From 469b3ffaaadb6ab15ddbebc47ac11a0c6fddfda2 Mon Sep 17 00:00:00 2001
From: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com>
Date: Tue, 5 Aug 2025 10:04:46 -0700
Subject: [PATCH 001/932] [V1] port xformers backend to v1 (#21342)

Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
---
 tests/v1/attention/utils.py             |   2 +
 vllm/engine/arg_utils.py                |   1 +
 vllm/platforms/cuda.py                  |   4 +
 vllm/platforms/interface.py             |   1 +
 vllm/v1/attention/backends/tree_attn.py |   1 -
 vllm/v1/attention/backends/xformers.py  | 430 ++++++++++++++++++++++++
 6 files changed, 438 insertions(+), 1 deletion(-)
 create mode 100644 vllm/v1/attention/backends/xformers.py

diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 78a6509986..e9e574501d 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -128,6 +128,8 @@ def get_attention_backend(backend_name: _Backend):
         "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
         _Backend.TREE_ATTN:
         "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
+        _Backend.XFORMERS_VLLM_V1:
+        "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
     }
 
     if backend_name not in backend_map:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5eb9660cd1..3e2f03d56c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1469,6 +1469,7 @@ class EngineArgs:
             "TORCH_SDPA_VLLM_V1",
             "FLEX_ATTENTION",
             "TREE_ATTN",
+            "XFORMERS_VLLM_V1",
         ]
         if (envs.is_set("VLLM_ATTENTION_BACKEND")
                 and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index b61b39a927..dd9356e399 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -271,6 +271,7 @@ class CudaPlatformBase(Platform):
             TRITON_ATTN_VLLM_V1 = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
             FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
             TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend"  # noqa: E501
+            XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"  # noqa: E501
 
             if selected_backend == _Backend.FLASHINFER:
                 logger.info_once("Using FlashInfer backend on V1 engine.")
@@ -291,6 +292,9 @@ class CudaPlatformBase(Platform):
             elif selected_backend == _Backend.TREE_ATTN:
                 logger.info_once("Using Tree Attention backend on V1 engine.")
                 return TREE_ATTN_V1
+            elif selected_backend == _Backend.XFORMERS_VLLM_V1:
+                logger.info_once("Using XFormers backend on V1 engine.")
+                return XFORMERS_V1
 
             from vllm.attention.selector import is_attn_backend_supported
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 61ce868c13..a85b583abc 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -63,6 +63,7 @@ class _Backend(enum.Enum):
     NO_ATTENTION = enum.auto()
     FLEX_ATTENTION = enum.auto()
     TREE_ATTN = enum.auto()
+    XFORMERS_VLLM_V1 = enum.auto()
 
 
 class PlatformEnum(enum.Enum):
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index a071f0921d..3b53b039f1 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -316,7 +316,6 @@ class TreeAttentionImpl(AttentionImpl):
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
new file mode 100644
index 0000000000..fe732c6017
--- /dev/null
+++ b/vllm/v1/attention/backends/xformers.py
@@ -0,0 +1,430 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with XFormersAttention."""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.ops.triton_unified_attention import unified_attention
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder, CommonAttentionMetadata,
+    reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+try:
+    from xformers import ops as xops
+    from xformers.ops.fmha.attn_bias import (
+        AttentionBias, PagedBlockDiagonalCausalWithOffsetPaddedKeysMask)
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    XFORMERS_AVAILABLE = False
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+
+from vllm import _custom_ops as ops
+
+logger = init_logger(__name__)
+
+
+class XFormersAttentionBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [
+            32,
+            40,
+            48,
+            56,
+            64,
+            72,
+            80,
+            88,
+            96,
+            104,
+            112,
+            120,
+            128,
+            136,
+            144,
+            152,
+            160,
+            168,
+            176,
+            184,
+            192,
+            200,
+            208,
+            216,
+            224,
+            232,
+            240,
+            248,
+            256,
+        ]
+
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
+    @staticmethod
+    def get_name() -> str:
+        return "XFORMERS_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["XFormersAttentionImpl"]:
+        return XFormersAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return XFormersAttentionMetadata
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def get_builder_cls() -> type["XFormersAttentionMetadataBuilder"]:
+        return XFormersAttentionMetadataBuilder
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+@dataclass
+class XFormersAttentionMetadata:
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    num_prefill_tokens: int = 0
+    num_decode_tokens: int = 0
+    num_prefills: int = 0
+    num_decodes: int = 0
+
+    # Biases for different attention types.
+    attn_bias: Optional["AttentionBias"] = None
+
+    # Self-attention prefill/decode metadata cache
+    _cached_prefill_metadata: Optional["XFormersAttentionMetadata"] = None
+    _cached_decode_metadata: Optional["XFormersAttentionMetadata"] = None
+
+    @property
+    def prefill_metadata(self) -> Optional["XFormersAttentionMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            # Recover cached prefill-phase attention
+            # metadata structure
+            return self._cached_prefill_metadata
+
+        q_start_loc = self.query_start_loc[self.num_decodes:]
+        q_seqlens = torch.diff(q_start_loc)
+        kv_seqlens = self.seq_lens[self.num_decodes:]
+        # Construct & cache prefill-phase attention metadata structure
+        self._cached_prefill_metadata = XFormersAttentionMetadata(
+            num_actual_tokens=self.num_prefill_tokens,
+            max_query_len=int(q_seqlens.max().item()),
+            query_start_loc=q_start_loc - q_start_loc[0],
+            max_seq_len=int(kv_seqlens.max().item()),
+            seq_lens=kv_seqlens,
+            block_table=self.block_table[self.num_decodes:],
+            slot_mapping=self.slot_mapping[self.num_decode_tokens:],
+        )
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["XFormersAttentionMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            # Recover cached decode-phase attention
+            # metadata structure
+            return self._cached_decode_metadata
+
+        q_start_loc = self.query_start_loc
+        q_seqlens = torch.diff(q_start_loc)
+        decode_kv_seqlens = self.seq_lens[:self.num_decodes]
+        # Construct & cache decode-phase attention metadata structure
+        self._cached_decode_metadata = XFormersAttentionMetadata(
+            num_actual_tokens=self.num_decode_tokens,
+            max_query_len=int(q_seqlens[:self.num_decodes].max().item()),
+            query_start_loc=q_start_loc[:self.num_decodes + 1],
+            max_seq_len=int(decode_kv_seqlens.max().item()),
+            seq_lens=decode_kv_seqlens,
+            block_table=self.block_table[:self.num_decodes],
+            slot_mapping=self.slot_mapping[:self.num_decode_tokens],
+            attn_bias=self.attn_bias,
+        )
+        return self._cached_decode_metadata
+
+
+class XFormersAttentionMetadataBuilder(
+        AttentionMetadataBuilder[XFormersAttentionMetadata]):
+
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        assert XFORMERS_AVAILABLE
+        self.kv_cache_spec = kv_cache_spec
+        self.block_size = kv_cache_spec.block_size
+        self._num_decodes = 0
+        self._num_decode_tokens = 0
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return reorder_batch_to_split_decodes_and_prefills(input_batch,
+                                                           scheduler_output,
+                                                           decode_threshold=1)
+
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> XFormersAttentionMetadata:
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+            split_decodes_and_prefills(common_attn_metadata,
+                                       decode_threshold=1))
+
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
+        q_start_loc = common_attn_metadata.query_start_loc
+        q_seqlens = torch.diff(q_start_loc)
+        max_query_len = common_attn_metadata.max_query_len
+        kv_seqlens = common_attn_metadata.seq_lens
+        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        block_table = common_attn_metadata.block_table_tensor
+        slot_mapping = common_attn_metadata.slot_mapping
+
+        bias = None
+        if num_decodes > 0:
+            # Construct the decoder bias.
+            decode_q_seqlens = q_seqlens[:num_decodes]
+            decode_kv_seqlens = kv_seqlens[:num_decodes]
+            bias = (
+                PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
+                    q_seqlen=decode_q_seqlens.tolist(),
+                    kv_seqlen=decode_kv_seqlens.tolist(),
+                    page_size=self.block_size,
+                    block_tables=block_table[:num_decodes],
+                    device=block_table.device,
+                ))
+
+        return XFormersAttentionMetadata(
+            num_actual_tokens=num_actual_tokens,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            num_prefills=num_prefills,
+            num_decodes=num_decodes,
+            max_query_len=max_query_len,
+            query_start_loc=q_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=kv_seqlens,
+            block_table=block_table,
+            slot_mapping=slot_mapping,
+            attn_bias=bias,
+        )
+
+
+class XFormersAttentionImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[str] = None,
+    ) -> None:
+        if kv_sharing_target_layer_name is not None:
+            raise NotImplementedError("KV sharing is not supported in V0.")
+        if alibi_slopes is not None:
+            raise NotImplementedError(
+                "XFormers does not support alibi slopes yet.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.kv_cache_dtype = kv_cache_dtype
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
+        if logits_soft_cap is None:
+            # Setting logits_soft_cap to 0 means no soft cap.
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+
+        XFormersAttentionBackend.validate_head_size(head_size)
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "XFormersAttentionImpl.")
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: XFormersAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with XFormers.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for XFormersAttentionImpl")
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        # Cache the input KVs.
+        key_cache, value_cache = kv_cache.unbind(0)
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+            # not padded. However, we don't need to do key[:num_actual_tokens]
+            # and value[:num_actual_tokens] because the reshape_and_cache_flash
+            # op uses the slot_mapping's shape to determine the number of
+            # actual tokens.
+            ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        if prefill_meta := attn_metadata.prefill_metadata:
+            descale_shape = (prefill_meta.query_start_loc.shape[0] - 1,
+                             key.shape[1])
+            unified_attention(
+                q=query[num_decode_tokens:num_actual_tokens],
+                k=key_cache,
+                v=value_cache,
+                out=output[num_decode_tokens:num_actual_tokens],
+                cu_seqlens_q=prefill_meta.query_start_loc,
+                max_seqlen_q=prefill_meta.max_query_len,
+                seqused_k=prefill_meta.seq_lens,
+                max_seqlen_k=prefill_meta.max_seq_len,
+                softmax_scale=self.scale,
+                causal=True,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.sliding_window,
+                block_table=prefill_meta.block_table,
+                softcap=self.logits_soft_cap,
+                q_descale=None,  # Not supported
+                k_descale=layer._k_scale.expand(descale_shape),
+                v_descale=layer._v_scale.expand(descale_shape),
+            )
+
+        if decode_meta := attn_metadata.decode_metadata:
+            # Query for decode. KV is not needed because it is already cached.
+            decode_query = query[:num_decode_tokens]
+            # Reshape query to [1, B_T, G, H, D].
+            q = decode_query.view(1, -1, self.num_kv_heads,
+                                  self.num_queries_per_kv, self.head_size)
+            # Reshape the k and v caches to [1, Bkv_T, G, H, D]
+            cache_k = key_cache.view(1, -1, self.num_kv_heads, 1,
+                                     self.head_size).expand(
+                                         1,
+                                         -1,
+                                         self.num_kv_heads,
+                                         self.num_queries_per_kv,
+                                         self.head_size,
+                                     )
+            cache_v = value_cache.view(1, -1, self.num_kv_heads, 1,
+                                       self.head_size).expand(
+                                           1,
+                                           -1,
+                                           self.num_kv_heads,
+                                           self.num_queries_per_kv,
+                                           self.head_size,
+                                       )
+
+            attn_bias = decode_meta.attn_bias
+            output[:
+                   num_decode_tokens] = xops.memory_efficient_attention_forward(
+                       q,
+                       cache_k,
+                       cache_v,
+                       attn_bias=attn_bias,
+                       p=0.0,
+                       scale=self.scale,
+                   ).view(decode_query.shape)
+
+        # Reshape the output tensor.
+        return output

From 59a0b8554bf0e8a9902e14e3d0e564fea38157b6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 6 Aug 2025 01:26:09 +0800
Subject: [PATCH 002/932] [bugfix] fix blackwell deepep installation (#22255)

---
 tools/ep_kernels/README.md                   | 10 +++++-----
 tools/ep_kernels/install_python_libraries.sh |  8 +++++++-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md
index 273e0f378e..85e9d2a4f8 100644
--- a/tools/ep_kernels/README.md
+++ b/tools/ep_kernels/README.md
@@ -13,16 +13,16 @@ All scripts accept a positional argument as workspace path for staging the build
 
 ## Usage
 
-### Single-node
-
 ```bash
-bash install_python_libraries.sh
+# for hopper
+TORCH_CUDA_ARCH_LIST="9.0" bash install_python_libraries.sh
+# for blackwell
+TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh
 ```
 
-### Multi-node
+Additional step for multi-node deployment:
 
 ```bash
-bash install_python_libraries.sh
 sudo bash configure_system_drivers.sh
 sudo reboot # Reboot is required to load the new driver
 ```
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 9d1b2da3b4..e163c83e8b 100644
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -29,6 +29,12 @@ if [ -z "$CUDA_HOME" ]; then
     exit 1
 fi
 
+# assume TORCH_CUDA_ARCH_LIST is set correctly
+if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then
+    echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture."
+    exit 1
+fi
+
 # disable all features except IBGDA
 export NVSHMEM_IBGDA_SUPPORT=1
 
@@ -95,7 +101,7 @@ clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py"
 cd pplx-kernels
 # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
 # PIP_NO_BUILD_ISOLATION=0 disables build isolation
-PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e  .
+PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .
 popd
 
 # build and install deepep, require pytorch installed

From 4b29d2784b3753fd5434cded25cbcf0bce7b7da7 Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Tue, 5 Aug 2025 16:54:56 -0700
Subject: [PATCH 003/932] [CI][TPU] Fix docker clean up (#22271)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
---
 .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh | 3 +--
 .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh       | 1 -
 .buildkite/scripts/tpu/config_v6e_1.env                 | 2 +-
 .buildkite/scripts/tpu/docker_run_bm.sh                 | 2 --
 .buildkite/scripts/tpu/quantized_v6e_1.env              | 2 +-
 5 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index d998c1f73b..734a817fd1 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -4,8 +4,7 @@ set -xu
 
 
 remove_docker_container() { 
-    docker rm -f tpu-test || true; 
-    docker rm -f vllm-tpu || true;
+    docker rm -f tpu-test || true;
 }
 
 trap remove_docker_container EXIT
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index e565d4b246..9e7b5a5462 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -5,7 +5,6 @@ set -xu
 
 remove_docker_container() { 
     docker rm -f tpu-test || true; 
-    docker rm -f vllm-tpu || true;
 }
 
 trap remove_docker_container EXIT
diff --git a/.buildkite/scripts/tpu/config_v6e_1.env b/.buildkite/scripts/tpu/config_v6e_1.env
index 03ec116f69..c9e3c26571 100644
--- a/.buildkite/scripts/tpu/config_v6e_1.env
+++ b/.buildkite/scripts/tpu/config_v6e_1.env
@@ -1,6 +1,6 @@
 # Environment config
 TEST_NAME=llama8b
-CONTAINER_NAME=vllm-tpu
+CONTAINER_NAME=tpu-test
 
 # vllm config
 MODEL=meta-llama/Llama-3.1-8B-Instruct
diff --git a/.buildkite/scripts/tpu/docker_run_bm.sh b/.buildkite/scripts/tpu/docker_run_bm.sh
index 8959877a3c..08e3661180 100755
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -12,8 +12,6 @@ source /etc/environment
 source $ENV_FILE
 
 remove_docker_container() { 
-    docker rm -f tpu-test || true; 
-    docker rm -f vllm-tpu || true;
     docker rm -f $CONTAINER_NAME || true;
 }
 
diff --git a/.buildkite/scripts/tpu/quantized_v6e_1.env b/.buildkite/scripts/tpu/quantized_v6e_1.env
index bab34b3be3..bd25c80308 100644
--- a/.buildkite/scripts/tpu/quantized_v6e_1.env
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@@ -1,6 +1,6 @@
 # Environment config
 TEST_NAME=llama8bw8a8
-CONTAINER_NAME=vllm-tpu
+CONTAINER_NAME=tpu-test
 
 # vllm config
 MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8

From 35509fc5be5d840e84717ff24bba6bdd5cc33d77 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 5 Aug 2025 20:05:40 -0400
Subject: [PATCH 004/932] [Bugfix] Remove faulty test for oot attention backend
 (#22286)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/plugins_tests/test_platform_plugins.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index ef99c3dadd..1d7e447501 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -4,9 +4,7 @@
 import pytest
 import torch
 
-from vllm.attention.selector import get_attn_backend
 from vllm.plugins import load_general_plugins
-from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
 
 
 def test_platform_plugins():
@@ -27,14 +25,6 @@ def test_platform_plugins():
         f" is loaded. The first import:\n{_init_trace}")
 
 
-def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
-    # ignore the backend env variable if it is set
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
-        backend = get_attn_backend(16, torch.float16, "auto", 16, False)
-        assert backend.get_name() == "Dummy_Backend"
-
-
 def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
     # simulate workload by running an example
     load_general_plugins()

From 6a5153043799dde3e22fae11f17c423c765f747b Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 5 Aug 2025 22:35:20 -0400
Subject: [PATCH 005/932] [Bugfix] Fix 3D input passed into cutlass_scaled_mm
 (#22278)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/_custom_ops.py | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e6f69e2344..92de394180 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -710,23 +710,25 @@ def cutlass_scaled_mm(a: torch.Tensor,
         scale_b.shape * [128, 128] == b.shape
     """
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
-    assert bias is None or bias.shape[0] == b.shape[
-        1] and bias.dtype == out_dtype
+    assert bias is None or bias.numel(
+    ) == b.shape[1] and bias.dtype == out_dtype
 
-    m = a.shape[0]
-    n = b.shape[1]
+    # Massage the input to be 2D
+    target_shape = (*a.shape[:-1], b.shape[1])
+    a = a.view(-1, a.shape[-1])
 
     cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     if current_platform.is_rocm() or not cutlass_compatible_b:
         from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import (  # noqa
             triton_scaled_mm)
-        return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+        out = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+    else:
+        out = torch.empty((a.shape[0], b.shape[1]),
+                          dtype=out_dtype,
+                          device=a.device)
+        torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
 
-    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
-
-    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
-
-    return out
+    return out.view(*target_shape)
 
 
 def cutlass_scaled_mm_azp(a: torch.Tensor,
@@ -746,15 +748,18 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
     assert bias is None or bias.numel(
     ) == b.shape[1] and bias.dtype == out_dtype
+
+    # Massage the input to be 2D
+    target_shape = (*a.shape[:-1], b.shape[1])
+    a = a.view(-1, a.shape[-1])
     assert azp is None or azp.numel() == a.shape[0]
 
-    m = a.shape[0]
-    n = b.shape[1]
-    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
-
+    out = torch.empty((a.shape[0], b.shape[1]),
+                      dtype=out_dtype,
+                      device=a.device)
     torch.ops._C.cutlass_scaled_mm_azp(out, a, b, scale_a, scale_b, azp_adj,
                                        azp, bias)
-    return out
+    return out.view(*target_shape)
 
 
 def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:

From 8e6c7e873f1a2830ab096d69ee1812b323aef650 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 6 Aug 2025 10:56:22 +0800
Subject: [PATCH 006/932] [Bugfix] Fix MoE BNB version (#22260)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/layers/quantization/bitsandbytes.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index a96f3ee5c3..5359189caa 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -412,12 +412,12 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
     def __init__(self, quant_config: BitsAndBytesConfig):
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.45.3":
+            if bitsandbytes.__version__ < "0.46.1":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.45.3.")
+                                  "install bitsandbytes>=0.46.1.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.45.3 via "
-                              "`pip install bitsandbytes>=0.45.3` to use "
+            raise ImportError("Please install bitsandbytes>=0.46.1 via "
+                              "`pip install bitsandbytes>=0.46.1` to use "
                               "bitsandbytes quantizer.") from err
         self.topk_indices_dtype = None
         self.quant_config = quant_config

From 7e6544c7978364fcb8178f4ab8b1325e45880aa9 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <chislett.ben@gmail.com>
Date: Tue, 5 Aug 2025 22:57:49 -0400
Subject: [PATCH 007/932] [Perf] Parallelize fill_bitmask to accelerate
 high-throughput guided decoding (#21862)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
---
 vllm/v1/structured_output/__init__.py         | 125 +++++++++++++-----
 vllm/v1/structured_output/backend_xgrammar.py |   7 +-
 vllm/v1/worker/gpu_model_runner.py            |   9 +-
 3 files changed, 102 insertions(+), 39 deletions(-)

diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index bd1dd01f90..63604a335d 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import multiprocessing
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import Future, ThreadPoolExecutor
 from typing import TYPE_CHECKING, Optional
 
 from vllm.config import VllmConfig
@@ -40,6 +40,17 @@ class StructuredOutputManager:
         self._grammar_bitmask: Optional[torch.Tensor] = None
         self._full_mask = torch.tensor(-1, dtype=torch.int32)
 
+        max_batch_size = self.vllm_config.scheduler_config.max_num_seqs
+        self.fill_bitmask_parallel_threshold = 128
+        if self.fill_bitmask_parallel_threshold < max_batch_size:
+            self.fill_bitmask_parallel_batch_size = 16
+            # Use:
+            # - at least 1 CPU
+            # - at most half the number of CPUs or 8, whichever is less
+            max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
+            self.executor_for_fillmask = ThreadPoolExecutor(
+                max_workers=max_workers)
+
         if not self.vllm_config.model_config.skip_tokenizer_init:
             # The default max_workers if not specified is the number of
             # CPUs * 5, which is way too high since these tasks are CPU-bound,
@@ -120,6 +131,26 @@ class StructuredOutputManager:
         assert self.backend is not None
         return self.backend.compile_grammar(request_type, grammar_spec)
 
+    def _fill_bitmasks(
+        self,
+        batch: list[tuple[StructuredOutputGrammar, int, bool]],
+    ) -> None:
+        assert self._grammar_bitmask is not None
+        for grammar, index, apply_bitmask in batch:
+            if apply_bitmask and not grammar.is_terminated():
+                grammar.fill_bitmask(self._grammar_bitmask, index)
+            else:
+                # Note that for thinking support, we will need to
+                # reset the relevant part of the bitmask for consequent
+                # requests here.
+                self._grammar_bitmask[index].fill_(self._full_mask)
+
+    def _async_submit_fill_bitmask(
+        self,
+        batch: list[tuple[StructuredOutputGrammar, int, bool]],
+    ) -> Future:
+        return self.executor_for_fillmask.submit(self._fill_bitmasks, batch)
+
     def grammar_bitmask(
         self,
         requests: dict[str, Request],
@@ -146,7 +177,6 @@ class StructuredOutputManager:
                 self.backend.allocate_token_bitmask(
                     max_batch_size * (1 + max_num_spec_tokens))
 
-        bitmask_tensor = self._grammar_bitmask
         # Generate a batched bitmask for all structured output requests.
         # When speculative decoding is enabled, we need to include multiple
         # masks for each request, one for each possible bonus token position.
@@ -155,47 +185,61 @@ class StructuredOutputManager:
         ordered_seq = sorted(structured_output_request_ids.items(),
                              key=lambda x: x[1])
 
-        # Note that for thinking support, we will need to
-        # reset the relevant part of the bitmask for consequent
-        # request here.
-        bitmask_tensor[:(len(ordered_seq) * (1 + max_num_spec_tokens))].fill_(
-            self._full_mask)
+        # Optimized parallel filling of bitmasks for
+        # non-spec, large-batch-size cases
+        if len(ordered_seq) > self.fill_bitmask_parallel_threshold and \
+                max_num_spec_tokens == 0:
+            promises = []
+            batch = []
+            for req_id, _ in ordered_seq:
+                request = requests[req_id]
+                structured_output_request = request.structured_output_request
+                if TYPE_CHECKING:
+                    assert structured_output_request is not None
+                    assert structured_output_request.grammar is not None
 
-        # NOTE: This outer loop can likely be parallelized to improve
-        # performance of bitmask generation for large batches.
-        for req_id, _ in ordered_seq:
-            request = requests[req_id]
-            structured_output_request = request.structured_output_request
+                apply_bitmask = self.should_fill_bitmask(request)
+                batch.append((structured_output_request.grammar,
+                              cumulative_index, apply_bitmask))
+                if len(batch) == self.fill_bitmask_parallel_batch_size:
+                    promises.append(self._async_submit_fill_bitmask(batch))
+                    batch = []
 
-            if TYPE_CHECKING:
-                assert structured_output_request is not None
-                assert structured_output_request.grammar is not None
-            apply_bitmask: bool = True
-            if self.reasoner is not None:
-                if structured_output_request.reasoning_ended is None:
-                    structured_output_request.reasoning_ended = \
-                        self.reasoner.is_reasoning_end(request.prompt_token_ids)
-                apply_bitmask = structured_output_request.reasoning_ended
+                cumulative_index += 1
+            if batch:
+                promises.append(self._async_submit_fill_bitmask(batch))
 
-            state_advancements = 0
-            req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + [None]
-            for i, token in enumerate(req_tokens):
-                if apply_bitmask and not \
-                    structured_output_request.grammar.is_terminated():
-                    structured_output_request.grammar.fill_bitmask(
-                        bitmask_tensor, cumulative_index)
-                    if token is not None:
-                        # In order to generate the correct bitmask for each
-                        # position in the speculative sequence, we advance
-                        # the FSM state for each speculative token and rollback
-                        # to restore the previous state when we are finished.
+            # Wait for all bitmask filling tasks to complete.
+            for promise in promises:
+                promise.result()
+        else:
+            # Fallback to serial filling of bitmasks for small-batch-size cases
+            for req_id, _ in ordered_seq:
+                request = requests[req_id]
+                structured_output_request = request.structured_output_request
+
+                if TYPE_CHECKING:
+                    assert structured_output_request is not None
+                    assert structured_output_request.grammar is not None
+                apply_bitmask = self.should_fill_bitmask(request)
+
+                state_advancements = 0
+                req_tokens = scheduled_spec_decode_tokens.get(req_id, [])
+                for i, token in enumerate(req_tokens + [None]):
+                    self._fill_bitmasks([(structured_output_request.grammar,
+                                          cumulative_index, apply_bitmask)])
+
+                    if apply_bitmask and token is not None and \
+                        not structured_output_request.grammar.is_terminated():
                         assert structured_output_request.grammar.accept_tokens(
                             req_id, [token])
                         state_advancements += 1
-                cumulative_index += 1
-            if state_advancements > 0:
-                structured_output_request.grammar.rollback(state_advancements)
+                    cumulative_index += 1
+                if state_advancements > 0:
+                    structured_output_request.grammar.rollback(
+                        state_advancements)
 
+        bitmask_tensor = self._grammar_bitmask
         if cumulative_index < bitmask_tensor.shape[0]:
             bitmask_tensor = bitmask_tensor[:cumulative_index]
 
@@ -204,6 +248,15 @@ class StructuredOutputManager:
         # and deserialization when sending this to the GPU workers.
         return bitmask_tensor.numpy()
 
+    def should_fill_bitmask(self, request: Request) -> bool:
+        if self.reasoner is not None:
+            assert request.structured_output_request is not None
+            if request.structured_output_request.reasoning_ended is None:
+                request.structured_output_request.reasoning_ended = \
+                    self.reasoner.is_reasoning_end(request.prompt_token_ids)
+            return request.structured_output_request.reasoning_ended
+        return True
+
     def should_advance(self, request: Request) -> bool:
         if not request.use_structured_output:
             return False
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 88544565e5..5e00f63804 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -148,6 +148,7 @@ class XgrammarGrammar(StructuredOutputGrammar):
                                       repr=False,
                                       hash=False,
                                       init=False)
+    _is_terminated: bool = field(default=False, repr=False, hash=False)
 
     def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
         """Accepts a list of tokens and advances the FSM.
@@ -155,6 +156,8 @@ class XgrammarGrammar(StructuredOutputGrammar):
         Returns True if the FSM was advanced successfully.
         Returns False if the FSM failed to advance.
         """
+        if self._is_terminated:
+            return False
         for token in tokens:
             if not self.matcher.accept_token(token):
                 logger.error(
@@ -162,6 +165,7 @@ class XgrammarGrammar(StructuredOutputGrammar):
                     "for tokens %s. Please file an issue.", request_id, token)
                 return False
             self.num_processed_tokens += 1
+        self._is_terminated = self.matcher.is_terminated()
         return True
 
     def validate_tokens(self, tokens: list[int]) -> list[int]:
@@ -184,12 +188,13 @@ class XgrammarGrammar(StructuredOutputGrammar):
     def rollback(self, num_tokens: int) -> None:
         self.matcher.rollback(num_tokens)
         self.num_processed_tokens -= num_tokens
+        self._is_terminated = self.matcher.is_terminated()
 
     def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
         self.matcher.fill_next_token_bitmask(bitmask, idx)
 
     def is_terminated(self) -> bool:
-        return self.matcher.is_terminated()
+        return self._is_terminated
 
     def reset(self):
         self.num_processed_tokens = 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 85976fc1c8..549f21af79 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1324,9 +1324,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             cumulative_index += 1 + num_spec_tokens
         grammar_bitmask = sorted_bitmask
 
+        # If the grammar bitmask and the logits have the same shape
+        # we don't need to pass indices to the kernel,
+        # since the bitmask is already aligned with the logits.
+        skip_out_indices = grammar_bitmask.shape[0] == logits.shape[0]
+
         # Serialization of np.ndarray is much more efficient than a tensor,
         # so we receive it in that format.
-        grammar_bitmask = torch.from_numpy(grammar_bitmask)
+        grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous()
 
         # Force use of the torch.compile implementation from xgrammar to work
         # around issues with the Triton kernel in concurrent structured output
@@ -1334,7 +1339,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         xgr_torch_compile.apply_token_bitmask_inplace_torch_compile(
             logits,
             grammar_bitmask.to(self.device, non_blocking=True),
-            indices=out_indices,
+            indices=out_indices if not skip_out_indices else None,
         )
 
     def sync_and_slice_intermediate_tensors(

From 302962e806e9820643ae25987e8e38ed035e05d3 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Tue, 5 Aug 2025 20:35:32 -0700
Subject: [PATCH 008/932] [Bugfix] Skip dead and non-GPU nodes for Ray DP
 engine allocation (#22275)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 vllm/v1/engine/utils.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index f39aa40593..770aa7d9dc 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -297,10 +297,10 @@ class CoreEngineActorManager:
         local_engine_count = \
             vllm_config.parallel_config.data_parallel_size_local
 
-        nodes = sorted(list_nodes(),
+        nodes = sorted(list_nodes(filters=[("state", "=", "ALIVE")]),
                        key=lambda node: node.node_ip != dp_master_ip)
         assert nodes[0].node_ip == dp_master_ip, (
-            "The first node must be the head node")
+            "The head node is missing or dead")
         assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
             "There can only be one head node")
 
@@ -312,6 +312,8 @@ class CoreEngineActorManager:
         for node in nodes:
             node_ip = node.node_ip
             node_resources = available_resources[node.node_id]
+            if "GPU" not in node_resources:
+                continue
             # For now, each DP rank can only be assigned to one node
             # TODO(rui): support allocating a single DP rank
             # to multiple nodes
@@ -346,6 +348,13 @@ class CoreEngineActorManager:
                     )
                     placement_groups.append(pg)
                     local_dp_ranks.append(i)
+        if len(placement_groups) < num_pg_to_create:
+            raise ValueError(
+                f"Not enough resources to allocate {num_pg_to_create} "
+                "placement groups, only created "
+                f"{len(placement_groups)} placement groups. "
+                "Available resources: "
+                f"{available_resources}")
         return placement_groups, local_dp_ranks
 
     @staticmethod

From 5d5d419ca6aa55034eef0144f24e66789b486cb5 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Tue, 5 Aug 2025 23:39:32 -0400
Subject: [PATCH 009/932] [Bugfix][CI/Build][ROCm] Make sure to use the headers
 from the build folder on ROCm (#22264)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 cmake/utils.cmake | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 621179a701..9c0ed1d095 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -467,6 +467,12 @@ function (define_gpu_extension_target GPU_MOD_NAME)
   if (GPU_LANGUAGE STREQUAL "HIP")
     # Make this target dependent on the hipify preprocessor step.
     add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
+    # Make sure we include the hipified versions of the headers, and avoid conflicts with the ones in the original source folder
+    target_include_directories(${GPU_MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
+      ${GPU_INCLUDE_DIRECTORIES})
+  else()
+    target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
+      ${GPU_INCLUDE_DIRECTORIES})
   endif()
 
   if (GPU_ARCHITECTURES)
@@ -482,8 +488,6 @@ function (define_gpu_extension_target GPU_MOD_NAME)
   target_compile_definitions(${GPU_MOD_NAME} PRIVATE
     "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
 
-  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
-    ${GPU_INCLUDE_DIRECTORIES})
 
   target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
 

From e3c876dca357711705822a7539eddca05ee0911f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 5 Aug 2025 21:36:21 -0700
Subject: [PATCH 010/932] Upgrade FA3 for attention sink (#22313)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 cmake/external_projects/vllm_flash_attn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index ef45a5fbeb..4eb4b464a2 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 1c2624e53c078854e0637ee566c72fe2107e75f4
+          GIT_TAG b99f8c821771fd11feb66d5c89661e9858fde359
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From dd16bdc7981349edc44900c1c614e09b2faa712e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 5 Aug 2025 21:43:21 -0700
Subject: [PATCH 011/932] Increase openai-python version (#22316)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 6b57a3d2f1..c5eb6dab95 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -13,7 +13,7 @@ tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
-openai >= 1.87.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support)
+openai >= 1.98.0  # For Responses API with reasoning content
 pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing

From 6e20924350e3fed375bc63d55166a303b6f0828a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 5 Aug 2025 22:37:21 -0700
Subject: [PATCH 012/932] Add attention sink in attention backends (#22320)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
---
 .../ops/chunked_prefill_paged_decode.py       | 33 ++++++--
 vllm/attention/ops/prefix_prefill.py          | 18 ++++-
 .../attention/ops/triton_unified_attention.py | 30 +++++++-
 vllm/envs.py                                  | 19 ++++-
 vllm/v1/attention/backends/flash_attn.py      | 10 +++
 vllm/v1/attention/backends/triton_attn.py     | 75 ++++++++++++++-----
 vllm/v1/attention/backends/utils.py           | 36 ++++++---
 7 files changed, 176 insertions(+), 45 deletions(-)

diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 4f839348e5..08bfcc974c 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -28,6 +28,7 @@ def kernel_paged_attention_2d(
         query_ptr,  # [num_tokens, num_query_heads, head_size]
         key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
         value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+        sink_ptr,  # [num_query_heads]
         block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
         seq_lens_ptr,  # [num_seqs]
         alibi_slopes_ptr,  # [num_query_heads]
@@ -95,7 +96,17 @@ def kernel_paged_attention_2d(
 
     block_table_offset = seq_idx * block_table_stride
 
-    M = tl.full([num_queries_per_kv_padded], float("-inf"), dtype=tl.float32)
+    if sink_ptr is None:
+        M = tl.full([num_queries_per_kv_padded],
+                    float("-inf"),
+                    dtype=tl.float32)
+    else:
+        M = tl.load(
+            sink_ptr + query_head_idx,
+            mask=head_mask,
+            other=float("-inf"),
+        ).to(dtype=tl.float32)
+
     L = tl.full([num_queries_per_kv_padded], 1.0, dtype=tl.float32)
     acc = tl.zeros([num_queries_per_kv_padded, HEAD_SIZE_PADDED],
                    dtype=tl.float32)
@@ -223,6 +234,8 @@ def chunked_prefill_paged_decode(
     alibi_slopes=None,
     sliding_window=None,
     sm_scale=None,
+    # Optional tensor for sinks
+    sinks=None,
 ):
 
     if sm_scale is None:
@@ -253,6 +266,7 @@ def chunked_prefill_paged_decode(
             sliding_window=sliding_window,
             sm_scale=sm_scale,
             skip_decode=True,
+            sinks=sinks,
         )
 
     block_size = value_cache.shape[3]
@@ -281,11 +295,17 @@ def chunked_prefill_paged_decode(
     num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv),
                                     16)
 
-    use_custom = use_rocm_custom_paged_attention(query.dtype, head_size,
-                                                 block_size,
-                                                 num_queries_per_kv,
-                                                 max_seq_len, sliding_window,
-                                                 kv_cache_dtype, alibi_slopes)
+    use_custom = use_rocm_custom_paged_attention(
+        query.dtype,
+        head_size,
+        block_size,
+        num_queries_per_kv,
+        max_seq_len,
+        sliding_window,
+        kv_cache_dtype,
+        alibi_slopes,
+        sinks,
+    )
     if use_custom:
         _PARTITION_SIZE_ROCM = 256
         max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) //
@@ -334,6 +354,7 @@ def chunked_prefill_paged_decode(
             query_ptr=query,
             key_cache_ptr=key_cache,
             value_cache_ptr=value_cache,
+            sink_ptr=sinks,
             block_tables_ptr=block_table,
             seq_lens_ptr=seq_lens,
             alibi_slopes_ptr=alibi_slopes,
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 13bef96722..64c9033797 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -38,6 +38,7 @@ def _fwd_kernel(Q,
                 V,
                 K_cache,
                 V_cache,
+                sink_ptr,
                 B_Loc,
                 sm_scale,
                 k_scale,
@@ -126,7 +127,15 @@ def _fwd_kernel(Q,
                 other=0.0)  # [M,D]
 
     # initialize pointer to m and l
-    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    if sink_ptr is None:
+        m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    else:
+        m_i = tl.load(
+            sink_ptr + tl.full([BLOCK_M], cur_head, dtype=tl.int64),
+            mask=(offs_m < cur_batch_query_len),
+            other=float("-inf"),
+        ).to(dtype=tl.float32)
+
     l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)  # [M,D]
 
@@ -732,7 +741,8 @@ def context_attention_fwd(q,
                           alibi_slopes=None,
                           sliding_window=None,
                           sm_scale=None,
-                          skip_decode=False):
+                          skip_decode=False,
+                          sinks=None):
 
     q_dtype_is_f32 = q.dtype is torch.float32
 
@@ -781,6 +791,7 @@ def context_attention_fwd(q,
         sliding_window = 0
 
     if alibi_slopes is not None:
+        assert sinks is None, "Sinks arg is not supported with alibi"
         # need to reduce num. blocks when using fp32
         # due to increased use of GPU shared memory
         # if q.dtype is torch.float32:
@@ -843,7 +854,7 @@ def context_attention_fwd(q,
     max_seq_len = 0 if max_seq_len is None else max_seq_len
     extra_kargs = {}
     if current_platform.is_rocm():
-        extra_kargs = {"kpack": 2, "waves_per_eu": 2}
+        extra_kargs = {"kpack": 1, "waves_per_eu": 2}
 
     grid = lambda META: (batch, head,
                          triton.cdiv(max_input_len, META["BLOCK_M"]))
@@ -853,6 +864,7 @@ def context_attention_fwd(q,
         v,
         k_cache,
         v_cache,
+        sinks,
         b_loc,
         sm_scale,
         k_scale,
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index 0fdba569f9..ba4299a277 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -52,6 +52,7 @@ def kernel_unified_attention_2d(
         query_ptr,  # [num_tokens, num_query_heads, head_size]
         key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
         value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        sink_ptr,  # [num_query_heads]
         block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
         seq_lens_ptr,  # [num_seqs]
         alibi_slopes_ptr,  # [num_query_heads]
@@ -131,7 +132,15 @@ def kernel_unified_attention_2d(
 
     block_table_offset = seq_idx * block_table_stride
 
-    M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    if sink_ptr is None:
+        M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    else:
+        M = tl.load(
+            sink_ptr + query_offset_1,
+            mask=query_mask_1,
+            other=float("-inf"),
+        ).to(dtype=tl.float32)
+
     L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
 
@@ -292,6 +301,7 @@ def kernel_unified_attention_3d(
         query_ptr,  # [num_tokens, num_query_heads, head_size]
         key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
         value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+        sink_ptr,  # [num_query_heads]
         block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
         seq_lens_ptr,  # [num_seqs]
         alibi_slopes_ptr,  # [num_query_heads]
@@ -383,7 +393,15 @@ def kernel_unified_attention_3d(
 
     block_table_offset = seq_idx * block_table_stride
 
-    M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    if sink_ptr is None or segm_idx != 0:
+        M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    else:
+        M = tl.load(
+            sink_ptr + query_offset_1,
+            mask=query_mask_1,
+            other=float("-inf"),
+        ).to(dtype=tl.float32)
+
     L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
 
@@ -627,6 +645,8 @@ def unified_attention(
     v_descale,
     alibi_slopes=None,
     qq_bias=None,
+    # Optional tensor for sinks
+    sinks=None,
 ):
     assert causal, "Only causal attention is supported"
     assert q_descale is None, "Q scales not supported"
@@ -635,6 +655,10 @@ def unified_attention(
     assert q.element_size() >= 2 or block_size >= 32, \
         "Block size must be at least 32 for fp8"
 
+    if sinks is not None:
+        assert sinks.shape[0] == q.shape[1], \
+        "Sinks must be num_query_heads size"
+
     use_alibi_slopes = alibi_slopes is not None
     use_qq_bias = qq_bias is not None
 
@@ -669,6 +693,7 @@ def unified_attention(
             query_ptr=q,
             key_cache_ptr=k,
             value_cache_ptr=v,
+            sink_ptr=sinks,
             block_tables_ptr=block_table,
             seq_lens_ptr=seqused_k,
             alibi_slopes_ptr=alibi_slopes,
@@ -741,6 +766,7 @@ def unified_attention(
                 query_ptr=q,
                 key_cache_ptr=k,
                 value_cache_ptr=v,
+                sink_ptr=sinks,
                 block_tables_ptr=block_table,
                 seq_lens_ptr=seqused_k,
                 alibi_slopes_ptr=alibi_slopes,
diff --git a/vllm/envs.py b/vllm/envs.py
index e28e9658e5..f8a7197dd1 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -17,6 +17,7 @@ if TYPE_CHECKING:
     LD_LIBRARY_PATH: Optional[str] = None
     VLLM_USE_TRITON_FLASH_ATTN: bool = True
     VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
+    VLLM_USE_AITER_UNIFIED_ATTENTION: bool = False
     VLLM_FLASH_ATTN_VERSION: Optional[int] = None
     LOCAL_RANK: int = 0
     CUDA_VISIBLE_DEVICES: Optional[str] = None
@@ -151,6 +152,8 @@ if TYPE_CHECKING:
     VLLM_LOOPBACK_IP: str = ""
     VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
     VLLM_ENABLE_RESPONSES_API_STORE: bool = False
+    VLLM_USE_TRTLLM_CONTEXT_ATTENTION: bool = False
+    VLLM_USE_TRTLLM_DECODE_ATTENTION: bool = False
 
 
 def get_default_cache_root():
@@ -326,6 +329,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     (os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in
      ("true", "1")),
 
+    # Use AITER triton unified attention for V1 attention
+    "VLLM_USE_AITER_UNIFIED_ATTENTION":
+    lambda:
+    (os.getenv("VLLM_USE_AITER_UNIFIED_ATTENTION", "False").lower() in
+     ("true", "1")),
+
     # Force vllm to use a specific flash-attention version (2 or 3), only valid
     # when using the flash-attention backend.
     "VLLM_FLASH_ATTN_VERSION":
@@ -1022,9 +1031,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_CUDNN_PREFILL":
     lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))),
 
-    # If set to 1, use the TRTLLM Attention backend in flashinfer.
-    "VLLM_USE_TRTLLM_ATTENTION":
-    lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
+    # If set to 1, use the TRTLLM Context Attention backend in flashinfer.
+    "VLLM_USE_TRTLLM_CONTEXT_ATTENTION":
+    lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_CONTEXT_ATTENTION", "0"))),
+
+    # If set to 1, use the TRTLLM Decode Attention backend in flashinfer.
+    "VLLM_USE_TRTLLM_DECODE_ATTENTION":
+    lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", "0"))),
 
     # Controls garbage collection during CUDA graph capture.
     # If set to 0 (default), enables GC freezing to speed up capture time.
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index f086bab255..95ba56b359 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -373,6 +373,7 @@ class FlashAttentionImpl(AttentionImpl):
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[str] = None,
+        sinks: Optional[torch.Tensor] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -410,6 +411,14 @@ class FlashAttentionImpl(AttentionImpl):
             raise NotImplementedError(
                 "FlashAttention does not support fp8 kv-cache on this device.")
 
+        self.sinks = sinks
+        if self.sinks is not None:
+            assert self.vllm_flash_attn_version == 3, (
+                "Sinks are only supported in FlashAttention 3")
+            assert self.sinks.shape[0] == num_heads, (
+                "Sinks must have the same number of heads as the number of "
+                "heads in the layer")
+
     def forward(
         self,
         layer: torch.nn.Module,
@@ -534,6 +543,7 @@ class FlashAttentionImpl(AttentionImpl):
                 k_descale=layer._k_scale.expand(descale_shape),
                 v_descale=layer._v_scale.expand(descale_shape),
                 num_splits=attn_metadata.max_num_splits,
+                s_aux=self.sinks,
             )
             return output
 
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 942cb95eef..c33afbfebc 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with PagedAttention and Triton prefix prefill."""
 from dataclasses import dataclass
+from functools import cache
 from typing import ClassVar, Optional
 
 import torch
@@ -13,7 +14,6 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
 from vllm.attention.ops.chunked_prefill_paged_decode import (
     chunked_prefill_paged_decode)
 from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -193,6 +193,15 @@ class TritonAttentionBackend(AttentionBackend):
         return TritonAttentionMetadataBuilder
 
 
+@cache
+def use_aiter_unified_attention() -> bool:
+    """Check if aiter unified attention should be used."""
+    # VLLM_ROCM_USE_AITER_MHA needs to set to 0 as well as it is set
+    # to 1 as default
+    return envs.VLLM_ROCM_USE_AITER \
+        and envs.VLLM_USE_AITER_UNIFIED_ATTENTION
+
+
 class TritonAttentionImpl(AttentionImpl):
 
     def __init__(
@@ -207,6 +216,7 @@ class TritonAttentionImpl(AttentionImpl):
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[int] = None,
+        sinks: Optional[torch.Tensor] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -240,6 +250,29 @@ class TritonAttentionImpl(AttentionImpl):
         self.force_prefill_decode_attn = \
             envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
 
+        if not self.force_prefill_decode_attn:
+            # If not using prefill decode attention, we use the Triton
+            # unified attention implementation.
+            if use_aiter_unified_attention():
+                logger.info_once(
+                    "Using aiter unified attention for TritonAttentionImpl")
+                from aiter.ops.triton.unified_attention import (
+                    unified_attention)
+                self.unified_attention = unified_attention
+            else:
+                logger.info_once(
+                    "Using vllm unified attention for TritonAttentionImpl")
+                from vllm.attention.ops.triton_unified_attention import (
+                    unified_attention)
+                self.unified_attention = unified_attention
+
+        self.sinks = sinks
+        if sinks is not None:
+            assert sinks.shape[0] == num_heads, (
+                "Sinks must have the same number of heads as the number of "
+                f"heads in the layer. Sinks shape: {sinks.shape}, "
+                f"num_heads: {num_heads}.")
+
     def forward(
         self,
         layer: torch.nn.Module,
@@ -342,28 +375,31 @@ class TritonAttentionImpl(AttentionImpl):
 
         if use_prefill_decode_attn:
             # Compute attention and update output up to `num_actual_tokens`.
-            chunked_prefill_paged_decode(query=query[:num_actual_tokens],
-                                         key=key[:num_actual_tokens],
-                                         value=value[:num_actual_tokens],
-                                         output=output[:num_actual_tokens],
-                                         kv_cache_dtype=self.kv_cache_dtype,
-                                         key_cache=key_cache,
-                                         value_cache=value_cache,
-                                         block_table=block_table,
-                                         query_start_loc=cu_seqlens_q,
-                                         seq_lens=seqused_k,
-                                         max_seq_len=max_seqlen_k,
-                                         max_query_len=max_seqlen_q,
-                                         k_scale=layer._k_scale,
-                                         v_scale=layer._v_scale,
-                                         alibi_slopes=self.alibi_slopes,
-                                         sliding_window=self.sliding_window[0],
-                                         sm_scale=self.scale)
+            chunked_prefill_paged_decode(
+                query=query[:num_actual_tokens],
+                key=key[:num_actual_tokens],
+                value=value[:num_actual_tokens],
+                output=output[:num_actual_tokens],
+                kv_cache_dtype=self.kv_cache_dtype,
+                key_cache=key_cache,
+                value_cache=value_cache,
+                block_table=block_table,
+                query_start_loc=cu_seqlens_q,
+                seq_lens=seqused_k,
+                max_seq_len=max_seqlen_k,
+                max_query_len=max_seqlen_q,
+                k_scale=layer._k_scale,
+                v_scale=layer._v_scale,
+                alibi_slopes=self.alibi_slopes,
+                sliding_window=self.sliding_window[0],
+                sm_scale=self.scale,
+                sinks=self.sinks,
+            )
 
         else:
             descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
 
-            unified_attention(
+            self.unified_attention(
                 q=query[:num_actual_tokens],
                 k=key_cache,
                 v=value_cache,
@@ -381,6 +417,7 @@ class TritonAttentionImpl(AttentionImpl):
                 q_descale=None,  # Not supported
                 k_descale=layer._k_scale.expand(descale_shape),
                 v_descale=layer._v_scale.expand(descale_shape),
+                sinks=self.sinks,
             )
 
         return output
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 7aeea40b25..f521d94331 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -254,7 +254,11 @@ def get_kv_cache_layout():
     # Override with format specified by the user.
     cache_layout = envs.VLLM_KV_CACHE_LAYOUT
     if cache_layout is None:
-        cache_layout = get_kv_connector_cache_layout()
+        if (envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION
+                or envs.VLLM_USE_TRTLLM_DECODE_ATTENTION):
+            cache_layout = "HND"
+        else:
+            cache_layout = get_kv_connector_cache_layout()
     else:
         logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \
         "detected. Setting KV cache layout to %s.", cache_layout)
@@ -272,7 +276,9 @@ def set_kv_cache_layout(cache_layout: str):
 class PerLayerParameters:
     """
     Currently, FlashInfer backend only support models in which all layers share
-    the same values for the following hyperparameters.
+    the same values for the following hyperparameters. Should not be used for
+    trtllm-gen backend since it supports different values for the following
+    hyperparameters.
     """
 
     window_left: int
@@ -310,7 +316,8 @@ def get_per_layer_parameters(
 def infer_global_hyperparameters(
         per_layer_params: dict[str, PerLayerParameters]) -> PerLayerParameters:
     """
-    Currently, FlashInfer backend only support models in which all layers share
+    Currently, FlashInfer backend other than trtllm-gen 
+    only support models in which all layers share
     the same values for the following hyperparameters:
     - `window_left`
     - `logits_soft_cap`
@@ -324,15 +331,20 @@ def infer_global_hyperparameters(
 
     param_sets = list(per_layer_params.values())
     global_params = param_sets[0]
-    for params in param_sets:
-        if params.window_left != global_params.window_left:
-            raise ValueError(
-                "Window left is not the same for all layers. One potential fix "
-                "is to set disable_sliding_window=True")
-        assert params == global_params, (
-            "FlashInfer backend currently only supports models in which all "
-            "layers share the same values for the following hyperparameters: "
-            "`window_left`, `logits_soft_cap`, `sm_scale`.")
+
+    # trtllm attention doesn't need global hyper params so disable the check
+    if (not envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION
+            and not envs.VLLM_USE_TRTLLM_DECODE_ATTENTION):
+        for params in param_sets:
+            if params.window_left != global_params.window_left:
+                raise ValueError(
+                    "Window left is not the same for all layers. " \
+                    "One potential fix is to set disable_sliding_window=True")
+            assert params == global_params, (
+                "FlashInfer backend currently only supports models in which all"
+                "layers share the same values "
+                "for the following hyperparameters:"
+                "`window_left`, `logits_soft_cap`, `sm_scale`.")
 
     return global_params
 

From 796bae07c59716b7b61d57343826bfbeabdd01bb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 6 Aug 2025 06:56:14 +0100
Subject: [PATCH 013/932] Update transformers to `v4.55` (#21931)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 requirements/common.txt                       |   2 +-
 requirements/test.in                          |   2 +-
 requirements/test.txt                         |   6 +-
 .../multimodal/generation/test_common.py      |   4 +
 tests/models/registry.py                      |  24 ++-
 tests/quantization/test_experts_int8.py       |   4 +
 vllm/model_executor/models/interfaces_base.py |  12 +-
 vllm/model_executor/models/qwen2_vl.py        |  11 +-
 vllm/model_executor/models/transformers.py    |  17 +-
 vllm/model_executor/models/utils.py           |  10 +-
 vllm/transformers_utils/config.py             |   4 +-
 vllm/transformers_utils/configs/__init__.py   |   2 +
 vllm/transformers_utils/configs/ovis.py       | 176 ++++++++++++++++++
 13 files changed, 235 insertions(+), 39 deletions(-)
 create mode 100644 vllm/transformers_utils/configs/ovis.py

diff --git a/requirements/common.txt b/requirements/common.txt
index c5eb6dab95..0a4b27c034 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,7 +7,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.53.2
+transformers >= 4.55.0
 huggingface-hub[hf_xet] >= 0.33.0  # Required for Xet downloads.
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
diff --git a/requirements/test.in b/requirements/test.in
index 9ecaaae927..9c8c75dd6f 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -35,7 +35,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 mteb[bm25s]>=1.38.11, <2 # required for mteb test
-transformers==4.53.2
+transformers==4.55.0
 tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.33.0  # Required for Xet downloads.
 schemathesis>=3.39.15 # Required for openai schema test.
diff --git a/requirements/test.txt b/requirements/test.txt
index 691420df87..08ba964f22 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -214,7 +214,7 @@ fiona==1.10.1
     # via torchgeo
 flask==3.1.1
     # via mlflow
-fonttools==4.54.1
+fonttools==4.55.0
     # via matplotlib
 fqdn==1.5.1
     # via jsonschema
@@ -286,7 +286,7 @@ httpx==0.27.2
     # via
     #   -r requirements/test.in
     #   schemathesis
-huggingface-hub==0.33.1
+huggingface-hub==0.34.3
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -1148,7 +1148,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.53.2
+transformers==4.55.0
     # via
     #   -r requirements/test.in
     #   genai-perf
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 8cb826c114..2a65d7e244 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -337,6 +337,10 @@ VLM_TEST_SETTINGS = {
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        # FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we
+        # should enable this again after the fix is released:
+        # https://github.com/huggingface/transformers/pull/39915
+        marks=[pytest.mark.skip("HF model is broken")],
     ),
     "gemma3": VLMTestInfo(
         models=["google/gemma-3-4b-it"],
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 47057d32e9..92a719d7a9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -179,8 +179,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                           min_transformers_version="4.54"),
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
-    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
-                                          min_transformers_version="4.53"),
+    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
@@ -223,7 +222,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                             trust_remote_code=True),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
     "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
-                                        extras={"tiny": "ai21labs/Jamba-tiny-dev"}),  # noqa: E501
+                                        extras={
+                                            "tiny": "ai21labs/Jamba-tiny-dev",
+                                            "random": "ai21labs/Jamba-tiny-random",  # noqa: E501
+                                        }),
     "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct",
                                         extras={"guard": "meta-llama/Llama-Guard-3-1B",  # noqa: E501
                                                 "hermes": "NousResearch/Hermes-3-Llama-3.1-8B", # noqa: E501
@@ -239,8 +241,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                          trust_remote_code=True),
     "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
                                          trust_remote_code=True),
-    "MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf",
-                                          min_transformers_version="4.53"),
+    "MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf"),
     "MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01",
                                                 trust_remote_code=True,
                                                 revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"),  # noqa: E501
@@ -272,6 +273,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
     "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
+                                         max_transformers_version="4.53",
+                                         transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings",  # noqa: E501
                                         trust_remote_code=True),
     "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                        trust_remote_code=True),
@@ -299,8 +302,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
     "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
                                         trust_remote_code=True),
-    "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst",
-                                        min_transformers_version="4.53"),
+    "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
     # [Encoder-decoder]
     "BartModel": _HfExamplesInfo("facebook/bart-base"),
     "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
@@ -326,8 +328,12 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
                                                trust_remote_code=True, v0_only=True),  # noqa: E501
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
-    "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
-    "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
+    "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B",
+                                           max_transformers_version="4.53",
+                                           transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"),  # noqa: E501
+    "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B",
+                                                  max_transformers_version="4.53",
+                                                  transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"),  # noqa: E501
     "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True),  # noqa: E501
     "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True),  # noqa: E501
     "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True),  # noqa: E501
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index 84a656a3b9..1e3e69e008 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -9,6 +9,8 @@ import pytest
 
 from tests.quantization.utils import is_quant_method_supported
 
+from ..models.registry import HF_EXAMPLE_MODELS
+
 MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
 
 
@@ -25,6 +27,8 @@ def test_model_experts_int8_startup(
     dtype: str,
     max_tokens: int,
 ) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_transformers_version(on_fail="skip")
 
     with vllm_runner(model, dtype=dtype,
                      quantization="experts_int8") as vllm_model:
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 4d68227b2a..697fa020de 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
+from typing import (TYPE_CHECKING, Any, ClassVar, Literal, Optional, Protocol,
                     Union, overload, runtime_checkable)
 
 import torch
@@ -14,6 +14,10 @@ if TYPE_CHECKING:
     from vllm.config import VllmConfig
     from vllm.model_executor.layers.pooler import Pooler
     from vllm.model_executor.sampling_metadata import SamplingMetadata
+else:
+    VllmConfig = Any
+    Pooler = Any
+    SamplingMetadata = Any
 
 logger = init_logger(__name__)
 
@@ -34,7 +38,7 @@ class VllmModel(Protocol[T_co]):
 
     def __init__(
         self,
-        vllm_config: "VllmConfig",
+        vllm_config: VllmConfig,
         prefix: str = "",
     ) -> None:
         ...
@@ -96,7 +100,7 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]):
     def compute_logits(
         self,
         hidden_states: T,
-        sampling_metadata: "SamplingMetadata",
+        sampling_metadata: SamplingMetadata,
     ) -> Optional[T]:
         """Return `None` if TP rank > 0."""
         ...
@@ -140,7 +144,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
         MRO of your model class.
     """
 
-    pooler: "Pooler"
+    pooler: Pooler
     """The pooler is only called on TP rank 0."""
 
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 40d77312b7..633f8598e8 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1395,11 +1395,12 @@ class Tarsier2Processor(Qwen2VLProcessor):
         **kwargs,
     ):
         self.image_processor = Tarsier2ImageProcessor(**vision_config)
-        super().__init__(image_processor=self.image_processor,
-                         tokenizer=tokenizer,
-                         video_processor=Qwen2VLVideoProcessor(),
-                         chat_template=None,
-                         **kwargs)
+        super().__init__(
+            image_processor=self.image_processor,
+            tokenizer=tokenizer,
+            video_processor=Qwen2VLVideoProcessor(**vision_config),
+            chat_template=None,
+            **kwargs)
 
 
 class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 5059d1e1d9..0c3df267ed 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -90,7 +90,7 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
 def replace_linear_class(
     linear: nn.Linear, style: Literal["colwise", "rowwise"],
     quant_config: QuantizationConfig
-) -> Union[ColumnParallelLinear, RowParallelLinear]:
+) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
     """
     Replace nn.Linear with one of vLLM's tensor parallel linear classes.
 
@@ -445,7 +445,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
 
         # Set correct attn and init on "meta" to delay allocating GPU tensors
         # TODO: @raushan, use the public `model.set_attn_implementation()`
-        # method after v4.54.0 is released
+        # method once its checks are fixed in Transformers.
         self.text_config._attn_implementation = "vllm"
         with init_on_device_without_buffers("meta"), config_override:
             self.model: PreTrainedModel = AutoModel.from_config(
@@ -520,7 +520,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         for i in range(len(layers)):
             if start_layer <= i and i < end_layer:
                 continue
-            layers[i] = PPMissingLayer(return_tuple=True)
+            layers[i] = PPMissingLayer()
 
         # Layers after module list
         for name in pp_plan[module_list_idx + 1:]:
@@ -533,14 +533,16 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         Apply the model's tensor parallelization plan.
         Currently only supports linear layers.
         """
-        if not self.model.supports_tp_plan:
-            if self.tp_size <= 1:
-                return
+        tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {}
 
+        if not tp_plan and self.tp_size > 1:
             raise ValueError(
                 f"{type(self.model)} does not support tensor parallel yet!")
 
-        tp_plan = self.model._tp_plan
+        # Some weight loaders expect linear layers to inherit from vLLM's
+        # LinearBase class, so we set a default style which causes any
+        # unspecified linear layers to be replaced with ReplicatedLinear
+        tp_plan[".*"] = "replicated"
 
         def _tensor_parallel(module: nn.Module, prefix: str = ""):
             for child_name, child_module in module.named_children():
@@ -552,6 +554,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
                             child_module, style, self.quant_config)
                         setattr(module, child_name, new_module)
                         log_replacement(qual_name, child_module, new_module)
+                        break
                 else:
                     _tensor_parallel(child_module, prefix=qual_name)
 
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 28508e1bac..fecd14dde4 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -534,16 +534,10 @@ class PPMissingLayer(torch.nn.Identity):
 
     def __init__(self, *args, **kwargs):
         super().__init__()
-        self.return_tuple = kwargs.get("return_tuple", False)
 
     def forward(self, *args, **kwargs):
-        """
-        Return the first arg from args or the first value from kwargs.
-
-        Wraps the input in a tuple if `self.return_tuple` is True.
-        """
-        input = args[0] if args else next(iter(kwargs.values()))
-        return (input, ) if self.return_tuple else input
+        """Return the first arg from args or the first value from kwargs."""
+        return args[0] if args else next(iter(kwargs.values()))
 
 
 _CPU_OFFLOAD_BYTES = 0
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 8fe153464d..bce24ef74c 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -35,7 +35,8 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config,
                                              MllamaConfig, MLPSpeculatorConfig,
                                              Nemotron_Nano_VL_Config,
                                              NemotronConfig, NVLM_D_Config,
-                                             RWConfig, SpeculatorsConfig,
+                                             OvisConfig, RWConfig,
+                                             SpeculatorsConfig,
                                              Step3TextConfig, Step3VLConfig,
                                              UltravoxConfig)
 # yapf: enable
@@ -85,6 +86,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
     "speculators": SpeculatorsConfig,
     "nemotron": NemotronConfig,
     "NVLM_D": NVLM_D_Config,
+    "ovis": OvisConfig,
     "ultravox": UltravoxConfig,
     "step3_vl": Step3VLConfig,
     "step3_text": Step3TextConfig,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 64ace167a5..82d24bb16b 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -24,6 +24,7 @@ from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
 from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
+from vllm.transformers_utils.configs.ovis import OvisConfig
 from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
 from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
                                                       Step3VisionEncoderConfig,
@@ -45,6 +46,7 @@ __all__ = [
     "NemotronHConfig",
     "Nemotron_Nano_VL_Config",
     "NVLM_D_Config",
+    "OvisConfig",
     "SpeculatorsConfig",
     "UltravoxConfig",
     "Step3VLConfig",
diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py
new file mode 100644
index 0000000000..550f5e15db
--- /dev/null
+++ b/vllm/transformers_utils/configs/ovis.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# yapf: disable
+# ruff: noqa: E501
+# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
+# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
+# Ovis Config with AimV2 config registration removed for Transformers compatibility
+from typing import Any, Optional, Union
+
+from transformers import AutoConfig, PretrainedConfig
+
+
+class AIMv2Config(PretrainedConfig):
+    """This is the configuration class to store the configuration of an [`AIMv2Model`].
+    Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
+    Args:
+        hidden_size: Dimension of the hidden representations.
+        intermediate_size: Dimension of the SwiGLU representations.
+        num_hidden_layers: Number of hidden layers in the Transformer.
+        num_attention_heads: Number of attention heads for each attention layer
+            in the Transformer.
+        num_channels: Number of input channels.
+        image_size: Image size.
+        patch_size: Patch size.
+        rms_norm_eps: Epsilon value used for the RMS normalization layer.
+        attention_dropout: Dropout ratio for attention probabilities.
+        projection_dropout: Dropout ratio for the projection layer after the attention.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        use_bias: Whether to add a bias in the feed-forward and projection layers.
+        kwargs: Keyword arguments for the [`PretrainedConfig`].
+    """
+
+    model_type: str = "aimv2"
+
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 2816,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 8,
+        num_channels: int = 3,
+        image_size: int = 224,
+        patch_size: int = 14,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+
+        self.projection_dropout = projection_dropout
+        self.qkv_bias = qkv_bias
+        self.use_bias = use_bias
+
+
+# ----------------------------------------------------------------------
+#                     Visual Tokenizer Configuration
+# ----------------------------------------------------------------------
+class BaseVisualTokenizerConfig(PretrainedConfig):
+
+    def __init__(self,
+                 vocab_size=16384,
+                 tokenize_function="softmax",
+                 tau=1.0,
+                 depths=None,
+                 drop_cls_token=False,
+                 backbone_config: Optional[Union[PretrainedConfig,
+                                                 dict]] = None,
+                 hidden_stride: int = 1,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.tokenize_function = tokenize_function
+        self.tau = tau
+        if isinstance(depths, str):
+            depths = [int(x) for x in depths.split('|')]
+        self.depths = depths
+        self.backbone_kwargs = dict[str, Any]()
+        self.drop_cls_token = drop_cls_token
+        if backbone_config is not None:
+            assert isinstance(backbone_config, (PretrainedConfig, dict)), \
+                f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
+            if not isinstance(backbone_config, PretrainedConfig):
+                model_type = backbone_config['model_type']
+                if model_type != "aimv2":
+                    backbone_config.pop('model_type')
+                    backbone_config = AutoConfig.for_model(model_type, **backbone_config)
+                else:
+                    backbone_config = AIMv2Config(**backbone_config)
+        self.backbone_config = backbone_config
+        self.hidden_stride = hidden_stride
+
+
+class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
+    model_type = "aimv2_visual_tokenizer"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.drop_cls_token:
+            self.drop_cls_token = False
+        if self.depths:
+            assert len(self.depths) == 1
+            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+
+
+class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
+    model_type = "siglip_visual_tokenizer"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.drop_cls_token:
+            self.drop_cls_token = False
+        if self.depths:
+            assert len(self.depths) == 1
+            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+
+
+AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
+AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
+
+
+# ----------------------------------------------------------------------
+#                           Ovis Configuration
+# ----------------------------------------------------------------------
+class OvisConfig(PretrainedConfig):
+    model_type = "ovis"
+
+    def __init__(self,
+                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
+                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
+                                                         dict]] = None,
+                 multimodal_max_length=8192,
+                 hidden_size=None,
+                 conversation_formatter_class=None,
+                 llm_attn_implementation=None,
+                 disable_tie_weight=False,
+                 **kwargs):
+        super().__init__(**kwargs)
+        if llm_config is not None:
+            assert isinstance(llm_config, (PretrainedConfig, dict)), \
+                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
+            if not isinstance(llm_config, PretrainedConfig):
+                model_type = llm_config['model_type']
+                llm_config.pop('model_type')
+                llm_config = AutoConfig.for_model(model_type, **llm_config)
+
+        # map llm_config to text_config
+        self.text_config = llm_config
+        if visual_tokenizer_config is not None:
+            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
+                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
+            if not isinstance(visual_tokenizer_config, PretrainedConfig):
+                model_type = visual_tokenizer_config['model_type']
+                visual_tokenizer_config.pop('model_type')
+                visual_tokenizer_config = AutoConfig.for_model(
+                    model_type, **visual_tokenizer_config)
+
+        self.visual_tokenizer_config = visual_tokenizer_config
+        self.multimodal_max_length = multimodal_max_length
+        self.hidden_size = hidden_size
+        self.conversation_formatter_class = conversation_formatter_class
+        self.llm_attn_implementation = llm_attn_implementation
+        self.disable_tie_weight = disable_tie_weight

From de98252f497b8cde5b9f18a8dac53302f5c72db7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 5 Aug 2025 23:26:00 -0700
Subject: [PATCH 014/932] Add GPT-OSS model code and config [1/N] (#22327)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/models/registry.py               |   1 +
 vllm/model_executor/models/config.py   |  29 ++
 vllm/model_executor/models/gpt_oss.py  | 472 +++++++++++++++++++++++++
 vllm/model_executor/models/registry.py |   1 +
 4 files changed, 503 insertions(+)
 create mode 100644 vllm/model_executor/models/gpt_oss.py

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 92a719d7a9..69961d7385 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -197,6 +197,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                        {"6b": "EleutherAI/gpt-j-6b"}),
     "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
                                           {"1b": "EleutherAI/pythia-1.4b"}),
+    "GptOssForCausalLM": _HfExamplesInfo("openai/gpt-oss-20b"),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
     "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"),  # noqa: E501
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 6f09be7a59..908d4e628b 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -247,6 +247,34 @@ class GraniteMoeHybridModelConfig(VerifyAndUpdateConfig):
             config.max_model_len)
 
 
+class GptOssConfig(VerifyAndUpdateConfig):
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        decoding_config = vllm_config.decoding_config
+        if decoding_config.reasoning_backend == "":
+            decoding_config.reasoning_backend = "openai"
+
+        # Increase the max capture size from 512 to 1024 for performance.
+        # NOTE(woosuk): This will increase the number of CUDA graphs
+        # from 67 to 83.
+        scheduler_config = vllm_config.scheduler_config
+        if len(scheduler_config.cuda_graph_sizes) == 1:
+            max_capture_size = scheduler_config.cuda_graph_sizes[0]
+            # FIXME(woosuk): When using full cuda graph with FA3, the max
+            # supported size is 992.
+            if max_capture_size < 1024:
+                cuda_graph_sizes = [1, 2, 4]
+                # Step size 8 for small batch sizes
+                cuda_graph_sizes += [i for i in range(8, 256, 8)]
+                # Step size 16 for larger batch sizes
+                cuda_graph_sizes += [i for i in range(256, 1025, 16)]
+                scheduler_config.cuda_graph_sizes = cuda_graph_sizes
+                logger.info(
+                    "Overriding max cuda graph capture size to "
+                    "%d for performance.", 1024)
+
+
 class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
 
     @classmethod
@@ -345,4 +373,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "JinaVLForRanking": JinaVLForSequenceClassificationConfig,
     "JambaForSequenceClassification": JambaForSequenceClassificationConfig,
     "GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig,
+    "GptOssForCausalLM": GptOssConfig,
 }
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
new file mode 100644
index 0000000000..896560fa24
--- /dev/null
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -0,0 +1,472 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from torch import nn
+from transformers import GptOssConfig
+
+from vllm import envs
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_ep_group, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.utils import cdiv
+
+from .utils import extract_layer_index, maybe_prefix
+
+
+class OAIAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: GptOssConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_idx = extract_layer_index(prefix)
+        self.head_dim = config.head_dim
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.hidden_size = config.hidden_size
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=config.max_position_embeddings,
+            base=config.rope_theta,
+            dtype=torch.float32,
+            rope_scaling={
+                "rope_type":
+                "yarn",
+                "factor":
+                config.rope_scaling["factor"],
+                "original_max_position_embeddings":
+                config.rope_scaling["original_max_position_embeddings"],
+                "beta_fast":
+                config.rope_ntk_beta,
+                "beta_slow":
+                config.rope_ntk_alpha,
+            },
+            is_neox_style=True,
+        )
+
+        tp_size = get_tensor_model_parallel_world_size()
+
+        attention_sink_dtype = (
+            torch.float32 if envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION
+            or envs.VLLM_USE_TRTLLM_DECODE_ATTENTION else torch.bfloat16)
+        self.sinks = torch.nn.Parameter(
+            torch.empty(config.num_attention_heads // tp_size,
+                        dtype=attention_sink_dtype,
+                        requires_grad=False))
+
+        self.norm = RMSNorm(config.hidden_size, eps=1e-5)
+
+        self.q_size = self.num_attention_heads * self.head_dim // tp_size
+        self.kv_size = self.num_key_value_heads * self.head_dim // tp_size
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = config.rope_theta
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=self.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.num_attention_heads,
+            total_num_kv_heads=self.num_key_value_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.num_attention_heads * self.head_dim,
+            output_size=self.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.num_local_attention_heads = config.num_attention_heads // tp_size
+        self.num_local_key_value_heads = config.num_key_value_heads // tp_size
+
+        # Only apply sliding window to every other layer
+        sliding_window = (config.sliding_window if self.layer_idx %
+                          2 == 0 else None)
+        self.attn = Attention(
+            self.num_local_attention_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_local_key_value_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            attn_type=AttentionType.DECODER,
+            prefix=f"{prefix}.attn",
+            sinks=self.sinks,
+        )
+
+    def forward(self, hidden_states: torch.Tensor,
+                positions: torch.Tensor) -> torch.Tensor:
+        t = self.norm(hidden_states)
+
+        qkv, _ = self.qkv(t)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        v = v.contiguous()
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+
+        return output + hidden_states
+
+
+class MLPBlock(torch.nn.Module):
+
+    def __init__(
+        self,
+        config: GptOssConfig,
+        layer_idx: int,
+        quant_config: QuantizationConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.num_experts = config.num_local_experts
+        self.experts_per_token = config.num_experts_per_tok
+        self.world_size = dist.get_world_size() if dist.is_initialized() else 1
+        self.norm = RMSNorm(config.hidden_size, eps=1e-5)
+        self.router = torch.nn.Linear(config.hidden_size,
+                                      config.num_local_experts,
+                                      dtype=torch.bfloat16)
+        assert config.intermediate_size % self.world_size == 0
+        self.experts = FusedMoE(num_experts=config.num_local_experts,
+                                top_k=config.num_experts_per_token,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.intermediate_size,
+                                reduce_results=True,
+                                renormalize=True,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts",
+                                apply_router_weight_on_input=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        t = self.norm(x)
+        g = self.router(t)
+        t = self.experts(hidden_states=t, router_logits=g)
+        return x + t
+
+
+class TransformerBlock(torch.nn.Module):
+
+    def __init__(
+        self,
+        config: GptOssConfig,
+        quant_config: QuantizationConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_idx = extract_layer_index(prefix)
+        self.attn = OAIAttention(config, prefix=f"{prefix}.attn")
+        self.mlp = MLPBlock(config,
+                            self.layer_idx,
+                            quant_config=quant_config,
+                            prefix=f"{prefix}.mlp")
+
+    def forward(self, hidden_states: torch.Tensor,
+                positions: torch.Tensor) -> torch.Tensor:
+        attn_output = self.attn(hidden_states, positions)
+        output = self.mlp(attn_output)
+        return output
+
+
+@support_torch_compile
+class GptOssModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        self.quant_config = vllm_config.quant_config
+        self.config.hidden_size = self.config.hidden_size
+        self.embedding = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+        )
+        self.layers = torch.nn.ModuleList([
+            TransformerBlock(
+                self.config,
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, f"block.{layer_idx}"),
+            ) for layer_idx in range(self.config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(self.config.hidden_size, eps=1e-5)
+
+    def forward(self, input_ids: torch.Tensor,
+                positions: torch.Tensor) -> torch.Tensor:
+        x = self.embedding(input_ids)
+        for layer in self.layers:
+            x = layer(x, positions)
+        x = self.norm(x)
+        return x
+
+
+class GptOssForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config.hf_config
+        self.model = GptOssModel(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+        self.lm_head = ParallelLMHead(
+            self.model_config.vocab_size,
+            self.model_config.hidden_size,
+        )
+        self.logits_processor = LogitsProcessor(self.model_config.vocab_size)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
+        assert intermediate_tensors is None
+        assert inputs_embeds is None
+        return self.model(input_ids, positions)
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        rename_mapping = {
+            "self_attn": "attn",
+            "input_layernorm.weight": "attn.norm.weight",
+            "post_attention_layernorm.weight": "mlp.norm.weight",
+            "embed_tokens": "embedding",
+        }
+
+        def maybe_rename(name: str) -> str:
+            for remap_name, new_name in rename_mapping.items():
+                if remap_name in name:
+                    return name.replace(remap_name, new_name)
+            return name
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        mxfp4_block = 32
+
+        tp_rank = get_tensor_model_parallel_rank()
+        tp_size = get_tensor_model_parallel_world_size()
+        intermediate_size = self.model_config.intermediate_size
+        intermediate_size_block = intermediate_size // mxfp4_block
+        per_rank_intermediate_size_block = cdiv(intermediate_size_block,
+                                                tp_size)
+        per_rank_intermediate_size = (per_rank_intermediate_size_block *
+                                      mxfp4_block)
+
+        # Calculate common slicing bounds for current rank
+        tp_rank_start = tp_rank * per_rank_intermediate_size
+        tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size,
+                          intermediate_size)
+
+        # Attention heads per rank
+        heads_per_rank = self.model_config.num_attention_heads // tp_size
+        head_start = tp_rank * heads_per_rank
+
+        use_ep = self.vllm_config.parallel_config.enable_expert_parallel
+        ep_size = get_ep_group().world_size
+        ep_rank = get_ep_group().rank
+        num_experts = self.model_config.num_local_experts
+        experts_per_rank = num_experts // ep_size
+        ep_rank_start = ep_rank * experts_per_rank
+        ep_rank_end = (ep_rank + 1) * experts_per_rank
+
+        for name, weight in weights:
+            # FIXME(woosuk): Remove this after testing.
+            weight = weight.cuda()
+
+            if "gate_up_proj_blocks" in name:
+                # Handle MLP gate and up projection weights
+                new_name = name.replace("gate_up_proj_blocks", "w13_weight")
+
+                # flat weight from (E, 2 * N, block_size, entry_per_block)
+                # to (E, 2 * N, -1), shouldn't trigger copy for contiguous
+                weight = weight.view(num_experts, 2 * intermediate_size,
+                                     -1).contiguous()
+
+                # Extract gate and up projection parts
+                # since the weight is shuffled, we can slice directly
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[:,
+                                           2 * tp_rank_start:2 * tp_rank_end,
+                                           ...]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param,
+                              narrow_weight,
+                              weight_name=new_name,
+                              shard_id=None,
+                              expert_id=None)
+                loaded_params.add(new_name)
+
+            elif "down_proj_blocks" in name:
+                # Handle MLP down projection weights
+                new_name = name.replace("down_proj_blocks", "w2_weight")
+                # same flatten here, but since 2 mx4 value are packed in 1
+                # uint8, divide by 2
+                weight = weight.view(num_experts, -1,
+                                     intermediate_size // 2).contiguous()
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[...,
+                                           tp_rank_start // 2:tp_rank_end // 2]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param,
+                              narrow_weight,
+                              weight_name=new_name,
+                              shard_id=None,
+                              expert_id=None)
+                loaded_params.add(new_name)
+
+            elif "gate_up_proj_scales" in name:
+                # Handle MLP gate and up projection weights scale
+                new_name = name.replace("gate_up_proj_scales",
+                                        "w13_weight_scale")
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[:,
+                                           2 * tp_rank_start:2 * tp_rank_end,
+                                           ...]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param,
+                              narrow_weight,
+                              weight_name=new_name,
+                              shard_id=None,
+                              expert_id=None)
+                loaded_params.add(new_name)
+
+            elif "down_proj_scales" in name:
+                # Handle MLP down projection weights
+                new_name = name.replace("down_proj_scales", "w2_weight_scale")
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[..., tp_rank_start //
+                                           mxfp4_block:tp_rank_end //
+                                           mxfp4_block]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param,
+                              narrow_weight,
+                              weight_name=new_name,
+                              shard_id=None,
+                              expert_id=None)
+                loaded_params.add(new_name)
+            elif "gate_up_proj_bias" in name:
+                # Handle MLP gate and up projection biases
+                new_name = name.replace("gate_up_proj_bias", "w13_bias")
+
+                # Extract gate and up projection bias parts
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[:,
+                                           2 * tp_rank_start:2 * tp_rank_end]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param,
+                              narrow_weight,
+                              weight_name=new_name,
+                              shard_id=None,
+                              expert_id=None)
+                loaded_params.add(new_name)
+
+            elif "down_proj_bias" in name:
+                # Handle MLP down projection bias
+                new_name = name.replace("down_proj_bias", "w2_bias")
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                if use_ep:
+                    weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    # (only load on rank 0 to avoid duplication)
+                    if tp_rank != 0:
+                        weight.zero_()
+                weight_loader(param,
+                              weight,
+                              weight_name=new_name,
+                              shard_id=None,
+                              expert_id=None)
+                loaded_params.add(new_name)
+            elif "sinks" in name:
+                # Handle attention sinks (distributed across ranks)
+                name = name.replace("self_attn", "attn")
+                param = params_dict[name]
+                narrow_weight = weight.narrow(0, head_start, heads_per_rank)
+                param.data.copy_(narrow_weight)
+                loaded_params.add(name)
+            elif "q_proj" in name or "k_proj" in name or "v_proj" in name:
+                shard_id = ("q" if "q_proj" in name else
+                            "k" if "k_proj" in name else "v")
+                name = name.replace("self_attn", "attn")
+                param_name = name.replace(f"{shard_id}_proj", "qkv")
+                param = params_dict[param_name]
+                weight_loader = param.weight_loader
+                weight_loader(param, weight, loaded_shard_id=shard_id)
+                loaded_params.add(param_name)
+            else:
+                # Handle all other weights with potential renaming
+                renamed_name = maybe_rename(name)
+                if renamed_name not in params_dict:
+                    continue
+                param = params_dict[renamed_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, weight)
+                loaded_params.add(renamed_name)
+
+        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 9b6ab52d86..c746e8ec3f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -74,6 +74,7 @@ _TEXT_GENERATION_MODELS = {
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"),
     "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"),
+    "GptOssForCausalLM": ("gpt_oss", "GptOssForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),

From 98a3a81024649985ed8814a4b7d083d2303fd73c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 5 Aug 2025 23:30:38 -0700
Subject: [PATCH 015/932] [ROCm] Add attention sink to
 use_rocm_custom_paged_attention (#22329)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
---
 vllm/platforms/rocm.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 54ffc83cd5..d26e4b3350 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -127,7 +127,8 @@ def use_rocm_custom_paged_attention(
         max_seq_len: int,
         sliding_window: int,
         kv_cache_dtype: str,
-        alibi_slopes: Optional[torch.Tensor] = None) -> bool:
+        alibi_slopes: Optional[torch.Tensor] = None,
+        sinks: Optional[torch.Tensor] = None) -> bool:
 
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
     ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
@@ -145,7 +146,7 @@ def use_rocm_custom_paged_attention(
                 and max_seq_len <= 128 * 1024
                 and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
                 and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
-                         and envs.VLLM_ROCM_USE_AITER))
+                         and envs.VLLM_ROCM_USE_AITER) and sinks is None)
 
     else:
         return (ON_GFX11_GFX12 and (not envs.VLLM_USE_V1 or sliding_window == 0
@@ -155,7 +156,7 @@ def use_rocm_custom_paged_attention(
                 and (gqa_ratio >= 3 and gqa_ratio <= 16)
                 and max_seq_len <= 128 * 1024 and alibi_slopes is None
                 and kv_cache_dtype == "auto"
-                and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+                and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN and sinks is None)
 
 
 class RocmPlatform(Platform):
@@ -170,7 +171,7 @@ class RocmPlatform(Platform):
 
     supported_quantization: list[str] = [
         "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
-        "quark", "ptpc_fp8"
+        "quark", "ptpc_fp8", "mxfp4"
     ]
 
     @classmethod
@@ -469,4 +470,4 @@ class RocmPlatform(Platform):
 
     @classmethod
     def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
-        return True
\ No newline at end of file
+        return True

From a47e6ffe9366516ea5ca28e27fc87367a869e854 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 5 Aug 2025 23:39:13 -0700
Subject: [PATCH 016/932] [GptOss] Add GptOss reasoning parser to support
 structure output (#22322)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
---
 vllm/model_executor/models/config.py      |  6 +--
 vllm/reasoning/__init__.py                |  2 +
 vllm/reasoning/gptoss_reasoning_parser.py | 64 +++++++++++++++++++++++
 3 files changed, 69 insertions(+), 3 deletions(-)
 create mode 100644 vllm/reasoning/gptoss_reasoning_parser.py

diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 908d4e628b..6f21cd267b 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -247,13 +247,13 @@ class GraniteMoeHybridModelConfig(VerifyAndUpdateConfig):
             config.max_model_len)
 
 
-class GptOssConfig(VerifyAndUpdateConfig):
+class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
 
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         decoding_config = vllm_config.decoding_config
         if decoding_config.reasoning_backend == "":
-            decoding_config.reasoning_backend = "openai"
+            decoding_config.reasoning_backend = "GptOss"
 
         # Increase the max capture size from 512 to 1024 for performance.
         # NOTE(woosuk): This will increase the number of CUDA graphs
@@ -373,5 +373,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "JinaVLForRanking": JinaVLForSequenceClassificationConfig,
     "JambaForSequenceClassification": JambaForSequenceClassificationConfig,
     "GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig,
-    "GptOssForCausalLM": GptOssConfig,
+    "GptOssForCausalLM": GptOssForCausalLMConfig,
 }
diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index 1c3f78f2ed..b987adeb64 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -4,6 +4,7 @@
 from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
 from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser
+from .gptoss_reasoning_parser import GptOssReasoningParser
 from .granite_reasoning_parser import GraniteReasoningParser
 from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser
 from .mistral_reasoning_parser import MistralReasoningParser
@@ -20,4 +21,5 @@ __all__ = [
     "Glm4MoeModelReasoningParser",
     "MistralReasoningParser",
     "Step3ReasoningParser",
+    "GptOssReasoningParser",
 ]
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
new file mode 100644
index 0000000000..05a72ac23b
--- /dev/null
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Optional, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("GptOss")
+class GptOssReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for GptOss model.
+
+    The GptOss model uses harmony to extract reasoning content and this parser
+    is only used for detecting the end of the reasoning content.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+        self.reasoning_end_token_ids = self.model_tokenizer.encode(
+            "<|start|>assistant<|channel|>final<|message|>")
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        end_token_ids = self.reasoning_end_token_ids
+        assert len(end_token_ids) > 0, "reasoning_end_token_ids is empty"
+        # Check if the end sequence is present in the input_ids.
+        # We search from the end of input_ids to find the last match.
+        for i in range(len(input_ids) - len(end_token_ids), -1, -1):
+            if input_ids[i:i + len(end_token_ids)] == end_token_ids:
+                return True
+        return False
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        raise RuntimeError(
+            "GptOss model uses harmony to extract reasoning content. This "
+            "function should not be called.")
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        raise RuntimeError(
+            "GptOss model uses harmony to extract reasoning content. This "
+            "function should not be called.")
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        raise RuntimeError(
+            "GptOss model uses harmony to extract reasoning content. This "
+            "function should not be called.")

From 90ec006937c4bcb33b4c0423285fd72502659cfe Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Tue, 5 Aug 2025 23:48:19 -0700
Subject: [PATCH 017/932] [gpt-oss] flashinfer attention sink init (#22330)

Signed-off-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
---
 vllm/v1/attention/backends/flashinfer.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 8592d1b26d..caf9ecc911 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -611,6 +611,7 @@ class FlashInferImpl(AttentionImpl):
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[int] = None,
+        sinks: Optional[torch.Tensor] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -635,6 +636,15 @@ class FlashInferImpl(AttentionImpl):
                                       "are not implemented for "
                                       "FlashInferImpl")
 
+        self.sinks: Optional[torch.Tensor] = None
+        if sinks is not None:
+            assert sinks.shape[0] == num_heads, (
+                "Sinks must have the same number of heads "
+                "as the number of heads in the layer"
+            )
+            assert sinks.dtype == torch.float32, "Sinks must be of type float32"
+            self.sinks = sinks
+
     def forward(
         self,
         layer: torch.nn.Module,

From 134a8ee8fdbcbb838a54911fd2b129f2ceda0f17 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 6 Aug 2025 00:10:14 -0700
Subject: [PATCH 018/932] [gpt-oss] Add openai-harmony as default dependency
 (#22332)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
---
 requirements/common.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/common.txt b/requirements/common.txt
index 0a4b27c034..5405df359a 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -49,3 +49,4 @@ ninja # Required for xgrammar, rocm, tpu, xpu
 pybase64 # fast base64 implementation
 cbor2 # Required for cross-language serialization of hashable objects
 setproctitle # Used to set process names for better debugging and monitoring
+openai-harmony >= 0.0.3  # Required for gpt-oss

From fa00c5d75bc63c87f5822f839db1342f19e4acc8 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 6 Aug 2025 15:50:25 +0800
Subject: [PATCH 019/932] [Misc] Clean up duplicated hf overrides (#22311)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/models/multimodal/test_tensor_schema.py | 51 +--------------
 tests/models/test_initialization.py           | 62 +++----------------
 tests/models/utils.py                         | 61 ++++++++++++++++++
 3 files changed, 71 insertions(+), 103 deletions(-)

diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py
index f80e8456f0..a4cb1a6883 100644
--- a/tests/models/multimodal/test_tensor_schema.py
+++ b/tests/models/multimodal/test_tensor_schema.py
@@ -1,11 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from functools import partial
-from typing import Any
 from unittest.mock import patch
 
 import pytest
-from transformers import PretrainedConfig
 
 from vllm.config import ModelConfig
 from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
@@ -19,6 +17,7 @@ from vllm.v1.engine.core import EngineCore as V1EngineCore
 
 from ...conftest import VllmRunner
 from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
+from ..utils import dummy_hf_overrides
 
 ARCH_TO_SKIP = {
     "MolmoForCausalLM": "incompatible requirements",
@@ -51,51 +50,6 @@ def create_batched_mm_kwargs(
     return mm_kwargs
 
 
-# Avoid OOM and reduce initialization time by only using 1 layer
-def hf_overrides(hf_config: PretrainedConfig,
-                 exist_overrides: dict[str, Any]) -> PretrainedConfig:
-    hf_config.update(exist_overrides)
-    text_config = hf_config.get_text_config()
-    # Ensure at least 2 expert per group
-    # Since `grouped_topk` assumes top-2
-    n_group = getattr(text_config, 'n_group', None)
-    num_experts = n_group * 2 if n_group is not None else 2
-    # we use three layers for Gemma-3n to check
-    # both normal layer and kv_shared_layer
-    text_config.update({
-        "num_layers": 1,
-        "num_hidden_layers": 1,
-        "num_experts": num_experts,
-        "num_experts_per_tok": 2,
-        "num_local_experts": num_experts,
-        # Otherwise there will not be any expert layers
-        "first_k_dense_replace": 0,
-        # To avoid OOM on DeepSeek-V3
-        "n_routed_experts": num_experts,
-        # For Gemma-3n
-        "num_kv_shared_layers": 1,
-    })
-    if hasattr(hf_config, "vision_config"):
-        hf_config.vision_config.update({
-            "num_layers": 1,
-            "num_hidden_layers": 1,
-        })
-    # e.g.: ibm-granite/granite-speech-3.3-2b
-    if hasattr(hf_config, "encoder_config"):
-        hf_config.encoder_config.update({
-            "num_layers": 1,
-            "num_hidden_layers": 1,
-        })
-    # e.g.: Qwen/Qwen2-Audio-7B-Instruct
-    if hasattr(hf_config, "audio_config"):
-        hf_config.audio_config.update({
-            "num_layers": 1,
-            "num_hidden_layers": 1,
-            "encoder_layers": 1,
-        })
-    return hf_config
-
-
 @pytest.mark.core_model
 @pytest.mark.parametrize("model_arch", list(_MULTIMODAL_EXAMPLE_MODELS.keys()))
 def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner],
@@ -110,7 +64,8 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner],
 
     model_id = model_info.default
 
-    hf_overrides_fn = partial(hf_overrides,
+    hf_overrides_fn = partial(dummy_hf_overrides,
+                              model_arch=model_arch,
                               exist_overrides=model_info.hf_overrides)
 
     model_config = ModelConfig(
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 4c7da24fca..f0aa91566b 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from functools import partial
 from unittest.mock import patch
 
 import pytest
-from transformers import PretrainedConfig
 
 from vllm import LLM
 from vllm.config import ModelImpl
@@ -16,6 +16,7 @@ from vllm.v1.engine.core import EngineCore as V1EngineCore
 from ..utils import create_new_process_for_each_test
 from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS,
                        HF_EXAMPLE_MODELS, HfExampleModels)
+from .utils import dummy_hf_overrides
 
 
 @create_new_process_for_each_test()
@@ -33,64 +34,15 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
+    hf_overrides_fn = partial(dummy_hf_overrides,
+                              model_arch=model_arch,
+                              exist_overrides=model_info.hf_overrides)
+
     if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
         from vllm.model_executor.models.llama4 import Llama4ForCausalLM
         from vllm.model_executor.models.registry import ModelRegistry
         ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
 
-    # Avoid OOM and reduce initialization time by only using 1 layer
-    def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
-        hf_config.update(model_info.hf_overrides)
-
-        text_config = hf_config.get_text_config()
-
-        # Ensure at least 2 expert per group
-        # Since `grouped_topk` assumes top-2
-        n_group = getattr(text_config, 'n_group', None)
-        num_experts = n_group * 2 if n_group is not None else 2
-
-        # we use three layers for Gemma-3n to check
-        # both normal layer and kv_shared_layer
-        num_hidden_layers = (3 if model_arch
-                             == "Gemma3nForConditionalGeneration" else 1)
-
-        text_config.update({
-            "num_layers": 1,
-            "num_hidden_layers": num_hidden_layers,
-            "num_experts": num_experts,
-            "num_experts_per_tok": 2,
-            "num_local_experts": num_experts,
-            # Otherwise there will not be any expert layers
-            "first_k_dense_replace": 0,
-            # To avoid OOM on DeepSeek-V3
-            "n_routed_experts": num_experts,
-            # For Gemma-3n
-            "num_kv_shared_layers": 1,
-        })
-
-        if hasattr(hf_config, "vision_config"):
-            hf_config.vision_config.update({
-                "num_layers": 1,
-                "num_hidden_layers": 1,
-            })
-
-        # e.g.: ibm-granite/granite-speech-3.3-2b
-        if hasattr(hf_config, "encoder_config"):
-            hf_config.encoder_config.update({
-                "num_layers": 1,
-                "num_hidden_layers": 1,
-            })
-
-        # e.g.: Qwen/Qwen2-Audio-7B-Instruct
-        if hasattr(hf_config, "audio_config"):
-            hf_config.audio_config.update({
-                "num_layers": 1,
-                "num_hidden_layers": 1,
-                "encoder_layers": 1,
-            })
-
-        return hf_config
-
     # Avoid calling model.forward()
     def _initialize_kv_caches_v0(self) -> None:
         self.cache_config.num_gpu_blocks = 0
@@ -132,7 +84,7 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
             load_format="dummy",
             model_impl=ModelImpl.TRANSFORMERS
             if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
-            hf_overrides=hf_overrides,
+            hf_overrides=hf_overrides_fn,
         )
 
 
diff --git a/tests/models/utils.py b/tests/models/utils.py
index bda7ea3e3a..1513db5220 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -7,6 +7,7 @@ from typing import Any, NamedTuple, Optional, Union
 
 import torch
 import torch.nn.functional as F
+from transformers import PretrainedConfig
 
 from vllm.config import ModelConfig, RunnerOption
 from vllm.inputs import InputContext
@@ -351,3 +352,63 @@ class RerankModelInfo(NamedTuple):
     architecture: str = ""
     dtype: str = "auto"
     enable_test: bool = True
+
+
+def dummy_hf_overrides(
+    hf_config: PretrainedConfig,
+    model_arch: str,
+    exist_overrides: Optional[dict[str, Any]] = None,
+) -> PretrainedConfig:
+    """
+    Dummy HF overrides function used to create dummy model
+    with only minimum nums of layer.
+    """
+    hf_config.update(exist_overrides or {})
+
+    text_config = hf_config.get_text_config()
+
+    # Ensure at least 2 expert per group
+    # Since `grouped_topk` assumes top-2
+    n_group = getattr(text_config, 'n_group', None)
+    num_experts = n_group * 2 if n_group is not None else 2
+
+    # we use three layers for Gemma-3n to check
+    # both normal layer and kv_shared_layer
+    num_hidden_layers = (3 if model_arch == "Gemma3nForConditionalGeneration"
+                         else 1)
+    text_config.update({
+        "num_layers": 1,
+        "num_hidden_layers": num_hidden_layers,
+        "num_experts": num_experts,
+        "num_experts_per_tok": 2,
+        "num_local_experts": num_experts,
+        # Otherwise there will not be any expert layers
+        "first_k_dense_replace": 0,
+        # To avoid OOM on DeepSeek-V3
+        "n_routed_experts": num_experts,
+        # For Gemma-3n
+        "num_kv_shared_layers": 1,
+    })
+
+    if hasattr(hf_config, "vision_config"):
+        hf_config.vision_config.update({
+            "num_layers": 1,
+            "num_hidden_layers": 1,
+        })
+
+    # e.g.: ibm-granite/granite-speech-3.3-2b
+    if hasattr(hf_config, "encoder_config"):
+        hf_config.encoder_config.update({
+            "num_layers": 1,
+            "num_hidden_layers": 1,
+        })
+
+    # e.g.: Qwen/Qwen2-Audio-7B-Instruct
+    if hasattr(hf_config, "audio_config"):
+        hf_config.audio_config.update({
+            "num_layers": 1,
+            "num_hidden_layers": 1,
+            "encoder_layers": 1,
+        })
+
+    return hf_config

From 178d03fbd64e18999647b349623cd1489f816c8c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 6 Aug 2025 01:08:49 -0700
Subject: [PATCH 020/932] [gpt-oss] Add Tool/ConversationContext classes and
 harmony_utils (#22340)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
---
 vllm/entrypoints/context.py       | 177 ++++++++++++++++++++++++++++++
 vllm/entrypoints/harmony_utils.py | 111 +++++++++++++++++++
 vllm/entrypoints/tool.py          |  87 +++++++++++++++
 3 files changed, 375 insertions(+)
 create mode 100644 vllm/entrypoints/context.py
 create mode 100644 vllm/entrypoints/harmony_utils.py
 create mode 100644 vllm/entrypoints/tool.py

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
new file mode 100644
index 0000000000..6292306e7c
--- /dev/null
+++ b/vllm/entrypoints/context.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+from abc import ABC, abstractmethod
+
+from openai_harmony import Message, Role, StreamState
+
+from vllm.entrypoints.harmony_utils import (
+    get_encoding, get_streamable_parser_for_assistant, render_for_completion)
+from vllm.entrypoints.tool import Tool
+from vllm.outputs import RequestOutput
+
+logger = logging.getLogger(__name__)
+
+
+class ConversationContext(ABC):
+
+    @abstractmethod
+    def append_output(self, output) -> None:
+        pass
+
+    @abstractmethod
+    async def call_tool(self) -> list[Message]:
+        pass
+
+    @abstractmethod
+    def need_builtin_tool_call(self) -> bool:
+        pass
+
+    @abstractmethod
+    def render_for_completion(self) -> list[int]:
+        pass
+
+
+class SimpleContext(ConversationContext):
+
+    def __init__(self):
+        self.last_output = None
+
+    def append_output(self, output) -> None:
+        self.last_output = output
+
+    def need_builtin_tool_call(self) -> bool:
+        return False
+
+    async def call_tool(self) -> list[Message]:
+        raise NotImplementedError("Should not be called.")
+
+    def render_for_completion(self) -> list[int]:
+        raise NotImplementedError("Should not be called.")
+
+
+class HarmonyContext(ConversationContext):
+
+    def __init__(
+        self,
+        messages: list,
+        tool_sessions: dict[str, Tool],
+    ):
+        self._messages = messages
+        self.tool_sessions = tool_sessions
+
+        self.parser = get_streamable_parser_for_assistant()
+        self.num_init_messages = len(messages)
+        # TODO(woosuk): Implement the following fields.
+        self.num_prompt_tokens = 0
+        self.num_cached_tokens = 0
+        self.num_output_tokens = 0
+        self.num_reasoning_tokens = 0
+
+    def append_output(self, output) -> None:
+        if isinstance(output, RequestOutput):
+            output_token_ids = output.outputs[0].token_ids
+            for token_id in output_token_ids:
+                self.parser.process(token_id)
+            output_msgs = self.parser.messages
+        else:
+            # Tool output.
+            output_msgs = output
+        self._messages.extend(output_msgs)
+
+    @property
+    def messages(self) -> list:
+        return self._messages
+
+    def need_builtin_tool_call(self) -> bool:
+        last_msg = self.messages[-1]
+        recipient = last_msg.recipient
+        return recipient is not None and (recipient.startswith("browser.")
+                                          or recipient.startswith("python"))
+
+    async def call_tool(self) -> list[Message]:
+        if not self.messages:
+            return []
+        last_msg = self.messages[-1]
+        recipient = last_msg.recipient
+        if recipient is not None:
+            if recipient.startswith("browser."):
+                return await self.call_search_tool(
+                    self.tool_sessions["browser"], last_msg)
+            elif recipient.startswith("python"):
+                return await self.call_python_tool(
+                    self.tool_sessions["python"], last_msg)
+        raise ValueError("No tool call found")
+
+    def render_for_completion(self) -> list[int]:
+        return render_for_completion(self.messages)
+
+    async def call_search_tool(
+        self,
+        tool_session: Tool,
+        last_msg: Message,
+    ) -> list[Message]:
+        return await tool_session.get_result(self)
+
+    async def call_python_tool(
+        self,
+        tool_session: Tool,
+        last_msg: Message,
+    ) -> list[Message]:
+        return await tool_session.get_result(self)
+
+
+class StreamingHarmonyContext(HarmonyContext):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.last_output = None
+
+        self.parser = get_streamable_parser_for_assistant()
+        self.encoding = get_encoding()
+        self.last_tok = None
+
+    @property
+    def messages(self) -> list:
+        return self.parser.messages
+
+    def append_output(self, output) -> None:
+        if isinstance(output, RequestOutput):
+            tok = output.outputs[0].token_ids[0]
+            self.parser.process(tok)
+            self.last_tok = tok
+        else:
+            # Handle the case of tool output in direct message format
+            assert len(output) == 1, "Tool output should be a single message"
+            msg = output[0]
+            # Sometimes the recipient is not set for tool messages,
+            # so we set it to "assistant"
+            if msg.author.role == Role.TOOL and msg.recipient is None:
+                msg.recipient = "assistant"
+            toks = self.encoding.render(msg)
+            for tok in toks:
+                self.parser.process(tok)
+            self.last_tok = toks[-1]
+
+    def is_expecting_start(self) -> bool:
+        return self.parser.state == StreamState.EXPECT_START
+
+    def is_assistant_action_turn(self) -> bool:
+        return self.last_tok in self.encoding.stop_tokens_for_assistant_actions(
+        )
+
+    def render_for_completion(self) -> list[int]:
+        # now this list of tokens as next turn's starting tokens
+        # `<|start|>assistant``,
+        # we need to process them in parser.
+        rendered_tokens = super().render_for_completion()
+
+        last_n = -1
+        to_process = []
+        while rendered_tokens[last_n] != self.last_tok:
+            to_process.append(rendered_tokens[last_n])
+            last_n -= 1
+        for tok in reversed(to_process):
+            self.parser.process(tok)
+
+        return rendered_tokens
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
new file mode 100644
index 0000000000..801c82b4fa
--- /dev/null
+++ b/vllm/entrypoints/harmony_utils.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import datetime
+from typing import Literal, Optional
+
+from openai.types.responses.tool import Tool
+from openai_harmony import (Conversation, DeveloperContent,
+                            HarmonyEncodingName, Message, ReasoningEffort,
+                            Role, StreamableParser, SystemContent, TextContent,
+                            ToolDescription, load_harmony_encoding)
+
+REASONING_EFFORT = {
+    "high": ReasoningEffort.HIGH,
+    "medium": ReasoningEffort.MEDIUM,
+    "low": ReasoningEffort.LOW,
+}
+
+_harmony_encoding = None
+
+
+def get_encoding():
+    global _harmony_encoding
+    if _harmony_encoding is None:
+        _harmony_encoding = load_harmony_encoding(
+            HarmonyEncodingName.HARMONY_GPT_OSS)
+    return _harmony_encoding
+
+
+def get_system_message(
+    model_identity: Optional[str] = None,
+    reasoning_effort: Optional[Literal["high", "medium", "low"]] = None,
+    start_date: Optional[str] = None,
+    browser_description: Optional[str] = None,
+    python_description: Optional[str] = None,
+) -> Message:
+    sys_msg_content = SystemContent.new()
+    if model_identity is not None:
+        sys_msg_content = sys_msg_content.with_model_identity(model_identity)
+    if reasoning_effort is not None:
+        sys_msg_content = sys_msg_content.with_reasoning_effort(
+            REASONING_EFFORT[reasoning_effort])
+    if start_date is None:
+        # NOTE(woosuk): This brings non-determinism in vLLM. Be careful.
+        start_date = datetime.datetime.now().strftime("%Y-%m-%d")
+    sys_msg_content = sys_msg_content.with_conversation_start_date(start_date)
+    if browser_description is not None:
+        sys_msg_content = sys_msg_content.with_tools(browser_description)
+    if python_description is not None:
+        sys_msg_content = sys_msg_content.with_tools(python_description)
+    sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content)
+    return sys_msg
+
+
+def get_developer_message(instructions: Optional[str] = None,
+                          tools: Optional[list[Tool]] = None) -> Message:
+    dev_msg_content = DeveloperContent.new()
+    if instructions is not None:
+        dev_msg_content = dev_msg_content.with_instructions(instructions)
+    if tools is not None:
+        function_tools = []
+        for tool in tools:
+            if tool.type in ("web_search_preview", "code_interpreter"):
+                # These are built-in tools that are added to the system message.
+                pass
+            elif tool.type == "function":
+                function_tools.append(tool)
+            else:
+                raise ValueError(f"tool type {tool.type} not supported")
+        if function_tools:
+            function_tool_descriptions = [
+                ToolDescription.new(
+                    name=tool.name,
+                    description=tool.description,
+                    parameters=tool.parameters,
+                ) for tool in function_tools
+            ]
+            dev_msg_content = dev_msg_content.with_function_tools(
+                function_tool_descriptions)
+    dev_msg = Message.from_role_and_content(Role.DEVELOPER, dev_msg_content)
+    return dev_msg
+
+
+def get_user_message(content: str) -> Message:
+    return Message.from_role_and_content(Role.USER, content)
+
+
+def parse_chat_input(chat_msg) -> Message:
+    role = chat_msg["role"]
+    content = chat_msg["content"]
+    if isinstance(content, str):
+        contents = [TextContent(text=content)]
+    else:
+        # TODO: Support refusal.
+        contents = [TextContent(text=c["text"]) for c in content]
+    msg = Message.from_role_and_contents(role, contents)
+    return msg
+
+
+def render_for_completion(messages: list[Message]) -> list[int]:
+    conversation = Conversation.from_messages(messages)
+    token_ids = get_encoding().render_conversation_for_completion(
+        conversation, Role.ASSISTANT)
+    return token_ids
+
+
+def get_stop_tokens_for_assistant_actions() -> list[int]:
+    return get_encoding().stop_tokens_for_assistant_actions()
+
+
+def get_streamable_parser_for_assistant() -> StreamableParser:
+    return StreamableParser(get_encoding(), role=Role.ASSISTANT)
diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py
new file mode 100644
index 0000000000..01ee77414f
--- /dev/null
+++ b/vllm/entrypoints/tool.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any
+
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    # Avoid circular import.
+    from vllm.entrypoints.context import ConversationContext
+
+logger = init_logger(__name__)
+
+
+class Tool(ABC):
+
+    @abstractmethod
+    async def get_result(self, context: "ConversationContext") -> Any:
+        pass
+
+
+class HarmonyBrowserTool(Tool):
+
+    def __init__(self):
+        self.enabled = True
+        exa_api_key = os.getenv("EXA_API_KEY")
+        if not exa_api_key:
+            self.enabled = False
+            logger.warning_once("EXA_API_KEY is not set, browsing is disabled")
+            return
+
+        try:
+            from gpt_oss.tools.simple_browser import SimpleBrowserTool
+            from gpt_oss.tools.simple_browser.backend import ExaBackend
+        except ImportError:
+            self.enabled = False
+            logger.warning_once(
+                "gpt_oss is not installed, browsing is disabled")
+            return
+
+        browser_backend = ExaBackend(source="web", api_key=exa_api_key)
+        self.browser_tool = SimpleBrowserTool(backend=browser_backend)
+        logger.info_once("Browser tool initialized")
+
+    async def get_result(self, context: "ConversationContext") -> Any:
+        from vllm.entrypoints.context import HarmonyContext
+        assert isinstance(context, HarmonyContext)
+        last_msg = context.messages[-1]
+        tool_output_msgs = []
+        async for msg in self.browser_tool.process(last_msg):
+            tool_output_msgs.append(msg)
+        return tool_output_msgs
+
+    @property
+    def tool_config(self) -> Any:
+        return self.browser_tool.tool_config
+
+
+class HarmonyPythonTool(Tool):
+
+    def __init__(self):
+        self.enabled = True
+
+        try:
+            from gpt_oss.tools.python_docker.docker_tool import PythonTool
+        except ImportError:
+            self.enabled = False
+            logger.warning_once(
+                "gpt_oss is not installed, code interpreter is disabled")
+            return
+
+        self.python_tool = PythonTool()
+        logger.info_once("Code interpreter tool initialized")
+
+    async def get_result(self, context: "ConversationContext") -> Any:
+        from vllm.entrypoints.context import HarmonyContext
+        assert isinstance(context, HarmonyContext)
+        last_msg = context.messages[-1]
+        tool_output_msgs = []
+        async for msg in self.python_tool.process(last_msg):
+            tool_output_msgs.append(msg)
+        return tool_output_msgs
+
+    @property
+    def tool_config(self) -> Any:
+        return self.python_tool.tool_config

From 54991c548a87392c0c1375e902db1f2ad71c105a Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Wed, 6 Aug 2025 01:49:44 -0700
Subject: [PATCH 021/932] [gpt-oss] add model to supported models doc (#22336)

Signed-off-by: Roger Wang <hey@rogerw.me>
---
 docs/models/supported_models.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 017a339ffc..120fd3f485 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -356,6 +356,7 @@ th {
 | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
 | `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ |
+| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | | ✅︎ |
 | `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |

From f263a4b53fb4070460f3d82538600cf667516d06 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 6 Aug 2025 01:57:39 -0700
Subject: [PATCH 022/932] [gpt-oss] Support chat completion api (#22342)

---
 vllm/entrypoints/harmony_utils.py       |  34 +++++
 vllm/entrypoints/openai/protocol.py     |   4 +
 vllm/entrypoints/openai/serving_chat.py | 169 ++++++++++++++++++++----
 3 files changed, 183 insertions(+), 24 deletions(-)

diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index 801c82b4fa..c1b0a084f3 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import datetime
+from collections.abc import Iterable
 from typing import Literal, Optional
 
 from openai.types.responses.tool import Tool
@@ -109,3 +110,36 @@ def get_stop_tokens_for_assistant_actions() -> list[int]:
 
 def get_streamable_parser_for_assistant() -> StreamableParser:
     return StreamableParser(get_encoding(), role=Role.ASSISTANT)
+
+
+def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser:
+    parser = get_streamable_parser_for_assistant()
+    for token_id in token_ids:
+        parser.process(token_id)
+    return parser
+
+
+def parse_chat_output(
+        token_ids: list[int]) -> tuple[Optional[str], Optional[str], bool]:
+    parser = parse_output_into_messages(token_ids)
+    output_msgs = parser.messages
+    if len(output_msgs) == 0:
+        # The generation has stopped during reasoning.
+        is_tool_call = False
+        reasoning_content = parser.current_content
+        final_content = None
+    elif len(output_msgs) == 1:
+        # The generation has stopped during final message.
+        is_tool_call = False
+        reasoning_content = output_msgs[0].content[0].text
+        final_content = parser.current_content
+    else:
+        if len(output_msgs) != 2:
+            raise ValueError(
+                "Expected 2 output messages (reasoning and final), "
+                f"but got {len(output_msgs)}.")
+        reasoning_msg, final_msg = output_msgs
+        reasoning_content = reasoning_msg.content[0].text
+        final_content = final_msg.content[0].text
+        is_tool_call = final_msg.recipient is not None
+    return reasoning_content, final_content, is_tool_call
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 64f2beb140..57aa427207 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -323,6 +323,7 @@ class ResponsesRequest(OpenAIBaseModel):
         if (top_p := self.top_p) is None:
             top_p = default_sampling_params.get(
                 "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+        stop_token_ids = default_sampling_params.get("stop_token_ids")
 
         # Structured output
         guided_decoding = None
@@ -340,6 +341,7 @@ class ResponsesRequest(OpenAIBaseModel):
             top_p=top_p,
             max_tokens=max_tokens,
             logprobs=self.top_logprobs,
+            stop_token_ids=stop_token_ids,
             output_kind=(RequestOutputKind.DELTA
                          if self.stream else RequestOutputKind.FINAL_ONLY),
             guided_decoding=guided_decoding,
@@ -404,6 +406,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
         Literal["required"],
         ChatCompletionNamedToolChoiceParam,
     ]] = "none"
+    reasoning_effort: Optional[Literal["low", "medium", "high"]] = None
+    include_reasoning: bool = True
 
     # NOTE this will be ignored by vLLM -- the model determines the behavior
     parallel_tool_calls: Optional[bool] = False
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index e1d8a31672..6ad0a8ec54 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -12,6 +12,7 @@ import jinja2
 import partial_json_parser
 import regex as re
 from fastapi import Request
+from openai_harmony import Message as OpenAIMessage
 from pydantic import TypeAdapter
 
 from vllm.config import ModelConfig
@@ -19,6 +20,10 @@ from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          ConversationMessage,
                                          random_tool_call_id)
+from vllm.entrypoints.harmony_utils import (
+    get_developer_message, get_stop_tokens_for_assistant_actions,
+    get_streamable_parser_for_assistant, get_system_message, parse_chat_input,
+    parse_chat_output, render_for_completion)
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionLogProb, ChatCompletionLogProbs,
@@ -35,6 +40,7 @@ from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
     MistralToolCall)
 from vllm.entrypoints.utils import get_max_tokens
+from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
@@ -125,6 +131,23 @@ class OpenAIServingChat(OpenAIServing):
             logger.info("Using default chat sampling params from %s: %s",
                         source, self.default_sampling_params)
 
+        self.use_harmony = model_config.hf_config.model_type == "gpt_oss"
+        if self.use_harmony:
+            if "stop_token_ids" not in self.default_sampling_params:
+                self.default_sampling_params["stop_token_ids"] = []
+            self.default_sampling_params["stop_token_ids"].extend(
+                get_stop_tokens_for_assistant_actions())
+
+        # NOTE(woosuk): While OpenAI's chat completion API supports browsing
+        # for some models, currently vLLM doesn't support it. Please use the
+        # Responses API instead.
+        self.supports_browsing = False
+        self.browser_tool = None
+        # NOTE(woosuk): Chat completion API does not support code interpreter.
+        # Please use the Responses API instead.
+        self.supports_code_interpreter = False
+        self.python_tool = None
+
     async def create_chat_completion(
         self,
         request: ChatCompletionRequest,
@@ -169,7 +192,8 @@ class OpenAIServingChat(OpenAIServing):
 
             if (request.tool_choice == "auto" and
                     not (self.enable_auto_tools and tool_parser is not None)
-                    and not isinstance(tokenizer, MistralTokenizer)):
+                    and not isinstance(tokenizer, MistralTokenizer)
+                    and not self.use_harmony):
                 # for hf tokenizers, "auto" tools requires
                 # --enable-auto-tool-choice and --tool-call-parser
                 return self.create_error_response(
@@ -184,25 +208,35 @@ class OpenAIServingChat(OpenAIServing):
             else:
                 tool_dicts = [tool.model_dump() for tool in request.tools]
 
-            (
-                conversation,
-                request_prompts,
-                engine_prompts,
-            ) = await self._preprocess_chat(
-                request,
-                tokenizer,
-                request.messages,
-                chat_template=request.chat_template or self.chat_template,
-                chat_template_content_format=self.chat_template_content_format,
-                add_generation_prompt=request.add_generation_prompt,
-                continue_final_message=request.continue_final_message,
-                tool_dicts=tool_dicts,
-                documents=request.documents,
-                chat_template_kwargs=request.chat_template_kwargs,
-                tool_parser=tool_parser,
-                truncate_prompt_tokens=request.truncate_prompt_tokens,
-                add_special_tokens=request.add_special_tokens,
-            )
+            if not self.use_harmony:
+                # Common case.
+                (
+                    conversation,
+                    request_prompts,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    request,
+                    tokenizer,
+                    request.messages,
+                    chat_template=request.chat_template or self.chat_template,
+                    chat_template_content_format=self.
+                    chat_template_content_format,
+                    add_generation_prompt=request.add_generation_prompt,
+                    continue_final_message=request.continue_final_message,
+                    tool_dicts=tool_dicts,
+                    documents=request.documents,
+                    chat_template_kwargs=request.chat_template_kwargs,
+                    tool_parser=tool_parser,
+                    truncate_prompt_tokens=request.truncate_prompt_tokens,
+                    add_special_tokens=request.add_special_tokens,
+                )
+            else:
+                # For GPT-OSS.
+                (
+                    conversation,
+                    request_prompts,
+                    engine_prompts,
+                ) = self._make_request_with_harmony(request)
         except (ValueError, TypeError, RuntimeError,
                 jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
@@ -436,6 +470,11 @@ class OpenAIServingChat(OpenAIServing):
         finish_reason_sent = [False] * num_choices
         num_prompt_tokens = 0
         num_cached_tokens = None
+        if self.use_harmony:
+            harmony_parsers = [
+                get_streamable_parser_for_assistant()
+                for _ in range(num_choices)
+            ]
 
         if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
             tool_choice_function_name = request.tool_choice.function.name
@@ -597,7 +636,18 @@ class OpenAIServingChat(OpenAIServing):
                     else:
                         logprobs = None
 
-                    delta_text = output.text
+                    if self.use_harmony:
+                        harmony_parser = harmony_parsers[i]
+                        for token_id in output.token_ids:
+                            harmony_parser.process(token_id)
+                        # FIXME(woosuk): Support function calling
+                        is_final = harmony_parser.current_channel == "final"
+                        if not (request.include_reasoning or is_final):
+                            # Skip the reasoning content.
+                            continue
+                        delta_text = harmony_parser.last_content_delta or ""
+                    else:
+                        delta_text = output.text
 
                     if not delta_text and not output.token_ids and \
                         not previous_num_tokens[i]:
@@ -607,7 +657,8 @@ class OpenAIServingChat(OpenAIServing):
                     delta_message: Optional[DeltaMessage]
 
                     # just update previous_texts and previous_token_ids
-                    if tool_choice_auto or self.reasoning_parser:
+                    if ((tool_choice_auto or self.reasoning_parser)
+                            and not self.use_harmony):
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
                         previous_text = previous_texts[i]
@@ -621,8 +672,14 @@ class OpenAIServingChat(OpenAIServing):
                         else:
                             current_token_ids = list(output.token_ids)
 
+                    if self.use_harmony:
+                        if is_final:
+                            delta_message = DeltaMessage(content=delta_text)
+                        else:
+                            delta_message = DeltaMessage(
+                                reasoning_content=delta_text)
                     # handle streaming deltas for tools with named tool_choice
-                    if tool_choice_function_name:
+                    elif tool_choice_function_name:
                         if (self.reasoning_parser and not reasoning_end_arr[i]
                                 and not reasoning_parser.is_reasoning_end(
                                     previous_token_ids)):
@@ -990,7 +1047,38 @@ class OpenAIServingChat(OpenAIServing):
                 )
             else:
                 logprobs = None
-            auto_tools_called = False
+
+            if self.use_harmony:
+                reasoning_content, final_content, is_tool_call = (
+                    parse_chat_output(token_ids))
+                if not request.include_reasoning:
+                    reasoning_content = None
+
+                if is_tool_call:
+                    # TODO(woosuk): Implement tool call for gpt-oss.
+                    # For now, only Responses API supports tool call for
+                    # gpt-oss.
+                    raise NotImplementedError(
+                        "Tool call in Chat Completion API is not supported "
+                        "for gpt-oss yet. Please use Responses API instead.")
+                else:
+                    # Normal message
+                    message = ChatMessage(
+                        role=role,
+                        reasoning_content=reasoning_content,
+                        content=final_content,
+                    )
+
+                choice_data = ChatCompletionResponseChoice(
+                    index=output.index,
+                    message=message,
+                    logprobs=logprobs,
+                    finish_reason="tool_calls" if is_tool_call else
+                    output.finish_reason if output.finish_reason else "stop",
+                    stop_reason=output.stop_reason,
+                )
+                choices.append(choice_data)
+                continue
 
             if self.reasoning_parser:
                 try:
@@ -1003,10 +1091,13 @@ class OpenAIServingChat(OpenAIServing):
                 reasoning_content, content = (
                     reasoning_parser.extract_reasoning_content(
                         output.text, request=request))
+                if not request.include_reasoning:
+                    reasoning_content = None
             else:
                 reasoning_content = None
                 content = output.text
 
+            auto_tools_called = False
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
             if (not self.enable_auto_tools or not self.tool_parser) and \
@@ -1261,3 +1352,33 @@ class OpenAIServingChat(OpenAIServing):
             and delta_message.tool_calls[0].function
             and delta_message.tool_calls[0].function.arguments is not None
         )
+
+    def _make_request_with_harmony(
+        self,
+        request: ChatCompletionRequest,
+    ):
+        messages: list[OpenAIMessage] = []
+
+        # Add system message.
+        # NOTE: In Chat Completion API, browsing is enabled by default
+        # if the model supports it. TODO: Support browsing.
+        assert not self.supports_browsing
+        assert not self.supports_code_interpreter
+        sys_msg = get_system_message(
+            reasoning_effort=request.reasoning_effort,
+            browser_description=None,
+            python_description=None)
+        messages.append(sys_msg)
+
+        # Add developer message.
+        dev_msg = get_developer_message()
+        messages.append(dev_msg)
+
+        # Add user message.
+        for chat_msg in request.messages:
+            messages.append(parse_chat_input(chat_msg))
+
+        # Render prompt token ids.
+        prompt_token_ids = render_for_completion(messages)
+        engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+        return messages, [prompt_token_ids], [engine_prompt]

From 9edd1db02bc6dce6da503503a373657f3466a78b Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 6 Aug 2025 02:22:03 -0700
Subject: [PATCH 023/932] [Minor] Fix type  (#22347)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/entrypoints/harmony_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index c1b0a084f3..ecda35c980 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import datetime
-from collections.abc import Iterable
+from collections.abc import Iterable, Sequence
 from typing import Literal, Optional
 
 from openai.types.responses.tool import Tool
@@ -120,7 +120,7 @@ def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser:
 
 
 def parse_chat_output(
-        token_ids: list[int]) -> tuple[Optional[str], Optional[str], bool]:
+        token_ids: Sequence[int]) -> tuple[Optional[str], Optional[str], bool]:
     parser = parse_output_into_messages(token_ids)
     output_msgs = parser.messages
     if len(output_msgs) == 0:

From 2cb6ef8996320273705933d5b24fc6674eb95de8 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 6 Aug 2025 11:03:03 -0400
Subject: [PATCH 024/932] [BugFix] Fix FA2 RuntimeError when sinks is provided
 (#22365)

Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
---
 cmake/external_projects/vllm_flash_attn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 4eb4b464a2..59b99e9e20 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG b99f8c821771fd11feb66d5c89661e9858fde359
+          GIT_TAG 6dbc6e011a3ebe9349eeb74578940dd7095436ba
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From b4b9813b5e2076d510ae518252f64064e6646a3e Mon Sep 17 00:00:00 2001
From: Zhang Jason <ning.zhang2@amd.com>
Date: Wed, 6 Aug 2025 23:58:38 +0800
Subject: [PATCH 025/932] add the codes to check AMD Instinct GPU number
 (#22367)

Signed-off-by: Zhang Jason <ning.zhang2@amd.com>
---
 .../disagg_prefill_lmcache_v1/disagg_example_nixl.sh   | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
index 1178681f15..a409c49b5d 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
@@ -21,8 +21,14 @@ check_hf_token() {
 }
 
 check_num_gpus() {
-    # can you check if the number of GPUs are >=2 via nvidia-smi?
-    num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    # can you check if the number of GPUs are >=2 via nvidia-smi/rocm-smi?
+    which rocm-smi > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+	num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    else
+	num_gpus=$(rocm-smi --showid | grep Instinct | wc -l)
+    fi
+
     if [ "$num_gpus" -lt 2 ]; then
         echo "You need at least 2 GPUs to run disaggregated prefill."
         exit 1

From 4a6b72c2ab9848af31d51d3105a1992b7d5a01dc Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 6 Aug 2025 12:47:38 -0400
Subject: [PATCH 026/932] [BugFix] Fix triton compile error in
 `kernel_unified_attention_2/3d` caused by attention sinks (#22368)

Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
---
 .../attention/ops/triton_unified_attention.py | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index ba4299a277..56ebed0f52 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -75,6 +75,7 @@ def kernel_unified_attention_2d(
         USE_ALIBI_SLOPES: tl.constexpr,  # bool
         USE_QQ_BIAS: tl.constexpr,  # bool
         USE_SOFTCAP: tl.constexpr,  # bool
+        USE_SINKS: tl.constexpr,  # bool
         SLIDING_WINDOW: tl.constexpr,  # int
         stride_k_cache_0: tl.int64,  # int
         stride_k_cache_1: tl.int64,  # int
@@ -132,7 +133,7 @@ def kernel_unified_attention_2d(
 
     block_table_offset = seq_idx * block_table_stride
 
-    if sink_ptr is None:
+    if not USE_SINKS:
         M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
     else:
         M = tl.load(
@@ -322,6 +323,7 @@ def kernel_unified_attention_3d(
         USE_ALIBI_SLOPES: tl.constexpr,  # bool
         USE_QQ_BIAS: tl.constexpr,  # bool
         USE_SOFTCAP: tl.constexpr,  # bool
+        USE_SINKS: tl.constexpr,  # bool
         SLIDING_WINDOW: tl.constexpr,  # int
         stride_k_cache_0: tl.int64,  # int
         stride_k_cache_1: tl.int64,  # int
@@ -393,14 +395,17 @@ def kernel_unified_attention_3d(
 
     block_table_offset = seq_idx * block_table_stride
 
-    if sink_ptr is None or segm_idx != 0:
-        M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    if USE_SINKS:
+        if segm_idx == 0:
+            M = tl.load(
+                sink_ptr + query_offset_1,
+                mask=query_mask_1,
+                other=float("-inf"),
+            ).to(dtype=tl.float32)
+        else:
+            M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
     else:
-        M = tl.load(
-            sink_ptr + query_offset_1,
-            mask=query_mask_1,
-            other=float("-inf"),
-        ).to(dtype=tl.float32)
+        M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
 
     L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
@@ -716,6 +721,7 @@ def unified_attention(
             USE_ALIBI_SLOPES=use_alibi_slopes,
             USE_QQ_BIAS=use_qq_bias,
             USE_SOFTCAP=(softcap > 0),
+            USE_SINKS=(sinks is not None),
             SLIDING_WINDOW=(1 + window_size[0]),
             stride_k_cache_0=k.stride(0),
             stride_k_cache_1=k.stride(1),
@@ -787,6 +793,7 @@ def unified_attention(
                 USE_ALIBI_SLOPES=use_alibi_slopes,
                 USE_QQ_BIAS=use_qq_bias,
                 USE_SOFTCAP=(softcap > 0),
+                USE_SINKS=(sinks is not None),
                 SLIDING_WINDOW=(1 + window_size[0]),
                 stride_k_cache_0=k.stride(0),
                 stride_k_cache_1=k.stride(1),

From 2435ea7ed5c3a7d058cc6f6d649316e96976acaa Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Wed, 6 Aug 2025 13:00:58 -0400
Subject: [PATCH 027/932] [Bugfix] Make condition in triton kernel constexpr
 (#22370)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/attention/ops/chunked_prefill_paged_decode.py | 4 +++-
 vllm/attention/ops/prefix_prefill.py               | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 08bfcc974c..dc10d7eca9 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -60,6 +60,7 @@ def kernel_paged_attention_2d(
         stride_v_cache_3: tl.int64,  # int
         filter_by_query_len: tl.constexpr,  # bool
         query_start_len_ptr,  # [num_seqs+1]
+        USE_SINKS: tl.constexpr,  # bool
 ):
     seq_idx = tl.program_id(0)
     kv_head_idx = tl.program_id(1)
@@ -96,7 +97,7 @@ def kernel_paged_attention_2d(
 
     block_table_offset = seq_idx * block_table_stride
 
-    if sink_ptr is None:
+    if not USE_SINKS:
         M = tl.full([num_queries_per_kv_padded],
                     float("-inf"),
                     dtype=tl.float32)
@@ -386,4 +387,5 @@ def chunked_prefill_paged_decode(
             stride_v_cache_3=value_cache.stride(3),
             filter_by_query_len=True,
             query_start_len_ptr=query_start_loc,
+            USE_SINKS=sinks is not None,
         )
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 64c9033797..e1d41930f6 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -81,6 +81,7 @@ def _fwd_kernel(Q,
                 num_unroll_cache: tl.constexpr,
                 num_unroll_request: tl.constexpr,
                 SKIP_DECODE: tl.constexpr,
+                USE_SINKS: tl.constexpr,
                 MAX_Q_LEN: tl.constexpr = 0,
                 MAX_CTX_LEN: tl.constexpr = 0):
 
@@ -127,7 +128,7 @@ def _fwd_kernel(Q,
                 other=0.0)  # [M,D]
 
     # initialize pointer to m and l
-    if sink_ptr is None:
+    if not USE_SINKS:
         m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
     else:
         m_i = tl.load(
@@ -910,5 +911,6 @@ def context_attention_fwd(q,
         num_unroll_request=1,
         num_warps=4,
         num_stages=1,
+        USE_SINKS=sinks is not None,
         **extra_kargs)
     return

From ec7cb1922478015b4e7eae73c6acde8b598a05a8 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 6 Aug 2025 10:32:21 -0700
Subject: [PATCH 028/932] [gpt-oss] Add loop for built-in tool call (#22374)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
---
 vllm/entrypoints/openai/serving_engine.py    | 56 ++++++++++++++++++++
 vllm/entrypoints/openai/serving_responses.py | 33 ++++++------
 2 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 71976fea1e..822f186840 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -35,6 +35,7 @@ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                          apply_mistral_chat_template,
                                          parse_chat_messages_futures,
                                          resolve_chat_template_content_format)
+from vllm.entrypoints.context import ConversationContext
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               ChatCompletionResponse,
@@ -948,6 +949,61 @@ class OpenAIServing:
 
         return conversation, [request_prompt], [engine_prompt]
 
+    async def _generate_with_builtin_tools(
+        self,
+        request_id: str,
+        request_prompt: RequestPrompt,
+        engine_prompt: EngineTokensPrompt,
+        sampling_params: SamplingParams,
+        context: ConversationContext,
+        lora_request: Optional[LoRARequest] = None,
+        priority: int = 0,
+        **kwargs,
+    ):
+        orig_priority = priority
+        while True:
+            self._log_inputs(
+                request_id,
+                request_prompt,
+                params=sampling_params,
+                lora_request=lora_request,
+            )
+            generator = self.engine_client.generate(
+                engine_prompt,
+                sampling_params,
+                request_id,
+                lora_request=lora_request,
+                priority=priority,
+                **kwargs,
+            )
+            async for res in generator:
+                context.append_output(res)
+                # NOTE(woosuk): The stop condition is handled by the engine.
+                yield context
+
+            if not context.need_builtin_tool_call():
+                # The model did not ask for a tool call, so we're done.
+                break
+
+            # Call the tool and update the context with the result.
+            tool_output = await context.call_tool()
+            context.append_output(tool_output)
+
+            # TODO: uncomment this and enable tool output streaming
+            # yield context
+
+            # Create inputs for the next turn.
+            # Render the next prompt token ids.
+            prompt_token_ids = context.render_for_completion()
+            engine_prompt = EngineTokensPrompt(
+                prompt_token_ids=prompt_token_ids)
+            request_prompt = prompt_token_ids
+            # Update the sampling params.
+            sampling_params.max_tokens = (self.max_model_len -
+                                          len(prompt_token_ids))
+            # OPTIMIZATION
+            priority = orig_priority - 1
+
     def _load_prompt_embeds(
         self,
         prompt_embeds: Optional[Union[bytes, list[bytes]]],
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index e009529fbd..f340854386 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -16,6 +16,7 @@ from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                          ChatTemplateContentFormatOption)
+from vllm.entrypoints.context import ConversationContext, SimpleContext
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -29,7 +30,6 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
-from vllm.outputs import RequestOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -187,7 +187,7 @@ class OpenAIServingResponses(OpenAIServing):
             raw_request.state.request_metadata = request_metadata
 
         # Schedule the request and get the result generator.
-        generators: list[AsyncGenerator[RequestOutput, None]] = []
+        generators: list[AsyncGenerator[ConversationContext, None]] = []
         try:
             for i, engine_prompt in enumerate(engine_prompts):
                 default_max_tokens = self.max_model_len - len(
@@ -195,21 +195,19 @@ class OpenAIServingResponses(OpenAIServing):
                 sampling_params = request.to_sampling_params(
                     default_max_tokens, self.default_sampling_params)
 
-                self._log_inputs(request.request_id,
-                                 request_prompts[i],
-                                 params=sampling_params,
-                                 lora_request=lora_request)
-
                 trace_headers = (None if raw_request is None else await
                                  self._get_trace_headers(raw_request.headers))
 
-                generator = self.engine_client.generate(
-                    engine_prompt,
-                    sampling_params,
-                    request.request_id,
+                context = SimpleContext()
+                generator = self._generate_with_builtin_tools(
+                    request_id=request.request_id,
+                    request_prompt=request_prompts[i],
+                    engine_prompt=engine_prompt,
+                    sampling_params=sampling_params,
+                    context=context,
                     lora_request=lora_request,
-                    trace_headers=trace_headers,
                     priority=request.priority,
+                    trace_headers=trace_headers,
                 )
                 generators.append(generator)
         except ValueError as e:
@@ -277,7 +275,7 @@ class OpenAIServingResponses(OpenAIServing):
         self,
         request: ResponsesRequest,
         sampling_params: SamplingParams,
-        result_generator: AsyncIterator[RequestOutput],
+        result_generator: AsyncIterator[ConversationContext],
         model_name: str,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
@@ -285,17 +283,20 @@ class OpenAIServingResponses(OpenAIServing):
     ) -> Union[ErrorResponse, ResponsesResponse]:
         if created_time is None:
             created_time = int(time.time())
-        final_res: Optional[RequestOutput] = None
 
+        context: Optional[ConversationContext] = None
         try:
-            async for res in result_generator:
-                final_res = res
+            async for context in result_generator:
+                pass
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
+        assert context is not None
+        assert isinstance(context, SimpleContext)
+        final_res = context.last_output
         assert final_res is not None
         assert len(final_res.outputs) == 1
         final_output = final_res.outputs[0]

From 31f5dc5b2a5da18bc17240c7a67e8770d00901d8 Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Wed, 6 Aug 2025 11:41:42 -0700
Subject: [PATCH 029/932] [gpt-oss] Enhance error msg on attention sink init
 (#22335)

Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index caf9ecc911..061bd5f1d2 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -638,11 +638,15 @@ class FlashInferImpl(AttentionImpl):
 
         self.sinks: Optional[torch.Tensor] = None
         if sinks is not None:
-            assert sinks.shape[0] == num_heads, (
-                "Sinks must have the same number of heads "
-                "as the number of heads in the layer"
-            )
-            assert sinks.dtype == torch.float32, "Sinks must be of type float32"
+            if sinks.shape[0] != num_heads:
+                raise ValueError(
+                    "Sinks must have the same number of heads as the number of "
+                    f"heads in the layer. Expected {num_heads}, but got "
+                    f"{sinks.shape[0]}."
+                )
+            if sinks.dtype != torch.float32:
+                raise ValueError("Sinks must be of type float32, but got "
+                                 f"{sinks.dtype}.")
             self.sinks = sinks
 
     def forward(

From 31f09c615f4f067dba765ce5fe7d00d880212a6d Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Wed, 6 Aug 2025 12:37:27 -0700
Subject: [PATCH 030/932] [gpt-oss] flashinfer mxfp4 (#22339)

Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
---
 vllm/envs.py                                  |  12 +
 vllm/model_executor/layers/fused_moe/layer.py |  32 +-
 .../layers/quantization/__init__.py           |   3 +
 .../layers/quantization/mxfp4.py              | 387 ++++++++++++++++++
 .../layers/quantization/utils/mxfp4_utils.py  |  22 +
 5 files changed, 453 insertions(+), 3 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/mxfp4.py

diff --git a/vllm/envs.py b/vllm/envs.py
index f8a7197dd1..8a3eb8e509 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -154,6 +154,8 @@ if TYPE_CHECKING:
     VLLM_ENABLE_RESPONSES_API_STORE: bool = False
     VLLM_USE_TRTLLM_CONTEXT_ATTENTION: bool = False
     VLLM_USE_TRTLLM_DECODE_ATTENTION: bool = False
+    VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
+    VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
 
 
 def get_default_cache_root():
@@ -932,6 +934,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_FLASHINFER_MOE_FP4":
     lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))),
 
+    # If set to 1, use the FlashInfer
+    # MXFP8 (activation) x MXFP4 (weight) MoE backend.
+    "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8":
+    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))),
+
+    # If set to 1, use the FlashInfer
+    # BF16 (activation) x MXFP4 (weight) MoE backend.
+    "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16":
+    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))),
+
     # Control the cache sized used by the xgrammar compiler. The default
     # of 512 MB should be enough for roughly 1000 JSON schemas.
     # It can be changed with this variable if needed for some reason.
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index f155a1b11f..a4a6157fa4 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -33,7 +33,8 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
-from vllm.utils import direct_register_custom_op, has_deep_ep, has_pplx
+from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx,
+                        round_up)
 from vllm.utils.flashinfer import has_flashinfer
 
 if current_platform.is_cuda_alike():
@@ -719,6 +720,12 @@ class FusedMoE(torch.nn.Module):
 
         self.global_num_experts = num_experts + num_redundant_experts
 
+        # we padding globally so EP buffer allocation works
+        if quant_config and quant_config.get_name() == "mxfp4" and (
+                envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+            hidden_size = round_up(hidden_size, 256)
+
         # For smuggling this layer into the fused moe custom op
         compilation_config = vllm_config.compilation_config
         if prefix in compilation_config.static_forward_context:
@@ -1064,6 +1071,18 @@ class FusedMoE(torch.nn.Module):
                       shard_id: str,
                       expert_id: int,
                       return_success: bool = False) -> Optional[bool]:
+
+        if self.quant_config and self.quant_config.get_name() == "mxfp4":
+            # (FIXME) for gpt-oss all experts are combined
+            if "bias" in weight_name:
+                dim1 = loaded_weight.shape[1]
+                param.data[:, :dim1].copy_(loaded_weight)
+            else:
+                dim1 = loaded_weight.shape[1]
+                dim2 = loaded_weight.shape[2]
+                param.data[:, :dim1, :dim2].copy_(loaded_weight)
+            return True if return_success else None
+
         expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
         if expert_id == -1:
             # Failed to load this param since it's not local to this rank
@@ -1476,13 +1495,20 @@ class FusedMoE(torch.nn.Module):
 
     def forward(self, hidden_states: torch.Tensor,
                 router_logits: torch.Tensor):
+        og_hidden_states = hidden_states.shape[-1]
+        if self.hidden_size != og_hidden_states:
+            hidden_states = F.pad(hidden_states,
+                                  (0, self.hidden_size - og_hidden_states),
+                                  mode='constant',
+                                  value=0.0)
         # TODO: Once the OOM issue for the TPU backend is resolved, we will
         # switch to using the moe_forward custom op.
         if current_platform.is_tpu():
             return self.forward_impl(hidden_states, router_logits)
         else:
-            return torch.ops.vllm.moe_forward(hidden_states, router_logits,
-                                              self.layer_name)
+            return torch.ops.vllm.moe_forward(
+                hidden_states, router_logits,
+                self.layer_name)[..., :og_hidden_states]
 
     def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                              full_router_logits: torch.Tensor):
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 95aea912a1..8d63027e18 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -37,6 +37,7 @@ QuantizationMethods = Literal[
     "auto-round",
     "rtn",
     "inc",
+    "mxfp4",
 ]
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
 
@@ -110,6 +111,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .marlin import MarlinConfig
     from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config
     from .moe_wna16 import MoeWNA16Config
+    from .mxfp4 import Mxfp4Config
     from .neuron_quant import NeuronQuantConfig
     from .ptpc_fp8 import PTPCFp8Config
     from .qqq import QQQConfig
@@ -148,6 +150,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "auto-round": AutoRoundConfig,
         "rtn": RTNConfig,
         "inc": INCConfig,
+        "mxfp4": Mxfp4Config,
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
new file mode 100644
index 0000000000..b6d7bc5d5c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -0,0 +1,387 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Callable, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm import envs
+from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
+                                                  FusedMoEMethodBase)
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
+    _can_support_mxfp4)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    is_layer_skipped)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.utils import next_power_of_2, round_up
+
+if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+        or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+    # from flashinfer.fused_moe import cutlass_fused_moe
+    from flashinfer import (mxfp8_quantize, shuffle_matrix_a,
+                            shuffle_matrix_sf_a, trtllm_fp4_block_scale_moe)
+
+
+class Mxfp4Config(QuantizationConfig):
+
+    def __init__(self, ignored_layers: Optional[list[str]] = None):
+        super().__init__()
+        self.ignored_layers = ignored_layers
+
+    @classmethod
+    def from_config(cls, config):
+        return cls()
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 100
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "mxfp4"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16]
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        if isinstance(layer, LinearBase):
+            if self.ignored_layers and is_layer_skipped(
+                    prefix=prefix,
+                    ignored_layers=self.ignored_layers,
+                    fused_mapping=self.packed_modules_mapping):
+                return UnquantizedLinearMethod()
+            raise NotImplementedError("Mxfp4 linear layer is not implemented")
+        elif isinstance(layer, FusedMoE):
+            return Mxfp4MoEMethod(layer.moe_config)
+        elif isinstance(layer, Attention):
+            raise NotImplementedError(
+                "Mxfp4 attention layer is not implemented")
+        return None
+
+
+class Mxfp4MoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, moe: FusedMoEConfig):
+        super().__init__()
+        self.topk_indices_dtype = None
+        self.moe = moe
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+        self.num_experts = num_experts
+        weight_dtype = torch.uint8
+        scale_dtype = torch.uint8
+
+        # FIXME (zyongye): ship after torch and safetensors support mxfp4
+        # is_torch_mxfp4_available = (
+        #     hasattr(torch, "float4_e2m1fn_x2") and
+        #     hasattr(torch, "float8_e8m0fnu"))
+        # if is_torch_mxfp4_available:
+        #     weight_dtype = torch.float4_e2m1fn_x2
+        #     scale_dtype = torch.float8_e8m0fnu
+
+        mxfp4_block = 32
+
+        intermediate_size_per_partition_after_pad = \
+            intermediate_size_per_partition
+        # pad the intermediate size to be a multiple of 2 * mxfp4_block
+        # for to hold non-uniform sharded tensor as well as swizzling
+        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+            intermediate_size_per_partition_after_pad = round_up(
+                intermediate_size_per_partition, 256)
+            hidden_size = round_up(hidden_size, 256)
+
+        self.intermediate_size = intermediate_size_per_partition_after_pad
+        self.hidden_size = hidden_size
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            2 * intermediate_size_per_partition_after_pad,
+            hidden_size // 2,
+            dtype=weight_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w13_weight_scale = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            2 * intermediate_size_per_partition_after_pad,
+            hidden_size // mxfp4_block,
+            dtype=scale_dtype),
+                                              requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+        w13_bias = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            2 * intermediate_size_per_partition_after_pad,
+            dtype=torch.bfloat16),
+                                      requires_grad=False)
+        layer.register_parameter("w13_bias", w13_bias)
+        set_weight_attrs(w13_bias, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition_after_pad // 2,
+            dtype=weight_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        w2_weight_scale = torch.nn.Parameter(torch.zeros(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition_after_pad // mxfp4_block,
+            dtype=scale_dtype),
+                                             requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        w2_bias = torch.nn.Parameter(torch.zeros(num_experts,
+                                                 hidden_size,
+                                                 dtype=torch.bfloat16),
+                                     requires_grad=False)
+        layer.register_parameter("w2_bias", w2_bias)
+        set_weight_attrs(w2_bias, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer):
+        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+            layer.gemm1_alpha = Parameter(torch.tensor(
+                [1.702] * self.num_experts, dtype=torch.float32).cuda(),
+                                          requires_grad=False)
+            layer.gemm1_beta = Parameter(torch.tensor(
+                [1.0] * self.num_experts, dtype=torch.float32).cuda(),
+                                         requires_grad=False)
+            layer.gemm1_clamp_limit = Parameter(torch.tensor(
+                [7.0] * self.num_experts, dtype=torch.float32).cuda(),
+                                                requires_grad=False)
+            sf_block_size = 32  # mxfp4 block size
+
+            assert (layer.w13_weight.dim() == 3
+                    and layer.w13_weight.shape[0] == self.num_experts
+                    and layer.w13_weight.shape[1] == self.intermediate_size * 2
+                    and layer.w13_weight.shape[2] == self.hidden_size // 2)
+            assert (layer.w13_weight_scale.dim() == 3
+                    and layer.w13_weight_scale.shape[0] == self.num_experts
+                    and layer.w13_weight_scale.shape[1]
+                    == self.intermediate_size * 2
+                    and layer.w13_weight_scale.shape[2]
+                    == self.hidden_size // sf_block_size)
+            assert (layer.w2_weight.dim() == 3
+                    and layer.w2_weight.shape[0] == self.num_experts
+                    and layer.w2_weight.shape[1] == self.hidden_size and
+                    layer.w2_weight.shape[2] == self.intermediate_size // 2)
+            assert (layer.w2_weight_scale.dim() == 3
+                    and layer.w2_weight_scale.shape[1] == self.hidden_size
+                    and layer.w2_weight_scale.shape[2]
+                    == self.intermediate_size // sf_block_size)
+            assert (layer.w13_bias.dim() == 2
+                    and layer.w13_bias.shape[0] == self.num_experts
+                    and layer.w13_bias.shape[1] == self.intermediate_size * 2)
+            assert (layer.w2_bias.dim() == 2
+                    and layer.w2_bias.shape[0] == self.num_experts
+                    and layer.w2_bias.shape[1] == self.hidden_size)
+
+            w13_weight_scale = layer.w13_weight_scale.data
+            w2_weight_scale = layer.w2_weight_scale.data
+            w13_weight = layer.w13_weight.data
+            w2_weight = layer.w2_weight.data
+            w13_bias = layer.w13_bias.data.to(torch.float32)
+            w2_bias = layer.w2_bias.data.to(torch.float32)
+
+            # Swap w1 and w3 as the defenition of
+            # swiglu is different in the trtllm-gen
+            def swap_every_two_rows(x, axis=-1):
+                shape = x.shape
+                if axis < 0:
+                    axis = len(shape) + axis
+
+                # Create a new shape with pairs swapped along specified axis
+                new_shape = list(shape)
+                new_shape[axis] = shape[axis] // 2
+                new_shape.insert(axis + 1, 2)
+
+                # Reshape to expose pairs, swap them, and reshape back
+                x = x.reshape(*new_shape)
+                x = x.flip(axis + 1)
+                new_shape = list(shape)
+                return x.reshape(*new_shape)
+
+            w13_weight_scale = swap_every_two_rows(w13_weight_scale, -2)
+            w13_weight = swap_every_two_rows(w13_weight, -2)
+            w13_bias = swap_every_two_rows(w13_bias, -1)
+
+            # Do not interleave as the checkpoint is already interleaved
+
+            # Shuffle weights and scaling factors for transposed mma output
+            gemm1_weights_mxfp4_shuffled = []
+            gemm1_scales_mxfp4_shuffled = []
+            gemm2_weights_mxfp4_shuffled = []
+            gemm2_scales_mxfp4_shuffled = []
+            gemm1_bias_shuffled = []
+            gemm2_bias_shuffled = []
+            epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
+            for i in range(self.num_experts):
+                gemm1_weights_mxfp4_shuffled.append(
+                    shuffle_matrix_a(w13_weight[i].view(torch.uint8),
+                                     epilogue_tile_m))
+                gemm1_scales_mxfp4_shuffled.append(
+                    shuffle_matrix_sf_a(w13_weight_scale[i].view(torch.uint8),
+                                        epilogue_tile_m))
+                gemm1_bias_shuffled.append(
+                    shuffle_matrix_a(w13_bias[i].clone().reshape(-1, 1),
+                                     epilogue_tile_m))
+
+                gemm2_weights_mxfp4_shuffled.append(
+                    shuffle_matrix_a(w2_weight[i].view(torch.uint8),
+                                     epilogue_tile_m))
+                gemm2_scales_mxfp4_shuffled.append(
+                    shuffle_matrix_sf_a(w2_weight_scale[i].view(torch.uint8),
+                                        epilogue_tile_m))
+                gemm2_bias_shuffled.append(
+                    shuffle_matrix_a(w2_bias[i].clone().reshape(-1, 1),
+                                     epilogue_tile_m))
+
+            w13_weight = torch.stack(gemm1_weights_mxfp4_shuffled)
+            w13_weight_scale = torch.stack(
+                gemm1_scales_mxfp4_shuffled).reshape(
+                    self.num_experts, 2 * self.intermediate_size,
+                    self.hidden_size // sf_block_size).view(
+                        torch.float8_e4m3fn)
+
+            w2_weight = torch.stack(gemm2_weights_mxfp4_shuffled)
+            w2_weight_scale = torch.stack(gemm2_scales_mxfp4_shuffled).reshape(
+                self.num_experts, self.hidden_size, self.intermediate_size //
+                sf_block_size).view(torch.float8_e4m3fn)
+
+            layer.w13_weight = Parameter(w13_weight, requires_grad=False)
+            layer.w13_weight_scale = Parameter(w13_weight_scale,
+                                               requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight, requires_grad=False)
+            layer.w2_weight_scale = Parameter(w2_weight_scale,
+                                              requires_grad=False)
+            layer.w13_bias = Parameter(
+                torch.stack(gemm1_bias_shuffled).reshape(self.num_experts, -1),
+                requires_grad=False)
+            layer.w2_bias = Parameter(torch.stack(gemm2_bias_shuffled).reshape(
+                self.num_experts, -1),
+                                      requires_grad=False)
+            return
+
+    def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int):
+        # Number of tokens in the input tensor.
+        num_tokens = x.shape[0]
+        # Factor to account for the imbalance of the experts.
+        # factor equals to the
+        # max_real_num_tokens_per_expert / perfect_num_tokens_per_expert
+        # - 1.0 means perfect expert distribution.
+        # - > 1.0 means some experts have more
+        #     tokens than the perfect distribution.
+        # - < 1.0 does not make sense.
+        imbalance_factor = 1.3
+        # Calculate the number of tokens per expert
+        # assuming perfect distribution.
+        num_tokens_per_expert = (num_tokens * top_k) // self.num_experts
+        # Apply the imbalance factor.
+        num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
+        # And pad the number to the next power of 2.
+        tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
+        # Cap to 8-64 tokens per CTA tile
+        # as it's the range supported by the kernel.
+        tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+
+        return tile_tokens_dim
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        if enable_eplb:
+            raise NotImplementedError("EPLB is not supported for mxfp4")
+
+        assert _can_support_mxfp4(
+            use_grouped_topk, topk_group, num_expert_group, expert_map,
+            custom_routing_function, e_score_correction_bias,
+            apply_router_weight_on_input, scoring_func, activation,
+            expert_load_view, logical_to_physical_map,
+            logical_replica_count), ("MXFP4 are not supported\
+                                      with this configuration.")
+
+        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+            assert not self.moe.use_ep, (
+                "EP is not supported for flashinfer mxfp4 moe backend yet.")
+            if envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16:
+                assert x.dtype == torch.bfloat16
+                x_quant = x
+                x_scale = None
+            else:
+                x_quant, x_scale = mxfp8_quantize(x, False)  # to mxfp8
+                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
+            trtllm_gen_output = trtllm_fp4_block_scale_moe(
+                router_logits.to(torch.bfloat16),
+                None,  # routing_bias
+                x_quant,
+                x_scale,
+                layer.w13_weight,  # uint8 (e2m1 x 2)
+                layer.w13_weight_scale,  # uint8 (e4m3 x 2)
+                layer.w13_bias,  # fp32 per expert per channel
+                layer.gemm1_alpha,  # fp32 per expert
+                layer.gemm1_beta,  # fp32 per expert
+                layer.gemm1_clamp_limit,  # fp32 per expert
+                layer.w2_weight,  # uint8 (e2m1 x 2)
+                layer.w2_weight_scale,  # ue8m0
+                layer.w2_bias,  # fp32 per expert per channel
+                None,  # output1_scale_scalar
+                None,  # output1_scale_gate_scalar
+                None,  # output2_scale_scalar
+                self.num_experts,
+                top_k,
+                None,  # n_group
+                None,  # topk_group
+                self.intermediate_size,  # padded to multiple of 256
+                0,  # local_expert_offset
+                self.num_experts,  # local num experts
+                None,
+                self._get_tile_tokens_dim(x, top_k),
+                1 if renormalize else 0,  # routing_method_type, renormalize
+                True,  # do finalize
+            )[0]
+            return trtllm_gen_output
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 1119045db0..4a4e199e13 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Callable, Optional
+
 import torch
 
 from vllm.utils import direct_register_custom_op
@@ -7,6 +9,26 @@ from vllm.utils import direct_register_custom_op
 OCP_MX_BLOCK_SIZE = 32
 
 
+def _can_support_mxfp4(use_grouped_topk: bool = False,
+                       topk_group: Optional[int] = None,
+                       num_expert_group: Optional[int] = None,
+                       expert_map: Optional[torch.Tensor] = None,
+                       custom_routing_function: Optional[Callable] = None,
+                       e_score_correction_bias: Optional[torch.Tensor] = None,
+                       apply_router_weight_on_input: bool = False,
+                       scoring_func: str = "softmax",
+                       activation: str = "silu",
+                       expert_load_view: Optional[torch.Tensor] = None,
+                       logical_to_physical_map: Optional[torch.Tensor] = None,
+                       logical_replica_count: Optional[torch.Tensor] = None):
+    return not (use_grouped_topk or topk_group or num_expert_group
+                or expert_map or custom_routing_function
+                or e_score_correction_bias or apply_router_weight_on_input
+                or scoring_func != "softmax" or activation != "silu"
+                or expert_load_view or logical_to_physical_map
+                or logical_replica_count)
+
+
 def _dequant_mxfp4(x: torch.Tensor, scale: torch.Tensor,
                    float_dtype: torch.dtype) -> torch.Tensor:
     try:

From 46a13949d5f64e4a40bac3cb30eb0f867074f741 Mon Sep 17 00:00:00 2001
From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Thu, 7 Aug 2025 03:03:42 +0300
Subject: [PATCH 031/932] [v1] - Mamba1 Attention Metadata (#21249)

Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
---
 csrc/mamba/mamba_ssm/selective_scan.h         |   3 +
 csrc/mamba/mamba_ssm/selective_scan_fwd.cu    |  18 ++-
 docs/models/supported_models.md               |   4 +-
 docs/usage/v1_guide.md                        |  12 +-
 .../models/language/generation/test_hybrid.py |   2 +
 tests/v1/test_oracle.py                       |   1 -
 .../layers/mamba/mamba_mixer.py               | 144 +++++++++++++-----
 .../layers/mamba/mamba_mixer2.py              |   7 +-
 .../layers/mamba/mamba_utils.py               |  99 +++++++-----
 vllm/model_executor/models/bamba.py           |   5 +-
 vllm/model_executor/models/falcon_h1.py       |   5 +-
 .../model_executor/models/granitemoehybrid.py |   5 +-
 vllm/model_executor/models/jamba.py           |  60 +++++---
 vllm/model_executor/models/mamba.py           |  77 ++++++----
 vllm/model_executor/models/mamba2.py          |   5 +-
 vllm/model_executor/models/nemotron_h.py      |   5 +-
 vllm/model_executor/models/zamba2.py          |   5 +-
 vllm/v1/attention/backends/mamba1_attn.py     |  67 ++++++++
 vllm/v1/attention/backends/mamba_selectors.py |   4 +
 19 files changed, 367 insertions(+), 161 deletions(-)
 create mode 100644 vllm/v1/attention/backends/mamba1_attn.py

diff --git a/csrc/mamba/mamba_ssm/selective_scan.h b/csrc/mamba/mamba_ssm/selective_scan.h
index 563d2fe4ef..13c6178941 100644
--- a/csrc/mamba/mamba_ssm/selective_scan.h
+++ b/csrc/mamba/mamba_ssm/selective_scan.h
@@ -45,6 +45,9 @@ struct SSMParamsBase {
     index_t out_d_stride;
     index_t out_z_batch_stride;
     index_t out_z_d_stride;
+    index_t ssm_states_batch_stride;
+    index_t ssm_states_dim_stride;  
+    index_t ssm_states_dstate_stride;
 
     // Common data pointers.
     void *__restrict__ A_ptr;
diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
index 5766fbab4e..c4ddbc1427 100644
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -132,8 +132,10 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
     weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
     input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + (cache_index * params.dim + dim_id * kNRows) * params.dstate;
-
+    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + 
+    cache_index * params.ssm_states_batch_stride + 
+    dim_id * kNRows * params.ssm_states_dim_stride;
+    
     float D_val[kNRows] = {0};
     if (params.D_ptr != nullptr) {
         #pragma unroll
@@ -248,7 +250,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 }
                 // Initialize running total
 
-                scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx]): 0.0);
+                scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx * params.ssm_states_dstate_stride]): 0.0);
 
                 SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
                 typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
@@ -259,7 +261,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 if (threadIdx.x == 0) {
                     smem_running_prefix[state_idx] = prefix_op.running_prefix;
                     if (chunk == n_chunks - 1) {
-                        ssm_states[state_idx] = input_t(prefix_op.running_prefix.y);
+                        ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y);
                     }
                 }
                 #pragma unroll
@@ -481,6 +483,10 @@ void set_ssm_params_fwd(SSMParamsBase &params,
         params.out_batch_stride = out.stride(1);
         params.out_d_stride = out.stride(0);
 
+        params.ssm_states_batch_stride = ssm_states.stride(0);
+        params.ssm_states_dim_stride = ssm_states.stride(1);  
+        params.ssm_states_dstate_stride = ssm_states.stride(2);
+
     }
     else{
         if (!is_variable_B) {
@@ -509,6 +515,10 @@ void set_ssm_params_fwd(SSMParamsBase &params,
         }
         params.out_batch_stride = out.stride(0);
         params.out_d_stride = out.stride(1);
+        
+        params.ssm_states_batch_stride = ssm_states.stride(0);
+        params.ssm_states_dim_stride = ssm_states.stride(1);  
+        params.ssm_states_dstate_stride = ssm_states.stride(2);
     }
 }
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 120fd3f485..3816412268 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -370,9 +370,9 @@ th {
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ |
-| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | |
+| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | |
+| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | ✅︎ |
 | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ |
 | `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 38399c6633..d30144e8a8 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -83,7 +83,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 | **Decoder-only Models**     | <nobr>🚀 Optimized</nobr>                                                          |
 | **Encoder-Decoder Models**  | <nobr>🟠 Delayed</nobr>                                                            |
 | **Embedding Models**        | <nobr>🟢 Functional</nobr>                                                         |
-| **Mamba Models**            | <nobr>🟢 (Mamba-2), 🟡 (Mamba-1)</nobr>                                            |
+| **Mamba Models**            | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr>                                            |
 | **Multimodal Models**       | <nobr>🟢 Functional</nobr>                                                         |
 
 vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol.
@@ -104,13 +104,11 @@ to enable simultaneous generation and embedding using the same engine instance i
 
 #### Mamba Models
 
-Models using selective state-space mechanisms instead of standard transformer attention are partially supported.
-Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers
-(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet supported. Please note that these models currently require
-disabling prefix caching in V1.
+Models using selective state-space mechanisms instead of standard transformer attention are supported.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1. Additionally, Mamba-1 models require `enforce_eager=True`.
 
-Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
-`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`). Please note that
+Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
+`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
 these models currently require disabling prefix caching and using the FlashInfer attention backend in V1.
 
 #### Encoder-Decoder Models
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 2238924c1b..67ba2f2559 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -53,6 +53,8 @@ HF_UNSUPPORTED_MODELS = [
 ]
 
 V1_SUPPORTED_MODELS = [
+    "state-spaces/mamba-130m-hf",
+    "ai21labs/Jamba-tiny-dev",
     "mistralai/Mamba-Codestral-7B-v0.1",
     "ibm-ai-platform/Bamba-9B-v1",
     "Zyphra/Zamba2-1.2B-instruct",
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index b68ed298a1..a756c89b52 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -12,7 +12,6 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
 UNSUPPORTED_MODELS_V1 = [
     "openai/whisper-large-v3",  # transcription
     "facebook/bart-large-cnn",  # encoder decoder
-    "state-spaces/mamba-130m-hf",  # mamba1
 ]
 
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 60cf3e1188..17b7f84a93 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,30 +1,37 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Optional
+
 import torch
 from torch import nn
 from torch.nn.parameter import Parameter
 
-from vllm.attention.backends.abstract import AttentionMetadata
+from vllm import envs
+from vllm.config import get_current_vllm_config
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.forward_context import get_forward_context
+from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                RowParallelLinear)
+from vllm.model_executor.layers.mamba.abstract import MambaBase
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateShapeCalculator)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
     selective_scan_fn, selective_state_update)
 from vllm.model_executor.models.mamba_cache import MambaCacheParams
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata
 
 
 # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
 @CustomOp.register("mamba_mixer")
-class MambaMixer(CustomOp):
+class MambaMixer(MambaBase, CustomOp):
     """
     Compute ∆, A, B, C, and D the state space parameters and compute
     the `contextualized_states`. A, D are input independent
@@ -47,13 +54,16 @@ class MambaMixer(CustomOp):
                  rms_norm_has_weight: bool = True,
                  rms_norm_eps: float = 1e-5,
                  activation="silu",
-                 is_lora_enabled: bool = False):
+                 is_lora_enabled: bool = False,
+                 prefix: str = ""):
         super().__init__()
         self.time_step_rank = time_step_rank
         self.ssm_state_size = ssm_state_size
         self.use_rms_norm = use_rms_norm
         self.activation = activation
         self.is_lora_enabled = is_lora_enabled
+        self.conv_kernel_size = conv_kernel_size
+        self.intermediate_size = intermediate_size
 
         self.conv1d = ColumnParallelLinear(
             input_size=conv_kernel_size,
@@ -131,14 +141,62 @@ class MambaMixer(CustomOp):
             has_weight=rms_norm_has_weight,
         ) if use_rms_norm else None
 
-    def forward_native(self, hidden_states: torch.Tensor,
-                       conv_state: torch.Tensor, ssm_state: torch.Tensor):
+        if envs.VLLM_USE_V1:
+            compilation_config = get_current_vllm_config().compilation_config
+            if prefix in compilation_config.static_forward_context:
+                raise ValueError(f"Duplicate layer name: {prefix}")
+            compilation_config.static_forward_context[prefix] = self
+            # The outer list is for v0 PP virtual engine. Though this code path
+            # only runs for v1, we have to do this to unify with the interface
+            # of Attention + v0 PP.
+            # The inner tuple is (conv_state, ssm_state)
+            self.kv_cache = [(torch.tensor([]), torch.tensor([]))]
+
+        self.prefix = prefix
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                mamba_cache_params: Optional[MambaCacheParams] = None):
+        if not envs.VLLM_USE_V1:
+            return CustomOp.forward(self, hidden_states, mamba_cache_params)
+        else:
+            return self.forward_cuda(hidden_states, mamba_cache_params)
+
+    def forward_native(self,
+                       hidden_states: torch.Tensor,
+                       mamba_cache_params: Optional[MambaCacheParams] = None):
         pass
 
-    def forward_cuda(self, hidden_states: torch.Tensor,
-                     mamba_cache_params: MambaCacheParams):
+    def forward_cuda(self,
+                     hidden_states: torch.Tensor,
+                     mamba_cache_params: Optional[MambaCacheParams] = None):
 
-        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+        forward_context: ForwardContext = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+
+        if envs.VLLM_USE_V1:
+            if attn_metadata is not None:
+                assert isinstance(attn_metadata, dict)
+                attn_metadata = attn_metadata[self.prefix]
+                mamba1_metadata = attn_metadata
+                assert isinstance(mamba1_metadata, Mamba1AttentionMetadata)
+                query_start_loc = mamba1_metadata.query_start_loc
+                state_indices_tensor = mamba1_metadata.state_indices_tensor
+                self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+                conv_state = self_kv_cache[0].transpose(-1, -2)
+                ssm_state = self_kv_cache[1]
+                has_initial_state = mamba1_metadata.has_initial_states
+                context_lens_tensor = mamba1_metadata.context_lens_tensor
+        else:
+            assert mamba_cache_params is not None
+            conv_state = mamba_cache_params.conv_state
+            ssm_state = mamba_cache_params.ssm_state
+            state_indices_tensor = mamba_cache_params.state_indices_tensor
+            query_start_loc = attn_metadata.query_start_loc
+            context_lens_tensor = attn_metadata.context_lens_tensor
+
+            if context_lens_tensor is not None:
+                has_initial_state = context_lens_tensor > 0
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
@@ -148,8 +206,12 @@ class MambaMixer(CustomOp):
         conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
                                                self.conv1d.weight.size(2))
 
-        if attn_metadata.query_start_loc is not None \
-            and attn_metadata.context_lens_tensor is not None:
+        if envs.VLLM_USE_V1 and attn_metadata is None:
+            # V1 profile run
+            hidden_states = hidden_states.contiguous()
+            return self.out_proj(hidden_states.transpose(-2, -1))[0]
+
+        if query_start_loc is not None and context_lens_tensor is not None:
             # |---------- N-1 iteration --------|
             # |---------------- N iteration ---------------------|
             # |- tokenA -|......................|-- newTokens ---|
@@ -161,18 +223,18 @@ class MambaMixer(CustomOp):
                 conv_weights,
                 bias=self.conv1d.bias,
                 activation=self.activation,
-                conv_states=mamba_cache_params.conv_state,
-                has_initial_state=attn_metadata.context_lens_tensor > 0,
-                cache_indices=mamba_cache_params.state_indices_tensor,
-                query_start_loc=attn_metadata.query_start_loc)
+                conv_states=conv_state,
+                has_initial_state=has_initial_state,
+                cache_indices=state_indices_tensor,
+                query_start_loc=query_start_loc)
         else:
             hidden_states = causal_conv1d_update(
                 hidden_states.transpose(0, 1),
-                mamba_cache_params.conv_state,
+                conv_state,
                 conv_weights,
                 self.conv1d.bias,
                 self.activation,
-                conv_state_indices=mamba_cache_params.state_indices_tensor)
+                conv_state_indices=state_indices_tensor)
             hidden_states = hidden_states.transpose(0, 1)
 
         # 3. State Space Model sequence transformation
@@ -203,11 +265,10 @@ class MambaMixer(CustomOp):
         time_proj_bias = (self.dt_proj.bias.float() if hasattr(
             self.dt_proj, "bias") else None)
 
-        if attn_metadata.query_start_loc is not None \
-            and attn_metadata.context_lens_tensor is not None:
+        if query_start_loc is not None and context_lens_tensor is not None:
             scan_outputs = selective_scan_fn(
                 hidden_states,
-                mamba_cache_params.ssm_state,
+                ssm_state,
                 discrete_time_step,
                 self.A,
                 B.transpose(-2, -1),
@@ -216,24 +277,23 @@ class MambaMixer(CustomOp):
                 gate,
                 time_proj_bias,
                 delta_softplus=True,
-                cache_indices=mamba_cache_params.state_indices_tensor,
-                has_initial_state=attn_metadata.context_lens_tensor > 0,
-                query_start_loc=attn_metadata.query_start_loc)
+                cache_indices=state_indices_tensor,
+                has_initial_state=has_initial_state,
+                query_start_loc=query_start_loc)
         else:
             scan_outputs = torch.empty_like(hidden_states.transpose(0, 1))
-            selective_state_update(
-                mamba_cache_params.ssm_state,
-                hidden_states.transpose(0, 1),
-                discrete_time_step.transpose(0, 1),
-                self.A,
-                B,
-                C,
-                self.D,
-                gate.transpose(0, 1),
-                time_proj_bias,
-                dt_softplus=True,
-                state_batch_indices=mamba_cache_params.state_indices_tensor,
-                out=scan_outputs)
+            selective_state_update(ssm_state,
+                                   hidden_states.transpose(0, 1),
+                                   discrete_time_step.transpose(0, 1),
+                                   self.A,
+                                   B,
+                                   C,
+                                   self.D,
+                                   gate.transpose(0, 1),
+                                   time_proj_bias,
+                                   dt_softplus=True,
+                                   state_batch_indices=state_indices_tensor,
+                                   out=scan_outputs)
             scan_outputs = scan_outputs.transpose(0, 1)
 
         # 4. Final linear projection
@@ -245,3 +305,15 @@ class MambaMixer(CustomOp):
             contextualized_states = self.out_proj(
                 scan_outputs.transpose(-2, -1))[0]
         return contextualized_states
+
+    def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
+        return MambaStateShapeCalculator.mamba1_state_shape(
+            tp_world_size=get_tensor_model_parallel_world_size(),
+            intermediate_size=self.intermediate_size,
+            state_size=self.ssm_state_size,
+            conv_kernel=self.conv_kernel_size,
+        )
+
+    @property
+    def mamba_type(self) -> str:
+        return "mamba1"
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 5ac9a7f9ab..d5f4877135 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -21,7 +21,7 @@ from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba2_metadata import (Mamba2Metadata,
                                                               update_metadata)
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    extra_groups_for_head_shards, get_mamba_state_shape)
+    MambaStateShapeCalculator)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated
@@ -278,8 +278,9 @@ class MambaMixer2(MambaBase, CustomOp):
             # - for TP we shard conv_dim by sharding on n_groups,
             # - but if n_groups cannot divide tp_size, we need to
             #   extend some extra groups
-            self.n_groups = n_groups + extra_groups_for_head_shards(
+            groups = MambaStateShapeCalculator.extra_groups_for_head_shards(
                 n_groups, self.tp_size)
+            self.n_groups = n_groups + groups
 
         self.conv_dim = intermediate_size + 2 * self.n_groups * ssm_state_size
         self.conv1d = ColumnParallelLinear(
@@ -732,7 +733,7 @@ class MambaMixer2(MambaBase, CustomOp):
         output[:num_actual_tokens], _ = self.out_proj(hidden_states)
 
     def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
-        return get_mamba_state_shape(
+        return MambaStateShapeCalculator.mamba2_state_shape(
             intermediate_size=self.intermediate_size,
             tp_world_size=get_tensor_model_parallel_world_size(),
             n_groups=self.n_groups,
diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py
index 99a582066c..42c815b08f 100644
--- a/vllm/model_executor/layers/mamba/mamba_utils.py
+++ b/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -3,53 +3,70 @@
 from vllm.distributed import divide
 
 
-def extra_groups_for_head_shards(ngroups: int, tp_size: int):
-    """Compute the increase in group numbers to account for
-    replication in order to accompany the head shards."""
+class MambaStateShapeCalculator:
 
-    # in the case ngoups % tp_size == 0, this will be zero
-    if ngroups % tp_size == 0:
-        return 0
+    @classmethod
+    def mamba1_state_shape(
+        cls,
+        tp_world_size: int,
+        intermediate_size: int,
+        state_size: int,
+        conv_kernel: int,
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int], tuple[int, int]]:
+        conv_state_shape = (divide(intermediate_size,
+                                   tp_world_size), conv_kernel - 1)
 
-    # for n_groups == 1, this is exactly tp_size - n_groups
-    return tp_size - ngroups
+        temporal_state_shape = (divide(intermediate_size,
+                                       tp_world_size), state_size)
 
+        # In V0, the conv_state shape was swapped during allocation in
+        # MambaCacheManager, but in V1 it needs to be determined here at the
+        # calculation level
+        if use_v1:
+            conv_state_shape = conv_state_shape[1], conv_state_shape[0]
 
-def get_mamba_state_shape(
-    intermediate_size: int,
-    tp_world_size: int,
-    n_groups: int,
-    num_heads: int,
-    head_dim: int,
-    state_size: int,
-    conv_kernel: int,
-    use_v1: bool = True,
-) -> tuple[tuple[int, int], tuple[int, int, int]]:
-    """ Get the shape of mamba state."""
+        return conv_state_shape, temporal_state_shape
 
-    # if n_groups is not divisible by world_size, need to extend the shards
-    # to ensure all groups needed by a head is sharded along with it
-    n_groups = (n_groups +
-                extra_groups_for_head_shards(n_groups, tp_world_size))
+    @classmethod
+    def mamba2_state_shape(
+        cls,
+        tp_world_size: int,
+        intermediate_size: int,
+        n_groups: int,
+        num_heads: int,
+        head_dim: int,
+        state_size: int,
+        conv_kernel: int,
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
+        # if n_groups is not divisible by world_size, need to extend the shards
+        # to ensure all groups needed by a head is sharded along with it
+        n_groups = n_groups + cls.extra_groups_for_head_shards(
+            n_groups, tp_world_size)
+        # heads and n_groups are TP-ed
+        conv_dim = intermediate_size + 2 * n_groups * state_size
 
-    # - heads and n_groups are TP-ed
-    conv_dim = (intermediate_size + 2 * n_groups * state_size)
-    # contiguous along 'dim' axis
-    conv_state_shape = (
-        conv_kernel - 1,
-        divide(conv_dim, tp_world_size),
-    )
+        # contiguous along 'dim' axis
+        conv_state_shape = (conv_kernel - 1, divide(conv_dim, tp_world_size))
+        if not use_v1:
+            conv_state_shape = conv_state_shape[1], conv_state_shape[0]
 
-    if not use_v1:
-        conv_state_shape = (conv_state_shape[1], conv_state_shape[0])
+        # These are not TP-ed as they depend on A, dt_bias, D
+        # - they are typically small
+        #   e.g., (h_heads, head_dim, state_size) = (128, 64, 128)
+        temporal_state_shape = (divide(num_heads,
+                                       tp_world_size), head_dim, state_size)
+        return conv_state_shape, temporal_state_shape
 
-    # These are not TP-ed as they depend on A, dt_bias, D
-    # - they are typically small
-    #   e.g., (h_heads, head_dim, state_size) = (128, 64, 128)
-    temporal_state_shape = (
-        divide(num_heads, tp_world_size),
-        head_dim,
-        state_size,
-    )
+    @classmethod
+    def extra_groups_for_head_shards(cls, ngroups: int, tp_size: int):
+        """Compute the increase in group numbers to account for
+        replication in order to accompany the head shards."""
 
-    return conv_state_shape, temporal_state_shape
+        # in the case ngoups % tp_size == 0, this will be zero
+        if ngroups % tp_size == 0:
+            return 0
+
+        # for n_groups == 1, this is exactly tp_size - n_groups
+        return tp_size - ngroups
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 0f54944276..4a2ae07581 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -25,7 +25,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
-from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -457,7 +458,7 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         hf_config = vllm_config.model_config.hf_config
         intermediate_size = hf_config.mamba_expand * hf_config.hidden_size
 
-        return get_mamba_state_shape(
+        return MambaStateShapeCalculator.mamba2_state_shape(
             intermediate_size=intermediate_size,
             tp_world_size=parallel_config.tensor_parallel_size,
             n_groups=hf_config.mamba_n_groups,
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 6a58b1501f..85d64af5bd 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -24,7 +24,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
-from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -543,7 +544,7 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                              if hf_config.mamba_d_ssm is None else
                              hf_config.mamba_d_ssm)
 
-        return get_mamba_state_shape(
+        return MambaStateShapeCalculator.mamba2_state_shape(
             intermediate_size=intermediate_size,
             tp_world_size=parallel_config.tensor_parallel_size,
             n_groups=hf_config.mamba_n_groups,
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 59c1dce48e..e59502f12a 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -23,7 +23,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
-from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -547,7 +548,7 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
         hf_config = vllm_config.model_config.hf_config
         intermediate_size = hf_config.mamba_expand * hf_config.hidden_size
 
-        return get_mamba_state_shape(
+        return MambaStateShapeCalculator.mamba2_state_shape(
             intermediate_size=intermediate_size,
             tp_world_size=parallel_config.tensor_parallel_size,
             n_groups=hf_config.mamba_n_groups,
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index ab21b7ce2c..c1033aff07 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -8,6 +8,7 @@ import torch
 from torch import nn
 from transformers import JambaConfig
 
+from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -19,6 +20,8 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateShapeCalculator)
 from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
                                                PoolingType)
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -32,8 +35,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType
 
-from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
-                         SupportsV0Only)
+from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -112,7 +114,8 @@ class JambaMambaDecoderLayer(nn.Module):
                                 use_rms_norm=True,
                                 rms_norm_eps=config.rms_norm_eps,
                                 activation=config.hidden_act,
-                                is_lora_enabled = self.is_lora_enabled
+                                is_lora_enabled = self.is_lora_enabled,
+                                prefix=f"{prefix}.mixer",
                                 )
 
         num_experts = config.layers_num_experts[layer_idx]
@@ -344,7 +347,8 @@ class JambaModel(nn.Module):
             layer_mamba_cache_params = None
             if isinstance(layer, JambaAttentionDecoderLayer):
                 kv_cache_index += 1
-            if isinstance(layer, JambaMambaDecoderLayer):
+            if isinstance(layer,
+                          JambaMambaDecoderLayer) and mamba_cache_params:
                 current_state_layer = mamba_cache_index
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
                     current_state_layer)
@@ -442,7 +446,7 @@ class JambaModel(nn.Module):
 
 
 class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                       IsHybrid, SupportsV0Only):
+                       IsHybrid):
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={
         ".self_attn.": ".",
         ".A_log": ".A"
@@ -509,14 +513,19 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
-        if self.mamba_cache is None:
-            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
-                self.vllm_config.parallel_config, LayerBlockType.mamba)
-            self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
-                *self._get_mamba_cache_shape())
+        # NOTE: mamba_cache_params is not needed for v1
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                num_layers = self.model_config.get_num_layers_by_block_type(
+                    self.vllm_config.parallel_config, LayerBlockType.mamba)
+                state_shape = self.get_mamba_state_shape_from_config(
+                    self.vllm_config)
+                self.mamba_cache = MambaCacheManager(self.vllm_config,
+                                                     self.lm_head.weight.dtype,
+                                                     num_layers, *state_shape)
 
-        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
         hidden_states = self.model(input_ids, positions, mamba_cache_params,
                                    intermediate_tensors, inputs_embeds)
@@ -529,19 +538,22 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
 
-    def _get_mamba_cache_shape(
-            self) -> tuple[tuple[int, int], tuple[int, int]]:
-        world_size = get_tensor_model_parallel_world_size()
-        hidden_size = self.config.hidden_size
-        conv_state_shape = (
-            self.config.mamba_expand * hidden_size // world_size,
-            self.config.mamba_d_conv - 1,
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[tuple[int, int], tuple[int, int]]:
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+        hidden_size = hf_config.hidden_size
+
+        return MambaStateShapeCalculator.mamba1_state_shape(
+            tp_world_size=parallel_config.tensor_parallel_size,
+            intermediate_size=hf_config.mamba_expand * hidden_size,
+            state_size=hf_config.mamba_d_state,
+            conv_kernel=hf_config.mamba_d_conv,
+            use_v1=envs.VLLM_USE_V1,
         )
-        temporal_state_shape = (
-            self.config.mamba_expand * hidden_size // world_size,
-            self.config.mamba_d_state,
-        )
-        return conv_state_shape, temporal_state_shape
 
     def compute_logits(
         self,
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 8162ac3f75..80b63e1537 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -8,20 +8,21 @@ import torch
 from torch import nn
 from transformers import MambaConfig
 
+from vllm import envs
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (HasInnerState,
-                                                   IsAttentionFree, SupportsPP,
-                                                   SupportsV0Only)
+                                                   IsAttentionFree, SupportsPP)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -41,7 +42,8 @@ class MambaDecoderLayer(nn.Module):
                  config: MambaConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 is_lora_enabled: Optional[bool] = False) -> None:
+                 is_lora_enabled: Optional[bool] = False,
+                 prefix: str = "") -> None:
         super().__init__()
         self.config = config
         self.is_falcon_mamba = config.model_type == "falcon_mamba"
@@ -58,7 +60,8 @@ class MambaDecoderLayer(nn.Module):
                                 rms_norm_has_weight=not self.is_falcon_mamba,
                                 rms_norm_eps=mixer_rms_eps,
                                 activation=config.hidden_act,
-                                is_lora_enabled=self.is_lora_enabled)
+                                is_lora_enabled=self.is_lora_enabled,
+                                prefix=f"{prefix}.mixer")
 
         self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
@@ -107,7 +110,8 @@ class MambaModel(nn.Module):
             lambda prefix: MambaDecoderLayer(config,
                                              cache_config=cache_config,
                                              quant_config=quant_config,
-                                             is_lora_enabled=is_lora_enabled),
+                                             is_lora_enabled=is_lora_enabled,
+                                             prefix=prefix),
             prefix=f"{prefix}.layers")
 
         self.norm_f = RMSNorm(config.hidden_size,
@@ -123,7 +127,7 @@ class MambaModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        mamba_cache_params: MambaCacheParams,
+        mamba_cache_params: Optional[MambaCacheParams] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -140,12 +144,17 @@ class MambaModel(nn.Module):
 
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
+
+            layer_cache_params = None
+            if mamba_cache_params is not None:
+                layer_cache_params = mamba_cache_params.at_layer_idx(
+                    i - self.start_layer)
+
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
                 residual=residual,
-                mamba_cache_params=mamba_cache_params.at_layer_idx(
-                    i - self.start_layer))
+                mamba_cache_params=layer_cache_params)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,
@@ -176,8 +185,7 @@ class MambaModel(nn.Module):
         return loaded_params
 
 
-class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
-                       SupportsV0Only):
+class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
@@ -227,20 +235,40 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
-        if self.mamba_cache is None:
-            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
-                self.vllm_config.parallel_config, LayerBlockType.mamba)
-            self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
-                *self._get_mamba_cache_shape())
 
-        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+        mamba_cache_params = None
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                num_layers = self.model_config.get_num_layers_by_block_type(
+                    self.vllm_config.parallel_config, LayerBlockType.mamba)
+                state_shape = self.get_mamba_state_shape_from_config(
+                    self.vllm_config)
+                self.mamba_cache = MambaCacheManager(self.vllm_config,
+                                                     self.lm_head.weight.dtype,
+                                                     num_layers, *state_shape)
+
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
         hidden_states = self.backbone(input_ids, positions, mamba_cache_params,
                                       intermediate_tensors, inputs_embeds)
 
         return hidden_states
 
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[tuple[int, int], tuple[int, int]]:
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+
+        return MambaStateShapeCalculator.mamba1_state_shape(
+            tp_world_size=parallel_config.tensor_parallel_size,
+            intermediate_size=hf_config.intermediate_size,
+            state_size=hf_config.state_size,
+            conv_kernel=hf_config.conv_kernel,
+            use_v1=envs.VLLM_USE_V1)
+
     def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
         return self.mamba_cache.copy_inputs_before_cuda_graphs(
             input_buffers, **kwargs)
@@ -248,19 +276,6 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
 
-    def _get_mamba_cache_shape(
-            self) -> tuple[tuple[int, int], tuple[int, int]]:
-        world_size = get_tensor_model_parallel_world_size()
-        conv_state_shape = (
-            self.config.intermediate_size // world_size,
-            self.config.conv_kernel - 1,
-        )
-        temporal_state_shape = (
-            self.config.intermediate_size // world_size,
-            self.config.state_size,
-        )
-        return conv_state_shape, temporal_state_shape
-
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
         logits = self.logits_processor(self.lm_head, hidden_states,
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index adad181617..75e92b0176 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -19,7 +19,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
-from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -220,7 +221,7 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
         hf_config = vllm_config.model_config.hf_config
         intermediate_size = hf_config.expand * hf_config.hidden_size
 
-        return get_mamba_state_shape(
+        return MambaStateShapeCalculator.mamba2_state_shape(
             intermediate_size=intermediate_size,
             tp_world_size=parallel_config.tensor_parallel_size,
             n_groups=hf_config.n_groups,
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 6a999e2254..eb62d5a53c 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -39,7 +39,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
-from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
@@ -482,7 +483,7 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         hf_config = vllm_config.model_config.hf_config
         intermediate_size = hf_config.expand * hf_config.hidden_size
 
-        return get_mamba_state_shape(
+        return MambaStateShapeCalculator.mamba2_state_shape(
             intermediate_size=intermediate_size,
             tp_world_size=parallel_config.tensor_parallel_size,
             n_groups=hf_config.n_groups,
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 7764fd9b9e..4cb0becf30 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -32,7 +32,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
-from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -869,7 +870,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         hf_config = vllm_config.model_config.hf_config
         intermediate_size = hf_config.mamba_expand * hf_config.hidden_size
 
-        return get_mamba_state_shape(
+        return MambaStateShapeCalculator.mamba2_state_shape(
             intermediate_size=intermediate_size,
             tp_world_size=parallel_config.tensor_parallel_size,
             n_groups=hf_config.mamba_ngroups,
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
new file mode 100644
index 0000000000..f0e4636fdb
--- /dev/null
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import ClassVar
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.config import VllmConfig
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata)
+from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
+
+
+class Mamba1AttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_builder_cls() -> type["Mamba1AttentionMetadataBuilder"]:
+        return Mamba1AttentionMetadataBuilder
+
+
+@dataclass
+class Mamba1AttentionMetadata:
+    query_start_loc: torch.Tensor
+    context_lens_tensor: torch.Tensor
+    state_indices_tensor: torch.Tensor
+    has_initial_states: torch.Tensor
+
+
+class Mamba1AttentionMetadataBuilder(
+        AttentionMetadataBuilder[Mamba1AttentionMetadata]):
+
+    reorder_batch_threshold: ClassVar[int] = 1
+
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        vllm_config: VllmConfig,
+        device: torch.device,
+        layer_names: list[str],
+    ):
+        assert isinstance(kv_cache_spec, MambaSpec)
+        self.kv_cache_spec = kv_cache_spec
+        self.device = device
+        self.vllm_config = vllm_config
+        self.layer_names = layer_names
+
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> Mamba1AttentionMetadata:
+        query_start_loc = common_attn_metadata.query_start_loc
+
+        state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
+        context_lens_tensor = common_attn_metadata.num_computed_tokens_cpu.to(
+            query_start_loc.device)
+        has_initial_states = (context_lens_tensor > 0)
+
+        return Mamba1AttentionMetadata(
+            query_start_loc=query_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            has_initial_states=has_initial_states,
+            state_indices_tensor=state_indices_tensor,
+        )
diff --git a/vllm/v1/attention/backends/mamba_selectors.py b/vllm/v1/attention/backends/mamba_selectors.py
index 80021a2165..f56f2fb7bf 100644
--- a/vllm/v1/attention/backends/mamba_selectors.py
+++ b/vllm/v1/attention/backends/mamba_selectors.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.attention.backends.abstract import AttentionBackend
+from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
 from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
 
 
 def get_mamba_attn_backend(mamba_type: str) -> type[AttentionBackend]:
+    if mamba_type == "mamba1":
+        return Mamba1AttentionBackend
+
     if mamba_type == "mamba2":
         return Mamba2AttentionBackend
 

From eec890c1c1cdf6d4bbf4c0563fac54abe80ab8b6 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 6 Aug 2025 20:03:53 -0400
Subject: [PATCH 032/932] [Bug] Fix B200 DeepGEMM E8M0 Accuracy Issue (#22399)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/quantization/utils/fp8_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 2aece9a1de..68a061968a 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -799,7 +799,8 @@ def requant_weight_ue8m0_inplace(
         s_exp = s_exp[:m_cur, :k_cur]
         w_dq = w_q.to(torch.float32) * s_exp
         # Re-quantise using power-of-two scaling (UE8M0).
-        w_requant, s_requant = per_block_cast_to_fp8(w_dq, [block_m, block_k])
+        w_requant, s_requant = per_block_cast_to_fp8(w_dq, [block_m, block_k],
+                                                     use_ue8m0=True)
 
         # Write back the results in-place.
         w_q.copy_(w_requant)

From 19c9365aa48d514ae6ef45242359dc98c6046666 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Wed, 6 Aug 2025 17:47:14 -0700
Subject: [PATCH 033/932] [gpt-oss] add demo tool server (#22393)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/openai/api_server.py        |  7 ++
 vllm/entrypoints/openai/cli_args.py          |  4 ++
 vllm/entrypoints/openai/serving_responses.py |  4 ++
 vllm/entrypoints/tool_server.py              | 70 ++++++++++++++++++++
 4 files changed, 85 insertions(+)
 create mode 100644 vllm/entrypoints/tool_server.py

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 9bf4702320..88ef16b87e 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -92,6 +92,7 @@ from vllm.entrypoints.openai.serving_tokenization import (
 from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription, OpenAIServingTranslation)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.entrypoints.tool_server import DemoToolServer, ToolServer
 from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
                                     log_non_default_args, with_cancellation)
 from vllm.logger import init_logger
@@ -1620,6 +1621,11 @@ async def init_app_state(
                     "This discrepancy may lead to performance degradation.",
                     resolved_chat_template, args.model)
 
+    if args.tool_server == "demo":
+        tool_server: Optional[ToolServer] = DemoToolServer()
+    else:
+        tool_server = None
+
     # Merge default_mm_loras into the static lora_modules
     default_mm_loras = (vllm_config.lora_config.default_mm_loras
                         if vllm_config.lora_config is not None else {})
@@ -1654,6 +1660,7 @@ async def init_app_state(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_auto_tools=args.enable_auto_tool_choice,
         tool_parser=args.tool_call_parser,
+        tool_server=tool_server,
         reasoning_parser=args.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index dfbc9cde3d..12318b300c 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -147,6 +147,10 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
     """Special the tool parser plugin write to parse the model-generated tool
     into OpenAI API format, the name register in this plugin can be used in
     `--tool-call-parser`."""
+    tool_server: Optional[str] = None
+    """Comma-separated list of host:port pairs (IPv4, IPv6, or hostname).
+    Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo
+    purpose."""
     log_config_file: Optional[str] = envs.VLLM_LOGGING_CONFIG_PATH
     """Path to logging config JSON file for both vllm and uvicorn"""
     max_log_len: Optional[int] = None
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index f340854386..4ca863fd07 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -29,6 +29,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
 # yapf: enable
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.tool_server import ToolServer
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import SamplingParams
@@ -53,6 +54,7 @@ class OpenAIServingResponses(OpenAIServing):
         reasoning_parser: str = "",
         enable_auto_tools: bool = False,
         tool_parser: Optional[str] = None,
+        tool_server: Optional[ToolServer] = None,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
     ) -> None:
@@ -114,6 +116,8 @@ class OpenAIServingResponses(OpenAIServing):
 
         self.background_tasks: dict[str, asyncio.Task] = {}
 
+        self.tool_server = tool_server
+
     async def create_responses(
         self,
         request: ResponsesRequest,
diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py
new file mode 100644
index 0000000000..769c40e8cc
--- /dev/null
+++ b/vllm/entrypoints/tool_server.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from contextlib import AbstractAsyncContextManager, asynccontextmanager
+from typing import Any, Optional
+
+from openai_harmony import ToolNamespaceConfig
+
+from vllm.entrypoints.tool import HarmonyBrowserTool, HarmonyPythonTool, Tool
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class ToolServer(ABC):
+
+    @abstractmethod
+    def has_tool(self, tool_name: str) -> bool:
+        """
+        Return True if the tool is supported, False otherwise.
+        """
+        pass
+
+    @abstractmethod
+    def get_tool_description(self,
+                             tool_name: str) -> Optional[ToolNamespaceConfig]:
+        """
+        Return the tool description for the given tool name.
+        If the tool is not supported, return None.
+        """
+        pass
+
+    @abstractmethod
+    def new_session(self, tool_name: str) -> AbstractAsyncContextManager[Any]:
+        """
+        Create a session for the tool.
+        """
+        ...
+
+
+class DemoToolServer(ToolServer):
+
+    def __init__(self):
+        self.tools: dict[str, Tool] = {}
+        browser_tool = HarmonyBrowserTool()
+        if browser_tool.enabled:
+            self.tools["browser"] = browser_tool
+        python_tool = HarmonyPythonTool()
+        if python_tool.enabled:
+            self.tools["python"] = python_tool
+        logger.info("DemoToolServer initialized with tools: %s",
+                    list(self.tools.keys()))
+
+    def has_tool(self, tool_name: str) -> bool:
+        return tool_name in self.tools
+
+    def get_tool_description(self,
+                             tool_name: str) -> Optional[ToolNamespaceConfig]:
+        if tool_name not in self.tools:
+            return None
+        if tool_name == "browser":
+            return ToolNamespaceConfig.browser()
+        elif tool_name == "python":
+            return ToolNamespaceConfig.python()
+        else:
+            raise ValueError(f"Unknown tool {tool_name}")
+
+    @asynccontextmanager
+    async def new_session(self, tool_name: str):
+        yield self.tools[tool_name]

From 5c7cc33f4dafd4949a3f4bda815fa980d71ba45f Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Wed, 6 Aug 2025 18:04:04 -0700
Subject: [PATCH 034/932] [gpt-oss] fix model config with hf_config (#22401)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
---
 vllm/model_executor/models/gpt_oss.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 896560fa24..c37c4e9610 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -61,9 +61,9 @@ class OAIAttention(nn.Module):
                 "original_max_position_embeddings":
                 config.rope_scaling["original_max_position_embeddings"],
                 "beta_fast":
-                config.rope_ntk_beta,
+                config.rope_scaling["beta_fast"],
                 "beta_slow":
-                config.rope_ntk_alpha,
+                config.rope_scaling["beta_slow"],
             },
             is_neox_style=True,
         )
@@ -154,7 +154,7 @@ class MLPBlock(torch.nn.Module):
                                       dtype=torch.bfloat16)
         assert config.intermediate_size % self.world_size == 0
         self.experts = FusedMoE(num_experts=config.num_local_experts,
-                                top_k=config.num_experts_per_token,
+                                top_k=config.num_experts_per_tok,
                                 hidden_size=config.hidden_size,
                                 intermediate_size=config.intermediate_size,
                                 reduce_results=True,

From 9a3835aaa9006c0d53628f278319642774d88fbe Mon Sep 17 00:00:00 2001
From: Lain <siyuanf@nvidia.com>
Date: Wed, 6 Aug 2025 18:07:41 -0700
Subject: [PATCH 035/932] Fix trtllm-gen attention env and add attention sink
 (#22378)

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: Lain <fusiyuan2000@hotmail.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
---
 vllm/envs.py                             | 13 ++++---------
 vllm/model_executor/models/gpt_oss.py    |  5 ++---
 vllm/utils/flashinfer.py                 |  8 ++++----
 vllm/v1/attention/backends/flashinfer.py | 17 +++++++++--------
 vllm/v1/attention/backends/utils.py      |  6 ++----
 5 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 8a3eb8e509..d9ebf59c1a 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -152,8 +152,7 @@ if TYPE_CHECKING:
     VLLM_LOOPBACK_IP: str = ""
     VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
     VLLM_ENABLE_RESPONSES_API_STORE: bool = False
-    VLLM_USE_TRTLLM_CONTEXT_ATTENTION: bool = False
-    VLLM_USE_TRTLLM_DECODE_ATTENTION: bool = False
+    VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
 
@@ -1043,13 +1042,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_CUDNN_PREFILL":
     lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))),
 
-    # If set to 1, use the TRTLLM Context Attention backend in flashinfer.
-    "VLLM_USE_TRTLLM_CONTEXT_ATTENTION":
-    lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_CONTEXT_ATTENTION", "0"))),
-
-    # If set to 1, use the TRTLLM Decode Attention backend in flashinfer.
-    "VLLM_USE_TRTLLM_DECODE_ATTENTION":
-    lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", "0"))),
+    # If set to 1, use the TRTLLM attention backend in flashinfer.
+    "VLLM_USE_TRTLLM_ATTENTION":
+    lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
 
     # Controls garbage collection during CUDA graph capture.
     # If set to 0 (default), enables GC freezing to speed up capture time.
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index c37c4e9610..feb323a045 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -70,9 +70,8 @@ class OAIAttention(nn.Module):
 
         tp_size = get_tensor_model_parallel_world_size()
 
-        attention_sink_dtype = (
-            torch.float32 if envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION
-            or envs.VLLM_USE_TRTLLM_DECODE_ATTENTION else torch.bfloat16)
+        attention_sink_dtype = (torch.float32 if envs.VLLM_USE_TRTLLM_ATTENTION
+                                else torch.bfloat16)
         self.sinks = torch.nn.Parameter(
             torch.empty(config.num_attention_heads // tp_size,
                         dtype=attention_sink_dtype,
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index cce1aefaf9..32c52612ca 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -159,7 +159,7 @@ def use_trtllm_attention(
 
     # Check if the dimensions are supported by TRTLLM decode attention
     if (attn_head_size is None or num_qo_heads is None or num_kv_heads is None
-            or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128):
+            or num_qo_heads % num_kv_heads != 0):
         return False
 
     env_value = envs.VLLM_USE_TRTLLM_ATTENTION
@@ -169,10 +169,10 @@ def use_trtllm_attention(
         # Making the conditional check for zero because
         # the path is automatically enabled if the batch size condition
         # is satisfied.
-        no_use_trtllm = (env_value == "0")
-        if not no_use_trtllm:
+        use_trtllm = (env_value == "1")
+        if use_trtllm:
             logger.info_once("Using TRTLLM attention.")
-        return not no_use_trtllm
+        return use_trtllm
     else:
         # Environment variable not set - use auto-detection
         use_trtllm = (num_tokens <= 256 and max_seq_len < 131072
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 061bd5f1d2..1fcb190286 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -215,6 +215,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         self._cascade_wrapper = None  # Wrapper for cascade attention
 
         # Global hyperparameters shared by all attention layers
+        # TODO: discard this for trtllm-gen backend
         self.global_hyperparameters = infer_global_hyperparameters(
             get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl))
 
@@ -523,16 +524,12 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         head_dim = self.kv_cache_spec.head_size
 
         # currently prefill trtllm attention does not support fp8 kv cache
-        # trtllm may not support sliding window
-        prefill_use_trtllm = (self.global_hyperparameters.window_left == -1
-                              and not cache_dtype.startswith("fp8")
-                              and use_trtllm_attention(
+        prefill_use_trtllm = use_trtllm_attention(
                                 num_prefill_tokens, max_seq_len, cache_dtype,
-                                num_qo_heads, num_kv_heads, head_dim))
-        decode_use_trtllm = (self.global_hyperparameters.window_left == -1
-                             and use_trtllm_attention(
+                                num_qo_heads, num_kv_heads, head_dim)
+        decode_use_trtllm = use_trtllm_attention(
                                 num_decode_tokens, max_seq_len, cache_dtype,
-                                num_qo_heads, num_kv_heads, head_dim))
+                                num_qo_heads, num_kv_heads, head_dim)
 
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -793,6 +790,8 @@ class FlashInferImpl(AttentionImpl):
                     batch_size=attn_metadata.num_prefills,
                     cum_seq_lens_q=attn_metadata.qo_indptr_gpu,
                     cum_seq_lens_kv=attn_metadata.paged_kv_indptr_gpu,
+                    window_left=window_left,
+                    sinks=self.sinks,
                     out=output[num_decode_tokens:],
                 )
 
@@ -839,6 +838,8 @@ class FlashInferImpl(AttentionImpl):
                     max_seq_len=attn_metadata.max_seq_len,
                     bmm1_scale=layer._k_scale_float * self.scale,
                     bmm2_scale=layer._v_scale_float,
+                    window_left=window_left,
+                    sinks=self.sinks,
                     out=output[:num_decode_tokens],
                 )
         return output_padded
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index f521d94331..770c14572f 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -254,8 +254,7 @@ def get_kv_cache_layout():
     # Override with format specified by the user.
     cache_layout = envs.VLLM_KV_CACHE_LAYOUT
     if cache_layout is None:
-        if (envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION
-                or envs.VLLM_USE_TRTLLM_DECODE_ATTENTION):
+        if envs.VLLM_USE_TRTLLM_ATTENTION:
             cache_layout = "HND"
         else:
             cache_layout = get_kv_connector_cache_layout()
@@ -333,8 +332,7 @@ def infer_global_hyperparameters(
     global_params = param_sets[0]
 
     # trtllm attention doesn't need global hyper params so disable the check
-    if (not envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION
-            and not envs.VLLM_USE_TRTLLM_DECODE_ATTENTION):
+    if not envs.VLLM_USE_TRTLLM_ATTENTION:
         for params in param_sets:
             if params.window_left != global_params.window_left:
                 raise ValueError(

From e8961e963a76feb3e2c080220e79d2d5a9d272f9 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 6 Aug 2025 21:10:24 -0400
Subject: [PATCH 036/932] Update `flashinfer-python==0.2.10` (#22389)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 docker/Dockerfile | 2 +-
 setup.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index d444087a3e..04a63f5d68 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -392,7 +392,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
 # We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
-ARG FLASHINFER_GIT_REF="v0.2.9"
+ARG FLASHINFER_GIT_REF="v0.2.10"
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
   . /etc/environment
     git clone --depth 1 --recursive --shallow-submodules \
diff --git a/setup.py b/setup.py
index c6f4985c59..e374fcb816 100644
--- a/setup.py
+++ b/setup.py
@@ -665,7 +665,7 @@ setup(
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.9"],
+        "flashinfer": ["flashinfer-python==0.2.10"],
     },
     cmdclass=cmdclass,
     package_data=package_data,

From 41b67f4263e6ee06cfb5e74073970e2cee854d5e Mon Sep 17 00:00:00 2001
From: tc-mb <157115220+tc-mb@users.noreply.github.com>
Date: Thu, 7 Aug 2025 09:35:46 +0800
Subject: [PATCH 037/932] [model] Support MiniCPM-V 4.0 (#22166)

Co-authored-by: imning3 <hbning@pku.edu.cn>
---
 docs/models/supported_models.md        |   2 +-
 tests/models/registry.py               |   2 +-
 vllm/model_executor/models/minicpmv.py | 148 +++++++++++++++++++++++--
 3 files changed, 140 insertions(+), 12 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 3816412268..265643a441 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -622,7 +622,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
 | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
 | `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
+| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, etc. | ✅︎ | | ✅︎ |
 | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
 | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 69961d7385..2c2d094e04 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -427,7 +427,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
                                 trust_remote_code=True),
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
-                                extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
+                                extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4"},  # noqa: E501
                                 trust_remote_code=True),
     "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
                                               trust_remote_code=True,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index e172758b2f..3aa16bb9ab 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -38,6 +38,8 @@ from typing_extensions import TypeVar
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.awq import AWQConfig
+from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
 from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
                                                   get_2d_sincos_pos_embed)
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
@@ -339,7 +341,9 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         mm_limits = {"image": None}
-        if self.get_model_version() == (2, 6):
+        if self.get_model_version() == (2,
+                                        6) or self.get_model_version() == (4,
+                                                                           0):
             mm_limits["video"] = None
 
         return mm_limits
@@ -620,7 +624,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         out_keys: set[str],
     ) -> dict[str, NestedTensors]:
         # This processor supports zipping prompt and mm_data together
-        if self.info.get_model_version() == (2, 6):
+        if self.info.get_model_version() == (
+                2, 6) or self.info.get_model_version() == (4, 0):
             inputs = super()._call_hf_processor(
                 prompt=prompts,  # type: ignore
                 mm_data=mm_data,
@@ -679,10 +684,18 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> Sequence[PromptUpdate]:
-        placeholder = {
-            "image": self.info.image_pattern,
-            "video": self.info.video_pattern,
-        }
+        placeholders = [("image", self.info.image_pattern),
+                        ("video", self.info.video_pattern)]
+
+        # hard code for inconsistency of encode-decode image_pattern
+        additional_placeholders = []
+        tokenizer = self.info.get_tokenizer()
+        for modality, pattern in placeholders:
+            sub_pattern = tokenizer.decode(
+                tokenizer.encode(pattern, add_special_tokens=False))
+            if sub_pattern != pattern:
+                additional_placeholders.append((modality, sub_pattern))
+        placeholders += additional_placeholders
 
         def get_image_replacement(item_idx: int):
             images = mm_items.get_items(
@@ -714,9 +727,9 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
         return [
             PromptReplacement(modality=modality,
-                              target=placeholder[modality],
+                              target=pattern,
                               replacement=get_replacement[modality])
-            for modality in ("image", "video")
+            for modality, pattern in placeholders
         ]
 
     def _get_mm_fields_config(
@@ -1262,11 +1275,124 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
 
         return self.resampler(vision_embedding, tgt_sizes)
 
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self,
+                                   skip_prefixes=["apm.", "audio", "tts"])
+        return loader.load_weights(weights)
+
+
+class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        assert self.version == (4, 0)
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)):
+            return None
+        return quant_config
+
+    def init_llm(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> nn.Module:
+        return LlamaForCausalLM(vllm_config=vllm_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        quant_config = self._maybe_ignore_quant_config(quant_config)
+        model = Idefics2VisionTransformer(config.vision_config,
+                                          quant_config=quant_config,
+                                          prefix=prefix)
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        quant_config = self._maybe_ignore_quant_config(quant_config)
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 4.0 remains consistent with the one in 2.5/2.6.
+            resampler = Resampler2_5(num_queries=self.config.query_num,
+                                     embed_dim=embed_dim,
+                                     num_heads=embed_dim // 128,
+                                     kv_dim=vision_dim,
+                                     quant_config=quant_config,
+                                     prefix=prefix)
+
+        return resampler.to(device=current_platform.device_type,
+                            dtype=torch.get_default_dtype())
+
+    def get_vision_hidden_states(
+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
+        tgt_sizes = data["tgt_sizes"]
+
+        B = len(pixel_values)
+        P = pixel_values[0].shape[-2]
+        L = max(item.shape[-1] for item in pixel_values)
+        device = pixel_values[0].device
+        dtype = pixel_values[0].dtype
+
+        all_pixel_values = torch.zeros((B, 3, P, L),
+                                       dtype=dtype,
+                                       device=device)
+        for i, pixel_values_item in enumerate(pixel_values):
+            L_item = pixel_values_item.shape[-1]
+            all_pixel_values[i, ..., :L_item] = pixel_values_item
+
+        num_patches = tgt_sizes.prod(-1)
+        max_patches = num_patches.max().item()
+        assert isinstance(max_patches, int)
+
+        patch_attn_mask = torch.zeros((B, max_patches),
+                                      dtype=torch.bool,
+                                      device=device)
+        for i, num_patches_item in enumerate(num_patches):
+            patch_attn_mask[i, :num_patches_item] = True
+
+        vision_embedding = self.vpm(
+            all_pixel_values,
+            patch_attention_mask=patch_attn_mask.unsqueeze(1),
+            tgt_sizes=tgt_sizes,
+        )
+
+        return self.resampler(vision_embedding, tgt_sizes)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self,
+                                   skip_prefixes=["apm.", "audio", "tts"])
+        return loader.load_weights(weights)
+
 
 _SUPPORT_VERSION = {
     (2, 0): MiniCPMV2_0,
     (2, 5): MiniCPMV2_5,
     (2, 6): MiniCPMV2_6,
+    (4, 0): MiniCPMV4_0,
 }
 
 
@@ -1294,8 +1420,10 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
         # Dispatch class based on version
         instance_cls = _SUPPORT_VERSION.get(version)
         if instance_cls is None:
-            raise ValueError(
-                "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6")
+            supported_versions = ", ".join(
+                [f"{v[0]}.{v[1]}" for v in sorted(_SUPPORT_VERSION.keys())])
+            raise ValueError(f"Currently, MiniCPMV only supports versions "
+                             f"{supported_versions}. Got version: {version}")
 
         # quant_config references base class members,
         # so update values before init is called

From f825c6bd22133a8b2242457069f59654a2ae401b Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Wed, 6 Aug 2025 22:37:14 -0300
Subject: [PATCH 038/932] Support encoder_only attention for FlexAttention
 (#22273)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 tests/kernels/test_flex_attention.py         | 88 +++++++++++++-----
 vllm/v1/attention/backends/flex_attention.py | 95 ++++++++++++++------
 2 files changed, 137 insertions(+), 46 deletions(-)

diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
index e25556c89f..f76bd19246 100644
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -9,7 +9,9 @@ import pytest
 import torch
 from packaging import version
 
-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
+
+from ..models.utils import check_embeddings_close
 
 TORCH_VERSION = version.parse(torch.__version__)
 MINIMUM_TORCH_VERSION = version.parse("2.7.0")
@@ -28,7 +30,7 @@ def set_seed(seed):
     not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
     reason="CUDA not available or PyTorch version < 2.7",
 )
-def test_flex_attention_vs_default_backend(monkeypatch):
+def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
     """Test that FlexAttention produces the same outputs as the default backend.
 
     This test compares the outputs from the FlexAttention backend with
@@ -36,7 +38,7 @@ def test_flex_attention_vs_default_backend(monkeypatch):
     """
     model_name = "Qwen/Qwen2.5-1.5B-Instruct"
     seed = 42
-    max_tokens = 32
+    max_tokens = 24
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
@@ -54,33 +56,30 @@ def test_flex_attention_vs_default_backend(monkeypatch):
         m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
 
         set_seed(seed)
-
-        llm_flex = LLM(
-            model_name,
-            tensor_parallel_size=1,
-            num_gpu_blocks_override=128,
-            enforce_eager=True,
-        )
-        output_flex = llm_flex.generate(prompts, sampling_params)
+        with vllm_runner(model_name,
+                         runner="generate",
+                         tensor_parallel_size=1,
+                         num_gpu_blocks_override=128,
+                         enforce_eager=True) as llm_flex:
+            output_flex = llm_flex.generate(prompts, sampling_params)
 
     # Run with default backend
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
         set_seed(seed)
-        llm_default = LLM(
-            model_name,
-            tensor_parallel_size=1,
-            num_gpu_blocks_override=128,
-            enforce_eager=True,
-        )
-        output_default = llm_default.generate(prompts, sampling_params)
+        with vllm_runner(model_name,
+                         runner="generate",
+                         tensor_parallel_size=1,
+                         num_gpu_blocks_override=128,
+                         enforce_eager=True) as llm_default:
+            output_default = llm_default.generate(prompts, sampling_params)
 
     # Compare outputs from both backends
     for i, (flex_result,
             default_result) in enumerate(zip(output_flex, output_default)):
         prompt = prompts[i]
-        flex_text = flex_result.outputs[0].text
-        default_text = default_result.outputs[0].text
+        flex_text = flex_result[1][0]
+        default_text = default_result[1][0]
 
         assert flex_text == default_text, (
             f"FlexAttention output doesn't match default for: {prompt!r}\n"
@@ -88,5 +87,54 @@ def test_flex_attention_vs_default_backend(monkeypatch):
             f"Default: {default_text!r}")
 
 
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
+    reason="CUDA not available or PyTorch version < 2.7",
+)
+def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
+    """Test that FlexAttention produces the same outputs as the default backend.
+
+    This test compares the outputs from the FlexAttention backend with
+    the default backend for encoder models.
+    """
+    model_name = "BAAI/bge-base-en-v1.5"
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+
+    # Run with flex attention
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+        with vllm_runner(model_name,
+                         runner="pooling",
+                         dtype=torch.bfloat16,
+                         tensor_parallel_size=1,
+                         max_model_len=100,
+                         enforce_eager=True) as llm_flex:
+            flex_outputs = llm_flex.embed(prompts)
+
+    # Run with default backend
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        with vllm_runner(model_name,
+                         runner="pooling",
+                         dtype=torch.bfloat16,
+                         tensor_parallel_size=1,
+                         max_model_len=100,
+                         enforce_eager=True) as llm_default:
+            default_outputs = llm_default.embed(prompts)
+
+    check_embeddings_close(
+        embeddings_0_lst=flex_outputs,
+        embeddings_1_lst=default_outputs,
+        name_0="flex",
+        name_1="default",
+        tol=1e-2,
+    )
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index bb0d890c77..e599411b2d 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -148,6 +148,7 @@ def causal_mask_mod(b: torch.Tensor, h: torch.Tensor, q_idx: torch.Tensor,
 
 @dataclass
 class FlexAttentionMetadata:
+    causal: bool
     num_actual_tokens: int  # Number of tokens excluding padding.
     max_query_len: int
     query_start_loc: torch.Tensor
@@ -177,10 +178,9 @@ class FlexAttentionMetadata:
     num_blocks = 0
     block_mask: Optional[BlockMask] = None
     score_mod: Optional[_score_mod_signature] = None
-    mask_mod: Optional[_mask_mod_signature] = None
     logical_mask_mod: _mask_mod_signature = causal_mask_mod
 
-    def get_mask_mod(self) -> _mask_mod_signature:
+    def get_causal_mask_mod(self) -> _mask_mod_signature:
         """Creates the mask_mod function for FlexAttention.
 
         This function creates the combined mask mod function that handles:
@@ -233,14 +233,39 @@ class FlexAttentionMetadata:
 
         return final_mask_mod
 
+    def get_bidirectional_mask_mod(self) -> _mask_mod_signature:
+        """Creates the encoder mask_mod function for FlexAttention.
+
+        Since the encoder bidirectional attention doesn't run with 
+        KV cache, this function creates a mask based on the
+        packed query sequences.
+        """
+        # Create a lookup mapping from query indices -> request number
+        request_lookup = _offsets_to_doc_ids_tensor(self.query_start_loc)
+
+        def final_mask_mod(
+            b: torch.Tensor,
+            h: torch.Tensor,
+            q_idx: torch.Tensor,
+            kv_idx: torch.Tensor,
+        ) -> torch.Tensor:
+            return request_lookup[q_idx] == request_lookup[kv_idx]
+
+        return final_mask_mod
+
     def build_block_mask(self) -> BlockMask:
-        assert self.mask_mod is not None
+        if self.causal:
+            mask_mod = self.get_causal_mask_mod()
+            kv_len = self.total_cache_tokens
+        else:
+            mask_mod = self.get_bidirectional_mask_mod()
+            kv_len = self.num_actual_tokens
         return create_block_mask_compiled(
-            self.mask_mod,
+            mask_mod,
             None,
             None,
             self.num_actual_tokens,
-            self.total_cache_tokens,
+            kv_len,
             device=self.block_table.device,
         )
 
@@ -251,7 +276,6 @@ class FlexAttentionMetadata:
         assert self.prefix_kv_lens is None, "Not implemented yet."
         assert self.suffix_kv_lens is None, "Not implemented yet."
         self.num_blocks = self.total_cache_tokens // self.block_size
-        self.mask_mod = self.get_mask_mod()
         self.block_mask = self.build_block_mask()
 
 
@@ -306,6 +330,7 @@ class FlexAttentionMetadataBuilder(
             self.device, non_blocking=True)
 
         out = FlexAttentionMetadata(
+            causal=common_attn_metadata.causal,
             num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
             query_start_loc=query_start_loc,
@@ -350,6 +375,12 @@ class FlexAttentionImpl(AttentionImpl):
         self.head_size = head_size
         self.scale = float(scale)
         self.num_kv_heads = num_kv_heads
+        self.attn_type = attn_type
+
+        if attn_type not in (AttentionType.ENCODER_ONLY,
+                             AttentionType.DECODER):
+            raise NotImplementedError(
+                f"FlexAttention does not support {attn_type} attention")
 
         if alibi_slopes is not None:
             raise NotImplementedError(
@@ -425,26 +456,38 @@ class FlexAttentionImpl(AttentionImpl):
 
         num_actual_tokens = attn_metadata.num_actual_tokens
 
-        key_cache, value_cache = kv_cache.unbind(0)
+        if not attn_metadata.causal:
+            assert self.attn_type == AttentionType.ENCODER_ONLY
 
-        torch.ops._C_cache_ops.reshape_and_cache_flash(
-            key,
-            value,
-            key_cache,
-            value_cache,
-            attn_metadata.slot_mapping,
-            self.kv_cache_dtype,
-            layer._k_scale,
-            layer._v_scale,
-        )
+            query, key_tensor, value_tensor = map(
+                lambda x: self.view_as_4d(x).permute(0, 2, 1, 3),
+                (query, key, value),
+            )
+
+        else:
+            assert self.attn_type == AttentionType.DECODER
+            key_cache, value_cache = kv_cache.unbind(0)
+
+            torch.ops._C_cache_ops.reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                self.kv_cache_dtype,
+                layer._k_scale,
+                layer._v_scale,
+            )
+
+            # View out the block_size dim
+            key_cache = key_cache.view(-1, self.num_kv_heads, self.head_size)
+            value_cache = value_cache.view(-1, self.num_kv_heads,
+                                           self.head_size)
+            query, key_tensor, value_tensor = map(
+                lambda x: self.view_as_4d(x).permute(0, 2, 1, 3),
+                (query, key_cache, value_cache),
+            )
 
-        # View out the block_size dim
-        key_cache = key_cache.view(-1, self.num_kv_heads, self.head_size)
-        value_cache = value_cache.view(-1, self.num_kv_heads, self.head_size)
-        query, key_cache, value_cache = map(
-            lambda x: self.view_as_4d(x).permute(0, 2, 1, 3),
-            (query, key_cache, value_cache),
-        )
         query = query[:, :, :num_actual_tokens, :]
         # Doesn't work for now -> constraint violation
         # torch._dynamo.try_mark_dynamic(query, 2)
@@ -465,8 +508,8 @@ class FlexAttentionImpl(AttentionImpl):
 
         out = flex_attention_compiled(
             query,
-            key_cache,
-            value_cache,
+            key_tensor,
+            value_tensor,
             attn_metadata.score_mod,
             attn_metadata.block_mask,
             self.scale,

From 1dc8a70b6d4e8ba4e139f1ddb86a166694f42f21 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 6 Aug 2025 21:40:52 -0400
Subject: [PATCH 039/932] [Attention] Support multiple attention metadata
 builders per kv_cache_spec  + proper local attention no hybrid kv cache fix
 (#21588)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/v1/spec_decode/test_eagle.py            |   3 +-
 tests/v1/worker/test_gpu_model_runner.py      |   6 +-
 vllm/attention/backends/abstract.py           |   4 +
 vllm/attention/layer.py                       |  36 +-
 .../layers/chunked_local_attention.py         |  88 +++++
 vllm/attention/selector.py                    |   2 +-
 vllm/model_executor/models/llama4.py          |  10 +-
 vllm/v1/attention/backends/utils.py           |  48 ++-
 vllm/v1/spec_decode/eagle.py                  |   9 +-
 vllm/v1/worker/cpu_model_runner.py            |   8 +-
 vllm/v1/worker/gpu_model_runner.py            | 342 +++++++++---------
 vllm/v1/worker/tpu_model_runner.py            |   5 +-
 vllm/v1/worker/utils.py                       |  21 ++
 13 files changed, 369 insertions(+), 213 deletions(-)
 create mode 100644 vllm/attention/layers/chunked_local_attention.py

diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 05f6dd40a9..73b47f8974 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -313,7 +313,8 @@ def test_propose(num_speculative_tokens, backend):
 
     # Mock runner for attention metadata building
     proposer.runner = mock.MagicMock()
-    proposer.runner.attn_metadata_builders = [attn_metadata_builder]
+    proposer.runner.attn_groups.append([mock.MagicMock()])
+    proposer.runner.attn_groups[0][0].metadata_builder = attn_metadata_builder
 
     result = proposer.propose(target_token_ids=target_token_ids,
                               target_positions=target_positions,
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 231dfcbb68..e151d388c2 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -417,12 +417,12 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
         return rnd_stride
 
     # Patch the attention backend class and re-trigger the KV cache creation.
-    for attn_backend in model_runner.attn_backends:
+    for attn_group in model_runner._attn_group_iterator():
+        attn_backend = attn_group.backend
         monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order",
                             rnd_stride_order)
 
-    model_runner.attn_backends = []
-    model_runner.attn_metadata_builders = []
+    model_runner.attn_groups = []
     model_runner.initialize_kv_cache(model_runner.kv_cache_config)
 
     # Shape is unchanged, but layout may differ
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index ba20da4fd7..2417fe06a6 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -106,6 +106,10 @@ class AttentionBackend(ABC):
                      block_size: int, num_seqs: int, num_queries: int) -> None:
         raise NotImplementedError
 
+    @classmethod
+    def full_cls_name(cls) -> tuple[str, str]:
+        return (cls.__module__, cls.__qualname__)
+
 
 @dataclass
 class AttentionMetadata:
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 178453ecdc..b4c3cbd7c9 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -9,6 +9,7 @@ import torch.nn.functional as F
 
 import vllm.envs as envs
 from vllm.attention import AttentionType
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
 from vllm.config import CacheConfig, get_current_vllm_config
@@ -80,6 +81,7 @@ class Attention(nn.Module):
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[str] = None,
+        attn_backend: Optional[type[AttentionBackend]] = None,
         **extra_impl_args,
     ) -> None:
         """
@@ -137,15 +139,6 @@ class Attention(nn.Module):
         self.num_kv_heads = num_kv_heads
         self.sliding_window = sliding_window
 
-        # For v1 we have backend agnostic iRoPE (local chunked attention)
-        # we have to store the flag on the layer so gpu model runner can
-        # set KVSpec appropriately (and pop it so it doesnt get passed to
-        # the backends)
-        if envs.VLLM_USE_V1:
-            self.use_irope = extra_impl_args.pop("use_irope", False)
-        else:
-            self.use_irope = extra_impl_args.get("use_irope", False)
-
         quant_method = quant_config.get_quant_method(
             self, prefix=prefix) if quant_config else None
         if quant_method is not None and not isinstance(
@@ -166,18 +159,22 @@ class Attention(nn.Module):
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
         dtype = torch.get_default_dtype()
-        attn_backend = get_attn_backend(head_size,
-                                        dtype,
-                                        kv_cache_dtype,
-                                        block_size,
-                                        is_attention_free,
-                                        use_mla=use_mla)
-        impl_cls = attn_backend.get_impl_cls()
+        if attn_backend is None:
+            self.attn_backend = get_attn_backend(head_size,
+                                                 dtype,
+                                                 kv_cache_dtype,
+                                                 block_size,
+                                                 is_attention_free,
+                                                 use_mla=use_mla)
+        else:
+            self.attn_backend = attn_backend
+
+        impl_cls = self.attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
                              logits_soft_cap, attn_type,
                              kv_sharing_target_layer_name, **extra_impl_args)
-        self.backend = backend_name_to_enum(attn_backend.get_name())
+        self.backend = backend_name_to_enum(self.attn_backend.get_name())
         self.dtype = dtype
 
         # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
@@ -187,7 +184,7 @@ class Attention(nn.Module):
         self.use_direct_call = not current_platform.is_cuda_alike(
         ) and not current_platform.is_cpu()
 
-        self.use_output = attn_backend.accept_output_buffer
+        self.use_output = self.attn_backend.accept_output_buffer
         compilation_config = get_current_vllm_config().compilation_config
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
@@ -309,6 +306,9 @@ class Attention(nn.Module):
         if hasattr(self.impl, "process_weights_after_loading"):
             self.impl.process_weights_after_loading(act_dtype)
 
+    def get_attn_backend(self) -> type[AttentionBackend]:
+        return self.attn_backend
+
 
 class MultiHeadAttention(nn.Module):
     """Multi-headed attention without any cache, used for ViT."""
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
new file mode 100644
index 0000000000..892077ba91
--- /dev/null
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+from typing import List, Optional
+
+import torch
+
+from vllm import envs
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.selector import get_attn_backend
+from vllm.config import CacheConfig, QuantizationConfig
+from vllm.v1.attention.backends.utils import (
+    CommonAttentionMetadata, make_local_attention_virtual_batches,
+    subclass_attention_backend, subclass_attention_metadata_builder)
+
+from ..layer import Attention
+
+
+@functools.lru_cache
+def create_chunked_local_attention_backend(
+    underlying_attn_backend: AttentionBackend,
+    attention_chunk_size: int,
+    block_size: int,
+) -> type[AttentionBackend]:
+    prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
+
+    def build_preprocess_fn(cm: CommonAttentionMetadata):
+        return make_local_attention_virtual_batches(attention_chunk_size, cm,
+                                                    block_size)
+
+    # Dynamically create a new attention backend that wraps the
+    # underlying attention backend but applies
+    # `make_local_attention_virtual_batches` before calling `build(...)`
+    builder_cls = subclass_attention_metadata_builder(
+        name_prefix=prefix,
+        builder_cls=underlying_attn_backend.get_builder_cls(),
+        build_preprocess_fn=build_preprocess_fn)
+    attn_backend = subclass_attention_backend(
+        name_prefix=prefix,
+        attention_backend_cls=underlying_attn_backend,
+        builder_cls=builder_cls)
+
+    return attn_backend
+
+
+class ChunkedLocalAttention(Attention):
+
+    def __init__(self,
+                 num_heads: int,
+                 head_size: int,
+                 scale: float,
+                 attention_chunk_size: int,
+                 num_kv_heads: Optional[int] = None,
+                 alibi_slopes: Optional[List[float]] = None,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 kv_sharing_target_layer_name: Optional[str] = None,
+                 prefix: str = ""):
+        dtype = torch.get_default_dtype()
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+
+        if envs.VLLM_USE_V1:
+            underlying_attn_backend = get_attn_backend(head_size, dtype,
+                                                       kv_cache_dtype,
+                                                       block_size)
+
+            attn_backend = create_chunked_local_attention_backend(
+                underlying_attn_backend, attention_chunk_size, block_size)
+        else:
+            # in v0 the local attention is handled inside the backends
+            attn_backend = None
+
+        super().__init__(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            num_kv_heads=num_kv_heads,
+            alibi_slopes=alibi_slopes,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
+            attn_backend=attn_backend)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 596c556e54..508470bb36 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -142,7 +142,7 @@ def get_attn_backend(
     dtype: torch.dtype,
     kv_cache_dtype: Optional[str],
     block_size: int,
-    is_attention_free: bool,
+    is_attention_free: bool = False,
     use_mla: bool = False,
 ) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 60098209c3..1f8b9d0744 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -25,6 +25,7 @@ from torch import nn
 from transformers import Llama4TextConfig
 
 from vllm.attention import Attention
+from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -194,17 +195,18 @@ class Llama4Attention(nn.Module):
             is_neox_style=is_neox_style,
         ) if not self.nope else None
 
-        self.attn = Attention(
+        attn_cls = Attention if self.nope else ChunkedLocalAttention
+        self.attn = attn_cls(
             self.num_heads,
             self.head_dim,
             self.scaling,
             num_kv_heads=self.num_kv_heads,
             cache_config=cache_config,
             quant_config=quant_config,
-            per_layer_sliding_window=None,
-            use_irope=not self.nope,
             prefix=f"{prefix}.attn",
-        )
+            **({
+                "attention_chunk_size": config.attention_chunk_size
+            } if not self.nope else {}))
 
     def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
         floor = torch.floor((positions + 1.0) / self.floor_scale)
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 770c14572f..e23dd8bc5b 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -5,12 +5,12 @@ import enum
 import functools
 from abc import abstractmethod
 from dataclasses import dataclass, make_dataclass
-from typing import TYPE_CHECKING, Any, ClassVar, Generic, Optional, TypeVar
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Generic, Optional,
+                    TypeVar)
 
 import numpy as np
 import torch
 
-from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.utils import cdiv
 
@@ -20,6 +20,8 @@ if TYPE_CHECKING:
     from vllm.v1.worker.gpu_input_batch import InputBatch
 
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.layer import Attention
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     get_kv_connector_cache_layout)
 from vllm.logger import init_logger
@@ -532,6 +534,48 @@ def make_local_attention_virtual_batches(
     )
 
 
+def subclass_attention_metadata_builder(
+    name_prefix: str,
+    builder_cls: type[AttentionMetadataBuilder[M]],
+    build_preprocess_fn: Callable[[CommonAttentionMetadata],
+                                  CommonAttentionMetadata],
+) -> type[AttentionMetadataBuilder[M]]:
+    """
+    Return a new subclass of `builder_cls` whose .build(...) method
+    first calls build_preprocess_fn(common_attn_metadata) on the metadata.
+    """
+    name: str = name_prefix + builder_cls.__name__  # type: ignore
+
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False):
+        return builder_cls.build(self, common_prefix_len,
+                                 build_preprocess_fn(common_attn_metadata),
+                                 fast_build)
+
+    Wrapped = type(
+        name,
+        (builder_cls, ),  # inherit from the original
+        {
+            "build": build,
+        })
+    return Wrapped  # type: ignore
+
+
+def subclass_attention_backend(
+        name_prefix: str, attention_backend_cls: type[AttentionBackend],
+        builder_cls: type[AttentionMetadataBuilder[M]]
+) -> type[AttentionBackend]:
+    """
+    Return a new subclass where `get_builder_cls` returns `builder_cls`.
+    """
+    name: str = name_prefix + attention_backend_cls.__name__  # type: ignore
+
+    return type(name, (attention_backend_cls, ),
+                {"get_builder_cls": lambda: builder_cls})
+
+
 def split_decodes_and_prefills(
     common_attn_metadata: CommonAttentionMetadata,
     decode_threshold: int = 1,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index b2380bb3dd..3c36971fe5 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -158,9 +158,9 @@ class EagleProposer:
         assert self.runner is not None
 
         # FIXME: need to consider multiple kv_cache_groups
-        attn_metadata = self.runner.attn_metadata_builders[
-            0].build_for_drafting(common_attn_metadata=common_attn_metadata,
-                                  draft_index=0)
+        attn_metadata = self.runner.attn_groups[0][0].metadata_builder\
+            .build_for_drafting(common_attn_metadata=common_attn_metadata,
+                                draft_index=0)
 
         # At this moment, we assume all eagle layers belong to the same KV
         # cache group, thus using the same attention metadata.
@@ -349,7 +349,8 @@ class EagleProposer:
         hidden_states: torch.Tensor,
         common_attn_metadata: CommonAttentionMetadata,
     ) -> list[torch.Tensor]:
-        tree_attn_metadata_builder = self.runner.attn_metadata_builders[0]
+        tree_attn_metadata_builder = \
+            self.runner.attn_groups[0][0].metadata_builder
         assert isinstance(tree_attn_metadata_builder,
                           TreeAttentionMetadataBuilder)
 
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index d8f3e0d89a..11b96d9463 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -53,11 +53,11 @@ class CPUModelRunner(GPUModelRunner):
             raise ValueError("Multiple KVCacheGroups is not"
                              "currently supported with CPU model runner.")
 
-        assert type(
-            self.attn_metadata_builders[0]) is TorchSDPAMetadataBuilderV1
+        assert type(self.attn_groups[0]
+                    [0].metadata_builder) is TorchSDPAMetadataBuilderV1
 
-        self.attn_metadata_builders[0].reorder_batch(self.input_batch,
-                                                     scheduler_output)
+        self.attn_groups[0][0].metadata_builder.reorder_batch(
+            self.input_batch, scheduler_output)
 
     def _postprocess_tenosrs(self) -> None:
         # Note: replace device tensors with cpu tensors
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 549f21af79..08b253dcdb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3,7 +3,10 @@
 
 import dataclasses
 import gc
+import itertools
 import time
+from collections import defaultdict
+from collections.abc import Iterator
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, Optional, Union, cast
 
@@ -14,9 +17,9 @@ import torch.nn as nn
 from tqdm import tqdm
 
 import vllm.envs as envs
-from vllm.attention import AttentionType, get_attn_backend
+from vllm.attention import Attention, AttentionType
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.layer import Attention
+from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.counter import compilation_counter
 from vllm.config import (CompilationLevel, VllmConfig,
                          get_layers_from_vllm_config, update_config)
@@ -50,7 +53,6 @@ from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
     make_kv_sharing_fast_prefill_attention_metadata,
-    make_local_attention_virtual_batches,
     reorder_batch_to_split_decodes_and_prefills)
 from vllm.v1.kv_cache_interface import (AttentionSpec,
                                         ChunkedLocalAttentionSpec,
@@ -73,8 +75,8 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import (
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
 from ..sample.logits_processor import LogitsProcessorManager
-from .utils import (MultiModalBudget, bind_kv_cache, gather_mm_placeholders,
-                    initialize_kv_cache_for_kv_sharing,
+from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache,
+                    gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
 if TYPE_CHECKING:
@@ -162,8 +164,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # self.model: nn.Module  # Set after load_model
         # Initialize in initialize_kv_cache
         self.kv_caches: list[torch.Tensor] = []
-        self.attn_metadata_builders: list[AttentionMetadataBuilder] = []
-        self.attn_backends: list[type[AttentionBackend]] = []
+        # indexes: [kv_cache_group_id][attn_group]
+        self.attn_groups: list[list[AttentionGroup]] = []
         # self.kv_cache_config: KVCacheConfig
 
         # req_id -> (input_id -> encoder_output)
@@ -830,81 +832,51 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 spec_decode_common_attn_metadata is None:
                 spec_decode_common_attn_metadata = common_attn_metadata
 
-            if isinstance(kv_cache_group_spec.kv_cache_spec,
-                          ChunkedLocalAttentionSpec):
-                common_attn_metadata = make_local_attention_virtual_batches(
-                    kv_cache_group_spec.kv_cache_spec.attention_chunk_size,
-                    common_attn_metadata, self.cache_config.block_size)
+            for attn_group in self.attn_groups[kv_cache_group_id]:
+                # Prepare for cascade attention if enabled & beneficial.
+                common_prefix_len = 0
+                builder = attn_group.metadata_builder
+                if self.cascade_attn_enabled:
+                    common_prefix_len = self._compute_cascade_attn_prefix_len(
+                        num_scheduled_tokens,
+                        scheduler_output.
+                        num_common_prefix_blocks[kv_cache_group_id],
+                        kv_cache_group_spec.kv_cache_spec,
+                        builder,
+                    )
 
-            # Prepare for cascade attention if enabled & beneficial.
-            common_prefix_len = 0
-            builder = self.attn_metadata_builders[kv_cache_group_id]
-            if self.cascade_attn_enabled:
-                common_prefix_len = self._compute_cascade_attn_prefix_len(
-                    num_scheduled_tokens,
-                    scheduler_output.
-                    num_common_prefix_blocks[kv_cache_group_id],
-                    kv_cache_group_spec.kv_cache_spec,
-                    builder,
-                )
-
-            attn_metadata_i = (builder.build(
-                common_prefix_len=common_prefix_len,
-                common_attn_metadata=common_attn_metadata,
-            ))
-
-            fast_prefill_metadata = attn_metadata_i
-            if (self.cache_config.kv_sharing_fast_prefill
-                    and self.kv_sharing_fast_prefill_eligible_layers):
-                # Dynamically create a a dataclass type that inherits
-                # from attention metadata type but includes additional
-                # fields logits_indices_padded and num_logits_indices
-                # which are required for prefill truncation
-                fast_prefill_metadata_type = (
-                    make_kv_sharing_fast_prefill_attention_metadata(
-                        metadata_cls=type(attn_metadata_i), ))
-                fast_prefill_metadata = fast_prefill_metadata_type(
-                    **dataclasses.asdict(attn_metadata_i),
-                    logits_indices_padded=logits_indices_padded,
-                    num_logits_indices=logits_indices.size(0),
-                )
-
-            for layer_name in kv_cache_group_spec.layer_names:
-                if (self.cache_config.kv_sharing_fast_prefill and layer_name
-                        in self.kv_sharing_fast_prefill_eligible_layers):
-                    attn_metadata[layer_name] = fast_prefill_metadata
-                    continue
-
-                attn_metadata[layer_name] = attn_metadata_i
-
-            # Hack for now to fix chunked local attention + no hybrid kv cache
-            # manager we can remove this once
-            # https://github.com/vllm-project/vllm/pull/21588
-            # is merged (i.e. properly handle different attention backends for
-            # the same kv_cache_spec)
-            if self.attention_chunk_size is not None \
-                    and self.scheduler_config.disable_hybrid_kv_cache_manager:
-                if not hasattr(self, "local_attention_layers"):
-                    self.local_attention_layers = []
-                    attn_layers = get_layers_from_vllm_config(
-                        self.vllm_config, Attention)
-                    for layer_name, attn_module in attn_layers.items():
-                        if attn_module.use_irope:
-                            self.local_attention_layers.append(layer_name)
-
-                local_attn_metadata_i = (builder.build(
-                    common_prefix_len=0,
-                    common_attn_metadata=make_local_attention_virtual_batches(
-                        self.attention_chunk_size, common_attn_metadata,
-                        self.cache_config.block_size),
+                attn_metadata_i = (builder.build(
+                    common_prefix_len=common_prefix_len,
+                    common_attn_metadata=common_attn_metadata,
                 ))
 
-                for layer_name in self.local_attention_layers:
-                    attn_metadata[layer_name] = local_attn_metadata_i
+                fast_prefill_metadata = attn_metadata_i
+                if (self.cache_config.kv_sharing_fast_prefill
+                        and self.kv_sharing_fast_prefill_eligible_layers):
+                    # Dynamically create a a dataclass type that inherits
+                    # from attention metadata type but includes additional
+                    # fields logits_indices_padded and num_logits_indices
+                    # which are required for prefill truncation
+                    fast_prefill_metadata_type = (
+                        make_kv_sharing_fast_prefill_attention_metadata(
+                            metadata_cls=type(attn_metadata_i), ))
+                    fast_prefill_metadata = fast_prefill_metadata_type(
+                        **dataclasses.asdict(attn_metadata_i),
+                        logits_indices_padded=logits_indices_padded,
+                        num_logits_indices=logits_indices.size(0),
+                    )
+
+                for layer_name in attn_group.layer_names:
+                    if (self.cache_config.kv_sharing_fast_prefill
+                            and layer_name
+                            in self.kv_sharing_fast_prefill_eligible_layers):
+                        attn_metadata[layer_name] = fast_prefill_metadata
+                        continue
+                    attn_metadata[layer_name] = attn_metadata_i
 
         attention_cuda_graphs = all(
-            b.can_run_in_cudagraph(common_attn_metadata)
-            for b in self.attn_metadata_builders)
+            g.metadata_builder.can_run_in_cudagraph(common_attn_metadata)
+            for g in self._attn_group_iterator())
 
         # Hot-Swap lora model
         if self.lora_config:
@@ -2229,11 +2201,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     block_table[kv_cache_group_id].slot_mapping[:num_tokens],
                     causal=True)
 
-                attn_metadata_i = self.attn_metadata_builders[
-                    kv_cache_group_id].build_for_cudagraph_capture(
-                        common_attn_metadata)
-                for layer_name in kv_cache_group_spec.layer_names:
-                    attn_metadata[layer_name] = attn_metadata_i
+                for attn_group in self.attn_groups[kv_cache_group_id]:
+                    attn_metadata_i = attn_group.metadata_builder\
+                        .build_for_cudagraph_capture(common_attn_metadata)
+                    for layer_name in kv_cache_group_spec.layer_names:
+                        attn_metadata[layer_name] = attn_metadata_i
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
                                             num_scheduled_tokens):
@@ -2565,88 +2537,100 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
                     elapsed_time, cuda_graph_size / (1 << 30))
 
-    def _initialize_single_attn_backend(
-        self, kv_cache_spec: KVCacheSpec, layer_names: list[str]
-    ) -> tuple[AttentionBackend, AttentionMetadataBuilder]:
-        if isinstance(kv_cache_spec, AttentionSpec):
-            attn_backend_i = get_attn_backend(
-                kv_cache_spec.head_size,
-                self.dtype,
-                kv_cache_spec.dtype,
-                kv_cache_spec.block_size,
-                self.model_config.is_attention_free,
-                use_mla=kv_cache_spec.use_mla,
-            )
-            if attn_backend_i is None:
-                error_msg = (f"Error with get_attn_backend: "
-                             f"{kv_cache_spec.head_size=}, "
-                             f"{self.dtype=}, {kv_cache_spec.dtype=}, "
-                             f"{kv_cache_spec.block_size=}, "
-                             f"{self.model_config.is_attention_free=}, "
-                             f"{kv_cache_spec.use_mla=}")
-                logger.error(error_msg)
-                raise NotImplementedError(
-                    "Non-Attention backend is not supported by V1 "
-                    "GPUModelRunner.")
-        elif isinstance(kv_cache_spec, MambaSpec):
-            attn_backend_i = get_mamba_attn_backend(kv_cache_spec.mamba_type)
-        else:
-            raise ValueError(
-                f"Unknown KV cache spec type: {type(kv_cache_spec)}")
-
-        attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
-            kv_cache_spec,
-            layer_names,
-            self.vllm_config,
-            self.device,
-        )
-
-        if self.full_cuda_graph:
-            if attn_metadata_builder_i.attn_cudagraph_support == \
-                AttentionCGSupport.NEVER:
-                raise ValueError(f"Full CUDAGraph not supported for "
-                                 f"{attn_backend_i.__name__}. Turn off "
-                                 f"CompilationConfig.full_cuda_graph or use a "
-                                 f" different attention backend.")
-            if attn_metadata_builder_i.attn_cudagraph_support == \
-                AttentionCGSupport.PURE_DECODE_ONLY:
-                # Limit the max cudagraph size to the max number of
-                # sequences for pure decode only cudagraph backend,
-                # whose max_query_len is 1.
-                self.cudagraph_batch_sizes = [
-                    size for size in self.cudagraph_batch_sizes
-                    if size <= self.scheduler_config.max_num_seqs
-                ]
-        return attn_backend_i, attn_metadata_builder_i
-
     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize the attention backends and attention metadata builders.
         """
-        assert len(self.attn_backends) == 0 and len(
-            self.attn_metadata_builders
-        ) == 0, "Attention backends are already initialized"
-        for i, kv_cache_group_spec in enumerate(
-                kv_cache_config.kv_cache_groups):
-            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+        assert len(self.attn_groups) == 0, \
+            "Attention backends are already initialized"
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
 
-            attn_backend_i, attn_metadata_builder_i = (
-                self._initialize_single_attn_backend(
-                    kv_cache_spec, kv_cache_group_spec.layer_names))
-            self.attn_backends.append(attn_backend_i)
-            self.attn_metadata_builders.append(attn_metadata_builder_i)
+        def get_attn_backends_for_layers(
+                layer_names: list[str]
+        ) -> dict[type[AttentionBackend], list[str]]:
+            attn_backends = {}
+            attn_backend_layers = defaultdict(list)
+            # Dedupe based on full class name; this is a bit safer than using
+            # using the class itself as the key because when we create dynamic
+            # attention backend subclasses (e.g. ChunkedLocalAttention) unless
+            # they are cached correctly, there will be different objects per
+            # layer.
+            for layer_name in layer_names:
+                attn_backend = attn_layers[layer_name].get_attn_backend()
+                key = attn_backend.full_cls_name()
+                attn_backends[key] = attn_backend
+                attn_backend_layers[key].append(layer_name)
+            return {
+                attn_backends[k]: v
+                for k, v in attn_backend_layers.items()
+            }
+
+        def create_attn_groups(
+            attn_backends_map: dict[AttentionBackend, list[str]],
+            kv_cache_spec: KVCacheSpec,
+        ) -> list[AttentionGroup]:
+            attn_groups: list[AttentionGroup] = []
+            for attn_backend, layer_names in attn_backends_map.items():
+                attn_metadata_builder_i = attn_backend.get_builder_cls()(
+                    kv_cache_spec,
+                    layer_names,
+                    self.vllm_config,
+                    self.device,
+                )
+                attn_group = AttentionGroup(attn_backend,
+                                            attn_metadata_builder_i,
+                                            layer_names)
+                attn_groups.append(attn_group)
+
+                if self.full_cuda_graph:
+                    if attn_metadata_builder_i.attn_cudagraph_support == \
+                        AttentionCGSupport.NEVER:
+                        raise ValueError(
+                            f"Full CUDAGraph not supported for "
+                            f"{attn_backend.__name__}. Turn off "
+                            f"CompilationConfig.full_cuda_graph or use a "
+                            f" different attention backend.")
+                    if attn_metadata_builder_i.attn_cudagraph_support == \
+                        AttentionCGSupport.PURE_DECODE_ONLY:
+                        # Limit the max cudagraph size to the max number of
+                        # sequences for pure decode only cudagraph backend,
+                        # whose max_query_len is 1.
+                        self.cudagraph_batch_sizes = [
+                            size for size in self.cudagraph_batch_sizes
+                            if size <= self.scheduler_config.max_num_seqs
+                        ]
+
+            return attn_groups
+
+        for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
+            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+            if isinstance(kv_cache_spec, AttentionSpec):
+                attn_backends = get_attn_backends_for_layers(
+                    kv_cache_group_spec.layer_names)
+            # TODO(lucas): move `get_mamba_attn_backend` into the mamba
+            # layers like above
+            elif isinstance(kv_cache_spec, MambaSpec):
+                attn_backends = {
+                    get_mamba_attn_backend(kv_cache_spec.mamba_type):
+                    kv_cache_group_spec.layer_names
+                }
+            else:
+                raise ValueError(
+                    f"Unknown KV cache spec type: {type(kv_cache_spec)}")
+
+            self.attn_groups.append(
+                create_attn_groups(attn_backends, kv_cache_spec))
 
         # Calculate reorder batch threshold (if neeeded)
         self.calculate_reorder_batch_threshold()
 
-        if len(self.attn_backends) > 0:
+        if len(self.attn_groups) > 0:
             return
 
         # Check if model is encoder-only
         block_size = self.vllm_config.cache_config.block_size
         use_mla = self.vllm_config.model_config.use_mla
         attn_specs = list[AttentionSpec]()
-        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
         for attn_module in attn_layers.values():
 
             if attn_module.attn_type == AttentionType.ENCODER_ONLY:
@@ -2666,11 +2650,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             assert len(attn_specs) == len(attn_layers), \
                 "All or none of the layers are expected to be encoder-only"
 
-            attn_backend, attn_metadata_builder = (
-                self._initialize_single_attn_backend(attn_specs[0],
-                                                     attn_layers.keys()))
-            self.attn_backends.append(attn_backend)
-            self.attn_metadata_builders.append(attn_metadata_builder)
+            attn_backends = get_attn_backends_for_layers(attn_layers.keys())
+
+            self.attn_groups.append(
+                create_attn_groups(attn_backends, attn_specs[0]))
             self.is_encoder_only_model = True
 
     def calculate_reorder_batch_threshold(self) -> None:
@@ -2678,7 +2661,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         Check that if any backends reorder batches; that the reordering
         is compatible (e.g., decode threshold is the same)
         """
-        for attn_metadata_builder_i in self.attn_metadata_builders:
+        for group in self._attn_group_iterator():
+            attn_metadata_builder_i = group.metadata_builder
+
             # check that if any backends reorder batches; that the reordering
             # is compatible (e.g., decode threshold is the same)
             reorder_batch_threshold_i = (
@@ -2752,6 +2737,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         )), "Some layers are not correctly initialized"
         return kv_cache_raw_tensors
 
+    def _attn_group_iterator(self) -> Iterator[AttentionGroup]:
+        return itertools.chain.from_iterable(self.attn_groups)
+
+    def _kv_cache_spec_attn_group_iterator(
+            self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]:
+        if not self.kv_cache_config.kv_cache_groups:
+            return
+        for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups):
+            for attn_group in attn_groups:
+                yield self.kv_cache_config.kv_cache_groups[
+                    kv_cache_spec_id].kv_cache_spec, attn_group
+
     def _reshape_kv_cache_tensors(
         self,
         kv_cache_config: KVCacheConfig,
@@ -2770,23 +2767,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         """
         kv_caches: dict[str, torch.Tensor] = {}
         has_attn, has_mamba = False, False
-        for i, kv_cache_group_spec in enumerate(
-                kv_cache_config.kv_cache_groups):
-            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
-            for layer_name in kv_cache_group_spec.layer_names:
+        for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
+            attn_backend = group.backend
+            for layer_name in group.layer_names:
                 raw_tensor = kv_cache_raw_tensors[layer_name]
                 assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
                 num_blocks = (raw_tensor.numel() //
                               kv_cache_spec.page_size_bytes)
                 if isinstance(kv_cache_spec, AttentionSpec):
                     has_attn = True
-                    kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
+                    kv_cache_shape = attn_backend.get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
                     dtype = kv_cache_spec.dtype
                     try:
-                        kv_cache_stride_order = self.attn_backends[
-                            i].get_kv_cache_stride_order()
+                        kv_cache_stride_order = \
+                            attn_backend.get_kv_cache_stride_order()
                         assert len(kv_cache_stride_order) == len(
                             kv_cache_shape)
                     except (AttributeError, NotImplementedError):
@@ -2850,15 +2846,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             kv_cache_raw_tensors: The KV cache buffer of each layer.
         """
 
-        for i, kv_cache_group_spec in enumerate(
-                kv_cache_config.kv_cache_groups):
-            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
-            for layer_name in kv_cache_group_spec.layer_names:
+        for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
+            for layer_name in group.layer_names:
                 raw_tensor = kv_cache_raw_tensors[layer_name]
                 num_blocks = (raw_tensor.numel() //
                               kv_cache_spec.page_size_bytes)
                 if isinstance(kv_cache_spec, AttentionSpec):
-                    kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
+
+                    kv_cache_shape = group.backend.get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
                     if kv_cache_shape[0] != num_blocks or kv_cache_shape[
@@ -2893,6 +2888,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 self.shared_kv_cache_layers,
                 kv_cache_config.kv_cache_groups,
                 kv_caches,
+                self.attn_groups,
             )
             attn_layers = get_layers_from_vllm_config(self.vllm_config,
                                                       Attention)
@@ -2958,9 +2954,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 continue
 
             # TODO: Support other attention modules, e.g., cross-attention
+            # TODO(lucas): move the attention specs into the model layers like
+            # the attention backends
             if attn_module.attn_type == AttentionType.DECODER:
-                use_local_attention = (self.attention_chunk_size is not None
-                                       and attn_module.use_irope)
                 if attn_module.sliding_window is not None:
                     kv_cache_spec[layer_name] = SlidingWindowSpec(
                         block_size=block_size,
@@ -2969,10 +2965,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         dtype=self.kv_cache_dtype,
                         sliding_window=attn_module.sliding_window,
                         use_mla=use_mla)
-                    assert not use_local_attention, (
-                        "attention module can not be with ",
-                        "both local attention and sliding window")
-                elif use_local_attention:
+                elif self.attention_chunk_size is not None \
+                        and isinstance(attn_module, ChunkedLocalAttention):
                     kv_cache_spec[layer_name] = ChunkedLocalAttentionSpec(
                         block_size=block_size,
                         num_kv_heads=attn_module.num_kv_heads,
@@ -3043,7 +3037,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # Use the first attention metadata builder
         # to create encoder attention metadata
-        builder = self.attn_metadata_builders[0]
+        builder = self.attn_groups[0][0].metadata_builder
 
         dummy_block_table = torch.zeros((num_reqs, 1),
                                         dtype=torch.int32,
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 5f3188efdb..81252f9b60 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -15,8 +15,9 @@ import torch_xla.distributed.spmd as xs
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
+from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionType
-from vllm.attention.layer import Attention
+from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
 from vllm.config import (ParallelConfig, VllmConfig,
                          get_layers_from_vllm_config, update_config)
@@ -518,7 +519,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 continue
 
             if attn_module.attn_type == AttentionType.DECODER:
-                if attn_module.use_irope:
+                if isinstance(attn_module, ChunkedLocalAttention):
                     logger.warning_once(
                         "Using irope in Pallas is not supported yet, it "
                         "will fall back to global attention for long context.")
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 6761b3c5e4..e7079235d6 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,14 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import ModelConfig, SchedulerConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.registry import MultiModalRegistry
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec
 
@@ -122,6 +125,13 @@ class MultiModalBudget:
         return max_items_per_prompt, max_items_per_batch
 
 
+@dataclass
+class AttentionGroup:
+    backend: type[AttentionBackend]
+    metadata_builder: AttentionMetadataBuilder
+    layer_names: list[str]
+
+
 def sanity_check_mm_encoder_outputs(
     mm_embeddings: MultiModalEmbeddings,
     expected_num_items: int,
@@ -196,6 +206,8 @@ def initialize_kv_cache_for_kv_sharing(
     shared_kv_cache_layers: dict[str, str],
     kv_cache_groups: list[KVCacheGroupSpec],
     kv_caches: dict[str, torch.Tensor],
+    # Optional for now to avoid breaking TPU
+    attn_groups: Optional[list[list[AttentionGroup]]] = None,
 ) -> None:
     """
     Sets up KV cache sharing by reusing the allocated KV caches in `kv_caches`
@@ -225,6 +237,15 @@ def initialize_kv_cache_for_kv_sharing(
         group_idx = layer_to_kv_cache_group_idx[target_layer_name]
         kv_cache_groups[group_idx].layer_names.append(layer_name)
 
+        if attn_groups is not None:
+            assert len(attn_groups[group_idx]) == 1, (
+                "Only one attention group per KV cache group is supported "
+                "for KV-cache sharing for now.")
+            # TODO(lucas): I think in the future the layers that re-use a
+            # KV cache will be in a different attention group so we can
+            # remove this code from here.
+            attn_groups[group_idx][0].layer_names.append(layer_name)
+
 
 def bind_kv_cache(
     kv_caches: dict[str, torch.Tensor],

From 6b47ef24de3d3b4f551aca0bc21b9f16f3d21b6a Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Thu, 7 Aug 2025 10:28:11 +0800
Subject: [PATCH 040/932] [XPU]Fix `flash_attn_varlen_func` interface on xpu
 (#22350)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/_ipex_ops.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 7533bf5ef7..79e3e448ca 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -271,6 +271,7 @@ class ipex_ops:
         k_descale=None,
         v_descale=None,
         num_splits=0,
+        s_aux: Optional[torch.Tensor] = None,
     ):
         if cu_seqlens_k is None:
             # cu_seqlens_k is not used in ipex kernel.

From 7377131a2ccb49cae71aa503ee5be520aa080904 Mon Sep 17 00:00:00 2001
From: Tao He <linzhu.ht@alibaba-inc.com>
Date: Thu, 7 Aug 2025 10:58:08 +0800
Subject: [PATCH 041/932] [Qwen3] Enable dual-chunk-attention support for Qwen3
 models. (#21924)

Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
---
 vllm/model_executor/models/qwen3.py     | 64 +++++++++++++++----------
 vllm/model_executor/models/qwen3_moe.py | 27 ++++++++---
 2 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index d2ae8959b1..0ad50640bb 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -23,7 +23,7 @@
 # limitations under the License.
 """Inference-only Qwen3 model compatible with HuggingFace weights."""
 from collections.abc import Iterable
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 from torch import nn
@@ -47,27 +47,31 @@ from vllm.sequence import IntermediateTensors
 from .interfaces import SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2MLP as Qwen3MLP
 from .qwen2 import Qwen2Model
-from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    maybe_prefix)
 
 logger = init_logger(__name__)
 
 
 class Qwen3Attention(nn.Module):
 
-    def __init__(self,
-                 hidden_size: int,
-                 num_heads: int,
-                 num_kv_heads: int,
-                 max_position: int = 4096 * 32,
-                 head_dim: Optional[int] = None,
-                 rms_norm_eps: float = 1e-06,
-                 qkv_bias: bool = False,
-                 rope_theta: float = 10000,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 rope_scaling: Optional[tuple] = None,
-                 prefix: str = "",
-                 attn_type: str = AttentionType.DECODER) -> None:
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-06,
+        qkv_bias: bool = False,
+        rope_theta: float = 10000,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        rope_scaling: Optional[tuple] = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+    ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -89,6 +93,7 @@ class Qwen3Attention(nn.Module):
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
         self.rope_theta = rope_theta
+        self.dual_chunk_attention_config = dual_chunk_attention_config
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -113,15 +118,22 @@ class Qwen3Attention(nn.Module):
             max_position=max_position,
             base=self.rope_theta,
             rope_scaling=rope_scaling,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            attn_type=attn_type,
+            **{
+                "layer_idx": extract_layer_index(prefix),
+                "dual_chunk_attention_config": dual_chunk_attention_config,
+            } if dual_chunk_attention_config else {},
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn",
-                              attn_type=attn_type)
         self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
         self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
 
@@ -161,6 +173,9 @@ class Qwen3DecoderLayer(nn.Module):
         # Requires transformers > 4.32.0
         rope_theta = getattr(config, "rope_theta", 1000000)
         rope_scaling = getattr(config, "rope_scaling", None)
+        dual_chunk_attention_config = getattr(config,
+                                              "dual_chunk_attention_config",
+                                              None)
 
         # By default, Qwen3 uses causal attention as it is a decoder-only model.
         # You can override the HF config with `is_causal=False` to enable
@@ -185,6 +200,7 @@ class Qwen3DecoderLayer(nn.Module):
             rope_scaling=rope_scaling,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
+            dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.mlp = Qwen3MLP(
             hidden_size=self.hidden_size,
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index ca14fd0657..7410589190 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -185,6 +185,7 @@ class Qwen3MoeAttention(nn.Module):
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -208,6 +209,7 @@ class Qwen3MoeAttention(nn.Module):
         self.scaling = self.head_dim**-0.5
         self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
+        self.dual_chunk_attention_config = dual_chunk_attention_config
 
         self.qkv_proj = QKVParallelLinear(hidden_size,
                                           self.head_dim,
@@ -229,14 +231,21 @@ class Qwen3MoeAttention(nn.Module):
             max_position=max_position_embeddings,
             base=rope_theta,
             rope_scaling=rope_scaling,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            **{
+                "layer_idx": extract_layer_index(prefix),
+                "dual_chunk_attention_config": dual_chunk_attention_config,
+            } if dual_chunk_attention_config else {},
         )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
 
         self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
         self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
@@ -280,6 +289,9 @@ class Qwen3MoeDecoderLayer(nn.Module):
         rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings",
                                           8192)
+        dual_chunk_attention_config = getattr(config,
+                                              "dual_chunk_attention_config",
+                                              None)
         self.self_attn = Qwen3MoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -293,6 +305,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
+            dual_chunk_attention_config=dual_chunk_attention_config,
         )
 
         # `mlp_only_layers` in the config.

From 04cf435d95fee3e4c0ba521583c1a64bc348c89d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 7 Aug 2025 11:05:20 +0800
Subject: [PATCH 042/932] [Bugfix] Fix wrong method name in Intern-S1 image
 processor (#22417)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/interns1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index ab21cbe91a..d952ced2fa 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -161,7 +161,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
         if not isinstance(processor, GotOcr2ImageProcessorFast):
             raise ValueError(f'GotOcr2ImageProcessorFast is expected but got '
                              f'{type(processor)}')
-        num_image_patches = processor.get_number_of_image_tokens(
+        num_image_patches = processor.get_number_of_image_patches(
             image_height, image_width, images_kwargs=dict())
         num_image_tokens = self.get_hf_processor(
         ).image_seq_length * num_image_patches

From a00d8b236f515d8c29c6afc2ecb98aef22788ae1 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 6 Aug 2025 23:07:47 -0400
Subject: [PATCH 043/932] Use float32 for test_completion.py (#22385)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
---
 tests/v1/entrypoints/openai/test_completion.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 2462f8f9f1..3a65583fab 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -20,9 +20,8 @@ MODEL_NAME = "facebook/opt-125m"
 @pytest.fixture(scope="module")
 def default_server_args():
     return [
-        # use half precision for speed and memory savings in CI environment
         "--dtype",
-        "bfloat16",
+        "float32",
         "--max-model-len",
         "2048",
         "--max-num-seqs",

From 5e9455ae8f33599865f8855b28db2d074ea04eb5 Mon Sep 17 00:00:00 2001
From: qscqesze <qingjun@minimaxi.com>
Date: Thu, 7 Aug 2025 11:30:27 +0800
Subject: [PATCH 044/932] [Bugfix]: Fix the streaming output for function calls
 in the minimax (#22015)

Signed-off-by: QscQ <qscqesze@gmail.com>
Signed-off-by: qingjun <qingjun@minimaxi.com>
---
 tests/tool_use/test_minimax_tool_parser.py    | 846 ++++++++++++++++-
 .../tool_parsers/minimax_tool_parser.py       | 850 +++++++++++++-----
 2 files changed, 1493 insertions(+), 203 deletions(-)

diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py
index 49b8e4b96f..ddf2600712 100644
--- a/tests/tool_use/test_minimax_tool_parser.py
+++ b/tests/tool_use/test_minimax_tool_parser.py
@@ -3,10 +3,12 @@
 # ruff: noqa: E501
 
 import json
+from typing import Any
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.protocol import (ChatCompletionToolsParam,
+                                              FunctionCall, ToolCall)
 from vllm.entrypoints.openai.tool_parsers import MinimaxToolParser
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -24,6 +26,57 @@ def minimax_tool_parser(minimax_tokenizer):
     return MinimaxToolParser(minimax_tokenizer)
 
 
+@pytest.fixture
+def sample_tools():
+    return [
+        ChatCompletionToolsParam(type="function",
+                                 function={
+                                     "name": "get_current_weather",
+                                     "description": "Get the current weather",
+                                     "parameters": {
+                                         "type": "object",
+                                         "properties": {
+                                             "city": {
+                                                 "type": "string",
+                                                 "description": "The city name"
+                                             },
+                                             "state": {
+                                                 "type": "string",
+                                                 "description":
+                                                 "The state code"
+                                             },
+                                             "unit": {
+                                                 "type": "string",
+                                                 "enum":
+                                                 ["fahrenheit", "celsius"]
+                                             }
+                                         },
+                                         "required": ["city", "state"]
+                                     }
+                                 }),
+        ChatCompletionToolsParam(type="function",
+                                 function={
+                                     "name": "calculate_area",
+                                     "description":
+                                     "Calculate area of a shape",
+                                     "parameters": {
+                                         "type": "object",
+                                         "properties": {
+                                             "shape": {
+                                                 "type": "string"
+                                             },
+                                             "dimensions": {
+                                                 "type": "object"
+                                             },
+                                             "precision": {
+                                                 "type": "integer"
+                                             }
+                                         }
+                                     }
+                                 })
+    ]
+
+
 def assert_tool_calls(actual_tool_calls: list[ToolCall],
                       expected_tool_calls: list[ToolCall]):
     assert len(actual_tool_calls) == len(expected_tool_calls)
@@ -370,3 +423,794 @@ def test_extract_tool_calls_multiline_json_not_supported(minimax_tool_parser):
     assert not extracted_tool_calls.tools_called
     assert extracted_tool_calls.tool_calls == []
     assert extracted_tool_calls.content is None
+
+
+def test_streaming_arguments_incremental_output(minimax_tool_parser):
+    """Test that streaming arguments are returned incrementally, not cumulatively."""
+    # Reset streaming state
+    minimax_tool_parser.current_tool_name_sent = False
+    minimax_tool_parser.prev_tool_call_arr = []
+    minimax_tool_parser.current_tool_id = -1
+    minimax_tool_parser.streamed_args_for_tool = []
+
+    # Simulate progressive tool call building
+    stages = [
+        # Stage 1: Function name complete
+        '<tool_calls>\n{"name": "get_current_weather", "arguments": ',
+        # Stage 2: Arguments object starts with first key
+        '<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": ',
+        # Stage 3: First parameter value added
+        '<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle"',
+        # Stage 4: Second parameter added
+        '<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA"',
+        # Stage 5: Third parameter added, arguments complete
+        '<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}',
+        # Stage 6: Tool calls closed
+        '<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n</tool',
+        '<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n</tool_calls>'
+    ]
+
+    function_name_sent = False
+    previous_args_content = ""
+
+    for i, current_text in enumerate(stages):
+        previous_text = stages[i - 1] if i > 0 else ""
+        delta_text = current_text[len(previous_text
+                                      ):] if i > 0 else current_text
+
+        result = minimax_tool_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=delta_text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+
+        print(f"Stage {i}: Current text: {repr(current_text)}")
+        print(f"Stage {i}: Delta text: {repr(delta_text)}")
+
+        if result is not None and hasattr(result,
+                                          'tool_calls') and result.tool_calls:
+            tool_call = result.tool_calls[0]
+
+            # Check if function name is sent (should happen only once)
+            if tool_call.function and tool_call.function.name:
+                assert tool_call.function.name == "get_current_weather"
+                function_name_sent = True
+                print(
+                    f"Stage {i}: Function name sent: {tool_call.function.name}"
+                )
+
+            # Check if arguments are sent incrementally
+            if tool_call.function and tool_call.function.arguments:
+                args_fragment = tool_call.function.arguments
+                print(
+                    f"Stage {i}: Got arguments fragment: {repr(args_fragment)}"
+                )
+
+                # For incremental output, each fragment should be new content only
+                # The fragment should not contain all previous content
+                if i >= 2 and previous_args_content:  # After we start getting arguments
+                    # The new fragment should not be identical to or contain all previous content
+                    assert args_fragment != previous_args_content, f"Fragment should be incremental, not cumulative: {args_fragment}"
+
+                    # If this is truly incremental, the fragment should be relatively small
+                    # compared to the complete arguments so far
+                    if len(args_fragment) > len(previous_args_content):
+                        print(
+                            "Warning: Fragment seems cumulative rather than incremental"
+                        )
+
+                previous_args_content = args_fragment
+
+    # Verify function name was sent at least once
+    assert function_name_sent, "Function name should have been sent"
+
+
+def test_streaming_arguments_delta_only(minimax_tool_parser):
+    """Test that each streaming call returns only the delta (new part) of arguments."""
+    # Reset streaming state
+    minimax_tool_parser.current_tool_name_sent = False
+    minimax_tool_parser.prev_tool_call_arr = []
+    minimax_tool_parser.current_tool_id = -1
+    minimax_tool_parser.streamed_args_for_tool = []
+
+    # Simulate two consecutive calls with growing arguments
+    call1_text = '<tool_calls>\n{"name": "test_tool", "arguments": {"param1": "value1"}}'
+    call2_text = '<tool_calls>\n{"name": "test_tool", "arguments": {"param1": "value1", "param2": "value2"}}'
+
+    print(f"Call 1 text: {repr(call1_text)}")
+    print(f"Call 2 text: {repr(call2_text)}")
+
+    # First call - should get the function name and initial arguments
+    result1 = minimax_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text=call1_text,
+        delta_text=call1_text,
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    print(f"Result 1: {result1}")
+    if result1 and hasattr(result1, 'tool_calls') and result1.tool_calls:
+        for i, tc in enumerate(result1.tool_calls):
+            print(f"  Tool call {i}: {tc}")
+
+    # Second call - should only get the delta (new part) of arguments
+    result2 = minimax_tool_parser.extract_tool_calls_streaming(
+        previous_text=call1_text,
+        current_text=call2_text,
+        delta_text=', "param2": "value2"}',
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    print(f"Result 2: {result2}")
+    if result2 and hasattr(result2, 'tool_calls') and result2.tool_calls:
+        for i, tc in enumerate(result2.tool_calls):
+            print(f"  Tool call {i}: {tc}")
+
+    # Verify the second call only returns the delta
+    if result2 is not None and hasattr(result2,
+                                       'tool_calls') and result2.tool_calls:
+        tool_call = result2.tool_calls[0]
+        if tool_call.function and tool_call.function.arguments:
+            args_delta = tool_call.function.arguments
+            print(f"Arguments delta from second call: {repr(args_delta)}")
+
+            # Should only contain the new part, not the full arguments
+            # The delta should be something like ', "param2": "value2"}' or just '"param2": "value2"'
+            assert ', "param2": "value2"}' in args_delta or '"param2": "value2"' in args_delta, f"Expected delta containing param2, got: {args_delta}"
+
+            # Should NOT contain the previous parameter data
+            assert '"param1": "value1"' not in args_delta, f"Arguments delta should not contain previous data: {args_delta}"
+
+            # The delta should be relatively short (incremental, not cumulative)
+            expected_max_length = len(
+                ', "param2": "value2"}') + 10  # Some tolerance
+            assert len(
+                args_delta
+            ) <= expected_max_length, f"Delta seems too long (possibly cumulative): {args_delta}"
+
+            print("✓ Delta validation passed")
+        else:
+            print("No arguments in result2 tool call")
+    else:
+        print("No tool calls in result2 or result2 is None")
+        # This might be acceptable if no incremental update is needed
+        # But let's at least verify that result1 had some content
+        assert result1 is not None, "At least the first call should return something"
+
+
+def test_streaming_openai_compatibility(minimax_tool_parser):
+    """Test that streaming behavior with buffering works correctly."""
+    # Reset streaming state
+    minimax_tool_parser.current_tool_name_sent = False
+    minimax_tool_parser.prev_tool_call_arr = []
+    minimax_tool_parser.current_tool_id = -1
+    minimax_tool_parser.streamed_args_for_tool = []
+    # Reset buffering state
+    minimax_tool_parser.pending_buffer = ""
+    minimax_tool_parser.in_thinking_tag = False
+    minimax_tool_parser.thinking_depth = 0
+
+    # Test scenario: simple buffering without complex tool call context
+    test_cases: list[dict[str, Any]] = [
+        {
+            'stage': 'Token: <',
+            'previous': '',
+            'current': '<',
+            'delta': '<',
+            'expected_content': None,  # Should be buffered
+        },
+        {
+            'stage': 'Token: tool_calls>',
+            'previous': '<',
+            'current': '<tool_calls>',
+            'delta': 'tool_calls>',
+            'expected_content': None,  # Complete tag, should not output
+        },
+        {
+            'stage': 'Regular content',
+            'previous': 'Hello',
+            'current': 'Hello world',
+            'delta': ' world',
+            'expected_content': ' world',  # Normal content should pass through
+        },
+        {
+            'stage': 'Content with end tag start',
+            'previous': 'Text',
+            'current': 'Text content</tool_',
+            'delta': ' content</tool_',
+            'expected_content':
+            ' content',  # Content part output, </tool_ buffered
+        },
+        {
+            'stage': 'Complete end tag',
+            'previous': 'Text content</tool_',
+            'current': 'Text content</tool_calls>',
+            'delta': 'calls>',
+            'expected_content': None,  # Complete close tag, should not output
+        },
+    ]
+
+    for i, test_case in enumerate(test_cases):
+        print(f"\n--- Stage {i}: {test_case['stage']} ---")
+        print(f"Previous: {repr(test_case['previous'])}")
+        print(f"Current:  {repr(test_case['current'])}")
+        print(f"Delta:    {repr(test_case['delta'])}")
+
+        result = minimax_tool_parser.extract_tool_calls_streaming(
+            previous_text=test_case['previous'],
+            current_text=test_case['current'],
+            delta_text=test_case['delta'],
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+
+        print(f"Result: {result}")
+
+        # Check expected content
+        if test_case['expected_content'] is None:
+            assert result is None or not getattr(result, 'content', None), \
+                f"Stage {i}: Expected no content, got {result}"
+            print("✓ No content output as expected")
+        else:
+            assert result is not None and hasattr(result, 'content'), \
+                f"Stage {i}: Expected content, got {result}"
+            assert result.content == test_case['expected_content'], \
+                f"Stage {i}: Expected content {test_case['expected_content']}, got {result.content}"
+            print(f"✓ Content matches: {repr(result.content)}")
+
+    print("✓ Streaming test with buffering completed successfully")
+
+
+def test_streaming_thinking_tag_buffering(minimax_tool_parser):
+    """Test that tool calls within thinking tags are properly handled during streaming."""
+    # Reset streaming state
+    minimax_tool_parser.current_tool_name_sent = False
+    minimax_tool_parser.prev_tool_call_arr = []
+    minimax_tool_parser.current_tool_id = -1
+    minimax_tool_parser.streamed_args_for_tool = []
+    # Reset buffering state
+    minimax_tool_parser.pending_buffer = ""
+    minimax_tool_parser.in_thinking_tag = False
+    minimax_tool_parser.thinking_depth = 0
+
+    # Test scenario: tool calls within thinking tags should be ignored
+    test_cases: list[dict[str, Any]] = [
+        {
+            'stage': 'Start thinking',
+            'previous': '',
+            'current': '<think>I need to use a tool. <tool_calls>',
+            'delta': '<think>I need to use a tool. <tool_calls>',
+            'expected_content':
+            '<think>I need to use a tool. <tool_calls>',  # Should pass through as content
+        },
+        {
+            'stage':
+            'Tool call in thinking',
+            'previous':
+            '<think>I need to use a tool. <tool_calls>',
+            'current':
+            '<think>I need to use a tool. <tool_calls>\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls>',
+            'delta':
+            '\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls>',
+            'expected_content':
+            '\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls>',  # </tool_calls> should be preserved in thinking tags
+        },
+        {
+            'stage': 'Real tool call after thinking',
+            'previous':
+            '<think>I need to use a tool. <tool_calls>\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls></think>',
+            'current':
+            '<think>I need to use a tool. <tool_calls>\n{"name": "ignored_tool", "arguments": {"param": "value"}}\n</tool_calls></think>\n<tool_calls>',
+            'delta': '\n<tool_calls>',
+            'expected_content':
+            '\n',  # Should output '\n' and suppress <tool_calls>
+        }
+    ]
+
+    for i, test_case in enumerate(test_cases):
+        print(f"\n--- Stage {i}: {test_case['stage']} ---")
+        print(f"Previous: {repr(test_case['previous'])}")
+        print(f"Current:  {repr(test_case['current'])}")
+        print(f"Delta:    {repr(test_case['delta'])}")
+
+        result = minimax_tool_parser.extract_tool_calls_streaming(
+            previous_text=test_case['previous'],
+            current_text=test_case['current'],
+            delta_text=test_case['delta'],
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+
+        print(f"Result: {result}")
+
+        # Check expected content
+        if 'expected_content' in test_case:
+            if test_case['expected_content'] is None:
+                assert result is None or not getattr(result, 'content', None), \
+                    f"Stage {i}: Expected no content, got {result}"
+            else:
+                assert result is not None and hasattr(result, 'content'), \
+                    f"Stage {i}: Expected content, got {result}"
+                assert result.content == test_case['expected_content'], \
+                    f"Stage {i}: Expected content {test_case['expected_content']}, got {result.content}"
+                print(f"✓ Content matches: {repr(result.content)}")
+
+        # Check tool calls
+        if test_case.get('expected_tool_call'):
+            assert result is not None and hasattr(result, 'tool_calls') and result.tool_calls, \
+                f"Stage {i}: Expected tool call, got {result}"
+
+            tool_call = result.tool_calls[0]
+            assert tool_call.function.name == "real_tool", \
+                f"Expected real_tool, got {tool_call.function.name}"
+            print(f"✓ Real tool call detected: {tool_call.function.name}")
+
+    print("✓ Thinking tag buffering test completed successfully")
+
+
+def reset_streaming_state(minimax_tool_parser):
+    """Helper function to properly reset the streaming state for MinimaxToolParser."""
+    # Reset minimax-specific state
+    minimax_tool_parser._reset_streaming_state()
+
+    # Reset base class state (these should still be reset for compatibility)
+    minimax_tool_parser.prev_tool_call_arr = []
+    minimax_tool_parser.current_tool_id = -1
+    minimax_tool_parser.current_tool_name_sent = False
+    minimax_tool_parser.streamed_args_for_tool = []
+
+
+def test_streaming_complex_scenario_with_multiple_tools(minimax_tool_parser):
+    """Test complex streaming scenario: tools inside <think> tags and multiple tool calls in one group."""
+    # Reset streaming state
+    reset_streaming_state(minimax_tool_parser)
+
+    # Complex scenario: tools inside thinking tags and multiple tools in one group
+    test_stages: list[dict[str, Any]] = [
+        {
+            'stage': 'Initial content',
+            'previous': '',
+            'current': 'Let me help you with this task.',
+            'delta': 'Let me help you with this task.',
+            'expected_content': 'Let me help you with this task.',
+            'expected_tool_calls': 0,
+        },
+        {
+            'stage': 'Start thinking tag',
+            'previous': 'Let me help you with this task.',
+            'current':
+            'Let me help you with this task.<think>I need to analyze this situation first.',
+            'delta': '<think>I need to analyze this situation first.',
+            'expected_content':
+            '<think>I need to analyze this situation first.',
+            'expected_tool_calls': 0,
+        },
+        {
+            'stage': 'Tool call inside thinking tag starts',
+            'previous':
+            'Let me help you with this task.<think>I need to analyze this situation first.',
+            'current':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>',
+            'delta': '<tool_calls>',
+            'expected_content':
+            '<tool_calls>',  # Inside thinking tags, tool tags should be preserved as content
+            'expected_tool_calls': 0,
+        },
+        {
+            'stage': 'Complete tool call inside thinking tag',
+            'previous':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>',
+            'current':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls>',
+            'delta':
+            '\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls>',
+            'expected_content':
+            '\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls>',
+            'expected_tool_calls':
+            0,  # Tools inside thinking tags should be ignored
+        },
+        {
+            'stage': 'End thinking tag',
+            'previous':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls>',
+            'current':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>',
+            'delta': '</think>',
+            'expected_content': '</think>',
+            'expected_tool_calls': 0,
+        },
+        {
+            'stage': 'Multiple tools group starts',
+            'previous':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>',
+            'current':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>',
+            'delta':
+            '\nNow I need to get weather information and calculate area.<tool_calls>',
+            'expected_content':
+            '\nNow I need to get weather information and calculate area.',  # <tool_calls> should be filtered
+            'expected_tool_calls': 0,
+        },
+        {
+            'stage': 'First tool in group',
+            'previous':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>',
+            'current':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}',
+            'delta':
+            '\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}',
+            'expected_content':
+            None,  # No content should be output when tool call is in progress
+            'expected_tool_calls': 1,
+            'expected_tool_name': 'get_current_weather',
+        },
+        {
+            'stage': 'Second tool in group',
+            'previous':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}',
+            'current':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}',
+            'delta':
+            '\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}',
+            'expected_content': None,
+            'expected_tool_calls': 1,
+            'expected_tool_name': 'calculate_area',
+        },
+        {
+            'stage': 'Complete tool calls group',
+            'previous':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}',
+            'current':
+            'Let me help you with this task.<think>I need to analyze this situation first.<tool_calls>\n{"name": "internal_analysis", "arguments": {"query": "analyze situation"}}\n</tool_calls></think>\nNow I need to get weather information and calculate area.<tool_calls>\n{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}\n{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}</tool_calls>',
+            'delta': '</tool_calls>',
+            'expected_content': None,
+            'expected_tool_calls': 0,
+        }
+    ]
+
+    tool_calls_count = 0
+
+    for i, test_case in enumerate(test_stages):
+        print(f"\n--- Stage {i}: {test_case['stage']} ---")
+        print(
+            f"Previous: {repr(test_case['previous'][:100])}{'...' if len(test_case['previous']) > 100 else ''}"
+        )
+        print(f"Current:  {repr(test_case['current'][-100:])}")
+        print(f"Delta:    {repr(test_case['delta'])}")
+
+        result = minimax_tool_parser.extract_tool_calls_streaming(
+            previous_text=test_case['previous'],
+            current_text=test_case['current'],
+            delta_text=test_case['delta'],
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+
+        print(f"Result: {result}")
+
+        # Check expected content
+        if test_case['expected_content'] is None:
+            assert result is None or not getattr(result, 'content', None), \
+                f"Stage {i}: Expected no content output, got {result}"
+            print("✓ No content output as expected")
+        else:
+            assert result is not None and hasattr(result, 'content'), \
+                f"Stage {i}: Expected content output, got {result}"
+            assert result.content == test_case['expected_content'], \
+                f"Stage {i}: Expected content {repr(test_case['expected_content'])}, got {repr(result.content)}"
+            print(f"✓ Content matches: {repr(result.content)}")
+
+        # Check tool calls
+        expected_tool_calls = test_case['expected_tool_calls']
+        actual_tool_calls = len(result.tool_calls) if result and hasattr(
+            result, 'tool_calls') and result.tool_calls else 0
+
+        if expected_tool_calls > 0:
+            assert actual_tool_calls >= expected_tool_calls, \
+                f"Stage {i}: Expected at least {expected_tool_calls} tool calls, got {actual_tool_calls}"
+
+            if 'expected_tool_name' in test_case:
+                # Find the tool call with the expected name
+                found_tool_call = None
+                for tool_call in result.tool_calls:
+                    if tool_call.function.name == test_case[
+                            'expected_tool_name']:
+                        found_tool_call = tool_call
+                        break
+
+                assert found_tool_call is not None, \
+                    f"Stage {i}: Expected tool name {test_case['expected_tool_name']} not found in tool calls: {[tc.function.name for tc in result.tool_calls]}"
+                print(f"✓ Tool call correct: {found_tool_call.function.name}")
+
+                # Ensure tools inside thinking tags are not called
+                assert found_tool_call.function.name != "internal_analysis", \
+                    f"Stage {i}: Tool 'internal_analysis' inside thinking tags should not be called"
+
+            tool_calls_count += actual_tool_calls
+            print(f"✓ Detected {actual_tool_calls} tool calls")
+        else:
+            assert actual_tool_calls == 0, \
+                f"Stage {i}: Expected no tool calls, got {actual_tool_calls}"
+
+    # Verify overall results
+    print("\n=== Test Summary ===")
+    print(f"Total tool calls count: {tool_calls_count}")
+    assert tool_calls_count >= 2, f"Expected at least 2 valid tool calls (outside thinking tags), but got {tool_calls_count}"
+
+    print("✓ Complex streaming test completed:")
+    print("  - ✓ Tools inside thinking tags correctly ignored")
+    print("  - ✓ Two tool groups outside thinking tags correctly parsed")
+    print("  - ✓ Content and tool call streaming correctly handled")
+    print("  - ✓ Buffering mechanism works correctly")
+
+
+def test_streaming_character_by_character_output(minimax_tool_parser):
+    """Test character-by-character streaming output to simulate real streaming scenarios."""
+    # Reset streaming state
+    reset_streaming_state(minimax_tool_parser)
+
+    # Complete text that will be streamed character by character
+    complete_text = """I'll help you with the weather analysis. <think>Let me think about this. <tool_calls>
+{"name": "internal_analysis", "arguments": {"type": "thinking"}}
+</tool_calls>This tool should be ignored.</think>
+
+Now I'll get the weather information for you. <tool_calls>
+{"name": "get_current_weather", "arguments": {"city": "Seattle", "state": "WA", "unit": "celsius"}}
+{"name": "calculate_area", "arguments": {"shape": "rectangle", "dimensions": {"width": 10, "height": 5}}}
+</tool_calls>Here are the results."""
+
+    print("\n=== Starting character-by-character streaming test ===")
+    print(f"Complete text length: {len(complete_text)} characters")
+
+    # Track the streaming results
+    content_fragments = []
+    tool_calls_detected = []
+
+    # Stream character by character
+    for i in range(1, len(complete_text) + 1):
+        current_text = complete_text[:i]
+        previous_text = complete_text[:i - 1] if i > 1 else ""
+        delta_text = complete_text[i - 1:i]
+
+        # Show progress every 50 characters
+        if i % 50 == 0 or i == len(complete_text):
+            print(f"Progress: {i}/{len(complete_text)} characters")
+
+        # Call the streaming parser
+        result = minimax_tool_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=delta_text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+
+        # Collect results
+        if result is not None:
+            if hasattr(result, 'content') and result.content:
+                content_fragments.append(result.content)
+                # Log important content fragments
+                if any(
+                        keyword in result.content for keyword in
+                    ['<think>', '</think>', '<tool_calls>', '</tool_calls>']):
+                    print(
+                        f"  Char {i}: Content fragment: {repr(result.content)}"
+                    )
+
+            if hasattr(result, 'tool_calls') and result.tool_calls:
+                for tool_call in result.tool_calls:
+                    tool_info = {
+                        'character_position':
+                        i,
+                        'function_name':
+                        tool_call.function.name
+                        if tool_call.function else None,
+                        'arguments':
+                        tool_call.function.arguments
+                        if tool_call.function else None,
+                    }
+                    tool_calls_detected.append(tool_info)
+                    print(
+                        f"  Char {i}: Tool call detected: {tool_call.function.name}"
+                    )
+                    if tool_call.function.arguments:
+                        print(
+                            f"    Arguments: {repr(tool_call.function.arguments)}"
+                        )
+
+    # Verify results
+    print("\n=== Streaming Test Results ===")
+    print(f"Total content fragments: {len(content_fragments)}")
+    print(f"Total tool calls detected: {len(tool_calls_detected)}")
+
+    # Reconstruct content from fragments
+    reconstructed_content = ''.join(content_fragments)
+    print(f"Reconstructed content length: {len(reconstructed_content)}")
+
+    # Verify thinking tags content is preserved
+    assert '<think>' in reconstructed_content, "Opening thinking tag should be preserved in content"
+    assert '</think>' in reconstructed_content, "Closing thinking tag should be preserved in content"
+
+    # Verify that tool calls inside thinking tags are NOT extracted as actual tool calls
+    thinking_tool_calls = [
+        tc for tc in tool_calls_detected
+        if tc['function_name'] == 'internal_analysis'
+    ]
+    assert len(
+        thinking_tool_calls
+    ) == 0, f"Tool calls inside thinking tags should be ignored, but found: {thinking_tool_calls}"
+
+    # Verify that real tool calls outside thinking tags ARE extracted
+    weather_tool_calls = [
+        tc for tc in tool_calls_detected
+        if tc['function_name'] == 'get_current_weather'
+    ]
+    area_tool_calls = [
+        tc for tc in tool_calls_detected
+        if tc['function_name'] == 'calculate_area'
+    ]
+    print(tool_calls_detected)
+    assert len(weather_tool_calls
+               ) > 0, "get_current_weather tool call should be detected"
+    assert len(
+        area_tool_calls) > 0, "calculate_area tool call should be detected"
+
+    # Verify tool call arguments are properly streamed
+    weather_args_found = any(tc['arguments'] for tc in weather_tool_calls
+                             if tc['arguments'])
+    area_args_found = any(tc['arguments'] for tc in area_tool_calls
+                          if tc['arguments'])
+
+    print(f"Weather tool call with arguments: {weather_args_found}")
+    print(f"Area tool call with arguments: {area_args_found}")
+
+    # Verify content before and after tool calls
+    assert 'I\'ll help you with the weather analysis.' in reconstructed_content, "Initial content should be preserved"
+    assert 'Here are the results.' in reconstructed_content, "Final content should be preserved"
+
+    # Verify that <tool_calls> and </tool_calls> tags are not included in the final content
+    # (they should be filtered out when not inside thinking tags)
+    content_outside_thinking = reconstructed_content
+    # Remove thinking tag content to check content outside
+    if '<think>' in content_outside_thinking and '</think>' in content_outside_thinking:
+        start_think = content_outside_thinking.find('<think>')
+        end_think = content_outside_thinking.find('</think>') + len('</think>')
+        content_outside_thinking = content_outside_thinking[:
+                                                            start_think] + content_outside_thinking[
+                                                                end_think:]
+
+    # Outside thinking tags, tool_calls tags should be filtered
+    tool_calls_in_content = content_outside_thinking.count('<tool_calls>')
+    assert tool_calls_in_content == 0, f"<tool_calls> tags should be filtered from content outside thinking tags, but found {tool_calls_in_content}"
+
+    print(
+        "\n=== Character-by-character streaming test completed successfully ==="
+    )
+    print("✓ Tool calls inside thinking tags correctly ignored")
+    print("✓ Tool calls outside thinking tags correctly detected")
+    print("✓ Content properly streamed and reconstructed")
+    print("✓ Tool call tags properly filtered from content")
+    print("✓ Character-level streaming works correctly")
+
+
+def test_streaming_character_by_character_simple_tool_call(
+        minimax_tool_parser):
+    """Test character-by-character streaming for a simple tool call scenario."""
+    # Reset streaming state
+    reset_streaming_state(minimax_tool_parser)
+
+    # Simple tool call text
+    simple_text = 'Let me check the weather. <tool_calls>\n{"name": "get_weather", "arguments": {"city": "NYC"}}\n</tool_calls>'
+
+    print("\n=== Simple character-by-character test ===")
+    print(f"Text: {repr(simple_text)}")
+
+    content_parts = []
+    tool_name_sent = False
+    tool_args_sent = False
+
+    for i in range(1, len(simple_text) + 1):
+        current_text = simple_text[:i]
+        previous_text = simple_text[:i - 1] if i > 1 else ""
+        delta_text = simple_text[i - 1:i]
+
+        result = minimax_tool_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=delta_text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+
+        if result:
+            if hasattr(result, 'content') and result.content:
+                content_parts.append(result.content)
+                print(
+                    f"  Char {i} ({repr(delta_text)}): Content: {repr(result.content)}"
+                )
+
+            if hasattr(result, 'tool_calls') and result.tool_calls:
+                for tool_call in result.tool_calls:
+                    if tool_call.function and tool_call.function.name:
+                        tool_name_sent = True
+                        print(
+                            f"  Char {i}: Tool name: {tool_call.function.name}"
+                        )
+                    if tool_call.function and tool_call.function.arguments:
+                        tool_args_sent = True
+                        print(
+                            f"  Char {i}: Tool args: {repr(tool_call.function.arguments)}"
+                        )
+
+    # Verify basic expectations
+    reconstructed_content = ''.join(content_parts)
+    print(f"Final reconstructed content: {repr(reconstructed_content)}")
+
+    assert tool_name_sent, "Tool name should be sent during streaming"
+    assert tool_args_sent, "Tool arguments should be sent during streaming"
+    assert "Let me check the weather." in reconstructed_content, "Initial content should be preserved"
+
+    print("✓ Simple character-by-character test passed")
+
+
+def test_streaming_character_by_character_with_buffering(minimax_tool_parser):
+    """Test character-by-character streaming with edge cases that trigger buffering."""
+    # Reset streaming state
+    reset_streaming_state(minimax_tool_parser)
+
+    # Text that includes potential buffering scenarios
+    buffering_text = 'Hello world<tool_calls>\n{"name": "test"}\n</tool_calls>done'
+
+    print("\n=== Buffering character-by-character test ===")
+    print(f"Text: {repr(buffering_text)}")
+
+    all_content = []
+
+    for i in range(1, len(buffering_text) + 1):
+        current_text = buffering_text[:i]
+        previous_text = buffering_text[:i - 1] if i > 1 else ""
+        delta_text = buffering_text[i - 1:i]
+
+        result = minimax_tool_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=delta_text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+
+        if result and hasattr(result, 'content') and result.content:
+            all_content.append(result.content)
+            print(f"  Char {i} ({repr(delta_text)}): {repr(result.content)}")
+
+    final_content = ''.join(all_content)
+    print(f"Final content: {repr(final_content)}")
+
+    # The parser should handle the edge case where </tool_calls> appears before <tool_calls>
+    assert "Hello" in final_content, "Initial 'Hello' should be preserved"
+    assert "world" in final_content, "Content after false closing tag should be preserved"
+    assert "done" in final_content, "Final content should be preserved"
+
+    print("✓ Buffering character-by-character test passed")
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
index 6ba32e38fc..226309ef29 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -3,11 +3,9 @@
 
 import json
 from collections.abc import Sequence
-from typing import Union
+from typing import Any, Optional, Union
 
-import partial_json_parser
 import regex as re
-from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import random_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
@@ -17,6 +15,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               FunctionCall, ToolCall)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    extract_intermediate_diff)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
@@ -29,25 +29,32 @@ class MinimaxToolParser(ToolParser):
     def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
 
-        self.current_tool_name_sent: bool = False
-        self.prev_tool_call_arr: list[dict] = []
-        self.current_tool_id: int = -1
-        self.streamed_args_for_tool: list[str] = []
-
-        self.tool_call_start_token: str = "<tool_calls>"
-        self.tool_call_end_token: str = "</tool_calls>"
+        # Initialize streaming state for tracking tool call progress
+        self.streaming_state: dict[str, Any] = {
+            "current_tool_index": -1,  # Index of current tool being processed
+            "tool_ids": [],  # List of tool call IDs
+            "sent_tools": [],  # List of tools that have been sent
+        }
 
+        # Define tool call tokens and patterns
+        self.tool_call_start_token = "<tool_calls>"
+        self.tool_call_end_token = "</tool_calls>"
         self.tool_call_regex = re.compile(
             r"<tool_calls>(.*?)</tool_calls>|<tool_calls>(.*)", re.DOTALL)
-
-        # Add regex pattern for thinking tag
         self.thinking_tag_pattern = r"<think>(.*?)</think>"
+        self.tool_name_pattern = re.compile(r'"name":\s*"([^"]+)"')
+        self.tool_args_pattern = re.compile(r'"arguments":\s*')
+
+        # Buffer for handling partial tool calls during streaming
+        self.pending_buffer = ""
+        self.in_thinking_tag = False
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
                 "constructor during construction.")
 
+        # Get token IDs for tool call start/end tokens
         self.tool_call_start_token_id = self.vocab.get(
             self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
@@ -60,33 +67,95 @@ class MinimaxToolParser(ToolParser):
 
     def preprocess_model_output(self, model_output: str) -> str:
         """
-        Remove tool calls from within thinking tags to avoid processing them.
+        Preprocess model output by removing tool calls from thinking tags.
+        
+        Args:
+            model_output: Raw model output string
+            
+        Returns:
+            Preprocessed model output with tool calls removed from thinking tags
         """
 
         def remove_tool_calls_from_think(match):
             think_content = match.group(1)
-            # Remove tool_calls from within the think tag
             cleaned_content = re.sub(r"<tool_calls>.*?</tool_calls>",
                                      "",
                                      think_content,
                                      flags=re.DOTALL)
             return f"<think>{cleaned_content}</think>"
 
-        # Process thinking tags and remove tool_calls from within them
-        processed_output = re.sub(self.thinking_tag_pattern,
-                                  remove_tool_calls_from_think,
-                                  model_output,
-                                  flags=re.DOTALL)
+        return re.sub(self.thinking_tag_pattern,
+                      remove_tool_calls_from_think,
+                      model_output,
+                      flags=re.DOTALL)
 
-        return processed_output
+    def _clean_duplicate_braces(self, args_text: str) -> str:
+        """
+        Clean duplicate closing braces from arguments text.
+        
+        Args:
+            args_text: Raw arguments text
+            
+        Returns:
+            Cleaned arguments text with proper JSON formatting
+        """
+        args_text = args_text.strip()
+        if not args_text:
+            return args_text
+
+        try:
+            json.loads(args_text)
+            return args_text
+        except json.JSONDecodeError:
+            pass
+
+        while args_text.endswith('}}'):
+            candidate = args_text[:-1]
+            try:
+                json.loads(candidate)
+                return candidate
+            except json.JSONDecodeError:
+                args_text = candidate
+
+        return args_text
+
+    def _clean_delta_braces(self, delta_text: str) -> str:
+        """
+        Clean delta text by removing excessive closing braces.
+        
+        Args:
+            delta_text: Delta text to clean
+            
+        Returns:
+            Cleaned delta text
+        """
+        if not delta_text:
+            return delta_text
+
+        delta_stripped = delta_text.strip()
+
+        if delta_stripped and all(c in '}\n\r\t ' for c in delta_stripped):
+            brace_count = delta_stripped.count('}')
+            if brace_count > 1:
+                return '}\n' if delta_text.endswith('\n') else '}'
+
+        return delta_text
 
     def extract_tool_calls(
         self,
         model_output: str,
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
-
-        # Preprocess to remove tool calls from thinking tags
+        """
+        Extract tool calls from model output for non-streaming mode.
+        
+        Args:
+            model_output: Complete model output
+            request: Chat completion request
+            
+        Returns:
+            ExtractedToolCallInformation containing tool calls and content
+        """
         processed_output = self.preprocess_model_output(model_output)
 
         if self.tool_call_start_token not in processed_output:
@@ -95,8 +164,8 @@ class MinimaxToolParser(ToolParser):
                                                 content=model_output)
 
         try:
-            function_call_tuples = (
-                self.tool_call_regex.findall(processed_output))
+            function_call_tuples = self.tool_call_regex.findall(
+                processed_output)
 
             raw_function_calls = []
             for match in function_call_tuples:
@@ -124,21 +193,15 @@ class MinimaxToolParser(ToolParser):
                                          function_call["arguments"],
                                          ensure_ascii=False))))
 
-            # Extract content before the first valid tool call
-            # Find the position in processed output, then map back to original
             processed_pos = processed_output.find(self.tool_call_start_token)
             if processed_pos != -1:
-                # Get the content before tool calls in processed output
                 processed_content = processed_output[:processed_pos].strip()
 
                 if processed_content:
-                    # Find the end of this content in the original output
-                    # Look for the last non-empty line of processed content
                     lines = processed_content.split('\n')
                     for line in reversed(lines):
                         line = line.strip()
                         if line:
-                            # Find this line in original output
                             pos = model_output.find(line)
                             if pos != -1:
                                 content = model_output[:pos + len(line)]
@@ -162,6 +225,445 @@ class MinimaxToolParser(ToolParser):
                                                 tool_calls=[],
                                                 content=model_output)
 
+    def _update_thinking_state(self, text: str) -> None:
+        """
+        Update the thinking tag state based on text content.
+        
+        Args:
+            text: Text to analyze for thinking tags
+        """
+        open_count = text.count("<think>")
+        close_count = text.count("</think>")
+        self.in_thinking_tag = open_count > close_count or (
+            open_count == close_count and text.endswith("</think>"))
+
+    def _is_potential_tag_start(self, text: str) -> bool:
+        """
+        Check if text might be the start of a tool call tag.
+        
+        Args:
+            text: Text to check
+            
+        Returns:
+            True if text could be the start of a tool call tag
+        """
+        for tag in [self.tool_call_start_token, self.tool_call_end_token]:
+            if any(
+                    tag.startswith(text[-i:])
+                    for i in range(1, min(len(text) + 1, len(tag)))):
+                return True
+        return False
+
+    def _should_buffer_content(self, delta_text: str) -> bool:
+        """
+        Determine if content should be buffered for later processing.
+        
+        Args:
+            delta_text: Delta text to check
+            
+        Returns:
+            True if content should be buffered
+        """
+        if self.in_thinking_tag:
+            return False
+        return bool(self.pending_buffer
+                    or self.tool_call_start_token in delta_text
+                    or self.tool_call_end_token in delta_text
+                    or delta_text.startswith('<'))
+
+    def _split_content_for_buffering(self, delta_text: str) -> tuple[str, str]:
+        """
+        Split delta text into safe content and potential tag content.
+        
+        Args:
+            delta_text: Delta text to split
+            
+        Returns:
+            Tuple of (safe_content, potential_tag_content)
+        """
+        if self.in_thinking_tag:
+            return delta_text, ""
+
+        for tag in [self.tool_call_start_token, self.tool_call_end_token]:
+            for i in range(1, len(tag)):
+                tag_prefix = tag[:i]
+                pos = delta_text.rfind(tag_prefix)
+                if pos != -1 and tag.startswith(delta_text[pos:]):
+                    return delta_text[:pos], delta_text[pos:]
+        return delta_text, ""
+
+    def _process_buffer(self, new_content: str) -> str:
+        """
+        Process buffered content and return output content.
+        
+        Args:
+            new_content: New content to add to buffer
+            
+        Returns:
+            Processed output content
+        """
+        self.pending_buffer += new_content
+        output_content = ""
+
+        if self.in_thinking_tag:
+            output_content = self.pending_buffer
+            self.pending_buffer = ""
+            return output_content
+
+        while self.pending_buffer:
+            start_pos = self.pending_buffer.find(self.tool_call_start_token)
+            end_pos = self.pending_buffer.find(self.tool_call_end_token)
+
+            if start_pos != -1 and (end_pos == -1 or start_pos < end_pos):
+                tag_pos, tag_len = start_pos, len(self.tool_call_start_token)
+            elif end_pos != -1:
+                tag_pos, tag_len = end_pos, len(self.tool_call_end_token)
+            else:
+                if self._is_potential_tag_start(self.pending_buffer):
+                    break
+                output_content += self.pending_buffer
+                self.pending_buffer = ""
+                break
+
+            output_content += self.pending_buffer[:tag_pos]
+            self.pending_buffer = self.pending_buffer[tag_pos + tag_len:]
+
+        return output_content
+
+    def _reset_streaming_state(self) -> None:
+        """Reset the streaming state to initial values."""
+        self.streaming_state = {
+            "current_tool_index": -1,
+            "tool_ids": [],
+            "sent_tools": [],
+        }
+
+    def _advance_to_next_tool(self) -> None:
+        """Advance to the next tool in the streaming sequence."""
+        self.streaming_state["current_tool_index"] = int(
+            self.streaming_state["current_tool_index"]) + 1
+
+    def _set_current_tool_index(self, index: int) -> None:
+        """
+        Set the current tool index.
+        
+        Args:
+            index: Tool index to set
+        """
+        self.streaming_state["current_tool_index"] = index
+
+    def _get_current_tool_index(self) -> int:
+        """
+        Get the current tool index.
+        
+        Returns:
+            Current tool index
+        """
+        return int(self.streaming_state["current_tool_index"])
+
+    def _get_next_unsent_tool_index(self, tool_count: int) -> int:
+        """
+        Get the index of the next unsent tool.
+        
+        Args:
+            tool_count: Total number of tools
+            
+        Returns:
+            Index of next unsent tool, or -1 if all tools sent
+        """
+        sent_tools = list(self.streaming_state["sent_tools"])
+        for i in range(tool_count):
+            if i < len(sent_tools):
+                if not sent_tools[i]["sent_name"]:
+                    return i
+            else:
+                return i
+        return -1
+
+    def _ensure_state_arrays(self, tool_count: int) -> None:
+        """
+        Ensure state arrays have sufficient capacity for tool_count tools.
+        
+        Args:
+            tool_count: Number of tools to prepare for
+        """
+        sent_tools = list(self.streaming_state["sent_tools"])
+        tool_ids = list(self.streaming_state["tool_ids"])
+
+        while len(sent_tools) < tool_count:
+            sent_tools.append({
+                "sent_name": False,
+                "sent_arguments": "",
+                "id": random_tool_call_id(),
+            })
+
+        while len(tool_ids) < tool_count:
+            tool_ids.append(None)
+
+        self.streaming_state["sent_tools"] = sent_tools
+        self.streaming_state["tool_ids"] = tool_ids
+
+    def _detect_tools_in_text(self, text: str) -> int:
+        """
+        Detect the number of tools in text by counting name patterns.
+        
+        Args:
+            text: Text to analyze
+            
+        Returns:
+            Number of tools detected
+        """
+        matches = self.tool_name_pattern.findall(text)
+        return len(matches)
+
+    def _find_tool_boundaries(self, text: str) -> list[tuple[int, int]]:
+        """
+        Find the boundaries of tool calls in text.
+        
+        Args:
+            text: Text to analyze
+            
+        Returns:
+            List of (start, end) positions for tool calls
+        """
+        boundaries = []
+        i = 0
+        while i < len(text):
+            if text[i] == '{':
+                start = i
+                depth = 0
+                has_name = False
+                has_arguments = False
+
+                while i < len(text):
+                    if text[i] == '{':
+                        depth += 1
+                    elif text[i] == '}':
+                        depth -= 1
+                        if depth == 0:
+                            end = i + 1
+                            segment = text[start:end]
+                            if '"name"' in segment and '"arguments"' in segment:
+                                boundaries.append((start, end))
+                            break
+
+                    if not has_name and '"name"' in text[start:i + 1]:
+                        has_name = True
+                    if not has_arguments and '"arguments"' in text[start:i +
+                                                                   1]:
+                        has_arguments = True
+
+                    i += 1
+
+                if depth > 0 and has_name:
+                    boundaries.append((start, i))
+            else:
+                i += 1
+        return boundaries
+
+    def _extract_tool_args(self, tool_content: str, args_match) -> str:
+        """
+        Extract tool arguments from tool content.
+        
+        Args:
+            tool_content: Tool call content
+            args_match: Regex match for arguments pattern
+            
+        Returns:
+            Extracted arguments as string
+        """
+        args_start_pos = args_match.end()
+        remaining_content = tool_content[args_start_pos:]
+
+        if remaining_content.strip().startswith('{'):
+            depth = 0
+            for i, char in enumerate(remaining_content):
+                if char == '{':
+                    depth += 1
+                elif char == '}':
+                    depth -= 1
+                    if depth == 0:
+                        return remaining_content[:i + 1]
+        else:
+            args_end = remaining_content.find('}')
+            if args_end > 0:
+                return remaining_content[:args_end].strip()
+
+        return remaining_content.rstrip('}').strip()
+
+    def _get_current_tool_content(
+            self, text: str,
+            tool_index: int) -> tuple[Optional[str], Optional[str]]:
+        """
+        Get the content of a specific tool by index.
+        
+        Args:
+            text: Text containing tool calls
+            tool_index: Index of tool to extract
+            
+        Returns:
+            Tuple of (tool_name, tool_arguments) or (None, None) if not found
+        """
+        boundaries = self._find_tool_boundaries(text)
+
+        if tool_index >= len(boundaries):
+            return None, None
+
+        start, end = boundaries[tool_index]
+        tool_content = text[start:end]
+
+        name_match = self.tool_name_pattern.search(tool_content)
+        name = name_match.group(1) if name_match else None
+
+        args_match = self.tool_args_pattern.search(tool_content)
+        if args_match:
+            try:
+                args_text = self._extract_tool_args(tool_content, args_match)
+                return name, args_text
+            except Exception:
+                remaining_content = tool_content[args_match.end():]
+                args_text = remaining_content.rstrip('}').strip()
+                return name, args_text
+
+        return name, None
+
+    def _handle_tool_name_streaming(
+            self, tool_content: str,
+            tool_count: int) -> Union[DeltaMessage, None]:
+        """
+        Handle streaming of tool names.
+        
+        Args:
+            tool_content: Content containing tool calls
+            tool_count: Total number of tools
+            
+        Returns:
+            DeltaMessage with tool name or None if no tool to stream
+        """
+        next_idx = self._get_next_unsent_tool_index(tool_count)
+
+        if next_idx == -1:
+            return None
+
+        boundaries = self._find_tool_boundaries(tool_content)
+        if next_idx >= len(boundaries):
+            return None
+
+        tool_name, _ = self._get_current_tool_content(tool_content, next_idx)
+        if not tool_name:
+            return None
+
+        self._set_current_tool_index(next_idx)
+        sent_tools = list(self.streaming_state["sent_tools"])
+        tool_ids = list(self.streaming_state["tool_ids"])
+
+        tool_id = sent_tools[next_idx]["id"]
+        tool_ids[next_idx] = tool_id
+        sent_tools[next_idx]["sent_name"] = True
+
+        self.streaming_state["sent_tools"] = sent_tools
+        self.streaming_state["tool_ids"] = tool_ids
+
+        return DeltaMessage(tool_calls=[
+            DeltaToolCall(index=next_idx,
+                          type="function",
+                          id=tool_id,
+                          function=DeltaFunctionCall(
+                              name=tool_name).model_dump(exclude_none=True))
+        ])
+
+    def _handle_tool_args_streaming(
+            self, tool_content: str,
+            tool_count: int) -> Union[DeltaMessage, None]:
+        """
+        Handle streaming of tool arguments.
+        
+        Args:
+            tool_content: Content containing tool calls
+            tool_count: Total number of tools
+            
+        Returns:
+            DeltaMessage with tool arguments or None if no arguments to stream
+        """
+        current_idx = self._get_current_tool_index()
+
+        if current_idx < 0 or current_idx >= tool_count:
+            return None
+
+        tool_name, tool_args = self._get_current_tool_content(
+            tool_content, current_idx)
+        if not tool_name or tool_args is None:
+            return None
+
+        sent_tools = list(self.streaming_state["sent_tools"])
+
+        if not sent_tools[current_idx]["sent_name"]:
+            return None
+
+        clean_args = self._clean_duplicate_braces(tool_args)
+        sent_args = sent_tools[current_idx]["sent_arguments"]
+
+        if clean_args != sent_args:
+            if sent_args and clean_args.startswith(sent_args):
+                args_delta = extract_intermediate_diff(clean_args, sent_args)
+                if args_delta:
+                    args_delta = self._clean_delta_braces(args_delta)
+                    sent_tools[current_idx]["sent_arguments"] = clean_args
+                    self.streaming_state["sent_tools"] = sent_tools
+
+                    if clean_args.endswith('}'):
+                        self._advance_to_next_tool()
+
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=current_idx,
+                                      function=DeltaFunctionCall(
+                                          arguments=args_delta).model_dump(
+                                              exclude_none=True))
+                    ])
+            elif not sent_args and clean_args:
+                clean_args_delta = self._clean_delta_braces(clean_args)
+                sent_tools[current_idx]["sent_arguments"] = clean_args
+                self.streaming_state["sent_tools"] = sent_tools
+
+                if clean_args.endswith('}'):
+                    self._advance_to_next_tool()
+
+                return DeltaMessage(tool_calls=[
+                    DeltaToolCall(index=current_idx,
+                                  function=DeltaFunctionCall(
+                                      arguments=clean_args_delta).model_dump(
+                                          exclude_none=True))
+                ])
+
+        return None
+
+    def _is_end_tool_calls(self, current_text: str) -> bool:
+        if self.tool_call_end_token not in current_text:
+            return False
+
+        end_token_positions = []
+        search_start = 0
+        while True:
+            pos = current_text.find(self.tool_call_end_token, search_start)
+            if pos == -1:
+                break
+            end_token_positions.append(pos)
+            search_start = pos + 1
+
+        think_regions = []
+        for match in re.finditer(self.thinking_tag_pattern,
+                                 current_text,
+                                 flags=re.DOTALL):
+            think_regions.append((match.start(), match.end()))
+
+        for pos in end_token_positions:
+            in_think = any(pos >= t_start and pos < t_end
+                           for t_start, t_end in think_regions)
+            if not in_think:
+                return True
+
+        return False
+
     def extract_tool_calls_streaming(
         self,
         previous_text: str,
@@ -172,13 +674,37 @@ class MinimaxToolParser(ToolParser):
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
     ) -> Union[DeltaMessage, None]:
-        logger.debug("delta_text: %s", delta_text)
-        logger.debug("delta_token_ids: %s", delta_token_ids)
+        self._update_thinking_state(current_text)
+
+        if self.in_thinking_tag:
+            return DeltaMessage(content=delta_text)
+
+        if self._should_buffer_content(delta_text):
+            buffered_output = self._process_buffer(delta_text)
+            return DeltaMessage(
+                content=buffered_output) if buffered_output else None
+
+        if self._is_end_tool_calls(current_text):
+            return DeltaMessage(content=delta_text)
+
+        safe_content, potential_tag = self._split_content_for_buffering(
+            delta_text)
+        if potential_tag:
+            self.pending_buffer += potential_tag
+            return DeltaMessage(content=safe_content) if safe_content else None
 
-        # Preprocess to remove tool calls from thinking tags
         processed_current_text = self.preprocess_model_output(current_text)
 
         if self.tool_call_start_token not in processed_current_text:
+            if (self.tool_call_end_token in delta_text
+                    and self.tool_call_start_token in current_text):
+                return None
+            if delta_text.strip(
+            ) == '' and self.tool_call_start_token in current_text:
+                return None
+            if (self._get_current_tool_index() != -1
+                    and self.tool_call_end_token in current_text):
+                self._reset_streaming_state()
             return DeltaMessage(content=delta_text)
 
         if (self.tool_call_start_token_id is not None
@@ -186,184 +712,104 @@ class MinimaxToolParser(ToolParser):
                 and len(delta_token_ids) == 1):
             return None
 
-        original_tool_call_start_pos = current_text.find(
-            self.tool_call_start_token)
-        if original_tool_call_start_pos > 0:
-            delta_start_pos = len(current_text) - len(delta_text)
-            if delta_start_pos < original_tool_call_start_pos:
-                content_part = delta_text
-                if delta_start_pos + len(
-                        delta_text) > original_tool_call_start_pos:
-                    content_part = delta_text[:original_tool_call_start_pos -
-                                              delta_start_pos]
-                if content_part:
-                    return DeltaMessage(content=content_part)
+        original_tool_start = self._find_tool_start_outside_thinking(
+            current_text)
+        if original_tool_start is None:
+            return None
 
-        flags = Allow.ALL if self.current_tool_name_sent \
-            else Allow.ALL & ~Allow.STR
+        content_before_tools = self._extract_content_before_tools(
+            current_text, delta_text, original_tool_start)
+        if content_before_tools:
+            return DeltaMessage(content=content_before_tools)
 
         try:
-            parsable_content = processed_current_text.split(
-                self.tool_call_start_token)[-1].split(
-                    self.tool_call_end_token)[0]
+            tool_content = self._extract_tool_content(current_text,
+                                                      original_tool_start)
+            current_tools_count = self._detect_tools_in_text(tool_content)
 
-            tool_call_arr = []
-            if parsable_content.strip():
-                lines = parsable_content.strip().split('\n')
-                for line in lines:
-                    line = line.strip()
-                    if line and (line.startswith('{') or '"name"' in line):
-                        try:
-                            if line.endswith('}'):
-                                parsed_call = json.loads(line)
-                                tool_call_arr.append(parsed_call)
-                            else:
-                                parsed_call = partial_json_parser.loads(
-                                    line, flags)
-                                if parsed_call and isinstance(
-                                        parsed_call, dict):
-                                    tool_call_arr.append(parsed_call)
-                        except (json.JSONDecodeError, partial_json_parser.core.
-                                exceptions.MalformedJSON):
-                            continue
-
-            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
-                if len(tool_call_arr) > self.current_tool_id >= 0 else {}
-
-            if len(tool_call_arr) == 0:
+            if current_tools_count == 0:
                 return None
 
-            # Starting a new tool in the array
-            elif (len(tool_call_arr) > 0
-                  and len(tool_call_arr) > self.current_tool_id + 1):
+            if self._get_current_tool_index() == -1:
+                self._reset_streaming_state()
 
-                # Handle any missed arguments from previous tool
-                if self.current_tool_id >= 0 and self.current_tool_id < len(
-                        self.prev_tool_call_arr):
-                    prev_tool_call = self.prev_tool_call_arr[
-                        self.current_tool_id]
-                    diff_arguments = prev_tool_call.get("arguments")
+            self._ensure_state_arrays(current_tools_count)
 
-                    if diff_arguments:
-                        diff_arguments_json = json.dumps(diff_arguments,
-                                                         ensure_ascii=False)
-                        already_streamed = self.streamed_args_for_tool[
-                            self.
-                            current_tool_id] if self.current_tool_id < len(
-                                self.streamed_args_for_tool) else ""
-
-                        if diff_arguments_json != already_streamed:
-                            diff = diff_arguments_json[len(already_streamed):]
-                            delta = DeltaMessage(tool_calls=[
-                                DeltaToolCall(index=self.current_tool_id,
-                                              function=DeltaFunctionCall(
-                                                  arguments=diff).model_dump(
-                                                      exclude_none=True))
-                            ])
-                            if self.current_tool_id < len(
-                                    self.streamed_args_for_tool):
-                                self.streamed_args_for_tool[
-                                    self.current_tool_id] = diff_arguments_json
-                        else:
-                            delta = None
-                    else:
-                        delta = None
-                else:
-                    delta = None
-
-                self.current_tool_id = len(tool_call_arr) - 1
-                self.current_tool_name_sent = False
-                self.streamed_args_for_tool.append("")
-                logger.debug("starting on new tool %d", self.current_tool_id)
-                return delta
-
-            # Send tool name if not sent yet
-            if not self.current_tool_name_sent:
-                function_name = current_tool_call.get("name")
-                if function_name:
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      type="function",
-                                      id=random_tool_call_id(),
-                                      function=DeltaFunctionCall(
-                                          name=function_name).model_dump(
-                                              exclude_none=True))
-                    ])
-                    self.current_tool_name_sent = True
-                else:
-                    delta = None
-
-            # Stream arguments
-            else:
-                prev_arguments = None
-                if (self.current_tool_id < len(self.prev_tool_call_arr)
-                        and self.prev_tool_call_arr[self.current_tool_id]):
-                    prev_arguments = self.prev_tool_call_arr[
-                        self.current_tool_id].get("arguments")
-
-                cur_arguments = current_tool_call.get("arguments")
-
-                if not cur_arguments and not prev_arguments:
-                    delta = None
-                elif not cur_arguments and prev_arguments:
-                    logger.error(
-                        "Arguments reset mid-call, skipping streaming")
-                    delta = None
-                elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments,
-                                                    ensure_ascii=False)
-                    logger.debug("First tokens in arguments received: %s",
-                                 cur_arguments_json)
-
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      function=DeltaFunctionCall(
-                                          arguments=cur_arguments_json).
-                                      model_dump(exclude_none=True))
-                    ])
-                    self.streamed_args_for_tool[
-                        self.current_tool_id] = cur_arguments_json
-
-                elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments,
-                                               ensure_ascii=False)
-                    prev_args_json = json.dumps(prev_arguments,
-                                                ensure_ascii=False)
-
-                    logger.debug("Searching for diff between \n%s\n%s",
-                                 cur_args_json, prev_args_json)
-
-                    already_streamed = self.streamed_args_for_tool[
-                        self.current_tool_id] if self.current_tool_id < len(
-                            self.streamed_args_for_tool) else ""
-
-                    if cur_args_json.startswith(already_streamed):
-                        argument_diff = cur_args_json[len(already_streamed):]
-                    elif cur_args_json != already_streamed:
-                        argument_diff = cur_args_json
-                        self.streamed_args_for_tool[self.current_tool_id] = ""
-                    else:
-                        argument_diff = ""
-
-                    if argument_diff:
-                        logger.debug("got arguments diff: %s", argument_diff)
-                        delta = DeltaMessage(tool_calls=[
-                            DeltaToolCall(index=self.current_tool_id,
-                                          function=DeltaFunctionCall(
-                                              arguments=argument_diff).
-                                          model_dump(exclude_none=True))
-                        ])
-                        self.streamed_args_for_tool[
-                            self.current_tool_id] += argument_diff
-                    else:
-                        delta = None
-                else:
-                    delta = None
-
-            self.prev_tool_call_arr = tool_call_arr
-            return delta
+            return (self._handle_tool_name_streaming(tool_content,
+                                                     current_tools_count)
+                    or self._handle_tool_args_streaming(
+                        tool_content, current_tools_count))
 
         except Exception:
-            logger.exception("An unexpected error occurred",
+            logger.exception("An unexpected error occurred ",
                              "during streaming tool call handling.")
             return None
+
+    def _find_tool_start_outside_thinking(self,
+                                          current_text: str) -> Optional[int]:
+        """
+        Find the start position of tool calls outside of thinking tags.
+        
+        Args:
+            current_text: Current text to search
+            
+        Returns:
+            Position of tool call start or None if not found
+        """
+        search_start = 0
+        while True:
+            pos = current_text.find(self.tool_call_start_token, search_start)
+            if pos == -1:
+                return None
+
+            think_regions = [(m.start(), m.end()) for m in re.finditer(
+                r"<think>(.*?)</think>", current_text, flags=re.DOTALL)]
+            in_think = any(pos >= t_start and pos < t_end
+                           for t_start, t_end in think_regions)
+
+            if not in_think:
+                return pos
+
+            search_start = pos + 1
+
+    def _extract_content_before_tools(self, current_text: str, delta_text: str,
+                                      tool_start: int) -> Optional[str]:
+        """
+        Extract content that appears before tool calls.
+        
+        Args:
+            current_text: Current text
+            delta_text: Delta text
+            tool_start: Start position of tools
+            
+        Returns:
+            Content before tools or None
+        """
+        if tool_start > 0:
+            delta_start_pos = len(current_text) - len(delta_text)
+            if delta_start_pos < tool_start:
+                content_part = delta_text
+                if delta_start_pos + len(delta_text) > tool_start:
+                    content_part = delta_text[:tool_start - delta_start_pos]
+                return content_part if content_part else None
+        return None
+
+    def _extract_tool_content(self, current_text: str, tool_start: int) -> str:
+        """
+        Extract tool content from current text starting at tool_start.
+        
+        Args:
+            current_text: Current text
+            tool_start: Start position of tool calls
+            
+        Returns:
+            Extracted tool content
+        """
+        tool_content_start = tool_start + len(self.tool_call_start_token)
+        tool_content = current_text[tool_content_start:]
+
+        end_pos = tool_content.find(self.tool_call_end_token)
+        if end_pos != -1:
+            tool_content = tool_content[:end_pos]
+
+        return tool_content

From 609b533cb6f25f599fda94598bba446396498632 Mon Sep 17 00:00:00 2001
From: Syed Muhammad Bin Asif <92625830+syedmba@users.noreply.github.com>
Date: Thu, 7 Aug 2025 11:31:03 +0800
Subject: [PATCH 045/932] [Bugfix] Add proper comparison for package versions
 (#22314)

Signed-off-by: Syed Muhammad Bin Asif <syedmba7@connect.hku.hk>
---
 benchmarks/kernels/benchmark_bitblas.py                    | 4 +++-
 docs/design/arch_overview.md                               | 3 ++-
 vllm/attention/ops/triton_decode_attention.py              | 4 +++-
 vllm/model_executor/layers/quantization/bitblas.py         | 4 +++-
 vllm/model_executor/layers/quantization/bitsandbytes.py    | 7 +++++--
 vllm/model_executor/layers/quantization/deepspeedfp.py     | 3 ++-
 vllm/model_executor/layers/quantization/gptq_bitblas.py    | 4 +++-
 vllm/model_executor/layers/quantization/ipex_quant.py      | 7 +++++--
 .../layers/quantization/kernels/mixed_precision/bitblas.py | 4 +++-
 .../layers/quantization/utils/bitblas_utils.py             | 4 +++-
 .../model_executor/layers/quantization/utils/w8a8_utils.py | 5 +++--
 vllm/model_executor/model_loader/bitsandbytes_loader.py    | 4 +++-
 vllm/v1/sample/ops/topk_topp_sampler.py                    | 3 ++-
 13 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py
index 97ee060341..66b44c27d6 100644
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@@ -3,6 +3,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
+from packaging import version
+
 from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
     MINIMUM_BITBLAS_VERSION,
 )
@@ -10,7 +12,7 @@ from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
 try:
     import bitblas
 
-    if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+    if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
         raise ImportError(
             "bitblas version is wrong. Please "
             f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md
index 334df5dc9b..6b70867760 100644
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -200,7 +200,8 @@ vision-language model.
                 lora_config = vllm_config.lora_config
                 super().__init__(config, cache_config, quant_config, lora_config, prefix)
 
-        if __version__ >= "0.6.4":
+        from packaging import version
+        if version.parse(__version__) >= version.parse("0.6.4"):
             MyModel = MyNewModel
         else:
             MyModel = MyOldModel
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
index c27b377aeb..f82ce5b4d4 100644
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -31,6 +31,8 @@ It supports page size >= 1.
 
 import logging
 
+from packaging import version
+
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 
@@ -40,7 +42,7 @@ logger = logging.getLogger(__name__)
 
 # Only print the following warnings when triton version < 3.2.0.
 # The issue won't affect performance or accuracy.
-if triton.__version__ < '3.2.0':
+if version.parse(triton.__version__) < version.parse('3.2.0'):
     logger.warning(
         "The following error message 'operation scheduled before its operands' "
         "can be ignored.")
diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py
index aa8eee88a9..39bd34d351 100644
--- a/vllm/model_executor/layers/quantization/bitblas.py
+++ b/vllm/model_executor/layers/quantization/bitblas.py
@@ -3,6 +3,7 @@
 from typing import Any, Optional
 
 import torch
+from packaging import version
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
@@ -45,7 +46,8 @@ class BitBLASConfig(QuantizationConfig):
     ) -> None:
         try:
             import bitblas
-            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+            if version.parse(bitblas.__version__) < version.parse(
+                    MINIMUM_BITBLAS_VERSION):
                 raise ImportError(
                     "bitblas version is wrong. Please "
                     f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 5359189caa..0204ff4685 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -4,6 +4,7 @@
 from typing import Any, Callable, Optional, Union
 
 import torch
+from packaging import version
 
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
@@ -169,7 +170,8 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: BitsAndBytesConfig):
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.46.1":
+            if version.parse(
+                    bitsandbytes.__version__) < version.parse("0.46.1"):
                 raise ImportError("bitsandbytes version is wrong. Please "
                                   "install bitsandbytes>=0.46.1.")
         except ImportError as err:
@@ -412,7 +414,8 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
     def __init__(self, quant_config: BitsAndBytesConfig):
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.46.1":
+            if version.parse(
+                    bitsandbytes.__version__) < version.parse("0.46.1"):
                 raise ImportError("bitsandbytes version is wrong. Please "
                                   "install bitsandbytes>=0.46.1.")
         except ImportError as err:
diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
index 8030be5259..2922aef329 100644
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -6,6 +6,7 @@ from typing import Any, Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from packaging import version
 
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -145,7 +146,7 @@ class DeepSpeedFPParameter(nn.Parameter):
                 quant_config: DeepSpeedFPConfig):
         try:
             import deepspeed
-            if deepspeed.__version__ < "0.14.2":
+            if version.parse(deepspeed.__version__) < version.parse("0.14.2"):
                 raise ImportError("deepspeed version is wrong. Please "
                                   "install deepspeed>=0.14.2.")
             from deepspeed.ops.fp_quantizer import FP_Quantize
diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py
index caeb266d0b..d03074f861 100644
--- a/vllm/model_executor/layers/quantization/gptq_bitblas.py
+++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py
@@ -3,6 +3,7 @@
 from typing import Any, Optional
 
 import torch
+from packaging import version
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
@@ -63,7 +64,8 @@ class GPTQBitBLASConfig(QuantizationConfig):
 
         try:
             import bitblas
-            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+            if version.parse(bitblas.__version__) < version.parse(
+                    MINIMUM_BITBLAS_VERSION):
                 raise ImportError(
                     "bitblas version is wrong. Please "
                     f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 428e9b882b..9c458954f9 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -4,6 +4,7 @@
 from typing import Any, Optional
 
 import torch
+from packaging import version
 
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
@@ -135,7 +136,8 @@ class IPEXGPTQLinearMethod(GPTQLinearMethod):
 
         try:
             import intel_extension_for_pytorch as ipex
-            if ipex.__version__ < MIN_IPEX_VERSION:
+            if version.parse(
+                    ipex.__version__) < version.parse(MIN_IPEX_VERSION):
                 raise ImportError(
                     "intel_extension_for_pytorch version is "
                     "wrong. Please install "
@@ -199,7 +201,8 @@ class IPEXAWQLinearMethod(AWQLinearMethod):
 
         try:
             import intel_extension_for_pytorch as ipex
-            if ipex.__version__ < MIN_IPEX_VERSION:
+            if version.parse(
+                    ipex.__version__) < version.parse(MIN_IPEX_VERSION):
                 raise ImportError(
                     "intel_extension_for_pytorch version is "
                     "wrong. Please install "
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
index 649d07b4d0..0eca3b4c02 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
@@ -4,6 +4,7 @@
 from typing import Optional
 
 import torch
+from packaging import version
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
@@ -110,7 +111,8 @@ class BitBLASLinearKernel(MPLinearKernel):
 
         try:
             import bitblas
-            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+            if version.parse(bitblas.__version__) < version.parse(
+                    MINIMUM_BITBLAS_VERSION):
                 raise ImportError(
                     "bitblas version is wrong. Please "
                     f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
index 82ee3edfd5..4c2e548735 100644
--- a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
@@ -3,6 +3,7 @@
 from typing import Optional
 
 import torch
+from packaging import version
 
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
@@ -75,7 +76,8 @@ def _check_bitblas_supported(
     # Finally, check if bitblas is installed
     try:
         import bitblas
-        if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+        if version.parse(
+                bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
             raise ImportError("bitblas version is wrong. Please "
                               f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
     except ImportError:
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 47bb457932..ddb5096890 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -4,6 +4,7 @@
 from typing import Callable, Optional, Union
 
 import torch
+from packaging import version
 
 from vllm import _custom_ops as ops
 from vllm import envs
@@ -21,8 +22,8 @@ TORCH_DEVICE_IDENTITY = None
 # torch._scaled_mm rowwise feature.
 # The condition is determined once as the operations
 # are time consuming.
-USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm()
-                               and torch.__version__[0:3] >= "2.7"
+USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() and version.parse(
+    torch.__version__) >= version.parse("2.7")
                                and current_platform.has_device_capability(94))
 
 
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index f54dfab523..ea2fb2e3ac 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -12,6 +12,7 @@ from typing import Any, Callable, Optional
 import numpy as np
 import torch
 from huggingface_hub import HfApi
+from packaging import version
 from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
@@ -193,7 +194,8 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         try:
             import bitsandbytes
 
-            if bitsandbytes.__version__ < "0.46.1":
+            if version.parse(
+                    bitsandbytes.__version__) < version.parse("0.46.1"):
                 raise ImportError("bitsandbytes version is wrong. Please "
                                   "install bitsandbytes>=0.46.1.")
         except ImportError as err:
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 460e1c0b05..e0434c8f3d 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -5,6 +5,7 @@ from typing import Optional
 
 import torch
 import torch.nn as nn
+from packaging import version
 
 from vllm import envs
 from vllm.logger import init_logger
@@ -32,7 +33,7 @@ class TopKTopPSampler(nn.Module):
         if current_platform.is_cuda():
             if is_flashinfer_available:
                 flashinfer_version = flashinfer.__version__
-                if flashinfer_version < "0.2.3":
+                if version.parse(flashinfer_version) < version.parse("0.2.3"):
                     logger.warning_once(
                         "FlashInfer version >= 0.2.3 required. "
                         "Falling back to default sampling implementation.")

From ecbea55ca254186ed6cbf62702242d73c177a75f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 7 Aug 2025 04:31:41 +0100
Subject: [PATCH 046/932] Update `hf_xet` pin to resolve hangs (#22356)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 requirements/common.txt             | 1 -
 requirements/nightly_torch_test.txt | 1 -
 requirements/test.in                | 1 -
 requirements/test.txt               | 3 +--
 4 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 5405df359a..5c422500e1 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -8,7 +8,6 @@ tqdm
 blake3
 py-cpuinfo
 transformers >= 4.55.0
-huggingface-hub[hf_xet] >= 0.33.0  # Required for Xet downloads.
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index 7ae5e6f2f4..491fa06259 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -31,7 +31,6 @@ lm-eval[api]==0.4.8 # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
 transformers==4.52.4
 tokenizers==0.21.1
-huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes>=0.46.1
diff --git a/requirements/test.in b/requirements/test.in
index 9c8c75dd6f..1e0cab80a2 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -37,7 +37,6 @@ lm-eval[api]==0.4.8 # required for model evaluation test
 mteb[bm25s]>=1.38.11, <2 # required for mteb test
 transformers==4.55.0
 tokenizers==0.21.1
-huggingface-hub[hf_xet]>=0.33.0  # Required for Xet downloads.
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes==0.46.1
diff --git a/requirements/test.txt b/requirements/test.txt
index 08ba964f22..324f8153b2 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -276,7 +276,7 @@ h5py==3.13.0
     # via terratorch
 harfile==0.3.0
     # via schemathesis
-hf-xet==1.1.3
+hf-xet==1.1.7
     # via huggingface-hub
 hiredis==3.0.0
     # via tensorizer
@@ -288,7 +288,6 @@ httpx==0.27.2
     #   schemathesis
 huggingface-hub==0.34.3
     # via
-    #   -r requirements/test.in
     #   accelerate
     #   datasets
     #   evaluate

From 14bcf93a6a59072fd5bc542d0ad73c54546cef5c Mon Sep 17 00:00:00 2001
From: "ZiTian.Zhao" <zitian.zhao@tencentmusic.com>
Date: Thu, 7 Aug 2025 11:32:19 +0800
Subject: [PATCH 047/932] Optimize logger init performance by using
 module-level constants (#22373)

Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
---
 vllm/logger.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/vllm/logger.py b/vllm/logger.py
index 69aaf4390a..8f06eb03c7 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -102,6 +102,14 @@ class _VllmLogger(Logger):
         _print_warning_once(self, msg, *args)
 
 
+# Pre-defined methods mapping to avoid repeated dictionary creation
+_METHODS_TO_PATCH = {
+    "debug_once": _print_debug_once,
+    "info_once": _print_info_once,
+    "warning_once": _print_warning_once,
+}
+
+
 def _configure_vllm_root_logger() -> None:
     logging_config = dict[str, Any]()
 
@@ -144,13 +152,7 @@ def init_logger(name: str) -> _VllmLogger:
 
     logger = logging.getLogger(name)
 
-    methods_to_patch = {
-        "debug_once": _print_debug_once,
-        "info_once": _print_info_once,
-        "warning_once": _print_warning_once,
-    }
-
-    for method_name, method in methods_to_patch.items():
+    for method_name, method in _METHODS_TO_PATCH.items():
         setattr(logger, method_name, MethodType(method, logger))
 
     return cast(_VllmLogger, logger)

From ad6c655dde487c256292ad85a538cdf5133ee28b Mon Sep 17 00:00:00 2001
From: Lionel Villard <villard@us.ibm.com>
Date: Wed, 6 Aug 2025 23:33:24 -0400
Subject: [PATCH 048/932] preload heavy modules when mp method is forkserver
 (#22214)

Signed-off-by: Lionel Villard <villard@us.ibm.com>
---
 vllm/benchmarks/latency.py            |  4 +++-
 vllm/entrypoints/openai/api_server.py | 10 ++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index cebdf56c45..05378ec74d 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -13,7 +13,6 @@ import numpy as np
 from tqdm import tqdm
 
 import vllm.envs as envs
-from vllm import LLM, SamplingParams
 from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
                                        write_to_json)
 from vllm.engine.arg_utils import EngineArgs
@@ -85,6 +84,9 @@ def main(args: argparse.Namespace):
             "Please set it to a valid path to use torch profiler.")
     engine_args = EngineArgs.from_cli_args(args)
 
+    # Lazy import to avoid importing LLM when the bench command is not selected.
+    from vllm import LLM, SamplingParams
+
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
     llm = LLM(**dataclasses.asdict(engine_args))
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 88ef16b87e..f6f83223a1 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -8,6 +8,7 @@ import importlib
 import inspect
 import json
 import multiprocessing
+import multiprocessing.forkserver as forkserver
 import os
 import signal
 import socket
@@ -155,6 +156,15 @@ async def build_async_engine_client(
     client_config: Optional[dict[str, Any]] = None,
 ) -> AsyncIterator[EngineClient]:
 
+    if os.getenv("VLLM_WORKER_MULTIPROC_METHOD") == "forkserver":
+        # The executor is expected to be mp.
+        # Pre-import heavy modules in the forkserver process
+        logger.debug("Setup forkserver with pre-imports")
+        multiprocessing.set_start_method('forkserver')
+        multiprocessing.set_forkserver_preload(["vllm.v1.engine.async_llm"])
+        forkserver.ensure_running()
+        logger.debug("Forkserver setup complete!")
+
     # Context manager to handle engine_client lifecycle
     # Ensures everything is shutdown and cleaned up on error/exit
     engine_args = AsyncEngineArgs.from_cli_args(args)

From f6278b6243079784dc71e63244f6de38a47bf6c2 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Wed, 6 Aug 2025 20:56:02 -0700
Subject: [PATCH 049/932] [gpt-oss] Convert user input to harmony format
 (#22402)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/entrypoints/chat_utils.py               |   4 +-
 vllm/entrypoints/harmony_utils.py            |  60 ++++++-
 vllm/entrypoints/openai/protocol.py          |   9 +-
 vllm/entrypoints/openai/serving_responses.py | 158 +++++++++++++++++--
 4 files changed, 216 insertions(+), 15 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index a658d97cc8..74c8093f49 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -29,6 +29,7 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
     InputAudio)
 from openai.types.responses import ResponseInputImageParam
+from openai_harmony import Message as OpenAIHarmonyMessage
 from PIL import Image
 from pydantic import BaseModel, ConfigDict, TypeAdapter
 # yapf: enable
@@ -207,7 +208,8 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
 
 
 ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam,
-                                   CustomChatCompletionMessageParam]
+                                   CustomChatCompletionMessageParam,
+                                   OpenAIHarmonyMessage]
 
 
 # TODO: Make fields ReadOnly once mypy supports it
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index ecda35c980..ee08d62b57 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -2,14 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import datetime
 from collections.abc import Iterable, Sequence
-from typing import Literal, Optional
+from typing import Literal, Optional, Union
 
+from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
 from openai.types.responses.tool import Tool
-from openai_harmony import (Conversation, DeveloperContent,
+from openai_harmony import (Author, Conversation, DeveloperContent,
                             HarmonyEncodingName, Message, ReasoningEffort,
                             Role, StreamableParser, SystemContent, TextContent,
                             ToolDescription, load_harmony_encoding)
 
+from vllm.entrypoints.openai.protocol import (ResponseInputOutputItem,
+                                              ResponseReasoningItem)
+
 REASONING_EFFORT = {
     "high": ReasoningEffort.HIGH,
     "medium": ReasoningEffort.MEDIUM,
@@ -85,6 +89,58 @@ def get_user_message(content: str) -> Message:
     return Message.from_role_and_content(Role.USER, content)
 
 
+def parse_response_input(
+    response_msg: ResponseInputOutputItem,
+    prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]]
+) -> Message:
+    if not isinstance(response_msg, dict):
+        response_msg = response_msg.model_dump()
+    if "type" not in response_msg or response_msg["type"] == "message":
+        role = response_msg["role"]
+        content = response_msg["content"]
+        if role == "system":
+            # User is trying to set a system message. Change it to:
+            # <|start|>developer<|message|># Instructions
+            # {instructions}<|end|>
+            role = "developer"
+            text_prefix = "Instructions:\n"
+        else:
+            text_prefix = ""
+        if isinstance(content, str):
+            msg = Message.from_role_and_content(role, text_prefix + content)
+        else:
+            contents = [
+                TextContent(text=text_prefix + c["text"]) for c in content
+            ]
+            msg = Message.from_role_and_contents(role, contents)
+    elif response_msg["type"] == "function_call_output":
+        call_id = response_msg["call_id"]
+        call_response: Optional[ResponseFunctionToolCall] = None
+        for prev_response in reversed(prev_responses):
+            if isinstance(prev_response, ResponseFunctionToolCall
+                          ) and prev_response.call_id == call_id:
+                call_response = prev_response
+                break
+        if call_response is None:
+            raise ValueError(f"No call message found for {call_id}")
+        msg = Message.from_author_and_content(
+            Author.new(Role.TOOL, f"functions.{call_response.name}"),
+            response_msg["output"])
+    elif response_msg["type"] == "reasoning":
+        content = response_msg["content"]
+        assert len(content) == 1
+        msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
+    elif response_msg["type"] == "function_call":
+        msg = Message.from_role_and_content(Role.ASSISTANT,
+                                            response_msg["arguments"])
+        msg = msg.with_channel("commentary")
+        msg = msg.with_recipient(f"functions.{response_msg['name']}")
+        msg = msg.with_content_type("json")
+    else:
+        raise ValueError(f"Unknown input type: {response_msg['type']}")
+    return msg
+
+
 def parse_chat_input(chat_msg) -> Message:
     role = chat_msg["role"]
     content = chat_msg["content"]
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 57aa427207..421927d61b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -17,7 +17,8 @@ from openai.types.chat.chat_completion_audio import (
 from openai.types.chat.chat_completion_message import (
     Annotation as OpenAIAnnotation)
 # yapf: enable
-from openai.types.responses import (ResponseInputParam, ResponseOutputItem,
+from openai.types.responses import (ResponseFunctionToolCall,
+                                    ResponseInputItemParam, ResponseOutputItem,
                                     ResponseOutputMessage, ResponsePrompt,
                                     ResponseStatus, ResponseTextConfig)
 from openai.types.responses.response import ToolChoice
@@ -234,6 +235,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
     return None
 
 
+ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam,
+                                           ResponseFunctionToolCall]
+
+
 class ResponsesRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/responses/create
@@ -248,7 +253,7 @@ class ResponsesRequest(OpenAIBaseModel):
             "reasoning.encrypted_content",
         ],
     ]] = None
-    input: Union[str, ResponseInputParam]
+    input: Union[str, list[ResponseInputOutputItem]]
     instructions: Optional[str] = None
     max_output_tokens: Optional[int] = None
     max_tool_calls: Optional[int] = None
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 4ca863fd07..3c0b590b0c 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -4,12 +4,15 @@
 import asyncio
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
+from copy import copy
 from http import HTTPStatus
 from typing import Callable, Final, Optional, Union
 
 import jinja2
 from fastapi import Request
-from openai.types.responses import ResponseOutputMessage, ResponseOutputText
+from openai.types.responses import (ResponseFunctionToolCall,
+                                    ResponseOutputMessage, ResponseOutputText)
+from openai_harmony import Message as OpenAIHarmonyMessage
 
 from vllm import envs
 from vllm.config import ModelConfig
@@ -17,6 +20,10 @@ from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                          ChatTemplateContentFormatOption)
 from vllm.entrypoints.context import ConversationContext, SimpleContext
+from vllm.entrypoints.harmony_utils import (
+    get_developer_message, get_stop_tokens_for_assistant_actions,
+    get_system_message, get_user_message, parse_response_input,
+    render_for_completion)
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -30,6 +37,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.tool_server import ToolServer
+from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import SamplingParams
@@ -103,6 +111,29 @@ class OpenAIServingResponses(OpenAIServing):
                 "`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may "
                 "cause a memory leak since we never remove responses from "
                 "the store.")
+
+        self.use_harmony = model_config.hf_config.model_type == "gpt_oss"
+        if self.use_harmony:
+            logger.warning("For gpt-oss, we ignore --enable-auto-tool-choice "
+                           "and always enable tool use.")
+            # OpenAI models have two EOS-like tokens: <|return|> and <|call|>.
+            # We need to add them to the stop token ids.
+            if "stop_token_ids" not in self.default_sampling_params:
+                self.default_sampling_params["stop_token_ids"] = []
+            self.default_sampling_params["stop_token_ids"].extend(
+                get_stop_tokens_for_assistant_actions())
+
+        # set up tool use
+        self.enable_auto_tools: bool = enable_auto_tools
+        if self.enable_auto_tools:
+            logger.info(
+                "\"auto\" tool choice has been enabled please note that while"
+                " the parallel_tool_calls client option is preset for "
+                "compatibility reasons, it will be ignored.")
+            if not self.use_harmony:
+                raise NotImplementedError("Auto tool choice is not supported "
+                                          "yet unless using Harmony")
+
         # HACK(woosuk): This is a hack. We should use a better store.
         # FIXME: If enable_store=True, this may cause a memory leak since we
         # never remove responses from the store.
@@ -165,21 +196,20 @@ class OpenAIServingResponses(OpenAIServing):
                 return self._make_not_found_error(prev_response_id)
         else:
             prev_response = None
-        # Construct the input messages.
-        messages = self._construct_input_messages(request, prev_response)
 
         try:
             lora_request = self._maybe_get_adapters(request)
             model_name = self._get_model_name(request.model, lora_request)
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
-            _, request_prompts, engine_prompts = await self._preprocess_chat(
-                request,
-                tokenizer,
-                messages,
-                chat_template=self.chat_template,
-                chat_template_content_format=self.chat_template_content_format,
-            )
+            if self.use_harmony:
+                messages, request_prompts, engine_prompts = (
+                    self._make_request_with_harmony(request, prev_response))
+            else:
+                messages, request_prompts, engine_prompts = (
+                    await self._make_request(request, prev_response,
+                                             tokenizer))
+
         except (ValueError, TypeError, RuntimeError,
                 jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
@@ -275,6 +305,38 @@ class OpenAIServingResponses(OpenAIServing):
         except Exception as e:
             return self.create_error_response(str(e))
 
+    async def _make_request(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse],
+        tokenizer: AnyTokenizer,
+    ):
+        # Construct the input messages.
+        messages = self._construct_input_messages(request, prev_response)
+        _, request_prompts, engine_prompts = await self._preprocess_chat(
+            request,
+            tokenizer,
+            messages,
+            chat_template=self.chat_template,
+            chat_template_content_format=self.chat_template_content_format,
+        )
+        return messages, request_prompts, engine_prompts
+
+    def _make_request_with_harmony(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse],
+    ):
+        if request.tool_choice != "auto":
+            raise NotImplementedError(
+                "Only 'auto' tool_choice is supported in "
+                "response API with Harmony")
+        messages = self._construct_input_messages_with_harmony(
+            request, prev_response)
+        prompt_token_ids = render_for_completion(messages)
+        engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+        return messages, [prompt_token_ids], [engine_prompt]
+
     async def responses_full_generator(
         self,
         request: ResponsesRequest,
@@ -411,6 +473,82 @@ class OpenAIServingResponses(OpenAIServing):
             messages.extend(request.input)  # type: ignore
         return messages
 
+    def _construct_input_messages_with_harmony(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse],
+    ) -> list[OpenAIHarmonyMessage]:
+        messages: list[OpenAIHarmonyMessage] = []
+        if prev_response is None:
+            # New conversation.
+            reasoning_effort = (request.reasoning.effort
+                                if request.reasoning else None)
+            tool_types = [tool.type for tool in request.tools]
+            enable_browser = ("web_search_preview" in tool_types
+                              and self.tool_server is not None
+                              and self.tool_server.has_tool("browser"))
+            enable_code_interpreter = ("code_interpreter" in tool_types
+                                       and self.tool_server is not None
+                                       and self.tool_server.has_tool("python"))
+            sys_msg = get_system_message(
+                reasoning_effort=reasoning_effort,
+                browser_description=self.tool_server.get_tool_description(
+                    "browser")
+                if enable_browser and self.tool_server is not None else None,
+                python_description=self.tool_server.get_tool_description(
+                    "python") if enable_code_interpreter
+                and self.tool_server is not None else None,
+            )
+            messages.append(sys_msg)
+            dev_msg = get_developer_message(request.instructions,
+                                            request.tools)
+            messages.append(dev_msg)
+        else:
+            # Continue the previous conversation.
+            # FIXME(woosuk): Currently, request params like reasoning and
+            # instructions are ignored.
+            prev_msgs = self.msg_store[prev_response.id]
+            # Remove the previous chain-of-thoughts if there is a new "final"
+            # message. Note that this also removes these messages from the
+            # msg_store.
+            if len(prev_msgs) > 0:
+                last_msg = prev_msgs[-1]
+                assert isinstance(last_msg, OpenAIHarmonyMessage)
+                if last_msg.channel == "final":
+                    prev_final_msg_idx = -1
+                    for i in range(len(prev_msgs) - 2, -1, -1):
+                        prev_msg_i = prev_msgs[i]
+                        assert isinstance(prev_msg_i, OpenAIHarmonyMessage)
+                        if prev_msg_i.channel == "final":
+                            prev_final_msg_idx = i
+                            break
+                    recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1:]
+                    del prev_msgs[prev_final_msg_idx + 1:]
+                    for msg in recent_turn_msgs:
+                        assert isinstance(msg, OpenAIHarmonyMessage)
+                        if msg.channel != "analysis":
+                            prev_msgs.append(msg)
+            messages.extend(prev_msgs)
+        # Append the new input.
+        # Reponses API supports simple text inputs without chat format.
+        if isinstance(request.input, str):
+            messages.append(get_user_message(request.input))
+        else:
+            if prev_response is not None:
+                prev_outputs = copy(prev_response.output)
+            else:
+                prev_outputs = []
+            for response_msg in request.input:
+                messages.append(
+                    parse_response_input(response_msg, prev_outputs))
+                # User passes in a a tool call request and its output. We need
+                # to add the tool call request to prev_outputs so that the
+                # parse_response_input can find the tool call request when
+                # parsing the tool call output.
+                if isinstance(response_msg, ResponseFunctionToolCall):
+                    prev_outputs.append(response_msg)
+        return messages
+
     async def _run_background_request(
         self,
         request: ResponsesRequest,

From 4be02a37767f05a3fd27d66435d5cebea7a9bfe8 Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Thu, 7 Aug 2025 12:07:54 +0800
Subject: [PATCH 050/932] [Bugfix] EPLB load statistics problem (#22167)

Signed-off-by: ycyaw66 <497410282@qq.com>
Signed-off-by: David Chen <530634352@qq.com>
Co-authored-by: ycyaw66 <497410282@qq.com>
---
 vllm/distributed/eplb/eplb_state.py           | 50 +++++++++----------
 vllm/model_executor/layers/fused_moe/layer.py | 17 +------
 2 files changed, 26 insertions(+), 41 deletions(-)

diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index f64b516b0d..c415d409f7 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -32,7 +32,7 @@ from dataclasses import dataclass
 from typing import Optional, Union
 
 import torch
-from torch.distributed import ProcessGroup, all_gather, all_reduce
+from torch.distributed import ProcessGroup, all_reduce
 
 from vllm.config import ParallelConfig
 from vllm.distributed.parallel_state import (get_ep_group, get_node_count,
@@ -112,13 +112,21 @@ class EplbState:
     Expert load during this forward pass. 
     We use the token count each expert processes as the load.
 
-    Shape: (num_moe_layers, num_local_physical_experts)
+    Shape: (num_moe_layers, num_physical_experts)
     """
     expert_load_window: torch.Tensor
     """
     A sliding window of expert load.
 
-    Shape: (window_size, num_moe_layers, num_local_physical_experts)
+    Shape: (window_size, num_moe_layers, num_physical_experts)
+
+    NOTE: The expert_load_view now records load for all physical experts
+    rather than just local experts. This ensures consistent load statistics
+    across different dispatch methods (naive all-to-all, DeepEP, pplx-kernels).
+    The recorded load will be multiplied by dp_size when using naive all-to-all
+    due to each DP rank contributing the same token set to the calculation.
+    See:
+    https://github.com/vllm-project/vllm/pull/22167#pullrequestreview-3086143856
     """
     expert_load_window_step: int = 0
     """
@@ -232,14 +240,14 @@ class EplbState:
         ).contiguous()
 
         expert_load_pass = torch.zeros(
-            (model.num_moe_layers, model.num_local_physical_experts),
+            (model.num_moe_layers, model.num_physical_experts),
             dtype=torch.int32,
             device=device,
         )
         expert_load_window_size = parallel_config.eplb_window_size
         expert_load_window = torch.zeros(
             (expert_load_window_size, model.num_moe_layers,
-             model.num_local_physical_experts),
+             model.num_physical_experts),
             dtype=torch.int32,
             device=device,
         )
@@ -353,18 +361,18 @@ class EplbState:
             self.expert_load_pass.zero_()
 
         if log_stats:
-            # `num_tokens`: (num_moe_layers,)
-            num_tokens = self.expert_load_pass.sum(dim=-1)
+            # total_expert_load_pass: (num_moe_layers, num_physical_experts)
+            total_expert_load_pass = self.expert_load_pass.clone()
 
             # Collect load metrics from all ranks
             ep_group = get_ep_group().device_group
             assert ep_group is not None
-            num_tokens_list = [
-                torch.empty_like(num_tokens) for _ in range(ep_group.size())
-            ]
-            all_gather(num_tokens_list, num_tokens, group=ep_group)
-            # Stack to get (num_ranks, num_moe_layers)
-            num_tokens_per_rank = torch.stack(num_tokens_list).float()
+            all_reduce(total_expert_load_pass, group=ep_group)
+
+            # num_tokens_per_rank: (num_moe_layers, num_ranks)
+            num_tokens_per_rank = total_expert_load_pass.reshape(
+                total_expert_load_pass.shape[0], ep_group.size(),
+                -1).sum(dim=-1).float()
 
             # Compute balancedness ratio:
             # for each layer:
@@ -426,17 +434,7 @@ class EplbState:
                         "(profile)" if is_profile else "")
 
         if global_expert_load is None:
-            # This mapping is only used here, so we do not store it in the state
-            physical_expert_start = ep_rank * model.num_local_physical_experts
-            physical_expert_end = (physical_expert_start +
-                                   model.num_local_physical_experts)
-            # (num_moe_layers, num_local_physical_experts)
-            local_physical_to_logical_map = self.physical_to_logical_map[
-                :,
-                physical_expert_start:physical_expert_end,
-            ]
-
-            # Map the local physical expert load to global logical experts
+            # Map the physical expert load to global logical experts
             logical_expert_load_window = torch.zeros(
                 self.expert_load_window_size,
                 model.num_moe_layers,
@@ -446,7 +444,7 @@ class EplbState:
             )
             logical_expert_load_window.scatter_add_(
                 dim=-1,
-                index=local_physical_to_logical_map.unsqueeze(0).expand_as(
+                index=self.physical_to_logical_map.unsqueeze(0).expand_as(
                     self.expert_load_window).long(),
                 src=self.expert_load_window,
             )
@@ -618,4 +616,4 @@ def _node_count_with_rank_mapping(
             if is_same_node and node_assignment[other_rank] == 0:
                 node_assignment[other_rank] = next_node_id
 
-    return next_node_id
+    return next_node_id
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index a4a6157fa4..72c2bc9a3d 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1430,22 +1430,9 @@ class FusedMoE(torch.nn.Module):
             # to the modular kernel, we can move this logic there
             # to achieve better efficiency.
 
-            # `expert_load_view`: (num_logical_experts,)
+            # `expert_load_view`: (num_physical_experts,)
 
-            # Mask out non-local experts
-            if expert_map is not None:
-                topk_ids_local = expert_map[topk_ids]
-                topk_ids_flatten = topk_ids_local.flatten()
-            else:
-                topk_ids_flatten = topk_ids.flatten()
-
-            # Should be equivalent to:
-            # ```
-            # topk_ids_masked = topk_ids_local[topk_ids_local >= 0]
-            # expert_load_view += topk_ids_masked.bincount(
-            #     minlength=expert_load_view.shape[0])
-            # ```
-            # We use `scatter_add_` since `bincount` cannot be compiled
+            topk_ids_flatten = topk_ids.flatten()
 
             # Performance optimization:
             # `masked_fill` is significantly faster than `masked_select`

From 2a4c825523d5715068bf3ec373f662e113c66f45 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Thu, 7 Aug 2025 14:05:03 +0800
Subject: [PATCH 051/932] [CI] Skip the pooling models that do not support
 transformers v4.55 (#22411)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 tests/models/language/pooling/test_embedding.py |  5 ++++-
 tests/models/language/pooling/test_gte.py       |  9 +++++++++
 tests/models/language/pooling/test_reward.py    |  4 ++++
 tests/models/utils.py                           | 11 +++++++++++
 4 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index 51283dc630..2dd35c4151 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -7,7 +7,7 @@ import pytest
 from vllm.config import PoolerConfig
 from vllm.platforms import current_platform
 
-from ...utils import check_embeddings_close
+from ...utils import check_embeddings_close, check_transformers_version
 
 
 @pytest.fixture(autouse=True)
@@ -56,6 +56,9 @@ def test_models(
     model,
     monkeypatch,
 ) -> None:
+    if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
+        check_transformers_version(model, max_transformers_version="4.53.2")
+
     if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
         # ROCm Triton FA does not currently support sliding window attention
         # switch to use ROCm CK FA backend
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 6d2eff7099..48a0cd64fe 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -4,6 +4,7 @@ from typing import Any
 
 import pytest
 
+from ...utils import check_transformers_version
 from .embed_utils import EmbedModelInfo, correctness_test_embed_models
 from .mteb_utils import mteb_test_embed_models
 
@@ -60,6 +61,10 @@ MODELS = [
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
+    if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
+        check_transformers_version(model_info.name,
+                                   max_transformers_version="4.53.2")
+
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
@@ -72,6 +77,10 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
 def test_embed_models_correctness(hf_runner, vllm_runner,
                                   model_info: EmbedModelInfo,
                                   example_prompts) -> None:
+    if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
+        check_transformers_version(model_info.name,
+                                   max_transformers_version="4.53.2")
+
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index 7add1d975c..beafa0aed9 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -10,6 +10,7 @@ from transformers import AutoModel
 from vllm.platforms import current_platform
 
 from ....conftest import HfRunner
+from ...utils import check_transformers_version
 
 
 @pytest.fixture(autouse=True)
@@ -86,6 +87,9 @@ def test_prm_models(
     dtype: str,
     monkeypatch,
 ) -> None:
+    check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
+                               max_transformers_version="4.53.2")
+
     if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
         pytest.skip("CPU only supports V1")
 
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 1513db5220..4657df60b1 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -412,3 +412,14 @@ def dummy_hf_overrides(
         })
 
     return hf_config
+
+
+def check_transformers_version(model: str,
+                               min_transformers_version: Optional[str] = None,
+                               max_transformers_version: Optional[str] = None):
+    from .registry import _HfExamplesInfo
+
+    return _HfExamplesInfo(model,
+                           min_transformers_version=min_transformers_version,
+                           max_transformers_version=max_transformers_version
+                           ).check_transformers_version(on_fail="skip")

From 4d4297e8fe96d64be0a114636512fbbe1e5ee0d6 Mon Sep 17 00:00:00 2001
From: lkchen <github@lkchen.net>
Date: Wed, 6 Aug 2025 23:05:07 -0700
Subject: [PATCH 052/932] [Bench] Split serve.py:main into async/async versions
 (#22405)

Signed-off-by: Linkun <github@lkchen.net>
---
 vllm/benchmarks/serve.py | 102 ++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 49 deletions(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index ca8d218581..6d52b51a9f 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -948,7 +948,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
     )
 
 
-def main(args: argparse.Namespace):
+def main(args: argparse.Namespace) -> dict[str, Any]:
+    return asyncio.run(main_async(args))
+
+async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     print(args)
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -1025,8 +1028,7 @@ def main(args: argparse.Namespace):
     gc.collect()
     gc.freeze()
 
-    benchmark_result = asyncio.run(
-        benchmark(
+    benchmark_result = await benchmark(
             endpoint_type=args.endpoint_type,
             api_url=api_url,
             base_url=base_url,
@@ -1052,62 +1054,62 @@ def main(args: argparse.Namespace):
             ramp_up_start_rps=args.ramp_up_start_rps,
             ramp_up_end_rps=args.ramp_up_end_rps,
             ready_check_timeout_sec=args.ready_check_timeout_sec,
-        ))
+        )
 
     # Save config and results to json
-    if args.save_result or args.append_result:
-        result_json: dict[str, Any] = {}
+    result_json: dict[str, Any] = {}
 
-        # Setup
-        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
-        result_json["date"] = current_dt
-        result_json["endpoint_type"] = args.endpoint_type
-        result_json["label"] = label
-        result_json["model_id"] = model_id
-        result_json["tokenizer_id"] = tokenizer_id
-        result_json["num_prompts"] = args.num_prompts
+    # Setup
+    current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+    result_json["date"] = current_dt
+    result_json["endpoint_type"] = args.endpoint_type
+    result_json["label"] = label
+    result_json["model_id"] = model_id
+    result_json["tokenizer_id"] = tokenizer_id
+    result_json["num_prompts"] = args.num_prompts
 
-        # Metadata
-        if args.metadata:
-            for item in args.metadata:
-                if "=" in item:
-                    kvstring = item.split("=")
-                    result_json[kvstring[0].strip()] = kvstring[1].strip()
-                else:
-                    raise ValueError(
-                        "Invalid metadata format. Please use KEY=VALUE format."
-                    )
+    # Metadata
+    if args.metadata:
+        for item in args.metadata:
+            if "=" in item:
+                kvstring = item.split("=")
+                result_json[kvstring[0].strip()] = kvstring[1].strip()
+            else:
+                raise ValueError(
+                    "Invalid metadata format. Please use KEY=VALUE format."
+                )
 
-        # Traffic
-        result_json["request_rate"] = (args.request_rate if args.request_rate
-                                       < float("inf") else "inf")
-        result_json["burstiness"] = args.burstiness
-        result_json["max_concurrency"] = args.max_concurrency
+    # Traffic
+    result_json["request_rate"] = (args.request_rate if args.request_rate
+                                    < float("inf") else "inf")
+    result_json["burstiness"] = args.burstiness
+    result_json["max_concurrency"] = args.max_concurrency
 
-        if args.ramp_up_strategy is not None:
-            result_json["ramp_up_strategy"] = args.ramp_up_strategy
-            result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
-            result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
+    if args.ramp_up_strategy is not None:
+        result_json["ramp_up_strategy"] = args.ramp_up_strategy
+        result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
+        result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
 
-        # Merge with benchmark result
-        result_json = {**result_json, **benchmark_result}
+    # Merge with benchmark result
+    result_json = {**result_json, **benchmark_result}
 
-        if not args.save_detailed:
-            # Remove fields with too many data points
-            for field in [
-                    "input_lens",
-                    "output_lens",
-                    "ttfts",
-                    "itls",
-                    "generated_texts",
-                    "errors",
-            ]:
-                if field in result_json:
-                    del result_json[field]
-                if field in benchmark_result:
-                    del benchmark_result[field]
+    if not args.save_detailed:
+        # Remove fields with too many data points
+        for field in [
+                "input_lens",
+                "output_lens",
+                "ttfts",
+                "itls",
+                "generated_texts",
+                "errors",
+        ]:
+            if field in result_json:
+                del result_json[field]
+            if field in benchmark_result:
+                del benchmark_result[field]
 
         # Save to file
+    if args.save_result or args.append_result:
         base_model_id = model_id.split("/")[-1]
         max_concurrency_str = (f"-concurrency{args.max_concurrency}"
                                if args.max_concurrency is not None else "")
@@ -1129,3 +1131,5 @@ def main(args: argparse.Namespace):
                 outfile.write("\n")
             json.dump(result_json, outfile)
         save_to_pytorch_benchmark_format(args, result_json, file_name)
+
+    return result_json
\ No newline at end of file

From cbc8457b2663e66beb2dedb20f3f0728b82ae603 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 7 Aug 2025 14:05:24 +0800
Subject: [PATCH 053/932] [Model] Switch to Fused RMS norm in Qwen2.5_VL model.
 (#22184)

Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
Signed-off-by: tjtanaavllm <tunjian.tan@amd.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: kf <kuanfu.liu@embeddedllm.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 45fb7f9580..79c5c77f6d 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -396,13 +396,13 @@ class Qwen2_5_VisionBlock(nn.Module):
             max_seqlen: Optional[int] = None,  # Only used for Flash Attention
             seqlens: Optional[list[int]] = None,  # Only used for xFormers
     ) -> torch.Tensor:
-        x = x + self.attn(self.norm1(x),
-                          cu_seqlens=cu_seqlens,
-                          rotary_pos_emb=rotary_pos_emb,
-                          max_seqlen=max_seqlen,
-                          seqlens=seqlens)
-
-        x = x + self.mlp(self.norm2(x))
+        x_attn = self.attn(self.norm1(x),
+                           cu_seqlens=cu_seqlens,
+                           rotary_pos_emb=rotary_pos_emb,
+                           max_seqlen=max_seqlen,
+                           seqlens=seqlens)
+        x_fused_norm, residual = self.norm2(x, residual=x_attn)
+        x = residual + self.mlp(x_fused_norm)
         return x
 
 
From 370661856bcfc4cdc9a88580cb70d66b7ac9fc7c Mon Sep 17 00:00:00 2001
From: Moritz Sanft <58110325+msanft@users.noreply.github.com>
Date: Thu, 7 Aug 2025 08:06:00 +0200
Subject: [PATCH 054/932] [Frontend] Update OpenAI error response to upstream
 format (#22099)

Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
---
 .../entrypoints/openai/test_classification.py |  5 +-
 .../entrypoints/openai/test_lora_resolvers.py |  8 +--
 .../entrypoints/openai/test_serving_models.py | 16 +++---
 .../openai/test_transcription_validation.py   | 17 ++++---
 .../openai/test_translation_validation.py     |  5 +-
 vllm/entrypoints/openai/api_server.py         | 50 ++++++++++---------
 vllm/entrypoints/openai/protocol.py           |  7 ++-
 vllm/entrypoints/openai/run_batch.py          |  2 +-
 vllm/entrypoints/openai/serving_engine.py     | 21 ++++----
 vllm/entrypoints/openai/serving_models.py     |  9 ++--
 10 files changed, 73 insertions(+), 67 deletions(-)

diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py
index bcf127307f..886267c211 100644
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@@ -121,8 +121,7 @@ def test_invalid_truncate_prompt_tokens_error(server: RemoteOpenAIServer,
 
     error = classification_response.json()
     assert classification_response.status_code == 400
-    assert error["object"] == "error"
-    assert "truncate_prompt_tokens" in error["message"]
+    assert "truncate_prompt_tokens" in error["error"]["message"]
 
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
@@ -137,7 +136,7 @@ def test_empty_input_error(server: RemoteOpenAIServer, model_name: str):
 
     error = classification_response.json()
     assert classification_response.status_code == 400
-    assert error["object"] == "error"
+    assert "error" in error
 
 
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index d4afdf7751..f480117258 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -160,8 +160,8 @@ async def test_serving_completion_resolver_not_found(mock_serving_setup,
     mock_engine.generate.assert_not_called()
 
     assert isinstance(response, ErrorResponse)
-    assert response.code == HTTPStatus.NOT_FOUND.value
-    assert non_existent_model in response.message
+    assert response.error.code == HTTPStatus.NOT_FOUND.value
+    assert non_existent_model in response.error.message
 
 
 @pytest.mark.asyncio
@@ -190,8 +190,8 @@ async def test_serving_completion_resolver_add_lora_fails(
 
     # Assert the correct error response
     assert isinstance(response, ErrorResponse)
-    assert response.code == HTTPStatus.BAD_REQUEST.value
-    assert invalid_model in response.message
+    assert response.error.code == HTTPStatus.BAD_REQUEST.value
+    assert invalid_model in response.error.message
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index c3b458d717..bc6a0341f5 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -66,8 +66,8 @@ async def test_load_lora_adapter_missing_fields():
     request = LoadLoRAAdapterRequest(lora_name="", lora_path="")
     response = await serving_models.load_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
-    assert response.type == "InvalidUserInput"
-    assert response.code == HTTPStatus.BAD_REQUEST
+    assert response.error.type == "InvalidUserInput"
+    assert response.error.code == HTTPStatus.BAD_REQUEST
 
 
 @pytest.mark.asyncio
@@ -84,8 +84,8 @@ async def test_load_lora_adapter_duplicate():
                                      lora_path="/path/to/adapter1")
     response = await serving_models.load_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
-    assert response.type == "InvalidUserInput"
-    assert response.code == HTTPStatus.BAD_REQUEST
+    assert response.error.type == "InvalidUserInput"
+    assert response.error.code == HTTPStatus.BAD_REQUEST
     assert len(serving_models.lora_requests) == 1
 
 
@@ -110,8 +110,8 @@ async def test_unload_lora_adapter_missing_fields():
     request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None)
     response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
-    assert response.type == "InvalidUserInput"
-    assert response.code == HTTPStatus.BAD_REQUEST
+    assert response.error.type == "InvalidUserInput"
+    assert response.error.code == HTTPStatus.BAD_REQUEST
 
 
 @pytest.mark.asyncio
@@ -120,5 +120,5 @@ async def test_unload_lora_adapter_not_found():
     request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter")
     response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
-    assert response.type == "NotFoundError"
-    assert response.code == HTTPStatus.NOT_FOUND
+    assert response.error.type == "NotFoundError"
+    assert response.error.code == HTTPStatus.NOT_FOUND
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index a8e2eb40b1..28fd02171b 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -116,8 +116,10 @@ async def test_non_asr_model(winning_call):
                                                        file=winning_call,
                                                        language="en",
                                                        temperature=0.0)
-        assert res.code == 400 and not res.text
-        assert res.message == "The model does not support Transcriptions API"
+        err = res.error
+        assert err["code"] == 400 and not res.text
+        assert err[
+            "message"] == "The model does not support Transcriptions API"
 
 
 @pytest.mark.asyncio
@@ -133,12 +135,15 @@ async def test_completion_endpoints():
                 "role": "system",
                 "content": "You are a helpful assistant."
             }])
-        assert res.code == 400
-        assert res.message == "The model does not support Chat Completions API"
+        err = res.error
+        assert err["code"] == 400
+        assert err[
+            "message"] == "The model does not support Chat Completions API"
 
         res = await client.completions.create(model=model_name, prompt="Hello")
-        assert res.code == 400
-        assert res.message == "The model does not support Completions API"
+        err = res.error
+        assert err["code"] == 400
+        assert err["message"] == "The model does not support Completions API"
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index 79e769e3a1..bfa9bdef1c 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -73,8 +73,9 @@ async def test_non_asr_model(foscolo):
         res = await client.audio.translations.create(model=model_name,
                                                      file=foscolo,
                                                      temperature=0.0)
-        assert res.code == 400 and not res.text
-        assert res.message == "The model does not support Translations API"
+        err = res.error
+        assert err["code"] == 400 and not res.text
+        assert err["message"] == "The model does not support Translations API"
 
 
 @pytest.mark.asyncio
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f6f83223a1..c695ea8b5a 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -62,7 +62,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DetokenizeRequest,
                                               DetokenizeResponse,
                                               EmbeddingRequest,
-                                              EmbeddingResponse, ErrorResponse,
+                                              EmbeddingResponse, ErrorInfo,
+                                              ErrorResponse,
                                               LoadLoRAAdapterRequest,
                                               PoolingRequest, PoolingResponse,
                                               RerankRequest, RerankResponse,
@@ -506,7 +507,7 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
+                            status_code=generator.error.code)
     elif isinstance(generator, TokenizeResponse):
         return JSONResponse(content=generator.model_dump())
 
@@ -540,7 +541,7 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
+                            status_code=generator.error.code)
     elif isinstance(generator, DetokenizeResponse):
         return JSONResponse(content=generator.model_dump())
 
@@ -556,7 +557,7 @@ def maybe_register_tokenizer_info_endpoint(args):
             """Get comprehensive tokenizer information."""
             result = await tokenization(raw_request).get_tokenizer_info()
             return JSONResponse(content=result.model_dump(),
-                                status_code=result.code if isinstance(
+                                status_code=result.error.code if isinstance(
                                     result, ErrorResponse) else 200)
 
 
@@ -603,7 +604,7 @@ async def create_responses(request: ResponsesRequest, raw_request: Request):
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
+                            status_code=generator.error.code)
     elif isinstance(generator, ResponsesResponse):
         return JSONResponse(content=generator.model_dump())
     return StreamingResponse(content=generator, media_type="text/event-stream")
@@ -620,7 +621,7 @@ async def retrieve_responses(response_id: str, raw_request: Request):
 
     if isinstance(response, ErrorResponse):
         return JSONResponse(content=response.model_dump(),
-                            status_code=response.code)
+                            status_code=response.error.code)
     return JSONResponse(content=response.model_dump())
 
 
@@ -635,7 +636,7 @@ async def cancel_responses(response_id: str, raw_request: Request):
 
     if isinstance(response, ErrorResponse):
         return JSONResponse(content=response.model_dump(),
-                            status_code=response.code)
+                            status_code=response.error.code)
     return JSONResponse(content=response.model_dump())
 
 
@@ -670,7 +671,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
+                            status_code=generator.error.code)
 
     elif isinstance(generator, ChatCompletionResponse):
         return JSONResponse(content=generator.model_dump())
@@ -715,7 +716,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
+                            status_code=generator.error.code)
     elif isinstance(generator, CompletionResponse):
         return JSONResponse(content=generator.model_dump())
 
@@ -744,7 +745,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
+                            status_code=generator.error.code)
     elif isinstance(generator, EmbeddingResponse):
         return JSONResponse(content=generator.model_dump())
 
@@ -772,7 +773,7 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
     generator = await handler.create_pooling(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
+                            status_code=generator.error.code)
     elif isinstance(generator, PoolingResponse):
         return JSONResponse(content=generator.model_dump())
 
@@ -792,7 +793,7 @@ async def create_classify(request: ClassificationRequest,
     generator = await handler.create_classify(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
+                            status_code=generator.error.code)
 
     elif isinstance(generator, ClassificationResponse):
         return JSONResponse(content=generator.model_dump())
@@ -821,7 +822,7 @@ async def create_score(request: ScoreRequest, raw_request: Request):
     generator = await handler.create_score(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
+                            status_code=generator.error.code)
     elif isinstance(generator, ScoreResponse):
         return JSONResponse(content=generator.model_dump())
 
@@ -881,7 +882,7 @@ async def create_transcriptions(raw_request: Request,
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
+                            status_code=generator.error.code)
 
     elif isinstance(generator, TranscriptionResponse):
         return JSONResponse(content=generator.model_dump())
@@ -922,7 +923,7 @@ async def create_translations(request: Annotated[TranslationRequest,
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
+                            status_code=generator.error.code)
 
     elif isinstance(generator, TranslationResponse):
         return JSONResponse(content=generator.model_dump())
@@ -950,7 +951,7 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
     generator = await handler.do_rerank(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
-                            status_code=generator.code)
+                            status_code=generator.error.code)
     elif isinstance(generator, RerankResponse):
         return JSONResponse(content=generator.model_dump())
 
@@ -1175,7 +1176,7 @@ async def invocations(raw_request: Request):
     msg = ("Cannot find suitable handler for request. "
            f"Expected one of: {type_names}")
     res = base(raw_request).create_error_response(message=msg)
-    return JSONResponse(content=res.model_dump(), status_code=res.code)
+    return JSONResponse(content=res.model_dump(), status_code=res.error.code)
 
 
 if envs.VLLM_TORCH_PROFILER_DIR:
@@ -1211,7 +1212,7 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
         response = await handler.load_lora_adapter(request)
         if isinstance(response, ErrorResponse):
             return JSONResponse(content=response.model_dump(),
-                                status_code=response.code)
+                                status_code=response.error.code)
 
         return Response(status_code=200, content=response)
 
@@ -1223,7 +1224,7 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
         response = await handler.unload_lora_adapter(request)
         if isinstance(response, ErrorResponse):
             return JSONResponse(content=response.model_dump(),
-                                status_code=response.code)
+                                status_code=response.error.code)
 
         return Response(status_code=200, content=response)
 
@@ -1502,9 +1503,10 @@ def build_app(args: Namespace) -> FastAPI:
 
     @app.exception_handler(HTTPException)
     async def http_exception_handler(_: Request, exc: HTTPException):
-        err = ErrorResponse(message=exc.detail,
+        err = ErrorResponse(
+            error=ErrorInfo(message=exc.detail,
                             type=HTTPStatus(exc.status_code).phrase,
-                            code=exc.status_code)
+                            code=exc.status_code))
         return JSONResponse(err.model_dump(), status_code=exc.status_code)
 
     @app.exception_handler(RequestValidationError)
@@ -1518,9 +1520,9 @@ def build_app(args: Namespace) -> FastAPI:
         else:
             message = exc_str
 
-        err = ErrorResponse(message=message,
-                            type=HTTPStatus.BAD_REQUEST.phrase,
-                            code=HTTPStatus.BAD_REQUEST)
+        err = ErrorResponse(error=ErrorInfo(message=message,
+                                            type=HTTPStatus.BAD_REQUEST.phrase,
+                                            code=HTTPStatus.BAD_REQUEST))
         return JSONResponse(err.model_dump(),
                             status_code=HTTPStatus.BAD_REQUEST)
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 421927d61b..ea2cf57563 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -78,14 +78,17 @@ class OpenAIBaseModel(BaseModel):
         return result
 
 
-class ErrorResponse(OpenAIBaseModel):
-    object: str = "error"
+class ErrorInfo(OpenAIBaseModel):
     message: str
     type: str
     param: Optional[str] = None
     code: int
 
 
+class ErrorResponse(OpenAIBaseModel):
+    error: ErrorInfo
+
+
 class ModelPermission(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
     object: str = "model_permission"
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index d146ad485d..a10d57456b 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -302,7 +302,7 @@ async def run_request(serving_engine_func: Callable,
             id=f"vllm-{random_uuid()}",
             custom_id=request.custom_id,
             response=BatchResponseData(
-                status_code=response.code,
+                status_code=response.error.code,
                 request_id=f"vllm-batch-{random_uuid()}"),
             error=response,
         )
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 822f186840..efd2f20299 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -47,10 +47,10 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               EmbeddingChatRequest,
                                               EmbeddingCompletionRequest,
                                               EmbeddingRequest,
-                                              EmbeddingResponse, ErrorResponse,
-                                              PoolingResponse, RerankRequest,
-                                              ResponsesRequest, ScoreRequest,
-                                              ScoreResponse,
+                                              EmbeddingResponse, ErrorInfo,
+                                              ErrorResponse, PoolingResponse,
+                                              RerankRequest, ResponsesRequest,
+                                              ScoreRequest, ScoreResponse,
                                               TokenizeChatRequest,
                                               TokenizeCompletionRequest,
                                               TokenizeResponse,
@@ -412,21 +412,18 @@ class OpenAIServing:
             message: str,
             err_type: str = "BadRequestError",
             status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
-        return ErrorResponse(message=message,
-                             type=err_type,
-                             code=status_code.value)
+        return ErrorResponse(error=ErrorInfo(
+            message=message, type=err_type, code=status_code.value))
 
     def create_streaming_error_response(
             self,
             message: str,
             err_type: str = "BadRequestError",
             status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str:
-        json_str = json.dumps({
-            "error":
+        json_str = json.dumps(
             self.create_error_response(message=message,
                                        err_type=err_type,
-                                       status_code=status_code).model_dump()
-        })
+                                       status_code=status_code).model_dump())
         return json_str
 
     async def _check_model(
@@ -445,7 +442,7 @@ class OpenAIServing:
             if isinstance(load_result, LoRARequest):
                 return None
             if isinstance(load_result, ErrorResponse) and \
-                load_result.code == HTTPStatus.BAD_REQUEST.value:
+                load_result.error.code == HTTPStatus.BAD_REQUEST.value:
                 error_response = load_result
 
         return error_response or self.create_error_response(
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 27614fcb41..a4efa0815b 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -9,7 +9,7 @@ from typing import Optional, Union
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.openai.protocol import (ErrorResponse,
+from vllm.entrypoints.openai.protocol import (ErrorInfo, ErrorResponse,
                                               LoadLoRAAdapterRequest,
                                               ModelCard, ModelList,
                                               ModelPermission,
@@ -82,7 +82,7 @@ class OpenAIServingModels:
             load_result = await self.load_lora_adapter(
                 request=load_request, base_model_name=lora.base_model_name)
             if isinstance(load_result, ErrorResponse):
-                raise ValueError(load_result.message)
+                raise ValueError(load_result.error.message)
 
     def is_base_model(self, model_name) -> bool:
         return any(model.name == model_name for model in self.base_model_paths)
@@ -284,6 +284,5 @@ def create_error_response(
         message: str,
         err_type: str = "BadRequestError",
         status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
-    return ErrorResponse(message=message,
-                         type=err_type,
-                         code=status_code.value)
+    return ErrorResponse(error=ErrorInfo(
+        message=message, type=err_type, code=status_code.value))

From 82216dc21f777584bcf53ab1fe4936390c1737bf Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Wed, 6 Aug 2025 23:06:20 -0700
Subject: [PATCH 055/932] [Misc] Support routing logic simulation (#21990)

Signed-off-by: Ming Yang <minos.future@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/test_routing_simulator.py               | 171 +++++++++++
 vllm/envs.py                                  |   9 +
 vllm/model_executor/layers/fused_moe/layer.py |  12 +
 .../layers/fused_moe/routing_simulator.py     | 289 ++++++++++++++++++
 4 files changed, 481 insertions(+)
 create mode 100644 tests/test_routing_simulator.py
 create mode 100644 vllm/model_executor/layers/fused_moe/routing_simulator.py

diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py
new file mode 100644
index 0000000000..8324b225a8
--- /dev/null
+++ b/tests/test_routing_simulator.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test script for the token-to-expert routing simulator.
+
+This script demonstrates how to use the routing simulator to test
+different routing strategies and analyze their performance, including
+integration tests with FusedMoE layer.
+"""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.routing_simulator import (
+    DistributionBasedRouting, RoutingSimulator)
+
+
+@pytest.fixture
+def device():
+    """Fixture to provide the appropriate device for testing."""
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+@pytest.mark.parametrize("num_tokens", [1, 16, 256])
+@pytest.mark.parametrize("hidden_size", [64, 1024])
+@pytest.mark.parametrize("num_experts", [16, 128])
+@pytest.mark.parametrize("top_k", [1, 4])
+def test_basic_functionality(
+    num_tokens: int,
+    hidden_size: int,
+    num_experts: int,
+    top_k: int,
+    device,
+):
+    """Test basic functionality of the routing simulator."""
+    # Test each routing strategy
+    strategies = RoutingSimulator.get_available_strategies()
+
+    hidden_states = torch.randn(num_tokens, hidden_size, device=device)
+    router_logits = torch.randn(num_tokens, num_experts, device=device)
+
+    for strategy in strategies:
+        # Simulate routing
+        topk_weights, topk_ids = RoutingSimulator.simulate_routing(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            strategy_name=strategy,
+            top_k=top_k,
+        )
+
+        # Check output shapes
+        assert topk_weights.shape == (
+            num_tokens,
+            top_k,
+        ), f"Wrong weights shape for {strategy}"
+        assert topk_ids.shape == (
+            num_tokens,
+            top_k,
+        ), f"Wrong ids shape for {strategy}"
+
+        # Check that expert IDs are valid
+        assert (topk_ids.min()
+                >= 0), f"Invalid expert ID (negative) for {strategy}"
+        assert (topk_ids.max()
+                < num_experts), f"Invalid expert ID (too large) for {strategy}"
+
+
+def test_routing_strategy_integration(monkeypatch, device):
+    """Test that the routing strategy environment variable works with
+    FusedMoE."""
+    pytest.importorskip("vllm.model_executor.layers.fused_moe.layer")
+
+    import vllm.envs as envs
+    from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+    # Test parameters
+    num_tokens = 32
+    hidden_size = 16
+    num_experts = 4
+    top_k = 2
+
+    # Create test data
+    hidden_states = torch.randn(num_tokens, hidden_size, device=device)
+    router_logits = torch.randn(num_tokens, num_experts, device=device)
+
+    # Test different routing strategies
+    strategies = RoutingSimulator.get_available_strategies()
+
+    for strategy in strategies:
+        # Set environment variable
+        env_name = "VLLM_MOE_ROUTING_SIMULATION_STRATEGY"
+        monkeypatch.setenv(env_name, strategy)
+
+        # Force reload of environment variable
+        envs.environment_variables[env_name] = lambda s=strategy: s
+
+        # Test the select_experts method
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            top_k=top_k,
+            use_grouped_topk=False,
+            renormalize=True,
+            indices_type=torch.long)
+
+        # Verify output shapes
+        assert topk_weights.shape == (
+            num_tokens, top_k), f"Wrong weights shape for {strategy}"
+        assert topk_ids.shape == (num_tokens,
+                                  top_k), f"Wrong ids shape for {strategy}"
+
+        # Verify expert IDs are valid
+        assert topk_ids.min(
+        ) >= 0, f"Invalid expert ID (negative) for {strategy}"
+        assert topk_ids.max(
+        ) < num_experts, f"Invalid expert ID (too large) for {strategy}"
+
+
+def test_distribution_based_routing_with_custom_strategy():
+    """Test registering and using DistributionBasedRouting with custom
+    parameters."""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Register custom distribution-based strategy
+    custom_strategy = DistributionBasedRouting(distribution="normal",
+                                               mean=2.0,
+                                               std=0.5)
+    RoutingSimulator.register_strategy("custom_normal", custom_strategy)
+
+    # Test data
+    num_tokens = 60
+    hidden_size = 48
+    num_experts = 6
+    top_k = 3
+
+    hidden_states = torch.randn(num_tokens, hidden_size, device=device)
+    router_logits = torch.randn(num_tokens, num_experts, device=device)
+
+    # Use the custom strategy
+    topk_weights, topk_ids = RoutingSimulator.simulate_routing(
+        hidden_states=hidden_states,
+        router_logits=router_logits,
+        strategy_name="custom_normal",
+        top_k=top_k)
+
+    # Check output shapes
+    assert topk_weights.shape == (num_tokens, top_k)
+    assert topk_ids.shape == (num_tokens, top_k)
+
+    # Check that expert IDs are valid
+    assert topk_ids.min() >= 0
+    assert topk_ids.max() < num_experts
+
+
+def test_instance_compatibility():
+    """Test that static methods work correctly."""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Test static method directly
+    hidden_states = torch.randn(10, 8, device=device)
+    router_logits = torch.randn(10, 4, device=device)
+
+    topk_weights, topk_ids = RoutingSimulator.simulate_routing(
+        hidden_states=hidden_states,
+        router_logits=router_logits,
+        strategy_name="uniform_random",
+        top_k=2)
+
+    assert topk_weights.shape == (10, 2)
+    assert topk_ids.shape == (10, 2)
diff --git a/vllm/envs.py b/vllm/envs.py
index d9ebf59c1a..f6c6d7e7ed 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -989,6 +989,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
     lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
 
+    # MoE routing strategy selector.
+    # See `RoutingSimulator.get_available_strategies()` # for available
+    # strategies.
+    # Cutstom routing strategies can be registered by
+    # RoutingSimulator.register_strategy()
+    # Note: custom strategies may not produce correct model outputs
+    "VLLM_MOE_ROUTING_SIMULATION_STRATEGY":
+    lambda: os.environ.get("VLLM_MOE_ROUTING_SIMULATION_STRATEGY", "").lower(),
+
     # Regex timeout for use by the vLLM tool parsing plugins.
     "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS":
     lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")),
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 72c2bc9a3d..76cedb3ed3 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -28,6 +28,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize)
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     is_rocm_aiter_moe_enabled)
+from vllm.model_executor.layers.fused_moe.routing_simulator import (
+    RoutingSimulator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
@@ -1362,6 +1364,16 @@ class FusedMoE(torch.nn.Module):
         """
         from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 
+        # Check if we should use a routing simulation strategy
+        routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
+        if routing_strategy != "":
+            return RoutingSimulator.simulate_routing(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+                strategy_name=routing_strategy,
+                top_k=top_k,
+                indices_type=indices_type)
+
         # DeepSeekv2 uses grouped_top_k
         if use_grouped_topk:
             assert topk_group is not None
diff --git a/vllm/model_executor/layers/fused_moe/routing_simulator.py b/vllm/model_executor/layers/fused_moe/routing_simulator.py
new file mode 100644
index 0000000000..c8b107f13c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/routing_simulator.py
@@ -0,0 +1,289 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Token-to-Expert Routing Simulator
+
+This module provides a framework for simulating and testing different
+token-to-expert routing strategies for Mixture of Experts (MoE) models.
+It supports routing logic customization and includes example implementations
+like uniform random routing.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+
+class RoutingStrategy(ABC):
+    """Base class for token-to-expert routing strategies."""
+
+    @abstractmethod
+    def route_tokens(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        indices_type: Optional[torch.dtype] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Route tokens to experts.
+
+        Args:
+            hidden_states: Input hidden states [num_tokens, hidden_size]
+            router_logits: Router logits [num_tokens, num_experts]
+            top_k: Number of experts to select per token
+            indices_type: Data type for expert indices
+
+        Returns:
+            tuple of (topk_weights, topk_ids)
+        """
+        pass
+
+
+class DistributionBasedRouting(RoutingStrategy):
+    """
+    Distribution-based random routing strategy with configurable distributions.
+
+    This routing strategy randomly selects experts for each token based on
+    different probability distributions. Currently supports uniform and normal
+    distributions for testing different routing patterns.
+    """
+
+    def __init__(self, distribution: str = "uniform", **distribution_params):
+        """
+        Initialize distribution-based routing.
+
+        Args:
+            distribution: Type of distribution to use for sampling
+                - "uniform": Uniform distribution (default)
+                - "normal": Normal/Gaussian distribution
+            **distribution_params: Parameters specific to the
+                chosen distribution
+                For "uniform": No additional parameters needed
+                For "normal": mean (default: 0.0), std (default: 1.0)
+        """
+        self.distribution = distribution.lower()
+        self.distribution_params = distribution_params
+
+        # Validate distribution and parameters
+        self._validate_distribution_params()
+
+    def _validate_distribution_params(self):
+        """Validate distribution type and parameters."""
+        valid_distributions = ["uniform", "normal"]
+
+        if self.distribution not in valid_distributions:
+            raise ValueError(f"Unsupported distribution: {self.distribution}. "
+                             f"Supported distributions: {valid_distributions}")
+
+        # Set default parameters if not provided
+        if self.distribution == "normal":
+            self.distribution_params.setdefault("mean", 0.0)
+            self.distribution_params.setdefault("std", 1.0)
+
+    def route_tokens(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        indices_type: Optional[torch.dtype] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Randomly select experts for each token using the specified distribution.
+
+        Args:
+            hidden_states: Input hidden states [num_tokens, hidden_size]
+            router_logits: Router logits [num_tokens, num_experts]
+            top_k: Number of experts to select per token
+            indices_type: Data type for expert indices
+
+        Returns:
+            tuple of (topk_weights, topk_ids) where:
+            - topk_weights: Weights based on distribution sampling
+            - topk_ids: Expert indices sampled from the distribution
+        """
+        num_tokens = hidden_states.shape[0]
+        num_experts = router_logits.shape[-1]
+
+        if indices_type is None:
+            indices_type = torch.long
+
+        # Generate expert IDs based on the specified distribution
+        topk_ids = self._sample_expert_ids(num_tokens, num_experts, top_k,
+                                           hidden_states.device, indices_type)
+
+        # Generate weights based on the distribution
+        topk_weights = self._generate_weights(num_tokens, top_k,
+                                              hidden_states.device)
+
+        return topk_weights, topk_ids
+
+    def _sample_expert_ids(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        top_k: int,
+        device: torch.device,
+        indices_type: torch.dtype,
+    ) -> torch.Tensor:
+        """Sample expert IDs based on the specified distribution."""
+
+        if self.distribution == "uniform":
+            # Uniform random sampling
+            return torch.randint(
+                low=0,
+                high=num_experts,
+                size=(num_tokens, top_k),
+                dtype=indices_type,
+                device=device,
+            )
+
+        elif self.distribution == "normal":
+            # For normal distribution, sample continuous values and map to
+            # expert IDs
+            continuous_samples = self._sample_continuous_distribution(
+                num_tokens, top_k, device)
+
+            # Map continuous samples to expert indices
+            # Normalize to [0, 1] range and scale to [0, num_experts)
+            normalized_samples = self._normalize_samples(continuous_samples)
+            expert_ids = (normalized_samples * num_experts).long()
+            expert_ids = torch.clamp(expert_ids, 0, num_experts - 1)
+
+            return expert_ids.to(dtype=indices_type)
+
+        else:
+            raise ValueError(f"Unsupported distribution: {self.distribution}")
+
+    def _sample_continuous_distribution(self, num_tokens: int, top_k: int,
+                                        device: torch.device) -> torch.Tensor:
+        """Sample from continuous distributions."""
+        shape = (num_tokens, top_k)
+
+        if self.distribution == "normal":
+            mean = self.distribution_params["mean"]
+            std = self.distribution_params["std"]
+            return torch.normal(mean, std, size=shape, device=device)
+
+        else:
+            raise ValueError(
+                f"Unsupported continuous distribution: {self.distribution}")
+
+    def _normalize_samples(self, samples: torch.Tensor) -> torch.Tensor:
+        """Normalize samples to [0, 1] range."""
+        if self.distribution == "normal":
+            # Use sigmoid to map normal distribution to [0, 1]
+            return torch.sigmoid(samples)
+
+        else:
+            raise ValueError(f"Unsupported distribution for normalization: "
+                             f"{self.distribution}")
+
+    def _generate_weights(self, num_tokens: int, top_k: int,
+                          device: torch.device) -> torch.Tensor:
+        """Generate weights based on the distribution."""
+        if self.distribution == "uniform":
+            # All-ones weights for uniform distribution
+            return torch.ones(
+                (num_tokens, top_k),
+                dtype=torch.float32,
+                device=device,
+            )
+
+        elif self.distribution == "normal":
+            # For normal distribution, generate weights from the same
+            # distribution
+            continuous_weights = self._sample_continuous_distribution(
+                num_tokens, top_k, device)
+            # Normalize to positive values and sum to 1
+            weights = torch.abs(continuous_weights)
+            weights = weights / weights.sum(dim=-1, keepdim=True)
+            return weights
+
+        else:
+            raise ValueError(
+                f"Unsupported distribution for weight generation: "
+                f"{self.distribution}")
+
+    def get_distribution_info(self) -> dict:
+        """Get information about the current distribution configuration."""
+        return {
+            "distribution": self.distribution,
+            "parameters": self.distribution_params.copy()
+        }
+
+
+class RoutingSimulator:
+    """
+    Token-to-Expert Routing Simulator.
+
+    This class provides a framework for testing and comparing different
+    routing strategies for MoE models. It can simulate routing behavior
+    and collect statistics for analysis.
+    """
+
+    # Class-level registry of routing strategies
+    _routing_strategies: dict[str, RoutingStrategy] = {
+        # Basic routing strategies
+        "uniform_random":
+        DistributionBasedRouting(distribution="uniform", mean=0.0, std=1.0),
+        "normal_routing":
+        DistributionBasedRouting(distribution="normal", mean=0.0, std=1.0),
+    }
+
+    @classmethod
+    def register_strategy(cls, name: str, strategy: RoutingStrategy):
+        """
+        Register a custom routing strategy.
+
+        Args:
+            name: Name of the strategy
+            strategy: RoutingStrategy instance
+        """
+        cls._routing_strategies[name] = strategy
+
+    @classmethod
+    def get_available_strategies(cls):
+        """
+        Get list of available routing strategy names.
+
+        Returns:
+            List of available strategy names
+        """
+        return list(cls._routing_strategies.keys())
+
+    @staticmethod
+    def simulate_routing(
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        strategy_name: str,
+        top_k: int,
+        indices_type: Optional[torch.dtype] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Simulate token-to-expert routing using the specified strategy.
+
+        Args:
+            hidden_states: Input hidden states [num_tokens, hidden_size]
+            router_logits: Router logits [num_tokens, num_experts]
+            strategy_name: Name of the routing strategy to use
+            top_k: Number of experts to select per token
+            indices_type: Data type for expert indices
+
+        Returns:
+            tuple of (topk_weights, topk_ids)
+        """
+        if strategy_name not in RoutingSimulator._routing_strategies:
+            raise ValueError(
+                f"Unknown routing strategy: {strategy_name}. "
+                f"Available strategies: "
+                f"{list(RoutingSimulator._routing_strategies.keys())}")
+
+        strategy = RoutingSimulator._routing_strategies[strategy_name]
+        return strategy.route_tokens(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            top_k=top_k,
+            indices_type=indices_type,
+        )

From 8e8e0b6af189d262bcfdaef6c0cfb94772e86b0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Garc=C3=ADa=20Garc=C3=ADa?=
 <adrigarvk8@gmail.com>
Date: Thu, 7 Aug 2025 10:10:13 +0400
Subject: [PATCH 056/932] feat: Add --enable-log-outputs flag for logging model
 generations (#20707)

Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai>
---
 tests/test_logger.py                         | 252 ++++++++++++++++++-
 vllm/entrypoints/logger.py                   |  36 ++-
 vllm/entrypoints/openai/cli_args.py          |   9 +-
 vllm/entrypoints/openai/serving_chat.py      | 121 +++++++--
 vllm/entrypoints/openai/serving_responses.py |  20 ++
 5 files changed, 412 insertions(+), 26 deletions(-)

diff --git a/tests/test_logger.py b/tests/test_logger.py
index 8f235f1474..0bfb449cdf 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -10,11 +10,12 @@ from dataclasses import dataclass
 from json.decoder import JSONDecodeError
 from tempfile import NamedTemporaryFile
 from typing import Any
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 from uuid import uuid4
 
 import pytest
 
+from vllm.entrypoints.logger import RequestLogger
 from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger,
                          enable_trace_function_call, init_logger)
 from vllm.logging_utils import NewLineFormatter
@@ -228,9 +229,10 @@ def test_prepare_object_to_dump():
     list_obj = [1, 2, 3]
     assert prepare_object_to_dump(list_obj) == '[1, 2, 3]'
 
-    dict_obj = {'a': 1, 'b': 'b'}
+    dict_obj = {"a": 1, "b": "b"}
     assert prepare_object_to_dump(dict_obj) in [
-        "{a: 1, b: 'b'}", "{b: 'b', a: 1}"
+        "{a: 1, b: 'b'}",
+        "{b: 'b', a: 1}",
     ]
 
     set_obj = {1, 2, 3}
@@ -252,4 +254,246 @@ def test_prepare_object_to_dump():
         b: str
 
     assert (prepare_object_to_dump(CustomClass(
-        1, 'b')) == "CustomClass(a=1, b='b')")
+        1, "b")) == "CustomClass(a=1, b='b')")
+
+
+def test_request_logger_log_outputs():
+    """Test the new log_outputs functionality."""
+    # Create a mock logger to capture log calls
+    mock_logger = MagicMock()
+
+    with patch("vllm.entrypoints.logger.logger", mock_logger):
+        request_logger = RequestLogger(max_log_len=None)
+
+        # Test basic output logging
+        request_logger.log_outputs(
+            request_id="test-123",
+            outputs="Hello, world!",
+            output_token_ids=[1, 2, 3, 4],
+            finish_reason="stop",
+            is_streaming=False,
+            delta=False,
+        )
+
+        mock_logger.info.assert_called_once()
+        call_args = mock_logger.info.call_args.args
+        assert "Generated response %s%s" in call_args[0]
+        assert call_args[1] == "test-123"
+        assert call_args[3] == "Hello, world!"
+        assert call_args[4] == [1, 2, 3, 4]
+        assert call_args[5] == "stop"
+
+
+def test_request_logger_log_outputs_streaming_delta():
+    """Test log_outputs with streaming delta mode."""
+    mock_logger = MagicMock()
+
+    with patch("vllm.entrypoints.logger.logger", mock_logger):
+        request_logger = RequestLogger(max_log_len=None)
+
+        # Test streaming delta logging
+        request_logger.log_outputs(
+            request_id="test-456",
+            outputs="Hello",
+            output_token_ids=[1],
+            finish_reason=None,
+            is_streaming=True,
+            delta=True,
+        )
+
+        mock_logger.info.assert_called_once()
+        call_args = mock_logger.info.call_args.args
+        assert "Generated response %s%s" in call_args[0]
+        assert call_args[1] == "test-456"
+        assert call_args[2] == " (streaming delta)"
+        assert call_args[3] == "Hello"
+        assert call_args[4] == [1]
+        assert call_args[5] is None
+
+
+def test_request_logger_log_outputs_streaming_complete():
+    """Test log_outputs with streaming complete mode."""
+    mock_logger = MagicMock()
+
+    with patch("vllm.entrypoints.logger.logger", mock_logger):
+        request_logger = RequestLogger(max_log_len=None)
+
+        # Test streaming complete logging
+        request_logger.log_outputs(
+            request_id="test-789",
+            outputs="Complete response",
+            output_token_ids=[1, 2, 3],
+            finish_reason="length",
+            is_streaming=True,
+            delta=False,
+        )
+
+        mock_logger.info.assert_called_once()
+        call_args = mock_logger.info.call_args.args
+        assert "Generated response %s%s" in call_args[0]
+        assert call_args[1] == "test-789"
+        assert call_args[2] == " (streaming complete)"
+        assert call_args[3] == "Complete response"
+        assert call_args[4] == [1, 2, 3]
+        assert call_args[5] == "length"
+
+
+def test_request_logger_log_outputs_with_truncation():
+    """Test log_outputs respects max_log_len setting."""
+    mock_logger = MagicMock()
+
+    with patch("vllm.entrypoints.logger.logger", mock_logger):
+        # Set max_log_len to 10
+        request_logger = RequestLogger(max_log_len=10)
+
+        # Test output truncation
+        long_output = "This is a very long output that should be truncated"
+        long_token_ids = list(range(20))  # 20 tokens
+
+        request_logger.log_outputs(
+            request_id="test-truncate",
+            outputs=long_output,
+            output_token_ids=long_token_ids,
+            finish_reason="stop",
+            is_streaming=False,
+            delta=False,
+        )
+
+        mock_logger.info.assert_called_once()
+        call_args = mock_logger.info.call_args
+
+        # Check that output was truncated to first 10 characters
+        logged_output = call_args[0][3]
+        assert logged_output == "This is a "
+        assert len(logged_output) == 10
+
+        # Check that token IDs were truncated to first 10 tokens
+        logged_token_ids = call_args[0][4]
+        assert logged_token_ids == list(range(10))
+        assert len(logged_token_ids) == 10
+
+
+def test_request_logger_log_outputs_none_values():
+    """Test log_outputs handles None values correctly."""
+    mock_logger = MagicMock()
+
+    with patch("vllm.entrypoints.logger.logger", mock_logger):
+        request_logger = RequestLogger(max_log_len=None)
+
+        # Test with None output_token_ids
+        request_logger.log_outputs(
+            request_id="test-none",
+            outputs="Test output",
+            output_token_ids=None,
+            finish_reason="stop",
+            is_streaming=False,
+            delta=False,
+        )
+
+        mock_logger.info.assert_called_once()
+        call_args = mock_logger.info.call_args.args
+        assert "Generated response %s%s" in call_args[0]
+        assert call_args[1] == "test-none"
+        assert call_args[3] == "Test output"
+        assert call_args[4] is None
+        assert call_args[5] == "stop"
+
+
+def test_request_logger_log_outputs_empty_output():
+    """Test log_outputs handles empty output correctly."""
+    mock_logger = MagicMock()
+
+    with patch("vllm.entrypoints.logger.logger", mock_logger):
+        request_logger = RequestLogger(max_log_len=5)
+
+        # Test with empty output
+        request_logger.log_outputs(
+            request_id="test-empty",
+            outputs="",
+            output_token_ids=[],
+            finish_reason="stop",
+            is_streaming=False,
+            delta=False,
+        )
+
+        mock_logger.info.assert_called_once()
+        call_args = mock_logger.info.call_args.args
+        assert "Generated response %s%s" in call_args[0]
+        assert call_args[1] == "test-empty"
+        assert call_args[3] == ""
+        assert call_args[4] == []
+        assert call_args[5] == "stop"
+
+
+def test_request_logger_log_outputs_integration():
+    """Test that log_outputs can be called alongside log_inputs."""
+    mock_logger = MagicMock()
+
+    with patch("vllm.entrypoints.logger.logger", mock_logger):
+        request_logger = RequestLogger(max_log_len=None)
+
+        # Test that both methods can be called without interference
+        request_logger.log_inputs(
+            request_id="test-integration",
+            prompt="Test prompt",
+            prompt_token_ids=[1, 2, 3],
+            prompt_embeds=None,
+            params=None,
+            lora_request=None,
+        )
+
+        request_logger.log_outputs(
+            request_id="test-integration",
+            outputs="Test output",
+            output_token_ids=[4, 5, 6],
+            finish_reason="stop",
+            is_streaming=False,
+            delta=False,
+        )
+
+        # Should have been called twice - once for inputs, once for outputs
+        assert mock_logger.info.call_count == 2
+
+        # Check that the calls were made with correct patterns
+        input_call = mock_logger.info.call_args_list[0][0]
+        output_call = mock_logger.info.call_args_list[1][0]
+
+        assert "Received request %s" in input_call[0]
+        assert input_call[1] == "test-integration"
+
+        assert "Generated response %s%s" in output_call[0]
+        assert output_call[1] == "test-integration"
+
+
+def test_streaming_complete_logs_full_text_content():
+    """Test that streaming complete logging includes
+      full accumulated text, not just token count."""
+    mock_logger = MagicMock()
+
+    with patch("vllm.entrypoints.logger.logger", mock_logger):
+        request_logger = RequestLogger(max_log_len=None)
+
+        # Test with actual content instead of token count format
+        full_response = "This is a complete response from streaming"
+        request_logger.log_outputs(
+            request_id="test-streaming-full-text",
+            outputs=full_response,
+            output_token_ids=None,
+            finish_reason="streaming_complete",
+            is_streaming=True,
+            delta=False,
+        )
+
+        mock_logger.info.assert_called_once()
+        call_args = mock_logger.info.call_args.args
+
+        # Verify the logged output is the full text, not a token count format
+        logged_output = call_args[3]
+        assert logged_output == full_response
+        assert "tokens>" not in logged_output
+        assert "streaming_complete" not in logged_output
+
+        # Verify other parameters
+        assert call_args[1] == "test-streaming-full-text"
+        assert call_args[2] == " (streaming complete)"
+        assert call_args[5] == "streaming_complete"
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
index 06ff3b417f..152d11c84e 100644
--- a/vllm/entrypoints/logger.py
+++ b/vllm/entrypoints/logger.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Sequence
 from typing import Optional, Union
 
 import torch
@@ -16,8 +17,6 @@ logger = init_logger(__name__)
 class RequestLogger:
 
     def __init__(self, *, max_log_len: Optional[int]) -> None:
-        super().__init__()
-
         self.max_log_len = max_log_len
 
     def log_inputs(
@@ -45,3 +44,36 @@ class RequestLogger:
             "lora_request: %s.", request_id, prompt, params, prompt_token_ids,
             prompt_embeds.shape if prompt_embeds is not None else None,
             lora_request)
+
+    def log_outputs(
+        self,
+        request_id: str,
+        outputs: str,
+        output_token_ids: Optional[Sequence[int]],
+        finish_reason: Optional[str] = None,
+        is_streaming: bool = False,
+        delta: bool = False,
+    ) -> None:
+        max_log_len = self.max_log_len
+        if max_log_len is not None:
+            if outputs is not None:
+                outputs = outputs[:max_log_len]
+
+            if output_token_ids is not None:
+                # Convert to list and apply truncation
+                output_token_ids = list(output_token_ids)[:max_log_len]
+
+        stream_info = ""
+        if is_streaming:
+            stream_info = (" (streaming delta)"
+                           if delta else " (streaming complete)")
+
+        logger.info(
+            "Generated response %s%s: output: %r, "
+            "output_token_ids: %s, finish_reason: %s",
+            request_id,
+            stream_info,
+            outputs,
+            output_token_ids,
+            finish_reason,
+        )
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 12318b300c..e89463a03c 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -44,10 +44,10 @@ class LoRAParserAction(argparse.Action):
 
         lora_list: list[LoRAModulePath] = []
         for item in values:
-            if item in [None, '']:  # Skip if item is None or empty string
+            if item in [None, ""]:  # Skip if item is None or empty string
                 continue
-            if '=' in item and ',' not in item:  # Old format: name=path
-                name, path = item.split('=')
+            if "=" in item and "," not in item:  # Old format: name=path
+                name, path = item.split("=")
                 lora_list.append(LoRAModulePath(name, path))
             else:  # Assume JSON format
                 try:
@@ -167,6 +167,9 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
     enable_tokenizer_info_endpoint: bool = False
     """Enable the /get_tokenizer_info endpoint. May expose chat
     templates and other tokenizer configuration."""
+    enable_log_outputs: bool = False
+    """If set to True, enable logging of model outputs (generations) 
+    in addition to the input logging that is enabled by default."""
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 6ad0a8ec54..b4231c6d10 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -73,6 +73,7 @@ class OpenAIServingChat(OpenAIServing):
         tool_parser: Optional[str] = None,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
+        enable_log_outputs: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
@@ -84,6 +85,7 @@ class OpenAIServingChat(OpenAIServing):
         self.response_role = response_role
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
+        self.enable_log_outputs = enable_log_outputs
 
         # set up tool use
         self.enable_auto_tools: bool = enable_auto_tools
@@ -489,20 +491,21 @@ class OpenAIServingChat(OpenAIServing):
         all_previous_token_ids: Optional[list[list[int]]]
         function_name_returned = [False] * num_choices
 
+        # Always track previous_texts for comprehensive output logging
+        previous_texts = [""] * num_choices
+
         # Only one of these will be used, thus previous_texts and
         # all_previous_token_ids will not be used twice in the same iteration.
         if tool_choice_auto or self.reasoning_parser:
             # These are only required in "auto" tool choice case
-            previous_texts = [""] * num_choices
             all_previous_token_ids = [[]] * num_choices
             # For reasoning parser and tool call all enabled
             added_content_delta_arr = [False] * num_choices
             reasoning_end_arr = [False] * num_choices
         elif request.tool_choice == "required":
-            previous_texts = [""] * num_choices
             all_previous_token_ids = None
         else:
-            previous_texts, all_previous_token_ids = None, None
+            all_previous_token_ids = None
 
         try:
             if self.reasoning_parser:
@@ -844,6 +847,7 @@ class OpenAIServingChat(OpenAIServing):
                                 current_token_ids=current_token_ids,
                                 delta_token_ids=output.token_ids,
                                 request=request))
+
                     # when only reasoning
                     elif self.reasoning_parser:
                         delta_message = (reasoning_parser.
@@ -865,6 +869,10 @@ class OpenAIServingChat(OpenAIServing):
                         assert all_previous_token_ids is not None
                         previous_texts[i] = current_text
                         all_previous_token_ids[i] = current_token_ids
+                    else:
+                        # Update for comprehensive logging even in simple case
+                        assert previous_texts is not None
+                        previous_texts[i] += delta_text
 
                     # set the previous values for the next iteration
                     previous_num_tokens[i] += len(output.token_ids)
@@ -876,6 +884,27 @@ class OpenAIServingChat(OpenAIServing):
                     if delta_message is None:
                         continue
 
+                    # Log streaming delta if output logging is enabled
+                    if self.enable_log_outputs and self.request_logger:
+                        delta_content = ""
+                        if delta_message.content:
+                            delta_content = delta_message.content
+                        elif delta_message.tool_calls:
+                            delta_content = "".join(
+                                tc.function.arguments
+                                for tc in delta_message.tool_calls
+                                if tc.function and tc.function.arguments)
+
+                        if delta_content:
+                            self.request_logger.log_outputs(
+                                request_id=request_id,
+                                outputs=delta_content,
+                                output_token_ids=list(output.token_ids),
+                                finish_reason=output.finish_reason,
+                                is_streaming=True,
+                                delta=True,
+                            )
+
                     if output.finish_reason is None:
                         # Send token-by-token response for each request.n
                         choice_data = ChatCompletionResponseStreamChoice(
@@ -994,7 +1023,27 @@ class OpenAIServingChat(OpenAIServing):
             request_metadata.final_usage_info = UsageInfo(
                 prompt_tokens=num_prompt_tokens,
                 completion_tokens=num_completion_tokens,
-                total_tokens=num_prompt_tokens + num_completion_tokens)
+                total_tokens=num_prompt_tokens + num_completion_tokens,
+            )
+
+            # Log complete streaming response if output logging is enabled
+            if self.enable_log_outputs and self.request_logger:
+                # Log the complete response for each choice
+                for i in range(num_choices):
+                    full_text = (
+                        previous_texts[i]
+                        if previous_texts and i < len(previous_texts) else
+                        f"<streaming_complete: {previous_num_tokens[i]} tokens>"
+                    )
+                    self.request_logger.log_outputs(
+                        request_id=request_id,
+                        outputs=full_text,
+                        output_token_ids=
+                        None,  # Consider also logging all token IDs
+                        finish_reason="streaming_complete",
+                        is_streaming=True,
+                        delta=False,
+                    )
 
         except Exception as e:
             # TODO: Use a vllm-specific Validation Error
@@ -1121,8 +1170,10 @@ class OpenAIServingChat(OpenAIServing):
                     tool_calls=[
                         tool_call_class(function=FunctionCall(
                             name=request.tool_choice.function.name,
-                            arguments=content))
-                    ])
+                            arguments=content,
+                        ))
+                    ],
+                )
 
             elif request.tool_choice and request.tool_choice == "required":
                 tool_call_class = MistralToolCall if isinstance(
@@ -1209,12 +1260,13 @@ class OpenAIServingChat(OpenAIServing):
                 finish_reason="tool_calls" if auto_tools_called else
                 output.finish_reason if output.finish_reason else "stop",
                 stop_reason=output.stop_reason)
+
             choices.append(choice_data)
 
         if request.echo:
             last_msg_content: Union[str, list[dict[str, str]]] = ""
-            if conversation and "content" in conversation[-1] and conversation[
-                    -1].get("role") == role:
+            if (conversation and "content" in conversation[-1]
+                    and conversation[-1].get("role") == role):
                 last_msg_content = conversation[-1]["content"] or ""
             if isinstance(last_msg_content, list):
                 last_msg_content = "\n".join(msg['text']
@@ -1251,6 +1303,40 @@ class OpenAIServingChat(OpenAIServing):
             kv_transfer_params=final_res.kv_transfer_params,
         )
 
+        # Log complete response if output logging is enabled
+        if self.enable_log_outputs and self.request_logger:
+            for choice in choices:
+                output_text = ""
+                if choice.message.content:
+                    output_text = choice.message.content
+                elif choice.message.tool_calls:
+                    # For tool calls, log the function name and arguments
+                    tool_call_descriptions = []
+                    for tool_call in choice.message.tool_calls:
+                        if hasattr(tool_call.function, "name") and hasattr(
+                                tool_call.function, "arguments"):
+                            tool_call_descriptions.append(
+                                f"{tool_call.function.name}({tool_call.function.arguments})"
+                            )
+                    tool_calls_str = ", ".join(tool_call_descriptions)
+                    output_text = f"[tool_calls: {tool_calls_str}]"
+
+                if output_text:
+                    # Get the corresponding output token IDs
+                    output_token_ids = None
+                    if choice.index < len(final_res.outputs):
+                        output_token_ids = final_res.outputs[
+                            choice.index].token_ids
+
+                    self.request_logger.log_outputs(
+                        request_id=request_id,
+                        outputs=output_text,
+                        output_token_ids=output_token_ids,
+                        finish_reason=choice.finish_reason,
+                        is_streaming=False,
+                        delta=False,
+                    )
+
         return response
 
     def _get_top_logprobs(
@@ -1258,15 +1344,16 @@ class OpenAIServingChat(OpenAIServing):
             tokenizer: AnyTokenizer,
             should_return_as_token_id: bool) -> list[ChatCompletionLogProb]:
         return [
-            ChatCompletionLogProb(token=(token := self._get_decoded_token(
-                p[1],
-                p[0],
-                tokenizer,
-                return_as_token_id=should_return_as_token_id)),
-                                  logprob=max(p[1].logprob, -9999.0),
-                                  bytes=list(
-                                      token.encode("utf-8", errors="replace")))
-            for i, p in enumerate(logprobs.items())
+            ChatCompletionLogProb(
+                token=(token := self._get_decoded_token(
+                    p[1],
+                    p[0],
+                    tokenizer,
+                    return_as_token_id=should_return_as_token_id,
+                )),
+                logprob=max(p[1].logprob, -9999.0),
+                bytes=list(token.encode("utf-8", errors="replace")),
+            ) for i, p in enumerate(logprobs.items())
             if top_logprobs and i < top_logprobs
         ]
 
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 3c0b590b0c..f26f92537c 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -65,6 +65,7 @@ class OpenAIServingResponses(OpenAIServing):
         tool_server: Optional[ToolServer] = None,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
+        enable_log_outputs: bool = False,
     ) -> None:
         super().__init__(
             engine_client=engine_client,
@@ -77,6 +78,7 @@ class OpenAIServingResponses(OpenAIServing):
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
+        self.enable_log_outputs = enable_log_outputs
 
         self.reasoning_parser: Optional[Callable[[AnyTokenizer],
                                                  ReasoningParser]] = None
@@ -428,6 +430,24 @@ class OpenAIServingResponses(OpenAIServing):
             usage=usage,
         )
 
+        # Log complete response if output logging is enabled
+        if self.enable_log_outputs and self.request_logger:
+            output_text = ""
+            if content:
+                output_text = content
+            elif reasoning_content:
+                output_text = f"[reasoning: {reasoning_content}]"
+
+            if output_text:
+                self.request_logger.log_outputs(
+                    request_id=request.request_id,
+                    outputs=output_text,
+                    output_token_ids=final_output.token_ids,
+                    finish_reason=final_output.finish_reason,
+                    is_streaming=False,
+                    delta=False,
+                )
+
         if request.store:
             async with self.response_store_lock:
                 stored_response = self.response_store.get(response.id)

From 434d2f3f7ab3b6768df59f8d9d81e43bf38204f7 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 7 Aug 2025 08:22:07 +0100
Subject: [PATCH 057/932] [Docs] Add missing dependency for docs build (#22435)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 requirements/docs.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/docs.txt b/requirements/docs.txt
index 4d4fc7da68..c589093110 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -19,6 +19,7 @@ cloudpickle
 fastapi
 msgspec
 openai
+openai-harmony
 partial-json-parser
 pillow
 psutil

From c2dba2dba8e4ebff1b7772ffbe811b0165e844d7 Mon Sep 17 00:00:00 2001
From: JaceyShao <65159281+JaceyShao@users.noreply.github.com>
Date: Thu, 7 Aug 2025 15:24:47 +0800
Subject: [PATCH 058/932] Add H20-3e fused MoE kernel tuning configs for
 GLM-4.5 (#22433)

Signed-off-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Co-authored-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
---
 ...E=160,N=192,device_name=NVIDIA_H20-3e.json | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 0000000000..f2ed716c8b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}

From 136825de756f5421283e404e3991b77a9d33c131 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 7 Aug 2025 00:26:24 -0700
Subject: [PATCH 059/932] [Misc] Enhance code formatting in mxfp4.py  (#22423)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .../layers/quantization/mxfp4.py              | 85 ++++++++++++-------
 1 file changed, 52 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index b6d7bc5d5c..068af02739 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -109,55 +109,74 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         self.intermediate_size = intermediate_size_per_partition_after_pad
         self.hidden_size = hidden_size
         # Fused gate_up_proj (column parallel)
-        w13_weight = torch.nn.Parameter(torch.zeros(
-            num_experts,
-            2 * intermediate_size_per_partition_after_pad,
-            hidden_size // 2,
-            dtype=weight_dtype),
-                                        requires_grad=False)
+        w13_weight = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition_after_pad,
+                hidden_size // 2,
+                dtype=weight_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
 
-        w13_weight_scale = torch.nn.Parameter(torch.zeros(
-            num_experts,
-            2 * intermediate_size_per_partition_after_pad,
-            hidden_size // mxfp4_block,
-            dtype=scale_dtype),
-                                              requires_grad=False)
+        w13_weight_scale = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition_after_pad,
+                hidden_size // mxfp4_block,
+                dtype=scale_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_weight_scale", w13_weight_scale)
         set_weight_attrs(w13_weight_scale, extra_weight_attrs)
 
-        w13_bias = torch.nn.Parameter(torch.zeros(
-            num_experts,
-            2 * intermediate_size_per_partition_after_pad,
-            dtype=torch.bfloat16),
-                                      requires_grad=False)
+        w13_bias = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition_after_pad,
+                dtype=torch.bfloat16,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_bias", w13_bias)
         set_weight_attrs(w13_bias, extra_weight_attrs)
 
         # down_proj (row parallel)
-        w2_weight = torch.nn.Parameter(torch.zeros(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition_after_pad // 2,
-            dtype=weight_dtype),
-                                       requires_grad=False)
+        w2_weight = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition_after_pad // 2,
+                dtype=weight_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
-        w2_weight_scale = torch.nn.Parameter(torch.zeros(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition_after_pad // mxfp4_block,
-            dtype=scale_dtype),
-                                             requires_grad=False)
+        w2_weight_scale = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition_after_pad // mxfp4_block,
+                dtype=scale_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_weight_scale", w2_weight_scale)
         set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
-        w2_bias = torch.nn.Parameter(torch.zeros(num_experts,
-                                                 hidden_size,
-                                                 dtype=torch.bfloat16),
-                                     requires_grad=False)
+        w2_bias = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                hidden_size,
+                dtype=torch.bfloat16,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_bias", w2_bias)
         set_weight_attrs(w2_bias, extra_weight_attrs)
 

From 5e8398805ed6b6e59e3408fe64ed37d189b77149 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Thu, 7 Aug 2025 00:28:15 -0700
Subject: [PATCH 060/932] [Doc] Fix link to prefix caching design (#22384)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 docs/features/automatic_prefix_caching.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/automatic_prefix_caching.md b/docs/features/automatic_prefix_caching.md
index f3c4bdd85c..c529da684e 100644
--- a/docs/features/automatic_prefix_caching.md
+++ b/docs/features/automatic_prefix_caching.md
@@ -5,7 +5,7 @@
 Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
 
 !!! note
-    Technical details on how vLLM implements APC can be found [here](../design/automatic_prefix_caching.md).
+    Technical details on how vLLM implements APC can be found [here](../design/prefix_caching.md).
 
 ## Enabling APC in vLLM
 

From a2c6696bfee0ec2275dc08b15eee154ed0e9c0c7 Mon Sep 17 00:00:00 2001
From: Ricardo Decal <crypdick@users.noreply.github.com>
Date: Thu, 7 Aug 2025 00:29:13 -0700
Subject: [PATCH 061/932] [Docs] Factor out troubleshooting to its own guide;
 add section for Ray Observability (#21578)

Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
---
 docs/serving/distributed_serving.md         | 29 +++++++++++----------
 docs/serving/distributed_troubleshooting.md | 16 ++++++++++++
 2 files changed, 31 insertions(+), 14 deletions(-)
 create mode 100644 docs/serving/distributed_troubleshooting.md

diff --git a/docs/serving/distributed_serving.md b/docs/serving/distributed_serving.md
index 08d889a00d..fc9d9f8a34 100644
--- a/docs/serving/distributed_serving.md
+++ b/docs/serving/distributed_serving.md
@@ -128,12 +128,17 @@ vllm serve /path/to/the/model/in/the/container \
      --tensor-parallel-size 16
 ```
 
-## Troubleshooting distributed deployments
+## Optimizing network communication for tensor parallelism
 
-To make tensor parallelism performant, ensure that communication between nodes is efficient, for example, by using high-speed network cards such as InfiniBand. To set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Contact your system administrator for more information about the required flags. One way to confirm if InfiniBand is working is to run `vllm` with the `NCCL_DEBUG=TRACE` environment variable set, for example `NCCL_DEBUG=TRACE vllm serve ...`, and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, NCCL uses a raw TCP socket, which is not efficient for cross-node tensor parallelism. If you find `[send] via NET/IB/GDRDMA` in the logs, NCCL uses InfiniBand with GPUDirect RDMA, which is efficient.
+Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand.
+To set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the
+<gh-file:examples/online_serving/run_cluster.sh> helper script.
+Contact your system administrator for more information about the required flags.
 
 ## Enabling GPUDirect RDMA
 
+GPUDirect RDMA (Remote Direct Memory Access) is an NVIDIA technology that allows network adapters to directly access GPU memory, bypassing the CPU and system memory. This direct access reduces latency and CPU overhead, which is beneficial for large data transfers between GPUs across nodes.
+
 To enable GPUDirect RDMA with vLLM, configure the following settings:
 
 - `IPC_LOCK` security context: add the `IPC_LOCK` capability to the container's security context to lock memory pages and prevent swapping to disk.
@@ -175,21 +180,17 @@ spec:
 ...
 ```
 
-Efficient tensor parallelism requires fast inter-node communication, preferably through high-speed network adapters such as InfiniBand. To enable InfiniBand, append flags such as `--privileged -e NCCL_IB_HCA=mlx5` to `run_cluster.sh`. For cluster-specific settings, consult your system administrator.
+!!! tip "Confirm GPUDirect RDMA operation"
+    To confirm your InfiniBand card is using GPUDirect RDMA, run vLLM with detailed NCCL logs: `NCCL_DEBUG=TRACE vllm serve ...`.
 
-To confirm InfiniBand operation, enable detailed NCCL logs:
+    Then look for the NCCL version and the network used.
 
-```bash
-NCCL_DEBUG=TRACE vllm serve ...
-```
-
-Search the logs for the transport method. Entries containing `[send] via NET/Socket` indicate raw TCP sockets, which perform poorly for cross-node tensor parallelism. Entries containing `[send] via NET/IB/GDRDMA` indicate InfiniBand with GPUDirect RDMA, which provides high performance.
-
-!!! tip "Verify inter-node GPU communication"
-    After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script][troubleshooting-incorrect-hardware-driver]. If you need additional environment variables for communication configuration, append them to `run_cluster.sh`, for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see <gh-issue:6803>.
+    - If you find `[send] via NET/IB/GDRDMA` in the logs, then NCCL is using InfiniBand with GPUDirect RDMA, which *is* efficient.
+    - If you find `[send] via NET/Socket` in the logs, NCCL used a raw TCP socket, which *is not* efficient for cross-node tensor parallelism. 
 
 !!! tip "Pre-download Hugging Face models"
     If you use Hugging Face models, downloading the model before starting vLLM is recommended. Download the model on every node to the same path, or store the model on a distributed file system accessible by all nodes. Then pass the path to the model in place of the repository ID. Otherwise, supply a Hugging Face token by appending `-e HF_TOKEN=<TOKEN>` to `run_cluster.sh`.
 
-!!! tip
-    The error message `Error: No available node types can fulfill resource request` can appear even when the cluster has enough GPUs. The issue often occurs when nodes have multiple IP addresses and vLLM can't select the correct one. Ensure that vLLM and Ray use the same IP address by setting `VLLM_HOST_IP` in `run_cluster.sh` (with a different value on each node). Use `ray status` and `ray list nodes` to verify the chosen IP address. For more information, see <gh-issue:7815>.
+## Troubleshooting distributed deployments
+
+For information about distributed debugging, see [Troubleshooting distributed deployments](distributed_troubleshooting.md).
diff --git a/docs/serving/distributed_troubleshooting.md b/docs/serving/distributed_troubleshooting.md
new file mode 100644
index 0000000000..bd45f010ed
--- /dev/null
+++ b/docs/serving/distributed_troubleshooting.md
@@ -0,0 +1,16 @@
+# Troubleshooting distributed deployments
+
+For general troubleshooting, see [Troubleshooting](../usage/troubleshooting.md).
+
+## Verify inter-node GPU communication
+
+After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script][troubleshooting-incorrect-hardware-driver]. If you need additional environment variables for communication configuration, append them to <gh-file:examples/online_serving/run_cluster.sh>, for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see <gh-issue:6803>.
+
+## No available node types can fulfill resource request
+
+The error message `Error: No available node types can fulfill resource request` can appear even when the cluster has enough GPUs. The issue often occurs when nodes have multiple IP addresses and vLLM can't select the correct one. Ensure that vLLM and Ray use the same IP address by setting `VLLM_HOST_IP` in <gh-file:examples/online_serving/run_cluster.sh> (with a different value on each node). Use `ray status` and `ray list nodes` to verify the chosen IP address. For more information, see <gh-issue:7815>.
+
+## Ray observability
+
+Debugging a distributed system can be challenging due to the large scale and complexity. Ray provides a suite of tools to help monitor, debug, and optimize Ray applications and clusters. For more information about Ray observability, visit the [official Ray observability docs](https://docs.ray.io/en/latest/ray-observability/index.html). For more information about debugging Ray applications, visit the [Ray Debugging Guide](https://docs.ray.io/en/latest/ray-observability/user-guides/debug-apps/index.html). For information about troubleshooting Kubernetes clusters, see the
+[official KubeRay troubleshooting guide](https://docs.ray.io/en/latest/serve/advanced-guides/multi-node-gpu-troubleshooting.html).

From 35171b1172fe59810612ac35de9ee29ccfbd8b65 Mon Sep 17 00:00:00 2001
From: Andrew Chan <andrewkchan.akc@gmail.com>
Date: Thu, 7 Aug 2025 00:29:45 -0700
Subject: [PATCH 062/932] [Doc] update docs for nightly benchmarks (#12022)

Signed-off-by: Andrew Chan <andrewkchan.akc@gmail.com>
---
 .buildkite/nightly-benchmarks/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 3721d3d1d6..3f2e2da397 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -168,9 +168,9 @@ See [nightly-descriptions.md](nightly-descriptions.md) for the detailed descript
 ### Workflow
 
 - The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
-- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
-- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
-- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
+- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
+- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
+- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
 
 ### Nightly tests
 
@@ -180,6 +180,6 @@ In [nightly-tests.json](tests/nightly-tests.json), we include the command line a
 
 The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
 
-WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
+WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
 
 WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).

From 289b18e670c2439dfc1f4f80df782de9ad112762 Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Thu, 7 Aug 2025 15:59:23 +0800
Subject: [PATCH 063/932] [Docs] Update features/disagg_prefill, add v1
 examples and development (#22165)

Signed-off-by: David Chen <530634352@qq.com>
---
 .../disagg_prefill/high_level_design.png      | Bin 0 -> 92801 bytes
 .../features/disagg_prefill/workflow.png      | Bin 0 -> 89969 bytes
 docs/features/disagg_prefill.md               |  25 ++++++++++++++++++
 3 files changed, 25 insertions(+)
 create mode 100644 docs/assets/features/disagg_prefill/high_level_design.png
 create mode 100644 docs/assets/features/disagg_prefill/workflow.png

diff --git a/docs/assets/features/disagg_prefill/high_level_design.png b/docs/assets/features/disagg_prefill/high_level_design.png
new file mode 100644
index 0000000000000000000000000000000000000000..ce9b1c88276602d08b2aa4892e4c3e3aeb4fe67e
GIT binary patch
literal 92801
zcmdSBbyQp3w>69uDQ?BBNE_T8LUAZitSwL+inVyKpryELaVbt)9E!UI2!sMH?iM^i
zaQ)Kf_uTv5Z`?87|KE(0k(}(2vrqP3d#yF+o-2{suT%)}Xz|d{&<NF36?M_jFcD~I
z=x|(Y)F(p?4i=~vbdau!JX$4;eg}2($W~5M4h^j)9{<(?3w8e3Mb!|5hDOx;_lXX1
zE_;uL_E4;*DEHdibUz2n?}-XkKk?(Ik7-q1yu3KHi+Dm=ig`qKc_{d_7dQ@y`tx6)
zK75U8^GZW=6~kT5zFAUNP)%SlBRkAnW*s`*M?E@nBtVD-HMeCnB^`?YI@WkRr}Cl-
z0RMIL>Ah~_`mYnfOKFw<*8kB6Vw8d}_V!|mIWH-`e_!Mg{coGJZ4#SY)W%Tab1g<T
z&CK5|I;b9MNK3tCefFv6etryhG-{N$n&bG-h50hsl)G#P6VM&!8@|NX)zx{#qpm~t
z!+W{Wnd!ry@1YS9xUVbjvE8;ttW8UHEvo)=cWu;%S%E?5=rf&xe7t6D5yZetv^ty7
zm%<piCWrV`yylljqy`;+5^ZhL6}}WoY9rzj68R3ZZkNXprsMk?gkj^o7$X_N3$dg$
zUZCU(>Ba0-j!}jgH(S`2_L%%<NWeAKMzj<Z5gC)E7B^F)>C#|9;KMCwZz%-7G>=i-
z1?_CN9p9PBd2*T;yYZB!-1mu+THAt(ogE;KNh<1B0*U+1#1l;H)5b$zA5>p-VorTh
zh_+y7TL1X4JpcUv?&jM<%Vu2M!Q9~O#ld_-_tMf*%X14d3W^Zj;+Ng?^K`OjbP@8Q
z(SsD|N;+h+Df9RD_ta7!9ue&?NB2jkeGRzts_@N=3s1Sa!l!!gUj2S(;#&9LNi~(^
zNGg31MnJQ=x6b<ZN61LK@5K>3FK})63)f|X4mkzKI}L1BJLbvwfYanl+1rlRrlzkZ
zkXJ5lo&!y;>*dbjrSPZ60f<ABt8Y5MC;u2q$^*VylB!t><qzst1&@iMF8puK)t`50
zG@pE+*GthS2|(x-zl`k(jb7;rx|^!diofpr*NBGk*|<m6WLCDf0kTh&z#?ikOX#bs
zt5co1j872%ZD&y@J!&T!ONyLJ|LO0qd9vpguKhbJe}CDU#Q$z2BZ+Ct;!(G`lHSQ|
z+*zC$dhYm+YA-cJF6qccJWq-cURRpf*Mc_yW&YMkm1NRR0O3kAw<Y8+>Wj6ieSacd
z;syGHJN&OvlA!MAgehO!)zedc_ccc%r}Z#ZM;psn8lM&~#Yi3w*2$)FoP%MxFQeu$
zOXy~@l!_>-t03f6o$60t9NmM3&#zk>+J({cZZAoj98AN7v!=S2md9GoHrF;b0Ap{4
zQ#eLIXWMeL%DfHI!5TN)wh8iC_E&0$`}99%QsHw$C&tL$+CC*g92^QRUG3Z<d90Hj
zcFhlajDH8CCV8Zu!y))`ynCJsCyn|&UEUP?C^?-|N?&x8m(o>yb^dUs1o`k6-XOu^
zsCJ6gp6wKA#H4VNr>ELR@-NycHZRr@EEN{DqANugwb-<JSe3+cZ`hzO-$9DN&==M$
zY5ih#iZ4O=Dv8}Z48tkhVYS6kP((b7i6oU8*I4=05bTA9INFP+FyY$`XtI$cQH{k9
zM({IspYI;M_<h~o=wP?a7l+0Y9z*O(Iw_@WhJrXnwL~!Bp4912x8kGgkJdUJ8jo2y
z-{`gh1dF^x96eok$gA(7e`vlnB|JUk8Wwsn-jh~lLe@&f8^^K0XVxCaHC?&B5UPIS
zYx%56Tjx<KXZTk#>lL$2%>}Jp^Tp&#a3-9SlZOsjs=8+`>GDmlwD#~Sje;Xd9Rf>*
z9g>2T6%|)DGC<;i$F;Yo#JJUmU{`T*wkK7|`-Xx;e{FAgebwA?hpazlhBy3Vs(W@=
z*&At_Dg4dK@uS`5Xc9*uHRrZHHMy^mlm)OPw8lY$xkxpa{tag;B<l@VB+hJM+d?)y
zgHDZ~y_!*hUA~G>vxMX`h0xk!HXd1rD!UhvYGJ2NW$&6l7kzU$d@OUY_L-50p58(R
zc>y`Y&wY}bhRw0$HSKP1c6pUjs(^B|Y|K49Ju%uXpFfH%<k`m#hIIK)zQCg5itG~H
z-1OSK>UpJ)Ev8@de%>52X8_l$rB{~Y>C^lsw*&09Hh&Tt8vDg-OY^HF3AW@o^1$iC
zkiS21C`F(>;?&)q$7x!$d(@LzbN4sEs5af(ub;x6L$ByXkRMpP`h{uHs|-P%NO9BX
z9MV2rCN)N(OLI>(f5W{)oj+Zvk!Qp8w|!f?FpwSvh%>7wS$Z9I8W&FG)U+HHUMf~;
z(i!A?acFqaMs4s6NZf{)t+jde`a(v)e0kdZ&IIMJW#3jBN4A{p_WpT;Ny72c_^36J
zw<0>`m9FaY@u|FvON4`$0N)-47}Pr|!qDdLSM182AWMu#H4;CPA-Hn5=!;8$SG)B-
zh=tSkyj%x4C$Xm1C>m9G6oEUG#`+Ts+9Vl&LPi#L<kFJ-pdROtx{kj&#&tL)Eozji
zQOhSROmNtEIEWD42I?1?qf>CCn3?`^ct3M`B*}=Q5((NI<SiTyJ%K9>5bTV4lzh$$
zT(P2~Y(3%^&jSXfw0be0b(p9ItO1ftI-^>hc1U+N`i7Ns`Lv1_I_X}e@eT@HOf(I8
zLZDQMCxhtX3mu^mF&PQ_Q<_C{t>oXloPyUw>zdM7p0MbZSgLbDNLyB<Oxr)F;9O+5
z8?CzEb0;k}5j{_)Lc-yunZjEOFtRe<Z&k!e>AtkOU-?<|Zuc5s;0q0?pYFjNC;C|u
z_T{?i_t#AbI}R@m@s}{)BoUA@o7SPjq3;nUOWIpP(IkS)0pC8~VYJ}33k@$wFZt7a
zR~(pC4pM?GSNR3bBjBa6QaGgtFu5U!N%WWLQU`7wP=)*B(t?Cin@ak}&3k4f_t9*e
zV|7GMI`>wK!M!^#@76-9ua;{LpVq7Quymb61#j;VX*2fdCjw*MqF~r(d3i?ffGa=b
zyBQsBWB$B8J9EPfvuZUcLgl`#xz9g=lDWjz-|ahEu)y{8mAZe8#z^>dqLi&CNW*ET
zNiq{Yye!MSE9u?#V@S4><9R9h<>j@S3?-Mb_HY+GBNwafetQs{lan}^62G|t_9oU#
ze^q&K?O2@+2Eh{$prwV>S{3D`v^}-`VCSfl@}s+IYozjm$a1gEXy(ujM}E&m&-XV!
zTrQ6+{4S46w-#3%7LjYKzGd#m1t&oGO*Cc&6|YHm#XMt)aiV1++&B4-vV(P&XjV0Y
zr5PMwns(<UEyLu)bh0?+e=*XUrW{Ho^XW!<71?*6UXg=5I*F^hb(l;~Js??LW-QV4
zdp{RLQxvE~pCqYzLq>b<+&-~8)>229i@a*`fUM^Ob7EUdWX=Wr)f)O6&aag$kuK{i
z7&lDx(!WIauDakhHw_g{_Sau!e9!Bh{3U;%R08<WY@e%;TG0qKso`{L3>Bz4hsf(r
zsvZ@aG`%n--4&g=9Dl;)1*2z?o=LCnF5U*bDNud=i(h4un%^v-ptZV2j@O#j7VjC+
zvm<Z3P&ezB271y2p`<21)8jS^jHnx*h{)$!TjjdS1t~PP?kgwg9Q|61VL%f#*nCNq
zhLnv;@%5y!=%5;ElV<zgmSa!A5R0qTdcsEv%hq(HqQxlvn6~P^SwOmbD0q?M(z^Z!
z>LMziBfnt1PAhKOYI*WFI5?izShPY;SBp3~^rwXgq*&aK#J<5XrA<6=se9}>r}O;t
zZ&iHoX9jVRSa@DU&x)3s`?C>(`x*+_zsIPCv=~%co&<IW!DFyQhdE*fH)IM2qn-Y2
z`FjbiB?k+Zceaj)EJue}W?H6M4`s3!rUX}K4~>HQVRG8q)egQCe#!<;rddBez5~bM
zR^aYTlzf59#*_Z)$D>L>C-Ahn%<s>)%f~nwA>HbQm%Tn_p>#%@yt2TFSjHmq*7<~_
zQAg~4U@*5_%&;0kNH=B;#ZPw$6KoA$-QFI$RDYhglNq*`WVz*?%hk*kY9>9E1}2m4
z(sywhmRD=AuXr4_csrQ}6EgNnayoX5NsM@f1l^^l2ICrPKzIXl7p77ur`1}@1j%Ik
zJ2s_~bceh4UQ;N6X*L8A=2CmU@fgS3k&`R~(y<0IphJvG<96Pj7`>vCjbUDdM2FUF
zZk^-Wa^GlMAad}lU0j)HukwaWL*qs9L-&OY+y?Yj)JBrdebP3o3w^QRS-X;=Y<enU
zdZF1ZBC+c{chfhM@ll4R5v>qnf3}vTYDh?{GMgA2Un4VQG2DAs!*Ii5p5s=Xs`q;)
zRmm#{{uOs#o+wv35@17@)EnVHg@a@0P~_p7_;EN@B-)tuCW^k^hOYN|6YO{~7FM5A
z(r!&R5@1C-2|EVX#WlxYgn7KD<F?BC0=Lf2*#1Fz>^BmA0CJrSv<_&)akwd94dc)|
zH{NPqmDzALtb1QeZ*?dWAIW2i0Uh;B$6RKQqD(#YOU<!ciNu~9Pw(3^&-{Fae3w@m
zev{vZpij3A%yRu~;CaqOM$31<^Cm@~!U%IPjYY+{%Cz-qEc>b_o^{U!rxG@u%sp&;
zPX;G$Wn%|)ge30z@=@%8;j!4`^r}s(Sw*`$cTG6Um2SE#(Mdm4f4%-IvRHXf+H9(x
zmr)jsjT1*7e&fhmqB%eiL59Sl&-p0k^iq;ZRyHFraChVi5Y0zN^Dd)VR+PGXF_GnZ
z$t1)YhKKMWTtUKGamUhW^1&zTrkvRKLXcXy==+*(v(aVG{Ct-dZz3;~iKo@?4HeVZ
zH4(iVzQo_o9l~>(dF7!^$Ywp|dKG4FBO-7A6Lx9YE4+)O(geSrN!t*!FJdlYIX^aS
z`hq5}gj__Kw#c1^4=$87FW>m$Ut}k%8)j=UCZmU>Yq#kE>Vj;n7-W2;aIADQ!xBA|
zqNNf2(9dI((7vz?%M5GPoMBvjq!hy1_<~PTLyk(W<Y&>t1@--qV}rzL{kd_g6F9Xw
z=d3G;wFjP1c<ZpSW9_zfYPc`u{(4Ok<+L;Nru&Tgd!byzR-uK{Ba?UM3^lklxwE7v
z$U5Wc1F76CDfhSB8n=u`t*aj-X}RMGjcBg?S5|0w2=)iBuWq0Lsa(lBU32Cw1)=ya
zOOy0>nMeSjyDO7*1f0(#?j;0<jU#VBc?iFVc8cf5ky5dryq`zcL#-B_ri=^5Wqm&t
zw>gHullDI1<wB||<JeuRYC;!5N7;FF9|BkHi4!LrL2DO<%6&uYuIYx+up!MytV&Y-
zq)H6!N#Y5#dWtm&z3&DV0%T3iC11KHd$3N1%xzD_pABh9xKZn53i(o!mQxtfYky^q
zjcDT9j3>fjqQ+;}hdHop6{P8KFK+qW{)%Ei&QOI`?XW6kUT-lA?4?w<7mM4p{}Mu~
zVISy~Xz)6+K@paf%VFx7uT%GiyR}5y-ZrEVRJwf1K7)D=imqrHVEep0F?Icvb*GGH
zLe!KZJWt(~;GsdRm74sEy1yGiKP$)@y0ctk3MQ7A8TB*ZC2A3p^{G9ksU{pDCn`Du
z&JC+#>U8^oxgqJkd3;_(G<Fe0dtsYvGH};Z<@eXpTXVx1jtwuO9kiNr=F@XKC+Abf
z$n)B_?+k!OPoQ5?k;s7RvC&$+BPx#9U74H~Ur|8Jl-Yq9z<<sp8HS%$6#>XsO6HS&
zw|cpjzmoqksUkoB7cnj_iH_SZ_sc4}{9Ax`_NXolnIq|Iy2Y$_xi5wR5Cxn&^8M_E
zXstGcKlk&$bD5)8PwaC$F8;u!rvZ5Fc_0<Rs-iV)-aBg4xH%_2tTB-h##(7*U-QcH
zXVj$TedQ0jp1SwJ&J{kmeied;MszU{5L13Lvb)j7<WU+YX6LRm>%_Tysc)tMhKAMV
zP7vEx=3&-HJZ>{J#hZ2t4wB3&w8Jdy9-ufbsdCKn&z4OHcEzE4Vrklo5YnrA<^Aw2
z7;7+LkTus}wjU&G+JcGNPG(Kco3FR0r&!K%-EwclMG^~3MoQZ~-nMkjI-!p@5)A86
z0_csjG!>=7gWlS-$YQ78&|PIqiqR*C`yR5y<%m1>9Xv7i>SUjv9k2DUv3pV*TT(~V
z*6!c$k4z$MhrG&7MCaJ*mE+Lj^RYQP&wN!n8+g5smEGzh@m*oq7JPH)&+>hTau|Uj
zRT^@ZD{$AD>Q3k9WFI-_yArVhJ`LH-_6wuI{Ira74<jPwc_yu;?ZQt)59HL-;WcZ1
zw*1BjN4$9lFjpc|T{nVY=`@4$nUY<onM#%RqezuDRlA*fL+x4Y&^pH$^yR>gscz#A
zxHwis=gsA{>`DOxJsF-e^1hbvRyo{~M>A6xR{-&12<kx2RY{uJVE-zFC3`Ir^Cx}8
z<_&W9EGvm=a^i#2M<(#2c&m3rY}$|960#szkATM5M2EGDYd2podsFYfggO`sWt<O~
zmIPyEd3lXknLXl`BBU=z1^t<~ajSRL;HH{R$O({$C`l;iw^hvP!T^x%Ca8`3)w1Q%
z0t{4R()b4p=Z&C2`_PP-jMbdY`Eq*hog`6|3rSXIT`aqU?-h6T0p8=E8dsLL>_8)p
z-W{KP9JgKcF$GMb(jld%N}>nt%~kZ&<}~S3W7)w+0h2#qHZ#>=Z0(i0_%6@i^AN2$
zyWEEcNarNJc!CqM%qrL6qx14{0{qjNCREa*c{{OdhqDbi?cAb1?~ujWZBO#}@n7x$
zV>LR3Pp-pm=EaE>o%QBT8sD=HT>G63{3Jf{xVMvc;f0?!Tw@8$-uW4lMd5TLfC*$N
zglpu`dqFyY@e}oY7;$5`VFhG-@x-d9O(HM1+tgbFi2S@U_-&Mj``TPl1CY&{Y-l=>
zy_aiq$H^wtGhMHtY%aUMbqbJ2Qu2e@lh-dN%XFbc%}(m=vp5;pJt_LB+3P2u5bL_f
z)<~()c86%pmfgvp%b)E@s?0(sLqV2TjbaSwL(9LjObqrxlN^s6k&80`*GK7R3i#Xz
zCOT+W7o_!@ET6ZNXa_|Py@_pxUB(Ko?^iJ86@f$zFb~sjU}n||a|nLC42JI;7#P!K
zgfU?D*k(Y9$46n^DM66gCg%=Won;(ka3?76w<zYr78*sB61QXtigZ)vpLV3<k07d0
z-gl|>lLiVVz4hDkg3$Md<&BB*Xq8KqG&56jLz`!Hs=p*NaEs|vsX^ik3xoYs3{nZ4
zQrEBYoAgnxKXGR3but>O--Ev%5O*F#N0I9HoJk5y+j!`$+-@eJ3O>E$$QJJpx(-~1
zhS<Y$Isw?hy(;n1rB5f1OjMwJfk_x8d0k%oH=N<|Iej}Isy?aRf}yNCR=m*}<FeV(
zuVO4X8epP^Zvqy%4$}sdMD@Dm<P9~VR*^hmzo0#^Q_Y!ky(YI8LDu=9neu;@X_*G!
zijCnZod)$nT#g0>1WKxxhstM5PHsaSc>_zgOP<dVUKp{gTrkuV7$ZB7ORirR1fTSr
zT4?5d8pDtg<|ud^DkwAWc_4!_QL&x)JV`5>82+=7>-hWq*_D;YI16CnEAH7j|EcKF
zOxQZ0r8(A70tX&XFUv7qf6--uY%~7iJ8}d4R^XQ_zU^{3^Vqn>?MbP|y|CgC^QY)&
z0`8hP3IDh=IW1hGU9<K8%;zH_2QuEm!(5{I|9E=9Yw9VpNbqXk7R!?leRO7DDUP!F
zG%lF3@ZY&#6wXB4e=fqU7paf$d6s!Ov7Wg^A>Kf&L|>z57R%6)EpZW}4m+|X!fqvR
z3{+=5Eh_;`NHiNj5hZF|pXbK!u#H8_!}gdudsktO-}yccLqK`KST(|a<BcI3L-99L
zl5Rj(0wPAQ_tjljYhc%!(*8SF=&$iNxl>+iE9;@JeLb1a;&@Kfz%Es@&gV5t>i{V_
zyEt0pb9m?Y@JLwrTjwbn9B0!H;XT>O7YT%^1S{#vqZ6}goNMofPs*|`Q?%*lt0S!R
zw30eE*~+O}27IqBhnMRqf5{yFtYwcqf>D07<>fVK^n2F)nm7+0voi_ZmR_CRCc7qy
zzZ^cIUShqk^Pl$q(QCOQjrYtE4;f<=poxK<SnuC>X;SDBS#5>lvQJnd?`azeD<K2z
zgW)&qep)IBEybnAw${x%2F&x1+ZF@oA;Fmq-z@vKHX#=+qj*~Cd^e)?OD78PiY28U
zIp|B4@~jc1V)I<k)3TXs32rJZjzHc0`A1x1&LBvq-&xqkajkjmBwfr|&(#ja1bve=
z!wdn>?VX_okibr*3S&x3)ujygT~9;Ul4r>0+yh75n<C3uSSEYjsggK#&p`;-aCcf4
za=Ylc%rnp9F>2n}#!IB=@+pkTh|zD#(syY)^u<T9$>ZK2{~pQeNUzk`1K}|xb*VhA
z`75yFt(v#V>4v$aPtQ1sT$LEms*SP4ie_fnnii-Ka!22CJ9(C;Po9K^Ir!aMKgAo(
zjNY#sA>~eB49OYS*zAA*$aN=4sgO2zrn-GF7rO(^@z*He8b0ymqacR*bYYXJox4RL
zdm^7)mhVG@$xVUjLVRCa5u!vBC$9zZv|5oyvB{c*)R~F;lW^8_6kTh@xHqFBt;O)!
z8bT^PuD2Me+Ir2@RBpsR<(@0`)$J$1mFc-dNI9X7;ye9EPk4g9k`-@~8`KhDsHchK
zoiy!{d99FpO3!?)4pOsLY^_TcqP~r}K)`a+;DQf)7yUB1l?nSA{FlshH}mdHH?s9s
z4!<SH0=Ksx)5O{aDl7U`?ytq3u<^^JNw3wZZO1iRjRE&%yf=n&M^(BwYZh5kM{WK@
z5*n$br?R>Jh(Eqtqo944g`%`gDH|k4YDi_ej85;nCrR2{T`rb74>B}+BqZ+k5+S0c
z8bfPjm#Iw9xt+Ir)qiwp8F&-V*)~s3P7;G25xo*xN*Kk{c&fFony_Qmrjb9;k(HO%
zS$`Mf+FN`ro~}WRM&xpoD@}a-%G<DkYyE|K$mk?LEyY4=aTu$TJJnzo{+HpjMA0dm
z-(WHW8zMz2zIO&&vuXU6;(@uwI;LNR@B2JW%6*OQ>Ja1<w_h7s;?+Lv#$3d@yeV#)
zb0R96Po1$c%LpQH93zi4OA4QbbO?|2cZlF<q~~HsuPNabyzF-)E9~~{2>Fbns3V^r
zg%0TGdG1~?NOK}Ru98-xb3wn>dPI|6USdSkSc`~?zqAi7{7PxzdJq&pibqS{v0*p5
zp(R>IU;t5$=-G;#R|J{Vcj3^D@9C9x;F`Bo&f{Qp2l1XNpGu2q<kL$#W#yIR0}LMJ
zH=nQwdzogiBJ^5;hprsrdbuj3J1YPtODfVh?PRS44$!HV%du|6dy+PB^K}MGj~rX=
zuOk@~>b*R?NIAYSYoMgN1(Fyy0UpCdL>u}@HXx&m3Oz3n2%$K14=nVhuMl$Zo-DbV
zKA6CM*yz}<M#^#FFV0w3-faF51a$f3okdz+Ij~-+X+YQ-nvo$pm;NLgG|g<LBZ#Rl
zT+>YN8c^F~n_#Ot2eK%hcSTmg=-mzo18kkT$K?+i+~F#X-t%*esqsu5LSWMX3`|TW
z$v}5Rg^FCGW7eLu$<gdUOUci2<({Aqc$aB}^UOp`bpaCX38@yadE1+FmdJ{M8On*D
zx0m1Q)~q3d<ApJdPFToI|1!;3LPiBHDa-}eVTN`3Uw?9gq8*)%Le{R2`<lkKI%EJ-
zOs>m?)?+sxd{YZ#wSd6Xbj(!YY&<t*<6FnWJNhGA7b$GRSE^<ItZ9`OG*5n${bGx3
zYXcN<8z@sTVFeG8bi_2JV4JDZ8y3y`JGdNXR=955`b1;R2tLn~`)wR!Ao%!wpoYQQ
zEGddW-9eW=Y2OBJoq5zZJ6v<6rtQTlnje+5rVVGuGvI(;N$o|SgDsd=h#_?P><_;%
zLi;8dk9<5xj|haH!(+pmei=2oc1WA-vF7BsPP?OxU=54H{-iaZZp|IU0JJ=zCfrv7
z$S5MjxA1UwPD$3-x?FIl__Pdf8W=^9CJda8%hDI+Pb;{yIiAc+4s14ekJ}NE4WW_9
zph>qC>7CWCB1em@W`e$=m4>n}2-oYFvQPOG0~%w0*Bw%OC*}|a6p>pVGBgT_bYh0^
zW)Qr#0_ERURdY`}FD1!w0^u}5C26Xs(!`~*E{6<KkXut%NsPV3NpkN)4I1j2Ty*xG
z2E-%%Nu9V^i8rV$X6Z<#_X}$0Hj$$KmWDxjsX=qB@TP?a2t^JEb~%F)uHTtMDd2*T
zTj^*jS|q#+usrQ2R@d1h`f2*M)?~}ES_4J<Hso!tjAfO?jJf5EVa)4e+pXgPvVC8z
zqt6SUKXK3-VZ^#&jG@(1AeF)828~t?W&0W>wAu#MI68Rr2RIl{Zl%NChWmXNhXc3j
z`kJqW6K2ePh=liN0}alPozKmzC6p$;Q~`et;6cdGf`T(aOSz+2+2_yj6K&M)4p@Q=
zk1MO6PjrUgh3yNUl>7yJ_{s1~M*1f|9pj035ELvSYYB(SRr=iFyKgw7T1lse<7T_&
zSD<sEmLs=jffeN={-w~A3X1>=4k9Lg%WKJrN<*UQV*EsyUz4XKYmbVg1go_A6rkqZ
zIDy$)?LEw=<x3#zL6q!j?FnCzNe6|!XoW&lAx^*bGKFc95Jtx)HS<|Rtm^Io`{v^^
zV+PU%+X;z0Ao_G?D)U{Np^K3qG4Gx}>t;~9ApbF7rpa&to)ol~7UkWsPHM{vQx0W4
zD~@^sUg@>gv=psvGc(>3v3XZnIE?$lwc9pmv*1Zc;g*&gah}*I_bM`3%!|l%)wjaf
z-f$w6;iYFo69qlAU+cxEP6E#(Uxj1ipSs}J0HcZBei9ZBjvk<%@@t`Gc9=gLF}B{4
zE|#XpMo$3H<<Ik*=1k$>0L^p!ehhWM-|chH-h<V?4!je24bXpyJyO88T!>0A3cfbR
zAKa;2*pZr+5Ek@*K?XDf8#SAZn*-K^ysXRY5sra^xr24WqYgJs!;g`r`JcI}_?@HF
z0>r<>1j_EUW**R(!25S|VM>Zy>&9CWb|c=h`@~8QeKwIc?ft3eIzc|ow}9X_(kXP-
zo(I{Lwv*Ex9_Un!^Psns*rGEO2#anc^r<7Pd+Zv?MB<o9JugfYMbZ$d0rjzFkdC|z
zq|bJ<5VEUolc+UZTdH=V(MU|Zc?@L|iC;{KywjomddYqKyebh=?Qp9WkHf1u7@oel
zMS+nu&5ARZ*SE*q@i!B_AFB=yJildU)o$Cs>ctKxqzi=K(#L%O!v1hEGpi~y>~!?4
zUlwde$Ht19X$x4~D0z9cK#{k8_Q%Cg5Yk!sY5b3?jpzg&6$v*d34L8PXj!p)cc5{v
z`o&4YEfx>7KJ-y<_q0lgNT@)Xd25<zPMdKaRg4o;QsbwlO`)UgVAAy=X_<|6_bV1%
z`nh;_XlhhwZph|vG_Jl_+;`RH_unM+(u~7k7YCBKqZ#wHd@S^yt0f)wKSkjYg9(WN
z3rQ-*5qoVvmo8Tk_3qMmp5Gm&Hp_^m-qrjjRT;A?**uokH?AhNI-TxS(Td_)e!%qn
z&THJkNhk^(2_p2tKcd;H<=3yeYc*z(0%pHsVOcb$%9HghNzXllB9iv&rj*jJUj5QM
z1RK+J(v3uqJ4;0Lw`_XF%1V6gM{uored=VAaeXOm$Gwv@9DWf8Vm3?hP)F@1O9~qB
zA+r(C)`;L3diRYk-(!{sm<asE$-zAFKub-826Wl@ruFr)<r?puu$*F{RZY23`FCNm
zUoP6^E^--SqstQ2U4lUT7)j?W+Ch;quzx%#qjdUpMv{_FrN$5rFq@d6?ry`+-$!IX
z1NuaFjZ?~`VP|;M^Rs&0F(Qo+2O#dPSFVi?l|Cy7wPu#^;)Q+{n%6RF56!xd9bL>W
zXlrW*(t2(m3<Qyuu$MPRZK|vLyU4bS=RKJsyeV;bxXRvyWe}n~37>VDc0)Zi`wYrC
zHN8V`UznMq5}ob|@~fPOovli^jCIrX$bHi7j~+(C{O~rQtcNGrJ%jKZM*oL+jwcUa
z*h+p8pVmLu%8i+-FtX|?vL7`2+Qlm6A9eD*mrBHp_m;B8YE0|&zyaCB)54Xgq-|(y
zcjZ>=k2U)ZKJjrVC%HE_R?{T{oZ%8Fp?sObM!W(2YCvyN-;LlzZsBL9zQudQ2t9BE
zwoqpvmBO!%dvS9(vUA=qYQDQJG*fplfLTk#ZEl@dwkb=bUAY9%(rq`X_Lz&7aI4+j
zn%gS=Y5)DrHJ(q0)yV<cuEXD3q;AQ3-0_XrP#Lztqq98v6Y9x2_dY|+l2fgqCoAv5
z5W6mZC;8f&^&r!_L{W|8bpgSCkG!`Yn=0etVzLi-xCKcd4hh1F9wZinTt$5J^Pbw*
z*@HkOG@6?>vfDYNE>zwRzCVB7>+h@QX=KP4bZ}1&eH|Lk3a@nno2`A`k!TsoJJH~G
zq_6r=vYNzKdtaeu3%$^}m^m5Y=SfqFmcTiXa~N|x6;zTnHC`z&Dcqc`Nm~;vBZ`0O
z=ae4b-aO_PU?_^WJTq)he&Ba64#@${A7;QmPY>Sm+-N-S$rc=xHfuPLj_aJSRM~ws
zd~fb=Fl<M2x&kD^R)pxjg?>OcPJ&HOo0VfV_%v9wzV)QH6IcQBeB3sIJ>mIQ01&{%
z#eMMrY$qY354pR!U@g|*QctfoK}n5xY{qh9oAD`Gc}gwX6)!8H8LV7xHc(3TDM#&k
z{v+<Rm8h#qxKzw%K9%$G)P}|DiMbBnyogq@@2^uXcyLU;4kbv=DvR(%u^5&E2`C2I
z91;3aa(LX2oor4}P6lrA(6@+dL=-2;gZ#Q;2_7?TV#6e9kY>b@A=&vkfJ|nEX!TA)
zxxU|q@XlJleB%2)o0-UAlE2%tL}SQO2YuEu_+xn8zVPRxxmn=R42WJ^blL_rNE@^c
z(>j1aQ$z9`5Afxsnmv<cu)y0U%g1|}n2Tv6;V#T!7bwG8n07M2ZkZ*1&md$<P%%j_
zoDbNdc<j@3ELqSvM6Us~s{CS#-yoXZla0wG$|W_{ZFg@wW>&w?WqEFRqr3pO?Wi&}
z?GOx&37#i-uYrlRm+vsNru!^46gqn>u`VGlu3PSczQdO*?aSzOXdpd5bMMw-m}@Li
zbm&;^a7roedYxT=p!*7$O(D{glj>na+aov*7-+uScSboeH2jLw@jT3PA!<NcFpv*3
zKqZWT_LIM_r&{)phkww4?Mw$qpd(WY3+bJ?{;(&J*IYgx?<<F(O%s7yOSB|pH<Q{$
zb34ma^yV1VYA{s5J48OPa;sDZLM^zAj5nj^n9lM}vb`tZyE;3$SF?T?xm)ynh0;+;
z>DtHleu>>Lk!oqY6}WctiJKOR75#t2JuODo0Y#2_{@w%?ry6;jZ#bjN#2x_BjtH@N
zG|u2uota6!v4(hre3X#7U(WBOc%CvkrR)m&q;S#<4Xv?nDugVM5RBO1)`klw2Z-K`
z5FwO_ib9{Pv+p;$(sp6$ug!W?ZqLqx6p`gHWgTgc->Vq=hU=~l7n$5;n3h>V<5%L|
zo1{z`YskUKyaHu8N{u-RRk|EyPj|N&GU<wKy_?9m8S1w2DFxqLM>WMS0R?!*hV1SH
zLq~t-m6kT9Og8X=lm|eoULqvGfcsRW+ffes=U{I5dzm_*Hv#hNc0+ei2E~w`ssuN5
z;m$WcJ=fHO;29+7uKJe?h>xXDDCJ5yFxOj?L$~D9$9Gg<O8eWj%u|Zt21)DLBJQoZ
zjJPrS3usca*0F;O*FV}TMunEmz~Q;W#FC&QqmB`KF}@E{lVv5dwWeFG%j#cffrvIP
zl*mgb<w2Hn$&6rsdj9%61x?`ni}F4X4fj|Qp}g%KtMz5+472?m#xzKw_v_gLav2ki
zwU&~Z2%dxFlpFhr#HL{iBSzQF(F3=XtN2zXj{r3~p1f^)o*Tvy)&`IpUF3mu{G19m
z{8DTbqz)0@LNraZ19HQ?1KZ+D3ji-M!VZrv#4Q;apII)RjVZa@*+w=^=ryUhj9z7W
zxbj(Z?vei0dnJkI8{~@!ww#@~_IVHlHu_tVO?%QU*bWIDy2nG;b!OuH7mmj#`)~W$
z8iX6PG%V*S>9mr0E0KL)AcYN+b9ZGejZf_>E@W)pK|637AsPynQd0wk=cI?x2eZ*b
zdZj|LOka3`e-&&gikgaB+(!Wjt4FDT;G+>krSfpwk(d>X&{h32p35mP(}k{4+IQ3@
z)?b-higy$WEbEYo5WIk}Z+}+*61tovvTF1r_m+V3f>s{6QTM1&Z?*D2ipW0DU$N%N
z%dX;`!{;8=ie>Pz=P^Y8$|V8E=m*W5|1GimUsBHhtIqTPt;VDUF@r#=XaBz-j+Vrx
zX1Q1hsN%giTnvWfNQwJxkhDolldOL%1Q1);radyOwNa?CWKR$^k(`;IH@f&crtz%I
z%xHHvr}C;UB<G0b_!JJkcNc8ZJ-xl-4UR@5xiWMhDsJVudi%c$Q}CB2-wWP<J=s%n
z@tKyh9qr>vK>Sl-VT$O$y!gQTD|}K?QWtmkkN~a}2q_JCxvhw23<@_32yXm1C4WM}
z@mE2rlcJLU*HHSxP*V@?4hap7ba~htD+cISa)%KyL<!qZ7+i1;7dE@^kolZ$2jNrk
z71-CMde<ovq6BSqhbZ`{vnKyns2fT6pN$Cp-uZd_3L{a4aTX>1z<a-<S2?tvp0u3u
zV8P3z4iKEE<MKqkskPPWR|4r^lE@E_s=oqUs=upT%>l(2RdOootbIZ*#4`e8j;i_H
z5lalbyu6$XUrJf*CyEABILLt?XGrp&K6U8?_qM3~t31smdczV$hI4^X>6t{vJ)#AH
zKy^-UQc8m{f#8#k#N}Xx`+2|*R8N18zaX-y$=NmMxum}$n9^ZvEI2vkKWl(ub4Uj@
zjQ?%A$MpYGDoQr?zZ=PjS0+e5PC0e$itM?-<H9}mkgF4$Tn066BBG8<9~MvEmoHb(
z(l;u{#n%)R{d)%hKAOLyMIR9p6H-;hx5Z-+aoB!cV7+&Kxrf+1IzO%{@fM~)SPf|P
zb#?cyZf-g*Ui*m$+_GI0Qh&z6Pm}(Fn<u$WbxJ4xNWaQd@4xT(!4|c8hK1xU_LxY_
zANfkfh+z2r+5I`wZp(*F%n*d=a1Q!7U1_8@K#0U8B2rGcQe28h34MSI?Owz|a&owY
zgg>3Yg7Wh6r0?U_yHFy&{|v-~*?%tk;|hWQ&^4rIyRyF#GAmSS-tAm>)O~ZN)8Rl<
zq?NanIl(f4+Ok|--3gudI%Mvlrz0=PxAW}upfg<}Hd<|79vdTW90>>v4BVT-S$*;8
zro!l%iSdmBo2L6;FUKte?MH@(h8i}$!+l({(Jgp#G2e(_{#~@XENWo3K*h)yhmvO>
zzd8~=BNbx6p-T^&$G{9`<11-=I<LWSeR;XI)b2Y~LEPiBA^eD#SW5zqxycVfNdtU@
zuduOPe{IMiFih#79uRtHOC&IY)Mj{dXA{fo!bq2hVMY5t24E_TvcyfAX4{bY&}b^9
z7pa_Y$dvG?<hwDh#_u6hb{Ff!E^clq?i3ZR7l*yXj`Qo#D!TO_&Y3Haa}!0hxrmsy
zaFf=H)yc6=ftyaq^s;FGlK)MGFVEo>mwr{htyw@gU*{cTp8p@*Nt+S*YSNOBwbRSU
z<ka;}N6YiT@SIjpD3Fe#HqZa6qNpYkHfal610|;(-n%}wkVjvvdNB917=9?`Fe`k<
zgBoWV_NdnL?O+?8z?9pL>&nO!ff9K4ju3pLql8+>i6xAHj@{$9xjD{;=N&W!JvH)k
zI=B(>n@}Z=zPB+pYc)%Gv@iJ6+2F_979G8B3Z_;FAm(O^!*s8(Ei5<Liu>%8q2uEO
z#?n%%J;AL6GHrH?cElqc8F=i$>abAH{(*mOAU4bHSV5}!)f5zN^AHwm(@A<z9^4@7
zS^GO^C`<OiutT@)*?+AqrN6!=H?ipayv(@fGwuw7lvlo0S}qGc1AP(;N~N3>R&cWM
zZF1Y;oI%{1+Vk8~6_q#{1e+L7sM5M!9Ln5Y&-<~dB`HoAH~YNV`bEA~1BQwc&`T5C
z-=lli`G^#$CK~tM%z4nU5n{d1^2~8Lp0E~Y(~Q!f5we8DnbeM9S4=OA#cR6P(cj*D
zC!G`E`6ZfO0%|lhX!n$HgO5Zev8%64Y(lqx7l#`R4vl6VDjVdz_#t{gN<xDBTt(&m
zP_|kn1RL4xw#iA!bFY9+>9#Rwp$KM<qnC+8-<kt_MJ%>*SxdNY<4U?8I-TIX7>J=6
z5s_H=I(x1#PlT$GP`Ssravl3U+j2(ts~gw<-Q45g{Tc%OO4H!vWOCgS4U}>qFDEBw
zL)LA?dn6N?)vQ7Uc+<j$fq^ODehD~^_^Q-__|#n@4M`BPB}(Bk!sdTNMn;R_3%V_r
zKvpBV(BF?Fke==SPQ4kab<6vA{k8#;(qT84GG{}{&kH^FrqcQCGk2(|yuVIX0IJAH
zl@sop-33y)-$k75O&4nC%TLJ=_f(NAeuunD4=GlEAZv8PJ3)zI^NrGo04Cqr<P~~j
z$`C;Z^Ij{_e6$7Qjb`0T5drU-g^INJKA-3K5cV;2c*R$G7y7(m#jT1wTNoFh5iK_i
zdi_nepc`T~c&<i1pxSYdeqV+>3@_8};4XPZ%O@X@&7o3h+|2)RfQ_^=nv$<S-BmUk
zHwOuQU2{79{icHen_jA3W!QE5h_ptxwYk~i;CrXiW#>(IX%i|o1u%-?{rT~RbWgD2
zu<A|ur>B_LQFaGX=D4B`3n5q_+d}j1+slnQXP4bm`(q%rgkLoQZ;^V|TgAFq{~O|$
zerTrC*=8kA+p~UXpk^z$u8utQu6d@->r)HDpZiRw+<;Uf@5c`=`W*W4mBO)NBJA@A
zwP7WThf%Ll#ZPtR)-lwjBDV|8$TGt{NB4snA~j7zR)o^9f3wF;gSRk$m3;|JJVFrO
zcy)O;ePuU(>+5%Yta5qX`b-c_C}0Hln=PIX5t_?8#Ysv>wvz7WOd<)V=bq;-f}5`r
zrNJn+1T|~<YRS*GS^^b+?m8PZ{!Ol2yS;q>58BM~s<!5re%gb6*x)#CdA>iJ-fAu|
zk?AQQ01{)t00My}&VBaQHFS?C`mt|b4Qb@@Ytm9pI%`-gW{T(AZYRiLRy&VXKF|D4
zONa7+>_oY*+sFfAsaFn8zHk}X^_^Uw(XGM}@dPvzRtwv!gUM1<OCNuc06RKZ___3b
zMtzE~vb=3=H>V9Jwc8AIu5{CW7!VF6vG-;^35@jiJ2(^DG<Jt-eRw1qSQ;&9g1p$D
zaj1!@G7L{0&HC!`V=L(*FOW}VY{@^*u#K2b02>)-#%3wq-pnh7bXv&<!SI;`+?kgI
z<eOKimLt9M1YNU@S&?y(nr(GXl&~z;V6u_IfMBi7Yq4?@A{Ef++0ug|?L+6oB?Bt_
zr?%P6K5QVXtoh$>$_7S=<W9>)-%ZFq$jwjj9Gl*?T%Oidt?t43AU4p{v!L!^k3F$`
z{k(v!MVoMx*+-q_TtubbO8#7yt2#6}x+&;S2ZBWD8FCV2j3`fc$DdrCY$f{En6FxD
zWJ@o$ePT@zQXz6boFPJm4MjKBrS*T(qv6T?H5m)HvtjFzY~T|rKGW()!~WVffS25C
zi9p9Wcl+ZjDzie%;$av5yNbSuzu8!jbgXw{cNKZXQ9Xgz&XQ58;mu0B*Lbs@)Ne7%
z6rG*GEFsKL{3rE{7EfEe;r9hfc<P1vCGuYY$Bc}mabCLt%<{s49c@#rJij<)98WF*
zKltJEV)nNc`@W`Y?VxE>gO9sGm6!B1vG-1P)8{y^D|Bh{0`4m!E<%y;h%=cMy9Nt#
z#7KD19bl<>#B}ijnACi@_m;0N>LOmL&3U$oc1N=r7d|F-dm?pgU~qr2R(2p;e-Pn_
zatyuLOe;5tj-=*o_jxmmN5IANLLvVF2Se<!@$qr@uf`ki9AB4dGolKRl6LUt8xX=Q
z{+W;N*Sm(M&HTdnI_&^=BumTry(;SqSNL0HH+GjpDt5PvVoAKB|G)!j2F@jN#x%`5
zxkE!${@vf-f>9#tD1csyN_U6QO%@V9M#Va<Zx$HsbG97+2QB5--@J`4;rSB;MR&|~
z2k64I?Mzw3^-U=?6Ti23YL!9g#ZpBy<D%4h>%ge%`2zTK=MMuRux!l!_X&H6w;$PY
z?lVo=-vN#2ZEbCe+GzmO5nVJ)I0O`Mca&(w@3n*|dwb@gaStxv&yw*1hQ(7eurIN)
z9A^5NrMc-z3*{9D@+jm5V6qE#Xtqr%J?uAwbk_XuKPyoKKXC6(RffhL%maThm+}{g
zWS~-3r^~ZIGDL$Tsz{=1Sv@e#-90OwV@cxjxN`Pz4<5dwC;sGXy@sB5r<{Uurz<A8
zh9NYSBa|3;t${J)VK<GP7kD1o)C#Kf)zurQvxTkRY*maj>sk&MxvK81v4F|(e0&6c
z)oSU$&_5AoH4Kt{qM2y57^sg_&EfoMtncle{w)uGd;CWg!845<Y149K&EL39v57($
zxz*-UGl<gMl31D83?bVB`A|F;PtUgu`O>5@5cjj)H^R<K^wX6ly%7TIi4q{ny`{ch
zBng7nLQN-^G^&TO{W|~Q)6>fmS0Q(?PNBiMXRO!TNhUWrnJf;ncN+>|2>}CI)(Wi*
zNxesRq4-o45;C+ffvRI057I@2SnB0!*Se+=Vmb-hG(M9=?31%;t|8oKtiOqwBq%3K
z#(H8h>MA4*R8vRLhcts<hQ-r{lNLfOCN}KdjNP}VUZqwTSkrE*zOJ}#iP+<rZ&+`<
z!r!D<zrQ30!FdwQU9_Tac)aw^1=1z&s7Sl$%gc1uHC^At<L7q1%6TW~($(c3_)v#I
z{#-M(H0^n<%}@*(irZx#X!ZCQO_B$+qQoYkeO7~(DeO$h<jk}&{6%4nz$LN;l`cn=
z_EawVTCW6G9=Yc`FMs2@&RVCghfPlENwSZ`pLLmgw%-u-Rm$AB9Q}Gqqw2!%@zNPp
zeTRa1PnsR)PS+uEYl~jXN}I!9`d418W8TPJkB_YbvwhO}%z!b}0`iXz&nt||wNlB*
z|DuYRqhh4O51YXpDXL22j;JSG`ZNMfDv!2s5PuYAYm58ON}Ue8jt-{HXcGxDq=29)
zhYX=tfVSl4Ua2~}05PY!*g&L$-GSq5!Q+nmUHlqVy-Uyi1(WhAih_~#pLm<&P2-Ka
zC0O`W7TpE?Wye!=NjS*H@o!dAkDqyM4C<5{)|n3Hap}JTgXi9R|Ka?iSnc*$1coz~
zDuN$IaESAqwUUb0^mE=dVUIM3A!hp(mx#rR=uj}#6*XYO>u@2KrgJ3mWODv1cv5*{
zhWuTZp;If{Lq~}LPQ^Rn{!*quTqZMF0vaG*bSk$}>PUuw=c&sMH^1iOv)H44+q{?U
zRfesmFu7^Jmaa)5%aJk*GPQtk3EAL7ohDFoCse<3*uM}51Lt{Dd@g}g<<5~_m6K|+
z<WGY@B1Y+u0Gs904A6V#jXgeDF^EAM=HJwH^9>5q8YxcGIk%DiiQ^ic?7G)SC3eMm
zp7;ASdvDpF?TS_bT#tl7OZJ2pf`n;lrH%cB1AfjEMQS|gf9!ak-F_c$@w*$)e1wGs
z6^L<vrIX!<3yY@W8>+Q!2Ei%DNA5~r<vkI|6w`95NhgPW>-4;98F|<`MgiZX@QRuu
zd|(>ydaXT0X1%Ievl}6YUa?}_{w$lOfkBhRvQkL;iT073RD82DEJMth@oEWlFOg;(
z9f6xBaZNGdklu_fxj4O}^dcWtc8w}kOR`GOg^yW>2uwoG?<4RGY#*JJkp>sPN}7AF
zRH8vMdR8R#dHxlww|H-REl)O&UEA25+Az0!&mf*+yRy>0pHQ}iOBNc~TVzphw6?YT
z!O#iTEq54NMfwmU9e8!}W@9i>6BSpE7smV+ofjK;WxULW`&T^OTj32xQ~o32pMjt)
zjft7QvyRt|;J*cX3X5Chlqy6_Nq^k71<k5V+i~Wsl)$>E_dnk9pq3F``wd~elcuyZ
zaU&RI-()xs?B<KmkhR<%U_R-5_Qk{Va+MCa=|=0TzdXc$0xim6ff>uZ1Ev2?&Eu31
zz`{Cfxe7xOs=DEyicYDy8-0q+VqNAyphk``kplKvXdP&wd*@%IqI5;|7x%ZlrpEv3
zkTC%;EiY^N^G^x!$!l+KpLc6GSU9rvy0E&gbeTry<!%OIt&H8HqRShcm697C!@_K*
zag-42|Bwa4>gzBD;&PKt(mgSUfzar*_xFfoyyJ6*)_m8=RB@lvroIZWske_${#iW3
zc)p&#x9^QXBQRGf@HVn!)W~Wodzmf03MHETC;I(^%1?Uot*;S*Z@(HsMK8Z0UYzbu
z631rME4Id&tnO@{DOzSj_f0zk6yE*-$Ax)?goO@$5Nt$vGp})QZ+8GYngZ6bF>Wz)
z_sl;R{6qfyOQe{J8)JL^PCu?)aLn=gP@wsP5}?i8*_0C=76PPraemnD%TmN-{iV}(
zLi&TO$x6mEUdF};L-P*qhjqm}nYss8`=$rAleK^?Dh8rVX&BdwxadMg2YB2^*+z)S
zALfaLEER?yf5Z2GRojABv!%iBj5q3R$HFK_J0Q=Q#JZ}V-U)oTBzZraBJi>e2Mfyz
z6)Ei&8!$74AJBVMVtoe%$0W8oN6kJlX=F0ZxhDh#ZDV|)JZ(k>s370UW(qUZSWHqr
z3J)>3zz7Noy0qF^wmZhW@aoE_gMYHK!l5YaRf&`Pix2O6lx8VWrO$E-%7YD)wi?&s
z<`SY#OP^tjKKW{OOutGW%#B<8*uUQ*M}Gc9;bt6ShJmOrKgPd%NZp=YoZwWBL7{Yu
zKIh$+dC%2nuYBt$9r>;2G+d9LqlzO>ywVQai(@)Ra=9uW=tP}T?>b82Y+%<<c#Im;
z_KH4sHjV}BVcg!{T1)MH1^;NE7PF}JW+oxZ%?5%f{jUjA_qKoke16U(9^vv)X7Xlr
zyJlUNsT>|u;&i^h$manej(7Zcer#Zhh|6hZZ}@bw{6FjsU*%CtPEJIxS)dUbpEJ{6
z5)KQ0BN+84%6CEh{)sbiC;eC@juCS9ak^##*BGyRTO&h~92AffkHSNi^Km(LR|-<3
zry-!-R(C`wid9}OX_b1F2Mi9t3|kqKeIV^tvykxLqK}~$dWB8#Dt#;=pB06Y0TA+c
zQ5+gxhjUhn)p-v1LJ$U^+iS%#Edo-9;GR2TP;O9gxYcq8W|QZg<dpM_-J_$fvgT8&
zm|sh^9<3=smRBhpc?yEzIMT%8gL3(8D;NFLk`5H-S$pp#@lGF;mv+fAqmg>C7MV6U
z%syiz1Z}~3QL{=)%4o4WMW+xJFS_O*CvhkY%a#cLro2roGVidq41rLgtfv3>PUp%>
zKC8IbJ<C+(@+Zu2BIt{r8ms5OW^vEY=)~5xMzh3Sj!1mY55pbjJq^y8WI1~72FI<M
z{BJ}XoR{q@?HhhC{`oO&t!<6yxO3_BP#*@m4xi#+15!%={#QYt&}Gj$G-dqt8S+*(
zcWWJis5&(IOQ4-3ap*QNzx=`-^s8E%+n~vye3w-vmb~$*)r;pqP>DzB<M-F!_ya~<
zMzTd8_l6O?$k1hHB55QdQWLP8VUA%Gc!YA>f*9irazS#n&ewdP7=Uc(JLl=Hk#FPz
z;5CE46h||XGGa5s(rq^COGF!0B(S}8J#8oXXmr|4I_-@MjP_XiW_r2cBv-@=uj8^j
z-y3^>u1@S_TUco5YF}is-RYu!x}Yn)X_fJDzCC8h-Ip=ifEbSCfz?ve#RTc*I)gKC
zIid2Btd*i)xThn(#R%?efEi`rLpwd4sDnI=epcWLUZab{{*nR`7MsQI+$~SjW$@F6
zXR6$UI8E4||HGyEw{TJgoC2Re)7H$vnB2LhOLmLjB?59p7#1bH61t!G;PVgPqb#{-
z?AobaGTq{EvgmQ)c%859H%J|PY&|X<=VdA&Xh$z#;58_=-Pb*+FqxP9mYc1a$tr1Z
zV<>sN#p9z!x-j7tK1IaYQ{5E(AY7#G4QR)b+pwNwZ@Nk)Up}<l*Ob6d0wKzGX0KPe
zGW_L73F%}+F<;dCA$px^b2<0@8Bz(2!c=ZU!M$9n-WHF&Uqx!<^$vHe-`@oGw!A^P
z$jInVpU_SbllV`&<^C2IqbM8u#pI)}s{bEpZyi)+`|gjbNJ&XZmo(CdG%S!35D-zi
zLqNJ4q@-EW-3X|pbT=rAl<sEH-Ffci_kH)?=Qn%i%$eVee>j6Kp7lKUbKTeVsr%#q
zAbG8(D#k=Yf@p(qNDRiFJU8u^Y;-@NWcCyIqDXY$6JPXrNRcar-r)o7<CMi6Qk&$e
zpGe@1X0bk*=etBv<m%Av6Wm(_z#6fMq(=nbj}$i9{kfowrOi=`L|(TCJ)qTBYCb1S
z2v@(|P2q@oDuE)<ZBNTT+jrreBBE0`ToI73wD(!#c|LVxN=i+i&wRy`)Cm4tlS^Y@
zEs_`}EVA1?)&RR6<!Vnn;o^LQiB`xLh~2f>yYK4vX=R`Z;=VW#SDZA#icWx}aT!m%
zGUgqBSFOkf9O&987~XovsG_cWIx%Z9;lwY-t{Wdxd7a-EOfg#bdW_$NY&KHyRTzTb
z70E+WQ=`7?aXW%j@4P3KptuuZR?Oym$7?<1mp9MHoniB}37}e?jXo#IPG)lzT*)uY
z>5{FpnA<b6Xy4SnmBL(~#)!#N?9T9EYd6J$i07YLQLf|Y)Y!_-tZ&fVZ7@=`b;b^j
z58yQSUFlpZ#xR6Od9w^;2r7JZxY^5TMct#x`DWv;Ob)T@nSnXi*lgZ5o^3A1Kt;vY
zUMDSVZ(EAAw`<osV^w|}eF<=<q3M~74iqCCOcI`ZSEt)}1EF_*ng9o(Y~eU+a61~R
zQKq$=YIx#&s%Yy3`<_9xFKa4lfO}R3kU-D##h{5gr<8E-zrhoSOIdGjH(>*u4wrP{
z6fZE%b+m0yrr!5nIHlKA=+$}3b~s111?t;Sol$>^$)N@5C=m;eoDjBX27dmI6<h`s
z;?U6-7JT}@|1_5#m-uu%Q!2BzvEeA*=1E@ZPa}&|ZS4_G4;nh`QgBIP@rL(PT;B}X
zYsgt2jqTVvDFgw4CgtdkrP+0d3zgB(FkxlhURirsuK%ZlXgyZ^Y$v_KJKeM0!)r>K
z>~fPnn(<y9^Vg)<g|<zQoUQu$dFi+193EALI3(Qs_gZ*bTgBC@JilspZ&q&6<LHLP
z#LzxdD-PakzEeERu(&Ln=erVLJ95|*Yie*CY*tdr<!bXji2iCeQxR9QjjGv58+-Q|
zaTl?5<5$zLM#`4ErCxpLO~jh@z>%riu3eETXds>_(B}7f)L3_;PwIops&$G3Fx7*$
zIhT)a2!{pmAg7nQzJ&>)!bsMOY_lHF5Kf|M*1EHKZ$7^_S#H{0Ql2N<@}i%`X0idD
z)az<gB3?6PH7$Pc`+HqJ_)i&tTC}ag)Ht0-Y~Q&x;*okEL_7>8dv4EkxYCa-WRVeS
zZcdVZHT~dqI<Ja$k9ko(v53p9AFCF8W&8NkYTD7RmGw?4Wd6GXYT048i5vW6@Mr(V
zeMp5FgQ_{%MsH(&*xe`b_Kw53+yj>-yT>W0nSF=ed`s+IPzql#9xg@(F$y|ve%Ra(
zSD&+JR#9P{PpP()=svYNI7!I(T}A-ELBWoz1>;*~3!k1ByH{$b&vjs{giL0amQAJ_
zl$}_}JDYX(Y$#|i$F#a%*$(yBqywBjQf%es6?j9)G|E4e?cR@0@mN}0dxrFY+_u*F
z24$>ho=N5Dx_QgD2O2+(<8R}Gn2-8rJvbNP%=XPP5yat-R$&?r?vE=QAe5DWFS;Y~
zjxy~}MAxH&9CNlcO8&NgmC%m`*tXFDjLVu@6S3OElI47FqYh;2)$gN!YTy1HiaN{2
z(1cOMwx(_vLo&UOC(!NMn>_Z|xs7e{4_>LcXgUdScwF_*;z@Z8MSsQn6tT2tH;AX#
zsQKi#Q5Q#f=T3@Y%uSIo_U{h<x0Rb)wp`hl-*LmX_^jJNvvF{g`wb2Aa{SqA@ty5d
z?fzP;E5%fPw@09Fd*Vw%fsOg51LYwKbqCsH&l5cfFbj2BU9W6Bw-?pyA<2cBcxX!8
zhgBHCDebmQ#RO3bU{NALsf_HYsi_CjmER3ROlNoI%9Z@K`%><NqRH%Z4k`~3BVev<
z5_ez+N#^@Br#@da?6-rRjjxlU@e20_niHhrQdSohFiXsr^NP4R=7xrpwGUzveU3dB
z$UWCyIGk#0FhnBZXq218&EMo8BftBbF>~kxu&m{7D6Rb1Hq+a}Sf?L{*;y>Qi!}7F
zY2)!y_^edN!2|?WdFx>1)$PqI^qR38HNA?D!-^edRfUs-Ia;Q)4`Q-BFMbxN<yW!#
zkJS&{OA5L@>iCw(@0NU=lpc^b?-TT_q&ZSPp(I<OzTa%L%Ad8H*L%qr_n@L?yyz%0
zm*T~Tm$89q;`^3w9EU@dN^t7ts?APve((M4&4MbX&j^^<>chq;_&v`HYPt~Upf*N=
z+j7GJV(b79)wY_w^^~WyK^qSW>$qvM4Ox+wckc?viKi7j=5%d5w2;nKdwg~Y8s=;Z
z=p3GB@yEEH0-Yaun6a@OYO1#%9@l|}PgcWp;G!=Tjze;N-ZWIrq+LK=wKqXOK3`};
zRyJ0mQQbNd_KM_O-&S;Y>qG?-;;mO<W{@kNph~lQ*JN9^GWks&C!|z~==Lprn3nvv
zYohHd(!yp8M+MDVd-~0>A2@1i<N1^nZ(}=N=9*J0W(bDFFe)yj*fja3q@*|v90hW$
z>Um!vM=CP!%eG6874^VAIsA!dzkR|SEwtQ@L-q^pBy3@+Gwd8>6}@SCDYeoY2m8-<
zkKrz1HQ4|fPMX0&jcw~rbmT|<?|-P07`PBqd0%hPB01B)v!1UDR3IB$zB!wh@qK{Y
zfky#eOJm;bpV4SF4Q_|8GV9!(EOP8}N}^ayvCG!~glg5jleH@vNE1p39^nwZdWbdW
zqr}?N->=A|mB;f;%^mC8+ph#gZALw@>?T$3#Wvr4qwlkL(v$rv>LoW{y2C74gB}%?
z&++}Yxv!qKRlbU9Gt^Q1`0y&l=AA^?hX{qqqa4c(#LOBj?BG~ynpE4N14g+lx0tS9
zX)KJPL~MR<^THi{;od0uIr5$%c%8lT8phZHv`x=_`YlXL>py3B(*aS}^~ykoQt?yy
zwbx~vY{lV6XFs1DdZbWk(x?@ab3+@<W26mNt4=idO&itAvh(`ogsImlUSGJ)J-9eZ
z`B|H2bVSN^`$F`l=KkA!Sz355<GrnDr!+PTRc4MoRHU7+a!GjReRgpy%_w|1=sLYl
z1UzR|i;PrR$+*{^ktCAP3$RZp|7p;w#!mLUOY4Q@2j}nSdx+$RHmH=NG~KMpe=Dgu
zrY~{ZovB^ZHdnsst1i^S$2ne_XtH9lo~wyW7xB=n7Nrn`>nJZRnDn{fprf~Hz4%BS
zl159&I;TJTy#e#VgX?{B|INIo{l0ET-Z7syq%s{4b7?wu+7Z}vlFg0+Xg(ih36zRQ
zy=HNF-G?iJl|WQpjg2<AE`HJGFcDd1m7|uHHrsr`iBsIyZ!&Er8%d%R+#MJg7}lBQ
zH}zo>s!PdkK!n@MgRQ)~)SF;#=}*9FD%qif+g@d1O1d_%Y>o~;s1i1U(J~Cd8=zz&
zGshAbzg}=tO*`$|taWDeGOnHAe)BKolx-J}toaiVB=bv2OJ`!iE`F=VqQ{1DoVc-9
zxXV6+QuK7*s}KoQ;8b}%^{gBJYWQF39(Y4D&vU-4TyL$@2n!AkPXq`VT&}-NoT!o_
z+6{2yCtJ**3H(h<IMlOT14TmXh5B@};so*)_0A|JLRYo#;zN8KZV-~Xo8E+JtBdJi
zHmFQghc?lc-`MH;xB<)aqtKnp;1OTu=8<LEKeo31CMLQ=ivl$fZk!6e+($MrFZc5a
zpR=+^Dz=$8MnuxkKbitLVVd{f-xa9ADMo%=DHd+Eu#_<TH!7gKgqu1rpY4W!J@~Ws
zp}Pt}HUfeEi`@{=Q;HM0)LnNwcB|=H{Wl;26j=XfWZeJN59J=L-rij8pB;5xI<bIe
z&h?u6$aH(I@`-_g!59HLDisw+Kv2-{7yl&7g&uMuc(eO2qdYE;?Jy~4kbbD%qC<Qj
zxGnPWy`4ZtT=)$STLSg0BMwINUl%4vju4|fd?o41|Dm54T3Y(1!fG6;Ppd*EES};(
zVvC(g<7euN(dLH_&w<JZ4Ft$lT9?`G&CN2&i}>K6?L%K+K?26?<de}ZgczPaZ5=!6
zsDwyHLW6^Y@iuR7y#Y5W3~W3r?yk4%G@?vQZ+fV3ac{rWR9l9{W(d0&uZaB3_(I%R
zS?#}<f`Yv5i&Ss7c&!1nKZF7fYvQ2<%ph{i#e1bsZ4f9trij>*{fBZ;ewp50C!z5g
zPIP@!>E`m8!%P)$z=J3eTmL!k4mgmoJ?g4t%JN*<_h!{QMeR=&jg-ZgG3?mS;B@LC
zj!}B^E*a!D8H^XRF+dI0&a)D^-FS+1YmGY`jYn=Vi2C(s5`Bp+M3=j<x)$57Y;P`~
z#K6%4K7U?1LoVmE-lirOtWb;0q1LOmu8lSSwUBPF$iad0JM1CKrO{eAdS2nUNl#1S
z5E3pqIqTKG>*g<wfk#}yLnY*lwlQofmetl-q($|aaNEuZ6()TCanvGP`$CT2I`V2Y
zA;fTzA)T_i%zac(-SVOL<uQqJ(bRN}>w)kZiMS2^+_;VQizP2~Lbl5%$d3r5YwhF8
zOpY{J^_u9O^|d0^A551FzsZ&)dFPW8kG8il-2Ge0etTUUlj;)C{BjXMf+Z2pI%&6h
zC<Sy>65cWSX=xPp@ZZpvfuZsOiF;F(u!Hf8=A3um4h!D@5WT+k<QzkD)Z7#>6T`TR
z+(|LErQzXu829YG1OS;=J^v%q35}jWXu!}hN$VSh_oz*aJxi;rtF65fX@kxT=I1ER
z*QN<wkNRBR)Sq6N&E~Q2T4@J*T{%l@^lcQ+l`guRS1eY$9u-YVdyW&Z2%w@{0$C6d
z7UT-Ma?A6R`<{2e_%qecEQza*NBsj8-;Cbm>mrS~)>A%9_R{kiCMW98=;{oEckFl_
z=0-7T=kk-jxNY4o`3!|zOsyO}<u?8mkSyKHD8Cl`?EQS3$DU89cI{zLX4~G8PZUq(
zRs)pp^+ihcdN3JZvB7F6M6B8a&yMc_2wh~DAB*n#{Tmnw|MNBF4F3j}iS0_wh5h{4
zjS_eff-m!hJ&i1sh{I_15x}3x05#Qq;gDaX#kvASC_Nu{){-fKWQ_ZjKCP9e8W^LL
zSXan7ZS7=(zI>rBhc~Hrh=_>TPTw%x-X6JJm#y%X^$xpxFL%91&xLQep;peh;q>?a
zj-r#r<LBoO9!+cR?kdvaQ|LdFT4z+)q@JFc`IF<A^oZYnr3Gnx?)+-(RTB66RJVUh
zgPfrlsC#srOm8=wy-zmj(W!}uFeTxJl4lZQZVuNuksh%xAiQR^`N2^AmU=Otq_BRR
z+2wLmm`T40IuCsNdPUVvb8Dvv33*6L02OCf8pBy8T0pSJ`9Jj##7&w()s*pw3Ce9o
z3y}1vY*P8{M(R%Vh@H}mdlO0cY_z?^3XXu5%9_R@-g1m>Ao7QWJDvO*-O()MssUH@
zE<41t>7t7M5do_o5NU+HzdPQ*cYVj>S~oup^l>Up!D#Yyp3}U$rl0CxC;g=U*4S?a
zW-~-R)@1{wOzwgI5=e~Fc5UoQNffc5!KR#{Ri4<W>tApPM;qW-WqBvPIl`U@v0?D^
z@>(g^zh&hY6kI$RpZ9FZzG3C;xv4*av_X8>@NDj(K#jy2j^IjRz-@T8v3ol(A7;Bd
zV}5>h1u`VV)aySa_3BiYm-8(_NZ7UcSz6a!EiW-i=)cUSPR4WwrXxK8tEY|ajI2PV
z`abz<_kV7XTioCG;&EMw&yj*dER!<f^Sp@mUyEK`?(ZOl8b?sc0LIkmxf)qA4b@0k
zxgB~nJ|lLH6bzd?DD4f8Llggg$rsaoe)vj!?S`_C7X15kCWdhJWTZ#_=@rm0lI6Xo
z3=F{!f(@9!2*bXn74OZzxV-OnMEIq}3Ezh@@;AQB(y5h;1KsRKjQrz<6N?1k@GmuJ
ze^$;G7&|;Yt<uv^zs{d<+YW6aun;}UT~~?{?o_G2_;<a(lte5aF{a#M(_aB4P1c{O
zrcYvnxA%PZNd%pC!|F~%EAsqUGsY^X?}gClZKb&kHF<Ho`=&`N7fi&giAN!H;-=s4
z3X0ue?gmtOy5`+<g5C?<GE4spFRT7Eo^K{(IbBDIdnpxCUqeTYe+v|g+t@hS45SLM
zXjZIKPZJ01hLtpRcV~Fgx2`iaZ8e{vH(eBe-DY`{85|4s$6?j-N6E<fvo$*O{k`eP
zw<}*)>@PsPn&XsW{VHAf>vhpTJAMd4^BDcf&e2h_^`(@F)ycT-55GbU2Vvuo{&+UV
z?`kFHr)pki#X>n<YMkAZ9l}8H!qyXCL`**?(B1GV3J(X(^O41)yLCUH=_FwKSY`K{
zmz}=5c7F)ENV-Qa7yo2Y3F^B(=BasC+d&RLgoCkvlXPZ=JJay%#{%V?VsaST2`?A|
zJ18;`T5-e*rZ4;dO#1w1_xRsqK>vUIkpBK-Lc-4}d?i|U?ryHGGhKC+FuT1uk*#<*
zqE?mp%qz|`dW_et9$3!)#5NsB_uMu4r{#$a_yhzYA#h1)X~W^J_r#(V)xqZpzXOj<
zIZY0kXcIHEjUnhkID75$J{*#gk~L1S<)jmsq~%oU<+q2;|8!h*gosyFm{Asw+9my(
zNv#PlnnB^??4^BXoRIMK1r9OiyoXChzn@~34est48`(c!ZS@I4ztZ6lO;;wdJ99C(
z(9WR`;HYV!OU@}A6Boa~e{jGHL0a9bS>V!5r(`HnD=dZ!&qXWU+nx7Orw;o3xn+Eu
z%-6SU4+k6DFCjrD^{O`QN<u<{YKn@G@R|R^q3WQ3hh$}xxE8v+H5nH_wYi|CZnQKs
zEA2ejh@p5(_n)^|RvpoxxMI()o$bvB(uaZQK7s;J4qJPk_$+~nl_+W@EDha+%0h60
z>aVb5@nbB;*_X!!Vz@pUL&C{~h~@7R8U5mIAL|e2y+{-^;qgxA9P#|TxE_V1{Bpd%
zpHi(vo4Z>n?kJq%`Qro*z2K<<-%mT!d_4y_(#M_WcK&v2y&(nnAB4iwg!tqyK4WbJ
z>U~gb*_qxA4rj>avsMo@xnp%}p}z$!N~r}rULBS6m8&}O>GoVC$#Az(_HU2YYrPdg
zEB4#Ml17heqw?A+bCqtxlkd&1Zw2*q|MRS66aRk60k4@_FAO0U%c^UWj{5F2E%gdB
zY;QilO8u;C*CNCBpu#_dC9uWwaA{Q7Vq;TI_@9ySbbyJMmWJ=ovrup`5#}mg)FwUe
zfFe!D_*Div5qs(jc+{+|t!+z#+w(6B5g5;!ZyAd9nx9=v2J8o7nC`>E(SLn;iZ6QA
zz{PFS<A=T+6iT0NBI314ocEeEaAfWLOw=rR<eQG|()rU#Q-8y|m2Wx9Z7KUTDr@)&
zuJc9#7oIA`G*WUPB3Nkw4i78KK}|kFf!#X56%BwVY@*SFC6tgQ1dkrp+Rq-c{jcjF
z03kh>&76q{78_oJWGklOys5TEihZU-2_MKP+x2=}*KRrxg&`Fd|MG<IW0q4&F-;zG
zI8kp#2%?YRo39e<3?1F?b79-JzJZ1hRyI(s|G5?SAK3q$KcDHmbEaIai=^G;s4M*7
zm!I2n<?rv@Jg4@xXurUgew3svH%$V~f_GW}0N(`>lR5#Sk$L~UA!MN0Ed2ZE+-IHW
zyW2~HB;uB0-5cmZS4!nd!@TD-x$a!LvOmhoQee5+R3XL3F;5~OGYe|GfklvH{!{G+
zFc8&FEDL~)W8{ZI{!?FNV+cj(bPru#&wW=$9WU06|8;<Rx`(Icykyl;oDZj<UAJf(
z8j`J2USN=4V``pe(ynohO>}Wm;5ZuEz-@6`>5im$$t@#OsNO{2b65EoF<)He%Gxc?
zn9I8CZZi`wznw|;!Aq(hxQ;0uyu*zbV&x&2(@j@5@9#dco@~Hw75A<2zLAS{b&*ZM
zW`3I+{_NjrjP6~9%nzoJ;F@^HAG$YIj=NLyXtz9x&po&q6p2xMd?01NRN!3%ThKN_
z9w_HIdC?hPv0i6i%6PV@qY1~mn#6~{Y0b6&{Jz(I<ma$VQn=f1<x(@>K<T85{T2wr
zTa$a<j+v6s{iCstJxBPi+>kS0q~UHg+x$Uc({yT!%0abVU!bIQd``NUj%hlO>Zy-?
z858~N{YR3=OsWk$$uDf6@eDT8@XFB94xyITr%7lQDJdc$KyQ7;74M0GWx(SK<9+0i
zv_&ZsWu~B%JaXTLv}~@>tsWKV17#>HEB@I|L$G%ipUKe#(YS8KpDZc%Uo0pp_Lo+d
zComncvzwb>Ujwpd@%mzT|Ky<St2~^ZU)QmWf~dl>`D%`F>1ThYgZAf0_MOx{#V!7`
zarb=Ds$`(vpa@EUQvf3So%W6Z#~jM+ow>LvwTN`@e1mzBD>8NRknsDDL;-<8G>5vm
zn56H#8wEeyapMwkq{bZmxycaUv{ZTUPraq*LWr%FL}UR+5ni#O&E`WQzGeB)Rd8#b
zFq+yeH1<PE%E_M3V#6_L-SvA5?*A+#^@(R!xgYne`6>IVp68IdU#(s33-7K-61oQ|
zx5tW#inb0g{XaKX-)-$3qaCKJ8t-oN8JZ6hd7WOrqhyJ{dphRyw!R<rS1Nr9D}6W9
zaruYa@aqc<60UTHo|paiAA+=TPV*%+jsD&BAtWZ9(-&KwPGNo}sy9~XVnmPE8CB1e
z$h?CrCmr;^3jjxXkio>@oO-o$ZKt6PP?TW-j_l@B%71M`@1kd*i!4HJhsG7wv!wV8
z3KF*ukR6)}4twx$DI*~X>|N1P7-il!?gY=?{i=gs;Nju<Reu~2!xNHWRc~koeeL;^
zubsk7O!r8*jUz8pwkk|}6DN}&aR-Ue(fzW78O|PZfDygM{e(tX*qRo!3V5IT2{R)(
zo59|ja#L-5kJ_|f*f0x+Yud}RF^XAbj47*++VTFHw)MtyX)6pI9Q^U~>1vy&PMagY
zqUnX+k{no0517mVl86buD0Y890VagpprAcpyUZT%qD$+)WH;#RaG;^PCJ`oj#H3!z
zl$sU;t7itob_256N&fjjOnwW}#zl<kWleT3>cjKJVU*D*AvNsYlru7IZS6vjbdIgD
zB8eCVg~6XE&;r#PtSwbUYr_8E!2>|5aF_@E4?7U?-B;izO%(CqU1L`duP`459wEv|
za?yY}kvhY*-lPJx63S?X(NUe)d^kwI2g4(w{c2Cd+zpCmH3Q{(oS^aGz3u68*l`G(
zJQ;*7KD{!V>3}fwon|KU$DBI~of`ap5vW29zKo2_%77j*z~`RA>JU(oAD>>6U%|U?
zb-_lDT$U`WRD!1^`Lk^M_4_Hj$fTT$wJ-LkEZ)DVu|*H@r1FA{2XiQAV?I+Ye74mJ
z-0*%->nHG&I5sWx$#Qt~q3vT|i-GCG#rd!DYG1sn#HGe0#w-Qs-sptjnNZ2?EwwqA
zp}zcQLp;Bpu2=o9K8t`}_kTb^|JO~~KdlA0+y6`CJ3Xa-LI?I4Ep3qxGe_7})LEa8
z7_fE~YW@4FdLu67cHj+vT}@3(`!es#mwrWBaS7u3nUJ=&b`bVLI5EvwX^k*Rfj_b5
zL7?niK#Xi`pqryV+NWViTVuEWz|yisxhjQ6P9E%Ydr4tn@YMCL@e-I!Dt7;E|ITHi
zuTZx3+Wy}(_ZyeDt8w%+!sI^eI^pzTgoZblCrs}Oz90Z^J#i{3y^(J!sN*H`Vpsh)
zZz^2P>E!ba52bKP@xphs#*OA?n88}5UhBKUBKebFz3X*b2(VTVv?mT5gJf%-@D*wy
z>|r(kho{-3Cw5E(9cnwAVfMc=-dRBTpD5tK0KQ;QuIy-vf}4A>PW@-i%8yUgif7hm
zZ8DuKmKJws>xsF|?w#&T1}-ezty%p*l)MF#5EpyU5R(hoKTNB4>;&vyxUGv{T3Vx1
z$zemS2cIAQoO13f+}Ei*xRd23_&C8tf#8NB6l9$qZJ<j%6b6P{S45Xf-thWBk(JPu
zs!+2MrQT(aS8tOV++?>HK_}9Ni#4(W)uQhgA}_YLb+<8T5K1p5rJ&HW0Pu0EZkbCr
z2Z$bw-*s|HN45d{6+uJ3YhYvrpFDu{WDhEXH1;p(_kRyM3ghWmiU0ec5$@F|^z6}e
zzBxIxG_<t1q@*8b3z(FY-h46XyUmr|Sp`PZzbJ?>8{%JLZ}03xHS0f}Y4)Mgs<IT9
zlnej|S_lge>;lCD-+;>7nz}lTzh(jOpAgD8aGP8D`>_QaRwN@+YdJsk5?fAJVWp?1
zgU37w^`^W!I6Ld!nXWE#Juoa(FMH4#N+c6ss8OVp@bB-Xklh_FB#!uRD+%c#5z{9+
zS;zj8!%f8F({A>r@V~r!2+@NOuxUQLID(Jw1s<xOZ<7l;zDmykTiQ^O3<n9?A5&o7
zTRpIST;Y?Ui-wNf2GG_hZWFn!^Z*%GSN^Fgt1z$vjCTDxU#0LCIVmkIt}jZgkLIud
zV!+pB-iru~hn2g^v5fy+=lUb&yigH!Zg;LR?5RJB{(QO)@VQliC~NTSdW<sBjt)+{
zO3F=_dN+nQMN5Q9)<%<9lW2Ql*@950yfM}I_zu<(O$sFxtcK2w7{<Y`DyW!LqL_MK
zCvoO?(Hdt=JuwakOH#Od^zw1i-R!ePfOG&)VwJwF#pyabFepf)x}om;^1NkWfN0~n
zaX@*w!-z-Q`~c3}obInLT*FAydQ#DeK9OKk=B9eG^It2kI)O-&$0`6AkPPP<++M{n
zM0`YB{f35t(f-o}w!aL80CQlK>kny#t@I9m#H!bQ_Vnq~n$_LNdGE9cpb*Yh8K-2@
zD%OtW*D3QPS?j+*j(esi?vDd`nb)h~c#J^L-9Ca<EbN^m`UIA6!!~}onxKck;00Za
zgbC%8y=4^B1OruzG+SGrl6=ObLQar87ePqMdgOgM$n^B|STDUqd5iT}IL5&7Aej>I
z;y+nG8yROJN;5>AqR~90x9*P1Io?!Gk=feW4PDqERvdoLlE`AK2k(a!PI3`@;*CFF
z!3<ejxuZsjA);k7Q*$2&2Pao9P7_lC`4M5ot^xvrCRh!W81mj;$0bpQ?*)bti2oR~
zilVouQkNOIFcfq;e8><^fu*E`W851{qUSY?_EkRNv%}%ar6S@bq6`BMPBs{q4v?!I
zp{sY)JUp=<-UEy~1ZkM<HRuVy%Ei6(heD*lAtm@$dddhsf+Kt(lbQrM?YRfT>F#7i
zILY(H;m;X4a+Mzee9>t%kClqwY;yDg=!nGP6+D*vs=*Ab^hH3P5MY7p^@!3Q0(95t
zWGH8=PTgNfO6XGUT_N0m@Q}La=6?2}Reu?Le59T4zM4<;lg})Jhnbv$0^e)?4dPc1
zP&BDudwOv>bb+L!N+j$&<wf$-y%b)VPy#5?4p*(i6)%GlITXJe>dTQb(0oohI`4{a
zJ`#r*7DGTOl6x7nyWs8ZeGVV^TH^&L)U3Sa+7!@jv=8h+T{v$0W6~pcC%245=M7xy
z<N1=x1mdJ5v{NoSGrquZ5NgYzUVtVYNeShJR3)Imn=#i+ekJ0H_-qCNadtq8mislY
z3$MHL9?jTwOUpfqxWAYmy>hGQXqr$pSAmb-_0ic|ej!Gm%;p$AJ@2v_htS0ibR%F_
zWtPWh=~+gAZjb!@{cSHtWWy;$@MwduKBwVsKq|~lC0+E4`v|XQPV;>UPYpZe^?T1l
z%w}>0UP?%C+MhqhB;mgYCZsqwyJt&(ks8_c!8-S=F*ZypL1Hy_y$&*)#t54x3)Z1C
ztHJtO)4e7p<(~xDyYoR={wNE*8!A7wle2{%fy#dPs9kw3h0jL6ZGsC?JQUM~tw|I2
zxC{<?8cv-1?p7G?nE2<nW-s>)MkX?p+gE&UuQeQZ&uKplq+&ybTyk6gGpEW118?-w
zKBz$B`S}~;Gtw!}6hpZh7#e4y48_*g>l0tek}J>e^V!TiX1Uo-6Y5}ziuaHT!Ad(S
zW-6g3*tvgu3;espU6G{LRdnJ*OPxa%Zt})`$va;?_n1dYv};-0Zq|orm)_2=UwvRw
zEhK)MBF$dbcVn0)$WJ#aU{!*Ki|aCq{8l;rKKMvFp7r-N2D5_Kf>%W7z0;L490crd
zkIN3ch-dajzQrVRIdRX}%!zIMT$S3dudD0%SueM==+^6|2A)U>v8M)j86P$eWQ4&I
z*xNek*_!wwB;&r1-g3;l)b-k^H)Dqpv44VA6M<8ZA{2G%X}vxlgMLJjXOEh<Me5O-
zjAp{f7@M3SJ@=t#e`@nR93`7ALO`1mA{M0Y+h<HnXj|jOTwXus7uHvT`*l1r-z?H7
zJxLsg4dcgnQ$pK=uxWq;)od#J$>ACdRYr!<;t_q;-@pRK&#GemtxLiCx<`LrZ7zJu
z>(g9vqMeFiRI*k$x%0=>fcK^Ov!(D3zK#fu)9p^s^YV8rE)Jo2CerB<k{!5^tSDp^
zLQyA_F6@9^ZSA364b+zvps3(I@SOMX@Gxqt8RB<ZdU&`x0ClDiY-jVjyjwAs5$|qk
zeV(TfFYa5?Y!$F+(?Ea2b21};j<BOE`DWAEGfjEDZg1<#e1Ei8X0vBaz@dz1)O8bH
zZPAF&5g@eaSql8M2ET%beq}V@tFoR8-qjH_97vztKRFq_(EGbzcaTkR931brd}?a6
zJN{+B_1ST2NK19{qU2?Y*H6Cr@4mWTm$Y7&n29$q%y6>jaydgR%7`vR5>w{nbZ6H8
z)XJwNILrSFn+J9U5F&e_VoHx$PA;dJVzF%l>>*A%ECU#l4bK<uua7ojSkKf%e);k^
zLhf%ML(kolewBQ_<puGZ1@9vU(TigT&-vV>j%_zprt2_5w$HczpR?btKI5<PDS22f
z<gr%qutVi9wJJvw5YbblwM)$BPWPGQKz;}ooDZb5QgPqkFW)#4OMm6iDi-ciN(~h}
z*@XRa1{yt}M|~I~H?ZaLJ_{Dt+zbIHs-#~<bB99~R(B#=S~3|a>7o{be0}#%PsPFE
zD^7ns`E-gCs`k2qYOsTbW#8RK?>3nX&(PSRcs~zHTX!j#YSDqj^eK?x5?-EdL{$Dz
zv*5W@Lm2a;gELEyUSwPEB#)X+R&aqaDQt;e-rzcy*|q*dk?S(eUG899Q=oZ`bR-G%
z0Wt<SSBq_Zu=oWi^GJGn84tH!5)u-!JFHmO)i19pxVXBxg+2DcYu(<XqMpOo^Ewj2
zrV{u7oc9y87C53;heF}xB8ur>8GYv{fIPc{NRO`#_;?GoGO)&wgk&fw)8hs0n@uPR
z(W>)Y1zp)#;W4_y!-gy+*t(!+qq8={)F^j<@)?JM?m?YLi759IV1*k>Ub<MaytJ}u
zxFD2HbWYkmFZW4`9HI?M@%A6buo#X#j{uv!EFEXfz??|^P_Y8b>y+!~p#&_Ew@5+B
z1CIf}^Bbh3N3(f2;|3!IVFsn8iAZBph4)iTEq0w)(DdL%vxrE|^^98V|2EajvA)4k
zhR>p$7c3^IUw3DqJ#zQlkKPZqovB@Zntj)dA{2t5Ias1okD@fdn4v{vAc#W_;_q%y
zuKS*!7nVMk9Pncv{s2t88L~Gc9(xiz*Vkm0vsGB)N^X7IGvvaK#5X8doki0w;GS?S
zF5(iO9T{U13y;(xHc8pn6H|F^&HY_|^J@rT@(&}kdchzD&UbmfOUuchIrE=d3p8&O
z$ORl9>j+l0Ck;F1_%u<v%3UM0)pWK@D0?$a5sMNi2ESUoX?CZo_5ySvClW)3oq$&t
zwImNCU<*Z$$;*~Kibq4+mjTvJF7r12+S`4`3hldQ0sVj#>fr?-aC{lM!u*9xMfxGi
zL&LQ~J-gdBWQ%7Q1cR1|7a<!o-QRGoC;r$z6So}q8LyfTo|rg=Y`Wg|-CR$cYE(&y
zma?&{gWMIP_>ArY8!gJ~w4uZc71oOzRHLbI4Q}FvXod*Mq$Z~=Uf}<ZoL{@4G_hJJ
zbJ^4EF{NGO)s$VkYa(V*E9P~Z2^BOwcDzp8`gv4OAi{yym`ZnPH`Co*D>-<L99C*S
znjF*vvi7p+EYFMczh331&z9MDa=$B3Co$?)iX|4|aCkvZR5x@tD1GS)Xn*2~627rQ
z-H?_RG3K1ZkB9&}JSGB8M~aLW4uZ*iRskxEN|~=G4->g8*grPt;P?0UkHv;y)cqO%
zp{A$30jx;AIzrVs3P0A)uSy_`@GX`TybO(>D@sXJx;b4b=N=m`3pJ|<d~VC`J<n5=
z1koOJSY(T0v}jODp-2q}@Ah2gVY}090o(*$e!PJ}WZ*1luR)I~_3V76VLdSRNk#1P
z;=%}6@Dn-h9s%V(3XYOefYvfHCk5M%$LVf6fJh8sV>@`G^WT+`AI&#jof_@U-6*Si
z-_De~aGU}o_h~mJe~=$9I8O;6^R211N4cIA-qVMfZy>h&Us9*LyHQPhEBo%eqDEl!
z4!vuAT2VAo_Sb?If@6i(s1~Z%fxEMUA%Q>xzXqh_`LP+RvsGTG*c75P7xpO`%@80*
z^9AC<9yf&DqWuFT17mEoKTW%_MG55zFuw(YNQYpL+a-D*{r%D?Zl@MiMQpRu8KGV-
zQPHL`E0<M%Ya_2=EyM0&okO8BtJx}@L&O*X)}=hjMjGxaFg1nq&AF2)Bv*=&5)oO1
zCSc4S{ZRc88I{uwcGvL4=N2Dg$!>m-<E+1LYbt5ItZ%90Q9~SDmFKX%xk<W*q{ZCx
z10@hwMgwPiKIp0c<uYg%q&wmzOGIz>z5Q_(7Z&stl4@kr1hD{oiND&T)>2F{4LBvr
zZ63&TlYT!(^Wh%B(}Rs0sj@UKf3hmFMVY7R+)|HX*;F2Z7^c|6y1G37#zqt2xR+5w
zr;OLaQUMED?Li=5!8w*$2M;LH+J3IGESi&ke6$`IcH1IK`)1r`A^_xbWCF#XA^QID
z@k`*S;j}m{;YKTU-*HDw-aue3etB$d$VSQU$}gRS-Qw%t>+>TuEL#Zpk_+CKe-7SL
zwG!0ttHW}IrzO8i8IzK<1Is{iWSn<8T9B@jG=>lURSn2?f%Km}{uFY14a_#z#vLb$
z<Ph(-g@2xAWihGc0Ug%^`g;Qqm};nLi;@@WQX(@gtjYOC8h7Ny-1o?m=4*0bM2|5Q
zlwa#rH$AoMxfkg5Et=j~KWAL&W)V0VZrq(7f9SX@ae$}Pk?E&H-+}(CAolJuB86s7
z0IjTV#HaFEiF|s7&nOy9wXp5`sD;8@N^ZJpVl!=@OX154d+twvYGmbEZ<je5E~H-=
z_ovF{<$K<ax~Ml9zStadL%lks601)4${p@doa^e-DmB@{ZaAB(Upt2j6^VmzSN`*=
z|C}0Cgg7;-o;s_|r5$l<)I<Xea(zB5DO8g_p|8~?W_RPg$!zzrX6>>&gU55%cO#wf
z)QWht>Fw%xU-^>|is>r-qdY7oa7M_VTstl=bdOP+0|HW#Dx2xh6u#mdz$t<Q5`z$e
zY+tb<MLuR6s1Px)D>VwvItvyIx+9b&vnR^UasA$(=0&r4A=uab^4Po|t6hTiX0SAH
zFdo-ra>VcpBzdd*HeauKIIfRinGU2S)XuQ7NUdE5hLzeaf57eXM?G%+W#E6{wQTdI
zK>xl4dqrpvO)YL2X!|6CsI=sb6fk($1}X-Jr}`ZlE<QM23wof`x$Kxp7E^t=<(4_6
z5_XjA;N1G+0RNzv67bVR4&vZ-aDGI@Zn%{Dt+c$ly^utO)1@DipV6f&8gknak-}HR
z><a>C77`|#mvV3*VgtK6#*dd%zhOB4!Fa-Y{z)hs1Uh^iD}1_p2k(dQH&U~|rBedC
z?B})@I)c9)el6TCErQNUb#mb6TWX+NC2YPC#8)8oVL5>SYxmo#ZUY-rMkH)6pUtN}
zLIQ$@yM-ic<Pg|A=<MX$0(Mp|D=h}ItSruxs>)sF?-!iydgRS27$_Xv1z8{KdEa17
zT~FLM$THw|I)rv&n0%RB3(ThzMc`BqKKc2vFB(ZliiI&mAjYHyouf1+qKg1RsarRR
zv&y=Yxz34$7$78)-JYvOvhp^IY`)cg1Zh5BmCF!PQ@rLj;+9Kz7DDGN3l<eG&-IcP
zCKZ?Fn(Ni&g~86$Nez*k(!(8(ldUYp9nJI2umQ@Nf!}FqLvxPL%%0}I=LeZ9OHp=L
zV-)DrGG|tdj53}_qwrT5b5D>D;Y2XeoKXj6{yOhrLmeR7_B^yhVjj5KYqXHuEqAjn
zEq2+PWNBUONnr60C;1arE#`l=6&5BiJ7e`Gb=ZAi)!D2Q9PEb-&_DKUeh=QTmMU?U
zmv22he&4kP%8wO6Kg(mU^~bfRLcXd`IT#N_g8Tv)wJ(<DffkRA-%@92sOv!T3k;g@
z%>41z<yVG18;0qPo^D6K=COZAXBq!qVtQNi<*&AYj+MVaexPEMMQvexvN`dr4#|6y
z?;bJ+VYK++Im+z%kHHA<ZC3<bAfLe6H8DinF{(Gaj?ahy&?wuZhQ(;tok_T+%5=!K
zXUhT47}v6H2gw2WMoEJiRJQAN(ra3w#}W~b6{Bd7341fbKw&kyyY)f~WxTI5aSrc`
zktr-c^1eZ)YEYYx-J<l?t9iIJX7%{KVl^<cy`@nq)?TuvBJ1rQivo$?UUeN5I5>7b
z@R?m3GCqkNcdWqzS&Sq&1s*BQ?pErZwP`|!<Euxn#A@KkMNdHCh{YcKKDza0573Xg
z{z_a|<8EtiPTmXd{Pzw{wQQu>%8YC>-W4uY6zerOUfk!nSC^(+YySjWdLIrm#RZ2>
zWaoQm@-wvvqr<xDGN*-6akGzD;fT#?39?Pg89z8Ej<i2LGx^JE>f>qkBf@9o9D(&%
zRXV^}5Ob^4c)7Pc5!JpvQp^Tu>_E<828*k>8pK?G@VLoL!q4CCw3h^vLiiqVt;H@F
zPzbt6oasWI%Jip>#mc?5EIwAr8+dsXUxfRZ_|sw@w$J9M&RX$JGZ}ZNmS$6}vqf39
z)L^p>Xnv)gl1duyTAoS<$OH>{s=E?<f<R7H)mzU$_QMLPhUwPJIiGP_x06(|MUgIV
z=;F;#ZkM$MLS6fXOy{<1gl!+uS~6SLiu_6L!c!306@NSif#;X!9J#ea4`9c#%J6j2
zRnw8egZLM#?Sm?rud-g+i=qq~iQuLuP$G3JrUu~8!sm;ht{Mgb&F3raM)y8VY59>$
z)G}=(j&OfEJA>M)dhREICLC|5A<)?QBtH1Yb1YQc^XG!jy?UkF2w(({&~pGxm;nE_
z27cbsj**R_`V!VkvT<s6Pg*M;KKEi_o(;I2dPA`F@vXg=+!8_)*72N)dVQDY!wA~(
z`WSk?Qbt>KM6!O@c5NP)n?%u$5?WDh<<>}NK!}R+4`&_iT6>b6>Dy1tF^p~9(^{Js
zCK8LV1+$3cs+Rk-BP5=EFh@e-Z>iaQT2+O3OtDJ|4pVoRrnL{Vj0iN?X^GSv&k}(V
zRCBFcCHA?xR=RL)=M*R+>`68mUE6R0rhTO}L4OPB>$y5FGO(|EX6nApm!r2nPK7o*
zoKem=x^y2tVadrZ!+)gaY{?XEJ_+9|dC$Z+-rgGAyC#hJDS^?&dZi}`iG(}1n(~6P
z!SrNnG)~`vh+Vs5FN_d2<4|lqFp3Nm2fnXCn2_9RXuLRS3`eIxKB+AgtXnV)JO7bw
zlW@@m!Yb?Jo`c~zhp$Mb&zZf-qPBb(_hSk7OXmoFphunKMy?`=^`S$e2*-a+PHreN
zWmr29O&^wPq8qN);)?=?4SEByGkbs$9$$SUnwho9{?IpdUTlYNE{{*j%Xc|s<Rk6-
zAh6W<V2}8PXMuE%k|bi(c5M@(ef%K2bfV0QOq2JD^6GR~`ginK&X2thJrB(#p6uM=
z0)SX!%;fx6*{=i+2*5(a6n8chtF$(+Of=<~-S2d(9)LLCE|9F1{UPnr^(yTWB(G)Q
z4kx`?F5(MUf6d}BD{$rx3~%S_!yNm!K^a-*N4)%Qo3ZNt%wL!Za|-Vh3`*xtO-`!7
z<ldbtK`!3pVZTo1#=?T!q3C%J6I1bB;{p3n%<^V~3ckQk)ApzwnS<YOJ^5r1<01To
zL+2MPmYHqtfyC!XNq?wl_n4Is>+ar~CqeNx)0<ca5<Yf}Zl<&Fkz2s7N~7sT9r_Fx
zFq3-AT8J^0S(b+j2c-!R!EXQ@GuFF4vnYc;#%DjmKT>eQzS(s4RF&9Z2VpYeOf~Jd
z|Ks4ReCgH?B1bUwd`tzkzP<e|>Wzjp+Ne^VcP$F+pGnfT8-GkLQf#e_mh;#SCKKij
zCiaK$9E&xErGF(p^~cl(N)i>;9SlNtae&8m2M;axUJNH*D_r!Qlw+UIt~9vlYL&-Z
z8Hf65E&vE{3F2u(CE%%^$G|;kRlVCdSRn;_$UI|J>eZJ@kKw&Nhz{g%xQglg2A3jM
zr5@vowR4GcMsjU06GBxGiSZ$S=)Lcn8qeKk)C;w5p{%sS-{f69FHWjy-o={wV`)q|
zZtS~{ezcX(xF3W~*^-B~RqC=w8O!=!^C8o4!>==nqk$+9md(*`fe*mRY^2+c82OKj
zZ~CWy5-6n4)Ji($$;Obr|IvUA%+@nC8`xL5rv(;~Mz#Vnj569>+58c>Us5>~#pUG>
zYcYtva2cVz^J<K4sCQWH3E_KviIKqGI(rjexOY27Hu|RV;}hI7Oqw(I)lhK0E~%&2
z^%p)H>*LKisSZ}QDS`#eHPxCo;=m52slES9m6c-{hdeEnZk-I)?pYH+n$RM@w(tpX
zzP}E_7y7-9$rmm*`p1hbWWrWwkIBrk*2Fr2mjv=SHF6H@kt;Hl-%DVX<-#CN8ki<e
zhU#V1WNrMHk_qkE54z?dmitX;KGOB@%C+aKnSo7jvL4iUo_u!|oCyn;EHjk)BA~*c
z@jRWT!K9pl-}5-p4LdsPJ?z8z_AvA-d4nlXkGG6kjCTsrsP(6=57(pqN#3K&ND^F2
zxBd8FK)G*ZE8c3FZtPQvg<veJ#$nVj1|TVTJWDp-+VzA5APxUeD<U|v#|}Ru;Y8~U
zO>m|D{QBkR0gVGZ`Y*IH*xOTO?Pd*)cwb(q?Ft`yy^T0KTs#gm)p^Dqf{TrV+59?P
zn50Onx+_3-_)D|c{l+rjg~$>rhC=!Bl+sw2(v+*Tt_M}ecHhZgh56q%C%kF8XE9!g
zw|FvT5w}0eHy_dhpXYcGdWa{5(K?EzO6F*=>>S!rqffe9tr2~P;n{!A4rRlcCf}9O
z;Ugo<^LQ7;P|O2V-Z={g*TTh~Q2BO?T;j}i27<#Eb8h3YEX=nWb+3de)RoHOp0AbJ
zwK_KFv3CD{4!7N^3PK!1uY{3Mzd4^z^wQ72qx4u9$)d^f{PusatT6q&!#n>SWRb<S
zyHl325-nPqgw{Fmq;FnZS$PJIdu+uE<tZhFgFc(urs;ZprU>=VSBIMctMbEneTdPo
z%=;77rsGH|k26r5Qou0^NY{s#R~Nt6waUBZlQ|Bo%kDvmMRXIqJ<w}va;26T`Zz5+
z%zy|$tWXG8v|Dy{>gh@`I@e&IBGz(%izL}}=k=S{U2&+@5V>Q#_yapEPi^z`l?j>F
z<&0d!w{H!T39;FQ{z!!ijiC(F8*gQTcG)oKWjvRn`^kD}97}u_03C@Tc8?C%`VyD|
zg$&4&=;c)byorn@dil=qp3}~Vc;UgZXs|)|`@d(f1D1`5F6WoS248<YrHCODpvEFW
zZe}KqjF$tvD({^x4}k}{*Y0S3b9l|Lp`hownB(3b^zjm%7?g;JhG_o9)XU?I*@ZPZ
zSbI(&LXZ-@zt|!i=rKPhVtcWK48-h&$vhU3Jm$I)8zHIPcb#f_Ii`Ewc?0;7ArNS1
z8}bnksn-LLR|5Dxal3W+R+za8<6=uwqqMLf$l;)RQ})&boi*>;=iua^HM$d{?8B%W
z?&*_fsyq=hd;@(&;k5Hok5&PE&I3f)L`~n!U%V=e_YZg|Bo6Yh{etonoWio1h787I
zei#VLaTZKB911Ur>>uxcS=I1rXx!g`>n){CIpH5Woail{?Z(#&eR!SX=Z|_O&pFRN
z>)tcc3LlzRURraj>H7ipGa}69f>v?yi7G3W4~iKyY4O}9uq=%*)t}x4c+EA=C*NoC
zfkpkd*)`a&GA(Nlxm)lu0=ZC8AFU096(&8`1lW0YhZ3>a(Zb>~8jqC`f}Bey3NLN4
z?)%uA^rF3DS$60szlrHx%`X;?ceRe2dh&NIKbsSh_tc(wj@K~j{&**w7xL}agKIq%
zs8_q+<f5XW7{ViW@a}tT=AttxXValzQhuKEy8K}a%{Ly1lF;6`!Xl@EWkryQJeU=7
z=>R!ok4Z_hi%k#^dh|Vg2rC^Oz|UB_QzxHCjRc=~BcbNJ8Z!K7z$;lhlp>1LDB(j1
zK>=7-tnc%kt&YE`r4FAXkm2&fA{KA%K<=L<=k?9azR{H;csm$DG?KN#w+^*+UcGE%
z=5WfZ=ODX@d1#~F(bAN(stf?m@#@Nk{A_>8emtejb7TF5R-~)tYJp~Ir8!UH=iW4c
z`z{!UWw+ic0)&Li{d(Zf-eiMaqiX{Y_~MF}bU+ZivqQixXX?1<hZS2Mn<+F?#f&bC
zxMxOP+w*U7KCq=tr?=S3rV#Fp`A|v7giCL5o0J(%^Hx|4Kk{MKW!nAFFdrH{h>ni#
zWzNFNR6^)ky#BLLRe05;4c-S(ui1L%wkQeI17H`<Ia9j%!mRQk$uLj&HQ+btNxgCy
z`fLxDkR7kfu$-;u^M=-!pI$x$7xamvDzimQ<1#`%a%ZZRf?htp>lfGZV7ZIllO1&O
z6qcI>1KN(h?K9+~!FI<-r`uJ%8h;`;?x0s|dgqO34?iQ*Uhl#x@%CqM?RTjI0)l|k
zuD9`*2O2hxxR_aHZllHR+Cb($W5%BI*_;l{jb=#{2c7$*#WCtE<2oso)12FRZ}Nw~
zatRtH0TCV^4W%SUa>{PC@%O~)YUrbTt6y8r5&<FZezyL|<t(M(eLXwr3v0jFZ*MhJ
zK<Z%)p3~N7!V}ktCIw>fv>9$#t&as;$b1lE$$*$6(Z#KoPuu=eBoOJx!hV=m=S;-M
zMsFv^SKlTTYIt5?XZEM?tE+lAu^<{Cqw<T7InYBFO6wPCysh4patM}U>fDhBu)&?R
zs+-7nrKAl{ezUEg!XLk$;hwHL5m#UHSA#C+ujUMzZ`?b+_8?ZA7lg9=U`uD1<P?6R
z#LD=QMs${9LGCvxM5IwIiiv#nDQ=lAP3;TRX=_<FfftNOJM<4k%djQOmXiicB39u}
z!do6)-Dn%UJAwvVRTN`v&~E45{6H8dnBx@UB}_1h!8wDa6dy__u_0BTL7MdXiHaUN
zc46Di@3Pt<C<R2;+j_D>Jbp~bWw)?ys2d-#<a`1sNFcN$-QhoLeq{)GpxBImk4W02
z$tW<0UAWw9w2{142Vk<S_QrJe#eI1~w%E$IqwW`mnP{RL-q_20H+~qD2}N249iGJ4
zIzsGzO=6#C#)>#l<2Po~!fNo>0~<CRG^=khq{iOwr4Uz}qZD7v@T|i8$L$kZ=?8x7
z<6wzRtteGl>gWD8y&E;ES@l8PP^ky^BfNf|BDL1xeim+)t$b2{-N^xu`8C^AwME2z
z{~$)qt)Aa1`F5AbNe^#s8tjHCz$ifY<D$If)gDM$a(l+*b+Y+ld$NA-8!9%pNpCQ2
zJ4(;qkvG(10mREaQ5;aO^&|mcR#@piCnl*N`B%9Fsh`Hb3z=i{pL%`9{L9f+e~bX6
zr-6QRMybT?`Z6*?yTP1=OcN~MY&Gi#csql$;xs0W?>y4slwsIubp>WVQo!QDQs=x!
zf3!IHi|sP4_&BZaXn(u-nBdKJ=8?cv$&r+K;MW>sS{Tl3@x~HGiqMNrgMdjTmMU_5
zD<LVnV<XE8d#Wr83bw&2hfb90=|fX*;Vu*()AR;udEK+ZjoqI4fGeanKTekcywn;u
z6ldA(l~g*ZpDXL{>)E`&Ph$({l>g>66f%7+FH)8kbaCsU7}K8%MIW{mI(rV$N<yp7
zXmmiA-Pu;AQ}Ny!^Ffb5YhUt*J^mP3q`dD`Q8Gl3k>EjPLr@UzeIl4WtEEuL%vPEj
ze<UgVCV{mb-9^_0FBYcURyXSRRkwYW)LBQ6dOGw2b1>b{!@HQH-!fR7PD}h6(#-7V
z+Fd8C_CjafCzuoZz`sX%Bdns|?L@hDS785T;uHk0Q8xEpU_H%SCW&GE9b<d9{HLQM
zy4tNjdlbXRRCK6XD(K4ahrEl6%j+~z{5b%fv)viOo)>%O?D6@4ysKa~tY)pDBYllA
ztBGKe=kUZxk3^@fW=<>1)k^XE5ciJ94O;nCV3QPvoX?asrXImCuTQsF>G-@#L1Y3A
zXnIH!cS-D=6(g%Hn*5uPP+@;+m%GE;&dYLeiHSeGI!aT!g-o2i0Ez82@oPPCEGGT_
zKYsiuKFViOoe+ETDh=X-z1$Vvp(mg4N~X!;Z0>xa9qPDIt@aqD$<Dz+0@(IiN7+OW
zd~9C|@51qh3h=q10Y?d4-*AJ>q?@SoHa}1&g#d^6GO&|_ArT0C8GUlemj!0Dho3q<
zgK7RE5-C|)j~IRxKO!Emw^m+Xy3<CP`#S!?#@m|UduvBz1_6gAC0i5T1{_tX)5v#&
zV#cyDxY*Jiv&Z!jHwjRh-rzTy_RQ=Xhh9B5<k6Et)Rhj;$~nkAIH<aus9cO+?|jZ3
z>YzG;Ur$ZnrcXF~bBv|UJsJ?>Fn0KDdFb+mTcu#H(G#uuTI97ibJ_B>Xd!=DEBUf-
z>c-C``>B*eo%ZywAlrp0<qOI>1|^pGtzL8PSCdNfb-}TbzgO#OGL{m?%XVgTuN@qV
zg0=EiU*N_kt!9nRO}3)fvr~5bG}^LlAE!Wmz82@v<>^QJ()N4`y>=~*k$3U?nUk?n
z$`eKG$e(w0du$Ff?)(f2pKVyY9(d3$S=kQiFDIo<7;mQ0G)s=X{Pm@0lzc0Ig6Nc%
zlLzhM`AWQCCO^(2od1Wbw~ngv`=Wlo0*cbzAtepc4T6L;2ugQ%96F_>L6Gi7q(i#9
zQ)xJK!=XFg&F>xK-aD?t{~QC3&)Lsjd#^R;XU3eC@ih?Q3zofA**o7#`J6<AAP6T^
zxv_#&pZ_WNyUG5LJKZ#2n|D0@Ov0IG-mp}8X6KhoVz)Z=NIJ#;5~f7neh7S8=dS`}
zy-ML?uXhg*@D<>M=jQI2ik8k1O9S}<aU>TQ#b?#&^@9%<cPj)$X>C>;_nX_Bj?pv$
zIywS?0P9Q_*wQVOMnyccW=D;FgR&v$J@p11t=E#OhN`nm-i+-6{^s~$-)6K8U@Iyg
z?R`v#ABN4AZc$^=1x_vtA_QTJH*MkVB}%O&f64d#`%VZ%R9bBWTn6+t265gg2Iqa&
z3!Jtf>vIxtJ&eiRoh*pAibmE&->0VZpTRYcT3~kP=i`^wMagsC*kKz8fW2PuUTrZv
zZkH`!QB1Ez`RveQa<qN<F*sU(Cd2y~!%Pj4!~62;XkADc49gZX<F-iKO)9J}I#2CK
z_a5O9S9f=5t|9^mO@`tmv&CM5kcgDiR|p6xj-)`(7F-mJ6B~SZGuHq?w_R)w{ryER
zQSj$GuCc5Jyy?Sqa7tqsJ3G$cP4dLT)<s+onqA=o*gs6xZ}nNo=LsXk4l{Wc(<K`I
zwOLhVKwK8o6^y>#6)h4E`*C`3d~<$sA_=bLG_T|V);Px%`q&?*0Y4?gsvdm+{rXD@
z6X<E$GlZJ#yPNwUn*)sgx`>}L&{};7$@jM+_jl&13!V3;V!8#d(NW@ovgiME-ShjE
z*p|~gSfSEMBUK=u9N4ne(~~&Yhbq$N3FG<TNc9^h2;lYr$t*$t-{9ut1Q=KGw_e|B
z0X3S8c<;{-6l^~C2%zEi<*~L})}P;b54c)f$&(Q?2mEi&D6O_43?tK9tYShoK_%=E
zp>BWu!45NKZqgKw@sHJD&1*E|4jAGio0v;<$d0F^_@v>T=zyjqbt`S3;xa!zE+kR(
zt}Z3S^ZLg>5#kp|Hj~W(V}mWxxjYI6f4^JwAEuC|Th(H8NYgD^xEIuGyIL}O`9By7
z5SQg>AZ4vDjL<M665f^)FmEfEiSq?9ycBtfH>6ZIn|f^n`#ccbSCzYB9_Tw@Eiqm-
z&qGjenTC1Hb9`Y%o>F5+C`#;4zpzC#;jP{MRs^RJys4s4SAmeXIRvmj)(D>k3dFzv
zwUvy6{>XC0IV~YZa(k6UT(xDE)&fx;>})h5w75CCRbU)|G+-V<u9zfN<!;^|x3D;R
z2X-2-Cm$xY8nrGWlVu7fbELF92Bk$qHmd85yLX~*`&*3)(6$dWDB<y#z#-B;&qXGv
zNDn2R+pVV(HR+Eh=I@QtZ)jX8GJKah!V%J1J`%Gn?*{^|B2Cu3({4AF@)UO&Ois>E
zI}ypfzmscnK0~2=_PEn^5E@hKET_nP9v|%?5Qz#*>$B}qF~GkD2VWW#0!|dE=*$lu
z)zRCYEshc>q<=;Bmed=)Oi&1C(M3{yf&lTga``i_Ldtv1?q$$vJic(1pEzDem#D`^
zd-d17@ZB@%I}mMKb2M219l-b#jLLP|qEKvY34|Pfg^0E0Za=lY#->&L$qrlyqOW3s
zIFa5&1ag^HL`(HOog_v?PZv<#q%-nc9}tK1ziD}>0hI}rYgrl?y!}^YIzkC88f(2r
z`%<jRPCEU->we4WV?BkG>2r!nLQX*qB$A=rU@m6}1hh@I0UJZx$7kO&-^wP)CDrLu
zz}+`QoqFH@Ehe^1-QVzDb!~fU&z`kG@EA>-bGSLh<-A@IdgdpnFPee5TE@pd<0waO
zBPnzqSVElRa42@+29ev5qsdS|SDySn<%Q$Qiou%?o4dR8@SIn}+(_#z>GSJgsIxv@
zy09alUxAq*6(bx#Y`g5lvpwcFAQFu`v6QdHnoK?hBZpapuJydPJ%4|&;EQ2IE7xs9
z)j5?fi%hPk2N|=>R)ifzdqb6L$OXMfK+eQ8GkVtxL>3KPEE4MPG{TH;aHm}tYHip&
zFRh1lD%1<~=*M1VO0lfZS8JIqzJ~>*!2XJ^!%`_F;CvL;1W4i*8%?ka6mI95iPMg~
ztMPt){*%&7tI8@&=tYbE%<qaw$V%Mu6%EeE*~GN3NjnnN;<gbh#*?qGnn8<W(NfUP
z7rDEa#XTB?uW!hgdhT?%&3*!Q69TKR<>ElZWRjt-q_l$FkUWv|2{{SX<+vr{tw+iS
zwwiw3%hq?4lXQe$S|`|?Oe%UM#}VQRC-Md@`cF_v<D(BWRHKN0VG)wJBDnjHHv)+!
zZKz?IVbP#)he)I*n9bnqQfuDMt7r|=Q=TZq>?wDqkLM^(G+eJZR=T+sU%+b;R;Dvq
zu&15XGfjJh$!i`EhNf>{H!phbsMNK@1%Ev`#J~Ks9Df=Y6b`YZe+s6t%B{nkg{nPj
z5B7_`F;mHyoC^=Nf7tK0Bx&|Q*>p@0YjYLIksZOKw6%z7-SyOZ^$Ly1K#_8!tx*Ch
zwq2+7HlAK!sH+2^GwQX3*8&f*QF~Lma$ny1ql)CUmi7T`wf)~ZC59Dl8B5=6*1uZg
z+MF%^!OrE769)I8T(d)Nx`aPJ+&<X*P7J>+Sw^ihK3-S&R@hKR&bHR2OZT&d_g)On
zeq#Oi>xuOvGx4-UwwZp>m@`+FQ2WCEr_sggN0^HkJw#aZZYuv1HqGK2<}I8Bi%G$`
z``e`Bxx2lEcWzC+sk7Ba9Mfg@h(LOR<1kdEd_Gb2cr^*Fw7k)~?g?3A)~Yk=8(lhm
z1j%>Zft6Ghb~9h^*}Y@cLoEzU5z~~0$@m>{uzMqc#hm5&Uf7)?JH*20aLQ>qmK?0X
zQ4sQ0kJl~=A$z{wBHZaai*|FM#guOc5b9OhOljUXxV?B5R6TR==k&srK_g$O$wl6J
zp&2dQ_euWM-^W`szRM#K{ZWw1%zU$?-6d8T2Z{f&bJkIq;<Z2RjhH%J-Z`V^^Z04i
zwrgyR91P6+Y6h??%(hKAA07ojq=jG_zw@iMkqkfQ;>291-PqZB_i*)l`Xk9mLRLeU
zflg?}O5I$N!Cq|6P4~c0pkG7%p3FaxDMJ30<!Oqf4MidxM-niCCQ78KO4KhAL2!Zk
zt1qL;XJwi#Uk5uxe*mS}N-4kydT0cH8A>qJe-gG2aX}qWq3)AHUG9sOXp<(Bm3z3p
zxiuId_wDEr<Z*Jh(--%NAB?ZCT95=9nv{?+!=5nptq$9zZ}IP-W$6O_EoOo4&(~6+
zX2@Nw+?OVxLuYNX2n5n)wbe)e)en~7sGMEyUYG}Qj{nv|^&K_lTCvu(;dJd;P+r-t
zKrcnGI}g<00uh8<a{OmNPA3i%eu!c9ePJad;f|qsMz=_)(_pesE?+1(91=pYGglg3
zl0MjW<JI%BR2B>;affi3eVvM#m9pAZ8X`PixvwW=1|x`DDIZ#?_J2W|n=-+~+&8S9
zc$rZ)x4?RUJ)ubr0rrSxTnZV#gS27k$@vu+&VB0Gq2$(IClOZ+%mKluCBXneTRvq<
zD+4P_<mC>ZSr4DO0TCJ>uQSiML0we@o8@ypV<n3Boa!Y@eYP2~u&)gyJra4_LqCvd
zs_|n4&4acYw9`I^zDzw^fYK>Ri*uDn=49%=#ChC}ZE6?jS9^tGzG<tvtKuMAc;`m6
zty(ZjfoFxZ3@^Dj^{?g2A3GY6g7qx8?Xrc%5&9AlCLP2cxuwP97MikaCbL`B-Qmh8
zcG{<cxltn|<6)}4S^Y-NQF@v+f$bk^BWj|9X^U|`-RF{x29x$p$ee=FLtl$7<=CaH
zAFN;>SZScKgu3IUeePEcqMA_*TN&F-(c%9TKEmd=D$4HL8pRi7;v&CFxm|Gl3<Yb5
zsMm~cbheak$ViaasvW4F&2j%pXT_jXdXok{Id1Rx7(7vC!)T1j^N$)3Y7koQn<?JO
zC$qpSbI;$8i!ohz6+kwJ>C67%v+B8Lb<F@cUcZ6-vx*mi;Dx*_M>3~LN{9~6lmQAi
z%vAUzSiznCLE)P^LaW;Y&k#e9Lz@Ug(P=%~9HGL?p3PE!M)W;v6P`)T+FIYwcWI&q
zCUW~|vQi^+B*Iq?>z2iT7vj9Cu{`6yUSif4cLn{gV}pRJAIoXMD#(tT!xYX0WMmX%
z$bC_n;-x8EKNy@D8ZK*CSPUnGy|+rZ`hp!Cq^UAqD{S%a5{k;Am8Y{D1(scskX-M8
zA0o(CC_w$u_Dn7>3eP*WO=YpEK$bQ@DBHlmUV5K0fHB%b&15uAdM_$~mc)3~d!5SV
zaNgMpNy7aR7;=n8e#e`eEK36e$c~8lYvJ2#Y_5W4>gzHl<(xdG7aFbiR=gE<<#JlQ
zu3vK}+MZoTGAOf{tIaZ_R^$R62!et~D}89`G+|!JRhuP6IGYnnoPUe7(AQ}dB!Io6
z7w8_wi>1`$k~x(V64oJj#3*jpb7rW@rJ7%;=Fd{0B3!7(e3-I|a1vZPEqr6?(>j_4
zb1w>m5)(8=&9=RqCCv+uFSrdICa3w<8igU-aF`MrI_#n0Y}xjTw%zxwj8fFFLHD7s
zmnog@54$$+V4wanXdk{P7C6gdyq0eh=#OCu4^hDK;6ZVPYnC7|lTO8akuA#Wb5?>0
z9sQ;J>vvxEClSxPoRBrSB94%6=-h@0+qp^diJfb<GMEakUb@6O<MeWBnTyo^yuP>E
zl(TPoSfg<hEg(xI?~jqr@m`GXz~bJK)E^^XCUzh<9#x~9qwF+!an7^u1`jOw^k_m@
zrX@m|f_jksSdK~r&{$>zAx@q0WZOsDSVIEaN7xy`EO$)Eg@kUeZuPJ7KeAU;++V+H
zK!pDq`SY&t#TzuGv<7F<qtxPtBki&|<8N!%BU7<<HdKpm_P@nkOP`bkZJ0S_nD`vt
zoZ$~ck`%W-^x-_=Sl~c|knnI;O#{|463jdzyI;MeKFb-8kR|y{2l>(UYUwPRgfDEX
zbbbXSN^G8O>kY}7os?~e);$PHmOw13X#L=k4J6wOIsCb9T^WJEe(j6siFJTU_FEsk
zv^0|3Ru%5+s0!3rhl+2uhbjf5A&tT;+r7IS(L_!LX320ZfD^9#6rlLt@thZPpS!-5
ze;s*z>MaZ9a8n4`i$ekDF4|1*AmInZfkihEx0Sqxf#2-gHu6LonZntZ2<7etYPnA>
zlH@hYaoaE3wj=l|z2?~O*&#$Ev3&aD<Ct`xYux^}=bWCdG*#>LjhfG7|2((5J>QFB
zPUT7V%$wNJd(O$~3G0y%#z^J1ZNGwg2k8jZ=33945(rx3SWbILNjb+qAuNHm+Ifq6
zlq7+&3IZ?%h?d*xl$4!|RyL+M2=C+Tc|`%!QI1SJgCb4q&ZPI;Sqf(c&InFX&4w=L
zf_Y1RMZUg124C2}tFRfIL|aMg#7yV;9NS7F<sARM>sNOJM0-vPrf<14gS;bWP4b*I
zm-wCp`EkazYa>iPr7mKTM2pVR2?lfp&_w%7^4<kq%dRx2;%vSnm0&CV=VVoQ{rJmC
z2eT}7^nkNrna7_eRef06T%1>7Nf=!UQ7Uv<VTiZoKR0<?$HWBAMVFmDhlWQIi|VjG
zzb2Rs=z?p(9r5LJLhqWY&;&|P&l$27_;j{uZIh{HCjNY7M0v<UbZj(@fAJD6#QyH`
z%yScFdBYI$EgB7xH44g0Wtc9y=cXG<;CohLEo=AaBD))=RS-o<N%dofS?aIQkGo?9
z;$-0utRcBP=FW8=8`5a<g%#^F<M|E978RS}yzQ4}bh-_K((;WL&itnhM|+}}#RRug
zRMZ`PodA>227w>GK!ZmR(cFMVt+k~7-xO|oCUk4K8w4GNz8?W;47Lr>*v4q&T}92i
zI1=kDbJu7Gxq3`Qrpqa}MnQD*3sa@R+EOjUy^Ry)nR=%aIE^=Nks4kNNgTVY*BM|p
zUK|A6UhL`ji1_NCj;$*-yRpqF?x1!Uk$qwHygKAbKV~tM&^y1=9!}$zhP#vKAf}(Y
z*YYFbd?w1GWSt)@zw~8F3syMvRjo7G3~Dz^m#rL4;wFTcjp4xh97GgFg(K;ne1p7y
zGW`3!Uf2r1y0`Vt_8d$%zh{;=PP9F<@XNtk+i@Alhc-G{2Rr0SQF>mIfUC-DkBePe
zKVsZX4gKvL;62${-lsj=Wlktl?ix8W;cwAy`Nq=hg_enkQ;^tF9=`8sTX#UY!^`uK
zvyEFBHRYXZ7S?Q+98H3S`5pAdQfv5r(8#~Lkyf?}u%MrubMS3a`a(r5FEwDDr@qmA
zuF1C5h^>oUIwSbKW5p}-)3(Pphs!WN=8*fAlTDjxI!wVq#;R578*PDIChKmvF~he5
z3yyH5)(z(`llA%b@$N@*Lw|;M2VvBB=ZlhB)R<PBS30Nl?;v-4BHoXWZ5tj$Uo7di
zmz`V+)gP_!${55l4Lz>%;lECdHC(Q$yRDGV>9&c>pNQ$lxhIGe>3mN1X@{h>dUu!;
z38?fkO&34nPapafOUb(*DAPyxXa%qBuKY3jEcxm`W16ZcbzA6<|BU70$=oNm#T8}_
zKK&H@Q{M@BzkYrg?&uZeq_Ig`o_HOEF!&$JIx!P}z4l-_tG6xTa0nVAjfLnY$EDsg
z&T}-Q=`G}MGNO>=b*phJAH+CubiqGJHm8>D^rlS6vq$quu+(64C_cX3y;%|G^gi!X
zG93L#uvprig^6kNYb04s0H<d*p_GmY$!Fv%_e+;iP&8O+)M>hWJ5JKIpYz3T7dmA8
z{+r>gNhicnrXBQlO%#O6qbDv^3Kej<ezf~x=@P?toiu{d7w9@YoUGj7D8VbhN5M<2
zzSkiMPBb*f3j;q3OD?edD#phV1$}`Ii9!W!XEKx09jPJF^~g>0s`GC{%R!-)VB);d
z!-T|Bw8&G8zGqLw&bC05mDT>K^5EI`!#6DyU_2Q81&9qth`1)|pdUa=fpjbO;1j+>
zE~^GsKycKDQrA7;JYCwEYvCA87ovg@Z{7fNle>vQKs#DQv;|JR3(eAoP>)0_)%Qq}
zfJu|tj<0dsh)c$dU&{O|vEQvW!BW?5{}k^`Hwe9Z5G1R!?KzZr;#_=lj3sN`t7@y<
z8GE$wBHM3mIbAru7;|AjnORSw(r}8jc|S8?yvNG}^mY^csznV3U~i~;5=V5knZT{e
zX}~P*MvU+*luP&>iuskt@+o^}#@7Qh*;}Lb!OD7@?i+z1=!Z=~lOCehz7W$e$HpvI
zO@55@ClWe;0kuYJ^Us^em5<esgfKius-#46JyzSznw6$hBf1S6FbXM!CAvvXmJ9!R
zV}o~@(0`rhd;v3l8(^61+_BFRJI7AfXQO=@xe#VXPq#p);MS@kfS<iB(#7VmPf~=a
zxy6y>u?ch9G)u`3RvTcOSaoS^h5Y!Ifffjl=#*S|o!7tR%z^eFA|mft`q7PdLbPP@
zOQGW{OxX%HB(L(fb~lj;J`jN)^zyHgGs!)8a7dqaqFuu81}$Hd6t8KJxg>9U?ama>
zbl*eqWg9XR2IF<H4Hf}Rmr1h<hf%Fmbm21HMAxm#I0~!6dUKB}iEEqZfIFu}CwLTV
zE*&$==4^!iAjh@prr7(lrwU?%4g0a-V^o6pqzdPJl-1Jj2889@AUEhEJl2m=Dd6mR
zFWPE%U+RS8YYlxP`*ctI#H8m3l+-^=b^*dQ$o}lRz@u(0Nh-Hpd=2URpYKgK%WQ`H
z@CWiqyb?KG97q5lY3rEQ(vof>^P#3n2n`MAIK^g0igz5lnk=cD*>2S4sda(}>&9p2
zSlKbsmsN|x?~xZh-M)+;k(|IeV`vNjNORpP4eQ-**uR!&2W&Ea%~os^t=~Pe9hhmq
zId(tW=r~q3s<1yH2XUh&SZ|<RQ&VwMvF}=lsfMAQ(5I637ZtMp7`iPUTny+6g#9r>
zP0|o-5Xu*Ce%&19b#vMu3%Qfmcv8QuE8%bNNZpzFMK=bA_^Y{D&?GB$DDGdGiXz;U
z1K}qcpP1eriDnc5rz3_A1kbA-23ZD((s^5rkj&x^KWvck4RIha-&)UB=qCJWFcnt~
znqe4><;@?8^WM6N3_GPWFJ;osqht8$L0TtIRR_8IdN%itG<H~{N}d04I_m6arz_Wt
z`xVj^=8*7GQ$rx3PNNWfvqWCPh;nDh^gZP{I}wEfW0h1^;vBw#xT&p|Y9m@0(F-d`
za-q`py<TJe472V>rrD0wY^;CjqRYZeU$nG4jRO2mcncSoo@(z$zPZRpzAe|y%Gpq~
zI)h5hge2K$H)Ow_<y0s#=Bt^u;}i=sPAGz_QH2}{oH;lsz_WO(QEC}DY!<#POMr1z
zepW$IQHQjD%XmeSwZccN_Q;f?6y45c=UQp2HmsxBj7@+MxjzR9dy(4-S)s_}Lo7Wk
zp?Xzwm*WLr6b`1Tp-AR^XKemLxr>QI8e+9`S;&uH%83n`w@-$DnzXD9IImT&jQGc!
z5TDUy7UyBEtsbMD#yFq43!#W5cE)8^jIG-a_F3t_=U1TB_a5JWTP$lD%A+K#M$bD>
z`!Q1Ni*f1QPDNSW;cO_#llWmW{wSNOTj#=D`5!AVGf)kW<tq8XQSLQFPK+Vr^|LBN
z)ykK?k?>Uwu{!|7VwlJGa_yGyGlPkMWts3NQR5YhRIjKXFmnxZ^xiDf$l^#9L?m3l
z^!hF&D5zYzw~zHLYI+-e*%#kOvVSy%10P(*J`|u3Dgz#JD7I)ZM?i}M!(iAKtChn|
z<`?U{<e4n@X~By1IHZXWOj@;^Rc1r)L>k2BK>~}3&`DW3f0kH~qp~hMiE3_r{%cd=
zgTT`absyn-3X=Pii;MVu10|l0gLRS<$9Lol>;`GXqh?uhgpel^()^?PVdAA(-_)z7
z;ec}8KVg5_$}=xUM1u|mT%Qx?st&Y!W(HIF|MbP?`%35Ce$Vs@#G(n47GnQI2IXQB
z*f*Iw{el%yHg^S;&H!&yf@h~^kI5!4Z`Tzj)zAXd4GLEO<qzj}-v6}fU_VBA$Fypl
z=oc@KF|ORr%xK^6ezNmxc85?EyGIKldE`952)Qf&Ev$YF9tY8CSR-jU#}DX4NmqW;
zI$jd5ZXLH`za&SI3WWiSmTcB?Y`#pY_drIY-cJI9yR3aM>{n^;OK0)(bb)oo1|i#D
z?BT|hD`Ap*Phv=f3t5Y*#^Iw1I0vJXcAwM#jL2&6({4BmG4Wf=KwbUW8ZbvUKkcsg
zdK_uVwa47pVIUBUh(nwKCPDyZ#RUY8@wOQA5*UQP%<7ITJ1g677C}O~+8lc9NYgWI
zpBTUdmP4rwL3a~<Eop-HcwiXs+jZ*u_>3H&uGi^+dl_?%%M(?3IPMI_>~FHn&DM8!
zcl|OwT)K8JKW<*t9`ffv>c~L2^AqtwL>#jk*+<C;u<r&9AMP*Z5(9?*OE4?m?d~Qp
zl0P0tUWvt@$!-fyv#9=UuZD#JGA)w?o?PWp*%+tCK$KF(&^bOxOf%4u8jIR|x{qwA
z_7&X&*&}W{pEgEP6@xtCs3IF&q1=`cQ96yuR|Pz`mq&j2zD!?vZnwW_F2{?bGp7QZ
zHXy_0i2vAz*Sop+Em`lXG{Z%+ENb_IuiDh`X<2BE?E1))d<9&*WB;L%{d2F>Sgxl2
zZ^+YL{k`d-Cf(8AA0#!Y3%K}_=a6pPkk{ya9aJ^VeKzi&hwh4BZoY=ytoS|HpI);y
zf60B9Qfp2#Wf&KTywu$gx45I+Wq&hClbkZ&YJ`8H<bk`ilNr3J)6*1dzM9L-a@yo#
zIFJ)0iKEkNh0C};HP-sU&p+WNIt+_^_1jd@PulmudONkFBY7lvb9_>WO2Y!b-P?Pq
zYf6>GD8-$81yZfCG7S(f|IVFZt+22F6a>pGZMdw8(uzj&XyeNKzv5pyS31weV5ZDp
z)Mx}&#vS+W>W^cY{c(KN2;|J>w`<a+vKg%m&su5q6{-<d#3iTiXhi1X;|_$Yxw4M`
z_UwNB(UE=+=ok8DTanjBCt{$fm_1Kyw*vT)@qg+pf@ZF&K~gv_E-V!Ba)1COW~iy>
z{OwQgx<$4uJ@Pke7dL9qtd3xE*(@0v2asOQ!d3_j^A<RC$E#AAVP+I}n{0O+39W*K
zsFb3J#SsG_SrG8OrB%lbr8_uW{sNKOJu{1rU^JK)bfmc>1I;`s?vadtXOGjdu=z3O
z_eg>OVD1hD>EyZIsb7<x7PJvwz8VA$eV{KpNu~-$TkRuH8TuyYL}&c9Tr(6*nc`a)
zzIAjRTXc=7*3*6_aLAbYDiuW<&7@Jac69?`^Xh+FIoNkFhL9pW+V0~G;(9Rl@p=Hz
zjz;-IB4cC!b0|fNhf_~&X{s{l^AK=5GJ!OR98|y9tO1aHE^bN6?^VOp)%mI%(&qwn
z=c!CQxLtlS&m@aARFIPAmQ{*XY$=YA@bG|dDwmF6w!J~Q7@kP?#?y0%JyK>(yx<zZ
zokTx=Uw?Q-K42+u=zITyHi|s4gM>H+z{lzxNViAR653k^+rXEG#cBN?pf=jyp3?&{
z@k^@jXt5Iut?=KoRA!F&>vW_!$NyD)n#dvGbOP`d7+vlpPzQF5cfi}P-#S~It9C#0
z(@s!p<!jxT@6!)w4m3E}%%)Lr7_<?Ld@~}vZqC|bUkvMh`JAutjApRD-CF{&7$6jc
zNw@xk_P|X9wR{Q=s2T~aBfie8iX}UB;?fo4J-tIi*g&Hpc{NH(ld=i&vU`X5+Op;5
zsw{2PuxQfmzBDNaMWnySynx<)kSrCbwF=l(q3wb`8O(4zyNYjo9UtbuXh2h|Q(6Rt
zChA-j@BcWxSuXoR?0t(a22%RCz7H{m1h~K3krXk#r#@jw6B*l+guz(h`GSz}xz>c>
zC@2_~bHVvs+Fo6I<Ed|1<3CAfW`k>~pBQgJOuayUy;Y2152=3>_#^AUsNev0Bt^JD
zy3<jSxy_eh*l*eOpsf`%u2&%aH#3=|4;7F{#LwPERv7hh7j=$=82@Rr{AI4@j}Sv8
zB?>eECTNSQ7L)piBE{p4?i_^pu+>$TtI%h<&b=kw@?qEPooXCx{tcih$A_I>5OYpy
zii?tjo!6hNWxKZd(P>3loJ);QR<gOkmT8=Q)emB(gczTIMH)+Ap9r;2&OzI=30R*!
zx|I;nF~xdEoDL_D2Vx5=tc&PG;jI|Fx66c9W~)$}^;)|tI^f3}J_@Xpq)(|nlX5o`
zyXJh8ZWqg(k&Bq11v*lkmXj#842!tPtDQj~OZnw;Vb<NZ5IG_dyzQ}!n3JQvD6&7+
z`51uu@a<bsdo;Pov(N@aJz?DN^gSqKa^deznjO!5Que14E;^@q@6u_kTBb<~xE`Xw
zp!XiQvln@yVkK9fF;jXy$TRrt!!aM5h<L30@&;50+Ky<N1%+s;evkdiLJdH%hVKY?
z`LXtZkIn0^dY-~hJmTqct|1Mo)g=uie}2%5@{LpsYi^RG`CLA&H{0qQzBmhl%*0Nx
z@;N1RmAx=2#1K@=m673Jp<lTlmudyD025c=Wohi69JE;Q@J~6idC>_O@*?ce2-$HY
z(TF2nIa_9F7p?ueQN!i5(kAbVCY{<=!RXO!|Do{*n>L~gyzX+ug#L_)tkN#bvJ@42
z&8NQp$*hOzX@jBIv;gFY3C5tJlg0xz-vUo%^{ewbpiO2x2`ri+Rx@R99b)s$$A5)N
z&f{((uRK8XpLFFi`Cmqp3I8ylK_%jfCnQL+Ew2U8o+p%T1lUwH@#2#1%JFK~Yhvya
z6Z!Ul%X*1$yd*9wysgI;1WYQ)nvxr~X25o&On7|h^9U4fYenJ6pfFR464DmZ0ts<k
zn_PA@CdU@KJIl$B4*!=kGMCWT4-W1vK9S@TIf6n$`GfcV?vRbEZ5R%fDN}b8snoo9
zPIqVwn+l$IB+<XBzVaYu&ZLH3I~zrkQQit)Jre#3SZeVOP#WKp3V+$i)L%YG#$rP;
zvWq6-+Rp-Pbs`J3xbnc!$*mFMlREeJtzI`sAT(4mfA9;YkOzvhDUQTR-7mK%4}zA0
z5oo0i2a8T?PIWW^ZwZrpifaFbmhddGZ>*T^4f||{c3L{{bZ~1oyJ-Y&M}MQdoE}MC
zTkm7f5mKdl5||O;_%xZP5W47&)V@5MSN?B+VLAT8%gAovYp>XZjsP4=?D_fTh#tnD
z{b?zHe`eC64?AUEZ1qO2Hk0dXZE3=!cQn4&$R|R=r?(^%&;A1Tx-}Spi8+s(<-8?e
zvp!xZs{I?l0}XcvNnS)JYrPUfwTCf+5)1>vD#EA|TR!WoASw4nG?~!krLo^(=#^#!
z!Nf$REP8IdidX#CuP0>)NV5pZ+|_ntWK25uJkNX%CXMRwdVLxj3FZ4&k_Mw0d{pS`
zLy4>u(y@85|6=K7@iGys=RLu{zF}Byiz1#W&yuKq7lKQL)QbuW6y)RRnC7lAu^Hdo
z<xneu)*#w6p-F}O;3nWTMlkFRv(YH_dVRaOJ(?j2)m_5mGvr0tT|l}yGxlp~8AUrj
zxIwmFZX{<GK<g=<0d_}hOd3n4q4}|O{572}=0VngUO&UN<)x;b<$Ub#sD%HSnVE4~
zOv3>!t=eiCoexSbs>M|PN9c<Hr8Wbj<kmvhu`&W-V~iu{)bDdpC0@jX(^ArtZ@|S+
zU-l=zw2JEL7__;p$FLO)v@eXfUFRn)E2~G}ZM3+h+dtj#9&LHf*ZZIz9EutB2I|u3
z+`9&%5^n`S?UX$E0@1@o265VR<^rg@Og5`%66_AQ0mkfxMyX!J;~c|C5>;ybl`~+3
z6XLLFqHt^tOASn#I9WI4+<zb;q0I6Mk^%J$!Ex&rRlvpi<C$q>_l=R_vF96&XGyup
z_G}ClT^y?bR&TVCM$YwCT^b=Km51@cTs0yY*-=9NV6gu5P|?rA7#8`@#&U^^RjQR(
zly3IMa;b1Y`QPCOf5#;2O9ob3W*C=5hDz(j-ZJi!?Q)<spQB+nP#XU$XbE$XxCyOD
zVXW4EMT|+#_|+Pe!~fFf>pH*~m3l-)q8WUSr(g}0GS)>54oRF$$Js&O*G6XWspHwk
z^Qz1nR~F4JEvpZwG@U_zczhZnO^Vi1S2_YHJ_jD#J9=1ZoSp&Q^F0n7+_im!=XO*`
zfGGT}jKtq}@~OF?eZlS>pF<}?UNBiGJ*0Zu@e5iJnm36Kv@g%K{oyX<{{mt}u?}CK
zCW0e^4=>IUI=OIrQcTxGy@8H)n+vMRV0_>C+?mpEIgLm_;1_A{=y(Ad3Y`_0MoPav
zDIYAhqK>3;b{`~9as3ZVrHs=<4)_x{2I?vsJa+SRFL!xooGwZD*o?0~aat{ZmT1bk
z{+QDj;<v@Tpg-Hs_9;P-29G5v7*C$qf}!QP4$=*>N&=YQ#}!o&E}s4q=f)rav3)4y
z8YpOTRT7UrrO2sl;P!}gsonxbc&=;P3-g&u4#5^dHn$IY=dcA39+9c`;u(D7#pY$I
zt`~ONQbF2+)_Ex1)3jE2EZOM$B-|TG7=&h{HF&P_XP>J*&w?X{9%32@X8cw8IdqTd
zO4(xB_A!P8A{IoyAHHDsMUhC(C>#Q`ja$92Z&_3l@Ui^&F>q|()uzrl*)H`l+XOne
zx{m+eagyB{>P^C`XIWX_j<66sHdNLA)~2s&F^uUf;PU+RA0vsN2N~5nrj9DECMMYc
zJW#hp{CIK`6KNkc{$Mea5*wZMUi4>B;oqK<hk!T=^RVU|1pUrn^k{~EkDV(9xj$IE
zUzO-KkjN8F|J0yTkH7RJ?|pt^W7$vrH|7gQlq`BpXVWg_OJ4sSddhch9ZD_Z(F@sc
zkno>pL<BL`hU(vvMWr`J&_ky6$2vHm-b_7D$BX|^1NxZ6>t1l!ub#?lT<-m|B6+{o
zXau8IZ`;gbGD_ulT%(*%a^W%EL(;e`ZT7gCuPKyG60|~{2J{9Eq8hLQy?96vo4gmk
zQq89tE+)lPR3jUu-N)=qSJxHvGs?W}pp6&CZFGW#P9O3QRsj=5w$45tFXXlc$qiKh
z|JwiUUsZTK=%(>{Z<a1)RGVXeEnA|Z%vcL2;n{CUuReo(7->4iBolHEi{HNlpM2_c
z`65WbbXY4ozu47qIo8*NaN#fR)Dic+CkE^AE$kz4E?_yPW>1)D-gloEv3c#h74*GC
z8PTfk-LT*=*`t?Gh9d{35(zegLC(Lwz90hjw`6o#dN<lQ+4B;7OspxwqEJ89=XE|x
z$>UbBPUqi0gH3Iht;N%vDmaXBZl7CV52Z~9`jFR&xL^Lpv|*`Q=Fzr0#8`o<Y`LM6
zHzoNF<+vWVl;XBt9Upgwa;VwEI)2t^{1=#dd$4|%7_+$GO5Rc(c?x-h%Rs%>6N;p$
zXqUzhkxyt4g!DAX3M*gJDvTY^?$R#`37Yf=M#|HH4M=qw^D#=>_5R%C%u`^%6MB8W
zEZoL7*5R_c`7l{^sk3Zr<*s<0amN<EaoN&$y38}}J=<Wvtj$ra3<n3cDLem(OWdJ^
z9G`Te*?I#~khB++;eDIyG$%4VVqr8Vqa4>RQ*Xbj{Ko?og^-O101UfDioa&r>~GE1
zUg*)Q7lCJUE%KQ<GF-pIpD^1I-!9_1)$p;^=Zt9gGi~pM7W&G@fy!7QT)Mxo=psSY
z0~J-LCu(w}+)JX|1HGk~QhN^Rw!-VGPG@(S+HPg4lCyXlquPCnJ}*5^oi=y9TKV{5
zmP>Lz#<ktuYJ38I+ZZNRwOqv0M4NEkkK3QjUT9Q#2u$98MPDx{|L{lhK{;Pk#EJAh
zJnN#Mam(ph<)1f^HT2lh!DCcZlP!@IKKC2+86v(0^YTuDo=5Lk?O<y4rl3)*MNBPA
zqIT)*;vPj|gij%x{q8!~RMCPJyfS`6Dry>IZTY{QB}NTLDowVPH|VxPh~!@xKvyB)
zD@G@8NLuHVxNSzQw5?WT5L%G1oAR05wt8!muisE>KIILd!z~*gSERy>hjs^lad*be
zkYv6bhOu}T4z)g#;=%^iDCkeM>Oe8?O=7nY#G;l9r09~;_`LB`qxLfn4IPG{n|+?#
z?oSzLgO+0GAUfGNx{W6P+If*mLMgZw;yj7P23Cqkh`8-gD-WjMDXd)-^U&$OXA~ny
z^*J5J6?ApRKf&Ruf~S;xO}lVK#*KfLXe%0ceECbIlpvNiG~gfcWe)7voie0kvEc~;
zd}C5xRLf!bgWuz%*IObNvB?sRcKLK0tx_%f<5&pA0$)dAVkhd~HiK=6@1;j=c_=46
zw!R-9In_bASLDykhpTsKe7^8K!xwnNKQXgET2O50N!R?s#Gp6iz0pY@*)O|!=nAZq
zXlaxA)Q22CR2k4WvIK9?HQzzuMNr@5X?S~ok;HDsY}glKSj)`PcJ=$hbG!k{xr<QI
zS#yu4(@SA+Pe7+e$rDQ*Vxh4fFjS3izU<B<ILxQX#Sk4ER83sE7Ae74Cq3^2=9KwD
zo1+=8h-=9@J2$nMJ>fD;n>3Jl3`Tk%=rpJWWN?QJG}haI1j&od)fU=&OcC(jy8lCc
zqp*}pUjjS6QwpaAvleKR;AERW3E2sQYv;=gBF@f{RG#r7Z>wj7;T6p4e?`id!Xk$k
zCcdng(enM5GRY4<@6WqGmKtC<M<QO7%2%qA*Gl7hZO-HiE~EkSJ4))@#rTbN56EZl
zT>M%vEqqxQ7>Wj32T~|rzC*PNItkEvn}%uKx;tsd4>o6i8I!XduOg^~mx$Lv#Gh2O
zr2UX@tKLMpF8^`}gxWDNJA>ol%xonOG@j+K{sTJi8+|$cmS`hzb7wPK#p#KP2iw0U
zAJy%?Te^!r8T;-HjuYVo6O@#y{TQ>AR|sj^8L~*;yq;9cBehp#{00+Yhan{Ng6~jl
zf^2ObJNsMZF*+!iqB5G9NdJ4<MvbM33TVLQ&s8j(4<Y3UGvUjFWX-H-e8#A>Z)N(W
zTl9m;*EV4+sqF>Y>R~=<r4Z`j(kK25H(kcz*9KA@5fg{TO)0*3l_e+8UvB7Y;7V<D
zIlRmM2PE^Qn(Q_S`aY6`<FS0medYYD&ctEX#meoI7fW{ZHOB?nh|5?Y=0)|lHtx^D
z5wR%kHiIRAdi&Q*pTMdEhaQby$lX}an+y1{l;_X4NOHybxHsbMnS!1Yl!q3d--&})
zh1qDeoXEgeG0^)nRSE%Do5Q|egk3Z~Z3?C~T9sW~KU#=!=%C%4^LaY`3ejICdisB`
zuDx}Es|@je3<{+}OSaj$TkEr(6BW1A2R`A(VZEgj&!Vrt=Cd**GF_aVeYM1+sl&)+
z%K}JI2h01W@4qWYyf(Y+iCjQ;xVa^r;_jp}xKDa*)qN{p9nqOyu^zI%p;2?Rdxk&h
zV<;I-z26yxN|j7zYfHv%+$BRGK>WAtvnqd>*YUvGiW0gDLn+kpQ*Du6ZlDuOOyY6@
zvyeN(N4DQw2dsxhau>GsPveBq>6sl<>rcs))`by+h}1Qkhz=QoJ`une7#3p2WdT($
z?ys|(!h%G!k0r5jNJbDL_TiOc@HrX-UQ!Mn3`@wv3Gn_&yCfqeA_&+_LQsn*VGB)t
zRXtB{$e4K3G6Xx4rwSu}ZgEV@KlfW4x+6*D#jk>f>Zt!E$H<2jo)=*eW<M$=RAXrS
z^G-;NbHp;F>^{xGgp-UgB>UZ)>zdorUrMUdH}mN%wk!K~@$07Ngr_6iYc=b0$1x^r
z>B{fNiT*))xqSE<fED8P0wO#W<CVJaMq6kjGLB=Tvgl$+eo$%%5}u8UK@O}3esEWs
zu#)qVtbOEiTrQ0{UL{XUmsX6xK)14E#JmRkxr|h_9dGaQd;&w<D16`TCbfh}mYLP_
z;#+Kh5VMRb0Bg~fK+u-0c3YHfx`6)iw^n=46FAgV&W|%>Ek7tt23=8Wz?lkB-%Nk+
zuf@1)yZ&FhzX*z%jahc(M+D<_R{~#lKc)tohp84FcfNL=*&bt-{onGFC@U$FU>eh_
z`a-$o?wE(^5Kn8yPnaoJDd7;ug3368m>6PizkZkA=7y|R4vlxlxsI@9KTV+%mE&99
z4k2TvltgzDzvT?AuzHeC6ZX}~*<0-jjyd=GTsaIg|Mv?{lcy|XTAm@tbTh{ht;e@B
z#4UqlH_sL>_W)<TE!Cxa@RU&Q&Vd~+wVDtdX2o(&s-pA8f@9{9n9M&V?Tub`KYV@{
z&N?J4l^3#Fsqd(UGnfiO;Ptw1nT31y#J(d@(BG8gE0uiU&a5@owUP6*RIOJ12K(>x
z30a^)^b7zm%2A;QofEPTQ~{<o$R5D7x1Rh!X<+h57fsJ$nk%Ov9|Q2c`vwf&L1DhY
zYFP<x@nj$}4_=<=U)Q;X3U&PQ^fZ7^ZzdQBfaX!<K|VN7+|$KUpVDi(KAy@;^C@J$
z3k`wE?OyZnx*YMXQVp*6U`%AR-*46%L>ZLLSQ7AGCi_n>y$cy<{rZ;CP(1ux!Y@5G
z$nmX3=PY|~^2B|?q2z%2uDLR0XQe0~p|5uJ?pwLE8}xT0w^E}LCxzE`ykCW!c7E_$
zBXC-+P)p-zQJw+`@Q+mtEmu<Pn6Wh{>H-Va68Gbk7!INuqA0&yye!+Z>`6RzPYRz!
z^Qxu7toMqklrLgaAbit|w1+OoNfNO#V@}>^L;A|y1&2}w=W&}!f@aJqu0)Kak}p8P
z7bPed)NAz`SX8<UAz;hN>%ZEakT38VN}!^Zj7|;dl_!y?v8vHgj<XB8qym9UJAd<=
za2n(=L?{xNwBHt;H@O^PydY;c{zM{JF;*4)+^ov|adL>)rjWCh;LE`-gE^LG@-@8h
zZ58N&5OV(3A!1=EQ@<QzNo#UbK~R4xa~9!kefZdKG0f<_*ZnD(i6w*3fy@3JPCBz(
zn<Z0x@dmbvouI!{-(z1jjQ#lNbqb=dJJaa*0@s(Qq)=F&(8yU(I)hQEp0-L}MtsK>
z5Fq9((|Ltz8TZ5TL}k0IE8R0%`S?nXV7c&(Snw%f(<8mV^tE78jdJZm&daKPf-B*!
zt-)htymF(Ohes;2h~z4`u0GZTe~(tiIYd?e7I(YTp8wl2Q~v!5vd4;R<Ch4s%CcR@
zY-3qqhli0sH#T_v_Z0U@d}58pxZ&{;971WlHtenM0OGRJaubPH$UVYomt!ULhL}m+
zK6E@7m3X=e1RIFr;td@ddEvO$;e&iz`){?#B@};iB#?&zSPJr_OZfu3EpUA*9CtAI
zyYryYeF%$P&vju1p5J;wtHf<z0`}J|vr4uoE>oQNu3WFeA+4)TcIYEj508yV-<Vc;
z<c!?Q&cl+v749Q4-Fok<7|cNK5HhY<arFk&u@Op-*19L6B%2f=UsORChrV&lTul1d
zY1>-@U}Y8CMmtn`tLQPDIuETnUq5zThjr<aKQ>RhoK<dqtskA5LVmtNK>2e*T%c{G
ze<GWq85{MJc#porbv6erXB?+vm)%ei2e`}#?ADS#ee!S4{k=0QM@N3A{}v3P+12W`
zmXRjA+p87+eh+tR<9^=c%s7vHt&f)RAK?3%s`Zz8khf2ECO(4Fiu$u)#7j$J`6?1D
ztu_4dvQG@&`GW*prh;KO%uxV6y$bMVZkOlupb5vU#Q<uC2vV3~>oUG7xt0Q5v&Jk7
zIO%`py7$JkS-SFmidh(7^9J*2@6od-lf`I@`w|&40aH~%S{os}D{o)V9^ZCo2;A)U
z)s1h{&8#Eiy*y!MG&yafv0d}ig62a1-Jx{U-7*`rw(@FptmrjivSREH4%+)C^=Tuo
zaQml!2+fRN?;~iV$vdWoR6H`1s%}s7dC*7~(amUShw#9s^FjBVA{TV~2AY5EK-VDn
z;bREt8o{-d<o^a3qKQ{EDLG#F)(U<_@;C1Dg6MsJ5qh4wRNW<;U0{oTknPd6qx6N?
z&dwVi?00g3SouRaA3DS+58^YCdxmG0J;AX1q(AW#Vz9R}dbm)rmHDsUYuDdh_rL6q
z!D7t_Z(=}Vjqy+he9=@M*HBHB{f7^ksDshsQ(J8$!eg!6Hp0IE^ha-WreFk7!pCI3
zIWzPs82!lLadD148U~rcR|Sx&FyHSYZLmb#$0HKo-@DRnue7v?jR#_J<BR`&h|L>d
zl8uLL&c=LHZ?I;IA{C)XR~sA)cIh=ES-890dSa!~$Dsc#zPs{}YiL3x*;+DRljB7X
zdWe9Sn<>?EmzcZuaE9P^;<I|s2j)P8tP$p8KN+dkdPB?59VBif-K2y)4gwFGqS~kK
zX!u?B`~}_5w5A!LgM0C^Y>->0*}3mn0=R7c0hKBV@u10q%XAJoIYlMm^&(QvmF&sN
zrox~qP-J^6ae}Pc?90cnECOKqtl$@KzZqX!>RKXYPT>kn-REFR6(P}JFYqW-|CU3i
zhi*3%u4bEap2@s&`WF?P_Qdo0h0lx_^u{)2@h9?)1oGtLG0qCnKh*q+VVa5nn_%g<
z0<%T~A!^E1{3=?@E!wxp$WoKi953?!|9^GI@{CrIj7#K{^!#A)wFXCF!bx}uS`n-F
z?*4*a$yy4f1)Ln|!bg%Mkb9tc#Jj0=ZMKOO_<q`UzCHu=TN%WHP;NS4I(oO0Gl^~H
zg}+tl!romlfE(FQx6|tZfi8;YYnjA}R;yOCUYpCuu(WvkWJRQO`mi+~d!+ERxk(C{
zyw~+!C;lcyr+<{>NC5C8hDzYA*L6k8bTOUJYmUP(+(DX{R>s6JSp=o$*<T!amutAI
z9W0Uk6OT24fO8mbPjvQ2lKFQyx*nby33PhMJpuuK*<yj?s$vp1F9K`dphk%)3p`g-
zIuSh5v@$vwj1MaAxSLW2hgefha?d~0hY2*Bo<HdS{`O@4zqxn*uM5X?x;Cv`jnxUC
zT1@(s7d?+{I|hjB$XEW(6}w&Ba)BGe`gkRBz4LbV-v-K2#mx2N6XaRQN=?a=Ql*Td
zB5PWUJt-)lUF)=pPt_w0;XEd!b?!H(@g_y+ps(rB%LJ|6uTQ)BSM0iPp<VBVyx>Bf
zxSvSTJjm(?eaJ7a8`(z#ZH`S>gVcJ@r%)5*%sGrB5hwPxD^zerTnkD?R7h*^7q}wR
zs@1=6Q6v#^#}v3q?}o2F3<liM-q0zv3eP9{#pZ|cs=tV}=(W2#cM1dCy1L5=T*Alv
z(V2}*hlhiInvT!^WX!leY<V}>`k<jt**<XmKaYeaM5$ESm(!X9E-JluZOp@ayr#<>
zhi(Q9l4$M2m=mn1?mvinOL*di62EsY8Cd`?!PLf0X|dTT)UQ^+{F{d?q>(u|+dk+r
ziQ8^Tz~B5S2J#D>Rv4yt1Ku4<QYaREO=9yEYK{n5bMgK@5)Xms+-SCpS6sXXeF+X{
zydclJjGxD5OTy-hw$w;!2j47F?>go-8(z2JrT68C9qO7dy5#Zs*r~G3-0Tdf%&FZp
z2zo!ll{PAT7%2!)8gh6)IK9JSlLLAzY)kjcfo)gKG*3CU;Rdtb$DCoR2O*jxuZ&Ix
z4=0!DdVLxHVSOP+`v5<MCZl@j|8l6d`7qA?W)ByQOek8LCMDPLp|wB<RjMInsnR}Z
zOis?4RR>W{YS(f}sqMnv0?OPN@pE&dp_q97DuwAV@URqIiN=R(+-rGUc+ZL1(0dqN
z6ZTnDT8IZ{4(JV++j7<(sNCAT=)Zq!32iOjp0pQq+hFSL3p-O$%QS&qoO&^{Z1MVX
zk$oF}Nfhp<3a%J9UhgL^$CE2N8>};Vg?{2cJ-8&FH4U}KQ_-?es-57S9vdk(f?|bL
zaeVG!(LDekTA)^}gs~~R$>sxNJSP^`%UO*0(Z=ahmRQK+fszuW&PJ819lF?`Nef@L
z)Eun)(UjO=GjeeAH2dpjr#<AXd)y)1;o??joujSeLhEXE)*xcoFP-~~7^mGaqR&`7
z8m`O}i^`u-9x>aAFq=;?Oe`L*zavxP!&k;nkdxj+a0LaMD_qWA9L)O?4gBoxS;;_E
zD%W(txfjAdIIy-~8~$=I>xuMcy{|iq40XIlSED-=_a&$oHD3IiW}UTSG4gsFdl#N`
zDoprc8$&2OuIR&hjW1n~M7S5OuPM++xS9E*KmMgdo6JB;KD32;Mocf!hJ~=IFwt77
z(No62FtHP45HYf0lZ^`q^^J{GkXvJO-8K?OW9c9$qyYQy0yF{C8(h$1--SgQv=C>D
zXCK5?PE?NJG}K0Fp&LB3A;$2E%0=wVAjt2<iKZnf8<6+DDje16Yo^h9ZsbI8?$7f*
zhEi|wDpLydH#|^*O}+CE@s;v4Zy~^OJdJB4@DXlbmA`{GxdA`Wn9$Rj=ntV^CoccW
z#++5vfz~Q&`!YS*nJ1|==(~zkKQ4!eZ7$?4d?TE!mfML?R<C16=QqPw({Ne81V@qu
zM6OiS&aOTve!9M8N^oIeUipOda|{C<**RZfq-8i7^Y4&vQC0}=U;dp?NrYA&8gzZ{
zs261(uTvq#6qNmX_}*~oW4BZAR{X?rCA%eIhlHeprs=&ptM$AKL7D9l-49x-chTmN
zy?<nl4n9cn2G#{9Vm|!bF2TeUG5r8h<@9egD2=VCd_&_O-g%gT>E$&IZoRn4lQa_H
zlAFq~^0kULq(oq6JH8#U67=3<Acc>#itnrtj>KHnE<D{2!J!~AjouIO(b>!KY$E!U
z!W$KG;hc)DUi%HDA%vWk-!#f1y3FQLtkaF{z1@lFn}~u>SojZ3(;tf^amPnWjE<e!
z`)}5)N}!J8BttJlBd%*}3nv~@DI{z9?IBm)2NJ0ugEPD`fJCCcTBkRHj8IBZ8%w9>
zKg3{P@i3g_G}?><=pT9Kx9acfn^K(%N_)bw<-2`L2PN`^=7KA$wf{VQ?75T5@_R$b
z9w^>bYD7LO&}}Z54h1DRG{$J~C6%U2Z8l~{hi)x+>^Fqn(ZrEsWZDfrYLw&~=MDRY
zg`;}Sp|Uea{Z+xyIuWHFeifH4uW=aynV!ETf9_iD?8p!_zwvA_nVJ|CVR8X?>F&z-
z&q38xas<R@2e0Y((aBogiqYx(7N@Ll5LAq+S>xcSkf1n+MdiBfKFHL*F|wlC7YjzC
zxl?iSOm!a9UQ#-acIp37PH~!G%M2Gaq3J^M_qzJ^b3+p>xLmLWO%%aY7rj>kdA8aP
zl0SbszsPnUjW1h9jo-iON<N#}Oo`Kzs#xjBW3FHZk3eD?Wq?{XxkHK38nH-)`n?Bz
zNsU0J!=9^w(Yz$cJ~!4gRRdE#RsKX>g0`&*Z2W^|Urn#X&k_)#7+c!D)dv4m$mg_b
zUHA(k73kEq&9`Qgg_X-CDapc?gl}}!FFw;mM|&fVtwn1JP55qjO*m2bBPQ0W9$!UF
zmOcHN!*<p}>1h3E)?Ig64o2Xwt<_z6R|LH;#XPKEPf5(dclUNy;#<FWN?KZK$;<v;
zSsj1)8kt)z<pLcnWCr@<X$Sm3RO^v~2gA1-C99c{cgEVU1zs#~XRv+MZu98Um6opT
zjLRcLo;K?4&j0M?7y0wO;aLBICd7*@HHchNQNy<EA-1Hhvy)=+dyk~;vU>=q*+$ph
zG_$SSfnJiGMn|)rpCjH=xp|^x^tMkR=Ow|f$`8Q&X5PDBitt(rM?06q;T9B=2sjUv
z=mm5b;mPbFspOPYnAx2CCS|X+{C6t^%UeBFXEJ9qUZg}Jdf@q5vh+~<->aDkbq*S{
z+3OzlR?``QAlB!Gw}=8trGUcd7|oz)-}YiYu_N^4!3JXYlb8o4n-?hbe`tH_ptjof
zYqTwGp}4yghvF_liiIMD0>z5EyA!0i1t{)bptQJCtT+^R2?Td1xSsty@9+E0dA~U`
z=dUxvz$Dp|>>Xy`_jRqcuC?H`>cJ)E5Ugy8<yAztMwW<+dT)}pkpxkUjp=jZ1+p0N
z0*bTNXad5RRbNFN)7Ew`jM}u?06g<j+-B1{tRY-O<=dGhF&SubIXvRjob0kxC+-FJ
zR?Fb9um8kc0MNk>RP`$9L`-hM8#*zk)c3E9E4=0=G$xhCw#MD@2s>8YCY4S|t&ZVg
z6t%SjORq=1_2X2)P%DD6XibXCIP2{re87Z8)smq8QAv;t=<;}^zyaiQ<8?!UKwxE5
z5@$3}dD;8|YOi~|+N8{rxbX`^2ga|wfh^MD9`G=(wMaZ^Ra0%oNOXH?>?fI18RDMr
zsR?$=Y7(^p4$$`itQGZpIx5d9i`!D8fpvkuB$%OYu%9QAdB}yqRWjkX*#@zV0LA;H
zTi?}<bM)1&d6Tos8RCgw&#O&8us^!D5_X;ZiKqW{@4dsg_#1)JG3;<WsqbT`u)^)W
zhP~x6=5UI$_cSXv!ywh#qm%>S#PWL&Xa$9EyN_aavEI>bpR%mQ4hP0-i}?ZMsl-Yg
z{mZAx;hd>gSvu-c8{w;F_r4ckq;4W32-$oEGe8hf$32LGR~nA^ar%@EsxmvC@_+)M
zvPzm)oSzJvLRR=Ua!h85KKOAAf|W&Jkvh%4`-5JNSjU@Tw-kdCA@FW`pElopZODJ@
z`-3HOq9MVuYb`T4ZEJPT34P%zll8^J1O5~dhY070AXM*e6dO5x{#gsP7XEkLXi8!c
zZ)%3Lb(j18uW@MI+MB6`W;X8mvtvTU-0su>A`_K{#!tMxz&N~&+Lw#YJNw6mMZGQx
z-zL>IX1aP%aOg{;#7hx1m7-z2D9Rx7zizzNAk|E-%RnxI(AkkS{FYDvD9KksP!sdG
z-RSfsDS7>}U*CM4r#8569)RoqVvKrgnT9|(_wG+xs%^Ja7+kIJ)2o7DUH^k~Gnb*f
zR4#{BX!PEMzwp9ECLN|Q%Qx;<jmIZVT81xuw*4kI@1gAP<UdiHLqEi8^g{x1Vv9!?
z9UmvyOg|)ORR@gdwa{aMFF7aqyJHAsXxI)4&8zag*}{xss-$Ywr^&AR9BCcPvOMr%
zEdsz7AF@T!mzRTc)2F0QC_5r>u(YMci>ec&P!PSb%)i9=m$M*(=dt6*z~FGJC^;he
zk>#!u$z^9wW{1zS-WA22f>TPBYo6LV<nrXYgr)cKqky|D`8%=<d5Ez4&nlU%1ux&=
zzP0PY`X$D^hAK$bE#OG?OgILA=^)^E%m%e+NZ&+k6#aAQ5?M1=X1wn%XJt_Cdw?IC
zHHoj{fej034BmL^q9Op|M#0tMU@FOYKb;H$gY4+e=gcY?!GG_+G+~(?;C%g<(YMm3
zh_UG*vf%S5geH)uK6~05`Z3uTxSjm?G&-c)&r|93#MC1_Y0|~Odx@v?#se{mHeswF
z&UquU+3-v3EfO3aZ#o=xd^yTwOoau7g+-<~T0q1A&xEN>?Cno81%%eg=K<8pkI(Em
z*ULUdr(e>i*rVOhkZFvAD{&WBx#kA`V18YDRc*A_&mZqSoNr~pzIbi8U47<~yKIl)
zUQL^Q7(pRY>UIK91qnc$LIGJzmC+#ejorM^$j@WjARO9Ysm4lo8u$&&#kRT5B%Ar#
z)`eQ?WlEw~ry^E8ecRqFYs6pw4FR#(Xz6Pq{p>Cfq0&A>mcjvwp^C8n)ucta>Ro`<
z(~Z9Z$85(f#v`Dgr3x@-fg~pu8HL2ZK*v&)YB7H4+RH1~{Yk|C$Q&AyA?-h(J15vR
zw;=!d7T6vNK_w<p@3eT!K&p(f7SKhf2<w_WE{X{M`GkJo=7u}8tf6|_Tn$U?ZB{sw
zvCzlJm#RZSR`jl8yN0U$SH~JephuE`<?3RbQwKNmG=@&NTy{c_#OsS6v6<Rx1Hvr4
zk*t6FMpg11-C0u?38q;H5|tu1ZQONG6<gx=4U4<1UGA+n(F_-Jx3&;;f65<<|Lg)}
zWK0l=3Czedr}s>cO}FjP?iRP;tmSpsi4j*NL{yO!PRjSl3SFF^_>kPuXvNYq=5DPr
z?-LVe3D3<`6zH)2&ikx}&^9sVD}$T73G641GiHp*YV_YJLwEZ`=5#!mW-RZri-6;z
zx)y{2@%<&m!H^WQ^Y@c2F<)k^mgcRt!kwXS{1MQr-`?xlro)v|(AH5!&GpvN&(ef}
zc_%qyzbP0nZ6dU|@(qvH{Hr{t-G%s%jfCCFD|#j0OF;xAd$_$!vk2Tbs@7xUVrg!w
zb3c!h8w;e4ccJk1#SW9sJ|8r{h*wf}udP~tPx-sIezECpPUQP&E<mmB%(oTcS5<YE
zE9xWB-VEuk5erjBVOa<}i6W2UlJz3IAXR^UL`e_bd6Wb_>g|6@Q($BUou54<KYuDG
zW)MA#&C1SK%ds*xnUfImxyL0j|3i88DxsN}Y@6xwolhfYqeFQTiPS+QF;6audl?1W
zD~B0-IES{x{Rk2J@;hc91@rSi<$MJzTJ5Yr2N9{cg*aS<sEQ3vsn}J)4q(MD6eh)w
ziT~Jcwv%eW-4A$QU+7@q`Lm>*?gG0Cb?2SQ)O}632#Bdt>#)_j%TKG;nXHqTY_!w$
zT%9>MKh<KgtT~pBug&UCrC((V2Vm>*smjeaJHekr;zvHd_IMqu$3$?<k#6zE=BMkf
zz{ht(SQPEVc;@AVs{}&vLE)|QQ33^XIa#JTDhbJ}TT5ZiO1A~a9_%dQShtk6vOaRj
z!GkU&x8KIEX{mxFUK5H!323>?iOc9~uxhEI|KaqVrIDcrFzFYV1cMtg2dOHK%R4yr
z0`<lBW-!CJQ_W7&!)~Xu;D8A~+N})t8Wj3>9`SFfVR*|#6%*Po!qIs-6@n=4ZMSI|
zIe1|15~I{LTvnH)0|)CoqAK^P&Wcarkypy^Ev9C8S%sTi{c_0G<JjFK?~`fE1QXZ+
zeB>a6Y>jS0Zu1&VN#{?vOVtZ)l>y#**Al!4L$h_0mzk|W8as+LOj4W9H*sug@?2sX
z{PrMdiOUT2mqe7Esh{s?WKcM)7&00tIskb68?AzT#3f?O@Et3Kt__Z8Cs@1QlfBkn
z_jw4yT;lu(4x0@;q=$@jhLv7G6JXkDlu9jWW|;q!owOWK*iAR)Louqr1(L)_x<#J$
z8$;=&H_<M-`HuWJn~G)@#eV=X(njx(Qw!=u=47%P=R(;BQhGy^HnpO59!X+AN}x)w
z{5B^ji{{{&px<q^;FXiHY}{y_@gZeIZH=J(LSh8=VrELYXho3KLs(3fz_LGiQ?K#`
zA&b0WQ0wpjZXX{Oql0dvv(DCrSr{1wr<iAIRo4{X<=e^EGO@TS6WyuCBmHt0k+0%F
zI@TM1<4Navitzz2ptYw0qAmW!$TpML!uda{tXee7#>RC3Vgt}vbsl8Au**=7pW*|n
z5yu%>>jsFB_@Lr@c`1xjgFP%2$|^OpdyIZQsyy>@DdgejtTh{M?&igEC*=EUe_&b>
zQ@62T6`P~&z48WxoQkVi-tBPB1`)nt4U5so2Eppw_~c%d>6c*Rc1N)<cgP@ot2}%G
zxl}a=%hWB@chL4ynPr*wkkd}|6@+}fbb|jeo2P7!(j7X+?|e@IGP2-Uq?7r$|DZ0Z
zdOnAw2<f&XiD-96i2Wsj$U({gl%1;I{6MVl`6T5;UpjAT63RQtr<50&YyARF#xgzU
zT!&Eq46RTer+#&BgaaX(g7v-XLMu?28Ldv)$Im>TP21SQoj%`-y0?X*li>2bBLEjC
zs~7f@+cARNt!#1x#R&}3pR;JB8?^dWl&<rX#a2DkmNiv>k9R}=GG0;NP$QEwQb)f~
zaW9-onKOM4x5YZ@^%zY7i85f?(T^H-E+-Kz?&4{!@+9VGKJU&1&we610QnFyC-g{-
zV)9`N@bBNg<T)IZSRe{Y8oJ+fGzD+c25-;Zd}E3ik8=I(LQ<TQQDa#nGJ6Bv_Hs#e
zAHlpedph8?v!+_>i?+WH`S~H44)ZVma7gC7>gw{LEFC1qPqD_^>^pJ3zf8{^V{V+G
ziD~dwciQ0E5SMi(y%hIu(&}~3<aWhC`~14)LJGdQA^q?aGhX;=L4sV^EQ9L@p%^zz
z1!JP#*DBG1+mF2Nwz<JEf@)H_P?J3?a&i42^+ax(kqi9VX;O@DLY%-3&V-pi$fGHT
zL4Dw>n8BU5$csNeW74jS;-Y&QsA^h+_*x<d9pZ}aM#b=&fex_E;^?*^7xi%0F<a$p
zosoDYjCr~kT(+IMINLHq<kaY*8AF&oDa1^Q--6S-Fn7kdm$GsS>|*8Odpm3O?EBz)
z@E?HIBF`uz?#HY%(+G)?M3(`bzeeJ_c&WBYbBexP%fgrmv{ELCH}Kg5!S7|OQn4Wh
zsnw~yi+4WPRfzr0&5SdFFH<PE^IH!+`WEo^H)}oL4x7Xw{azeAOT$;G_V!)K`HRhM
z2mOc5u~mli8aU;E%sT~=HTHN4-|os_MjXghjGKO&YBu~BBiq&u0^ti@d{bT}SzsQF
zyjl^qmm)K^xLj?wLd>ZchFnHy%WFX7K|O&Vd@4c2IwNNHLq2mZOJ0c9%7nnn$i65N
z+Q`hX?Ug8s@GC08h>wLqUz+p{(rDfH7B9H<6PBq*=L@o&IH5ev3v;%FovWV}YW)ww
zcP2|Pxr>$Gg*XcO{BTEfID{z!=F0ch#4Sc!>yOdv-xHEA;nv&8u4S=oEMW!KAuo(I
z%N#{P)Z2<X;aTR(NEy8*m)9}<X7?uh?u(%`{#$dH@o}!M`*fw5ld{5KD+4Q+Ux}l!
z7g4X*PTKLD4A(t}vgYQm=d3oh6S(wL70Koabu`cuY;-saSx~nJY3>TS@~zKKX6C8^
zr(1_KCRid{vd%&C+5QS5GmS{+V}?+pR|^hoSVp#r?UStF%pFoj3Wm(xRuUWJ`p-5y
z?C%DS-FG(11rZ59hu7$H8YnOVkRMPs(9hQ+3n+IlgHMQ9Qq^bF+VNRZG4_ctyCo(m
zvFy{ohNv|wY<krf1Mpl5*Riby^6+WO&`tzup-&N;>08&&?L`wYm`|YRHiYN0OMCP~
zO%D!)Kti?hAFcyiY;vBzQz{z5--(sdSljSL4AX_xRJ%ejkis0rMCBBXHnBz|Ra~*(
z^lg2bs>W%vdFHEs+{OEe-}Cae@|K}5^NWl)27_AkBhyU=u>!wikxhB(qz=u!Yg@!<
z)zdQd3H^yMNttnE>G22A*bb(20zXVLmju<78rp!9n(9W%(NWAd@E<L4O_?Pv$0j!9
zIx*2^wOP!C_dpa!<)RO!()j!0RIB!c(1uPx{IgLc>T{nDC3183V_)Xke#{iAZ<T)7
z{pzyF+P4g}JAoz*#9oF0O$iniG!lzDxB-S7JS+%GF(;hZ%xs!^oZ<u10z_A$BfMRs
zS@N#?1AmV#AZCkyZqj3|01vx!3B@Q<uIAnc_FFR{szm&|18hs(;@U-+zcQb@wYl<8
z#-a?l2n!YHve@vQ=pm|I&2cz(tVx}J@UfKxbDJnskG5jC-D-V@l`e)#q28#EMjQ=<
z=i6n6+iS56cH9_>|GVvyI=i;Jv|01{kv3<}TeyH_`%<ETUo6B?D<^3EdS3gRpakm>
z@uWTtxYV`z=|o6wA3(uQu_`B%{MZc6=Y3!%3yxmq!~qDg#TXp<{kHoChdTwI`ZsL*
zyNo>jW&|d|eVg2vB!z4#CR#HHEs-Zs-_svq%9Pn1Sc8Tij!(AVchOUhx>x=z%i6sf
z*DV{DZT=3oxNWOE^S?1*=sZ_G4qg5=qE&3IdJa}Zwx9`pw*4PmrTd=t)}%S>x3!Em
zoSp4wM_ENJK|miuBP}!&ns8kcPwnrCYV){x?M|p<NQ&a`+PTX;$E4*MAhWiEUHuln
zPtRN~2OWo^bHF_(5TjD%P?ftkqeNE6ow8><L2Np9SFu{+Xb~cbPD~+Lgg0)L_&Kox
z5!4}n1I^{C3y$L|nm^Mz6^mOK;4RjjSC!jij#A_b36aU+KNh56K9$4m#dZ3_s|D`e
zW_{EQC8~7!s&>G&M=9j&xx2clrKWM9oW`5o;%Oi0F1)CiwCyJZU0?hq#nzPPsnlk_
z<i*s(nOfh|PgD>o(pIOHo&S<xZk&)>QfuIa@wCUnOLvCqgt+=}wNeqtR_KP@dSupG
zb>hIf+(vp0Z{=ikoU?|`wrDc5ud3HGf6VL@(ftBs!M_h}K8KV8&^)yso7_L<OHsrs
zv;&SQ!KcQNhbu_uHmf3~(7>7m`@fldVqMn8H?CkGbbVwHm!N%4$tnJ!SQiRqB%TCJ
zo{>g>aff>-3^Cvg1%*9w{))UzT0gn=bwVcx@m>pEw^k!1;rN(SlrB`t3UmER=3qkf
zo<UkYZ|b4D%as?o=y=zYD84i3xq3AtGkO0|dXZ;!07&@qAbKE;uMl5pEp-^(3XjZU
zTq&~=9n4#;g-g*j)vIELp9bi8<R<7yc7J{<7I4P5VSjZHPZ}`HgkRiyO@4qOk1Y@y
zu0tqboPJ#zl41IeH%4JN2Nd*uYSxk)`_+?WtZt7^u2NLqGVjOG8mCZgT^fjrEHh~*
zS2++`cDF<9)j4`A8?j6`ZF+aK&ZO1WIuzY6NlU)oxrV1`u+|dE3^iQwgvq->XdyVV
zX|V|=iNvo;HCuyCNh-hi9pSRY)p)wxYqxVKWi`LU<oKmtv*5G4r#I~93*EW$%%Bmm
zJusef5F!^EzS60dCqgWV#M#6UZm!RkF$LhDYJp*`E&B4_A_cAZ%VICYvQhNi{bXZt
zCA{0co-~*>pAF#lbQ~cSt9&;`PAs2M?{(ZBNF$_2-L53Rnfz4Q7e(>$=+v3sKjAJ@
zeUqez!+<*$>SR%m9#)HAK1SJ|WXr6+kS6?8=je$MMYPF0Ek7|iuQK0ka;SV6Q&W(u
zUT}7n25^oHJ@MB3WUQ@MY^3+p>j#4Rt!7=~8=!g9myp#+ib516Kt%uD(|m-rM5W%-
ze%HtB*1t*|L}Qc3@MC(&!aT}_$RWStM)BUK)~VL4r6|(OFRW&e2c5j>nc34`%$mQu
zWXVsx4r=+rsI6MvJV1n25l}L~=&mTS7A!{c0U*7lBZ(=21*oS!sjCnibGH6we9-Ww
z!SVXUs8pu`lU_a<ywM*g4pGYy_5!kq$ff!;>gW&OR(6bbTe4&5CY`;4CXj#36xZ<B
z^61)BG%7|6ZQ_@)-gZ>2<KMF~C9#>HGFMAWjM(igIxo_q$v<SXJivBvLJ&c+N;Hz%
zt=VzZZJz94mmMaaVG)l@-Zjr^#8(P**2UA=oRDt}GLmvq-WwanBV@ph?Ls~fW>{wq
zUeLm>P`WVw7uYxJaVK0TKpeY>j)YskbSgh6k#WTHvj_KUzm18b)2E_crbn}oMXnLw
zUfJLu`n2LfUGl5KfXre+>C#H8=DUwMVZh%f9dOB`;<iw|kpJO3neUzk-%(cSC>~c$
zEJz9lkK-tiT8FPbNX60Gyp!`V+lXo(cMzKR(80Wao&M)VLCSg&tK|27m2Zp=&j(dE
zZ48me{}K#h_wwUgt8>s=sRz7?l!1ql&vdQQ`zPLhJVGjp7<cDV3#$+4p-&qbxub2t
zX_rA*B)K{*;Ea?Ej?~8t4?oz<IFW%1T8Dl^t>S6aZs{D<XuL+9A{EW=IVZ!fxLKxo
z3PQ6IhFSn&+^SF7GW@PZGmYJ#Nmsu+3QuZ8fee3nj%OhlOJ>Z9QM<=)vc8L(_iI)l
zt4|`ky5vB;oY4WLIBl%kRX~F4w+VBtK2c{15Vs|nAl+9P5jFa#SmEi&eARg$=a2V_
zKvgIW&)hxj%(6l_bk|pFO#l}!7o43YUfVTvV5B3T#QOZFI@4G5Ey(Ig*5`rs)eaM)
z8-3#43iZMtA}n*`gKI^j!&jZQ)C+Eu4Niv>4wDxz&;%Toy6$`SztLJK(S1nO%60)}
zY5*HsBVd5CSNs&*6^b$ZK<wzN=>7$8)uHT*rINjg^e1h;x5sHb4ym_-XKMFna?#5Z
z4kNV1!^Mbv8JDGKkU;k_MZ@~f1!7YGv$4h-3|II3*wce~9^3gAxoY3R6lP29Tcf$q
zA0*cYb+>OMo?GY03<?Z5F8x*irJd|YdCS`DaoD{CY8bDx(HxwppiQ+J0_Q*E{P*ys
zn9Z}T(X61R8jF>I3%lN3G#hPF&)kl}uKtRM&N<<&(ELbDp1#CeeWgaWD=({;0`0o|
z=E5@?VWHyWR~9j#EJ0L9n!B`3*Dp<(eYNZer?0*vT?UHfQLr{Iw!Wl8FTfjez}dxb
zpG)ke+6Y+#8FS3j!D&|aE?vv>GLsK*{n(p9vtz2L*@50cJ5PP#E|KD6{w@<v-$dBS
z@?z+ArG3|NAO)TXA(6H??W_w)$SZLcah}-W{Y~Nhy3@J)hGVyYHzTp>A_%Gv@w~5o
zpYPac|MbDIHL0`tEMtlcveg5rZ>3_`p6IW5x`|19acJ2M1H!fCt0FIpnXtsWNpGyv
zwsmiKEVa}LJih15*IB5cKri>9nw?fd$Ea2I1{Y+Re0tsTx{6}!x}{1g)r`t}1>yNi
z;2`N{R{Lx0<nXDMNRC5>O&wl~$Z~zpUmY?NTbfrb7cRHGzcK8DueO~^VJ8bhhh(E2
zl5UxYfjM!@jH_a-6Wb<TJ_m2F7@h658t+SwE6@}V79jXNEQ0w!PGx^ad-#pr3(f7|
zGaJvxd!gI&gV%sY4_6D7jDO`e^&s4GJpUCCg&NZ2&mqwr?w3LTflQ<HlNfKQ8s)mI
z>l@YldxcdVHpnBX=fPS`9xa$w^?QFZ-`~4`Mmn=VwNe2hyIc;|{)4JL0^}o_ZQeQu
zAXh>lN4$#lL1gM9(%n^09@5>)*5wn;mO+=lD8nYjPOzgcT@;;GmW*Uyv}npP2crgK
zG!92h%7%Memic3b=x6AjRtTs{IC>H8t+>y<yq))P>n2TX#OYdB)*>DjiDc7Me^s$>
z!kLt?{%M&5`}ER#F5@!5<aFUih<IRQd!v4_Pryf6W%vSG?gUweoiwjpxU^<2>o&WI
z#*Vrmc$^I$VZUSiy~OsNd@Ixvb3^BlT&PYT&+Kf(>N8LLs^A$deemP^{KNG&{J(7u
zY?NVFWDN}q9Q)jD@CH4H!`uoRY?#r^>F>7h#qg`TvveOA?XN7)<$A4Ep`Ko1kOY|t
z!Dd^Py_8O;%QZYycH^qw*9+ag?@GgyP4~*(z$Hs#V*$$cI41nBL@2Rh6QQ4g%e0z0
z4y<Z#?S4x4Heu>obw}pE^kMQbk&a?==`WTi9IgKRub_y2Cx1;eypwqweBls^<D1cc
z?F_ul(@Wa3d7i@V^YOz%!7QMX0Vd}@$@W07LrSt0oBhRKsRQGJ?SIZ)NeE34%U(wR
z5hw+c{#^JUof7aK|4*qB@a#RHDErT0{~YrF@gm+qbjO*4(OP$eqpNH8+#9uTyxQC^
z(FljCx4MRMK*)gC+@FMf7`B*?7O;f$O%9SZ9Ek*GWFW(v>3U*U97q_TZ2zeA2{Qd(
zmY3T%dOW=m5fKf@z2`|kPEV>Xkc|rdfYYTz=qC!KzkkTS!J?B(f1&4nUdO857CKvD
zP~aZ@GyK3XtjwmEMt%b&nQu94AI)PCvhorz5vbD^M_DL!G&;lvX?zaGB$%k4I$k-x
z1ZHy=o3DuBa1J(wAoe^-z$9&M9D4>nQoS)aS9iq}sp%*^IAixsz#k_V1*fwbtkRaX
zJ(^RXJZkUg0SN&7^=WPA9Im+h`SkVm|NBM)aKii7V%vb+?0*SfQ65Q_MVZd_rddo!
z4Ul5>7P(xzEddF^d^OlB2iD+_=Mv@)s3k<VJ{~$~|Ge&9T0+|V)zAY_*HyODq=OTq
zh_g)v^n+!hFw6s<^x5I+^Zo11wXD|`UCBs0i!SIuhw%0|sCMVB<0Td)dX+I)HdZP6
zQ?(`g-28&pj1Lunm72`|W!n+3P4`-9JGr3<5^V9XwUsPTx^T8!{>eZi==1OZ5QulD
zHm6EeZLUdH`%*<~5eaDelfgfSZ@E(Yu(lzwL<0O^Ec-BW?a2?tsRkzpX)L`T^`UPj
zi3X*#K!!fH8d`Upmit<G7yF4)#LXQ08|uKwEXx0>PMlcty<PMltLj6UgUL!CiG*N-
zlv+l!)o%E&)s|ab(C#knyAZ|7V}*K1yd?(uTi#EUnR-J=?qm2v`X^Yx08@nV%^A&?
z4W>0uWOYx?)1t*{3oNB<P?YXGkKy=%1Td2WukK0t_i7YQk9K#X;Xy?vXV8#vay~-m
zE%vXL<HkRRZzWC|nw{3)VIILSm9aF@50_v2KWHB`w;vaS8)vO1$YAc2(reeJD*5jv
zg3<2WuFMUC`1f~fEVP=<(8}R00nUY}h6~q6C=`1Lmjhrr-|yxBDTvfQ^*dU_*kIAe
zs#<602(NNkS;1DJx)moTtte(6*_j3mhjm3_UQHqc*|q4&bp}{e#5Oa>l0jR(Cy>ib
zVjEIZ!Mn;Wk11zV2mSVdj4)8qkj~fVCVjb(u-7|dzw6Bo|JOyfUHy^!Tsn@)JZTEd
zR&;HwQt8^w#CC~f0oGy>kgKEIkAvx+yEo`M8p@LQo?L8XKB5O-)7mXX7d*%Ugp+>d
z+C*)|Q{#y=E|dyv={sGWOK%4Rar*b5DpJj&0T$5rrJ|buFaOG$q_u|>Zx$PYqi!?-
ztlDFdXDS7~U$gXxgE8$2DyXj4JjmI+6{mjgqa3GnRQe(8O~%?pdzPlBDOhH0Wu?Qy
ziN~5~^JQXKOOpBXq>yh?%F3c|^vTD5fZe_;#^d7^b0md@;c^;@Z#R}GoRCtFm&#al
zhm(-iTi5s<4ED~Ia$bfLiR1isC3`_cq*<y9ME*qikl7y{ZRzU<b8a<ek!R2ZN_o!B
z{~TT$obkb`)%Jxo1bzH51~x72P^kNNHhs<Ex^V4bE<yp<PB~F9?Q-f#n%GGG+uJuZ
z;K)E^tgMv_ijZ_bH0iWD;np2RL8d|@GjYCpwQ^--VG=bpkkADf0|1H=V10OF*#cNE
z2spq4T75mGBS`gV>%`-}QA^Qy56z*N9Tj%<jQ|A|85~|Q)VdRiUZP5)`m&u_pLi~p
zZ8VhuqkO56WgM8&{b%h(ZlAh4_rAfUhN=}Qos2SihOhKeV}@w<azPMVuq>XV{f9jQ
z$xxI$3Qj6oVmj2(=kJyiHgoU45PMlbEI?Q|y0F_(hQ^zbo7=~qCk`88=oG@W?)-i)
z{WwZPIx&BF^iKa9wsxkVGF>{B@?Z`>vi31_3nGwtBlx4|miON8^Y4FbXd?Y3f%M#e
zd*gq8U%LQap#MDmpC56+4&{G8`2Y3AzsQ~sw|tZ!kjeUpXfi$U`HvZR+w6wx<*msc
z|DMfD{uia5{?{jB{C2m<JQmQt%fpP7wKdj2mba(Cs_O}XAUEz`DgZ;Hm3JpxrE|5L
zhf#qyBM-=)R&6;3JuHX}TX-v05QB09D=;C_H#msaV6WGgFH^(m>`7${`~7@mMES*H
z>l3gHo7A+h9tSe$_nIFA63epIFHKe~F_~gfjO^^mY<?IH@Rp7>H?pde9JhbZ6xhi5
zUu1d~BO1Lgj9qu9(D!H9`_yO`G|RNY*|R8s00}QA!xyN4g@p7|8^!7U4bNbFE})Zp
z5jAV~XQMr3rN4wn%Q$nC%Vlf5qCShu;%3i)VP!t1<)aSb<fQhW)9vc&0wTOhrI+Cp
zl@6nrI0t~g#rtJ33a6(>y+MuVK@WHChKq9@5?}s6yu3^{hJ>`*P7TQU@_jFo?REtG
zxZeplgcVqfXxHEJd3t+wbhZ3j6}-uODR}C)`S%s;Erd`$xjbCIRW{dXeP(CL<K>=!
zWsK)3Fu7f6n9SyzJNJ~C>wghBPIs(q-v{8G0#vc_P7lTTmQ-J!)<RgaMg6VI$6?hQ
zdOL3sx8gn9Bm2v%9wVdQxYSWytyQTKZZH0v2l##x^uR2gJLkw2gV@hcAlu(DZ=cr*
z!HZvtk6xW^HdFvMC!5_7v~!Jq1nI){vN?VQV~tjz^L+sz5q4Sv*reHX9&$LEirY=e
z7LsD;2|FJQ+8;f2o<1cmEiF-uW{W=TG&N<;UNX0tl0Bi@L5p!t<|m7#ihD1(enJ=(
z_h3lOxQx=j+Je4MjtZC~o>&pl4GlNh;#Jwo^ng}0M}R7z@1I0(eEJi-+-^|TfKUx1
zwfA`{lUC(Rzy#y1A~7Twi|RQEw`oXpbhMem%GR*W&TKIfDc=t$Toks}d%<!nm=6Sl
zbO#_cSGx3>^Ezlu+FmgdvZi7RG!Rw8|Fvks=p81TrIX(RzKEKS1P7cji1*`iocHb2
zD}|IHnRT%GSfdd^sX@D{bW;)$N4CkZxWag~j9pC|x49yf#idhke2)vK*96L9n;RZ&
z=v7BQj_>J8hv@wFcmXo8sP$JNb9G*>KD{<uv23z92eYTgEUj8c!Wc@?ukh9dqq%R{
z4iz9nT=YE2Q0X~NBWy8SE$LWX5;If+?6B$08yBbZV{vZ)ytByH%Iam(FUI*DOB_XH
zf0X?2e3r<bm<J*jIb$8pjX|MOp@zk~H$#>c76U|X76ZOKkZY7-?@Or&@96PbSp(fB
zufggDE1fO!hNC6fc`)0jMhzt1BRKM`{c=^QdZI&+q(@p6)(tAY%-=+-=rZT8t-e}<
z#N*jQE=qIrA#sP68TLypF-J>Z@+2PQci?q}rnuUmV@nrH<QNJN6aOz?Sg0|q-d>xr
zKC-b<(CF_*oS@%5Hjs@JT=em0fz|rnT(d{B(BqR#OHB<PL&jmFKgcCB3-sjMSLB2;
zQCm{rYnxsr!BZ4VkWp*3F0NJQOwDc}92<c2JJlr`SEtYB`ynZh`CHPdU%~ukd#fjT
zt=yl#o-9J#9lBndka1*+v(>vos~nz_T;hbG65w7M9YBYxWfQ}3PBfb)JZ~2XuqZ`w
zAkH_pA8U8TU%s-LlB!yW=JY*vZuGkL8KvnTZr`r>5`}w_k!#voWL#nCVY!V062c@G
z4?+F-C0}c)_s7gnBubFbpm1kQF9yh?z_@0{xCX44gQSx&mR$FrWfA-$(bM8~x9(TU
zrzUwj+J&wbJTojxk#5+PfW2<d*T<|lCOK!<UH-BAkD=M%KLDIHg(5*^asB<~YE7m_
zMoY6ygQ)#M9|f2V`_l^XkY+t|yPa}UOy$HcFCQKI=_WuYpNL;$p~M?=ASD#q+)M^W
z-)9@)n4K*<4#(KnSOJo8tJ}kxL4bHRxXvb}K)2}~V91m-f7K=k3^<wt;d><;F8EBJ
z&~JvLvT&4?VtcB5&#tK#>h~kOF}@gP32hx{;}z0=9UBn$ItzCt1#+V>litlr?u<Ph
zH=^u(vTWyEX7e8f1a@YhtuN2iD={cYO}t&fgA06)J5yFeIv26FqfQ9_ry9)thU3n;
zY(35v1<0cB3HB(IR<G!jUobijE*6!z#FEld9JdpV;?XSLt}2~W6dcK^&sLW4CO})c
zBZo)L8+yEiu?gfS%(NnJTmt}M<>X;RQ!yUB;$;qm*Yjk>!EUBRt&h{uQ1hP(W$8B-
zSklBMlnmzlxX7%n?Bu`JZ3HEU6NO<N%O?GF`zG4wpw;L|2-p!Z!#s((JkR@|52bTG
z2QqXeqSK>+idbYQgGZ)!7VnY5M!J47p>Jow7yEBY1-tvnibPH)6#u9|_OfffhH<=B
z8&Ycp>sq98*r1FgVc|FKiHQo2qbc~bs@k8@j$&(T>+}S-JKvoixo>5;tapr4P33b5
zoTAJ7T0ATYIigD@hVc0=XaTl9q&*6c)Z1vRW9E8oPHKLmm6t}{kFnXJSaH!msOHOP
zNk5l_TSX-sb;T@Ii-LH6M-xZ!0hus+U*S8`)g<;yt+9<<#w)QjuX-5lVE!msKsx%M
zFdwIlqrpahDy3d%Y_ifv)T^>P%-q?XHy?-vFwIjhFjqU}voQaydR}lmmu~@)F{Y|e
z-c_z*(@3KXPCQBWaY3_0<rA4=gWup4s1~O8C$UhOM?AZXh~&}Z)fGf`-7(F~&ulL+
z?rDd*P>_Z}@oWgLU2}2il|rk{e!Z2O!2h9M%pCX%ivfqdXl!y7@T!OtY-Cpb4o+qj
zik8yB_aDtMdims%DFk_;$(|#-<Q;P@>R@C#luE(&#Z^LaJ(p1_lUyZVI$<aPn0=(x
z`>nW!;-J+opRdJdBLF_zD2ZcM!vvf_DT<XSyDs)Mm-UHwUPR(EM?3p}mbkx;zy?N2
zwTZA-J#L1k31*KM0dL=I+P*QnYhqJ{oY!3WVDQeTycMki(!I~ONCUXNvK_kfaU2l-
zQh2qV>dum=Xr<Sv*i^X|rrhu?-RXKqmARXU&swlziCWr4wxzhG;94&<z}r7rHc+tY
z)FQ$d#Uk_Ip--uEsY{LK{0Mw6IP)VCKLYoyK@1S-QJuPpO0WrQ^goV4`OSHG=@4=Y
zv>2~=oPdbu=!LqBw-SNUKU53(*Sh7t4rk0pvc;{E!#U#EAof#bF!VI;?e`_B1V$EQ
zud%3ZO%EJ1G<(Z8Z(6ZQc`YNzh5Te+nSd-g%5|HVVC16JR2)M<>E$8a;`wINvpwUm
z!2w3ME#!2M4aD1p?)md^@-x>|*;sw5(J&bx{*cT{u-5zW>jxZKvgSk#z?=(Pr^({j
zeQsf%1T?p#3o)jJD-;+Pl4`YD$GL+4x}_b83_tW>2E@}34pzP3J{ZH!aJ$1~2Yhmy
zSROkXh5QR7HX~6Sq<MKFZfo78ZD?C2859ClU!BQ+c^HwBHT93Y>ta+c_>va?0UJbE
z{rzJI?#<as?^5#ze1UT9C(;7*5g94kTNjIwaOu;veEq^R&$WAe3l4+isfRe8t=#t`
z2RqaGCG(7!KT}vjN_Qvzp8gA$096yyDdi%~oPl?Z<Xu=_@#OPX*HiL$pG(<yOQ8Kc
ze0=RV(<4gCD^|go1gP7bM$iI+ceOZYt2pm!qXpYt`OE0!;up#L^o7xE?GCX;v>X@P
z{bkA5f{@)ww&ykSO`6^9JC5{p=wBcX($E$cb)7a&#O4{DXuwO7QGe>u@Otkt%p!iL
zRV)jhw}9IO#ga;@nbit;UWmtJCfrr2eeKJmwGF|k<hA0t)lCr6cptiam0i%cI#(mU
zfJqz~fgqGawZ41pW@KYRo5SI{^U3pehyag>RWeM1w$dDiiSTXhv{~N><=nq%<~+=L
z+B*F%7BDWKCl!*oLlxq)urY9Q3`e@Q*vGVm`F~Q?-$kkMu0>2*{7^f^qTM<49U*#`
zC_RS1*?e88?~iPLDdK(oUKqtH!4hpwB8}O_y$boLEwua77|&*QJzZiIhR9=$#S@3;
z0OpGY7I(s6Zo5>9;Kk&%v`fG=SK)-;Kkdut`CTidetjT8c8l`#Tb@7)hr#>KV7YH1
zUhn2dt%D>}{Sj^0&1U-4lGiO3wND4Tcr<6UGLbYemuuWZyyLD60bgNqIVf=zHs2R?
z7m*x@@LQ?7(LlGDR^;5S6&$U0mxd~;%}+HxBqmXolR;5zO;<Zr;f~<`QQfw{3=bdS
zFP2x#XE!a>wFbJq#~hv~h9~Ej@gLd-Yqqr-d{bOMR|mzv5>qy4q2b>@8`kMh<7%Db
z>Kvl`n4t+2lZk&mem%=g)lhowf3ZOC1Qvfesj#aR!W;AJ_=DyL+BNCa3^%Gkm|y?s
zrBH9PjVJovoj}EM_=$(RQ@o{!!FR1w=LzQb+eu0sSxX!NC5c1mC^+$$i%(ZokFeoc
zt=qA!mghnXM`V6&E%Vie&QIB?a}8nmWhPUgd?`=u<Grm=W&r$kS!W8Xs+0L8%=c*)
z#g66Recoh?H9{k;es^vjK`ib-oj&`6y`2j5In{ggG+#MWZ=<KGM2N@y&+%4EMf6y4
z&5#L8qya9Axo$8jiTs|f&yYD6Dw&A1V{jeT)r|Qm$5la|*Xrfdo&G_h8CgK+2k<BV
zmj*B%n+X2MtT6n0jSK?5FRs8muWT?a2<0Ya?Ows~^<)BL>?V1e9brwlC%JpM_9@N-
z)o3c}hsuOX<5L>$mlpC*`l;6Xbu5WVwv?TL?(5r+Y*H@dfQe?o&!lh6Zt^X)wE<(j
z?z2xdFJ3EUhCcKu8}EjGoFMsQ);+LU0?pVychSh$JU?fa2cp<rnd3WwUVgU)N=RRA
z`;V^~ll<7*_Yof831C<`_QPBI)MwAE&E%vdK7S<BP(Uqm*hy7o<srz%Tmx2pxKlDo
zbVE%Yp31RiH<UoOw_<zL0KG>cF-3ECB57z@oGyNp39qM7eX1R+0^u7b;9Z;QX_m>o
zte(H5-}3GBVzm>bmHd)FFv1bApe8j2M+c&;cFA=wq4rf&0LeAK`E9=bEL5N|TAT|f
zrw%kLQ$}UkjL0gB#yPTZ+FCP%?!j?n&NdP+sE4m?Y16Q%haP)2)yov$L>z7ZO_b=J
zhK1I5-29p~g~*(;c#ej8gnyMnzUmwv9tK7^-FYX#^XYflSx#3}iv%Dr(@q+uat0xP
zsWRitRVRvfrr;?(O^$$rkpcg{GCqq&Zzdx-jcg2!jG(r`hDoJM;;ihPSDK)lvxtwQ
zff&~6EkI3m!UoZ*pwZ}1$7_5v+X?fkKMWvJD-#s|nqLj}CLMcj-0+7A=R$08cUI<J
ztB@tO+6V24qTceLu)-~HJO6N;n3=OS6`M0%RY=5o&qMq0J3UEZl!?L*F-$nLk;5%+
zv$zBQ+&}pAAFzRDY;tH&^9Mu8sai2KUMQY9TL_$~zT;uLOb6TK&?47)(sRgMzEhP+
zTUXj1&0y$RDxJ=HcoE|O*exEWn{A04fuYDL`OcJ-_G^DO2h=`I7|xdIF@)r&CVTJC
zxH;L%Z7`|GEFx~1XGmJ#efl9dk_C!b@58$LJ}Kt?Rmg=w&J{ZSGl&NIzKV%LMl2{C
zm?``dIs9~6%FCJL3jfL1pCyEs{<$L1SqAm3#K9bj$##O$-`WZ15VQ|mjB;fH#@QzF
zS-QH?!DDaO^*E_!?3aPDFNY6q-h|aA^p-L=l%7ul){CT!Ll25%J&-IF&yi3r*AL3N
zt=`%bD|qzsp@mV~YYTM>+!U~-=7viDWp>liI9cHr4=;+=c_<!F3&I4w*21NSy*L`1
zYh1?kCgyp}MwrJvG?I(63WwL~Cj7W28J#zNsFr2F$l;G-J`-}jXHfG5bRzE$=79^X
z^xmYQU0kP;pV~>TL8cI-%*szEkt!|dF!rxywBXRIvR$1Ns<AwLJ@8nfcZ&6yLkE>a
ztvE3T+<E-lu$(-#;fzX99Kuck<%k{@_Sj6f1!6_jwlwYG??`0y;T-hqJ98Y_7W)5y
zUx`!-UvZD=z|grJxC;uqzDSt8UV4Gu{_W0uLA0VoBEKGkll42z@{q;$P1*J~it(Rj
zdgpud8oegfE93>R;Fxy>dgKO{J--!u^9S$7)_8Bg$=c+lAApkBybQdC;$b<fH8Nf7
z0dbmoL$UOwE@Ne)2eC7iSE9?p@<Htnk8l!?tiCkg$YCXQ>sXxv8o+f)z4Q)2^*0({
z<}o8Jw%5~%>@p~{)=R^U<#g`dKcT3@`r%kw4A@UO6lhS<Xz=Fj0O_TVKQ`txe)0l0
zAI>GA(}*ui64Sf!n+1yB`SMYhKgMbc6(_q<i2EvdCPq4yfwow7K6Kd3o4#nZS;T{H
z+2X@zRt+swS-!mmGJ>Jsm_eZh;HHVnIt&r@`=lJKV-;{SnSd27kJ<SS9Q6>!hYQ2!
zGB%ZtrIgB_t=r>`<2vpNy98%wxQ(|vFw%X@#x?t^LD(#v%C`aefgU^THe=@FKzsa@
zqellL4q#)x4h71AJf$UbttRKZxfBgH>jK$^nYlK*0$0i+`6NRBY0GfCEngV_MYNtx
z@acN!%0k_sF%ce};tORg57)!RMW0Y5&(*!i^WB=%{d+u~;#fSg*}{YZ*B^W~@KZYr
z=E)N)9VDcK_ofoUzsCHiV!TF^#n@Eo<>=JZ9_L%4@mGej=nK$K<Lrix<N55Yp=qL6
z(_cKkS*`pPunM`&f~>67k;Nx3x0v&FM^a+c!8~Nrz%f4-Ti9MJr?aoG%v&v1*r5U7
zRi5>)%iip|czhqi`kDoq|1=g<9dJ(RgQ)+eth@8>@PP>1j>UPNlZ{(`TK1<L!fX1^
zKbKx1W9D>KPebUI9u$nG0^{C-y@>N7DL~nRono){d|tBAf?2zz^M)?)7s$ix3E<EL
zwSJJGV5+ZAvI-rKYVSH{)#+*%a=iTCt9wd8hAg$ls#N?GofIy%$C9Yg)EP&xAwpM#
zf0jqMXE#MlCEaAR$)P(uuG?r^b9A%=-#_viASg(yELrAXK<%qO5%gWxOz@~#FfY8R
z!AQ;DTUk+S+|vO%PIhuKb%`xkCoUsTMnqG)UCUJ~a;!?dJejmpc%Td(<J;2KR<q%h
zjvvBNJymI31m{v8aNg763MS!#bZBJ4-(tR_)#1vQ2FxQXt!kt#Q`k}stjToSoX`@U
z^o=k5f<g+yXmsZ)9X+nToqyISwkVcpth@{-WsY~HJRw((RvY3fBBvU~*DUH3b9a`w
zpX2gn-yk!hYyWsGb`9B-*A;F9j4JNMH=kTQJw0i>w#239+;+!jt3Th6&Z%fLgVm+o
zE4yau)Nts}Qv9FVPS4K3kMS5M{qtlz+vO4)OyjIJVpxaI?4B7YMi=*+cW3o%GDTXU
zS&mv_X&J`h*e&oz8Z7W0b<Oi@<xU9ugw}__Ce15NViB#)#LYOpb1Vytoo1uizx%lt
z_;bwkjcc@|f)2CqWTC`8rB^r$i(!{xlV8s6Yk6uV4w!P6a3JQk*q~SWGKgKD+~Kuz
z7sZC+N>kr>6dam?x=DQ<+6$lw^tEwlCtNsNn2w@V9zoX<EQmGRokg6);W^{~2!WPN
z8to>^B^MfC+Re0v#nPFCsdf0GkO;3FKxKRVfXe6vPAA!&o$Uphhevk=4v_xGG^Gur
zRcR62x(YLWQ2bJf0_=5N!D36_KYscl?(QBpf(<^Q6%NJ&4IC$79my(ZnzD-IOo$+Z
z@I^j0hZC^<)a|?a^jm|%<)fyf&$1**{5iAxp=jh90r_~f8`;LN!N#GBhDux5{}7jJ
zXcS^Tb8~eS2(Aab+Ge&dwSP*rYdB}G8F0kcj(u2ag?dmorMiTH0e@+tUFpy8kkR~u
z8=IA?C{Sn&F<aa3Gb8)uwvZ8ug}nFK=cxj9?SM@#nRVoUyW!-C3;d67wRjhEd*U;a
z--RZB-u=y_VUvv!;oihiQz!%ScvUuQ!9p&N>@e-VP9397lSs?u7}N6|u4T%w!KP^S
z?|6|GBV9vwJ{$!nTaBZp18RfB%EEIq7tp}dFWP`t+Q=!nYhY9MwvnrtTG4@XnQQ)=
z5b@FBrfjc6*R_8>Aa2WV`Q73Jo-2O=IPzL$Rf#wx(vp0#b|8VqZGvts)mh0TM!>A-
zRn{!t_DDgbQ?MF?oN2vWB#oEzz?3}4;v)TCKQhVW_AV2GcriwaH>CN3JdQ^2tySRl
zha|eHQWNBkD^w2|>RP0hSgT*SX?N*}in}>Nstg&(pQ~D=i0CZCTDXS=-Lc!u*ood%
zAb-Hxd^+H~jypU@^vjV^w|B}7_%pPy>Icn7=pHQAqIz`tS{ADokG$Qz2YLxG)0EY*
z<)CM@h7LHR4;7)Sq|>L!xL-GUG$FSp)UMum$7oq2c(~CQ3|h5_B^0Wz;){r5HF`nP
zH8S#kfuLyGpxGC>m|a&gKzF`}vQRGhWsl!z^m2`P?^2jp*U#%#=bZ0v)&}?o@J<Ee
zmi;PxI!_vzap`&Yapm1{D+V}%|3mIPCOrLGdKNi-^FW-VbF4WwzP7J(o$VWhOLdo>
zN_hNmmNW!uv6|ISdyGb~*97vgOxo-<=lh6Sr}&|Z<%4JhbuG>U=>=W)(Af>zBluwM
ze?KQvkWC}z|FvMj-xPh(;;D%b&geZqfOem*8x6uAh=t9Hv~R&b1&9XkGKyjWhZhIK
zFch`L?O0;glJZGHww~|pL<*7jiI1Xje0HRR$^T-VOvm|adT&k~S(&4T`F^P4=%w>_
z|2WF}gox$-K+IS_vIOK$v<G#$#?wP%&G~*BI`;AJ9L=Hd{Y{ED0%cC9{;RLplmhIT
zU%Vnmwze>#HCYu>7WpM?GfoF^axBn4lS(ouL?d%$j+K0u3jaA=4LJOtD)c`;{xOjr
z|NDW=>GprWsFaI+Y;VtTa$dp%@Um9gYvN6=yX5uuOV2#695NCOP6O1_Qc7Xrcd?W@
z$v$MI=l65-uYjwSsu*8)I_YKx+&v=5JdTXl?qzmB_PBss%$Ir%{PwQt3nBd$$0Yt8
z123zi5L(&G^|8FxEnx%g<af(~DJcTxtlHF<TM$pg7RHgNPw;?02YA+~z-b@ut6jgo
zVuYS8LZL>;UI#=M{L}Lo(tQfQ(&k~zb;yYUci%av_?)f@mFhH;Q&LhU(+5y_AhfqD
z^rLQm&3sAz#M|DnNhZK$`WHP@(BpXmA!~1X8Hm;9X_S`H0zHEtt#35Df%SAX?H7wV
zyG}>~5xX{UVMmnNJ7UhPgcY#x&vNl^RbK)zaeyyi^JV8S<O+Md<2kjYrE}z3LyPx?
zyzLBttAAe~Z$AUbluU6CR9JnvfTY+?xW_`Nlgm?Y3{n`If{*YmrI=SBb%KTA@%VZw
zrvo+r(V~g=K9WrBKAJ$4O~bKxCF@1UexU%vkbz})bB)<RL&v`5LJbwb+cBz{P9#^?
zR3XMxXg2I5!|MO<?|~`7xaTj8=8+G|;V1<WxEgQ>?_tJ#2b38meUDOe9tXi@Z&uEy
zUWv*o|7jZ3#Nqk!*~t@th|Gc!Pap$*GTG~qAmq6g2o>Nd-oSbhfLU6cQtXqmjFfeb
z&@9h$4{>JF5foqR4v+aIaJ&*Rs5bTMGv&n-Ypmntakh?18_@{#8;^zl27s_hhM<~h
z?};192BoBEeyh<Yb3f{hZuZy|wZj1-*ZrhU7Kbqga2fv04{CBcOp|Imgh-}enr;@c
zJ>iURVNYmZx26$dQG!+v3R$?zbl3NiqK6|xkr<U$qlG+<lZQVQe&+h0+vp|uMTloC
z83#20x;l^>QH4cdGKO%|QtN{^xJ0M>p7T*oK^%3Im18|J-`VdUJ{(>8Jo)YY2n%ce
z{s%$ab}Cd<d|&v-)b((45DV>N`Sy)ZPR?vs?ZQ<LbF9%)^7iIFB&&mJm<+M3fDmn|
z_VIwGkLuUUi^*0bf5Z_AL_*#8i@9wv>U2C#J0E&dlhf>jap~?JPDx*Fw(D;s{cYS?
z*+pczg?l+F>;}Wrp5?yWhhv?8kwYR=>nt1514%v3K52Q+iSs2mrYWPj*_HEJi~)wC
zcGv{%OSzC>k}vTQ&7@&<OF!=i0@8!gcWL4C@UkvHOOPdx7(aS{14U)0S*@2wV@&<=
zgP91sLE9C*u^N_|ZI;q<Sk1;U*539&Yt49{|4Q+=>#ip4m`d8)N}PQCxZHn}Ox
zYZUQVG`uJ5!z&qv8ZqN$-TyhURvZ6B(Wv2U;$B}tz*e$Fi>s&H7u_$6JZ9bA&tmyR
z7yr5~&#sDn-5U!bTEjyPPGss8zHxoI1ke<K*(^@)$Yoh;_1O?GLPm8xGVf#+0`r92
z9aCHUyY5|MGj-p^(cW=dtvNbj4&1u2R9dbo>b5u%BHg-yw_}FKep!$K3|IuxMh0Ne
zWpt8zR6X*yr73~S)KMAdA=qCw`XEfLIA-9{ryYp4a2cxK880IMvhiZ~XDeFV5_3c$
zk*Djg*B6@-0RHfq_myp!PJ?~F@Un=v=W^+BtnXbax4m?(laI$SJhIJKI9D^4f5+kx
zFoeD>!)v$Of6b~bB3r-eRjfq9rsV#9H3X;q$#y;f;01G)ZDb={UDz+R#`8Nz1an}Q
z6t%W*XcI*%0qp6!e0;xPA^ya%q>&TiNc10OkPqf2ad^2zC%XJ@El039*R7rUY&LRo
z(!z44Mm(F$;$j!Ey~AnQ2+cX>^L|0<b{IXaQ*UyTND}{MtmW$2kapD<KL^ya5Sy7i
zr9OMdgL&oNzUI`2HxwuKU43ikJqkPg#;kkkc1rq<O#AqoCcuens&0$2(OB=s3(Yk;
z@wp$~6SYF>WE$AnI4wp@tLTqi*1B<^<l~xqKG+LiWzZ~0n)zNGCFe;2askr}fT9_M
z*n>=Z1ziQ)3m^^2z4|eK)@8AzKg6?WvWH`cLbihE6c@TJM&T4W!*(17Vu1aa_sUS(
zdNp)g%=cFC_TrGv-SH&Xz7evWTw2g?&}^P|W9t$f4U3>$(lP01DXC`9U0K}?DM3qF
z`x~vZvoxU+Iup$KLdao;HJn&X1i-@qD&0rlq&+rU=>H<_ucNAb!)Q@dMM|W*L`u4)
zOOO;Kl@O3_6p(I^?rs)HgS2#acXxL;i&(&Y`Q3BQ-uI3(_BeOkGsf{B3l`t{*84p%
z=kv@NOym)B6CUvH0A|SN=HE&=^2eNIVs`}9UoJJyu7YEs`e*Ku6_<Tb>AB8$v7nwq
zO{#r%=4^cP$9b=hY$j)~8P%JS8A2hh*B#OaZy!I?4!ML8p?813AF0tGdktY%J}8Nu
z+gcbJPE9|q={0H|s<}H^^+*l<e;-m|5LG;2*W^E!YHn2omTO&5lDQuAI}T}M7_~3~
zf8Plx$}+dRcY!Z%-ie)H)HrWufKR^{XA59Scr;UcJm)M>bh05464qBcoCWcJ0PdNm
zUXR@7@6Zyk>UOf?C7+ZXpNsg!&?a(pwF4LMAu>T{S#@LIq05Q<OiaMSWXY!yf?d7t
z>|o>0(fh;d7kSfv;FYqa!}FshSNFcAl`YD@5pMIk=ie;HrBx%z?*HPP{9|QV*T8>$
zp7*F+r}#h$OE?wD0ZT|z&0shJZT^bPl^9b`Y(G63n?9B{Xr+}v-x*(`ZZ|JSE{TAY
zcuM?;k&tckn7T&yrJ%0QJuJ51SnMCcWoA!K^kBsleYQ-?P#3~vf1KF5nFqzkzc{p!
zP=}d0ABoJr9vL5oEY=$2H7amGq!&3)nj9^Z_zFbKf_8)Bdf8^T<WibF0ahP)K3^yI
zxg-1#yH_<P+<JN@kH6ZoZ9Xc}272_PQ0<Bv=ob3C!u^}6la4i0yRI!$R^vdq5=kL}
z=i+cbhwJosk&p63{R1p;rM4ja1nA9HkqE04@^s}0cK^N^H-tII(%UICI>QM#onfqZ
zp?XC}%a2OV)wFME2ICJ(?7b&cDEcNiGut$8YX7)I3fy%N*z`Ii{u$&`{dK=Okd|fb
z?uGD^?M>Q<Ez}@C%_WpsR^-A`h`9^#lE3CFq2e$r&HZ}+h|d3F+F~&*d!IYgZZRo<
zn4^x-c*_S;#Dvuq>K!VfiEVVMuu?83DWNYZ^pc;XdmzalRI(De5|<HN>Pi)atY4H4
zP5Bd&hHu|g$omjJ3x{YL5DpV^OS~@ghdC^M+Eh$eBRBY*j6kULPV&0+%H(?i-@j~m
zhSA$Ml-<&cVO5vrnTCgZ6tE&l<i^ab&6rcK_4$th<6zM5lBoVy-#vZ?^&1&Q45o1y
z6Cb&9jzv$LAUC^luUBJzwY2R#_bMqx3v);#=FqkQDP)M%URTH5*gu)OBnZy=a8H~t
znP@1$7T6LVES8gg4S72Zc+9b!!2mwH<iLRWN}J%U2CSxVQvN2)qsxe)fOFO@5|sq<
z6X<V3tjfO+s&JGKlu~oMdy_H|%vwepgBmaClyk&^RwV|rrlV%9>uC|(u^!2#US~hi
zb`urWJ6VcpT#Op~eJ+I0Nq$JHZL2M(%29hiJGm-Q!Ke`(rnN-(u4`(uvzF3y5#8Z2
z_88J*l1B{Y=;u}KS_3wK+Iu(Y)2LTnzps+0F#ZI=s;c6WN)`b2J+k4BZ2AcvG?==>
z`tQQ<Rh5iXEtZ0ECJWxG=DhAn;o?ks_=*cEt`2V+`3*Ipu<Br*zFKzUYlFVR4A<B8
z7hemn@!5^~!j{kRBmgsLeKlZRhT0TJLJd~&gMOP&vjjtQCsKa%{faT^GSb4<um<A>
zRIFFzdRaX+wz~pyB*G!W4Aa2*a5jQmUuB|jQp)GD`SF&S>V_x~i$=Ux2{+<>VlB6;
zVFN4mLmH<{r;19Rn$`hfyy8Lba0I;5Ij+bVj;TUJ2N{-~kn@J;0gc{hKwGof`-ov&
z?(Z+ukCt`2U-I8d3?xwQ?u10R+<am(e&v>~$y;i8{5&n4XUpN*;n-cjM2}Kpf;O&b
z$rlp)V{cOQ)YU1Xa|W|J=_@9eEP=&B<@CSK+`t`-?^pW9@5G(!HhMdN8w#%S0U<y9
zTC&HnkSmT$IOIb!7gL0g_%8#wXrizEF)cx%8dRH0ulS42V_CeH%MBxk^ZpyQd!!i&
za^R<I*<JBN!c_W>Ng;vN-AUwBDtg+VH|lj?FD|Ry;Hc6yVk1CLxg>kPpf4sIKdNi!
zEYNK^R#t`AV`CK%@DdqobtGL-BH=FU-IyYST4{~grdzH;(*;*I$C}6T(jo147`hni
z-LHETN+ZkVx-bxUb=v)6)oRx)E#@xJQ?yd74go`jCY>^4Ek1JxU2{EduNZnwr~Joe
z903cGF>X$(Of2BYN=QRA2gADy9q)~~5d<uO;hGPUF_}u06Pb$F_KdPb3jy0dbbXqk
z_mo%1;U14Fv_@*UY57oDQ%IvL=T^e>lpA74q{r1lF5K_wC}CrJu|NWmC5FCg3>J62
zGU^J%hUSm{Rgps<UP&*5nE*WNwRaIA<Wr3`dJkP;r0CXb9o?7a@zgFmt==vHdL0cA
zNMlx+asZ7~gqQ_+I2rG+_f)b86NUBnRW_d6{So8^GjhwUxE}4dv3iJ)wAx>bBdzi<
zm5)soYuzXNd}o^6C>KIW4JVdAP2`PI!?h%tVq@qSp%)B3DrRB$Ihclju7?JpmQq{^
zI-vEA@p@b)Squ+k>=}M)(m!@<-^95pED}i;pYLRZ$V@+`IUnPv)Y^($;FrHJV%DN#
z$$@G&yZJZL#T9u1YPyf}zfT*>3aMi4-drapxNUI|Y3FiZC1__TI`?hZn<bX~JutT;
zzS@?pqp!~p_VZ+ge92nu#!^FrpK`!0XakeaST@jOIZ}VaecW$SDw+#>s!;uvYJZKh
zJ(AV1)hY=_7m|uZ&B5SuoA@W`S4Eeu<O6KGo`~;(?E{S|BvCn6%vaC1%w!ND(Yg22
zb*XM7-RDo3m*FWXAs=1fCi})PAO3xjb3Z<K>sW2Rb+^K1x#*(f-i^WjgCx_ym2I-Z
zY0wOm>QRRMI!L&lK7nh($VPLw@mb8{#iek^Ly%-gt4C%~0{DH&A0xMpGN>`?xNG-^
z0_#KEEIG*c19Zv_;6LYKd>v?OA`w8_9#E(*xWILdICq*Yvu&mG$ow>Jl+0b^RY1+T
z+zwga_a+air7<bF{)I#)azK}Ps<KhjWrRRwV(HPJgpB(4%ZDH_WUBp~$7?$F5WnjH
zOJcBptV*%1ITR*jbl{3gB0Nm4?2hdJhK-}wtmHZfTad5r{SVq^U)-U!^uV*H(gmwj
z;q+$!3Gd(VYZu#Po~$DzW0H{swsk3Ieq|ZMWLm|}^z{TeW(R9)F1ZqUv5?8+<AqvZ
z0ycf}m~7{Z?mySXN=0a9Q)Z0%rO+ebs{8`y5oe#IeTeP&;KNGr_zJaj)LM}vdS0oX
zfH=!Izs%Mn>CZPu1e`WU^QRD5?crFuOq}#}oL{gviN9~NS?AnW=j^E7->Fe*T*wMC
zaiY;G2rLt`pYqZg{4<scX058~iDO3X!)1P3awh&NU364*Z1oRq^KSs4NplU&+R%u5
zb_Nj62kaYQSsRO`Mqa6vt@l=GLJw|~d3Yb2%Lm%qKZ^;&!?yo9s$n*NyL8S|stgmm
z0+wsERpF+$_7VjD+S#<5D1^8Q6p(S~lwO&R8C4vRvg&euj(eY<O-uSiW!)^6x=!dv
znS2on)|@Mg(?6pwzr*^I;kZMrui>PENrs2pt1-3*S+WGv-9x?=wtc^5B6L!vk5#iC
zv3Owq;#|R7y_bMA?kx0V*(FhW!d5yTGu9Sc-j4#BVr6;IJhTSQt5iY^=#Qg~W^{Kv
z-1B3yf1!1fPY5ES`-LN$#)SaHzhmf!7=60+<ivyHyarvwQ<wZrFHFacL@P|#BJQeT
zGo`w*B>rm<8{srQyDvf8Y$z>*gQ&*CsZsA|JkcN$5)aqh-!`P!!*A1-bFH^qKdKKS
zeaWb?HlWWEH|LfTV}MC2Z6T9t<JDx&cpMyr#3onuUC4YcP>tKcQa<6$u{kepta;pF
zD_*SgKBDY)|4M^Y!u_cufc*;Md-Zb>(7^Faj(dBra&sSxUKG;`qfE8w@ZEM@7K8`}
zG1~h=A|gY7%vIS@>U9PD@TrwILdv|5EzoW>V>=?`?@Z?{d@(GVc5{Y*Z}A0`W?!U|
znai_g@#!_kNQ^OoAt7U1vq>5I%<VCsCXa{2L2N-Scg>N$vOWS%NVYR^_SW)snf@9W
z)iO+ynImZnNug-oj1skB;JP>O_;ut!F@O9~vB{#{sFz2NZ3w<#x@~qQat!_~P)jzR
zA~7U;+3BS8^(3$R(Qn&Vu1vDq;pQ)4*R?380|JsW&s)w%-5Z51{!>Ga@e~I|6V7tT
zdv#J8C(DjfOUa=A1J@tx#LWF^%>;l(^&L?E74$5(M}KltzZ*?jtd|ch#)CB3A*u(~
zv+BlXqQgyitX^P`oy9804n-lPuLub@j#=amSsB?i$_k!WJg_-{fpOX_wKvKegm&9F
zN*@uR5vK7xzBmwz*=5aU$PWUW#^*H7Z5M-9SGtVnId>m)A>1x_(rge`%hmay6EM*K
zVJL)9Yy^Kec{?*L+O(ZS`wc*H^8YSm48LXM9<o^s@_74ojOrh%vM361cakz)B{7ez
zkITI<!&|eg084m*lF;=n^*>=FURB6H&gH}eLyAzHt4w|)hNu1c@tAJ1=)lT{PU>3;
zpik#2Qg<QZ^LhHdcu|dwd5J(b_TwVD37m-va?J<w<j#A-Mfamyl2dG|NPX)&kg{d6
zog}t?Fs&!fuBv4kQPOqsTA0WuB1Cfv^{Eyz&tp$L=-@2!9Sf{hp48}u4z4*@oi3l@
zh!v_?msz*d*M|Sv?h8NQJ=3|pDEoDQ+3qZ3=wei-2CbeO@_fOMwV8Sq8`)y<2Cp+K
zK)YE#o-wmAU<2d37<7EQm^Ow#m+=v!!jRFx)Us*43eGP>{Mbva)(OSA`j=RSqxFUw
z<Cc1v8YZYcNQ&;zTL-ckMl!CnmGZ)@h>fV!mo;>aFb;um7mvNk7kq$2KTTGT<UHzB
z9C2SmOb>Zp2c$_q71rR4lyV>2b;_skV1|W-5%BB2ih3vW4y`>mLD&IpYbe!=X1bc(
z^}0lcviM6q3L1%!zhMDXYHzc;%`}WhOKBjXOngSr)ls{vv|K7`lF;p9N2sB(0aH*y
zcHQ=2+EjOsKT|101-PzAlNg2J8WNdhNktkaPL}29_nBN5B~Odl`g4b2P--w<izOg<
zfSLI~A2!^jR}f=~iA?cu^4(LP^MUdm9Lm#$v_~VH$6XG5%3mL4>fAd56va;JdH8z?
zoaFeSjbFETiy-=$^gSue%<o<w8<0F&GHBW!x9Z0fjUgzKWax}`AS0PSdeB6%RzjOF
zn^Han#zqvK@Nbaphv;;N<Mz9zM*KtVF5)2S>`|<Nh8g-!LU^@*QJpv16zd(eUraf6
zmE};%%oJ)ZE>OV>+nTJO_&G!jJ;$pE#^=ak)bA4t=IIO}`kJD>ge-7%$?sr5&tG8E
zzrS{21&H<-Q9B$cV`=wpV#25XJfQ+k4@SedZj14H^oQ@YSn**cpq$nIJ~Pzbj@`@g
z=tU*1{6{9d30N~uw^aRql6BR%co*zflpehZjvEJ|i8j+xbJx|s9v?`{NDHxsT@tT;
zSH+1YXCxfPSNXHH{XDcfLoIioDeG=a0S2mZh9pDnrYji|q2!~E3@M!6Tv6MrP!X3d
zo3b!OYdbgQs(`8?vtTDw#!r&-J^LG+J#OYhBK)`vc;k)Uy(ejiTjw<i`?`ovQ9L_3
ziYHOu+0i**<#ELLn&{_ZUD*GMnR^_PYenKceBo<fXrjm@<3>MHj(zM~j~MmtgfQyE
zyw(1ZTjEDZ{|ws=xd{yqHHq>DU4|((k05bDr^aDO_~0+@d9BEyWFEGR;1yxxtMw29
zpKe~4n^#fP0W6SKpG5JLLyF4m^{4<Q&S~fER=6_r3Wr`SstD^_pSINYC?O+#>D_O3
zR=c;jB+Jz&hwfzh(@UAO-&?`nLbb>eOuXr?S_S6}c4Kp~JBxbMg#frZ0$sQh1x`HS
zpZ{Y~myK&C_as+aWM1=sZ0b7Yb_K4%{cya3ah}X{l75s7iQ9(!pK^5$>nJfLOL%u^
zkj{T!%^78c(d|ZC9^9;VT7-JlV0aJ2W5}K((xJrMO-K%+<O2b~pc)G@t5&xA!JTLB
zLoSE)nn9GkS*Y#k^UqOKwLINT!LOt%CJSX^`5JMI^!W0>)?hy*Z$p_DkVy|_tV3uo
zIJx=Zg$l$ppvE!ux*1tYmx<k=mWWf|E*E6j>W;i`maln~y-prMz69UX+%DC{h8t<&
z!>`-HU^GDsc(zj-$S1y{h44F$Y(e9$3#mw+RgG0SS|7*Pa`_$f(2xYS+u{429xT2&
z0Dn6YSFV+Z$(i+a1TF=BI<0ffG5tj8TQEi?L(v9db(YV)H}9&Wf7QvlZ>)B_Nx-Hj
zotyIYqV)b3p*9(|XyI)tkM&tK_{vyFzblG^XuTwVSDU@*m_fvl1fc3j@Nk6PGaUF7
zoly142dbvbBkrbn1j<60vOfb6g%>LowiW6rkC5q>!<!&83pU~HZXs_EAE>($Wtba^
zL)$P$|H8J)s+<WiZHqF^XBc24O=wKGO}u%;sSlCD*XM_>!A4t85;GHft~9lssEIIO
zF;93>D2C`4kh|yZ@Q%bR@@w7a3c;{xPmFaFyyG{U5cyd(px6EGpinvH*jGz<>^}2w
zH?!g1%w0rS9c14Bu(v0tHQjCP1Xg=TScf|Nv?QSPV-BPduGFxlZ+Ol~eP4Tl1&))G
zljPlaIO2gg7clX;+I|lUd5imhyugQaNBMYWQu_m;pndM+Jom?ExkKN(y#3`PIAx4x
zuKOmwJOsPVluE_Dr;|&o|6MR`sS!u70%)$#@~!$>t6hYD27_kmw@)Uj|MfV)yf)zs
zmth@v4iLRR4)*q!TMZmnt`Ts1v!PvR^UrDi`TL)fWaJ-b|L8=nf$b!EGQalKSNIHJ
zK$3QK<|oD#seqScI7ImR9-(r&y<2Xk?);CbJlx6cXVw^6^{KO>dx00?t$dsNH|p^F
zI818Lg41yL{g{$%W86Yz*OA|LeI*hoQ*NY;vUR$PW&^3b<JWps0c70oc(M%@x`8i@
zYVjiG>QbSTj6qKf&`QH^X<O#6{?XdNTaF=cO-vQAt*ptH*M4QYJ|*bb{4$A;OL7Pd
z>P~XVqfFh4(z=I}EHT6;fQeuje+vA1-{yIZ>kY|wXT$@i8SsxW4!XKy6rG<kjgQ@X
zAT+`a+Wu`E)vC=0cKb)zDZt4lJmve!Y%sdzImh=_Q9qH>D8rFVRc+iO?dQ49_ktee
zwtG|9&D#t<s--t8(7OU{c})QzVf&dfkIc=gQM)NMS+z9ciuOwul{1vslce?x7ByrR
z+TcN*VzxFlSg<o+z*MZ&=tESQ{AVn!(vW$>(??>-;|-D1W2MOksg>KqYX()umfyV5
zhK1VxLvRtC^Oi`2o<_wZJ$J+6l*i!SQ64Xc<>n#^gg&0620xD1Sg-TF3n32G;Qsm6
ztxMKqVPe$8=R`so220RA!Dt1;#0<mr!=^-=&84Vi_s)r2m$#LV`8h2hIn4rV@6H_5
zePC8?Tk&%mvrNCc%;FS|b&#gUw4G}!amOXDtVD$JSeQ_v812nWAlRZ>8!9RYz~{!>
z8XyP`Tu1CDIRZ;}|M7Nf!dBMyZ4<qetWIqhVgVR<<R2SEnuQL5J$2>U%G1uH7m6G_
z(NgPkG)xNEMSO5DN{(!bCkRR8+7B=um#z0X@(_x)dsR*BTzfQJzKT~+OP}TB&`{Ji
z-$%~F-H#<!tL?##X}oUq_J<2zt~Y0`^Yg@+60h6AtSTm#W)1ogbdd!y&Bmj^3d#UB
zpUH8KlEG$PJX+}KZ7jYYDlxrao?(An>n*3uNRhJgLZgig#W}dFhv%=L(%AkwH&L@v
zi4?qD;bbf&0cg5xu2S5L3H<QGX@Zp&K+!ORc6n%$L6y}&!Gm%IG6FVoLI^n!K)CLz
zEzdQ)$&^H&lvim&F;eCOyOJxOO+3~Iv_ZH`L6MY_dJ}nyMu)^;SnK&6^f=nWtEG<x
z@3{3vhii<=s*tS$w@+DlC-@u{{(N}e5l*HZqI`4ak$_<PjC5|QdyrW5#UI_<tEC>D
zDYwC6f@ki1+Wzjmeezwqy+CZ98;N+DS@LM1meq8ODoZ*V0qnrF#Ly{6@F>0jkgyWF
ze@e<5jjOZn>l?u=?oFCVTaC~!z#H6Y2eibEDEGG)Ei#rYPD>N{%7_5K5$O&iWdn26
zn)S}`jdc#4zUSN2c4~@Ecx0<bKCKcp+KsNrs6_8uJfFj105}AP{tHs{=lbB470*bp
zdewew=cUr}sC0USvGnHX&CiG1ckWjM@aJ2yZF#GWk?e-Q1|a;1PUl;(rZ`fd6db9w
zF|IHfC2{zv^%2S(*7ZT_eOj*vzRwvK>fGcdp#@L|2#8o-di~_lS*m-mvbBmPFE9V3
zc_Dk~WD)YYbVXM16_0*Mtb?uwIw5<O?bhk0_G){m?`{H7^X0M!0LXCz-Al_Or^)EQ
z&{E07z0Q;jmq=ayrGMGi%=XjP!-HkTRCyPN&j;OD{p<%Td<3oynV|I;(uic~9OIOz
z)4WIl(58A$8==Ce9TeH_FSVFyI-W!B@c=8W<h*L1(Qqj*RLcHzHehY<{2O@w>Mbt`
zMr{?+YQLKAU*vaS&GXP+sC5H(`SRobR1qep*%$R;QJ>j=h8%o9nDx4Ye4WaTb=vDH
zPpll`FTDb_>6cI*QafGqSxODhiOw^xo0lR17MLV4hRl%@)@E62qc=B^FBQ+c=t2Kg
zsfF8YqK6<y(aPBs{ZM1Nv$mY({zo{vr6;97wDqu)gX%LDS$fFF$D*0}d2)@diJD*c
z2w`cwlx-q6!X=^Du!~s^Aa75UA4ScM&Xp!dM!kY5AckbnANBnx8AOKb{-8sgn(nXz
z$@s8PYgmuad-J)MhjW$D{u(DMkK9m3+BhC-y+FShI;~gy)kWPA&$UiYS_S*@9>9zh
z_xiz)_T*_6#Re<;{xQq;u6F?E3JM0XeWfpyI4(QL(xhZav%%-WH2PB+4G7EVBEk+g
zW7G-!F7H6wO)g%-VLZeL>1aw{)@*RJmQ58PEYWUk%R1Hn+QOh65}!*0;OMX+(s?4`
z&8Dng)dCe%Oo|2#&xAwfuYbQ#tk`V!`SMD%up|iM4~LMwNx06Q+#(aQ7g=Th48{gX
z!;0z>S@@d2h+d%!-ZRTX*h(!CNdz14#e?x$TfE@1qtY!AuiXfomdf<~vyd8{NneL1
z+kLo7xzRZfMj=*PML*nK4WsnG(u2)B55Qo4k0*QVpB1=%YjJ|Akn%?87sA&)TTw?B
z#Mei2@Ptaa>}k9v@|~^6x0C&<HSTv@w*-{scOxPLsT26wGO(laR{V^~t!A?3zg+Un
zTD87ou^QxR0jwrVfBDC<hkuW7aXqvdH9FtFG9FH8`Qr{Yl@mGUDojI@IL$x#E{C@s
z`*r%7K#hlzLSG_Br+-a;6A=Fv`S+donvkmi8^8J86l%izpBQT$K|a8(9#}P5ra?Bc
zYgy3A1cSl&7=`q2F@n)@{<ZGFU<E!h_U}Zwx3EnHlf?q=p8!rn?eC`iShulN3*D{y
zGr1K@=B1JhT`pC&n+)1i`?q5Wz{78R-s8FQEQH_5yl3S=gp-VC*0Q=ABK?y01qf4i
z>Y5wp!f(c)ukEhtR!rItq;QG<+u}K3kxvzP+<kk7>IS&+)IX{v-_^?W;Ky?MJ#)gC
zFgWK*(x`h_{tkj@B6mf?<CXIRVg0DFI{=#{@{8{UI`@e;BCwr)ApKJE9ree2)xG@2
zo_;C*=}7uVlc_?&R32VR#WkELz74V!mO;AAqlG5F1U3r&jTWwJCQ0X7!X`v+ZuG)$
z^>A8uA-?h33{=ty&vY#~+XkvydQ$6i`T44fDa&zY=3KRM{eqy!-7KB;)leCWS*Mpp
zd}zQz8B7r-W9S~V6D1qYuHd%~JKfJ2vMS6KsLIDMs-Fczw5~o4rwNk*;i!*23RMyv
zG^hCX3E{aweFAKN;pue-_|WOsQ_A?^EW7>qdGsbh$e7wMG*$mimSqIHG?veU4<*Kz
zllL#Y36Wbvua2hDpYKi3o{J}{1B+`iuONrVexQOPR&hL2^_61J?e0qaJ!ME!hkTA_
zv#x7qu~xm_a+6z6k{}JydozmNOZpYuWk-b?%jI`%i%UWw6G}Q__sVUHukI1?s`eM_
zx#O6%%wS1k?6I1KYGsTXFrp{M7_^HFs+RgUdqp#DD>$h_ZV&x^NNh^Z(4~(HO*X0>
zeBDYQuYDY2Z|eG^P(do_{6*|Bk;52+&z6?z6Tz^6qx&Jr4(uf?^p~Mb@IUHv3iY*`
zUq9TRLspfIne{i5e(<=TWgy-Dk=s!qkEBo+jbU59wB+Sz89LhMU<LVG*qv?(dD}(U
z=7<PbrH6S-4z*qS&a*JNZSCE@sQ$Se-3(R<TLLgCBiQu2s_l52aQONPREnZf_rMXP
zGz!@XAmzWs7o_sQsZfuk;Cm}yY<r#g#l0gOADjIC1HoC@_RpgtIDa|)*c;N}lzKgP
zRi-~3L!A|-Q5l-~u02V-l@nq&SU`3`_yz9qLo}Gk&|SMOtFc`b*qbh4X*!y|?cF$9
ztY<GY-=F%?6Ul*3#AS+rh=_=L5+slbNFguI0@b8xdXRN+KKi|md>vlm)O5llpC*Jm
zD(nt>*%5|~_+!rQLrTplxZ|$ij#rENU)L7YyXI-HBDo};AJ00x9-K!9&~_tFOQGg<
zC0Z%&D?gwwAQL`ZRitq$xeGHHZ}8*ENRttPDV`L{8ovVmu<JtiL|oM9@+>utClMDi
z<sHW25mX7VA$$9(WzHe3&O7M(^x9%jy}#)`Xgu6+_JdLdoz{K>{}-Lrf1jg@pIjs{
zPnPLxZr*}46)dKh0(q0JoXki2m665bhsPi*!5pJI-W+4|p2-F34?J;LF#ge)+;6^+
zBwtkGt^h}1epAf|d-1>DHZaIitb08oQscDFN5o}8+b-7bivk_|o&3UmXC(dl?zlOU
z%MGF8=lu7rF_q_JwyWg{nCEoS?^Y&yfdJzl&ugk>BDQUt)TY6?O7qbo4eptlxpcSg
zA*lq~Qx+!`c>2=^hIA|KCNq>d2eQBwa*HgC7rDRBgKeRR=x=%)Z|uw;MXnh0*|IEU
zfW?n8gm^Pp;q>7iMtk~DkZyG>Y{9q1@;w}fPC3{LgrggpZen+2TXuN}pD)-}Z{6*M
zdvE!L(C&dg7zu4zbyEwu0n8OX$<5s3t1@fH1)UWVEH^mWB<2J+_d;L&2>Wc*VlkA$
zD-|aIpHyw(-J_V1M~eJ=MB4>+%_i}h)M9R4Jcd?LH0O7@O3dRMwRns5u26~<b*Vb_
za}~g8cY6dyJet5n8xU{U|Ea7l<@8%oPPO;Nd~&B=Kn`BG+!=-2e8}?zdg<wM(cf#V
zcKzfOvuOLcpT?kX(Gr8i86L!o7o1pi+g=J|9{HfN5B+V;U`57y*ZBvtoK@%AR>1kd
zVpbGDf5`MB+JbgxFA*;G;VoI@k<U1P%w5z|9059*_n&Mn=%+k18-EeHIonOwu6Vx{
zYP*lbJn|N+nFgEI!#@0FWo05|dY!E9Fh?NQh?(h!-syK~o~_UH2Q#a)9_T&FCe$I3
zx`eTXS<(-ra%>+i#JD5pM(t1JV*>|rKN;1dKc@T7d0bYLaGC+jaHDi>#qY98|IMD@
zH!xNB^C{Mh-Qnl&m&h=&8V%jQeA~atIL&Z=Jznt}4QhL*!oAD)1f!qGq6#ym8Lpt@
zGC*^%-}@z%uYK1WvZ9)=B%aDEC;vtt*dEU`JIH(DsPs*@hb8el{M5gW5MlQRvOJ5+
zayI+%tURDPN9*+rB<2QaXA*!H8zftjP=tDd!0}1@qct5#z~lPv*H)kRE{9Q1!YEm+
zkjV4l)V}m~ztt-@;|YryQpvD1@7!FXcjAgd==^tA2f|);pIvmfo1(i?c``s?KC`-u
zdey9rqKwCj5BnqHt<c~oT{v4xOj;M-w&)Vh2+Y_)X9U(!H~ZssrUpy{Fa_(x7|RX&
zw{l54o63gZhh_h9TYi^OY`DL>`s<0l!3w#b8O<aZ$;E#o==xR}ZK_!7PS~dA4)?WO
z=?N*kYpOuA?oi>bT8++kPw-Zxf*knLP&F>v&pfIqv>$$tDd=?soVIsQTQwQVEx)(j
zVRdvu{OHy#q^4WkI(a$YB>*ss_kXOG8OF`fS_bFCN9*i}rRnqym&lO$;q|)1nDWgl
z&89n>=g~8NmVf7Q-}^m)^CU&{QfOguQM`ub!R0x-QP11_Gmr}&R^8d*T}Ly-f!4i6
zM=(CVGL3wTpPjjZ#fb}bUCakhS@`N7K(~OHYV;U1pBWaj|J+k&PlXOV*A&{V*Spd+
zYkw(?x?cq7I$oco8=^dZj=%;6^r7Fb+K=vO6e`W^5~r^v#fybrc5C7=s$pbyhqRje
zh$b^rvi=zQB;fMU_Rkk6j6Ukx4;{I6Q+-{U9GsVwn*K`-@e+oclQ2Rni!e>D0x;<S
z=Y;7PHoraTXu9yz=kZ#uh||353!}zAq->e^HsGr55oA@}cCMzJTyE6++t*_lAM<!`
z@9zsCj8l6w=&mw`kS5RW+rD^~W%=aPxF*znrqU_V>`$>)YuAdei3L84LGSrRBpS+B
zx;p#06^6^W<sgmkA{oN+1F)$3<gt%bW>^!QJKm(tSg+~t?C>VA+YoA|3RPPGbzJS%
z-+ZoYN+ct1VDf3TdcA#vCUZt_MSZHeD@?}~b7T{>mFyBB@RJ3q@lGpgJk1}<tKiD?
zt6w?#uH$O=4*Y5LHn%eoNA9truOauo)j#h-&C@D~lU?s>>ldB}6@z>NYx=-L%sA2w
zWWHMeti#&zc0$~&nf5vUtu9cSD0ANYHIOU_m#>uF(cp9*0aTXK-Ypc3a2q(grwO@H
zeXw4?<;=JB*xy10rGNBWj;{+%-ZYBky4(v5Zp-f@TzjHvh7q@|jB!$j%CvJVteT*t
z{Pt2T(UvBj%vyCmxC|;QgXWg+%@qg_zfFl=TV2I5#rz4A!`<e#-5nuv__^G1^qIrp
zuS9cR#9gMf4Yb84KbXyNg0aDI)vGU-K?buHU2x&o)<B?aBD?yMn=pyK<4Vi(6alBP
ze;bdikYDxIS@uT@fnvyd`U=N!w94!(qA-JPxl`c~NCO7MIjnU>H#0&&yf5u&hrtUQ
zLVvXVYk%uA<`F(iiX>=G4c<7oxZFJou-x!}Df%`kZ!Tu-T>7;@1+28b197As^xP}=
zXW6oclQ~+FyRv9V!`7S4tmXQ^G-9UF#vkT)xi@*YLUJ^rJE8BcUJ<swKRfhB@YeG1
z4ceeu^-t0;65iinm?B+2{zxi<HQlgiIsLrB1KwM5u(PK3d=P(A_q81DOPAqV$NN3b
z?=`iQzSaWQ?YG&wn+?AiRkk-bZH}N$yZ1`$a9GqK-ks(CZ*^ghTzb?dfVQ>dvDGQD
z{)CWWG`Y63l>VGT!ix1pyNB>KZ%mGacQO0aVuBfAChkF_`PJD)9M@Qpi$jIQ?AX7H
zJ6&h+sFRji%qjsHySI)34u{l-xQ8l8<qan33khsZG@`z@Y~mp}kHG`(Ru4q<Qzri0
zt)jX!J=RG2DQ^E6<34Jt*(G;^rQQ}~Oo}Y0E!OC*MPsyXY8!${U<?Vnfb!Vw#e~@9
zIHD8wUGE%2;+L2Axl1s+m|yeR?bE?`S46Yw{5n3;vOS(d2Dw=xV1-<WAj0uxkJ{pg
zg;;pDbadC#g?2yht^wzeNVg3JRTq~&*xf`<9^uOSl1)$l1vdV#>loyFTmjdOkW88(
zxA&dnK;MsvA5Hv%psdXP@Ee<)d*|)tel}vcbmLi1+|7C@tHec*6okEuO-{H<!5hm;
z;q-{RrqR)>EYolGx{R_Xo`xd`r^oal_TGc!<>SlD*VC2@v8zI4GT04|E6DZ-yuaTF
zI%j-N=8DT+MlUF?`Z143JXIo+4&%kRrEE(XQXcC#1>lTkXageKcjD>QxyyOA3JdHb
zAuG{fl^>CuE<<Tt%qqw*pod+{!^^vRb*KT>`^?nOwBs0WHRykn3A<}<IYAj51@LT#
zF6WPXl9=u!50QMo?@gG3vA5hv=(AUu%iNX=5xk4SU77yqp-m~&&Wu`hwoez-=>nc(
ztJO7e+k*ulW!PMLPc4&5f=O|G@3zO%argVihRt}(R-mB{9+MK5^5T>53m`4jlafnx
za6k!TJr#1(<c?uVe3HZz0<Sd|Vi+kPPGozuG!J&m@PK`G$LUO?InDRRi!*2>l*Q|C
z9k{4EU%}Yc+-AqJk%*Z?Fpq^DNRrkb)7MYVHqARi2(fV=3-x?liqnL?D<UuCIYK=y
zj>n1dhtgU>MsN>sEcu?v^(l@yvx+Lf=5X$-t=aUge;{;!+EQoqUu;~qFun)*5_hq7
zup*_O?)hF|Pc(<V=VyJz)8cDxNArIYACoL_725;`h`8)43Kg4hb6|_j4(JX)7d><1
z{li}%w=C4T5%<S4f7~8|fS>jVUG_LNj-^M!_k<X2BI>%rA^Iap#qiLPP>4<7sB{i1
zo*q2{Bpvn|bSO#3c*oVD*b5D&`WtslltywpKg3GfjiIOEph|>cH!x{wEsm6J6AYv@
z6Wt@CN!-3pWHQI?^aok+k7daliJIyM^0oUGFYg`mx$j`ciP0QzvhHZ0x&#i-v1wkw
z4dHxo4WiG6e=n#yriijN=tqwmOrdg-x78De`U2@q>i4|GJj4vK;G6ai7ymd9a@DH~
zfUTTar=+B<rd{%vI^Gy<3_uWz=ItXWn1j}cpIC*LDqkAzgr5p@IAE8(Ro}^P?cmV<
zcQd%me(+o3SqN?bT#~afjZgHsaz58%X2%_XaddM#cVdRmx+=Q4L1o^&Js2!p0P_Xq
zt=be_-y^u$V0lHnvmSVE)}L{J2hJ2G)_`qk$&xtn&{kXSXhtI3;G*V%&cI)y*+hbj
z)$ypiCaAo{7(gc6WSF1f8lqELjqWWb9YrA>Wl&l+iAJ{B{<OeDT?&F2aM#niAq0=Z
zd-r!saC}EcM<!#X%oN0<51Ab<{alBU3{Xq(rU5?d|Fi=1g5iI@f0>*Ao8sz!qb&cE
zB{cZ^|4%xr|HDlB|IhDy#5`{GzGHO0zpaFQeYz?Dex>y#bFT#>py0`V2k)Z~vBL#y
z<8??a)j75Q6Z>xh7MzPWK<w1<Y=aHR3V|Nmq<_paxDsw{Fq!*IRWard@{bm%t<-~L
z!}dTT99WlZO`2plJz4$8$qrue=|$zAr0M{e=v$vm{3(v!S0P<ged?v`P9E6GRCG1N
z$2aa*0?wvHD#RaLPKsV~Al`)Yk#Sk@IA0wEg8nE&w)Sl6W#mBIS@VbB+<1J1%VY1E
zGCf@Y2nOKvx*7@V0g%&Vrj(o>rVbA<(sD-d=5B<Fzc>CijiyNl112WpIiZHT&~qSw
zT>*mPeD(1M>~1_OE348%@D^{ZN60VFE;{m~X-ci{zjcJCJ!4Q|lw@2_hBrufX#O#8
zA49M8>Z$u7s@!|oWAXdJ&3V1;hJ(lcfS}D6#aR9Jo<!K4gjyy}?2@Qb@t<wtvt{*6
zSF-71E%99YBJprA#pY1AyE~ps`}Hpl5b^dB?^pR2_aI<`$ebeV5B4fE0Fax&Ym;L*
zIDIN|3XYi<`A0vUrG{E7dwY=+46o*5(&iE4Ey>S`?3?J=$4e~QZBtWIPvXFvcPH=V
zisysz$7XLSJMPA^Wppjc0RQ0&A`W<GWeraE2;`zZAGQV(SuJK*Py1iMftLqjKd-7R
zX7$9(m<)R3NLSl#A<rHNLkI=2ri(SP%I+yUTxNyo|0t|xAQ391@rooq^)DZmzYEJY
zXg9gSN9VdFTRo_N#VBha&<GT)P*dL=sWm!tvK#gpsNdPSDl|mLOPGLNkT=QPmR*f5
z&=+pE0^KTu5Pp16THoug-Gf*s8AT<{C<A!0rf{%XqCc4AF_N#GOGdHucn{T%VSpmG
z`J=0$<p9nQpTC~9CyJWm*JdAeI}aSnYplg4H{#Ib_pM&{ccC9p;gg@?jX0fMM+RJB
z)5yUvPTw#(0-4!)bFa3yVgCMzASXP4p9y$|0L=4kqw@y#y1B(gR4}ssA+5<xu}l=c
zKdEFpb4@UBb#ISe@U6YQJ$R?X32bnHv;k_vPp8QqOu^=>f=Lx&v4%8W8wB&ILXi|6
z>;K|rjHNGgH6WGZ7}bLWUF_SR^flh;SOLwh-s@Y%H9>dSwNwmkq``Mu5&R`e$?!K%
zcK{UjV@vMu2HxpDi=X8NsqwZLjUM+H?@4SEYve!16o{ZwrK%S{??o+9FV7KhaGywk
zf(-x1{r=WV&;I%5Jg>_Y<11YHi}sE?2)~2Xde<=IYp8PB-B%|67U2Y~|LyUGVcKrd
zDe>Zd|KN0*<YDlMKQsFIA^&Lna<jW_t7z3KIs_`&1Uuu%|G&P}_`lqE{D0y<<sz_-
zS+tvc;^U_!wM(>{t1NExuP3)b9M=&^UO7Cx7jbFDpKf$I@A_A{+^+9&1T(7!;qL`v
z+CQjkqx0pSzmSa(bRd)Lo^rSZMsm&Xu>-Aann7@<viL~GS8{ooIsDl8l7sB6)B-#{
z%l8g2!zO&|4OZNB03Q_q230^W91PSZ)Tdm90bC~uz}M65G!KgZYM)1H5EXk8IcUI%
zwp0h(mI|HY<JbUs?ikIK6sC^=Xf0S@G0ynt|NkASz-ER7z-chaMPK$OIUv6g;9)l&
z3VBV){??!;;=j!E>BBm>7PWGtK^lGoP&cC(8yi1Gqt!Owa>jdi$A&0snGCQ;D3vC3
zFPTQe#l@AUR+f>%|D@rW{prKn&H46BwIvyl@6rQx-qBKHFf#VLfH!wBKX1PQ;57(D
zrq#2}euDQV5nvM!l+!Q4Q|SBNbWAPon&o$h<MC1>zeE^GAh;=SEoMqfD=Trk&)vWe
z_3I4AZ*OgF{kL&9TlItZTcsH_+UZ>>el%*Lpr8OZvDwq(Zw|C)|JKyh{4tySJJaA)
z-cVg#jbDFB&S!r>OEF(!WoIW2=0-~OI$!liIp`YGRHu7?E!N~6NaT=Ap$7?!T8G_k
z@5}DXoFCieTlz0}e{J;40Nz9(8Wv=;J(LbEbq&24oge#Jz+mD@Pi~FIcrfW}r^_?n
zW;<|)vdd||<SC2|2~DWCf7k*SEk=BUYoUEn0Dv@o^)nAzf5P|aD*AU0K;Imo`%LEd
z_@I;4Zgj@wG@t5NZTHU}J&_C~U|aKeDIo3wAZg5}>=<?I-xveh228IbVh0BY%^Iuc
zAVFbcE(0$G@HOr7s2L!rVb7sRi0sG;x?U?h;L$0))NS(t47f*zc<?*9Q0>;&)N_n9
zG&KEKWY;x+usipYgxu=H7QoeKlZJtu`kaKDo4a={iA^72tuxra$?c8;M489aXu2A(
z2C|botAg_tdc6E7Z_fzN@2e7w$0Vk}hN0T*?uOh(_Z&M&Ut-c9os5~31E&W-+1KGm
zYc6|i&98I*xs^d0m!%C3-|Ee4SDM8dh>UmZ@<YR45}wKd#M!^jZhsBjiE9wRvsXt}
z4V^oN{Rp2Cb6H@t4t=M&a%8{%lBpV$p_cBs-0UF?SfY36K(Mne9W=<z9oB#!g&+;+
z=-DfQ?DK@+S)V=CAfD*(u7I}@s{w=#(Rj}1j{trXO31&>Ot&)T6Qs;`#k)F~Rd&5b
zx6@&Gj#9;eMh8gea@dfCF(HQ$jG8E*RoUqI#nGaMtvUdX*^CAVJOq7s#aO&Im%ZKu
z89dP1en_-$Kh44!(0t&47l2q@9bN2>OA>V-%$7$|%isxzUV)f<lCBkmO@jtb3Hkak
zxE%UXM2Wf^sBt!`QcZbvopP=m&`F^_+vxQMk|KST|BV&ENlc>PG1BU}froSkC|%l4
z{6m`<#yP>+2l{~@K>NKZNO?rR0Ecd2EFJw$`~h}zIxPIC17N8rk)E)OG`f{p>y|QT
zrP*Yg7aSs$0=q#EhDX{gW>yEkZYHh5A5Wys=Enz(F=3CpLvA)%;g2oKHRb-M;1mJ!
z!7nSts4qr3SP*dPB0H?s0qCTE%Z-PF7WMEjm8W<+wj@y;uRfi-4q(kSxz*cxj=6V+
z5RN~=QbAa1e^^og{utPxac_W7m(T{|fEh>_s*i-|L7p=Q@uN8y`d4B(>S*XCuUUP_
z`CHzF#{Xky9u~5k)CTe0H=8aF1yQbl<h2p^-0ZA+)k*V9-cvc+7`lWTxH1Bq%#%X;
zYf9xE!dieq<#fv@vWwIAgD9s>Px$a!KaciPi=W(U$|Z1<v5bR6Q|*HVJA^;AlslsA
z`<|qC(MkcrntE`MrF;5GclrV4Hk%Qrt8fG)V`)EHjwS-aZ1nrgRNg{NPRk_$LKiYH
z4bXeBR--Rr7Tbd8jh!;+_rkqDj!9`H8N!WoBCI4fUah;@fPhLAz_q{JEc~s?f>6hg
z8BzLF&cs!%h~4RIgT2d3P-rSuyqQZjMALc>PQ}|uQY4oCE7fL<->z|XBRx1Cs~BWc
z`+vO8kDDLY!10M1lX`wUO&7|T*ZZDbLKx&_s`0<9l>@@zaKNEV5r&2b%pr}yI{}f_
z3yDc$!^6jO+a|PKT3v=3Ogm64@V=LU)AS-Xx7Xbr%eFq8;~dpz+#1yJ)ka4*NaVIh
zV|^xCHq=7TAIxJe_rc6qM5O_nD*=pGWB}U%P;6DoO1J#=3kfQRcCrw?_T{HkmF?GI
zL5FDP7+!A*fF*{PqzUx$&*f=?eAfX8MK_8$$ISP|{kboHc@%-Qu^C}O>#v7J&j`63
zP@`XyzJ8BO|3&Jhz<Ba(U<&<L`ar+C*wVFwcZENPX@&U_<5;wrKbFJll)s&(m6Dce
zS*x9c^1t&@r%V(<{hppjWOc|CgHXWGn(tP2!tEmM1(C?N1`XYcMnJ5x7yAjik;D(t
z!k|~z3N_{Ufo+h`$Z-wU%@{#h@BuHsGPk$tK^-TcTiHE15!^R%wR9$T$-Y@Ju;xqk
z@8nDz!&4aR1@{!E+4p1ij934vG6?Rtm@&*+e7hs}Z-f&ghXLgxmCg!o8b|C$ncesT
zD|BZChq3_Dbj$W3il)bri6w1s%he%5n*_x+XEe|xR9aw`o{Z<O5frWBm>ds}6Da%{
z*%H`i9O|&f0FA}n7Q*`KDmm_JClZMzjY>1$=7(Du=eVa2_6459ssf4qtFb^Lu=n&S
zj`8L*uG0;z<LyO$Rw0{RWaF4^Mncn$DyG|0f?8>-Emd!bnf)6Q1y(L^la=QaCp#}p
zZWmu$fzzw$YGf2oCqj&%2q1a@=TX6Va~!g&oieo|+hdGYCki;RON0=Re?3&U3ArSO
z!M%=BAa)-N(RT-a*PTG4iuimWzHGp?HDuvhH?2k9*B6E7$Zv=~^AwieTopme&q`36
zacvw<;StjVESp+U!R3?K#|gQ^2!Js5$BUiO?2}jBVQzz8Xew4oriDO<@RV9Dc>J$M
ze*UKwkBhL!!@^ZAllW^u&tA6Zkw`eP4WbAWeuOFKSmt<;FsKxkV6lE$@e|*i;ku6h
z;mwK9leK34?^#j#rqy2&v`(Ekq|SPe#8+X{AB79#Aa$Sx%DT4F{+$*81raq}uk({5
zYy;?_h9dB5@PCgq1nTFpY>Z%$Llop-sw7;N?+q+LLTpUlra3tG=^y6_R>^Q%-(O0C
z&b37u-`LTyreDoYf`f<o!G<n$szO`^y7<qIA%iFxmn#J?`{_v(3d7NFvOx*NqV9^J
zi>8*@mw73`(IuBE5cv0N!CGbH+tCaO$&+Va7-es20m?jg&BdIv==jMhLo5(Wo2qG6
zq=Rl{2SuyNRea7=$^%a<`=!Te8!kw$9L^jxTf@mTA0>pk9fBPsOkUwJQjO!IlCZz%
z42qAEI){dMouacIy6tV85(rsfJPnr!27^3sZ&32qmw#L~Q39;e7|15`qUS5+PR1yM
zPWu&Drx)2q`%!8<EEhwsuH>9N+Yc*C7q)GpUEsGr0KBsOAF*EcozVi}j3l?GXJS*I
zb>IB1ipgZOF#YtdT@S2&DZRW^c4y~=GiEcLDNVP<8iq8#3=G*p$NfZk)aRd`_Umry
z^A~#USLkFyA2KCE4QS7DjhfP}wiQh5F|4af5JHpEf4HQr`aU2d$ZKk*u<6ekgpddk
z#M0jff+p6@zUImy82UMxJ5zRdY-c2&_*;b;ex6c0-AeA~L|Sc=>yuTnmhzg`JHOuT
zeuv0_yN<wUjKzTa#PUap-#&pAD}EWY!m)_xN0~C+RWoPD5{ZcJAfu5L&h_sgb`$=p
zi$TF>$uKQf$y2Xq1`_LPRZsfsG4kae%WvN2slpWx1KOEYCFfD0)%dno)@`dI^K^}-
zBjqL-lR@R+8K-#snfV{jW}7>_zc6()=<#B#=n5Fqh&>QpRu~(r)*UPZ#c036PO;$a
zAG6A44EC(Y$=m~rJnd=(uEIM+An*|X+c|BmzF@GNkAIu^jo{CJI1VT~m=4RVdZ~0a
zD!_&YsGJnX|MgE~H*_{;cI3(fw12;5kNI@b$0NXjx^2Ax6g!QA8$O7_uM_0r?y8rw
zKlI0WaIa6~M@NNqQhZQ(KFk*Ek474m&p;;R>LYZInE7dP&q$jo?@5ovmDhT$>*Lh{
z@9FW@XSJG>WAq_-`PHG_TCnTr?-Pinzq%G1K*$mI%3(A3&oR~#xj<hW6HDUKW8wEe
zPytYYCfzdkD#yy$s2dC)j%1lCI}(csxGd@6Fnk|7IDVX0N3sd+k7wdaqZ5vzk^c$2
zz?%3O2X@{!mm*=lh_|X1U01G?<z>{UMYFfpHK^m3({90Wx{6`_)%ly4#sDUV8bZW@
z9GNLrb^yi`&(3v=Lf<srrkg{Tcz$ir?oH%V>Nl%hck=nWfYxfjdx-QN0Zp9n{6m)8
z(*qe(LTwXg0gz`-JuvqJN>|;j0rfQ+Wir;=ULNx83AsdiPCO=cUz5>HbECg!UxB3%
ziDIUB_f8qMVuhOY!BQhJn0<`<y-U*8dc_5*$izxdJ;))WO<ZAI<30Dp@G4{bGr$OL
z9OsVkZ=*=3Qc&Y1`C)(^G26qr&s((yW^mhC;7P|boBB($PDipNvkDH_4erdmzj*9l
z58dk?>#bFLL?AWW4qB_SpNo99J0ps(s$~;}5UFKiCto!`fm#lcRMJX?TFtn#HXEYL
z?rzyN1pJ}AO%UY>mD_TnxFR)kAD7p-M}6GbLJe>a(@Sn2RKI5bAo1|NplI~=L8E^)
zqwpc+IYMbDd}t87mk1vL-%BJ`*H4Ha5wcX_cR!OvVv2~HMPO#3gy4u!Nm;J)4-Yl6
z`HV!LcHKNq)iLzeZXb-l;2!>&yWEmKm8H&b=iK><+iS5alz1_7^WrrW?1#<zj~mr5
zTuIvX>n~k~D|4FhLrEI;HlBOn)H~ekpzckVgU4c~Gl2k#_^X?FvGFnHJ6eJy`ZLe8
zE)P?f%RCp8=AdoT`}`uc(<m<24K;>u7q>=;2+vlWO?p(0Pl~TvV*=yY^d-16UF*@b
z4h9iI;eNKUW@#p6yKMwW5#zDV0Ht&DeC1^$2OQ)oI<g_#kGx7);?|!Z)S$C83miKi
zMMYhYslxu25PzYn$@`qhH-6OoDAJ$I(wuCSxl~|@vGDvQRpnpR62<;5xd2FT=C4>Y
zF88r742l^zt)$N)vi=zjnl;{Dl&flcjH4VU7RspAa9UjBwUSDI3eEkTT>USgCsHL(
zLHBMv&3x{6czjmPiaUpiQ?oSDaWNj#z3xEf%CnbzpNsR2Tg({=Jj9+qi;C_d-@jRy
z{b#&=Y(PcasU`>~^lnQp=ziWW4*H07FPU}p0p|bmEwutez1d&nqi3EQIg*mwXMzx9
zDdwaDCat=QM`e$BjHTz#z%L{#ABU&kpS>ViC4Ktc^O>Y0;?p1AwqX9>|0Pz5DEQ1X
zr|aE^)7l=D(KPaNpb2+7rfN(`AmF#z3V{fF2&qHio_+Wv0j}aDBRnsB9Vn$b8!yV+
z-b97}zRUScTx8gfH&tPZje>r>S`3T|Zno2yC@9+En6$U9P@g?xe!Bjbg&@kz<pv8^
z5^@;VOanDJi_O2WSBKU6OnkQi5Fxj@>a9U#H386@NBIK<i#$&Ck{F}zAyQ^AHdeOy
z3YXcSdfC1H7_wmvK(b^(m$;&Dwe0E)$j_ch^&%ZV^YqL<np^J*D|@)V{kBnx6dV##
zigq}!e!ew$kR2}ya0XrbgPF4ABeNnr7&yb~xpJfjV0f}ATlypUd-h)mpHtqCR#R<W
z8;nvo9QvcV4KLj^J1{X8CO=VkQ08aPt_7byq30~B#S2_|9d}${!-W{c;md>Be)uc(
zawE0=xWU7Y!1X3h%aA@0)bKUx?IFbTkeG>1kIo=mK&`1PN5$v^^y6qQhDl>eDSJ&G
z9(>Ht(|W+w$8!OFvb1JTctpf`LFj_{WWjVP$om>abCp{CHv{VB%!G&~EBSH90ImcR
z6_SlQ?h|fg&E^eLs{3cfi}j6{`ZTiR!+WA+&5W^`oAi<_u{M>(i*wa=cjn33?l+x>
z%FFxNn-6w!5Wd~M(WVKPw|soJQOs9!a<Nzs_hx4`*WH`B>2ME+7mXfnv57MZ&tgyK
z?QSlk_Sb}q0rFIn2Jnf^Fn(R$xwJkgnQGfUrp6FmHB2;Qh|=DJPpHv>@(c1>74#_m
z(Gh!z?6C=pL1S8ARd5peE$0k|eMT)Hd0)6bIeoqT$EJDI1{`O)mri6KsxOcya^=ZT
z(9x?QE@<!W5_q1qJjPK<hN*%6SaT$aRX1vLAaNWjXHhc;CcHRJ$7X;kgaO&-<Rc+}
z*sATRaqwYU8sk52f3(`Eb&^>dH8r&a(3i~otUu(cmaG1$TV+1Yd^hZIr3O+jqxqCY
z9Qj9t{e{|KAo;lHJG0pzU;lH%<?6tC(}b*~@ha(S(XB~_Rxd6EXvcLJREsO}@_;{&
z3?75Vm7HZ-excD|^Gvk7(B=!n`Kt7(Qe8@!G@(Y&wvYRc-&#&o*V$=X>HfOErTu@^
zcCJxLW^EtG%9_!fn(SmfMN?@_raU$tYK*X~9J7PwAw`kQ9M6YTBt^x?No%sQrcoyi
zEf0A>KvP5kH7&Iy(^5eNL?w?1h{(YM`fg`FJm21L&t8kQSc`kXy4lyg_jUdL|GoEa
z>zD^bNX3hNtZ9NTH%2VZja0p(riu>4gZLfnq{VVdEA@77_d(+|87;S2RX0D~2yfgI
zq~c@qXt9qy(VSo3Hd^(~;Lp<SM*yRjexf-QoL6?)W-x7*iNIUMw+!hUQZCNC-Gpl+
z4D5=NW;N27+`MMV2591SG+t4ps{&+0L2eVd)kSV|$gw1G)7K=ZBtA-B6viaHfyt(X
zqr>rcr$5^fJpk;1F~S51Dnh-&%@;RST3il>LcYcK;@~IPO)J!!zpw5GNw}-z5IF)9
z5D1b}dw-7Z8ca~3aMP6t4-%3(^5}Fl3}C%Uz7SgOT>J_2;dfHi7;}&EQ>e|#a!s(x
zRJy;+-_BH5Pmj9)T(a=drBS7b!{Ham2c_VN^%<0Yz%vCjBR57&r~@I}XB{NTv!JuL
zC=XUi?>yz>{OV%m<Hwr?*1s2wf^mHRkF}te--G=5W^sK4CE`$!f$L$Yc|?**<3R)m
zUGF$)F`vCAA0NVKZe#;u<}Y0oAO{t_%yME9JYdL1B-R!^+}&7pU*%f8VejoXN{Qw(
zJw|*3RNt7<gN2($z#zv3sjHVk8*?8qa$B^k&pw?Do&I1Bxspxtj1v(mzZ6~YjEHdB
zQL#a9R{+SeXPQN3M90ef3<l%5m{-@GZ4r7ShZYj!cTcI1N|!^OM4VVH<(@^gFvSh5
zyJcnklqDiVcMTXR<Kt_>1(J40V%-raNQH!AVTnR2=tOPWLpyTe%d1P{Yy@Vi+FBVs
z`K2yFyyR0{KldfgGIZ*$8}qC^ZtR&qFIF*aAR%;e$QwV$QjpYkP-1=@4C@|Hin+l>
zOgYh|^s3jnW(6=Cml6ZIESwDQ&}CpPO#fhfRRcrt^Q&m@E{?{)Yy>J{j&#@ep1n<9
z50sS{k(5glAkD+ajNr^g2?MoRAWl)gx|Bn2-zRp+FGwE%xDME${MaT(Ke`^AP?2o@
zvxfW{^xTRia&@p21(`_mM?t~xI<WgxG^rBUez00#38;pgCP!iUqwMZuDg6ryK}F*!
zc{xVX({I92>a>GsoNYG{H^0m~!)y8t5!)0-+cDej0n9_Z*L++IHx1T9#j&zzp9@;m
z?1tybH>p9cr1KrwAfoYtFYY?;(obIfkm1pdLsW-aGvE%>{w1d}vD1CSYl>IY)DYQ7
zi}#E0a6r(M$#H0!S6las;hgQ%Faoh~=gFJ*{8-r?^~KxgztwN`ln&ytB#ll72)-Jv
zwF_LZ>_t3?V=?>&I<FwW25>3Td%A2AxB9goI^M_FE#7ys+dqO8tq0U8Kk%xQ=^G)J
z<LCJcZI7UGvj3qSVHU8{tq%-PL{0Y<#AMtzOY@z*^+L&4q?+>ef{E10$tm-I|98gj
zFJ9;hW=FGFB-^`6c8QZO@fszU=)Uvc6^$KvR-Hyyaz~CtEy490)Hg~+_F#F>CaL$y
zF?y5L962kIvVN1n)RBuW-xiuJmmLOGJVVc5HrKBsyH~HROPYIRzjtg=ZvDbkaM%e5
z#Qt2Kw3@jVvV7?9$yW|g?ve7$Fd6N=u{x-3Wq!tk8BuOJEhtwHkgz5?N%PF4GRaq+
z!Xvh)O`TgWX2S4^+!{=<m$A5`{n<}h7zxk9v^MQiDJX+)HK7(=-z3g7Z4fKDRjn}v
zS8TWe`w{qvEIXoW6dJD{-k@^5j2jK$PZZdpT2jdA&C)&Ct+!Z|^CStl+t-atMVZ|6
zxJ?s%Uy2>VI}vD(W%%--;)3Hd86fE=O&<tKfR5$<y|gbHRt-xDPmu$KVc`BYbe-?x
zEOcB8Uj=4KhI|V@2jEj4&_G5|ECzpZ%A_(}a0(|P8EyAE*heae?A2cR{ZL*ta6G(I
z49z&FJV`wUa!5#{RkJl@U&~eX&lUK_#Z!sup2(Er4@J2@`PWbsR8L-EAji1o>QMEL
zzcz87*G2LAqA?%RPer~H+m7;&?g^J|X+hn-{L7-Nh?(n$z%R#h3>(g+jd|Zgh~u^$
zH1e@w|CQimT!lu#ZU%c{Ii`eq`9u&n-!lrnZi&yQ{3=aH(4*X9brIeZdpOKmPO6?D
zPKt__uL;=GxW({7-0%sxh4FgwJ3O>koCwgiWN+$S)l6pKji&Bfi$I^Ja5+|jcwsA7
zv@j_ub2Y9XZ9$Euh4AUUu5TuGCC+}xcldN#lqZv5mVDM<ogB8~fmoQq9RQJV^=OLK
z!vpD|p|F3^E^0Xozu2Dfu22b}!_6EJvkA68T-0(zHErudZ;bu?u-?rkaY@$3f9L<<
z+@k9Xu<0)em#HbV*S9FnhOaEit$LCPO*C?IN&8mnUSjZ~FdZ@6TF4=)?pp*n3fSc+
zea`&^!(A{+<g$Wds2uEiX?gSgP%Q^AC1%p|CUdOK)z1xy3zLULeWkzED+h>{VZ5EJ
z=Z0yb=G=;AThtWL5hk=}d~23iqY|&JUE5}Q+sVV<=FgB-=JETMWg-iI<xwjSLL;sL
z9T$#X4pT!QOSx+EHITwzbg)4gn&m|)>bSAjaZ9Fg1TC<8pkwY-#L)E|1pXyP#O9Z|
zkb+9}KSu85?tgxpSkdn0AO*@66C&V_b)bh@FLO$qx?lTWX_z60@haP(em*|@5a4WL
z|HLeYD|))U1M2G}<Fe8WdNAkTIVb1W2KSaxcT4#VC{HU14?3&wT<&g$tw+Xj!Tq+4
zb>!gt9fT^^Cx2{p`LX-8pDvf~mj=$M8Wu<8VpI}eP+sbj-qp%KF?c-dG=!!{k@?rp
z3R~9m6Hi4-$tJ<M`lzm&BjJE2WbIk8gI=NmWr&W^f_!5RN(!6%1&>RUYSdL%_y_9E
zpVl?z)Sk)vH50E<Sn>V}Tsa$Pw&MnqQ^bF}?Yw-HB}%}`{*Vmjo6SwdXn0Ybo}+e0
zJjX6pl9Wi8(bxe0qbux1SBc~+$6>rk{@HhA3OJm=Ql^K&e(Xfbb~5EIy={?o_-f(3
zoI51D^jtDaJ=IHuG1kjU{eMFiaqP}{7)g;gS(%zYvi4SCtN23wf^UUNsbE}N?dJ9>
z<XNp<ZE56zN?}j@dXNc6q^ERSc-77BR^eXyDu~FU3)6k75~}yT&WQdK6B!jCAxN@0
zy;@~3b-mr4-8si2Qsfg80XcC08oji86~LGaMq@rNPUqFGO{9ooMUS5sjM_@H#x$Ev
z>2RJM^N$e)=O24`mu+l$xNZHfmo^tC1kcO#+fau=a6=Oey*wNpk8q|K^NW|r+t6I&
z+umu9oPKamhsmYa#$PXU6@#(7rJcCXccOM>k%gZtOQu~)lX)0NqN2AM3xxV|aNsBT
zjBf^T1K9gbEKK3xYvGlS3GiyqdY8H(nC&*5GIiJH<S2hUw-fk7c(Oxdqgm)%{E^}-
zd6Jsoo{8|RkCEu|+qbN~9j-2Q<|`C}9U0<QCJr4=O$O8L*E4??XDRmekIq9ezI^~c
z2Ljk$wZj%Xx~bpYB8jC;IeGLG;ZRrtH8&_kRUPQecOQHI-bZ3~{mI+?UZRktq%t>l
znQIxnat63Bs)1JIDQJ+=E7?IZx9ghes+O?k<JxC1FiruxN3nnuzHqI{Jph(P{6LxZ
z4x84vz|0o{7w`B@^Z~p*N-u$VDpSBVn^^3{xU-%U+)!{!H60m`&as39vYw3WQE*<b
zoPgc?wD@97Rh1~-oJ-H0m%zqp$6$JLkW0<6JAw(;nCh-BTIiBmX6fR$OnRvh2{_t5
zjeQAdx+Vi0y!C;459s!TZWIsknRwvL!8SbOW4hrF9z2+n;B~QRMdA&=;9qQHBZj<n
z(ez>&<D*0BoB?mMj+heIZN&ayoj&|_Wq8b@>%iC6htTmgOf}`SBU7veN<G={>mc8_
z>qNJtWzBiFFyFkgdrF`(=Vt*1><xMI=Cq~n*ild%JP#MU>&$)s{Sz{4E?HY1+SIHa
zMgTd$fG#Z69cVNO!L947fo+3PIT{o}F!QC0$Rs!S4#z<T7D{w{nQ!7T`|RN^MZ@U(
zC*7CS?22JQ`Ne0iy=xhf$AKr4v?GGse<0l@MmPpk+`vVpG)g+j+&ZmzF1xdJVZ+9q
z6S*dh(Uaq0$7<JT>i{uax7@q_8flt<U0^?eC&GWe<Y!ozi8s0CP=%P7x)qr{Bkrq2
zhfe=xW{H{F5F^m2a6iQ76L-$Ao?26IOyts#aZ}T|`ZtKy<pqvE@8a@Pqe~I`eej5t
zwd|s(YYsW4B+o0{$%T^O#O7mLoGY%M#uDbdG*YY3$YHOjo#4R7H}KZWs#~%VJl!|j
zQhb1hPno+T6k){XTWv@x6|U{izI7>dBj2`1RRfr*K7bDR@N3s_8Ja%d^cGCcHSJ52
zU;N6!Ca+oI4Ui;(ZC8uuG?e|{@{n!{(8#(q3Cz+QDzZx>ETS0Iur#H$ZTIH`>*yll
zM(x?%MdDr!JW(YU6$$H}iOF{#O`iXia^*ql;ehGeH%LQPwj7V~oC;=vT|xc6+|leG
zkeoQ+ngX-VX55PAi6nNOQ+Be;f-QZoa5)BnJpOa8jyfkdbOZD$r2Wqyw{O`pf;)68
z{;C#a`SDgr?`#~r300lgJY<R__fLl#)uz=n_Cs>LnowaDwGOEp4Pd$hv@;w;<*d2E
zU~e)5Inaxw*sXg6MSQ~KaYS)z>d0oRSZd2mAYcJkk5ZBGir%bMECMpWn6Tug%LMB~
z1AcoUNq;_=`EK=!4pWM~23Z{t#)uI|c2>Zg=^KSTS`hs4Rc!S5Mxgt<B(2|gg}CfL
zE6zX6wm)_d5F{I0Fg!qhL;^9H@2i<B)M}-GjITs}XYK!tD}gWmFTlz=I1E30_`Rt&
zb7Bzy8_>LR5Oy2p8@&5X?y8=jh@w-hu^}{ooK*0?KdwDyktFS}=USwWf83Tsq>N?0
z#TDI!)~uRH&c54c9kpO?GV{+Dr9{d<JUtXTS27GL&dbEPF53ny`RIG@L}qsku(*T(
z#Y#CH1#@SyU%KVr{83lu#_@lCLx!r27-WN)NSf<Y6DAJY4g4xVU*oAH8HV<^5w}L}
zq@>nRfpFhf7d0M`@DGCfe#!_o)ihY;4-<jC(--HFe;i~kl00djRu?4#dxX)OzyH$5
zHf1iQ<)9dHV_e>2>{eY8K1nqUT<rwV?9=}<`S+W3$3c#{+O!_6_ALZrleC(Efwp-F
iAg$H*|6J1baTz+NWmIs&A?p0<rS|78pCzBUp8S8D{r6-5

literal 0
HcmV?d00001

diff --git a/docs/assets/features/disagg_prefill/workflow.png b/docs/assets/features/disagg_prefill/workflow.png
new file mode 100644
index 0000000000000000000000000000000000000000..9e773f4fa0d6da8f13a14e7171702714cd615fb5
GIT binary patch
literal 89969
zcmcG$WmHvd`!7m^grFc@(%s#N(v5U?cXxx}qPsy#y1To(yIUHh;mq~C|7V|b-ZA!v
z{b7$`O;*l3uIpE~Ve+zKD2TX-5D*Y365?MKAt0b&ARwUq5a58HeEV+s2lxxpQBmv*
zMEN-W5pV%xDl8)m0Z|o&^sEmHT)(puS9gSfc;5s54LM-@+Xw>UEnniR@OL+zKN+xk
zcmvaZ1EY~UOomF5aV)H?ta+o}#*ZVTqtB1Sqoc?Y%+`qdviPhk)kbem5l)fY;*lct
zq+<(O-pws*D`_h$DH-`e{r&v~m*}R0zdr=_G9mXcf$qt2hV<e5^|3?rdpW~$Izs;W
z7qbZ(@B`r2e;&d9pUW+#z)tY1kXaGHfA;))IZ(`wDjX+nXc))V)FEmW6TMkF^Yu>f
zu42aUfyrdbE4h}&sC$0uc?tVbzZYXy?&@e*FM%0haCPpJPf;;~xDv6`<PV&WabF#-
zV#!G<&GU`_eu8|c9Nz0Wc1S`k>x|%5N$^_4ZEqGWqPmm{oKP<{sqM#PkSj1IlhnD}
zO)zLI{|E=kUe$c;hcSLD<zW*}CLhOU5Kf_*TSQxtqyw!;PCt_{oPjnHS|#TwfS+6d
zdtNMRm@Tm{Tb8tYV<aJYlD-jB<#liI9Gf~#w)U6@(^gCyt=vuA`BJA>Aoo!*j(my5
zee~l6r=&r)aPyrE62XyHG%AwGJsI=Zl|CK1ncYL>&|Z%~1}jlL+`2!kBOVz)<s9-n
zc3+WI5kqN0yT}YVxpg0ISxLydKNcpCrVko$QP!0%(66$8c4oa6%^sNgyFRTfrd;dO
zZuTjvUKKB|>(j-++}XHKSkdC*8SDKsA2=}o%u2HGe=d8Fq5heMQR07Q<=;#E|FhW(
z{jYcZpG&U)de?t1_2GoE9S_E1O*7s3Zlv;xiX__KUM+EW|MFfwz2xLtAEZ0V#-hvd
zeJ?I4`67l{SYNLq%e^Pwbie%utdLwn&zmVv_CFGo1my!8-gnN_w6s%>As^nm->len
z#51<vb2%A(-c@ImbBBR~TvLHrZOQ4vdi=C|x8WNTO4zJaTudV(267AJ<Me&ia=+-}
zH5yHR(ZX>{ZoTMM+P<bwRpm+5d$peI5Jh7}_!1pPDf=vT)_M&`O;0~>C?gYbwdyE0
zI3p*xkGKHGU(fbkMMeDEw+Dk@2d?yzvNC(v!!Nm8fvD@T69P~2;Xje;DmyN^XZD6^
z8KYBEk0(Ecd<f$EJIWe01l%F<?d`1x2+HR5*}eLDJZ-R4oi!DywYpl7`>ZigEA#P-
zFC!{DJG;Y-jOuV%gAO{+N2B5RFotuVT~K5`XLf`#(Qr+X66J_Zi#0<OpF@R_kx_H5
z%+-Dx<I%=A)KNN%aqN>QM`CQOp1W<b@Fep5iEYPA9{c4`9OY5fqeqt4{T45aVSl^{
zDr6lk%Ve#&lF@oA_fINw794g`fgdTXg)JA(Kh?4@)>%!*?Ji|zx|evdo@*|6KvADX
zLWbJzUR8BGPYsv6@4Ssy8_(Cgf9AEemM&=uxoy+YX=NXE`*l`Q(P(5J(bCY&Hsazz
zr^+%u&lbh8oo)rYXls{g+|Crj_vmxOM=c!Hxp9l%{aIp7GuC(Fi=%iU)%AI(dcIvC
zdVKZ23pB`N+Hy@{H7#gY1qH(*?GI)7zV1hkIXqp@p}|@_JZJK`Z<9O(Tz5UI9Irpo
zu`>*A>|PHKJJ9boW!?{lTIGtCQ7-8E7UmTcSmJv@_wrjW8zsphePOT8{vhq~j@A3|
zdkU|sOO3E$Y4e3ZMaQe6_XK0>Mfs}BN?>{H?eA6BjkwtK^j`}XnapZ!EW_6<3UD#1
zR3(GxR$5ZrVh<v#`T67;A+ba~{1x%O-**S^;uYdGbsi*r$WfJi6d05wcAh)lWGAar
zZS%7AGIij0?YB2Qt^^0#sxwZ+;ms+_w)Q;fqBjYDn?LvBuvqkFPv(q^feVaGMN)j}
zcL_;#ZVqI|*V>u>K`dP_GBG0Wa4pZ%Qw?6h!|aTF(~0$~F-~7`7nkzZyLG&)NfFm3
zX$1xe35n`)o|7SE)is)?ChhMjifM(LQj(JMAP$S!$E)4AipBuP@}JjlS`=x;DO7?T
zEm~fea~`j=N&}4-O%4<Ma^eOCzy8!r)N5o5yx#we&CLAFx#5-2`gCS_bu(i-n4p@;
zm@8&WhNbP4?*kJh;1N}DJtda4aARs>VlG-xIW|1(vY<_`X<e5g)zfn|M0swhKM^XT
z)R$tavzvL2@B$&R!q@>)f`%&bId4OCsvDZo66P0s&uyeCLy4$=S|>I_->`Z<orD{^
z8lGEJB>`1;J8h{apW?(Z80uI!Ct=~TEIAS?1r2+I!l}ie>wQ~<=|Guo7n1IErLIv)
zXx}Svc2CLTuxoa-;_AnD(W%6sd(OBs@J6!Mau@9n9Xfp4Ao%;?Vk`7~>iL@w$y<yr
zL)#6Us+w$k%{Y&6MIkZSgjL<#-_*i&pQlerj1^9>#&M<f%eW5DmxWRq8mWXQ>urU6
zm%Ssn3*L8Y#Sf9Su@x>W*1FtBd2feRp+xPP9eCRXcEBW&Y7+`f?CD!15WqiFR+Z~`
zygy8TsANC3#LIW{*HzS3Pl-n3S&WSCh~(afxQm={SV>8R?-nkK6*CL(QyU!|oMt&3
zB$bml_8w(^8oAgi2?dQhEIBW<iMZRxrhA@AOqXe*@{28=Hf;1|wCId11G6!Zkkw!&
zkn2UjVa1-*LF8iWQm=AM=)OzHj;PoBgMGJp#3tG>ihmFX)^|w5vMl$m_3f2%ZZaF0
zQtpaB`VQCtH0r)D5xaXyS@sFFnns6{A1sjZ?R)JRB9+q9S32l^)BTPCcP4i4zV((J
zsXAFGoKGhXI}r_bl5fs+Uj<+9wUgZEr*T~s6k;@L8de<=X!=5ld}PN3-w>7i<@j%Z
z%)7mwzm3l#yRTTI4z0NgQg}b!xTcVI^=XqYXuF2_;HQgR-korEekwI>3}_{Z5BSza
z>rZ<hg2&*9^GLtsaY9g=-HTUQDUmC|%AJCal-S|MEq}8LZMgk#@K&PYqOP9ud^2-b
zyi4S^X-?m(>v2TwI({(ymh5%6rs}Zno^;}P+IpP{6!kado&eOSWlbBKOwWRWJrL`-
zs>tA>+h8tm7ZHDIX9%lD7(AQq9%^x2F`^4IT*`GEANDw#T3rkGgJ~;Tzco7@6-(az
zT_K7s*KQd?74*s-Gr;%!Q-#HbFJq+Z`%<*8X(4%1)==e90<3Qkhw)fCw~Wi8KIhQT
z(XU_@ckdNz*H{^jrS6wCb$$gm>h|X=CHLF8gT2b0NPGK^L(pj8jb;klk#5A(g6_a@
zrso+Ws=$4%Rm0l2#-@gDiZb3r%ixohIdUgZ=qS!xuTya=q6A-KTWlyFHr@#Bjn6wQ
z27mqGT(Pn&On;+<Cv?l(H;;^~;BL6=Pb=A8H_Zy1`%TBdF#Rn0(A@LBrSa-Sy}>v1
zuW{2sW|}s+b?U0q%%^gEU8QR#_uEeBD7y{e%ygeeca6j~yB=+aA@YUhS@OoK7mKpm
zHMgBLi?2$TT?pCTKTNZ;_1Z_STbd2!1_?d?6zUti1Nv(AjY1`5mREgVUS4tjQxB@{
z$W|y(NnJ8f*M--rF^od-9pX!?`j<80mJNPd@7^xxe(!sJIu~q_cJapq0W~LEl9QQP
zV&dg;phAP{?S7|WIV3G1AwPv%1M)}C#k!|$bN7RSy5pGhZn`7GySjn8@y8Qex-^^S
z{Fb-J>Mppn!?w0@K5p)&>Zh}(!Q+yOd^k8F#*k6NsI?OIKHt{|C8X!#V!aJ`5Rwp~
zVSoCfl|-8G<Yly~dC9@cS=B)Mqk#GF7yoOY$G@i)3n;j3-{DXRf7RXTw0V>$EEUQL
zx_bT4xqE%GwUDv;`v=DYVg|-9=^b<ouA-!5IIS0|A}O{=%!cFs=zME@Mh2<7ySBEq
zRS+w~Ro&ly{=w_K^EuV5I7nzX^DP;F>2)b->F6rMj=5TeMFXQ~B4*^lLbuN#Wh|Pn
z-veUf;zYzS_v#jO?A$4IT~_T~$I#)4yy892Z#Al*u6k`+PIrbE>TYa}jq~>{uaq=2
znsGaJ`>Y$*rnJ}cM4?2PeVM;7f8=AtegN^wWb*u7RI1~)Y`f)9tbjK5sL=6d&kzYg
zP6Dds@2ShJP}>op!qc6Ux5RiX47RY1D5yYfOuRl3Z4Y-VFnAGp&#QGK2cXrO>rYL)
z2zuQkl(EZXK3DvyTX>bNvMEkHEmfbX70M$<HhILLhoK&%k}nEF#pmL}$gLq*W8cLp
zl^RzF6gzr-xyxd>o-_L_zJ>QrAhG4l<2%QO7hVMK@7chVs3=lgjQ-Um4HBa0h1YL9
zT;uN9-2PBRCwM;zQHOfNe0g(CTcu3>ecVJs-!E9kNcs41u5t3fV>?u|{cXi8-_=Iu
zisXO#vqOI+cGLO=Qj(WG1TvTqj?vw3d9C@xCi^^NjP{1f%sz@?xr@W&8M}pz$Zbq&
zSF9apwMuv}s*406Fu3thSFf~r(;w<enGAbquEkZCUd*2Y`U>_F3qqoHyuv#TbCMI-
zG{}Ix=p?LnBI+UVaRR!#c&YeQ9fJekwU}a=qlbL4OLZ2_Cau23(eyhwI8@@_cI&%A
z@6pH%RE6WfyhjveQL?Z|y*Zpxde47@eM&22jn^P}D#k{MjLoFRii}C?TSfKua%X$@
z!70Y)1+KVGLJ$h0tU-Y*uBcM{ZslCQX(uvtZtbfWo|finl$WGdUQZyh+y?Bj<t5qV
zy5#-=*E|y(6p^I6yE_u>Bu8r$DJiK)na<;vRkt1fV--^CKemlqCLS`;^MVw)-Z^EC
zSG&Hm+8$4*3+B2Xk>^x(+{gFArcJ*6%sJQ-S?4B@W`-KF#Auxzf9Lvg>uNs0m>PVD
z5aPLU0)zRE@l8H(M{|>K%dXk?&3|0&olEN;pA7yi++{j_<X(lEpKBy9A(lX$1xSoW
zq{c*MfyJ_JZhn>}mQOsB*tb%S-dmkGtD<Rt`Iw*oymh4Z`Ii?ts;*sELiM@_-m&*l
z&$moUekZEDb0qyDI(>wTBnqbCg^ssZ5n72LLo%p^EtCrc<Co-KWGMJ>m+jt;^y%@r
z`m3^r^>ykUfn`WUtOH!j(p*@^)7RRT>7wZMqFQ&w9l@tRQd%<fa%I!B<y^XHu&>Y8
zt`A3r=WILrXr)d2;^5jio6l}MYy@T!+A?1LCLDC*6M}Ez+FD%!?1$<Qe6WqmM+=>8
zc7_pncNrdZ0{5HNs#0DyVQ|8hIM}3zu~Ov=_n98Y3Uz-r+;+&_w}Rj3Kibmy!j$0d
z#=O1W36JsCK6N}_!P`8^BZ(lky+ePs3o&85g_<&v8=!6#Xg#T|MRo^fgw1za!s+b=
z)frYd&_y<{?RJDwFb1BmQ3)k<ex19YQ3jUC_c_x=ltA!RHA%PI^6uSnK_8xwF1Cm9
z45n0L1P3A~TZGy#(+M4i*M}s_nVF(O!h8Z9U9U^(CD#oCCxIl=NG=_eAOHZWi)Bt8
z#hXyJDvm&jo=~J&6`lF+TRj|Pb+9oSL}yQ4ew=J|UUPmc92hE5)%DKdRjn9Pccanq
zIKm<D?y1U+<mt-4Gd~R!nt^^PH48XhW<PxR;XF!gC*L`3Z<8BQKMD2oOPsV2<tIdr
z)DO?Ee$A%mO$Qx{3JNryXH5yTpz|ch@drWciC-7_HH4=%Vnl6O>2}?pExu2VBsg-e
zIaM@oP4J!@Eg449@1@xa%%tK6-vndn6a~}jKv-zHp;uQs)WU_foD7misRnLe4iHUd
zwlvnt^52TPzGyR)L*IC4H9hCopR*%X%AH1;b_eXYig|c6zls)3*8g;VnN>m^DQ~f%
z0VrjO#sfCYDiJ4fIZLjiP#1X6EqrkK+>$ZfTBTCm>;|S8{Wbu^Ugom7?7DHfS2H0n
z*=pC1gLQ~D53oe#v_2-LBeQRsC*!=fyWQzqSH7vZ)vcb}e^*;>$Uh6tI0^YQUk#__
zX*QgdM{kVaB$zh0uKbqK+jYG<8;m)zC=}(v1iBICGX7;J^u+JNh-a*`{=LktI~Elg
zstK26bGfFI@_|~%b$~DDWIP_C3d#7~9NucRH>r;>?9*hgwjC8b&EnzGSzgt)LcJ>P
zJMfW`YF}3*337?*^!9`acm1G4_VfO`)W0{-Hg}j!N3MfxJX|e;x+c>PT2kd#af3KC
zwVPF-EOtHhgLau0&@0UJD5r(KbJ)A<DqmgGY|C@Q{aqV#{!re^hxc=_vIimB-Odnp
zMH<WZq3hZ?mAh>e?>z$*qIA5q)Zp)+LLuZ(sALYk`1@EE{H>Cb674Q2yp`~g^v0Qc
z`Ct5Z!~G!y<!*;1+c=5qyyUhmbb0QRydfj;0UwRq+0F?KQ@vI+xXM3oD7m?QZP(DS
zTHFeUjQZfM$--KL#d<S{j$Pl?4ne!G`AdroAZD;npXq`eofvlADmx_67?JxKHh#dH
zLr1#y^R-lm5Tpw0IpG4b1rIbJh-46?h5jR6ff7RgPX&*E<gtH6uzxkN_x~k={m-S{
zzv9e)eFz}V<T}QnJ+4R`?GCXblM-eVYrG4dqR~eqYlN4Rk%|`eH9Y+B!IF(V?EVD$
zjGr<%(n(-2l-fLufvC9`mwBi-8dZ~YioB(&iysEnpqDJdpF$OrT$}`U8j&Jrh}mZw
z8He3K{!pH>ckeVLL^MUPhf<<QA~xA7GD0nPs!i9oP4^L5A}M}0LGQ5~q1B6PZBL|-
ziH@kBP)Fd;$(6o?=~!sl<nFQbw`(s&-378C^C8-NOmioEnFyI_`IDs9v!U3yB)WIB
zjR+d-Koy3os3=#kD1jdTf=B?;_i~0TDnbxfNir%+=;x6r|1&%&>%d+!{!(h6glefR
zPztK>N<O;UDt<17R1GXGLu~ZIebRxl7)c=axdrX+76wK3GjPF9PotCb^D2Bq-<cCZ
ziyYah${PDdGxDs+)}rHqA{{rXr7GD6N!tZofAu5DOYQh-E8XLWpGt<sfsUwLvR9<U
zzavM%M6Ng?KK|=bgNEp#nN<zT^+B;I9Irlcbn-^)fU_VbFIMgf&9|VNfo;sh4_Lfd
zC7PvZM@t5_jt`1N^?R3#Ybq2~=g@3mh0`$Xe?Oc5SCqHu*;z7YXJ<u4MG|9U<L&E*
zFQz<Lb#--$Iywv}C@49kr$`POAHM#iL_-kw_HN5G-Emi)<wpU8Tyk_rI7ms@y|cR~
z5iyO>leedhx8Y=F!`*l#S%v%+oCiP$*sYzBjT;^wo{aemnGXUXA|Z*0C1&*Zi^>SX
zsd3>m($goDRJ3bsXeuiw@I4;pTBN5a07~POfRjfcgt33D9+K+&t%0SxNbid-c!#s5
zgW~O+`6XN}LGx*{hr`^c-QT&9u>e6-Yj!><I6#nEeZD;#(e*(e=Oyy--Aui3@>p;L
zb;NYLUAiCKdc4j&(YLl6uTCL^X!5<F9dY@FZM68_bI0GwaITTLuDhE$^6<5u^h9|A
zDyW=~;dg+#eqgcR?)_F?R8$1?MmHfiWNcz;YHC2(V0e4Jj%t1RyW)`n=+;$N63xR3
z@>&dS&3nP`oe6)Zyx#9b6=N4jr~De`;p9~PG_>LSMh~#8NDz^r&L{DOa_#)=Y@v0p
zYnc>2_i8c<is4rGYop#DA95BkP5=H`y1E)+Jh=t%XaXU>hxpyuTAj%w&@o4VyQKnH
zfWo(g)YRgmEwwLS0-i5=P_O#1bO&#aW|XH;a9F-3kPJuO7@L?JI|DtuicYOkXorB;
zr4*OW_vJPzFF*f?x%>I=iY<$NH;i&Spk6iOvbK5L#5L6*?UkGh#M%qK5YH6I%p9x$
zY@hAS%<lZxFOrAdIJp~~@Si{P<+21kI?N`5Rd-v?+h-0u0Ys@>Fi1j6D`k8=Oj}{*
z7;Z{;b37+?etkV5IO2KUR(Kx3wzs!81j1qc&hYYdx6lX_>f4k?L7n%zA=6oczCPFd
z08*0kMm5dyu5Z^Bmz7nS7!!E<lY5CxAmXi^JNq4mjF(Yeak_LQW~wQhP*Gu(I85SQ
z><T@()2x~bPnob^w{Vq~;~0Cfkh~t%=v1!Qh^B2DIRH5GX%I0+09+(EG%=C%=Z}>#
z50xGU2BOW33@1uoBNrf4euo{q?PWKcB#sye`x=4vS0S6KEyHD5VuzI94MV;eo8Z%@
zPpO&wcdJSXPXPQYzAWo_ee`&5+ZTdo^E>?EtmS-unF%q8i4Q4c6p68w7!YzxaRqz3
zZUwRcb?6vBh21ojW*d+vzwLE;Yp@~!R7F?06k8sf4Er-S6gJ(~L~y!Wd4}t{y0P1m
zaZ-_j5K*xGPX7=<%js9F8!~W}v9a*QisE#(J&ubCZ`{SvMW=v)t+@PqvLFYLH4c=9
zDN(#3K*#|LS_$}oMqptT%`iQ|T2|GMYcZ<7_+bDb%jK6bHsxK!*z6YZ{b8jMe+Pj6
zz-kEBh1gHMUa;|&F`gwGtb{EiC-~wNr{pJ7p18aV@-J9kUZzl0vUUhBJ_4|-r9+(@
zzYj2U>;%BpkST{-+r?0dQxcGo)#v0U#|)=uS?!@7EC8gfh9Mky8QCq)rsd}TbBxMY
zmk{xp`W2g!%kd1}2J`c_JDcW*{j{|7YFmf!`XOp9nOvn_k5f!dtDFvr6#z2kE=*+d
zYwnPomUFD%3=sLUHKp3lL~ySEKB8;~B);k)>TfSHt$?f|I`bKxV2aIn8kv9J;z%Zx
z(Brp&DqG_KP>E7y0i}(iaF-n)&*2GB1V{yvSVdFQ{ePM7t2eIW7^I#RDHU=ktEg~f
zxbLUP0H~`fM8pJt!#GJhHN9$f7dgEE5dXbHl;rr8vo3*+XSS>#N^tW~zv1&#0Bq+r
zYBYr3pa&UTH_^z1#6%7P$5H0T@gFOJZW+vl=ZS1M1S0MdA~W%q$})4GkTxRpoiK5|
zB4lP>OD$5_t0<g$)d+dYhU6mb;$%2tKp<rMaM88Z*~}vS?GT^c7_nYz2m5fTb?-av
zvWOYh3_Gh?HhbO%QANdA8?AHgllY{0c>pXs-{NlC5b?QH&L7(+Ii7l)lohTcocXIl
zHgMK$0FNuDQ6b2cB`<TGIj?TYrX1T^bOGvXO!JX>HAH#gd_-D?2A56{wGze9U(K`6
zYge9!%ofvH3%E}GxgvXjTAx2g8)Qg|e-!msYj)n<XL{(i@#;ZFOi(V<Do8{pFO6;b
z9a_?>&(mpPomVQ4R%UDrsn~JLP8*joqh&>#OASZ@+Bb$^;6f*QpFz9XY?Y9Yk)_|W
z%aapmcPb1S^TZeNWEo@cc@Bx)4|76y0hvl+FL=63`voXB$A7nfG8TK{kT<lVab}nL
zIb}z8M|s*<3x!Rm3-LZB>H1Iuy3db4?N4XRFn%ip&=7n5w#XsC=FrW0=h}=#Y40A6
z*J>&)q3u>mk0#m^oR7MA7jJRzsK05)|FjOLwR7s}W)9n}4|6GCR<6H8a3L$donAQP
zX99=eVd-|)w`le*Brr(_+?e;Ki=;FyqGoYXxi%rJRo`xA<cv2v{fqfJ9`^C9Cki-z
zel!p%$`(34KA!#QXP}X~On&#v<>!^*4Ww_OY)0|viW{uFJpEQCP?zd5ciPje>r*`F
zO~z95k8)Ac*#KRNNJFV$1HwE%Bw{-XW;oHU<M|9)^`u;$J6kL1hAzl3*A}AAb6f1a
zo2(l{b{z5Lrg;vuUhTov1qR0pNvqRjL6p_ep}5!=9x(%KcW-DnIlwea0E{Q?C(X^Z
zEstF(EvHB_nLOwnSnjPg1WumC16)1=6AZxTEA`^BF8mS?OYnFYFUsiU@v8z*vx}`1
zNSuay+T`fdb^f@ad)%FsA^yyKahlA<;v-mS%{++_5`KL=sn94n;E87+PK$JM+BKYo
zLX6^ufLP!n_+;MzjrczJ8;@D{qT5ac`}=tMj6XnWf>%Cs3I{hIXAG!`;22?6_;uLT
zzfL^8b8=o(<7W@vyEPMniCI24B^l3$dPCeipQb?{-*tb6=aHNijwXD`!`gw!&EIp>
zm!858oD!n!A#hz)_{2m&@z6G>qFJ;6yC=;D3KH7qLmy6!fqd>~{c;U)S3n&k8w@TD
ze*l7y%efAX%ySm{S!_9)TNEmnmx+)`-mL+fay3c;#Yqc{L)dj#5qaQFzMDCj{3cZ|
z?EWNHNa*r|<Qw+|78dWBYdpv4rW+y?u6#3<83<PEdR9$=B}gQtDr!6D@4ACX*GJBf
z-c-x-q9-aci-$kZmxR90mmWkI40W4z^BPhst^8cFx{%Y!adEkqK&MygaY(=J2}NC~
z&0V~}y`aO!UAIRD0XHMlQ|&x-^m)wzPSV7s_~y@?_RLp$-^EJ(W9=U|e1m~c_PUwS
z?$IhB$g{btrbyni#{GkB_eM-Zk@It(ovBTC?94;$0Reppsf~7Tc82YBhY|X@djkXt
zLKtMa?vIpPOtEgJ*l>e>ru$1_s7#woY!`MHXH0BNhZihv<}AxZtO%fS5DpA&B8=)i
zM?@Bvh-d^h2G4Ah5O?;WJPCaQRJg#jeRrijp)F+Hw)iB5e=JDl354lO)=1>GQ2&g=
z2X6<WpJpda$oHX6*Jk90&yzH<soWvmYz8(>T3Sy?;D>G|v3Q>m&)^!iiM_2M^gTkt
z@fJBY9MNCK%NKfD09>*7$}}5e5~^v_^o<{JfY7tlBM20k)4kV2l7#AsDw{N#9-JY7
zlFPbjYZkkNZb`<?uJBlax#Vb3n;~n9Mu$p8XfZQYKw5w)9pPk-Vd%Mq1JefI6tIW`
zL+A!|m_Rf{ys7>>tYMxqn>8RunDT3CX!3x+X)m!J8ysQE_`T0I?Ee~tWDW!s<6k&1
zRfwXHFM@|m;Z=bOdNQyCQ8+N7rv1nPpMJ6<;$}dllC(Bqn)xtI_~~PdJ{kP~6Ak}O
z<|-B7G7=JVk=5|SQc!Td43T`1z8%5n9F^*1`pVIN?Wy#Mbv0Jj-Dy0xK|thViq;Q_
zAnc8auj0~!y|x@Yw>Y9wR`LE*DOCz^w3w_2Jk;1Q*9EFWV!CZ)g0DAqG(H%7aqn|B
z6>g+iN3B3N<9CtnzDup{*o#aU$={;Q!<Lw6_;w%-^?+(Lg@ckpk?hungA#Gy|3BCu
zybn1*77<RkYZNzy%Q2m-00s-gfJV>cR7CwcX)MjYa&Lc2=bQvj6w2^b{Wn4dbbmL{
z7t2nlpMe@|L2Q>5h$^5~zi5DzVX*i^7tD-VP0&i!Mu`ve++^U^p2#`nv|HI`pc_7c
zU<wdT6p2C5iL@pJkScV&7kb{1nEVFczL35GsgJ|aWZ_B!=D_T@X$96#2E?2E*%eSB
zkhBurLzG#iKUso3i9n10BAW0%Q5@`_e%+>zdYF<1IXQs#EUT*DdfAWX%x#>C2Q(hg
zBEXJGY3O3VDs@p8rjr0kfHnp$Hs|Bn{W(N1xmoN2MnL%jo5^7dpc#pCs=C^lZrkrp
zZNWK+UQB@ATL*kV0U$vm2#-@%j1nA{s0s$wL7H9=G$E8I$!H4eKq?CGQ3gPE1;ine
z1;xP!0}=(tCJf-gKk<iTvYe+|JKyMdlT1PH0v<TzD*ev64AX-Gpg3h5eveusQ_~eM
zZ~%iD5D@x5>QnSEc04OmQu6@Z!a4&Q_?p<g$N|b|JU7g7#4E!=CQvHMUJrXod;4j&
z^Gk?gk@)6gu;7>fNcdBNAWb=H5ex>1=>XGlm{kxP3VMhB#Z;9KM4@-QZ5YWdn(6!M
z-n1Ej<kUsV^9dJxN1<iLlM+Q#*3s&5BV_aqDoKI>OrHP~7aAIx+9W|KLf@MyG1^)0
zs43y+QT7X<T}zsq7nK44*#hI{bVc=mglQW|a|D>bMUDTg(9htmw4m#)+5gHVix|Ey
z>Fk`33%O<Zrv3|?RA2pbd2=(5_F3}N3(#7Q?K=-QS54BaDS+*tj|>o->gwuJbp3Q-
zQ%$8X2kP}1zzI{re0u>F{BTbcA64)-*zG$;0c9p6H0BID9i2>Ec&nS&sQLEuTi#CS
zPF;{+bv4T^6p+Cf-s}oAKAg;Hj{bmm>43@#vyJ!fKL2MQ8Gm!WdTNxodOctqj@oIT
z8}~j_KH^@!<Xp8A85$k6$|OSwp6KU+UWa3&-aIxn%qoDx@J?dTb`pTB*Y62A@_@CP
z7wbg$f0rb%ktFkJe;{cKrzdO>Z@k{8@ZGICYLpaa5cPS#+%A+TbVdtJVCx}qzK4kN
zZ*XTtsn*9=JKa$G2-cEtSP@j{IEJ(1M3g$$J<sVI);&tBnvV*WGo!u?P<4t#r=^vg
zy*xjA9j&*u0Q$d($L%pWV2w^ict<*)%*(Vv1vL~v05~R>9N2(RdJvOM=gXEA2>@S_
z|HOxcf&<KUe>h;oC?i<hwVXAV@Ldg(9^-(QE9euju1m0iY{2r@e^NlSAOA~;!gE<P
z*Gt|6Jfum!1wTNfDIS52%}Ao7Ht+fXDb3d_4lVi)Thvpp=Y7OD2!&UxDmgS8y=%4$
z8vgz42iTGdX6gAAqacu);qTY8wU&lwfada?1H}^nCT2jX__|4cxKNP=SQr5P#@_@%
z-z9lBFc<ob5rDr=DIM?>qo4`8MF-Y@!eVOb|0SXPAk*HXqGCwWS+lB6OOOCU62-L8
zimK>ZPCpc2*p(q0l8AkDbHA&?7@eg*umBF!<s4af!**4LO0O>CM>DH<fEF9;z0HwP
zQ#Wx-=l%>RYhFE%WPcZkVWHjHzP#yuY`MnYB-h>j#MYz~@hYDjqL+-D#4#NoADgXj
zS%ODS&Mpjf)?CceK}HUrMtkVYaOUBongFYHFDl`pf4q8Y6?FNuR`Zd9JVq+Ny7xSj
zMfhW!S}ip(q^sPSlIGmuG#2v4W@JpJQFo&dZht;EcK>;Iqh~$xPG&<h2}VCZy8cJ>
zwT$$X5uZ+ej)kL|5B>RLstb7?w@a%JlG7=*OvsjB$XiTBWKnLrm{u$5S;(Rg^u-yk
z^uCKbV&7%*W+1Owvx7V_QFhK|<fD+GaRp+Ln<7mm9PHVex%hGB>%WC;T?vISl*M#^
zZ)D2k`!y8ZUP}{Ygav`~v0oUU-dOFNqdTr4W`JdgiJeZ48;7DcT0bIA-!apHrPTae
zh*wtn4(|DqN)YmpQrH=;#>XDx-&b}UJo;3&(Gv0@IeecaLTI8k?>fHXa2ym-vFx^J
z1t2REZm}SHCrQzYob|ND_<Pn_$K|vz+JES%qUuW5FLf%`>ZecNiHi+@qfXw7>uy3(
zZ0=-4T73=Rw&`FZPP1)H`<_77f}DORu=<snQQC>f)vlyuFj|IlQ8|h;)P5_hP_KR>
z_)hB5tD}F6UlJ4MT#0Ss{O{UjioX8p2JDA!jIrCMFI?FnQs(}0f>c?wYT*YDwUbC(
zxgp*HlH>L=->Q^;*km2hI^NAPjaO1-p3i)t-bDP2qJ-00-b+wHWsVSH9~_4jf|}rn
zA%vk7)D^v@HS`Rp8V%EfR$t15T`-=X;J6xShg-m7MrPkS{7Ha{d0I^e16lWEJkBN)
zmi!Ag2CBX|t2qhsf~P*!Z%Hhey{XO2qTb*w9Of$oIe(F4#XGu8KKW{$<R2_f_6|1Y
zMn7@}oXbWkh4&WdM0RuZ(~(8=Y0~LNVsbo9c7?rGp^e3{uY}}7x?CeDgxt|WF7k4N
zo<A^%xSo>_GsPFC8XqE>?{>Y$8i`|{iF6TC6RCs@RkFnk9<NY|#K$KU4kFR?RC@gY
zabYotycYAVvXUBZix46iW5>e}gz4^WO%Hk_bjXJw?M=~%G(>IdQ1xuq(36Nz!k1gx
zO)M&gtmdkRoT;TyiX^>7!{1#Sso}+bHa8y+JzFmup3DzC`87N_61uhFfP_;mr#413
zIJwuPC>N6BO7N;}z~s(iHLfHZl3%{seLvUu4xK?H3F|wdJ`0)Lr}3hNgWC`{Jftdj
zgyk@ULMuy)RI)ay18tYa`cl~|PYNjylUN_Vu%WIU4Ww*}Z8{4?I16&M1^fYA(~c=`
zcQ4(r@h_&KP+~oersN>3_A-4cjLNgyOa^HKD&z$S{rl{P`#3G0wS?mI)S08dO_?WD
zgCxQJeVJq0=!7A-p#?ScS9bbtH9@wH$;Q!QC;DHm)ZlPPzLk-4guWXO7T#K}@l7?_
zC1VfGL5q5{I>W6uPR#L)-#49M5})J8)7KgsA5o&!4ms<9qYT_j_`DE0(t|W}#<IJ#
z+VqLM$IhLu4!I{Y;zgxn#Eix1u9qp5w@g-Xm6chwKh|ilhAvsX;^>r@PJ4$P6KIIM
z!V$LqC1ZsLE;@nP1+_ZOx6-krtCOncz5E=^F;gjBoocCJVN4nqU-bOQW}FQEc9L*~
zVV^=KmU481`v3vl7&;rYx_v8`30f_BZOhd793fNJhqMn*@-y#Xpt_M0pZ7RdrrUiP
zk@22RBku5}JkP}!97k6Y;mU1C8hV`RjcrPCCr5=gqGH`|_$%+`KjIlr%1)LmtFBND
znM=^t4f%EdiIZE8TdFY~_Ja%zdKa-QDWWaJQuC3VeC$uJa&UM!BH)k0@9jhhCIWy`
zdtSqmzNoxB5r|oc0iA8G;BI4bl6YqzDoG-aVyINTCWyS=(ZPW&Yq|(D?%Bo8uA<YP
z%xW6R2Z)v|2E7PC$3`a5X5s6YIEb5_4VJN2R#pat-LKjdeB4c^W4b|}+hLSWSvVrO
z%)YN){|FHPaZWgzDI5WK93c2o=hTRL2ArxWtY&+|I<(Yn5j66}CdQDdYsaqgIzB9=
zB4c(*zPcE=Tlvk?EhX0BAPzj8waD~GzSxAZ6By3H2fG0(=?EZp7h7u2&dm)0fjlxW
z>I9%jNV!GE^t2-P_775{hBcSCw!3vr6-{e)Agolj*RbK^$iBHbS(|_oG6+0E+t=5Z
zLu$Q~X4^hhi-my^6#1jzX;K7Lnbgo+NC#E$x$+qho^wcJg!{4OfpF`u=ev%#1VF``
z#4%L%8;`T<-%eIren${qhw->{ZX12K?kVE3WUQp_dD;$y)ItY?-tElEJb(i>94n=?
z1Ne^F$V8cp03>Mgy6hXUKwfn}K#^sB-nZbnYuQN~x}JDr#K`^dQ;$v&iICL<gbSzL
z?4%$Pt>Md6hLe!qLu)rgo-_yuzq<e<P~H3Si=LW#FMKmNlDtVhH$!*&Wr?I|LlX&}
z@5FV(n=*?3Z}R*_o1qsaJ<4`{o<EZm;=nRVl)&R}AP=JBaT+^q=0s@5LY27__b_vk
zKefJM0OV9<^GwNyra^W%fK%P{eC3$2y`hllb)~F|R1|~6z@;1>j%lV3PH?K*wxR4b
z9jqs=w>=HxnBtROA56r(+zCp<y~n5HJ#Q^s^}2p3FKBe65+zmXQjQ$q<Im9l{?liA
z)<ofks!AbObO=apkpL;&oEoIh?-S?xv%%Viau*<Kssf2W6n3_Qv9!X=P5wo*+=$&2
z)6AIER9xi{Dg)4mZWIEZ@8f*e(q%`EulY$j;ON%ZG$;ljMZBNNhv>Q5XkXp)p4V)-
zym32xKb7Oa>~#@P!;L1hg^%m9=QmQf+pfoXGZdbKLr~yJ0}r_VE*d(MPo37U$O3Wb
zClegIvsK55IVZ;~K&$5;Z@VA2Y1+e7H;iCUS7@(4OG-*AxeO{TOVOJPELcf@HX1sK
z`Q^T@hQSI{%vmSO<NQHiaDYzf=4JPON6t0emz4YE<~pykvUBd$ccDA{%TcsQe`B}1
zW7F;Y)102$W#5|wAkmW3(rP~598r>rMY?o&h8nPKqg)LEi<3Hv@9GDT4()RQc2+To
zmSP9g)%@X9?(MnjhA-{6zF^s*tj~ZZo)6~u4xNzjg8;`WJ99vOkL-tM+o}Mh8nK&z
zo6{1YpLNr5+k$QSQ$1pIx8}NFXWMq0i0^$fWg#S;$*2BF01iP0){VQ>>E-FX!-L=j
z8?cE8kCy97@SP@wuK*GK=phkp0MHewlXN_a`cJl~Zvj@ih%-4;tI`3aT?YY8w_xlY
z)mxk_kHnL%UUal9BMRRFES6Sw$cG8LaqffPQ;w%~3l>BuhI7FWY}GAfns@#P;C07j
zNr)a)3v0oyR3PmXywF^{S`-Et9|NBzhtBRhJW%H8>o5>c&(=JfmnYCv^&(N5c+pqg
zRm814;YG{5tCmzhDqXyq3mgKNoo4RkZlgnn;B?Rt2M4EIREq-r^;O&FZO7C!KC@gW
zMB8QQTm7;Ht@5iIl>Chw>hItkQL(U(uYhcr1SGRmUazy>4(&lu@jL{VHjP_wr;TC<
zeDU1FNld#Nfa0iJT~o6Mm~k`95AP_`v)4lalY%b3`P*lFb38yh?$=aQ{59|p>GS$v
z`T!K&U4TI*f`zs6)+<T=&<|g9k$7|+A_d?F2wc<G7g))3d>)LsZ)cU}NUQrWH4oi0
z=YDRe-A+;jHVIHCE11f_eP9aA&d`G^tEbVlpax1p!t}HO{cLu1^_N4{$=rc$_SfcS
z4fRUB%$rkQp&kS1NgU|}FT73Cq6)L<TF_SsnT8=#16UAbJh3+<L&vj&^5|^5X7e1W
zf{IE3`vagc59z+WmOoq+q}fd4-r|_1=P*3mtTmh4`dEyvSk*+}{$|rQGB!Tqx2rG|
z+TGtTw)Y>&<dYclI-V`UEf9+$9P5X7$J#Ll^f!$eAhlC7SK?na+2VS>zwAkNb$zY8
zB$v!2%umAR7;OURYhk7kB0tzzX*LV9d=#VL$H?+xp5k0MKp!)IzE1;rbrNqiUDoTV
z=?CGJP?cL5X@|x;smg}D$3X%li;Pa0x&Gm1%v~`8=K$JTpV=v>nL@b)K$~Cx)R?mD
z1%V9@?1943bndmeUP3$AqK&!Qzpk*F1uYCBK5s9ks!tC}fKI$#3}tr1DWM%K$5Mly
zr+JQ{ebe)Uz^);8u;@eXE)$}>Q<?u4uocKoWYcn6AG=Z4LSwjkcT-xp|2)onJ_z>V
z!m*i%yHVuhDZ&X2{qGkKNjurs(S)a`rp@-a4uBktk~`R8@&QtBsX%7C<t&Yh#%XRF
zEz-ru;>K_wa<1S*<mIzpDC9ln;Xr4!^jYB6WHmI!mRb2GPI>b&jSMv21X$WF`CwR@
zdST<(QYh|IG0tmfB?g8iIX{c(f<uoxp;42@I5#xP+y1`oCL6wwCi1x#eL|mfg7EPb
zVrV`@#$h!PbbmY&)26@R0%Q<x)g1}fr-6}HcSpclj*JJS0G?uDv<r3~!)zUg77B@^
zvms%|kPk|eDzett#$jn$%bY#PngQMW_`^3#rde|W@Fx`=?WHYekJU?}nk^c>kzN3{
zC^l30g^e|e#bh*ywnpf-30XFW4sU%mI}Y|#*y+qDI$4EV-B2Iz&KZH)_8s;K@YgH`
z1hnpUl|VlL2TC#Y*1gUX8Gc{9Gh+jH_Ae^mh0y7aAIkt~<&sI7ITw!-i*F~8SltF4
zrYt*E0~(YCkHTM2xn7T7*L}`|J|N@rq|co2N#HV*i04*?Zx8G3)9X!@Cief{LZ+a^
z&>YDo(DP4DJ*F5Mv}`ffP*$e?_AMCj>Hi>3L6+CYop)0V1O0&tNUv9xlzE*Z9`X<v
zcZ%x6Vl2bLUOSy^_(nH#D=7vX_?;8W6QSuk2Rfan{URkERnuV?ZUQpQ;~K02C#5yD
z^*=4syfK(h0a;!|*hAzUWAh<N8KmbVD#YHPI26V8XI}%MNk>8crV@<N;sn(VA88%0
zTXS~&A4n6OKu@pmZ6cESNF_P0FgcxrW~M`|6Hj*_F#=dJC87}CxQBk?g+d@mJ_KyE
z=YkZ!tspj$=*)KSr(s}D*3$?0nFN@0E=ZV2{5=hmH@64Y<g8ztPs)}0BAGv0!Qm7U
zkc8a<Uv*~cosKWQ(N{00=w7`XTbQH2kJ?Q39|#!yO+VAUAex0UEobFxkN7zwtDQ#Y
zZ$BZ|^+jm>XU*Y_nMc>+BL_@&8T$o*4e3}VIbKB{T+Y5vo5D$l)19Co|7N%&)}Sww
z<o)eo%9zLjVDnd2;KAthG%7Y%^rvQeDsP>;#p&#%c~7BBqyaLgBr`D<omw^`PM=r=
zt}5?Sf-$7_>Tk`CuSG?SGxffYFq;kolV7I{$=qI`7R8ty-|t0m20Z2dz*!Cr5WRFD
z7*0wRbZ?f+%0Bpc?MB#`X}bCF12y<(&g92!)2|RA$S)9n_e>5K*mn4&@1TQj^wo&r
zCnh^dJH5q84-iE7@-{rnsg(*2YbM`)ghpuERM#`=23D;xoG^T~^iMqY=C>wamA~|n
zI54L$=;)uuFU>z=m?zS)(A2%m&a%169O_|X$fm|<%f9Y45bZ9fF-9L`J&QY3I|gFW
z!I~evn>G~ExCmrYF_%vU?G*+R_8b9G+G~@(Mv}_8_gJChv7;t~REWqxESo3Nm>Hp%
z_W784@Mu^ICI>wPJm?B+g*p)dnb0$JoW4@>_aH|?$D66Kvmv`;?TD~QWx%o<R2;h{
ze24teF`*A`)(dA^j6O+&BV+wArsEe=pq@SUP$$#3R`i>i>}%3YgxnYPM2bKF@8xwZ
zkU^`M8vRTN+)^v-cKIb0h2cC^Xi9Ez&?c_gRxxiZPeIU~QFoZQ8OeEnLLRg6p_^gM
zgh=?0Us_t#-`Vde54PWMp+$`jb%jI4CV2JwJBy)NKe?PI?IW+UNwD2@g3+XoSxv%-
zpW6#q27WJ$r&ER>`2!fqukH$gqgg7n?T`6tTZ4cwrlt>r#aWybfK%V7KsHKjMp+G<
zCG3z(&C5$21YXMnm5OZEK6Y;X6YpDhtrBlcVkVV(lf=A_Es$^VXF8WOiKMr|0LE}R
zthyU6g|}`z6&X<5co0&NvDsN|Q~!N1zV_?TpzO2)6u`_xpF~)NHB5fxnZjfFvtk0`
zyi34-KskR3)ZAAa;B7y1krGo1erx_K?%ZPXp|%(J*si><rKX#<l@wsPI5HbJJU5Jl
zm*>|0@7oJyO8}lU$Lky*4MM+TV+-3z=mRJM0B)R)5=q+N^H(ZGzylE=0Q2Cjl)uoq
zxvkRwE7`?@^4J1OQGH)k4xCk+f^6JwDClNX-b3Lqw}JtWh6N3jVtuyov(HEfm{URm
zX=6!fMV-S20hoWV!Qe>>jdy5)ltpS&5%|6XHn&BKnmr2XZEFAFopdcYoxFOYMZjH$
z3fGjeuZaQ<i_5Qoz6g^=>0t0%!m@r!9K9Y2NcmqAa2=N~CaAdz%ua4@_7HJ}X}Hr#
znnf-$iQaNiMkkt{0?L0L+z}rWkG&cFnpVgJ1?ITTjZ^U&mMmZmp4iDgR~>jmHt;=6
zvwS3{Wt51uvL^+Pfxk%rD4lHRICl512z-PI_!%tP6r(4ZG!pfVF~Aoc*aZ3gDIc27
zPXiC{jO^)G<04(egITFSdK)9~{4n8sK=1uDR@M*}MxqC1>g%~L!~aPCRH135I+!28
z%VG7|=!lzGTTxMZ_n(pTMri&kn?U$7W3{LcBh6cfUN;9>L{-%n0UkDF)n}EYr~F^>
zeG#Q}Heuj}s;!*+9fK35kdXs;!ZN`l%J(GLyz1n>h%f*1rLR2l4!1?A|MQ(gJEep9
z9}6Y3KL0z6YJZa@!bWM>!HxtjZfahaDGFQ+7|vxk`N6W|S`~Z&4@OQ^6>!LG)hrQ(
z7Ze2kSepinBW@-c#cAnX(Sy9IDg_GJbSfYX91H{HKRLBmn3E&?54=1o%=#imqNJ({
zG{>v3(KJp*C!?d^Wo4f_HY)B&8G6j#Sq?d!#?kP@EpCg(FrCiIl2KBM7qi0Aq2gNi
z9b6ABRvG5!<&goeDypbRI`4SXd1H8o900_aG=N-m(I*SE3w+!5hl19};}S3t&JhGq
z5sUlP?qp1T;0Jr)gxK%W#g4P)lSC;S(Ldc^95paQeenK4YLk5t1@jUjZB0d>dIcpV
z(j;w{6bwwvkVrZsi8I;rXE2Wqr!6;Aib!F?hMKL!qI`VOkU~0Wbt`IXfLpM)MUKmd
z4BP>lUmS3*Ljv#v@;vX(Vv~|60Xj5@$ZiEvCZZsxsqKsS1um2^nHj(1(8<Zuv@Pfm
z>+S%aD-O80Tx&YMFV1Ca>%hS<a10Mf!_;WGZ3P$80ZoV4b=5u;-+5NS0XQ8ua**Y_
z{)eW}Wx{#3UykqcYne3=O*sW}&!3aD9Ky7G9`-&&{G0~{HxX69TSk`q^egvi?aXZc
z#EbZKqx}wW)MY#dAf3`+@RkC^uxeN;ii=~hblij^1z+8`fw)>NwSoDjx;x=^ev$u;
zTq>Kygg%$o^W}g;4+`h+9HsBy^A0$c%}6C=WD;&}oR)x3-viP*vurpa@MAD^EJ-u$
zORaVP+3<ss(>^c~5{XC=-++^uk@F@`H%FvktPZ>#QEXyj0U`r4vjWGun;}^FaPFwB
z{hmOC7-b2}6oYk^n8Y^C8xUif6<-7hn-TEw@j#?Ux5v6^FR|I!$SCNn+32^n;KOY0
zRSqix5H-@o;daR5yWb4F0?_XvHz+xi-_s)1_wB{C35YEy$JE4>1B6gwhRI@<irs1%
zQ52TNZPOn<&vv7|Stf<$TN=I__}tm^wMJu28^EwV<oKYU_uVhiqWj5jHJP@7ooC`K
z;gV2;p}9)+PZ=3i6rWH5Dva4X23w1@Ksc1xHJ25casE3kK!1v-uV3uOiI%7ajQyeV
zmb2m~EfE>t>oLxxw!h0#3G2W~+k)`e2+MJ<9c1o{PH1Dm#Tl~)&XLJh??m!Ss;H<O
zmP^6ecAIAT(zTpBMgOT;cs))K>Qdr8XfI562*>xi-?FIGYmI)7QDd46=Euzh9)~&L
z^PE8j+AKAu6Nx|;UQ7i*x7@qYBHyQo02FL-7+nC?)Cy$MDBua4qJf<@0w_);ThS@-
zI#3{J5P8qVCLVRhd0h7r$1pKqMb)A=7Jigo7KH&PA%XZ`3m8i~0K9wz96sDqoPovQ
z2}cD|f_uQEs=7V^Ts{Y}vAMaq$*#wEy+T(I@R6pNg1#|dEv(qm0+G{cY_oWMk$ERX
z4iSK(8eeR7EyD=MK@9%|j#woEe!p}ZCL3wxl&$~Mra~t~SDb%<4HDk~5Q{IdcRF2G
z?zy5{v1uuz@v?Y!gJE(3;CjVfmt`zqhb_1h!|#UL>tk=JspAu%OYsV)DM$HW*ibka
zD#)jTlLXHjZ!cMYKQscyu=18ikZUz?Qe12<1IT8S>sr^%eKExh*tV6><UP*OV;1GJ
zh1Qwt7xjVP$sWtx!KQH9&E?(%(vGtNNI~TI3&C&N8X?Fyzv|wA(jy9v`0<A@ceWks
zc3lFhUCR>050gVXEAv!w&|)ZG%mS@)^BsUuN%Von^t%XAT>XP3j#`|56%~l0b4=C|
zB0TbaaBY)PPky4@6Ze3b3bvq6D~<H(;cP!M{-Hc}8GOe9PZpDYfJ?ijbuHwAJwL(a
zqFJb%lvGkxhYx8Vd9cPxKdq21JO;nLb@dR1#?OIh$myT~Xv0l#WOD}_)%sLK2-l1-
zn;Oax9{PuAUcX!%EF2UZgbc;=H&HjlXp|68C}osCH2kq{k9@kfhVpTe!#qo66Czd*
zxDB@#04AT>DU(wt)0fdycBv;=;Ar2`GQMB8@jbqz-5C4A?6~s{(mgPVwYCt}hge#6
z{vHVO_29VWo^(VCuD#n>=zbvBDg6N}`tBJByB3#{`a7)Y1)|I%H0`Ha0!vX$miq`9
z=jj^DVg13>d;{)mKpbIuu}qV*>8K!Wx3p$lkyCKN_wlIEBhm*Iu*Rzko&a|`2?+HS
zZ%yK^X5Cz`@(GGU!u0_>sbDzERNZyW*%<gQ0Ol<D4)Oe%(aw~+2Dh;Z2vK^VMe6HW
zHix(28i;tz*y1)}X4(FYw?|j8=q2<}sJ^Ep=gpyHI4k}Y-F`c-=}e{Wrukb8I3m|9
zH2}h?`A*OS8b=+*&mRv_B;axS4fbHKVNP!WC%lBloM&lDMb?wQ&U^V<A=?e>0(UaY
zZ$~TC;kL8IFBC$=pJ_FhJe_o7>oYL{HnkIGX&J`DbyATcdNN~^HtUT*UPM4by{9M^
z+Q!9JPhAhuL^U9Yn=c>13-P(J%5Pd>BRTy7!Q35qh%!w2`ki3=`x^}W{w#}(bQaaM
zi6a0D3u{rD4n!PNO{POe4^1xtz4+)>)N1?x8=0x!-Tw@N&<<FCF|6joGpnQm?{#N(
z$jhfxGE!dZZ7mqUhdzZQg-OoJHlBfmgBSVq253$}kuXJ6w^CDofLJ`2Hj5+Xd0Hjg
z<im#vI5;t~OrJ+{c9=xWb^;7RrgKO*7@bdih@&xjy$mA;XtOG?TA3e)hV#!Q4uf*Q
zh*SQzO5&6A(%8-DU`9JQEP_WnO{|tO`L%Nu2B=+ye+~71kn|8F%}VQ+F4L6>Phy+J
zKFkEN_P<vn6;WI1RQqxJ*^JTwP#w;Nk9b1s^|5I&ML$nA`0vB<up4ltG3(U^?JqwI
z6iqwI7n7xW>M?`S6p$nCDu5aZ45?DgmJ_}=!H&_ZU(6#@@{VCDszu*J@@j63nZV>^
z0pJjt+Ujq!NVzl=EBo9&u(C5aC==UCI7(>Bjs;+%dLj7lqC-Iw%vF4w+{0(7WKC=8
ztAoSaiQt=?jEP=CwEhQ3*4s}@P?~R`nVk1>W%nF9Y=v)(t{M#u8QPPA?fF{tPkksX
zzM>gy_y56K^__<_CWBr;Xm)p2Np5bhav+kRBu<`rqj&mCax7SJQ*V&oG9&<ecv(HP
z*?Bfep9u_!Fs5leiJtc<*&$1tYA}UmNPXD-SBlynqGKO!o8OChs2mWunDtVK;oAqv
zHtuzK;UKq|^h>H-tm^$glznwnlu;X}fPlaN(p`dd3>^{zN;e{nASEE(DIo($cXy)#
zDj*<8cRDoEDV>6(dtbl(cK4q>XZJ5TM}~Q4?tS0;JimHwqKgaYpCXZoKZSOyKPmQ;
z3WP5%vP$TUv?6G@gQ2c+^#!j2#W|%-@y9yyyY)jG!`W0vO=cfpX=aH{1NSK8`T^bF
zKA04Li4RW})V6l~I)wN6fV)Ro?zurG3v)ZGX^D*e_gc>8d(fxlEYk$UkZ~i6veNcv
z?|YzzV*@g1<3U-|8mft86t?UaJo88QpQLFFz@IaJK&9Mg$YZ4MzN78MulgGtq};a^
zDvHX=Df>4eCu$<eC1W~YM05vYLR9Epm<&E}RZUoY)Fnn**9twhEWWFQZWZbJxUtBR
zvsv1dpM;X8t%6TGpCI$~r!Laj`1@T$jWn!2%(*Z3qJWx0V&5J8(Cv|(AsqJ{Qr33M
zig<m)HV$w&6jWR^+Y-_?!}*|X_NXnc<GU>Xe-=w<<zrwUs{V(pi)r)Nt?d-fH$!|g
z(Aj!LH8*W7IO;yB&tf~2D~aYsm?pDe#co6m`h>SYG%;T_8(;|p4V(l`C?8&44DyeZ
zeYQS&pHh2UTwEzeFIdT0m*C|LH60cG*IImJy2-eYfM?_tj=JPU5hbb{J?sV2SKq9p
z{kR>&zP-2#9464WYpgf-D*Qj~$8%sAEol6kik<;${H>g55`z5DM~_;PJ&TiiE?{m*
z3;x<KIBT$OQ9OY%v}Kn{GK4=q_jBJUfj0j=&>1!Sw=??O$fP`FQl5}tn*w!c@{bdx
z9fbQ|W`_KyMi`Wc+lxY_I=2N)kF$jZi{f*K&$SD3|JI->oH<17XoeMPNb!Q0Lsmp3
zjbDN0m$V10&;wCH>E^o}rfX4<se+S<NkPe{no|hBZ6(demc>bP{ah`1&4Q@m-+O)y
znsZ=7)>>EmW(W_oDfK95oa5;_@o(HMfm+gAIECg@&5$Hwf$7dC|0>W!VS%mFt`Bw4
zx9oOj7~@r0DQ8`IBqIozdDNy7>W@(>Z*(~gwe}Zxt=UzQ7ec^&uJEtd1*r=a;`Av0
zX&*bvKEf1qLcQ-jfL<w07mP&B{o|j4`>*Rq&^%B1Q~WyUzYTU1EgRBWA)}}yGqYc>
zHgD;*t!)kJxZb+{w1p`I4S)|F{JEz5hvj1q?P#4S8-A9)o?Jgx;K|8nM10zo>Dez~
z=EwN=PT(3(N~hjcEg&e!v%|o~=<e=1*<^Xq#P;INRj7b~z}uLbY=xtzVpimPLBr2P
zzM5daE@=;kmeOMLq4L-7oDLQbgvDYK5(pug*V+$xy5vN4Nqs!2YbQ2t?i4`B4gh9~
z9n$|$gWB<U($FyiFs4{<ZkVNR&awfshz~fzo`DX0DNY<x&Nx6wse{-7<>;}4^8v<K
z@dr?ILWv6r>F0n4zy)a027rr$5ptO101a%9;2ih2-3OaFzLnkP<a=1E7{)$D3ITMW
zfF=anFk<}lN#9^yS9DYi`a>!2V+-%iY)^}po3liaeWRHyA}0gV8308$^m)p}7o^E7
zaOC=*rJwHbKSto{%Odb!sDgUkPlD{;-SI$H9-b+YBR<f)DM?EQP2O4$rZIpLyABeO
z4NG3Vo&f}%#c-A=WV0CnBm%pxk#nd@wwMmYHSMID);!uuS4>#$f5{O+25roO*U}+_
z*|w?}ay_y`;Tcz;!Qg;IX)0MFn&9!rAy4aFv<56L-Bga80547gY^JW3)>x6U2bsio
z(1<GMO8N`<RTnPHSjI3c-E17>NrzZ~S|r)NZblA7K@{{L@aZZsy1-M0#jdiU1Y{Sq
z_%xf<kO@!_4_BsMjDcJAU-?~55s?s|$P}TNkB0am4dM_KS^{pEBqWSUenZmB+JL!B
zz<D+MV3qA{Zfq#IXnj_@qaM54+1V-dO<(g*XKq%ME~nn0(~Fp+uZCY8v<wa3_uv*b
z)Bi28e%SCit%({GK;}?yx)p^&I$jMe214eCHuCzZD9gi#fXp>|0CG7Z9YD#Sdn@NH
z8?xA<VEq^t0d()9Ri;v?BeHior!tO~<`?0!B&iQwj}sqO`BwZCjLds@rc+0xV;R7f
zu|xDGL=fy@Ok!c~>#Qv53m~Sioix$cX9NA#7fQ5<eLn}aoHL3v0h<INuDrqWtv8^{
z;Ip3<0AA|cHSqb2JL%z3!}q*vnyRJWZO(K)`3F$Ef(a<Lsz0*(6$tKc){74xIKja%
zw#e$4flsb~2pm$cHrofdzo|SzM@HQ-bgYZCu(B9{ykB5HeVn`obsQt``iK=QMX=<=
z0k2SWsw+7epyMLF7(o2lZ*eEwO`8OG)EV;<cyI)Z5Mm80*-}b5BcdpIS>1Z|C`|90
z!r=Y%48atLM`z&XnaDTI$rPftwTCSh(4S#H@6a<yGb$@7k;4r>^m{u1b+{6D+~@~K
zMy5`5)}ue0T74|qm{icW3=-mT!W%)I;&-d|JXo034WuM9qv#IDL}3iv3IA46p>!f*
zcA@Nwwh#`QT7;G<x&12e#>4^4p|4D{5QJdK&|ZVm`}7)M>F+OisQZ`XCKH|if*ZdE
zt4%K)Qpq%C1U^f-H2yfisa?4GSkt#(lA72jmR7<7RIXw&fd&CyxGey?Fu1d-n&*)V
zyk?>MVmluSmm59yDvn&ZTCfgCx;ytMC|U)xgw0!Dc^VxVYA^S%@U^H?QlS7C3+wrG
z!Ct?S4g_JL(Y|HZ<$cd;JL+$)37ue#rwYlEP(s;|-?M)h7lgv*vNwkn{algz_pHvt
z4{B-CFS1GkI#f4uFPmCUAyTOGKqb>@8*EG#(>D9_+8MmGyI?fG+9F4tbnwI1bZkZl
zRBN*!2Y}W`fQXUPo(=G{{4zR;B7^KEq+yyS-pwF7Cu?@QQ~wAgChR@=%zqcjPsoTT
zYV=$$$?E#gk;|mMb)V=asKx`XKH?pLlMntNF(ZYwd`nSi<v(yCr-OjCQntW2Pj2Qr
z=Ag?<A7k<j$n<D#3P^mhN-Fgru(=UtPr41~UM<uBU@`nLQYR*O#bs-PiwD{#aRiA8
zM4+nHBuu|wh|YR^@cl|nXwIpli+Su$%EuePG`Ue<xKA0gKvps#eE1KQdV<#D&*YdY
zk{zA!Qx4P5>H1%+ww${oT_wgwhckuMNBAqV{*<ccR`xg*^QvWw)W;Otr}rYFA3Ewc
z)|HhhSeKd;K4dOs;7nw3R&W0zf;X`AxrlppM6_GrD`!ZgbcZ`RyeQIqE;oJW(8tTh
z{90^e;I7bI;+2zBjQ5tQ{~P3p^I5a`hw3ouh)mx@PNSYLMCU-`?Eaa04P`+>k;sNo
zqA1t=?XrG+Een44aw4A9&T0;|tW^=(%;ATuleIQ)i%tv49PvLxk6@{oEfN$*FfTv7
z=r}icK&~}CO=oOWJV8(WR1%wZ1`GNFnz=~xUbLA9_{XRtY9>oLOMM~dn?oe6gndSa
z+IvJtrAh|R3kNf$3}fT1<1lTW&^D2+N#8y$$q`o|>ro^cQ39_i3OO<A;axx(K{Pds
ztLm*SH%lGgaiwq?^wsShO~gE6#9vvaEef$-GN#8zO6MF|6yXSq&J5*By*L>7og?04
z-aDR%x$Na}%rbS9>gF8f+1Z4>MzKJ$b$HC?vLPBz&=OB@5PIJ8LY=lKaYtmMFT^ug
z+{>d<c7OST!rN(ZfG$wHy~PszZ%3khkz@KPBmTj+=u<9N2_kKj<)(ua-v?;ww7tyN
zhH^RLj8pLrIr9Vjy>G=I`>koK)9y05uC=)yQ}+h@R+E#t8p9c$GbwOfB%ZuAOu8$0
zS4&GDwP7#Py$c7b@%q@X5>nFF_<YxN>QXH3Yy1fSxRO{gCmk`mUOmH7+Yy8;;@nNC
z3k4co4QtUgG%Hbq_D>Zwu0Ku~_$hGd&Txh2=pi^ZAdpcmYBcr;0yeHHfm=n4U<hOp
z74x$vOoDA`BDDk}g7kFzuK<zeu4s%LF*qL)KZ)|;^8oJIo5TI@R}S9by=wzoo$}$|
z0O{#wAAlVC{H<scJdP&aG~ks$#9(mW7&tY>YPppU&wxh!_F|LH`0XYT(~oXGq2QOD
z0ZNFflkEfW>@sLI69L9cR|9f~+KJ~Gct~)PR@p9p2Dn;zR9eb8r~$@M?SOiEsy#~Q
zujKWiY;@)(OZ12J5r4h@R7Icg)au@FH;rllgr>5eKuR*pCt|?VCg=^&(^~jX&}InG
z00~spaAUXc%<C^h&uGEB#WsNk-+<oX`^84Z9dC~XwovMk*x;pUfVOgnVB)8Ml$}>W
zN@A~sWa@}}dl7&1Qd<*yz=MVpcqePv<AF!iVC)F0{+*8p@gD8w9Y3yXN=+yr)Hna*
zGfaQKVQ2rRz{q`kBEooUNZR1fqhj?$OylJm<67I*8chOTPI3Tur@GfMuQqKW*xu;p
zX!zoJE-TGYi;|(PZxJCLqP23vz*wR#N6TuwfU~tkZpRxZ48c$RXdM*ZudCZIDyAF$
z0!wK*t>Bup*UIg|o4@Z5LLEcg4zIv5_$;&rV2Tyu4lOqxs{H+~z<>DTyq)(LX7f_w
z9#Z;tsW}eMLdJxJ1}uH;a8}P<^`a2J5z70$A#QOav;lw2AdSc~T0dE^uU?(_9<43=
z53u?!(48{}w7cf?6)Ir%0DUK8Rf2==_>S-LJTDvZ`3eP)Df!OWlx9B@CBi??hMUQp
zd>!;06n^WnX^63=Ao=u@@9|Q)9v>B2v`ds@y6-VpVQUyJRmnw0(|~aMH(8rROvvnd
z0dU3NG~oIa0+y&`XxGyJ-W<F!sLVSQo_$qlDRA$J7BYC~>nTgMN66Dp<&i%ljtel#
zKVmBc<UX_Su*IVDu+t{fHq3XL1_kN_kl$P#yyt5_#^1~jqebEe#bS4<1;xrd{h`UR
z)KRpQm$wq~)H$zny64UkPm5>_fs8aNaam62i2GwBy^Eq~N_09d?7)@<-W+t<U^fkp
zQ`?RO%8PM4*2f~<60CZx*B8o0tmfDv*n}3Hyp_L^ZJ6pJ$RJJ;IfibD3uTdmndS@x
z3x2!TmvsjGs5=KU4N}g|jxq5jXYrU@XUHIz4cf=c1O8qPIZ{Tn%ROFFK!#n}iJv1b
zB4^KOL|c>_9OLGqen8H5jFfHyi}Q9Md~<E#T25{GnBJHgCl&L2O)?tSX63!ph1KJV
z@hc9}c+5?2jDu#cttHyt@Y6(m-l90sgOHkdyaV#9CJtoK>t^fowKl;{Me*KL-*};8
zwkdl2GYN06o^VX{KyanL7a_j!o_(3UnKg&I;<EMPO-(qgk~!JdOmE`w+K+GR={_^j
z7?I_ui>lAYVQwwWglAUgh}YxW9L9LRi#qikQM(@1qE)_3X*OT@8YQk^_~+{&?}`jY
zr@G1SHEAE{d5PtFeE5Cc$@@dNqo@%rSBoW#h8_Evg+Z{mL1v?#JjI}6++*3qQ#bDn
zZ>*?LsJVeFn?>{AYy5326cUJ!G#m(Q4~$Kk#jkbc^p~Vx@!Bd*1xa-I`5K5%e$SFF
zi(h#(pVDSkDtl4M$$t^IqqvKU`aIh{!2}HnN|PXW_SY=r9@Ar%tw~AiWTNTRJ3c^_
zsBzA@FKzTr?_#_j6a-OSn)p~9cvqt4t$S3F5R)VAD(&qY@}ajYx_7jG`ITKsbEc32
z3>_FQ7*Q+#eB?e+eu@=IHNzV64+|RC5mqW5c*|ku%s4yWq_h3+Pv$G;XH4KJQ_ThZ
z%KHIacY9AvVu`l~04s_!|7@8e;l_&WITgR{z~N9e-TK@+SmD(GQTL6#D>?p|IcwBD
zk23f{*NXhW^UwFf2{`Q@vD|s;!zj!vXWIYy!yXB9*1gT(^;deXcY2y#IUUA%R4-wm
z-TK#DLKqCm!?u{vLB-J}JGYdI7i1Cz6#1Z~Hg+LD$)b0kc<@~;HO!d#Y~tPcRDFjk
zIC3)9hXv6){$P#e9RY#K(~$8AIB&dyYVD$Z+tr_KR4whT{nazUsBbZNf+lH~vl?ma
zE#ruGr$L-mNBZnG`1MB4?|uFH7`uWnEFW0~yJVcz_b14$_Bge#FE-o98?Qbe^qI%Q
zf18<|N*0(YzDGE3b^rx?jJMf#6TFv^7$Ne$WBn5~-!tzrnDHMHdf9zdYMhRm2eLe{
zbNvXu@mk@zFWxq*B0Z$>Vf{{KIkqYJ-vYKQew`Vo2@SxRVv`@pdfxE$jm|4uk~65=
zNV-w=DE1)xs{;rq>f+=xEq}#w%kFQTr78mZLpXg$_@bp(no$4!i{jEXC9RjA|M;&d
zr%D(bMWd_@v_C!lq?EGu_CuUF=}1^|7w-Pdydzh7v=i7;^1W}1Py92M?by*8TQNfK
z&~;o>{N}4(wB`0>QPK~(x~IfS{e*pfvH!tmhXS|0Z8j1d!qbqFxD2UglkKdawlH0I
z2Mb9k?xl3{pxr3CIO;xQIj8G+x=z*k+UgX1fA#R&BDzCMfyf^M(xD@p14CTx?bP;}
zHn+(s^s4usbWAN+FLbu|SCQR<#prGD^``;RA$Hrgl*T^W*}P-Zez_HVx{vGOMa3Ti
zzY{;O<Zd=eo82}gw+K?%P=B9FY0czJ<MOgZ9-H-+bOaY+!v+i)yzTeNj))Lua$Rc7
zy;vRZM?=k~HS1wFIk0(G`q-6*$o^`0FigVxSv{&1FGb4RzR|vX)5eiLMk54qePn-}
zr{-FKaHAbC9PIa8O~QNcdp}9c_l@a-oCGfH_<hymp}#HkXrvLfW_;tGdrCU(aaAHQ
zFsv_WFk`>SUbr}2N*%W>whgv>&SE0NUeom?e3|cbZF%@T%(!Xj0<MBE$-z!Fs|8Mj
zTPq5&hPMMw()BRov<#3nFd7TXCNyO2$P*WU9fK6*&UKEJig!#M8wY&ibdA>p31&L=
zFuM?&_xP##@0~dH5%~wmL^D>pgwmSVE@r2OG4vf6?e*eaSLbU%um)={$s%M3>}J2g
zkAY&P#1W535wYLYKRVA?0PE<lYZ^pJ$Vk{&NLGH&!4dJbjRMp1B#YA+ot;ra8Cm?*
zw@waMo0wVlyCw47vIS=h?xWW%y>Ik(ST9Zona-EL#di5yQ0TXr@lGV;9uK;z_nU>$
zJwSQ!nW?dZd*oYiL@W1eStwl<^Yfx4rh>^=&gfWFCN2rZEa?w)W-i#SZY&joxLXv8
z`-~oREmy6`F9{Pe8%7zAhF@hpZ5J6ZBqR(r^Q$y<p|e8Zbc89BBeQ-D%C}cP)>bkI
zCe2aVm-hR9xzv*xpT2u5WV<X&KIuRYljYLZA{^IJS8_2kY78-(!`!X3!&_YoVvOBE
z8UbF+7H@SQTBm$jue`eUFH9`o;QKj)uyV69<F{+l(LI}ML37~Vis507EsoDm#2k^w
zz3*vh$M=M`*S_51*WF;4m<+;QjX{Ha^S4rb)NYX#h1M%qx|N8GQd>P;MsVA5bv#}B
z<i=ODvDX)pA&e1uZd?IlTrUS6mcG!VwfMXiMJdi?7}*84=GqUiqk0PXPDLpSoZ>M%
z5_kn3eS=**zu+$Hi;Ji}{<hgm$vEk8PUtDF_kBf2Rn0zut|x$}xe-5(;61*lGmCsO
z!+JUdq`EP!`6)SI(k*~0n3*NpE#Yd~AU7;99=%n*Ri5&CS;(>Fq1k52LzVHvj|5yQ
zUQJvP%W%z(uW~i;<)b<a^spb^_copY<6AP^zCnHbCgn*(W#QV7^!6w7j?vXS&vm??
zWQVmH_w=Gzh3?KIdf$_K6!MUB#HS_&A=2}@D29nsJK-XAGJ4{=?Xew2Q<r~+jd@zx
zvLT~E+GKl=)BpOh>KFV+do3w5<&R231aQU2zH)n<R(AD~z{6OjmoaP9z0IX2`E$+3
zqb>%(=YR(K|21;`|2wq%|CV7Dg;!WmAY<aaPSUVl@qP_pi7=Wyqi4Fh1E3;fq4Pi0
z{1p!6c0;K({R0DC09st@ODxXMXK<SYISdmZkgrhoanScuq%s8FDFe)+NVm!)pFcYi
zwBi?_d9dtHV!Na=&jauZq+gy_S5SxpaM}wXNGa<Em(^VXT7S83MECAKD+dDGb3UKb
zR}d#|k?lSiUsA$+bm0edmJgmjeX6CaTiO$t;R9m4pXumqdAI9=u#l<2hDwkzejeo2
zbkf=J8`MgyE{}s!lase<v#c5IJG<P0M`OF|Ue(Q)zksh%1lc{s%BkE5NlE`4A9Kz<
zG~2uo%B;g10)P{ZR;cAr$(1yQ<T2i7GuAqag}b@l8x#$P?Ktj*8_{+-uJwB}P~9yG
z<Ou6Qn6z;mmGv?sg{@cvzfHLOkwuCS)Vgy4U@TQ2b`Tdm8RTCFNaVPyiRxeA3ee#}
zpIUJ0z^yn`1RDSOht}EdDMo+{)zZ@|bM`^B1HhKy(Qb8!4<?{rh2p$VHu}cPwI}kR
zizNAXk%UA<mO%TRG&rd8`#$KCglw6b{;WjKMpXldq8EH377)bRK#5nsr&$Mt1wjaj
z?XtpuZWdA3jepW(IY<oqpz8Z&Mcrm$(xT4}*T);}kX#AhI*>@~@ek1jf1+Gz;e`6|
z3M^u32%eb28OUYT`2*p}DnN~W@C9fm1UbBKk$!=jd2jyS4q-?Gk8chmu6P)<sz9bE
z_bo9cg&6{10eb_PA1vnGrzSS;5Z^+A)OWr>{F6kaCL-M(OSkyJ0uzs#6&P`9*v|#0
zjI@Es(N*xC(@qOeK2sh*!UaStV2+tV&s@E*3W?__Z{iqa$N%K91zn+FbSSc97ro2f
z16`$#V+ZIYnIG-dB$lAB<C8;q4`Gz30%TrArKNV_<T&I!;7)u00CMu8CP`v@{+`gq
zry<w^W><FKsv)}27$jyAyg%DOQVu|6+G?Utaf3dY_iVd^i}*A-=lnO<YbD~ilSsW6
zh;{@dt87r#fs~?UWA+w;e4RZZgD=7Is0AFDmzsrGSXkTyejVj$Gj%`qdIQlcIzVO^
zr6%p-@_F*Jb5Ehy%=AcqzwPtMW$an)H4tqDav{E!eKiHQ!W01x`5iAJK0dx0#U&u*
zdA;u$DBK!3!gwwGj+XI3lWZa8)Q|N0;vwxJJBlC~T<KRh^vy%g_tHCEQow4Jotg*i
z;qru|Gmb#Zp_%;J!J)G8xAR|fnCY`!pe@<fS>iE4^|>{FbziCnVEcwT8(bufJpi_J
zt7!+K;7W|XxdzuvdHnnMCfG3asy>uMkx>{gh-Xp_`R^i!5t#7Hn*s2D23c#TyC+`_
zR_y(XrQ03B8-=_>n^k#&F%}ejRE+jO6f_Ia+-j~jfrO+jM>*K2bDSn-r~Fv3=ifs&
z5X@~md%gtP<amj*X<6X(pr(m#73};91l;98!9P~K@53=(9h?4P7{dLxepr<ncK;ji
z{dKR*aslA=kP=J&13{r%S#1mjzg+TWMa)!8EaAEB1a%>9PR?!p-q9U;oJ#3sA0<(v
zAR^4qZE<&!1QyT(h+-DsJ}qYyp1Y(jY|h<4xqYekR#^T%OHV|q-65<xo>UwSl5tT|
zLOBB^>wrZ;nneoenEG`Rl7K{w3}c>V%u;15HLYJ|W^UAO)FmT$3r#79_HV%U^2r(Q
z_xmdMlNUee2SZ+<=I}DNM@)Ve1;t}S|N4Tn+0Bk<6ngQFTX7L}FCOZah%i*t^<l|c
zFF7nH+y&GF1N@avFM8OUh6R4)J@w6}2y&$39Yze<y>P~u(H%f^J=cK<`iER3Z|rKC
zgtVrMBSsjt$Ze@Xj!Ejj%o*|})x7i|Z-NKj9-zH>P6B>Cs`xH`J4A5!U!CbN2I56~
z5u|keg5-~!kY#(|P7Z1YB^=e@%<jJr9`NtK*&mE1=|c7yXVX?T)?x~j2!ciE_nonK
z_#iguIQQ?he5t$DtQEqK&usvV7y&@B&OK~fM8k~L&v(<k>VsFU7OD_U8k5!f_z!?_
z+&ttqf}O}hU2;LlL5!-(xME(QgZ;|~CG-A3P6F;_k54m)Q6#8D_2NJ~nEwLDtP=fs
zqBt<TSeh<Q6c=a|jl{t+(|Ugq18$K69|)nbl=<$JeKK~R&}kHR9Efm(nd_Tk^t>7n
z^d*wacHX|=Yx;YMbks|+u)xI8m3!vJ$O$WK1_=i1fI?)+=b}X=y%pW9?qC8_R@}zn
zDcJy&Qg;9b|Iw<vf}Z=PhQ52XmOCH6rtA(@YU$3f&KM^?e8?q2B5)B~T%0a}gwKNb
zg6FiLP9X#UVCSp>;Hgdk1JMT_Y=u|ESt?{h?D70ogQKAt54;P=qaV|SfX86lasCem
z5Ug<wU;JX$$xEDebNO3a_7nttB^Jg?8UQ_(J99U#(6s3%dvG-Vki-Dl7vMd}<+6}p
ztkpqged0fl8^~u>(oH?R#HgvGKoDg9Tbq6;91U@Y7A2!xWrj-5&m6>{bh5e87MbT!
z04Z}I2&*IrEn$wooDEDOyEt`}^)K9?dUn~hILZXn4_f-O^uUvUwd1)-d73d>5o+eQ
z@0tE{Gi`B-Y5gr5qe>#;cl-D=A2Z4>T{3KKRdP>tvXTmMFLm720O9&oL6W(#(kqj>
zw-4uwqTOUH(_eH53JTeXu`A8p7#m%Rzjc0-N%{0#6u*|l;XSM`Pj0UH3FUlw(OUtE
zQEPY42a1%&e!yr?6#lh;J`z5`ZGVmTC-XGDNkAAQ*lgQ2?UWU`?lTe2Z$NJ=w@!;K
zOTFu=k~?H9d%f!_6+pz}d^~rH!GeG8N6D;0JJHK14jK&`>)ag)efgw(-xg`mk$CaB
zZP}_Kj7GjObFVPKMr5vy<flovWN(i%?Y8H=c~mkj!^P*^m4N_He7z4etFQQ&T8I#C
zTpwZa-xfCmQX{{WD&rkB_`l#{AY_vInrl(A{;m0KDBrmFgb;;GmRlDiv{1yXWu$}0
zqU+Rg?V`A!f#aQo6ooD40H5Yu*~aWKww0iQe!li_*M9xC5-(b%N8}}=H<eCTPxtju
zmW|UsqrW|}(c@9+^W%xC5>+`eBJ1u_O+yARNeD(=4F!C9^n4>gH^r1wgz=?xnTC6@
zK{~%cg>k;JneG<0Nc*dN1A@^VR(h%|g0=uz|3m~Ek6Y+hj_{Bv!7aHzmle$A-1aV^
zWmgmm2@@C}cg~D|EDsZOYg;An>$4l>s~b&N?n~}dqgRcPbncqh2sE~5`c6*NyEVc|
zxQ%tVVN!pnV5|H3d)SWcA`fM$DZ9JSsH<F#WL?QUD=OLD=*SSL#_P*T=_N$oN#Cqp
z&Em*fG?{AE<k<0(qHs#hvf2+V!dHToO5wqLPq=Y9=pUWMTh_z!QN8O`*O&J{n#F}F
z6Kyd?vB(mCBE#tFI|6NFaU@73_+jyjSU{@Xve#Q3F_#KweHzX#%W`f-;d2arIDfrQ
z+;5nxO5(0SyC!@70$A;RD`2w6_foHbss6hb#RGg#V{=v&9g2gQhpLvLM!c9$j$JR6
zpBs37O})<?OUmc=gD90nHs`73FIJ$sENJ+hQRl_D#V#-H+;F;`LMCS6cGuz$#25W^
zXT(i5Bcwq1M$<~ncE9;b#Hx$EIas-skdVU(qEr(SmW96NCS{B_<z{W1&A$k6Rdx9j
zpqI8*4C~-+<|dXqpzxr%N;{3HBp%#O;Iv7PNB|Cn2*rp5Kz32+D<)(mELEx|4ZbFu
zyirwCjdSx9pD<2MT06r^j9+VKsyD}Skkagqxsn!IcrA;Yo3w(BMJ8{Hi0rZahViC3
z>SoK3;#0RqD!#UWiVtwIiAt|-O7@jUB&_Tw#wUK*q8C3`RNa&;I9G5rOE4y{9O`Ar
z=C-#nNxK`(BN4fYRT2%~;?6Egzp}9S=6j#nEtk06iH=O}V6}b6e@@PeCu*T5w2vw3
zoA9xwcbr;#nK)^GoGZJmCl|jUA^y2p0&_CM*0Tnlq<n;PPeiWu*j2SUIqt#A@3mzU
z6J-h)nknCrl<l)j%d!LZU>Mfw%2Pams&4k(Z}2v2U;j}7!y4m^Rp5h37{2F&<=Ca<
zD_hsYv<rGQPH2ypW(<0r`cET}4|(J<VB32!4c7+%yVM%X#)=zfN-VwQh+xA-Hd_b4
zhwS6>7g(5Nqex&eS(R)VIbQAJ;JJqm0Ab-h;{$AjT0`jUUp8?~7vc=+g6fi1G;Ce$
zM=^z+B4NRKag*?UM|c#6$h^10fYVfC^eUUpzGn#s=UjPr2?iT?=8oE3rnVbA{0<4$
zYb;r5<>jh(oyC3M@Ub;IL}pNJq$vmyEtK8HveLuu;$+U%BfJ%XCgf)gxnu~DX1I7R
zLjMUZcT}N1Eq9}p_x<j~d~Ix;4o*KIbp-VoT+<?4bCS9{QJxhx@>zl0lkF<rVwyA7
ziQ+-*dc0YhcnWJroOp^lLXCES9^@0$Br%vJWO1qFtX1W>UXf#J=xcNS#?^7CvPrj?
zw$o@lrOL@4A#}W33@W&nOSNqeZOP@XAG<~fd5@A3SVISfW}d3{PxaK+>kG(p{P_CT
zl&a{*_tC89Iy%KW{^R50;g!f=OFRRjuEUQe_30%*M6K9~xpiO-TtbX*m(ymORcYg*
z$kfX3i~$-FRuCA;E4a4*IvV6_s{={3UTIVXnBsI+1h~+%dCwO!Ae(TlAd(x2o~bwK
zJ^4Z31yp~%`qtSGPd0{djORsL6+}S-jyvUoW$4y0Z{VMAP1V<eW6!~zw|@+px-QGT
z=|bPdXIuPC^N?>x@7`Xay`di8^rP7jf7`%uBTW@9oY&%5k}ouDyqD{l?;q}bG8I|G
zX8W}o1R{toK%om|y#i@&W|;~vhK7T7soI21+Bc6q%Em_5$I*T(oe_-=ou6CzNkQVA
z>?uqzy!~YHGyL^5+<4G-V=$fl+j_L44kAFLEr1AW<Ozb!l-wtj-zQ}n;nu22UjCwK
zSVj!2wP7B_0a=d_ho@~7G|Rxqdp-1c+WZ`-u;X*yfA|mw*b@TpgJxmi_^nC;!XM4s
z+bev2>{$g6Up`qs0~!G^LW*ts7V?;{L+ZoOw77>DWc}oYIEA=fJ<d@ABjl?zPqo5-
zw0}X57SC<`*?}t&PCEREj4LaQOlm4l=qw_{+7EQK?#o{et8;j_yZR(VHfiwHaW+kX
zyeH-F^`;b`)6(M`nvucG=N$aEm86K{ZmNHW7kNpiQ%P}tWu*zw`Dr!$D&V(h3%)o8
z<7IlezrAMvd(AJ6vjoyP`?Fg*jdeLdbJ+xP0)&82#6l3ZU%%>O&6>pn+>mRLJd!DZ
zM>Oc5Qtkn%3m>)lKy*Sc_@moVUt*sf`Gd~K0LN+w0X*pQ6vm$AJG5Pmz<k~h?4ute
zh4YCS1$nEDPC*b=$pQjYRAbFmkDh%ZB_-{L!o!+<&nI0!5MKa@VMk&{ug>rN$GQZ_
zi>!JLilA4|C!vP8a#p?`(0Cd@j|?f%y--0i=@Z<=LV0}SHw8sSPaxMvbcn!_kvI+p
z+WV4Yzl)0tpKIWf9lx3fN}|u*ly<#xRQ5$<IwxfM1Yw`zcAJQ|kc>L=%$@)Fre4Wf
zu#O`ie+0tw#U$5}=5e;H9wUb9BH%`s2Yz@fz?wN`X?Yvd`W;dL^|UXj+ZINJmcY~*
zd7R_jU65XXOqE^W{RG4&2$p6|CId~q5b>8E&+Fz~l))giNO#dqpm2I31T4Zj6;L|@
zqmwj(B6}N7wH8K<kHHsJz6<mR+z?|Z4x$N;ff5_iJjr}Qr+~EUFKv;Kw&zUQ3#5Fy
zg(`>>&h5PUdzFje0vXT{nqha`*;iX+vpfB1e_pCgmHY#F#HYY-bWd*p<J%80uYhYT
zL*Rsoh<Jkh*@mXy-=UN-V67=_Ne`e5S%qj#o_n-tEA;?XG77lTta`{rF5yKO*8$AB
z4<H%#zjkjeaGAJ|pP*`OgGG6YmQAdQcj4djqV*n{6`vs`*!}8UMbONf?8ofFUn4n^
zgj0Y&d3J*G4H)Oe4b6cO360~H@I9N9@oRs3v@J2}yi{&O$<4yjGa=xqgH~8enU%a6
zDQlmZA%Y_mO)aWr*O41=Sw0AK1<9;(DBUQ6zaR?T?t<0bc9|UC3&)_2t_AST6`e=)
zrs7a$S~NzIU=Nt3B-6YIfUjC+^;`^MNjcQoOV<$z-DYerPT&g?KCdMDEkOrDczvXV
zY<I!USp~iu9X3UL6U-6W)-}ty+*Qbc0;uXT2Iuu;-Tr?{ps?FJyJI~oz{HNMk3*-0
ztp&3Lxk8qqmvRc?{Ei0MgMUhyMG`HtpqRoZDB0CDH}8VYNY%)n486~9t3t1T)Ip$g
zEgihqi1UW$?$ZMjBx`J!(<aq6>PnS8*py2aymI#~4!Bl#`|#Go^8*Q5zqQ_35~leT
z$KT%s0GmtUgr*~Q)0!VFC?lu~D3w%v7U7ePU@XEL2(p@>UlPfo`vVZxnlFq9y7Ml*
zQ>zCL{g-<b44GCP0WvkZfn~)v58AUDtU$ljb>7?0%P92Gb0;7XZNhEkQ{Kg$!uD?|
zSMhQ=WQVl(a|xcX=wYiS5QABE;DC9xQ!Q&74ty}9W?^<8c(q6dr#~&e=V+{{c@l7a
zWH;k7um?OhxJ4Be$cf1Sw<&@8jE!d?bjGE2I1eXXCjAFGYbXdY0GdSIY`*Cl{U9$>
z&T(B~1#|xpKqpR$`(_bPB$)y+Ud=p>;~hCl?99dS51MgCM1A)U$)SASN{XCw09d|{
z%so5D;jVvdw;yGdcd-QFbl>mL)~~auW{g*)_JjyLahXpPK4b^e_QGtvMyZ1{b9dSz
zfAWjLDIr0<y^<G5qktwHK{k4$s-q=Ia#=6N!;+do^COk>TN|Wj8Kewmgk^vF;LzvL
z0pcMP0g!4WeM1KVZE`u9&Y>|MsxhKgZ$vNF7*8|C)E?;q`H{A#!;8`<YRSCjXQ_bS
ze{w~(+qko!##UU00!(wd-(0U6sxiUr!UkYQOQu{133b@!@JmP-X9c;Qg6bT0PcSXR
zGO2ZM!TOXqcpF{R4{Q;&-V1%$CXVSgTet!4nb-CqrSr6mcxZ8+VWoE-MiQPy#qyeW
zzmh_+-SMJ;KX|ped08iu1sUEv+&WqFL+IaS8Ay$Ee_0}Fl?_Z?0nbAB?zKlg5^{HM
z@DBY<?>gb;U?72kY18ft8VxP)yC`}~bWZs5Dx3#h|9LKIe1EO{vNZcF4xIPKbqhY2
zd;E{X+9P?s%L&a`D2pxycC3RZVbXGku5Op>0M~hg%+&+klf`Ma$V8*)6CJ?^;=22!
zhNCG>4IoV<e)f|S7;^-hf6BDvnAGT!Yhm$FA>d$qqhT?+E5B~;y)KW=+g{T33lMQo
zrX_hgD(jxBi4LN(IEOH=qcO5i$Y~zSOCW@h)`6mT580jYLP7#iVa`m$Alt@}<kM7J
zd_3MMSswptFgi}n-#h0PXe(`|`7?RNrKpZ~{7E27UUqMI%jrYY)c^q;Xja&)dY~(a
z=!x<u5eMKw4H4rPL7yen*8597m(NCtrNb0PewyAT6ExT9y*HeZ#?q!At-8uG9H+^M
z5sF7agF-5m?FSj%xjGGkEn|0LDKSH{KOLKGBhf2M;*!3vF({0GpJvsYX<8ip9JmQj
zys!D~1bd}?jQP>iFR|LRx&yS|4h~{?C}SSJ@Sl*}LUQSUZoErYpiiN(+6<EM1?l7}
z1o=2)gkB?lyQlamV19>SHVDU_0ZNW%mE#5FKMI|i4<a0+7=p*AMPrV;Xw>bqXT_Bo
zYpt<`){`n^N>in%riggX3vyAN3#J=lu6%A#59OQ=peSjP{VoabztoI)mCM=uoP5{w
ziIP0x+31=}bE(%)$1o9CM?ZcL6efB#NLgaoFu)E?j+{(I4rzH=MPi#27-{jWI3Or~
z9h@&FoQoc$@OSVt^2!-Pi(^m)Y<Y`4JASPkDQwBR==bX+ib`ygN;pdG&H@THrtNzD
zmf=soWL%`i`c%tu%M5ADBuh|6_lDgJp?~FM?xGHoaIEu1vsXRBAb0ItDSlU6>}{7m
zTkTzq3RVHc+z*c!)S<z_XH}2uouh+gH8p(ClUU3yB7>@#(8B7&6KOBqdOHdqdnMs1
zWfR;x)uER(I%0L$kNDB%k0qarPAt#^huy<83Oj23l=4nQT$~ZR$<4*_Yzky#(TKM3
z6Q!6Jfgt0eRJ^2x-<?k><ASEv?m*fuNRrKGJxtxqo%7?z(+M1w*mo=tD#v=4GNbdr
zF5!bwsRBkx<+%NCP5(6@Vv3VEn-N;MSU?+di<lJQdCw>lozdrA)&Pp7#i2s}sL;GW
zVR$s`z`?2dtG8fjJ`MQMD6z@Oj$RWdalMKwXc&GK+ws~0I!S^AXRh!^;(!)@)})2+
zYt|g6uK5mwx3Mc@PHYeDh<&JmsKj5JFQqd>uJKej`HDf3=w2ERq=^Vzx#(px9<I*N
z|Kg=ry=*v;oRI_4%H{aDG?4#a@2W~cCD-&gGP-WIbJ^MvdZ+wK<~;tCWbVVz7*G`N
z4Cb1!@B>veWR})bP_ium>oHO1+W}t*`R5%-{Rjr#%8}R1dv4`C*wTlp&Zbs{9PVLO
z5Osjt@v5XwjDGBIMG6uX*u$f|%ofJQ(vO5Asq_;8kHk>MP*lRAsP);O89~5PRCF}Y
z(rsz|l4A?Th+S?If%J81TwD7oegOW{Z6}c#nCT7a!2Dd&_9#oB<5#lmOVF(VgIR=F
za}28(aj)~)qE-P=4_-WI)c-Cl`e-WY6}dCcSaQ#5ob10MB@T4xHwa6GN}w;5n*+ve
z|1j&$_1(OG#WHFTb9v?Aj!PK;=`d?`P|CUS6||{RBx42tE%K5OCs>r({>Knv|4Gd<
z;)OK}?tj0vpnQ2wm-azR=%w>lr8e66kYhxSAM1Lv|Lxr?B^aZFsTt$V#_9;|zYEdS
zrHa%4JE?GiO6Bp2{0LPR&w6G>4q^7TV?}QMvP99UG8+227^>teK<bDs5fKIA(|uCm
z{;doEG@PXa#-Qsn$I0LvlkpNOEt#;=|1mp*f<{_fI}uD<^!&Vxpb{H!JDT|ICz%V%
zNhye@14d(98o}qxkOo)lqnb)mJPj2G?c|hD{~TMb7T+QZ?r&AC=hO6|U*-dh6Q3nt
z6`LZ|eLU&tZP-M$gckeqWQy8Yw7x$sY8+o5dz@fVa(cgfz`4gbO{--vSChqkBAUGl
ziWaLFtUA|+e=`B7Mbso~#;WUu2&alF7(_u>qIRH~7OekUOLJG3@k0}9THP+UqSj7j
zS9E61@<IqlY3X16RYJyujCfZ5zvaoZN-Y*vEXqHOa*0Py6jM0H?J7|Del|-zfB)L$
zz-jia?H&<8Y*_hkE8-g~<cw#1g1)if=vg*g#l?sw?woqH9_*j!)=m;9Y7<#rE(&TL
z<nVW7bEFaY)qR+=D0dp^2o~A>LXYQcuEJbg4|m9=;OUbx>iDwk%+BKwhk8DzuXf}V
zu(4PxrsMIFZ|g&RU?R~LGgmJMKa0K+nZPT9c=_x^l93@JiW*OKPcuq@^*AW<`M&)4
z3~=L4X7A>G<@N!t6=3y<!UY#?yX28KCwoVrr0|q4-v^JLKNx&Q27mBMZf0Zv6cE`D
z_wM6!J0SFq)9ufO#rGh4t3OH3_c2Q&6ON1B0umqP<ayUP$tB{f9@|=`ua$meK&Fvv
zBGCDF1l|<a*E7!G`q6cr%zty|ixlPw((wnk0x<@$4Dt9UnNaN~2}!K!7Zml6nI3Ye
zy7*Wq$k@JP5|icss#^jxun*TJ)KB=yul@O$-*`EfZDxLMQbHT)>S8%f+3~O^TUo)`
z45{TVB|1h?zVE3v0#iIlsJznOr+q!gBFLipxgqu^3(T3tx@g;fMjfbJ`t~;}oww|O
zpWt2MqhocagCND18gNB5MJAExCtNWntSps^t_ak3@$n3-K~LN3Tdw68?jr>4G`3mM
z+liic)Dy9CwKFxsBZklDRa`1?4pxTOB;Pt;*{DTWS$w(^qg?k$-EW+AM#HJI(5zma
z)qaj~o^92XJZES^W?rJtT3r3)O9Nb79v<aO%M<me<oK|?en%ti+C%`h1w$<BeSyu_
z77=+ny+AgU!9uQ<`JS-(9*<tV4Rh{khF|0M$4xbt!M;30wAGZuuw=!uIlaF>xBRuj
z%&m?PPvy~gh+=KSB)Xov@tLfrA$Fj|R>PT_UXVMx?VOZbcHyC%#xiHF03V6X-Fi~*
z&eat;tV*ld|LutVK_DP@as}80$UQWZ92tr43ZR4}$d=I!EEM|s`uT<3{Z9sPgmk$z
z;9aQW3j^=a)KDH-&MD$FWgx}2COPy;N?6`3`5)3wQ&MHly&x`ov~OFv@aT9UecvuG
zbo04a+S16=j(RE9jAE?9Zw|S3(`6C9S3}DUay+ox$$86rt9zIoA2Rp={UcoLd0+xY
z_KLkl?p)-WY(BWS%&aQb9a-SBJ!Mr?!}AGiU;S8lPfaXoOz`RIYehJ%XEQ(jO0<Zd
zMaL_u-|o?TzAhi15qAx<qS?MN;FDv$_lSyklDPkJ%6|N?2{mgr^Xq|e;gykcdH3Sz
ztrAX0z%R!W(XU+g<aJMT?Uw_*ntb+L5H^yVdODWj$kGU%j>AX0VrQaLOwo~Ho>X!e
zw3r#pGNojNP9E8V>Ful`ab1M+^dLi{AU{-jx0Um?32rZZ1y>xc%hhi-?4dy-N?2jm
zJMVuF?V{p`+YeY8=3iP*Gwh9PzL(>;Z);Or+@)n?#3~TxZUsy?01-jwbKOt@@eSW^
z9bUbP+upYIkEy$V?`{z=nt-~$7^p3_cXobKdpvEs!NI{1yirD@Nl|<w643Hub@>3<
z>YAFAR7P!pHUk+6EI}}k=eM}HJb#>;mZ!folzo1wgl_$d3c)dxIPHJga`rBU#3|LM
zay&U=agjdtML<r|asQWL`d@R2?S)NHZ#n~&RbW)!Gt1vV-O*cKn!^s@y8QPBrNFtu
zUB84Xx{+qS1_aW{P)D2Nf2O~8HD-cAP#EX2=<g}$kb(eEf%aIxtFH<OoB)wv4|MjG
z(Yi;Wfk1oLA2l^J=rO6C0zt)#=hBYFQlmdSfa5Lo)%P8AxNn3gR99<0jd*b-`9HA{
z)LS_kT2f^Ul#c6^l9hLSnZk}SU~qBf@k*q6A7G^OLnV7~us8ZZ03<Us5nB9YaP_zi
z(@{}VGk;R(&DEYYa2?TE%+=Eg?1EwBGQ1v=Wmx9KAdisS1xU}80j)lm#w%+AQrkaD
zd<FSipuMhmR|#??_&~zD;QdNqXZO(d@VfWz-MgBU?*eg??<*Voo2zm&?6$>wfVv>>
zzz1Y^f-D5()B9_in**R>W2UmNi6<r_`<fXR*rc9Ik5G_1S%O<6{t!|9w-&VZB>Ho#
zEQkyks#}r)c1T{eQ&5&0a{1rrcfh~01H_~P6nv_cLi!J(`PJZ>YJlXUxDs(uZ{)Kj
zdYL(U4AySjHnj`%q~_~=EOZ{Vql>b<9%^vZf9a!uIA;RmJ>3NEVGIMA0T)z4`hm~8
zz3s5|5>pOSbEg>?`Ld2q3n=eG+$wOweQHn)@&c>G52iafJ|pmb7uN#enyYhR2H&RK
zctP!H9q#{r`Q8BqIl1Nd(vg)P2wX6De{OW0#lgj;^HAOIdfijtp@O(|j(15(d9i#5
zyyyn3P$Gnw9E1^`TeLP{5iyrZpMq+D=jEOF0OZrzmdkApXoMCeE|^OC8Bq6qdD460
zjYASRCof0ML}smlH+9jBV$uQ>dN{0tDKbGG9xlg;A<dtD_JT@wgBZ!_y%bcscCopb
z45SzG3tp>FOGT_IR$BKiM`hIqcB<a6e_gzd(<Xf(ihi0y{pQ++&+AY_^v_b^T6w8S
zD(Go=h*v8xv7<NMg0|Tt^Db~f>_4}w>AhPRT!Kb?3L--bR51ad6TAz;7lvEK6vOaW
z4It%9cW66EJiGoE-vRuVv$;VhVKg3T|KeRhY^LoW$VIr?b@(#vSQ?Y2Xow0ZVVu9n
zhfrRE;grAD-c)yc_A-RMqW)ofU6h(3(G9b~#q3xE0en-ZE_6saBcq-M@J-4=DjL;A
zpuC^-!_UD%vjuwQQP<uO|ETQAwXb|;W%geK%8|YrjZAVje4``Tf4g!eOrcsjB>lun
zy%I}pgNPBnYG{~m^=HzMX8>#t9+2Fgl9goyOq8m%LJ|Sj>OQz%Y&SMNQ%_G}(2Q-6
zdXqEE`e!!OKB=?wNpniNAmh;Y2KGQ$*?d7D*i8N((Z{6dkwy$$`J_>HegO2Fz3U7t
zZ2jQLQF`>FJH`+4q}qRzx;Y)QZ2mntHltK-*#dGp)Kcc5N!z2$cjf?U^taiEUgNs{
zl+JgT7l7F3&lvuKsR1qkR<><aH~|!Pg^fQLq_hUS8A*`yWp#ZazISLyM;6l&bpdyz
zy7gBF>+&3AX@mqjO+ZnWp$4OAq8!W(4tjjK;Jrzc&C+$ko&`MHkmbaNQegyii<JnH
zZh3i4Lz1{{>^ILIwKzR*9!!57Nd>rX41VIX-Q3EtPu<wW2zwfjaK2Wqtn~CP+vc`{
z2k~sn)Z;M5_>`1N5I6dkK8gZ=j~mO=xRs<`HHeueY(ak-9A`Rs_2ZC{tDng7I}`jt
zSvfwHkP68IY{Jke87Q>qlrm`#xTKzaB76fW-+-_TMAkduj$A<KGO&O=065Dyr4Y#S
zzyL9Gf59pJ(S}mQDUb5*TPPhvFaU;<!Tl%F3?UQS4JH75q`3wI%p~n;muLvdhRDh9
z6``rOLnc<ckF_b3j3W}~S^cOXc|c&7aSI%gxXRj?G&H5EnI#L@FzgKG!`IdBcVUk@
zTdmpmLcQHI8_$&DY0-eMycHf5szDFWw&hec9(<{4Z_k(a99%wC<Lsu0ZlG?NOPa`$
zV=48vv8v<Vs7{-ZM@eM1ko~*C>_>(W6jhq`t1Fw8gk&8#x@7plSd0+YJ)pd*ES2^~
z+x^|_@SXj?z#gajb6C@(e%BEs013Gh7aiQGbZ#G;!LOUKqM8BcP_3tgDyG}ZqPju<
zL*}<3O<1{!iDSEFZL96I<QA+G3(dYeM^5)TE^cqGcABTjpF%Kcg`E`T92!2xPBEBX
zu6h~OBz#HhG;&CxrAX9IW2v42r|e*$Hi5PO&~5c~$WW^MerdZk`O)xK|5+$=$iDwr
z(fLwMnJUIwvsZTGGz_&P9JSVlG?0z$m7)5HPLR;lq$~)XF)l2G{epwwU|kV(kMbz$
zhRXM<G02nJT0xfy9kTu*;TK$CG<9JB3tnU!k~QMu?U``VO$FjUX^;)<2dqnBMngjV
zf4ovl8v8M9PA`f_;@pfCfR=oo!Spb1+<9m$2O3Xh*V8_kM-iBF#j2JBueBxK1ouYO
z-qwQLkWaMW<fws=gICpL|BEEf+C2b|D<|_~px#hu)CsUGk>njnE1PowQ*kp^kuMNw
z*?lLwA@Yiuq#yo!)$$h_GhU_}C=g32sB{oShIZ(44Qw=*Dp?&TZGNcBrbeUUaX^I!
ztWwWMLQ?T8di}+)ndW9a;f_il2mib`pxpil?T`B^D;<1goZW;R)hTVr!DrztOIxLj
zPPMB!0Hsh7JytO4|7*7&5&Hn>%*M?^0Xg-R|BhsGQ&XkSb;Wy!WLZ3lLX+VB{&(S_
zaPGN`&pwKZwn+;iNH^Z>;k)nBzoWqv!7+bTn@w%`^Rpa`Rm14lpk+14LwXg~sLa#+
z%&ucR&bbd#kqvt_>i?YIdLrVvZv-U=H{Ol_Em*_Ztlc^UE$?hSQ}BIsg9Ycl7xH=G
zWh9r|le;v3x}&K|&1?wiIr}MQ##91zgJb+a254Z3IlxC8wXG$IMHlWyQNU22o6>+F
zDt6aP=KCzzxwp9F0$z9`J#ZNzWVChQrp!eyG>!#I)4B%JcUfp`Mv>t*Ji+`VlRLJ_
zKdjg-%pN|^w}>zH>PV5=XVN&FKJLkmTw-^_(e;c(XGPoD$zT$VHu#f3hrKQ%-v->|
zZEV-gMjxH{tzj0n0pOncJS=gzmRQx0@vF^8HI>_&5~HjfSlcFTSzk9Dk!Cn{9Q3_I
z2p0cF7TYD%{q%T-q7t1tD`mby;c>6N&)K{1^#$!#+&OrJz8bm)j2Wmg7F<zloo6>i
z<@U(dBUgG<hh4|&Vo@IB`+av!VwUR$2Sl;UOJcNl$=@+pRr*nqSl*PIy*cV4GmtH%
z$-TuSXF64F?QK<PbA3D;5n_dVN$uo_d=e&$3`1!<N;$~>fm(A8#y96s=2+)Rp<X2k
zvwFyp!`I?(?|N~;Vd0Vf>V2*}$ca*Qx@O@U*egl~TBQ14oJ(^ji5#@NLWTu&SeSy`
zb0${I1|EOX2hxABhsUxAZ)1qhcLmT(tXY%{d@R(7RWUxeP{d*cee{@CY<*cbf%k|E
zgx%=FznHk3Xfsu9Cio^-VZ;0LOzYGWv{S1Ji=>uPdA?sO42zwMT<pJpQ>46))MfPp
zsOJKYYHFu(Mt&RzCGkBa?5{%2HxialHzksY7n$vkS#_e&hP7Ss!tCmE>@@1LUJTxG
znKGT$R^Aj@Z1BXzAl+<I3>E3|*LX#34TfaYlOe3`R)v)6#xIFqWie9QJfIzc+h742
zb=ce4%Nealg2E}P{_wu{6k;*V%{eRjDpX4M55Ftd(Q<?-F-9Tx>}=JYSgh_lLnK_w
z0fU~c-Wl>u0zUO)`}%W#=cHh_3bHtx>hFnO_!ka89i0(>&XGBN6iV5RjX)i|8RL4M
z7gpQCy<}S)osi8DX`eo$dc3ZC&}i`NV^PrrPaS$j|9X#eu96R8apJZvPF?cS$mhH-
zf_y5W&Lkv@xAj>+jS==yj;8JSwar0kA{{nO$4_miUG0-iM`iOn;tv&vlI+BbY4E0&
za9c@6+?S^gQy1q#qU-BvHS&zqMcTu{q}yDwa92Zzw1bNh7qb{u<D*dM16FW#b4=z;
z$dNKeMbR~HUr+_m4aa+|ygl4JMyeZmJvxkoTk^FuTN})bT4;d5y&Iy%bRYWyuc!4>
zF<#94BYzI9*a!DWsTNRLY5LcmoJRNV6PgXECh4CrE81EW7FqYbG}N*UAtqM!d?zk_
zd3wOSV_zNP)~6mxJw;gOmYU*$MEfW+bP8pf3IBlzj{YINhEe<wn`|J>#iFSVYD3bN
z1tn}neck;v6!@=1k6b<%RN4y0ps~NtVX@AvIF(ag`J9aNzM(_CM*1^Wq^OYJ$9hW!
z<VEp}7!E5V^N3lL)SDrCy^B$vpa7y?KfrS8@W^~yIyS~r4V2KDcUV)=)8C#t$dTke
zf(A+xga&fl*ZXB*V|Bi>3wG+OO7-j8KU7+l<m6x0Lvgjlr7n-hYm-rK_6xIOZ2q;A
zqZW^Q3g+|-=6V;2vv5auQ5e;hsSdyN>Jq0F$Cb!H37F;#m4kVQuc8>%XNrB_fntN3
zt^90@>Il>S-tkqDQWXl$BkQ`N8B;hu06=Lu<CPrO`V*%0=fd-oYT}X9FUg(52F?UY
z0?Ik<!Yd9UH!m%2-4J@+xl}{{PD49#_@=kV)HH>qhBm>d&@)?U1cBqIV_t;-J;~Ek
zg79nHGEB2L03HJR%8K1w=!;%P7E_gV#VJJwH=E6o?lc90+hJU4g5q*ZlT5haKqKR4
zL3XU=zfU)=gK2U3VnT~jHj;`rNvRoagC4#@eEcRadE<`p+^+*$o5TOWwh7uqZdwnR
zM<Yo3BPlIzVI@|Ak*(oUY32V35;%RDT^<)2{b!#DI;LRwJ}aB&Y7<-T=IB;zyFYR@
zF+eNPNhd(D_FF00;fB=M`T=!RP~|?Q@ecj)KXmkQ>~;FzafPkmX&b)QUd0iGq)w6S
z_w(>yoLM>P<d!)y@xfc}d?$;P|AVKqj*DvhzdoHqNjP*VAmz|qf^>(3q%;VU(nv@t
zoq}|OG>CvAF?5%ph;&MqfP~LJ-0$<dfBCuya~#g>v-kR}^<MS|-8pyvdt(G}L@9a_
zk2u|XhUdyYwq(4w=%OBeOAR(ot_6!i^brAg!$7LH1KdvJgr&1<LB*HCCK~8>eL*SW
zkKV6E9vFvOHu0kAzsa93CExxl&&Mt%=LSJH#6G`P>I0?_cC3AS8Js4W3G@I%WwPBK
zaCUp24ufsl$r!z?Qxs^%hHCn5u)vM5OvJ*>&T$|jEA@9-{`6yv_V(9Hwh~LKTYY-g
z3L)M|A(S6okoy){H-aD={#W@O37`u~Z5Fpb(?Ziu^J!#}eV{u)vTg+ND=F1#EJR9e
z%8q^XDrdj8LlRQ|ruOVVwxfG((4&kAwn-9)__p6v^bB`S7)wD~*QXW;&|MF>f9gYU
z)7K!Q_zJxUlb{Y<^kLIHltm%IT<6<IvV~rOdnrh!{dWv_VUjq_?P_lPs&mR=)Uj5x
z@)KGBe}y4&LJ#{mpLZE!)IOTY?&ur=Ft25julC{Oe<<$+xE9<y8?Jxss2%y=U4-Z~
z?^&o7&|!4^QXeZrBN#^&-U)lnk1c<R0pf6zl*%iv_W)TLeEe3Y<hEv+^?2(nZMo2=
zgUt~2>SF4Y@?XZkf4^4*D$Rul?|z&#SDzv4cV7tL^uhWGQYYZDDNqKs6Yl6>$vB^p
zZ3(ccZMX|Xdxt)V`us;JB^y2ep>r3Sx&QxuH3*S@j|>>zGf+tytIg>`=4pKw8R@!t
zw@Zk4m()B9Jp(VjjQ>9agR~RcLmAFfK|*e4xujrE(7P4~h`N&LLR{b%2#59Weg32l
zu(|w5Ff|A-c6|S=qP>%Q=nFhfwj73je{|VYWRE(sc>nX6%A^*K;LaE#+GaEAi@D8n
zHK4Jt!9uRipVJG#W^p0%Qx)G)0hhBb??TB!fVld+_3enK>}`4{e=5si8Ji77qo_SZ
zjuC#HfLX+L^<(_5(9tDTXo1UejoyE^J0E$^?6SP-Q2x_RFU~tS7?~ZD3^6f-HYd4k
zK+^(L8@clG@_hE#qOcRtkWy76993OWq3Yo1sBb`(hP5{u@Z-z19_rDnXB-*Te*gHl
z9B+?bgxs>lf32C43>Zrs>rOV)So(RsG>vD9xUYhRJMHMmO;4B2>(5U#WrCsrz~1Qx
zr)V}{8gkeiIttA@0#n;Q02<2MGBQQ@Lcx5fVSLtvGU*0FWa8oBoiz8v(n19+S@UeU
zL^7tMWLxt>;N5Mxs=eIX3WERVGidiT5Fcd~Ag`T_$Svk&unv>dkL-0$LLK-AP_?Nb
zqI7t9eYEiJMjvnpJ_<hINZtW;8D+|UfSQ#AlpjG4n+usJ?SeB{%37E(kc^&`-YuvE
zm|P9$u#xf*;_E{Ww6eo*Wv<W4*-uq`5~O}F%db5KOMcFadmaq2{~21?(5D}d?ox;q
zz<xVY6}5;n1gqPv<Qjtx_99TdKu#l|@keW?eIFB_Vw0!w;&@}Q!G2O#<lEF#X^0%7
z=l;A2B)n}+6uo6PbEr06F}bVe<MxVd2SidBshykhDgd^W7o;*IB4&aP0M>EJ(f{kB
zsu!E(gszbhJFr<y+k`hhGQw4vdreerK`K?3E0S9oAGoI>`lZ2qkvMJU3Uv-W{Yd=v
z64l7@b8r24%HG!V?~1G&QTQTIAHxKgruPpHCatbo-A;=~;Loxsvs<SryT6eWI7@ld
zU~<L`dF4D;1EzFuyQh|SUdduVLzydTh<n+Sj;Ey!k~@v!ATCwzt(iRa2sH4{R`_cO
zFJVv$IcJIkSpn)L`MnR-9s;8np<Z@Vt%k&@t_UFF>n5}U9wjeSF~1l}bn+Ka9rzt~
zGcOmwiLm_Y>&;)sn25`K2G?QGt$yg+2Gxd0{=0HlTLJ3Mhc@uHJ%FtuX${0maJ#2P
z=e8k8MNQ#8*gGdedS?os`DVQ7{*Rw%;dr9~JGH$7fWyTFX+T#{<zb32aKh+Ck=`Bs
z2aGV~bMc4Zw&4av*BO%3qkUqFH?PH3qWN@L`a+K37BmXkpqC&Aq>Q#(A(}4eR!@Uu
zc!EzO$}Zr+QPXMnmU_HR`V!vtKTW3q%tzm5CUfCXOw^*(SJ&E+y0>@<0=kj)YF`4V
z{Otr)6lv`IvvZ&Y-98%4VZZ69`tzQs6QM8&Lg;nm89-xZy&<-*fWYi|>HZv;3U7JA
zFMoOJOVCAu7VA^alf7S0AtiS=0tIRyf}(9Hf|sCTG`%=#wn6_b<c!c;qH}3Sr79M{
zIR+ZB-4z_8Kp`#_%6viW)L)vV{H~gpdp|$Dm}=Zh^#B2<oHGFbO$EL*HEkW8HSpFv
zDS`@dwk5O4mwORLAkubnrl)Um|Ebf*6v;{86sep^610j{SOY3&0{Da(#?%0sY592A
z(gdK;mI#NkzxGR*2N)gucO$9+PSS0=c4qBYx6frG2=f*lhG?}1(1`w_s4Ok4+ef3$
z!UHko{Bz)Cq_5nvgh8{1^6zDp+I*0I&>ARr5HpGrz&~4RcyKfgnik5M^WO<;7(c)2
zDcYsgP>Pzr3UxcK>JSE1^fQ;Qz=)F<u!r@+O+LR&0ZOD<%6)mXTsV)WW+WJ)_*Mw8
zsJ>$Whr<#oIw)C2#RkwUnwB_m=mG&`e}GWNDk371r>PRYaWBjJfvN9S(Rwk?Y7MAk
zDYb3iei|$j!g(mo@FIf)@e=3lM(9WgjRFP8NmWD6z4tgMM0gDJqYR7l{Npa8BXe;<
zu_5sDZg+cLum$q~v+#>n9Vaf39i!+ZD^^`j$|e`f!3{G3T+PIcX%T#}WbQZQ-I4jd
z)@s9Pv4L<bsLG0Bbt;J7caxD4VIDO7U2AbX11&WTG<gYNe8~_Mu=~@Rfl%hlm|?ur
z@N053SQ1T~lsFLROS&4^)Ehj+nhj1+ZgUnU{Vy0Gxpb|gRO9u5!qp&!q5Kb6D`zad
zF}($oMZ*<9pv?LG^SUT1RB_e~Z~7&35M3`4phj<(oLRXxFx<Cz3>4o_UjgMaSLM}a
zK%*Qh_kadC;SnGWBDtOU^Nd(|qsZBip8lS!T-&7<;P!4Y;!t?t3{s75008aKhDJu<
z?Celw`cq;Ub85u(OaXnZRs7?VXpru*;r|q)0a}M9LlR9WN-GzDh{Fp!xlVuDYuXvw
zp9a+xNnvE2%q}|}z(~o&>BwPs|1yD(IGqErw>csPq{G(!f?jintG?*QSDj$tiJjdK
zy218lL8xo~U8H<O2Pa?)?DSzE${>lX7kH>@8zi7?^x=B+y%ERczdFdofD`(NnTt;;
z(U2A)&&Kh5bLuU1p1<9G5Bxh+?o6PjM(HKctCiUF??NRf1OzG#p%$o0&y@jrt+3Gi
z``pF{4SBCXSag{7Y!+=RkJ1t#uS8ipAVO{cE~NWIp~*-3T(na?hrWx~x&KgUPf)Ct
z&9F1L5Fy?p(+o~cyF9V*KN0yo)*3Hf#g5+%Ek=o5vS(<8bjM-}7>dJbbYX6PsK{Nr
z#HEW(ouIV5BF06T4H+29UmtIIRJW3Ttk6P%m5PG#?wIxSh8$@?L)V^6x+lS>MG)Z)
zdU=tNkCct_TW0D%_o&l{0FwaFMAB_73gp!<8%r8AeWA+tX<sdEHpoS7dPxoyb^~K3
zwcT!NqffE~UfPH^8%r{~#Ew>Y)1VYsipNZtkf&2uTAA(FE|q4k-Q-lMv0l)C1?s1A
zNM2MgN&S^-#Iwa>x~w2mVBmUK5GgTp4J=B<;qfy>`2va@%xIBG33vHR;-APrF^9$5
z?8zuR-uL~>MJ&Y295{Cc7lumR!wW3$S$K1cV|2wF-XXlk#ZlOJG>QV)!V3|=GhF|p
zevEP{y&i0cRpO>fkBa_DR|juZ`jR|L!LVHt_^(|XC-#$oC>G}))Y#;wsK9Pqb3{22
z`m>s=O3w1aqNgrj<TNoUjkEcer6qR=K%fp|{VGlf*IA|j8XaV}a%hux9nXFR%6%&v
z{5P9(v~mlBjY{=DoH-1$G5PkPzsd1%ryYnq$lMTP5^r<;K#JX)9!5OluX1n?(J-Q8
z$0kA4nSTQ2GgNFY%S-d7@aRM^<553*`slM{m2amnuwqfW-l2C5`!sGDgz(M3FHX%&
zjqr#^uVZTUfZxlSL;AfWqo<}zR^e`zgYvYr@|(ePII;$NN(O=&d#_I?wmaBx#Hml#
z+WD3|le<0#$<*>%WaEg-d*nXHwZ8%C`iZ!jM&ErJui5^t>(OhSFnXDdyCeCNP6a0(
zpmjTPIqKdSS~dG^oh!wz@Uw=8R#vlTaP4orr08K76e6<8=LRt;acy}@`l71CE@4hj
zeu~9-+JMeDPwG+9eidoS!b!>t;<d3m$yfp-Dd|L7(rkSDH&^TLasF`&N1$1)2;(Kj
ztjzBp929=jS#2ocQ%3#t>Revql{kLaj|n)FdNPx%ORR9~${fd)(8ujm*Ltn`;vO}s
zrg=Z^eWCdEgY)mbmXE$4G%0cfU?2t*)9!}1&ATV62a<x3+)bIg=;B20ccjYVQrJp!
z)S#baXYGPEMNp<)aXSMe0h>LH;IH?ADhKI+%*1|-uO05Sgm9ct*C0CG>i%-GWEtnF
z%1cx2`b?w+fJOwQO2mDN4O?g2ya&Jn#~lt4Q!J%Qy~=(}A84n)@hi`mst3z&Qjb%L
zWAwbiyx)g-M0TH+(jG0c5GRe|FKh0V*4LqaX&s!N=>Z!baSZ=an3xsj<X%7)BgN9^
zCl50&KbSV0*I`$HqrO~dcgz9txr1fNC)byzbspQYyW^7A-l%IDPU)c2=WPX!B?^M`
zAlzJ-x3E?sI(RM2c7K9dcvPBSq-Gv_q4hsI(U5NV+pLt)XJNnb+LE~sq88|1fzwB5
z`73LlVH?R$)P_sj%L;p+Uw{<>hCBde4B@@g%kl)CoLh5jF*Zc-EaHp`JGz*uEqAcQ
zqh*C{m#5b}06RfoS=#cY&+@r9GSGRuexUtp=C#1|F%ZK9)gEIWN-=rO<k8=Q#|fiP
z%tvsFP1c2!{U6KZpppgVbb!N2{%Cym8!5kb=zHZ&oTP28pBC&P4T5L2JRffugUtZE
zyOV6FqDB$hfx_VkhAwJajbkwPp?HG6s6*PS<3u!FU>qVuGsAqBbl&&v+z}%s)l0i5
zvbs6qa+ZsUIXs_-`~H0XREzpzV9;_%vLEm5(5yFrq;DpV!(oG#@G&i7^fs%&lkS)!
z-(uysz{BRI1Ol<0d*5qh3nc7HUnENh-H=4{nSE)0I#DBA{J}cP_{hA_-vK^|o{ds$
z5{q#W(@gkICD@QM9lzi{+r@n~)h?l+2u=xP)?1`a@HC{>$OVsMPO|8Jo|W6-be2PH
z`eMxx#vLB&)L0qKMYZ~Q?UdNNLGZ+FkBma7Ja(`<W=M8S2|*;Yf!uJoV~t55`cG=*
z%rZFt$Emj7Qrf&MuQ7rB;a?QYqeK=KCi=R8#d5^n901(5p*TKyjjVX9`l!p7eCqFl
z-@ZiF1J`sG7Cio~PI>-x0)wore1KytMM3B=9(#8ER75|9f{H*Ju$0cIilgJO5<!M5
z8EU6m#=d;vYl7zh-nazni#M}k)RSx=1O3+xY#K@vTEH}LyJ2XMPk^@oW{~zeU9`Xl
z7mT0&D@w_;faB*W+*CvfXq%FOBhYP=OPVLJsn_s{O0n5b>l%O>!BsG}nCVXX-pkP<
z9xg3R`9tY1f!rk;U`ZzZNOG7`A-+A)jl0~(3X|t7Q@1hA6e38ozxo(Ql|`SC2b~+h
z=;>*LQcwz+eljCUG_N_PP6h$hmIXZJoc|pp*sv9b0p&3&FC9-uoAe$RpBAebPxJ`Z
zFc$Umn&7I0F}I>la#T7T8-{N$7(6QO^WQz8PIZQkG@P%al<c@mt7Eg2hPQ|k76!rn
zCCbXSlVX9xzv3aTli;;W8Q{uHA-McGuDAblg)Vrl8aW5Z_i(2RkL+dlNj?P96@IOS
z!*-b%vPnw%O&6oE4j`Xy1sXd1yFzckz7{S*;2ASv^(TlVPMErVe}y`-YM0Wm%PXgW
zhS~Kt;~j0+)ETO-Souf7sC#ol;;}E0`V3<VQ0)*qHBUhRN`OtmAqiCtb4Z`qvo;id
ztL8jv{h&0OuFXo`UCXyqQw6#L20(=lb6^?7P4;0Z|N1avH>p`?d%4c=Rt|XNV3dDC
z^^StCp6roYm>Fm~wtdDI?TsF125gE0cmZ0M#2&j@>=M`@Z4FF@9S*izAfQMiyLb8@
zUAd*Ou-nYS7Cvh=_D~O8U{V*G8kXvOw-<d}l2<xp{x7X0>^s5zPaD*FFtLv2P7Lzn
zv<~ml##64vU$_?dLgFX0oSSmUs)yRNlIk-|8+OijrmSCm>v5vbLjk6``BWRh_g6J6
zy%}gJ8VOzZz-<d+SJ~&l@Mw=p!ePF+$ZOs62pK74#|z4hn1210Gn*?_AsXpjP>#5r
zr{`4(G5||p!civ>1{IZ5lA}4nNh8v1f;aKruK@J%uPEYn^*5E{7kJAAw>q0>jDxnz
zgK2}jTllaMha~jZWx=y7)1(^T|FI*lc@~j4?qSo{gj;1PtIWlqWh8)(V~n1OLr5uc
zXHaOFINA>+GS*pIuE#nkK%ow}v#?Y!(uP3#vPjCq>KiK`tJ0D0n508apr5vh30GKH
z*u)sISdwvNqA+bvQVyrjFP&e`R$*pZJ2M{%JsLZOk+*ss<|8$0X{u+X3X*Rd$%d_;
zF*)+uX+TgH;HALs+IX9F!}ftmRbH;%@bC+C^L@i;xI?_=p$g@|<rpcznoA(C_@coc
zCpC0-)(FD-T7k&h5jfWkK=zs5@3$}%a70=IR*y6Q5m96y)*NjIMi90n7PUms_Mi<N
z6al<qTzDOHuDS{e3Jw6<5Hl%@>R7-C@Tr&qUrSpb82d9o9u$`@9_Kk>2q=B>A|RkO
zTxgasT=K2|W9IKx@fQDm;Hc1&k}#h2c`Ck&G(`t&+HT2fYsft1+<N-tb`}VUNWgVq
z#?HY3QUlou0B@<@iVEeDpp9gmc?X&R3=}oq2Ym}i6DkkOTs5COVFC7)&+Pu~F1Mi#
z5R(EmM-QPxoip%ueCiPb#0~;KK&Tr4ous9g5QG}?*ig;eQJ*mO?UQC^2pid26rUD1
zBQmfVQ){(55c!KA9Fs5jeixPVzx+##xP10FN~$<I^AFZzEzqeX8xFG<glQhIZ9y0j
z2$<Lm%sAg%q<!MxJddW$UIl=r-;dvLghAV|E~N!VjgL9b!Ta+ckR8Ps&<osvjl#j;
z3o4)<-%>&H5Fnce_Rt#3+OAxD>bl>^QJqBJvV2VRk$m8?5a4L~+MxSZ*;m6Fx*V;r
zu0`W$t@X7Or+CScI5nmF1GtC^0cIM4-+(>pdqGOg!M39Tj&BS>07iWM7PZ|Ilq>)J
z#09Y}A<0JoEioxcAa?K9=k1=_j!<kBEiJ80hXO8haXm51pcf1ojNM0loFunvBtLRh
zqQd)K@*OcuS<_LF#hjuLGBVZtZEtVi68Ee_HmFl_F$jGX=z3GTP}32l)WihYz2+!N
z#bXx}6DxoEeg%GiJUN4ea{cb+{K5l&+f9Y1bCHzF7kdT&H&v)@8V5rKBVsWL<d$X;
zUrjolp@pWO8SqEw9umP3(jZmh_zTL(f%K^7iSUd$KVm0dJrqvH<}P|?LRFd+^_oDn
zquT!<c3guYqi{p*Lki_9{m)+3w~{3Uyx$P81)61ECa5;6gP!}wTDfxyu`~~m;RH$e
z55H(ai3z7Oshjf^&l^+&sKf*WK-~bzr~lv0nUnw;f(gzg{VE9Ba|40W5vxNvJVkPg
zxyIZkK-bU~kl)kx{!`@_MR#WT^mywndl}4etVeI`TZ)5gpM#oI``tERSIc8&0!^*s
zTBzU-NrNnHK!XJhov{g^*-4@eq=fpl6ljaGQD(120;d5b8S>+`ZN&xg)PmDjX>f3a
z1=!N)j-oAEx0fygST-#INb($)4&(JKkHNlw)Gt5T%%ReAZoYUh+mK2Yfh7}CpP133
zk;zyf55uG76-AqiJTZ_2Hyd-K2yev}lyHL5LBUoP{$R~>5rCS}xa6Lp&J}TgP)eNS
zOP~<WUsnd8)Vz}D&b*B)NJkyuGCgKoV0TkiRn_g@k%^h4eh^eA6F7fy;Qsp9YLKTI
zHyB~uD-!EY``#<FrrifXSSmkC;??#+yq_#C(C0`lT1UM4Uj_tLdT@rm*AR$qTq65|
zdDDMd#Qe1u@RGTHVlUMr1@2@I>HJh;l`7f4vO`(g)H?MK8->{HlU-D-LzmZuKfV&B
z3C|&tQm~cQ_)$N~Gm^J?v2C(ViEHXT;unt5CNk<N|I;1-9{rD_5jatjRsZR>HOQ~X
z$!}qB4$zyhWxRJNTsVM06!N<N5*kaf$jl4nGADavcB@diyeq14Y1Akmw6uN31PkH*
zRT_{XW8THc{d3Zum{|%4>=mAx{Jpl&jaEzSdv9h2!y{6=qMb7g-)5$Iiczm&luF+A
z^%tdj0D(9kKYDPDZ2<iE^|iO-4FlsjoF}O-g3ubjd@<gbDIu+}@G#AH^t*lOZacRW
zI|sblRYvnJ49xjA#&eO`dvG++v=%zD4e1NpFOZwh$0DR7X#c$Z+C)MZ@|WuN8xilm
zL^O68{Fc?!A&);=Ca-TIK`v@uK~?S$nH_S6n#`y5dn4|^3MQB80`$_&22>2@*p7sx
z=|eh2IGxEvi4|@~x{8*`9f%CrGuvdDEIu4|mDCrm%rMksNyih&(%}X;E`wmPuI51T
z{mA}@QiVu$X7H5rM%JKe37&KY0_B`{!nX~+jZ#~VO7;q48>_%O?_jq@B{en2TDG@d
z41Y8g{bKn%yJPuSB=jL!Tbf`jiIl4^NYRgwq&gxe@o6>!Y49~7yq+7bbu2XYg*H4+
z!tOmErck2G#MYo&vIofP0P!#9$OS!qAL3LGVtvP>Meln)#6;esdlDOU(oGp}f^<_7
zv%zE8TMet<6+DMLQVMc`In#3xANnxx{svD7Rv@)6F|DLYoa4MZ^M}yvFMrr!-wCly
zNN;DIB`(s7X&AcKzn76uS|#U}o=Bv%w<_Dy|LOIk3-DnW*SpRJ(`l3K1s#?ps4Ls{
zn<bXivZb%-4TP4-XiKk4dk_c;lvsgl(&@yMll%%e#0G)8%%j<SIRHG!JwTtxwY8vB
zECg&mf|`Bv4L~!Nt_Yxrc^(08jM7Y%iKpQrB%>ZfHA^gx6}90|55lO_zltR8@!sHc
z6iVdiJr(FF3Au#o0Eo$cHr?nPdVm5iXCLGi>BTvNQoexubV@B}@e631qK5D@+uzT~
zm^f$?V*$!*YoV<TwBnu43<4+>=ZEUAEp-2YvdjM!bUHj)lM#i`&jC-0AIj79Z34M1
z2~sT}+}qKjPr?tdhB8bRt4-F>Fb8IbULIFmwSvRL8qkQp^%ocvHv?W(nzlS0Ko`xB
zlOFCMC})AsGLM6U*Q*oSJ`hQq#1}*E#Obp~R3IR&1ksQAo!ZHUpw6j*bljkL?EQEC
z9T^0Dk-TsH5@SK50VTg({K!C`pA22a^mo9@xwz$J+s@-(#QIP|(}Jv<bQ`vL^5cgP
zROkqDYl$=gn3U4~e5(K#mqoE6Mfmd7GoSBkKJv3x8%2t|%uY-JGc+$d&Ly9%eSX0K
zk$#34HvPw5nUAa*RVrylcA<f-yyye1L_Qp*VcqW(p(v00IbGTA2a<+!WrFCJel=*J
zoBE=X4b@Yzvyw4<lubQYJBuV?J~3r`LPn1F_Hjtiws4OqDjRY2bbmt!n*kZ#OwH-`
zj0OQla+N;wV_a8Qcb|a&Q<@b3S3hx&Z6J}_XeN{mZ%Ck6%$|GO)I{X!PTSiQeqvsc
z8_k0uzP4Y4&Ky?5#=aUH0^a$-B<wfwmc|OA@z}1!QcGHz0MFj(%BAVJH(3%lpG5gr
z5ql<qW%*Iy)p`>v5gEU|?4Gz#ueU;9sMmTc$AnDN+}zxH1FHS@ctONK!iLiWqF9pH
z?f5iSqN*iYx9plO>TJT!XYX8Sx0bS@SK$h$-B4WMrNkgKb1L|QudHNA(KptCSXZk|
zN<_;c)QJAH)GX-f${HNkRm{C-tZ4_jvx-Ac;5<0*iYf;rbaEKC60@7n`LrTGdH#NO
z^x&h4!<(rZExhN_Y9#k<;2ari=`HWR@?lOI&J9x9J6Z{cTDXRp=~3-iB3B<DHnxv1
zuP0f+mBEps%9nzp2jxJ#6{~pLcYhFHpb6?$H~9qxpX;gT5uzM$l#2lEqK)>=0$5!Z
zqxlC~s_<GI(DRvoqaUMgb%#T<ueRd&O-3|I8mBzUH73;X?zCXjEj2e1K2E8mtgdS*
zHOnfU)Dhk0SlDs*74v_y>|F6O0JO!pZ;kJ7v{<>>9fN?`T^8;BgTNOf0pa66%v}q>
zfu6T5r8{QR<-;{T6F+O0=xv$|f#pl2l#~g)B5bTmpi--XpBWn&G~&wj)0v=vPnr)^
z4b2~Y94Zo0OOg!jM}ltxhat*AJew&MYkL<?FHUKN&K>TKl=)Hy9%|;dO`BhTQlT>`
zMafvtZ;rTTUpzhka2fL98#%RF#rXbBQw*WO)tYF`TgM+>A9(~@W3OvKm}UHWEFi=p
zWTpYOvip#tXV=eyqS<zHt}fid@!~lNvdOYCEQ&ME8h7PS!E3|SAN;BZ83xID>m9GV
z#`YIoe|h<&wTT*!t!e!cxMH$*OpN9{BUqKsiT&bj+6uy6pN!o-E6V?akD%ny*769y
zQdZS8Tu$@RarA6J#dPuvyOd}(y&`}qJfHGh#wIfOMc$69E&55RT-n&-_o}AWJ#AA(
z)N}?D_e;h8Ie9JwPljCx=mxT5l$;5B$Z^tqP$9POXev;ZKedndV^Lz;5uK{IDr3o2
zt;NiADmUy;NcyR^xR5t3A3H_U0#z|fuIu(Qots4o4&@MwNPT3}HE0Ho2}PWsGQ#Yw
zcuJaeeiznN$e`_sW;b_{UlrCjw1=gW!*q1ER+l;%@aH8g$_GPQM>f=|lO`@YY&l$C
z;oD;7IPp-nG<<8a1can+V!=+bVb3EMUPjm!!>_k}H4eBSuT^D8=ai85^C5l`BhpSU
zocp;PrKpcwBI9U{DzC?pW2(i)GB0QLh7~+fK~FIp)r-gm$LWFW-@{T^7Uxkt*5K5G
z$<<La*6qmP>aSPwf#-i%A7uYxYm2_rSs%(~-I|zLGYl8z&L-MW#a;hr2c=EvBd)P;
z)iyLFL3%A`Du@R5You0i_a`Rb@rd9>#UI9=?H;(c%yFHUrhoG~{wF_U)OyfjV8Mqs
zTOuK^JBf)f&O&Eu_e~TG0?`G8_ZDd8cOx?IB*H0rRGuElu{o<Zy>^(hIGb{J&Z+4_
zrTP?(`#DYp+g8AT^T+vibA*yu)<`FEv%YpeR;Wa0pL8zn9`dnu?q^>jwfC1B{RJ12
zc>%4lh`a;S*%Q&u&jb`n7T*Ihtos7r!CJQ|QY;Q=E`8}gjn*piz835sGpXR6x;B3|
z#&zlqfas}3>mDum0<bff%G2|D0{C|g`Dg>Rmen<7We`}yY*BpEbbVq4Mu17<1q#jX
zBEAm^NR_D%6JLI@8j>%ye<OW{3L^a8{@l)1$6%3vP8TwQeN3!*Z2b+{Fl4@T2VZm2
z!L6uXp8+VN?6@pwczxhOcAqudJLF#8;hcqf+57KWhI$WPv|bd`_^fIrrjyesN&Az{
zcS*-YH{p=(G>~6N4qG&he4EQay2s2XzdgUp2%8Dp|3ty!F?v4&SCW<U0c^%K=Ei|U
z{;c4&#jB_zmrJhw<}SqE#+x*YpKG<87J%7(qW>1rLVq82IX8wMiXGCAG^qHnUvpv(
zuX1FJJ3tKE<6k@D($Qzo*SRHS5M+BZzEl=L8gwk8_56B^&!3u`TPQCd+*d5{bv-~o
zNSo@nXNTqRrIC?l<YzR&;70=9rj7;18@ukbmzBSI@mCD)iO+aj%q>J(80~=zZD?CX
z<OS?!dYQaN?0jNJvRQn8q4SPnNlZTc_J9Av!$aQSCFO|~ZG1t*rY<oTbK#|Q2VU=?
zza8zPJ!Y^0r7rDkp@B$CfM7==j7t1y9y;G9m1Tatnd02x;E8TReS!9L?3CEdK+6{B
zM;{3KDqcAh`pfLV09&EJb6ul4DgEEHLz2apTUG|+eE}`j2mQ~jhw4DQ_OWT4?$$Kx
zN1rJ<@!Ky4UyK+Ya;D0|K{1(Vdb5&i^8}-T68OPD<Yv*`mScyKx9%lVzw>Qi_R&KB
zC3bnOwpLcjYjkS>I~2<v)KMj*lp;x22yO$$72ea1(k)gSh9FWLdH2Y#NLl0T*KNKy
z!{X-Ux<oZwqZSrO9@mU?K!!4Tx&N*+csa;k-*a6DB%slX1#QT?y#Y@bOx%o)Rl}f&
z%pn>+{V8=SUD{jfTj8wTy3eiSRKV591Q9J<*us>nqP9Z&zYD~;ifQQ)cRZjDC!yC(
zyZE@EnRHt;hdoF)X%(OJ0-QTQI{W|9H$BS@jk@I)aGvn9*4`zr|Ki$T8?|gS%7G-N
zXZhWNfjB;cDsBGA4PZA20Th^DP*B;#BnzN-!(uUjZ6+ho*W2$he4K>yo#Fo1|4t$Z
z?a4bu3jaL%jWAhQ9wh)=(jf2-b3l!Ib3nlWSYZ6<U}#VX96d*43AqUeARg5ZtQ&bf
z?ch&b5EOAGe0+ZOR5c+RbZkFk0z>ad)X!|x6aU=b+&TA`gj^t25MvieBLl!P1Q#XM
zZF&wTpxg0#T6j#n^~q}p|L$%^>IU+a-`Vjy0moD*I{>*1Aj;C*oUso8c>p1a$s^37
zfgVDuE&cD>Z5%#?yroX@IO>im)q|odM?eQt219SF-&19YnKUO3$tfvic?Uc7qXUrg
zb$WX0>p$CIe+0lEn+y7;K99kck;?~iAN<+*F|2$AcHPP0;pXnR0+-On*l{l<CFKa%
zOZovM=|1>3*9z*OY%#=w`sG`s&coJ|L^3k6(jx%sX@TmpTR{EQ)+wDg#<a1!TK~Yn
z!~)dMY>GSveU3=ZFc4A(qB4}E*9My!%lPSj35<qME6zC}dzhS@+y_mO=|Vd@A3+9~
zZ*Yj@vI<giNfC6uikElIX%<`BK<}Iti2IbU^1z^$>zE{O$kvoD=2gXh5=F{7U>PS-
zjD;2637j&l&}mXoEfct}9AGY^z+zyse+)#uLFjDZ8UPICZh*hk2V#2#7LXPwK{$5?
zyd=png0qUP=<!KO%mIH-5?oh$l$@L@>G-1mFfofM0s9)Lm;x&3Jplo&mNS(kRefO(
z8^}@uWTIq1elU@bqvHZui%JOHOF(D21mZE%mi)}-oZtl2{KM%LuzzefL;GnUfzVc_
ztT48lw_9uM(h}VQj=>Z>IZrlX0K%L?6=~s+TgM6v*?<3GrJ)N0{N!d3;_*;XRMZTG
z(6r;lu+Dh-#lMinY%&rx{vDh%76Ch;P)+Bxt{sU8udVCfPA^|*g?ya_=`FKCLw`RV
z$MH&UO~?f?sVOUKTcT)|LRp>ZCasAA`Iw30ro%jt*7;pN+SjLv8LcJVE0=r#4ucQC
zXW%Pz0-ME<Uci8%E_WbZemZ^X2d5<tP);66Md3{*4kg8akJ>1M3a_ZBs1B4QZ}RM=
zWMx&(SP(-z?tg!PwI>H10fQRr&JHt}0mPK;ofd07%XDO$#rO@+;Vsw`lZ;H1y|CoT
z1I;=%2N<TtxUUP1I69hUC!6&b7(cE#94ZSgpRzwwDATDg5sXWacE=dTgs?9?9oOQT
ze@&ehJ}g-ldT-%q#C_5*>WuRn#^#!IhI3HY8Gl(9yVb^rL1oy}QyX?UMj|SHMKSmU
zW4z!Elw&dZ>mh!E^Y9!&*ZBPze!9?$Y+v-IJs@L^zx>B28?N~8=K7mvuI2JO9!sn9
zgC(P+-u6gA0Jp^!R-ie-HecBO-q(73eC$cDA>E|a@exjv{RxSf#s5$CFn^Xffk~jP
z|M9n4VyH|FKqqb8d+b5?1=Q&@5acvo0YEjE$G3dV$pvtT)zbpjts5hi1%Tcz0$pl#
z0Q2PNj*5j6j-G`FpkOjk&+&<H78R1b1skgaoP_U#B_^AY>kBx(u7d9+%p%`SlJ<9B
zAF}^r4642NK=8(HE_^&2G`AI1)W}ew-=-QPgW8nJ2d<YtY+oVcHVq;t=E1^uDumSm
z&UPM!D71gzs6mM(ltF@50@DVvP|C#V&H)3=G$?^tp?o|jxIGgEP65R1Dt2cw{*6QU
zqLVOS->m`3bX=H+;T@U%Vteqg8Wul2$UYvr0e+VQ6|RO^7yzWH6A*nN=6^f}JN&ns
zg=zj&VIrtET4@I%I9061xMrH3UhVx##|He@vi;Gk+fm16y?AISUT}z~>-mq=%2#a(
z(^_OdwE7rEXTBKCemarAd}%`|i?YdOu8y`vJdc=7oE?xaW5*Xo7g`<*Q*?b2u-S{N
zV?9)wN5Wblk?RSTS)nan9;rYog28%&JM#LKX4Os=2EsQDWh9Tcs?@(I;c|I#lewG_
z%ZP7I*mg1uV(_2@)omf`j)HI_EPG?Wxowh51zv4<H>6@e;YWxhn8IMh;ewk==0dVC
zGNc_Y%gyjRP3Eoik#{E67w|ZKj3BQ-o@$i|3ki(y@jF$fKn9a7Wc@rDf+la+%f6f%
zd3yIDr}H?Lv||rSLuv%nP`|;ASU^RF499>fC`8t(9o&gP7Sn;uqk|3&+{gu(ztPlq
z9o?kaRm6Ii<g>_qnc%(#z=Lxl=0{zlm9U=&dW8$UZCZz3?gt?3#K-~tI2al8muPt6
z7hViG{xN>jyHD){A>K%v@NtThoty{PPwHZ#V93Z2p00{He%c(cZaBuLCzbEFI+;8N
zOpoB-2TStD8!vZ|&`KUSlnob|D;)_3KIzE#Y%sR+vsP?$<O2H?fkpEIX9>QYb4&0<
zj88}}Ux{k|HN;>*_5Cu)`pt9qp|YsVsCJ&d=a0H5v7SdfFQKHOjoEAEJ89j2<Bs#3
zK7=r2S9a_WXNrGduFP|yRlkMo&v0}Ti_ng2J4sJ6(!2gj>mXTHP~@2z>InOyqE)i^
zOf)6m34DEd`I+9Iz<pE}e7D1N{-&!NtYoj3vQ{&X)QwlgF&p{!zI{jo|Cok>H{T<G
z{1G%YBhcSYA~LJ;Jp~b(8wyGQdkd->mDbg~uw*P+PBte6Y+4_VFqo@Y#3JM`+;dtn
zHBXY2UOrs)M{lIDiICx*JB(O9^CLFDcm8%Bo!b{Ctd&rZe*FXg-1fmTVDpu}!pFmV
z{vf}Wn4^;+mn29AVeZ{n2yrVL(18#mp7}LVQ^(4ym7BLIRgtIzG@<4{C940x@RZ&?
z@9J<gEO=3f@km};MRTuN@h)?a1(q)@+ddo*Y)IbHMR#3)4Y>TBFnGcs_oDf^hsl%I
zGVO###N2IvYX{BUp*w`?_xnPtm#tH};b-54Iw3P8ct%y)-gn7o62o)bBnfAKLsgJl
zWP)qiL4Sj*a1I{8yA}okr{7MEv0x~T{E`s28rVDCfLC<;0`}Z1`Vi2z--BYTfc;$;
zu>Z}-dsJHKkR0B^-4tPu9h8mbOZU#K1}Js?JiRo`X{Nqq*)MJ%763nA2+YF~bTCVw
z4o$}W0DaCUkX(RoWm7Vco8s=Fod=+uKcydb)>t5?5W;K8I;+6`ngo1@qtR%V%@ejR
zsW7T^OXxPP4bn{79Phn3;VZCgr=cd4Z2@%$YhVl(Pb76U7ETnsCSN022wsAp7n-C~
zm=_<;*?am5wu~Q|uXRw`W->x?J9TOP=rx(GOkWd|DROGG|1QS5WT2hoM~h?nI1jMw
zMOUf66ZI^wdV+|yE;`FqHF2|(1rv7uq{NJT>ZC%?4lY$O{0e9D<NW?4*6j&7q`{L3
z-p>61rVmz$&>Xe^Q+Uqq_X`2uSJ^QdUb<H|m=*GyZ^)Z!uPjPB?b~J)YdQ!a^9Nex
z>k!HG#3Wl=N)KIt#`k!;3LK}uc^`ux*g|fjD6_9B1{q5+U7EBka$x<n$QV&hC>HLV
z+~p>>&4@_F{)IPbBFj*qLP!pTd^nfpb_8sFHeN<y73`!^Ax%>qxQhYAO{41@A~ohm
z5<2hc830!|e1Hxa4blz{Zwh}DcD^>JM<Epd3l<+w)mM)@W?e>f&Y=Rhr&RQAeT;P=
z0of(t)qzOGBIIALQ@RZWsE0M~AS5{tS5Weml34m*-5k?HIPz8CfKLSmTA#4zn~!|q
zXuvFA4i1$Gyn$EV02w)G4P5jeLt~@Yl&fV7dD~mh8gGLGRKGhxP%7nTkDT<wAR#ni
zlEcCz^+YHfPY3EfBwF4L2Y-L!?W4A5iHXWeU)S3X*qjAF$)(O0L+c>yCfMF`LfaK?
zk%ngUo}<@{olIx*KbOF^jUOo~box<k;=B3NyOuD>$L%|@2<gsH#V{dJa%!qF199Xz
zp4V1EUz$}CLnmlOE<LjS*8Uoow4o%J5tWed+1q7koAC07_1)&1zuO-pf^;aH-nAQ$
zV1T#&i>GT>9)7~IY0p?3Y4N~2Em>xDc|U1PU+76Aa<cP{4xg(Y7`aRdB~COYSf&jQ
zT~bkP3no!^*;V2-*BfpgM2zF5`JjSdQ5xvG&>kf!Eh~$fGenU<hYB|6EHZO)s@M31
zgg!m)j$xD*H-@9b?*ioPr1k4oUl7534T;bNsn`Tmg@->T9x-R+@=e>SmL{AG=~8}C
zR`2rsaqyJlc*N`D2rGcjgZgw2@sO%#=I;5b`ai7JJz_>0QekKsuCBE&XZV6$z@cEa
z?yM%}0I2KSYin!2)zjs-E{8*#&egvdvw7W2?2g8UqGQ40!A9WsVNbHFAoZIwZg{xu
zV>rggloOY*d#qBGgS(Hxf{NV%3yH<<L5ZOD+)w8YyXIxC>K|T)p|Sj_oq7Q^gMe+g
zPT=_xHrM;;6?$OpcAK>?vA-x1@BPdmAn@_xNIuwtlK?EA=06x^vCtJnL;34L^_pNr
zJHjYP4=eNSd~4cA?E;yo$*fnPvZEhN>ThcTieedqEZW29zt*D#rAo?kQscN@OiP^&
z!4aZV0a&*EV9_!DvGZ&Nlkk6^R5pPa3a73P1Ds@02}~NPFF@%Z1b|Ch%PaYkh%=Lg
ziz-_pNb}<&DYc)X9b6xMicojvg#7|3@Ut|?Ufz3i;SA487tulE-HkK)-zPRzrurs5
z(a{!gx6*<cJF~BMIDt7j-~3XWJ>t38?iiv4c7eI|cmAh+1ay-w$?F7GHi8Fi0X~nP
zcBSg|x7!s0&j`m*#6B_GqB_dLgPgZ+!efDiP-xr1K+P7;6`!8MejQ<>5}3yo!xvrk
zs2VjL{5%=}49l=^PyCVOSX4Ll1iZd}p7)cip=|9BVks|g(OwjbE~M;XeOP{=DHk<k
zPHaJsh@THIkc}8V_>?@N1|Y7gM>&j!ye?}rO7&TI1jqw^!SKQs@lpi_R?-uHm&Ae_
z&$|)~S1FNx4$4hN2Z1N<T@QbLI?Q>koq=>Ajy{{Y`e8yfZ}QjhI_zJ#)TcW=uVVj>
zkPAf@oFbOmYFy4E{kqKN5Py9Pk#drsRjxk|Mwq1{mPRXIALbN%b?+uvv6E0ee|Sbg
z*w$`x%|gb)lA}V|Q&M(zuqw5kM$vgs(MhjBHx1L|ouZXVWWyk>yUIY3S#?Nc(m0(9
z-W;%lumiW0%-pV-NZbvO%+nxTA}`<P9vQJ+wE&YOveELH{>PdAsW6uYM?=@pDP6(e
z<5j5o7kVQvru>G|**I{OJ@sN>RQ$RQ9NPB`G=8jN2q&~qu}`x+$T4GW#Sd3(;VALs
zR`pRT;1^jbtL+GT=$srbT@YUOj(!R~Kesp^7??~l5<GBZ!v>vQ*0^D|V><tyBx9L~
z#sE!*Nr3x>Ue&Xa0kxFyzwtvk)XAzFg!k)_Dl(INI)xtUq5Mq;nG#xh>+HvGa;-ii
zt^%!(jo&?wPK(E%%pDz~JH7w9j3}ba<3f(^b(wpQnNO|CWSC4Darlq*1`1r#d#dW?
z>!jTYyOP)rOcryt(TYd1=}Sy7w|<wcMQoqNxSAAh!iiO(2U$OJFHLJq1ntAAkQM`P
zGR2Lvab)TGIV^;yN~ZclkzU>`DG^C#ElEUTj?cOhM;%t@-@q>9S_<j)!;^8-usgew
z@&*bXQx<c-hh@Eb^7o+mK4t3XquIqH#q8g|j=R$=Lia?KB}olMG=Hy#%pKvk%$s6N
z%Yu)RG-f9wYrjVmi66UevY@E-&sFw~+~hPtrOFS{Dp|{!W>cGFQGLFajcz^C;X0z~
zOEbmI)7lw2Fne{2LM22z>vq-0<_}Yog<TKscG_9*k@@$}9c+{zxy%WqsA=zVOX{aK
zdzAm$C`G`DiV_eh>bUmsw1)zC#`iTIYZPg0$GZNS9mr+ZNQPlyoqaQK4MZ@VN24M8
z7-1{z0(#uhI*0~IErX60m%-ZU=9$kP0?|6=Y?P9qP9Y<cc9N#&0fJy<PV1$AmAeJ{
zp3nJEz%sI(k;aFa4wjjr2u~L9Sv5pj7>lk>2C%%MX;y(BsfnJVdLgfw&2D~ZejmP=
zLRk3Zmg0`qJHqci1o(Bie&V*bePi8GZ4}m<S4S|oKmKa1ba<Z(^cBm;pxMD6UNR;C
z8%-^RDJdWoe7E?A&?uiR+DZPhwfdC^^J(O@y=TF;<mC#;vPG=4FS{Ah#crBH+sA8O
zUb*&$ya`i|p*yiMOfnWPP<KFQ62VNb()-4fQ)Uw=?R%;+L(5dwj;tf%z>`dJB4gr`
zk=%Z<>VH7H_MN6DK$>yO_Udux!ABm^=*#GA12)w{YsGhAlO7gxcNel$&x>&DH6>{W
zSyh*h3^EY3THypE?e7I@pM0XEM=*&w!(quHN$81_Txw}#ZfydJKcxnjuUM$nt$={Z
ztof5|@?UAf(LK?*QU&@J8D=w4VckBD&pHouYA2r+Br4$Bd(}NTr|G5HGg?1>WN}V|
zTX4f<!SC+@Kl4gJ_{Y9B6duO3@P{tqpq{H5L2{PbdEi6YS=y!uC!nY8xi+~6%VD;l
zID4+bg;a#^KQx7h>tV$WDD}aEnsw0jT0cRDn#hnR3i~nu2gmI5gmR2rWi$e;wswbn
zW+6U=C;ptVbOC%tXYHy2E-h5I`$dFIRH*v9_GC_kB+Ztm$<kQS&mRn<um#tPtA|_e
zT|KphOO}U+7ftyqCuid0XUin}^UIwLa`{UKq+D^0Scf%6kv`;6?OVlP&Q9b!{{Na>
z(BG!3vX!|CPFJ}Pdt2ncR!Em#0TYkJ#nGTfQ7upHo2J&X($aWzLSbn)H@8CD(d;82
z^eNlhmyP_q0AZ&)zJyQE!L!h$4wS)cdjeF_AV>*4Jl+_MJ2&8^qM}-D`*&Sk@%8WT
zx~<~(AUmV83cj8Od^&0y<Pu=d4=_dMxEN@^{{j0}Dwx>o4GavN`0?xM=>g7&zLKk}
zYuEiDa(!cHBQ5Pt6V>Tn39>K%CN>l~m_POHTR%{X9yT{Ov(Eq%+tZK^^Ht}cK^AQj
z4$Tu@nol0dVht%RYVzGaA#LChL%XR}wp91<hlWgN8AGRyylT%hqlM<ZoBfaaFGqwo
zX|Pa%&Bq&1@5`GY&lIrkX(7$`=9+OoI!CQQb_GYEGw{Bc7l&HT^d5|)^RX^-xCUOI
z>R)U~{d@55yjKdqeCQGaum6_S<$yk*b?mh_OlLl_d&aBG6xq8)X{L#Q?CZ_A0`<5b
zf$i_Z+I$wt&IXpdEQ)53fdRY^$`0%*ct5{#GT@m!Cc&vzH=t3q86MV{5y3I}Z?FGO
zcFfBqb(Q?yC=FVRI%yf1gzh;grK{R$-k7cu5FYxd8kY*r3FVi-BftrAS+$uios7J`
z9#X*Dx4Q;JN{=`B^ABpbvv@E2KQiqngP5dzw%=ZqVl*u^%R>r)od?ojNh^lwmT!?l
zTnFUwFbljW0(L9rsyp5Lpa*Le2*GK<=3op(cP=0e6+8mn>fTo@fk0TU^1COJ%6BZ6
zfT6(a=<>;=blzQZ@_v9fF6O%AYW)7F#cRI`YapIcb`@9<Q+9TC6yHS2e6_0uVk#1l
z|KkLx2@2_*q#?N$JWC$};8{xJ&2{~8+SSpOng3zfAAY%r4^>sE;7*l2(^&#tnt%sp
zXiW~e+wRTZusg|=^4#m@+b(44;8Vi8wEpcJ#*g#A4^mB_e~+1fwpOP22iSKV8QW*9
zeZV5Y1{hQod{~0XAl0H!tdX<)m3|}nNo}L<tAE#bDJA`_n^$77?@LI?#)==I=#hQi
zsY^TAm(m2~Tz(?^UpTA_VgzHlz8{X06R7(f3Q;95fr>j$DE3GLPBU&G>r(bq97p4>
zPhj9z2=Pv(@8i?lc+wk7yB%HF+OUCwc@4gy4>(*{AXM`FPmtTQhP<ZWBlg<`RUSna
z<3tn!k%?Bwh^wGqP=OO;CU6iPfgmMU;aD&o?$JwNg#p!@v@Fmb9@K4q9>yAg;z6yn
z;@~<KsxGjjBO$vb6nYFGys3MgaS6vX-hEuIv$Ml7Hu>PMTsUl94Zkm1%Fwz7Bh@_-
zvI93@q(pomO)N7~EPNCVgh1}(`PR#U!`_@dvTxI%Q*5J&VvnpuIPj&HY`Hl&Zoq*_
zhjy*;=bo$bi?871gRCMKn|gqB=JnXbx%jytm;w%)5zISXxx`!LQu|+ySAjp~<*46#
z_G<$>rJ3pJ0Z{3;27rxPnEi0yg|7kMBA1VW%3iYar(r~E1Q=av0LPDGwrDgn2+T!(
z_3yrL19J<gUJ;=0Tty>-veC$Rd%7pu_rp3NWV~MR<mI9t3PKzb@OOQ@*VAqJA40jj
zOi)%QdLmPe(K8RlsU_}0vlv3$0~lDws{gPbsJ{4e7(NF!_@ZQo9oY1CbGynjliMmW
zz;!l`W@s2#(z!~>oBN4*Tm(b+1eiwaE9ytWCn}$M0so5IfW!R~-3b%?KVGM@gF<3r
z3KNr)dRvw_z_~;;o4-oOC7Z07&n5TEyXX3?Q<fJ7h&zA$Ot;qCCE0UaUsuQEbvIlo
z(r>T11VAhtXfXRT(a6|ZVF1Wi4)%wNpI7b6CM8B!ze3jD2pB><lG+jlJs*(kMXlP4
ztzg!iTzx6$@R=Mi<SE`@C4ELSw$=unSU1NDtbv$3JJ5^M-eh5lDUb5Aq`F~3`FF$u
z?MlQFVKkX9KlkSyUIEx9EyLG8;c96%8ie=0=vd@*81mX#H4*@-acz)!JeLo3VU?&1
z!7OgyKoN)dku^_l*YWXZmB)__-Dh}}LtT$ULQMZkWVEyd>Db(_x<5A>M!<T9E=Rp0
zs`w{q+KkGlPX1qhZ3P{3XzLhIPvfkZ3z$=I@mek*n=Sy<nH_R7L8G2JAXS%zC<Nb$
z6N<MP&93C~ctV2M4|>apYqZbdprAcL2W&!B!izpv;;4Zf$-?2-7hUu|i*)kntr)M5
zea_^mm)P!}1f6fR+X-)Ex_ht{wlco|<yXw=PltQ_T$q`0-0i*U?Ln(ua*w>~*;wml
z5{6y_E)|CEIiNSz0LL`z(MsIHKs4p#YKnfgPIn^^CO*(dhbkrAcqm)9fEI~=Q*7Vu
zOC%a*7&thd241XTw@TTLaFz%gz3bll{ma~>0Apagu*NIn$vmxVxR5-F@_q<D_n7m|
z*}SFUT3W(qd}(KJs!oK2>d)6(N(b19uA>~<HxoMT;^^xH#2a8x_`&4lgmWy>jOOGk
z!Y6K8<U%%~-v3VbUIf~*%xs_}$}t1ZcG^?kSHZX;aMXsYsCtbYNnZ1Ilq{SmF?o~^
zFr41{@lcr;Sc9BA7$f4xsv?j}*UiQc$sKtS2FEDdYi5qE)HE6C(4mPXbKQJ&`;OEs
zx)j`_;$>5+_&Md?$EYy4Iqo=cv5~`$P5-4Nn*Lu)G_7B<4q@ntr1H{~PMn9c-vCIH
zGx_ZY<m#Zk(g_7kjYO%qq=8AyadT!>Z8zE*La{Q7M-z+D!QsDsx5V!09Qk_8t(meJ
zu^jU>-hfBN2ecxA)`qc(p7!q`vihyhd|hfI1@S7VfdYFef`GdCP0kWlGztr>PA;)&
zb0Dspnz{V|Qyz!QS8|fN-K7VJF;x=Ooa9zTU37jsc@pSX0bs%O<YhdjHNc9pXwO7v
zk*!le!y-7whiG!AGbCe-p-QOfQbLD}p;f+3P(EQC&)0774%=94Ec@&G-gGQv^PV%Z
z>N4W2LUS%@I(Mp@pDgwdbD0{&RUn$yGk&iAuw8ih>V<yyBhkxUq3H+3`m3o=_g+@^
z44OE;T>6dmV|oUoh-ZueJ=}KNwK=L`arO?`{SO#+-38wEGJJbC0DM=r9P5NyP-w5D
z;8p2|V0ahWZj3=x_vivVS5rl+AeXA6QNxu|T<S$%0<+#0*gpE5JrR;jlH_t*=g-Zg
zbT<EoLDrZ<y|`ouzhwOqG5;2V2yO_)_kilpd#sr*A<k7Ez!z9f(w^sd^2FR@%srUV
zrT>0fUxs5<*{G~=xE2o5P4)<>iJ?LuhEeqekfn?oy%(Ntn<&}rG$0AlOkTX4wdf7W
z!rM?MP>eIBY!MWpEUk7-^x>nv$I-TS-#|BfoU7lUN#pDvU_mB>)m!!JjYIy)rWSGY
zd*R?WSo+gh6V%B+7#pLkG22NLX!0&G5VCZSHwfs7M%Alv2Iws9d>+k6DQ7z>F>;<_
zwLLvQv!C)dx}=pjyt-_?yZr5O<yw+=Q~q)1Z%mf^&NUraE&Ze_lxb-@22{e9k75dK
ziw73<C`=CXN<+L}{VKT6K)9>6C_yRA`}B-yOW6SA${J9_$o|kKR{r|l|H#u&lgjvm
zI6{iS)7tZ`ad2Ib>P0}EZkn!&n*A`7zO`1^(79vF)JS#8NwLS)%%?VfU4gEc58pm~
zQ9t_pmXOpVSz<-YoK2+VE0~>{nef8#0}*#pKk=Y+QET+Gpjou?Zalm=Ubw3u{?9+o
z7AEa@(Ip}J5Kh5qU1*ES%-%Rt$YtJ_8yORIhlZPDCv9oKdS>ojpD$3#re()?4#^cm
zJRKuS?BugX^7o{~73x-oR!cm4M|{CBLOHH>7i{UYgw~ZB-TL5H&@8*sFgwfhA7?9H
zD%}7nHD_(~o$vpG7*p1);rAwFgK`XCeN$Hd#4pk6;kxfu*LtiHatdkTHUBP>&C0EB
z=D^lJ5?2Q~-S^fJ=HY1|(Qz&&cEY1H5*=(EL5IG6mqCa7kEz|ZDVoHbokvybwOsPm
zPSi)<6lBcrL|ad-e1l4GwvO*P1Wjk2SFX7&TBBjMN<|hPm~h@wcKqL&xnNrXta(2V
zd%-+g)yAg$gzjAtxm=y*T6A#Y56_7)%rkqT78aVuev<X%Yq`yFR$1TXv-wnDE*w|Q
zZPY5h)?!|J-0$9`HHw<fYxT|8WZ4#Of=YIotLIjj#&H=1qa4b*6Th|BN;K^yQSie7
z6KRKPqg>L$k<M~=+<L$eixZnKnp?&_J<i_@mWeg67ap~@nf~9ZPds0Scp3*&r+CL8
za$Id>UkW6tO)CC<UEyj;#)5W853?i*0kC$_PcjbWcc}e#U^OLZ?>TvnXAv!E4d?iO
zZ4^yI+=JOwLVbCk=~&M2W!#Pd>^#UHsXf-DLhIMzsd>g!E!Io&UdG&m0RDGyH?`PW
zDq%?~m0JJv*;kI(a>&8Ll`D(IoV4x2Y$VCYI;|`lF!R*X$wSR<YczW(UM3`hn!uv(
z_G*~%*yF;(Sxg#}IRQ^V(Kp<0w-Q`3dsE#Av0?rtUX21a2do0<mC-i8(bafum9un5
zG_tKDBz$cZ|5=#qk5LqCcODolq|_C)r(A<yjk3+R!OSvLTljYfyd!7cXC9=J{R=P2
zHbutAHI&Z{P0-5RgHqVWZ@wWYxw$S!@k7ov9wNi34|)kZLvym4FxNWb=XMj>MAO#P
z^7=t_>ggprtdHcMlxKSk(B1prf#v(ztAXl%&nA+RR)VJzLS-Yii0J9|Tr>NR##>44
zN)d=#0kX11<xjrh?G2Tko^QGL!$-wSK0u+cxK1mAK={ryc+a1!>S|5of_IP+olx)@
zU!5+H4q-@SmzdyT|F8yKXfBWpC7}9G6U-;LwX5-UIXJ%n^v!Sd&Hv--t)rrPzc)}C
z#G#~ykdW??k`9ppX_SzZkq&7@x`#$OlrE(tMMRJiq+>t{2|*g95fFs?9>1U8U3c9-
zT#IpF=Hz?!e)bdDZy`gnF&r43#(699y^JP}FUjD<&O`74&BPj{jKaJs1A(`c2pF9|
z7ub+IYf1vvVWbkMU+%)cHoxHU8Eb(58nbp)HXCRGR~?n#{E4WySBGQ-)Sat`d-G~^
zcEwMXje={YnaH<kY}uGc)79N|mo-b+)!GCE5Fn{Yyfh!kRG6QLY12$SEI45SP3gK6
zn6p%48)4?a*-#5OYa!stHL5mY^8R0go^RbDgVT6hlSa7i0lV}h^x340&Z%jy%B|%o
zV)F$@*+X7_HehG}mm_n1J+Y(z{<EgBSui_2ZhTlUro0!)kj4o%#^PW93(pMKGaMkb
z?Hkv#aKSf#KOzhn)3<PE=?OrFc3Huw59YnQqu>8S8zw%+6xjKib#7%y<Iefn*j?wI
z`?q72G*9MX;OM8}9u-m#y%~Q<_L1Iz51OrgFz8k=plga@H{yO}C{)imvxr1af+6s#
z2I+d>t!PrZz4_O#y=9N>psoT-;QDVx?IXtHVFgP|RTDYr(CW7n<qvW-JFtk!wq)tv
zRlku8`-hfi+RGfjU!EzeIcewrUN*Y^Ab*zdMgP~jyxv6Y)@$23FV{Ds`og~wgHyXf
zSN34)&O`U)4gRp0d{TxfBLbvhNNxxiB9^&B*Gw1~6&HgEP~U_|CilM`wX!NzRm_Vk
z7ABI|uhP(#Zn%wG<T$z<<Lk5B&~<O!AX5;(Rb(5QKho%!%y9$6eC$_X3VyR@WP(jE
ziw}^rl>+4nG&KjFJ*J<nr@47~lYtGdCMdC9h(G@GZMk6$jCCNc6r@}PEl<M4r1#E2
zbtFU8OSXU;jKZ0+Jo!oyW^PMW|1M8Yy1I`>!cTQ>z|cpGxHjet1B9t|fM14(6B0F0
zr|1t%$}ywuNU5ji(vW`89r(Mb8yKVmAN792#{nEbc5M1;&C=)Yx9O*%a4VOPC`-N?
zL&DZD4c;Axe{~%M;Y0OE=94-dbG_=6(&P<0L#lO<Y4}cG8j3%fE^ODqc&rVl0jksH
zLQjuth8uu;Ad*)n{^gC_bUAY8;cQ8eUnhF}k-eY1t>I08TomM*en`aai&-)+ByR+Q
zM4Q3Pf)UkuH@1b(%yXgDKmvK{Ly$|`CPR4HL>Mk0DEQ5PjN!I*Kh+TJB$%giLvjf!
z8k%z58W3LyF|0VjcxtjHhr;Zcgcysp^5TM@+i2{!h1L_WA_0bglZL3bLyq2%c42(9
zueEAhfzb?(8jX$tKmgLSeO9yFd+2O?0n@6o>KMpV6rltm(5N3SgVT}$#reHWgQxEQ
z{4xN-@d{%ob<)A3v=M5sA3S()3p`?Lvwb(q=yyPzdFtz2U$rg)QWqGMO+(*USN8z8
zBuPP&Mg3{_D^TtvfmYOvB>r9&Xt+#WJMiffVj)l0U0_19mR&UWXMV~Dq#`eajQg)^
zKMsM#b?FG&bZ?-3DMQBW`C`Q8Gf<7asSQqQckKW*8BbPL){N39&|~xs52pifqPstL
z<6NCp7CMYDMcz*sDAR_^QhdgpECC0w(4tE2dnV_@?slF?I-|0;Pc)dqusGCX8I^s;
zlxh3YO`JE3z3m}rt|BS;;JaN>Jd}kzX0Q*?8zC^!OMyJ>(>6iHm`}{Cd<u}ga%p0?
z^(V+jx<WE37sq%Tnpw60lUeuA6e_BKf@cMCr^h~Eg?~|Ht%8_J)#k0Y3TVixjsY5>
zBG`KL0MM5O+MDicYfm--mvWM)$K!l^;GhpUR8*C2$U-(KP@W_zoY7`Dn@3w_Km1QO
zyi=mRon;=x5td<8nBdr-{H?64-ni`nEzj18R<s{gzki`p5KL%q-dh=Bozhsm?ktv>
zU9Vo2nzgHX_YGsf-=A*q3owcFj7V=v=|vLT@tiyRHur)NB|Do+LMk*2HR%R`Ko$Kp
zZh0zSMa2+*P+8NHuEe^RRD2Q(3pO|bTtrTY4CmCeh^~VSigY2rmN&KgfVph1P0&RP
zu?&}&m+##bWI7N7emZC1Ag+KbY{IsLW27GPQ-2whD!o6>R||+%<qRWQm%6Lm=RhPn
z0H87YX#JxsI;Ka~Y5IB_kU38tMG#YNNQ>4>(XtP;h4lrI44TAt(~sjcUmtzTA^yjB
zD6}w?Pv^aTO~~j+V#cg9V-Ta8Dh@K9zQpXDTfr9>e#2t>0-#0{2ExSDKe6nv5B~xJ
znJ&i{1h&8$FljTOW}>aKgb~58jsJoa!-bN^ZGrTkIEYTbjJv#cA6&F$iy`hMTo%B$
z3lr^Q3L1-aK>l)x_O(MGA-qJ4w^s!a(cYzzeU4ygy}|8g{0GcoM~`m9*C`eyPd|dd
zN4pmhT+LiqdtKfzQnH2Hi3kV(mpLR%NhMv;&OQ2u<LvMB<IUXvTiPnW3gtZ9jSF4Q
zQc5%g`Gn`To2BY~IBKypX8_<Ws4_znWOAQCDUAiLATj}pzAO(6;A0K~@UGOnHd)u+
zVi$+J{9~c!MLIuQCM>gGp5s5!p--<FV<6uy-kp{fnO{8ZnJg|4@J7$=`71>`^_k;y
z4oyd9C0y$fSFE$MwySgbe;Qahi`ng#!{Hf^;Er10OS2_1TBI;=$$WJhIZSE6ggxR3
za3z(yf~sG}-@82aVp|v+tq-kS_u7c7G0v}fj9xxg^%$XbbBc&bCg5?~4dD7kZp8V&
zYcmadtx$wQ2s{T(ghV7~xubhjH+b8?J?M3iW7x?p^cKWV>c@vceA70O$>4NBwUlqd
z#A%X*+%i;WkcEC1K18?>?C5HG4p_4$HwCd^IA~rwFW?|wD>b$o$sDK?p>>_f+js`(
zLHD0f^Z6)d0iXH5Gh2!h#=A0vtz-~q_K&3m78^cWusNPD&hJ?dZL^#0B1x-2)58jG
z`uf#~GinWIJQvtK27Fl2sX0E`+1fb(h-eSKXXTg3n~(wXk>21FxjBe)wWot!S~}Jl
zCBcr(-N<RE$ob&*Gn)1J*IRi82(?2fL}R6=KNL_fj11{V<A<lD1nd`%hfV-P!qkuG
zv`V))Zg;$B5B#3pE>4v%=P}A|RPBW~>%+Kfg0L{iM~jP1ION6KXK|!U#qWmiDP8<z
z5)u>yuGGX9FoaOZd^|KvecN)N202+Q1DXb=fN+%}(Zv_{`KL5PbECsxhLhV^I(xkl
z(P2keiGt39lq@-R-^{$tyx7cKux@|8oBM1EMfi7~Zc+ENk5YCtHi6~s+viQXG3Sd)
z@>-d^mdJ=NQ*pmT)HX|BIVXeLSBo$4(Yl<~jfsLtrgEXy+9klE0kfMKC51bF!aqnt
z(&_ygh;*euOD14sTmly9LI~%*S<aa3A}`;*#!#~A2XkGM4$M@DWLVHmJYDk>{jBmx
zJ{O8U-6F@^8Ot}l%?&5EBbj%>bXl;F!Rfcj$jM-Yt4xVBXSZOBePknwFP9mjQWhX~
z%eO^51A{$(rmI02w*3Zz{RM!||M`d+P=v1<)mSB^(Wuu67gqhyNT8#102WpH1l5>>
zdzxGXB_;06LzU6*9z3Q^R^+o)N5{VB`$r1I(Ib#0w7PG8vTS6nV@8vjJbm2Uz?-Eo
zJvPMj^mD$3Zo+)~s&p%s`PpHH-zC^~TG<Z?=^`m-OJ-yB2PQZiOjU{Y9baN&2jrC+
zZg90M$db1(uaa0OfV1@hZ>$IF)Yd%vHdgNwz@lzur?iatfcvv2eq563<``U+%7AV#
z@1c<X+ikzDe6gwO{K;Squ6n%6NcOJ(HNh4Q_{!s3#pN>GxJWDYXFG*zh4sJ58>kza
zilS{b*wvg&bMO5u9<~U?YNHz<)V&zMZM$2j@`9-u8w2>V=^K)U?9oT$KO##SCFd~3
zBl(|)Mb^vzp#Z^5&PVF+8U>T_IW<U3*tF|@xfYe+bi)G#uGhMq@#$W;;vZg5&$x|s
zWR92&k&v#toyhIf^9~G?t})=&bA8(^u=&>*42@c6=46ysy|4=TijQr(E()sM``G<6
zWNiFGg(NA%&M<|P@vV@TNCBunwUV-wlotBh_wG>sJoyP!|0;m?=*Wb}<luUWEh)BR
z<(gK7db1?N)-32rBuz{E2-+46mYqAFp$?QS=8r<k;7h#<HBlq(^;bN&Dpp&yKvFk`
zvV7z6=?qEs*uj3dd+7(<LraL?6O_56FAB4~{>*#mJvW4$%IvVqaT|rq`+eUi-HlMi
zw}<Co54%W!@s-yGo3wb{Ps?r0$i4h?`LQuFU#h8!BEJ7^B8@aP=ky}d`jFl;$14nA
zzd2Z`@(kK;3AhzCQ&!QGH>cB*|5D$;Uiri>4Vl!=m(}nwIa>{oN)t71*Zk~|@bgmE
zAyY=Vql9(Lm{OZ(IOm+ArzdOK>$dko2$?|c<dex3hMoiT9L--&4W{3bKgf9+o#y?=
z1vpCHzF77iuy>ngJtqhFpQPe8PhC@PX!)7qFS>1m%=s1HaUd&Y$y6NOx3D>Yc$-Cr
z%?YhsxHHt7mu^4QTVmYY2o%ADCAV68@@dOCx>jcYod0ttQa$@?O~)UGzQ$L-*w`Xp
zq$=tXC?0KI_r*V2zH2xSNCOC4cZJQZmsUTb81IjWhPB%ff-icWFZ0UOEPP2LeWwdE
zeLS-@=3>{#6v3!|OBN3}X*T5kplwabLsb%i(aK2#nt(gSbUUR6&~*BGlPXlrIHaXY
zH>1%CbW9-O;`c2Js?iyhzVU%C#G&2C<f7u?o|ohmu;dNKlzacj3soY+b|7ZJ>%(~U
zHg?am^ct&$<@4?8*)<3-%25;HHSj-}muU1YNi#^+EkbUDXzl+{;7*8It>N^&K_9j#
zi6?mm5bUI{_$`OAhp(KbitmDJpl9aw)juSqemjxat@n*-4#FOO^Cp#)ioS^L3aGRu
zW1qgX|66oqDfH}Vu{Vpa5{K_eCS!KgZLGsm0;@ma#@EFEhY|F;s@0GD;P2Z?lCfM#
zD<OFxTBJqaY+-HdDJMQ-$Myjr*~@uX7q6YiTSU9`G<f-Q^|`3uZgbW3=jT5z_1pcp
zx`_POdli37$FENo@km}1E0_#p4eN>Ea%MnAhNJYFFMpC0aT1%*PU?v|OlyZ{w}|#_
zeLFUQL(Kg#98gpUpG`XOXcl(GukXV@`Q}=!%~ZH}k~}h(_5_H+RzK9pZ$FS&dGGK)
zZdCDyn0tOiOB$cHDkr{*-nTZ=ao}a}!mtliNO0w5O=*;t^E#$7&G|)#KVaAC{yns4
zZQ>=i7|YAX$2Jn6HTL#Vr`|BxCQ{|IiYaymy0PDsUhd|vEN(IfctR17m&v%k7@6hL
zc<0!MiV}--TL^$Jt?u4JnYPHm)RhC^2gM_+2ff~%H8;gBC2(dU7xmPQ6a81Q7?}yO
z*6StjkaX2cw$b-QJiU+J@8ahtI~e1CAtAi%eSmT>NkPf@-0MqRC={dn=9sC&Cv2+6
zbiHqMz|seBHzw;~T_fnL%A=j+-1No%7j|^rIbRjtrG-=Xsg9(2_IZ!=c%Zv<R+E&$
z@c_9^%_sBFic*vnH@N&tNoS4M$XDi3;G48J6{%Sd+aK`pUipk~S-D<Oyj{{rs`Ftj
z<GEhl@aE+%?`m>lvBU|}SwM-ub5q^xTU;79(^}4l*ZPwQ%uY0=4%g7*1$9EXeQU08
z(pO~_Pbc~xcOgD)b*2u!9Y?UzmbwRqU4Qr*wV8dRlexb)vg}!!3@XO7WxHKfj!!M@
zf$%_tTVC`+q~f*VSF4@*I&-qQS$&1{rA><C`aTjR9kOxMf}L*)Jx*MOV#60*>R~yZ
zsH<~3x^j1~lT*{3C9b@~HAx?dtRKTQnqzUkX{<41L1V(p7cikN%r~p`)RSAno+H)5
zo@bTr!u-)}N@<bjadq7bcfpI~f)v8bUbc}t-o_RvI%MZwT&0DsPt%$Xoz&Hk@V)(y
zegb`o=d-ct1@xplOnH22&vR>F*ALOEzP{*>y9dV0{jU$lf5e3yeAsajkd9kbeWoUV
zvK>#4_ILaG@gR{%D$+tH{TH}*nW0rtQ9Zi9(r*ODv;OhOGV|s&V^wZKi@1r_OXH1A
zJl>P)0zvMpPg@Jnz__V;qnY%TvjMN`oAS4*fZuG?!{k}R7b!_k5Rv*4-Ils%+u8K(
z>)nl}n5>3?itEZvqA1K1Lf9O>h95MBntk0{uN)Cwd>?+o#W){<TJ(|eVLx&bgsDZ3
zsI!xCjdPGH$>m5l#DZg~<@MXmuYV7eQc0J%fBNVvsteAJ{O}WP)YMev&W<cMzQ1mZ
zqpMGUxM_bK<F-gSZD=u^`X)YlX?rzmke!98YrU|kUzM+gy`1tnjj^1me`#_t<}fA?
zmCO=Qn;>eq@2DInZz$=Zy9GjFa*}5z7}0yi(|<<ZKA3DO9Cvg6lZDtZo<VR&G@)+g
z6?O4HZ+wGD8acHm-R&a(Bsq9P=GDg+v+3M0dqGXO9A%H0I)hD?Rptj4%?r`U6BN+`
zqfK`&f<1m1s{ocfS#!15x+j8Z>U{B{Lh|N`EJf%3hbu3|8&pXsPuZ!a$m4-IUk;Vs
zU=%zy64j~5R{u3>{eQ?vdk=e@hj6+*961^6Lv7tnGNeRe28Ep#;ln>#JrfqGCFxqH
zM?|Mml4a$MeubP+jDuIu5G~a3pc3GaMP7ekeV77>&Iq%_ejB0io^{}LX3kKhYn#>e
z(?NHB;~V^GOVWYsMn6PQS+zs9K;0_Q_E~dk6(Oj_jGVMq=_L2|fJdUiX_I@|)x;+b
zQ~%@-&l>^n$Ae=t($msu5jIeIKC^y#kmFxkQmSsQMRq@_0Z^qR4Va4SYh;$f-2wq;
zYxUEYeuObo?@*+DI;;H+PGhE<3TS_4(w|HFsEd00EC-Ib3{<6zaq{9VYhGTBSYb~)
zIK~r{iC<z;QK>RZXrnlok*-44vdbycNQELhSH-L1*R|bl{SW%cgD%AV|HG2nk+(fw
zf9o4;bMTTCPx}m&nz!2&`B>ArY6D+!JnR<PJihnNqUQcbhUs7YI_eyh$ytIhC9?1i
z?S1~v@JXw~9O9^eZ?>Am>&E-Q+YaXVHJU-bQe_e`ZU$d_fvEh;38NNyg-Lw>vV3pG
z@gjBjdL(LN*(t-2w^MixnTc-Rvk?wo6hgU9iY*ChOf#-cn)$!wCAq)BZwvoGe5d5L
ziMWyX9eZMQh6lx`7Iy#B@!2x$thL-|nT%#zb~3%DN|P_l3|)yXa`j)Z+$vx>+i(1h
zESa^@+@X)?(pi);O8ZAkx?Cs)lz2}gtk!2+VMdc$jLUhgL^NDVH(qX8shaB|tv!mY
z)-MEkm1uiHX0Js`+k3pQMJ?c@MZT?fF+$6*`TUrTj<ALgbG<rw1DmT4A6V=ZUX~MH
z{?G#RTEJe@ww927P+Dl!%XDK%*b}_xIkGB)wuWEMC9-Y^-crhtPg6;SO-tt+p?A&_
zS)KQ-dRuE?w7qNw=$40GU$x{!azd*;bfsm7*Ip891XCH6?ZP**?r6wo+*>XHANJO|
ze}&@g4UFyJhf(6qJX=lAyt%_Npy{m%03)YHGQ}_9F{$ZF$Y<p>crS=wSnDwQ-&5|v
za1Tq3-dr$!n!F-6{@_|O0w!7G8%><GVhljma=b@(yIUhVCAy;zCd??z;5*&uj>q{k
zUjyGh8|uG5?q6W>S1X2F@j%(|HQFu6h-Z;YS&#~SNN30KLCv(Q>2%s#@xjaG1ZkHP
zHd0xG0hDjL<LwWBvg7SeO-(2hy}tx;`|Z4o1+MCj@bp+~<Z=f_5HpsZS@HRPo`GqF
za2yv|hKg0&gs^8~q;(o=(1-Wz!lvAD10T_D`{KSnO^60xTH^XdSl5}U4IQ$tNQ`bN
zq4TjYT_X$GYb9E~E>qeb**3DUyN~ZNRrT(@yJMal7;rv7U-U)d!v}*Z^+whom!+}i
zOvx^7mqvh_VRbB~a(|Q##!E&!c~|)8Kmnblg-)_Pv_FXR-*pHN@&Y>LyVkvun~a{}
z2WukKw0n|#cTA@bG6b~uQz)(}l^{zBQKltf@+3cbUx6G}i2yin_j42r=n2g1%b%CE
zfLlFwM9*~yHT%~!vjIekit5}2FJ$FBn1ww|bL-5^wwF4|dk3!>w!~{5KQOpG`Sl#N
zQjIEHPWr{`7a*ST82lT4Vhz`3+L`|FenL2|*V4#(eWg}ZcmhG;lYjbhg+gfXu5g^p
z_qu+B8dFtY;!(OOTA-JWf==pW9pSbz76a?r*bC^G$IbfCQbB*O+(E5G?$IwfNu_pq
zlS){tOd@Kc{!GTBjqgT!GYXDI!OftQjV>^IftEq&hyPx>LLf7E9~!nmzxaPIp-u$)
zJ%Ew?`?LREGJ;C*-|vC82>Ri_my7?uPyN4_AXEeS05dwuQlP_qFa$p75%dA~zx*SO
z$erfX<AcvYJ^^3qX$OKBtS>Ll9_Z?_0=8fh&^DJ<S5H=e*oIQA9A1!+a05p0P6aCn
z*nh=?NH@d3l(`iE%fG@2JTo^k;?dO9gm`|Cd0!i(lqfIfZB;S-ENj^V`|AWpCk_7l
z63G4>vc)$AN+x%}F6vhzlP&K51KGd>IkP^1=VT;-0AvMZDZHn@21!$Rk4NOrH3UDM
zJ~;zmfPO)q$FQU@@bQnQTnXyHr|mu6|5;L7tNZiAW_iUjusX^B4uPhC6uAr;qKJ-2
zZShM2weE7~bs;_6&%h4S=r$Mtj^g29XGwaL0-<y-pcwLAodjKR0W04REx=gMeN>UK
z4^T8;2La$f;0)+2tROK25EBd_8J@Cdn7T*Dfs`~7d|$e#>+7sPq-C*>$VXo((8FmU
zRNTtO#=!;KLKQ@J-iJ_sNNPR-jy)S4^v!#~nXPZC*3ufVHPb*`-2k9rR>b%NRmTCa
zA{?kRss|!weLqUy-`i!EAlcQrH<2|tBZD@&#H;bUvoxKoZ-Y4;uLf-90KH=+E8qw=
z_M8J&TiHi&B85RKI<~0T;ZagtEc$-O{#Yaki(`l!TNVePx^dx;QoE}p*1x~6?qMcb
z_<#^c@H-_JqTqdT;3|paGS+<!)X_p6`HEql{#QWaJzfwFWg7tcU4;$)4j0TF$cHyf
zu~!K!bO4cbV`V=^Ug~BSP$Gb97x_AL$Dc~x@)CGRWsuS$yng^(GX9jDPUd+HASsd^
z07$tXf{bN-k49Y~yp0qvp?cXG9l+hdMj%2IN2-ZLqMuEU_Y)fLLq@8<Ng>lj_bF6l
zADG)L_%lQ4k-sP`^g9LK&lYx(s4xdKfOt~fE`3A8t?Xdn(jQ+9^teNZ1i?+JDXy1(
zz*62ET-|+YYQ6hE!^?ziCPpFk!RWgHp!?|qQr_2U2%6@0aMpb!L3jaWE_cQ@i--X!
zz^wT(b0eczmY3XKXz~^Ybg1D`Paz9)9Ctr!mq_hsJqVc<>Sv8)e6liXM{E9k5r1Q{
zIuEEEIhX3NR*3@u66zb21M(%`6j<;x0S%ax0JpF4ul=4&T^l&@JREBlC|v-f`NjY5
zmjCai#2}06txStA#RzBtc=Q0g#6jRa3iUqd<J$Uk0p0zz#h{nJkMbdT{W}2sukRA`
zBxtNB{DWUWivs^OAR!Wd<Tt_-z_{trO>MXy9hevRr!05Zr3<>>doan*K3F>6|8TUW
zPwpl>gKK=j{p{Id?E5SNzg>@h=2u%X;CM9u6}0`UvfD=neEQqo4!wSF0qhd!g;WT~
zJ=$s781X*;@xO3Y9fj=JoU=)LodcY%fxAzBYrk!K^$UFvDa9YaRrHj2(jJVxiDquZ
zw&`N;d=giqiTI9RPiX-rBNOoju!D|3thr{dasNb4w`6lb_RPKAUZ2#sAePuL6_{BS
zE%}@%t7e?V9%#TEUMl$ZV&6%SMT3Et*57a6zHQdmD^i?w$IDQpcO@kfo>_dkyWCMK
z9j?se<UN>+4=rNA9zpjoOy)Uh!d*_eog_}@-pCem!hOr8B%#?vijiQ3gG`VzdnqWx
z*T#I1DQq3HlM);k1re8%fXHy&`#@JuulO>Nu&!HZA4>87)2>1vTvETg$fkW0>_ykt
zDt!ZC7q`gdC~167TlPPaf*gh*mAH}h>s=`e9uUvl&ReC+j_-N(n>$nf+)ORwvnRWy
zNfMk@S~zGd;wONF=^&ABWgOc7w~s3?6&&iv)YtQ$BiLP)&hir_<W3>vCpLyj2XX2H
zA_qj7QsEn5z7jyswfF=08t4aAw;aT*#>d|LymtcPdCQqoh4z0S<)UIGn1R0}N$g#m
zQ~{q;u$aD%$q@x3Rx*;v$4dkUiW0*D3P}`KkU!$(RQK2+n6;IC2R!F0`=t{Q?To2`
zSi^rhNQ{<NUx*<$k_it$I(4P<GOj$_qBrsM)5U>0Af5QFM04ei?(TsD<M=W=<Ht&e
z?JNku5jzfBfT+%0$z(z=z-_8Go?nP-^<zab7mFi~IZ#Q63`Rej8~DdQfNAMx)_svb
zPJ~w<S@^7wr%<qM7iJu<iH!`y@fKWKkAWSXtj~GX{=px<-Ck*zwu%M1!X;Op@7TkL
zWU}wgO<@YwpPH&-%RYmxaXAAQ;IT;6bK|GxJlHWV0Pw+lW1((y`Fk+UK;i%TYqW47
zHPNGqV)crb1~Sxgvezz@DQ+<5*-MTqDVTDTF0qoZ@Vos<mp?0f7hAV)i<?Ncw8XSe
z`E4!ic^|`B)UzTE&6#jEm)Vn#<OeyiG$NPnWO?LwXzi2k95C>a#gXOR!LXC;_IQxr
zLEVpaEe@0EW$?MBxw9{QYRlz9&PTR%?87E_kQ^mLv^)Me-W>N}?4`_w3;qScC;rd<
zVQZ^KeUzZJ-f&X{c#E3NH+3+iO-^b1A#-i%{kM!Urm1h;d_WN=MWly`&b1CsRscE(
ze=ve`q!m{SR?9KC00X^S1u)C$H<`?-OLShCpiGhCW}_8{26-FuN{PYGyWyjEi}c?E
z&awUbU5MkXg7ecx2y}MnT2Zi|^cCE}RzF2EI41%wtqNB*9kf;e09NtFZ4ha4Qwu6Y
z;y55@Sm-C91#i4Xb)&4pKJ`y?;xM+I9nptf&7FGHhnA`id<<S^-ywk_iC5Y15Wf3K
zsMn5ztD&V%sa6(iMVzUbZ6+nB4)uIgRFmVJ?Q9f6=R14HA62Oxm)7$HzlWm{WeL|P
zjq=G|f&I5kNok4QloTzefUGk8T)+WZ`mC&sC{kNfb8IhOa9D4xT8=y!t-v04_`u{B
zXm@pgb?#;JgaajnyqKoxC)3PkS9tF2A6-o?`qDv*2*II#&TICWR2kJsT288KU5DXa
zhcaT}ZuaEP1qiL3UhJR?<sJXk>Emfrc!2nc<nnAUuDb(}ak*{y_=rZb=7mQmt9Zf#
zZI`v+KzgO`1W!|`1l@wGy!Tz<N=HTX(7`C;_CZt<D(5=z)3WLnA-6@+P)JTo1o3vI
zFE7eftJ0Or6n2UZAa1>&u?D$;4w~pB;gwGQ#K`MPF9rG#4@z5L;M?g@pfca_xkl-_
zl6ZP;yd3mG6xxo$PRFfr=>;h&0oQj|4pLYT?2O}JCP6~2u(R0)IGt9WlzOw*KuTw*
zoZyga?uW;QTP@O8^|l|+KeU$EqO>z2Cr*V2S>%W7YsqcM8z}UxAv|(ke_K<^<70WH
z?&IG6v&Gm9o8DkujLxeKa~Lm<w?6vnptXS03=K2Y-*kLXf{FNw5ET4tN<Z<uBI?D+
zO1$rxrw(V@j>jxwcp(&_*w=qw{|eyX#U_f)2Be_jy{~0n{%Jyli{MCk2ngyVAL0xX
z5rJunCl8-P;z2S&pU|Ky!Tq8^57b5Fk&z%wNXl&_SFa)s3#GOu-qc{0i9_4d+Mq0A
znz>_d%sTiCg%Nqk!6wa!Rzshv*-FzjU#uOg8fH<CRC+9qn{`vUUv9FG^9XiQ5Gbo>
zE*Ty3I)W>LLy)x7B6TR&^!1tAi0W7Wn2JYrhvQcBM^aVneO&6ht0Qg)+@1%n)Wiq9
z&qlQI#T*|zuUOx15nW>%Rf|Wga44qTzjNYr{I(D_fY404F0A2rAKgvi(-h+}-@r|G
z`Az{{|6H64G)p#tjC8BYeF$7S8v%Z$+#puEVRvEAyd>I#sB~6p?z(Qf_8)98>vGS8
z6ac2RaiHqVK4FYg3iyH!1Mql0RbgR0*1vyGQ7_9u8ST*mGm~2BR^;&(UYy<pFIx7k
z;pyl8Yi+lX=w94zDl<~rbX02!swn+hDsu&I)OF>#LYP@rA%7XnY}(;Ix|q`4SyE6V
ziLc^alKWI0EJwyTk#*PO(2tH!iI(7RdK!a1X;Oh7S3nl=0D?Td;IKm26Ls90&B&pY
zfs7D(*gLwyI<m4bfS6lJI&lwy&G~KQM%dmZc&AT6*E}NXq=(jG>eXEsSa)AvP(OM9
zxi4|V9BnkMg-+$MT3=#a=&dObR~Gho#m-g#S}%>o#I{Mo^7-X<qI%a~0im?eN{Zz4
z<Wk}6t$u|ZzdNT@Nvv%*D=6F#F#4oDI8M_jhLEN5GPj5tKZ8ht@OLX-nEJ=f6;|1I
z-ROb1<f|BQ_FY#Y1T_zVqEf)r!Rs5u;-g(jHySr{Fm|Lr&Be_fTq4fXm7L%gqnkfG
zS`d0N#dMzb2k*ZkcH|FeZW44w7$|ze4^++f*RmTzwY_(SAkau;Tx!usEKbfO6s5{#
z%(T;%z}v-1i_R@dw^CJ(r}$>;`kWbuq`p{~U8D0L-`3H{n)_v*f%SUlmy|CL(#pDf
zYmAu2P*Qa;GdZ(FQa&Z|Kbw<nFDm7H<6TD-n<?{SBj@8L8WQ~vHn=eFN3e_>tHewh
z?XUOMjLC8#Zy|R8Q`5}c^idI=2+<4?|L<Fq{ewHXWkLoj#WsOCN1Lxo(;HC1zmzOI
zMboXBalB39aV{}o<BAna&$_mY#BaepODp-p!P<>H?;tlfYybMC=N4Gqk%;>Q6%7LI
zYOBIv^`;?rAil4BUuP};5%mW^?P{RV;B|=I;b|1Q1WL9QPymgNASXjbh$R#@fSzMQ
zDTNz1yTA8ZRCZD#QTXByc2c=Nn_DB+m{j?a@^bgG0ZN}IfK>3cR60Y13OH@}V-pIy
zxl-?(uC()OMaSW@Wr(mz3<=Ks;DANCyE&{TEBG>=-kYD2YT=Oh8PZiH;S>{Ae38Y1
zTnm%DHs@EOj*>?^vn3!!piLW0w|+8^+#VvT_u1O|@D*8m!L8z36_u5)&cMUlxG?XP
zXHKE2msgsp&&t`spDiIZAD=9y7)#HaItEb28q{kxdvL5oJfsDVwd>*3bk@Isb;3tR
z|Gf_P&S;eZx7SQ8Yi6#cyi4DxY-RRFu0{XTrGb8H{Gsq|jbXWSC)vK_i#g!H<BP+-
z!+im{8p{D=16A1dcA+og9`l4M-hM?`N<X655=M(EGVm}!bGNnJP5apC_=7}Z%!L^c
zI=plVx=}hN**B<`R=GXFQI+r!5863?5=*30VSX<#i}C}Vs->BAa|Q4kqZanwaT2=$
z2_z&}GlP0TfH)7JX%qpjmfHcS6Uz$#ONgv5m4|W#O#5BaCij7LuZ!{%(PrXoy?-$z
zNrM{YOR4vQmRQwN!)i_w4oU_;hcYKv$ZZs-EZ3^!rgweGEv|Z&%7w%iaPQ|@jQ$?k
zk;r0xiFp;w5NKxxIWTE2wJgQ(i88~vfq*L(jbA<ccj39~3s0yo9=bjf!Vr_=*}5O=
zz9!<s5mZb|a&mE~vcM*|93qVh@L$f!#f!t%iWHR+g3;B$vR0}pixD(Euu$<^BDQ_L
zL^_+1%%Ad@4E>gqxcUI)hXs)DqblC?ThF(Ac#AJ)Zh{0wWb<IQq$a71tKp<OPcqn1
z?Pjkn){4rym!KN{B>d!ae-b?YEl$NF9aAUEn;#4Dt-XPT`o`@U@TV<IC380d&VQ$&
zx{&=L=S-YiZ(EZtfGh7zYY<?2*-u`97H%nSOhg}hBwO;~9K=qCREpN$DvcZ@W<i5E
zGn04Q5`A&fcBOwO^{jqGt|Bnqrkf<@70tQdZtMG~Ixe9W+G2H~FMt$x3ux;s>*aIv
zYG%8BWC7#-W=bv}HZ3ix67TQ=+Ms38FY1<dUK7-Jkx%vxYyHz(dA2i5sh~T2F4g9m
z+K-itJifo@WQjoRXh16Q!{V>>(;2F>%X60_IqC_H4AZrpo40_>r=?<@cR8AiDHSMv
zlzfjj1uClLkk7t9Wu_BzX!vsn_8d4lE%z|Eqykr{F1m!d2I)2qox<be?L1up{jX`l
z!1&+xjaCwnyJ>65?BAP)baP<Gy$LpDKw0?IlRwu$e(n#L#a3MaIHLI?)|@yl1@PAF
zvLQ<e(0LdJUj6mf|Ex}|mc>L62T-EIh~{(qX-hndqeR-J1)u<1i52Sfa2S^O3Hv1r
zG;EipKCgfi9J(k2dzulLNz?(H=AV-q=$s48t4O%uIk9_;p)j89S1WQVjPW)a>`(DN
z&T&XtDRg(&w9h!Yu$-OEO+0N6?wGj?(!u19q@Uv!Y3I4BC$OTYXdZT+`c4n778BsB
z?REeK+v_U3<_?q4Yn~bAfdwFON`Cphd|<Nd@2Re|Fa8_AWmxNx49J7Df|C7SIXB-Q
zScP2afUQRzDaGjACFoiEK?}nHf(s4s{Q#?K-98)qYNp7ubCm~1?g%2tti{N_B&{KB
z1l|_e7Y%&Lvo5wCUtuoXL?wuG%|ieI(D?8fIHp^p-2k|#KEr|CTn<U<kX`P4d?74~
zVkdN(_{{0%QTk0vNodJV)RI7vp_biO9H2f22L~lH@B00>iW?h|CejfRvd4a+aPsc%
z!(zi~x#sKe@N}!rP!R$g>f_|x;cs~c?8#Yx`DrjR(9WW;aoaGTqg3-1X9|~gY5`w-
zq8pjqEyxb*o19v}`sg0gCRNtv)MM=6%Okj)e^X;i*6EkO)Y!-&ph7C^4+5R&kQzg`
z*6#F-V|CSn<Oyz9h(68H4T~tJcV7)UYZu`(EsltQKb}J(T}(1cz|#8NYi#)=DSl0D
z^w{kJR%<2s{o0wfc#FKFdmrC$SDtSl1_5Yd*YO3`&s-486JzieU{#;~bm-*2Q!!iv
zoVqt2*MVP)P~Xtdc>m4|`M@-v!&Ql*!gBqe!S}_tbux1VFy5-^JJZhWLflV#cC+@c
zK!>~eT+*7GsiB71XPxK6=F)fTS5@yP&9M}sk%n&uJqk{O528XsSlcD!2nbIMr~v9U
zVIXVvmxF_26Y1N+@aZ*7c~}O^SG-pwh2STT_k31pD2@+cHXhSmc<f~q;6G&P!~V{E
zLZ8X;7j5&svD*!i)IcGouikijtoBRpEg&gGUo2ex65Oh~qrv2}EGM~~SOgF{*MYdK
zS+L}nIsQ7HJR>;Y_DFPU7_)i1k}7b%BW-ZPI`t&>ou^o6-ZiG{evy-@1jT-+a;o1W
zqX9$l%jkDk8pYqia$^n1$WFNzv}DGUbi+iwy}h1kYn>+TzWC9%<{*JrlLe;W@mB@<
z=*IG_$L;<&Hkao#(gJ_h3xb~~62d4}`JP)wEM9}z8YquCwjW-CX1EUA(;ItO#7wxM
z4N(j7=b%Asu*FikhMT>XKZ%9g85gn~0<z;`w@N&<=5AE|+2n-^1y}CgWGAyhwGg09
zGwpeCWt{8h+|u&*cgC=)geIRwzRA%_hI}emu^-A^p4xu+vuW}*zTXkZ*OKqdpK9a=
zU7AgX`s&PCUTeOX7$j5ORm{S)+M+9>sxVu`HNS&C{%TAC*?AACf6|yZvHJM<08L*?
z)U9F};e-)9Ih!4k)OMx=6-KvkUIT4Mw_nX$Z-w2K?%7W46kS4OPM?ne`(%?dMl)e^
zk_&5NS%&`ob^jddT2Wo-jl;16+sjS>(-x;tfEZI_&9LYQm>1Y(jYg|3b}OYvxzPcX
zPE9P)uJYDk028py$lxYSV-d;ljv~9mO!T8-U9-S6ro`2n>V_7U2Q~-XX|xqTK|Zk3
z|L~Z7W5-x*(co(*z-+wjjmI`5*U$<&^jo1A7uC|6dr4M^pOBmV<Xg{l8k{OC<1TK=
zo-R7u*Z3pl*v35|glBU^>H4d0a+bSg1^zraUq}5xP6Cc+Nzv)wm;U|q&YuSkx<ERL
z<1FMF<k!&f_mSM_W-h>|8hbYqzPM(J+9x;80AZcd--Hl_JMxD&Wj-tK;)0u2J6j=e
zdOC5<qKrLo>Rxp08EJ8>^e&D&OWthP%{x0Na)f87X_K~b(|-1m1r{X?+>QowDZxHB
zkmog(kqozM6Wg*^?NZ**LJ#M{W*Kf}ku46fA)X&Jf2`m1`t!-t@XGceKb&-FKkgH|
zW6naha22z>7GcQBe~m7S2U8J*9->vfiy+F>@djdkhdYw$JN|cCBGpo2>!_+LmFtUN
z-+Z8Dxdvy)Iol`y7Njkj`{=4l*3$8$ec{CYXY0K_1gyhG8^lH23Cq7}#2J<!z4#rB
zCeH9R_~U00RO3WX@#gg2N~nzlZeYA!S8o}D$kKyLt`YUX5rU2RkM>D?AN{KQkGcmi
z9t!17ZHJsQX_Ee_juhR|j&6p#Az3yscAmJB@p5IiTE;|TVVcw*817&9@^Q>xK5upZ
zjVhAox?33}_x>oZY7RGOL1~vzooc#1#O%7^^1b_(-JRz;M)aw5w>UG9(9?;pf#NJ;
z12R3^Nlr^kX<kpu;a$<^NKU74@1J?MWcfC=0a;d6;acUF<Sgr6{P=1<Ip-CV{M&BR
z4Z3K<2IN}k8P@iYwqttE??UQVwVnvi_QnBlslIH~q9eX7#pR?iQKXGNq6gPxcb7w~
zcxWyksYKmfyGqr}WfgB8%Tw;UbmI=>62h=!fXcGH+72PbHF#6?XN{yKsNtXD*}9J8
z&?lmOw_bxf-;Qj`y9C(G4ZiB>ctoM{uce5pY|-o6?K8fL3IoW`Sm`J#{kyC{i*B}8
zu0SXq-qU-|fz4D$TJPWT0=VZ{s3hJTv2d}9iwTOHY+UQ<JtN)L!CWdL7ac1AH8)7|
z-{F5rmk3(=Cy@*SAPoH}o}ZoLZm>}aXh432RRm2J%EoI($0-A@R(yVJ`PC?YU&HYg
z^d^o;!`32FY1_5vd8X=2R6mA*=cTSK<l+YZg%`;%lxHHOT>tM~uqM^aB6P;t-^$QK
z?}k(aZQ*J@SlfJ35kuPzU9<TvXc%MG)BpY$aGBuJ7U3jxpS$j1kQPY>X<}-YquUR_
z{`?Leijw%#)XZ<kK-mrKufz+#QFk5``u>~7&0HhBQx5EGe4=kUQ;|WnSx}R%Z>fOy
z?SDry^G%Hl8p!di|NGO>P8C#xupKyt2~GVH-J*|8HTa?9?~#wb=?sp}n4D`<lAv@p
zv{P=-{`U@Rr|q-~J4N#UJF>YmEll>9s@MO0SyXn5=f)0k!M`!UFQu4zoCDvc<|d$!
ziUDK1r_dpJNtNgeG(CVhVFoDu0OAw&tNXuA^(7FldpO7|1~yg5^zW<OEc&;xY+5}y
zgDiGzney!+^g!VR^@QzO5r9dqw$L%%h+!e+OcpOJELL3As;FlL=2zf+vxZ?LkA+=)
zfqpI~4EkPaDtAnQfERQOxJ>^tRJveR77VC`pmTjsG4G%K9jxf4z1*x_6SOUq_Zf6e
zs8eH-^K-1hWOGbw-H2f2t)Sq$de{;Q)H)PzoH0J2^MS4&GC20vmH1|lN;<q)fHN|5
z3_l7m>`#V4Bs8#(KV;|(=nzg8>h1r|GLB<vJYtX=+C%wA_7;4wC8&1N6_bAhA};?a
z0OFnj1y%p5nCgC|FN1ItwE6ddm~&(73!rhOGixM*l&=(!Q8l?~G`id&tBVs7d;uid
z2SAme3NooazdO`@{8$J`hIk2cqX!UA7Wu<2Iz2bWzW;X)?ipy6FQ3$O#mB7>#y};L
zI~WYABH+NEnLv!Re_#MSmk6%_^j-j&;kqAxH5>plFPabIAMBi*Qo*Fj$UNX!|L+BG
zZ{FxWr0cocw0Y-_yyijkD@AhXdW)(nB*+pg`}T}il!}U$c;Nl^8N*!vnn;!TuDPM%
z>niikQzopqU`_Gqd(bfOuN-8o0$_c8$s3LDa&eaU1q9Nd8G`uyb4#Fu_p<_K0pot3
z)s2g%mk-KB9-ErZum2D_IsHyQpLe1jc;P`2<jEA!aw#3u%J8|Lg&vj<Oak*Diy26d
z(gxWIfJyJ2r7lD#!b-;qJSA0URuVpNtYE4Q6+ibqqNtIk;k4H>H=Z@e;cW*aC_V8h
z3*7ta>ak6N!TpB-!2i$JON8w$(l%n@0BjWGkPj)a^z`(4fjcl8E+OfJ^AK<cJp*G3
zu8OM0m%t`j*$=1Z1+x$|!qGP{Ql@5JxHev8atA8Bm!ZTqXaoc$wt-?hnSu?eYHn`M
z-W|~~f=edD0*nAFK&Xz=0JD$cG^UyJEjR@RzMi{0bUbIkK1hF_=?V=E4y!>0V@ym;
zYy~D%NpM;MWi21jo<y#I`Ee>pdx2(pgSoOMKpTG>DiOtU7rP?W!BDYrXQK)-;^bgs
zQ~G#yAD4|#p4&V10-BznVa~l+Ouz(KjQBT*0f~=ofD`zDR=^ruCVD>feYm??V8)gH
z`ng7ql>Y9??!w~-md}_wXJO*u>|z4S2hB!8iu^#;jP%^fVhtd$xsL`+I0?j`+<@Ok
z`Y{cGpuM^Y&0V}0^V0Xgyu2iauH*zj873sy7tpZD9fZ!Qd3n8Sgq&V1^;_U^+z))2
zPSqu-3S5B2ROjj~fzDF{8DuA<A3+ivQVvV*>YY99e6bVQTIvPhg#l2bmm=|c)YUa!
zir$BGTly2rKr**5A!9j(c+4s-Z34|ZB|)6M*sk6Wh{(?yc%UKl1x)^`H=vf$9aI4Q
z4ff;3H8naUVIZZa=EIjC@ptAE0rj>FXuji7w^5yVS-@PpnU0|5AqthJQ56el<;u4~
zrM=QkB5doRu!;Lk+d3+aQSt$pg7ZKluEyu~H!T?<l~{Sy*++2p%L^eBz!lIDe5<_#
zdbmd>R}FjK!2S`SY41ekjJv@veQcaL^8<Z6l)q!*bTj2QHX&nv;OT(=!-s6za%aVi
zKpz6RjlV%5fIP|PGA+_9wpFcbY|Hi6yUBh?cP3Diz8CZS{oJ#MxU0a0LT4p#SaPiw
zG)z2PIiAU-N{%ml$beKWm!3V({~-D|>E6YWuPX|#s(G`VHxlAAT!F;HH3yc_lu*>@
zdzXdPT(Qdo)6Yt?=tX-6*!{k3sVtEp^#9hyj+sfMA)Ex)u45Na0%p2L#2mlz8-arI
z-8=2r%yc2kKS#u)UN1TUf>i_vy-P?RB;WbehyWeV*uC>2eL0U+4gEqO-7DbLKWN(T
zd8OuKU52;WS^qnd^kzA-e`)E94&<Bq`-a8f=)s!NdjFS49W@0d+Rbo<BUKhMU*>f)
zi_o7y0#}CO)XuFWXa^i;AqWT<PQglT%}@pt+m|Fb2djsf>v-x#TcZ<jaaf%4ZSYtd
z?TSaQO(lkm4FWYtdDV9iVAobIvq7W6GhUStT8y87`3S^+UP~-HKns^OENK^?j=n}i
zs6-Nc<F^|g&byr}IO(E$EXE8-AdVgZnF=Ao?rrlX@2}|}(p6{B0P*s;I>_F%xO)M3
ziasP^UA>tY35jLqe9V0*&3!jGTG#a<$>4-5>&R*JP>Of`nK)8(@aDQKw>kl=zAJtS
zb<3X3fj3mR?TtA-LuFn+QCBfH=ijl`Pe8!6Wz_+BEKzgG;&4AU9%u;l24>nf*<N=K
zd~SQLwxFHg`6mA>qclUC7|{S;2c%Df^nZuMy^~umExciW$kJxB_fGu_N5_NRZMlQf
zuK*Lk>d(=J1J-S-yHcNywTu<+Tme$WG60Ys`XC+yt6&%#C;Ucwx_th25*eCMmadnG
zt@Ed!j`2%9dr!?r);f}Hiv2Oa_`{w<?HC|nZ#YLJHIE<~j{gJ!rb->K^x0yUNM#`C
z0G3dmO@QLTi%WG3*4BzDE1QWTIGx`M{ZnqEo>Oq6I7@<1NY4_5hGbJljUH*h#qe@p
zfFl3Q&a>#I>lVyanpzs?`6#Z7v}#K%;th$&G7RFk|4{6E?6Z<ae+3~J#IL?$al2AU
zx3a^rj&eu;sBl7}QF<j9H6xG-xewm{Vn>EMg7iP__`yQ@LnsBYVbl(u-(*MyP3t1P
zv@AvP7gUoNqQ4TT@;Ev%(Iu-&qN%z}AH0n$9;tCL8s7I14#S69@M0iI>CN;C01%dO
z1y{7i#jH@(Pj6dJ$RDzLZV?V^$jx%<1#~hB1XHGq!T=rqmLZRY#Hgb}#W{e@bS$xd
z=9iQhaR+tYreqF+ny9_OQ!W82{y>7?46JnVtyb-Oj9F4YwTIY5{#;Y9$TkmtZZU|U
z#p%}gbo)7c&6X&<B>A&avy9L_=-!bI<a;+QZR5qhjUI*AqDXOeW^H>zWGrB#*Fi?+
z$woiExZ#rr9D!4n(Iwe1{nHQlNcVOb`hKx4r&=9!R{p?WzyPsLT3})e=J;`8efo`>
znT;5F#sg)?HF44V&zvs;h=5w+rA`QT49Sx=KLkvtuIc^qm#%<*$|_Is?L`lM$G0C!
z8+(4HiLt=;4V%;T_Tzjv<oQk9KMK4tz0C*(Mv2)hbr9oIHI7s;4`a<XwJ_M9jm*n`
zmdguzJEg|N7G@oFq?*s38Q!Z#v}!Ucf?}r~a3IGkXE2kzAuuA)a@ULo+9aUTc%aXe
z8mLt7#qjgdn{_}sOXq5+%pxSOvXb?zf0#fs>H9W{v7g$J+#-MUhV5Gg83e(M!(HZG
zQa-VDWec`jt1Nrll@IbKJ5&YJyICx}I$AcoFgnaYMztoPB<kL!qrnIoixW|Rdh%@0
z*(B3H#YJ<>ZRsV{(jJn~*uokJP|<AWv!+iaKjzKj^IqM2Hl^rCJvE&50d50>*XqbM
zpJ-%+eri-l$v_1yy!y%arRRr)fFC1{&_M*B#<GrLbfU((-M+dQXuVusF5MrC`d8l1
zLInOUJHsQ_qAXA@t0~N?Ky;&-sKIQ#zF$d#W$Fv+j)0yvfw&X#l{JvCe?==s^;vF3
zY?cljLIyyEnY0!cV*XczgRSeHGYG^YOCJ)ZT?R_$38jk6h8PsN%ad(Pd*ag<^I4es
zw*N|CNS}HQS`+(f0`-fY7GwmE3mP>`J==aX7f!4c@|AtS0lvP7tlvD!mrQ4(ziv4|
z`aP=7hvdP-uiUMKHflD%3!hCbrQgKF{~i<>=>;?py8+U8@gny?9lp^(K5KQYi8`gO
zM-;9HYoKg8>3zRfjwP7hxAX~$cjz%=Y;A2AAXT@S+{#7#MbFfAJ%Wx}3##I$fg3?J
zjV-waYyBAU>-KKvsV8tgn4vRIv%zv)W3X>x)d$OhVWkiC5n>;`>LSF5<<q<A@J3B(
zO|s1Kz+of$J#6#7bEg%<!RE2yO9U`5ifK@Id^Hx2Ys_yh)Lk7Z8>d{VI|QB@k7{*B
z+y=e*`8lj7?~B`XjR5;_EAV7Zm^Et$Cc_4FdZ`2vXECzpHd8X8aA^C|zi34O3@jU>
z|MO4-rHsrtQ1LMQh4ay%VG`5hVXW=fGAClQcNOQLY{5<JM+1!yQ7JHE$w@Tq3Bs*`
zLiUeUZ!pDla?-n_zj}c(LxLY?w%x^SVjPhX)e<Ma>KyS+w>=@I;3i@Nm?v^T8f7Na
z2<elI^wbHeU%xur-^nh~KrJOstSD$WSzk>%fKyKEt0GBv2r7U?6Xy@g87!EYF9Ald
z5SzQVjzc(Q6Xb-`+M_z5X0kN9_L0qd<J+Qu-bJWlz5rl5DZ?04$Vvbcz7jDt2Z?Zy
zdBEZFT}y0e^y4FoDPF_+U$eupel1^xWqvbw1zU*#vL6`<gvs6mfCF@3Fv1VKuCQNy
z1=}(f2<&@Qk=d;lfI&JFNBoR8n=uhB^<TID=-DIT3iDO1s(RKi3FzTrwOXnc81lb%
z)EuGFp708`I1|-T2)(!LrJd3FX^<6>!~l9R!a>&cc@ZVzbE}!Xr;GrEzvlxweGDtE
zi8s{ZGS%FsV0b!|3)=kdE*PMGaCD=ugF)g9oi9P9U;^N>fo^aYwjnPny>lP<CBu7S
zxIjtkOV*1Smwx(v>0f=9X06DRt6wcm1GVu_wI4oI?T^_!CbIf2_DmPrPkJ298*hkb
zn$kd{>PK;7u2Di~&d!AP%fCDMe?2~1mRo|HIQU`%8DP}$gP#sD3l&h{`M&A!>d5J=
z2U)L90yYcZ@GnMM!OV$)@*gU3@2PcB>QF^SPk!3ne=tB-JntG|Nz2s#b#~^8%|}eB
zW#@JbNZYWO)B%K)2#k*Es4vkR_y5HpcEki^zcvntO5GH((M%KibfK}iwZ$fg9`iwM
zuzJ<ja&xAM7uK<Jd%PwnE|@nqLY|LJO>u`PC{(|ENnci8o@{}5Qx!A|`oym!LF6#;
zUOpE&R-u;DhQU1f$D5m*M@&JjAqtN^QRa?-9@O_m8%X-uoF5$>-F^)mB6jJJCcuQc
z#ADNX-cx#(o5?;ydNn;&z0Dk-g3D5QbXo1OFER$?N|r~^U#AA4L;qPU*SD8h0JXC>
zlo8D#A|kTn09QwXCSew+VjW;Ipp^f|XM{347npt5S^xE65dZ6Fd-@@$W7sIUw34`T
z>A94<4;wBD{+R^4pLLJ*+?_X*zau7QZwfJj(?>w9s0o<D<qII>B(QH_AO)I)fKX#L
z2;3^Y?Q>uS)#mIVqd8fmOBL3S0<F-|)}+u<9U@hB=U+4Qlb@p<@U{yDwdzq}@XAO4
znG9)XFtv(QR`lB$B8Wcy{cUmv#&H555jzQRn2pmeK-B{j?-#Y(px`$i%LOUi4)EpG
zWr;3}V2Io+{@sZUn&n(UxeS9Z#ngX(W;q4UXI7oz*Jk|o0He6VP4Pu1n4od-Rkzgr
z>tG=FuiB4bL^c4Xd|WSNe`nHnqQ?x1C9db2D55=g(@;GMj2hDKD7F&-Nkm~2Ko3Q@
zP;5*epfL-cZ`Z$onkSG{$_+(-I|01g9dn<n4*=N}RhQ0Z_?Y*GL3E<7iSs7Fzw~Eg
z^OKG#iJhjO%lToX$=J9AZ<Zc|sF3B<hgwNsY^C(`%Nvju5}rdG3la<5bDrLyC4|;t
z^fh5&2=1A=iOx1Kw&!?SB@*-lkZiyC;6lt;NUIMhydUjo+lB?A!i_M`WY9`Su2=%~
zad9wB%CJK3-FYbua1+K<1dU^qF(S7X;v*o8%AzS4S4_BGV;OlK>i)aRUruDc-#-{w
zrRdLti)b|#6#+Qk$6sm`oK6BAD@d1y!lKchBt=#CH8qpWpEc>mLRel5h9GBp#zKhz
zNq9?E*6ic1?kLp7y(O?nz-lE}Oe5d{f#6&swu6a)W&ZRlB|+E{XsX7jaW$3-Rj`XX
z5K_W0y42ySj6yo_k^i3fp~tLcWTMf0FVJHsg<3KOaSblY1cp0Wk=KFQhR_FKmg||t
z;?z1V_<+_fu_!@GEkR<5$jjCl;`j1E)7orMj!CuDuvY~FMlM($uG~jW10aw~<^RhM
zuD#VRT!L5k(4eI`@f&p-g0dFw7$0#w3*xC-7=70Zn&SpQ#zOO%?t1EN`JNFSter{)
z3wsL`s$h9_byMfAm$K@rMp<%;ImrWa{bF?rI&&;v!gVnVA6nwvT-<(6>#tv4bSP@$
zwvrY@fQG=<wu-{bN^`)gTLdX&qiG1sp19~H{$SzXkdx`Z5^}77&ycVlFS~6&J;>V&
zx+#eC?%ZFwKuQm^+Zkr+L_J8z*~>^vEAb<PtWukK^MST|D9s)S?x<*}Hj|Y&ahEb;
zM^3pIk$C%0u*hk`I!7L|FvVTp!QGj?W7%%!h=4or(wEc?{@+@dU#?3w0VD+Lz0Wy5
z5e?h5yGHG7u6e{>k`&>FQt0T@q{iHU7NFna?<vk?$4f>M`OSR6Lju(x3!A3-J&-sb
zR#)Q|51#1|QLvf(5snSXXYkMvFDw`3R_2#BRRlu^@t$IYjURaOz+*e1?3!=t`v6Pi
z23}MR6w_z+N%gSwYU@oB?8ZE6wydS4rCvZahq2M4uN_>+6#G?s{)g}3y=HvIr}5aC
z09*b2qkb`9$nycrhrHrXLAT}jy#qXG8n7t;pW5C!EUGwM7gbbH7{sEx6zLRckZw@A
z1OWj-8fk_cqy%Y@ZY88t8tGE05kzSiQW~Y<e&fDp@3YTy&U5~`*T+8wW-)7K&06dC
zeed^%nM-}4@+XMq`G-Ji-g|FWOJzSl>R4rR3|D1bg?ZP?RhQqg8}8w`qyBw!&0Q`L
zPi5;5Hp1#;tE%l!y>sjbORNRY#56xyCysnYs-wT2z-`Ogs9bd7cu!^`M?b^<^r*7t
zf4gz?pHECs=?#Ok#q_`V_7JE07mmA0evNjjo8fuc;tixe_I_9dZQU30t+6I5?Gtzz
zsz&#z%OJnqc`@C6CFL)GTB{64JsUN1uj+0^a#-uTT(zTX3o%qvdsUzfq-42^nOmVe
zElwY5&f9;H|C~8hmPlZJ5m{3~l3q<#$->4qOz7a)hJS}<e6v5*Wm@Eb&Ls0GD8B0|
zJPB>*oOV1J?T6Zt;9FD>War4pF}}o1&w0F^(y*1Z$V6)wD^1aTSBIu%hUg_ha~>@l
z5f|fjcMPJR<_*#KW{^ncNmW-|Xun~Js6toY1;P0MN}C%Q+;;oY;@y<n)3r_!iFi{z
zc$tlcw%adXL?p;(6-WgJXpO#8N_dy*?8c}rN_w!_oWMofK7*Orxz0f}#6;WlMW(M!
zLIe}PZx|b@e$yaUzQG56<B;0E6BF&REm_d?`-a4NUWi05-b@Y#Opg@%7uyy&glTJJ
z`+|ICJH>|V6lr|+?7KM(Xng*9&Iu{Tc_@&c?K4uJ8NL>`nO(q0iJp%6ju}7n#7Y6%
zRZ|ByobALtUSt*|T}S#JOxN~w0d82}ZCtKMVzczUo3nhTADcJr%k_gh!Zo)g-js%P
z-E(5dsKLygF_29roAxxhl{S~!iA!e*_|QbPNp$ljS+Zr2q8$PRqTd7H6hrMSCv6QA
z!f0=5kLiGHfN_~zvE8%LIx~JcM0jNbPIliF5Gxyl^?bw{oWa?Ud&h#cqeP7>lcqxK
zS>1C8`Ob#?ySL3?SRMKi;t&U5jC%bLEQ7wU0T6cidDFo{cIn5Tto)$7aq|Y3WBze0
zFJkN36!2$?x8^#l&U+pLFM~uFbTRyo%br}Oc0+5XPAPNN%AC~TGgt3?6A8UH?$|bv
z8!b-ZOq{aNMlPKF_1-V`U0`FsZH6QtjUB89Oema_Tz~eU6S7VP)9koTBJ6tCi3H~(
zYp-@BbiX#@3Y137H-xACFYy1?)-@(}CblRlDpHv6D8o{WPrd@aicuGz^xZQi#2VQM
z^ew@SRtt$;b5Tb}Q)e`(Kpp9ABX^`YgMF8JhTK%UFNI73XiZ?ARrsXwmDpcBHxaO^
zMH6{q`w(p!zzUGvcgZ*iu#BaJIlwP;kytP8UUQ(QT@h;2M9yK|jplxW_49+WN*yE&
zRD*iw2Be7yYkLj)MzAQ#_I-%SK@(BA4oI!M_)AV2?05Ra#QNua?MkQ132;3gN(4_q
z#qd4KXHx_TsHwrd?mhs&^G?ANT<Gg6Aja2I`K;8s?<U6ZJ^Pd5=z=eGLXWrYJsWoC
znP2VxZ_A7O6X1;{@Ecl?e5dPowqkJm5rEj{E(A=iLLD>Q00!f>siRnr+diD*oy;>W
z!HlouCiI0x4@iw&UyC1sfap6B9pyMyFO@*v@7GNqTEy-d>VM2&yL|*wrzMBa4lNE3
zOu$E;?T@^8H(N8CW%$%CcrFcF{RFS0T|vA__u1J-X8Yc5ATJHEGt&JA+DpFWD$Erg
z9cV1Dc#Dn#N2BZ%Yfo+&UwH^)65BCq3BQJg3*_|DHBcQ1P+4ZW`!@jTE`dVAjbO&#
z9nW)<b4!h0eUfKB069%PZ_M-??7rS&GTejRNj$M`VJp$@VLqF$)B)i?5d--TE71zh
zJR#OUtW!M=iVU|UKy<H8Kn7g)>ib4buCo(x_|nv8M4I=Mmqz@0xjl<}MN3f0?CeX;
z9hxU&oc60v7A`v`9)pm9zU<*^$iRG7Oh-|eX1O`t^32nm`NBwzcMrcwvmsUDy%8~v
z3sXazBVtodi{d&$<;)x$=M6;L@$Af24MmN7Zu^yZ8c<iWIJTwY&E|wiinhzAu3I|1
zU~H}de!rA$`(jXo6PJm&!tf`41=7__{vJHP?$D%AcSccrKS~oWA_JDZRZIZf@WSs9
z&G+(2x{8A)WpjI4WWDBq`taq{8D~4*=_h#dtbTK>FQ_4Q(F1NXVbQy<@oc!>imp6+
zm|wted`S2;R_?8);I?Gl`~~ExyA22JF=t)~$PW@<pOqTKg>u!0^#<{VP;RCLN8cgZ
zmNm)gD@&5kvHT?9SsfoOGcR>K&#37cLuAov)hgG=Vj`Y1MbAaF{rLm?25<k`gwJG*
zx}Esu%@a=fzP3J=4AE{pvs(H7)h`mZZpb95;$Uwsg(UCbun_v70~boBAJ<~VgO+C^
zVGNJ0Iuq-(bniW*|I%wB9OPexFe;XLeBUG)#yF3RR+ZEKyyrSEip?nS+252k!CjZH
z_dHXcd(ZsI!%2$a^hOwM;t0o~NPAre0bba?q0XnJjccnPOSYs-40qjMy6wzhJ2FVc
z+ofHIH~Caea>Akh-!{>^fedRWQeGuFp?)%vDB$<fE#ma5@+{ZueKVVhoaWpg{}rm_
zQBy3CN+FH4j|?TmMSNn6G3Nnz;F2$|<@C$nHH{aPakzpUc)<TZ!x`o?ea)wTxUk&`
z8fVdOmtZc;&5#7iQQJ|;bedyxcf7KuBUa(QLi<!@T}P~SjipmWeurbkm0!QP!od-j
zdC;+MmZUFsKcXNX`$#!wCb5Gfk$H<$RT&;V*H?hjSUjjQjmNa{!q$HfilnITStB?z
z-<CFzk>R?YJqHi9XSYNFoE0AI#EQOdd6BX9QRL+?E&VGQu)Xoh{k3xbB}CxMHs@d`
zgU(45!1`i%*hV|Y^x~3D=+0NVm)JS6%qNqXFnzXd$K1r*Lr{~E<%CiK{qL@C=#Lq>
zEf6vPe0c~6q?X{a@Q>=qTj_|lz1`hgHsN}uNR@r*5d&LQo6C7hMLl=-`JFOly@@F(
zDJ_QD-|{}g2r#j+8K;h){Om2*yr0ZACXf(Sr|xcG@UHABo!?&(>mmZhbmcevZzvee
zhQ{AapMKY>Q84=Hb4SHIjXNs%t#jr?1eywX2u9Bn2p|FH9U3oncZi#_`NR6ykV2QH
zD%qZ9TaC}Xd#If-6^fNKiKGJWhD&Bf8Sjk6J<YDPRq>F&<eu*f)t5m9J8dic*?t*(
z1%2NW9hw={d}tRO#wPu3gQ$<uj9hqkEXP7`qjl4#PZo8hPg;wn?-5G$LZjibJ|{zH
zFJlH`OxRqucc71vL7L-{xJn7aH^qRJ9EGWnF-sH05g_P`;SF)3NtC}InJCQ_%^Xkf
z#Fc=!poS`KOnOCpTY9fnNF{Br@>@@&HV(zpyO!PF9Tv}z3#{@!va~(*yoU0rxBA*!
z;d}Xs(-Z!mIQSMF$G9mHg(d{7T;zCm8a1Be;ZM;$)2D6Bx0GIo=ft4nL(z}j^cCJI
zO;FBG9)f8|o}K+Z^SouUAY&HS^{Fe5-_AXuPmy@va$tKo)Ey<aRU%v}>CqEmlq|gW
zaLc%@M)Z0?J0`+WOD&DmWVSW<@*FGP@m5lA0Dtks$?euJ?~*DRT7#<z%3MEs-y=o!
z25@U#6Ma0m>+elN+Ihg{+;3F29GursO;6RC-xFc-yR}G^!ECp~?KI}mrR5s_lu*qT
z%O%ze>zc1YQ;>X)*ieQB(_!WrBd_CGBx;lvY@?4d^DO7TYXH(0umSNz%k`A=r(Nx@
zi~(7Ubz(L_y%y-|yQ;L4xMJiznN&jh97+&(Z3(w5mMjWEZD1Hrzx^`S?8DK7(KW(3
zO9e}iAY?WocqmdWm;7`SLU%HY5XIF5g6wU)T+)jp?9#Ni8Kk!rCwIq;8ca~cO)9c|
z19#J(lGpjPCz#35#D~aQEX`kMYpZ0~Gwf>U>%Qh0BRJlGVDk}j?Z>nYoYRv*4Pv<E
zP^?6I4a)3opj>E^nZ$r!!S)Sx)Ok_~vp`K=rj63xYoNd3NRXa1OE9a5%jSBd)>*;J
zl+n?nfte8xwJa}36gNPo(xlaVi?<-sq_MYE@75~4H6ze4m>K!Bd0i$F65bi>vM2Ps
zn5x*WZ94t?NZ<GC_q2qx+bKfpeki^Rn@LgMjkuam{}z{g_`4Dtmy}H+OycmuKGopt
z?KiJ)-;{^%S0q+3EMId#9lc2i5;5CMdY4#5B;8a~Xl5j?$j8X0+GaXdSkJAmY34k>
z=&BmGJK<8x9ejs9$zWBK(xy(-*=Ev+EK}Sw{OTMg`vC1~l<brAbbd^sN+@zCm)F(L
zhfJI?nkq@ng^X03J-R$o&oky}6+!?sPTd0^_({44PW-NWZrU9p)+ceuQOI9<p6m*D
z7}^%NXM8$)V~*Jwq}_=my04LM+EvHHx3$i3`6eXR@WrYbkqf(XMO)W)#nB%Wx!~s+
z_9J>WUQbdzy-t2ZZrF%Ux=FBapktU_dX!b!rL;E0qT3;#qk#n^C8uKz2vXQTH{T8M
zGSLRaFxvO6#Zu<;>*T1+W_=p?dvP69o^)D7uoJQoVM7jk@YJJ|Ze`C#q|2Y#Iaw+Q
zYaXDj3?T!_hfaD=zpec>y=7Nu%VWLunf|cuYsd=T$g9JU$3$u~P1hXQq`%@2Alm9i
z>1nvh`RMO(p8R`9>Syecka8-e;7S`PO(cfCeU;+5T!^jP(9fMCa4(h*oNQ+v3;k?O
zZN-NLGEsxyLD`LMb^nK`|9_I-W1s&|QU8A_eE)6zziRxkfsfcb_Wz>s$EM)@lidBs
zP9mAOFQ=xa=7H$_>?3KmhC*OVmI*cS{P63YPomTAFHd2dVaB9ZD0ul@aWMuA`C`Ya
ztmKUn3FCWW$w>Fp`K(@v!VGyE<_ZeWpFf{h>k1H6OXijWM^1YiYwMRaAE~=tJ?}0j
z&d!o6D=YtT(bi6&2<wA$4W!<Z@UmFq1y+2T-%ygJ1-`ZP&=EMcTW(=V4cLO*69SJd
zAkjQFC560p*hx}I4g5QS7J6`RgIV$}63Reqj*qQ);-$z^$P$t+P)WJrbVa%Wi#d)%
z5uZAV#cY7bpN>YOWqhzlBqCRu>|*JMZbMtl^;bs`wpTvhLfYMr#{k)d3qsUA+VkCB
z?!ki?G+GOb1<k{Ui?6P(?w%Ogxf)SVOLX>y<N8s_{-4t%{>H@lk=Xh>!qmq(MF45-
z)yyO^sqdaU#_}lg1TW`qLO5&;Xv`lrzpv<|z;c7S0ZaYKDaHVAB>naOlx6qmm%sBq
zgqLHmdFWV^+@FC|JWqnLxFe8A*s-`LR-fq%66|hlS~o;_$3iabJDdaf{NGQore6?o
z(*IMW9J;B#&jA-0D?*7?Wo4m<Y9kcv#xhFw@DOBlmX?;zFc;~DgU*4SkDy0K`;^#9
znXonsfMuGr`EIu0@d9073__`a>&$#(6{q_Q@Jt4PLkqFs@=hHz6xi4ov;rLx=mqsY
zz)OHK&C!>0TeXk)lAJKhCoo%38pe8kY6T1)GDnY%jor9>5%L-AH7~s!<gp*WPfg<q
z%`3B@pstRIr{&jQSh5Y40pbJc>CQ0NQ8F>f5GQIR!Dh=&E{*}rjln{ik&%(zkJ#}`
zpG7ikhcRG{ezCc=sYpLAT_ZF#ECvt*BluNN4xOhMw|feVk*vz~spjUZkLZ*UfuAP5
zlr3e9b_?iehd^84&QFu^rux8m<T28Bd9ty-_s_fHvRKe3bc18-V{EQcbxF(umj2t{
z>f)RZRuP9{06=0|xvyey^vRXwjk6R1>tb%EGv=+dyIMI|krkz>IYjr?<}PFz_x(wI
zO}4g;8t#Z-XkN;Ruz&!kcekbiN*(4r0;E!#2I~Qg?UM~X$BT=LA7MOJ-WI+?-jeUn
z5YyK042+eh&uUk&f(UkP=2s$0+gP(>=;95pitn3JA2q{g&xaWIO6a8(fC0st1at<z
z8^3b09HH#u5Fq2m!6uLrMmtHKLU`CbAO_#PNsSx>SS>m~YDThbV8d3i#?0E|>SBN5
z7a|sAmia7N_JBJjz<^6cQz9n=I3yP5JUqc_m%RO@f`>w|77q8uEN-qDbm{4XJHsE(
zp3&K?P>ABl{62E8KV*>CdVns0Yhv>l7n_(f4vG*8+HJ;YGK#!2AX#)n<M$p#sYKQG
z49kP5wQKw{vO=4$u6zca*`VVw%FwPNq#(>mW|9ZR{dd2VXNjAW`yFscBoY>3YFSj-
z*WW+N+oI7ukS#5cX7!`oEcNOEAKcGK)a%JEvSi%J@OXfYq8fv5S9>s9gPiec31x@s
z>NX%<!-;$e5*(vI*z*J_K|Vm%KXD!dd3he|+l!$jz<;yl6VD$01TJD-X8B{4a*>(f
zo7LYH_qL&n<isNWC~337UL%0nwYNG~sn3mTYVAH7z{&rhB)&9}<&J|9^4hf)xVnD|
zM|Q;}?oLKRgJ~YPI*b3jxrz1r9qQ1_(LJv}!1us5GjShAj%C}e=fDo5l%GhH;F?7E
zn_2{m?skBbYzDU|H$K+Y{Le26GV1DT!3oz#pbd~lPfPthCLzP`4w?#XFrD9gGmtBF
z*u4)Tq|s_Mku`n+YmM&th~v#9$L?^Ga|oteAlH#Qx3_HhnQP0`DCS;g2Po`|oauar
zqz)KwA_Rw8HmBVqP&9b+QJNJcE~Hp^6=&_qwXDtM+wv&`>FTyyFb@9of_Zp?fJKcJ
z$?7YiC21R@=))pc7pFPCA2BK|DavAKa10K!1OZY~bu5=r2uDWWx(?0i$TRN?{`coP
zC<*s`$t2v*$%Z;JB5MY7n7R0@zM)-vxbUNz=QBt6N*{Z%FHw&HexfCgoxGU*GZP*J
zgi^V_ce%{U+P^yY&ezw-dD%3wo4aMEzX!r^`_9+g*C?qzZk)ZHmc-R~Z=TXUgMjg>
z`w)F|UIcGfGf_n4U*28bS*}hZGc4I>`5ImuL!PS1R*)#SR}3C5VQ;#}<-QK9vvd#c
zLvHpQDU$KRQ2$DESR1O<;5?EHSqml5=yjFO{cZjI;cNN11pF-PC9lU2CP{`s7_3we
zEx)r8r3?uf$_`OZx^FWqraD@!%iy1*PoeRQJ!E%==Lz>kZp6zs5xZjZR)DvSkcf$0
z!m0qPF~<<F_BJ*zK}|Wyj4)o<cuSNBB~y)P2;9xQned3eUH$%0l&_zKv=7a8IBmS!
z?~O|p_M$=k1f3T<gEFTAT2M}(2XLLffn!b-e<Z8d?U-z$zNw`KPtT8Nf`Nyp@D)ae
z*pLRD%dZ0H*E(NaP^cG&S<;rz;GnBj>=qM;ia~#5I$sYW<T?R<AjxgN6d(Cw>a)nP
z+<mex`K|09$roS7W+y~km-1VhEfmaQmc2=I>*mB%-VSzxurqK;-3Lpm6nAH@S*spa
zOC>d!-bHZ-OR>@B_m|^(=3>?jR}sY1v~7$Tt$P!zu3vYkx=Dhpow7tzB=;fs=U%9}
zQRbqV2g;1tk8eChqi2U<wBxQ(B>h6<fQU4ycq5G{-p<t+iT>7ksj2!I?+7xIC)*5d
zGW&A}iV;)sHA`Dke!B|XwDc6mak$%dr7Bok0kfZ$3FnFV<A%9$lX?2_&&6K|pK(AQ
zy?M1Y{JoD%r1wq5^LR8CkitD5w`{FBhl+hL!~3QI9JycLYQ;T&a}}FNq}>~7#TV>P
zA9%z3sfhXRalQymZh(_@-DFCOIAmFCufTmc{o~WQ0Nw?1ZmPdO%!#zE=L576EGnRZ
zy5IlLc6{g%ZdC&BSwtmUu1kbRmw!1g$-~#~Z}wUAVmyr;ku)yZ0~XzQcHDm0VVTpA
zSpVi_zaBi}C6yu>HBkwrwCdz!(thsqt!4@_R%i95*XT%5yfFFuacm?iC`P`!lr_(D
zADc?8{f)(ZzSrdmdLX}lBO)O1{3oDst~#J1eNSHoEz17QQ|-UnOf1Gp=CFG^dwfeC
zLZq1x5iwlb(hZaMTe{~CPGn@vm&>(r<>XP{9vE7dA1gR1aqRvNUz<n20%CLjgjGJB
zr7-RxJ4c4g;u?Rx^npPXi+((nQbdJI>lSH9P(Z*9S#nQ}5kL0yM5f+EehpMphbr4m
zOBJf%MVj#N-IOW+q1%lfV%Jz0Bk^us+w&FozwZ-`*Nc`Q<?uR=eOmom%1<HsJ8oo3
z-0cI#NaXD%Y`V9wEp5K4M{Cx#*R(cAo&=I<qImeX$cJqGDUHweqw)GBsk#${E(Gs2
z6F8Z=ZW{SjW+f5fQYvLO+>q$+-Uwk>5ZN}hzqIw*eQ0{Gh3J6M@AyWB*%d#*d6b=E
z+*I_jixZJ~m=(Q4dx+F$Zv0|C7)j?k@gdqb`nWG*y7h!v7XUSqOW&nU<fxq>LAa62
z$<qOB^`G9(H=1XjGO5*5T%@Ls{{{$q5r?X{d{B=0J<++dz6-Nf?CJW)EMs~ab?ooW
zLvX^KY$SQA<Ucb;i{5b3C(~glM-NdFr(g17N{)E2F$f51apy2ZMdj_>$RQLk@4<RH
zbtg-e`v4O%Xn+J)s`5efxkfHB5v=rXhSZB+FY1PRqTDRj{mSAConDjPaL>{%clGD;
zb8l3(QX1Lxgl#&(g5%#7*i!fZYy`ISg}XjO?7|O9{||@kZ1b~@On>=JcWD0NnLd{=
zq_0xE`G)d*F~yI8mnY}-eyJP5YsrlQl|>B)|4oz+mL~a+I!w-B&;CIKrv87}FE@X!
zoaz|NL|y*f*^MiHDvE=+@=9Yt;RyoYbg>P}Ph<D1k>a1Qt&M^o_<dY$o&8I1nj~>g
zvGv-tDDH~|b9V`Y{ohpGxS~{DUv7)CczfagSdJlR3!pyZCVNRKIJ8rL{cwkxo31m8
zvQXsM>E#Wl*-*#d=emFT<=UD-FGWrME7bG=sQ-BM*Mt1c{9zvd_|vN8>MW))m#&TQ
z_hIPP=LPw8_0+U0EwvZz!Yy1mV5lg}`}7+7W4rFGxX&n&>Vaiq)0-K@6^-0hLe~VX
z_y^1-9WI34qlPrIpVH^Z2nV}Fuv5mK0C1^_FumruGz>NEU5GZXs&jfrzlh7d@ws0g
z+m8u@O$yl+DWXdQ-k*(vps*HSzZ-)+ii_<C9P=>KSq6E6$8a$VmtY;+nDAh0WtD$I
z$j=+z*k8H&VTp723q)zq>}&(cf9#?(+})Wz!=Jx3O=xTI+;I(+`Pa?;+m|xo(T$SY
zh15EV7TEvNxSpy#yMUbzKe(;@+bH&2aZeO8ijSBEXc=O+dLqxmtIdCYw!;*l=$EEn
zo->mc9Kl(qAbzs@NQ8b6#-m{{!jB6)uVG@s3F~n&wwuD@o2vnGY&f@?ADeXJ%2J`)
z#V<laz{=DA_Z=@WWyNdme_1&LpE7LCnE+L@JXYrF2~=(i;6LioPwY{>ka0LCa&t)J
zbG8yzQ@^`0g$zk6#D3~(BCwlsv`&E<*l_d+o51D_qn{p7(sK5T?nL^)fW&K;r|>3Z
z+8uh`AFTYmEfo(wb31Ms=S;7SLkD)4;z}nX@@wHL_NbEU6uUc;QFs99+!6#9Sl_S}
zAsisWL{PD}i%D4c-&m}f2X8sIT=(;qYWK%QeKLjJ-hoQ|(OlcU2k<xCUY)bS>7tv0
zR>&C|6J^M*sKYKfJphhUQ+*@2zxnwB20ZZ}hNQiIeZ98oT>M|`T&&a^yV$vswm`6o
zL3l)j_81ag>@YnJ|HMMli=4A&8VA5)@6^T12uRH6d_~O$xPGqR9bkVI=>EGxOn}UJ
zk=wo*5)ZMolU!XtztacCf2q`=q82QcAmg`=`3;R?<)qiD01+c(xAbBOaS$;1wUOXo
zc;#tI=w~1QFJDw93{63}@d&Guk{bo?lNwIdk@xOWw##MvhAp{t^rAOG@|pw$7HThj
zX7$ICL_jfq?)ilItr8HSI%BgS;N+XIW7=K$u8cyE=ky}=7ir<>lBH9Ee-OFrgad@a
zIYXQNFY#0wpA44_0nef;s(}}F#jGX^7FuT@Bs@zxgZvY6cl&?11QSSp(#a4PI**=T
z$--`3l6-r4{)Yzk$qG@0D?)y0+}fRy6ckubv3r;5fTrvgn8|u@t^Q^mEz6V23f^;j
z=C6e0lE^9K3^jNpSitWz@2uReXYHxTBMRyWQ)z{#Dn+k%zi5#+D0w6#RP4gLkur5S
zcaLzJ<MiEjRy;tNP12Q2cO+Cds`z6RgFjl959<8qo%KhI6yAC1EW=OsbzSFlP3)WI
zEf$v_H@QS(LV!aGpN|a~0C1hd|Njox{D14g82QcR*olQ{h|De)L{LYoO-w?6+<@0+
zA3J``uh@^E4NLpt=wtJ#)m5>7=|}%NOnFN}+e5a{9t9Lz=s|iG)kit3g%T^}ANRQ*
zyA}m+74txn`S$JI*ips<^+1f4Q3jugpz1>%NtY%UvWsOZf;CDW>u4{<el=1j3BRG6
zCp32s=C(alRq1Ws5h+d4>z6%T&mywJaK8=X3_URt`W=a@(S5fYjB(1+>SmWC-{hSQ
zeoh+X+$<8c;~roU^D0pEi{8c9-*Uwdk|a3z?RDi|zS7gO;+Ii*K`0gOYw^EJVt^U<
zvgQT5k0m8SUr0rVC6us?YQdu3`A5BZV%vyy5m8UtTesK=M&*aYb-}3L&elX4Hj=ne
zUrg4;^wXbQJwIUr%t^(*^1DPmO5tpy%lbAC7fYsI;bUTs;q^>dLdnahMn+$ZJO7|y
ziKza9CLT(+b476P7DX(5HJH?^;1MS2YmTIpBW?G>i=|lUA=c1M{r}lW6MKRDpOL&k
z>j&8}CWGxR%JHV$EKyNWrKm+3zs*&fXU`rAmK79?v{80e#t&jt<$EIrn}Q#p%_H7l
zh<XJQHBI!$NOybKO@^t~Tf|?#+uGpo5^#y$1%*ls06p?XBp^bLflwtq6Hm<80zcLf
z1Q51Q^ME16!bTcOO5tb)?nn<1D?NJpHI?SzqrZpcQ%~049!?2KNj<>AtBwh<b>(2W
ziMB<a2uPXuY>|7eeIQM&yJ}n!Tw)RhcbLoZ-=US$xhiDg@V1zsh%y9IR&SpGkk-9L
zokDx%R!Diy;ga$7?K7i<i1H{wlf<pLDLX$i&f0l9q2o<o3?!PyU%2{1A8vevux=cE
zAG#N5%~HHKvFA{G@^{tvKwqO4j2e3`5RGFZduA7$L#Dw0p;sG?T-vh?a6UR%q^++!
zdf3j4=v=GlI?aqvB#}3YEbBZz$cGzIVV{m+DF@(6*u1E>m48N7>v;94w+2r))xa6B
z_hj$P5S9c+tP$mP3W^mBAY3E(uG{lrD=4YW5ySWIM2sH~RzF=k^j1|pHvU1om(#Y*
z^ZR2KFHs1EB|vRX#On@VzbVm|%(L1TL^RnblVHbi5YaWe{^7-xwKp8qyO4QZ0)~`}
z1}*0?FAjI#I*(ae*+xuA%laOzjY3ZQ%r<pj?D1a#f$tP&cQPAjovye{2ZVMv?um1X
zdH)Ts3D!3nmvffnT5ThmDsId@I~={RjY&eT{F#l|P_(zVZ`Vy`zUwf^zHXu{6FNto
zZPL74z-F9b`<)|oWXRuQO`ki64yUvbI4KVacELC$SK>a$TgA9jG(P&%w?fr7>;T*~
zK&Y|?H0DwB`AHcglNa#SEiFR;(9s4`5hKO{Ny!XMO<9-wtPAxUr-gQP%PW<qhm7;v
zef<fwE{jWz9(jdU3NK!~LQvj&0FV)ZZVG8}#e?@P&1b$<hxGPdSJi}<xgmTWj9a^U
z0NY~kq81z>zb;*!k9v|7Iy)%8kpvmb?fR>Qwccz!;2Kcc#@H^kM<aa;IFPj{k;A16
zP!;%Tk%Pid5%@0V&!5{@bQo+$BYzjzkgpM0uCIN1?1m{bs<*i7AW1I1Hy&4Q1adl)
zb-`cSuqIchX%qndX#vpH8U~0W8q)iVna!gaMc;K+*?{9$3P@zA0VfC|1=o0h!1-&!
zL83FEZ4jXDXq$*|{4=b}L>J9b=I@%hDKvFkum`NC_|FM>?IgKcfg{5LPE2}N#awBV
zMxqTSyZ0cDj0FJ5(k3qGB%6HBP6U7Mn7W!2Inem+i|gM0>*)67^w~wnC@LYPhmRh8
zF!F`?V}sKXJ@YIor}op=bp@KMxq$2{uJ%OTGmu>!fYVa7@1kXt*V^A-?}wcg-Y<iD
z%f>80zh<(KOI<N#^pLZh!$mRerVIvkIN3i&xWq@eXhT4R?FW;>#Xv7lh$6>w8zqOq
zW%u>j$%X`l1ZKN#mDBCduAQ%XhUbE2DWgLzhV|=|wr+Z+D+)^9>nx(xl@A|2Oxr$+
z)+vDelS(k4?L)tf`y&z(`ru$#eM)3?5BZJmhc*NT;T-G^?|rz@v_H#dY*yE&-LTjV
zti)Vm0lC}w2PjI;QExCxdBv6{Xsx@t)fMfwH_N75=d<|rO)eJ>R|lkbD5<HH`9Y92
z>QIV)lR+hIzpzOIYT<ge&!qLF&m)bTI#j=XK2Tz_THqdt&9fDPvJZ3OS4g)|OEM{R
zXau^ND5#z9Rvs+;>ZW!Vowm@t1K4|Y)8z&_7K8~v?ae*p<b}fK%*MllKS_J%AX}LO
z`jj`tOZVJSv#JJfEuTK^1>EDk<v~dmXY9_m`UhYbRABkueWS^1BUZUA&hG@bNYinD
z=BUU2d=TdJC^I%&I<)w@Bu3?+L%|P|uxRXAH%K0_TDR?5`z3|%mv*nAVy5^>T6TFl
z_((l3!<kSTG4U3&(kfYwT)1nX`Dpp!D}}|Tql}p9YJMv_8alqGD(;x~dKn(G^j2BN
z>#R%O#woo(<`sI-+uPbmCASzX9MSomrz<T7GZ4gc4*eJq(4vyM6$*pf8r9$*97rIj
z8`TY9Jo<YfZB614(qeZ>z*x*HwK)1&`gMbR)C#!$7GR$5E`3PLWJ3>>7O}XaI{bJ&
zw=<nbjpx!donl6w!=I<I6;~OhyTt(Q#$)kCQ1N0JTDZ!6LmV7+yXr)r*7BFd4y)Bm
z_T3>Ygi3ZWp~D69)AlSMLIWia(J7F56=X7M<WjF}RnlbM6AUNPal05kW$y`GLxX{y
z15%;3W~Vcu@7-7ebWRQyAJ4G1e>J8TZPE6(wGNO9Xg*q}Ec@<~0-@J=Q1zOW#g6|Z
zuGAW>%Y4tV34;R_NSl)0<1JZ_L{(`AkP?55m^U>mVmaG5d!f53z{NGCpZiHzSwww@
z?0Et&dODPHzrF?cjrAeC)p}r5t8q40q3O)_18G9}`WqB>KAfH66=S~ic^@vz_W(}I
z4DDSZD@yt*<PgC<PpG19Q9TY?rB3}xk#1nRzvp^2c*=cn%iJ;3_mm#UYoQe3ySE@B
zwx0K%!3ktHWSD6AU~Q%n^EGpITzb8I3xkK}0Sb4-G#aqU-`blA&7_;C-*YTIu|n4<
zC@Q*HZje~9+pn@7JQ7fg3=#J|6lTn=XXCnHaq{QvHrCjwB`EY6g;p$CarG`s6?77{
zheAicBfj6yA6gVYEz$$TSeYZ~`}c%<8NYg@gm&tm_;&`)T%@=Kr$@oni0%4ZYsf5n
z&T$jX5l@Qv0-0OJknlbe+0I+KbJ=|%g|rX4+F~~<+>3<iVD_uFIs_!S{)8^6vdLfi
zeUwqj@V%)=#luiYmeR3)?XSCT*Ay!lD^-`oP71HuWks>{E=ZhtHWt_K=^$abZ!8WX
z8a3lV{pzMAd<R5wO_(T<?$MOm1Hu75-*zX;GfPv*7;tIc`I@8_M*EH<7m5){!(Oc%
zP;`&rC-x>hkTuID4WJJqy!S(|va%8_{H?nijT2NMWg9^<2+h<|B<iNcc#XHkKo<Qw
zyE!S0*H?j0;PQ{=6l9&2xy5rS-GtEVCO*E`vwXEA3QQttxKdg(7<xUDJ}-?r9=;(8
zVT{HR+xKh#j1&tP-y(=v-<)Byy<cYGeMOj2%LK-OS<nE<13NyMjf>4j89(=yPwgz1
z+}&n!ROwlt>=>xM9jcR-VwJze@_oB62Oi_D+OzQ!X!I?P4WB}lz29SoNF?IvEHE$(
z#P?U{F0uB*iVK2s9Vbk=#vB{_@BEF9;495<X^;OdX-?06aoocfKv;fXYM5ViADwzl
z;wGvw*@lYJ%8lcdGFFzp)JDRhJR{61?Va_q_~S>Vvst-Dxd~-3Yt+$-x0#UDZ^9%N
ze?UCy`j*V2-AB@<ktZ#bXOc%1M`Ibyt4w}dwQPI)z<`!l=1p8&?&%fzl)S?8FA3UG
zs;|V^B}+|b)u-F5;8TP+?oFJPl~NjrVZFghRq>7Y)+atVY6RezXX|~sb%RyaV0g>m
zK8q{<09gI9L5Y?py7_r{dnuQz7XL&$$%OXLO~dOC2OVl=3ZDJpm1(JzsWZFVO-5t(
z;E<ul-SWCm2;tVccSwses`ZEGV&goN2QoW<H`};s8X6g398`L^#T7N^WGdbReqs4y
zLXpwDoJv9k<2mJG@hMAFnTa~)<~8zRmBF!vB?nWd$TM#_AWl*X9WKR9I5cezXw-<x
zQU=XzlV&=yy~a-zJ=dtYf2UEeC@t!ndb8h&_A>8A@YJt1Carj{_a?q>&it0dOX;nT
ze=}+6nHsHRhHSb~BJZlB)di=HX*t@AU{;0#PVHHBm%mqPcE8?BhM$-JpjwX)m=7}?
z-&a?+pFDp!htuk2ww<W$O+a1!amg7*m&1wR14f;dFjH24YD|0p!>-~>=It*vpYel5
zJljvKQ`pQFXI&C5f^5lP;(}4g;p3=?nM);F_#`2|-7#i$G)eS$LvOfe+)G))xI{$y
z^eZ2)CfkApBLol;_e2=28Pqq!C7eGZFxOe~=X9%UW4?{3%QnE+pgb5P$m%c<(=~2F
z4&%SEdL!x?YcMC*6%-y`p&HL&o(?L%@HCHEAX$V|=ouI`CH!f@n{!jiEHgsrvWg<E
z^xko~My8WfdAw5G?|Or-Q6I3Es*kS>9u38Rr4jR55hX@lOSV&P64gpWyxHRPIo>*2
zH%>0tAWYbR;VH?MB@^YYmL~EQ?HOuB|I579%a=U9UYd7r^5?KP#9z6c`>@NtvM2F1
zSt5cqadH5Nn^KE`Jn@nnoR%K-YcEE4&(M$Ns{Je<a<IYBQIZUjQlnkB`7KlFb#Ff#
zu9vO#Th6ztPVyJRGd|CEnMx~Ig+E)+{{gyK*I|v=^h;Ryb4-H#TiMk+7L8k-5zb55
z$k7CxeUX)-xav@U&+{yQ8KcJ^TkSup&C&ASWkH9rMTSr#k6WBKTfBP-1&!WSwZ5(1
zn>?cZxhlyPjbCN>iAI=8>O5v=t1C16AS)|Yd{6HR1kV;eGb=obYTjoSn>sAC3ZxH?
zMAZFsPAxSuWiRn|ud^L?8zP_k0;J}QNb34RZt0T`qZ#f^58lc6?4=FzWHs=%rOuQB
zRrm47NiTV@+Df+o;_~;A8|E2^D`rKfixQun>E{pSMnaXjQ1@VZKzeT^<ws-)FIO-z
zi+R^NJrq;Hq9?nI&yt^7T0Tq3L7YSG?CgAC?(lvPt*fse#uj%}S9vqTNtGgAsFyS0
z?BlQXqPXVn3v4(3PVCX8PhF!4T*}8#?9-^RNjFob-}F0!WkGfEZUvH{YENBc&3diQ
z%Gw#uwn1m_@7ko@Qg~}+nB-b*;3166Fs&$|-um;lbKkTK<du((<D%WqPkTKHLiG>w
zs~x17BhIR+d*eIm&pmk1LN9)6*1Zei?Vgv`Q5hsNFM59RM=%kPGRZ#folZEB^hMWe
zg3u8B2p*0F3lX9kR>Cg2J28=Y_lw2<Ye><R`<92Pst~OxXHt-7$&RGNwfsV)jtS<G
z42kIIj<m(6)46l=5t&+6`H-O`TPi-Gpn<vM{o8JX3<O82qxk$K<-<l8p~t0940a<2
zNlE^qX>eLAeo^hEjLTEq@^~!uu$+9VRgbPLJRE<t^+>tYDe^+yxB^k%oAA`*O<o@J
z`lhCqySq5PxG28lPm;>7`oGvN4Hq^SwCoJm<%vWK-5@*PU>B*(xAmIl+IupcTiFFM
z{39cZ%?Xu^oYVxpGbKIed%pA=&||nYlT*Gr+$hWP;-|EYpW9?Vxg~IUNy=15aPKYI
zTS=$f{LbU+tTKqpy+mTVWF4>|K5R>JmKEd@BZ$9*P;DAMY}Lqm!gSTE^at~6UTH2e
z%jDif{CHj!uP+y^5HGr8vQud=Q8nDjx&}5iP41CGy4u>-_ajID20Q<_RTS4GE&No-
zE((O?cT-5?RdjXb>`L#29)|GB`a&|s#pO|ltVE&}W(v+G74Df7UTHC<M%k1s`?X)6
zniHwL*zU%m#p0i@FwE*{kQ6hZ9=Pj<-dhrp8tZ(yG#c$y8#)ryz}2qYm=+hexH5UV
zf>(qGB=<Xe_O3~=@{QE+7_q!Z_n|BJ^J0LQl3yTX`k9WSYE2O{PC<U3!H~Bq$wABl
zawyEO3Zlzcbk0>LUdr@CDi%Nazj2$sSzKExxa0J(gf|v(6czPK(uyPWwU7Ms8<u2Y
zmE(!jm497jD>f^oe?1V?zrrPga1FE!9sBs3I}G~+Q%j|^p?h+P*0<usfVSsj;CBR|
z{Bs?RqhJ3$O(l0&m6bp8uesEf>Pl-@tr9y5<$mWkT4cU<8`}k*56%1v@?_Ood1Hyv
zwa6|jG+z}?VKcgVp^WNjy|Z<?ZI4o6rk38V_%B9pw35GQeJxHrpJo~J!Dlu5n|5)^
zmTfiP$HVVSn|cQ-Ub}Zp&No*HxJ15^EiC#{(kTE=N%~US1J0DZ(<P_GmyAES6rdIB
z2+5fQs2hu4v<zq`!egFv-|nnx78de*7BOge{nzIiE9=%{TEbTt0QaIkxidmoQ$j^S
zP4)(vejKc$!CXC#z+vqreqn=?8W}|Nf_bf@*~2o6(TFqDEN9zRH`ZkRY^zriui0m^
zO-3TxSkY}cbKlZbzf5gNdYU(tB(~r-Z>e)R+m-j7owxTy<o&j<Uctl=3+P;pJ9vh;
zr>OL7am}76Nu4dX$7}1T7oRnw@A*|JZn8)pgHw$f+hU4*j~5!wUKVaEN#9?p#}x<L
zvH`Fgqtes6maKCgQ(jQ1skER6Fkx!nj1u64u1yC@UXz`9{&|ZQN(N8gePvYOEefJ|
z78IK}#N>->S%=%Zed3$+Mhc{_*AA+meCB;uc3Ekx0oRINmMSXGpFy>+-s()H=bN%i
zg5Hlt#s@+rtVBwluM^rVa`_Sfeng=1Jap30c&&{W<+deLzI;`WYxN`Vn;`R#Umh;B
zecck9*GB>^yRqdb$LC#p=;YIJEmGWwkn&?@qud?R=Lt7BIXG0$P$wb7trB@PXUC08
zj;Bri%C2Oa%Ln~2v9nDg$P$lfn)P6UMX}#W-t%2kJAN&N8*To3pEegS^;3G&?$#f5
zKOS#*JRA9!$j2cu+SZA;ZR}`<tGqI&=j7h0(8J-hw4f-0{5KgHWqY-}<5qrn(L{wd
z?Rjg%ite80HbR?e+Lb&kFdJK9KWV4JU<dpBHCg55B7uYb;q|m9)X3l~b7#(({ihCO
zy|3bQ=HFXMC2Lqq)jlZurZ~|Se3FnDzkIS}#o&va=@fhX)L&ws0BJJaRfw^?LAbsT
z42`qNX^ErF<ETVh%T#_Jukm3kr;B&{^48Y9B_?U77j_mm<d5d6LN~<K<GFeb`6~_r
zm-c;xPlZH%95|<VP6PT+Y5u_b+VdQAZ#&7L$;>!Iw6}aXo;<tc9P*=o(8fd1`jz}n
zgQHM59D(SB=)F1Q;u7xzQKM7ii^uXnhH^>1)Zah#quMt$@BCzc;O5L?nA&g`>00zg
zdz%As=^Y;mqie;qPc|N9_)c_)O=s*aPJB)xJoVC-4c%(ya5D_@-pCmadxiID$!a#E
znV-<TKZ$&LBhZ5+u{4^1L==NV&UtXu?C3SVb1<+tH#XtE(RCD5?tNbzDe;pwQ_N?y
zx-H-V=e8rN;g0xG<d%?_SnfepbJ0s5%ZuWiNU7Zm`_0pHFHa(A&(b3{+GjR`_U0rm
z5L=ZXGm)F8zAd3YYEGi(XG~nFPOIFbOH~>~wr(Vy5%_dQH*YQuV`#GGrB38nZ$4i7
z<HE#&-%PYRx8pP&N*cb=V00+>F!_Koy+9DBK_XLkiue_M>!M!1Ovqg3%})zHviaE|
zeYGyKi|S!MBR|H)PAHuSUWYzy9=}pZej-T4usz}Fmai(|K}#cWjXIC|Z|)QWGI!Q-
z-}$=npxn(P??9yA_>+m;Gp~boog$+ePvrPcGjV3^MnYEqP0RMh4$Aq@z8hVO9rXwM
z$B*fZX^BWP8)>KA3x6N<FOx5S%{;-h9dZv&H5%@dlZ@}}H-MuE+2`oyt`7I@nUg>`
z2yG8ejZ=LV-gh1CHS@_FUE&fX3g-~MdX~~`)OT{?z7??7_xSjr2JQ~x`};mr1V@iI
zCy$y3ZM^hl7ReiEH>SEaCJcSgCd7Jiown+o`a}0*LU)k9KUvn7d>jw@Pu~;awPRN0
zHx{<~LuY;u?;*D+*QSpCY*@3K6o@-qQ{zk9({q;;3`~lSrfjZpnjuNQvwi)Q;BngE
z-nlbzt{PA2m;IB`{SofAe<=?L2DcoygqWFSPicP|ChSms(y>1W{}g4_WXc|x2L3;D
CbBSjF

literal 0
HcmV?d00001

diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md
index c0c32594f2..996ef00a6b 100644
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -19,6 +19,18 @@ Two main reasons:
 
 Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
 
+Now supports 5 types of connectors:
+
+- **SharedStorageConnector**: refer to <gh-file:examples/offline_inference/disaggregated-prefill-v1/run.sh> for the example usage of SharedStorageConnector disaggregated prefilling.
+- **LMCacheConnectorV1**: refer to <gh-file:examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh> for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
+- **NixlConnector**: refer to <gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh> for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv.
+- **P2pNcclConnector**: refer to <gh-file:examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh> for the example usage of P2pNcclConnector disaggregated prefilling.
+- **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
+
+  ```bash
+  --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
+  ```
+
 ## Benchmarks
 
 Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
@@ -48,6 +60,19 @@ The workflow of disaggregated prefilling is as follows:
 
 The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
 
+Now every process in vLLM will have a corresponding connector. Specifically, we have:
+
+- Scheduler connector: the connector that locates in the same process as the scheduler process. It schedules the KV cache transfer ops.
+- Worker connectors: the connectors that locate in the worker processes. They execute KV cache transfer ops.
+
+Here is a figure illustrating how the above 2 connectors are organized:
+
+![Disaggregated prefilling high level design](../assets/features/disagg_prefill/high_level_design.png)
+
+The figure below shows how the worker connector works with the attention module to achieve layer-by-layer KV cache store and load:
+
+![Disaggregated prefilling workflow](../assets/features/disagg_prefill/workflow.png)
+
 ## Third-party contributions
 
 Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors).

From 766bc8162cb37ad32605eee051d4f049ec325926 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 7 Aug 2025 16:45:04 +0800
Subject: [PATCH 064/932] [Core] Store only the keys for multi-modal data in P0
 (#22198)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/configuration/conserving_memory.md       | 30 +++---
 docs/configuration/optimization.md            | 83 +++++++---------
 examples/offline_inference/mistral-small.py   |  2 +-
 examples/offline_inference/vision_language.py |  2 +-
 tests/models/utils.py                         |  5 +-
 tests/multimodal/test_cache.py                | 51 ++++++++++
 tests/multimodal/test_processing.py           | 48 +--------
 vllm/config.py                                | 30 +++++-
 vllm/engine/arg_utils.py                      | 22 ++---
 vllm/entrypoints/cli/serve.py                 |  5 +-
 vllm/envs.py                                  |  6 +-
 vllm/multimodal/cache.py                      | 95 ++++++++++++++++++
 vllm/multimodal/processing.py                 | 53 +---------
 vllm/v1/core/kv_cache_utils.py                |  4 +-
 vllm/v1/engine/core.py                        |  7 +-
 vllm/v1/engine/mm_input_cache.py              | 99 ++++++++++++-------
 vllm/v1/engine/processor.py                   | 17 ++--
 17 files changed, 325 insertions(+), 234 deletions(-)
 create mode 100644 tests/multimodal/test_cache.py
 create mode 100644 vllm/multimodal/cache.py

diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 4d5c961af9..dcaf1069bf 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
 
 If you run out of CPU RAM, try the following options:
 
-- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
+- (Multi-modal models only) you can set the size of multi-modal processor cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB per API process + 4 GiB per engine core process)
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
 
 ## Multi-modal input limits
@@ -129,20 +129,18 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory.
 
 Here are some examples:
 
-??? code
+```python
+from vllm import LLM
 
-    ```python
-    from vllm import LLM
+# Available for Qwen2-VL series models
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          mm_processor_kwargs={
+              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
+          })
 
-    # Available for Qwen2-VL series models
-    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-            mm_processor_kwargs={
-                "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
-            })
-
-    # Available for InternVL series models
-    llm = LLM(model="OpenGVLab/InternVL2-2B",
-            mm_processor_kwargs={
-                "max_dynamic_patch": 4,  # Default is 12
-            })
-    ```
+# Available for InternVL series models
+llm = LLM(model="OpenGVLab/InternVL2-2B",
+          mm_processor_kwargs={
+              "max_dynamic_patch": 4,  # Default is 12
+          })
+```
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 811925c19e..bb7342c93f 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -2,6 +2,9 @@
 
 This guide covers optimization strategies and performance tuning for vLLM V1.
 
+!!! tip
+    Running out of memory? Consult [this guide](./conserving_memory.md) on how to conserve memory.
+
 ## Preemption
 
 Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
@@ -126,62 +129,44 @@ Data parallelism replicates the entire model across multiple GPU sets and proces
 Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`.
 Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.
 
-## Reducing Memory Usage
+## Input Processing
 
-If you encounter out-of-memory issues, consider these strategies:
+### Parallel Processing
 
-### Context Length and Batch Size
+You can run input processing in parallel via [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing).
+This is useful when input processing (which is run inside the API server)
+becomes a bottleneck compared to model execution (which is run inside engine core)
+and you have excess CPU capacity.
 
-You can reduce memory usage by limiting the context length and batch size:
+```console
+# Run 4 API processes and 1 engine core process
+vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4
 
-```python
-from vllm import LLM
-
-llm = LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
-    max_model_len=2048,  # Limit context window
-    max_num_seqs=4       # Limit batch size
-)
+# Run 4 API processes and 2 engine core processes
+vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
 ```
 
-### Adjust CUDA Graph Compilation
+!!! note
+    API server scale-out is only available for online inference.
 
-CUDA graph compilation in V1 uses more memory than in V0. You can reduce memory usage by adjusting the compilation level:
+!!! note
+    [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled
+    because it requires a one-to-one correspondance between API and engine core processes.
+
+## Multi-Modal Caching
+
+### Processor Cache
+
+By default, the multi-modal processor cache is enabled to avoid repeatedly processing
+the same multi-modal inputs via Hugging Face `AutoProcessor`,
+which commonly occurs in multi-turn conversations.
+
+You can adjust the size of the cache via `VLLM_MM_INPUT_CACHE_GIB` environment variable
+(default 4 GiB per API process + 4 GiB per engine core process).
+
+If you do not benefit much from the cache, you can disable it completely via `disable_mm_preprocessor_cache`:
 
 ```python
-from vllm import LLM
-from vllm.config import CompilationConfig, CompilationLevel
-
-llm = LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
-    compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        cudagraph_capture_sizes=[1, 2, 4, 8]  # Capture fewer batch sizes
-    )
-)
-```
-
-Or, if you are not concerned about latency or overall performance, disable CUDA graph compilation entirely with `enforce_eager=True`:
-
-```python
-from vllm import LLM
-
-llm = LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
-    enforce_eager=True  # Disable CUDA graph compilation
-)
-```
-
-### Multimodal Models
-
-For multi-modal models, you can reduce memory usage by limiting the number of images/videos per request:
-
-```python
-from vllm import LLM
-
-# Accept up to 2 images per prompt
-llm = LLM(
-    model="Qwen/Qwen2.5-VL-3B-Instruct",
-    limit_mm_per_prompt={"image": 2}
-)
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          disable_mm_preprocessor_cache=True)
 ```
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index a38fc9216d..59ec22a1e9 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -166,7 +166,7 @@ def parse_args():
     parser.add_argument(
         "--disable-mm-preprocessor-cache",
         action="store_true",
-        help="If True, disables caching of multi-modal preprocessor/mapper.",
+        help="If True, disables caching of multi-modal processor.",
     )
     return parser.parse_args()
 
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 16bb3712f5..5dbe001994 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1565,7 +1565,7 @@ def parse_args():
     parser.add_argument(
         "--disable-mm-preprocessor-cache",
         action="store_true",
-        help="If True, disables caching of multi-modal preprocessor/mapper.",
+        help="If True, disables caching of multi-modal processor.",
     )
 
     parser.add_argument(
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 4657df60b1..27ce9de469 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -9,7 +9,7 @@ import torch
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
-from vllm.config import ModelConfig, RunnerOption
+from vllm.config import ModelConfig, ModelDType, RunnerOption
 from vllm.inputs import InputContext
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 
@@ -257,7 +257,7 @@ def check_logprobs_close(
 def build_model_context(
     model_id: str,
     runner: RunnerOption = "auto",
-    dtype: Union[str, torch.dtype] = "auto",
+    dtype: ModelDType = "auto",
     model_config_kwargs: Optional[dict[str, Any]] = None,
     mm_processor_kwargs: Optional[dict[str, Any]] = None,
     limit_mm_per_prompt: Optional[dict[str, int]] = None,
@@ -279,6 +279,7 @@ def build_model_context(
     model_info.check_transformers_version(on_fail="skip")
 
     model_config_kwargs = model_config_kwargs or {}
+    limit_mm_per_prompt = limit_mm_per_prompt or {}
     model_config = ModelConfig(
         model_id,
         runner=runner,
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
new file mode 100644
index 0000000000..e07b73bd25
--- /dev/null
+++ b/tests/multimodal/test_cache.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
+from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField)
+
+
+def _dummy_elem(modality: str, key: str, size: int):
+    return MultiModalFieldElem(
+        modality=modality,
+        key=key,
+        data=torch.empty((size, ), dtype=torch.int8),
+        field=MultiModalSharedField(1),
+    )
+
+
+def _dummy_item(modality: str, size_by_key: dict[str, int]):
+    return MultiModalKwargsItem.from_elems([
+        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
+    ])
+
+
+def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
+    return MultiModalKwargs.from_items([
+        _dummy_item(modality, size_by_key)
+        for modality, size_by_key in size_by_key_modality.items()
+    ])
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("item", "expected_size"),
+    [
+        (_dummy_item("a", {"a1": 100}), 100),
+        (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
+        (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+    ],
+)
+# yapf: enable
+def test_cache_item_size(item, expected_size):
+    cache = MultiModalCache.get_lru_cache(2048, type(item))
+
+    cache[""] = item
+    assert cache.currsize == expected_size
+
+    cache[""] = MultiModalCacheItemMetadata.wraps(item)
+    assert cache.currsize == expected_size
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 508c773b8a..cb489c47fd 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -6,20 +6,15 @@ from typing import Optional, cast
 
 import numpy as np
 import pytest
-import torch
 
 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
-                                    MultiModalKwargsItem,
-                                    MultiModalSharedField)
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
-                                        ProcessingCache, PromptIndexTargets,
-                                        PromptInsertion, PromptReplacement,
-                                        apply_text_matches,
+                                        PromptIndexTargets, PromptInsertion,
+                                        PromptReplacement, apply_text_matches,
                                         apply_token_matches,
                                         find_mm_placeholders,
                                         find_text_matches, find_token_matches,
@@ -902,45 +897,6 @@ def test_find_mm_placeholders(
     assert result == expected
 
 
-def _dummy_elem(modality: str, key: str, size: int):
-    return MultiModalFieldElem(
-        modality=modality,
-        key=key,
-        data=torch.empty((size, ), dtype=torch.int8),
-        field=MultiModalSharedField(1),
-    )
-
-
-def _dummy_item(modality: str, size_by_key: dict[str, int]):
-    return MultiModalKwargsItem.from_elems([
-        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
-    ])
-
-
-def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
-    return MultiModalKwargs.from_items([
-        _dummy_item(modality, size_by_key)
-        for modality, size_by_key in size_by_key_modality.items()
-    ])
-
-
-# yapf: disable
-@pytest.mark.parametrize(
-    ("item", "expected_size"),
-    [
-        (_dummy_item("a", {"a1": 100}), 100),
-        (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
-        (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
-    ],
-)
-# yapf: enable
-def test_cache_item_size(item, expected_size):
-    cache = ProcessingCache.get_lru_cache(2048, type(item))
-    cache[""] = item
-
-    assert cache.currsize == expected_size
-
-
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("limit", "num_supported", "is_valid"),
diff --git a/vllm/config.py b/vllm/config.py
index 899862bf54..44a8d871f0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -444,8 +444,7 @@ class ModelConfig:
     model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
     """
     disable_mm_preprocessor_cache: bool = False
-    """If `True`, disable caching of the multi-modal preprocessor/mapper (not
-    recommended)."""
+    """If `True`, disable caching of the multi-modal processor."""
     override_neuron_config: dict[str, Any] = field(default_factory=dict)
     """Initialize non-default neuron config or override default neuron config
     that are specific to Neuron devices, this argument will be used to
@@ -1692,6 +1691,31 @@ class ModelConfig:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
+    @property
+    def processor_return_mm_hashes(self) -> bool:
+        """Whether the multi-modal processor should output hashes."""
+        mm_config = self.multimodal_config
+        if mm_config is None:
+            return False
+
+        return not mm_config.disable_mm_preprocessor_cache
+
+    @property
+    def enable_mm_input_cache(self) -> bool:
+        """Whether the multi-modal input cache should be enabled."""
+        mm_config = self.multimodal_config
+        if mm_config is None:
+            return False
+
+        return not mm_config.disable_mm_preprocessor_cache
+
+    def get_mm_input_cache_gb(self) -> int:
+        mm_config = self.multimodal_config
+        if mm_config is None:
+            return 0
+
+        return envs.VLLM_MM_INPUT_CACHE_GIB
+
     @property
     def is_cross_encoder(self) -> bool:
         return (self._model_info.supports_cross_encoding
@@ -3369,7 +3393,7 @@ class MultiModalConfig:
 
     disable_mm_preprocessor_cache: bool = False
     """
-    If `True`, disable caching of the processed multi-modal inputs.
+    If `True`, disable caching of the multi-modal processor.
     """
 
     interleave_mm_strings: bool = False
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3e2f03d56c..a18cd9dde3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1230,17 +1230,17 @@ class EngineArgs:
             enable_multimodal_encoder_data_parallel,
         )
 
-        supports_mm_preprocessor_cache = (self.data_parallel_size == 1
-                                          or data_parallel_external_lb)
-        if (not supports_mm_preprocessor_cache
-                and model_config.is_multimodal_model
-                and not model_config.disable_mm_preprocessor_cache):
-            logger.warning(
-                "Multi-modal preprocessor cache is not compatible "
-                "with data parallelism when there does not exist a "
-                "one-to-one correspondance between API process and "
-                "EngineCore process, so the cache will be disabled.")
-            model_config.set_disable_mm_preprocessor_cache(True)
+        if model_config.is_multimodal_model:
+            dp_supports_mm_processor_cache = (self.data_parallel_size == 1
+                                              or data_parallel_external_lb)
+            if (not dp_supports_mm_processor_cache
+                    and not model_config.disable_mm_preprocessor_cache):
+                logger.warning(
+                    "Multi-modal processor cache is disabled because "
+                    "it is not compatible with data parallelism when "
+                    "there does not exist a one-to-one correspondance "
+                    "between API and engine core processes.")
+                model_config.set_disable_mm_preprocessor_cache(True)
 
         speculative_config = self.create_speculative_config(
             target_model_config=model_config,
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 9762a1de9e..02b78f103c 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -163,9 +163,8 @@ def run_multi_api_server(args: argparse.Namespace):
 
         if model_config.is_multimodal_model and not (
                 orig_disable_mm_preprocessor_cache):
-            logger.warning(
-                "Multi-modal preprocessor cache is not compatible "
-                "with api_server_count > 1, so the cache will be disabled.")
+            logger.warning("Multi-modal processor cache is disabled because "
+                           "it is not compatible with `api_server_count > 1`.")
 
     executor_class = Executor.get_class(vllm_config)
     log_stats = not engine_args.disable_log_stats
diff --git a/vllm/envs.py b/vllm/envs.py
index f6c6d7e7ed..212eaf015a 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -65,7 +65,7 @@ if TYPE_CHECKING:
     VLLM_AUDIO_FETCH_TIMEOUT: int = 10
     VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
     VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
-    VLLM_MM_INPUT_CACHE_GIB: int = 8
+    VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
@@ -561,8 +561,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_VIDEO_LOADER_BACKEND":
     lambda: os.getenv("VLLM_VIDEO_LOADER_BACKEND", "opencv"),
 
-    # Cache size (in GiB) for multimodal input cache
-    # Default is 4 GiB
+    # Cache size (in GiB per process) for multimodal input cache
+    # Default is 4 GiB per API process + 4 GiB per engine core process
     "VLLM_MM_INPUT_CACHE_GIB":
     lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")),
 
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
new file mode 100644
index 0000000000..262b22e554
--- /dev/null
+++ b/vllm/multimodal/cache.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import sys
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import TypeVar, Union
+
+import torch
+
+from vllm.jsontree import json_map_leaves, json_reduce_leaves
+from vllm.logger import init_logger
+from vllm.utils import GiB_bytes, LRUCache
+
+from .inputs import MultiModalKwargs, MultiModalKwargsItem, NestedTensors
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class MultiModalCacheItemMetadata:
+    size: int
+
+    @classmethod
+    def wraps(cls, value: "MultiModalCacheValue"):
+        return cls(size=MultiModalCache.get_item_size(value))
+
+
+MultiModalCacheValue = Union[
+    MultiModalKwargs,
+    MultiModalKwargsItem,
+    Mapping[str, NestedTensors],
+    MultiModalCacheItemMetadata,
+]
+
+_V = TypeVar("_V", bound=MultiModalCacheValue)
+
+
+class MultiModalCache:
+
+    @classmethod
+    def get_leaf_size(
+        cls,
+        leaf: object,
+        *,
+        debug: bool = False,
+    ) -> int:
+        # MultiModalKwargs is not a subclass of dict
+        if isinstance(leaf, MultiModalKwargs):
+            return cls.get_item_size(leaf.data, debug=debug)
+
+        # MultiModalKwargsItem is not a subclass of dict
+        if isinstance(leaf, MultiModalKwargsItem):
+            leaf_data = {k: v.data for k, v in leaf.items()}
+            return cls.get_item_size(leaf_data, debug=debug)
+
+        # sys.getsizeof doesn't work for tensors
+        if isinstance(leaf, torch.Tensor):
+            return leaf.nbytes
+
+        if isinstance(leaf, MultiModalCacheItemMetadata):
+            return leaf.size
+
+        return sys.getsizeof(leaf)
+
+    @classmethod
+    def get_item_size(
+        cls,
+        value: MultiModalCacheValue,
+        *,
+        debug: bool = False,
+    ) -> int:
+        size = json_reduce_leaves(
+            lambda a, b: a + b,
+            json_map_leaves(lambda x: cls.get_leaf_size(x, debug=debug),
+                            value),
+        )
+
+        if debug:
+            logger.debug("Calculated size of %s to be %.2f GiB", type(value),
+                         size / GiB_bytes)
+
+        return size
+
+    @classmethod
+    def get_lru_cache(
+        cls,
+        capacity_gb: float,
+        value_type: type[_V],
+        *,
+        debug: bool = False,
+    ) -> LRUCache[str, _V]:
+        return LRUCache(
+            GiB_bytes * capacity_gb,
+            getsizeof=lambda x: cls.get_item_size(x, debug=debug),
+        )
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 46240855d1..0378539495 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import sys
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping,
@@ -16,16 +15,16 @@ import torch
 from typing_extensions import assert_never
 
 from vllm.inputs import InputProcessingContext
-from vllm.jsontree import json_map_leaves, json_reduce_leaves
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
-from vllm.utils import GiB_bytes, LRUCache, flatten_2d_lists, full_groupby
+from vllm.utils import GiB_bytes, flatten_2d_lists, full_groupby
 
+from .cache import MultiModalCache
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                      MultiModalFieldConfig, MultiModalInputs, MultiModalKwargs,
-                     MultiModalKwargsItem, NestedTensors, PlaceholderRange)
+                     MultiModalKwargsItem, PlaceholderRange)
 from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
                     MultiModalDataParser)
 
@@ -888,9 +887,6 @@ def find_mm_placeholders(
     return dict(full_groupby_modality(it))
 
 
-_V = TypeVar("_V", bound="Union[MultiModalKwargs, MultiModalKwargsItem]")
-
-
 class ProcessingCacheOptionalItem(NamedTuple):
     key: str
     value: Optional[MultiModalKwargsItem]
@@ -901,48 +897,7 @@ class ProcessingCacheItem(NamedTuple):
     value: MultiModalKwargsItem
 
 
-class ProcessingCache:
-
-    @staticmethod
-    def get_lru_cache(
-        capacity_gb: float,
-        value_type: type[_V],
-        *,
-        debug: bool = False,
-    ) -> LRUCache[str, _V]:
-
-        def get_leaf_size(leaf: object) -> int:
-            # MultiModalKwargs is not a subclass of dict
-            if isinstance(leaf, MultiModalKwargs):
-                return get_item_size(leaf.data)
-
-            # MultiModalKwargsItem is not a subclass of dict
-            if isinstance(leaf, MultiModalKwargsItem):
-                leaf_data = {k: v.data for k, v in leaf.items()}
-                return get_item_size(leaf_data)
-
-            # sys.getsizeof doesn't work for tensors
-            if isinstance(leaf, torch.Tensor):
-                return leaf.nbytes
-
-            return sys.getsizeof(leaf)
-
-        def get_item_size(
-            value: Union[MultiModalKwargs, MultiModalKwargsItem,
-                         Mapping[str, NestedTensors]]
-        ) -> int:
-            size = json_reduce_leaves(
-                lambda a, b: a + b,
-                json_map_leaves(get_leaf_size, value),
-            )
-
-            if debug:
-                logger.debug("Calculated size of %s to be %.2f GiB",
-                             type(value), size / GiB_bytes)
-
-            return size
-
-        return LRUCache(GiB_bytes * capacity_gb, getsizeof=get_item_size)
+class ProcessingCache(MultiModalCache):
 
     def __init__(
         self,
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index eab1560b1a..38b1d9b13f 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -429,8 +429,8 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     if mm_positions and len(mm_positions) != len(mm_hashes):
         raise ValueError(
             "The number of multi-modal positions and hashes must match. This "
-            "is likely because you do not enable MM preprocessor hashing. "
-            "Please set disable_mm_preprocessor_cache=False.")
+            "is likely because you did not enable MM hashing. "
+            "Please set `disable_mm_preprocessor_cache=False`.")
 
     # Note that we assume mm_positions is sorted by offset.
     # We do not need to check all mm inputs if the start token index is out of
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 79c47e1028..78b8fe4ea6 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -35,7 +35,7 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType,
                             ReconfigureDistributedRequest, ReconfigureRankType,
                             UtilityOutput, UtilityResult)
-from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
+from vllm.v1.engine.mm_input_cache import MultiModalInputCacheServer
 from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -124,8 +124,7 @@ class EngineCore:
             log_stats=self.log_stats,
         )
 
-        # Setup MM Input Mapper.
-        self.mm_input_cache_server = MirroredProcessingCache(
+        self.mm_input_cache_server = MultiModalInputCacheServer(
             vllm_config.model_config)
 
         # Setup batch queue for pipeline parallelism.
@@ -413,7 +412,7 @@ class EngineCore:
             # Note on thread safety: no race condition.
             # `mm_input_cache_server` is reset at the end of LLMEngine init,
             # and will only accessed in the input processing thread afterwards.
-            request.mm_inputs = self.mm_input_cache_server.get_and_update_p1(
+            request.mm_inputs = self.mm_input_cache_server.get_and_update(
                 request.mm_inputs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index abe98a13df..279c9f0007 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,54 +1,68 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
-from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
 from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.processing import ProcessingCache
+from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
 from vllm.utils import is_list_of
 
-# The idea of multimodal preprocessing caching is based on having a client and
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
+# The idea of multimodal input caching is based on having a client and
 # a server, where the client executes in the frontend process (=P0) and the
 # server in the core process (=P1).
 #
-# -- Client:
-#  - BaseMultiModalProcessor to process MultiModalData into MultiModalKwargs
-#    with built-in caching functionality, with mm_hash as its identifier.
-#  - MirroredProcessingCache to keep track of the cached entries and
-#    determine whether to send the MultiModalKwargs to P1.
+# -- P0:
+#  - BaseMultiModalProcessor calls MultiModalHasher to get the `mm_hash` of
+#    each input multi-modal item (e.g. image),
+#  - BaseMultiModalProcessor processes the input items into `mm_inputs`,
+#    which are MultiModalKwargsItem instances that each correspond to an
+#    input multi-modal item.
+#  - MultiModalInputCacheClient accepts the `mm_inputs` and corresponding
+#    `mm_hash` for each item. It stores the `mm_hash` as keys and the size
+#    of `mm_inputs`, but not the `mm_inputs` themselves, to avoid taking
+#    up additional memory in P0.
+#  - The `mm_hash` is always sent to P1.
+#  - The corresponding `mm_inputs` are only sent to P1 if they are not cached
+#    in MultiModalInputCacheServer.
 #
-# -- Server:
-#  - MirroredProcessingCache to store the MultiModalKwargs from P0.
+# -- P1:
+#  - If the `mm_hash` is cached (i.e. `mm_inputs` are not sent from P0),
+#    MultiModalInputCacheServer retrieves the corresponding `mm_inputs`.
+#  - If the `mm_hash` is not cached (i.e. `mm_inputs` are sent from P0),
+#    MultiModalInputCacheServer stores `mm_inputs` under the key `mm_hash`.
+#  - Either way, the `mm_hash` and corresponding `mm_inputs` are sent to
+#    the engine for model execution.
 #
-# The caching for both client and server is mirrored, and this allows us
-# to avoid the serialization of "mm_inputs" (like pixel values) between
-# client (=P0) and server (=P1) processes if the mm_hash is found in the client
-# cache.
-
-# Both Client and Server must use the same cache size
-# (to perform mirrored caching). This cache size is set by the environment
-# variable VLLM_MM_INPUT_CACHE_GIB.
+# Both Client and Server must perform cache update and eviction based on the
+# same item size. This ensures that the keys of MultiModalInputCacheClient
+# and MultiModalInputCacheServer are mirrored, allowing us to determine in P0
+# whether a key is cached in MultiModalInputCacheServer by querying
+# MultiModalInputCacheClient without having to communicate with P1.
 
 
-class MirroredProcessingCache:
+class MultiModalInputCacheClient:
+    """Used by P0 to check whether multi-modal kwargs are cached in P1."""
 
-    def __init__(self, model_config):
-        mm_config = model_config.multimodal_config
-        disable_mm_preprocessor_cache = (
-            mm_config is not None and mm_config.disable_mm_preprocessor_cache)
-        self.use_cache = not disable_mm_preprocessor_cache
-        self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
-                                                      MultiModalKwargs)
+    def __init__(self, model_config: "ModelConfig") -> None:
+        super().__init__()
 
-    def get_and_update_p0(
+        self.enabled = model_config.enable_mm_input_cache
+        self.mm_cache = MultiModalCache.get_lru_cache(
+            model_config.get_mm_input_cache_gb(),
+            MultiModalCacheItemMetadata,
+        )
+
+    def get_and_update(
         self,
         mm_inputs: Sequence[MultiModalKwargs],
         mm_hashes: list[str],
     ) -> Sequence[Optional[MultiModalKwargs]]:
         assert len(mm_inputs) == len(mm_hashes)
 
-        if not self.use_cache:
+        if not self.enabled:
             assert is_list_of(mm_inputs, MultiModalKwargs)
             return mm_inputs
 
@@ -57,20 +71,37 @@ class MirroredProcessingCache:
             if self.mm_cache.get(mm_hash) is not None:
                 mm_input = None
             else:
-                self.mm_cache[mm_hash] = mm_input
+                self.mm_cache[mm_hash] = \
+                    MultiModalCacheItemMetadata.wraps(mm_input)
 
             full_mm_inputs.append(mm_input)
 
         return full_mm_inputs
 
-    def get_and_update_p1(
+    def reset(self) -> None:
+        self.mm_cache.clear()
+
+
+class MultiModalInputCacheServer:
+    """Used by P1 to avoid requiring past multi-modal kwargs from P0."""
+
+    def __init__(self, model_config: "ModelConfig") -> None:
+        super().__init__()
+
+        self.enabled = model_config.enable_mm_input_cache
+        self.mm_cache = MultiModalCache.get_lru_cache(
+            model_config.get_mm_input_cache_gb(),
+            MultiModalKwargs,
+        )
+
+    def get_and_update(
         self,
         mm_inputs: Sequence[Optional[MultiModalKwargs]],
         mm_hashes: list[str],
     ) -> Sequence[MultiModalKwargs]:
         assert len(mm_inputs) == len(mm_hashes)
 
-        if not self.use_cache:
+        if not self.enabled:
             assert is_list_of(mm_inputs, MultiModalKwargs)
             return mm_inputs
 
@@ -85,7 +116,5 @@ class MirroredProcessingCache:
 
         return full_mm_inputs
 
-    def reset(self) -> bool:
+    def reset(self) -> None:
         self.mm_cache.clear()
-
-        return True
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 692a7dd564..6e37ebeb87 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -19,7 +19,7 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
+from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
 from vllm.v1.structured_output.backend_guidance import (
     validate_guidance_grammar)
 from vllm.v1.structured_output.backend_outlines import (
@@ -50,11 +50,8 @@ class Processor:
                                                     self.tokenizer,
                                                     mm_registry)
 
-        self.mm_input_cache_client = MirroredProcessingCache(self.model_config)
-
-        # Multi-modal hasher (for images)
-        self.use_hash = self.mm_input_cache_client.use_cache or \
-            self.cache_config.enable_prefix_caching
+        self.mm_input_cache_client = MultiModalInputCacheClient(
+            self.model_config)
 
     @property
     def mm_registry(self):
@@ -256,11 +253,13 @@ class Processor:
         # 1. Tokenize text prompt, with LoRA request if one exists.
         # 2. For multimodal models with a merged preprocessor, preprocess
         #   multimodal data and expand prompt token ids accordingly.
+        return_mm_hashes = (self.model_config.processor_return_mm_hashes
+                            or bool(self.cache_config.enable_prefix_caching))
         processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
-            return_mm_hashes=self.use_hash,
+            return_mm_hashes=return_mm_hashes,
         )
         from vllm.platforms import current_platform
         current_platform.validate_request(
@@ -312,7 +311,7 @@ class Processor:
                 sorted_mm_hashes,
             ) = merge_and_sort_multimodal_metadata(
                 decoder_inputs["mm_placeholders"],
-                decoder_inputs["mm_hashes"] if self.use_hash else None,
+                decoder_inputs["mm_hashes"] if return_mm_hashes else None,
             )
 
             # The output of merged multi-modal processor (`decoder_mm_inputs`)
@@ -339,7 +338,7 @@ class Processor:
                 ]
 
             if sorted_mm_hashes is not None:
-                sorted_mm_inputs = self.mm_input_cache_client.get_and_update_p0(
+                sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
                     orig_sorted_mm_inputs, sorted_mm_hashes)
             else:
                 sorted_mm_inputs = orig_sorted_mm_inputs

From 7e0b121812a30975365497e39608898312c18984 Mon Sep 17 00:00:00 2001
From: fxmarty-amd <felmarty@amd.com>
Date: Thu, 7 Aug 2025 15:30:48 +0200
Subject: [PATCH 065/932] [Bugfix] Add missing `packed_modules_mapping` to
 `DeepseekV2ForCausalLM` (#22352)

Signed-off-by: Felix Marty <Felix.Marty@amd.com>
---
 vllm/model_executor/models/deepseek_v2.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 68a0a83d62..c2880c33cb 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -726,6 +726,9 @@ class DeepseekV2Model(nn.Module):
 
 
 class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
+    packed_modules_mapping = {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -733,6 +736,19 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
+
+        # `packed_modules_mapping` needs to be modified before
+        # initializing DeepseekV2Model, as it is passed inplace to
+        # quantization config init and may be used to select the
+        # quant_method for relevant layers during initialization.
+        self.fuse_qkv_a_proj = hasattr(
+            config, "q_lora_rank") and config.q_lora_rank is not None
+        if self.fuse_qkv_a_proj:
+            self.packed_modules_mapping["fused_qkv_a_proj"] = [
+                "q_a_proj",
+                "kv_a_proj_with_mqa",
+            ]
+
         self.model = DeepseekV2Model(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "model"))
         if get_pp_group().is_last_rank:

From 4da8bf20d08f1f8f97a4839d580eb923d0ca9415 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Thu, 7 Aug 2025 07:03:38 -0700
Subject: [PATCH 066/932] [Tool] Fix auto tool call (#22434)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/openai/serving_responses.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index f26f92537c..21fc209af9 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -132,9 +132,6 @@ class OpenAIServingResponses(OpenAIServing):
                 "\"auto\" tool choice has been enabled please note that while"
                 " the parallel_tool_calls client option is preset for "
                 "compatibility reasons, it will be ignored.")
-            if not self.use_harmony:
-                raise NotImplementedError("Auto tool choice is not supported "
-                                          "yet unless using Harmony")
 
         # HACK(woosuk): This is a hack. We should use a better store.
         # FIXME: If enable_store=True, this may cause a memory leak since we
@@ -212,8 +209,8 @@ class OpenAIServingResponses(OpenAIServing):
                     await self._make_request(request, prev_response,
                                              tokenizer))
 
-        except (ValueError, TypeError, RuntimeError,
-                jinja2.TemplateError) as e:
+        except (ValueError, TypeError, RuntimeError, jinja2.TemplateError,
+                NotImplementedError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(f"{e} {e.__cause__}")
 
@@ -313,6 +310,9 @@ class OpenAIServingResponses(OpenAIServing):
         prev_response: Optional[ResponsesResponse],
         tokenizer: AnyTokenizer,
     ):
+        if len(request.tools) > 0:
+            raise NotImplementedError(
+                "Tool use is not supported in Responses API without Harmony")
         # Construct the input messages.
         messages = self._construct_input_messages(request, prev_response)
         _, request_prompts, engine_prompts = await self._preprocess_chat(

From 4815b00f5487a070a40c7451c2cfcaef80786220 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Thu, 7 Aug 2025 08:33:25 -0700
Subject: [PATCH 067/932] [gpt-oss] Generate ResponseOutputItem from Harmony
 Message (#22410)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 .../openai/responses/test_basic.py            |   2 +-
 vllm/entrypoints/harmony_utils.py             | 153 ++++++++++++++-
 vllm/entrypoints/openai/protocol.py           |  31 +--
 vllm/entrypoints/openai/serving_responses.py  | 184 ++++++++++++------
 4 files changed, 290 insertions(+), 80 deletions(-)

diff --git a/tests/v1/entrypoints/openai/responses/test_basic.py b/tests/v1/entrypoints/openai/responses/test_basic.py
index 974ea8673c..18c35152e7 100644
--- a/tests/v1/entrypoints/openai/responses/test_basic.py
+++ b/tests/v1/entrypoints/openai/responses/test_basic.py
@@ -17,7 +17,7 @@ async def test_simple_input(client: openai.AsyncOpenAI):
 
     # Whether the output contains the reasoning.
     assert outputs[0].type == "reasoning"
-    assert outputs[0].text != ""
+    assert outputs[0].content[0].text != ""
 
 
 @pytest.mark.asyncio
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index ee08d62b57..87e76e08a0 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -1,18 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import datetime
+import json
 from collections.abc import Iterable, Sequence
 from typing import Literal, Optional, Union
 
-from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses import (ResponseFunctionToolCall,
+                                    ResponseOutputItem, ResponseOutputMessage,
+                                    ResponseOutputText, ResponseReasoningItem)
+from openai.types.responses.response_function_web_search import (
+    ActionFind, ActionOpenPage, ActionSearch, ResponseFunctionWebSearch)
+from openai.types.responses.response_reasoning_item import (
+    Content as ResponseReasoningTextContent)
 from openai.types.responses.tool import Tool
 from openai_harmony import (Author, Conversation, DeveloperContent,
                             HarmonyEncodingName, Message, ReasoningEffort,
                             Role, StreamableParser, SystemContent, TextContent,
                             ToolDescription, load_harmony_encoding)
 
-from vllm.entrypoints.openai.protocol import (ResponseInputOutputItem,
-                                              ResponseReasoningItem)
+from vllm.entrypoints.openai.protocol import ResponseInputOutputItem
+from vllm.utils import random_uuid
 
 REASONING_EFFORT = {
     "high": ReasoningEffort.HIGH,
@@ -160,6 +167,146 @@ def render_for_completion(messages: list[Message]) -> list[int]:
     return token_ids
 
 
+def parse_output_message(message: Message) -> list[ResponseOutputItem]:
+    """
+    Parse a Harmony message into a list of output response items.
+    """
+    if message.author.role != "assistant":
+        # This is a message from a tool to the assistant (e.g., search result).
+        # Don't include it in the final output for now. This aligns with
+        # OpenAI's behavior on models like o4-mini.
+        return []
+
+    output_items: list[ResponseOutputItem] = []
+    recipient = message.recipient
+    if recipient is not None and recipient.startswith("browser."):
+        if len(message.content) != 1:
+            raise ValueError("Invalid number of contents in browser message")
+        content = message.content[0]
+        browser_call = json.loads(content.text)
+        # TODO: translate to url properly!
+        if recipient == "browser.search":
+            action = ActionSearch(
+                query=f"cursor:{browser_call.get('query', '')}", type="search")
+        elif recipient == "browser.open":
+            action = ActionOpenPage(
+                url=f"cursor:{browser_call.get('url', '')}", type="open_page")
+        elif recipient == "browser.find":
+            action = ActionFind(pattern=browser_call["pattern"],
+                                url=f"cursor:{browser_call.get('url', '')}",
+                                type="find")
+        else:
+            raise ValueError(f"Unknown browser action: {recipient}")
+        web_search_item = ResponseFunctionWebSearch(
+            id=f"ws_{random_uuid()}",
+            action=action,
+            status="completed",
+            type="web_search_call",
+        )
+        output_items.append(web_search_item)
+    elif message.channel == "analysis":
+        for content in message.content:
+            reasoning_item = ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                summary=[],
+                type="reasoning",
+                content=[
+                    ResponseReasoningTextContent(text=content.text,
+                                                 type="reasoning_text")
+                ],
+                status=None,
+            )
+            output_items.append(reasoning_item)
+    elif message.channel == "commentary":
+        if message.recipient.startswith("functions."):
+            function_name = message.recipient.split(".")[-1]
+            for content in message.content:
+                random_id = random_uuid()
+                response_item = ResponseFunctionToolCall(
+                    arguments=content.text,
+                    call_id=f"call_{random_id}",
+                    type="function_call",
+                    name=function_name,
+                    id=f"ft_{random_id}",
+                )
+                output_items.append(response_item)
+        elif message.recipient.startswith(
+                "python") or message.recipient.startswith("browser"):
+            for content in message.content:
+                reasoning_item = ResponseReasoningItem(
+                    id=f"rs_{random_uuid()}",
+                    summary=[],
+                    type="reasoning",
+                    text=content.text,
+                    status=None,
+                )
+                output_items.append(reasoning_item)
+        else:
+            raise ValueError(f"Unknown recipient: {message.recipient}")
+    elif message.channel == "final":
+        contents = []
+        for content in message.content:
+            output_text = ResponseOutputText(
+                text=content.text,
+                annotations=[],  # TODO
+                type="output_text",
+                logprobs=None,  # TODO
+            )
+            contents.append(output_text)
+        text_item = ResponseOutputMessage(
+            id=f"msg_{random_uuid()}",
+            content=contents,
+            role=message.author.role,
+            status="completed",
+            type="message",
+        )
+        output_items.append(text_item)
+    else:
+        raise ValueError(f"Unknown channel: {message.channel}")
+    return output_items
+
+
+def parse_remaining_state(
+        parser: StreamableParser) -> list[ResponseOutputItem]:
+    if not parser.current_content:
+        return []
+    if parser.current_role != Role.ASSISTANT:
+        return []
+    current_recipient = parser.current_recipient
+    if (current_recipient is not None
+            and current_recipient.startswith("browser.")):
+        return []
+
+    if parser.current_channel == "analysis":
+        reasoning_item = ResponseReasoningItem(
+            id=f"rs_{random_uuid()}",
+            summary=[],
+            type="reasoning",
+            content=[
+                ResponseReasoningTextContent(text=parser.current_content,
+                                             type="reasoning_text")
+            ],
+            status=None,
+        )
+        return [reasoning_item]
+    elif parser.current_channel == "final":
+        output_text = ResponseOutputText(
+            text=parser.current_content,
+            annotations=[],  # TODO
+            type="output_text",
+            logprobs=None,  # TODO
+        )
+        text_item = ResponseOutputMessage(
+            id=f"msg_{random_uuid()}",
+            content=[output_text],
+            role="assistant",
+            status="completed",
+            type="message",
+        )
+        return [text_item]
+    return []
+
+
 def get_stop_tokens_for_assistant_actions() -> list[int]:
     return get_encoding().stop_tokens_for_assistant_actions()
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index ea2cf57563..3b9f4b544e 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -19,8 +19,8 @@ from openai.types.chat.chat_completion_message import (
 # yapf: enable
 from openai.types.responses import (ResponseFunctionToolCall,
                                     ResponseInputItemParam, ResponseOutputItem,
-                                    ResponseOutputMessage, ResponsePrompt,
-                                    ResponseStatus, ResponseTextConfig)
+                                    ResponsePrompt, ResponseStatus,
+                                    ResponseTextConfig)
 from openai.types.responses.response import ToolChoice
 from openai.types.responses.tool import Tool
 from openai.types.shared import Metadata, Reasoning
@@ -1729,13 +1729,20 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
     usage: Optional[UsageInfo] = Field(default=None)
 
 
-class ResponseReasoningItem(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"rs_{random_uuid()}")
-    text: str
-    summary: list = Field(default_factory=list)
-    type: Literal["reasoning"] = "reasoning"
-    encrypted_content: Optional[str] = None
-    status: Optional[Literal["in_progress", "completed", "incomplete"]]
+class InputTokensDetails(OpenAIBaseModel):
+    cached_tokens: int
+
+
+class OutputTokensDetails(OpenAIBaseModel):
+    reasoning_tokens: int
+
+
+class ResponseUsage(OpenAIBaseModel):
+    input_tokens: int
+    input_tokens_details: InputTokensDetails
+    output_tokens: int
+    output_tokens_details: OutputTokensDetails
+    total_tokens: int
 
 
 class ResponsesResponse(OpenAIBaseModel):
@@ -1747,7 +1754,7 @@ class ResponsesResponse(OpenAIBaseModel):
     metadata: Optional[Metadata] = None
     model: str
     object: Literal["response"] = "response"
-    output: list[Union[ResponseOutputMessage, ResponseReasoningItem]]
+    output: list[ResponseOutputItem]
     parallel_tool_calls: bool
     temperature: float
     tool_choice: ToolChoice
@@ -1764,7 +1771,7 @@ class ResponsesResponse(OpenAIBaseModel):
     text: Optional[ResponseTextConfig] = None
     top_logprobs: int
     truncation: Literal["auto", "disabled"]
-    usage: Optional[UsageInfo] = None
+    usage: Optional[ResponseUsage] = None
     user: Optional[str] = None
 
     @classmethod
@@ -1776,7 +1783,7 @@ class ResponsesResponse(OpenAIBaseModel):
         created_time: int,
         output: list[ResponseOutputItem],
         status: ResponseStatus,
-        usage: Optional[UsageInfo] = None,
+        usage: Optional[ResponseUsage] = None,
     ) -> "ResponsesResponse":
         return cls(
             id=request.request_id,
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 21fc209af9..d40231795b 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -6,12 +6,15 @@ import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from copy import copy
 from http import HTTPStatus
-from typing import Callable, Final, Optional, Union
+from typing import Any, Callable, Final, Optional, Union
 
 import jinja2
 from fastapi import Request
 from openai.types.responses import (ResponseFunctionToolCall,
-                                    ResponseOutputMessage, ResponseOutputText)
+                                    ResponseOutputItem, ResponseOutputMessage,
+                                    ResponseOutputText, ResponseReasoningItem)
+from openai.types.responses.response_reasoning_item import (
+    Content as ResponseReasoningTextContent)
 from openai_harmony import Message as OpenAIHarmonyMessage
 
 from vllm import envs
@@ -19,26 +22,28 @@ from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                          ChatTemplateContentFormatOption)
-from vllm.entrypoints.context import ConversationContext, SimpleContext
+from vllm.entrypoints.context import (ConversationContext, HarmonyContext,
+                                      SimpleContext, StreamingHarmonyContext)
 from vllm.entrypoints.harmony_utils import (
     get_developer_message, get_stop_tokens_for_assistant_actions,
-    get_system_message, get_user_message, parse_response_input,
-    render_for_completion)
+    get_system_message, get_user_message, parse_output_message,
+    parse_remaining_state, parse_response_input, render_for_completion)
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
-                                              PromptTokenUsageInfo,
+                                              InputTokensDetails,
+                                              OutputTokensDetails,
                                               RequestResponseMetadata,
-                                              ResponseReasoningItem,
                                               ResponsesRequest,
-                                              ResponsesResponse, UsageInfo)
+                                              ResponsesResponse, ResponseUsage)
 # yapf: enable
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
+from vllm.outputs import CompletionOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -222,6 +227,7 @@ class OpenAIServingResponses(OpenAIServing):
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[ConversationContext, None]] = []
         try:
+            tool_sessions: dict[str, Any] = {}
             for i, engine_prompt in enumerate(engine_prompts):
                 default_max_tokens = self.max_model_len - len(
                     engine_prompt["prompt_token_ids"])
@@ -231,7 +237,15 @@ class OpenAIServingResponses(OpenAIServing):
                 trace_headers = (None if raw_request is None else await
                                  self._get_trace_headers(raw_request.headers))
 
-                context = SimpleContext()
+                context: ConversationContext
+                if self.use_harmony:
+                    if request.stream:
+                        context = StreamingHarmonyContext(
+                            messages, tool_sessions)
+                    else:
+                        context = HarmonyContext(messages, tool_sessions)
+                else:
+                    context = SimpleContext()
                 generator = self._generate_with_builtin_tools(
                     request_id=request.request_id,
                     request_prompt=request_prompts[i],
@@ -274,6 +288,7 @@ class OpenAIServingResponses(OpenAIServing):
                     request,
                     sampling_params,
                     result_generator,
+                    context,
                     model_name,
                     tokenizer,
                     request_metadata,
@@ -297,6 +312,7 @@ class OpenAIServingResponses(OpenAIServing):
                 request,
                 sampling_params,
                 result_generator,
+                context,
                 model_name,
                 tokenizer,
                 request_metadata,
@@ -344,6 +360,7 @@ class OpenAIServingResponses(OpenAIServing):
         request: ResponsesRequest,
         sampling_params: SamplingParams,
         result_generator: AsyncIterator[ConversationContext],
+        context: ConversationContext,
         model_name: str,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
@@ -352,9 +369,8 @@ class OpenAIServingResponses(OpenAIServing):
         if created_time is None:
             created_time = int(time.time())
 
-        context: Optional[ConversationContext] = None
         try:
-            async for context in result_generator:
+            async for _ in result_generator:
                 pass
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
@@ -362,64 +378,40 @@ class OpenAIServingResponses(OpenAIServing):
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        assert context is not None
-        assert isinstance(context, SimpleContext)
-        final_res = context.last_output
-        assert final_res is not None
-        assert len(final_res.outputs) == 1
-        final_output = final_res.outputs[0]
-
-        if self.reasoning_parser:
-            try:
-                reasoning_parser = self.reasoning_parser(tokenizer)
-            except RuntimeError as e:
-                logger.exception("Error in reasoning parser creation.")
-                return self.create_error_response(str(e))
-
-            reasoning_content, content = (
-                reasoning_parser.extract_reasoning_content(final_output.text,
-                                                           request=request))
+        if self.use_harmony:
+            assert isinstance(context, HarmonyContext)
+            output = self._make_response_output_items_with_harmony(context)
+            # TODO: these are all 0 for now!
+            num_prompt_tokens = context.num_prompt_tokens
+            num_generated_tokens = context.num_output_tokens
+            num_cached_tokens = context.num_cached_tokens
+            num_reasoning_tokens = context.num_reasoning_tokens
         else:
-            reasoning_content = None
-            content = final_output.text
+            assert isinstance(context, SimpleContext)
+            final_res = context.last_output
+            assert final_res is not None
+            assert len(final_res.outputs) == 1
+            final_output = final_res.outputs[0]
 
-        output = []
-        if reasoning_content:
-            reasoning_item = ResponseReasoningItem(
-                text=reasoning_content,
-                status=None,  # NOTE: Only the last output item has status.
-            )
-            output.append(reasoning_item)
-        if content:
-            output_text = ResponseOutputText(
-                text=content,
-                annotations=[],  # TODO
-                type="output_text",
-                logprobs=None,  # TODO
-            )
-            message = ResponseOutputMessage(
-                id=f"msg_{random_uuid()}",
-                content=[output_text],
-                role="assistant",
-                status="completed",
-                type="message",
-            )
-            output.append(message)
+            output = self._make_response_output_items(request, final_output,
+                                                      tokenizer)
 
-        # Calculate usage.
-        assert final_res.prompt_token_ids is not None
-        num_prompt_tokens = len(final_res.prompt_token_ids)
-        num_generated_tokens = len(final_output.token_ids)
-        usage = UsageInfo(
-            prompt_tokens=num_prompt_tokens,
-            completion_tokens=num_generated_tokens,
+            # Calculate usage.
+            assert final_res.prompt_token_ids is not None
+            num_prompt_tokens = len(final_res.prompt_token_ids)
+            num_generated_tokens = len(final_output.token_ids)
+            num_cached_tokens = final_res.num_cached_tokens
+            num_reasoning_tokens = 0
+
+        usage = ResponseUsage(
+            input_tokens=num_prompt_tokens,
+            output_tokens=num_generated_tokens,
             total_tokens=num_prompt_tokens + num_generated_tokens,
+            input_tokens_details=InputTokensDetails(
+                cached_tokens=num_cached_tokens),
+            output_tokens_details=OutputTokensDetails(
+                reasoning_tokens=num_reasoning_tokens),
         )
-        if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
-            usage.prompt_tokens_details = PromptTokenUsageInfo(
-                cached_tokens=final_res.num_cached_tokens)
-        request_metadata.final_usage_info = usage
-
         response = ResponsesResponse.from_request(
             request,
             sampling_params,
@@ -457,6 +449,70 @@ class OpenAIServingResponses(OpenAIServing):
                     self.response_store[response.id] = response
         return response
 
+    def _make_response_output_items(
+        self,
+        request: ResponsesRequest,
+        final_output: CompletionOutput,
+        tokenizer: AnyTokenizer,
+    ) -> list[ResponseOutputItem]:
+        if self.reasoning_parser:
+            try:
+                reasoning_parser = self.reasoning_parser(tokenizer)
+            except RuntimeError as e:
+                logger.exception("Error in reasoning parser creation.")
+                raise e
+
+            reasoning_content, content = (
+                reasoning_parser.extract_reasoning_content(final_output.text,
+                                                           request=request))
+        else:
+            reasoning_content = None
+            content = final_output.text
+
+        output = []
+        if reasoning_content:
+            reasoning_item = ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                summary=[],
+                type="reasoning",
+                content=[
+                    ResponseReasoningTextContent(text=reasoning_content,
+                                                 type="reasoning_text")
+                ],
+                status=None,  # NOTE: Only the last output item has status.
+            )
+            output.append(reasoning_item)
+        if content:
+            output_text = ResponseOutputText(
+                text=content,
+                annotations=[],  # TODO
+                type="output_text",
+                logprobs=None,  # TODO
+            )
+            message = ResponseOutputMessage(
+                id=f"msg_{random_uuid()}",
+                content=[output_text],
+                role="assistant",
+                status="completed",
+                type="message",
+            )
+            output.append(message)
+        return output
+
+    def _make_response_output_items_with_harmony(
+        self,
+        context: HarmonyContext,
+    ) -> list[ResponseOutputItem]:
+        output_items = []
+        num_init_messages = context.num_init_messages
+        for msg in context.messages[num_init_messages:]:
+            output_items.extend(parse_output_message(msg))
+        # Handle the generation stopped in the middle (if any).
+        last_items = parse_remaining_state(context.parser)
+        if last_items:
+            output_items.extend(last_items)
+        return output_items
+
     def _construct_input_messages(
         self,
         request: ResponsesRequest,

From 399d2a10e23fcf37cc7a703d7de50ffecc7e0c6f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 7 Aug 2025 08:54:39 -0700
Subject: [PATCH 068/932] Fix pre-commit error in main (#22462)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/entrypoints/openai/serving_responses.py | 36 ++++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index d40231795b..a7554e0d68 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -422,24 +422,6 @@ class OpenAIServingResponses(OpenAIServing):
             usage=usage,
         )
 
-        # Log complete response if output logging is enabled
-        if self.enable_log_outputs and self.request_logger:
-            output_text = ""
-            if content:
-                output_text = content
-            elif reasoning_content:
-                output_text = f"[reasoning: {reasoning_content}]"
-
-            if output_text:
-                self.request_logger.log_outputs(
-                    request_id=request.request_id,
-                    outputs=output_text,
-                    output_token_ids=final_output.token_ids,
-                    finish_reason=final_output.finish_reason,
-                    is_streaming=False,
-                    delta=False,
-                )
-
         if request.store:
             async with self.response_store_lock:
                 stored_response = self.response_store.get(response.id)
@@ -469,6 +451,24 @@ class OpenAIServingResponses(OpenAIServing):
             reasoning_content = None
             content = final_output.text
 
+        # Log complete response if output logging is enabled
+        if self.enable_log_outputs and self.request_logger:
+            output_text = ""
+            if content:
+                output_text = content
+            elif reasoning_content:
+                output_text = f"[reasoning: {reasoning_content}]"
+
+            if output_text:
+                self.request_logger.log_outputs(
+                    request_id=request.request_id,
+                    outputs=output_text,
+                    output_token_ids=final_output.token_ids,
+                    finish_reason=final_output.finish_reason,
+                    is_streaming=False,
+                    delta=False,
+                )
+
         output = []
         if reasoning_content:
             reasoning_item = ResponseReasoningItem(

From 8c9da6be229336a769d9c904415daaa250824c89 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 8 Aug 2025 00:47:07 +0800
Subject: [PATCH 069/932] [Core] Simplify mm processing cache (#22457)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../models/qwen2_5_omni_thinker.py            |  12 +-
 vllm/model_executor/models/transformers.py    |   5 +-
 vllm/multimodal/processing.py                 | 248 +++++-------------
 vllm/v1/serial_utils.py                       |  34 +--
 4 files changed, 95 insertions(+), 204 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index b9fed79c84..a3af541d20 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -431,7 +431,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         tokenization_kwargs: Mapping[str, object],
         *,
         enable_hf_prompt_update: bool,
-    ) -> tuple[list[int], MultiModalKwargs, bool]:
+    ) -> tuple[list[int], BatchFeature, bool]:
         """
         Qwen2.5-Omni reimplements this function to handle text only.
         """
@@ -448,20 +448,20 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         else:
             prompt_ids = self._apply_hf_processor_tokens_only(prompt)
 
-        mm_kwargs = self._apply_hf_processor_mm_only(
+        mm_processed_data = self._apply_hf_processor_mm_only(
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
         )
 
-        return prompt_ids, mm_kwargs, False
+        return prompt_ids, mm_processed_data, False
 
     def _apply_hf_processor_mm_only(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-    ) -> MultiModalKwargs:
+    ) -> BatchFeature:
         """
         Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
         """
@@ -473,14 +473,14 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             assert "audio" in mm_counts
             mm_counts["audio"] -= mm_counts["video"]
 
-        _, mm_kwargs, _ = self._apply_hf_processor_text_mm(
+        _, mm_processed_data, _ = self._apply_hf_processor_text_mm(
             prompt_text=self.dummy_inputs.get_dummy_text(mm_counts),
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
         )
 
-        return mm_kwargs
+        return mm_processed_data
 
     def _validate_mm_placeholders(
         self,
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 0c3df267ed..92e132045c 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -22,7 +22,8 @@ from typing import Literal, Optional, Union
 import regex as re
 import torch
 from torch import nn
-from transformers import AutoModel, PretrainedConfig, PreTrainedModel
+from transformers import (AutoModel, BatchFeature, PretrainedConfig,
+                          PreTrainedModel)
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
 from vllm.attention import Attention
@@ -269,7 +270,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-    ):
+    ) -> tuple[list[int], BatchFeature, bool]:
         """
         Apply the HF processor on the prompt text and multi-modal data
         together.
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 0378539495..38c5d5d99f 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -18,7 +18,7 @@ from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
-from vllm.utils import GiB_bytes, flatten_2d_lists, full_groupby
+from vllm.utils import flatten_2d_lists, full_groupby
 
 from .cache import MultiModalCache
 from .hasher import MultiModalHasher
@@ -887,120 +887,19 @@ def find_mm_placeholders(
     return dict(full_groupby_modality(it))
 
 
-class ProcessingCacheOptionalItem(NamedTuple):
-    key: str
-    value: Optional[MultiModalKwargsItem]
-
-
-class ProcessingCacheItem(NamedTuple):
-    key: str
-    value: MultiModalKwargsItem
-
-
 class ProcessingCache(MultiModalCache):
 
-    def __init__(
-        self,
-        capacity_gb: float,
-        *,
-        debug_cache_hit_ratio_steps: Optional[int] = None,
-    ) -> None:
+    def __init__(self, capacity_gb: float) -> None:
         super().__init__()
 
-        self.debug_cache_hit_ratio_steps = debug_cache_hit_ratio_steps
-        self.debug_cache_hits = 0
-        self.debug_cache_total = 0
+        self._cache = self.get_lru_cache(capacity_gb, MultiModalKwargsItem)
 
-        self._cache = self.get_lru_cache(
-            capacity_gb,
-            MultiModalKwargsItem,
-            debug=bool(debug_cache_hit_ratio_steps),
-        )
+        self.get = self._cache.get
+        self.put = self._cache.put
+        self.reset = self._cache.clear
 
-    def _maybe_log_cache_stats(self) -> None:
-        steps = self.debug_cache_hit_ratio_steps
-        if not steps:
-            return
 
-        total = self.debug_cache_total
-        if total > 0 and total % steps == 0:
-            logger.debug("ProcessingCache: hit_ratio = %.2f",
-                         self.debug_cache_hits / total)
-            logger.debug("ProcessingCache: size = %.2f / %.2f GiB",
-                         self._cache.currsize / GiB_bytes,
-                         self._cache.maxsize / GiB_bytes)
-
-    def get(
-        self,
-        model_id: str,
-        modality: str,
-        input_item: object,
-        input_kwargs: Mapping[str, object],
-    ) -> Optional[MultiModalKwargsItem]:
-        """
-        Get a processed multi-modal item from the cache
-        according to its dependencies, including:
-
-        - The model ID
-        - The modality of the item
-        - The original data item passed to the HF processor
-        - The configuration options of the HF processor
-        """
-        self._maybe_log_cache_stats()
-
-        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
-                                                 **{modality: input_item},
-                                                 **input_kwargs)
-
-        if self.debug_cache_hit_ratio_steps:
-            if cache_key in self._cache:
-                self.debug_cache_hits += 1
-
-            self.debug_cache_total += 1
-
-        return self._cache.get(cache_key)
-
-    def get_item(
-        self,
-        model_id: str,
-        modality: str,
-        input_item: object,
-        input_kwargs: Mapping[str, object],
-    ) -> ProcessingCacheOptionalItem:
-        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
-                                                 **{modality: input_item},
-                                                 **input_kwargs)
-
-        return ProcessingCacheOptionalItem(
-            key=cache_key,
-            value=self._cache.get(cache_key),
-        )
-
-    def put(
-        self,
-        model_id: str,
-        modality: str,
-        input_item: object,
-        input_kwargs: Mapping[str, object],
-        output_kwargs: MultiModalKwargsItem,
-    ) -> None:
-        """
-        Put a processed multi-modal item into the cache
-        according to its dependencies
-        (see [`get`][vllm.multimodal.processing.ProcessingCache.get]).
-        """
-        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
-                                                 **{modality: input_item},
-                                                 **input_kwargs)
-        self._cache[cache_key] = output_kwargs
-
-    def put_item(self, item: ProcessingCacheItem) -> None:
-        self._cache[item.key] = item.value
-
-    def reset(self) -> bool:
-        self._cache.clear()
-
-        return True
+_CacheItemOrHash = Union[MultiModalKwargsItem, str]
 
 
 class BaseProcessingInfo:
@@ -1279,7 +1178,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-    ) -> tuple[list[int], MultiModalKwargs, bool]:
+    ) -> tuple[list[int], "BatchFeature", bool]:
         """
         Apply the HF processor on the prompt text and multi-modal data
         together.
@@ -1298,11 +1197,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         prompt_ids, = processed_data.pop("input_ids").tolist()
 
-        mm_kwargs = MultiModalKwargs.from_hf_inputs(
-            processed_data,
-            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
-        )
-
         is_update_applied = self._hf_processor_applies_updates(
             prompt_text=prompt_text,
             mm_items=mm_items,
@@ -1310,11 +1204,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             tokenization_kwargs=tokenization_kwargs,
         )
 
-        return prompt_ids, mm_kwargs, is_update_applied
+        return prompt_ids, processed_data, is_update_applied
 
     def _apply_hf_processor_text_only(
-            self, prompt_text: str,
-            tokenization_kwargs: Mapping[str, object]) -> list[int]:
+        self,
+        prompt_text: str,
+        tokenization_kwargs: Mapping[str, object],
+    ) -> list[int]:
         """
         Apply the HF processor on the prompt text only.
 
@@ -1353,7 +1249,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-    ) -> MultiModalKwargs:
+    ) -> "BatchFeature":
         """
         Apply the HF processor on the multi-modal data only.
 
@@ -1364,14 +1260,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         """
         mm_counts = mm_items.get_all_counts()
 
-        _, mm_kwargs, _ = self._apply_hf_processor_text_mm(
+        _, mm_processed_data, _ = self._apply_hf_processor_text_mm(
             prompt_text=self.dummy_inputs.get_dummy_text(mm_counts),
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
         )
 
-        return mm_kwargs
+        return mm_processed_data
 
     def _apply_hf_processor_main(
         self,
@@ -1381,7 +1277,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         tokenization_kwargs: Mapping[str, object],
         *,
         enable_hf_prompt_update: bool,
-    ) -> tuple[list[int], MultiModalKwargs, bool]:
+    ) -> tuple[list[int], "BatchFeature", bool]:
         """
         Apply the HF processor on the prompt text and multi-modal data.
 
@@ -1407,52 +1303,46 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         else:
             prompt_ids = self._apply_hf_processor_tokens_only(prompt)
 
-        mm_kwargs = self._apply_hf_processor_mm_only(
+        mm_processed_data = self._apply_hf_processor_mm_only(
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
         )
 
-        return prompt_ids, mm_kwargs, False
+        return prompt_ids, mm_processed_data, False
 
     def _get_cache_missing_items(
         self,
         cache: ProcessingCache,
         mm_data_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Mapping[str, object],
-    ) -> tuple[dict[str, list[ProcessingCacheOptionalItem]], dict[
-            str, list[object]]]:
-        model_id = self.info.model_id
-
-        mm_cache_items = {
-            modality: [
-                cache.get_item(
-                    model_id, modality, item,
-                    dict(**hf_processor_mm_kwargs, **tokenization_kwargs))
-                for item in items
-            ]
-            for modality, items in mm_data_items.items()
+        mm_hashes: MultiModalHashes,
+    ) -> tuple[dict[str, list[_CacheItemOrHash]], MultiModalDataItems]:
+        mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]] = {
+            modality: [(h if (v := cache.get(h)) is None else v)
+                       for h in hashes]
+            for modality, hashes in mm_hashes.items()
         }
 
         mm_missing_idxs = {
             modality: [
-                idx for idx, item in enumerate(cache_items)
-                if item.value is None
+                idx for idx, item_or_hash in enumerate(items_or_hashes)
+                if isinstance(item_or_hash, str)
             ]
-            for modality, cache_items in mm_cache_items.items()
+            for modality, items_or_hashes in mm_cache_items_or_hashes.items()
         }
         mm_missing_data = {
             modality: [mm_data_items[modality][idx] for idx in idxs]
             for modality, idxs in mm_missing_idxs.items()
         }
 
-        return mm_cache_items, mm_missing_data
+        return mm_cache_items_or_hashes, self._to_mm_items(mm_missing_data)
 
     def _hash_mm_items(
-            self, mm_items: MultiModalDataItems,
-            hf_processor_mm_kwargs: Mapping[str, object],
-            tokenization_kwargs: Mapping[str, object]) -> MultiModalHashes:
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> MultiModalHashes:
         """Create MM hashes to be returned (only used in V1)."""
         model_id = self.info.model_id
 
@@ -1470,34 +1360,25 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
     def _merge_mm_kwargs(
         self,
         cache: ProcessingCache,
-        mm_cache_items: dict[str, list[ProcessingCacheOptionalItem]],
-        mm_missing_data: dict[str, list[object]],
+        mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]],
         mm_missing_kwargs: MultiModalKwargs,
-    ) -> dict[str, list[ProcessingCacheItem]]:
-        mm_missing_next_idx = {modality: 0 for modality in mm_missing_data}
+    ) -> dict[str, list[MultiModalKwargsItem]]:
+        mm_missing_next_idx = defaultdict[str, int](lambda: 0)
 
-        merged_items = defaultdict[str, list[ProcessingCacheItem]](list)
-        for modality, cache_items in mm_cache_items.items():
-            for cache_item in cache_items:
-                if cache_item.value is None:
+        merged_items = defaultdict[str, list[MultiModalKwargsItem]](list)
+        for modality, items_or_hashes in mm_cache_items_or_hashes.items():
+            for item_or_hash in items_or_hashes:
+                if isinstance(item_or_hash, str):
                     kw_item = mm_missing_kwargs.get_item(
                         modality,
                         mm_missing_next_idx[modality],
                     )
-                    cache_item_new = ProcessingCacheItem(
-                        key=cache_item.key,
-                        value=kw_item,
-                    )
-
-                    cache.put_item(cache_item_new)
+                    cache.put(item_or_hash, kw_item)
                     mm_missing_next_idx[modality] += 1
                 else:
-                    cache_item_new = ProcessingCacheItem(
-                        key=cache_item.key,
-                        value=cache_item.value,
-                    )
+                    kw_item = item_or_hash
 
-                merged_items[modality].append(cache_item_new)
+                merged_items[modality].append(kw_item)
 
         return dict(merged_items)
 
@@ -1512,7 +1393,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
     ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
         (
             prompt_ids,
-            mm_kwargs,
+            mm_processed_data,
             is_update_applied,
         ) = self._apply_hf_processor_main(
             prompt=prompt,
@@ -1522,6 +1403,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             enable_hf_prompt_update=True,
         )
 
+        mm_kwargs = MultiModalKwargs.from_hf_inputs(
+            mm_processed_data,
+            self._get_mm_fields_config(mm_processed_data,
+                                       hf_processor_mm_kwargs),
+        )
+
         mm_hashes = (self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
                                          tokenization_kwargs)
                      if return_mm_hashes else None)
@@ -1553,49 +1440,52 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                 return_mm_hashes=return_mm_hashes,
             )
 
+        mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
+                                        tokenization_kwargs)
         (
-            mm_cache_items,
-            mm_missing_data,
+            mm_cache_items_or_hashes,
+            mm_missing_data_items,
         ) = self._get_cache_missing_items(
             cache=cache,
             mm_data_items=mm_data_items,
-            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
-            tokenization_kwargs=tokenization_kwargs,
+            mm_hashes=mm_hashes,
         )
 
+        mm_hashes_to_return = mm_hashes if return_mm_hashes else None
+
         # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
         # so we can't apply prompt updates until the new multimodal
         # items are combined with the cached multimodal items
         (
             prompt_ids,
-            mm_missing_kwargs,
+            mm_missing_processed_data,
             is_update_applied,
         ) = self._apply_hf_processor_main(
             prompt=prompt,
-            mm_items=self._to_mm_items(mm_missing_data),
+            mm_items=mm_missing_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
             enable_hf_prompt_update=False,
         )
 
+        mm_missing_kwargs = MultiModalKwargs.from_hf_inputs(
+            mm_missing_processed_data,
+            self._get_mm_fields_config(mm_missing_processed_data,
+                                       hf_processor_mm_kwargs),
+        )
+
         mm_cache_items_merged = self._merge_mm_kwargs(
             cache,
-            mm_cache_items=mm_cache_items,
-            mm_missing_data=mm_missing_data,
+            mm_cache_items_or_hashes=mm_cache_items_or_hashes,
             mm_missing_kwargs=mm_missing_kwargs,
         )
 
         mm_kwargs = MultiModalKwargs.from_items([
-            item.value for cache_items in mm_cache_items_merged.values()
+            item for cache_items in mm_cache_items_merged.values()
             for item in cache_items
         ])
 
-        mm_hashes = {
-            modality: [item.key for item in cache_items]
-            for modality, cache_items in mm_cache_items_merged.items()
-        } if return_mm_hashes else None
-
-        return prompt_ids, mm_kwargs, mm_hashes, is_update_applied
+        return prompt_ids, mm_kwargs, mm_hashes_to_return, is_update_applied
 
     def _bind_and_group_updates(
         self,
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 809a60c196..9d063f1eda 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -312,25 +312,25 @@ class MsgpackDecoder:
         return arr.view(torch_dtype).view(shape)
 
     def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]:
-        decoded_items = []
-        for item in obj:
-            elems = []
-            for v in item:
-                v["data"] = self._decode_nested_tensors(v["data"])
-                # Reconstruct the field processor using MultiModalFieldConfig
-                factory_meth_name, *field_args = v["field"]
-                factory_meth = getattr(MultiModalFieldConfig,
-                                       factory_meth_name)
+        return [self._decode_mm_item(v) for v in obj]
 
-                # Special case: decode the union "slices" field of
-                # MultiModalFlatField
-                if factory_meth_name == "flat":
-                    field_args[0] = self._decode_nested_slices(field_args[0])
+    def _decode_mm_item(self, obj: list) -> MultiModalKwargsItem:
+        return MultiModalKwargsItem.from_elems(
+            [self._decode_mm_field_elem(v) for v in obj])
 
-                v["field"] = factory_meth(None, *field_args).field
-                elems.append(MultiModalFieldElem(**v))
-            decoded_items.append(MultiModalKwargsItem.from_elems(elems))
-        return decoded_items
+    def _decode_mm_field_elem(self, obj: dict) -> MultiModalFieldElem:
+        obj["data"] = self._decode_nested_tensors(obj["data"])
+        # Reconstruct the field processor using MultiModalFieldConfig
+        factory_meth_name, *field_args = obj["field"]
+        factory_meth = getattr(MultiModalFieldConfig, factory_meth_name)
+
+        # Special case: decode the union "slices" field of
+        # MultiModalFlatField
+        if factory_meth_name == "flat":
+            field_args[0] = self._decode_nested_slices(field_args[0])
+
+        obj["field"] = factory_meth(None, *field_args).field
+        return MultiModalFieldElem(**obj)
 
     def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
         if isinstance(obj, (int, float)):

From 139d155781c187b6d38ac6d84a516c97ff66bb1f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 8 Aug 2025 00:47:10 +0800
Subject: [PATCH 070/932] [Frontend] Use engine argument to control MM cache
 size (#22441)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/configuration/conserving_memory.md       |  2 +-
 docs/configuration/optimization.md            | 12 ++++--
 examples/offline_inference/mistral-small.py   |  6 +--
 examples/offline_inference/vision_language.py |  4 +-
 .../multimodal/generation/vlm_utils/core.py   |  4 +-
 .../multimodal/processing/test_llama4.py      |  6 +--
 tests/models/utils.py                         |  4 +-
 vllm/config.py                                | 43 ++++++++++++++-----
 vllm/engine/arg_utils.py                      | 34 ++++++++++++---
 vllm/entrypoints/cli/serve.py                 |  7 ++-
 vllm/envs.py                                  |  2 +-
 vllm/multimodal/registry.py                   | 22 +++++++---
 vllm/v1/core/kv_cache_utils.py                |  2 +-
 13 files changed, 101 insertions(+), 47 deletions(-)

diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index dcaf1069bf..058eba5fe0 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
 
 If you run out of CPU RAM, try the following options:
 
-- (Multi-modal models only) you can set the size of multi-modal processor cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB per API process + 4 GiB per engine core process)
+- (Multi-modal models only) you can set the size of multi-modal processor cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB per API process + 4 GiB per engine core process)
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
 
 ## Multi-modal input limits
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index bb7342c93f..2eeb8ad25d 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -161,12 +161,18 @@ By default, the multi-modal processor cache is enabled to avoid repeatedly proce
 the same multi-modal inputs via Hugging Face `AutoProcessor`,
 which commonly occurs in multi-turn conversations.
 
-You can adjust the size of the cache via `VLLM_MM_INPUT_CACHE_GIB` environment variable
+You can adjust the size of the cache by setting the value of `mm_processor_cache_gb`
 (default 4 GiB per API process + 4 GiB per engine core process).
+If you do not benefit much from the cache, you can disable it completely via `mm_processor_cache_gb=0`.
 
-If you do not benefit much from the cache, you can disable it completely via `disable_mm_preprocessor_cache`:
+Examples:
 
 ```python
+# Use a larger cache
 llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          disable_mm_preprocessor_cache=True)
+          mm_processor_cache_gb=8)
+
+# Disable the cache
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          mm_processor_cache_gb=0)
 ```
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index 59ec22a1e9..1f6e5ba146 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -68,7 +68,7 @@ def run_simple_demo(args: argparse.Namespace):
         max_model_len=4096,
         max_num_seqs=2,
         tensor_parallel_size=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        mm_processor_cache_gb=0 if args.disable_mm_processor_cache else 4,
     )
 
     prompt = "Describe this image in one sentence."
@@ -105,7 +105,7 @@ def run_advanced_demo(args: argparse.Namespace):
         limit_mm_per_prompt={"image": max_img_per_msg},
         max_model_len=max_img_per_msg * max_tokens_per_img,
         tensor_parallel_size=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        mm_processor_cache_gb=0 if args.disable_mm_processor_cache else 4,
     )
 
     prompt = "Describe the following image."
@@ -164,7 +164,7 @@ def parse_args():
     )
 
     parser.add_argument(
-        "--disable-mm-preprocessor-cache",
+        "--disable-mm-processor-cache",
         action="store_true",
         help="If True, disables caching of multi-modal processor.",
     )
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 5dbe001994..1314d33e90 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1563,7 +1563,7 @@ def parse_args():
     )
 
     parser.add_argument(
-        "--disable-mm-preprocessor-cache",
+        "--disable-mm-processor-cache",
         action="store_true",
         help="If True, disables caching of multi-modal processor.",
     )
@@ -1603,7 +1603,7 @@ def main(args):
 
     engine_args = asdict(req_data.engine_args) | {
         "seed": args.seed,
-        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
+        "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
     }
     llm = LLM(**engine_args)
 
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index f65385150d..a5d6948f06 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -62,9 +62,7 @@ def run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
 
-    vllm_runner_kwargs_: dict[str, Any] = {
-        "disable_mm_preprocessor_cache": True,
-    }
+    vllm_runner_kwargs_: dict[str, Any] = {"mm_processor_cache_gb": 0}
     if model_info.tokenizer:
         vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
     if model_info.tokenizer_mode:
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 9ef7af5562..5e14f0f996 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -15,14 +15,14 @@ from ...utils import build_model_context
                          ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
 @pytest.mark.parametrize("mm_processor_kwargs", [{}])
 @pytest.mark.parametrize("num_imgs", [1, 5])
-@pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
+@pytest.mark.parametrize("mm_processor_cache_gb", [0, 4])
 @pytest.mark.parametrize("tokenized_prompt", [True, False])
 def test_processor_override(
     image_assets: ImageTestAssets,
     model_id: str,
     mm_processor_kwargs: dict,
     num_imgs: int,
-    disable_mm_preprocessor_cache: bool,
+    mm_processor_cache_gb: int,
     tokenized_prompt: bool,
 ):
     """Ensure llama4 processor works properly."""
@@ -30,7 +30,7 @@ def test_processor_override(
         model_id,
         mm_processor_kwargs=mm_processor_kwargs,
         limit_mm_per_prompt={"image": num_imgs},
-        disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
+        mm_processor_cache_gb=mm_processor_cache_gb,
     )
     processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     config = processor.info.get_hf_config()
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 27ce9de469..1e3d51aeec 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -261,7 +261,7 @@ def build_model_context(
     model_config_kwargs: Optional[dict[str, Any]] = None,
     mm_processor_kwargs: Optional[dict[str, Any]] = None,
     limit_mm_per_prompt: Optional[dict[str, int]] = None,
-    disable_mm_preprocessor_cache: bool = True,
+    mm_processor_cache_gb: int = 0,
 ):
     """Creates an InputContext for a given model.
 
@@ -291,7 +291,7 @@ def build_model_context(
         seed=0,
         mm_processor_kwargs=mm_processor_kwargs,
         limit_mm_per_prompt=limit_mm_per_prompt,
-        disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
+        mm_processor_cache_gb=mm_processor_cache_gb,
         hf_overrides=model_info.hf_overrides,
         **model_config_kwargs,
     )
diff --git a/vllm/config.py b/vllm/config.py
index 44a8d871f0..8dcd429a6b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -443,8 +443,15 @@ class ModelConfig:
     from `AutoProcessor.from_pretrained`. The available overrides depend on the
     model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
     """
-    disable_mm_preprocessor_cache: bool = False
-    """If `True`, disable caching of the multi-modal processor."""
+    mm_processor_cache_gb: int = 4
+    """The size (in GiB) of the multi-modal processor cache, which is used to
+    avoid re-processing past multi-modal inputs.
+
+    This cache is duplicated for each API process and engine core process,
+    resulting in a total memory usage of
+    `mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
+
+    Set to `0` to disable this cache completely (not recommended)."""
     override_neuron_config: dict[str, Any] = field(default_factory=dict)
     """Initialize non-default neuron config or override default neuron config
     that are specific to Neuron devices, this argument will be used to
@@ -881,17 +888,16 @@ class ModelConfig:
                 limit_per_prompt=self.limit_mm_per_prompt,
                 media_io_kwargs=self.media_io_kwargs,
                 mm_processor_kwargs=self.mm_processor_kwargs,
-                disable_mm_preprocessor_cache=self.
-                disable_mm_preprocessor_cache,
+                mm_processor_cache_gb=self.mm_processor_cache_gb,
                 interleave_mm_strings=self.interleave_mm_strings)
 
         return None
 
-    def set_disable_mm_preprocessor_cache(self, value: bool) -> None:
+    def set_mm_processor_cache_gb(self, value: int) -> None:
         mm_config = self.get_multimodal_config()
 
-        self.disable_mm_preprocessor_cache = value
-        mm_config.disable_mm_preprocessor_cache = value
+        self.mm_processor_cache_gb = value
+        mm_config.mm_processor_cache_gb = value
 
     def _get_encoder_config(self):
         return get_sentence_transformer_tokenizer_config(
@@ -1698,7 +1704,16 @@ class ModelConfig:
         if mm_config is None:
             return False
 
-        return not mm_config.disable_mm_preprocessor_cache
+        return mm_config.mm_processor_cache_gb > 0
+
+    @property
+    def enable_mm_processor_cache(self) -> bool:
+        """Whether the multi-modal processor cache should be enabled."""
+        mm_config = self.multimodal_config
+        if mm_config is None:
+            return False
+
+        return mm_config.mm_processor_cache_gb > 0
 
     @property
     def enable_mm_input_cache(self) -> bool:
@@ -1707,7 +1722,7 @@ class ModelConfig:
         if mm_config is None:
             return False
 
-        return not mm_config.disable_mm_preprocessor_cache
+        return mm_config.mm_processor_cache_gb > 0
 
     def get_mm_input_cache_gb(self) -> int:
         mm_config = self.multimodal_config
@@ -3391,9 +3406,15 @@ class MultiModalConfig:
     `{"num_crops": 4}`.
     """
 
-    disable_mm_preprocessor_cache: bool = False
+    mm_processor_cache_gb: int = 4
     """
-    If `True`, disable caching of the multi-modal processor.
+    The size (in GiB) of the multi-modal processor cache, which is used to
+
+    This cache is duplicated for each API process and engine core process,
+    resulting in a total memory usage of
+    `mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
+
+    Set to `0` to disable this cache completely (not recommended).
     """
 
     interleave_mm_strings: bool = False
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a18cd9dde3..d2153dfae3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -358,8 +358,8 @@ class EngineArgs:
                                                       "media_io_kwargs")
     mm_processor_kwargs: Optional[Dict[str, Any]] = \
         MultiModalConfig.mm_processor_kwargs
-    disable_mm_preprocessor_cache: bool = \
-        MultiModalConfig.disable_mm_preprocessor_cache
+    disable_mm_preprocessor_cache: bool = False  # DEPRECATED
+    mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
     # LoRA fields
     enable_lora: bool = False
     enable_lora_bias: bool = LoRAConfig.bias_enabled
@@ -720,8 +720,11 @@ class EngineArgs:
             "--mm-processor-kwargs",
             **multimodal_kwargs["mm_processor_kwargs"])
         multimodal_group.add_argument(
-            "--disable-mm-preprocessor-cache",
-            **multimodal_kwargs["disable_mm_preprocessor_cache"])
+            "--mm-processor-cache-gb",
+            **multimodal_kwargs["mm_processor_cache_gb"])
+        multimodal_group.add_argument("--disable-mm-preprocessor-cache",
+                                      type=bool,
+                                      deprecated=True)
         multimodal_group.add_argument(
             "--interleave-mm-strings",
             **multimodal_kwargs["interleave_mm_strings"])
@@ -886,6 +889,23 @@ class EngineArgs:
             self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
             self.load_format = "runai_streamer"
 
+        if self.disable_mm_preprocessor_cache:
+            logger.warning(
+                "`--disable-mm-preprocessor-cache` is deprecated "
+                "and will be removed in v0.13. "
+                "Please use `--mm-processor-cache-gb 0` instead.", )
+
+            self.mm_processor_cache_gb = 0
+        elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
+            logger.warning(
+                "VLLM_MM_INPUT_CACHE_GIB` is deprecated "
+                "and will be removed in v0.13. "
+                "Please use `--mm-processor-cache-gb %d` instead.",
+                envs.VLLM_MM_INPUT_CACHE_GIB,
+            )
+
+            self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
+
         return ModelConfig(
             model=self.model,
             hf_config_path=self.hf_config_path,
@@ -922,7 +942,7 @@ class EngineArgs:
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
-            disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
+            mm_processor_cache_gb=self.mm_processor_cache_gb,
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern,
@@ -1234,13 +1254,13 @@ class EngineArgs:
             dp_supports_mm_processor_cache = (self.data_parallel_size == 1
                                               or data_parallel_external_lb)
             if (not dp_supports_mm_processor_cache
-                    and not model_config.disable_mm_preprocessor_cache):
+                    and model_config.mm_processor_cache_gb > 0):
                 logger.warning(
                     "Multi-modal processor cache is disabled because "
                     "it is not compatible with data parallelism when "
                     "there does not exist a one-to-one correspondance "
                     "between API and engine core processes.")
-                model_config.set_disable_mm_preprocessor_cache(True)
+                model_config.set_mm_processor_cache_gb(0)
 
         speculative_config = self.create_speculative_config(
             target_model_config=model_config,
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 02b78f103c..803a3e0046 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -138,13 +138,13 @@ def run_multi_api_server(args: argparse.Namespace):
     num_api_servers = args.api_server_count
     assert num_api_servers > 0
 
-    orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache
+    orig_mm_processor_cache_gb = args.mm_processor_cache_gb
 
     if num_api_servers > 1:
         setup_multiprocess_prometheus()
 
         # Not compatible with API server scale-out
-        args.disable_mm_preprocessor_cache = True
+        args.mm_processor_cache_gb = 0
 
     listen_address, sock = setup_server(args)
 
@@ -161,8 +161,7 @@ def run_multi_api_server(args: argparse.Namespace):
             raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used "
                              "with api_server_count > 1")
 
-        if model_config.is_multimodal_model and not (
-                orig_disable_mm_preprocessor_cache):
+        if model_config.is_multimodal_model and orig_mm_processor_cache_gb > 0:
             logger.warning("Multi-modal processor cache is disabled because "
                            "it is not compatible with `api_server_count > 1`.")
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 212eaf015a..8b12a7ee2b 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -561,7 +561,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_VIDEO_LOADER_BACKEND":
     lambda: os.getenv("VLLM_VIDEO_LOADER_BACKEND", "opencv"),
 
-    # Cache size (in GiB per process) for multimodal input cache
+    # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache
     # Default is 4 GiB per API process + 4 GiB per engine core process
     "VLLM_MM_INPUT_CACHE_GIB":
     lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")),
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 5f5b620e0c..dca04e9a1e 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
 
 import torch.nn as nn
 
-from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
 from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer,
@@ -96,11 +95,22 @@ class MultiModalRegistry:
         self._processor_factories = ClassRegistry[nn.Module,
                                                   _ProcessorFactories]()
 
-        self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_GIB)
+        self._processor_cache: Optional[ProcessingCache] = None
+
+    def _get_processor_cache(self, model_config: "ModelConfig"):
+        capacity_gb = model_config.mm_processor_cache_gb
+        if capacity_gb is None:
+            return None  # Overrides `disable_cache` argument
+
+        if self._processor_cache is None:
+            self._processor_cache = ProcessingCache(capacity_gb)
+
+        return self._processor_cache
 
     def reset_processor_cache(self) -> bool:
         """Reset the multi-modal processing cache."""
-        self._processing_cache.reset()
+        if self._processor_cache:
+            self._processor_cache.reset()
 
         return True  # Success
 
@@ -244,14 +254,14 @@ class MultiModalRegistry:
         if tokenizer is None and not model_config.skip_tokenizer_init:
             tokenizer = cached_tokenizer_from_config(model_config)
         if disable_cache is None:
-            mm_config = model_config.get_multimodal_config()
-            disable_cache = mm_config.disable_mm_preprocessor_cache
+            disable_cache = not model_config.enable_mm_processor_cache
 
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]
 
         ctx = InputProcessingContext(model_config, tokenizer)
-        cache = None if disable_cache else self._processing_cache
+        cache = None if disable_cache else self._get_processor_cache(
+            model_config)
 
         return factories.build_processor(ctx, cache=cache)
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 38b1d9b13f..626aa35a77 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -430,7 +430,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
         raise ValueError(
             "The number of multi-modal positions and hashes must match. This "
             "is likely because you did not enable MM hashing. "
-            "Please set `disable_mm_preprocessor_cache=False`.")
+            "Please set `mm_processor_cache_gb > 0`.")
 
     # Note that we assume mm_positions is sorted by offset.
     # We do not need to check all mm inputs if the start token index is out of

From 7e3a8dc90670fd312ce1e0d4eba9bf11c571e3ad Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 7 Aug 2025 18:13:04 +0100
Subject: [PATCH 071/932] Remove `from_dict` from `SpeculativeConfig` (#22451)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/v1/spec_decode/test_ngram.py | 13 ++++++-------
 vllm/config.py                     |  5 -----
 vllm/engine/arg_utils.py           | 19 +++----------------
 3 files changed, 9 insertions(+), 28 deletions(-)

diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index c844925e6c..b7303e0443 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -47,13 +47,12 @@ def test_ngram_proposer():
         model_config = ModelConfig(model="facebook/opt-125m")
         return NgramProposer(
             vllm_config=VllmConfig(model_config=model_config,
-                                   speculative_config=SpeculativeConfig.
-                                   from_dict({
-                                       "prompt_lookup_min": min_n,
-                                       "prompt_lookup_max": max_n,
-                                       "num_speculative_tokens": k,
-                                       "method": "ngram",
-                                   })))
+                                   speculative_config=SpeculativeConfig(
+                                       prompt_lookup_min=min_n,
+                                       prompt_lookup_max=max_n,
+                                       num_speculative_tokens=k,
+                                       method="ngram",
+                                   )))
 
     # No match.
     result = ngram_proposer(
diff --git a/vllm/config.py b/vllm/config.py
index 8dcd429a6b..7147702edd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2895,11 +2895,6 @@ class SpeculativeConfig:
                                usedforsecurity=False).hexdigest()
         return hash_str
 
-    @classmethod
-    def from_dict(cls, dict_value: dict) -> "SpeculativeConfig":
-        """Parse the CLI value for the speculative config."""
-        return cls(**dict_value)
-
     @staticmethod
     def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
         if hf_config.model_type == "deepseek_v3":
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d2153dfae3..c0ac3ff631 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -757,18 +757,6 @@ class EngineArgs:
         lora_group.add_argument("--default-mm-loras",
                                 **lora_kwargs["default_mm_loras"])
 
-        # Speculative arguments
-        speculative_group = parser.add_argument_group(
-            title="SpeculativeConfig",
-            description=SpeculativeConfig.__doc__,
-        )
-        speculative_group.add_argument(
-            "--speculative-config",
-            type=json.loads,
-            default=None,
-            help="The configurations for speculative decoding. Should be a "
-            "JSON string.")
-
         # Observability arguments
         observability_kwargs = get_kwargs(ObservabilityConfig)
         observability_group = parser.add_argument_group(
@@ -848,6 +836,8 @@ class EngineArgs:
             title="VllmConfig",
             description=VllmConfig.__doc__,
         )
+        vllm_group.add_argument("--speculative-config",
+                                **vllm_kwargs["speculative_config"])
         vllm_group.add_argument("--kv-transfer-config",
                                 **vllm_kwargs["kv_transfer_config"])
         vllm_group.add_argument('--kv-events-config',
@@ -1033,10 +1023,7 @@ class EngineArgs:
             "enable_chunked_prefill": enable_chunked_prefill,
             "disable_log_stats": disable_log_stats,
         })
-        speculative_config = SpeculativeConfig.from_dict(
-            self.speculative_config)
-
-        return speculative_config
+        return SpeculativeConfig(**self.speculative_config)
 
     def create_engine_config(
         self,

From acf8aeb79e23c32217dd37b5e96847302ae4d0b7 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Fri, 8 Aug 2025 09:57:27 +0800
Subject: [PATCH 072/932] [Misc] normalize multiprocessing Queue usage (#22371)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 tests/test_sharded_state_loader.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 1bb4203d21..42afdfa3c7 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -118,8 +118,17 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
                             tensor_parallel_size=tp_size,
                         ))
         p.start()
-        p.join()
+        # Call queue.get() before p.join() to prevent deadlock:
+        # If p.join() is called before queue.get() and the queue is full,
+        # the child process may block while writing to the queue and never
+        # terminate, causing the parent to wait indefinitely on p.join().
+        # See: https://github.com/vllm-project/vllm/pull/22371#discussion_r2257773814
         out_before = queue.get()
+        p.join()
+        queue.close()
+        queue.join_thread()
+
+        queue = ctx.Queue()
 
         p = ctx.Process(target=_run_generate,
                         args=(output_dir, queue),
@@ -131,7 +140,14 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
                             load_format="sharded_state",
                         ))
         p.start()
-        p.join()
+        # Call queue.get() before p.join() to prevent deadlock:
+        # If p.join() is called before queue.get() and the queue is full,
+        # the child process may block while writing to the queue and never
+        # terminate, causing the parent to wait indefinitely on p.join().
+        # See: https://github.com/vllm-project/vllm/pull/22371#discussion_r2257773814
         out_after = queue.get()
+        p.join()
+        queue.close()
+        queue.join_thread()
 
         assert out_before == out_after

From 1ee5ead5f8f1c3c77b73effcb230ee02952fbe1f Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Thu, 7 Aug 2025 19:13:17 -0700
Subject: [PATCH 073/932] [ROCm] [V1] [SpecDec] Enable Speculative Decoding on
 ROCm V1 Engine (#21496)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/utils.py                       | 16 ++++++++
 tests/v1/attention/utils.py          |  7 +++-
 tests/v1/e2e/test_spec_decode.py     | 15 ++++++++
 tests/v1/spec_decode/test_eagle.py   | 55 +++++++++++++++++++++++-----
 tests/v1/spec_decode/test_max_len.py | 54 +++++++++++++++------------
 vllm/v1/spec_decode/eagle.py         | 22 ++++++++---
 6 files changed, 128 insertions(+), 41 deletions(-)

diff --git a/tests/utils.py b/tests/utils.py
index 1c1a1cc601..741b4401cc 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -986,3 +986,19 @@ def has_module_attribute(module_name, attribute_name):
         return hasattr(module, attribute_name)
     except ImportError:
         return False
+
+
+def get_attn_backend_list_based_on_platform() -> list[str]:
+    if current_platform.is_cuda():
+        return ["FLASH_ATTN_VLLM_V1", "TRITON_ATTN_VLLM_V1", "TREE_ATTN"]
+    elif current_platform.is_rocm():
+        attn_backend_list = ["TRITON_ATTN_VLLM_V1"]
+        try:
+            import aiter  # noqa: F401
+            attn_backend_list.append("FLASH_ATTN_VLLM_V1")
+        except Exception:
+            print("Skip FLASH_ATTN_VLLM_V1 on ROCm as aiter is not installed")
+
+        return attn_backend_list
+    else:
+        raise ValueError("Unsupported platform")
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index e9e574501d..a4e38eb32f 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -11,7 +11,7 @@ import torch
 from vllm.config import (CacheConfig, CompilationConfig, DeviceConfig,
                          LoadConfig, ModelConfig, ModelDType, ParallelConfig,
                          SchedulerConfig, VllmConfig)
-from vllm.platforms import _Backend
+from vllm.platforms import _Backend, current_platform
 from vllm.utils import resolve_obj_by_qualname
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
@@ -119,7 +119,10 @@ def get_attention_backend(backend_name: _Backend):
     """
     backend_map = {
         _Backend.FLASH_ATTN_VLLM_V1:
-        "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend",
+        ("vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+         if current_platform.is_cuda() else
+         "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
+         ),
         _Backend.FLASHINFER_VLLM_V1:
         "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
         _Backend.FLEX_ATTENTION:
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 31f25e94c5..4950faf826 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -8,10 +8,12 @@ from typing import Any, Union
 import pytest
 import torch
 
+from tests.utils import get_attn_backend_list_based_on_platform
 from vllm import LLM, SamplingParams
 from vllm.assets.base import VLLM_S3_BUCKET_URL
 from vllm.assets.image import VLM_IMAGES_DIR
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
 
 
 def get_test_prompts(mm_enabled: bool):
@@ -141,11 +143,14 @@ def test_ngram_correctness(
             marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
     ],
     ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle", "llama4_eagle_mm"])
+@pytest.mark.parametrize("attn_backend",
+                         get_attn_backend_list_based_on_platform())
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     sampling_config: SamplingParams,
     model_setup: tuple[str, str, str, int],
     mm_enabled: bool,
+    attn_backend: str,
 ):
     # Generate test prompts inside the function instead of using fixture
     test_prompts = get_test_prompts(mm_enabled)
@@ -156,6 +161,16 @@ def test_eagle_correctness(
     '''
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+
+        if (attn_backend == "TRITON_ATTN_VLLM_V1"
+                and not current_platform.is_rocm()):
+            pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
+                        "multi-token eagle spec decode on current platform")
+
+        if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
+            m.setenv("VLLM_ROCM_USE_AITER", "1")
+
         method, model_name, spec_model_name, tp_size = model_setup
 
         ref_llm = LLM(model=model_name,
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 73b47f8974..2b4f8bd2a8 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -6,6 +6,7 @@ from unittest import mock
 import pytest
 import torch
 
+from tests.utils import get_attn_backend_list_based_on_platform
 from tests.v1.attention.utils import (BatchSpec, _Backend,
                                       create_common_attn_metadata,
                                       create_standard_kv_cache_spec,
@@ -120,17 +121,28 @@ def test_prepare_inputs():
     assert torch.equal(token_indices, expected_token_indices)
 
 
-@pytest.mark.parametrize("method,proposer_helper", [
-    ("eagle", lambda k: _create_proposer("eagle", k)),
-    ("eagle3", lambda k: _create_proposer("eagle3", k)),
-])
+@pytest.mark.parametrize("method", ["eagle", "eagle3"])
+@pytest.mark.parametrize("attn_backend",
+                         get_attn_backend_list_based_on_platform())
 @pytest.mark.parametrize("pp_size", [1, 2])
 @pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
 @mock.patch('vllm.v1.spec_decode.eagle.get_pp_group')
 @mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config')
 @mock.patch('vllm.v1.spec_decode.eagle.get_model')
 def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
-                    proposer_helper, pp_size, use_distinct_embed_tokens):
+                    attn_backend, pp_size, use_distinct_embed_tokens,
+                    monkeypatch):
+
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+
+    if (attn_backend == "TRITON_ATTN_VLLM_V1"
+            and not current_platform.is_rocm()):
+        pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
+                    "multi-token eagle spec decode on current platform")
+
+    if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     # Setup draft model mock
     mock_model = mock.MagicMock()
     if use_distinct_embed_tokens:
@@ -177,7 +189,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
         target_model.lm_head = mock.MagicMock()
 
     # Create proposer using the helper function
-    proposer = proposer_helper(k=8)
+    proposer = _create_proposer(method, k=8)
 
     # Call the method under test
     proposer.load_model(target_model)
@@ -201,10 +213,22 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
             target_model.model.embed_tokens
 
 
+@pytest.mark.parametrize("method", ["eagle", "eagle3"])
+@pytest.mark.parametrize("attn_backend",
+                         get_attn_backend_list_based_on_platform())
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
-@pytest.mark.parametrize("backend",
-                         [_Backend.FLASH_ATTN_VLLM_V1, _Backend.TREE_ATTN])
-def test_propose(num_speculative_tokens, backend):
+def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
+
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+
+    if (attn_backend == "TRITON_ATTN_VLLM_V1"
+            and not current_platform.is_rocm()):
+        pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
+                    "multi-token eagle spec decode on current platform")
+
+    if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     # Use GPU device
     device = torch.device(current_platform.device_type)
 
@@ -303,7 +327,18 @@ def test_propose(num_speculative_tokens, backend):
                                    device=device)
     sampling_metadata = mock.MagicMock()
 
-    attn_metadata_builder_cls, _ = get_attention_backend(backend)
+    if attn_backend == "FLASH_ATTN_VLLM_V1":
+        attn_metadata_builder_cls, _ = get_attention_backend(
+            _Backend.FLASH_ATTN_VLLM_V1)
+    elif attn_backend == "TRITON_ATTN_VLLM_V1":
+        attn_metadata_builder_cls, _ = get_attention_backend(
+            _Backend.TRITON_ATTN_VLLM_V1)
+    elif attn_backend == "TREE_ATTN":
+        attn_metadata_builder_cls, _ = get_attention_backend(
+            _Backend.TREE_ATTN)
+    else:
+        raise ValueError(f"Unsupported attention backend: {attn_backend}")
+
     attn_metadata_builder = attn_metadata_builder_cls(
         kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
         layer_names=proposer.attn_layer_names,
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index 9070d2b10f..fef6a5421b 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -4,7 +4,9 @@
 
 import pytest
 
+from tests.utils import get_attn_backend_list_based_on_platform
 from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
 
 _PROMPTS = [
     "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
@@ -14,36 +16,40 @@ _PROMPTS = [
 
 
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10])
-def test_ngram_max_len(
-    monkeypatch: pytest.MonkeyPatch,
-    num_speculative_tokens: int,
-):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        llm = LLM(
-            model="facebook/opt-125m",
-            max_model_len=100,
-            enforce_eager=True,  # For faster initialization.
-            speculative_config={
-                "method": "ngram",
-                "prompt_lookup_max": 5,
-                "prompt_lookup_min": 3,
-                "num_speculative_tokens": num_speculative_tokens,
-            },
-        )
-        sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
-        llm.generate(_PROMPTS, sampling_params)
+def test_ngram_max_len(num_speculative_tokens: int):
+    llm = LLM(
+        model="facebook/opt-125m",
+        max_model_len=100,
+        enforce_eager=True,  # For faster initialization.
+        speculative_config={
+            "method": "ngram",
+            "prompt_lookup_max": 5,
+            "prompt_lookup_min": 3,
+            "num_speculative_tokens": num_speculative_tokens,
+        },
+    )
+    sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
+    llm.generate(_PROMPTS, sampling_params)
 
 
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10])
-def test_eagle_max_len(
-    monkeypatch: pytest.MonkeyPatch,
-    num_speculative_tokens: int,
-):
+@pytest.mark.parametrize("attn_backend",
+                         get_attn_backend_list_based_on_platform())
+def test_eagle_max_len(monkeypatch: pytest.MonkeyPatch,
+                       num_speculative_tokens: int, attn_backend: str):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+
+        if (attn_backend == "TRITON_ATTN_VLLM_V1"
+                and not current_platform.is_rocm()):
+            pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
+                        "multi-token eagle spec decode on current platform")
+
+        if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
+            m.setenv("VLLM_ROCM_USE_AITER", "1")
+
         llm = LLM(
             model="meta-llama/Meta-Llama-3-8B-Instruct",
             enforce_eager=True,  # For faster initialization.
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 3c36971fe5..f75d76dd97 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -17,10 +17,14 @@ from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models import supports_multimodal
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
+from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.attention.backends.rocm_aiter_fa import (
+    AiterFlashAttentionMetadata)
 from vllm.v1.attention.backends.tree_attn import (TreeAttentionMetadata,
                                                   TreeAttentionMetadataBuilder)
+from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -230,11 +234,19 @@ class EagleProposer:
         # one layer. Adapt this code to support multiple layers once
         # there's a multi-layer MTP module.
 
-        # Currently, only FlashAttention and TreeAttention support multi-token
-        # eagle spec decode. This is because the code below
-        # makes assumptions about attn_metadata attributes available.
-        assert isinstance(attn_metadata,
-                          (FlashAttentionMetadata, TreeAttentionMetadata))
+        # On ROCm, both AiterFlashAttention and TritonAttention
+        # support multi-token eagle spec decode.
+        if current_platform.is_rocm():
+            assert isinstance(
+                attn_metadata,
+                (TritonAttentionMetadata, AiterFlashAttentionMetadata,
+                 FlashAttentionMetadata))
+        else:
+            # Currently, only FlashAttention and TreeAttention support
+            # multi-token eagle spec decode. This is because the code below
+            # makes assumptions about attn_metadata attributes available.
+            assert isinstance(attn_metadata,
+                              (FlashAttentionMetadata, TreeAttentionMetadata))
 
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]

From e2c8f1edec24f7a89a68e3b48bc65ae683aed0cb Mon Sep 17 00:00:00 2001
From: Andrew Sansom <andrew@protopia.ai>
Date: Thu, 7 Aug 2025 21:15:32 -0500
Subject: [PATCH 074/932] [PERF] Use pybase64 to more quickly decode prompt
 embeddings (#22469)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
---
 vllm/entrypoints/openai/serving_engine.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index efd2f20299..fb9d456df7 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-import base64
 import io
 import json
 import sys
@@ -12,6 +11,7 @@ from http import HTTPStatus
 from typing import (Annotated, Any, Callable, ClassVar, Generic, Optional,
                     TypeVar, Union, cast, overload)
 
+import pybase64
 import torch
 from fastapi import Request
 from pydantic import BaseModel, ConfigDict, Field
@@ -1008,7 +1008,8 @@ class OpenAIServing:
     ) -> list[EmbedsPrompt]:
 
         def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
-            tensor = torch.load(io.BytesIO(base64.b64decode(embed)),
+            tensor = torch.load(io.BytesIO(
+                pybase64.b64decode(embed, validate=True)),
                                 weights_only=True)
             assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
                 torch.float32,

From d57dc2364e88e9a1a3e8dc3f6ff8486a7ba040dd Mon Sep 17 00:00:00 2001
From: Zhiyu <zhiyuc@nvidia.com>
Date: Thu, 7 Aug 2025 19:18:19 -0700
Subject: [PATCH 075/932] Add ModelOpt Qwen3 nvfp4 support (#20101)

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 .../model_loader/weight_utils.py              | 66 ++++++++++---------
 vllm/model_executor/models/qwen2.py           | 13 +++-
 vllm/model_executor/models/qwen3_moe.py       | 16 ++++-
 3 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 074126fa66..78b186265d 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -764,39 +764,41 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
             return None
         return remapped_name
 
-    possible_scale_names = [".k_scale", ".v_scale"]
-    modelopt_scale_names = [
-        ".self_attn.k_proj.k_scale", ".self_attn.v_proj.v_scale"
+    # Define scale name mapping patterns in order of precedence
+    scale_mapping_patterns = [
+        # ModelOpt format: .self_attn.{k,v}_proj.{k,v}_scale ->
+        # .self_attn.attn.{k,v}_scale
+        (r"\.self_attn\.([kv])_proj\.([kv])_scale$",
+         r".self_attn.attn.\2_scale"),
+        # QKV proj format: .self_attn.qkv_proj.{k,v}_scale ->
+        # .self_attn.attn.{k,v}_scale
+        (r"\.self_attn\.qkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"),
+        # Qwen3 MoE format: .self_attn.qkqkv_proj.{k,v}_scale ->
+        # .self_attn.attn.{k,v}_scale
+        (r"\.self_attn\.qkqkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"
+         ),
+        # Default format: .{k,v}_scale -> .attn.{k,v}_scale
+        (r"\.([kv])_scale$", r".attn.\1_scale"),
     ]
-    # Also support qkv_proj scale parameters (from stacked parameter processing)
-    qkv_proj_scale_names = [
-        ".self_attn.qkv_proj.k_scale", ".self_attn.qkv_proj.v_scale"
-    ]
-    for scale_name in possible_scale_names:
-        if name.endswith(scale_name):
-            if any(mo_scale_name in name
-                   for mo_scale_name in modelopt_scale_names):
-                remapped_name = name.replace(
-                    f".self_attn.{scale_name[1]}_proj{scale_name}",
-                    f".self_attn.attn{scale_name}")
-            elif any(qkv_scale_name in name
-                     for qkv_scale_name in qkv_proj_scale_names):
-                # Handle qkv_proj scale parameters
-                remapped_name = name.replace(
-                    f".self_attn.qkv_proj{scale_name}",
-                    f".self_attn.attn{scale_name}")
-            else:
-                remapped_name = name.replace(scale_name, f".attn{scale_name}")
-            if remapped_name not in params_dict:
-                logger.warning_once(
-                    "Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.",  # noqa: E501
-                    scale_name,
-                    name,
-                    remapped_name,
-                    scale_name,
-                )
-                return None
-            return remapped_name
+
+    # Check if name ends with k_scale or v_scale
+    if name.endswith((".k_scale", ".v_scale")):
+        import regex as re
+
+        for pattern, replacement in scale_mapping_patterns:
+            if re.search(pattern, name):
+                remapped_name = re.sub(pattern, replacement, name)
+                if remapped_name not in params_dict:
+                    scale_type = name.split(".")[-1]
+                    logger.warning_once(
+                        "Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.",  # noqa: E501
+                        scale_type,
+                        name,
+                        remapped_name,
+                        scale_type,
+                    )
+                    return None
+                return remapped_name
 
     # If there were no matches, return the untouched param name
     return name
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 0e7507a457..e4f0de04e9 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -408,9 +408,18 @@ class Qwen2Model(nn.Module):
                     continue
                 if is_pp_missing_parameter(name, self):
                     continue
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
                 param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, loaded_weight)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
                 break
             else:
                 # Skip loading extra bias for GPTQ models.
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 7410589190..b2397c115d 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -48,7 +48,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -471,12 +472,21 @@ class Qwen3MoeModel(nn.Module):
                 # Skip layers on other devices.
                 if is_pp_missing_parameter(name, self):
                     continue
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
                 if name not in params_dict:
                     continue
 
                 param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, loaded_weight)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
                 break
             else:
                 is_expert_weight = False

From a3b9c17b56d09a091e210222d8e1f75cabe65b84 Mon Sep 17 00:00:00 2001
From: Shu Wang <shuw@nvidia.com>
Date: Thu, 7 Aug 2025 21:18:22 -0500
Subject: [PATCH 076/932] Support Tensorrt-LLM MoE fp4 for low-latency (#21331)

Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: XIn Li <xinli@nvidia.com>
Co-authored-by: XIn Li <xinli@nvidia.com>
---
 vllm/envs.py                                  |  15 +
 .../model_executor/layers/fused_moe/config.py |   3 +-
 .../compressed_tensors_moe.py                 |   9 +-
 .../layers/quantization/modelopt.py           | 284 ++++++++++++++++--
 .../quantization/utils/flashinfer_fp4_moe.py  |  12 +-
 .../quantization/utils/nvfp4_moe_support.py   |   4 +-
 vllm/utils/flashinfer.py                      |   4 +
 7 files changed, 288 insertions(+), 43 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 8b12a7ee2b..f81f6dacd8 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -129,6 +129,7 @@ if TYPE_CHECKING:
     VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
+    VLLM_FLASHINFER_MOE_BACKEND: str = "throughput"
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
@@ -982,6 +983,20 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_ALL2ALL_BACKEND":
     lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
 
+    # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. Both
+    # require compute capability 10.0 or above.
+    # Available options:
+    # - "throughput":  [default]
+    #     Uses CUTLASS kernels optimized for high-throughput batch inference.
+    # - "latency":
+    #     Uses TensorRT-LLM kernels optimized for low-latency inference.
+    # To set this backend, define the environment variable:
+    #     export VLLM_FLASHINFER_MOE_BACKEND=latency.
+    # If not set, defaults to "throughput".
+    "VLLM_FLASHINFER_MOE_BACKEND": lambda: os.getenv(
+    "VLLM_FLASHINFER_MOE_BACKEND", "throughput"
+    ),
+
     # Control the maximum number of tokens per expert supported by the
     # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
     # the blockscale tensor of activations NVFP4 Quantization.
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 9e4ee5a3d7..f2242ade0c 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -192,7 +192,8 @@ class FusedMoEParallelConfig:
     @property
     def use_flashinfer_cutlass_kernels(self):
         return (envs.VLLM_USE_FLASHINFER_MOE_FP4
-                and has_flashinfer_cutlass_fused_moe())
+                and has_flashinfer_cutlass_fused_moe()
+                and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput")
 
     @staticmethod
     def make(tp_size_: int, dp_size_: int,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 09d8890888..c04f7c39a5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -105,7 +105,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             detect_nvfp4_moe_support)
         _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
         self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
-        self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass
+        self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
         self.fused_experts = None  # type: ignore[assignment]
@@ -212,7 +212,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                                              requires_grad=False)
 
         # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
-        if self.allow_flashinfer_cutlass:
+        if self.allow_flashinfer:
             w, s = reorder_w1w3_to_w3w1(layer.w13_weight.data,
                                         layer.w13_weight_scale.data,
                                         dim=-2)
@@ -266,7 +266,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             (layer.w2_input_global_scale), requires_grad=False)
 
     def maybe_swap_experts_impl(self, moe_parallel_config):
-        if not self.allow_flashinfer_cutlass:
+        if not self.allow_flashinfer:
             return
         self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
             moe_parallel_config)
@@ -277,8 +277,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (  # noqa: E501
             select_nvfp4_gemm_impl)
 
-        return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe,
-                                      logger)
+        return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger)
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 0334a28245..147b275eaf 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from enum import Enum
 from typing import Any, Callable, Optional, Union
 
 import torch
@@ -36,6 +37,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 from vllm.model_executor.parameter import (ModelWeightParameter,
                                            PerTensorScaleParameter)
 from vllm.scalar_type import scalar_types
+from vllm.utils import next_power_of_2
 from vllm.utils.flashinfer import has_flashinfer_moe
 
 logger = init_logger(__name__)
@@ -44,6 +46,11 @@ QUANT_ALGOS = ["FP8", "NVFP4"]
 KV_CACHE_QUANT_ALGOS = ["FP8"]
 
 
+class FlashinferMoeBackend(Enum):
+    TENSORRT_LLM = "TensorRT-LLM"
+    CUTLASS = "CUTLASS"
+
+
 class ModelOptFp8Config(QuantizationConfig):
     """Config class for ModelOpt FP8."""
 
@@ -185,7 +192,7 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
         Args: quant_config: The ModelOpt quantization config.
     """
 
-    def __init__(self, quant_config: ModelOptFp8Config):
+    def __init__(self, quant_config: ModelOptFp8Config) -> None:
         self.quant_config = quant_config
         self.fp8_linear = Fp8LinearOp(
             act_quant_static=True, act_quant_group_shape=GroupShape.PER_TENSOR)
@@ -265,7 +272,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         quant_config: The ModelOpt quantization config.
     """
 
-    def __init__(self, quant_config: ModelOptFp8Config):
+    def __init__(self, quant_config: ModelOptFp8Config) -> None:
         self.quant_config = quant_config
         from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
             cutlass_fp8_supported)
@@ -670,7 +677,8 @@ class ModelOptNvFp4Config(QuantizationConfig):
         return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo,
                    exclude_modules, group_size)
 
-    def is_layer_excluded(self, prefix: str, exclude_modules: list):
+    def is_layer_excluded(self, prefix: str,
+                          exclude_modules: list[str]) -> bool:
         import regex as re
         for pattern in exclude_modules:
             regex_str = pattern.replace('.', r'\.').replace('*', r'.*')
@@ -714,7 +722,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
     Args: quant_config: The ModelOpt quantization config.
     """
 
-    def __init__(self, quant_config: ModelOptNvFp4Config):
+    def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         self.quant_config = quant_config
         self.cutlass_nvfp4_supported = cutlass_fp4_supported()
         self.use_marlin = False
@@ -859,6 +867,16 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         return out.view(*output_shape)
 
 
+def _get_tile_tokens_dim(num_tokens: int, top_k: int, num_experts: int) -> int:
+    # Guess tokens per expert assuming perfect expert distribution first.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
+    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    return tile_tokens_dim
+
+
 class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     """
     MoE Method for FP4 Quantization.
@@ -866,22 +884,40 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         quant_config: NVFP4 Quant Config
     """
 
-    def __init__(self, quant_config: ModelOptNvFp4Config):
+    def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         self.quant_config = quant_config
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
             detect_nvfp4_moe_support)
         _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
         self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
-        self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass
+        self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
+        self.flashinfer_moe_backend = None
 
-        self.fused_experts = None  # type: ignore
+        if self.allow_flashinfer:
+            flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
+            if flashinfer_moe_backend == "throughput":
+                self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
+                logger.info_once("Using FlashInfer CUTLASS kernels for "
+                                 "ModelOptNvFp4FusedMoE.")
+            elif flashinfer_moe_backend == "latency":
+                self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
+                logger.info_once("Using FlashInfer TensorRT-LLM kernels for "
+                                 "ModelOptNvFp4FusedMoE.")
+            else:
+                allowed_backends = ["throughput", "latency"]
+                raise ValueError(
+                    f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
+                    f" expected one of {allowed_backends}")
+
+        self.fused_experts: Optional[
+            mk.FusedMoEModularKernel] = None  # type: ignore[assignment]
 
     def maybe_swap_experts_impl(
         self,
         moe_parallel_config: FusedMoEParallelConfig,
     ):
-        if not self.allow_flashinfer_cutlass:
+        if not self.allow_flashinfer:
             return
         self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
             moe_parallel_config)
@@ -897,8 +933,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (  # noqa: E501
             select_nvfp4_gemm_impl)
 
-        return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe,
-                                      logger)
+        return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger)
 
     def uses_weight_scale_2_pattern(self) -> bool:
         """
@@ -996,14 +1031,101 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                                                  weight_loader=weight_loader)
         layer.register_parameter("w2_input_scale", w2_input_scale)
 
+    def prepare_static_weight_layouts_for_trtllm_moe(
+        self,
+        gemm1_weights: torch.Tensor,
+        gemm2_weights: torch.Tensor,
+        gemm1_scales_linear_fp4_bytes: torch.Tensor,
+        gemm2_scales_linear_fp4_bytes: torch.Tensor,
+        hidden_size: int,
+        intermediate_size: int,
+        num_experts: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Prepare quantized weights for kernel (done offline with weights)."""
+        from flashinfer import (reorder_rows_for_gated_act_gemm,
+                                shuffle_matrix_a, shuffle_matrix_sf_a)
+        epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
+
+        # Convert quantized weights to proper formats
+        gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape(
+            num_experts, 2 * intermediate_size, hidden_size // 2)  # packed fp4
+        gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view(
+            torch.float8_e4m3fn).reshape(num_experts, 2 * intermediate_size,
+                                         hidden_size //
+                                         16)  # fp8 scaling factors
+
+        gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape(
+            num_experts, hidden_size, intermediate_size // 2)  # packed fp4
+        gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view(
+            torch.float8_e4m3fn).reshape(num_experts, hidden_size,
+                                         intermediate_size //
+                                         16)  # fp8 scaling factors
+
+        # Reorder rows of W1 and scales for fused gated activation
+        gemm1_weights_fp4_interleaved = []
+        gemm1_scales_fp4_interleaved = []
+        for i in range(num_experts):
+            gemm1_weights_fp4_interleaved.append(
+                reorder_rows_for_gated_act_gemm(gemm1_weights_fp4[i].clone()))
+            gemm1_scales_fp4_interleaved.append(
+                reorder_rows_for_gated_act_gemm(
+                    gemm1_scales_linear_fp4[i].clone()))
+
+        # Stack weights and scales for all experts
+        gemm1_weights_fp4_interleaved = torch.stack(
+            gemm1_weights_fp4_interleaved).reshape(num_experts,
+                                                   2 * intermediate_size,
+                                                   hidden_size // 2)
+        gemm1_scales_fp4_interleaved = torch.stack(
+            gemm1_scales_fp4_interleaved).reshape(num_experts,
+                                                  2 * intermediate_size,
+                                                  hidden_size // 16)
+
+        # Shuffle weights and scaling factors for transposed mma output
+        gemm1_weights_fp4_shuffled = []
+        gemm1_scales_fp4_shuffled = []
+        gemm2_weights_fp4_shuffled = []
+        gemm2_scales_fp4_shuffled = []
+        for i in range(num_experts):
+            gemm1_weights_fp4_shuffled.append(
+                shuffle_matrix_a(
+                    gemm1_weights_fp4_interleaved[i].view(torch.uint8),
+                    epilogue_tile_m))
+            gemm1_scales_fp4_shuffled.append(
+                shuffle_matrix_sf_a(
+                    gemm1_scales_fp4_interleaved[i].view(torch.uint8),
+                    epilogue_tile_m))
+
+            gemm2_weights_fp4_shuffled.append(
+                shuffle_matrix_a(gemm2_weights_fp4[i].view(torch.uint8),
+                                 epilogue_tile_m))
+            gemm2_scales_fp4_shuffled.append(
+                shuffle_matrix_sf_a(
+                    gemm2_scales_linear_fp4[i].view(torch.uint8),
+                    epilogue_tile_m))
+
+        # Stack weights for all experts
+        gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled)
+        gemm1_scales_fp4_shuffled = (
+            torch.stack(gemm1_scales_fp4_shuffled).view(
+                torch.float8_e4m3fn).reshape(num_experts,
+                                             2 * intermediate_size,
+                                             hidden_size // 16))
+
+        gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled)
+        gemm2_scales_fp4_shuffled = (
+            torch.stack(gemm2_scales_fp4_shuffled).view(
+                torch.float8_e4m3fn).reshape(num_experts, hidden_size,
+                                             intermediate_size // 16))
+        return (gemm1_weights_fp4_shuffled, gemm1_scales_fp4_shuffled,
+                gemm2_weights_fp4_shuffled, gemm2_scales_fp4_shuffled)
+
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # GEMM 1
-        # The FlashInfer Cutlass fused MoE kernel expects the combined weights
-        # to be ordered as [w3, w1], unlike the standard [w1, w3] layout.
+        # GEMM 1 processing
         gemm1_weight = layer.w13_weight.data
         gemm1_weight_scale = layer.w13_weight_scale.data
 
-        if self.allow_flashinfer_cutlass:
+        if self.allow_flashinfer:
             gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
                 gemm1_weight, gemm1_weight_scale, dim=-2)
 
@@ -1011,6 +1133,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         layer.w13_weight_scale = Parameter(gemm1_weight_scale,
                                            requires_grad=False)
 
+        # Common processing for w13_weight_scale_2
         if not torch.allclose(layer.w13_weight_scale_2[:, 0],
                               layer.w13_weight_scale_2[:, 1]):
             logger.warning_once(
@@ -1021,26 +1144,18 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2,
                                              requires_grad=False)
 
+        # Common processing for input scales and alphas
         w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(
             torch.float32)
         layer.g1_alphas = Parameter(
             (w13_input_scale * w13_weight_scale_2).to(torch.float32),
             requires_grad=False)
 
-        assert (layer.w13_weight_scale.shape[2] % 16 == 0), (
-            "Expected weight_scale.dim(1) to be divisible by 16")
-        assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), (
-            "Weight Blockscale must be represented as FP8-E4M3")
-        w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
-
-        layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
-                                                  requires_grad=False)
-
         # This is for quantization, so we need to invert it.
         layer.w13_input_scale_quant = Parameter(
             (1 / w13_input_scale).to(torch.float32), requires_grad=False)
 
-        # GEMM 2
+        # GEMM 2 processing
         layer.g2_alphas = Parameter(
             (layer.w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
             requires_grad=False)
@@ -1049,15 +1164,63 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         layer.w2_input_scale_quant = Parameter(
             (1 / layer.w2_input_scale).to(torch.float32), requires_grad=False)
 
-        assert (layer.w2_weight_scale.shape[2] % 16 == 0), (
-            "Expected weight_scale.dim(1) to be divisible by 16")
-        assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
-            "Weight Blockscale must be represented as FP8-E4M3")
-        w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
+        # TensorRT-LLM specific processing
+        if self.allow_flashinfer and \
+            self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            # Prepare static weights for TRT-LLM kernel
+            (gemm1_weights_fp4_shuffled, gemm1_scales_fp4_shuffled,
+             gemm2_weights_fp4_shuffled, gemm2_scales_fp4_shuffled
+             ) = self.prepare_static_weight_layouts_for_trtllm_moe(
+                 layer.w13_weight,
+                 layer.w2_weight,
+                 layer.w13_weight_scale,
+                 layer.w2_weight_scale,
+                 layer.w2_weight.size(-2),  # hidden_size
+                 layer.w13_weight.size(-2) // 2,  # intermediate_size
+                 layer.w13_weight.size(0),  # num_experts
+             )
 
-        layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
-                                                 requires_grad=False)
-        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
+            layer.gemm1_weights_fp4_shuffled = Parameter(
+                gemm1_weights_fp4_shuffled, requires_grad=False)
+            layer.gemm2_weights_fp4_shuffled = Parameter(
+                gemm2_weights_fp4_shuffled, requires_grad=False)
+            layer.gemm1_scales_fp4_shuffled = Parameter(
+                gemm1_scales_fp4_shuffled, requires_grad=False)
+            layer.gemm2_scales_fp4_shuffled = Parameter(
+                gemm2_scales_fp4_shuffled, requires_grad=False)
+
+            # Additional parameter needed for TRT-LLM
+            layer.g1_scale_c = Parameter(
+                (layer.w2_input_scale_quant * layer.g1_alphas).to(
+                    torch.float32),
+                requires_grad=False,
+            )
+
+            # Clean up weights that won't be used by TRT-LLM
+            del layer.w2_weight
+            del layer.w2_weight_scale
+            del layer.w13_weight
+            del layer.w13_weight_scale
+        else:
+            # Non-TRT-LLM processing (Cutlass or non-flashinfer)
+            assert (layer.w13_weight_scale.shape[2] % 16 == 0), (
+                "Expected weight_scale.dim(1) to be divisible by 16")
+            assert (layer.w13_weight_scale.dtype == torch.float8_e4m3fn), (
+                "Weight Blockscale must be represented as FP8-E4M3")
+            w13_blockscale_swizzled = swizzle_blockscale(
+                layer.w13_weight_scale)
+            layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
+                                                      requires_grad=False)
+
+            assert (layer.w2_weight_scale.shape[2] % 16 == 0), (
+                "Expected weight_scale.dim(1) to be divisible by 16")
+            assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
+                "Weight Blockscale must be represented as FP8-E4M3")
+            w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
+            layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
+                                                     requires_grad=False)
+            layer.w2_weight = Parameter(layer.w2_weight.data,
+                                        requires_grad=False)
 
         if self.use_marlin:
             prepare_moe_fp4_layer_for_marlin(layer)
@@ -1095,6 +1258,60 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 "EPLB not supported for `ModelOptNvFp4FusedMoE` yet.")
         assert activation == "silu", "Only SiLU activation is supported."
 
+        if self.allow_flashinfer and \
+            self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            import flashinfer
+
+            from vllm.model_executor.models.llama4 import Llama4MoE
+
+            a1_gscale = layer.w13_input_scale_quant
+            (hidden_states_fp4,
+             hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
+                 x,
+                 a1_gscale,
+                 is_sf_swizzled_layout=False,
+             )
+            use_llama4_routing = \
+                custom_routing_function is Llama4MoE.custom_routing_function
+            routing_method_type = flashinfer.RoutingMethodType.DeepSeekV3
+            if use_llama4_routing:
+                routing_method_type = flashinfer.RoutingMethodType.Llama4
+            out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
+                routing_logits=router_logits
+                if use_llama4_routing else router_logits.to(torch.float32),
+                routing_bias=e_score_correction_bias,
+                hidden_states=hidden_states_fp4,
+                hidden_states_scale=hidden_states_scale_linear_fp4.view(
+                    torch.float8_e4m3fn).flatten(),
+                gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
+                gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
+                    torch.float8_e4m3fn),
+                gemm1_bias=None,
+                gemm1_alpha=None,
+                gemm1_beta=None,
+                gemm1_clamp_limit=None,
+                gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
+                gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
+                    torch.float8_e4m3fn),
+                gemm2_bias=None,
+                output1_scale_scalar=layer.g1_scale_c.data,
+                output1_scale_gate_scalar=layer.g1_alphas.data,
+                output2_scale_scalar=layer.g2_alphas.data,
+                num_experts=global_num_experts,
+                top_k=top_k,
+                n_group=num_expert_group,
+                topk_group=topk_group,
+                intermediate_size=layer.intermediate_size_per_partition,
+                local_expert_offset=layer.ep_rank * layer.local_num_experts,
+                local_num_experts=layer.local_num_experts,
+                routed_scaling_factor=None,
+                tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k,
+                                                     layer.local_num_experts),
+                routing_method_type=routing_method_type,
+                do_finalize=True,
+            )[0]
+            return out
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -1149,6 +1366,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 expert_map=expert_map,
                 apply_router_weight_on_input=apply_router_weight_on_input)
         else:
+            assert self.allow_flashinfer and \
+               self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
             out = flashinfer_fp4_cutlass_moe_forward(
                 self.fused_experts,
                 layer,
@@ -1160,4 +1379,5 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 expert_map=expert_map,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
+
         return out
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 4c617e2260..8ef91eeed4 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -126,7 +126,7 @@ def flashinfer_fp4_cutlass_moe_forward(
 
 
 def select_nvfp4_gemm_impl(
-        allow_flashinfer_cutlass: bool,
+        allow_flashinfer: bool,
         moe,  # FusedMoEConfig
         logger):
     """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers"""
@@ -137,8 +137,14 @@ def select_nvfp4_gemm_impl(
     all2all_manager = get_ep_group().device_communicator.all2all_manager
     assert all2all_manager is not None
 
-    if allow_flashinfer_cutlass:
-        logger.debug_once("Using FlashInferExperts")
+    if allow_flashinfer:
+        flashinfer_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
+        if flashinfer_backend != "throughput":
+            raise ValueError(
+                f"Only throughput backend is supported for FlashInferExperts, "
+                f"but got {flashinfer_backend}.")
+        logger.debug_once(
+            "Initializing FlashInferExperts with throughput backend.")
         return FlashInferExperts(
             use_nvfp4_w4a4=True,
             use_dp=moe.moe_parallel_config.dp_size > 1,
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
index 23a749467f..21af74c6b7 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
@@ -21,7 +21,7 @@ class NvFp4Support:
     """Result container for NV-FP4 capability probing."""
 
     cutlass_supported: bool
-    allow_flashinfer_cutlass: bool
+    allow_flashinfer: bool
     use_marlin: bool
 
 
@@ -54,6 +54,6 @@ def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
 
     return NvFp4Support(
         cutlass_supported=cutlass_supported,
-        allow_flashinfer_cutlass=allow_flashinfer,
+        allow_flashinfer=allow_flashinfer,
         use_marlin=use_marlin,
     )
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 32c52612ca..5998d4c312 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -86,6 +86,8 @@ flashinfer_cutlass_fused_moe = _lazy_import_wrapper("flashinfer.fused_moe",
 fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize")
 nvfp4_block_scale_interleave = _lazy_import_wrapper(
     "flashinfer", "nvfp4_block_scale_interleave")
+trtllm_fp4_block_scale_moe = _lazy_import_wrapper(
+    "flashinfer", "trtllm_fp4_block_scale_moe")
 
 # Special case for autotune since it returns a context manager
 autotune = _lazy_import_wrapper(
@@ -112,6 +114,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
         ("flashinfer.fused_moe", "cutlass_fused_moe"),
         ("flashinfer", "fp4_quantize"),
         ("flashinfer", "nvfp4_block_scale_interleave"),
+        ("flashinfer.fused_moe", "trtllm_fp4_block_scale_moe"),
     ]
 
     for module_name, attr_name in required_functions:
@@ -188,6 +191,7 @@ __all__ = [
     "flashinfer_cutlass_fused_moe",
     "fp4_quantize",
     "nvfp4_block_scale_interleave",
+    "trtllm_fp4_block_scale_moe",
     "autotune",
     "has_flashinfer_moe",
     "has_flashinfer_cutlass_fused_moe",

From b2c8ce57c68db0764a49d66f048b8a7a5cef9d13 Mon Sep 17 00:00:00 2001
From: Shu Wang <shuw@nvidia.com>
Date: Thu, 7 Aug 2025 21:18:25 -0500
Subject: [PATCH 077/932] Fix Flashinfer CUTLASS MOE Allgather (#21963)

Signed-off-by: Shu Wang <shuw@nvidia.com>
---
 .../device_communicators/cuda_communicator.py |  3 +-
 vllm/forward_context.py                       | 58 +++++++++++++++++++
 .../flashinfer_cutlass_prepare_finalize.py    | 24 ++------
 vllm/model_executor/layers/fused_moe/layer.py | 13 +++--
 4 files changed, 71 insertions(+), 27 deletions(-)

diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 4ab8f3d938..66d4940c9c 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -236,7 +236,8 @@ class CudaCommunicator(DeviceCommunicatorBase):
             input_size = input_.size()
             if sizes is not None:
                 assert len(sizes) == world_size
-                assert input_.shape[dim] == sizes[self.rank_in_group]
+                assert input_.shape[dim] == sizes[self.rank_in_group], (
+                    f"{input_.shape[dim]} != {sizes[self.rank_in_group]}")
                 output_size = (sum(sizes), ) + input_size[1:]
             else:
                 output_size = (input_size[0] * world_size, ) + input_size[1:]
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index dd55b19fee..4686ba24e6 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -26,10 +26,26 @@ batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL
 batchsize_forward_time: defaultdict = defaultdict(list)
 
 
+def _compute_chunked_local_num_tokens(num_tokens_across_dp_cpu: list[int],
+                                      max_num_tokens: int,
+                                      chunk_idx: int) -> list[int]:
+    dp_size = len(num_tokens_across_dp_cpu)
+
+    local_size = [-1] * dp_size
+    for i in range(dp_size):
+        dp_tokens = num_tokens_across_dp_cpu[i]
+        local_size[i] = min(max_num_tokens,
+                            dp_tokens - (max_num_tokens * chunk_idx))
+        if local_size[i] <= 0:
+            local_size[i] = 1  # ensure lockstep even if done
+    return local_size
+
+
 @dataclass
 class DPMetadata:
     max_tokens_across_dp_cpu: torch.Tensor
     cu_tokens_across_dp_cpu: torch.Tensor
+    local_sizes: Optional[list[int]] = None
 
     @staticmethod
     def num_tokens_across_dp(num_tokens: int, dp_size: int,
@@ -78,6 +94,48 @@ class DPMetadata:
         cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_across_dp, dim=0)
         return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu)
 
+    @contextmanager
+    def chunked_sizes(self, max_chunk_size_per_rank: int, chunk_idx: int):
+        """
+        Context manager to compute and temporarily set the per-rank local token
+        sizes for a specific chunk during chunked forward execution.
+
+        This is necessary to ensure each DP (data parallel) rank processes its
+        designated portion of tokens in lockstep with others, even when the
+        token counts are uneven or some ranks have completed their input early.
+
+        For chunked execution, we break up the total tokens on each rank into
+        multiple chunks (of at most `max_chunk_size_per_rank`), and for a given
+        `chunk_idx`, this context manager sets `self.local_sizes` to the number
+        of tokens to process in that chunk on each rank.
+
+        It uses cumulative sizes (`cu_tokens_across_dp_cpu`) to derive the
+        number of tokens per rank, and calls `_compute_chunked_local_num_tokens`
+        to determine the chunk-wise split.
+
+        `self.local_sizes` is only valid inside the context.
+
+        Args:
+            max_chunk_size_per_rank: The max number of tokens each rank is 
+                                     allowed to process in this chunk.
+            chunk_idx: The index of the chunk to compute sizes for.
+        """
+        cu_sizes = self.cu_tokens_across_dp_cpu
+        num_tokens_across_dp_cpu = [
+            (cu_sizes[i] -
+             cu_sizes[i - 1]).item() if i > 0 else cu_sizes[0].item()
+            for i in range(len(cu_sizes))
+        ]
+        self.local_sizes = _compute_chunked_local_num_tokens(
+            num_tokens_across_dp_cpu, max_chunk_size_per_rank, chunk_idx)
+        try:
+            yield self.local_sizes
+        finally:
+            self.local_sizes = None
+
+    def get_chunk_sizes_across_dp_rank(self) -> Optional[list[int]]:
+        return self.local_sizes
+
 
 @dataclass
 class ForwardContext:
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index 02e1d1f1fd..7fdb465c45 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -4,7 +4,6 @@ from typing import Any, Optional
 
 import torch
 
-import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.distributed import get_dp_group
 from vllm.forward_context import get_forward_context
@@ -14,20 +13,8 @@ from vllm.model_executor.layers.fused_moe.utils import (
 from vllm.utils.flashinfer import nvfp4_block_scale_interleave
 
 
-def get_local_sizes(local_tokens):
-    cu_sizes = get_forward_context().dp_metadata.cu_tokens_across_dp_cpu
-    sizes = [cu_sizes[0].item()]
-    for i in range(1, len(cu_sizes)):
-        sizes.append((cu_sizes[i] - cu_sizes[i - 1]).item())
-    max_num_tokens = envs.VLLM_MOE_DP_CHUNK_SIZE
-    sizes_chunked = [max_num_tokens] * len(sizes)
-    if local_tokens < max_num_tokens:
-        # When the number of local tokens is less than max_num_tokens, all other
-        # ranks will also have fewer than max_num_tokens. The remaining tokens
-        # are accounted for as residual.
-        sizes_chunked = [x % max_num_tokens for x in sizes]
-
-    return sizes_chunked
+def get_local_sizes():
+    return get_forward_context().dp_metadata.get_chunk_sizes_across_dp_rank()
 
 
 class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
@@ -90,7 +77,7 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             topk_weights, topk_ids, a1q, a1q_scale = \
                 get_dp_group().all_gatherv([topk_weights, topk_ids, a1q, a1q_scale], # noqa: E501
                                            dim=0,
-                                           sizes=get_local_sizes(local_tokens))
+                                           sizes=get_local_sizes())
             a1_m, a1_n = a1q.shape
             a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
 
@@ -107,8 +94,5 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                                                ['use_dp', 'local_tokens'])
         if use_dp:
             fused_expert_output = get_dp_group().reduce_scatterv(
-                fused_expert_output,
-                dim=0,
-                sizes=get_local_sizes(local_tokens),
-            )
+                fused_expert_output, dim=0, sizes=get_local_sizes())
         output.copy_(fused_expert_output)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 76cedb3ed3..272b6ce672 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1570,18 +1570,19 @@ class FusedMoE(torch.nn.Module):
         max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
         moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens
         num_tokens = full_hidden_states.size(0)
-        for chunk_start_ in range(0, max_tokens_across_dp,
-                                  moe_dp_chunk_size_per_rank):
+        for chunk_idx, chunk_start_ in enumerate(
+                range(0, max_tokens_across_dp, moe_dp_chunk_size_per_rank)):
             chunk_start = chunk_start_
             chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank,
                             max_tokens_across_dp)
             # clamp start and end
             chunk_start = min(chunk_start, num_tokens - 1)
             chunk_end = min(chunk_end, num_tokens)
-
-            process_chunk(chunk_start,
-                          chunk_end,
-                          skip_result_store=chunk_start_ >= num_tokens)
+            with ctx.dp_metadata.chunked_sizes(moe_dp_chunk_size_per_rank,
+                                               chunk_idx):
+                process_chunk(chunk_start,
+                              chunk_end,
+                              skip_result_store=chunk_start_ >= num_tokens)
 
         return full_final_hidden_states
 

From 3303f134e03f7a80b42e50065976be9d499c8683 Mon Sep 17 00:00:00 2001
From: Junhao Li <streaver91@gmail.com>
Date: Thu, 7 Aug 2025 22:18:28 -0400
Subject: [PATCH 078/932] [Kernel] Add support for block FP8 on SM120 (NVIDIA
 5090 and RTX PRO 6000) (#22131)

Signed-off-by: Junhao Li <junhao@ubicloud.com>
---
 CMakeLists.txt                                |   1 +
 csrc/cutlass_extensions/common.hpp            |  10 +
 .../c3x/scaled_mm_blockwise_sm120_fp8.cu      |  23 +++
 ...scaled_mm_blockwise_sm120_fp8_dispatch.cuh | 183 ++++++++++++++++++
 .../cutlass_w8a8/c3x/scaled_mm_kernels.hpp    |   6 +
 .../cutlass_w8a8/scaled_mm_c3x_sm120.cu       |  24 +--
 6 files changed, 229 insertions(+), 18 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e2cc0ccdef..093330caa4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -427,6 +427,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(SRCS
       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
     )
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index 195872e8ed..f2c1dcf69f 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -60,3 +60,13 @@ struct enable_sm100_only : Kernel {
 #endif
   }
 };
+
+template <typename Kernel>
+struct enable_sm120_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1200
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu
new file mode 100644
index 0000000000..5515374a57
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu
@@ -0,0 +1,23 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_blockwise_sm120_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_blockwise_sm120_fp8(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           torch::Tensor const& a_scales,
+                                           torch::Tensor const& b_scales) {
+  if (out.dtype() == torch::kBFloat16) {
+    cutlass_gemm_blockwise_sm120_fp8_dispatch<cutlass::bfloat16_t>(
+        out, a, b, a_scales, b_scales);
+
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    cutlass_gemm_blockwise_sm120_fp8_dispatch<cutlass::half_t>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
new file mode 100644
index 0000000000..d50a83ae1c
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
@@ -0,0 +1,183 @@
+#pragma once
+
+#include "cuda_utils.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
+
+#include "cutlass_gemm_caller.cuh"
+
+namespace vllm {
+
+using namespace cute;
+
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler>
+struct cutlass_3x_gemm_fp8_blockwise {
+  using ElementAB = cutlass::float_e4m3_t;
+
+  using ElementA = ElementAB;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = ElementAB;
+  // ColumnMajor is used for B to match the CUTLASS convention.
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementD = OutType;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ElementC = void; // TODO: support bias
+  using LayoutC = LayoutD;
+  using LayoutC_Transpose = LayoutD_Transpose;
+  static constexpr int AlignmentC = AlignmentD;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+  using ElementBlockScale = float; 
+
+  using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::MN, cute::UMMA::Major::K>;
+
+  // layout_SFA and layout_SFB cannot be swapped since they are deduced.
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+
+  using ArchTag = cutlass::arch::Sm120;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  using ElementScalar = float;
+  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      LayoutC,
+      AlignmentC,
+      ElementD,
+      LayoutD,
+      AlignmentD,
+      EpilogueScheduler,
+      DefaultOperation
+  >::CollectiveOp;
+ 
+  using StageCountType = cutlass::gemm::collective::StageCountAuto; 
+  using CollectiveMainloop = 
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementA,
+          cute::tuple<LayoutA, LayoutSFA>,
+          AlignmentA,
+          ElementB,
+          cute::tuple<LayoutB, LayoutSFB>,
+          AlignmentB,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp;
+
+  using KernelType = enable_sm120_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
+                                   torch::Tensor const& b,
+                                   torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutSFA = typename Gemm::LayoutSFA;
+  using LayoutSFB = typename Gemm::LayoutSFB;
+  using ScaleConfig = typename Gemm::ScaleConfig;
+
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
+
+  StrideA a_stride;
+  StrideB b_stride;
+  StrideC c_stride;
+  a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
+
+  LayoutSFA layout_SFA = 
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
+  LayoutSFB layout_SFB = 
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
+  auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
+
+  auto mainloop_args = [&](){
+    return typename GemmKernel::MainloopArguments{
+        a_ptr,        a_stride,   b_ptr,        b_stride,
+        a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
+    };
+  }();
+  auto prob_shape = cute::make_shape(m, n, k, 1);
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {}, c_ptr, c_stride, c_ptr, c_stride};
+  c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
+                                       epilogue_args);
+}
+
+template <typename OutType>
+void cutlass_gemm_blockwise_sm120_fp8_dispatch(torch::Tensor& out,
+                                               torch::Tensor const& a,
+                                               torch::Tensor const& b,
+                                               torch::Tensor const& a_scales,
+                                               torch::Tensor const& b_scales) {
+  // TODO: better heuristics
+  cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+      OutType, 1, 128, 128, Shape<_128, _128, _128>,
+      Shape<_1, _1, _1>, cutlass::epilogue::collective::EpilogueScheduleAuto,
+      cutlass::gemm::collective::KernelScheduleAuto>>(
+      out, a, b, a_scales, b_scales);
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
index e049a5f2d2..9ceb3a3ece 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
@@ -47,4 +47,10 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
                                            torch::Tensor const& b,
                                            torch::Tensor const& a_scales,
                                            torch::Tensor const& b_scales);
+
+void cutlass_scaled_mm_blockwise_sm120_fp8(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           torch::Tensor const& a_scales,
+                                           torch::Tensor const& b_scales);
 }  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
index 0c47ab8299..dc87c5c35c 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu
@@ -1,11 +1,9 @@
-#include <cudaTypedefs.h>
+#include "c3x/scaled_mm_helper.hpp"
 #include "c3x/scaled_mm_kernels.hpp"
 
-#include "cuda_utils.h"
-
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
-   NVIDIA GPUs with sm120 (Blackwell Geforce).
+   NVIDIA GPUs with sm120 (Blackwell).
 */
 
 #if defined ENABLE_SCALED_MM_SM120 && ENABLE_SCALED_MM_SM120
@@ -15,20 +13,10 @@ void cutlass_scaled_mm_sm120(torch::Tensor& c, torch::Tensor const& a,
                              torch::Tensor const& a_scales,
                              torch::Tensor const& b_scales,
                              std::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-  int M = a.size(0), N = b.size(1), K = a.size(1);
-  TORCH_CHECK(
-      (a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
-          (b_scales.numel() == 1 || b_scales.numel() == b.size(1)),
-      "Currently, block scaled fp8 gemm is not implemented for Blackwell");
-
-  // Standard per-tensor/per-token/per-channel scaling
-  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn,
-              "Currently, only fp8 gemm is implemented for Blackwell");
-  vllm::cutlass_scaled_mm_sm120_fp8(c, a, b, a_scales, b_scales, bias);
+  dispatch_scaled_mm(c, a, b, a_scales, b_scales, bias,
+                     vllm::cutlass_scaled_mm_sm120_fp8,
+                     nullptr,  // int8 not supported on SM120
+                     vllm::cutlass_scaled_mm_blockwise_sm120_fp8);
 }
 
 #endif

From 17eaaef59504aa6786cbf89a8d5012d7b64839de Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Fri, 8 Aug 2025 10:20:21 +0800
Subject: [PATCH 079/932] [Bugfix] Fix RuntimeError: Index put requires the
 source and destination dtypes match (#22065)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .../test_completion_with_image_embeds.py      | 103 ++++++++++++++++++
 vllm/model_executor/models/utils.py           |   5 +-
 2 files changed, 106 insertions(+), 2 deletions(-)
 create mode 100644 tests/v1/entrypoints/openai/test_completion_with_image_embeds.py

diff --git a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
new file mode 100644
index 0000000000..be98be8d14
--- /dev/null
+++ b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import io
+import json
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+import torch
+from transformers import AutoConfig
+
+from tests.conftest import ImageTestAssets
+from tests.utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
+MAXIMUM_IMAGES = 2
+
+
+@pytest.fixture(scope="module")
+def default_image_embeds_server_args() -> list[str]:
+    return [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "4",
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": MAXIMUM_IMAGES}),
+    ]
+
+
+@pytest.fixture(scope="module")
+def server_with_image_embeds(default_image_embeds_server_args):
+    with RemoteOpenAIServer(MODEL_NAME,
+                            default_image_embeds_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_with_image_embeds(server_with_image_embeds):
+    async with server_with_image_embeds.get_async_client() as async_client:
+        yield async_client
+
+
+def encode_image_embedding_to_base64(image_embedding) -> str:
+    """
+    Encode image embedding to base64 string
+    """
+    buffer = io.BytesIO()
+    torch.save(image_embedding, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+    return base64_image_embedding
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("dtype", [torch.half, torch.float16, torch.float32])
+async def test_completions_with_image_embeds(
+    client_with_image_embeds: openai.AsyncOpenAI,
+    model_name: str,
+    image_assets: ImageTestAssets,
+    dtype: torch.dtype,
+):
+    # Test case: Single image embeds input
+    image_embeds = image_assets[0].image_embeds.to(dtype=dtype)
+    base64_image_embedding = encode_image_embedding_to_base64(image_embeds)
+    chat_completion = await client_with_image_embeds.chat.completions.create(
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a helpful assistant."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type":
+                        "text",
+                        "text":
+                        "Describe these images separately. For each image,"
+                        "reply with a short sentence (no more than 10 words).",
+                    },
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding,
+                    },
+                ],
+            },
+        ],
+        model=model_name,
+    )
+    assert chat_completion.choices[0].message.content is not None
+    assert isinstance(chat_completion.choices[0].message.content, str)
+    assert len(chat_completion.choices[0].message.content) > 0
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index fecd14dde4..c69df6e616 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -401,7 +401,7 @@ def merge_multimodal_embeddings_from_map(
     """
     flattened_embeddings = _flatten_embeddings(multimodal_embeddings)
     inputs_embeds[placeholder_map.dest] = flattened_embeddings[
-        placeholder_map.src]
+        placeholder_map.src].to(dtype=inputs_embeds.dtype)
     return inputs_embeds
 
 
@@ -421,7 +421,8 @@ def _merge_multimodal_embeddings(
     flattened = _flatten_embeddings(multimodal_embeddings)
     try:
         # This is equivalent to: inputs_embeds[is_multimodal] = flattened.
-        inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), flattened)
+        inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1),
+                                      flattened.to(dtype=inputs_embeds.dtype))
     except RuntimeError as e:
         num_expected_tokens = is_multimodal.sum().item()
         assert isinstance(num_expected_tokens, int)

From c152e2a8a0f49edfc06d760f04ff617310384757 Mon Sep 17 00:00:00 2001
From: Yuxuan Zhang <2448370773@qq.com>
Date: Fri, 8 Aug 2025 10:37:23 +0800
Subject: [PATCH 080/932] not tie_word_embeddings for glm-4.5 and glm-4.5v
 (#22460)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
---
 vllm/model_executor/models/glm4_moe.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index bd3e27662e..0053e4e6ff 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -601,8 +601,6 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
                                           quant_config=quant_config)
         else:
             self.lm_head = PPMissingLayer()
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)

From 6f287915d8e4c2c09e7db2eb5cb670036d33f478 Mon Sep 17 00:00:00 2001
From: ZiTian Zhao <zitian.zhao@tencentmusic.com>
Date: Fri, 8 Aug 2025 11:18:50 +0800
Subject: [PATCH 081/932] Optimize MiniCPMO mask creation with vectorized
 implementation (#22464)

Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
---
 vllm/model_executor/models/minicpmo.py | 32 ++++++++++++++++++--------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index 4e4fc3d5c7..fd91c7fcc1 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -587,15 +587,29 @@ class MiniCPMO(MiniCPMV2_6):
         num_lookhead: int = 0,
     ) -> torch.Tensor:
         ret = torch.zeros(size, size, device=device, dtype=torch.bool)
-        for i in range(size):
-            if num_left_chunks < 0:
-                start = 0
-            else:
-                start = max((i // chunk_size - num_left_chunks) * chunk_size,
-                            0)
-            ending = min((i // chunk_size + 1) * chunk_size + num_lookhead,
-                         size)
-            ret[i, start:ending] = True
+        # Vectorized computation of row indices and chunk boundaries
+        row_indices = torch.arange(size, device=device)
+        chunk_indices = row_indices // chunk_size  
+        if num_left_chunks < 0:
+            # If num_left_chunks < 0, start is always 0 for all rows
+            start_indices = torch.zeros_like(row_indices)
+        else:
+            # Compute start indices vectorially
+            start_chunk_indices = torch.clamp(chunk_indices - num_left_chunks,
+                                              min=0)
+            start_indices = start_chunk_indices * chunk_size    
+        # Compute ending indices vectorially
+        end_chunk_indices = chunk_indices + 1
+        end_indices = torch.clamp(end_chunk_indices * chunk_size +
+                                  num_lookhead,
+                                  max=size)
+        # Create column indices for broadcasting
+        col_indices = torch.arange(size, device=device).unsqueeze(0)
+        row_indices = row_indices.unsqueeze(1)
+        start_indices = start_indices.unsqueeze(1)
+        end_indices = end_indices.unsqueeze(1)
+        # Vectorized mask creation
+        ret = (col_indices >= start_indices) & (col_indices < end_indices)
         return ret
 
     def _get_feat_extract_output_lengths(self,

From 157f9c13687e38b89fdeb20ecdbb75baf8153e0f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 8 Aug 2025 11:21:54 +0800
Subject: [PATCH 082/932] Fix pre-commit (#22487)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/minicpmo.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index fd91c7fcc1..1ee0a94c37 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -589,7 +589,7 @@ class MiniCPMO(MiniCPMV2_6):
         ret = torch.zeros(size, size, device=device, dtype=torch.bool)
         # Vectorized computation of row indices and chunk boundaries
         row_indices = torch.arange(size, device=device)
-        chunk_indices = row_indices // chunk_size  
+        chunk_indices = row_indices // chunk_size
         if num_left_chunks < 0:
             # If num_left_chunks < 0, start is always 0 for all rows
             start_indices = torch.zeros_like(row_indices)
@@ -597,7 +597,7 @@ class MiniCPMO(MiniCPMV2_6):
             # Compute start indices vectorially
             start_chunk_indices = torch.clamp(chunk_indices - num_left_chunks,
                                               min=0)
-            start_indices = start_chunk_indices * chunk_size    
+            start_indices = start_chunk_indices * chunk_size
         # Compute ending indices vectorially
         end_chunk_indices = chunk_indices + 1
         end_indices = torch.clamp(end_chunk_indices * chunk_size +

From af473f0a85731c17d9cf708deec3e864e674feb0 Mon Sep 17 00:00:00 2001
From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com>
Date: Fri, 8 Aug 2025 11:25:01 +0800
Subject: [PATCH 083/932] [bugfix] Fix Llama3/4 issues caused by FlashInfer
 0.2.10 (#22426)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
---
 .../quantization/utils/flashinfer_utils.py    | 22 +++++++++++++------
 vllm/v1/attention/backends/flashinfer.py      |  3 ++-
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index c6f914febc..9fb194767e 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -6,14 +6,22 @@ import torch
 
 
 def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
-    from flashinfer import next_positive_power_of_2
 
-    # Guess tokens per expert assuming perfect expert distribution first.
-    num_tokens_per_expert = (num_tokens * top_k) // num_experts
-    # And pad the number to the next power of 2.
-    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
-    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
-    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    # FlashInfer 0.2.10 has issues with larger tile sizes. Set to 8 for now.
+    # TODO: Revert this to dynamic calculation once a new version of FlashInfer
+    # with the necessary kernels is released.
+    tile_tokens_dim = 8
+
+    # from flashinfer import next_positive_power_of_2
+
+    # # Guess tokens per expert assuming perfect expert distribution first.
+    # num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # # And pad the number to the next power of 2.
+    # tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
+    # # Cap to 8-64 tokens per CTA tile as it's the range supported by the
+    # # kernel.
+    # tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+
     return tile_tokens_dim
 
 
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 1fcb190286..c85d8bce31 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -524,7 +524,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         head_dim = self.kv_cache_spec.head_size
 
         # currently prefill trtllm attention does not support fp8 kv cache
-        prefill_use_trtllm = use_trtllm_attention(
+        prefill_use_trtllm = not cache_dtype.startswith("fp8") \
+                                and use_trtllm_attention(
                                 num_prefill_tokens, max_seq_len, cache_dtype,
                                 num_qo_heads, num_kv_heads, head_dim)
         decode_use_trtllm = use_trtllm_attention(

From 099c0464637f330f8ea38b07fe0694717c16d815 Mon Sep 17 00:00:00 2001
From: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Date: Fri, 8 Aug 2025 11:25:18 +0700
Subject: [PATCH 084/932] [Doc] Sleep mode documentation (#22310)

Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Hong Hanh <hanh.usth@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 docs/features/sleep_mode.md | 80 +++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 docs/features/sleep_mode.md

diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md
new file mode 100644
index 0000000000..5749b02d26
--- /dev/null
+++ b/docs/features/sleep_mode.md
@@ -0,0 +1,80 @@
+# Sleep Mode
+
+vLLM's Sleep Mode allows you to temporarily release most GPU memory used by a model, including model weights and KV cache, without stopping the server or unloading the Docker container. This is especially useful for RLHF, training, or cost-saving scenarios where GPU resources need to be freed between inference workloads.
+
+Key benefits:
+
+- **Frees GPU memory**: Offloads model weights to CPU RAM and discards KV cache, releasing up to 90%+ of GPU memory for other tasks.
+- **Fast resume**: Quickly wake up the engine and resume inference without full model reload.
+- **API endpoints**: Control sleep/wake_up state via HTTP endpoints or Python API.
+- **Supports distributed workloads**: Works with tensor parallelism, pipeline parallelism, etc.
+- **Fine-grained control**: Optionally wake up only model weights or KV cache to avoid OOM during weight updates.
+
+!!! note
+    This feature is only supported on CUDA platform.
+
+## Sleep levels
+
+Level 1 sleep will offload the model weights and discard the KV cache. The content of KV cache is forgotten. Level 1 sleep is good for sleeping and waking up the engine to run the same model again. The model weights are backed up in CPU memory. Please make sure there's enough CPU memory to store the model weights. Level 2 sleep will discard both the model weights and the KV cache (while the model's buffers are kept in CPU, like rope scaling tensors). The content of both the model weights and KV cache is forgotten. Level 2 sleep is good for sleeping and waking up the engine to run a different model or update the model, where previous model weights are not needed, e.g. RLHF weight update.
+
+## Usage
+
+### Offline inference
+
+Enable sleep mode by passing `enable_sleep_mode=True` to the `LLM` class.
+
+```python
+from vllm import LLM
+llm = LLM("Qwen/Qwen3-0.6B", enable_sleep_mode=True)
+```
+
+#### Python API
+
+```python
+# Put the engine to sleep (level=1: offload weights to CPU RAM, discard KV cache)
+llm.sleep(level=1)
+
+# Wake up the engine (restore weights)
+llm.wake_up()
+```
+
+#### RLHF weight updates
+
+During RLHF training, vLLM allows you to selectively wake up only the model weights or the KV cache using the tags argument in wake_up(). This fine-grained control is especially useful when updating model weights: by waking up just the weights (e.g., llm.wake_up(tags=["weights"])), you avoid allocating memory for the KV cache until after the weight update is complete. This approach helps prevent GPU out-of-memory (OOM) errors, particularly with large models, by minimizing peak memory usage during weight synchronization and update operations.
+
+Use `tags=["weights"]` or `tags=["kv_cache"]` to control which resources are restored, useful for RLHF and weight updates. **Note** that `is_sleeping` will report `true` until all components are awake.
+
+```python
+# Put engine to deep sleep (level=2)
+llm.sleep(level=2)
+# ... Get the new weights
+# Wake up only weights to avoid OOM
+llm.wake_up(tags=["weights"])
+# ... Update the weights
+# wake up KV cache after weights are updated
+llm.wake_up(tags=["kv_cache"])
+```
+
+### Online Serving
+
+To enable sleep mode in a vLLM server you need to initialize it with the flag `VLLM_SERVER_DEV_MODE=1` and pass `--enable-sleep-mode` to the vLLM server.
+
+#### Server in development mode
+
+When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
+
+```bash
+VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen3-0.6B \
+  --enable-sleep-mode \
+  --port 8000
+```
+
+#### HTTP endpoints
+
+- `POST /sleep?level=1` — Put the model to sleep (`level=1`).
+- `POST /wake_up` — Wake up the model. Supports optional `tags` query parameters for partial wake-up (e.g., `?tags=weights`).
+- `GET /is_sleeping` — Check if the model is sleeping.
+
+!!! note
+    These endpoints are only available when passing `VLLM_SERVER_DEV_MODE=1`.

From 808a7b69df479b6b3a16181711cac7ca28a9b941 Mon Sep 17 00:00:00 2001
From: lkchen <github@lkchen.net>
Date: Thu, 7 Aug 2025 23:15:50 -0700
Subject: [PATCH 085/932] [bench] Fix benchmark/serve.py to ignore unavailable
 results (#22382)

Signed-off-by: Linkun <github@lkchen.net>
---
 vllm/benchmarks/serve.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 6d52b51a9f..7cdf87cb4c 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -665,7 +665,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
     pt_records = convert_to_pytorch_benchmark_format(
         args=args,
         metrics={k: [results[k]]
-                 for k in metrics},
+                 for k in metrics if k in results},
         extra_info={
             k: results[k]
             for k in results if k not in metrics and k not in ignored_metrics

From 1712543df6d0ebdc2cc9649e246ae983c92dabd3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 8 Aug 2025 15:31:19 +0800
Subject: [PATCH 086/932] [CI/Build] Fix multimodal tests (#22491)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/llm_engine.py    |  3 ++-
 vllm/multimodal/registry.py  | 25 +++++++++++++------------
 vllm/v1/engine/async_llm.py  |  2 +-
 vllm/v1/engine/llm_engine.py |  2 +-
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 79255b031e..3fc4f6445d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -845,7 +845,8 @@ class LLMEngine:
 
     def reset_mm_cache(self) -> bool:
         """Reset the multi-modal cache."""
-        return self.input_preprocessor.mm_registry.reset_processor_cache()
+        return self.input_preprocessor.mm_registry.reset_processor_cache(
+            self.model_config)
 
     def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
         """Reset prefix cache for all devices."""
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index dca04e9a1e..565d54e1a2 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
+from functools import lru_cache
 from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
 
 import torch.nn as nn
@@ -86,6 +87,13 @@ class _ProcessorFactories(Generic[_I]):
         return self.processor(info, dummy_inputs_builder, cache=cache)
 
 
+# Make sure a different cache is used for each model config
+# NOTE: ModelConfig is not hashable so it cannot be passed directly
+@lru_cache(maxsize=1)
+def _get_processor_cache(model_id: str, capacity_gb: int):
+    return ProcessingCache(capacity_gb) if capacity_gb > 0 else None
+
+
 class MultiModalRegistry:
     """
     A registry that dispatches data processing according to the model.
@@ -95,22 +103,15 @@ class MultiModalRegistry:
         self._processor_factories = ClassRegistry[nn.Module,
                                                   _ProcessorFactories]()
 
-        self._processor_cache: Optional[ProcessingCache] = None
-
     def _get_processor_cache(self, model_config: "ModelConfig"):
+        model_id = model_config.model
         capacity_gb = model_config.mm_processor_cache_gb
-        if capacity_gb is None:
-            return None  # Overrides `disable_cache` argument
+        return _get_processor_cache(model_id, capacity_gb)
 
-        if self._processor_cache is None:
-            self._processor_cache = ProcessingCache(capacity_gb)
-
-        return self._processor_cache
-
-    def reset_processor_cache(self) -> bool:
+    def reset_processor_cache(self, model_config: "ModelConfig") -> bool:
         """Reset the multi-modal processing cache."""
-        if self._processor_cache:
-            self._processor_cache.reset()
+        if processor_cache := self._get_processor_cache(model_config):
+            processor_cache.reset()
 
         return True  # Success
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 45f450291a..7b4ed90fd1 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -566,7 +566,7 @@ class AsyncLLM(EngineClient):
         await self.engine_core.profile_async(False)
 
     async def reset_mm_cache(self) -> None:
-        self.processor.mm_registry.reset_processor_cache()
+        self.processor.mm_registry.reset_processor_cache(self.model_config)
         self.processor.mm_input_cache_client.reset()
         await self.engine_core.reset_mm_cache_async()
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index efbdffbc09..5a00a93095 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -271,7 +271,7 @@ class LLMEngine:
         self.engine_core.profile(False)
 
     def reset_mm_cache(self):
-        self.processor.mm_registry.reset_processor_cache()
+        self.processor.mm_registry.reset_processor_cache(self.model_config)
         self.processor.mm_input_cache_client.reset()
         self.engine_core.reset_mm_cache()
 

From 43c4f3d77c3c03f67385201e1b1725a6ba6bcc7a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 8 Aug 2025 16:11:54 +0800
Subject: [PATCH 087/932] [Misc] Begin deprecation of
 `get_tensor_model_*_group` (#22494)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/distributed/test_custom_all_reduce.py |  5 ++---
 tests/distributed/test_quick_all_reduce.py  |  5 ++---
 vllm/distributed/parallel_state.py          | 16 ++++++++++++----
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index fae49c41d5..9212c04dee 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -10,8 +10,7 @@ import torch.distributed as dist
 
 from vllm.distributed.communication_op import (  # noqa
     tensor_model_parallel_all_reduce)
-from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
-                                             get_tp_group, graph_capture)
+from vllm.distributed.parallel_state import get_tp_group, graph_capture
 
 from ..utils import (ensure_model_parallel_initialized,
                      init_test_distributed_environment, multi_process_parallel)
@@ -37,7 +36,7 @@ def graph_allreduce(
         init_test_distributed_environment(tp_size, pp_size, rank,
                                           distributed_init_port)
         ensure_model_parallel_initialized(tp_size, pp_size)
-        group = get_tensor_model_parallel_group().device_group
+        group = get_tp_group().device_group
 
         # A small all_reduce for warmup.
         # this is needed because device communicators might be created lazily
diff --git a/tests/distributed/test_quick_all_reduce.py b/tests/distributed/test_quick_all_reduce.py
index a4added291..6245ccbeca 100644
--- a/tests/distributed/test_quick_all_reduce.py
+++ b/tests/distributed/test_quick_all_reduce.py
@@ -10,8 +10,7 @@ import torch.distributed as dist
 
 from vllm.distributed.communication_op import (  # noqa
     tensor_model_parallel_all_reduce)
-from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
-                                             get_tp_group, graph_capture)
+from vllm.distributed.parallel_state import get_tp_group, graph_capture
 from vllm.platforms import current_platform
 
 from ..utils import (ensure_model_parallel_initialized,
@@ -42,7 +41,7 @@ def graph_quickreduce(
         init_test_distributed_environment(tp_size, pp_size, rank,
                                           distributed_init_port)
         ensure_model_parallel_initialized(tp_size, pp_size)
-        group = get_tensor_model_parallel_group().device_group
+        group = get_tp_group().device_group
 
         # A small all_reduce for warmup.
         # this is needed because device communicators might be created lazily
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 6c25cdcfb7..0b3993ca02 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -36,6 +36,7 @@ from unittest.mock import patch
 import torch
 import torch.distributed
 from torch.distributed import Backend, ProcessGroup
+from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.distributed.device_communicators.base_device_communicator import (
@@ -894,8 +895,12 @@ def get_tp_group() -> GroupCoordinator:
     return _TP
 
 
-# kept for backward compatibility
-get_tensor_model_parallel_group = get_tp_group
+@deprecated("`get_tensor_model_parallel_group` has been replaced with "
+            "`get_tp_group` and may be removed after v0.12. Please use "
+            "`get_tp_group` instead.")
+def get_tensor_model_parallel_group():
+    return get_tp_group()
+
 
 _PP: Optional[GroupCoordinator] = None
 
@@ -921,8 +926,11 @@ def get_pp_group() -> GroupCoordinator:
     return _PP
 
 
-# kept for backward compatibility
-get_pipeline_model_parallel_group = get_pp_group
+@deprecated("`get_pipeline_model_parallel_group` has been replaced with "
+            "`get_pp_group` and may be removed in v0.12. Please use "
+            "`get_pp_group` instead.")
+def get_pipeline_model_parallel_group():
+    return get_pp_group()
 
 
 @contextmanager

From 904063907c141fe59c2302afe5bc94cbb53c0de6 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Fri, 8 Aug 2025 16:12:54 +0800
Subject: [PATCH 088/932] [Misc] fix openai version (#22485)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 5c422500e1..1a8fea0dd7 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -12,7 +12,7 @@ tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
-openai >= 1.98.0  # For Responses API with reasoning content
+openai >= 1.99.1  # For Responses API with reasoning content
 pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing

From ccdae737a0c947467488c05f61537e5658fe5064 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 8 Aug 2025 01:13:18 -0700
Subject: [PATCH 089/932] [BugFix] Don't cancel asyncio tasks directly from
 destructors (#22476)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/utils/__init__.py        | 23 +++++++++++++++++------
 vllm/v1/engine/async_llm.py   |  5 ++---
 vllm/v1/engine/core_client.py |  9 ++++-----
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index ce62282c21..6d82714f3c 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -687,19 +687,30 @@ class AsyncMicrobatchTokenizer:
         max_length = kwargs.get("max_length")
 
         if not truncation:
-            return ("encode", add_special_tokens, False, None)
+            return "encode", add_special_tokens, False, None
 
         model_max = getattr(self.tokenizer, "model_max_length", None)
         if max_length is None or (model_max is not None
                                   and max_length == model_max):
-            return ("encode", add_special_tokens, True, "model_max")
+            return "encode", add_special_tokens, True, "model_max"
 
-        return ("encode", "other")
+        return "encode", "other"
 
     def __del__(self):
-        for task in self._batcher_tasks:
-            if not task.done():
-                task.cancel()
+        if ((tasks := getattr(self, "_batcher_tasks", None))
+                and (loop := getattr(self, "_loop", None))
+                and not loop.is_closed()):
+
+            def cancel_tasks():
+                for task in tasks:
+                    task.cancel()
+
+            loop.call_soon_threadsafe(cancel_tasks)
+
+
+def cancel_task_threadsafe(task: Task):
+    if task and not task.done() and not (loop := task.get_loop()).is_closed():
+        loop.call_soon_threadsafe(task.cancel)
 
 
 def make_async(
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 7b4ed90fd1..a270632791 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -27,7 +27,7 @@ from vllm.transformers_utils.config import (
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, cdiv, deprecate_kwargs
+from vllm.utils import Device, cancel_task_threadsafe, cdiv, deprecate_kwargs
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
@@ -219,8 +219,7 @@ class AsyncLLM(EngineClient):
         if engine_core := getattr(self, "engine_core", None):
             engine_core.shutdown()
 
-        if handler := getattr(self, "output_handler", None):
-            handler.cancel()
+        cancel_task_threadsafe(getattr(self, "output_handler", None))
 
     async def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         return await self.engine_core.get_supported_tasks_async()
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 4d30bb6b74..05b4d72608 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -23,7 +23,8 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.tasks import SupportedTask
-from vllm.utils import get_open_port, get_open_zmq_inproc_path, make_zmq_socket
+from vllm.utils import (cancel_task_threadsafe, get_open_port,
+                        get_open_zmq_inproc_path, make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType,
                             ReconfigureDistributedRequest, ReconfigureRankType,
@@ -342,10 +343,8 @@ class BackgroundResources:
         if self.coordinator is not None:
             self.coordinator.close()
 
-        if self.output_queue_task is not None:
-            self.output_queue_task.cancel()
-        if self.stats_update_task is not None:
-            self.stats_update_task.cancel()
+        cancel_task_threadsafe(self.output_queue_task)
+        cancel_task_threadsafe(self.stats_update_task)
 
         # ZMQ context termination can hang if the sockets
         # aren't explicitly closed first.

From 7be7f3824a2d610299991ceefb1b034b3a923b0f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 8 Aug 2025 11:02:51 +0100
Subject: [PATCH 090/932] [Docs] Improve API docs (+small tweaks) (#22459)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/.nav.yml                          | 5 ++---
 docs/api/{README.md => summary.md}     | 0
 docs/features/quantization/inc.md      | 5 +----
 docs/mkdocs/hooks/generate_examples.py | 2 +-
 mkdocs.yaml                            | 6 +-----
 5 files changed, 5 insertions(+), 13 deletions(-)
 rename docs/api/{README.md => summary.md} (100%)

diff --git a/docs/.nav.yml b/docs/.nav.yml
index ad742be3d6..77342e2674 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -58,10 +58,9 @@ nav:
     - CI: contributing/ci
     - Design Documents: design
   - API Reference:
-    - Summary: api/README.md
+    - Summary: api/summary.md
     - Contents:
-      - glob: api/vllm/*
-        preserve_directory_names: true
+      - api/vllm/*
   - CLI Reference:
     - Summary: cli/README.md
   - Community:
diff --git a/docs/api/README.md b/docs/api/summary.md
similarity index 100%
rename from docs/api/README.md
rename to docs/api/summary.md
diff --git a/docs/features/quantization/inc.md b/docs/features/quantization/inc.md
index d97a462f54..13b151bc7f 100644
--- a/docs/features/quantization/inc.md
+++ b/docs/features/quantization/inc.md
@@ -1,7 +1,4 @@
----
-title: FP8 INC
----
-[](){ #inc }
+# FP8 INC
 
 vLLM supports FP8 (8-bit floating point) weight and activation quantization using Intel® Neural Compressor (INC) on Intel® Gaudi® 2 and Intel® Gaudi® 3 AI accelerators.
 Currently, quantization is validated only in Llama models.
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 0ee52bb346..6b4c5b3107 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -105,7 +105,7 @@ class Example:
         return fix_case(self.path.stem.replace("_", " ").title())
 
     def generate(self) -> str:
-        content = f"---\ntitle: {self.title}\n---\n\n"
+        content = f"# {self.title}\n\n"
         content += f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
 
         # Use long code fence to avoid issues with
diff --git a/mkdocs.yaml b/mkdocs.yaml
index e5b7454003..3a64888fb4 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -40,6 +40,7 @@ theme:
     - navigation.sections
     - navigation.prune
     - navigation.top
+    - navigation.indexes
     - search.highlight
     - search.share
     - toc.follow
@@ -51,11 +52,6 @@ hooks:
   - docs/mkdocs/hooks/generate_argparse.py
   - docs/mkdocs/hooks/url_schemes.py
 
-# Required to stop api-autonav from raising an error
-# https://github.com/tlambert03/mkdocs-api-autonav/issues/16
-nav:
-  - api
-
 plugins:
   - meta
   - search

From e5ebeeba531755a78f68413e88a23d061404f3e3 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 8 Aug 2025 11:06:46 +0100
Subject: [PATCH 091/932] Remove exception for Python 3.8 typing from linter
 (#22506)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 pyproject.toml         | 2 --
 vllm/utils/__init__.py | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index dfad5d2cdf..03a32ac0ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -73,8 +73,6 @@ line-length = 80
 "vllm/engine/**/*.py" = ["UP006", "UP035"]
 "vllm/executor/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
-# Python 3.8 typing - skip utils for ROCm
-"vllm/utils/__init__.py" = ["UP006", "UP035"]
 
 [tool.ruff.lint]
 select = [
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 6d82714f3c..e39cdf76dc 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -47,7 +47,7 @@ from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
 from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, TextIO, Tuple, TypeVar, Union, cast, overload)
+                    Optional, TextIO, TypeVar, Union, cast, overload)
 from urllib.parse import urlparse
 from uuid import uuid4
 
@@ -861,7 +861,7 @@ def is_valid_ipv6_address(address: str) -> bool:
         return False
 
 
-def split_host_port(host_port: str) -> Tuple[str, int]:
+def split_host_port(host_port: str) -> tuple[str, int]:
     # ipv6
     if host_port.startswith('['):
         host, port = host_port.rsplit(']', 1)

From e789cad6b8b5d2a01aa6521b9208bb8d6501ee5b Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Fri, 8 Aug 2025 08:24:07 -0700
Subject: [PATCH 092/932] [gpt-oss] triton kernel mxfp4 (#22421)

Signed-off-by: <zyy1102000@gmail.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
---
 .gitignore                                    |   3 +
 .../moe/test_gpt_oss_triton_kernels.py        | 375 ++++++++++++++++++
 .../fused_moe/gpt_oss_triton_kernels_moe.py   | 230 +++++++++++
 vllm/model_executor/layers/fused_moe/layer.py |  17 +-
 .../layers/quantization/mxfp4.py              |  66 ++-
 .../layers/quantization/utils/mxfp4_utils.py  |  46 ++-
 vllm/model_executor/layers/utils.py           |  21 +
 vllm/utils/__init__.py                        |   6 +
 8 files changed, 755 insertions(+), 9 deletions(-)
 create mode 100644 tests/kernels/moe/test_gpt_oss_triton_kernels.py
 create mode 100644 vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py

diff --git a/.gitignore b/.gitignore
index 96b97a552c..5dc0f04b6f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,9 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
 
+# triton jit 
+.triton
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
new file mode 100644
index 0000000000..3f9b32ce5a
--- /dev/null
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -0,0 +1,375 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass, fields
+
+import pytest
+import torch
+import torch.nn.functional as F
+import triton_kernels.swiglu
+from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+from triton_kernels.numerics import InFlexData
+from triton_kernels.numerics_details.mxfp import (downcast_to_mxfp,
+                                                  upcast_from_mxfp)
+from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+from triton_kernels.tensor_details import layout
+from triton_kernels.testing import assert_close
+
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedPrepareAndFinalize)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    BatchedOAITritonExperts, triton_kernel_moe_forward)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.utils import shuffle_weight
+from vllm.utils import round_up
+
+
+def deshuffle(w: torch.Tensor):
+    first = w[..., ::2]
+    second = w[..., 1::2]
+
+    deshuffled = torch.concat((first, second), dim=-1)
+    return deshuffled
+
+
+def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
+    randbits = [torch.randperm(E) for _ in range(M)]
+    x_list = [
+        (-1)**i *
+        ((16384 +
+          ((i * 512) % 4096) + bits).to(torch.int16).view(torch.bfloat16))
+        for i, bits in enumerate(randbits)
+    ]
+    exp_data = torch.stack(x_list).to(
+        device="cuda")  # simulating gate_output (M, E)
+
+    # create input tensor
+    x = torch.randn((M, K), dtype=torch.bfloat16, device="cuda")
+    w1 = torch.randn((E, 2 * N, K), dtype=torch.bfloat16, device="cuda")
+    w1_bias = torch.randn((E, 2 * N), dtype=torch.bfloat16, device="cuda")
+
+    w2 = torch.randn((E, K, N), dtype=torch.bfloat16, device="cuda")
+    w2_bias = torch.randn((E, K), dtype=torch.bfloat16, device="cuda")
+
+    exp_data_tri = exp_data.clone()
+    x_tri = x.clone()
+    w1_tri = w1.clone()
+    w2_tri = w2.clone()
+
+    w1_bias_tri = w1_bias.clone()
+    w2_bias_tri = w2_bias.clone()
+    w1_bias_tri = w1_bias_tri.to(torch.float32)
+    w2_bias_tri = w2_bias_tri.to(torch.float32)
+
+    dtype_dict = {
+        "bf16": torch.bfloat16,
+        "fp8_e4m3": torch.float8_e4m3fn,
+        "fp8_e5m2": torch.float8_e5m2
+    }
+
+    x = x.to(dtype_dict[a_dtype]).to(torch.bfloat16)
+    if w_dtype != "mx4":
+        # simulate quantization support on reference impl
+        w1 = w1.to(dtype_dict[w_dtype]).to(torch.bfloat16)
+        w2 = w2.to(dtype_dict[w_dtype]).to(torch.bfloat16)
+
+    # triton moe kernel use transposed shape for matmul
+    w1_tri = w1_tri.transpose(-2, -1)
+    w2_tri = w2_tri.transpose(-2, -1)
+
+    # shuffle weights
+    w1_tri = shuffle_weight(w1_tri)
+    w1_bias_tri = shuffle_weight(w1_bias_tri)
+
+    # quant triton_weights
+    x_tri = x.to(dtype_dict[a_dtype])
+    if w_dtype != "mx4":
+        pytest.skip("NYI")
+    else:  # quantize to mx4
+        # careful on the padding here, the activation padding need to be
+        # multiple of 64, the actual engine is not implemented
+        w1_bottom_pad = round_up(w1_tri.shape[1], 64) - w1_tri.shape[1]
+        w1_right_pad = round_up(w1_tri.shape[2], 128) - w1_tri.shape[2]
+
+        w2_bottom_pad = w1_right_pad // 2
+        w2_right_pad = w1_bottom_pad
+
+        x_pad = w1_bottom_pad
+
+        w1_tri = F.pad(w1_tri, (0, w1_right_pad, 0, w1_bottom_pad, 0, 0),
+                       mode="constant",
+                       value=0)
+        w2_tri = F.pad(w2_tri, (0, w2_right_pad, 0, w2_bottom_pad, 0, 0),
+                       mode="constant",
+                       value=0)
+
+        w1_bias_tri = F.pad(w1_bias_tri, (0, w1_right_pad, 0, 0),
+                            mode="constant",
+                            value=0)
+        w2_bias_tri = F.pad(w2_bias_tri, (0, w2_right_pad, 0, 0),
+                            mode="constant",
+                            value=0)
+
+        x_tri = F.pad(x_tri, (0, x_pad, 0, 0), mode="constant", value=0)
+
+        w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(
+            mx_axis=1)
+        w_scale_layout, w_scale_layout_opts = (
+            layout.make_default_matmul_mxfp4_w_scale_layout(
+                mx_axis=1, num_warps=num_warps))
+
+        w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1)
+        w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, torch.bfloat16, axis=1)
+
+        w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1)
+        w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, torch.bfloat16, axis=1)
+
+        w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout,
+                                **w_layout_opts)
+        w1_scale_tri = convert_layout(wrap_torch_tensor(w1_scale_tri),
+                                      w_scale_layout, **w_scale_layout_opts)
+
+        w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout,
+                                **w_layout_opts)
+        w2_scale_tri = convert_layout(wrap_torch_tensor(w2_scale_tri),
+                                      w_scale_layout, **w_scale_layout_opts)
+
+        pc1 = PrecisionConfig(weight_scale=w1_scale_tri,
+                              flex_ctx=FlexCtx(rhs_data=InFlexData()))
+        pc2 = PrecisionConfig(weight_scale=w2_scale_tri,
+                              flex_ctx=FlexCtx(rhs_data=InFlexData()))
+
+        # tucuate so the rest can run properly
+        w1 = w1[..., :K, :2 * N]
+        w2 = w2[..., :N, :K]
+
+        w1 = deshuffle(w1)
+
+        w1 = w1.transpose(-1, -2).contiguous()
+        w2 = w2.transpose(-1, -2).contiguous()
+
+        return (x, w1, w1_bias, w2, w2_bias, exp_data, x_tri, w1_tri, w2_tri,
+                exp_data_tri, w1_bias_tri, w2_bias_tri, pc1, pc2)
+
+
+@dataclass
+class ModelConfig:
+    num_hidden_layers: int = 36
+    num_experts: int = 128
+    experts_per_token: int = 4
+    vocab_size: int = 201088
+    hidden_size: int = 2880
+    intermediate_size: int = 2880
+    head_dim: int = 64
+    num_attention_heads: int = 64
+    num_key_value_heads: int = 8
+    sliding_window: int = 128
+    initial_context_length: int = 4096
+    rope_theta: float = 150000.0
+    rope_scaling_factor: float = 32.0
+    rope_ntk_alpha: float = 1.0
+    rope_ntk_beta: float = 32.0
+
+
+def swiglu(x, alpha: float = 1.702, limit: float = 1.0):
+    # Note we add an extra bias of 1 to the linear layer
+    x_glu, x_linear = torch.chunk(x, 2, dim=-1)
+    if limit is not None:
+        x_glu = x_glu.clamp(max=limit)
+    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
+    if limit is not None:
+        x_linear = x_linear.clamp(min=-limit, max=limit)
+    return out_glu * (x_linear + 1)
+
+
+def oai_moe_forward(
+        hidden_states: torch.Tensor,  # (M, K)
+        w1: torch.Tensor,  # (E, 2N)
+        w1_bias: torch.Tensor,  # (E, 2N, K)
+        w2: torch.Tensor,  # (E, K, N)
+        w2_bias: torch.Tensor,  # (E, N)
+        gating_output: torch.Tensor,  # (M, E)
+        topk: int):
+    # model.py 309:330, assuming gating and norm
+    t = hidden_states
+    experts = torch.topk(gating_output, k=topk, dim=-1, sorted=True)
+    expert_weights = torch.nn.functional.softmax(experts.values, dim=1)
+    expert_indices = experts.indices
+
+    # MLP #1
+    mlp1_weight = w1[expert_indices, ...]
+    mlp1_bias = w1_bias[expert_indices, ...]
+    t = torch.einsum("beck,bk->bec", mlp1_weight, t) + mlp1_bias
+    t = swiglu(t, limit=7)
+
+    # MLP #2
+    mlp2_weight = w2[expert_indices, ...]
+    mlp2_bias = w2_bias[expert_indices, ...]
+    t = torch.einsum("beck,bek->bec", mlp2_weight, t)
+    t += mlp2_bias
+
+    # Weighted sum of experts
+    t = torch.einsum("bec,be->bc", t, expert_weights)
+
+    return t
+
+
+@dataclass
+class Case:
+    a_dtype: str
+    w_dtype: str
+
+
+@pytest.mark.parametrize(
+    ", ".join(f.name for f in fields(Case)),
+    [
+        tuple(getattr(case, f.name) for f in fields(Case)) for case in [
+            # Case(a_dtype="bf16", w_dtype="bf16"),
+            # Case(a_dtype="fp8_e4m3", w_dtype="fp8_e5m2"),
+            Case(a_dtype="bf16", w_dtype="mx4")
+        ]
+    ],
+)
+@pytest.mark.parametrize("num_token", [2])
+@pytest.mark.parametrize("tp", [1, 2, 4, 8])
+def test_equiv(num_token, a_dtype, w_dtype, tp):
+    M = num_token
+    E = ModelConfig.num_experts
+    K = ModelConfig.hidden_size
+    N = ModelConfig.intermediate_size // tp
+    topk = ModelConfig.experts_per_token
+
+    x, w1, w1_bias, w2, w2_bias, exp_data, \
+        x_tri, w1_tri, w2_tri, exp_data_tri, w1_bias_tri,\
+        w2_bias_tri, pc1, pc2 = init_compute_data(
+        M, K, N, E, a_dtype, w_dtype, num_warps=8)
+
+    out_triton_monolithic = triton_kernel_moe_forward(
+        hidden_states=x_tri,
+        w1=w1_tri,
+        w2=w2_tri,
+        gating_output=exp_data_tri,
+        topk=topk,
+        renormalize=True,
+        w1_bias=w1_bias_tri,
+        w2_bias=w2_bias_tri,
+        w1_precision=pc1,
+        w2_precision=pc2)
+    out_triton_monolithic = out_triton_monolithic[..., :K]
+
+    out_ref = oai_moe_forward(hidden_states=x,
+                              w1=w1,
+                              w1_bias=w1_bias,
+                              w2=w2,
+                              w2_bias=w2_bias,
+                              gating_output=exp_data,
+                              topk=topk)
+    assert_close(ref=out_ref,
+                 tri=out_triton_monolithic,
+                 maxtol=0.025,
+                 rmstol=0.005)
+
+
+def batched_moe(a: torch.Tensor, w1, w2, gating_output: torch.Tensor,
+                topk: int, renormalize: bool, w1_bias: torch.Tensor,
+                w2_bias: torch.Tensor, w1_precision: PrecisionConfig,
+                w2_precision: PrecisionConfig) -> torch.Tensor:
+    max_num_tokens = round_up(a.shape[0], 64)
+
+    fused_experts = FusedMoEModularKernel(
+        BatchedPrepareAndFinalize(max_num_tokens,
+                                  num_dispatchers=1,
+                                  num_local_experts=w1.shape[0],
+                                  rank=0),
+        BatchedOAITritonExperts(
+            None,
+            max_num_tokens=max_num_tokens,
+            num_dispatchers=1,
+            w1_precision=w1_precision,
+            w2_precision=w2_precision,
+        ),
+    )
+
+    extra_expert_args = {
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+    }
+
+    topk_weight, topk_ids, _ = fused_topk(a, gating_output, topk, renormalize)
+
+    return fused_experts(
+        a,
+        w1,
+        w2,
+        topk_weight,
+        topk_ids,
+        extra_expert_args=extra_expert_args,
+    )
+
+
+@pytest.mark.parametrize(
+    ", ".join(f.name for f in fields(Case)),
+    [
+        tuple(getattr(case, f.name) for f in fields(Case)) for case in [
+            # Case(a_dtype="bf16", w_dtype="bf16"),
+            # Case(a_dtype="fp8_e4m3", w_dtype="fp8_e5m2"),
+            Case(a_dtype="bf16", w_dtype="mx4")
+        ]
+    ],
+)
+@pytest.mark.parametrize("num_token", [64])
+@pytest.mark.parametrize("ep", [1, 2, 4, 8])
+def test_triton_kernel_batched_moe(num_token, a_dtype, w_dtype, ep):
+    M = num_token
+    E = ModelConfig.num_experts // ep
+    K = ModelConfig.hidden_size
+    N = ModelConfig.intermediate_size
+    topk = ModelConfig.experts_per_token
+
+    x, w1, w1_bias, w2, w2_bias, exp_data, \
+        x_tri, w1_tri, w2_tri, exp_data_tri, w1_bias_tri, \
+            w2_bias_tri, pc1, pc2 = init_compute_data(
+        M, K, N, E, a_dtype, w_dtype, num_warps=4)
+
+    out_tri = batched_moe(a=x_tri,
+                          w1=w1_tri,
+                          w2=w2_tri,
+                          gating_output=exp_data_tri,
+                          topk=topk,
+                          renormalize=True,
+                          w1_bias=w1_bias_tri,
+                          w2_bias=w2_bias_tri,
+                          w1_precision=pc1,
+                          w2_precision=pc2)
+    out_tri = out_tri[..., :K]
+
+    out_ref = oai_moe_forward(hidden_states=x,
+                              w1=w1,
+                              w1_bias=w1_bias,
+                              w2=w2,
+                              w2_bias=w2_bias,
+                              gating_output=exp_data,
+                              topk=topk)
+    assert_close(ref=out_ref, tri=out_tri, maxtol=0.025, rmstol=0.005)
+
+
+def test_unit_shuffle():
+    N = ModelConfig.intermediate_size
+    K = ModelConfig.hidden_size
+    m = torch.randn((K, 2 * N), dtype=torch.bfloat16, device="cuda")
+
+    x = torch.randn(K, dtype=torch.bfloat16, device="cuda")
+
+    m_shuffled = shuffle_weight(m)
+
+    out_ref = x @ m
+    out_ref = swiglu(out_ref, limit=1.0)
+
+    out = x @ m_shuffled
+    out = triton_kernels.swiglu.swiglu_torch(
+        out,
+        alpha=1.702,
+        precision_config=triton_kernels.swiglu.PrecisionConfig(limit=1.0))
+
+    assert_close(ref=out_ref, tri=out)
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
new file mode 100644
index 0000000000..4482029c16
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -0,0 +1,230 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate)
+from vllm.model_executor.layers.fused_moe.utils import extract_required_args
+
+if True:
+    import triton_kernels.swiglu
+    from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation,
+                                           PrecisionConfig, matmul_ogs)
+    from triton_kernels.routing import routing
+
+
+def triton_kernel_moe_forward(
+    hidden_states: torch.Tensor,
+    w1,  # Tensor or triton_kernels.Tensor
+    w2,  # Tensor or triton_kernels.Tensor
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    w1_precision=None,  # PrecisionConfig or None
+    w2_precision=None,  # PrecisionConfig or None
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+
+    routing_data, gather_idx, scatter_idx = routing(gating_output,
+                                                    topk,
+                                                    sm_first=not renormalize)
+
+    return triton_kernel_fused_experts(
+        None,
+        hidden_states,
+        w1,
+        w2,
+        routing_data,
+        gather_idx,
+        scatter_idx,
+        activation=activation,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+        use_fp8_w8a8=use_fp8_w8a8,
+        per_channel_quant=per_channel_quant,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        w1_precision=w1_precision,
+        w2_precision=w2_precision,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape)
+
+
+# This is a triton implementation of the fused_experts function
+def triton_kernel_fused_experts(
+    output_tensor: torch.Tensor,
+    hidden_states: torch.Tensor,
+    w1,  # Tensor or triton_kernels.Tensor
+    w2,  # Tensor or triton_kernels.Tensor
+    routing_data,  # RoutingData
+    gather_indx,  # GatherIndx
+    scatter_indx,  # ScatterIndx
+    activation: str = "silu",
+    swiglu_alpha: float = 1.702,
+    swiglu_limit: float = 7.0,
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    w1_precision=None,  # PrecisionConfig or None
+    w2_precision=None,  # PrecisionConfig or None
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+
+    # type check, uint8 means mxfp4
+    assert hidden_states.dtype == torch.bfloat16
+    assert w1_bias is None or w1_bias.dtype == torch.float32
+    assert w2_bias is None or w2_bias.dtype == torch.float32
+
+    # Shape check, only check non-mxfp4
+    assert hidden_states.shape[-1] == w1.shape[-2]
+    assert w2.shape[-1] == w1.shape[1]
+
+    E, _, N = w1.shape
+
+    if global_num_experts == -1:
+        global_num_experts = E
+
+    act = FusedActivation(
+        FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
+        (swiglu_alpha, swiglu_limit), 2)
+    gammas = routing_data.gate_scal if routing_data else None
+
+    intermediate_cache1 = matmul_ogs(
+        hidden_states,
+        w1,
+        w1_bias,
+        routing_data,
+        gather_indx=gather_indx,
+        precision_config=w1_precision,
+        gammas=gammas if apply_router_weight_on_input else None,
+        fused_activation=act)
+
+    intermediate_cache3 = matmul_ogs(
+        intermediate_cache1,
+        w2,
+        w2_bias,
+        routing_data,
+        scatter_indx=scatter_indx,
+        precision_config=w2_precision,
+        gammas=None if apply_router_weight_on_input else gammas,
+        y=output_tensor,
+    )
+    return intermediate_cache3
+
+
+class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(self, quant_config, max_num_tokens: int, num_dispatchers: int,
+                 w1_precision: PrecisionConfig, w2_precision: PrecisionConfig):
+        super().__init__(quant_config)
+        self.max_num_tokens = max_num_tokens
+        self.num_dispatchers = num_dispatchers
+        self.w1_precision = w1_precision
+        self.w2_precision = w2_precision
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.BatchedExperts,
+                mk.FusedMoEActivationFormat.BatchedExperts)
+
+    def supports_chunking(self) -> bool:
+        return False
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+
+    def workspace_shapes(
+        self, a: torch.Tensor, aq: torch.Tensor, M: int, N: int, K: int,
+        topk: int, global_num_experts: int, local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata]
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        # workspace are allocated inside the kernel
+        assert a.dim() == 2
+        num_dp = self.num_dispatchers
+        num_experts = local_num_experts
+        max_num_tokens = self.max_num_tokens
+        workspace2 = (0, 0, 0)
+        output = (num_experts, max_num_tokens * num_dp, N)
+        return (output, workspace2, output, a.dtype)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+        extra_expert_args: Optional[dict[str, Any]],
+    ):
+        w1_bias, w2_bias = (extract_required_args(extra_expert_args,
+                                                  ["w1_bias", "w2_bias"]))
+
+        return triton_kernel_fused_experts(
+            output,
+            hidden_states,
+            w1,
+            w2,
+            None,
+            None,
+            None,
+            activation=activation,
+            apply_router_weight_on_input=False,
+            use_fp8_w8a8=False,
+            per_channel_quant=False,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            w1_precision=self.w1_precision,
+            w2_precision=self.w2_precision,
+            a1_scale=a1q_scale,
+            a2_scale=a2_scale)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 272b6ce672..d664a92841 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -36,7 +36,7 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
 from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx,
-                        round_up)
+                        has_triton_kernels, is_torch_equal_or_newer, round_up)
 from vllm.utils.flashinfer import has_flashinfer
 
 if current_platform.is_cuda_alike():
@@ -723,10 +723,17 @@ class FusedMoE(torch.nn.Module):
         self.global_num_experts = num_experts + num_redundant_experts
 
         # we padding globally so EP buffer allocation works
-        if quant_config and quant_config.get_name() == "mxfp4" and (
-                envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
-            hidden_size = round_up(hidden_size, 256)
+        if quant_config and quant_config.get_name() == "mxfp4":
+            if not is_torch_equal_or_newer("2.8.0"):
+                raise RuntimeError("Mxfp4 on hopper requires torch >= 2.8.0")
+            if current_platform.is_device_capability(
+                    90) and not has_triton_kernels():
+                raise NotImplementedError(
+                    "Triton kernels must be installed for mxfp4 on hopper")
+            if (current_platform.is_rocm()
+                    or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+                    or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+                hidden_size = round_up(hidden_size, 256)
 
         # For smuggling this layer into the fused moe custom op
         compilation_config = vllm_config.compilation_config
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 068af02739..4e59aef480 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -8,16 +8,19 @@ from torch.nn.parameter import Parameter
 from vllm import envs
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                   FusedMoEMethodBase)
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    triton_kernel_moe_forward)
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
-    _can_support_mxfp4)
+    _can_support_mxfp4, _swizzle_mxfp4)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.utils import next_power_of_2, round_up
 
 if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
@@ -39,7 +42,7 @@ class Mxfp4Config(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 100
+        return 90
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
@@ -100,11 +103,18 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             intermediate_size_per_partition
         # pad the intermediate size to be a multiple of 2 * mxfp4_block
         # for to hold non-uniform sharded tensor as well as swizzling
+        # other padding to increase performance
         if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
                 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 256)
             hidden_size = round_up(hidden_size, 256)
+        elif current_platform.is_rocm():
+            intermediate_size_per_partition_after_pad = round_up(
+                intermediate_size_per_partition, 128)
+        else:
+            intermediate_size_per_partition_after_pad = round_up(
+                intermediate_size_per_partition, 64)
 
         self.intermediate_size = intermediate_size_per_partition_after_pad
         self.hidden_size = hidden_size
@@ -303,7 +313,41 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             layer.w2_bias = Parameter(torch.stack(gemm2_bias_shuffled).reshape(
                 self.num_experts, -1),
                                       requires_grad=False)
-            return
+        else:
+            from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+
+            w13_bias = layer.w13_bias.to(torch.float32)
+            w2_bias = layer.w2_bias.to(torch.float32)
+
+            layer.w13_bias = Parameter(w13_bias, requires_grad=False)
+            layer.w2_bias = Parameter(w2_bias, requires_grad=False)
+
+            # FIXME warp need to be adjusted based on batch size
+            # only apply to  batched mode
+            if self.moe.use_ep:
+                num_warps = 4 if envs.VLLM_MOE_DP_CHUNK_SIZE <= 512 else 8
+            else:
+                num_warps = 8
+
+            w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(
+                layer.w13_weight, layer.w13_weight_scale, num_warps)
+            w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(
+                layer.w2_weight, layer.w2_weight_scale, num_warps)
+
+            self.w13_precision_config = PrecisionConfig(
+                weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex))
+            self.w2_precision_config = PrecisionConfig(
+                weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex))
+
+            self.w13_weight_triton_tensor = w13_weight
+            self.w2_weight_triton_tensor = w2_weight
+
+            # need to delete the original weights to save memory on single GPU
+            del layer.w13_weight
+            del layer.w2_weight
+            layer.w13_weight = None
+            layer.w2_weight = None
+            torch.cuda.empty_cache()
 
     def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int):
         # Number of tokens in the input tensor.
@@ -404,3 +448,19 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 True,  # do finalize
             )[0]
             return trtllm_gen_output
+        else:
+            return triton_kernel_moe_forward(
+                hidden_states=x,
+                w1=self.w13_weight_triton_tensor,
+                w2=self.w2_weight_triton_tensor,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                w1_bias=layer.w13_bias,
+                w2_bias=layer.w2_bias,
+                w1_precision=self.w13_precision_config,
+                w2_precision=self.w2_precision_config,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 4a4e199e13..4084dd837c 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -4,11 +4,55 @@ from typing import Callable, Optional
 
 import torch
 
-from vllm.utils import direct_register_custom_op
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
+
+logger = init_logger(__name__)
 
 OCP_MX_BLOCK_SIZE = 32
 
 
+def _swizzle_mxfp4(quant_tensor, scale, num_warps):
+    """ weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel
+    """
+    import triton_kernels.matmul_ogs_details.opt_flags as opt_flags
+    from triton_kernels.numerics import InFlexData
+    from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+    from triton_kernels.tensor_details import layout
+    from triton_kernels.tensor_details.layout import StridedLayout
+    if (current_platform.is_cuda()
+            and current_platform.is_device_capability(90)
+            and not is_torch_equal_or_newer("2.8.1")):
+        logger.warning_once(
+            "Mxfp4 on hopper is running on torch < 2.8.1, "
+            "this cause swizling to be disabled, which may "
+            "cause performance degradation. Please upgrade to torch nightly")
+        value_layout, value_layout_opts = StridedLayout, dict()
+        scale_layout, scale_layout_opts = StridedLayout, dict()
+    else:
+        value_layout, value_layout_opts = \
+            layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
+        scale_layout, scale_layout_opts = (
+            layout.make_default_matmul_mxfp4_w_scale_layout(
+                mx_axis=1, num_warps=num_warps))
+    if current_platform.is_cuda() and \
+        current_platform.is_device_capability(100):
+        constraints = {
+            "is_persistent": True,
+            "epilogue_subtile": 1,
+        }
+        opt_flags.update_opt_flags_constraints(constraints)
+    # transpose the tensor so that the quantization axis is on dim1
+    quant_tensor = quant_tensor.transpose(-2, -1)
+    scale = scale.transpose(-2, -1)
+    quant_tensor = convert_layout(wrap_torch_tensor(quant_tensor, dtype=FP4),
+                                  value_layout, **value_layout_opts)
+    scale = convert_layout(wrap_torch_tensor(scale), scale_layout,
+                           **scale_layout_opts)
+    return quant_tensor, InFlexData(), scale
+
+
 def _can_support_mxfp4(use_grouped_topk: bool = False,
                        topk_group: Optional[int] = None,
                        num_expert_group: Optional[int] = None,
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index cd32f12f3c..48a347a8f5 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -11,6 +11,27 @@ from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
 
+def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
+    # Shuffle weight along the last dimension so that
+    # we folded the weights to adjance location
+    # Example:
+    # input:
+    #       [[1, 2, 3, 4, 5, 6],
+    #        [7, 8, 9, 10, 11, 12]]
+    # output:
+    #       [[1, 4, 2, 5, 3, 6],
+    #        [7, 10, 8, 11, 9, 12]]
+    # This will be used together with triton swiglu kernel
+    shape = w.shape
+    N = shape[-1]
+    first = w[..., :N // 2]
+    second = w[..., N // 2:]
+
+    stacked = torch.stack((first, second), dim=-1)
+    w_shuffled = stacked.reshape(shape)
+    return w_shuffled
+
+
 def get_token_bin_counts_and_mask(
     tokens: torch.Tensor,
     vocab_size: int,
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index e39cdf76dc..7a0abf5b59 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -3254,6 +3254,12 @@ def has_deep_gemm() -> bool:
     return _has_module("deep_gemm")
 
 
+def has_triton_kernels() -> bool:
+    """Whether the optional `triton_kernels` package is available."""
+
+    return _has_module("triton_kernels")
+
+
 def set_process_title(name: str,
                       suffix: str = "",
                       append: bool = False) -> None:

From f0964e29cb3b2deccdad89f5f8c068d3a629d239 Mon Sep 17 00:00:00 2001
From: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com>
Date: Fri, 8 Aug 2025 20:28:50 +0300
Subject: [PATCH 093/932] [Benchmark] Add benchmark tool for multi turn
 conversations (#20267)

---
 benchmarks/multi_turn/README.md               |   71 +
 benchmarks/multi_turn/bench_dataset.py        |  493 ++++++
 benchmarks/multi_turn/bench_utils.py          |   25 +
 .../benchmark_serving_multi_turn.py           | 1557 +++++++++++++++++
 .../multi_turn/convert_sharegpt_to_openai.py  |  354 ++++
 .../multi_turn/generate_multi_turn.json       |   35 +
 benchmarks/multi_turn/requirements.txt        |    5 +
 7 files changed, 2540 insertions(+)
 create mode 100644 benchmarks/multi_turn/README.md
 create mode 100644 benchmarks/multi_turn/bench_dataset.py
 create mode 100644 benchmarks/multi_turn/bench_utils.py
 create mode 100644 benchmarks/multi_turn/benchmark_serving_multi_turn.py
 create mode 100644 benchmarks/multi_turn/convert_sharegpt_to_openai.py
 create mode 100644 benchmarks/multi_turn/generate_multi_turn.json
 create mode 100644 benchmarks/multi_turn/requirements.txt

diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md
new file mode 100644
index 0000000000..ae0866ae60
--- /dev/null
+++ b/benchmarks/multi_turn/README.md
@@ -0,0 +1,71 @@
+# Benchmark KV Cache Offloading with Multi-Turn Conversations
+
+The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `requirements.txt`
+
+First start serving your model
+
+```bash
+export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
+
+vllm serve $MODEL_NAME --disable-log-requests
+```
+
+## Synthetic Multi-Turn Conversations
+
+Download the following text file (used for generation of synthetic conversations)
+
+```bash
+wget https://www.gutenberg.org/ebooks/1184.txt.utf-8
+mv 1184.txt.utf-8 pg1184.txt
+```
+
+The filename `pg1184.txt` is used in `generate_multi_turn.json` (see `"text_files"`).
+
+But you may use other text files if you prefer (using this specific file is not required).
+
+Then run the benchmarking script
+
+```bash
+export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
+
+python benchmark_serving_multi_turn.py --model $MODEL_NAME --input-file generate_multi_turn.json \
+--num-clients 2 --max-active-conversations 6
+```
+
+You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
+
+If successful, you will see the following output
+
+```bash
+----------------------------------------------------------------------------------------------------
+Statistics summary:
+runtime_sec = 215.810
+requests_per_sec = 0.769
+----------------------------------------------------------------------------------------------------
+                   count     mean     std      min      25%      50%      75%      90%      99%      max
+ttft_ms            166.0    78.22   67.63    45.91    59.94    62.26    64.43    69.66   353.18   567.54
+tpot_ms            166.0    25.37    0.57    24.40    25.07    25.31    25.50    25.84    27.50    28.05
+latency_ms         166.0  2591.07  326.90  1998.53  2341.62  2573.01  2860.10  3003.50  3268.46  3862.94
+input_num_turns    166.0     7.43    4.57     1.00     3.00     7.00    11.00    13.00    17.00    17.00
+input_num_tokens   166.0  2006.20  893.56   522.00  1247.75  2019.00  2718.00  3233.00  3736.45  3899.00
+output_num_tokens  166.0   100.01   11.80    80.00    91.00    99.00   109.75   116.00   120.00   120.00
+output_num_chunks  166.0    99.01   11.80    79.00    90.00    98.00   108.75   115.00   119.00   119.00
+----------------------------------------------------------------------------------------------------
+```
+
+## ShareGPT Conversations
+
+To run with the ShareGPT data, download the following ShareGPT dataset:
+`https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json`
+
+Use the `convert_sharegpt_to_openai.py` script to convert the dataset to a format supported by `benchmark_serving_multi_turn.py`
+
+```bash
+python convert_sharegpt_to_openai.py sharegpt_20230401_clean_lang_split.json sharegpt_conv_128.json --seed=99 --max-items=128
+```
+
+The script will convert the ShareGPT dataset to a dataset with the standard user/assistant roles.
+
+The flag `--max-items=128` is used to sample 128 conversations from the original dataset (change as needed).
+
+Use the output JSON file `sharegpt_conv_128.json` as the `--input-file` for `benchmark_serving_multi_turn.py`.
diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py
new file mode 100644
index 0000000000..411b89dd23
--- /dev/null
+++ b/benchmarks/multi_turn/bench_dataset.py
@@ -0,0 +1,493 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from statistics import mean
+from typing import Any, NamedTuple, Optional, Union
+
+import numpy as np  # type: ignore
+import pandas as pd  # type: ignore
+from bench_utils import (
+    TEXT_SEPARATOR,
+    Color,
+    logger,
+)
+from transformers import AutoTokenizer  # type: ignore
+
+# Conversation ID is a string (e.g: "UzTK34D")
+ConvId = str
+
+# A list of dicts (dicts with keys "id" and "messages")
+ShareGptConversations = list[dict[str, Any]]
+
+# A list of dicts (dicts with keys "role" and "content")
+MessagesList = list[dict[str, str]]
+
+# Map conversation ID to conversation messages
+ConversationsMap = list[ConvId, MessagesList]
+
+
+class Distribution(ABC):
+    @abstractmethod
+    def sample(self, size: int = 1) -> np.ndarray:
+        pass
+
+
+class UniformDistribution(Distribution):
+    def __init__(
+        self,
+        min_val: Union[int, float],
+        max_val: Union[int, float],
+        is_integer: bool = True,
+    ) -> None:
+        self.min_val = min_val
+        self.max_val = max_val
+        self.is_integer = is_integer
+
+    def sample(self, size: int = 1) -> np.ndarray:
+        if self.is_integer:
+            return np.random.randint(
+                int(self.min_val), int(self.max_val + 1), size=size
+            )
+        else:
+            return np.random.uniform(self.min_val, self.max_val, size=size)
+
+    def __repr__(self) -> str:
+        return f"UniformDistribution[{self.min_val}, {self.max_val}]"
+
+
+class ConstantDistribution(Distribution):
+    def __init__(self, value: Union[int, float]) -> None:
+        self.value = value
+        self.max_val = value
+
+    def sample(self, size: int = 1) -> np.ndarray:
+        return np.full(shape=size, fill_value=self.value)
+
+    def __repr__(self) -> str:
+        return f"Constant[{self.value}]"
+
+
+class ZipfDistribution(Distribution):
+    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+        self.alpha = alpha
+        self.max_val = max_val
+
+    def sample(self, size: int = 1) -> np.ndarray:
+        samples = np.random.zipf(self.alpha, size=size)
+        if self.max_val:
+            samples = np.minimum(samples, self.max_val)
+        return samples
+
+    def __repr__(self) -> str:
+        return f"ZipfDistribution[{self.alpha}]"
+
+
+class PoissonDistribution(Distribution):
+    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+        self.alpha = alpha
+        self.max_val = max_val
+
+    def sample(self, size: int = 1) -> np.ndarray:
+        samples = np.random.poisson(self.alpha, size=size)
+        if self.max_val:
+            samples = np.minimum(samples, self.max_val)
+        return samples
+
+    def __repr__(self) -> str:
+        return f"PoissonDistribution[{self.alpha}]"
+
+
+class LognormalDistribution(Distribution):
+    def __init__(
+        self, mean: float, sigma: float, max_val: Optional[int] = None
+    ) -> None:
+        self.mean = mean
+        self.sigma = sigma
+        self.max_val = max_val
+
+    def sample(self, size: int = 1) -> np.ndarray:
+        samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
+        if self.max_val:
+            samples = np.minimum(samples, self.max_val)
+
+        return np.round(samples).astype(int)
+
+    def __repr__(self) -> str:
+        return f"LognormalDistribution[{self.mean}, {self.sigma}]"
+
+
+class GenConvArgs(NamedTuple):
+    num_conversations: int
+    text_files: list[str]
+    input_num_turns: Distribution
+    input_common_prefix_num_tokens: Distribution
+    input_prefix_num_tokens: Distribution
+    input_num_tokens: Distribution
+    output_num_tokens: Distribution
+    print_stats: bool
+
+
+def verify_field_exists(
+    conf: dict, field_name: str, section: str, subsection: str
+) -> None:
+    if field_name not in conf:
+        raise ValueError(
+            f"Missing field '{field_name}' in {section=} and {subsection=}"
+        )
+
+
+def get_random_distribution(
+    conf: dict, section: str, subsection: str, optional: bool = False
+) -> Distribution:
+    # section can be "prompt_input" or "prompt_output" (both required)
+    conf = conf[section]
+
+    if optional and subsection not in conf:
+        # Optional subsection, if not found assume the value is always 0
+        return ConstantDistribution(0)
+
+    # subsection can be "num_turns", "num_tokens" or "prefix_num_tokens"
+    if subsection not in conf:
+        raise ValueError(f"Missing subsection {subsection} in section {section}")
+
+    conf = conf[subsection]
+
+    distribution = conf.get("distribution")
+    if distribution is None:
+        raise ValueError(
+            f"Missing field 'distribution' in {section=} and {subsection=}"
+        )
+
+    if distribution == "constant":
+        verify_field_exists(conf, "value", section, subsection)
+        return ConstantDistribution(conf["value"])
+
+    elif distribution == "zipf":
+        verify_field_exists(conf, "alpha", section, subsection)
+        max_val = conf.get("max", None)
+        return ZipfDistribution(conf["alpha"], max_val=max_val)
+
+    elif distribution == "poisson":
+        verify_field_exists(conf, "alpha", section, subsection)
+        max_val = conf.get("max", None)
+        return PoissonDistribution(conf["alpha"], max_val=max_val)
+
+    elif distribution == "lognormal":
+        verify_field_exists(conf, "mean", section, subsection)
+        verify_field_exists(conf, "sigma", section, subsection)
+        max_val = conf.get("max", None)
+        return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
+
+    elif distribution == "uniform":
+        verify_field_exists(conf, "min", section, subsection)
+        verify_field_exists(conf, "max", section, subsection)
+
+        min_value = conf["min"]
+        max_value = conf["max"]
+
+        assert min_value > 0
+        assert min_value <= max_value
+
+        is_integer = isinstance(min_value, int) and isinstance(max_value, int)
+        return UniformDistribution(min_value, max_value, is_integer)
+    else:
+        raise ValueError(f"Unknown distribution: {distribution}")
+
+
+def parse_input_json_file(conf: dict) -> GenConvArgs:
+    # Validate the input file
+    assert isinstance(conf, dict)
+    required_fields = [
+        "filetype",
+        "num_conversations",
+        "text_files",
+        "prompt_input",
+        "prompt_output",
+    ]
+    for field in required_fields:
+        assert field in conf, f"Missing field {field} in input {conf}"
+
+    assert conf["filetype"] == "generate_conversations"
+
+    assert conf["num_conversations"] > 0, "num_conversations should be larger than zero"
+
+    text_files = conf["text_files"]
+
+    assert isinstance(text_files, list), "Field 'text_files' should be a list"
+    assert len(text_files) > 0, (
+        "Field 'text_files' should be a list with at least one file"
+    )
+
+    # Parse the parameters for the prompt input/output workload
+    input_num_turns = get_random_distribution(conf, "prompt_input", "num_turns")
+    input_num_tokens = get_random_distribution(conf, "prompt_input", "num_tokens")
+    input_common_prefix_num_tokens = get_random_distribution(
+        conf, "prompt_input", "common_prefix_num_tokens", optional=True
+    )
+    input_prefix_num_tokens = get_random_distribution(
+        conf, "prompt_input", "prefix_num_tokens"
+    )
+    output_num_tokens = get_random_distribution(conf, "prompt_output", "num_tokens")
+
+    print_stats: bool = conf.get("print_stats", False)
+    assert isinstance(print_stats, bool), (
+        "Field 'print_stats' should be either 'true' or 'false'"
+    )
+
+    args = GenConvArgs(
+        num_conversations=conf["num_conversations"],
+        text_files=text_files,
+        input_num_turns=input_num_turns,
+        input_common_prefix_num_tokens=input_common_prefix_num_tokens,
+        input_prefix_num_tokens=input_prefix_num_tokens,
+        input_num_tokens=input_num_tokens,
+        output_num_tokens=output_num_tokens,
+        print_stats=print_stats,
+    )
+    return args
+
+
+def print_conv_stats(conversations: ConversationsMap, tokenizer: AutoTokenizer) -> None:
+    # Collect statistics
+    conv_stats: list[dict[Any, Any]] = []
+    req_stats: list[int] = []
+
+    print("\nCollecting statistics...")
+    for messages in conversations.values():
+        # messages is a list of dicts
+        user_tokens: list[int] = []
+        assistant_tokens: list[int] = []
+        request_tokens: list[int] = []
+
+        req_tokens = 0
+        for m in messages:
+            content = m["content"]
+            num_tokens = len(tokenizer(content).input_ids)
+
+            if m["role"] == "user":
+                user_tokens.append(num_tokens)
+                # New user prompt including all chat history
+                req_tokens += num_tokens
+                request_tokens.append(req_tokens)
+
+            elif m["role"] == "assistant":
+                assistant_tokens.append(num_tokens)
+                # Update assistant answer
+                # (will be part of chat history for the next user prompt)
+                req_tokens += num_tokens
+
+        item_stats = {
+            "conversation_turns": len(messages),
+            "user_tokens": mean(user_tokens),
+            "assistant_tokens": mean(assistant_tokens),
+        }
+
+        conv_stats.append(item_stats)
+        req_stats.extend(request_tokens)
+
+    # Print statistics
+    percentiles = [0.25, 0.5, 0.75, 0.9, 0.99]
+
+    print(TEXT_SEPARATOR)
+    print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}")
+    print(TEXT_SEPARATOR)
+    df = pd.DataFrame(conv_stats)
+    print(df.describe(percentiles=percentiles).transpose())
+    print(TEXT_SEPARATOR)
+    print(f"{Color.YELLOW}Request statistics:{Color.RESET}")
+    print(TEXT_SEPARATOR)
+    df = pd.DataFrame(req_stats, columns=["request_tokens"])
+    print(df.describe(percentiles=percentiles).transpose())
+    print(TEXT_SEPARATOR)
+
+
+def generate_conversations(
+    args: GenConvArgs, tokenizer: AutoTokenizer
+) -> ConversationsMap:
+    # Text for all user prompts
+    # (text from the input text files will be appended to this line)
+    base_prompt_text = "Please rewrite the following text and add more content: "
+    base_prompt_token_count = len(
+        tokenizer.encode(base_prompt_text, add_special_tokens=False)
+    )
+
+    logger.info(f"{Color.PURPLE}Generating conversations...{Color.RESET}")
+    logger.info(args)
+
+    list_of_tokens = []
+
+    for filename in args.text_files:
+        # Load text file that will be used to generate prompts
+        with open(filename) as file:
+            data = file.read()
+            tokens_in_file = tokenizer.encode(data, add_special_tokens=False)
+            list_of_tokens.extend(tokens_in_file)
+
+    conversations: ConversationsMap = {}
+    conv_id = 0
+
+    # Generate number of turns for every conversation
+    turn_count: np.ndarray = args.input_num_turns.sample(args.num_conversations)
+
+    # Turn count should be at least 2 (one user prompt and one assistant answer)
+    turn_count = np.maximum(turn_count, 2)
+
+    # Round up to an even number (every user prompt should have an answer)
+    turn_count = turn_count + (turn_count % 2)
+
+    # Generate number of prefix tokens for every conversation
+    conv_prefix_tokens: np.ndarray = args.input_prefix_num_tokens.sample(
+        args.num_conversations
+    )
+
+    # Used to reduce shared text between conversations
+    # (jump/skip over text sections between conversations)
+    base_offset = 0
+
+    # Common prefix size for all conversations (only 1 sample required)
+    common_prefix_text = ""
+    common_prefix_tokens: int = args.input_common_prefix_num_tokens.sample(1)[0]
+    if common_prefix_tokens > 0:
+        # Using "." at the end to separate sentences
+        common_prefix_text = (
+            tokenizer.decode(list_of_tokens[: common_prefix_tokens - 2]) + "."
+        )
+        base_offset += common_prefix_tokens
+
+    for conv_id in range(args.num_conversations):
+        # Generate a single conversation
+        messages: MessagesList = []
+
+        nturns = turn_count[conv_id]
+
+        # User prompt token count per turn (with lower limit)
+        input_token_count: np.ndarray = args.input_num_tokens.sample(nturns)
+        input_token_count = np.maximum(input_token_count, base_prompt_token_count)
+
+        # Assistant answer token count per turn (with lower limit)
+        output_token_count: np.ndarray = args.output_num_tokens.sample(nturns)
+        output_token_count = np.maximum(output_token_count, 1)
+
+        user_turn = True
+        for turn_id in range(nturns):
+            if user_turn:
+                role = "user"
+                num_tokens = input_token_count[turn_id]
+
+                # Generate the user prompt,
+                # use a unique prefix (the conv_id) for each conversation
+                # (to avoid shared prefix between conversations)
+                content = f"{conv_id} is a nice number... "
+
+                if len(common_prefix_text) > 0 and turn_id == 0:
+                    content = common_prefix_text + content
+
+                # Update the number of tokens left for the content
+                num_tokens -= len(tokenizer.encode(content, add_special_tokens=False))
+
+                if turn_id == 0:
+                    prefix_num_tokens = conv_prefix_tokens[conv_id]
+                    if prefix_num_tokens > 0:
+                        # Add prefix text (context) to the first turn
+                        start_offset = base_offset
+                        end_offset = start_offset + prefix_num_tokens
+                        assert len(list_of_tokens) > end_offset, (
+                            "Not enough input text to generate "
+                            f"{prefix_num_tokens} tokens for the "
+                            f"prefix text ({start_offset=}, {end_offset=})"
+                        )
+
+                        content += f"{conv_id}, " + tokenizer.decode(
+                            list_of_tokens[start_offset:end_offset]
+                        )
+                        base_offset += prefix_num_tokens
+
+                # Add the actual user prompt/question after the prefix text
+                content += base_prompt_text
+                num_tokens -= base_prompt_token_count
+
+                if num_tokens > 0:
+                    # Add text from the input file (to reach the desired token count)
+                    start_offset = base_offset + turn_id * input_token_count.max()
+                    end_offset = start_offset + num_tokens
+                    assert len(list_of_tokens) > end_offset, (
+                        f"Not enough input text to generate {num_tokens} tokens "
+                        f"for the prompt ({start_offset=}, {end_offset=})"
+                    )
+
+                    # Convert tokens back to text
+                    content += tokenizer.decode(list_of_tokens[start_offset:end_offset])
+            else:
+                role = "assistant"
+                # This content will not be used as input to the LLM server
+                # (actual answers will be used instead).
+                # Content is only required to determine the min_tokens/max_tokens
+                # (inputs to the LLM server).
+                num_tokens = output_token_count[turn_id]
+                assert len(list_of_tokens) > num_tokens, (
+                    f"Not enough input text to generate {num_tokens} "
+                    "tokens for assistant content"
+                )
+                content = tokenizer.decode(list_of_tokens[:num_tokens])
+
+            # Append the user/assistant message to the list of messages
+            messages.append({"role": role, "content": content})
+            user_turn = not user_turn
+
+        # Add the new conversation
+        conversations[f"CONV_ID_{conv_id}"] = messages
+
+        # Increase base offset for the next conversation
+        base_offset += nturns
+
+    if args.print_stats:
+        print_conv_stats(conversations, tokenizer)
+
+    return conversations
+
+
+def conversations_list_to_dict(input_list: ShareGptConversations) -> ConversationsMap:
+    conversations: ConversationsMap = {}
+
+    for item in input_list:
+        conv_id: str = item["id"]
+        assert isinstance(conv_id, str)
+
+        assert conv_id not in conversations, (
+            f"Conversation ID {conv_id} found more than once in the input"
+        )
+
+        messages: MessagesList = item["messages"]
+        assert isinstance(messages, list), (
+            f"Conversation messages should be a list (ID: {conv_id})"
+        )
+        assert len(messages) > 0, f"Conversation with no messages (ID: {conv_id})"
+
+        conversations[conv_id] = messages
+
+    logger.info(f"Using {len(conversations)} unique conversations (IDs)")
+    assert len(conversations) == len(input_list)
+
+    # Print statistics about the selected conversations
+    stats: list[dict[str, Any]] = []
+    for conv_data in conversations.values():
+        stats.append({"num_turns": len(conv_data)})
+
+    print(TEXT_SEPARATOR)
+    print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}")
+    print(TEXT_SEPARATOR)
+    percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999]
+    conv_stats = pd.DataFrame(stats).describe(percentiles=percentiles)
+    print(conv_stats.transpose())
+    print(TEXT_SEPARATOR)
+
+    return conversations
+
+
+def conversations_dict_to_list(input_dict: ConversationsMap) -> ShareGptConversations:
+    output: ShareGptConversations = []
+    for conv_id, conv_data in input_dict.items():
+        new_item = {"id": conv_id, "messages": conv_data}
+        output.append(new_item)
+
+    return output
diff --git a/benchmarks/multi_turn/bench_utils.py b/benchmarks/multi_turn/bench_utils.py
new file mode 100644
index 0000000000..d4d3c1ca8c
--- /dev/null
+++ b/benchmarks/multi_turn/bench_utils.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+from enum import Enum
+
+
+class Color(str, Enum):
+    RED = "\033[91m"
+    GREEN = "\033[92m"
+    BLUE = "\033[94m"
+    PURPLE = "\033[95m"
+    CYAN = "\033[96m"
+    YELLOW = "\033[93m"
+    RESET = "\033[0m"
+
+
+TEXT_SEPARATOR = "-" * 100
+
+# Configure the logger
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] - %(message)s",
+    datefmt="%d-%m-%Y %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
new file mode 100644
index 0000000000..53c3207491
--- /dev/null
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -0,0 +1,1557 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import random
+import time
+from collections import Counter, deque
+from datetime import datetime
+from enum import Enum
+from http import HTTPStatus
+from statistics import mean
+from typing import NamedTuple, Optional, Union
+
+import aiohttp  # type: ignore
+import numpy as np  # type: ignore
+import pandas as pd  # type: ignore
+from bench_dataset import (
+    ConversationsMap,
+    ConvId,
+    GenConvArgs,
+    MessagesList,
+    ShareGptConversations,
+    conversations_dict_to_list,
+    conversations_list_to_dict,
+    generate_conversations,
+    parse_input_json_file,
+)
+from bench_utils import TEXT_SEPARATOR, Color, logger
+from transformers import AutoTokenizer  # type: ignore
+
+NUM_TOKENS_FROM_DATASET = 0
+TERM_SIGNAL = None
+
+
+class ConversationSampling(str, Enum):
+    ROUND_ROBIN = "round_robin"
+    RANDOM = "random"
+
+    def __str__(self):
+        return self.value
+
+
+class ClientArgs(NamedTuple):
+    seed: int
+    max_num_requests: Optional[int]
+    skip_first_turn: bool
+    max_turns: Optional[int]
+    max_active_conversations: int
+    verbose: bool
+    print_content: bool
+    verify_output: bool
+    conversation_sampling: ConversationSampling
+    request_rate: float
+
+
+class RequestArgs(NamedTuple):
+    chat_url: str
+    model: str
+    stream: bool
+    limit_min_tokens: int  # Use negative value for no limit
+    limit_max_tokens: int  # Use negative value for no limit
+
+
+class BenchmarkArgs(NamedTuple):
+    url: str
+    num_clients: int
+    early_stop: bool
+
+
+class ServerResponse(NamedTuple):
+    valid: bool
+    ttft_ms: float  # time to first chunk
+    tpot_ms: float  # time per output chunk (one or more tokens)
+    latency_ms: float
+    start_time_ms: float
+    first_chunk: str  # first chunk of the content
+    content: str  # includes the first_chunk
+    num_chunks: int
+
+    def __str__(self) -> str:
+        return f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}"  # noqa: E501
+
+
+class RequestStats(NamedTuple):
+    ttft_ms: float
+    tpot_ms: float
+    latency_ms: float
+    start_time_ms: float
+    input_num_turns: int
+    input_num_tokens: int
+    output_num_tokens: int
+    output_num_chunks: int
+    output_num_first_chunk_tokens: int
+    approx_cached_percent: float
+    conversation_id: str
+    client_id: int
+
+    def __str__(self) -> str:
+        return (
+            f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}, input_num_tokens {self.input_num_tokens}, "  # noqa: E501
+            f"output_num_tokens {self.output_num_tokens} ({self.output_num_chunks} chunks, {self.output_num_first_chunk_tokens} tokens in first chunk), "  # noqa: E501
+            f"approx_cached_percent {self.approx_cached_percent:.2f}%"
+        )
+
+
+class MetricStats:
+    def __init__(self) -> None:
+        self.min: Optional[float] = None
+        self.max: Optional[float] = None
+        self.avg: Optional[float] = None
+        self.sum = 0.0
+        self.count = 0
+
+    def update(self, value: float) -> None:
+        if self.min is None:
+            self.min = value
+        else:
+            self.min = min(self.min, value)
+
+        if self.max is None:
+            self.max = value
+        else:
+            self.max = max(self.max, value)
+
+        self.sum += value
+        self.count += 1
+        self.avg = self.sum / self.count
+
+    def __repr__(self) -> str:
+        if self.count == 0:
+            return "no data"
+        return f"avg: {self.avg:>10.3f}, min: {self.min:>10.3f}, max: {self.max:>10.3f}"
+
+
+class MovingAverage:
+    def __init__(self, window_size: int) -> None:
+        self.window_size = window_size
+        self.window = np.zeros(window_size)
+        self.index = 0
+        self.sum = 0.0
+        self.count = 0
+        self.avg: Optional[float] = None
+
+    def update(self, new_value: float) -> None:
+        if self.count < self.window_size:
+            # Filling up the window
+            self.sum += new_value
+            self.window[self.count] = new_value
+            self.count += 1
+        else:
+            # Window is full, start replacing old values
+            old_value = self.window[self.index]
+            self.sum = self.sum - old_value + new_value
+            self.window[self.index] = new_value
+            self.index = (self.index + 1) % self.window_size
+
+        self.avg = self.sum / self.count
+
+    def __repr__(self) -> str:
+        if self.count == 0:
+            return "no data"
+        return f"avg: {self.avg:>10.3f} ({self.count} samples)"
+
+
+class DebugStats:
+    def __init__(self, logger: logging.Logger, window_size: int) -> None:
+        self.logger = logger
+        self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
+            "moving_avg_ttft_ms": MovingAverage(window_size),
+            "moving_avg_tpot_ms": MovingAverage(window_size),
+            "ttft_ms": MetricStats(),
+            "tpot_ms": MetricStats(),
+            "latency_ms": MetricStats(),
+            "input_num_turns": MetricStats(),
+            "input_num_tokens": MetricStats(),
+            "output_num_tokens": MetricStats(),
+        }
+
+    def update(self, data: RequestStats) -> None:
+        self.metrics["ttft_ms"].update(data.ttft_ms)
+        self.metrics["moving_avg_ttft_ms"].update(data.ttft_ms)
+        self.metrics["tpot_ms"].update(data.tpot_ms)
+        self.metrics["moving_avg_tpot_ms"].update(data.tpot_ms)
+        self.metrics["latency_ms"].update(data.latency_ms)
+        self.metrics["input_num_turns"].update(data.input_num_turns)
+        self.metrics["input_num_tokens"].update(data.input_num_tokens)
+        self.metrics["output_num_tokens"].update(data.output_num_tokens)
+
+    def print(self) -> None:
+        self.logger.info("-" * 50)
+        for k, v in self.metrics.items():
+            kv_info = f"[{k:25}] {v}"
+            self.logger.info(kv_info)
+        self.logger.info("-" * 50)
+
+
+# Must support Python 3.8, we can't use str.removeprefix(prefix)
+# introduced in Python 3.9
+def remove_prefix(text: str, prefix: str) -> str:
+    if text.startswith(prefix):
+        return text[len(prefix) :]
+    return text
+
+
+def nanosec_to_millisec(value: float) -> float:
+    return value / 1000000.0
+
+
+def nanosec_to_sec(value: float) -> float:
+    return value / 1000000000.0
+
+
+async def send_request(
+    session: aiohttp.ClientSession,
+    messages: list[dict[str, str]],
+    chat_url: str,
+    model: str,
+    stream: bool = True,
+    min_tokens: Optional[int] = None,
+    max_tokens: Optional[int] = None,
+) -> ServerResponse:
+    payload = {
+        "model": model,
+        "messages": messages,
+        "seed": 0,
+        "temperature": 0.0,
+    }
+
+    if stream:
+        payload["stream"] = True
+        payload["stream_options"] = {"include_usage": False}
+
+    if min_tokens is not None:
+        payload["min_tokens"] = min_tokens
+
+    if max_tokens is not None:
+        payload["max_tokens"] = max_tokens
+
+    headers = {"Content-Type": "application/json"}
+
+    # Calculate the timeout for the request
+    timeout_sec = 120
+    if max_tokens is not None:
+        # Assume TPOT of 200ms and use max_tokens to determine timeout
+        timeout_sec = max(timeout_sec, int(max_tokens * 0.2))
+    timeout = aiohttp.ClientTimeout(total=timeout_sec)
+
+    valid_response = True
+    ttft: Optional[float] = None
+    chunk_delay: list[int] = []
+    latency: Optional[float] = None
+    first_chunk = ""
+    generated_text = ""
+
+    start_time: int = time.perf_counter_ns()
+    most_recent_timestamp: int = start_time
+
+    async with session.post(
+        url=chat_url, json=payload, headers=headers, timeout=timeout
+    ) as response:
+        http_status = HTTPStatus(response.status)
+        if http_status == HTTPStatus.OK:
+            async for chunk_bytes in response.content:
+                chunk_bytes = chunk_bytes.strip()
+                if not chunk_bytes:
+                    continue
+
+                chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                if chunk == "[DONE]":
+                    # End of stream
+                    latency = time.perf_counter_ns() - start_time
+                elif stream is False:
+                    data = json.loads(chunk)
+                    message = data["choices"][0]["message"]
+                    assert message["role"] == "assistant"
+                    generated_text += message["content"]
+                else:
+                    timestamp: int = time.perf_counter_ns()
+                    data = json.loads(chunk)
+
+                    # Delta is the new content/text/data
+                    delta = data["choices"][0]["delta"]
+                    if delta.get("content", None):
+                        if ttft is None:
+                            # First token
+                            first_token_time = time.perf_counter_ns()
+                            ttft = first_token_time - start_time
+                            first_chunk = delta["content"]
+                        else:
+                            # Decoding phase
+                            chunk_delay.append(timestamp - most_recent_timestamp)
+
+                        generated_text += delta["content"]
+
+                    most_recent_timestamp = timestamp
+        else:
+            valid_response = False
+            content = await response.text()
+            logger.warning(
+                f"{Color.YELLOW}Received HTTP status {http_status.value} "
+                f"({http_status.phrase}): {content}{Color.RESET}"
+            )
+
+    if latency is None:
+        latency = -1.0
+        if valid_response:
+            # Streaming is disabled, latency was not set
+            latency = time.perf_counter_ns() - start_time
+
+    if ttft is None:
+        # The response was a single chunk
+        ttft = latency
+
+    # Each chunk may include more than one token
+    tpot: float = mean(chunk_delay) if len(chunk_delay) > 0 else 0.0
+    num_chunks: int = len(chunk_delay)
+
+    sr = ServerResponse(
+        valid=valid_response,
+        ttft_ms=nanosec_to_millisec(ttft) if ttft > 0.0 else -1.0,
+        tpot_ms=nanosec_to_millisec(tpot),
+        latency_ms=nanosec_to_millisec(latency),
+        start_time_ms=nanosec_to_millisec(start_time),
+        first_chunk=first_chunk,
+        content=generated_text,
+        num_chunks=num_chunks,
+    )
+    return sr
+
+
+def get_short_string(input: str) -> str:
+    n = 20
+    if len(input) < 400:
+        return input
+
+    return f"{input[:n]}...{input[-n:]}"
+
+
+def get_token_count(tokenizer: AutoTokenizer, text: str) -> int:
+    return len(tokenizer(text, add_special_tokens=False).input_ids)
+
+
+def get_messages_token_count(
+    tokenizer: AutoTokenizer, messages: list[dict[str, str]]
+) -> int:
+    token_count = 0
+    for m in messages:
+        token_count += get_token_count(tokenizer, m["content"])
+
+    return token_count
+
+
+async def send_turn(
+    session: aiohttp.ClientSession,
+    client_id: int,
+    conv_id: str,
+    conversation_messages: MessagesList,
+    messages_to_use: int,
+    tokenizer: AutoTokenizer,
+    req_args: RequestArgs,
+    verbose: bool,
+    verify_output: bool,
+) -> Optional[RequestStats]:
+    assert messages_to_use > 0
+    assert messages_to_use <= len(conversation_messages)
+
+    messages = conversation_messages[:messages_to_use]
+
+    # Index of the next message (the role should be "user")
+    index = messages_to_use - 1
+
+    # Verify that the message has only two keys, "role" and "content"
+    assert len(messages[index].keys()) == 2
+    assert "role" in messages[index] and "content" in messages[index]
+    assert messages[index]["role"] == "user", (
+        f"Failed on conversation ID {conv_id}, message role should be user"
+    )
+
+    if verbose:
+        print(
+            f"{Color.CYAN}Messages (conversation ID {conv_id},"
+            f" {len(messages)} turns):{Color.RESET}",
+            messages,
+        )
+
+    # None means that there is no upper/lower limit for the output token count
+    min_tokens = None if req_args.limit_min_tokens < 0 else req_args.limit_min_tokens
+    max_tokens = None if req_args.limit_max_tokens < 0 else req_args.limit_max_tokens
+
+    if len(conversation_messages) > messages_to_use:
+        # The conversation contains an assistant answer for the next user prompt
+        if (
+            min_tokens == NUM_TOKENS_FROM_DATASET
+            or max_tokens == NUM_TOKENS_FROM_DATASET
+        ):
+            # Compute number of tokens in the answer (from the input conversation)
+            assistant_answer = conversation_messages[messages_to_use]
+            answer_num_tokens = get_token_count(tokenizer, assistant_answer["content"])
+            assert assistant_answer["role"] == "assistant"
+
+        if min_tokens == NUM_TOKENS_FROM_DATASET:
+            min_tokens = max(1, answer_num_tokens)
+
+        if max_tokens == NUM_TOKENS_FROM_DATASET:
+            max_tokens = max(1, answer_num_tokens)
+
+    # Send the current conversation to LLM and get a response
+    response: ServerResponse = await send_request(
+        session,
+        messages,
+        req_args.chat_url,
+        req_args.model,
+        req_args.stream,
+        min_tokens,
+        max_tokens,
+    )
+
+    if response.valid is False:
+        # Request failed
+        return None
+
+    # Compute number of tokens in input / output
+    input_num_tokens = get_messages_token_count(tokenizer, messages)
+
+    # Num tokens in the user's last question
+    question_num_tokens = get_token_count(tokenizer, messages[index]["content"])
+
+    # Num tokens in the history/context of the question
+    assert input_num_tokens >= question_num_tokens
+    history_num_tokens = input_num_tokens - question_num_tokens
+
+    # Num tokens in the LLM's answer (first chunk and full answer)
+    first_chunk_tokens = get_token_count(tokenizer, response.first_chunk)
+
+    output_content = response.content
+    output_num_tokens = get_token_count(tokenizer, output_content)
+
+    # Prefix caching approximated cached percent
+    approx_cached_percent = (
+        100.0 * (history_num_tokens / input_num_tokens) if input_num_tokens > 0 else 0.0
+    )
+
+    # Compute the correct TTFT and TPOT (based on tokens and not chunks).
+    # Required because multiple output tokens may be bundled in a single chunk.
+    if output_num_tokens > 1 and output_num_tokens > first_chunk_tokens:
+        # More than one token and more than one chunk in the output
+        decode_ms = response.latency_ms - response.ttft_ms
+        decode_num_tokens = output_num_tokens - first_chunk_tokens
+        tpot_ms = decode_ms / decode_num_tokens
+    else:
+        # In this case: output_num_tokens == first_chunk_tokens
+        # Output was a single chunk (output_num_tokens > 1)
+        # or even a single token (output_num_tokens == 1)
+        tpot_ms = 0.0
+
+    if first_chunk_tokens > 1:
+        # First chunk had multiple tokens, adjust TTFT for a single token
+        delta_ms = (first_chunk_tokens - 1) * tpot_ms
+        ttft_ms = max(0.1, response.ttft_ms - delta_ms)
+    else:
+        # First chunk had only one token
+        ttft_ms = response.ttft_ms
+
+    rs = RequestStats(
+        ttft_ms=ttft_ms,
+        tpot_ms=tpot_ms,
+        latency_ms=response.latency_ms,
+        start_time_ms=response.start_time_ms,
+        input_num_turns=len(messages),
+        input_num_tokens=input_num_tokens,
+        output_num_tokens=output_num_tokens,
+        output_num_chunks=response.num_chunks,
+        output_num_first_chunk_tokens=first_chunk_tokens,
+        approx_cached_percent=approx_cached_percent,
+        conversation_id=conv_id,
+        client_id=client_id,
+    )
+
+    if verbose:
+        print(
+            f"\n{Color.YELLOW}Response ({output_num_tokens} tokens):{Color.RESET}",
+            output_content,
+        )
+        print(f"{Color.YELLOW}Response metrics: {rs}{Color.RESET}")
+        print("-" * 70)
+
+    # Save the LLM's answer (will be used as part of the context for the next user turn)
+    answer_index = messages_to_use
+    if len(conversation_messages) > answer_index:
+        assert conversation_messages[answer_index]["role"] == "assistant", (
+            f"Failed on conversation ID {conv_id}, message role should be assistant"
+        )
+
+        orig_content = conversation_messages[answer_index]["content"]
+        if verify_output:
+            # Compare the new answer to the answer from the input file
+            debug_info = (
+                f"LLM/dataset answers do not match ({conv_id}):"
+                f"\n'{get_short_string(output_content)}' (len: {len(output_content)}),"
+                f"\n'{get_short_string(orig_content)}' (len: {len(orig_content)})"
+            )
+            if orig_content != output_content:
+                raise ValueError(debug_info)
+
+        # Update the answer
+        conversation_messages[answer_index]["content"] = output_content
+    else:
+        # A user prompt that has no answer, add the answer as a new message
+        new_answer = {"role": "assistant", "content": output_content}
+        conversation_messages.append(new_answer)
+
+    return rs
+
+
+async def poisson_sleep(request_rate: float, verbose: bool = False) -> None:
+    # Generate a random time interval from the Poisson distribution
+    assert request_rate > 0
+
+    interval = np.random.exponential(1.0 / request_rate)
+    if verbose:
+        logger.info(f"Sleeping for {interval:.3f} seconds...")
+    await asyncio.sleep(interval)
+
+
+async def client_main(
+    args: ClientArgs,
+    req_args: RequestArgs,
+    client_id: int,
+    tokenizer: AutoTokenizer,
+    stop_event: mp.Event,  # type: ignore
+    task_queue: mp.Queue,
+    result_queue: mp.Queue,
+    conv_queue: mp.Queue,
+) -> None:
+    logger.info(
+        f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}"  # noqa: E501
+    )
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    # Active conversations
+    active_convs: ConversationsMap = {}
+    conv_id_queue: deque = deque(maxlen=args.max_active_conversations)
+
+    # Keep track of how many messages have been used for each conversation
+    turns_count: Counter = Counter()
+    num_successes = 0
+    num_failures = 0
+
+    # Track the timestamp (time.perf_counter())
+    # of the last turn per conversation (only for debug)
+    time_of_last_turn: dict[ConvId, float] = {}
+
+    # Flag that indicates that there are no new tasks (conversations) for the client
+    task_queue_empty = False
+
+    async with aiohttp.ClientSession() as session:
+        # Print progress
+
+        while task_queue_empty is False:
+            result = None
+
+            if (
+                args.max_num_requests
+                and num_successes + num_failures == args.max_num_requests
+            ):
+                logger.info(
+                    f"{Color.YELLOW}Client {client_id} reached "
+                    f"request limit{Color.RESET}"
+                )
+                break
+
+            if stop_event.is_set():  # type: ignore
+                logger.info(
+                    f"{Color.YELLOW}Client {client_id} received "
+                    f"a termination signal{Color.RESET}"
+                )
+                break
+
+            while (
+                len(active_convs) < args.max_active_conversations
+                and task_queue_empty is False
+            ):
+                # Get a new conversation from the task queue
+                conv_id, messages = task_queue.get()
+
+                if conv_id is TERM_SIGNAL:
+                    task_queue_empty = True
+                    break
+
+                if args.skip_first_turn:
+                    # Skip the first turn (both user and assistant),
+                    # relevant if warmup was enabled.
+                    # Default turns_count[conv_id] will be zero if conv_id
+                    # was never inserted/updated in turns_count.
+                    turns_count[conv_id] += 2
+
+                if turns_count[conv_id] < len(messages):
+                    # Add new conversation
+                    active_convs[conv_id] = messages
+                    conv_id_queue.append(conv_id)
+
+                    if args.verbose:
+                        logger.info(
+                            f"{Color.GREEN}Client {client_id} will use conversation ID {conv_id} (active conversations {len(active_convs)}){Color.RESET}"  # noqa: E501
+                        )
+
+                elif args.verbose:
+                    # No more messages (conversation finished during the warmup)
+                    logger.info(
+                        f"{Color.YELLOW}Client {client_id} will not use conversation ID {conv_id} (all {len(messages)} messages already sent){Color.RESET}"  # noqa: E501
+                    )
+
+            if len(active_convs) == 0 or task_queue_empty:
+                logger.info(
+                    f"{Color.YELLOW}Client {client_id} has no more work{Color.RESET}"
+                )
+                break
+
+            # Pick an active conversation for the next request
+            if args.conversation_sampling == ConversationSampling.ROUND_ROBIN:
+                conv_id = conv_id_queue.pop()
+            else:
+                # ConversationSampling.RANDOM
+                active_ids = list(active_convs.keys())
+                conv_id = random.choice(active_ids)
+
+            messages = active_convs[conv_id]
+            assert isinstance(messages, list) and len(messages) > 0
+
+            # Update the amount of messages to use
+            turns_count[conv_id] += 1
+            current_turn = turns_count[conv_id]
+
+            assert current_turn < len(messages), (
+                f"Turn number {current_turn} is invalid for conversation ID {conv_id}"
+                f" that has only {len(messages)} messages"
+            )
+
+            if args.verbose:
+                curr_time_sec: float = time.perf_counter()
+                time_since_last_turn: Union[str, float] = "N/A"
+                if conv_id in time_of_last_turn:
+                    time_since_last_turn = round(
+                        curr_time_sec - time_of_last_turn[conv_id], 3
+                    )
+                logger.info(
+                    f"Client {client_id} using conversation ID {conv_id} (turn: {current_turn}, time since last turn [sec]: {time_since_last_turn})"  # noqa: E501
+                )
+                time_of_last_turn[conv_id] = curr_time_sec
+
+            success = True
+            try:
+                result = await send_turn(
+                    session,
+                    client_id,
+                    conv_id,
+                    messages,
+                    current_turn,
+                    tokenizer,
+                    req_args,
+                    args.print_content,
+                    args.verify_output,
+                )
+                if result is not None:
+                    result_queue.put(result)
+                else:
+                    # None means that the request failed,
+                    # and should not be added to the statistics.
+                    success = False
+                    num_failures += 1
+
+                    logger.warning(
+                        f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+                    )
+
+                    # Remove the conversation (should not be used again)
+                    active_convs.pop(conv_id)
+
+            except asyncio.exceptions.TimeoutError:
+                num_failures += 1
+                logger.exception(
+                    f"{Color.RED}Client {client_id} - Timeout during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+                )
+                break  # Exit gracefully instead of raising an error
+
+            except Exception:
+                num_failures += 1
+                logger.exception(
+                    f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+                )
+                break  # Exit gracefully instead of raising an error
+
+            if success:
+                num_successes += 1
+
+                # Update the turns counter to include the LLM response
+                # The LLM response will be used as context for the next user turn
+                turns_count[conv_id] += 1
+
+                max_turns = len(messages)
+                if args.max_turns is not None:
+                    # Limit the number of turns in the conversation
+                    max_turns = min(args.max_turns, max_turns)
+
+                if turns_count[conv_id] >= max_turns:
+                    # Conversation has no more turns (no longer active)
+                    # save the updated conversation (with the LLM server's answer)
+                    conv_queue.put((conv_id, active_convs.pop(conv_id)))
+                    if args.verbose:
+                        logger.info(
+                            f"{Color.GREEN}Client {client_id} finished "
+                            f"conversation ID {conv_id}{Color.RESET}"
+                        )
+                else:
+                    # Conversation is not finished, insert it at the back of the queue
+                    conv_id_queue.appendleft(conv_id)
+
+            # Sleep between requests (if lambda is positive)
+            if args.request_rate > 0:
+                await poisson_sleep(args.request_rate, args.verbose)
+
+    # Send indication that the client is done
+    conv_queue.put((TERM_SIGNAL, TERM_SIGNAL))
+
+    logger.info(
+        f"{Color.CYAN}Client {client_id} is done "
+        f"({num_successes=}, {num_failures=}){Color.RESET}"
+    )
+
+
+def worker_function(
+    client_id: int,
+    tokenizer: AutoTokenizer,
+    client_args: ClientArgs,
+    req_args: RequestArgs,
+    stop_event: mp.Event,  # type: ignore
+    task_queue: mp.Queue,
+    result_queue: mp.Queue,
+    conv_queue: mp.Queue,
+) -> None:
+    asyncio.run(
+        client_main(
+            client_args,
+            req_args,
+            client_id,
+            tokenizer,
+            stop_event,
+            task_queue,
+            result_queue,
+            conv_queue,
+        )
+    )
+
+
+def get_client_config(
+    args: argparse.Namespace, input_conv: ConversationsMap
+) -> tuple[ClientArgs, RequestArgs]:
+    if args.num_clients < 1:
+        raise ValueError("Number of clients must be a positive number")
+
+    if len(input_conv) < args.num_clients:
+        raise ValueError(
+            "Number of conversations must be equal or larger than the number of clients"
+        )
+
+    max_req_per_client: Optional[int] = None
+    if args.max_num_requests is not None:
+        # Max number of requests per client
+        req_per_client = args.max_num_requests // args.num_clients
+        if req_per_client < 1:
+            raise ValueError("Number of requests should be at least one per client")
+        max_req_per_client = req_per_client
+
+    max_active_conversations = args.max_active_conversations
+    if max_active_conversations is None:
+        # Each client will have only one active conversation at a time
+        max_active_conversations = args.num_clients
+
+    if max_active_conversations > len(input_conv):
+        raise ValueError(
+            f"Max active conversations {max_active_conversations} "
+            "must be equal or less than the total number of conversations"
+        )
+
+    # Max number of active conversations per client
+    max_active_conv_per_client = max_active_conversations // args.num_clients
+    if max_active_conv_per_client < 1:
+        raise ValueError(
+            f"Max active conversations {max_active_conversations} "
+            "must be equal or greater than the number of clients"
+        )
+
+    # Skip the first user turn (as part of the warmup)
+    skip_first_turn = args.warmup_step
+
+    # Common arguments for all clients
+    client_args = ClientArgs(
+        seed=args.seed,
+        max_num_requests=max_req_per_client,
+        skip_first_turn=skip_first_turn,
+        max_turns=args.max_turns,
+        max_active_conversations=max_active_conv_per_client,
+        verbose=args.verbose,
+        print_content=args.print_content,
+        verify_output=args.verify_output,
+        conversation_sampling=args.conversation_sampling,
+        request_rate=args.request_rate,
+    )
+
+    if args.limit_min_tokens > 0 or args.limit_max_tokens > 0:
+        if args.limit_min_tokens < 1 or args.limit_max_tokens < 1:
+            raise ValueError(
+                "Invalid min/max tokens limits (both limits should be provided)"
+            )
+        if args.limit_min_tokens > args.limit_max_tokens:
+            raise ValueError(
+                "Invalid min/max tokens limits (min should not be larger than max)"
+            )
+
+    # Arguments for API requests
+    chat_url = f"{args.url}/v1/chat/completions"
+    req_args = RequestArgs(
+        chat_url=chat_url,
+        model=args.model,
+        stream=not args.no_stream,
+        limit_min_tokens=args.limit_min_tokens,
+        limit_max_tokens=args.limit_max_tokens,
+    )
+
+    return client_args, req_args
+
+
+async def main_mp(
+    client_args: ClientArgs,
+    req_args: RequestArgs,
+    bench_args: BenchmarkArgs,
+    tokenizer: AutoTokenizer,
+    input_conv: ConversationsMap,
+) -> tuple[ConversationsMap, list[RequestStats]]:
+    # An event that will trigger graceful termination of all the clients
+    stop_event = mp.Event()
+
+    # Queue for input conversations (from the input file/dataset)
+    task_queue: mp.Queue = mp.Queue()
+
+    # Queue for client measurements (TTFT, TPOT, etc. for each request)
+    result_queue: mp.Queue = mp.Queue()
+
+    # Queue for output conversations (with the LLM answers, sent by the server)
+    conv_queue: mp.Queue = mp.Queue()
+    output_conv: ConversationsMap = {}
+    client_metrics: list[RequestStats] = []
+
+    # Start all clients
+    start_time = time.perf_counter_ns()
+    logger.info(f"{Color.GREEN}Starting {bench_args.num_clients} clients{Color.RESET}")
+
+    clients = []
+    for client_id in range(bench_args.num_clients):
+        client = mp.Process(
+            name=f"client_{client_id}",
+            target=worker_function,
+            args=(
+                client_id,
+                tokenizer,
+                client_args,
+                req_args,
+                stop_event,
+                task_queue,
+                result_queue,
+                conv_queue,
+            ),
+        )
+        clients.append(client)
+        client.start()
+
+    # Submit all the input conversations as tasks for the clients
+    for conv_id, messages in input_conv.items():
+        task_queue.put((conv_id, messages))
+
+    # Add termination signals for clients
+    for _ in range(bench_args.num_clients):
+        task_queue.put((TERM_SIGNAL, TERM_SIGNAL))
+
+    # Collect the updated conversations from all clients
+    num_clients_finished = 0
+    total_convs = len(input_conv)
+
+    debug_stats = DebugStats(logger, min(15 * bench_args.num_clients, 500))
+
+    while num_clients_finished < bench_args.num_clients:
+        # Collect updated conversation
+        conv_id, messages = conv_queue.get()
+
+        # Collect results (measurements)
+        while not result_queue.empty():
+            new_data = result_queue.get()
+            client_metrics.append(new_data)
+            debug_stats.update(new_data)
+
+        if conv_id is TERM_SIGNAL:
+            num_clients_finished += 1
+            logger.info(
+                f"{Color.CYAN}{num_clients_finished} out of "
+                f"{bench_args.num_clients} clients finished{Color.RESET}"
+            )
+
+            if bench_args.early_stop and not stop_event.is_set():
+                # Once one client finished, stop all other clients.
+                # there is no reason to continue the benchmark with fewer clients.
+                logger.info(
+                    f"{Color.YELLOW}Sending termination signal to clients{Color.RESET}"
+                )
+                stop_event.set()
+        else:
+            output_conv[conv_id] = messages
+
+            finished_convs = len(output_conv)
+            percent = finished_convs / total_convs
+
+            # Tuned to control the print rate (can be changed if required)
+            print_cycle = max(3, int(bench_args.num_clients / 4))
+
+            if finished_convs % print_cycle == 0:
+                runtime_sec = nanosec_to_sec(time.perf_counter_ns() - start_time)
+                logger.info(
+                    f"{Color.CYAN}Finished {finished_convs} out of {total_convs} conversations ({percent:.0%}), "  # noqa: E501
+                    f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}"  # noqa: E501
+                )
+
+                rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
+                if len(client_metrics) < (5 * bench_args.num_clients):
+                    # Do not estimate the RPS if the number of samples is very low
+                    # (threshold can be tuned if needed)
+                    rps = "N/A"
+
+                runtime_left_sec: Union[str, float] = round(
+                    (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
+                )
+                if percent < 0.05:
+                    # If less than 5% of the conversations were not finished,
+                    # the estimation will probably be very inaccurate
+                    # (threshold can be tuned if needed).
+                    runtime_left_sec = "N/A"
+
+                logger.info(
+                    f"{Color.CYAN}Estimated req/sec {rps}, estimated runtime left {runtime_left_sec} sec{Color.RESET}"  # noqa: E501
+                )
+                debug_stats.print()
+
+    logger.info(
+        f"{Color.CYAN}All {bench_args.num_clients} clients finished{Color.RESET}"
+    )
+
+    # At this point all the clients finished,
+    # collect results (TTFT, TPOT, etc.) from all the clients.
+    # This needs to happens before calling join on the clients
+    # (result_queue should be emptied).
+    while not result_queue.empty():
+        client_metrics.append(result_queue.get())
+
+    logger.info(f"Collected {len(client_metrics)} samples from all the clients")
+
+    # Wait for all clients to finish
+    for client in clients:
+        logger.info(
+            f"{Color.CYAN}Waiting for client {client.name} "
+            f"(is alive: {client.is_alive()}){Color.RESET}"
+        )
+
+        client.join(timeout=120)
+
+        if client.is_alive():
+            logger.warning(
+                f"{Color.YELLOW}Client {client.name} will be terminated{Color.RESET}"
+            )
+            client.terminate()
+
+        exitcode = client.exitcode
+        if exitcode != 0:
+            logger.error(
+                f"{Color.RED}Client {client.name} exited "
+                f"with exit code {exitcode}{Color.RESET}"
+            )
+
+    logger.info(
+        f"All {bench_args.num_clients} clients exited (successfully "
+        f"finished {len(output_conv)} out of {total_convs} conversations)"
+    )
+
+    # Queues should be closed, required to avoid hang at interpreter shutdown
+    unfinished_tasks = 0
+    while not task_queue.empty():
+        task_queue.get()
+        unfinished_tasks += 1
+
+    if unfinished_tasks > 0:
+        # Can happen if not all tasks (conversations) have finished.
+        # May happen if --max-num-requests was used,
+        # or if an error occurred in one of the clients.
+        logger.debug(f"Discarding {unfinished_tasks} unfinished tasks")
+
+    task_queue.close()
+    task_queue.join_thread()
+
+    result_queue.close()
+    result_queue.join_thread()
+
+    conv_queue.close()
+    conv_queue.join_thread()
+
+    return output_conv, client_metrics
+
+
+def get_filename_with_timestamp(label: str, extension: str) -> str:
+    time_now = datetime.now()
+    timestamp = time_now.strftime("%d-%m-%Y_%H-%M-%S")
+    filename = f"{label}__{timestamp}.{extension}"
+    return filename
+
+
+def process_statistics(
+    client_metrics: list[RequestStats],
+    warmup_percentages: list[float],
+    test_params: dict,
+    verbose: bool,
+    gen_conv_args: Optional[GenConvArgs] = None,
+    excel_output: bool = False,
+) -> None:
+    if len(client_metrics) == 0:
+        logger.info("No samples to process")
+        return
+
+    logger.info(f"Processing {len(client_metrics)} samples...")
+
+    raw_data = pd.DataFrame(client_metrics)
+
+    if verbose:
+        # Calculate the time between user turns in each conversation (in a new column)
+        raw_data = raw_data.sort_values(by=["conversation_id", "start_time_ms"])
+        raw_data["time_between_user_turns_sec"] = raw_data.groupby("conversation_id")[
+            "start_time_ms"
+        ].diff()
+
+        # Convert milliseconds to seconds
+        raw_data["time_between_user_turns_sec"] = (
+            raw_data["time_between_user_turns_sec"] / 1000.0
+        )
+
+    # Final raw data should be sorted by time
+    raw_data = raw_data.sort_values(by=["start_time_ms"])
+    raw_data["end_time_ms"] = raw_data["start_time_ms"] + raw_data["latency_ms"]
+
+    percentiles = [0.25, 0.5, 0.75, 0.9]
+
+    # Add more percentiles if there are enough samples
+    if len(raw_data) >= 100:
+        percentiles.append(0.99)
+
+    if len(raw_data) >= 1000:
+        percentiles.append(0.999)
+
+    if len(raw_data) >= 10000:
+        percentiles.append(0.9999)
+
+    # Set precision for numbers in the output text (the dataframes)
+    pd.set_option("display.precision", 2)
+
+    # Exclude parameters from RequestStats
+    exclude = [
+        "start_time_ms",
+        "end_time_ms",
+        "output_num_first_chunk_tokens",
+        "approx_cached_percent",
+        "conversation_id",
+        "client_id",
+    ]
+
+    print(TEXT_SEPARATOR)
+    print(f"{Color.YELLOW}Parameters:{Color.RESET}")
+    for k, v in test_params.items():
+        print(f"{k}={v}")
+
+    # conversations generation parameters
+    if gen_conv_args is not None:
+        gen_params = {
+            "text_files": ", ".join(gen_conv_args.text_files),
+            "input_num_turns": str(gen_conv_args.input_num_turns),
+            "input_common_prefix_num_tokens": str(
+                gen_conv_args.input_common_prefix_num_tokens
+            ),
+            "input_prefix_num_tokens": str(gen_conv_args.input_prefix_num_tokens),
+            "input_num_tokens": str(gen_conv_args.input_num_tokens),
+            "output_num_tokens": str(gen_conv_args.output_num_tokens),
+        }
+
+        print(f"{Color.YELLOW}Conversations Generation Parameters:{Color.RESET}")
+        for k, v in gen_params.items():
+            print(f"{k}={v}")
+
+    print(TEXT_SEPARATOR)
+
+    params_list = []
+    df_list = []
+    for percent in warmup_percentages:
+        # Select samples from the end (tail) of the dataframe
+        warmup_count = int(percent * len(raw_data))
+        tail_count = len(raw_data) - warmup_count
+        if tail_count == 0:
+            # No reason to process if the count of samples is zero
+            break
+
+        df = raw_data.tail(tail_count)
+
+        # Runtime is the diff between the end of the last request
+        # and the start of the first request
+        runtime_sec = df["end_time_ms"].iloc[-1] - df["start_time_ms"].iloc[0]
+
+        # Convert milliseconds to seconds
+        runtime_sec = runtime_sec / 1000.0
+        requests_per_sec = float(len(df)) / runtime_sec
+
+        params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
+
+        # Generate a summary of relevant metrics (and drop irrelevant data)
+        df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
+
+        # List for Excel file
+        params_list.append(params)
+        df_list.append(df)
+
+        # Print the statistics summary
+        if percent > 0 or len(warmup_percentages) > 1:
+            print(
+                f"{Color.YELLOW}Statistics summary "
+                f"(assuming {percent:.0%} warmup samples):{Color.RESET}"
+            )
+        else:
+            print(f"{Color.YELLOW}Statistics summary:{Color.RESET}")
+
+        for k, v in params.items():
+            if isinstance(v, float):
+                print(f"{k} = {v:.3f}")
+            else:
+                print(f"{k} = {v}")
+        print(TEXT_SEPARATOR)
+        print(df)
+        print(TEXT_SEPARATOR)
+
+    if excel_output:
+        prefix = f"statistics_{test_params['num_clients']}_clients"
+        filename = get_filename_with_timestamp(prefix, "xlsx")
+
+        with pd.ExcelWriter(filename, engine="xlsxwriter") as writer:
+            startrow = 0
+            test_params_df = pd.DataFrame([test_params])
+            test_params_df.to_excel(
+                writer, sheet_name="Summary", index=False, startrow=startrow
+            )
+            startrow += len(test_params_df) + 3
+
+            if gen_conv_args is not None:
+                gen_params_df = pd.DataFrame([gen_params])
+                gen_params_df.to_excel(
+                    writer, sheet_name="Summary", index=False, startrow=(startrow - 1)
+                )
+                startrow += len(gen_params_df) + 3
+
+            for params, df_stats in zip(params_list, df_list):
+                df_params = pd.DataFrame([params])
+                df_params.to_excel(
+                    writer, sheet_name="Summary", index=False, startrow=startrow
+                )
+                startrow += len(df_params) + 2
+                df_stats.to_excel(
+                    writer, sheet_name="Summary", index=True, startrow=startrow
+                )
+                startrow += len(df_stats) + 3
+
+            raw_data.to_excel(writer, sheet_name="Raw data", index=False, startrow=0)
+
+        logger.info(
+            f"{Color.GREEN}Client metrics exported to file: {filename}{Color.RESET}"
+        )
+
+
+async def get_server_info(url: str) -> None:
+    logger.info(f"{Color.BLUE}Collecting information from server: {url}{Color.RESET}")
+    async with aiohttp.ClientSession() as session:
+        # Get server version (not mandatory, "version" endpoint may not exist)
+        url_version = f"{url}/version"
+        async with session.get(url_version) as response:
+            if HTTPStatus(response.status) == HTTPStatus.OK:
+                text = await response.text()
+                logger.info(f"{Color.BLUE}Server version: {text}{Color.RESET}")
+
+        # Get available models
+        url_models = f"{url}/v1/models"
+        async with session.get(url_models) as response:
+            if HTTPStatus(response.status) == HTTPStatus.OK:
+                text = await response.text()
+                logger.info(f"{Color.BLUE}Models:{Color.RESET}")
+                models_data = json.loads(text)
+                models_list = models_data["data"]
+                for model in models_list:
+                    model_id = model["id"]
+                    max_model_len = model.get("max_model_len", "N/A")
+                    logger.info(
+                        f"{Color.BLUE}\t{model_id=}, {max_model_len=}{Color.RESET}"
+                    )
+            else:
+                logger.info(f"{Color.RED}Failed to get models{Color.RESET}")
+
+
+async def main() -> None:
+    parser = argparse.ArgumentParser(
+        prog="Benchmark serving with multi-turn conversations",
+        description="Benchmark online inference using REST API",
+    )
+    parser.add_argument("--version", action="version", version="%(prog)s 1.0")
+
+    parser.add_argument(
+        "-i",
+        "--input-file",
+        type=str,
+        required=True,
+        help="Input JSON file with ShareGPT conversations or "
+        "configuration file for generation of synthetic conversations",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-file",
+        type=str,
+        default=None,
+        help="Output JSON file containing conversations with updated assistant answers",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="Seed for random number generators (default: 0)",
+    )
+    parser.add_argument(
+        "-m", "--model", type=str, required=True, help="Path of the LLM model"
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        type=str,
+        default="http://localhost:8000",
+        help="Base URL for the LLM API server",
+    )
+
+    parser.add_argument(
+        "-p",
+        "--num-clients",
+        type=int,
+        default=1,
+        help="Number of clients that will send requests in parallel",
+    )
+    parser.add_argument(
+        "-k",
+        "--max-active-conversations",
+        type=int,
+        default=None,
+        help="Max number of active conversations at a time (for all clients)",
+    )
+    parser.add_argument(
+        "-n",
+        "--max-num-requests",
+        type=int,
+        default=None,
+        help="Max number of requests to send (total for all clients)",
+    )
+
+    parser.add_argument(
+        "--warmup-step",
+        default=False,
+        action="store_true",
+        help="Run a warmup step (using only the first turn of every conversation), "
+        "measurements will not be included in the final benchmark results",
+    )
+
+    parser.add_argument(
+        "--max-turns",
+        type=int,
+        default=None,
+        help="Maximum number of turns/messages per conversation, "
+        "includes both user and assistant messages "
+        "(a positive number, e.g: 2, 4, 6, etc.), disabled by default",
+    )
+    parser.add_argument(
+        "--no-early-stop",
+        default=False,
+        action="store_true",
+        help="By default, the benchmark will stop if at least one client exits."
+        " Use this flag to disable this behavior",
+    )
+
+    parser.add_argument(
+        "--limit-max-tokens",
+        type=int,
+        default=NUM_TOKENS_FROM_DATASET,
+        help="Set max_tokens for the output token count of each request "
+        "(must also set --limit-min-tokens). "
+        "Overrides output token count from the input dataset. "
+        "Use a negative value to disable this limit.",
+    )
+    parser.add_argument(
+        "--limit-min-tokens",
+        type=int,
+        default=NUM_TOKENS_FROM_DATASET,
+        help="Set min_tokens for the output token count of each request "
+        "(must also set --limit-max-tokens). "
+        "Overrides output token count from the input dataset. "
+        "Use a negative value to disable this limit.",
+    )
+
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=0,
+        help="Expected request rate (Poisson process) per client in requests/sec."
+        "Set to 0 for no delay between requests.",
+    )
+    parser.add_argument(
+        "--conversation-sampling",
+        type=ConversationSampling,
+        choices=list(ConversationSampling),
+        default=ConversationSampling.ROUND_ROBIN,
+        help=(
+            "Strategy for selecting which conversation to use for the next request. "
+            "Options: 'round_robin' (cycle through conversations), "
+            "'random' (pick randomly)."
+        ),
+    )
+    parser.add_argument(
+        "--verify-output",
+        default=False,
+        action="store_true",
+        help="Verify the LLM output (compare to the answers in the input JSON file)",
+    )
+
+    parser.add_argument(
+        "--no-stream",
+        default=False,
+        action="store_true",
+        help="Disable stream/streaming mode (set 'stream' to False in the API request)",
+    )
+
+    parser.add_argument(
+        "-e",
+        "--excel-output",
+        default=False,
+        action="store_true",
+        help="Export summary to Excel file (optional)",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        default=False,
+        action="store_true",
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "--print-content",
+        default=False,
+        action="store_true",
+        help="Print the user prompts and the server's answers",
+    )
+
+    parser.add_argument(
+        "--warmup-percentages",
+        type=str,
+        default="0%",
+        help="Ignore the first X samples as warmup (X is a percentage)."
+        " A comma separated list of percentages can be used "
+        "(for example: --warmup-percentages=0%%,50%%)",
+    )
+
+    args = parser.parse_args()
+
+    logger.info(args)
+
+    logger.info(f"{Color.GREEN}Input parameters:{Color.RESET}")
+    logger.info(f"url={args.url}")
+    logger.info(f"model={args.model}")
+    logger.info(f"num_clients={args.num_clients}")
+
+    if args.verify_output:
+        logger.info(f"{Color.PURPLE}Verify is enabled{Color.RESET}")
+
+    # Calculate the amount of samples to filter (as warmup samples/measurements).
+    try:
+        warmup_percentages: list[float] = [0.0]
+        if not args.warmup_step:
+            # Warmup percentage can be used only if the warmup step was used
+            warmup_strings: list[str] = args.warmup_percentages.split(",")
+            warmup_strings = [x.replace("%", "") for x in warmup_strings]
+            warmup_percentages = [float(x) / 100 for x in warmup_strings]
+
+            # Check for valid range (0 to 1)
+            for p in warmup_percentages:
+                assert p >= 0.0 and p < 1.0
+
+            # Sort from high to low warmup percentage
+            warmup_percentages.sort()
+
+            logger.info(
+                f"Warmup percentages (percentage of samples): {warmup_percentages}"
+            )
+
+    except Exception:
+        raise ValueError(
+            f"Invalid --warmup-percentage={args.warmup_percentage}"
+        ) from None
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    if not os.path.exists(args.model):
+        raise OSError(f"Path does not exist: {args.model}")
+    logger.info("Loading tokenizer")
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+
+    await get_server_info(args.url)
+
+    # Load the input file (either conversations of configuration file)
+    logger.info(f"Reading input file: {args.input_file}")
+    with open(args.input_file) as f:
+        input_data = json.load(f)
+
+    gen_conv_args = None
+    if isinstance(input_data, list):
+        # The conversations are stored as a list of dicts
+        logger.info(f"Found {len(input_data)} items in the input file")
+
+        # Convert the list to a ConversationsMap
+        conversations = conversations_list_to_dict(input_data)
+
+    elif isinstance(input_data, dict):
+        # The input file is a configuration file
+        # (type is determined by the field 'filetype')
+        if "filetype" not in input_data:
+            raise Exception(
+                f"Input file {args.input_file} is invalid (missing 'filetype')"
+            )
+
+        logger.info(f"Using input file with filetype: {input_data['filetype']}")
+
+        gen_conv_args = parse_input_json_file(input_data)
+
+        # Disable warning from "huggingface/tokenizers"
+        # (when using python multiprocessing and tokenizers)
+        os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+        # Generate synthetic conversations
+        conversations = generate_conversations(gen_conv_args, tokenizer)
+
+    else:
+        raise Exception(f"Input file {args.input_file} is invalid")
+
+    if args.max_turns is not None:
+        if args.max_turns < 1:
+            raise ValueError("Max turns must be a positive number")
+        logger.info(
+            f"{Color.PURPLE}Max turns per conversation "
+            f"is limited to {args.max_turns}{Color.RESET}"
+        )
+
+    # Create benchmark configurations
+    client_args, req_args = get_client_config(args, conversations)
+
+    bench_args = BenchmarkArgs(
+        url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
+    )
+
+    # Warm-up step
+    if args.warmup_step:
+        # Only send a single user prompt from every conversation.
+        # max_active_conversations must be 1,
+        # otherwise the clients may exit after sending a single request
+        # (because the task queue is empty).
+        warmup_client_args = client_args._replace(
+            skip_first_turn=False, max_turns=1, max_active_conversations=1
+        )
+
+        # Early stop should be disabled,
+        # all clients should finish their work before exiting
+        warmup_bench_args = bench_args._replace(early_stop=False)
+
+        logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
+        conversations, _ = await main_mp(
+            warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
+        )
+        logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
+
+    # Run the benchmark
+    start_time = time.perf_counter_ns()
+    client_convs, client_metrics = await main_mp(
+        client_args, req_args, bench_args, tokenizer, conversations
+    )
+    total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
+
+    # Calculate requests per second
+    total_runtime_sec = total_runtime_ms / 1000.0
+    rps = len(client_metrics) / total_runtime_sec
+    logger.info(
+        f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
+        f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
+    )
+
+    # Benchmark parameters
+    params = {
+        "model": args.model,
+        "num_clients": args.num_clients,
+        "num_conversations": len(conversations),
+        "active_conversations": args.max_active_conversations,
+        "seed": args.seed,
+    }
+
+    if args.limit_min_tokens > 0:
+        params["min_tokens"] = args.limit_min_tokens
+
+    if args.limit_max_tokens > 0:
+        params["max_tokens"] = args.limit_max_tokens
+
+    # Process and print statistics (and save excel file with the statistics)
+    process_statistics(
+        client_metrics,
+        test_params=params,
+        warmup_percentages=warmup_percentages,
+        verbose=args.verbose,
+        gen_conv_args=gen_conv_args,
+        excel_output=args.excel_output,
+    )
+
+    if args.output_file is not None:
+        # Write a JSON file with the updated conversations
+        # The "assistant" content will contain the answers from the tested LLM
+        output_data: ShareGptConversations = conversations_dict_to_list(client_convs)
+        logger.info(
+            f"{Color.GREEN}Writing conversations file: {args.output_file}{Color.RESET}"
+        )
+        with open(args.output_file, "w") as f:
+            json.dump(output_data, f, indent=4)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/benchmarks/multi_turn/convert_sharegpt_to_openai.py b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
new file mode 100644
index 0000000000..c3622c99a2
--- /dev/null
+++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
@@ -0,0 +1,354 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Download dataset from:
+https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json
+
+Convert to OpenAI API:
+export INPUT_FILE=sharegpt_20230401_clean_lang_split.json
+python convert_sharegpt_to_openai.py $INPUT_FILE sharegpt_conv_128.json --max-items=128
+"""
+
+import argparse
+import json
+import random
+from statistics import mean
+from typing import Any, Optional
+
+import pandas as pd  # type: ignore
+import tqdm  # type: ignore
+from transformers import AutoTokenizer  # type: ignore
+
+
+def has_non_english_chars(text: str) -> bool:
+    return not text.isascii()
+
+
+def content_is_valid(
+    content: str, min_content_len: Optional[int], max_content_len: Optional[int]
+) -> bool:
+    if min_content_len and len(content) < min_content_len:
+        return False
+
+    if max_content_len and len(content) > max_content_len:
+        return False
+
+    return has_non_english_chars(content)
+
+
+def print_stats(
+    conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
+) -> None:
+    # Collect statistics
+    stats = []
+
+    print("\nCollecting statistics...")
+    for item in tqdm.tqdm(conversations):
+        # item has "id" and "messages"
+        messages = item["messages"]
+
+        user_turns = 0
+        assistant_turns = 0
+        user_words = 0
+        assistant_words = 0
+        conv_chars = 0
+
+        user_tokens: list[int] = []
+        assistant_tokens: list[int] = []
+
+        for m in messages:
+            content = m["content"]
+            conv_chars += len(content)
+            content_num_words = content.count(" ") + 1
+
+            num_tokens = 0
+            if tokenizer:
+                num_tokens = len(tokenizer(m["content"]).input_ids)
+
+            if m["role"] == "user":
+                user_turns += 1
+                user_words += content_num_words
+                if tokenizer:
+                    user_tokens.append(num_tokens)
+
+            elif m["role"] == "assistant":
+                assistant_turns += 1
+                assistant_words += content_num_words
+                if tokenizer:
+                    assistant_tokens.append(num_tokens)
+
+        # assert user_turns == assistant_turns, \
+        # f"Invalid conversation ID {item['id']}"
+
+        conv_words = user_words + assistant_words
+        item_stats = {
+            "user_turns": user_turns,
+            "assistant_turns": assistant_turns,
+            "user_words": user_words,
+            "assistant_words": assistant_words,
+            "conv_turns": len(messages),
+            "conv_words": conv_words,
+            "conv_characters": conv_chars,
+        }
+
+        if len(user_tokens) > 0:
+            item_stats["user_tokens"] = int(mean(user_tokens))
+
+        if len(assistant_tokens) > 0:
+            item_stats["assistant_tokens"] = int(mean(assistant_tokens))
+
+        stats.append(item_stats)
+
+    print("\nStatistics:")
+    percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999]
+    df = pd.DataFrame(stats)
+    print(df.describe(percentiles=percentiles).transpose())
+
+
+def convert_sharegpt_to_openai(
+    seed: int,
+    input_file: str,
+    output_file: str,
+    max_items: Optional[int],
+    min_content_len: Optional[int] = None,
+    max_content_len: Optional[int] = None,
+    min_turns: Optional[int] = None,
+    max_turns: Optional[int] = None,
+    model: Optional[str] = None,
+) -> None:
+    if min_turns and max_turns:
+        assert min_turns <= max_turns
+
+    if min_content_len and max_content_len:
+        # Verify that min is not larger than max if both were given
+        assert min_content_len <= max_content_len
+
+    print(
+        f"Input parameters:\n{seed=}, {max_items=}, {min_content_len=},"
+        f" {max_content_len=}, {min_turns=}, {max_turns=}\n"
+    )
+
+    random.seed(seed)
+
+    tokenizer = None
+    if model is not None:
+        print(f"Loading tokenizer from: {model}")
+        tokenizer = AutoTokenizer.from_pretrained(model)
+
+    # Read the ShareGPT JSON file
+    print(f"Reading file: {input_file}")
+    with open(input_file, encoding="utf-8") as f:
+        # Should be a list of dicts
+        # Each dict should have "id" (string) and "conversations" (list of dicts)
+        sharegpt_data = json.load(f)
+
+    assert isinstance(sharegpt_data, list), "Input file should contain a list of dicts"
+
+    print(f"Total items in input file: {len(sharegpt_data):,}")
+
+    print(f"Shuffling dataset with seed {seed}")
+    random.shuffle(sharegpt_data)
+
+    # Map conversation ID to the all the messages
+    conversation_parts: dict[str, list[Any]] = {}
+
+    for item in tqdm.tqdm(sharegpt_data):
+        assert "id" in item, "Missing key 'id'"
+        assert "conversations" in item, "Missing key 'conversations'"
+
+        # Conversation ID (e.g: "hiWPlMD") and part/session (0, 1, 2, etc.)
+        conv_id, _ = item["id"].split("_")
+        new_turns = item["conversations"]
+
+        if conv_id not in conversation_parts:
+            # Start new conversation
+            conversation_parts[conv_id] = []
+        elif len(conversation_parts[conv_id]) > 0 and len(new_turns) > 0:
+            prev_turns = conversation_parts[conv_id][-1]
+            if prev_turns[-1]["from"] == new_turns[0]["from"]:
+                new_turns = new_turns[1:]
+
+        if len(new_turns) > 0:
+            # We assume that parts are in order in the ShareGPT dataset
+            conversation_parts[conv_id].append(new_turns)
+
+    dataset: list[dict[str, Any]] = []
+    for conv_id, conv_parts in conversation_parts.items():
+        new_item = {"id": conv_id}
+
+        conversations: list[dict[str, str]] = []
+
+        # Merge all parts
+        for conv_part in conv_parts:
+            conversations.extend(conv_part)
+
+        if len(conversations) > 0:
+            new_item["conversations"] = conversations
+            dataset.append(new_item)
+
+    print(f"Total unique conversations (IDs) in input file: {len(dataset):,}")
+
+    # Final output data
+    final_openai_dataset: list[dict] = []
+
+    # Filter conversations from the ShareGPT dataset and convert to OpenAI format
+    for item in tqdm.tqdm(dataset):
+        messages: list[dict] = []
+
+        assert "id" in item, "Missing key 'id'"
+        assert "conversations" in item, "Missing key 'conversations'"
+
+        conv_id = item["id"]
+        conversations = item["conversations"]
+
+        if min_turns is not None and len(conversations) < min_turns:
+            # Skip short conversations
+            continue
+
+        # Convert each message in the conversation, up to max_turns if specified
+        for i, turn in enumerate(conversations):
+            assert "from" in turn and "value" in turn, (
+                f"Invalid conversation ID {conv_id} - missing 'from' or 'value'"
+            )
+
+            role = None
+            turn_from = turn["from"]
+
+            if turn_from in {"human", "user"}:
+                role = "user"
+            elif turn_from in {"gpt", "bing", "chatgpt", "bard"}:
+                role = "assistant"
+            elif turn_from == "system":
+                role = "system"
+
+            assert role is not None, (
+                f"Invalid conversation ID {conv_id} - 'from'='{turn_from}' is invalid"
+            )
+
+            if i == 0 and role != "user":
+                # If the first message is from assistant (gpt), skip it.
+                # this happens when the conversation is a follow-up
+                # to a previous conversation (from the same user).
+                continue
+
+            if max_turns is not None and i >= max_turns:
+                break
+
+            # Convert message to OpenAI format (with "role" and "content")
+            content = turn["value"]
+            messages.append({"role": role, "content": content})
+
+        # Add the converted conversation to the OpenAI format
+        if len(messages) > 0:
+            valid_messages = True
+
+            # First turn should always be from the user
+            user_turn = True
+
+            for m in messages:
+                # Make sure that turns alternate between user and assistant
+                if (user_turn and m["role"] != "user") or (
+                    not user_turn and m["role"] != "assistant"
+                ):
+                    valid_messages = False
+                    break
+
+                user_turn = not user_turn
+
+                content = m["content"]
+                valid_messages = content_is_valid(
+                    content, min_content_len, max_content_len
+                )
+                if not valid_messages:
+                    break
+
+            if valid_messages is True:
+                final_openai_dataset.append({"id": conv_id, "messages": messages})
+
+    assert len(final_openai_dataset) > 0, "Final number of conversations is zero"
+
+    print_stats(final_openai_dataset)
+
+    print_stats_again = False
+    if max_items is not None and len(final_openai_dataset) > max_items:
+        print(f"\n\nSampling {max_items} items from the dataset...")
+        print_stats_again = True
+        final_openai_dataset = random.sample(final_openai_dataset, max_items)
+
+    if print_stats_again:
+        # Print stats after the dataset changed
+        print_stats(final_openai_dataset, tokenizer)
+
+    # Write the converted data to a new JSON file
+    final_size = len(final_openai_dataset)
+    print(f"\nTotal conversations converted (after filtering): {final_size:,}")
+    print(f"\nWriting file: {output_file}")
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(final_openai_dataset, f, ensure_ascii=False, indent=2)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Convert ShareGPT dataset to OpenAI API format"
+    )
+    parser.add_argument("input_file", help="Path to the input ShareGPT JSON file")
+    parser.add_argument(
+        "output_file", help="Path to the output OpenAI format JSON file"
+    )
+    parser.add_argument(
+        "--seed", type=int, default=0, help="Seed for random number generators"
+    )
+    parser.add_argument(
+        "--max-items",
+        type=int,
+        default=None,
+        help="Maximum number of items in the output file",
+    )
+    parser.add_argument(
+        "--min-turns",
+        type=int,
+        default=None,
+        help="Minimum number of turns per conversation",
+    )
+    parser.add_argument(
+        "--max-turns",
+        type=int,
+        default=None,
+        help="Maximum number of turns per conversation",
+    )
+    parser.add_argument(
+        "--min-content-len",
+        type=int,
+        default=None,
+        help="Min number of characters in the messages' content",
+    )
+    parser.add_argument(
+        "--max-content-len",
+        type=int,
+        default=None,
+        help="Max number of characters in the messages' content",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help="LLM model, only the tokenizer will be used",
+    )
+
+    args = parser.parse_args()
+
+    convert_sharegpt_to_openai(
+        args.seed,
+        args.input_file,
+        args.output_file,
+        args.max_items,
+        args.min_content_len,
+        args.max_content_len,
+        args.min_turns,
+        args.max_turns,
+        args.model,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/multi_turn/generate_multi_turn.json b/benchmarks/multi_turn/generate_multi_turn.json
new file mode 100644
index 0000000000..274d03c2bd
--- /dev/null
+++ b/benchmarks/multi_turn/generate_multi_turn.json
@@ -0,0 +1,35 @@
+{
+    "filetype": "generate_conversations",
+    "num_conversations": 24,
+    "text_files": ["pg1184.txt"],
+    "print_stats": false,
+    "prompt_input": {
+        "num_turns": {
+            "distribution": "uniform",
+            "min": 12,
+            "max": 18
+        },
+        "common_prefix_num_tokens": {
+            "distribution": "constant",
+            "value": 500
+        },
+        "prefix_num_tokens": {
+            "distribution": "lognormal",
+            "mean": 6,
+            "sigma": 4,
+            "max": 1500
+        },
+        "num_tokens": {
+            "distribution": "uniform",
+            "min": 120,
+            "max": 160
+        }
+    },
+    "prompt_output": {
+        "num_tokens": {
+            "distribution": "uniform",
+            "min": 80,
+            "max": 120
+        }
+    }
+}
\ No newline at end of file
diff --git a/benchmarks/multi_turn/requirements.txt b/benchmarks/multi_turn/requirements.txt
new file mode 100644
index 0000000000..f0e1935914
--- /dev/null
+++ b/benchmarks/multi_turn/requirements.txt
@@ -0,0 +1,5 @@
+numpy>=1.24
+pandas>=2.0.0
+aiohttp>=3.10
+transformers>=4.46
+xlsxwriter>=3.2.1
\ No newline at end of file

From f756a682d96ba1824b6a759017f9d27a7f5f0182 Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Fri, 8 Aug 2025 11:18:33 -0700
Subject: [PATCH 094/932] [gpt-oss] guard import when triton kernel is not
 installed (#22529)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .../fused_moe/gpt_oss_triton_kernels_moe.py   | 29 ++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 4482029c16..6d6a2e22bc 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 
@@ -8,13 +8,16 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate)
 from vllm.model_executor.layers.fused_moe.utils import extract_required_args
+from vllm.utils import has_triton_kernels
 
-if True:
+if has_triton_kernels():
     import triton_kernels.swiglu
-    from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation,
-                                           PrecisionConfig, matmul_ogs)
+    from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs
     from triton_kernels.routing import routing
 
+if TYPE_CHECKING:
+    from triton_kernels.matmul_ogs import PrecisionConfig
+
 
 def triton_kernel_moe_forward(
     hidden_states: torch.Tensor,
@@ -33,8 +36,8 @@ def triton_kernel_moe_forward(
     w2_scale: Optional[torch.Tensor] = None,
     w1_bias: Optional[torch.Tensor] = None,
     w2_bias: Optional[torch.Tensor] = None,
-    w1_precision=None,  # PrecisionConfig or None
-    w2_precision=None,  # PrecisionConfig or None
+    w1_precision: Optional["PrecisionConfig"] = None,
+    w2_precision: Optional["PrecisionConfig"] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[list[int]] = None,
@@ -90,8 +93,8 @@ def triton_kernel_fused_experts(
     w2_scale: Optional[torch.Tensor] = None,
     w1_bias: Optional[torch.Tensor] = None,
     w2_bias: Optional[torch.Tensor] = None,
-    w1_precision=None,  # PrecisionConfig or None
-    w2_precision=None,  # PrecisionConfig or None
+    w1_precision: Optional["PrecisionConfig"] = None,
+    w2_precision: Optional["PrecisionConfig"] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[list[int]] = None,
@@ -141,8 +144,14 @@ def triton_kernel_fused_experts(
 
 class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
-    def __init__(self, quant_config, max_num_tokens: int, num_dispatchers: int,
-                 w1_precision: PrecisionConfig, w2_precision: PrecisionConfig):
+    def __init__(
+        self,
+        quant_config,
+        max_num_tokens: int,
+        num_dispatchers: int,
+        w1_precision: "PrecisionConfig",
+        w2_precision: "PrecisionConfig",
+    ):
         super().__init__(quant_config)
         self.max_num_tokens = max_num_tokens
         self.num_dispatchers = num_dispatchers

From e29059407251c071a75b1b1d89471326add28b90 Mon Sep 17 00:00:00 2001
From: Ricardo Decal <crypdick@users.noreply.github.com>
Date: Fri, 8 Aug 2025 12:26:21 -0700
Subject: [PATCH 095/932] =?UTF-8?q?[Docs]=20Rename=20=E2=80=9CDistributed?=
 =?UTF-8?q?=20inference=20and=20serving=E2=80=9D=20to=20=E2=80=9CParalleli?=
 =?UTF-8?q?sm=20&=20Scaling=E2=80=9D=20(#22466)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
---
 docs/models/supported_models.md               | 20 +++++++++----------
 ...uted_serving.md => parallelism_scaling.md} |  2 +-
 docs/usage/troubleshooting.md                 |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)
 rename docs/serving/{distributed_serving.md => parallelism_scaling.md} (99%)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 265643a441..b79650444a 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -320,7 +320,7 @@ th {
 }
 </style>
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -426,7 +426,7 @@ See [this page](./pooling_models.md) for more information on how to use pooling
 
 These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API.
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
 | `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ |
@@ -466,7 +466,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding
 
 These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API.
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
 | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
@@ -483,7 +483,7 @@ If your model is not in the above list, we will try to automatically convert the
 Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
 These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | |
 | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -521,7 +521,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 
 These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API.
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `LlamaForCausalLM`<sup>C</sup> | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -594,7 +594,7 @@ See [this page](generative_models.md) for more information on how to use generat
 
 These models primarily accept the [`LLM.generate`](./generative_models.md#llmgenerate) API. Chat/Instruct models additionally support the [`LLM.chat`](./generative_models.md#llmchat) API.
 
-| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
@@ -647,7 +647,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 
 Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
 
-| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
 | `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ✅︎ |
 
@@ -726,7 +726,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th
 
 Speech2Text models trained specifically for Automatic Speech Recognition.
 
-| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
 | `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ |
@@ -744,7 +744,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 
 The following table lists those that are tested in vLLM.
 
-| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
+| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
 | `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
@@ -760,7 +760,7 @@ The following table lists those that are tested in vLLM.
 Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
 These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
 
-| Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
+| Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][parallelism-scaling]   | [V1](gh-issue:8779)   |
 |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
 | `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
 
diff --git a/docs/serving/distributed_serving.md b/docs/serving/parallelism_scaling.md
similarity index 99%
rename from docs/serving/distributed_serving.md
rename to docs/serving/parallelism_scaling.md
index fc9d9f8a34..fa7fc1b290 100644
--- a/docs/serving/distributed_serving.md
+++ b/docs/serving/parallelism_scaling.md
@@ -1,4 +1,4 @@
-# Distributed inference and serving
+# Parallelism and Scaling
 
 ## Distributed inference strategies for a single-model replica
 
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index f9ba32c58c..9715ad66d9 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -289,7 +289,7 @@ Traceback (most recent call last):
 ...
 ```
 
-This indicates vLLM failed to initialize the NCCL communicator, possibly due to a missing `IPC_LOCK` linux capability  or an unmounted `/dev/shm`. Refer to [Distributed Inference and Serving](../serving/distributed_serving.md#running-vllm-on-multiple-nodes) for guidance on properly configuring the environment for distributed serving.
+This indicates vLLM failed to initialize the NCCL communicator, possibly due to a missing `IPC_LOCK` linux capability  or an unmounted `/dev/shm`. Refer to [Enabling GPUDirect RDMA](../serving/parallelism_scaling.md#enabling-gpudirect-rdma) for guidance on properly configuring the environment for GPUDirect RDMA.
 
 ## Known Issues
 

From fe6d8257a1859cdd938cb2ec2a63a45c666dcca3 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 8 Aug 2025 15:06:37 -0700
Subject: [PATCH 096/932] [gpt-oss] Support tool call and implement MCP tool
 server (#22427)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/harmony_utils.py            |   5 +-
 vllm/entrypoints/openai/api_server.py        |   6 +-
 vllm/entrypoints/openai/serving_responses.py | 185 +++++++++++--------
 vllm/entrypoints/tool_server.py              | 119 +++++++++++-
 4 files changed, 233 insertions(+), 82 deletions(-)

diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index 87e76e08a0..efca1472e4 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -237,7 +237,10 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
                     id=f"rs_{random_uuid()}",
                     summary=[],
                     type="reasoning",
-                    text=content.text,
+                    content=[
+                        ResponseReasoningTextContent(text=content.text,
+                                                     type="reasoning_text")
+                    ],
                     status=None,
                 )
                 output_items.append(reasoning_item)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index c695ea8b5a..00eaba8c87 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -94,7 +94,8 @@ from vllm.entrypoints.openai.serving_tokenization import (
 from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription, OpenAIServingTranslation)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
-from vllm.entrypoints.tool_server import DemoToolServer, ToolServer
+from vllm.entrypoints.tool_server import (DemoToolServer, MCPToolServer,
+                                          ToolServer)
 from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
                                     log_non_default_args, with_cancellation)
 from vllm.logger import init_logger
@@ -1635,6 +1636,9 @@ async def init_app_state(
 
     if args.tool_server == "demo":
         tool_server: Optional[ToolServer] = DemoToolServer()
+    elif args.tool_server:
+        tool_server = MCPToolServer()
+        await tool_server.add_tool_server(args.tool_server)
     else:
         tool_server = None
 
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index a7554e0d68..1e3746e956 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -4,6 +4,7 @@
 import asyncio
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
+from contextlib import AsyncExitStack
 from copy import copy
 from http import HTTPStatus
 from typing import Any, Callable, Final, Optional, Union
@@ -226,65 +227,114 @@ class OpenAIServingResponses(OpenAIServing):
 
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[ConversationContext, None]] = []
-        try:
-            tool_sessions: dict[str, Any] = {}
-            for i, engine_prompt in enumerate(engine_prompts):
-                default_max_tokens = self.max_model_len - len(
-                    engine_prompt["prompt_token_ids"])
-                sampling_params = request.to_sampling_params(
-                    default_max_tokens, self.default_sampling_params)
 
-                trace_headers = (None if raw_request is None else await
-                                 self._get_trace_headers(raw_request.headers))
-
-                context: ConversationContext
-                if self.use_harmony:
-                    if request.stream:
-                        context = StreamingHarmonyContext(
-                            messages, tool_sessions)
-                    else:
-                        context = HarmonyContext(messages, tool_sessions)
+        builtin_tool_list: list[str] = []
+        if self.use_harmony and self.tool_server is not None:
+            if self.tool_server.has_tool("browser"):
+                builtin_tool_list.append("browser")
+            if self.tool_server.has_tool("python"):
+                builtin_tool_list.append("python")
+        async with AsyncExitStack() as exit_stack:
+            try:
+                if self.tool_server is not None:
+                    # TODO: initialize tool sessions lazily when the session
+                    # is actually used.
+                    tool_session_ctxs: dict[str, Any] = {
+                        tool_name:
+                        exit_stack.enter_async_context(
+                            self.tool_server.new_session(tool_name))
+                        for tool_name in builtin_tool_list
+                    }
+                    tool_sessions = {}
+                    for tool_name in builtin_tool_list:
+                        tool_sessions[tool_name] = (
+                            await tool_session_ctxs[tool_name])
                 else:
-                    context = SimpleContext()
-                generator = self._generate_with_builtin_tools(
-                    request_id=request.request_id,
-                    request_prompt=request_prompts[i],
-                    engine_prompt=engine_prompt,
-                    sampling_params=sampling_params,
-                    context=context,
-                    lora_request=lora_request,
-                    priority=request.priority,
-                    trace_headers=trace_headers,
+                    assert len(builtin_tool_list) == 0
+                    tool_sessions = {}
+                for i, engine_prompt in enumerate(engine_prompts):
+                    default_max_tokens = self.max_model_len - len(
+                        engine_prompt["prompt_token_ids"])
+                    sampling_params = request.to_sampling_params(
+                        default_max_tokens, self.default_sampling_params)
+
+                    trace_headers = (None if raw_request is None else await
+                                     self._get_trace_headers(
+                                         raw_request.headers))
+
+                    context: ConversationContext
+                    if self.use_harmony:
+                        if request.stream:
+                            context = StreamingHarmonyContext(
+                                messages, tool_sessions)
+                        else:
+                            context = HarmonyContext(messages, tool_sessions)
+                    else:
+                        context = SimpleContext()
+                    generator = self._generate_with_builtin_tools(
+                        request_id=request.request_id,
+                        request_prompt=request_prompts[i],
+                        engine_prompt=engine_prompt,
+                        sampling_params=sampling_params,
+                        context=context,
+                        lora_request=lora_request,
+                        priority=request.priority,
+                        trace_headers=trace_headers,
+                    )
+                    generators.append(generator)
+            except ValueError as e:
+                # TODO: Use a vllm-specific Validation Error
+                return self.create_error_response(str(e))
+
+            assert len(generators) == 1
+            result_generator, = generators
+
+            # Store the input messages.
+            if request.store:
+                self.msg_store[request.request_id] = messages
+
+            if request.background:
+                created_time = int(time.time())
+                response = ResponsesResponse.from_request(
+                    request,
+                    sampling_params,
+                    model_name=model_name,
+                    created_time=created_time,
+                    output=[],
+                    status="queued",
+                    usage=None,
                 )
-                generators.append(generator)
-        except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+                async with self.response_store_lock:
+                    self.response_store[response.id] = response
 
-        assert len(generators) == 1
-        result_generator, = generators
+                # Run the request in the background.
+                task = asyncio.create_task(
+                    self._run_background_request(
+                        request,
+                        sampling_params,
+                        result_generator,
+                        context,
+                        model_name,
+                        tokenizer,
+                        request_metadata,
+                        created_time,
+                    ),
+                    name=f"create_{response.id}",
+                )
 
-        # Store the input messages.
-        if request.store:
-            self.msg_store[request.request_id] = messages
+                # For cleanup.
+                response_id = response.id
+                self.background_tasks[response_id] = task
+                task.add_done_callback(
+                    lambda _: self.background_tasks.pop(response_id, None))
+                return response
 
-        if request.background:
-            created_time = int(time.time())
-            response = ResponsesResponse.from_request(
-                request,
-                sampling_params,
-                model_name=model_name,
-                created_time=created_time,
-                output=[],
-                status="queued",
-                usage=None,
-            )
-            async with self.response_store_lock:
-                self.response_store[response.id] = response
+            if request.stream:
+                raise NotImplementedError(
+                    "Streaming responses are not supported")
 
-            # Run the request in the background.
-            task = asyncio.create_task(
-                self._run_background_request(
+            try:
+                return await self.responses_full_generator(
                     request,
                     sampling_params,
                     result_generator,
@@ -292,33 +342,10 @@ class OpenAIServingResponses(OpenAIServing):
                     model_name,
                     tokenizer,
                     request_metadata,
-                    created_time,
-                ),
-                name=f"create_{response.id}",
-            )
-
-            # For cleanup.
-            response_id = response.id
-            self.background_tasks[response_id] = task
-            task.add_done_callback(
-                lambda _: self.background_tasks.pop(response_id, None))
-            return response
-
-        if request.stream:
-            raise NotImplementedError("Streaming responses are not supported")
-
-        try:
-            return await self.responses_full_generator(
-                request,
-                sampling_params,
-                result_generator,
-                context,
-                model_name,
-                tokenizer,
-                request_metadata,
-            )
-        except Exception as e:
-            return self.create_error_response(str(e))
+                )
+            except Exception as e:
+                return self.create_error_response(str(e))
+        return self.create_error_response("Should not reach here")
 
     async def _make_request(
         self,
diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py
index 769c40e8cc..352704b2b3 100644
--- a/vllm/entrypoints/tool_server.py
+++ b/vllm/entrypoints/tool_server.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from contextlib import AbstractAsyncContextManager, asynccontextmanager
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 from openai_harmony import ToolNamespaceConfig
 
@@ -11,6 +11,61 @@ from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
+if TYPE_CHECKING:
+    from mcp.types import ListToolsResult
+
+
+async def list_server_and_tools(server_url: str):
+    from mcp import ClientSession
+    from mcp.client.sse import sse_client
+
+    async with sse_client(url=server_url) as streams, ClientSession(
+            *streams) as session:
+        initialize_response = await session.initialize()
+        list_tools_response = await session.list_tools()
+        return initialize_response, list_tools_response
+
+
+def trim_schema(schema: dict) -> dict:
+    # Turn JSON Schema from MCP generated into Harmony's variant.
+    if "title" in schema:
+        del schema["title"]
+    if "default" in schema and schema["default"] is None:
+        del schema["default"]
+    if "anyOf" in schema:
+        # Turn "anyOf": [{"type": "type-1"}, {"type": "type-2"}]
+        # into "type": ["type-1", "type-2"]
+        # if there's more than 1 types, also remove "null" type as Harmony will
+        # just ignore it
+        types = [
+            type_dict["type"] for type_dict in schema["anyOf"]
+            if type_dict["type"] != 'null'
+        ]
+        schema["type"] = types
+        del schema["anyOf"]
+    if "properties" in schema:
+        schema["properties"] = {
+            k: trim_schema(v)
+            for k, v in schema["properties"].items()
+        }
+    return schema
+
+
+def post_process_tools_description(
+        list_tools_result: "ListToolsResult") -> "ListToolsResult":
+    # Adapt the MCP tool result for Harmony
+    for tool in list_tools_result.tools:
+        tool.inputSchema = trim_schema(tool.inputSchema)
+
+    # Some tools schema don't need to be part of the prompt (e.g. simple text
+    # in text out for Python)
+    list_tools_result.tools = [
+        tool for tool in list_tools_result.tools
+        if getattr(tool.annotations, "include_in_prompt", True)
+    ]
+
+    return list_tools_result
+
 
 class ToolServer(ABC):
 
@@ -38,6 +93,66 @@ class ToolServer(ABC):
         ...
 
 
+class MCPToolServer(ToolServer):
+
+    def __init__(self):
+        try:
+            import mcp  # noqa: F401
+        except ImportError:
+            raise ImportError(
+                "mcp is not installed. Please run `pip install mcp` to use "
+                "MCPToolServer.") from None
+        self.harmony_tool_descriptions = {}
+
+    async def add_tool_server(self, server_url: str):
+        from mcp.types import ToolDescription
+        tool_urls = server_url.split(",")
+        self.harmony_tool_descriptions = {}
+        self.urls: dict[str, str] = {}
+        for url in tool_urls:
+            url = f"http://{url}/sse"
+            initialize_response, list_tools_response = (
+                await list_server_and_tools(url))
+
+            list_tools_response = post_process_tools_description(
+                list_tools_response)
+
+            tool_from_mcp = ToolNamespaceConfig(
+                name=initialize_response.serverInfo.name,
+                description=initialize_response.instructions,
+                tools=[
+                    ToolDescription.new(name=tool.name,
+                                        description=tool.description,
+                                        parameters=tool.inputSchema)
+                    for tool in list_tools_response.tools
+                ])
+            self.harmony_tool_descriptions[tool_from_mcp.name] = tool_from_mcp
+            if tool_from_mcp.name not in self.urls:
+                self.urls[tool_from_mcp.name] = url
+            else:
+                logger.warning(
+                    "Tool %s already exists. Ignoring duplicate tool server %s",
+                    tool_from_mcp.name, url)
+
+    def has_tool(self, tool_name: str):
+        return tool_name in self.harmony_tool_descriptions
+
+    def get_tool_description(self, tool_name: str):
+        return self.harmony_tool_descriptions.get(tool_name)
+
+    @asynccontextmanager
+    async def new_session(self, tool_name: str):
+        from mcp import ClientSession
+        from mcp.client.sse import sse_client
+        url = self.urls.get(tool_name)
+        if not url:
+            raise KeyError(f"Tool '{tool_name}' is not supported")
+        async with sse_client(url=url) as streams, ClientSession(
+                *streams) as session:
+            await session.initialize()
+            yield session
+
+
 class DemoToolServer(ToolServer):
 
     def __init__(self):
@@ -67,4 +182,6 @@ class DemoToolServer(ToolServer):
 
     @asynccontextmanager
     async def new_session(self, tool_name: str):
+        if tool_name not in self.tools:
+            raise KeyError(f"Tool '{tool_name}' is not supported")
         yield self.tools[tool_name]

From cd9b9de1fb009cf607403ba08961f2a3f869931d Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 8 Aug 2025 19:09:42 -0400
Subject: [PATCH 097/932] [BugFix] Fix IMA FlashMLA full cuda-graph and DP +
 Update FlashMLA (#21691)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 cmake/external_projects/flashmla.cmake     |  8 ++--
 vllm/attention/ops/flashmla.py             |  1 -
 vllm/v1/attention/backends/mla/flashmla.py | 56 ++++++++++++++--------
 3 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index 6291475164..ee6768bce2 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
   FetchContent_Declare(
         flashmla
         GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
-        GIT_TAG 575f7724b9762f265bbee5889df9c7d630801845
+        GIT_TAG 0e43e774597682284358ff2c54530757b654b8d1
         GIT_PROGRESS TRUE
         CONFIGURE_COMMAND ""
         BUILD_COMMAND ""
@@ -37,9 +37,9 @@ cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
     set(FlashMLA_SOURCES
         ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
-        ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_bf16_sm90.cu
-        ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_fp16_sm90.cu
-        ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_metadata.cu)
+        ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
+        ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu)
 
     set(FlashMLA_INCLUDES
         ${flashmla_SOURCE_DIR}/csrc/cutlass/include
diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py
index b85f27ac41..1af26dfc3d 100644
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -91,7 +91,6 @@ def flash_mla_with_kvcache(
     out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla(
         q,
         k_cache,
-        None,
         head_dim_v,
         cache_seqlens,
         block_table,
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index b5aecff993..2b0f52cf80 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -70,6 +70,22 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
         self.cg_buf_tile_scheduler_metadata = None
         self.cg_buf_num_splits = None
 
+        device_properties = torch.cuda.get_device_properties(self.device)
+        num_sms = device_properties.multi_processor_count
+
+        if self.compilation_config.full_cuda_graph:
+            self.cg_buf_tile_scheduler_metadata = torch.zeros(
+                # Upper bound on size (<= #SMs, TileSchedulerMetaDataSize)
+                # TileSchedulerMetaDataSize = 8
+                (num_sms, 8),
+                device=self.device,
+                dtype=torch.int32,
+            )
+            self.cg_buf_num_splits = torch.empty(
+                (vllm_config.scheduler_config.max_num_seqs + 1),
+                device=self.device,
+                dtype=torch.int32)
+
     def _build_decode(self, block_table_tensor: torch.Tensor,
                       seq_lens: torch.Tensor) -> FlashMLADecodeMetadata:
         tile_scheduler_metadata, num_splits = \
@@ -80,28 +96,28 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
         )
 
         if self.compilation_config.full_cuda_graph:
-            # First time around (CUDAGraph capture), allocate the static buffer
-            if self.cg_buf_tile_scheduler_metadata is None:
-                self.cg_buf_tile_scheduler_metadata = tile_scheduler_metadata
-                self.cg_buf_num_splits = num_splits
-            else:
-                assert self.cg_buf_num_splits is not None
+            assert self.cg_buf_tile_scheduler_metadata is not None
+            assert self.cg_buf_num_splits is not None
 
-                # Metadata per-SM, fixed size (#SMs, TileMetadataSize)
-                assert (self.cg_buf_tile_scheduler_metadata.size() ==
-                        tile_scheduler_metadata.size())
-                self.cg_buf_tile_scheduler_metadata.\
-                    copy_(tile_scheduler_metadata)
-                tile_scheduler_metadata = self.cg_buf_tile_scheduler_metadata
+            sm_parts = tile_scheduler_metadata.size(0)
+            # Metadata per-SM, upper bound on size (<= #SMs, TileMetadataSize)
+            assert sm_parts <= self.cg_buf_tile_scheduler_metadata.size(0)
+            tile_scheduler_metadata_view = \
+                self.cg_buf_tile_scheduler_metadata[:sm_parts]
+            tile_scheduler_metadata_view.copy_(tile_scheduler_metadata)
+            tile_scheduler_metadata = tile_scheduler_metadata_view
 
-                # Num splits is per-batch, varying size (batch_size,)
-                n = num_splits.size(0)
-                # make sure static buffer is large enough
-                assert n <= self.cg_buf_num_splits.size(0)
-                num_splits_view = self.cg_buf_num_splits[:n]
-                num_splits_view.copy_(num_splits)
-                self.cg_buf_num_splits[n:].fill_(0)  # fill the rest with 0s
-                num_splits = num_splits_view
+            # Num splits is per-batch, varying size (batch_size,)
+            n = num_splits.size(0)
+            # make sure static buffer is large enough
+            assert n <= self.cg_buf_num_splits.size(0)
+            num_splits_view = self.cg_buf_num_splits[:n]
+            num_splits_view.copy_(num_splits)
+            # Num splits needs to monotonically increasing
+            # (with: https://github.com/vllm-project/FlashMLA/pull/3, otherwise
+            #  it needs to monotonically increasing by 1)
+            self.cg_buf_num_splits[n:].fill_(num_splits[-1])
+            num_splits = num_splits_view
 
         return FlashMLADecodeMetadata(
             block_table=block_table_tensor,

From f703b923f3885157cf02b951c42f967c25329b01 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Fri, 8 Aug 2025 19:09:59 -0400
Subject: [PATCH 098/932] [Misc] DeepGEMM : Avoid JIT generation in the
 hot-path (#22215)

Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
---
 .../layers/fused_moe/deep_gemm_moe.py         |  12 -
 .../layers/fused_moe/fused_moe.py             |  55 +++--
 .../model_executor/warmup/deep_gemm_warmup.py | 219 ++++++++++++++++++
 vllm/model_executor/warmup/kernel_warmup.py   |  20 ++
 vllm/v1/worker/gpu_worker.py                  |   5 +
 5 files changed, 274 insertions(+), 37 deletions(-)
 create mode 100644 vllm/model_executor/warmup/deep_gemm_warmup.py
 create mode 100644 vllm/model_executor/warmup/kernel_warmup.py

diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index ba7105c83a..9b8175f42a 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -237,18 +237,6 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         assert w1_scale is not None
         assert w2_scale is not None
 
-        if not env.VLLM_SKIP_DEEP_GEMM_WARMUP:
-            # DeepGemm JITs the grouped-gemm kernels. We don't want the JIT'ing
-            # to happen during actual model-inference. The
-            # `warmup_deepgemm_kernels` function is a `run_once` decorated
-            # function that executes during the model profile run. This warmup
-            # should create all the required JITs for the current model.
-            warmup_deepgemm_gg_contiguous_kernels(w1,
-                                                  w2,
-                                                  w1_scale,
-                                                  w2_scale,
-                                                  num_topk=topk_ids.size(1))
-
         a1q = hidden_states
         _, N, K = w1.size()
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 597af08c3c..f4f5457ebc 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -4,6 +4,9 @@
 import functools
 import json
 import os
+# torch.compile needs typing.List. It will fail torch.library.infer_schema
+# otherwise
+from typing import List  # noqa: UP035
 from typing import Any, Callable, Optional
 
 import torch
@@ -998,29 +1001,30 @@ def get_config_dtype_str(
     return None
 
 
-def inplace_fused_experts(hidden_states: torch.Tensor,
-                          w1: torch.Tensor,
-                          w2: torch.Tensor,
-                          topk_weights: torch.Tensor,
-                          topk_ids: torch.Tensor,
-                          activation: str = "silu",
-                          is_act_and_mul: bool = True,
-                          apply_router_weight_on_input: bool = False,
-                          use_fp8_w8a8: bool = False,
-                          use_int8_w8a8: bool = False,
-                          use_int8_w8a16: bool = False,
-                          use_int4_w4a16: bool = False,
-                          use_mxfp4_w4a4: bool = False,
-                          per_channel_quant: bool = False,
-                          global_num_experts: int = -1,
-                          expert_map: Optional[torch.Tensor] = None,
-                          w1_scale: Optional[torch.Tensor] = None,
-                          w2_scale: Optional[torch.Tensor] = None,
-                          w1_zp: Optional[torch.Tensor] = None,
-                          w2_zp: Optional[torch.Tensor] = None,
-                          a1_scale: Optional[torch.Tensor] = None,
-                          a2_scale: Optional[torch.Tensor] = None,
-                          block_shape: Optional[list[int]] = None) -> None:
+def inplace_fused_experts(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str = "silu",
+        is_act_and_mul: bool = True,
+        apply_router_weight_on_input: bool = False,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        use_mxfp4_w4a4: bool = False,
+        per_channel_quant: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[List[int]] = None) -> None:  #noqa: UP006
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
                        activation, is_act_and_mul,
                        apply_router_weight_on_input, use_fp8_w8a8,
@@ -1082,7 +1086,7 @@ def flashinfer_fused_moe_blockscale_fp8(
         intermediate_size: int,
         expert_offset: int,
         local_num_experts: int,
-        block_shape: list[int],
+        block_shape: List[int],  #noqa: UP006
         routed_scaling: float = 1.0) -> torch.Tensor:
     from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe
     assert top_k <= global_num_experts
@@ -1264,7 +1268,8 @@ def outplace_fused_experts(
         w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[list[int]] = None) -> torch.Tensor:
+        block_shape: Optional[List[int]] = None,  #noqa: UP006
+) -> torch.Tensor:
     return fused_experts_impl(
         hidden_states, w1, w2, topk_weights, topk_ids, False, activation,
         is_act_and_mul, apply_router_weight_on_input, use_fp8_w8a8,
diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py
new file mode 100644
index 0000000000..74599fa44c
--- /dev/null
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -0,0 +1,219 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Warmup deep_gemm kernels.
+DeepGEMM JIT's the kernels. The warmup aims to JIT all the kernels that would
+be used during model execution beforehand.
+"""
+
+import torch
+from tqdm import tqdm
+
+import vllm.envs as envs
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
+from vllm.model_executor.layers.fused_moe.deep_gemm_utils import (
+    compute_aligned_M, deep_gemm_block_shape)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+    TritonOrDeepGemmExperts)
+from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+from vllm.utils.deep_gemm import fp8_gemm_nt, m_grouped_fp8_gemm_nt_contiguous
+
+
+def _extract_data_from_linear_base_module(
+        m: torch.nn.Module) -> tuple[torch.Tensor, torch.Tensor, list[int]]:
+    """
+    Extract weights, weight scales and quantization block sizes from the given
+    LinearBase module.
+    """
+    assert isinstance(m, LinearBase)
+    assert isinstance(m.quant_method, Fp8LinearMethod)
+    assert m.quant_method.block_quant
+    assert m.quant_method.quant_config is not None
+
+    w = m.weight
+    ws = m.weight_scale_inv
+    quant_block_size = m.quant_method.quant_config.weight_block_size
+
+    assert isinstance(w, torch.Tensor)
+    assert isinstance(ws, torch.Tensor)
+    assert quant_block_size is not None
+    return (w, ws, quant_block_size)
+
+
+def _extract_data_from_fused_moe_module(
+    m: torch.nn.Module
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int]:
+    """
+    Extract weights, weight scales and num_topk from FusedMoE module.
+    """
+    assert isinstance(m, FusedMoE)
+    w13 = m.w13_weight
+    w13_s = m.w13_weight_scale_inv
+    w2 = m.w2_weight
+    w2_s = m.w2_weight_scale_inv
+    num_topk = m.top_k
+
+    assert isinstance(w13, torch.Tensor)
+    assert isinstance(w13_s, torch.Tensor)
+    assert isinstance(w2, torch.Tensor)
+    assert isinstance(w2_s, torch.Tensor)
+    return w13, w13_s, w2, w2_s, num_topk
+
+
+def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool:
+    """
+    Return True if the input module/layer could be processed with DeepGEMM.
+    """
+    block_size = deep_gemm_block_shape()[0]
+    if not (isinstance(module, LinearBase)
+            and isinstance(module.quant_method, Fp8LinearMethod)
+            and module.quant_method.block_quant):
+        return False
+
+    w, _, block_sizes = _extract_data_from_linear_base_module(module)
+    return (block_sizes == deep_gemm_block_shape() and w.ndim == 2
+            and w.shape[0] % block_size == 0 and w.shape[1] % block_size == 0)
+
+
+def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
+    if not (isinstance(module, FusedMoE)
+            and module.moe_config.quant_dtype == torch.float8_e4m3fn
+            and module.moe_config.block_shape == deep_gemm_block_shape()):
+        return False
+
+    if not isinstance(module.quant_method.fused_experts,
+                      FusedMoEModularKernel):
+        # fused_experts could invoke deep_gemm_moe_fp8
+        return True
+
+    mk: FusedMoEModularKernel = module.quant_method.fused_experts
+    # Further check if the ModularKernel implementation uses the DeepGemmExperts
+    return isinstance(mk.fused_experts,
+                      (DeepGemmExperts, TritonOrDeepGemmExperts))
+
+
+FP8_GEMM_NT_WARMUP_CACHE: set[torch.Size] = set()
+
+
+def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor,
+                                 max_tokens: int):
+    if w.size() in FP8_GEMM_NT_WARMUP_CACHE:
+        return
+
+    n, k = w.size()
+    block_m = deep_gemm_block_shape()[0]
+
+    device = w.device
+    a1q = torch.empty((max_tokens, k),
+                      device=device,
+                      dtype=torch.float8_e4m3fn)
+    a1q_scales = torch.empty((max_tokens, k // block_m),
+                             device=device,
+                             dtype=torch.float32)
+    out = torch.empty((max_tokens, n), device=device, dtype=torch.bfloat16)
+
+    pbar = tqdm(total=max_tokens,
+                desc=f"DeepGemm(fp8_gemm_nt) warmup (W={w.size()})")
+    num_tokens = max_tokens
+    while num_tokens > 0:
+        fp8_gemm_nt((a1q[:num_tokens], a1q_scales[:num_tokens]), (w, ws),
+                    out[:num_tokens])
+        pbar.update(1)
+        num_tokens -= 1
+
+    FP8_GEMM_NT_WARMUP_CACHE.add(w.size())
+
+
+GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE: set[torch.Size] = set()
+
+
+def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(w1: torch.Tensor,
+                                                    w2: torch.Tensor,
+                                                    w1_scale: torch.Tensor,
+                                                    w2_scale: torch.Tensor,
+                                                    num_topk: int):
+    if (w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
+            and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE):
+        return
+
+    assert w1.size(0) == w2.size(0), (
+        "w1 and w2 must have the same number of experts")
+
+    block_m = deep_gemm_block_shape()[0]
+    num_experts = w1.size(0)
+    device = w1.device
+
+    # This is the maximum GroupedGemm M size that we expect to run
+    # the grouped_gemm with.
+    MAX_M = compute_aligned_M(envs.VLLM_FUSED_MOE_CHUNK_SIZE,
+                              num_topk,
+                              num_experts,
+                              block_m,
+                              expert_tokens_meta=None)
+    # Distribute expert-ids evenly.
+    MAX_BLOCKS = MAX_M // block_m
+    expert_ids_block = torch.randint(low=0,
+                                     high=num_experts,
+                                     size=(MAX_BLOCKS, ),
+                                     device=device,
+                                     dtype=torch.int32)
+    expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0)
+
+    def _warmup(w: torch.Tensor, w_scale: torch.Tensor):
+
+        _, n, k = w.size()
+        a1q = torch.empty((MAX_M, k), device=device, dtype=torch.float8_e4m3fn)
+        a1q_scales = torch.empty((MAX_M, k // block_m),
+                                 device=device,
+                                 dtype=torch.float32)
+        out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16)
+
+        pbar = tqdm(
+            total=MAX_BLOCKS,
+            desc=
+            f"DeepGemm(m_grouped_fp8_gemm_nt_contiguous) warmup (W={w.size()})"
+        )
+        num_tokens = MAX_M
+        while num_tokens > 0:
+            m_grouped_fp8_gemm_nt_contiguous(
+                (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, w_scale),
+                out[:num_tokens], expert_ids[:num_tokens])
+            pbar.update(1)
+            num_tokens = num_tokens - block_m
+
+    for w, ws in [(w1, w1_scale), (w2, w2_scale)]:
+        if w.size() not in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE:
+            _warmup(w, ws)
+            GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE.add(w.size())
+
+
+def deepgemm_fp8_gemm_nt_warmup(model: torch.nn.Module, max_tokens: int):
+    dg_modules = [
+        m for m in model.modules() if _fp8_linear_may_use_deep_gemm(m)
+    ]
+
+    for dgm in dg_modules:
+        w, ws, _ = _extract_data_from_linear_base_module(dgm)
+        _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens)
+
+
+def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model: torch.nn.Module):
+    dg_modules = [
+        m for m in model.modules()
+        if _fused_moe_grouped_gemm_may_use_deep_gemm(m)
+    ]
+
+    for dgm in dg_modules:
+        w13, w13_scale, w2, w2_scale, num_topk = (
+            _extract_data_from_fused_moe_module(dgm))
+        _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
+            w13, w2, w13_scale, w2_scale, num_topk)
+
+
+def deep_gemm_warmup(model: torch.nn.Module, max_tokens: int):
+    deepgemm_fp8_gemm_nt_warmup(model, max_tokens)
+    deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model)
diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
new file mode 100644
index 0000000000..10f2dc0252
--- /dev/null
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Warmup kernels used during model execution.
+This is useful specifically for JIT'ed kernels as we don't want JIT'ing to
+happen during model execution.
+"""
+import torch
+
+import vllm.envs as envs
+from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup
+from vllm.utils.deep_gemm import is_deep_gemm_supported
+
+
+def kernel_warmup(model: torch.nn.Module, max_tokens: int):
+    do_deep_gemm_warmup = (envs.VLLM_USE_DEEP_GEMM
+                           and is_deep_gemm_supported()
+                           and not envs.VLLM_SKIP_DEEP_GEMM_WARMUP)
+    if do_deep_gemm_warmup:
+        deep_gemm_warmup(model, max_tokens)
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 7fca245c1b..0ea23921a0 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -21,6 +21,7 @@ from vllm.distributed.parallel_state import get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
+from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
@@ -338,6 +339,10 @@ class Worker(WorkerBase):
                 self.model_runner._dummy_sampler_run(
                     hidden_states=last_hidden_states)
 
+        # Warmup kernels used during model execution
+        kernel_warmup(self.get_model(),
+                      max_tokens=self.scheduler_config.max_num_batched_tokens)
+
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)

From bd875d2eb71b130cbc2b68bf0e2dd285f5c7348d Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Sat, 9 Aug 2025 01:10:25 +0200
Subject: [PATCH 099/932] [Bugfix] Update FA commit hash (#22546)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 cmake/external_projects/vllm_flash_attn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 59b99e9e20..d24d8e8e5e 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 6dbc6e011a3ebe9349eeb74578940dd7095436ba
+          GIT_TAG 93cf5a08f421a3efd0c4a7e005ef8f742b578ce0
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From 41b965575136a72c21927b87a16bd7460b3a3cf8 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 9 Aug 2025 00:20:58 +0100
Subject: [PATCH 100/932] Skip Qwen 1 in CI because remote code is no longer
 compatible with Transformers (#22536)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/models/registry.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 2c2d094e04..b1952ce9c2 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -278,6 +278,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                          transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings",  # noqa: E501
                                         trust_remote_code=True),
     "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
+                                       max_transformers_version="4.53",
+                                       transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
                                        trust_remote_code=True),
     "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-0.5B-Instruct",
                                         extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}), # noqa: E501

From 2fcf6b27b6902a18aaf4a6fb8cf5c7efc8afc731 Mon Sep 17 00:00:00 2001
From: Guy Stone <guys@spotify.com>
Date: Fri, 8 Aug 2025 19:22:35 -0400
Subject: [PATCH 101/932] [Docs] fix broken links in metrics.md (#22315)

Signed-off-by: Guy Stone <guys@spotify.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/design/metrics.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index 1f65331d3c..b01838883f 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -57,11 +57,11 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
 - `vllm:spec_decode_num_draft_tokens_total` (Counter)
 - `vllm:spec_decode_num_emitted_tokens_total` (Counter)
 
-These are documented under [Inferencing and Serving -> Production Metrics](../../usage/metrics.md).
+These are documented under [Inferencing and Serving -> Production Metrics](../usage/metrics.md).
 
 ### Grafana Dashboard
 
-vLLM also provides [a reference example](../../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
+vLLM also provides [a reference example](../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
 
 The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
 
@@ -455,7 +455,7 @@ In general:
    [an escape hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics)
    for some time before deleting them.
 
-See the [deprecation policy](../../contributing/deprecation_policy.md) for
+See the [deprecation policy](../contributing/deprecation_policy.md) for
 the project-wide deprecation policy.
 
 ### Unimplemented - `vllm:tokens_total`
@@ -655,7 +655,7 @@ v0 has support for OpenTelemetry tracing:
 - Added by <gh-pr:4687>
 - Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
 - [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
-- [User-facing docs](../../examples/online_serving/opentelemetry.md)
+- [User-facing docs](../examples/online_serving/opentelemetry.md)
 - [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
 - [IBM product docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
   

From baece8c3d26484b918fa123c884e6ea81237b661 Mon Sep 17 00:00:00 2001
From: yyweiss <70619747+yyweiss@users.noreply.github.com>
Date: Sat, 9 Aug 2025 02:23:44 +0300
Subject: [PATCH 102/932] [Frontend] Add unix domain socket support (#18097)

Signed-off-by: <yyweiss@gmail.com>
Signed-off-by: yyw <yyweiss@gmail.com>
---
 docs/cli/README.md                    |  3 ++
 tests/entrypoints/openai/test_uds.py  | 43 +++++++++++++++++++++++++++
 tests/utils.py                        | 27 ++++++++++++-----
 vllm/entrypoints/openai/api_server.py | 27 ++++++++++++-----
 vllm/entrypoints/openai/cli_args.py   |  2 ++
 5 files changed, 86 insertions(+), 16 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_uds.py

diff --git a/docs/cli/README.md b/docs/cli/README.md
index b1371c82a4..a7de6d7192 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -29,6 +29,9 @@ Start the vLLM OpenAI Compatible API server.
     # Specify the port
     vllm serve meta-llama/Llama-2-7b-hf --port 8100
 
+    # Serve over a Unix domain socket
+    vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock
+
     # Check with --help for more options
     # To list all groups
     vllm serve --help=listgroup
diff --git a/tests/entrypoints/openai/test_uds.py b/tests/entrypoints/openai/test_uds.py
new file mode 100644
index 0000000000..5c39869a79
--- /dev/null
+++ b/tests/entrypoints/openai/test_uds.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from tempfile import TemporaryDirectory
+
+import httpx
+import pytest
+
+from vllm.version import __version__ as VLLM_VERSION
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def server():
+    with TemporaryDirectory() as tmpdir:
+        args = [
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "8192",
+            "--enforce-eager",
+            "--max-num-seqs",
+            "128",
+            "--uds",
+            f"{tmpdir}/vllm.sock",
+        ]
+
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_show_version(server: RemoteOpenAIServer):
+    transport = httpx.HTTPTransport(uds=server.uds)
+    client = httpx.Client(transport=transport)
+    response = client.get(server.url_for("version"))
+    response.raise_for_status()
+
+    assert response.json() == {"version": VLLM_VERSION}
diff --git a/tests/utils.py b/tests/utils.py
index 741b4401cc..18fcde9491 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -17,6 +17,7 @@ from pathlib import Path
 from typing import Any, Callable, Literal, Optional, Union
 
 import cloudpickle
+import httpx
 import openai
 import pytest
 import requests
@@ -88,10 +89,12 @@ class RemoteOpenAIServer:
                 raise ValueError("You have manually specified the port "
                                  "when `auto_port=True`.")
 
-            # Don't mutate the input args
-            vllm_serve_args = vllm_serve_args + [
-                "--port", str(get_open_port())
-            ]
+            # No need for a port if using unix sockets
+            if "--uds" not in vllm_serve_args:
+                # Don't mutate the input args
+                vllm_serve_args = vllm_serve_args + [
+                    "--port", str(get_open_port())
+                ]
         if seed is not None:
             if "--seed" in vllm_serve_args:
                 raise ValueError("You have manually specified the seed "
@@ -104,8 +107,13 @@ class RemoteOpenAIServer:
         subparsers = parser.add_subparsers(required=False, dest="subparser")
         parser = ServeSubcommand().subparser_init(subparsers)
         args = parser.parse_args(["--model", model, *vllm_serve_args])
-        self.host = str(args.host or 'localhost')
-        self.port = int(args.port)
+        self.uds = args.uds
+        if args.uds:
+            self.host = None
+            self.port = None
+        else:
+            self.host = str(args.host or 'localhost')
+            self.port = int(args.port)
 
         self.show_hidden_metrics = \
             args.show_hidden_metrics_for_version is not None
@@ -150,9 +158,11 @@ class RemoteOpenAIServer:
     def _wait_for_server(self, *, url: str, timeout: float):
         # run health check
         start = time.time()
+        client = (httpx.Client(transport=httpx.HTTPTransport(
+            uds=self.uds)) if self.uds else requests)
         while True:
             try:
-                if requests.get(url).status_code == 200:
+                if client.get(url).status_code == 200:
                     break
             except Exception:
                 # this exception can only be raised by requests.get,
@@ -170,7 +180,8 @@ class RemoteOpenAIServer:
 
     @property
     def url_root(self) -> str:
-        return f"http://{self.host}:{self.port}"
+        return (f"http://{self.uds.split('/')[-1]}"
+                if self.uds else f"http://{self.host}:{self.port}")
 
     def url_for(self, *parts: str) -> str:
         return self.url_root + "/" + "/".join(parts)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 00eaba8c87..e5d31c1fd0 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1777,6 +1777,12 @@ def create_server_socket(addr: tuple[str, int]) -> socket.socket:
     return sock
 
 
+def create_server_unix_socket(path: str) -> socket.socket:
+    sock = socket.socket(family=socket.AF_UNIX, type=socket.SOCK_STREAM)
+    sock.bind(path)
+    return sock
+
+
 def validate_api_server_args(args):
     valid_tool_parses = ToolParserManager.tool_parsers.keys()
     if args.enable_auto_tool_choice \
@@ -1807,8 +1813,11 @@ def setup_server(args):
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.
     # see https://github.com/vllm-project/vllm/issues/8204
-    sock_addr = (args.host or "", args.port)
-    sock = create_server_socket(sock_addr)
+    if args.uds:
+        sock = create_server_unix_socket(args.uds)
+    else:
+        sock_addr = (args.host or "", args.port)
+        sock = create_server_socket(sock_addr)
 
     # workaround to avoid footguns where uvicorn drops requests with too
     # many concurrent requests active
@@ -1820,12 +1829,14 @@ def setup_server(args):
 
     signal.signal(signal.SIGTERM, signal_handler)
 
-    addr, port = sock_addr
-    is_ssl = args.ssl_keyfile and args.ssl_certfile
-    host_part = f"[{addr}]" if is_valid_ipv6_address(
-        addr) else addr or "0.0.0.0"
-    listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}"
-
+    if args.uds:
+        listen_address = f"unix:{args.uds}"
+    else:
+        addr, port = sock_addr
+        is_ssl = args.ssl_keyfile and args.ssl_certfile
+        host_part = f"[{addr}]" if is_valid_ipv6_address(
+            addr) else addr or "0.0.0.0"
+        listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}"
     return listen_address, sock
 
 
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index e89463a03c..e15f65b430 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -72,6 +72,8 @@ class FrontendArgs:
     """Host name."""
     port: int = 8000
     """Port number."""
+    uds: Optional[str] = None
+    """Unix domain socket path. If set, host and port arguments are ignored."""
     uvicorn_log_level: Literal["debug", "info", "warning", "error", "critical",
                                "trace"] = "info"
     """Log level for uvicorn."""

From e3edc0a7a8f015b938d5cd77a44638dde28ab3a9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 9 Aug 2025 00:34:25 +0100
Subject: [PATCH 103/932] Extract `CompilationConfig` from `config.py` (#22524)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/engine/test_arg_utils.py         |  33 --
 vllm/{config.py => config/__init__.py} | 449 +------------------------
 vllm/config/compilation.py             | 428 +++++++++++++++++++++++
 vllm/config/utils.py                   |  29 ++
 vllm/engine/arg_utils.py               |   8 +-
 5 files changed, 467 insertions(+), 480 deletions(-)
 rename vllm/{config.py => config/__init__.py} (91%)
 create mode 100644 vllm/config/compilation.py
 create mode 100644 vllm/config/utils.py

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index c282bf0023..93ac18dfcc 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -93,32 +93,6 @@ class NestedConfig:
     """field"""
 
 
-@config
-@dataclass
-class FromCliConfig1:
-    field: int = 1
-    """field"""
-
-    @classmethod
-    def from_cli(cls, cli_value: str):
-        inst = cls(**json.loads(cli_value))
-        inst.field += 1
-        return inst
-
-
-@config
-@dataclass
-class FromCliConfig2:
-    field: int = 1
-    """field"""
-
-    @classmethod
-    def from_cli(cls, cli_value: str):
-        inst = cls(**json.loads(cli_value))
-        inst.field += 2
-        return inst
-
-
 @config
 @dataclass
 class DummyConfig:
@@ -144,10 +118,6 @@ class DummyConfig:
     """Dict which will be JSON in CLI"""
     nested_config: NestedConfig = field(default_factory=NestedConfig)
     """Nested config"""
-    from_cli_config1: FromCliConfig1 = field(default_factory=FromCliConfig1)
-    """Config with from_cli method"""
-    from_cli_config2: FromCliConfig2 = field(default_factory=FromCliConfig2)
-    """Different config with from_cli method"""
 
 
 @pytest.mark.parametrize(("type_hint", "expected"), [
@@ -199,9 +169,6 @@ def test_get_kwargs():
     assert json_tip in kwargs["json_tip"]["help"]
     # nested config should should construct the nested config
     assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)
-    # from_cli configs should be constructed with the correct method
-    assert kwargs["from_cli_config1"]["type"]('{"field": 2}').field == 3
-    assert kwargs["from_cli_config2"]["type"]('{"field": 2}').field == 4
 
 
 @pytest.mark.parametrize(
diff --git a/vllm/config.py b/vllm/config/__init__.py
similarity index 91%
rename from vllm/config.py
rename to vllm/config/__init__.py
index 7147702edd..eaed6017cc 100644
--- a/vllm/config.py
+++ b/vllm/config/__init__.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+# ruff: noqa: F401
 import ast
 import copy
 import enum
@@ -10,11 +11,9 @@ import json
 import textwrap
 import uuid
 import warnings
-from collections import Counter
 from collections.abc import Mapping
 from contextlib import contextmanager
-from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
-                         replace)
+from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
 from functools import cached_property, lru_cache
 from importlib.util import find_spec
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional,
@@ -22,7 +21,7 @@ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional,
 
 import regex as re
 import torch
-from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
+from pydantic import (ConfigDict, SkipValidation, field_validator,
                       model_validator)
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
@@ -31,7 +30,9 @@ from typing_extensions import Self, assert_never, runtime_checkable
 
 import vllm.envs as envs
 from vllm import version
-from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
+from vllm.config.compilation import (CompilationConfig, CompilationLevel,
+                                     PassConfig)
+from vllm.config.utils import ConfigType, config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.platforms import current_platform
@@ -50,8 +51,7 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
                         POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
                         LayerBlockType, LazyLoader, common_broadcastable_dtype,
                         cuda_device_count_stateless, get_cpu_memory,
-                        get_open_port, is_torch_equal_or_newer, random_uuid,
-                        resolve_obj_by_qualname)
+                        get_open_port, random_uuid)
 
 # yapf: enable
 
@@ -70,7 +70,6 @@ if TYPE_CHECKING:
     from vllm.model_executor.model_loader import LoadFormats
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
-    ConfigType = type[DataclassInstance]
     HfOverrides = Union[dict, Callable[[type], type]]
 else:
     DataclassInstance = Any
@@ -83,7 +82,6 @@ else:
     BaseModelLoader = Any
     LoadFormats = Any
     TensorizerConfig = Any
-    ConfigType = type
     HfOverrides = Union[dict[str, Any], Callable[[type], type]]
 
     me_quant = LazyLoader("model_executor", globals(),
@@ -93,7 +91,6 @@ else:
 
 logger = init_logger(__name__)
 DataclassInstanceT = TypeVar("DataclassInstanceT", bound=DataclassInstance)
-ConfigT = TypeVar("ConfigT", bound=ConfigType)
 
 TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
                      "score", "reward", "transcription", "draft"]
@@ -234,23 +231,6 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
     return out
 
 
-def config(cls: ConfigT) -> ConfigT:
-    """
-    A decorator that ensures all fields in a dataclass have default values
-    and that each field has a docstring.
-
-    If a `ConfigT` is used as a CLI argument itself, the default value provided
-    by `get_kwargs` will be the result parsing a JSON string as the kwargs
-    (i.e. `ConfigT(**json.loads(cli_arg))`). However, if a particular `ConfigT`
-    requires custom construction from CLI (i.e. `CompilationConfig`), it can
-    have a `from_cli` method, which will be called instead.
-
-    Config validation is performed by the tools/validate_config.py
-    script, which is invoked during the pre-commit checks.
-    """
-    return cls
-
-
 def get_field(cls: ConfigType, name: str) -> Field:
     """Get the default factory field of a dataclass by name. Used for getting
     default factory fields in `EngineArgs`."""
@@ -4154,421 +4134,6 @@ class KVEventsConfig:
     """
 
 
-class CompilationLevel:
-    # constants for the levels of the compilation process
-    NO_COMPILATION = 0
-    DYNAMO_AS_IS = 1
-    DYNAMO_ONCE = 2
-    PIECEWISE = 3
-
-
-@config
-@dataclass
-class PassConfig:
-    """Configuration for custom Inductor passes.
-
-    This is separate from general `CompilationConfig` so that inductor passes
-    don't all have access to full configuration - that would create a cycle as
-    the `PassManager` is set as a property of config."""
-
-    enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
-    """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
-    enable_attn_fusion: bool = False
-    """Whether to enable the custom attention+quant fusion pass."""
-    enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
-    """Whether to enable the custom no-op elimination pass."""
-    enable_sequence_parallelism: bool = False
-    """Whether to enable sequence parallelism."""
-    enable_async_tp: bool = False
-    """Whether to enable async TP."""
-    enable_fi_allreduce_fusion: bool = False
-    """Whether to enable flashinfer allreduce fusion."""
-    fi_allreduce_fusion_max_token_num: int = 16384
-    """Max number of tokens to used in flashinfer allreduce fusion."""
-
-    # TODO(luka) better pass enabling system.
-
-    def uuid(self):
-        """
-        Produces a hash unique to the pass configuration.
-        Any new fields that affect compilation should be added to the hash.
-        Any future fields that don't affect compilation should be excluded.
-        """
-        return InductorPass.hash_dict(asdict(self))
-
-    def __post_init__(self) -> None:
-        if not self.enable_noop:
-            if self.enable_fusion:
-                logger.warning_once(
-                    "Fusion enabled but reshape elimination disabled. "
-                    "RMSNorm/SiluMul + quant (fp8) fusion might not work")
-            if self.enable_attn_fusion:
-                logger.warning_once(
-                    "Fusion enabled but reshape elimination disabled. "
-                    "Attention + quant (fp8) fusion might not work")
-
-
-@config
-@dataclass
-class CompilationConfig:
-    """Configuration for compilation. It has three parts:
-
-    - Top-level Compilation control:
-        - [`level`][vllm.config.CompilationConfig.level]
-        - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
-        - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
-        - [`backend`][vllm.config.CompilationConfig.backend]
-        - [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
-        - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
-    - CudaGraph capture:
-        - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
-        - [`cudagraph_capture_sizes`]
-        [vllm.config.CompilationConfig.cudagraph_capture_sizes]
-        - [`cudagraph_num_of_warmups`]
-        [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
-        - [`cudagraph_copy_inputs`]
-        [vllm.config.CompilationConfig.cudagraph_copy_inputs]
-        - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
-    - Inductor compilation:
-        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
-        - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
-        - [`inductor_compile_config`]
-        [vllm.config.CompilationConfig.inductor_compile_config]
-        - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
-        - custom inductor passes
-
-    Why we have different sizes for cudagraph and inductor:
-    - cudagraph: a cudagraph captured for a specific size can only be used
-        for the same size. We need to capture all the sizes we want to use.
-    - inductor: a graph compiled by inductor for a general shape can be used
-        for different sizes. Inductor can also compile for specific sizes,
-        where it can have more information to optimize the graph with fully
-        static shapes. However, we find the general shape compilation is
-        sufficient for most cases. It might be beneficial to compile for
-        certain small batchsizes, where inductor is good at optimizing.
-    """
-    # Top-level Compilation control
-    level: Optional[int] = None
-    """The level of compilation:
-
-    - None: If None, we will select the default compilation level.
-      For V1 engine this is 3, for V0 engine this is 0.
-    - 0: no compilation.
-    - 1: dynamo as is.
-    - 2: dynamo once.
-    - 3: piecewise compilation."""
-    debug_dump_path: str = ""
-    """The path to dump the debug information."""
-    cache_dir: str = ""
-    """The directory to store the compiled graph, to accelerate Inductor
-    compilation. By default, it will use model-related information to generate
-    a cache directory."""
-    backend: str = ""
-    """The backend for compilation. It needs to be a string:
-
-    - "" (empty string): use the default backend.
-    - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
-    - "full.module.name": a qualified name which can be used to import the
-
-    backend function.
-    We use string to avoid serialization issues when using compilation in a
-    distributed setting. When the compilation level is 1 or 2, the backend is
-    used for the compilation directly (it sees the whole graph). When the
-    compilation level is 3, the backend is used for the piecewise compilation
-    (it sees a part of the graph)."""
-    custom_ops: list[str] = field(default_factory=list)
-    """Fine-grained control over which custom ops to enable/disable. Use 'all'
-    to enable all, 'none' to disable all. Also specify a list of custom op
-    names to enable (prefixed with a '+'), or disable (prefixed with a '-').
-    Examples:
-
-    - 'all,-op1' to enable all except op1
-    - 'none,+op1,+op2' to enable only op1 and op2
-
-    By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
-    Inductor generates (fused) Triton kernels for disabled custom ops."""
-    splitting_ops: list[str] = field(default_factory=list)
-    """A list of ops to split the full graph into subgraphs, used in piecewise
-    compilation."""
-
-    # Inductor capture
-    use_inductor: bool = True
-    """Whether to use inductor compilation:
-
-    - False: inductor compilation is not used. graph runs in eager
-        (custom_ops enabled by default).
-    - True: inductor compilation is used (custom_ops disabled by default).
-        One graph for symbolic shape and one graph per size in compile_sizes
-        are compiled using configurations in inductor_compile_config.
-
-    This setting is ignored if level<PIECEWISE."""
-    compile_sizes: Optional[list[Union[int, str]]] = None
-    """Sizes to compile for inductor. In addition
-    to integers, it also supports "cudagraph_capture_sizes" to
-    specify the sizes for cudagraph capture."""
-    inductor_compile_config: dict = field(default_factory=dict)
-    """Additional configurations for inductor.
-    - None: use default configurations."""
-    inductor_passes: dict[str, str] = field(default_factory=dict)
-    """Additional passes for inductor. It is a dictionary
-    from pass name to pass function qualified name. We use function
-    name because the config uses JSON format. If we pass the config
-    from Python, functions can also be passed directly via Python object
-    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
-
-    # CudaGraph compilation
-    use_cudagraph: bool = field(default_factory=lambda: envs.VLLM_USE_V1)
-    """Whether to use cudagraph inside compilation.
-    - False: cudagraph inside compilation is not used.
-    - True: cudagraph inside compilation is used. It requires
-        that all input buffers have fixed addresses, and all
-        splitting ops write their outputs to input buffers.
-    In the vLLM V1 Engine, this flag only applies for
-    CompilationLevel.PIECEWISE (aka -O3).
-    Note that this is orthogonal to the cudagraph capture logic
-    outside of compilation.
-    TODO: move outside cudagraph logic into compilation.
-    torch.compile will handle cudagraph capture logic in the future."""
-    cudagraph_num_of_warmups: int = 0
-    """Number of warmup runs for cudagraph.
-    It means the first several runs will be treated as warmup runs.
-    Only after that, the execution will be recorded, and the recorded
-    cudagraph will be used for subsequent runs."""
-    cudagraph_capture_sizes: Optional[list[int]] = None
-    """Sizes to capture cudagraph.
-    - None (default): capture sizes are inferred from vllm config.
-    - list[int]: capture sizes are specified as given."""
-    cudagraph_copy_inputs: bool = False
-    """Whether to copy input tensors for
-    cudagraph. If the caller can guarantee that the same input buffers
-    are always used, it can set this to False. Otherwise, it should
-    set this to True, and the compiler will copy the input to an
-    internally managed buffer. Default is False."""
-    full_cuda_graph: bool = False
-    """whether to use a full cuda graph for the entire forward pass rather than
-    splitting certain operations such as attention into subgraphs. Thus this
-    flag cannot be used together with splitting_ops. This may provide
-    performance benefits for smaller models."""
-
-    pass_config: PassConfig = field(default_factory=PassConfig)
-    """Custom inductor passes, see PassConfig for more details"""
-
-    max_capture_size: int = field(default=None, init=False)  # type: ignore
-    """not configurable, computed after init"""
-    local_cache_dir: str = field(default=None, init=False)  # type: ignore
-    """local cache dir for each rank"""
-    bs_to_padded_graph_size: list[int] = field(
-        default=None,  # type: ignore
-        init=False)
-    """optimization:
-    Intuitively, bs_to_padded_graph_size should be dict[int, int].
-    since we know all keys are in a range [0, max_capture_size],
-    we can optimize it to list[int] for better lookup performance."""
-
-    # keep track of enabled and disabled custom ops
-    enabled_custom_ops: Counter[str] = field(default_factory=Counter,
-                                             init=False)
-    """custom ops that are enabled"""
-    disabled_custom_ops: Counter[str] = field(default_factory=Counter,
-                                              init=False)
-    """custom ops that are disabled"""
-    traced_files: set[str] = field(default_factory=set, init=False)
-    """files that are traced for compilation"""
-    compilation_time: float = field(default=0.0, init=False)
-    """time taken for compilation"""
-
-    static_forward_context: dict[str, Any] = field(default_factory=dict,
-                                                   init=False)
-    """Per-model forward context
-    Map from layer name to layer objects that need to be accessed outside
-    model code, e.g., Attention, FusedMOE when dp_size>1."""
-
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        factors: list[Any] = []
-        factors.append(self.level)
-        factors.append(self.backend)
-        factors.append(self.custom_ops)
-        factors.append(self.splitting_ops)
-        factors.append(self.use_inductor)
-        factors.append(self.inductor_compile_config)
-        factors.append(self.inductor_passes)
-        factors.append(self.pass_config.uuid())
-        return hashlib.sha256(str(factors).encode()).hexdigest()
-
-    def __repr__(self) -> str:
-        exclude = {
-            "static_forward_context": True,
-            "enabled_custom_ops": True,
-            "disabled_custom_ops": True,
-            "compilation_time": True,
-            "bs_to_padded_graph_size": True,
-            "traced_files": True,
-            "inductor_compile_config": {
-                "post_grad_custom_post_pass": True,
-            },
-        }
-
-        # exclude default attr in pass_config
-        pass_config_exclude = {}
-        for attr, default_val in vars(PassConfig()).items():
-            if getattr(self.pass_config, attr) == default_val:
-                pass_config_exclude[attr] = True
-        if pass_config_exclude:
-            exclude["pass_config"] = pass_config_exclude
-
-        # The cast to string is necessary because Pydantic is mocked in docs
-        # builds and sphinx-argparse doesn't know the return type of decode()
-        return str(
-            TypeAdapter(CompilationConfig).dump_json(
-                self,
-                exclude=exclude,  # type: ignore[arg-type]
-                exclude_unset=True).decode())
-
-    __str__ = __repr__
-
-    @classmethod
-    def from_cli(cls, cli_value: str) -> "CompilationConfig":
-        """Parse the CLI value for the compilation config.
-        -O1, -O2, -O3, etc. is handled in FlexibleArgumentParser.
-        """
-        return TypeAdapter(CompilationConfig).validate_json(cli_value)
-
-    def __post_init__(self) -> None:
-        count_none = self.custom_ops.count("none")
-        count_all = self.custom_ops.count("all")
-        assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
-
-        # TODO(zou3519/luka): There are 2 issues with auto-functionalization V2:
-        # 1. A bug in PyTorch, fixed in 2.7:
-        #    https://github.com/pytorch/pytorch/issues/147924
-        # 2. Custom passes (fusion) rely on auto-functionalization V1 and don't
-        #    work with V2. Addressing this will take extra engineering effort
-        #    and it is not yet a priority. RFC here:
-        #    https://github.com/vllm-project/vllm/issues/14703
-
-        if is_torch_equal_or_newer("2.6"):
-            KEY = 'enable_auto_functionalized_v2'
-            if KEY not in self.inductor_compile_config:
-                self.inductor_compile_config[KEY] = False
-
-        for k, v in self.inductor_passes.items():
-            if not isinstance(v, str):
-                assert callable(v), (
-                    f"pass {k} should be callable or a qualified name")
-                self.inductor_compile_config[k] = v if isinstance(
-                    v, InductorPass) else CallableInductorPass(v)
-                continue
-
-            # resolve function from qualified name
-            names = v.split(".")
-            module = ".".join(names[:-1])
-            func_name = names[-1]
-            func = __import__(module).__dict__[func_name]
-            self.inductor_compile_config[k] = func if isinstance(
-                func, InductorPass) else CallableInductorPass(func)
-
-        if isinstance(self.pass_config, dict):
-            self.pass_config = PassConfig(**self.pass_config)
-
-    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
-        if self.level == CompilationLevel.NO_COMPILATION:
-            raise ValueError("No compilation level is set.")
-
-        from torch._dynamo.backends.registry import list_backends
-        torch_backends = list_backends(exclude_tags=tuple())
-        if self.level in [
-                CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE
-        ]:
-            if self.backend == "":
-                return "eager"
-            if self.backend in torch_backends:
-                return self.backend
-            return resolve_obj_by_qualname(self.backend)
-
-        # TODO: pass user-specified backend to piecewise compilation
-        # merge with the config use_inductor
-        assert self.level == CompilationLevel.PIECEWISE
-
-        from vllm.compilation.backends import VllmBackend
-        return VllmBackend(vllm_config)
-
-    def init_with_cudagraph_sizes(self,
-                                  cudagraph_capture_sizes: list[int]) -> None:
-        """To complete the initialization of config,
-        we need to know the cudagraph sizes."""
-
-        if self.cudagraph_capture_sizes is None:
-            self.cudagraph_capture_sizes = cudagraph_capture_sizes
-        else:
-            # de-duplicate the sizes provided by the config
-            dedup_sizes = list(set(self.cudagraph_capture_sizes))
-            if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
-                logger.info(("cudagraph sizes specified by model runner"
-                             " %s is overridden by config %s"),
-                            cudagraph_capture_sizes, dedup_sizes)
-            self.cudagraph_capture_sizes = dedup_sizes
-
-        computed_compile_sizes = []
-        if self.compile_sizes is not None:
-            # de-duplicate the sizes provided by the config
-            self.compile_sizes = list(set(self.compile_sizes))
-            for x in self.compile_sizes:
-                if isinstance(x, str):
-                    assert x == "cudagraph_capture_sizes", \
-                    "Unrecognized size type in compile_sizes, " \
-                    f"expect 'cudagraph_capture_sizes', got {x}"
-                    computed_compile_sizes.extend(self.cudagraph_capture_sizes)
-                else:
-                    assert isinstance(x, int)
-                    computed_compile_sizes.append(x)
-        self.compile_sizes = computed_compile_sizes  # type: ignore
-
-        # sort to make sure cudagraph capture sizes are in descending order
-        self.cudagraph_capture_sizes.sort(reverse=True)
-        self.max_capture_size = self.cudagraph_capture_sizes[
-            0] if self.cudagraph_capture_sizes else 0
-
-        # pre-compute the mapping from batch size to padded graph size
-        self.bs_to_padded_graph_size = [
-            0 for i in range(self.max_capture_size + 1)
-        ]
-        for end, start in zip(self.cudagraph_capture_sizes,
-                              self.cudagraph_capture_sizes[1:] + [0]):
-            for bs in range(start, end):
-                if bs == start:
-                    self.bs_to_padded_graph_size[bs] = start
-                else:
-                    self.bs_to_padded_graph_size[bs] = end
-        self.bs_to_padded_graph_size[
-            self.max_capture_size] = self.max_capture_size
-
-    def set_splitting_ops_for_v1(self):
-        # NOTE: this function needs to be called
-        if self.splitting_ops and self.full_cuda_graph:
-            raise ValueError("full_cuda_graph cannot be used together with "
-                             "splitting_ops, as Full CUDA graph will override "
-                             f"the splitting_ops: {self.splitting_ops}")
-
-        if not self.splitting_ops:
-            self.splitting_ops = [] if self.full_cuda_graph else [
-                "vllm.unified_attention",
-                "vllm.unified_attention_with_output",
-                "vllm.mamba_mixer2",
-            ]
-
-
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class VllmConfig:
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
new file mode 100644
index 0000000000..c1b3a61217
--- /dev/null
+++ b/vllm/config/compilation.py
@@ -0,0 +1,428 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+from collections import Counter
+from dataclasses import asdict, field
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+
+from pydantic import TypeAdapter
+from pydantic.dataclasses import dataclass
+
+import vllm.envs as envs
+from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
+from vllm.config.utils import config
+from vllm.logger import init_logger
+from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
+
+if TYPE_CHECKING:
+    from vllm.config.config import VllmConfig
+else:
+    VllmConfig = object
+
+logger = init_logger(__name__)
+
+
+class CompilationLevel:
+    # constants for the levels of the compilation process
+    NO_COMPILATION = 0
+    DYNAMO_AS_IS = 1
+    DYNAMO_ONCE = 2
+    PIECEWISE = 3
+
+
+@config
+@dataclass
+class PassConfig:
+    """Configuration for custom Inductor passes.
+
+    This is separate from general `CompilationConfig` so that inductor passes
+    don't all have access to full configuration - that would create a cycle as
+    the `PassManager` is set as a property of config."""
+
+    enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
+    """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
+    enable_attn_fusion: bool = False
+    """Whether to enable the custom attention+quant fusion pass."""
+    enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
+    """Whether to enable the custom no-op elimination pass."""
+    enable_sequence_parallelism: bool = False
+    """Whether to enable sequence parallelism."""
+    enable_async_tp: bool = False
+    """Whether to enable async TP."""
+    enable_fi_allreduce_fusion: bool = False
+    """Whether to enable flashinfer allreduce fusion."""
+    fi_allreduce_fusion_max_token_num: int = 16384
+    """Max number of tokens to used in flashinfer allreduce fusion."""
+
+    # TODO(luka) better pass enabling system.
+
+    def uuid(self):
+        """
+        Produces a hash unique to the pass configuration.
+        Any new fields that affect compilation should be added to the hash.
+        Any future fields that don't affect compilation should be excluded.
+        """
+        return InductorPass.hash_dict(asdict(self))
+
+    def __post_init__(self) -> None:
+        if not self.enable_noop:
+            if self.enable_fusion:
+                logger.warning_once(
+                    "Fusion enabled but reshape elimination disabled. "
+                    "RMSNorm/SiluMul + quant (fp8) fusion might not work")
+            if self.enable_attn_fusion:
+                logger.warning_once(
+                    "Fusion enabled but reshape elimination disabled. "
+                    "Attention + quant (fp8) fusion might not work")
+
+
+@config
+@dataclass
+class CompilationConfig:
+    """Configuration for compilation. It has three parts:
+
+    - Top-level Compilation control:
+        - [`level`][vllm.config.CompilationConfig.level]
+        - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
+        - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
+        - [`backend`][vllm.config.CompilationConfig.backend]
+        - [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
+        - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
+    - CudaGraph capture:
+        - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
+        - [`cudagraph_capture_sizes`]
+        [vllm.config.CompilationConfig.cudagraph_capture_sizes]
+        - [`cudagraph_num_of_warmups`]
+        [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
+        - [`cudagraph_copy_inputs`]
+        [vllm.config.CompilationConfig.cudagraph_copy_inputs]
+        - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
+    - Inductor compilation:
+        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
+        - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
+        - [`inductor_compile_config`]
+        [vllm.config.CompilationConfig.inductor_compile_config]
+        - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
+        - custom inductor passes
+
+    Why we have different sizes for cudagraph and inductor:
+    - cudagraph: a cudagraph captured for a specific size can only be used
+        for the same size. We need to capture all the sizes we want to use.
+    - inductor: a graph compiled by inductor for a general shape can be used
+        for different sizes. Inductor can also compile for specific sizes,
+        where it can have more information to optimize the graph with fully
+        static shapes. However, we find the general shape compilation is
+        sufficient for most cases. It might be beneficial to compile for
+        certain small batchsizes, where inductor is good at optimizing.
+    """
+    # Top-level Compilation control
+    level: Optional[int] = None
+    """The level of compilation:
+
+    - None: If None, we will select the default compilation level.
+      For V1 engine this is 3, for V0 engine this is 0.
+    - 0: no compilation.
+    - 1: dynamo as is.
+    - 2: dynamo once.
+    - 3: piecewise compilation."""
+    debug_dump_path: str = ""
+    """The path to dump the debug information."""
+    cache_dir: str = ""
+    """The directory to store the compiled graph, to accelerate Inductor
+    compilation. By default, it will use model-related information to generate
+    a cache directory."""
+    backend: str = ""
+    """The backend for compilation. It needs to be a string:
+
+    - "" (empty string): use the default backend.
+    - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
+    - "full.module.name": a qualified name which can be used to import the
+
+    backend function.
+    We use string to avoid serialization issues when using compilation in a
+    distributed setting. When the compilation level is 1 or 2, the backend is
+    used for the compilation directly (it sees the whole graph). When the
+    compilation level is 3, the backend is used for the piecewise compilation
+    (it sees a part of the graph)."""
+    custom_ops: list[str] = field(default_factory=list)
+    """Fine-grained control over which custom ops to enable/disable. Use 'all'
+    to enable all, 'none' to disable all. Also specify a list of custom op
+    names to enable (prefixed with a '+'), or disable (prefixed with a '-').
+    Examples:
+
+    - 'all,-op1' to enable all except op1
+    - 'none,+op1,+op2' to enable only op1 and op2
+
+    By default, all custom ops are enabled when running without Inductor and
+    disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
+    Inductor generates (fused) Triton kernels for disabled custom ops."""
+    splitting_ops: list[str] = field(default_factory=list)
+    """A list of ops to split the full graph into subgraphs, used in piecewise
+    compilation."""
+
+    # Inductor capture
+    use_inductor: bool = True
+    """Whether to use inductor compilation:
+
+    - False: inductor compilation is not used. graph runs in eager
+        (custom_ops enabled by default).
+    - True: inductor compilation is used (custom_ops disabled by default).
+        One graph for symbolic shape and one graph per size in compile_sizes
+        are compiled using configurations in inductor_compile_config.
+
+    This setting is ignored if level<PIECEWISE."""
+    compile_sizes: Optional[list[Union[int, str]]] = None
+    """Sizes to compile for inductor. In addition
+    to integers, it also supports "cudagraph_capture_sizes" to
+    specify the sizes for cudagraph capture."""
+    inductor_compile_config: dict = field(default_factory=dict)
+    """Additional configurations for inductor.
+    - None: use default configurations."""
+    inductor_passes: dict[str, str] = field(default_factory=dict)
+    """Additional passes for inductor. It is a dictionary
+    from pass name to pass function qualified name. We use function
+    name because the config uses JSON format. If we pass the config
+    from Python, functions can also be passed directly via Python object
+    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
+
+    # CudaGraph compilation
+    use_cudagraph: bool = field(default_factory=lambda: envs.VLLM_USE_V1)
+    """Whether to use cudagraph inside compilation.
+    - False: cudagraph inside compilation is not used.
+    - True: cudagraph inside compilation is used. It requires
+        that all input buffers have fixed addresses, and all
+        splitting ops write their outputs to input buffers.
+    In the vLLM V1 Engine, this flag only applies for
+    CompilationLevel.PIECEWISE (aka -O3).
+    Note that this is orthogonal to the cudagraph capture logic
+    outside of compilation.
+    TODO: move outside cudagraph logic into compilation.
+    torch.compile will handle cudagraph capture logic in the future."""
+    cudagraph_num_of_warmups: int = 0
+    """Number of warmup runs for cudagraph.
+    It means the first several runs will be treated as warmup runs.
+    Only after that, the execution will be recorded, and the recorded
+    cudagraph will be used for subsequent runs."""
+    cudagraph_capture_sizes: Optional[list[int]] = None
+    """Sizes to capture cudagraph.
+    - None (default): capture sizes are inferred from vllm config.
+    - list[int]: capture sizes are specified as given."""
+    cudagraph_copy_inputs: bool = False
+    """Whether to copy input tensors for
+    cudagraph. If the caller can guarantee that the same input buffers
+    are always used, it can set this to False. Otherwise, it should
+    set this to True, and the compiler will copy the input to an
+    internally managed buffer. Default is False."""
+    full_cuda_graph: bool = False
+    """whether to use a full cuda graph for the entire forward pass rather than
+    splitting certain operations such as attention into subgraphs. Thus this
+    flag cannot be used together with splitting_ops. This may provide
+    performance benefits for smaller models."""
+
+    pass_config: PassConfig = field(default_factory=PassConfig)
+    """Custom inductor passes, see PassConfig for more details"""
+
+    max_capture_size: int = field(default=None, init=False)  # type: ignore
+    """not configurable, computed after init"""
+    local_cache_dir: str = field(default=None, init=False)  # type: ignore
+    """local cache dir for each rank"""
+    bs_to_padded_graph_size: list[int] = field(
+        default=None,  # type: ignore
+        init=False)
+    """optimization:
+    Intuitively, bs_to_padded_graph_size should be dict[int, int].
+    since we know all keys are in a range [0, max_capture_size],
+    we can optimize it to list[int] for better lookup performance."""
+
+    # keep track of enabled and disabled custom ops
+    enabled_custom_ops: Counter[str] = field(default_factory=Counter,
+                                             init=False)
+    """custom ops that are enabled"""
+    disabled_custom_ops: Counter[str] = field(default_factory=Counter,
+                                              init=False)
+    """custom ops that are disabled"""
+    traced_files: set[str] = field(default_factory=set, init=False)
+    """files that are traced for compilation"""
+    compilation_time: float = field(default=0.0, init=False)
+    """time taken for compilation"""
+
+    static_forward_context: dict[str, Any] = field(default_factory=dict,
+                                                   init=False)
+    """Per-model forward context
+    Map from layer name to layer objects that need to be accessed outside
+    model code, e.g., Attention, FusedMOE when dp_size>1."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+        factors.append(self.level)
+        factors.append(self.backend)
+        factors.append(self.custom_ops)
+        factors.append(self.splitting_ops)
+        factors.append(self.use_inductor)
+        factors.append(self.inductor_compile_config)
+        factors.append(self.inductor_passes)
+        factors.append(self.pass_config.uuid())
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
+    def __repr__(self) -> str:
+        exclude = {
+            "static_forward_context": True,
+            "enabled_custom_ops": True,
+            "disabled_custom_ops": True,
+            "compilation_time": True,
+            "bs_to_padded_graph_size": True,
+            "traced_files": True,
+            "inductor_compile_config": {
+                "post_grad_custom_post_pass": True,
+            },
+        }
+
+        # exclude default attr in pass_config
+        pass_config_exclude = {}
+        for attr, default_val in vars(PassConfig()).items():
+            if getattr(self.pass_config, attr) == default_val:
+                pass_config_exclude[attr] = True
+        if pass_config_exclude:
+            exclude["pass_config"] = pass_config_exclude
+
+        return TypeAdapter(CompilationConfig).dump_json(
+            self,
+            exclude=exclude,  # type: ignore[arg-type]
+            exclude_unset=True).decode()
+
+    __str__ = __repr__
+
+    def __post_init__(self) -> None:
+        count_none = self.custom_ops.count("none")
+        count_all = self.custom_ops.count("all")
+        assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
+
+        # TODO(zou3519/luka): There are 2 issues with auto-functionalization V2:
+        # 1. A bug in PyTorch, fixed in 2.7:
+        #    https://github.com/pytorch/pytorch/issues/147924
+        # 2. Custom passes (fusion) rely on auto-functionalization V1 and don't
+        #    work with V2. Addressing this will take extra engineering effort
+        #    and it is not yet a priority. RFC here:
+        #    https://github.com/vllm-project/vllm/issues/14703
+
+        if is_torch_equal_or_newer("2.6"):
+            KEY = 'enable_auto_functionalized_v2'
+            if KEY not in self.inductor_compile_config:
+                self.inductor_compile_config[KEY] = False
+
+        for k, v in self.inductor_passes.items():
+            if not isinstance(v, str):
+                assert callable(v), (
+                    f"pass {k} should be callable or a qualified name")
+                self.inductor_compile_config[k] = v if isinstance(
+                    v, InductorPass) else CallableInductorPass(v)
+                continue
+
+            # resolve function from qualified name
+            names = v.split(".")
+            module = ".".join(names[:-1])
+            func_name = names[-1]
+            func = __import__(module).__dict__[func_name]
+            self.inductor_compile_config[k] = func if isinstance(
+                func, InductorPass) else CallableInductorPass(func)
+
+        if isinstance(self.pass_config, dict):
+            self.pass_config = PassConfig(**self.pass_config)
+
+    def init_backend(self, vllm_config: VllmConfig) -> Union[str, Callable]:
+        if self.level == CompilationLevel.NO_COMPILATION:
+            raise ValueError("No compilation level is set.")
+
+        from torch._dynamo.backends.registry import list_backends
+        torch_backends = list_backends(exclude_tags=tuple())
+        if self.level in [
+                CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE
+        ]:
+            if self.backend == "":
+                return "eager"
+            if self.backend in torch_backends:
+                return self.backend
+            return resolve_obj_by_qualname(self.backend)
+
+        # TODO: pass user-specified backend to piecewise compilation
+        # merge with the config use_inductor
+        assert self.level == CompilationLevel.PIECEWISE
+
+        from vllm.compilation.backends import VllmBackend
+        return VllmBackend(vllm_config)
+
+    def init_with_cudagraph_sizes(self,
+                                  cudagraph_capture_sizes: list[int]) -> None:
+        """To complete the initialization of config,
+        we need to know the cudagraph sizes."""
+
+        if self.cudagraph_capture_sizes is None:
+            self.cudagraph_capture_sizes = cudagraph_capture_sizes
+        else:
+            # de-duplicate the sizes provided by the config
+            dedup_sizes = list(set(self.cudagraph_capture_sizes))
+            if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
+                logger.info(("cudagraph sizes specified by model runner"
+                             " %s is overridden by config %s"),
+                            cudagraph_capture_sizes, dedup_sizes)
+            self.cudagraph_capture_sizes = dedup_sizes
+
+        computed_compile_sizes = []
+        if self.compile_sizes is not None:
+            # de-duplicate the sizes provided by the config
+            self.compile_sizes = list(set(self.compile_sizes))
+            for x in self.compile_sizes:
+                if isinstance(x, str):
+                    assert x == "cudagraph_capture_sizes", \
+                    "Unrecognized size type in compile_sizes, " \
+                    f"expect 'cudagraph_capture_sizes', got {x}"
+                    computed_compile_sizes.extend(self.cudagraph_capture_sizes)
+                else:
+                    assert isinstance(x, int)
+                    computed_compile_sizes.append(x)
+        self.compile_sizes = computed_compile_sizes  # type: ignore
+
+        # sort to make sure cudagraph capture sizes are in descending order
+        self.cudagraph_capture_sizes.sort(reverse=True)
+        self.max_capture_size = self.cudagraph_capture_sizes[
+            0] if self.cudagraph_capture_sizes else 0
+
+        # pre-compute the mapping from batch size to padded graph size
+        self.bs_to_padded_graph_size = [
+            0 for i in range(self.max_capture_size + 1)
+        ]
+        for end, start in zip(self.cudagraph_capture_sizes,
+                              self.cudagraph_capture_sizes[1:] + [0]):
+            for bs in range(start, end):
+                if bs == start:
+                    self.bs_to_padded_graph_size[bs] = start
+                else:
+                    self.bs_to_padded_graph_size[bs] = end
+        self.bs_to_padded_graph_size[
+            self.max_capture_size] = self.max_capture_size
+
+    def set_splitting_ops_for_v1(self):
+        # NOTE: this function needs to be called
+        if self.splitting_ops and self.full_cuda_graph:
+            raise ValueError("full_cuda_graph cannot be used together with "
+                             "splitting_ops, as Full CUDA graph will override "
+                             f"the splitting_ops: {self.splitting_ops}")
+
+        if not self.splitting_ops:
+            self.splitting_ops = [] if self.full_cuda_graph else [
+                "vllm.unified_attention",
+                "vllm.unified_attention_with_output",
+                "vllm.mamba_mixer2",
+            ]
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
new file mode 100644
index 0000000000..98fbeb1fa8
--- /dev/null
+++ b/vllm/config/utils.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING, TypeVar
+
+if TYPE_CHECKING:
+    from _typeshed import DataclassInstance
+
+    ConfigType = type[DataclassInstance]
+else:
+    ConfigType = type
+
+ConfigT = TypeVar("ConfigT", bound=ConfigType)
+
+
+def config(cls: ConfigT) -> ConfigT:
+    """
+    A decorator that ensures all fields in a dataclass have default values
+    and that each field has a docstring.
+
+    If a `ConfigT` is used as a CLI argument itself, the `type` keyword argument
+    provided by `get_kwargs` will be
+    `pydantic.TypeAdapter(ConfigT).validate_json(cli_arg)` which treats the
+    `cli_arg` as a JSON string which gets validated by `pydantic`.
+
+    Config validation is performed by the tools/validate_config.py
+    script, which is invoked during the pre-commit checks.
+    """
+    return cls
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c0ac3ff631..c9dc99cad2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -193,8 +193,6 @@ Additionally, list elements can be passed individually using `+`:
 
             def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
                 try:
-                    if hasattr(cls, "from_cli"):
-                        return cls.from_cli(val)
                     return TypeAdapter(cls).validate_json(val)
                 except ValidationError as e:
                     raise argparse.ArgumentTypeError(repr(e)) from e
@@ -455,9 +453,9 @@ class EngineArgs:
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
         # CompilationConfig object
-        if isinstance(self.compilation_config, (int, dict)):
-            self.compilation_config = CompilationConfig.from_cli(
-                str(self.compilation_config))
+        if isinstance(self.compilation_config, dict):
+            self.compilation_config = CompilationConfig(
+                **self.compilation_config)
         # Setup plugins
         from vllm.plugins import load_general_plugins
         load_general_plugins()

From 311d875614583b7070d16c786c791a3817a8c10a Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 8 Aug 2025 19:56:47 -0400
Subject: [PATCH 104/932] Drop flaky test_healthcheck_response_time (#22539)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .../openai/test_async_tokenization.py         | 54 -------------------
 1 file changed, 54 deletions(-)

diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
index ab3c809054..80261597b1 100644
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -2,15 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
-import contextlib
 import random
-import time
 from typing import Callable
 
 import openai
 import pytest
 import pytest_asyncio
-import requests
 
 from tests.utils import RemoteOpenAIServer
 
@@ -87,54 +84,3 @@ async def test_with_and_without_truncate(
 
     responses = await asyncio.gather(*[get_status_code(**b) for b in bodies])
     assert 500 not in responses
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    ids=["single completion", "multiple completions", "chat"],
-    argnames=["create_func_gen", "content_body"],
-    argvalues=[
-        (lambda x: x.completions.create, {
-            "prompt": " ".join(['A'] * 300_000)
-        }),
-        (lambda x: x.completions.create, {
-            "prompt": [" ".join(['A'] * 300_000)] * 2
-        }),
-        (lambda x: x.chat.completions.create, {
-            "messages": [{
-                "role": "user",
-                "content": " ".join(['A'] * 300_000)
-            }]
-        }),
-    ],
-)
-async def test_healthcheck_response_time(
-    server: RemoteOpenAIServer,
-    client: openai.AsyncOpenAI,
-    create_func_gen: Callable,
-    content_body: dict,
-):
-    num_requests = 50
-
-    create_func = create_func_gen(client)
-    body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
-
-    def get_response_time(url):
-        start_time = time.monotonic()
-        res = requests.get(url)
-        end_time = time.monotonic()
-        assert res.status_code == 200
-        return end_time - start_time
-
-    no_load_response_time = get_response_time(server.url_for("health"))
-    tasks = [
-        asyncio.create_task(create_func(**body)) for _ in range(num_requests)
-    ]
-    await asyncio.sleep(1)  # give the tasks a chance to start running
-    load_response_time = get_response_time(server.url_for("health"))
-
-    with contextlib.suppress(openai.APIStatusError):
-        await asyncio.gather(*tasks)
-
-    assert load_response_time < 100 * no_load_response_time
-    assert load_response_time < 0.1

From 81c57f60a2c77d169dbec021bb58a467edf580f6 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Sat, 9 Aug 2025 08:03:45 +0800
Subject: [PATCH 105/932] [XPU] upgrade torch 2.8 on for XPU (#22300)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 docker/Dockerfile.xpu        | 17 +++++++++++------
 requirements/xpu.txt         | 11 +++--------
 vllm/plugins/__init__.py     |  9 ---------
 vllm/v1/worker/xpu_worker.py |  2 +-
 4 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 7d5a589eb1..65d2e5036b 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -1,9 +1,12 @@
-# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
-FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
+FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
 
 RUN rm /etc/apt/sources.list.d/intel-graphics.list
 
-RUN apt-get update -y && \
+RUN apt clean && apt-get update -y && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get install -y python3.10 python3.10-distutils && \
+    curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
     apt-get install -y --no-install-recommends --fix-missing \
     curl \
     ffmpeg \
@@ -14,11 +17,13 @@ RUN apt-get update -y && \
     libgl1 \
     lsb-release \
     numactl \
-    python3 \
-    python3-dev \
-    python3-pip \
+    python3.10-dev \
     wget
 
+
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
+
 WORKDIR /workspace/vllm
 COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
 COPY requirements/common.txt /workspace/vllm/requirements/common.txt
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 0d95dc5715..4607c3efdf 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -10,15 +10,10 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-
-torch==2.7.0+xpu
+--extra-index-url=https://download.pytorch.org/whl/xpu
+torch==2.8.0+xpu
 torchaudio
 torchvision
 pytorch-triton-xpu
---extra-index-url=https://download.pytorch.org/whl/xpu
-
-# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
-# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
-intel-extension-for-pytorch==2.7.10+xpu
-oneccl_bind_pt==2.7.0+xpu
 --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+intel-extension-for-pytorch==2.8.10+xpu
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 51c78ddc1a..1a1760df82 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -4,8 +4,6 @@
 import logging
 from typing import Any, Callable
 
-import torch
-
 import vllm.envs as envs
 
 logger = logging.getLogger(__name__)
@@ -68,13 +66,6 @@ def load_general_plugins():
         return
     plugins_loaded = True
 
-    # some platform-specific configurations
-    from vllm.platforms import current_platform
-
-    if current_platform.is_xpu():
-        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
-        torch._dynamo.config.disable = True
-
     plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP)
     # general plugins, we only need to execute the loaded functions
     for func in plugins.values():
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 2a7e0625b2..134d839252 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -152,7 +152,7 @@ class XPUWorker(Worker):
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
 
-        ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "drmfd")
+        ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "pidfd")
         ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi")
         ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE",
                                          str(self.parallel_config.world_size))

From 35afe1b30b154114dc2ee8329e12f8cf3fe9f576 Mon Sep 17 00:00:00 2001
From: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Date: Fri, 8 Aug 2025 20:04:15 -0400
Subject: [PATCH 106/932] [BugFix] [P/D] Handle lookahead token count edge-case
 with Eagle Spec Decoding and P/D (#22317)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com>
Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
---
 vllm/v1/core/sched/scheduler.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index d39aea1f2d..430085d9c9 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -437,14 +437,24 @@ class Scheduler(SchedulerInterface):
                             # The request cannot be scheduled.
                             break
 
+                # Handles an edge case when P/D Disaggregation
+                # is used with Spec Decoding where an
+                # extra block gets allocated which
+                # creates a mismatch between the number
+                # of local and remote blocks.
+                effective_lookahead_tokens = (0 if request.num_computed_tokens
+                                              == 0 else
+                                              self.num_lookahead_tokens)
+
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
                     num_new_tokens + num_external_computed_tokens,
                     num_new_local_computed_tokens,
                     new_computed_blocks,
-                    num_lookahead_tokens=self.num_lookahead_tokens,
+                    num_lookahead_tokens=effective_lookahead_tokens,
                     delay_cache_blocks=load_kv_async,
                 )
+
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     break

From 429e4e2d420f7c648d37b7d90430f5df6a7dc61f Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 9 Aug 2025 13:17:22 +0800
Subject: [PATCH 107/932] [Bugfix] Fix ModernBert cuda graph capturing in v1
 (#21901)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/models/language/pooling/mteb_utils.py  |  5 ++-
 vllm/model_executor/models/bert.py           |  2 +-
 vllm/model_executor/models/bert_with_rope.py | 46 +++++++++-----------
 vllm/model_executor/models/modernbert.py     | 22 +++++-----
 vllm/model_executor/models/roberta.py        |  6 +--
 5 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 8c93bbdc98..77aaddb4f5 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -162,7 +162,8 @@ def mteb_test_embed_models(hf_runner,
                            vllm_runner,
                            model_info: EmbedModelInfo,
                            vllm_extra_kwargs=None,
-                           hf_model_callback=None):
+                           hf_model_callback=None,
+                           atol=MTEB_RERANK_TOL):
     if not model_info.enable_test:
         # A model family has many models with the same architecture,
         # and we don't need to test each one.
@@ -198,7 +199,7 @@ def mteb_test_embed_models(hf_runner,
     print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
+    assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
 
 
 def run_mteb_rerank(cross_encoder, tasks, languages):
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 504621c8ab..8f988903f7 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -466,7 +466,7 @@ class BertEmbeddingModel(nn.Module, SupportsQuant):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor,
         positions: torch.Tensor,
         token_type_ids: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 59033cb74a..050f18f16e 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -8,13 +8,15 @@ from torch import nn
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
                                                    get_act_fn)
-from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_topk, torch_vllm_outplace_fused_experts)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -284,15 +286,22 @@ class NomicMoE(nn.Module):
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.router(hidden_states)
-        final_hidden_states = fused_moe(hidden_states,
-                                        self.w1,
-                                        self.w2,
-                                        router_logits,
-                                        self.top_k,
-                                        renormalize=False,
-                                        inplace=False,
-                                        activation=self.hidden_act,
-                                        is_act_and_mul=False)
+        # FIXME(Isotr0py): This implementation is too tricky,
+        # we should use FusedMoE instead in the future
+        # after supporting ungated activation for it.
+        topk_weights, topk_ids, _ = fused_topk(hidden_states,
+                                               router_logits,
+                                               self.top_k,
+                                               renormalize=False)
+        final_hidden_states = torch_vllm_outplace_fused_experts(
+            hidden_states=hidden_states,
+            w1=self.w1,
+            w2=self.w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=self.hidden_act,
+            is_act_and_mul=False,
+        )
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -391,6 +400,7 @@ class BertWithRopeEncoder(nn.Module):
         return hidden_states
 
 
+@support_torch_compile
 class BertWithRope(nn.Module, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
@@ -407,7 +417,7 @@ class BertWithRope(nn.Module, SupportsQuant):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor,
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
@@ -554,20 +564,6 @@ class JinaRobertaModel(BertWithRope):
             "norm2": "mlp_ln",
         })
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        return super().forward(input_ids=input_ids,
-                               positions=position_ids,
-                               intermediate_tensors=intermediate_tensors,
-                               inputs_embeds=inputs_embeds,
-                               token_type_ids=token_type_ids)
-
     @torch.inference_mode()
     def jina_merge_lora_weights(self, weights: Iterable[tuple[str,
                                                               torch.Tensor]]):
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 4967032a24..761fce815e 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -8,6 +8,7 @@ from torch import nn
 from transformers import ModernBertConfig
 
 from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -46,7 +47,7 @@ class ModernBertEmbeddings(nn.Module):
         input_ids: torch.Tensor,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        if inputs_embeds:
+        if inputs_embeds is not None:
             return self.norm(inputs_embeds)
         else:
             inputs_embeds = self.tok_embeddings(input_ids)
@@ -117,7 +118,7 @@ class ModernBertAttention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        position_ids: Optional[torch.LongTensor] = None,
+        position_ids: torch.Tensor,
     ) -> torch.Tensor:
         qkv, _ = self.Wqkv(hidden_states)
         q, k, v = qkv.split([self.all_head_size] * 3, dim=-1)
@@ -169,9 +170,9 @@ class ModernBertLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        position_ids: Optional[torch.LongTensor] = None,
-    ):
-        attn_outputs = self.attn(self.attn_norm(hidden_states),
+        position_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        attn_outputs = self.attn(hidden_states=self.attn_norm(hidden_states),
                                  position_ids=position_ids)
         hidden_states = hidden_states + attn_outputs
         mlp_output = self.mlp(self.mlp_norm(hidden_states))
@@ -192,13 +193,14 @@ class ModernBertEncoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        position_ids: Optional[torch.LongTensor] = None,
+        position_ids: torch.Tensor,
     ) -> torch.Tensor:
         for i, layer in enumerate(self.layers):
             hidden_states = layer(hidden_states, position_ids)
         return hidden_states
 
 
+@support_torch_compile
 class ModernBertModel(nn.Module):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={"layers.": "encoder_layer.layers."})
@@ -234,13 +236,11 @@ class ModernBertModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.LongTensor] = None,
-        positions: Optional[torch.Tensor] = None,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
     ) -> torch.Tensor:
-        position_ids = positions if positions is not None else position_ids
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
@@ -249,7 +249,7 @@ class ModernBertModel(nn.Module):
 
         outputs = self.encoder_layer(
             hidden_states=hidden_states,
-            position_ids=position_ids,
+            position_ids=positions,
         )
         norm_outputs = self.final_norm(outputs)
         return norm_outputs
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 77e072c792..61c8faed40 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -105,7 +105,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor,
         positions: torch.Tensor,
         token_type_ids: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
@@ -119,8 +119,8 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
                                   position_ids=positions,
                                   padding_idx=self.padding_idx)
 
-        return self.model(input_ids=input_ids,
-                          position_ids=positions,
+        return self.model(input_ids,
+                          positions,
                           token_type_ids=token_type_ids,
                           inputs_embeds=inputs_embeds,
                           intermediate_tensors=intermediate_tensors)

From 08b751ba749541259e5450d6371d822fdf769b8a Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Fri, 8 Aug 2025 22:21:40 -0700
Subject: [PATCH 108/932] Implicit language-model-only mode via
 limit-mm-per-prompt (#22299)

Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Andy Xie <andy.xning@gmail.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: XIn Li <xinli@nvidia.com>
Signed-off-by: Junhao Li <junhao@ubicloud.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Signed-off-by: Linkun <github@lkchen.net>
Co-authored-by: Ning Xie <andy.xning@gmail.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: Zhiyu <zhiyuc@nvidia.com>
Co-authored-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: XIn Li <xinli@nvidia.com>
Co-authored-by: Junhao Li <streaver91@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
Co-authored-by: ZiTian Zhao <zitian.zhao@tencentmusic.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Po-Han Huang (NVIDIA) <53919306+nvpohanh@users.noreply.github.com>
Co-authored-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Hong Hanh <hanh.usth@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: lkchen <github@lkchen.net>
---
 tests/multimodal/test_registry.py             | 38 ++++++++++++
 vllm/config/__init__.py                       |  9 ---
 vllm/model_executor/models/llava.py           | 34 +++++++----
 vllm/model_executor/models/mistral3.py        | 38 +++++++-----
 vllm/model_executor/models/mllama4.py         | 30 ++++++----
 .../models/qwen2_5_omni_thinker.py            | 33 +++++++---
 vllm/model_executor/models/qwen2_5_vl.py      | 22 ++++---
 vllm/model_executor/models/qwen2_vl.py        | 26 +++++---
 vllm/model_executor/models/step3_vl.py        | 60 ++++++++++++-------
 vllm/multimodal/registry.py                   | 39 ++++++++++++
 vllm/v1/core/encoder_cache_manager.py         |  2 +-
 vllm/v1/engine/core.py                        |  3 +-
 vllm/v1/engine/mm_input_cache.py              | 12 ++--
 vllm/v1/engine/processor.py                   |  2 +-
 vllm/v1/worker/gpu_model_runner.py            | 16 ++---
 vllm/v1/worker/tpu_model_runner.py            | 23 ++++---
 16 files changed, 271 insertions(+), 116 deletions(-)
 create mode 100644 tests/multimodal/test_registry.py

diff --git a/tests/multimodal/test_registry.py b/tests/multimodal/test_registry.py
new file mode 100644
index 0000000000..d31e75bc27
--- /dev/null
+++ b/tests/multimodal/test_registry.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for MultiModalRegistry.supports_multimodal_inputs and
+Qwen2.5-VL visual component loading behavior.
+"""
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ..models.utils import build_model_context
+
+
+@pytest.mark.parametrize(
+    "model_id,limit_mm_per_prompt,expected",
+    [
+        ("Qwen/Qwen2-0.5B-Instruct", {}, False),
+        ("Qwen/Qwen2.5-VL-3B-Instruct", {}, True),
+        ("Qwen/Qwen2.5-VL-3B-Instruct", {
+            "image": 0,
+            "video": 0
+        }, False),
+        ("Qwen/Qwen2.5-VL-3B-Instruct", {
+            "image": 0
+        }, True),
+    ],
+)
+@pytest.mark.core_model
+def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
+    """Test supports_multimodal_inputs returns correct boolean for various 
+    configs."""
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+    )
+    assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(
+        ctx.model_config) is expected
\ No newline at end of file
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index eaed6017cc..69c05b75d3 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1695,15 +1695,6 @@ class ModelConfig:
 
         return mm_config.mm_processor_cache_gb > 0
 
-    @property
-    def enable_mm_input_cache(self) -> bool:
-        """Whether the multi-modal input cache should be enabled."""
-        mm_config = self.multimodal_config
-        if mm_config is None:
-            return False
-
-        return mm_config.mm_processor_cache_gb > 0
-
     def get_mm_input_cache_gb(self) -> int:
         mm_config = self.multimodal_config
         if mm_config is None:
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index c863ba4064..cfc6ffd99a 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -521,18 +521,22 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
             config.projector_hidden_act = "gelu"
 
         # TODO: Optionally initializes this for supporting embeddings.
-        self.vision_tower = init_vision_tower_for_llava(
-            config,
-            quant_config,
-            require_post_norm=False,
-            prefix=maybe_prefix(prefix, "vision_tower"))
-        self.multi_modal_projector = LlavaMultiModalProjector(
-            vision_hidden_size=config.vision_config.hidden_size,
-            text_hidden_size=config.text_config.hidden_size,
-            projector_hidden_act=config.projector_hidden_act,
-            multimodal_projector_bias=config.multimodal_projector_bias,
-            quant_config=quant_config,
-            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        if multimodal_config.get_limit_per_prompt("image"):
+            self.vision_tower = init_vision_tower_for_llava(
+                config,
+                quant_config,
+                require_post_norm=False,
+                prefix=maybe_prefix(prefix, "vision_tower"))
+            self.multi_modal_projector = LlavaMultiModalProjector(
+                vision_hidden_size=config.vision_config.hidden_size,
+                text_hidden_size=config.text_config.hidden_size,
+                projector_hidden_act=config.projector_hidden_act,
+                multimodal_projector_bias=config.multimodal_projector_bias,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        else:
+            self.vision_tower = None
+            self.multi_modal_projector = None
 
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
@@ -756,7 +760,11 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self)
+        skip_prefixes = []
+        if self.vision_tower is None and self.multi_modal_projector is None:
+            skip_prefixes.extend(["vision_tower.", "multi_modal_projector."])
+
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 88c3823eaa..9e29a96c6e 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -428,20 +428,24 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
             config.projector_hidden_act = "gelu"
 
         # TODO: Optionally initializes this for supporting embeddings.
-        self.vision_tower = init_vision_tower_for_llava(
-            config,
-            quant_config,
-            require_post_norm=False,
-            prefix=maybe_prefix(prefix, "vision_tower"))
-        self.multi_modal_projector = Mistral3MultiModalProjector(
-            vision_hidden_size=config.vision_config.hidden_size,
-            text_hidden_size=config.text_config.hidden_size,
-            projector_hidden_act=config.projector_hidden_act,
-            spatial_merge_size=config.spatial_merge_size,
-            patch_size=config.vision_config.patch_size,
-            multimodal_projector_bias=config.multimodal_projector_bias,
-            quant_config=quant_config,
-            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        if multimodal_config.get_limit_per_prompt("image"):
+            self.vision_tower = init_vision_tower_for_llava(
+                config,
+                quant_config,
+                require_post_norm=False,
+                prefix=maybe_prefix(prefix, "vision_tower"))
+            self.multi_modal_projector = Mistral3MultiModalProjector(
+                vision_hidden_size=config.vision_config.hidden_size,
+                text_hidden_size=config.text_config.hidden_size,
+                projector_hidden_act=config.projector_hidden_act,
+                spatial_merge_size=config.spatial_merge_size,
+                patch_size=config.vision_config.patch_size,
+                multimodal_projector_bias=config.multimodal_projector_bias,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        else:
+            self.vision_tower = None
+            self.multi_modal_projector = None
 
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
@@ -611,7 +615,11 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self)
+        skip_prefixes = []
+        if self.vision_tower is None and self.multi_modal_projector is None:
+            skip_prefixes = ["vision_tower.", "multi_modal_projector."]
+
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
     def get_mm_mapping(self) -> MultiModelKeys:
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index e73dc0c2be..b405dfca6d 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -737,16 +737,20 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.config = config
         self.quant_config = quant_config
         self.multimodal_config = multimodal_config
-        self.vision_model = Llama4VisionModel(
-            config.vision_config,
-            None,
-            prefix=maybe_prefix(prefix, "vision_model"),
-            use_data_parallel=self.use_data_parallel,
-        )
-        self.multi_modal_projector = Llama4MultiModalProjector(
-            self.config,
-            None,
-            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        if multimodal_config.get_limit_per_prompt("image"):
+            self.vision_model = Llama4VisionModel(
+                config.vision_config,
+                None,
+                prefix=maybe_prefix(prefix, "vision_model"),
+                use_data_parallel=self.use_data_parallel,
+            )
+            self.multi_modal_projector = Llama4MultiModalProjector(
+                self.config,
+                None,
+                prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        else:
+            self.vision_model = None
+            self.multi_modal_projector = None
         self.language_model = initialize_model(
             vllm_config=vllm_config.with_hf_config(config.text_config,
                                                    ["LlamaForCausalLM"]),
@@ -783,6 +787,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
     def _process_image_input(
             self, image_input: Llama4ImagePatchInputs) -> MultiModalEmbeddings:
+
+        assert self.vision_model and self.multi_modal_projector
         flat_data = image_input["flat_data"]
         patches_per_image = image_input["patches_per_image"].tolist()
 
@@ -1048,6 +1054,10 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         language_model_weights, other_weights = (
             self._separate_and_rename_weights(weights))
 
+        # Skip loading vision model and projector if they're not initialized.
+        if self.vision_model is None and self.multi_modal_projector is None:
+            other_weights = []
+
         # Handle expert scale parameters
         regular_weights, expert_scale_weights, updated_params_from_experts = (
             self._handle_expert_scale_broadcasting(language_model_weights,
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index a3af541d20..e95295c318 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -722,13 +722,24 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
                 "exactly same result as the transformers implementation "
                 "in the audio tower part.")
 
-        self.audio_tower = Qwen2_5OmniAudioEncoder(thinker_config.audio_config)
-        self.visual = Qwen2_5_VisionTransformer(
-            vision_config=thinker_config.vision_config,
-            norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
-            quant_config=quant_config,
-            prefix=maybe_prefix(prefix, "visual"),
-        )
+        if multimodal_config.get_limit_per_prompt("audio"):
+            self.audio_tower = Qwen2_5OmniAudioEncoder(
+                thinker_config.audio_config)
+        else:
+            self.audio_tower = None
+
+        if multimodal_config.get_limit_per_prompt(
+                "image") or multimodal_config.get_limit_per_prompt("video"):
+            self.visual = Qwen2_5_VisionTransformer(
+                vision_config=thinker_config.vision_config,
+                norm_eps=getattr(thinker_config.text_config, "rms_norm_eps",
+                                 1e-6),
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "visual"),
+            )
+        else:
+            self.visual = None
+
         self.quant_config = quant_config
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
@@ -886,9 +897,15 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
+        skip_prefixes = ["talker.", "token2wav."]
+        if self.audio_tower is None:
+            skip_prefixes.extend(["audio_tower."])
+        if self.visual is None:
+            skip_prefixes.extend(["visual."])
+
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=["talker.", "token2wav."],
+            skip_prefixes=skip_prefixes,
         )
         loaded_weights = loader.load_weights(weights,
                                              mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 79c5c77f6d..6bea180ffe 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -843,12 +843,17 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.config = config
         self.multimodal_config = multimodal_config
 
-        self.visual = Qwen2_5_VisionTransformer(
-            config.vision_config,
-            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
-            quant_config=self._maybe_ignore_quant_config(self.quant_config),
-            prefix=maybe_prefix(prefix, "visual"),
-        )
+        if multimodal_config.get_limit_per_prompt("image") or \
+            multimodal_config.get_limit_per_prompt("video"):
+            self.visual = Qwen2_5_VisionTransformer(
+                config.vision_config,
+                norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+                quant_config=self._maybe_ignore_quant_config(
+                    self.quant_config),
+                prefix=maybe_prefix(prefix, "visual"),
+            )
+        else:
+            self.visual = None
 
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
@@ -1152,7 +1157,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
 
-        loader = AutoWeightsLoader(self)
+        skip_prefixes = []
+        if self.visual is None:
+            skip_prefixes.extend(["visual."])
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
     def get_mm_mapping(self) -> MultiModelKeys:
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 633f8598e8..f2d438b385 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1049,12 +1049,16 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.config = config
         self.multimodal_config = multimodal_config
 
-        self.visual = Qwen2VisionTransformer(
-            config.vision_config,
-            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
-            quant_config=self._maybe_ignore_quant_config(quant_config),
-            prefix=maybe_prefix(prefix, "visual"),
-        )
+        if multimodal_config.get_limit_per_prompt("image") or \
+            multimodal_config.get_limit_per_prompt("video"):
+            self.visual = Qwen2VisionTransformer(
+                config.vision_config,
+                norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+                quant_config=self._maybe_ignore_quant_config(quant_config),
+                prefix=maybe_prefix(prefix, "visual"),
+            )
+        else:
+            self.visual = None
 
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
@@ -1350,7 +1354,10 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
 
-        loader = AutoWeightsLoader(self)
+        skip_prefixes = []
+        if self.visual is None:
+            skip_prefixes.extend(["visual."])
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
     def get_mm_mapping(self) -> MultiModelKeys:
@@ -1445,5 +1452,8 @@ class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration):
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
 
-        loader = AutoWeightsLoader(self)
+        skip_prefixes = []
+        if self.visual is None:
+            skip_prefixes.extend(["visual."])
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 363c12a4bf..41dba312cb 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -837,27 +837,35 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.config = config
         self.multimodal_config = multimodal_config
 
-        self.vision_model = Step3VisionTransformer(config.vision_config,
-                                                   None,
-                                                   prefix=maybe_prefix(
-                                                       prefix, "vision_model"))
-        self.vit_downsampler = nn.Conv2d(
-            config.vision_config.hidden_size,
-            config.vision_config.output_hidden_size,
-            kernel_size=2,
-            stride=config.understand_projector_stride)
-        self.vit_downsampler2 = nn.Conv2d(
-            config.vision_config.output_hidden_size,
-            config.vision_config.output_hidden_size * 2,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-        )
-        self.vit_large_projector = nn.Linear(
-            config.vision_config.output_hidden_size * 2,
-            config.hidden_size,
-            bias=config.projector_bias,
-        )
+        if multimodal_config.get_limit_per_prompt("image"):
+            self.vision_model = Step3VisionTransformer(config.vision_config,
+                                                       None,
+                                                       prefix=maybe_prefix(
+                                                           prefix,
+                                                           "vision_model"))
+            self.vit_downsampler = nn.Conv2d(
+                config.vision_config.hidden_size,
+                config.vision_config.output_hidden_size,
+                kernel_size=2,
+                stride=config.understand_projector_stride)
+            self.vit_downsampler2 = nn.Conv2d(
+                config.vision_config.output_hidden_size,
+                config.vision_config.output_hidden_size * 2,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+            )
+            self.vit_large_projector = nn.Linear(
+                config.vision_config.output_hidden_size * 2,
+                config.hidden_size,
+                bias=config.projector_bias,
+            )
+        else:
+            self.vision_model = None
+            self.vit_downsampler = None
+            self.vit_downsampler2 = None
+            self.vit_large_projector = None
+
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
             hf_config=config.text_config,
@@ -1046,7 +1054,15 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         return self.language_model.sample(logits, sampling_metadata)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        loader = AutoWeightsLoader(self)
+
+        skip_prefixes = []
+        if self.vision_model is None and self.vit_large_projector is None:
+            skip_prefixes = [
+                "vision_model.", "vit_downsampler.", "vit_downsampler2.",
+                "vit_large_projector."
+            ]
+
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
         loaded_weights = loader.load_weights(weights,
                                              mapper=self.hf_to_vllm_mapper)
         return loaded_weights
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 565d54e1a2..a101f2a55f 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -115,6 +115,45 @@ class MultiModalRegistry:
 
         return True  # Success
 
+    def enable_mm_input_cache(self, model_config: "ModelConfig") -> bool:
+        """Whether the multi-modal input cache should be enabled.
+        NOTE: This is put under MultiModalRegistry on purpose to respect 
+        text-only mode for multimodal models.
+        """
+
+        if not self.supports_multimodal_inputs(model_config):
+            return False
+
+        mm_config = model_config.get_multimodal_config()
+
+        return mm_config.mm_processor_cache_gb > 0
+
+    def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
+        """
+        Checks if the model supports multimodal inputs.
+        Returns True if the model is multimodal with any non-zero supported 
+        modalities, otherwise returns False, effectively running in 
+        text-only mode.
+        """
+        if not model_config.is_multimodal_model:
+            return False
+
+        processor = self.create_processor(model_config, disable_cache=False)
+        supported_modalities = processor.info.get_supported_mm_limits()
+
+        mm_config = model_config.get_multimodal_config()
+
+        # Check if all supported modalities have limit == 0
+        if all(
+                mm_config.get_limit_per_prompt(modality) == 0
+                for modality in supported_modalities):
+            logger.info_once(
+                "All limits of multimodal modalities supported by the model "
+                "are set to 0, running in text-only mode.")
+            return False
+
+        return True
+
     def get_max_tokens_per_item_by_modality(
         self,
         model_config: "ModelConfig",
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 67ea3b007e..faf5c132f8 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -189,7 +189,7 @@ def compute_encoder_budget(
             in the input sequence.
     """
 
-    if not model_config.is_multimodal_model:
+    if not mm_registry.supports_multimodal_inputs(model_config):
         return 0, 0
 
     # TODO: handle encoder-decoder models once we support them.
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 78b8fe4ea6..f92a3e43da 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -21,6 +21,7 @@ from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.logger import init_logger
 from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
@@ -125,7 +126,7 @@ class EngineCore:
         )
 
         self.mm_input_cache_server = MultiModalInputCacheServer(
-            vllm_config.model_config)
+            vllm_config.model_config, MULTIMODAL_REGISTRY)
 
         # Setup batch queue for pipeline parallelism.
         # Batch queue for scheduled batches. This enables us to asynchronously
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 279c9f0007..0532cda03d 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -3,7 +3,7 @@
 from collections.abc import Sequence
 from typing import TYPE_CHECKING, Optional
 
-from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal import MultiModalKwargs, MultiModalRegistry
 from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
 from vllm.utils import is_list_of
 
@@ -46,10 +46,11 @@ if TYPE_CHECKING:
 class MultiModalInputCacheClient:
     """Used by P0 to check whether multi-modal kwargs are cached in P1."""
 
-    def __init__(self, model_config: "ModelConfig") -> None:
+    def __init__(self, model_config: "ModelConfig",
+                 mm_registry: MultiModalRegistry) -> None:
         super().__init__()
 
-        self.enabled = model_config.enable_mm_input_cache
+        self.enabled = mm_registry.enable_mm_input_cache(model_config)
         self.mm_cache = MultiModalCache.get_lru_cache(
             model_config.get_mm_input_cache_gb(),
             MultiModalCacheItemMetadata,
@@ -85,10 +86,11 @@ class MultiModalInputCacheClient:
 class MultiModalInputCacheServer:
     """Used by P1 to avoid requiring past multi-modal kwargs from P0."""
 
-    def __init__(self, model_config: "ModelConfig") -> None:
+    def __init__(self, model_config: "ModelConfig",
+                 mm_registry: MultiModalRegistry) -> None:
         super().__init__()
 
-        self.enabled = model_config.enable_mm_input_cache
+        self.enabled = mm_registry.enable_mm_input_cache(model_config)
         self.mm_cache = MultiModalCache.get_lru_cache(
             model_config.get_mm_input_cache_gb(),
             MultiModalKwargs,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6e37ebeb87..b9419142ca 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -51,7 +51,7 @@ class Processor:
                                                     mm_registry)
 
         self.mm_input_cache_client = MultiModalInputCacheClient(
-            self.model_config)
+            self.model_config, mm_registry)
 
     @property
     def mm_registry(self):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 08b253dcdb..48ff50fd6b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -129,7 +129,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 cache_config.cache_dtype]
 
-        self.is_multimodal_model = model_config.is_multimodal_model
         self.is_pooling_model = model_config.pooler_config is not None
         self.is_encoder_only_model = False
         self.is_multimodal_raw_input_supported = (
@@ -149,6 +148,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
+        self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
+            model_config)
 
         # Sampler
         self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
@@ -330,7 +331,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.mm_registry,
             max_model_len=self.max_model_len,
             max_num_reqs=self.max_num_reqs,
-        ) if self.is_multimodal_model else None)
+        ) if self.supports_mm_inputs \
+            else None)
 
         self.reorder_batch_threshold: Optional[int] = None
 
@@ -1479,14 +1481,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
-        if self.is_multimodal_model:
+        if self.supports_mm_inputs:
             # Run the multimodal encoder if any.
             self._execute_mm_encoder(scheduler_output)
             mm_embeds = self._gather_mm_embeddings(scheduler_output)
         else:
             mm_embeds = []
 
-        if self.is_multimodal_model and get_pp_group().is_first_rank:
+        if self.supports_mm_inputs and get_pp_group().is_first_rank:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
@@ -1817,7 +1819,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 else:
                     target_hidden_states = hidden_states[token_indices]
             mm_embeds = None
-            if self.is_multimodal_model:
+            if self.supports_mm_inputs:
                 mm_embeds = self._gather_mm_embeddings(scheduler_output,
                                                        shift_computed_tokens=1)
 
@@ -2209,7 +2211,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
                                             num_scheduled_tokens):
-            if self.is_multimodal_model:
+            if self.supports_mm_inputs:
                 input_ids = None
                 inputs_embeds = self.inputs_embeds[:num_tokens]
                 model_mm_kwargs = self._dummy_mm_kwargs(num_reqs)
@@ -2417,7 +2419,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
-        if self.is_multimodal_model:
+        if self.supports_mm_inputs:
             mm_budget = self.mm_budget
             assert mm_budget is not None
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 81252f9b60..442c0ea068 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -157,7 +157,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 cache_config.cache_dtype]
         self._hidden_states_dtype = self.dtype
 
-        self.is_multimodal_model = model_config.is_multimodal_model
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
         self.max_model_len = model_config.max_model_len
@@ -193,6 +192,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
+        self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
+            model_config)
         # TODO: Support M-RoPE (e.g, Qwen2-VL)
         assert not self.uses_mrope, "TPU does not support M-RoPE yet."
 
@@ -293,7 +294,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.mm_registry,
             max_model_len=self.max_model_len,
             max_num_reqs=self.max_num_reqs,
-        ) if self.is_multimodal_model else None)
+        ) if self.supports_mm_inputs else None)
 
         if not self.use_spmd:
             self.sample_from_logits_func = torch.compile(
@@ -947,7 +948,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def _get_model_inputs(self, input_ids: torch.Tensor,
                           mm_embeds: list[torch.Tensor]):
-        if self.is_multimodal_model:
+        if self.supports_mm_inputs:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
@@ -979,7 +980,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return self.kv_connector_no_forward(scheduler_output,
                                                 self.vllm_config)
 
-        if self.is_multimodal_model:
+        if self.supports_mm_inputs:
             # Run the multimodal encoder if any.
             self._execute_mm_encoder(scheduler_output)
             mm_embeds = self._gather_mm_embeddings(scheduler_output)
@@ -1230,7 +1231,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     @torch.no_grad()
     def _dummy_run(self, num_tokens: int, num_reqs: int,
                    num_blocks: int) -> None:
-        if self.is_multimodal_model:
+        if self.supports_mm_inputs:
             input_ids = None
             inputs_embeds = torch.zeros((num_tokens, self.hidden_size),
                                         dtype=self.dtype,
@@ -1271,7 +1272,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             _num_slices_per_kv_cache_update_block,
         )
 
-        if self.is_multimodal_model:
+        if self.supports_mm_inputs:
             torch._dynamo.mark_dynamic(inputs_embeds, 0)
         else:
             torch._dynamo.mark_dynamic(input_ids, 0)
@@ -1305,7 +1306,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         xm.mark_step()  # Captures metadata updates
 
     def _precompile_mm_encoder(self) -> None:
-        if not self.is_multimodal_model:
+        if not self.supports_mm_inputs:
             return
 
         # Pre-compile MM encoder for all supported data modalities.
@@ -1527,7 +1528,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         num_tokens: int,
     ) -> None:
         # Profile with multimodal encoder & encoder cache.
-        if self.is_multimodal_model:
+        if self.supports_mm_inputs:
             mm_budget = self.mm_budget
             assert mm_budget is not None
 
@@ -1684,7 +1685,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             get_kv_transfer_group().set_host_xfer_buffer_ops(copy_kv_blocks)
 
     def reset_dynamo_cache(self):
-        if self.is_multimodal_model:
+
+        # NOTE: We check `is_multimodal_model` instead of `supports_mm_inputs`
+        # since the compiled model object of the language backbone of a
+        # multimodal model needs to be extracted via `get_language_model`.
+        if self.model_config.is_multimodal_model:
             compiled_model = self.model.get_language_model().model
         else:
             compiled_model = self.model.model

From 23472ff51cdf25c2f9c9bf9afa50a8d3cc6cc1d8 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Fri, 8 Aug 2025 23:04:19 -0700
Subject: [PATCH 109/932] [Doc] Add usage of implicit text-only mode  (#22561)

Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Flora Feng <4florafeng@gmail.com>
---
 docs/models/supported_models.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index b79650444a..afabfccb55 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -583,6 +583,9 @@ See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inp
 
     **This is no longer required if you are using vLLM V1.**
 
+!!! tip
+    For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
+
 !!! note
     vLLM currently only supports adding LoRA to the language backbone of multimodal models.
 

From 8a0ffd6285f6a0d8137d9363f448cef78ce97712 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Sat, 9 Aug 2025 08:05:32 +0200
Subject: [PATCH 110/932] Remove mamba_ssm from vLLM requirements; install
 inside test container using `--no-build-isolation` (#22541)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 .buildkite/test-pipeline.yaml                   |  8 ++++----
 docs/contributing/ci/update_pytorch_version.md  | 13 -------------
 requirements/test.in                            |  5 ++---
 requirements/test.txt                           | 13 +------------
 tests/models/language/generation/test_hybrid.py | 16 +++++++++-------
 tests/models/registry.py                        | 16 ++++++++++------
 6 files changed, 26 insertions(+), 45 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e139c6b305..221888edb3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -535,8 +535,6 @@ steps:
   - vllm/
   - tests/models/language
   commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/language -m core_model
 
@@ -547,8 +545,10 @@ steps:
   - vllm/
   - tests/models/language/generation
   commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
     - pytest -v -s models/language/generation -m hybrid_model
 
 - label: Language Models Test (Extended Generation) # 1hr20min
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 3a6026d450..7ef22d6f8c 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -131,19 +131,6 @@ MAX_JOBS=16 uv pip install --system \
     --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
 ```
 
-### Mamba
-
-```bash
-uv pip install --system \
-    --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5"
-```
-
-### causal-conv1d
-
-```bash
-uv pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
-```
-
 ## Update all the different vLLM platforms
 
 Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable
diff --git a/requirements/test.in b/requirements/test.in
index 1e0cab80a2..ca22fd1551 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -10,7 +10,7 @@ pytest-timeout
 # testing utils
 backoff # required for phi4mm test
 blobfile # required for kimi-vl test
-einops # required for MPT, qwen-vl and Mamba
+einops # required for MPT, qwen-vl
 httpx
 librosa # required for audio tests
 vector_quantize_pytorch # required for minicpmo_26 test
@@ -26,7 +26,6 @@ torch==2.7.1
 torchaudio==2.7.1
 torchvision==0.22.1
 transformers_stream_generator # required for qwen-vl test
-mamba_ssm==2.2.5 # required for plamo2 test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.8.2 # required for voxtral test
 num2words # required for smolvlm test
@@ -53,4 +52,4 @@ runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
 pydantic>=2.10 # 2.9 leads to error on python 3.10
-terratorch==1.1rc2 # required for PrithviMAE test
\ No newline at end of file
+terratorch==1.1rc2 # required for PrithviMAE test
diff --git a/requirements/test.txt b/requirements/test.txt
index 324f8153b2..377eeb58c4 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -178,7 +178,6 @@ einops==0.8.1
     # via
     #   -r requirements/test.in
     #   encodec
-    #   mamba-ssm
     #   terratorch
     #   torchgeo
     #   vector-quantize-pytorch
@@ -417,8 +416,6 @@ lxml==5.3.0
     #   sacrebleu
 mako==1.3.10
     # via alembic
-mamba-ssm==2.2.5
-    # via -r requirements/test.in
 markdown==3.8.2
     # via mlflow
 markdown-it-py==3.0.0
@@ -475,8 +472,6 @@ networkx==3.2.1
     # via
     #   scikit-image
     #   torch
-ninja==1.11.1.3
-    # via mamba-ssm
 nltk==3.9.1
     # via rouge-score
 num2words==0.5.14
@@ -629,7 +624,6 @@ packaging==24.2
     #   lazy-loader
     #   lightning
     #   lightning-utilities
-    #   mamba-ssm
     #   matplotlib
     #   mlflow-skinny
     #   peft
@@ -973,7 +967,6 @@ sentencepiece==0.2.0
 setuptools==77.0.3
     # via
     #   lightning-utilities
-    #   mamba-ssm
     #   pytablewriter
     #   torch
     #   triton
@@ -1085,7 +1078,6 @@ torch==2.7.1+cu128
     #   lightly
     #   lightning
     #   lm-eval
-    #   mamba-ssm
     #   mteb
     #   open-clip-torch
     #   peft
@@ -1152,16 +1144,13 @@ transformers==4.55.0
     #   -r requirements/test.in
     #   genai-perf
     #   lm-eval
-    #   mamba-ssm
     #   peft
     #   sentence-transformers
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
     # via -r requirements/test.in
 triton==3.3.1
-    # via
-    #   mamba-ssm
-    #   torch
+    # via torch
 tritonclient==2.51.0
     # via
     #   -r requirements/test.in
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 67ba2f2559..8c3e1f5c2b 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -25,10 +25,8 @@ SSM_MODELS = [
 
 HYBRID_MODELS = [
     "ai21labs/Jamba-tiny-dev",
-    # NOTE: Running Plamo2 in transformers implementation requires to install
-    # causal-conv1d package, which is not listed as a test dependency as it's
-    # not compatible with pip-compile.
-    "pfnet/plamo-2-1b",
+    # skipping until vLLM implementation issues are resolved
+    # "pfnet/plamo-2-1b",
     "Zyphra/Zamba2-1.2B-instruct",
     "hmellor/tiny-random-BambaForCausalLM",
     "ibm-ai-platform/Bamba-9B-v1",
@@ -83,12 +81,16 @@ def test_models(
     try:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
         model_info.check_available_online(on_fail="skip")
-        model_info.check_transformers_version(on_fail="skip")
+        hf_version_check = model_info.check_transformers_version(
+            on_fail="return")
     except ValueError:
-        pass
+        hf_version_check = None
+
+    if hf_version_check is not None:
+        print(f"Skipping transformers comparison because: {hf_version_check}")
 
     with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS:
+        if model not in HF_UNSUPPORTED_MODELS and hf_version_check is None:
             hf_outputs = hf_model.generate_greedy_logprobs_limit(
                 example_prompts, max_tokens, num_logprobs)
         else:
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b1952ce9c2..2bb06b7d19 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -79,17 +79,17 @@ class _HfExamplesInfo:
     def check_transformers_version(
         self,
         *,
-        on_fail: Literal["error", "skip"],
+        on_fail: Literal["error", "skip", "return"],
         check_min_version: bool = True,
         check_max_version: bool = True,
-    ) -> None:
+    ) -> Optional[str]:
         """
         If the installed transformers version does not meet the requirements,
         perform the given action.
         """
         if (self.min_transformers_version is None
                 and self.max_transformers_version is None):
-            return
+            return None
 
         current_version = TRANSFORMERS_VERSION
         cur_base_version = Version(current_version).base_version
@@ -105,16 +105,18 @@ class _HfExamplesInfo:
               and Version(cur_base_version) > Version(max_version)):
             msg += f"<={max_version}` is required to run this model."
         else:
-            return
+            return None
 
         if self.transformers_version_reason:
             msg += f" Reason: {self.transformers_version_reason}"
 
         if on_fail == "error":
             raise RuntimeError(msg)
-        else:
+        elif on_fail == "skip":
             pytest.skip(msg)
 
+        return msg
+
     def check_available_online(
         self,
         *,
@@ -148,7 +150,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                          trust_remote_code=True),
     "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
                                          trust_remote_code=True),
-    "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B",
+    "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1",
+                                        min_transformers_version="4.55.1",
                                         extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
                                         {"1b": "bigscience/bloomz-1b1"}),
@@ -223,6 +226,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                             trust_remote_code=True),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
     "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
+                                        min_transformers_version="4.55.1",
                                         extras={
                                             "tiny": "ai21labs/Jamba-tiny-dev",
                                             "random": "ai21labs/Jamba-tiny-random",  # noqa: E501

From 3157aebb63a2e121da6de943754dc95dffd14caa Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Sat, 9 Aug 2025 02:07:48 -0400
Subject: [PATCH 111/932] [Log] Add Warning for Deprecation of DeepGEMM old
 version (#22194)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/utils/deep_gemm.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 0edfb01cde..174287b44b 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -14,6 +14,7 @@ from typing import Any, Callable, NoReturn
 import torch
 
 import vllm.envs as envs
+from vllm.logger import logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv, has_deep_gemm
 
@@ -57,6 +58,14 @@ def _resolve_symbol(module, new: str, old: str) -> Callable[..., Any] | None:
     if hasattr(module, new):
         return getattr(module, new)
     if hasattr(module, old):
+        # TODO(wentao): deprecate old symbol in the future.
+        logger.warning_once(
+            "Found legacy DeepGEMM symbol `%s`. Please upgrade the `deep_gemm` "
+            "package so that `%s` is available. Support for the legacy symbol "
+            "will be removed in a future vLLM release.",
+            old,
+            new,
+        )
         return getattr(module, old)
     return None
 

From 6ade99eafa373f5c88eb6b8956daa4c217aa7cda Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Sat, 9 Aug 2025 08:08:48 +0200
Subject: [PATCH 112/932] [V1] [Hybrid] Support Minimax-Text-01 in V1  (#22151)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/model_executor/layers/lightning_attn.py  |   2 +-
 .../layers/mamba/mamba_utils.py               |  11 +
 vllm/model_executor/models/minimax_text_01.py | 192 ++++++++++++++----
 vllm/v1/attention/backends/linear_attn.py     |  67 ++++++
 vllm/v1/attention/backends/mamba_selectors.py |   4 +-
 5 files changed, 234 insertions(+), 42 deletions(-)
 create mode 100644 vllm/v1/attention/backends/linear_attn.py

diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
index 978086d190..8ffc700ca5 100644
--- a/vllm/model_executor/layers/lightning_attn.py
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -532,7 +532,7 @@ def _linear_attn_decode_kernel(
     pid_d = tl.program_id(2)  # dimension block index
 
     # Load slot index for the current batch
-    slot_id = tl.load(slot_idx + pid_b)
+    slot_id = tl.load(slot_idx + pid_b).to(tl.int64)
 
     # Skip if slot_id is -1 (padding)
     if slot_id == -1:
diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py
index 42c815b08f..ad14017912 100644
--- a/vllm/model_executor/layers/mamba/mamba_utils.py
+++ b/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -5,6 +5,17 @@ from vllm.distributed import divide
 
 class MambaStateShapeCalculator:
 
+    @classmethod
+    def linear_attention_state_shape(
+        cls,
+        num_heads: int,
+        tp_size: int,
+        head_dim: int,
+    ) -> tuple[tuple[int, int, int], ...]:
+
+        state_shape = (num_heads // tp_size, head_dim, head_dim)
+        return (state_shape, )
+
     @classmethod
     def mamba1_state_shape(
         cls,
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index f2773af490..1f9f7f60ca 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -14,8 +14,9 @@ from einops import rearrange
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
+from vllm import envs
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tensor_model_parallel_rank,
@@ -33,6 +34,9 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.abstract import MambaBase
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -41,8 +45,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
 
-from .interfaces import HasInnerState, IsHybrid, SupportsV0Only
+from .interfaces import HasInnerState, IsHybrid
 from .minimax_cache import MinimaxCacheManager, MinimaxCacheParams
 from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
 
@@ -327,7 +332,17 @@ class MiniMaxText01LinearKernel:
         return rearrange(output.squeeze(0), "h n d -> n (h d)")
 
 
-class MiniMaxText01LinearAttention(nn.Module):
+class MiniMaxText01LinearAttention(nn.Module, MambaBase):
+
+    @property
+    def mamba_type(self) -> str:
+        return "linear_attention"
+
+    def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
+        return MambaStateShapeCalculator.linear_attention_state_shape(
+            num_heads=self.num_heads,
+            tp_size=self.tp_size,
+            head_dim=self.head_dim)
 
     def __init__(
         self,
@@ -359,6 +374,7 @@ class MiniMaxText01LinearAttention(nn.Module):
         self.tp_heads = self.total_num_heads // self.tp_size
         self.qkv_size = self.num_heads * self.head_dim
         self.tp_hidden = self.head_dim * self.tp_heads
+        self.prefix = prefix
 
         self.qkv_proj = ColumnParallelLinear(
             hidden_size,
@@ -397,6 +413,12 @@ class MiniMaxText01LinearAttention(nn.Module):
                                         self.tp_heads:(self.tp_rank + 1) *
                                         self.tp_heads].contiguous()
 
+        if envs.VLLM_USE_V1:
+            compilation_config = get_current_vllm_config().compilation_config
+            if prefix in compilation_config.static_forward_context:
+                raise ValueError(f"Duplicate layer name: {prefix}")
+            compilation_config.static_forward_context[prefix] = self
+
     @staticmethod
     def weight_direct_load(param: torch.Tensor,
                            loaded_weight: torch.Tensor) -> None:
@@ -434,13 +456,14 @@ class MiniMaxText01LinearAttention(nn.Module):
                 break
             if _prefill_idx >= len(state_indices_tensor):
                 break
-            _start = attn_metadata.query_start_loc[_prefill_idx]
-            _end = attn_metadata.query_start_loc[_prefill_idx + 1]
-            slot_id = state_indices_tensor[_prefill_idx]
+            # prefills are packed at end of batch in V1
+            offset = attn_metadata.num_decode_tokens if envs.VLLM_USE_V1 else 0
+            _start = attn_metadata.query_start_loc[offset + _prefill_idx]
+            _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1]
+            slot_id = state_indices_tensor[offset + _prefill_idx]
             qs = q[_start:_end].transpose(0, 1).contiguous()
             ks = k[_start:_end].transpose(0, 1).contiguous()
             vs = v[_start:_end].transpose(0, 1).contiguous()
-            slot_id = state_indices_tensor[_prefill_idx]
             slice_layer_cache = kv_cache[slot_id, ...]
 
             out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix(
@@ -453,9 +476,13 @@ class MiniMaxText01LinearAttention(nn.Module):
                 layer_idx=self.layer_idx)
             hidden.append(out_slice.contiguous())
         if attn_metadata.num_decode_tokens > 0:
-            hidden.append(
-                self._decode_infer(q, k, v, kv_cache, state_indices_tensor,
-                                   attn_metadata))
+            hidden_decode = self._decode_infer(q, k, v, kv_cache,
+                                               state_indices_tensor,
+                                               attn_metadata)
+            if envs.VLLM_USE_V1:
+                hidden.insert(0, hidden_decode)
+            else:
+                hidden.append(hidden_decode)
 
         if not hidden:
             return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype)
@@ -465,11 +492,17 @@ class MiniMaxText01LinearAttention(nn.Module):
 
     def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor,
                       attn_metadata):
-        q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
-        k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
-        v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
-        slot_id = state_indices_tensor[getattr(attn_metadata, "num_prefills", 0
-                                               ):]
+        if not envs.VLLM_USE_V1:
+            q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+            k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+            v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+            num_prefills = getattr(attn_metadata, "num_prefills", 0)
+            slot_id = state_indices_tensor[num_prefills:]
+        else:
+            q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+            k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+            v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+            slot_id = state_indices_tensor[:attn_metadata.num_decodes]
         hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope,
                                               slot_id, 32)
         return hidden
@@ -483,17 +516,49 @@ class MiniMaxText01LinearAttention(nn.Module):
         q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
         forward_context = get_forward_context()
         attn_metadata = forward_context.attn_metadata
-        kv_cache = kv_caches.minimax_cache
-        state_indices_tensor = kv_caches.state_indices_tensor
+        if envs.VLLM_USE_V1:
+            if attn_metadata is not None:
+                assert isinstance(attn_metadata, dict)
+                attn_metadata = attn_metadata[self.prefix]
+                assert isinstance(attn_metadata, LinearAttentionMetadata)
+                kv_cache = self.kv_cache[forward_context.virtual_engine][0]
+                state_indices_tensor = attn_metadata.state_indices_tensor
+
+                num_prefills = getattr(attn_metadata, "num_prefills", 0)
+                if num_prefills > 0:
+                    num_decode_tokens = getattr(attn_metadata,
+                                                "num_decode_tokens", 0)
+                    for prefill_idx in range(num_prefills):
+                        q_start = attn_metadata.query_start_loc[
+                            num_decode_tokens + prefill_idx]
+                        q_end = attn_metadata.query_start_loc[num_decode_tokens
+                                                              + prefill_idx +
+                                                              1]
+                        query_len = q_end - q_start
+                        context_len = attn_metadata.seq_lens[
+                            num_decode_tokens + prefill_idx] - query_len
+                        if context_len == 0:
+                            block_to_clear = state_indices_tensor[
+                                num_decode_tokens + prefill_idx]
+                            kv_cache[block_to_clear, ...] = 0
+        else:
+            kv_cache = kv_caches.minimax_cache
+            state_indices_tensor = kv_caches.state_indices_tensor
 
         decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
-        if not decode_only:
-            hidden = self._prefill_and_mix_infer(q, k, v, kv_cache,
-                                                 state_indices_tensor,
-                                                 attn_metadata)
+        if attn_metadata is None:
+            hidden = torch.empty((q.shape[0], q.shape[1] * q.shape[2]),
+                                 device=q.device,
+                                 dtype=q.dtype)
         else:
-            hidden = self._decode_infer(q, k, v, kv_cache,
-                                        state_indices_tensor, attn_metadata)
+            if not decode_only:
+                hidden = self._prefill_and_mix_infer(q, k, v, kv_cache,
+                                                     state_indices_tensor,
+                                                     attn_metadata)
+            else:
+                hidden = self._decode_infer(q, k, v, kv_cache,
+                                            state_indices_tensor,
+                                            attn_metadata)
 
         hidden = self.norm._forward(hidden)
         gate, _ = self.output_gate(hidden_states)
@@ -541,6 +606,7 @@ class MiniMaxText01Attention(nn.Module):
         self.scaling = self.head_dim**-0.5
         self.rope_theta = rope_theta
         self.sliding_window = sliding_window
+        self.prefix = prefix
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -575,7 +641,12 @@ class MiniMaxText01Attention(nn.Module):
         attn_metadata = forward_context.attn_metadata
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = attn_metadata.rotary_emb(positions, q, k)
+        if envs.VLLM_USE_V1:
+            if attn_metadata is not None:
+                q, k = attn_metadata[f"{self.prefix}.attn"].rotary_emb(
+                    positions, q, k)
+        else:
+            q, k = attn_metadata.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
@@ -595,6 +666,7 @@ class MiniMaxText01DecoderLayer(nn.Module):
     ) -> None:
         self._ilayer = layer_id
         self._irank = get_tensor_model_parallel_rank()
+        self.prefix = prefix
         super().__init__()
 
         self.hidden_size = config.hidden_size
@@ -876,8 +948,9 @@ class MiniMaxText01Model(nn.Module):
         self._dtype = _dummy.dtype
         del _dummy
 
-        self.minimax_cache = MinimaxCacheManager(dtype=torch.float32,
-                                                 cache_shape=self.cache_shape)
+        if not envs.VLLM_USE_V1:
+            self.minimax_cache = MinimaxCacheManager(
+                dtype=torch.float32, cache_shape=self.cache_shape)
 
         rope_theta = getattr(config, "rope_theta", 10000)
         head_dim = getattr(config, "head_dim", None)
@@ -944,23 +1017,27 @@ class MiniMaxText01Model(nn.Module):
                 **kwargs) -> Union[torch.Tensor, IntermediateTensors]:
         forward_context = get_forward_context()
         attn_metadata = forward_context.attn_metadata
-        if attn_metadata is None:
+        if not envs.VLLM_USE_V1 and attn_metadata is None:
             return None
         if "request_ids_to_seq_ids" not in kwargs:
             kwargs["request_ids_to_seq_ids"] = {}
         if "finished_requests_ids" not in kwargs:
             kwargs["finished_requests_ids"] = []
 
-        (
-            minimax_cache_tensors,
-            state_indices_tensor,
-        ) = self.minimax_cache.current_run_tensors(**kwargs)
-        if getattr(attn_metadata, "num_prefills", 0) > 0:
-            self._clear_prefill_cache(attn_metadata, minimax_cache_tensors,
-                                      **kwargs)
+        if not envs.VLLM_USE_V1:
+            (
+                minimax_cache_tensors,
+                state_indices_tensor,
+            ) = self.minimax_cache.current_run_tensors(**kwargs)
+            if getattr(attn_metadata, "num_prefills", 0) > 0:
+                self._clear_prefill_cache(attn_metadata, minimax_cache_tensors,
+                                          **kwargs)
+
+            minimax_cache_params = MinimaxCacheParams(minimax_cache_tensors,
+                                                      state_indices_tensor)
+        else:
+            minimax_cache_params = None
 
-        minimax_cache_params = MinimaxCacheParams(minimax_cache_tensors,
-                                                  state_indices_tensor)
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
                 hidden_states = self.embed_scale * self.embed_tokens(input_ids)
@@ -973,11 +1050,22 @@ class MiniMaxText01Model(nn.Module):
             residual = intermediate_tensors["residual"]
 
         minimax_cache_index = 0
-        attn_metadata.rotary_emb = self.rotary_emb
+
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
+            if attn_metadata is not None:
+                # TODO (tdoublep): this whole thing with the rotary_emb is
+                # weird. we shouldn't be passing it via attn_metadata imo.
+                if envs.VLLM_USE_V1:
+                    if isinstance(layer.self_attn, MiniMaxText01Attention):
+                        attn_metadata[layer.prefix +
+                                      ".attn"].rotary_emb = self.rotary_emb
+                else:
+                    attn_metadata.rotary_emb = self.rotary_emb
+
             _caches = None
-            if isinstance(layer.self_attn, MiniMaxText01LinearAttention):
+            if not envs.VLLM_USE_V1 and isinstance(
+                    layer.self_attn, MiniMaxText01LinearAttention):
                 current_state_layer = minimax_cache_index
                 _caches = minimax_cache_params.at_layer_idx(
                     current_state_layer)
@@ -1002,8 +1090,7 @@ class MiniMaxText01Model(nn.Module):
         return hidden_states
 
 
-class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid,
-                               SupportsV0Only):
+class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
@@ -1321,3 +1408,28 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid,
 
             load_basic_weight(name, loaded_weight, self)
         return loaded_params
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, ...], ...]:
+        """Calculate shape for MiniMaxText01LinearAttention cache.
+
+        Args:
+            vllm_config: vLLM config
+            use_v1: Get shapes for V1 (or V0)
+
+        Returns:
+            Tuple containing:
+            - state_shape: Shape of the cache
+        """
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+
+        return MambaStateShapeCalculator.linear_attention_state_shape(
+            num_heads=hf_config.num_attention_heads,
+            tp_size=parallel_config.tensor_parallel_size,
+            head_dim=hf_config.head_dim,
+        )
diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py
new file mode 100644
index 0000000000..f08b6d7f17
--- /dev/null
+++ b/vllm/v1/attention/backends/linear_attn.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import ClassVar
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.config import VllmConfig
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata,
+                                              split_decodes_and_prefills)
+from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
+
+
+class LinearAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_builder_cls() -> type["LinearAttentionMetadataBuilder"]:
+        return LinearAttentionMetadataBuilder
+
+
+@dataclass
+class LinearAttentionMetadata:
+    num_prefills: int
+    num_prefill_tokens: int
+    num_decodes: int
+    num_decode_tokens: int
+    query_start_loc: torch.Tensor
+    seq_lens: torch.Tensor
+
+    state_indices_tensor: torch.Tensor  # shape: [batch,]
+
+
+class LinearAttentionMetadataBuilder(
+        AttentionMetadataBuilder[LinearAttentionMetadata]):
+
+    reorder_batch_threshold: ClassVar[int] = 1
+
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
+        assert isinstance(kv_cache_spec, MambaSpec)
+        self.kv_cache_spec = kv_cache_spec
+
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False) -> LinearAttentionMetadata:
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
+
+        state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
+
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+            split_decodes_and_prefills(common_attn_metadata,
+                                       decode_threshold=1))
+
+        attn_metadata = LinearAttentionMetadata(
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
+            query_start_loc=query_start_loc,
+            seq_lens=seq_lens,
+            state_indices_tensor=state_indices_tensor,
+        )
+        return attn_metadata
diff --git a/vllm/v1/attention/backends/mamba_selectors.py b/vllm/v1/attention/backends/mamba_selectors.py
index f56f2fb7bf..852e0dfe1b 100644
--- a/vllm/v1/attention/backends/mamba_selectors.py
+++ b/vllm/v1/attention/backends/mamba_selectors.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.attention.backends.abstract import AttentionBackend
+from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend
 from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
 from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
 
@@ -8,9 +9,10 @@ from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
 def get_mamba_attn_backend(mamba_type: str) -> type[AttentionBackend]:
     if mamba_type == "mamba1":
         return Mamba1AttentionBackend
-
     if mamba_type == "mamba2":
         return Mamba2AttentionBackend
+    if mamba_type == "linear_attention":
+        return LinearAttentionBackend
 
     raise NotImplementedError(f"Mamba Attention type {mamba_type} is not "
                               "supported yet.")

From 7ad7adb67f1350b6e9f7cfdd7aacf38eed093bb1 Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Sat, 9 Aug 2025 09:09:51 +0300
Subject: [PATCH 113/932] v1: Pass KVConnectorOutput to scheduler-side (#22157)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 .../distributed/kv_transfer/kv_connector/v1/base.py | 13 +++++++++++++
 .../kv_transfer/kv_connector/v1/multi_connector.py  |  5 +++++
 vllm/v1/core/sched/scheduler.py                     |  4 ++++
 3 files changed, 22 insertions(+)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 7a2ccb5865..b721043978 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -12,6 +12,8 @@ The class provides the following primitives:
             times for a given request and should be side-effect free.
         update_state_after_alloc() - update KVConnector state after
             temporary buffer alloc by the CacheManager.
+        update_connector_output() - update KVConnector state after
+            output is received from worker-side connectors.
         request_finished() - called when a request is finished, with
             the computed kv cache blocks for the request.
             Returns whether KV cache should be freed now or will be
@@ -38,6 +40,7 @@ import torch
 
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
@@ -283,6 +286,16 @@ class KVConnectorBase_V1(ABC):
         """
         pass
 
+    def update_connector_output(self, connector_output: KVConnectorOutput):
+        """
+        Update KVConnector state from worker-side connectors output.
+
+        Args:
+            connector_output (KVConnectorOutput): the worker-side
+                connectors output.
+        """
+        return
+
     def request_finished(
         self,
         request: "Request",
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 62a4980bff..7d67c76e2f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -14,6 +14,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
@@ -177,6 +178,10 @@ class MultiConnector(KVConnectorBase_V1):
             self._extra_async_saves = {}
         return metadata
 
+    def update_connector_output(self, connector_output: KVConnectorOutput):
+        for c in self._connectors:
+            c.update_connector_output(connector_output)
+
     def request_finished(
         self,
         request: "Request",
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 430085d9c9..85fc1a4a01 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1150,6 +1150,10 @@ class Scheduler(SchedulerInterface):
         # if finished_recving: add to state so we can
             scheduler the request during the next step.
         """
+
+        assert self.connector is not None
+        self.connector.update_connector_output(kv_connector_output)
+
         # KV Connector:: update recv and send status from last step.
         for req_id in (kv_connector_output.finished_recving or ()):
             logger.debug("Finished recving KV transfer for request %s", req_id)

From 65552b476b1c475ef433995d2699bb27428693b3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 9 Aug 2025 14:10:51 +0800
Subject: [PATCH 114/932] [Misc] Use config definitions from Transformers
 library (#21913)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/aimv2.py           | 22 +++++++++----------
 vllm/model_executor/models/commandr.py        |  8 +++----
 vllm/model_executor/models/dbrx.py            | 14 ++++++------
 vllm/model_executor/models/deepseek_v2.py     | 15 ++++++++-----
 vllm/model_executor/models/dots1.py           |  8 +++----
 vllm/model_executor/models/exaone4.py         |  6 ++---
 vllm/model_executor/models/glm4_moe.py        | 10 ++++-----
 vllm/model_executor/models/minimax_text_01.py |  6 ++---
 vllm/model_executor/models/olmoe.py           |  4 ++--
 vllm/model_executor/models/qwen2_moe.py       |  6 ++---
 vllm/model_executor/models/qwen3_moe.py       |  6 ++---
 11 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index d2307bb464..b13d863ebb 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -8,7 +8,6 @@ from typing import Optional
 
 import torch
 import torch.nn as nn
-from transformers import PretrainedConfig
 
 from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -21,12 +20,13 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.transformers_utils.configs.ovis import AIMv2Config
 
 
 class AIMv2SwiGLUFFN(nn.Module):
 
-    def __init__(self, config: PretrainedConfig,
-                 quant_config: QuantizationConfig, prefix: str):
+    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
+                 prefix: str):
         super().__init__()
         hidden_features = config.intermediate_size
         in_features = config.hidden_size
@@ -57,7 +57,7 @@ class AIMv2SwiGLUFFN(nn.Module):
 
 class AIMv2PatchEmbed(nn.Module):
 
-    def __init__(self, config: PretrainedConfig):
+    def __init__(self, config: AIMv2Config):
         super().__init__()
         self.proj = nn.Conv2d(
             config.num_channels,
@@ -75,7 +75,7 @@ class AIMv2PatchEmbed(nn.Module):
 
 class AIMv2ViTPreprocessor(nn.Module):
 
-    def __init__(self, config: PretrainedConfig):
+    def __init__(self, config: AIMv2Config):
         super().__init__()
         num_patches = (config.image_size // config.patch_size)**2
 
@@ -93,8 +93,8 @@ class AIMv2ViTPreprocessor(nn.Module):
 
 class AIMv2Attention(nn.Module):
 
-    def __init__(self, config: PretrainedConfig,
-                 quant_config: QuantizationConfig, prefix: str):
+    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
+                 prefix: str):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -141,8 +141,8 @@ class AIMv2Attention(nn.Module):
 
 class AIMv2Block(nn.Module):
 
-    def __init__(self, config: PretrainedConfig,
-                 quant_config: QuantizationConfig, prefix: str):
+    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
+                 prefix: str):
         super().__init__()
         self.attn = AIMv2Attention(config,
                                    quant_config=quant_config,
@@ -163,7 +163,7 @@ class AIMv2Transformer(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: AIMv2Config,
         quant_config: QuantizationConfig,
         *,
         require_post_norm: Optional[bool] = None,
@@ -193,7 +193,7 @@ class AIMv2Transformer(nn.Module):
 class AIMv2Model(torch.nn.Module):
 
     def __init__(self,
-                 config: PretrainedConfig,
+                 config: AIMv2Config,
                  quant_config: QuantizationConfig,
                  *,
                  require_post_norm: Optional[bool] = None,
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index c4f6144ed9..69281abf73 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -27,7 +27,7 @@ from typing import Optional, Union
 
 import torch
 from torch import nn
-from transformers import CohereConfig
+from transformers import Cohere2Config, CohereConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -89,7 +89,7 @@ class CohereMLP(nn.Module):
 
     def __init__(
         self,
-        config: CohereConfig,
+        config: Union[CohereConfig, Cohere2Config],
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ):
@@ -124,7 +124,7 @@ class CohereAttention(nn.Module):
 
     def __init__(
         self,
-        config: CohereConfig,
+        config: Union[CohereConfig, Cohere2Config],
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -242,7 +242,7 @@ class CohereAttention(nn.Module):
 class CohereDecoderLayer(nn.Module):
 
     def __init__(self,
-                 config: CohereConfig,
+                 config: Union[CohereConfig, Cohere2Config],
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 360c7e66bf..e74d90e0b1 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -6,7 +6,7 @@ from typing import Optional, Union
 
 import torch
 import torch.nn as nn
-from transformers import PretrainedConfig
+from transformers import DbrxConfig
 
 from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
@@ -39,7 +39,7 @@ class DbrxRouter(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: DbrxConfig,
         params_dtype: Optional[torch.dtype] = None,
     ):
         super().__init__()
@@ -63,7 +63,7 @@ class DbrxExperts(FusedMoE):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: DbrxConfig,
         quant_config: Optional[QuantizationConfig] = None,
         params_dtype: Optional[torch.dtype] = None,
         prefix: str = "",
@@ -138,7 +138,7 @@ class DbrxMoE(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: DbrxConfig,
         quant_config: Optional[QuantizationConfig] = None,
         params_dtype: Optional[torch.dtype] = None,
         prefix: str = "",
@@ -169,7 +169,7 @@ class DbrxAttention(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: DbrxConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -249,7 +249,7 @@ class DbrxFusedNormAttention(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: DbrxConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -284,7 +284,7 @@ class DbrxBlock(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: DbrxConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index c2880c33cb..f199da135e 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -29,7 +29,7 @@ from typing import Any, Optional, Union
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig
+from transformers import DeepseekV2Config, DeepseekV3Config
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -100,7 +100,7 @@ class DeepseekV2MoE(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Union[DeepseekV2Config, DeepseekV3Config],
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
         enable_eplb: bool = False,
@@ -221,7 +221,7 @@ class DeepseekV2Attention(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Union[DeepseekV2Config, DeepseekV3Config],
         hidden_size: int,
         num_heads: int,
         qk_nope_head_dim: int,
@@ -373,7 +373,7 @@ class DeepseekV2MLAAttention(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Union[DeepseekV2Config, DeepseekV3Config],
         hidden_size: int,
         num_heads: int,
         qk_nope_head_dim: int,
@@ -538,7 +538,7 @@ class DeepseekV2DecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Union[DeepseekV2Config, DeepseekV3Config],
         prefix: str,
         model_config: ModelConfig,
         cache_config: Optional[CacheConfig] = None,
@@ -973,7 +973,10 @@ class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
     pass
 
 
-def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
+# Compatibility with
+# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py
+def get_spec_layer_idx_from_weight_name(config: Union[DeepseekV2Config,
+                                                      DeepseekV3Config],
                                         weight_name: str) -> Optional[int]:
     if (hasattr(config, "num_nextn_predict_layers")
             and config.num_nextn_predict_layers > 0):
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 9b21a79446..5f410c0ae5 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -29,7 +29,7 @@ from typing import Any, Optional, Union
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig
+from transformers import Dots1Config
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -99,7 +99,7 @@ class Dots1MoE(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Dots1Config,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ):
@@ -174,7 +174,7 @@ class Dots1Attention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        config: PretrainedConfig,
+        config: Dots1Config,
         rope_theta: float = 10000,
         rope_scaling: Optional[dict[str, Any]] = None,
         max_position_embeddings: int = 8192,
@@ -260,7 +260,7 @@ class Dots1DecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Dots1Config,
         prefix: str,
         model_config: ModelConfig,
         cache_config: Optional[CacheConfig] = None,
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 3d6ce3e889..ecd942a76a 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -26,7 +26,7 @@ from typing import Any, Optional, Union
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig
+from transformers import Exaone4Config
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -96,7 +96,7 @@ class Exaone4Attention(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Exaone4Config,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -224,7 +224,7 @@ class Exaone4DecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Exaone4Config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 0053e4e6ff..624eef6cf1 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -28,7 +28,7 @@ from typing import Any, Optional, Union
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig
+from transformers.models.glm4_moe import Glm4MoeConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -100,7 +100,7 @@ class Glm4MoE(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Glm4MoeConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
         enable_eplb: bool = False,
@@ -198,7 +198,7 @@ class Glm4MoeAttention(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Glm4MoeConfig,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -297,7 +297,7 @@ class Glm4MoeDecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Glm4MoeConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -681,7 +681,7 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         return self.model.get_expert_mapping()
 
 
-def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
+def get_spec_layer_idx_from_weight_name(config: Glm4MoeConfig,
                                         weight_name: str) -> Optional[int]:
     if hasattr(config,
                "num_nextn_predict_layers") and (config.num_nextn_predict_layers
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 1f9f7f60ca..3d14a6ad5c 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -12,7 +12,7 @@ import torch.distributed
 import torch.nn.functional as F
 from einops import rearrange
 from torch import nn
-from transformers.configuration_utils import PretrainedConfig
+from transformers import MiniMaxConfig
 
 from vllm import envs
 from vllm.attention import Attention, AttentionMetadata
@@ -656,7 +656,7 @@ class MiniMaxText01DecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: MiniMaxConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         expert_num: int = 1,
@@ -860,7 +860,7 @@ class MiniMaxText01Model(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: MiniMaxConfig,
         quant_config: Optional[QuantizationConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         scheduler_config=None,
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 7552f64c42..a47c3bd416 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -19,7 +19,7 @@ from typing import Any, Optional, Union
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig
+from transformers import OlmoeConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -205,7 +205,7 @@ class OlmoeDecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: OlmoeConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index b061e2f69a..5c4ad34246 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -30,7 +30,7 @@ from typing import Any, Optional, Union
 import torch
 import torch.nn.functional as F
 from torch import nn
-from transformers import PretrainedConfig
+from transformers import Qwen2MoeConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -98,7 +98,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Qwen2MoeConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ):
@@ -256,7 +256,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Qwen2MoeConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index b2397c115d..3d1e72299b 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -28,7 +28,7 @@ from typing import Any, Optional, Union
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig
+from transformers import Qwen3MoeConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
@@ -101,7 +101,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Qwen3MoeConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
         enable_eplb: bool = False,
@@ -278,7 +278,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
+        config: Qwen3MoeConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",

From 10a02535d4252353880486f6fdf91e5ce7507977 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eldar=20Kurti=C4=87?=
 <8884008+eldarkurtic@users.noreply.github.com>
Date: Sat, 9 Aug 2025 08:12:12 +0200
Subject: [PATCH 115/932] Fix loading of quantized BigCode models (#22463)

Signed-off-by: Eldar Kurtic <eldar@neuralmagic.com>
---
 vllm/model_executor/models/gpt_bigcode.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 661a67bdc0..036ded530f 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -45,7 +45,8 @@ from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class GPTBigCodeAttention(nn.Module):
@@ -83,6 +84,7 @@ class GPTBigCodeAttention(nn.Module):
             total_num_kv_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_attn",
         )
 
         self.c_proj = RowParallelLinear(
@@ -90,6 +92,7 @@ class GPTBigCodeAttention(nn.Module):
             self.hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -123,6 +126,7 @@ class GPTBigMLP(nn.Module):
         intermediate_size: int,
         config: GPTBigCodeConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -131,12 +135,14 @@ class GPTBigMLP(nn.Module):
             intermediate_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_fc",
         )
         self.c_proj = RowParallelLinear(
             intermediate_size,
             hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
         self.act = get_act_fn(config.activation_function)
 
@@ -167,7 +173,10 @@ class GPTBigCodeBlock(nn.Module):
                                         quant_config,
                                         prefix=f"{prefix}.attn")
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = GPTBigMLP(inner_dim, config, quant_config)
+        self.mlp = GPTBigMLP(inner_dim,
+                             config,
+                             quant_config,
+                             prefix=f"{prefix}.mlp")
 
     def forward(
         self,
@@ -260,7 +269,7 @@ class GPTBigCodeModel(nn.Module):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
-            if "c_attn.input_scale" in name or "c_attn.weight_scale" in name:
+            if "c_attn.input_scale" in name:
                 weight_loader(param, loaded_weight, 'q')
                 weight_loader(param, loaded_weight, 'k')
                 weight_loader(param, loaded_weight, 'v')
@@ -284,7 +293,8 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
         self.quant_config = quant_config
         self.transformer = GPTBigCodeModel(vllm_config=vllm_config,
-                                           prefix=prefix)
+                                           prefix=maybe_prefix(
+                                               prefix, "transformer"))
         if self.config.tie_word_embeddings:
             self.lm_head = self.transformer.wte
         else:

From 9a0c5ded5aef022d2cfd1a263cd1fecdeb6697be Mon Sep 17 00:00:00 2001
From: Kyuyeun Kim <62023335+kyuyeunk@users.noreply.github.com>
Date: Fri, 8 Aug 2025 23:12:54 -0700
Subject: [PATCH 116/932] [TPU] Add support for online w8a8 quantization
 (#22425)

Signed-off-by: Kyuyeun Kim <kyuyeunk@google.com>
---
 .../hardware_ci/run-tpu-v1-test-part2.sh      |  2 +
 tests/v1/tpu/test_tpu_int8.py                 | 73 +++++++++++++++++++
 .../layers/quantization/tpu_int8.py           | 10 ++-
 3 files changed, 82 insertions(+), 3 deletions(-)
 create mode 100644 tests/v1/tpu/test_tpu_int8.py

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index 734a817fd1..10d2e23649 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -139,6 +139,8 @@ run_and_track_test 5 "test_spmd_model_weight_loading.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
 run_and_track_test 6 "test_kv_cache_update_kernel.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
+run_and_track_test 7 "test_tpu_int8.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
 
 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
diff --git a/tests/v1/tpu/test_tpu_int8.py b/tests/v1/tpu/test_tpu_int8.py
new file mode 100644
index 0000000000..991070dc92
--- /dev/null
+++ b/tests/v1/tpu/test_tpu_int8.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests whether TPU Int8 computation is enabled correctly.
+
+Run `pytest tests/quantization/test_tpu_int8.py`.
+"""
+import pytest
+
+from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization.tpu_int8 import (
+    TPUInt8LinearMethod)
+from vllm.platforms import current_platform
+
+from ...models.registry import HF_EXAMPLE_MODELS
+
+MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="TPU Int8 is only enabled for TPUs.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize(
+    "hf_overrides",
+    [
+        # w8a8 dynamic activation
+        {
+            'quantization_config': {
+                'quant_method': 'tpu_int8',
+                'activation_scheme': 'dynamic'
+            }
+        }
+    ])
+def test_model_tpu_int8(vllm_runner, model: str, dtype: str, max_tokens: int,
+                        hf_overrides: dict, monkeypatch) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_transformers_version(on_fail="skip")
+
+    activation_scheme = hf_overrides.get('quantization_config',
+                                         {}).get('activation_scheme')
+    quantize_activation = activation_scheme == 'dynamic'
+
+    # Allows using apply_model
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    # Prevent error from re-initializing cache
+    monkeypatch.setenv("VLLM_XLA_CACHE_PATH", "")
+
+    prompts = [
+        "A robot may not injure a human being",
+        "It is only with the heart that one can see rightly;",
+        "The greatest glory in living lies not in never falling,",
+    ]
+    answers = [
+        "or, being injured, not kill, except in",
+        "without the heart, one can only see wrongly.",
+        "but in rising every time we fall. - Nelson"
+    ]
+
+    with vllm_runner(model, dtype=dtype, hf_overrides=hf_overrides) as vllm:
+
+        def check_model(model):
+            for name, module in model.named_modules():
+                if not isinstance(module, LinearBase):
+                    continue
+                quant_method = module.quant_method
+                assert isinstance(quant_method, TPUInt8LinearMethod)
+                assert quant_method.quantize_activation == quantize_activation
+
+        vllm.apply_model(check_model)
+        outputs = vllm.generate_greedy(prompts, max_tokens)
+        for (_, output), answer in zip(outputs, answers):
+            assert answer in output
diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
index 83c8a98eac..38de4b54fb 100644
--- a/vllm/model_executor/layers/quantization/tpu_int8.py
+++ b/vllm/model_executor/layers/quantization/tpu_int8.py
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.parameter import ModelWeightParameter
 
-ACTIVATION_SCHEMES = ["none"]
+ACTIVATION_SCHEMES = ["none", "dynamic"]
 
 
 class Int8TpuConfig(QuantizationConfig):
@@ -61,6 +61,9 @@ class TPUInt8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: Int8TpuConfig):
         self.quant_config = quant_config
+        self.quantize_activation = False
+        if self.quant_config.activation_scheme == 'dynamic':
+            self.quantize_activation = True
 
     def create_weights(self, layer: Module, input_size_per_partition: int,
                        output_partition_sizes: list[int], input_size: int,
@@ -107,7 +110,7 @@ class TPUInt8LinearMethod(LinearMethodBase):
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         try:
-            import torch_xla.experimental.xla_quantized_matmul  # noqa: F401
+            import torch_xla.experimental.custom_kernel  # noqa: F401
         except ImportError as err:
             raise ImportError(
                 "Please install torch_xla by following the instructions at "
@@ -115,7 +118,8 @@ class TPUInt8LinearMethod(LinearMethodBase):
                 "to run vLLM on TPU.") from err
         weight = layer.weight
         scale = layer.scale
-        out = torch.ops.xla.quantized_matmul(x, weight, scale)
+        out = torch.ops.xla.quantized_matmul_int8(
+            x, weight, scale, quantize_activation=self.quantize_activation)
         if bias is not None:
             out = out + bias
         return out

From b7c0942b65380ab8c53ecf2657121e1c21150672 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Sat, 9 Aug 2025 01:15:06 -0500
Subject: [PATCH 117/932] [ROCm][Misc] Rename the context_len to seq_len in
 ROCm custom paged attention kernel (#22097)

Signed-off-by: charlifu <charlifu@amd.com>
---
 csrc/rocm/attention.cu       | 179 +++++++++++++++++------------------
 csrc/rocm/ops.h              |   4 +-
 csrc/rocm/torch_bindings.cpp |   4 +-
 3 files changed, 91 insertions(+), 96 deletions(-)

diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 65cb1c1d14..e3a0e15f53 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -270,7 +270,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const int num_kv_heads,   
     const float scale,    
     const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ context_lens,   // [num_seqs]
+    const int* __restrict__ seq_lens,   // [num_seqs]
     const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes, // [num_heads]
@@ -304,12 +304,12 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
 
   const auto max_num_partitions = gridDim.y;
 
-  const int context_len = context_lens[seq_idx];
+  const int seq_len = seq_lens[seq_idx];
 
   const int partition_start_token_idx =
       partition_idx * T_PAR_SIZE;  // partition_size;
   // exit if partition is out of context for seq
-  if (partition_start_token_idx >= context_len) {
+  if (partition_start_token_idx >= seq_len) {
     return;
   }
 
@@ -361,8 +361,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
   // output layout from QKmfma : QH16xT4x4 16 qheads across 16 lanes, 16 tokens
   // across 4 rows x 4 tokens per lane
 
-  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
-  const int last_ctx_block = num_context_blocks - 1;
+  const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
+  const int last_seq_block = num_seq_blocks - 1;
 
   const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
 
@@ -373,9 +373,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const int klocal_token_idx =
         TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
     const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
-    const int kblock_idx = (kglobal_token_idx < context_len)
+    const int kblock_idx = (kglobal_token_idx < seq_len)
                                ? kglobal_token_idx / BLOCK_SIZE
-                               : last_ctx_block;
+                               : last_seq_block;
     kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
   }
 
@@ -476,9 +476,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
       // tokens
       const int vglobal_token_idx =
           partition_start_token_idx + vlocal_token_idx;
-      const int vblock_idx = (vglobal_token_idx < context_len)
+      const int vblock_idx = (vglobal_token_idx < seq_len)
                                  ? vglobal_token_idx / BLOCK_SIZE
-                                 : last_ctx_block;
+                                 : last_seq_block;
       vphysical_block_number[vtoken_depth][vblock_depth] =
           block_table_seq[vblock_idx];
     }
@@ -554,7 +554,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
   if constexpr (ALIBI_ENABLED) {
     for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
       const int local_token_idx = qkout_token_idx + token_depth * 16;
-      const int alibi_offset = local_token_idx - context_len + 1;
+      const int alibi_offset = local_token_idx - seq_len + 1;
       for (int i = 0; i < 4; i++) {
         d_out[token_depth][i] += alibi_slope * (alibi_offset + i);
       }
@@ -568,9 +568,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
   for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
     const int local_token_idx = qkout_token_idx + token_depth * 16;
     for (int i = 0; i < 4; i++) {
-      const float tmp = (local_token_idx + i < context_len)
-                            ? d_out[token_depth][i]
-                            : -FLT_MAX;
+      const float tmp =
+          (local_token_idx + i < seq_len) ? d_out[token_depth][i] : -FLT_MAX;
       qk_max = fmaxf(qk_max, tmp);
     }
   }
@@ -582,7 +581,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
   for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
     const int local_token_idx = qkout_token_idx + token_depth * 16;
     for (int i = 0; i < 4; i++) {
-      const float tmp = (local_token_idx + i < context_len)
+      const float tmp = (local_token_idx + i < seq_len)
                             ? __expf(d_out[token_depth][i] - qk_max)
                             : 0.0f;
       d_out[token_depth][i] = tmp;
@@ -780,7 +779,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     const int num_kv_heads,
     const float scale,
     const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ context_lens,   // [num_seqs]
+    const int* __restrict__ seq_lens,   // [num_seqs]
     const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes, // [num_heads]
@@ -809,10 +808,10 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
   const auto partition_size = blockDim.x;
   const auto max_num_partitions = gridDim.y;
 
-  const int context_len = context_lens[seq_idx];
+  const int seq_len = seq_lens[seq_idx];
   const int partition_start_token_idx = partition_idx * partition_size;
   // exit if partition is out of context for seq
-  if (partition_start_token_idx >= context_len) {
+  if (partition_start_token_idx >= seq_len) {
     return;
   }
   // every 4 lanes fetch 4 different qheads
@@ -855,7 +854,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
   const int warp_start_token_idx =
       partition_start_token_idx + warpid * WARP_SIZE;
 
-  if (warp_start_token_idx >= context_len) {  // warp out of context
+  if (warp_start_token_idx >= seq_len) {  // warp out of context
   #pragma unroll
     for (int h = 0; h < GQA_RATIO4; h++) {
       shared_qk_max[warpid][h] = -FLT_MAX;
@@ -863,8 +862,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     }
   } else {  // warp within context
 
-    const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
-    const int last_ctx_block = num_context_blocks - 1;
+    const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
+    const int last_seq_block = num_seq_blocks - 1;
 
     const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
     // token id within partition
@@ -873,9 +872,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     const int global_token_idx = partition_start_token_idx + local_token_idx;
 
     // fetch block number for k
-    const int block_idx = (global_token_idx < context_len)
+    const int block_idx = (global_token_idx < seq_len)
                               ? global_token_idx / BLOCK_SIZE
-                              : last_ctx_block;
+                              : last_seq_block;
 
     // fetch k physical block number
     //  int32 physical_block_number leads to overflow when multiplied with
@@ -888,7 +887,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     for (int b = 0; b < VBLOCKS; b++) {
       const int vblock_idx = warp_start_block_idx + b;
       const int vblock_idx_ctx =
-          (vblock_idx <= last_ctx_block) ? vblock_idx : last_ctx_block;
+          (vblock_idx <= last_seq_block) ? vblock_idx : last_seq_block;
       vphysical_blocks[b] = block_table[vblock_idx_ctx];
     }
 
@@ -1057,7 +1056,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     const int lane4_token_idx = 4 * (global_token_idx >> 2);
 
     if constexpr (ALIBI_ENABLED) {
-      const int alibi_offset = lane4_token_idx - context_len + 1;
+      const int alibi_offset = lane4_token_idx - seq_len + 1;
       for (int h = 0; h < QHLOOP; h++) {
         for (int i = 0; i < 4; i++) {
           d_out[h][i] += alibi_slope[h] * (alibi_offset + i);
@@ -1070,7 +1069,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     for (int h = 0; h < QHLOOP; h++) {
       qk_max[h] = -FLT_MAX;
       for (int i = 0; i < 4; i++) {
-        qk_max[h] = (lane4_token_idx + i < context_len)
+        qk_max[h] = (lane4_token_idx + i < seq_len)
                         ? fmaxf(qk_max[h], d_out[h][i])
                         : qk_max[h];
       }
@@ -1101,7 +1100,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     for (int h = 0; h < QHLOOP; h++) {
       exp_sum[h] = 0.0f;
       for (int i = 0; i < 4; i++) {
-        d_out[h][i] = (lane4_token_idx + i < context_len)
+        d_out[h][i] = (lane4_token_idx + i < seq_len)
                           ? __expf(d_out[h][i] - qk_max[h])
                           : 0.0f;
         exp_sum[h] += d_out[h][i];
@@ -1181,7 +1180,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     }
   }
 
-  if (warp_start_token_idx >= context_len) {  // warp out of context
+  if (warp_start_token_idx >= seq_len) {  // warp out of context
     for (int qh = 0; qh < QHLOOP; qh++) {
       for (int vh = 0; vh < VHELOOP; vh++) {
         vout_shared[qh][vh][laneid][warpid] = {0};
@@ -1279,7 +1278,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
                                            // max_num_partitions]
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
-    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ seq_lens,      // [num_seqs]
     const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
   const auto num_heads = gridDim.x;
@@ -1293,8 +1292,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     return;
   }
 
-  const int context_len = context_lens[seq_idx];
-  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+  const int seq_len = seq_lens[seq_idx];
+  const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
   const auto warpid = threadIdx.x / WARP_SIZE;
 
   __shared__ float shared_global_exp_sum;
@@ -1581,7 +1580,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
                                           // head_size, block_size]
     const int num_kv_heads, const float scale,
     const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ seq_lens,  // [num_seqs]
     const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
@@ -1615,11 +1614,11 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
 
   const int max_num_partitions = gridDim.y;
 
-  const int context_len = context_lens[seq_idx];  // length of a seq
+  const int seq_len = seq_lens[seq_idx];  // length of a seq
 
   const int partition_start_token_idx = partition_idx * T_PAR_SIZE;
   // exit if partition is out of context for seq
-  if (partition_start_token_idx >= context_len) {
+  if (partition_start_token_idx >= seq_len) {
     return;
   }
 
@@ -1715,8 +1714,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
     }
   }
 
-  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
-  const int last_ctx_block = num_context_blocks - 1;
+  const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
+  const int last_seq_block = num_seq_blocks - 1;
 
   const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
 
@@ -1727,9 +1726,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const int klocal_token_idx =
         TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
     const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
-    const int kblock_idx = (kglobal_token_idx < context_len)
+    const int kblock_idx = (kglobal_token_idx < seq_len)
                                ? kglobal_token_idx / BLOCK_SIZE
-                               : last_ctx_block;
+                               : last_seq_block;
     kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
   }
 
@@ -1781,9 +1780,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
           vblock_depth * BLOCK_SIZE;
       const int vglobal_token_idx =
           partition_start_token_idx + vlocal_token_idx;
-      const int vblock_idx = (vglobal_token_idx < context_len)
+      const int vblock_idx = (vglobal_token_idx < seq_len)
                                  ? vglobal_token_idx / BLOCK_SIZE
-                                 : last_ctx_block;
+                                 : last_seq_block;
       vphysical_block_number[vtoken_depth][vblock_depth] =
           block_table_seq[vblock_idx];
     }
@@ -1836,9 +1835,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
   for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
     const int local_token_idx = qkout_token_idx + token_depth * 16;
     for (int i = 0; i < 8; i++) {
-      const float tmp = (local_token_idx + 2 * i < context_len)
-                            ? dout[token_depth][i]
-                            : -FLT_MAX;
+      const float tmp =
+          (local_token_idx + 2 * i < seq_len) ? dout[token_depth][i] : -FLT_MAX;
       qk_max = fmaxf(qk_max, tmp);
     }
   }
@@ -1848,7 +1846,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
   for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
     const int local_token_idx = qkout_token_idx + token_depth * 16;
     for (int i = 0; i < 8; i++) {
-      const float tmp = (local_token_idx + 2 * i < context_len)
+      const float tmp = (local_token_idx + 2 * i < seq_len)
                             ? __expf(dout[token_depth][i] - qk_max)
                             : 0.0f;
       dout[token_depth][i] = tmp;
@@ -2019,7 +2017,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
                                           // head_size, block_size]
     const int num_kv_heads, const float scale,
     const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ seq_lens,      // [num_seqs]
     const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
@@ -2046,7 +2044,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
                                            // max_num_partitions]
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
-    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ seq_lens,      // [num_seqs]
     const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
   const auto num_heads = gridDim.x;
@@ -2060,8 +2058,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     return;
   }
 
-  const int context_len = context_lens[seq_idx];
-  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+  const int seq_len = seq_lens[seq_idx];
+  const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
   const int warpid = threadIdx.x / WARP_SIZE;
 
   __shared__ float shared_global_exp_sum;
@@ -2349,7 +2347,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
                                           // head_size, block_size]
     const int num_kv_heads, const float scale,
     const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ seq_lens,  // [num_seqs]
     const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
@@ -2382,11 +2380,11 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
 
   const int max_num_partitions = gridDim.y;
 
-  const int context_len = context_lens[seq_idx];  // length of a seq
+  const int seq_len = seq_lens[seq_idx];  // length of a seq
 
   const int partition_start_token_idx = partition_idx * T_PAR_SIZE;
   // exit if partition is out of context for seq
-  if (partition_start_token_idx >= context_len) {
+  if (partition_start_token_idx >= seq_len) {
     return;
   }
 
@@ -2482,8 +2480,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
     }
   }
 
-  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
-  const int last_ctx_block = num_context_blocks - 1;
+  const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
+  const int last_seq_block = num_seq_blocks - 1;
 
   const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
 
@@ -2494,9 +2492,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const int klocal_token_idx =
         TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
     const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
-    const int kblock_idx = (kglobal_token_idx < context_len)
+    const int kblock_idx = (kglobal_token_idx < seq_len)
                                ? kglobal_token_idx / BLOCK_SIZE
-                               : last_ctx_block;
+                               : last_seq_block;
     kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
   }
 
@@ -2548,9 +2546,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
           rowid * VTOKENS_PER_LANE + vblock_depth * BLOCK_SIZE;
       const int vglobal_token_idx =
           partition_start_token_idx + vlocal_token_idx;
-      const int vblock_idx = (vglobal_token_idx < context_len)
+      const int vblock_idx = (vglobal_token_idx < seq_len)
                                  ? vglobal_token_idx / BLOCK_SIZE
-                                 : last_ctx_block;
+                                 : last_seq_block;
       vphysical_block_number[vtoken_depth][vblock_depth] =
           block_table_seq[vblock_idx];
     }
@@ -2604,7 +2602,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const int local_token_idx = qkout_token_idx + token_depth * 16;
     for (int i = 0; i < 8; i++) {
       const float tmp =
-          (local_token_idx + i < context_len) ? dout[token_depth][i] : -FLT_MAX;
+          (local_token_idx + i < seq_len) ? dout[token_depth][i] : -FLT_MAX;
       qk_max = fmaxf(qk_max, tmp);
     }
   }
@@ -2614,7 +2612,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
   for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
     const int local_token_idx = qkout_token_idx + token_depth * 16;
     for (int i = 0; i < 8; i++) {
-      const float tmp = (local_token_idx + i < context_len)
+      const float tmp = (local_token_idx + i < seq_len)
                             ? __expf(dout[token_depth][i] - qk_max)
                             : 0.0f;
       dout[token_depth][i] = tmp;
@@ -2751,7 +2749,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
                                           // head_size, block_size]
     const int num_kv_heads, const float scale,
     const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ seq_lens,      // [num_seqs]
     const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
@@ -2778,7 +2776,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
                                            // max_num_partitions]
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
-    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ seq_lens,      // [num_seqs]
     const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
   const auto num_heads = gridDim.x;
@@ -2792,8 +2790,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     return;
   }
 
-  const int context_len = context_lens[seq_idx];
-  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+  const int seq_len = seq_lens[seq_idx];
+  const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
   const int warpid = threadIdx.x / WARP_SIZE;
 
   __shared__ float shared_global_exp_sum;
@@ -2980,7 +2978,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const int num_kv_heads,
     const float scale,
     const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ seq_lens,    // [num_seqs]
     const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
@@ -3007,7 +3005,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     const int num_kv_heads,
     const float scale,
     const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ seq_lens,    // [num_seqs]
     const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
@@ -3031,7 +3029,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const float* __restrict__ exp_sums,    // [num_seqs, num_heads, max_num_partitions]
     const float* __restrict__ max_logits,  // [num_seqs, num_heads, max_num_partitions]
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
-    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ seq_lens,  // [num_seqs]
     const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
   UNREACHABLE_CODE
@@ -3046,7 +3044,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
                                           GQA_RATIO>                           \
       <<<grid, block, 0, stream>>>(                                            \
           query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
-          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          block_tables_ptr, seq_lens_ptr, query_start_loc_ptr,                 \
           max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
           kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
           max_ctx_blocks, k_scale_ptr, v_scale_ptr);
@@ -3057,18 +3055,17 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
                                          GQA_RATIO>                            \
       <<<grid, block, 0, stream>>>(                                            \
           query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
-          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          block_tables_ptr, seq_lens_ptr, query_start_loc_ptr,                 \
           max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
           kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
           max_ctx_blocks, k_scale_ptr, v_scale_ptr);
 
-#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS)                          \
-  paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
-                                      PARTITION_SIZE, NPAR_LOOPS>    \
-      <<<reduce_grid, reduce_block, 0, stream>>>(                    \
-          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr,        \
-          context_lens_ptr, query_start_loc_ptr, max_num_partitions, \
-          fp8_out_scale_ptr);
+#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS)                                 \
+  paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE,        \
+                                      PARTITION_SIZE, NPAR_LOOPS>           \
+      <<<reduce_grid, reduce_block, 0, stream>>>(                           \
+          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \
+          query_start_loc_ptr, max_num_partitions, fp8_out_scale_ptr);
 
 template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
           int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
@@ -3077,8 +3074,8 @@ void paged_attention_custom_launcher(
     torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, const int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& context_lens,
-    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens,
+    const std::optional<torch::Tensor>& query_start_loc, int max_seq_len,
     const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
     torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale) {
   int num_seqs = block_tables.size(0);
@@ -3109,7 +3106,7 @@ void paged_attention_custom_launcher(
   KVT* key_cache_ptr = reinterpret_cast<KVT*>(key_cache.data_ptr());
   KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
   int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* context_lens_ptr = context_lens.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
   const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
   const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
   // NOTE: fp8_out_scale is optional.
@@ -3119,13 +3116,12 @@ void paged_attention_custom_launcher(
           : nullptr;
   OUTT* out_ptr = reinterpret_cast<OUTT*>(out.data_ptr());
 
-  const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
+  const int max_ctx_blocks = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE);
 
   // partition size is fixed at 256 since both mfma4 and mfma16 kernels support
   // it mfma4 kernel also supports partition size 512
   constexpr int PARTITION_SIZE = 256;
-  const int max_num_partitions =
-      DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
+  const int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
   const int gqa_ratio = num_heads / num_kv_heads;
   assert(num_heads % num_kv_heads == 0);
   assert(head_size == HEAD_SIZE);
@@ -3234,8 +3230,8 @@ void paged_attention_custom_launcher_navi(
     torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, const int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& context_lens,
-    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens,
+    const std::optional<torch::Tensor>& query_start_loc, int max_seq_len,
     const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
     torch::Tensor& v_scale) {
   int num_seqs = block_tables.size(0);
@@ -3263,7 +3259,7 @@ void paged_attention_custom_launcher_navi(
   KVT* key_cache_ptr = reinterpret_cast<KVT*>(key_cache.data_ptr());
   KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
   int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* context_lens_ptr = context_lens.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
 
   const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
   const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
@@ -3271,11 +3267,10 @@ void paged_attention_custom_launcher_navi(
   const auto fp8_out_scale_ptr = nullptr;
   OUTT* out_ptr = reinterpret_cast<OUTT*>(out.data_ptr());
 
-  const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
+  const int max_ctx_blocks = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE);
 
   constexpr int PARTITION_SIZE = 256;
-  const int max_num_partitions =
-      DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
+  const int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
   const int gqa_ratio = num_heads / num_kv_heads;
   assert(num_heads % num_kv_heads == 0);
   assert(head_size == HEAD_SIZE);
@@ -3407,14 +3402,14 @@ void paged_attention_custom_launcher_navi(
     paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,  \
                                     OUTT, PSIZE, ALIBI_ENABLED>(            \
         out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,  \
-        num_kv_heads, scale, block_tables, context_lens, query_start_loc,   \
-        max_context_len, alibi_slopes, k_scale, v_scale, fp8_out_scale);    \
+        num_kv_heads, scale, block_tables, seq_lens, query_start_loc,       \
+        max_seq_len, alibi_slopes, k_scale, v_scale, fp8_out_scale);        \
   } else {                                                                  \
     paged_attention_custom_launcher_navi<                                   \
         T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, ALIBI_ENABLED>( \
         out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,  \
-        num_kv_heads, scale, block_tables, context_lens, query_start_loc,   \
-        max_context_len, alibi_slopes, k_scale, v_scale);                   \
+        num_kv_heads, scale, block_tables, seq_lens, query_start_loc,       \
+        max_seq_len, alibi_slopes, k_scale, v_scale);                       \
   }
 
 #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,    \
@@ -3502,9 +3497,9 @@ void paged_attention(
     int64_t num_kv_heads, 
     double scale,
     torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
-    torch::Tensor& context_lens, // [num_seqs]
+    torch::Tensor& seq_lens, // [num_seqs]
     const std::optional<torch::Tensor>& query_start_loc, // [num_seqs]
-    int64_t block_size, int64_t max_context_len,
+    int64_t block_size, int64_t max_seq_len,
     const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, torch::Tensor& k_scale,
     torch::Tensor& v_scale,
diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h
index e538197dbc..34dcc9401a 100644
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@@ -15,8 +15,8 @@ void paged_attention(
     torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& context_lens,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens,
     const std::optional<torch::Tensor>& query_start_loc, int64_t block_size,
-    int64_t max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
+    int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, torch::Tensor& k_scale,
     torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale);
diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp
index 34575477bc..66bdc448da 100644
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@@ -41,10 +41,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
       "                Tensor query, Tensor key_cache,"
       "                Tensor value_cache, int num_kv_heads,"
       "                float scale, Tensor block_tables,"
-      "                Tensor context_lens,"
+      "                Tensor seq_lens,"
       "                Tensor? query_start_loc,"
       "                int block_size,"
-      "                int max_context_len,"
+      "                int max_seq_len,"
       "                Tensor? alibi_slopes,"
       "                str kv_cache_dtype,"
       "                Tensor k_scale, Tensor v_scale,"

From 7920e9b1c5e168fe6218d2d147bdb9acf6bc993d Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 9 Aug 2025 15:03:26 +0800
Subject: [PATCH 118/932] [Bugfix] Fix failing GPT-OSS initialization test
 (#22557)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/registry.py            | 2 +-
 tests/models/test_initialization.py | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 2bb06b7d19..64eeed6555 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -200,7 +200,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                        {"6b": "EleutherAI/gpt-j-6b"}),
     "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
                                           {"1b": "EleutherAI/pythia-1.4b"}),
-    "GptOssForCausalLM": _HfExamplesInfo("openai/gpt-oss-20b"),
+    "GptOssForCausalLM": _HfExamplesInfo("lmsys/gpt-oss-20b-bf16"),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
     "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"),  # noqa: E501
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index f0aa91566b..f06b34285e 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -68,6 +68,11 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
         if model_arch == "Phi4FlashForCausalLM":
             # Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
             m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
+        if model_arch == "GptOssForCausalLM":
+            # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
+            # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
+            # L4 supports FA3.
+            m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
         LLM(
             model_info.default,
             tokenizer=model_info.tokenizer,

From 0edc0cd52b68d293250157226abdf631e52a53a3 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 9 Aug 2025 15:03:29 +0800
Subject: [PATCH 119/932] [Bugfix] Fix CI moe kernel failure (#22556)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../moe/test_gpt_oss_triton_kernels.py        | 204 ++++++++++++------
 1 file changed, 141 insertions(+), 63 deletions(-)

diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
index 3f9b32ce5a..54f2351bf6 100644
--- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -5,6 +5,15 @@ from dataclasses import dataclass, fields
 import pytest
 import torch
 import torch.nn.functional as F
+
+from vllm.utils import has_triton_kernels
+
+if not has_triton_kernels():
+    pytest.skip(
+        "triton_kernels not found, skipping all related tests",
+        allow_module_level=True,
+    )
+
 import triton_kernels.swiglu
 from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
 from triton_kernels.numerics import InFlexData
@@ -65,7 +74,7 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
     dtype_dict = {
         "bf16": torch.bfloat16,
         "fp8_e4m3": torch.float8_e4m3fn,
-        "fp8_e5m2": torch.float8_e5m2
+        "fp8_e5m2": torch.float8_e5m2,
     }
 
     x = x.to(dtype_dict[a_dtype]).to(torch.bfloat16)
@@ -97,12 +106,18 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
 
         x_pad = w1_bottom_pad
 
-        w1_tri = F.pad(w1_tri, (0, w1_right_pad, 0, w1_bottom_pad, 0, 0),
-                       mode="constant",
-                       value=0)
-        w2_tri = F.pad(w2_tri, (0, w2_right_pad, 0, w2_bottom_pad, 0, 0),
-                       mode="constant",
-                       value=0)
+        w1_tri = F.pad(
+            w1_tri,
+            (0, w1_right_pad, 0, w1_bottom_pad, 0, 0),
+            mode="constant",
+            value=0,
+        )
+        w2_tri = F.pad(
+            w2_tri,
+            (0, w2_right_pad, 0, w2_bottom_pad, 0, 0),
+            mode="constant",
+            value=0,
+        )
 
         w1_bias_tri = F.pad(w1_bias_tri, (0, w1_right_pad, 0, 0),
                             mode="constant",
@@ -127,13 +142,19 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
 
         w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout,
                                 **w_layout_opts)
-        w1_scale_tri = convert_layout(wrap_torch_tensor(w1_scale_tri),
-                                      w_scale_layout, **w_scale_layout_opts)
+        w1_scale_tri = convert_layout(
+            wrap_torch_tensor(w1_scale_tri),
+            w_scale_layout,
+            **w_scale_layout_opts,
+        )
 
         w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout,
                                 **w_layout_opts)
-        w2_scale_tri = convert_layout(wrap_torch_tensor(w2_scale_tri),
-                                      w_scale_layout, **w_scale_layout_opts)
+        w2_scale_tri = convert_layout(
+            wrap_torch_tensor(w2_scale_tri),
+            w_scale_layout,
+            **w_scale_layout_opts,
+        )
 
         pc1 = PrecisionConfig(weight_scale=w1_scale_tri,
                               flex_ctx=FlexCtx(rhs_data=InFlexData()))
@@ -149,8 +170,22 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
         w1 = w1.transpose(-1, -2).contiguous()
         w2 = w2.transpose(-1, -2).contiguous()
 
-        return (x, w1, w1_bias, w2, w2_bias, exp_data, x_tri, w1_tri, w2_tri,
-                exp_data_tri, w1_bias_tri, w2_bias_tri, pc1, pc2)
+        return (
+            x,
+            w1,
+            w1_bias,
+            w2,
+            w2_bias,
+            exp_data,
+            x_tri,
+            w1_tri,
+            w2_tri,
+            exp_data_tri,
+            w1_bias_tri,
+            w2_bias_tri,
+            pc1,
+            pc2,
+        )
 
 
 @dataclass
@@ -184,13 +219,14 @@ def swiglu(x, alpha: float = 1.702, limit: float = 1.0):
 
 
 def oai_moe_forward(
-        hidden_states: torch.Tensor,  # (M, K)
-        w1: torch.Tensor,  # (E, 2N)
-        w1_bias: torch.Tensor,  # (E, 2N, K)
-        w2: torch.Tensor,  # (E, K, N)
-        w2_bias: torch.Tensor,  # (E, N)
-        gating_output: torch.Tensor,  # (M, E)
-        topk: int):
+    hidden_states: torch.Tensor,  # (M, K)
+    w1: torch.Tensor,  # (E, 2N)
+    w1_bias: torch.Tensor,  # (E, 2N, K)
+    w2: torch.Tensor,  # (E, K, N)
+    w2_bias: torch.Tensor,  # (E, N)
+    gating_output: torch.Tensor,  # (M, E)
+    topk: int,
+):
     # model.py 309:330, assuming gating and norm
     t = hidden_states
     experts = torch.topk(gating_output, k=topk, dim=-1, sorted=True)
@@ -240,10 +276,22 @@ def test_equiv(num_token, a_dtype, w_dtype, tp):
     N = ModelConfig.intermediate_size // tp
     topk = ModelConfig.experts_per_token
 
-    x, w1, w1_bias, w2, w2_bias, exp_data, \
-        x_tri, w1_tri, w2_tri, exp_data_tri, w1_bias_tri,\
-        w2_bias_tri, pc1, pc2 = init_compute_data(
-        M, K, N, E, a_dtype, w_dtype, num_warps=8)
+    (
+        x,
+        w1,
+        w1_bias,
+        w2,
+        w2_bias,
+        exp_data,
+        x_tri,
+        w1_tri,
+        w2_tri,
+        exp_data_tri,
+        w1_bias_tri,
+        w2_bias_tri,
+        pc1,
+        pc2,
+    ) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=8)
 
     out_triton_monolithic = triton_kernel_moe_forward(
         hidden_states=x_tri,
@@ -255,33 +303,46 @@ def test_equiv(num_token, a_dtype, w_dtype, tp):
         w1_bias=w1_bias_tri,
         w2_bias=w2_bias_tri,
         w1_precision=pc1,
-        w2_precision=pc2)
+        w2_precision=pc2,
+    )
     out_triton_monolithic = out_triton_monolithic[..., :K]
 
-    out_ref = oai_moe_forward(hidden_states=x,
-                              w1=w1,
-                              w1_bias=w1_bias,
-                              w2=w2,
-                              w2_bias=w2_bias,
-                              gating_output=exp_data,
-                              topk=topk)
+    out_ref = oai_moe_forward(
+        hidden_states=x,
+        w1=w1,
+        w1_bias=w1_bias,
+        w2=w2,
+        w2_bias=w2_bias,
+        gating_output=exp_data,
+        topk=topk,
+    )
     assert_close(ref=out_ref,
                  tri=out_triton_monolithic,
                  maxtol=0.025,
                  rmstol=0.005)
 
 
-def batched_moe(a: torch.Tensor, w1, w2, gating_output: torch.Tensor,
-                topk: int, renormalize: bool, w1_bias: torch.Tensor,
-                w2_bias: torch.Tensor, w1_precision: PrecisionConfig,
-                w2_precision: PrecisionConfig) -> torch.Tensor:
+def batched_moe(
+    a: torch.Tensor,
+    w1,
+    w2,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    w1_precision: PrecisionConfig,
+    w2_precision: PrecisionConfig,
+) -> torch.Tensor:
     max_num_tokens = round_up(a.shape[0], 64)
 
     fused_experts = FusedMoEModularKernel(
-        BatchedPrepareAndFinalize(max_num_tokens,
-                                  num_dispatchers=1,
-                                  num_local_experts=w1.shape[0],
-                                  rank=0),
+        BatchedPrepareAndFinalize(
+            max_num_tokens,
+            num_dispatchers=1,
+            num_local_experts=w1.shape[0],
+            rank=0,
+        ),
         BatchedOAITritonExperts(
             None,
             max_num_tokens=max_num_tokens,
@@ -327,30 +388,46 @@ def test_triton_kernel_batched_moe(num_token, a_dtype, w_dtype, ep):
     N = ModelConfig.intermediate_size
     topk = ModelConfig.experts_per_token
 
-    x, w1, w1_bias, w2, w2_bias, exp_data, \
-        x_tri, w1_tri, w2_tri, exp_data_tri, w1_bias_tri, \
-            w2_bias_tri, pc1, pc2 = init_compute_data(
-        M, K, N, E, a_dtype, w_dtype, num_warps=4)
+    (
+        x,
+        w1,
+        w1_bias,
+        w2,
+        w2_bias,
+        exp_data,
+        x_tri,
+        w1_tri,
+        w2_tri,
+        exp_data_tri,
+        w1_bias_tri,
+        w2_bias_tri,
+        pc1,
+        pc2,
+    ) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=4)
 
-    out_tri = batched_moe(a=x_tri,
-                          w1=w1_tri,
-                          w2=w2_tri,
-                          gating_output=exp_data_tri,
-                          topk=topk,
-                          renormalize=True,
-                          w1_bias=w1_bias_tri,
-                          w2_bias=w2_bias_tri,
-                          w1_precision=pc1,
-                          w2_precision=pc2)
+    out_tri = batched_moe(
+        a=x_tri,
+        w1=w1_tri,
+        w2=w2_tri,
+        gating_output=exp_data_tri,
+        topk=topk,
+        renormalize=True,
+        w1_bias=w1_bias_tri,
+        w2_bias=w2_bias_tri,
+        w1_precision=pc1,
+        w2_precision=pc2,
+    )
     out_tri = out_tri[..., :K]
 
-    out_ref = oai_moe_forward(hidden_states=x,
-                              w1=w1,
-                              w1_bias=w1_bias,
-                              w2=w2,
-                              w2_bias=w2_bias,
-                              gating_output=exp_data,
-                              topk=topk)
+    out_ref = oai_moe_forward(
+        hidden_states=x,
+        w1=w1,
+        w1_bias=w1_bias,
+        w2=w2,
+        w2_bias=w2_bias,
+        gating_output=exp_data,
+        topk=topk,
+    )
     assert_close(ref=out_ref, tri=out_tri, maxtol=0.025, rmstol=0.005)
 
 
@@ -370,6 +447,7 @@ def test_unit_shuffle():
     out = triton_kernels.swiglu.swiglu_torch(
         out,
         alpha=1.702,
-        precision_config=triton_kernels.swiglu.PrecisionConfig(limit=1.0))
+        precision_config=triton_kernels.swiglu.PrecisionConfig(limit=1.0),
+    )
 
-    assert_close(ref=out_ref, tri=out)
\ No newline at end of file
+    assert_close(ref=out_ref, tri=out)

From 2be07a0db115e65009111145e17b034c54ae4a01 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Sat, 9 Aug 2025 09:18:18 +0200
Subject: [PATCH 120/932] Update docs for Minimax-Text support (#22562)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 docs/models/supported_models.md | 4 ++--
 docs/usage/v1_guide.md          | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index afabfccb55..87dd08e059 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -404,8 +404,8 @@ th {
 | `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | |
-| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | |
+| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ |
+| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ |
 | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
 
 !!! note
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index d30144e8a8..a9492c8502 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -111,6 +111,10 @@ Models that combine Mamba-2 and Mamba-1 layers with standard attention layers ar
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
 these models currently require disabling prefix caching and using the FlashInfer attention backend in V1.
 
+Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
+Please note that these models currently require disabling prefix caching, enforcing eager mode, and using the FlashInfer
+attention backend in V1.
+
 #### Encoder-Decoder Models
 
 Models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`)

From a6022e6fbcbdba65e3c0e6dce5c9e3cbc8120e90 Mon Sep 17 00:00:00 2001
From: Yuxuan Zhang <2448370773@qq.com>
Date: Sat, 9 Aug 2025 15:50:21 +0800
Subject: [PATCH 121/932] GLM-4.5V with new class name at transformers (#22520)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md        | 4 ++--
 tests/models/registry.py               | 2 +-
 vllm/model_executor/models/glm4_moe.py | 8 +++++++-
 vllm/model_executor/models/registry.py | 2 +-
 vllm/transformers_utils/config.py      | 3 ++-
 5 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 87dd08e059..19186a0635 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -352,6 +352,7 @@ th {
 | `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
 | `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4MoeForCausalLM` | GLM-4.5 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
 | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
@@ -609,8 +610,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4MoeForCausalLM` | GLM-4.5 | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4v_moeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 64eeed6555..09d62413fe 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -396,7 +396,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
     "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"),  # noqa: E501
-    "Glm4v_moeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V",
+    "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V",
                                           is_available_online=False),   # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
                                       trust_remote_code=True,
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 624eef6cf1..131c042c3c 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -372,7 +372,13 @@ class Glm4MoeDecoderLayer(nn.Module):
         return hidden_states, residual
 
 
-@support_torch_compile
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
 class Glm4MoeModel(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index c746e8ec3f..4aa958ecdc 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -207,7 +207,7 @@ _MULTIMODAL_MODELS = {
     "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
     "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
     "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
-    "Glm4v_moeForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
+    "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
     "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index bce24ef74c..de779f94a4 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -254,7 +254,8 @@ def _uses_mrope(config: PretrainedConfig) -> bool:
 
 def uses_mrope(config: PretrainedConfig) -> bool:
     """Detect if the model with this config uses M-ROPE."""
-    return _uses_mrope(config) or thinker_uses_mrope(config)
+    return _uses_mrope(config) or _uses_mrope(
+        config.get_text_config()) or thinker_uses_mrope(config)
 
 
 def thinker_uses_mrope(config: PretrainedConfig) -> bool:

From 1bf5e1f25b92423f5739ea7cbb9266f61af12b0b Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Sat, 9 Aug 2025 11:04:42 +0200
Subject: [PATCH 122/932] [CI] [Hybrid] Speed up hybrid models test by removing
 large models  (#22563)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 .../models/language/generation/test_hybrid.py | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 8c3e1f5c2b..4934da9517 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -20,7 +20,7 @@ pytestmark = pytest.mark.hybrid_model
 SSM_MODELS = [
     "state-spaces/mamba-130m-hf",
     "tiiuae/falcon-mamba-tiny-dev",
-    "mistralai/Mamba-Codestral-7B-v0.1",
+    "yujiepan/mamba2-codestral-v0.1-tiny-random",
 ]
 
 HYBRID_MODELS = [
@@ -29,8 +29,6 @@ HYBRID_MODELS = [
     # "pfnet/plamo-2-1b",
     "Zyphra/Zamba2-1.2B-instruct",
     "hmellor/tiny-random-BambaForCausalLM",
-    "ibm-ai-platform/Bamba-9B-v1",
-    "nvidia/Nemotron-H-8B-Base-8K",
     "ibm-granite/granite-4.0-tiny-preview",
     "tiiuae/Falcon-H1-0.5B-Base",
 ]
@@ -40,23 +38,18 @@ HF_UNSUPPORTED_MODELS = [
     # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
     # doesn't compare vLLM output with HF output.
     # See https://github.com/huggingface/transformers/pull/35943
-    "mistralai/Mamba-Codestral-7B-v0.1",
-    # Note: I'm not seeing the same output from vLLM V0 vs. HF transformers
-    # for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1
-    "nvidia/Nemotron-H-8B-Base-8K",
-    # NOTE: Currently the test fails due to HF transformers issue fixed in:
-    # https://github.com/huggingface/transformers/pull/39033
-    # We will enable vLLM test for Granite after next HF transformers release.
-    "ibm-granite/granite-4.0-tiny-preview",
+    "yujiepan/mamba2-codestral-v0.1-tiny-random",
+    # transformers 4.55 is still producing garbage for this model
+    # TODO(tdoublep): follow-up on transformers side
+    "ibm-granite/granite-4.0-tiny-preview"
 ]
 
 V1_SUPPORTED_MODELS = [
     "state-spaces/mamba-130m-hf",
     "ai21labs/Jamba-tiny-dev",
-    "mistralai/Mamba-Codestral-7B-v0.1",
-    "ibm-ai-platform/Bamba-9B-v1",
+    "yujiepan/mamba2-codestral-v0.1-tiny-random",
     "Zyphra/Zamba2-1.2B-instruct",
-    "nvidia/Nemotron-H-8B-Base-8K",
+    "hmellor/tiny-random-BambaForCausalLM",
     "ibm-granite/granite-4.0-tiny-preview",
     "tiiuae/Falcon-H1-0.5B-Base",
 ]

From 56186474f6afef825943fb5c5b1ad288909b6783 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 9 Aug 2025 16:31:32 +0100
Subject: [PATCH 123/932] [Docs] Reduce noise in docs and `--help` from the
 JSON tip (#22567)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/cli/README.md                | 10 ++++++++++
 docs/configuration/engine_args.md | 10 ++++++++++
 vllm/engine/arg_utils.py          | 23 ++---------------------
 vllm/utils/__init__.py            | 21 ++++++++++++++++++---
 4 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/docs/cli/README.md b/docs/cli/README.md
index a7de6d7192..b512a4f4ba 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -16,6 +16,16 @@ Available Commands:
 vllm {chat,complete,serve,bench,collect-env,run-batch}
 ```
 
+When passing JSON CLI arguments, the following sets of arguments are equivalent:
+
+- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
+- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
+
+Additionally, list elements can be passed individually using `+`:
+
+- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
+- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`
+
 ## serve
 
 Start the vLLM OpenAI Compatible API server.
diff --git a/docs/configuration/engine_args.md b/docs/configuration/engine_args.md
index c3c1d5a1c3..e7ca08b557 100644
--- a/docs/configuration/engine_args.md
+++ b/docs/configuration/engine_args.md
@@ -11,6 +11,16 @@ Engine arguments control the behavior of the vLLM engine.
 
 The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings.
 
+When passing JSON CLI arguments, the following sets of arguments are equivalent:
+
+- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
+- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
+
+Additionally, list elements can be passed individually using `+`:
+
+- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
+- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`
+
 ## `EngineArgs`
 
 --8<-- "docs/argparse/engine_args.md"
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c9dc99cad2..4d4ce4c78e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -178,17 +178,8 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
         kwargs[name] = {"default": default, "help": help}
 
         # Set other kwargs based on the type hints
-        json_tip = """Should either be a valid JSON string or JSON keys
-passed individually. For example, the following sets of arguments are
-equivalent:
-
-- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
-- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
-
-Additionally, list elements can be passed individually using `+`:
-
-- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`\n
-- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`"""
+        json_tip = ("Should either be a valid JSON string or JSON keys passed "
+                    "individually.")
         if dataclass_cls is not None:
 
             def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
@@ -1831,13 +1822,3 @@ def human_readable_int(value):
 
     # Regular plain number.
     return int(value)
-
-
-# These functions are used by sphinx to build the documentation
-def _engine_args_parser():
-    return EngineArgs.add_cli_args(FlexibleArgumentParser())
-
-
-def _async_engine_args_parser():
-    return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(),
-                                        async_args_only=True)
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 7a0abf5b59..a4997226ea 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1669,11 +1669,19 @@ class FlexibleArgumentParser(ArgumentParser):
     """ArgumentParser that allows both underscore and dash in names."""
 
     _deprecated: set[Action] = set()
+    _json_tip: str = (
+        "When passing JSON CLI arguments, the following sets of arguments "
+        "are equivalent:\n"
+        '   --json-arg \'{"key1": "value1", "key2": {"key3": "value2"}}\'\n'
+        "   --json-arg.key1 value1 --json-arg.key2.key3 value2\n\n"
+        "Additionally, list elements can be passed individually using +:\n"
+        '   --json-arg \'{"key4": ["value3", "value4", "value5"]}\'\n'
+        "   --json-arg.key4+ value3 --json-arg.key4+=\'value4,value5\'\n\n")
 
     def __init__(self, *args, **kwargs):
-        # Set the default 'formatter_class' to SortedHelpFormatter
-        if 'formatter_class' not in kwargs:
-            kwargs['formatter_class'] = SortedHelpFormatter
+        # Set the default "formatter_class" to SortedHelpFormatter
+        if "formatter_class" not in kwargs:
+            kwargs["formatter_class"] = SortedHelpFormatter
         super().__init__(*args, **kwargs)
 
     if sys.version_info < (3, 13):
@@ -1715,6 +1723,13 @@ class FlexibleArgumentParser(ArgumentParser):
             self._action_groups.append(group)
             return group
 
+    def format_help(self) -> str:
+        # Add tip about JSON arguments to the epilog
+        epilog = self.epilog or ""
+        if not epilog.startswith(FlexibleArgumentParser._json_tip):
+            self.epilog = FlexibleArgumentParser._json_tip + epilog
+        return super().format_help()
+
     def parse_args(  # type: ignore[override]
         self,
         args: list[str] | None = None,

From 2d18256e47805bf32d1ae04ba1a8c9fd98261fcf Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 9 Aug 2025 16:33:46 +0100
Subject: [PATCH 124/932] Move `ParallelConfig` from `config/__init__.py` to
 `config/parallel.py` (#22565)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .github/CODEOWNERS         |   2 +-
 vllm/config/__init__.py    | 357 +----------------------------------
 vllm/config/compilation.py |   2 +-
 vllm/config/parallel.py    | 375 +++++++++++++++++++++++++++++++++++++
 4 files changed, 379 insertions(+), 357 deletions(-)
 create mode 100644 vllm/config/parallel.py

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 5bc9442967..0a7f8e8be4 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -20,7 +20,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
-/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
+/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
 
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 69c05b75d3..7efab23f14 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -25,13 +25,13 @@ from pydantic import (ConfigDict, SkipValidation, field_validator,
                       model_validator)
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
-from torch.distributed import ProcessGroup, ReduceOp
 from typing_extensions import Self, assert_never, runtime_checkable
 
 import vllm.envs as envs
 from vllm import version
 from vllm.config.compilation import (CompilationConfig, CompilationLevel,
                                      PassConfig)
+from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig
 from vllm.config.utils import ConfigType, config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -50,20 +50,16 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
                         MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
                         POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
                         LayerBlockType, LazyLoader, common_broadcastable_dtype,
-                        cuda_device_count_stateless, get_cpu_memory,
-                        get_open_port, random_uuid)
+                        get_cpu_memory, random_uuid)
 
 # yapf: enable
 
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
-    from ray.runtime_env import RuntimeEnv
-    from ray.util.placement_group import PlacementGroup
     from transformers.configuration_utils import PretrainedConfig
 
     import vllm.model_executor.layers.quantization as me_quant
     import vllm.model_executor.models as me_models
-    from vllm.executor.executor_base import ExecutorBase
     from vllm.model_executor.layers.quantization import QuantizationMethods
     from vllm.model_executor.layers.quantization.base_config import (
         QuantizationConfig)
@@ -73,10 +69,7 @@ if TYPE_CHECKING:
     HfOverrides = Union[dict, Callable[[type], type]]
 else:
     DataclassInstance = Any
-    PlacementGroup = Any
-    RuntimeEnv = Any
     PretrainedConfig = Any
-    ExecutorBase = Any
     QuantizationConfig = Any
     QuantizationMethods = Any
     BaseModelLoader = Any
@@ -2043,352 +2036,6 @@ class LoadConfig:
             self.ignore_patterns = ["original/**/*"]
 
 
-DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
-
-
-@config
-@dataclass
-class ParallelConfig:
-    """Configuration for the distributed execution."""
-
-    pipeline_parallel_size: int = 1
-    """Number of pipeline parallel groups."""
-    tensor_parallel_size: int = 1
-    """Number of tensor parallel groups."""
-    data_parallel_size: int = 1
-    """Number of data parallel groups. MoE layers will be sharded according to
-    the product of the tensor parallel size and data parallel size."""
-    data_parallel_size_local: int = 1
-    """Number of local data parallel groups."""
-    data_parallel_rank: int = 0
-    """Rank of the data parallel group."""
-    data_parallel_rank_local: Optional[int] = None
-    """Local rank of the data parallel group,
-    set only in SPMD mode."""
-    data_parallel_master_ip: str = "127.0.0.1"
-    """IP of the data parallel master."""
-    data_parallel_rpc_port: int = 29550
-    """Port for data parallel messaging."""
-    data_parallel_master_port: int = 29500
-    """Port of the data parallel master."""
-    data_parallel_backend: str = "mp"
-    """Backend to use for data parallel, either "mp" or "ray"."""
-    data_parallel_external_lb: bool = False
-    """Whether to use "external" DP LB mode. Applies only to online serving
-    and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
-    wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank
-    is provided explicitly to vllm serve."""
-    data_parallel_hybrid_lb: bool = False
-    """Whether to use "hybrid" DP LB mode. Applies only to online serving
-    and when data_parallel_size > 0. Enables running an AsyncLLM
-    and API server on a "per-node" basis where vLLM load balances
-    between local data parallel ranks, but an external LB balances
-    between vLLM nodes/replicas. Set explicitly in conjunction with
-    --data-parallel-start-rank."""
-    enable_expert_parallel: bool = False
-    """Use expert parallelism instead of tensor parallelism for MoE layers."""
-    enable_eplb: bool = False
-    """Enable expert parallelism load balancing for MoE layers."""
-    num_redundant_experts: int = 0
-    """Number of redundant experts to use for expert parallelism."""
-    eplb_window_size: int = 1000
-    """Window size for expert load recording."""
-    eplb_step_interval: int = 3000
-    """
-    Interval for rearranging experts in expert parallelism.
-
-    Note that if this is greater than the EPLB window size, only the metrics
-    of the last `eplb_window_size` steps will be used for rearranging experts.
-    """
-    eplb_log_balancedness: bool = False
-    """
-    Log the balancedness each step of expert parallelism.
-    This is turned off by default since it will cause communication overhead.
-    """
-
-    max_parallel_loading_workers: Optional[int] = None
-    """Maximum number of parallel loading workers when loading model
-    sequentially in multiple batches. To avoid RAM OOM when using tensor
-    parallel and large models."""
-
-    disable_custom_all_reduce: bool = False
-    """Disable the custom all-reduce kernel and fall back to NCCL."""
-
-    ray_workers_use_nsight: bool = False
-    """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
-
-    ray_runtime_env: Optional["RuntimeEnv"] = None
-    """Ray runtime environment to pass to distributed workers."""
-
-    placement_group: Optional["PlacementGroup"] = None
-    """ray distributed model workers placement group."""
-
-    distributed_executor_backend: Optional[Union[DistributedExecutorBackend,
-                                                 type["ExecutorBase"]]] = None
-    """Backend to use for distributed model
-    workers, either "ray" or "mp" (multiprocessing). If the product
-    of pipeline_parallel_size and tensor_parallel_size is less than
-    or equal to the number of GPUs available, "mp" will be used to
-    keep processing on a single host. Otherwise, this will default
-    to "ray" if Ray is installed and fail otherwise. Note that tpu
-    only support Ray for distributed inference."""
-
-    worker_cls: str = "auto"
-    """The full name of the worker class to use. If "auto", the worker class
-    will be determined based on the platform."""
-    sd_worker_cls: str = "auto"
-    """The full name of the worker class to use for speculative decoding.
-    If "auto", the worker class will be determined based on the platform."""
-    worker_extension_cls: str = ""
-    """The full name of the worker extension class to use. The worker extension
-    class is dynamically inherited by the worker class. This is used to inject
-    new attributes and methods to the worker class for use in collective_rpc
-    calls."""
-
-    world_size: int = field(init=False)
-    """world_size is TPxPP, it affects the number of workers we create."""
-
-    rank: int = 0
-    """Global rank in distributed setup."""
-
-    enable_multimodal_encoder_data_parallel: bool = False
-    """ Use data parallelism instead of tensor parallelism for vision encoder.
-    Only support LLama4 for now"""
-
-    @property
-    def world_size_across_dp(self) -> int:
-        """world_size_across_dp is TPxPPxDP, it is the size of the world
-        including data parallelism."""
-        return self.world_size * self.data_parallel_size
-
-    def get_next_dp_init_port(self) -> int:
-        """
-        We might need to initialize process groups in multiple
-        processes that is related to data parallelism,
-        e.g. both in the worker and in the engine, which
-        can live in different processes. To avoid port conflicts, we
-        increment the port number each time we need to initialize a
-        new process group related to data parallelism.
-        """
-        answer = self.data_parallel_master_port
-        self.data_parallel_master_port += 1
-        return answer
-
-    def stateless_init_dp_group(self) -> "ProcessGroup":
-        # NOTE: In high-concurrency scenarios multiple processes
-        # can pick the same (currently free) port through a race
-        # condition when calling `get_open_port()`. When the first
-        # process binds the port the others will subsequently fail
-        # with `torch.distributed.DistNetworkError: EADDRINUSE`.
-        # To make the initialization more robust we retry a few times
-        # with a fresh port whenever this specific error is observed.
-        from torch.distributed import DistNetworkError
-
-        from vllm.distributed.utils import (
-            stateless_init_torch_distributed_process_group)
-
-        max_retries = 5
-        last_exc: Optional[Exception] = None
-        for _ in range(max_retries):
-            try:
-                # use gloo since the engine process might not have cuda device
-                return stateless_init_torch_distributed_process_group(
-                    self.data_parallel_master_ip,
-                    self.get_next_dp_init_port(),
-                    self.data_parallel_rank,
-                    self.data_parallel_size,
-                    backend="gloo")
-            except DistNetworkError as e:
-                # We only want to retry when the root cause is EADDRINUSE.
-                if "EADDRINUSE" in str(e):
-                    logger.warning(
-                        "Address already in use. Retrying with a new port.")
-                    last_exc = e
-                    continue  # try again with a new port
-                raise e
-
-        # If we get here all retries have failed.
-        assert last_exc is not None
-        raise last_exc
-
-    @staticmethod
-    def has_unfinished_dp(dp_group: "ProcessGroup",
-                          has_unfinished: bool) -> bool:
-        tensor = torch.tensor([has_unfinished],
-                              dtype=torch.int32,
-                              device="cpu")
-        # dp rank 0: has_unfinished_seqs=True
-        # dp rank 1: has_unfinished_seqs=False
-        # aggregated: has_unfinished_seqs=True
-        # so this is an OR operation, i.e. MAX in integers
-        torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group)
-        aggregated_has_unfinished = bool(tensor.item())
-        return aggregated_has_unfinished
-
-    @staticmethod
-    def sync_kv_cache_memory_size(dp_group: "ProcessGroup",
-                                  kv_cache_memory: int) -> int:
-        if kv_cache_memory == -1:
-            kv_cache_memory = torch.iinfo(torch.int64).max
-        tensor = torch.tensor([kv_cache_memory],
-                              dtype=torch.int64,
-                              device="cpu")
-        # we cannot use broadcast for stateless dp group since it depends
-        # on global rank
-        torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
-        return tensor.item()
-
-    def compute_hash(self):
-        """
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        factors: list[Any] = []
-        factors.append(self.pipeline_parallel_size)
-        factors.append(self.tensor_parallel_size)
-        factors.append(self.enable_expert_parallel)
-        factors.append(self.data_parallel_size)
-        factors.append(envs.VLLM_ALL2ALL_BACKEND)
-        return hashlib.sha256(str(factors).encode()).hexdigest()
-
-    def __post_init__(self) -> None:
-        self.world_size = self.pipeline_parallel_size * \
-            self.tensor_parallel_size
-
-        if self.data_parallel_size_local > self.data_parallel_size:
-            raise ValueError(
-                f"data_parallel_size_local ({self.data_parallel_size_local}) "
-                f"must be <= data_parallel_size ({self.data_parallel_size})")
-
-        if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
-            # Data parallel was specified in the engine args.
-            self.data_parallel_master_port = get_open_port()
-
-            if not (0 <= self.data_parallel_rank < self.data_parallel_size):
-                raise ValueError(
-                    f"data_parallel_rank ({self.data_parallel_rank})"
-                    f" must be in the range [0, {self.data_parallel_size})")
-        else:
-            # Otherwise fall back to env vars (e.g. for offline SPMD case).
-            self.data_parallel_size = envs.VLLM_DP_SIZE
-            self.data_parallel_rank = envs.VLLM_DP_RANK
-            self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
-            self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
-            self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
-
-            if self.data_parallel_external_lb:
-                raise ValueError("data_parallel_external_lb can only "
-                                 "be set when data_parallel_size > 1")
-
-        if self.distributed_executor_backend == "external_launcher":
-            import os
-            os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
-            logger.info("Disabling V1 multiprocessing for external launcher.")
-
-        if self.enable_eplb:
-            if not current_platform.is_cuda():
-                raise ValueError(
-                    "Expert parallelism load balancing is only supported on "
-                    "CUDA devices now.")
-            if self.num_redundant_experts < 0:
-                raise ValueError(
-                    "num_redundant_experts must be non-negative, but got "
-                    f"{self.num_redundant_experts}.")
-            if not self.enable_expert_parallel:
-                raise ValueError(
-                    "enable_expert_parallel must be True to use EPLB.")
-            if self.tensor_parallel_size * self.data_parallel_size <= 1:
-                raise ValueError(
-                    "EPLB requires tensor_parallel_size or data_parallel_size "
-                    f"to be greater than 1, but got "
-                    f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
-                )
-        else:
-            if self.num_redundant_experts != 0:
-                raise ValueError(
-                    "num_redundant_experts should be used with EPLB."
-                    f"{self.num_redundant_experts}.")
-        if self.distributed_executor_backend is None and self.world_size > 1:
-            # We use multiprocessing by default if world_size fits on the
-            # current node and we aren't in a ray placement group.
-
-            from vllm.executor import ray_utils
-            backend: DistributedExecutorBackend = "mp"
-            ray_found = ray_utils.ray_is_available()
-            if current_platform.is_neuron():
-                # neuron uses single process to control multiple devices
-                backend = "uni"
-            elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
-                backend = "uni"
-            elif (current_platform.is_cuda()
-                  and cuda_device_count_stateless() < self.world_size):
-                if not ray_found:
-                    raise ValueError("Unable to load Ray: "
-                                     f"{ray_utils.ray_import_err}. Ray is "
-                                     "required for multi-node inference, "
-                                     "please install Ray with `pip install "
-                                     "ray`.")
-                backend = "ray"
-            elif self.data_parallel_backend == "ray":
-                logger.info("Using ray distributed inference because "
-                            "data_parallel_backend is ray")
-                backend = "ray"
-            elif ray_found:
-                if self.placement_group:
-                    backend = "ray"
-                else:
-                    from ray import is_initialized as ray_is_initialized
-                    if ray_is_initialized():
-                        from ray.util import get_current_placement_group
-                        if get_current_placement_group():
-                            backend = "ray"
-            self.distributed_executor_backend = backend
-            logger.debug("Defaulting to use %s for distributed inference",
-                         backend)
-
-        if self.distributed_executor_backend is None and self.world_size == 1:
-            self.distributed_executor_backend = "uni"
-
-    @property
-    def use_ray(self) -> bool:
-        return self.distributed_executor_backend == "ray" or (
-            isinstance(self.distributed_executor_backend, type)
-            and self.distributed_executor_backend.uses_ray)
-
-    @model_validator(mode='after')
-    def _verify_args(self) -> Self:
-        # Lazy import to avoid circular import
-        from vllm.executor.executor_base import ExecutorBase
-        from vllm.platforms import current_platform
-        if self.distributed_executor_backend not in (
-                "ray", "mp", "uni",
-                "external_launcher", None) and not (isinstance(
-                    self.distributed_executor_backend, type) and issubclass(
-                        self.distributed_executor_backend, ExecutorBase)):
-            raise ValueError(
-                "Unrecognized distributed executor backend "
-                f"{self.distributed_executor_backend}. Supported "
-                "values are 'ray', 'mp' 'uni', 'external_launcher' or"
-                " custom ExecutorBase subclass.")
-        if self.use_ray:
-            from vllm.executor import ray_utils
-            ray_utils.assert_ray_available()
-
-        if not current_platform.use_custom_allreduce():
-            self.disable_custom_all_reduce = True
-            logger.debug(
-                "Disabled the custom all-reduce kernel because it is not "
-                "supported on current platform.")
-        if self.ray_workers_use_nsight and not self.use_ray:
-            raise ValueError("Unable to use nsight profiling unless workers "
-                             "run with Ray.")
-
-        return self
-
-
 PreemptionMode = Literal["swap", "recompute"]
 SchedulerPolicy = Literal["fcfs", "priority"]
 
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index c1b3a61217..8a78d811b9 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -16,7 +16,7 @@ from vllm.logger import init_logger
 from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
 
 if TYPE_CHECKING:
-    from vllm.config.config import VllmConfig
+    from vllm.config import VllmConfig
 else:
     VllmConfig = object
 
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
new file mode 100644
index 0000000000..bac1e63800
--- /dev/null
+++ b/vllm/config/parallel.py
@@ -0,0 +1,375 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+from dataclasses import field
+from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+
+import torch
+from pydantic import model_validator
+from pydantic.dataclasses import dataclass
+from torch.distributed import ProcessGroup, ReduceOp
+from typing_extensions import Self
+
+import vllm.envs as envs
+from vllm.config.utils import config
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import cuda_device_count_stateless, get_open_port
+
+if TYPE_CHECKING:
+    from ray.runtime_env import RuntimeEnv
+    from ray.util.placement_group import PlacementGroup
+
+    from vllm.executor.executor_base import ExecutorBase
+else:
+    RuntimeEnv = Any
+    PlacementGroup = Any
+    ExecutorBase = Any
+
+logger = init_logger(__name__)
+
+DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
+
+
+@config
+@dataclass
+class ParallelConfig:
+    """Configuration for the distributed execution."""
+
+    pipeline_parallel_size: int = 1
+    """Number of pipeline parallel groups."""
+    tensor_parallel_size: int = 1
+    """Number of tensor parallel groups."""
+    data_parallel_size: int = 1
+    """Number of data parallel groups. MoE layers will be sharded according to
+    the product of the tensor parallel size and data parallel size."""
+    data_parallel_size_local: int = 1
+    """Number of local data parallel groups."""
+    data_parallel_rank: int = 0
+    """Rank of the data parallel group."""
+    data_parallel_rank_local: Optional[int] = None
+    """Local rank of the data parallel group,
+    set only in SPMD mode."""
+    data_parallel_master_ip: str = "127.0.0.1"
+    """IP of the data parallel master."""
+    data_parallel_rpc_port: int = 29550
+    """Port for data parallel messaging."""
+    data_parallel_master_port: int = 29500
+    """Port of the data parallel master."""
+    data_parallel_backend: str = "mp"
+    """Backend to use for data parallel, either "mp" or "ray"."""
+    data_parallel_external_lb: bool = False
+    """Whether to use "external" DP LB mode. Applies only to online serving
+    and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
+    wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank
+    is provided explicitly to vllm serve."""
+    data_parallel_hybrid_lb: bool = False
+    """Whether to use "hybrid" DP LB mode. Applies only to online serving
+    and when data_parallel_size > 0. Enables running an AsyncLLM
+    and API server on a "per-node" basis where vLLM load balances
+    between local data parallel ranks, but an external LB balances
+    between vLLM nodes/replicas. Set explicitly in conjunction with
+    --data-parallel-start-rank."""
+    enable_expert_parallel: bool = False
+    """Use expert parallelism instead of tensor parallelism for MoE layers."""
+    enable_eplb: bool = False
+    """Enable expert parallelism load balancing for MoE layers."""
+    num_redundant_experts: int = 0
+    """Number of redundant experts to use for expert parallelism."""
+    eplb_window_size: int = 1000
+    """Window size for expert load recording."""
+    eplb_step_interval: int = 3000
+    """
+    Interval for rearranging experts in expert parallelism.
+
+    Note that if this is greater than the EPLB window size, only the metrics
+    of the last `eplb_window_size` steps will be used for rearranging experts.
+    """
+    eplb_log_balancedness: bool = False
+    """
+    Log the balancedness each step of expert parallelism.
+    This is turned off by default since it will cause communication overhead.
+    """
+
+    max_parallel_loading_workers: Optional[int] = None
+    """Maximum number of parallel loading workers when loading model
+    sequentially in multiple batches. To avoid RAM OOM when using tensor
+    parallel and large models."""
+
+    disable_custom_all_reduce: bool = False
+    """Disable the custom all-reduce kernel and fall back to NCCL."""
+
+    ray_workers_use_nsight: bool = False
+    """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
+
+    ray_runtime_env: Optional[RuntimeEnv] = None
+    """Ray runtime environment to pass to distributed workers."""
+
+    placement_group: Optional[PlacementGroup] = None
+    """ray distributed model workers placement group."""
+
+    distributed_executor_backend: Optional[Union[DistributedExecutorBackend,
+                                                 type[ExecutorBase]]] = None
+    """Backend to use for distributed model
+    workers, either "ray" or "mp" (multiprocessing). If the product
+    of pipeline_parallel_size and tensor_parallel_size is less than
+    or equal to the number of GPUs available, "mp" will be used to
+    keep processing on a single host. Otherwise, this will default
+    to "ray" if Ray is installed and fail otherwise. Note that tpu
+    only support Ray for distributed inference."""
+
+    worker_cls: str = "auto"
+    """The full name of the worker class to use. If "auto", the worker class
+    will be determined based on the platform."""
+    sd_worker_cls: str = "auto"
+    """The full name of the worker class to use for speculative decoding.
+    If "auto", the worker class will be determined based on the platform."""
+    worker_extension_cls: str = ""
+    """The full name of the worker extension class to use. The worker extension
+    class is dynamically inherited by the worker class. This is used to inject
+    new attributes and methods to the worker class for use in collective_rpc
+    calls."""
+
+    world_size: int = field(init=False)
+    """world_size is TPxPP, it affects the number of workers we create."""
+
+    rank: int = 0
+    """Global rank in distributed setup."""
+
+    enable_multimodal_encoder_data_parallel: bool = False
+    """ Use data parallelism instead of tensor parallelism for vision encoder.
+    Only support LLama4 for now"""
+
+    @property
+    def world_size_across_dp(self) -> int:
+        """world_size_across_dp is TPxPPxDP, it is the size of the world
+        including data parallelism."""
+        return self.world_size * self.data_parallel_size
+
+    def get_next_dp_init_port(self) -> int:
+        """
+        We might need to initialize process groups in multiple
+        processes that is related to data parallelism,
+        e.g. both in the worker and in the engine, which
+        can live in different processes. To avoid port conflicts, we
+        increment the port number each time we need to initialize a
+        new process group related to data parallelism.
+        """
+        answer = self.data_parallel_master_port
+        self.data_parallel_master_port += 1
+        return answer
+
+    def stateless_init_dp_group(self) -> ProcessGroup:
+        # NOTE: In high-concurrency scenarios multiple processes
+        # can pick the same (currently free) port through a race
+        # condition when calling `get_open_port()`. When the first
+        # process binds the port the others will subsequently fail
+        # with `torch.distributed.DistNetworkError: EADDRINUSE`.
+        # To make the initialization more robust we retry a few times
+        # with a fresh port whenever this specific error is observed.
+        from torch.distributed import DistNetworkError
+
+        from vllm.distributed.utils import (
+            stateless_init_torch_distributed_process_group)
+
+        max_retries = 5
+        last_exc: Optional[Exception] = None
+        for _ in range(max_retries):
+            try:
+                # use gloo since the engine process might not have cuda device
+                return stateless_init_torch_distributed_process_group(
+                    self.data_parallel_master_ip,
+                    self.get_next_dp_init_port(),
+                    self.data_parallel_rank,
+                    self.data_parallel_size,
+                    backend="gloo")
+            except DistNetworkError as e:
+                # We only want to retry when the root cause is EADDRINUSE.
+                if "EADDRINUSE" in str(e):
+                    logger.warning(
+                        "Address already in use. Retrying with a new port.")
+                    last_exc = e
+                    continue  # try again with a new port
+                raise e
+
+        # If we get here all retries have failed.
+        assert last_exc is not None
+        raise last_exc
+
+    @staticmethod
+    def has_unfinished_dp(dp_group: ProcessGroup,
+                          has_unfinished: bool) -> bool:
+        tensor = torch.tensor([has_unfinished],
+                              dtype=torch.int32,
+                              device="cpu")
+        # dp rank 0: has_unfinished_seqs=True
+        # dp rank 1: has_unfinished_seqs=False
+        # aggregated: has_unfinished_seqs=True
+        # so this is an OR operation, i.e. MAX in integers
+        torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group)
+        aggregated_has_unfinished = bool(tensor.item())
+        return aggregated_has_unfinished
+
+    @staticmethod
+    def sync_kv_cache_memory_size(dp_group: ProcessGroup,
+                                  kv_cache_memory: int) -> int:
+        if kv_cache_memory == -1:
+            kv_cache_memory = torch.iinfo(torch.int64).max
+        tensor = torch.tensor([kv_cache_memory],
+                              dtype=torch.int64,
+                              device="cpu")
+        # we cannot use broadcast for stateless dp group since it depends
+        # on global rank
+        torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
+        return tensor.item()
+
+    def compute_hash(self):
+        """
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+        factors.append(self.pipeline_parallel_size)
+        factors.append(self.tensor_parallel_size)
+        factors.append(self.enable_expert_parallel)
+        factors.append(self.data_parallel_size)
+        factors.append(envs.VLLM_ALL2ALL_BACKEND)
+        return hashlib.sha256(str(factors).encode()).hexdigest()
+
+    def __post_init__(self) -> None:
+        self.world_size = self.pipeline_parallel_size * \
+            self.tensor_parallel_size
+
+        if self.data_parallel_size_local > self.data_parallel_size:
+            raise ValueError(
+                f"data_parallel_size_local ({self.data_parallel_size_local}) "
+                f"must be <= data_parallel_size ({self.data_parallel_size})")
+
+        if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
+            # Data parallel was specified in the engine args.
+            self.data_parallel_master_port = get_open_port()
+
+            if not (0 <= self.data_parallel_rank < self.data_parallel_size):
+                raise ValueError(
+                    f"data_parallel_rank ({self.data_parallel_rank})"
+                    f" must be in the range [0, {self.data_parallel_size})")
+        else:
+            # Otherwise fall back to env vars (e.g. for offline SPMD case).
+            self.data_parallel_size = envs.VLLM_DP_SIZE
+            self.data_parallel_rank = envs.VLLM_DP_RANK
+            self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
+            self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
+            self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+
+            if self.data_parallel_external_lb:
+                raise ValueError("data_parallel_external_lb can only "
+                                 "be set when data_parallel_size > 1")
+
+        if self.distributed_executor_backend == "external_launcher":
+            import os
+            os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+            logger.info("Disabling V1 multiprocessing for external launcher.")
+
+        if self.enable_eplb:
+            if not current_platform.is_cuda():
+                raise ValueError(
+                    "Expert parallelism load balancing is only supported on "
+                    "CUDA devices now.")
+            if self.num_redundant_experts < 0:
+                raise ValueError(
+                    "num_redundant_experts must be non-negative, but got "
+                    f"{self.num_redundant_experts}.")
+            if not self.enable_expert_parallel:
+                raise ValueError(
+                    "enable_expert_parallel must be True to use EPLB.")
+            if self.tensor_parallel_size * self.data_parallel_size <= 1:
+                raise ValueError(
+                    "EPLB requires tensor_parallel_size or data_parallel_size "
+                    f"to be greater than 1, but got "
+                    f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
+                )
+        else:
+            if self.num_redundant_experts != 0:
+                raise ValueError(
+                    "num_redundant_experts should be used with EPLB."
+                    f"{self.num_redundant_experts}.")
+        if self.distributed_executor_backend is None and self.world_size > 1:
+            # We use multiprocessing by default if world_size fits on the
+            # current node and we aren't in a ray placement group.
+
+            from vllm.executor import ray_utils
+            backend: DistributedExecutorBackend = "mp"
+            ray_found = ray_utils.ray_is_available()
+            if current_platform.is_neuron():
+                # neuron uses single process to control multiple devices
+                backend = "uni"
+            elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
+                backend = "uni"
+            elif (current_platform.is_cuda()
+                  and cuda_device_count_stateless() < self.world_size):
+                if not ray_found:
+                    raise ValueError("Unable to load Ray: "
+                                     f"{ray_utils.ray_import_err}. Ray is "
+                                     "required for multi-node inference, "
+                                     "please install Ray with `pip install "
+                                     "ray`.")
+                backend = "ray"
+            elif self.data_parallel_backend == "ray":
+                logger.info("Using ray distributed inference because "
+                            "data_parallel_backend is ray")
+                backend = "ray"
+            elif ray_found:
+                if self.placement_group:
+                    backend = "ray"
+                else:
+                    from ray import is_initialized as ray_is_initialized
+                    if ray_is_initialized():
+                        from ray.util import get_current_placement_group
+                        if get_current_placement_group():
+                            backend = "ray"
+            self.distributed_executor_backend = backend
+            logger.debug("Defaulting to use %s for distributed inference",
+                         backend)
+
+        if self.distributed_executor_backend is None and self.world_size == 1:
+            self.distributed_executor_backend = "uni"
+
+    @property
+    def use_ray(self) -> bool:
+        return self.distributed_executor_backend == "ray" or (
+            isinstance(self.distributed_executor_backend, type)
+            and self.distributed_executor_backend.uses_ray)
+
+    @model_validator(mode='after')
+    def _verify_args(self) -> Self:
+        # Lazy import to avoid circular import
+        from vllm.executor.executor_base import ExecutorBase
+        from vllm.platforms import current_platform
+        if self.distributed_executor_backend not in (
+                "ray", "mp", "uni",
+                "external_launcher", None) and not (isinstance(
+                    self.distributed_executor_backend, type) and issubclass(
+                        self.distributed_executor_backend, ExecutorBase)):
+            raise ValueError(
+                "Unrecognized distributed executor backend "
+                f"{self.distributed_executor_backend}. Supported "
+                "values are 'ray', 'mp' 'uni', 'external_launcher' or"
+                " custom ExecutorBase subclass.")
+        if self.use_ray:
+            from vllm.executor import ray_utils
+            ray_utils.assert_ray_available()
+
+        if not current_platform.use_custom_allreduce():
+            self.disable_custom_all_reduce = True
+            logger.debug(
+                "Disabled the custom all-reduce kernel because it is not "
+                "supported on current platform.")
+        if self.ray_workers_use_nsight and not self.use_ray:
+            raise ValueError("Unable to use nsight profiling unless workers "
+                             "run with Ray.")
+
+        return self

From 5a16fa614c78e1f401125cd7384c602f83cb2160 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 9 Aug 2025 18:56:25 +0200
Subject: [PATCH 125/932] [Model] Gemma3n MM (#20495)

Signed-off-by: ShriKode <shrikode@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: ShriKode <shrikode@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
---
 docs/models/supported_models.md               |  15 +-
 examples/offline_inference/audio_language.py  |  20 +
 examples/offline_inference/vision_language.py |  27 +
 requirements/test.in                          |   2 +-
 requirements/test.txt                         |   2 +-
 .../multimodal/processing/test_common.py      |   5 +-
 tests/models/registry.py                      |   4 +-
 tests/test_test.py                            |  61 ++
 vllm/model_executor/models/gemma3n.py         |  79 +-
 vllm/model_executor/models/gemma3n_mm.py      | 700 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   4 +-
 11 files changed, 864 insertions(+), 55 deletions(-)
 create mode 100644 tests/test_test.py
 create mode 100644 vllm/model_executor/models/gemma3n_mm.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 19186a0635..5c48998ba4 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -349,7 +349,7 @@ th {
 | `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
+| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
 | `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Glm4MoeForCausalLM` | GLM-4.5 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -412,9 +412,6 @@ th {
 !!! note
     Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
 
-!!! note
-    Only text inputs are currently supported for `Gemma3nForConditionalGeneration`. To use this model, please upgrade Hugging Face Transformers to version 4.53.0.
-
 ### Pooling Models
 
 See [this page](./pooling_models.md) for more information on how to use pooling models.
@@ -608,6 +605,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
 | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
+| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -677,6 +675,15 @@ Some models are supported only via the [Transformers backend](#transformers). Th
 
     This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
 
+!!! note
+    `Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
+    MobileNet-v5 vision backbone.
+  
+    Performance is not yet fully optimized mainly due to:
+  
+    - Both audio and vision MM encoders use `transformers.AutoModel` implementation.  
+    - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.
+
 !!! note
     Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
 
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 01d6a188be..22cb8b057d 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -96,6 +96,25 @@ def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
     )
 
 
+# Gemma3N
+def run_gemma3n(question: str, audio_count: int) -> ModelRequestData:
+    model_name = "google/gemma-3n-E2B-it"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_batched_tokens=2048,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"audio": audio_count},
+        enforce_eager=True,
+    )
+    prompt = f"<start_of_turn>user\n<audio_soft_token>{question}"
+    "<end_of_turn>\n<start_of_turn>model\n"
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
 # Granite Speech
 def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
     # NOTE - the setting in this example are somehat different than what is
@@ -331,6 +350,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
 
 model_example_map = {
     "voxtral": run_voxtral,
+    "gemma3n": run_gemma3n,
     "granite_speech": run_granite_speech,
     "minicpmo": run_minicpmo,
     "phi4_mm": run_phi4mm,
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 1314d33e90..5b3f0d2dc2 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -211,7 +211,33 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
         )
         for question in questions
     ]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
+
+# Gemma3N
+def run_gemma3n(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "google/gemma-3n-E2B-it"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+        enforce_eager=True,
+    )
+
+    prompts = [
+        (
+            "<start_of_turn>user\n"
+            f"<image_soft_token>{question}<end_of_turn>\n"
+            "<start_of_turn>model\n"
+        )
+        for question in questions
+    ]
     return ModelRequestData(
         engine_args=engine_args,
         prompts=prompts,
@@ -1395,6 +1421,7 @@ model_example_map = {
     "florence2": run_florence2,
     "fuyu": run_fuyu,
     "gemma3": run_gemma3,
+    "gemma3n": run_gemma3n,
     "glm4v": run_glm4v,
     "glm4_1v": run_glm4_1v,
     "h2ovl_chat": run_h2ovl,
diff --git a/requirements/test.in b/requirements/test.in
index ca22fd1551..6652bfdfe6 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -21,7 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
-timm # required for internvl test
+timm >=1.0.17 # required for internvl and gemma3n-mm test
 torch==2.7.1
 torchaudio==2.7.1
 torchvision==0.22.1
diff --git a/requirements/test.txt b/requirements/test.txt
index 377eeb58c4..ff9886a315 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1051,7 +1051,7 @@ tiktoken==0.7.0
     # via
     #   lm-eval
     #   mistral-common
-timm==1.0.15
+timm==1.0.17
     # via
     #   -r requirements/test.in
     #   open-clip-torch
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index bd1c55d95d..906966ddd0 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -271,6 +271,7 @@ def _test_processing_correctness_one(
     "microsoft/Florence-2-base",
     "adept/fuyu-8b",
     "google/gemma-3-4b-it",
+    "google/gemma-3n-E2B-it",
     "zai-org/glm-4v-9b",
     "zai-org/GLM-4.1V-9B-Thinking",
     "ibm-granite/granite-speech-3.3-2b",
@@ -315,7 +316,7 @@ def _test_processing_correctness_one(
     "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     "openai/whisper-large-v3",
     "omni-research/Tarsier-7b",
-    "omni-research/Tarsier2-Recap-7b"
+    "omni-research/Tarsier2-Recap-7b",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
@@ -327,6 +328,8 @@ def test_processing_correctness(
     num_batches: int,
     simplify_rate: float,
 ):
+    if model_id == "google/gemma-3n-E2B-it":
+        pytest.skip("Skipping gemma-3n-E2B-it due to transformers #39911 bug.")
     _test_processing_correctness(
         model_id,
         hit_rate=hit_rate,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 09d62413fe..e0939d1a20 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -186,7 +186,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
-    "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it",    # noqa: E501
+    "Gemma3nForCausalLM": _HfExamplesInfo("google/gemma-3n-E2B-it",
                                           min_transformers_version="4.53"),
     "GlmForCausalLM": _HfExamplesInfo("zai-org/glm-4-9b-chat-hf"),
     "Glm4ForCausalLM": _HfExamplesInfo("zai-org/GLM-4-9B-0414"),
@@ -391,6 +391,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
+    "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it",    # noqa: E501
+                                        min_transformers_version="4.53"),
     "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"),  # noqa: E501
     "GLM4VForCausalLM": _HfExamplesInfo("zai-org/glm-4v-9b",
                                         trust_remote_code=True,
diff --git a/tests/test_test.py b/tests/test_test.py
new file mode 100644
index 0000000000..dc8c9814ed
--- /dev/null
+++ b/tests/test_test.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import LLM, envs
+from vllm.sampling_params import SamplingParams
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+@pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
+# TODO TPU will appear busy if we fan-out test params here
+@pytest.mark.parametrize("n_prompts", [1])
+def test_logprobs(model_name: str, n_prompts: int):
+    """
+    Request top logprobs with different sampling settings and check
+    that results contains the requested number, ordered ascendingly.  
+    """
+
+    def check_num_logprobs(logprobs, expected_num: int):
+        for step in logprobs:
+            prev_logp = 1.0
+            # order by rank
+            sorted_step = dict(
+                sorted(step.items(), key=lambda item: item[1].rank))
+
+            if len(step) != expected_num:
+                print("watch out", sorted_step)
+
+            # check results are ordered by prob value
+            # assert len(step) == expected_num
+            for rankno, (tid, logp) in enumerate(sorted_step.items()):
+                assert logp.logprob <= prev_logp
+                prev_logp = logp.logprob
+                assert logp.rank == rankno + 1
+
+    llm = LLM(model_name,
+              enforce_eager=False,
+              max_num_seqs=1,
+              max_model_len=128,
+              max_num_batched_tokens=128)
+    prompts = [
+        "Write a short story about a robot that dreams for the first time."
+    ] * n_prompts
+    greedy_sampling_params = SamplingParams(temperature=0.0, max_tokens=64,\
+         logprobs=4)
+    regular_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\
+         logprobs=4)
+    topkp_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\
+         logprobs=4, top_k=12, top_p=0.5)
+
+    for sp in [greedy_sampling_params, regular_sampling_params, \
+               topkp_sampling_params]:
+        output = llm.generate(prompts, sp)
+        for o in output:
+            check_num_logprobs(o.outputs[0].logprobs, 4)
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index e16c03c8d3..4b41cba1c7 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -331,14 +331,15 @@ class Gemma3nAttention(nn.Module):
                                      config.num_kv_shared_layers)
         self.is_kv_shared = layer_idx >= first_kv_shared_layer_idx
 
+        kv_sharing_target_layer_name = None
         if self.is_kv_shared:
             # Last full attention layer is 1 before sharing
             # Last sliding attention layer is 2 before sharing
             offset = 2 if self.sliding_window is not None else 1
             kv_shared_layer_index = first_kv_shared_layer_idx - offset
-            kv_sharing_target_layer_name = f"model.language_model.layers.{kv_shared_layer_index}.self_attn.attn"  # noqa: E501
-        else:
-            kv_sharing_target_layer_name = None
+            if kv_shared_layer_index >= 0:
+                # Only the greater layer is required to specify sharing.
+                kv_sharing_target_layer_name = f"language_model.model.layers.{kv_shared_layer_index}.self_attn.attn"  # noqa: E501
 
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -396,6 +397,7 @@ class Gemma3nDecoderLayer(nn.Module):
         prefix: str = "",
     ) -> None:
         super().__init__()
+        assert isinstance(config, Gemma3nTextConfig)
         self.altup_active_idx = config.altup_active_idx
         assert config.altup_correct_scale
 
@@ -537,7 +539,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        config = vllm_config.model_config.hf_config.text_config
+        config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
@@ -553,6 +555,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
             config.hidden_size**0.5,
             dtype=self.embed_tokens.weight.dtype,
         )
+        # Additional per-layer embeddings (PLE)
         self.embed_tokens_per_layer = VocabParallelEmbedding(
             config.vocab_size_per_layer_input,
             config.num_hidden_layers * config.hidden_size_per_layer_input,
@@ -636,6 +639,8 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
+        per_layer_inputs: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -644,13 +649,6 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         else:
             hidden_states_0 = self.get_input_embeddings(input_ids)
 
-        # Per layer inputs.
-        if input_ids is None:
-            raise ValueError("Passing None for input ids is not supported.")
-        per_layer_inputs = self.get_per_layer_input_embeddings(input_ids)
-        per_layer_inputs = per_layer_inputs.reshape(
-            -1, self.config.num_hidden_layers,
-            self.config.hidden_size_per_layer_input)
         per_layer_projection = self.per_layer_model_projection(hidden_states_0)
         per_layer_projection = per_layer_projection.reshape(
             *hidden_states_0.shape[:-1],
@@ -659,8 +657,13 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         )
         per_layer_projection = self.per_layer_projection_norm(
             per_layer_projection)
-        per_layer_inputs = per_layer_projection + per_layer_inputs
-        per_layer_inputs *= self.per_layer_input_scale
+
+        if per_layer_inputs is not None:
+            # Profiling run does not compute per_layer_inputs
+            per_layer_inputs = per_layer_projection + per_layer_inputs
+            per_layer_inputs *= self.per_layer_input_scale
+        else:
+            per_layer_inputs = per_layer_projection
 
         # Altup embed.
         hidden_states = [hidden_states_0] * self.config.altup_num_inputs
@@ -760,29 +763,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         return loaded_params
 
 
-class Gemma3nModel(nn.Module):
-
-    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        self.language_model = Gemma3nTextModel(vllm_config=vllm_config,
-                                               prefix=maybe_prefix(
-                                                   prefix, "language_model"))
-
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor],
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        return self.language_model(input_ids=input_ids,
-                                   positions=positions,
-                                   inputs_embeds=inputs_embeds,
-                                   **kwargs)
-
-
-class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant):
+class Gemma3nForCausalLM(nn.Module):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -802,25 +783,33 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant):
         super().__init__()
         self.config = config
         self.cache_config = vllm_config.cache_config
-        self.model = Gemma3nModel(vllm_config=vllm_config,
-                                  prefix=maybe_prefix(prefix, "model"))
+        self.model = Gemma3nTextModel(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(
-            config.text_config.vocab_size,
-            soft_cap=config.text_config.final_logit_softcapping)
+            config.vocab_size, soft_cap=config.final_logit_softcapping)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.language_model.get_input_embeddings(input_ids)
+        return self.model.get_input_embeddings(input_ids)
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
+        *,
+        per_layer_inputs: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds, **kwargs)
+
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            per_layer_inputs=per_layer_inputs,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
         return hidden_states
 
     def compute_logits(
@@ -828,8 +817,8 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant):
         hidden_states: torch.Tensor,
         sampling_metadata: Optional[SamplingMetadata],
     ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.model.language_model.embed_tokens,
-                                       hidden_states, sampling_metadata)
+        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
+                                       sampling_metadata)
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str,
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
new file mode 100644
index 0000000000..a0c3bb5007
--- /dev/null
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -0,0 +1,700 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Optional, TypedDict, Union, cast
+
+import torch
+from torch import nn
+from transformers import AutoModel, BatchFeature
+from transformers.models.gemma3n import (Gemma3nAudioConfig,
+                                         Gemma3nAudioFeatureExtractor,
+                                         Gemma3nConfig, Gemma3nProcessor,
+                                         Gemma3nTextConfig,
+                                         Gemma3nVisionConfig)
+from transformers.models.siglip import SiglipImageProcessorFast
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+# yapf: disable
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, BoundPromptUpdate,
+                                        PlaceholderFeaturesInfo,
+                                        PromptReplacement, PromptTargetMatch,
+                                        PromptUpdate, PromptUpdateDetails,
+                                        find_mm_placeholders,
+                                        replace_token_matches)
+# yapf: enable
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+logger = init_logger(__name__)
+
+# This should be based on model config but we hardcode them for now.
+TOKENS_PER_IMAGE = 256
+TOKENS_PER_AUDIO = 188
+
+
+class Gemma3nImagePixelInputs(TypedDict):
+    pixel_values: torch.Tensor
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+
+
+class Gemma3nAudioInputs(TypedDict):
+    input_features: torch.Tensor
+    """Shape: `(batch_size * num_audio, seq_length, num_features)`"""
+    input_features_mask: torch.Tensor
+    """Shape: `(batch_size * num_audio, seq_length)`"""
+
+
+Gemma3nImageInputs = Gemma3nImagePixelInputs
+
+
+class Gemma3nProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Gemma3nConfig)
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(Gemma3nProcessor, **kwargs)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "audio": None}
+
+    def get_max_tokens_per_item(
+            self, seq_len: int,
+            mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
+
+        return {"image": TOKENS_PER_IMAGE, "audio": TOKENS_PER_AUDIO}
+
+    def get_image_repl(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Gemma3nProcessor],
+    ) -> str:
+        """
+        Get the replacement text for image tokens.
+        
+        For Gemma3n, this should return the full_image_sequence which includes
+        BOI token, repeated image tokens, and EOI token.
+        """
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return PromptUpdateDetails.select_token_id(
+            processor.full_image_sequence, processor.image_token_id)
+
+    def get_audio_repl(
+        self,
+        *,
+        processor: Optional[Gemma3nProcessor],
+    ) -> str:
+        """
+        Get the replacement text for audio tokens.
+        
+        For Gemma3n, this should return the full_audio_sequence which includes
+        BOA token, repeated audio tokens, and EOA token.
+        """
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        # Return the full audio sequence as defined by the processor
+        return PromptUpdateDetails.select_token_id(
+            processor.full_audio_sequence, processor.audio_token_id)
+
+
+class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_audios = mm_counts.get("audio", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+        audio_token = processor.audio_token
+
+        return image_token * num_images + audio_token * num_audios
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_audios = mm_counts.get("audio", 0)
+        processor = self.info.get_hf_processor()
+        audio_feature_extractor: Gemma3nAudioFeatureExtractor = processor.feature_extractor  # noqa: E501
+        audio_len = audio_feature_extractor.fft_length
+        image_processor: SiglipImageProcessorFast = processor.image_processor
+        img_width = image_processor.size.get("width", 224)
+        img_height = image_processor.size.get("height", 224)
+
+        return {
+            "image":
+            self._get_dummy_images(width=img_width,
+                                   height=img_height,
+                                   num_images=num_images),
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+
+class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
+                                 ):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_hf_processor().feature_extractor
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+
+        # HF Transformers audio processor no longer accepts `audios` key.
+        # We pop `audios` and replace it with `audio` key to surpress
+        # the warning.
+        if 'audios' in mm_data:
+            mm_data['audio'] = mm_data.pop('audios')
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
+            tok_kwargs,
+        )
+        if 'input_features' in processed_outputs:
+            # Avoid padding since we need the output of each item to be
+            # independent of other items for the cache to work correctly
+            unpadded_features = [
+                f[mask] for f, mask in zip(
+                    processed_outputs["input_features"],
+                    processed_outputs["input_features_mask"],
+                )
+            ]
+            processed_outputs["input_features"] = unpadded_features
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"),
+                    input_features=MultiModalFieldConfig.batched("audio"),
+                    input_features_mask=MultiModalFieldConfig.batched("audio"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        prompt_updates = []
+
+        # Handle image tokens
+        if "image" in mm_items:
+            image_token = hf_processor.image_token
+
+            def get_replacement_image(item_idx: int):
+                images = mm_items.get_items("image", ImageProcessorItems)
+                image_size = images.get_image_size(item_idx)
+                return self.info.get_image_repl(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            prompt_updates.append(
+                PromptReplacement(
+                    modality="image",
+                    target=image_token,
+                    replacement=get_replacement_image,
+                ))
+
+        # Handle audio tokens
+        if "audio" in mm_items:
+            audio_token = hf_processor.audio_token
+
+            def get_replacement_audio(item_idx: int):
+                return self.info.get_audio_repl(processor=hf_processor, )
+
+            prompt_updates.append(
+                PromptReplacement(
+                    modality="audio",
+                    target=audio_token,
+                    replacement=get_replacement_audio,
+                ))
+
+        return prompt_updates
+
+    def _apply_token_matches(
+        self,
+        prompt: list[int],
+        mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
+        mm_item_counts: Mapping[str, int],
+    ) -> list[int]:
+        token_ids = super()._apply_token_matches(
+            prompt,
+            mm_matches,
+            mm_item_counts,
+        )
+
+        # "\n\n\n" and "\n\n\n\n" are single tokens
+        # Since our replacement can insert "\n\n" next to "\n"
+        # tokens, we have to combine them to be consistent with
+        # the output of the tokenizer
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+        newline_1 = vocab["\n"]
+        newline_2 = vocab["\n\n"]
+        newline_3 = vocab["\n\n\n"]
+        newline_4 = vocab["\n\n\n\n"]
+
+        token_ids = replace_token_matches(
+            token_ids,
+            [newline_1, newline_2],
+            [newline_3],
+        )
+        token_ids = replace_token_matches(
+            token_ids,
+            [newline_2, newline_1],
+            [newline_3],
+        )
+        token_ids = replace_token_matches(
+            token_ids,
+            [newline_2, newline_2],
+            [newline_4],
+        )
+
+        return token_ids
+
+    def _find_mm_placeholders(
+        self,
+        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
+        new_token_ids: list[int],
+        mm_item_counts: Mapping[str, int],
+    ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
+        # We need to detect "\n\n" inside "\n\n\n" and "\n\n\n\n"
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+        newline_1 = vocab["\n"]
+        newline_2 = vocab["\n\n"]
+        newline_3 = vocab["\n\n\n"]
+        newline_4 = vocab["\n\n\n\n"]
+
+        def get_repl_toks(tok: int) -> list[int]:
+            if tok == newline_3:
+                return [newline_1, newline_2]
+            if tok == newline_4:
+                return [newline_2, newline_2]
+
+            return [tok]
+
+        repl_token_ids = list[int]()
+        repl_orig_idxs = list[int]()
+        for orig_idx, orig_tok in enumerate(new_token_ids):
+            repl_toks = get_repl_toks(orig_tok)
+            repl_token_ids.extend(repl_toks)
+            repl_orig_idxs.extend(orig_idx for _ in range(len(repl_toks)))
+
+        repls = find_mm_placeholders(mm_prompt_updates, repl_token_ids,
+                                     mm_item_counts)
+
+        return {
+            modality: [
+                PlaceholderFeaturesInfo(
+                    modality=p.modality,
+                    item_idx=p.item_idx,
+                    start_idx=repl_orig_idxs[p.start_idx],
+                    tokens=p.tokens,
+                    is_embed=p.is_embed,
+                ) for p in placeholders
+            ]
+            for modality, placeholders in repls.items()
+        }
+
+
+class Gemma3nMultimodalEmbedder(nn.Module):
+    """Embeds token ids or soft tokens for multimodal content into language 
+    model space."""
+
+    def __init__(
+        self,
+        multimodal_config: Union[Gemma3nAudioConfig, Gemma3nVisionConfig],
+        text_config: Gemma3nTextConfig,
+    ):
+        super().__init__()
+
+        self.multimodal_hidden_size = multimodal_config.hidden_size
+        self.eps = multimodal_config.rms_norm_eps
+        self.vocab_offset = multimodal_config.vocab_offset
+        self.vocab_size = multimodal_config.vocab_size
+        self.text_hidden_size = text_config.hidden_size
+
+        self.embedding = VocabParallelEmbedding(
+            self.vocab_size,
+            self.multimodal_hidden_size,
+        )
+
+        self.hard_embedding_norm = RMSNorm(
+            self.multimodal_hidden_size,
+            eps=self.eps,
+        )
+
+        self.soft_embedding_norm = RMSNorm(
+            self.multimodal_hidden_size,
+            eps=self.eps,
+        )
+
+        self.embedding_projection = RowParallelLinear(
+            self.multimodal_hidden_size,
+            self.text_hidden_size,
+            bias=False,
+        )
+
+        self.embedding_post_projection_norm = RMSNorm(
+            self.text_hidden_size,
+            eps=self.eps,
+            has_weight=False,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Embeds token ids or soft tokens for multimodal content into language model space.
+
+        Args:
+            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
+                `[vocab_offset, vocab_offset + vocab_size)`.
+            inputs_embeds: A torch.Tensor containing the soft tokens to embed.
+
+        Returns:
+            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
+        """  # noqa: E501
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is not None:
+            emb_norm = self.soft_embedding_norm(inputs_embeds)
+        else:
+            hard_emb = self.embedding(input_ids - self.vocab_offset)
+            emb_norm = self.hard_embedding_norm(hard_emb)
+
+        emb_norm_proj, _ = self.embedding_projection(emb_norm)
+        return self.embedding_post_projection_norm(emb_norm_proj)
+
+
+@MULTIMODAL_REGISTRY.register_processor(Gemma3nMultiModalProcessor,
+                                        info=Gemma3nProcessingInfo,
+                                        dummy_inputs=Gemma3nDummyInputsBuilder)
+class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.embed_audio.": "embed_audio.",
+            "model.embed_vision.": "embed_vision.",
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.audio_tower.": "audio_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "lm_head.": "language_model.lm_head.",
+            "model": "language_model.model",
+        })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.quant_config = quant_config
+        self.multimodal_config = multimodal_config
+        self.vocab_size = config.text_config.vocab_size
+
+        self.sliding_window = getattr(config.text_config,
+                                      "interleaved_sliding_window", None)
+
+        self.vision_tower = AutoModel.from_config(config=config.vision_config)
+        self.audio_tower = AutoModel.from_config(config=config.audio_config)
+        self.embed_vision = Gemma3nMultimodalEmbedder(config.vision_config,
+                                                      config.text_config)
+        self.embed_audio = Gemma3nMultimodalEmbedder(config.audio_config,
+                                                     config.text_config)
+
+        self.language_model: nn.Module = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Gemma3nForCausalLM"],
+        )
+        self.language_model = cast(Gemma3nForCausalLM, self.language_model)
+        # NOTE (NickLucche) In order to be compatible with cudagraph, the
+        # buffer needs to be consistent, so we pre-allocate here.
+        self.per_layer_embeddings = torch.zeros(
+            vllm_config.scheduler_config.max_num_batched_tokens,
+            self.config.text_config.num_hidden_layers,
+            self.config.text_config.hidden_size_per_layer_input,
+            device=self.language_model.model.embed_tokens.weight.device,
+            dtype=self.language_model.model.embed_tokens.weight.dtype)
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        # TODO check if there are any
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Gemma3nImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        # TODO is this the case?
+        assert image_embeds is None, "Gemma3n does not support image_embeds."
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        pixel_values = flatten_bn(pixel_values, concat=True)
+        pixel_values = pixel_values.contiguous()
+
+        return Gemma3nImagePixelInputs(
+            pixel_values=self._validate_pixel_values(pixel_values), )
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[Gemma3nAudioInputs]:
+        input_features = kwargs.pop("input_features", None)
+        if input_features is None:
+            return None
+
+        input_features_mask = kwargs.pop("input_features_mask", None)
+        if input_features_mask is None:
+            return None
+
+        return Gemma3nAudioInputs(
+            input_features=input_features,
+            input_features_mask=input_features_mask,
+        )
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key == "input_features" \
+                and "audio" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "audio"] = self._parse_and_validate_audio_input(**kwargs)
+        return mm_input_by_modality
+
+    def _process_image_input(
+        self,
+        image_input: Gemma3nImageInputs,
+    ) -> list[torch.Tensor]:
+        assert self.vision_tower is not None
+
+        pixel_values = image_input["pixel_values"]
+        vision_outputs = self.vision_tower(pixel_values=pixel_values,
+                                           do_pooling=False,
+                                           return_dict=True).last_hidden_state
+        # TODO try to avoid copy here
+        # (batch, channels, height, width) to (batch, height * width, channels)
+        vision_outputs = vision_outputs.reshape(
+            vision_outputs.shape[0],
+            self.config.vision_config.hidden_size,
+            self.config.vision_soft_tokens_per_image,
+        ).permute(0, 2, 1).contiguous()
+        # Normalize and embed the soft tokens into language model space.
+        vision_outputs *= self.config.vision_config.hidden_size**0.5
+        # Return a list of embeddings instead of a batched tensor
+        return self.embed_vision(inputs_embeds=vision_outputs).unbind(0)
+
+    def _process_audio_input(
+        self,
+        audio_input: Gemma3nAudioInputs,
+    ) -> list[torch.Tensor]:
+        assert self.audio_tower is not None
+        input_features = audio_input["input_features"].squeeze(1)
+        input_features_mask = audio_input["input_features_mask"].squeeze(1)
+        audio_outputs, audio_mask = self.audio_tower(input_features,
+                                                     ~input_features_mask)
+        audio_features = self.embed_audio(inputs_embeds=audio_outputs)
+
+        # ruff: noqa
+        # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the
+        # text to account for this. However, the audio preprocessing and encoder do not gurarantee they will
+        # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens
+        # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad
+        # the audio feature out to 188 soft tokens with the emebedding of the last token in the embed_audio vocab.
+        # TODO precompute and cache padding
+        audio_padding_toks = torch.tensor([[self.vocab_size - 1]],
+                                          dtype=torch.long,
+                                          device=audio_features.device)
+        audio_padding_embs = self.embed_audio(input_ids=audio_padding_toks)
+        audio_features = torch.where(audio_mask.unsqueeze(-1),
+                                     audio_padding_embs, audio_features)
+
+        audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape
+        extra_padding_tokens = self.config.audio_soft_tokens_per_image - audio_seq_len  # noqa: E501
+        extra_padding_features = audio_padding_embs.expand(
+            audio_batch_size, extra_padding_tokens, audio_embed_dim)
+
+        audio_features = torch.cat((audio_features, extra_padding_features),
+                                   dim=1)
+        # Return a list of embeddings instead of a batched tensor
+        return audio_features.unbind(0)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if mm_input_by_modality is None:
+            return []
+
+        multimodal_embeddings: list[torch.Tensor] = []
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings.extend(vision_embeddings)
+            if modality == "audio":
+                audio_embeddings = self._process_audio_input(multimodal_input)
+                multimodal_embeddings.extend(audio_embeddings)
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache
+        # them here, as the model  forward has only access to the input_embeds.
+        if input_ids is not None:
+            per_layer_inputs = self.language_model.model.get_per_layer_input_embeddings(
+                input_ids)
+            per_layer_inputs = per_layer_inputs.reshape(
+                -1, self.config.text_config.num_hidden_layers,
+                self.config.text_config.hidden_size_per_layer_input)
+            self.per_layer_embeddings[:per_layer_inputs.shape[0]].copy_(
+                per_layer_inputs)
+
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                # NOTE: this order of processing mm items is important
+                [self.config.image_token_id, self.config.audio_token_id])
+        return inputs_embeds
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs: object) -> IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE (NickLucche) During profiling, `get_input_embeddings` is not
+        # called, hence we don't have input_ids to compute PLEs. We simply
+        # select a chunk of pre-allocated PLEs. During normal execution,
+        # `get_input_embeddings` is called before forward, hence this slice
+        # will contain PLEs computed from the actual input_ids.
+        per_layer_inputs = self.per_layer_embeddings[:inputs_embeds.shape[0]]
+
+        hidden_states = self.language_model.model(
+            input_ids,
+            positions,
+            per_layer_inputs=per_layer_inputs,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            **kwargs)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_tower")
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality == "image":
+            return "<image_soft_token>"
+        elif modality == "audio":
+            return "<audio_soft_token>"
+        else:
+            raise ValueError(f"Unsupported modality: {modality}")
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4aa958ecdc..3d8694e7b9 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -69,8 +69,7 @@ _TEXT_GENERATION_MODELS = {
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
     "Gemma3ForCausalLM": ("gemma3", "Gemma3ForCausalLM"),
-    #TODO(ywang96): Support multimodal gemma3n
-    "Gemma3nForConditionalGeneration": ("gemma3n", "Gemma3nForConditionalGeneration"),    # noqa: E501
+    "Gemma3nForCausalLM": ("gemma3n", "Gemma3nForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"),
     "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"),
@@ -205,6 +204,7 @@ _MULTIMODAL_MODELS = {
     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
+    "Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"),    # noqa: E501
     "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
     "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
     "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501

From fbd8595c5c6f969dfa6cf33e5a371d93d55025fb Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 10 Aug 2025 02:42:21 +0800
Subject: [PATCH 126/932] [Bugfix] Fix basic models tests hanging due to mm
 processor creation (#22571)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/multimodal/registry.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index a101f2a55f..ded56cca80 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -138,8 +138,8 @@ class MultiModalRegistry:
         if not model_config.is_multimodal_model:
             return False
 
-        processor = self.create_processor(model_config, disable_cache=False)
-        supported_modalities = processor.info.get_supported_mm_limits()
+        info = self._create_processing_info(model_config, tokenizer=None)
+        supported_modalities = info.get_supported_mm_limits()
 
         mm_config = model_config.get_multimodal_config()
 
@@ -278,6 +278,26 @@ class MultiModalRegistry:
         model_cls, _ = get_model_architecture(model_config)
         return model_cls
 
+    def _create_processing_ctx(
+        self,
+        model_config: "ModelConfig",
+        tokenizer: Optional[AnyTokenizer] = None,
+    ) -> InputProcessingContext:
+        if tokenizer is None and not model_config.skip_tokenizer_init:
+            tokenizer = cached_tokenizer_from_config(model_config)
+        return InputProcessingContext(model_config, tokenizer)
+
+    def _create_processing_info(
+        self,
+        model_config: "ModelConfig",
+        *,
+        tokenizer: Optional[AnyTokenizer] = None,
+    ) -> BaseProcessingInfo:
+        model_cls = self._get_model_cls(model_config)
+        factories = self._processor_factories[model_cls]
+        ctx = self._create_processing_ctx(model_config, tokenizer)
+        return factories.info(ctx)
+
     def create_processor(
         self,
         model_config: "ModelConfig",
@@ -291,15 +311,13 @@ class MultiModalRegistry:
         if not model_config.is_multimodal_model:
             raise ValueError(f"{model_config.model} is not a multimodal model")
 
-        if tokenizer is None and not model_config.skip_tokenizer_init:
-            tokenizer = cached_tokenizer_from_config(model_config)
         if disable_cache is None:
             disable_cache = not model_config.enable_mm_processor_cache
 
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]
 
-        ctx = InputProcessingContext(model_config, tokenizer)
+        ctx = self._create_processing_ctx(model_config, tokenizer)
         cache = None if disable_cache else self._get_processor_cache(
             model_config)
 

From 42172ad18fbc22c89d0063184e4570cc84186e16 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Sat, 9 Aug 2025 11:50:03 -0700
Subject: [PATCH 127/932] [FEAT] [Performance] Add triton mrope to replace the
 torch code path (#22375)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 benchmarks/kernels/benchmark_mrope.py         | 328 ++++++++++++++++++
 tests/kernels/test_mrope.py                   | 207 +++++++++++
 .../layers/rotary_embedding/mrope.py          | 231 ++++++++++++
 3 files changed, 766 insertions(+)
 create mode 100644 benchmarks/kernels/benchmark_mrope.py
 create mode 100644 tests/kernels/test_mrope.py

diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
new file mode 100644
index 0000000000..b914736170
--- /dev/null
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -0,0 +1,328 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# This script benchmarks the mrope kernel (mainly for Qwen2VL and Qwen2.5VL models).
+# It generates test data, runs benchmarks, and saves results to a CSV file.
+#
+# The CSV file (named with current date/time) contains these columns:
+# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
+# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
+# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
+# speedup
+#
+# == Usage Examples ==
+#
+# Single model benchmark:
+# python3 benchmark_mrope.py --model-name Qwen/Qwen2-VL-7B-Instruct --tp-size 1 \
+#   --warmup-iter 10 --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
+#
+# All models benchmark:
+# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \
+#   --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
+#
+# All models with different TP sizes:
+# python3 benchmark_mrope.py --model-name "" --tp-size 1 2 4 8 --warmup-iter 10 \
+#   --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
+#
+# All models with different token counts:
+# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \
+#   --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 4096 16384
+import csv
+import os
+import time
+from datetime import datetime
+from typing import Any
+
+import numpy as np
+import torch
+
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.platforms import current_platform
+from vllm.transformers_utils.config import get_config
+from vllm.utils import FlexibleArgumentParser
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def generate_test_data(
+    num_tokens: int,
+    num_q_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    max_position_embeddings: int,
+    dtype: torch.dtype,
+    device: torch.device,
+):
+    """Generate test data for given configuration."""
+    # Create 2D positions (3, num_tokens) for multimodal case
+    positions = torch.randint(
+        0, max_position_embeddings // 4, (3, num_tokens), device=device
+    )
+
+    # Create query and key tensors
+    query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device)
+    key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device)
+
+    return positions, query, key
+
+
+def calculate_stats(times: list[float]) -> dict[str, float]:
+    """Calculate statistics from a list of times."""
+    times_array = np.array(times)
+    return {
+        "mean": np.mean(times_array),
+        "median": np.median(times_array),
+        "p99": np.percentile(times_array, 99),
+        "min": np.min(times_array),
+        "max": np.max(times_array),
+    }
+
+
+def benchmark_mrope(
+    model_name: str,
+    num_tokens: int,
+    head_dim: int,
+    tp_size: int,
+    num_heads: int,
+    num_kv_heads: int,
+    max_position: int = 8192,
+    rope_theta: float = 10000,
+    is_neox_style: bool = True,
+    rope_scaling: dict[str, Any] = None,
+    dtype: torch.dtype = torch.bfloat16,
+    seed: int = 0,
+    warmup_iter: int = 10,
+    benchmark_iter: int = 100,
+    csv_writer=None,
+):
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    # the parameters to compute the q k v size based on tp_size
+    mrope_helper_class = get_rope(
+        head_size=head_dim,
+        rotary_dim=head_dim,
+        max_position=max_position,
+        base=rope_theta,
+        is_neox_style=is_neox_style,
+        rope_scaling=rope_scaling,
+        dtype=dtype,
+    ).to(device=device)
+
+    print(80 * "=")
+    print(
+        f"Evaluating model: {model_name} "
+        f"with tp_size: {tp_size} "
+        f"and num_tokens: {num_tokens}, "
+        f"dtype: {dtype}"
+    )
+
+    # create q k v input tensors
+    # create rotary pos emb input tensors
+    positions, query, key = generate_test_data(
+        num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
+    )
+
+    # Warm up
+    for _ in range(warmup_iter):
+        mrope_helper_class.forward_native(
+            positions,
+            query.clone(),
+            key.clone(),
+        )
+
+        mrope_helper_class.forward_cuda(
+            positions,
+            query.clone(),
+            key.clone(),
+        )
+
+    torch.cuda.synchronize()
+
+    # Time reference implementation
+    torch_times = []
+    for _ in range(benchmark_iter):
+        query_clone = query.clone()
+        key_clone = key.clone()
+        torch.cuda.synchronize()
+        start_time = time.time()
+
+        mrope_helper_class.forward_native(
+            positions,
+            query_clone,
+            key_clone,
+        )
+
+        torch.cuda.synchronize()
+        torch_times.append(time.time() - start_time)
+
+    # Time triton kernel implementation
+    triton_times = []
+    for _ in range(benchmark_iter):
+        query_clone = query.clone()
+        key_clone = key.clone()
+        torch.cuda.synchronize()
+        start_time = time.time()
+        mrope_helper_class.forward_cuda(
+            positions,
+            query_clone,
+            key_clone,
+        )
+        torch.cuda.synchronize()
+        triton_times.append(time.time() - start_time)
+
+    # Calculate statistics
+    torch_stats = calculate_stats(torch_times)
+    triton_stats = calculate_stats(triton_times)
+    print(f"\nPerformance for config ({num_tokens}, {num_heads}, {num_kv_heads}):")
+
+    print(
+        f"Torch implementation: "
+        f"mean={torch_stats['mean']:.8f}s, "
+        f"median={torch_stats['median']:.8f}s, "
+        f"p99={torch_stats['p99']:.8f}s"
+    )
+
+    print(
+        f"Triton implementation: "
+        f"mean={triton_stats['mean']:.8f}s, "
+        f"median={triton_stats['median']:.8f}s, "
+        f"p99={triton_stats['p99']:.8f}s"
+    )
+
+    print(
+        f"Triton Speedup over Torch: {torch_stats['mean'] / triton_stats['mean']:.8f}x"
+    )
+
+    # Write to CSV
+    if csv_writer:
+        row = [
+            model_name,
+            tp_size,
+            num_tokens,
+            num_heads,
+            num_kv_heads,
+            head_dim,
+            max_position,
+            rope_theta,
+            is_neox_style,
+            str(rope_scaling),
+            str(dtype).split(".")[-1],
+            torch_stats["mean"],
+            torch_stats["median"],
+            torch_stats["p99"],
+            torch_stats["min"],
+            torch_stats["max"],
+            triton_stats["mean"],
+            triton_stats["median"],
+            triton_stats["p99"],
+            triton_stats["min"],
+            triton_stats["max"],
+            torch_stats["mean"] / triton_stats["mean"],  # speedup
+        ]
+        csv_writer.writerow(row)
+
+    return torch_stats, triton_stats
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the rotary embedding kernels."
+    )
+    parser.add_argument("--model-name", type=str, default="")
+    parser.add_argument("--tp-size", type=int, default=1)
+    parser.add_argument("--warmup-iter", type=int, default=10)
+    parser.add_argument("--benchmark-iter", type=int, default=100)
+    parser.add_argument("--dtype", type=str, choices=["bfloat16"], default="bfloat16")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--num-tokens", type=int, nargs="+", required=False)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--output-csv", type=str, default="mrope_benchmark_results.csv")
+    args = parser.parse_args()
+    print(args)
+
+    # Create CSV file for results
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    csv_filename = f"{os.path.splitext(args.output_csv)[0]}_{timestamp}.csv"
+
+    with open(csv_filename, "w", newline="") as csvfile:
+        csv_writer = csv.writer(csvfile)
+        # Write header
+        header = [
+            "model_name",
+            "tp_size",
+            "num_tokens",
+            "num_heads",
+            "num_kv_heads",
+            "head_dim",
+            "max_position",
+            "rope_theta",
+            "is_neox_style",
+            "rope_scaling",
+            "dtype",
+            "torch_mean",
+            "torch_median",
+            "torch_p99",
+            "torch_min",
+            "torch_max",
+            "triton_mean",
+            "triton_median",
+            "triton_p99",
+            "triton_min",
+            "triton_max",
+            "speedup",
+        ]
+        csv_writer.writerow(header)
+
+        model_tp_dict = {}
+        if args.model_name == "":
+            model_tp_dict = {
+                "Qwen/Qwen2-VL-2B-Instruct": [1],
+                "Qwen/Qwen2-VL-7B-Instruct": [1],
+                "Qwen/Qwen2-VL-72B-Instruct": [2, 4, 8],
+                "Qwen/Qwen2.5-VL-3B-Instruct": [1, 2, 4, 8],
+                "Qwen/Qwen2.5-VL-7B-Instruct": [1, 2, 4, 8],
+                "Qwen/Qwen2.5-VL-72B-Instruct": [2, 4, 8],
+            }
+        else:
+            model_tp_dict[args.model_name] = [args.tp_size]
+
+        if args.num_tokens is None:
+            num_tokens_list = [2**i for i in range(0, 18)]
+        else:
+            num_tokens_list = args.num_tokens
+
+        for model_name, tp_list in model_tp_dict.items():
+            config = get_config(model_name, trust_remote_code=args.trust_remote_code)
+            for tp_size in tp_list:
+                # get the model config
+                total_num_kv_heads = config.num_key_value_heads
+                total_num_heads = config.num_attention_heads
+                num_heads = total_num_heads // tp_size
+                num_kv_heads = max(1, total_num_kv_heads // tp_size)
+                head_dim = config.hidden_size // total_num_heads
+                q_size = num_heads * head_dim
+                kv_size = num_kv_heads * head_dim
+                is_neox_style = True
+                rope_theta = config.rope_theta
+                max_position = config.max_position_embeddings
+
+                for num_tokens in num_tokens_list:
+                    benchmark_mrope(
+                        model_name=model_name,
+                        num_tokens=num_tokens,
+                        head_dim=head_dim,
+                        tp_size=tp_size,
+                        num_heads=num_heads,
+                        num_kv_heads=num_kv_heads,
+                        max_position=max_position,
+                        rope_theta=rope_theta,
+                        is_neox_style=is_neox_style,
+                        rope_scaling=config.rope_scaling,
+                        dtype=getattr(torch, args.dtype),
+                        seed=args.seed,
+                        warmup_iter=args.warmup_iter,
+                        benchmark_iter=args.benchmark_iter,
+                        csv_writer=csv_writer,
+                    )
+
+    print(f"Benchmark results saved to {csv_filename}")
diff --git a/tests/kernels/test_mrope.py b/tests/kernels/test_mrope.py
new file mode 100644
index 0000000000..5918b7a58b
--- /dev/null
+++ b/tests/kernels/test_mrope.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+from transformers import AutoConfig
+
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.platforms import current_platform
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
+                       head_size: int, max_position_embeddings: int,
+                       dtype: torch.dtype, device: torch.device):
+    """Generate test data for given configuration."""
+    # Create 2D positions (3, num_tokens) for multimodal case
+    positions = torch.randint(0,
+                              max_position_embeddings // 4, (3, num_tokens),
+                              device=device)
+
+    # Create query and key tensors
+    query = torch.randn(num_tokens,
+                        num_q_heads * head_size,
+                        dtype=dtype,
+                        device=device)
+    key = torch.randn(num_tokens,
+                      num_kv_heads * head_size,
+                      dtype=dtype,
+                      device=device)
+
+    return positions, query, key
+
+
+def unroll_model_tp_dict(model_tp_dict):
+    return [(model_name, tp_size)
+            for model_name, tp_sizes in model_tp_dict.items()
+            for tp_size in tp_sizes]
+
+
+model_tp_dict = {
+    "Qwen/Qwen2-VL-7B-Instruct": [1, 2],
+    "Qwen/Qwen2-VL-72B-Instruct": [1, 2],
+    "Qwen/Qwen2.5-VL-72B-Instruct": [1, 2]
+}
+
+# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
+dtype_atol_rtol_list = [
+    [torch.bfloat16, 1e-5, 1.6e-2],
+]
+
+num_tokens_list = [11, 8192]
+
+
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+                    reason="Skipping CUDA/ROCm only tests.")
+@pytest.mark.parametrize("model_name, tp_size",
+                         unroll_model_tp_dict(model_tp_dict))
+@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
+@pytest.mark.parametrize("num_tokens", num_tokens_list)
+def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens):
+
+    config = AutoConfig.from_pretrained(model_name)
+
+    # get the model config
+    total_num_kv_heads = config.num_key_value_heads
+    total_num_heads = config.num_attention_heads
+    num_heads = total_num_heads // tp_size
+    num_kv_heads = max(1, total_num_kv_heads // tp_size)
+    head_dim = config.hidden_size // total_num_heads
+    is_neox_style = True
+
+    rope_theta = config.rope_theta
+    max_position = config.max_position_embeddings
+
+    mrope_helper_class = get_rope(
+        head_size=head_dim,
+        rotary_dim=head_dim,
+        max_position=max_position,
+        base=rope_theta,
+        is_neox_style=is_neox_style,
+        rope_scaling=config.rope_scaling,
+        dtype=dtype,
+    ).to(device=device)
+
+    # create q k v input tensors
+    # create rotary pos emb input tensors
+    positions, query, key = generate_test_data(num_tokens, num_heads,
+                                               num_kv_heads, head_dim,
+                                               max_position, dtype, device)
+
+    query_native, key_native = mrope_helper_class.forward_native(
+        positions,
+        query.clone(),
+        key.clone(),
+    )
+
+    query_cuda, key_cuda = mrope_helper_class.forward_cuda(
+        positions,
+        query.clone(),
+        key.clone(),
+    )
+
+    torch.testing.assert_close(query_native, query_cuda, atol=atol, rtol=rtol)
+    torch.testing.assert_close(key_native, key_cuda, atol=atol, rtol=rtol)
+
+
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+                    reason="Skipping CUDA/ROCm only tests.")
+@pytest.mark.parametrize(
+    "model_name, tp_size",
+    unroll_model_tp_dict({"Qwen/Qwen2-VL-7B-Instruct": [1, 2]}))
+@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
+@pytest.mark.parametrize("num_tokens", [4])
+def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol,
+                                     num_tokens):
+    config = AutoConfig.from_pretrained(model_name)
+
+    # get the model config
+    total_num_kv_heads = config.num_key_value_heads
+    total_num_heads = config.num_attention_heads
+    num_heads = total_num_heads // tp_size
+    num_kv_heads = max(1, total_num_kv_heads // tp_size)
+    head_dim = config.hidden_size // total_num_heads
+    is_neox_style = True
+    rope_theta = config.rope_theta
+    max_position = config.max_position_embeddings
+
+    mrope_helper_class = get_rope(
+        head_size=head_dim,
+        rotary_dim=head_dim,
+        max_position=max_position,
+        base=rope_theta,
+        is_neox_style=is_neox_style,
+        rope_scaling=config.rope_scaling,
+        dtype=dtype,
+    ).to(device=device)
+
+    # Generate test data
+    positions, query, key = generate_test_data(num_tokens, num_heads,
+                                               num_kv_heads, head_dim,
+                                               max_position, dtype, device)
+
+    # Create a wrapper that makes the in-place function appear functional
+    def functional_forward_cuda(pos, q, k):
+        """Wrapper that converts in-place operation to functional style
+        
+        CUDA Graph does not support in-place operations.
+        This wrapper creates working copies of the 
+        input tensors and modifies them.
+        """
+        q_work = q.clone()  # Create working copies
+        k_work = k.clone()
+        # Your in-place function modifies q_work and k_work
+        mrope_helper_class.forward_cuda(pos, q_work, k_work)
+        return q_work, k_work  # Return the modified tensors
+
+    # Get reference results
+    query_native, key_native = mrope_helper_class.forward_native(
+        positions,
+        query.clone(),
+        key.clone(),
+    )
+
+    try:
+        compiled_forward_cuda = torch.compile(functional_forward_cuda,
+                                              fullgraph=True,
+                                              backend="inductor",
+                                              mode="reduce-overhead",
+                                              dynamic=False)
+
+        # Run compiled version
+        query_compiled_cuda, key_compiled_cuda = compiled_forward_cuda(
+            positions,
+            query,
+            key,
+        )
+
+        # Run original version for comparison
+        query_cuda = query.clone()
+        key_cuda = key.clone()
+        mrope_helper_class.forward_cuda(positions, query_cuda, key_cuda)
+
+        # Verify results
+        torch.testing.assert_close(query_compiled_cuda,
+                                   query_cuda,
+                                   atol=atol,
+                                   rtol=rtol)
+        torch.testing.assert_close(key_compiled_cuda,
+                                   key_cuda,
+                                   atol=atol,
+                                   rtol=rtol)
+        torch.testing.assert_close(query_compiled_cuda,
+                                   query_native,
+                                   atol=atol,
+                                   rtol=rtol)
+        torch.testing.assert_close(key_compiled_cuda,
+                                   key_native,
+                                   atol=atol,
+                                   rtol=rtol)
+
+        print("✓ forward_cuda successfully traced with torch.compile inductor")
+
+    except Exception as e:
+        pytest.fail(
+            f"forward_cuda failed to trace with torch.compile inductor: {e}")
diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index a75b9e5eb4..d3b71930b6 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -8,10 +8,173 @@ import numpy as np
 import torch
 from transformers import PretrainedConfig
 
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+
 from .base import RotaryEmbedding
 from .common import apply_rotary_emb_dispatch
 
 
+@triton.jit
+def _triton_qwen2vl_mrope_forward(
+    q_ptr,
+    k_ptr,
+    cos,
+    sin,
+    num_tokens,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    pad_n_qh: tl.constexpr,
+    pad_n_kh: tl.constexpr,
+    pad_hd: tl.constexpr,
+    mrope_section_t: tl.constexpr,
+    mrope_section_h: tl.constexpr,
+):
+    # Adapted from
+    # https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/qwen2vl_mrope.py
+    # This version supports flatten input tensors from vllm
+    # and supports cos and sin cache with shape (3, num_tokens, head_dim // 2)
+    # instead of (3, bsz, seq_len, head_dim)
+    pid = tl.program_id(0)
+    # locate start address
+    q_ptr = q_ptr + pid * (n_qh * hd)
+    k_ptr = k_ptr + pid * (n_kh * hd)
+
+    # ####################################################################
+    # get the cos(mθ_{i...d/2}) and sin(mθ_{i...d/2}) for token position
+    # m of this program instance
+    # ####################################################################
+    # Note: cos and sin now have shape (3, num_tokens, head_dim // 2)
+
+    t_end = mrope_section_t
+    h_end = t_end + mrope_section_h
+
+    # Updated stride calculation for half head_dim
+    half_hd = hd // 2
+    t_cos = cos + pid * half_hd
+    h_cos = t_cos + num_tokens * half_hd
+    w_cos = h_cos + num_tokens * half_hd
+    t_sin = sin + pid * half_hd
+    h_sin = t_sin + num_tokens * half_hd
+    w_sin = h_sin + num_tokens * half_hd
+
+    # Updated offsets for half head_dim
+    cos_offsets = tl.arange(0, pad_hd // 2)
+    t_mask = cos_offsets < t_end
+    h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end)
+    w_mask = (h_end <= cos_offsets) & (cos_offsets < half_hd)
+
+    t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0)
+    h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0)
+    w_cos_row = tl.load(w_cos + cos_offsets, mask=w_mask, other=0)
+    t_sin_row = tl.load(t_sin + cos_offsets, mask=t_mask, other=0)
+    h_sin_row = tl.load(h_sin + cos_offsets, mask=h_mask, other=0)
+    w_sin_row = tl.load(w_sin + cos_offsets, mask=w_mask, other=0)
+
+    cos_row = t_cos_row + h_cos_row + w_cos_row
+    sin_row = t_sin_row + h_sin_row + w_sin_row
+
+    # ####################################################################
+    # Load the left and right half of q and k for the current
+    # program instance (i.e. for the current token) separately
+    # ####################################################################
+    # left half of the head
+    first_half_q_offsets = tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(
+        0, pad_hd // 2)[None, :]
+    first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(
+        0, pad_hd // 2)[None, :]
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange(
+        0, pad_hd // 2)[None, :] < hd // 2)
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange(
+        0, pad_hd // 2)[None, :] < hd // 2)
+
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets,
+                       mask=first_q_mask,
+                       other=0).to(sin_row.dtype)
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets,
+                       mask=first_k_mask,
+                       other=0).to(sin_row.dtype)
+
+    # right half of the head
+    second_half_q_offsets = first_half_q_offsets + (hd // 2)
+    second_half_k_offsets = first_half_k_offsets + (hd // 2)
+    second_q_mask = first_q_mask
+    second_k_mask = first_k_mask
+
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets,
+                       mask=second_q_mask,
+                       other=0).to(sin_row.dtype)
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets,
+                       mask=second_k_mask,
+                       other=0).to(sin_row.dtype)
+
+    # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
+    # Since cos and sin are now half-size,
+    # we use the same cos_row and sin_row for both halves
+    new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
+    tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+    new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
+    tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+
+    new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
+    tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+    new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
+    tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+
+
+def triton_mrope(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    mrope_section: list[int],
+    head_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Qwen2VL mrope kernel.
+
+    Args:
+        query: [num_tokens, num_heads * head_size]
+        key: [num_tokens, num_kv_heads * head_size]
+        cos: [3, num_tokens, head_size //2 ]
+            (T/H/W positions with multimodal inputs)
+        sin: [3, num_tokens, head_size //2 ]
+            (T/H/W positions with multimodal inputs)
+        mrope_section: [t, h, w]
+        head_size: int
+    """
+    n_row, n_q_head_head_dim = q.shape
+    n_q_head = n_q_head_head_dim // head_size
+    n_kv_head = k.shape[1] // head_size
+    pad_hd = triton.next_power_of_2(head_size)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+
+    # ensure tensors passed into the kernel are contiguous.
+    # It will be no-op if they are already contiguous
+    q = q.contiguous()
+    k = k.contiguous()
+    cos = cos.contiguous()
+    sin = sin.contiguous()
+
+    _triton_qwen2vl_mrope_forward[(n_row, )](
+        q,
+        k,
+        cos,
+        sin,
+        n_row,
+        n_q_head,
+        n_kv_head,
+        head_size,
+        pad_n_q_head,
+        pad_n_kv_head,
+        pad_hd,
+        mrope_section[0],
+        mrope_section[1],
+    )
+    return q, k
+
+
 class MRotaryEmbedding(RotaryEmbedding):
     """Rotary Embedding with Multimodal Sections."""
 
@@ -36,11 +199,34 @@ class MRotaryEmbedding(RotaryEmbedding):
         if self.mrope_section:
             assert sum(self.mrope_section) == rotary_dim // 2
 
+        self.use_triton = current_platform.is_cuda_alike()
+
     def forward(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
         key: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """MRope forward.
+
+        Args:
+            positions:
+                [num_tokens,] (text only) or
+                [3, num_tokens] (T/H/W positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        if self.use_triton:
+            return self.forward_cuda(positions, query, key)
+        else:
+            return self.forward_native(positions, query, key)
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         """PyTorch-native implementation equivalent to forward().
 
@@ -88,6 +274,51 @@ class MRotaryEmbedding(RotaryEmbedding):
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+
+        assert positions.ndim == 1 or positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        query_shape = query.shape
+        key_shape = key.shape
+        if positions.ndim == 2:
+            assert self.mrope_section
+
+            q, k = triton_mrope(
+                query,
+                key,
+                cos,
+                sin,
+                self.mrope_section,
+                self.head_size,
+            )
+
+            return q.reshape(query_shape), k.reshape(key_shape)
+
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin,
+                                              self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin,
+                                            self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
     @classmethod
     def get_input_positions(
         cls,

From 61f67d8acdb4b77c168d1150e81a5c284c6f8ce7 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Sun, 10 Aug 2025 05:16:11 +0200
Subject: [PATCH 128/932] [V1] [Hybrid] Enable Full CUDA Graph (decode-only)
 for Mamba layers (#21401)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 .../models/language/generation/test_hybrid.py | 60 +++++++++++++++++++
 vllm/v1/attention/backends/mamba_attn.py      | 44 +++++++++++++-
 2 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 4934da9517..76f6c226ba 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -384,3 +384,63 @@ def test_distributed_correctness(
         name_0="vllm_tp_1",
         name_1="vllm_tp_2",
     )
+
+
+@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_full_cuda_graph(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    with hf_runner(model) as hf_model:
+        if model not in HF_UNSUPPORTED_MODELS:
+            hf_outputs = hf_model.generate_greedy_logprobs_limit(
+                example_prompts, max_tokens, num_logprobs)
+        else:
+            hf_outputs = None
+
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        if model in HYBRID_MODELS:
+            # required due to reorder_batch behaviour
+            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+        with vllm_runner(model,
+                         max_num_seqs=MAX_NUM_SEQS,
+                         compilation_config={'full_cuda_graph': True},
+                         enable_prefix_caching=False) as vllm_model:
+            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
+
+    if hf_outputs is not None:
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_v0_outputs,
+            name_0="hf",
+            name_1="vllm-v0",
+        )
+
+    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
+    check_logprobs_close(
+        outputs_0_lst=ref_outputs,
+        outputs_1_lst=vllm_v1_outputs,
+        name_0="hf" if hf_outputs is not None else "vllm-v0",
+        name_1="vllm-v1",
+    )
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index 66a8d91db8..7c1226049f 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -7,8 +7,10 @@ from typing import ClassVar, Optional
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import VllmConfig
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                              AttentionMetadataBuilder,
                                               CommonAttentionMetadata,
                                               split_decodes_and_prefills)
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
@@ -82,6 +84,8 @@ class Mamba2AttentionMetadata:
 
 class Mamba2AttentionMetadataBuilder(
         AttentionMetadataBuilder[Mamba2AttentionMetadata]):
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.PURE_DECODE_ONLY
 
     reorder_batch_threshold: ClassVar[int] = 1
 
@@ -90,8 +94,18 @@ class Mamba2AttentionMetadataBuilder(
         assert isinstance(kv_cache_spec, MambaSpec)
         self.kv_cache_spec = kv_cache_spec
         self.chunk_size = vllm_config.model_config.get_mamba_chunk_size()
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
         assert self.chunk_size is not None, (
             "chunk_size needs to be set in the model config for Mamba2 models")
+        self.decode_cudagraph_max_bs = min(
+            self.vllm_config.scheduler_config.max_num_seqs,
+            self.compilation_config.max_capture_size)
+        self.state_indices_tensor = torch.empty(
+            (self.decode_cudagraph_max_bs, ),
+            dtype=torch.int32,
+            device=device,
+        )
 
     def build(self,
               common_prefix_len: int,
@@ -144,6 +158,14 @@ class Mamba2AttentionMetadataBuilder(
                         query_start_loc_p, self.chunk_size,
                         num_prefill_tokens))
 
+        elif num_decodes <= self.decode_cudagraph_max_bs:
+            # Pad state tensor for CUDA graph
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes)
+            self.state_indices_tensor[:num_decodes].copy_(state_indices_tensor,
+                                                          non_blocking=True)
+            state_indices_tensor = self.state_indices_tensor[:num_input_tokens]
+            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
+
         attn_metadata = Mamba2AttentionMetadata(
             num_prefills=num_prefills,
             num_prefill_tokens=num_prefill_tokens,
@@ -160,3 +182,23 @@ class Mamba2AttentionMetadataBuilder(
             state_indices_tensor=state_indices_tensor,
         )
         return attn_metadata
+
+    def build_for_cudagraph_capture(
+            self, common_attn_metadata: CommonAttentionMetadata):
+        """
+        This method builds the metadata for full cudagraph capture.
+        Currently, only decode is supported for full cudagraphs with Mamba.
+        """
+        m = common_attn_metadata
+
+        assert m.num_reqs == m.num_actual_tokens, \
+            "Mamba only supports decode-only full CUDAGraph capture. " \
+            "Make sure all cudagraph capture sizes <= max_num_seq."
+
+        m.max_query_len = 1  # decode-only
+
+        return self.build(0, m)
+
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        return common_attn_metadata.max_query_len == 1

From 0c5254b82acc625112ce7adc10811514f1a42d52 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 10 Aug 2025 11:19:13 +0800
Subject: [PATCH 129/932] [oss] Init gpt-oss bf16 support (#22508)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../model_executor/layers/fused_moe/config.py |   6 +-
 .../layers/fused_moe/fused_moe.py             | 265 +++++++++++-------
 vllm/model_executor/layers/fused_moe/layer.py |  40 ++-
 vllm/model_executor/models/gpt_oss.py         | 152 +++++++++-
 4 files changed, 340 insertions(+), 123 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index f2242ade0c..31ea826f1f 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -324,6 +324,8 @@ class FusedMoEConfig:
 
     max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE
 
+    has_bias: bool = False
+
     def __post_init__(self):
         if self.dp_size > 1:
             logger.debug_once("Using FusedMoEConfig::max_num_tokens=%d",
@@ -413,7 +415,8 @@ class FusedMoEConfig:
         in_dtype: torch.dtype,
         max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE,
         quant_config: Optional[Union[FusedMoEQuantConfig,
-                                     QuantizationConfig]] = None
+                                     QuantizationConfig]] = None,
+        has_bias: bool = False,
     ) -> "FusedMoEConfig":
 
         _quant_config: Optional[FusedMoEQuantConfig] = None
@@ -482,4 +485,5 @@ class FusedMoEConfig:
             in_dtype=in_dtype,
             quant_config=_quant_config,
             max_num_tokens=max_num_tokens,
+            has_bias=has_bias,
         )
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index f4f5457ebc..3ad5f5b7ad 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -275,6 +275,7 @@ def fused_moe_kernel(
     a_ptr,
     b_ptr,
     c_ptr,
+    b_bias_ptr,
     a_scale_ptr,
     b_scale_ptr,
     topk_weights_ptr,
@@ -302,6 +303,8 @@ def fused_moe_kernel(
     stride_bse,
     stride_bsk,
     stride_bsn,
+    stride_bbe,  # bias expert stride
+    stride_bbn,  # bias N stride
     # Block size for block-wise quantization
     group_n: tl.constexpr,
     group_k: tl.constexpr,
@@ -317,6 +320,7 @@ def fused_moe_kernel(
     use_int8_w8a8: tl.constexpr,
     use_int8_w8a16: tl.constexpr,
     per_channel_quant: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
 ):
     """
     Implements the fused computation for a Mixture of Experts (MOE) using
@@ -414,7 +418,10 @@ def fused_moe_kernel(
         else:
             a_scale = tl.load(a_scale_ptr)
             b_scale = tl.load(b_scale_ptr + off_experts)
-
+    if HAS_BIAS:
+        # bias shape: [num_experts, N]
+        bias_ptrs = b_bias_ptr + off_experts * stride_bbe + offs_bn * stride_bbn
+        bias = tl.load(bias_ptrs, mask=(offs_bn < N), other=0.0)
     # -----------------------------------------------------------
     # Iterate to compute a block of the C matrix.
     # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
@@ -456,7 +463,8 @@ def fused_moe_kernel(
         # Advance the ptrs to the next K block.
         a_ptrs += BLOCK_SIZE_K * stride_ak
         b_ptrs += BLOCK_SIZE_K * stride_bk
-
+    if HAS_BIAS:
+        accumulator = accumulator + bias[None, :]
     if MUL_ROUTED_WEIGHT:
         moe_weight = tl.load(topk_weights_ptr + offs_token,
                              mask=token_mask,
@@ -471,6 +479,7 @@ def fused_moe_kernel(
             accumulator = (accumulator * a_scale * b_scale).to(compute_type)
     else:
         accumulator = accumulator.to(compute_type)
+
     # -----------------------------------------------------------
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
@@ -499,7 +508,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             use_int8_w8a16: bool,
                             use_int4_w4a16: bool,
                             per_channel_quant: bool,
-                            block_shape: Optional[list[int]] = None) -> None:
+                            block_shape: Optional[list[int]] = None,
+                            B_bias: Optional[torch.Tensor] = None) -> None:
     assert topk_weights is not None or not mul_routed_weight
     assert topk_weights is None or topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
@@ -531,7 +541,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                  A.size(0) * top_k * config['BLOCK_SIZE_M'])
     grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv(
         B.size(1), META['BLOCK_SIZE_N']), )
-
+    HAS_BIAS = B_bias is not None
     if (use_int8_w8a16 or use_int4_w4a16) and \
             block_shape is not None and block_shape[1] > 0:
         assert B_scale is not None and B_scale.ndim == 3
@@ -611,6 +621,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             A,
             B,
             C,
+            B_bias,
             A_scale,
             B_scale,
             topk_weights,
@@ -638,6 +649,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             if B_scale is not None and B_scale.ndim == 3 else 0,
             B_scale.stride(1)
             if B_scale is not None and B_scale.ndim >= 2 else 0,
+            B_bias.stride(0) if B_bias is not None else 0,
+            B_bias.stride(1) if B_bias is not None else 0,
             0 if block_shape is None else block_shape[0],
             0 if block_shape is None else block_shape[1],
             MUL_ROUTED_WEIGHT=mul_routed_weight,
@@ -647,6 +660,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             use_int8_w8a8=use_int8_w8a8,
             use_int8_w8a16=use_int8_w8a16,
             per_channel_quant=per_channel_quant,
+            HAS_BIAS=HAS_BIAS,
             BLOCK_SIZE_K=BLOCK_SIZE_K,
             **config,
         )
@@ -1024,40 +1038,43 @@ def inplace_fused_experts(
         w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[List[int]] = None) -> None:  #noqa: UP006
+        block_shape: Optional[List[int]] = None,
+        w1_bias: Optional[torch.Tensor] = None,
+        w2_bias: Optional[torch.Tensor] = None) -> None:  #noqa: UP006
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
                        activation, is_act_and_mul,
                        apply_router_weight_on_input, use_fp8_w8a8,
                        use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
                        use_mxfp4_w4a4, per_channel_quant, global_num_experts,
                        expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
-                       a2_scale, block_shape)
+                       a2_scale, block_shape, w1_bias, w2_bias)
 
 
-def inplace_fused_experts_fake(
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: str = "silu",
-        is_act_and_mul: bool = True,
-        apply_router_weight_on_input: bool = False,
-        use_fp8_w8a8: bool = False,
-        use_int8_w8a8: bool = False,
-        use_int8_w8a16: bool = False,
-        use_int4_w4a16: bool = False,
-        use_mxfp4_w4a4: bool = False,
-        per_channel_quant: bool = False,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        w1_scale: Optional[torch.Tensor] = None,
-        w2_scale: Optional[torch.Tensor] = None,
-        w1_zp: Optional[torch.Tensor] = None,
-        w2_zp: Optional[torch.Tensor] = None,
-        a1_scale: Optional[torch.Tensor] = None,
-        a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[list[int]] = None) -> None:
+def inplace_fused_experts_fake(hidden_states: torch.Tensor,
+                               w1: torch.Tensor,
+                               w2: torch.Tensor,
+                               topk_weights: torch.Tensor,
+                               topk_ids: torch.Tensor,
+                               activation: str = "silu",
+                               is_act_and_mul: bool = True,
+                               apply_router_weight_on_input: bool = False,
+                               use_fp8_w8a8: bool = False,
+                               use_int8_w8a8: bool = False,
+                               use_int8_w8a16: bool = False,
+                               use_int4_w4a16: bool = False,
+                               use_mxfp4_w4a4: bool = False,
+                               per_channel_quant: bool = False,
+                               global_num_experts: int = -1,
+                               expert_map: Optional[torch.Tensor] = None,
+                               w1_scale: Optional[torch.Tensor] = None,
+                               w2_scale: Optional[torch.Tensor] = None,
+                               w1_zp: Optional[torch.Tensor] = None,
+                               w2_zp: Optional[torch.Tensor] = None,
+                               a1_scale: Optional[torch.Tensor] = None,
+                               a2_scale: Optional[torch.Tensor] = None,
+                               block_shape: Optional[list[int]] = None,
+                               w1_bias: Optional[torch.Tensor] = None,
+                               w2_bias: Optional[torch.Tensor] = None) -> None:
     pass
 
 
@@ -1246,36 +1263,38 @@ direct_register_custom_op(
 
 
 def outplace_fused_experts(
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: str = "silu",
-        is_act_and_mul: bool = True,
-        apply_router_weight_on_input: bool = False,
-        use_fp8_w8a8: bool = False,
-        use_int8_w8a8: bool = False,
-        use_int8_w8a16: bool = False,
-        use_int4_w4a16: bool = False,
-        use_mxfp4_w4a4: bool = False,
-        per_channel_quant: bool = False,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        w1_scale: Optional[torch.Tensor] = None,
-        w2_scale: Optional[torch.Tensor] = None,
-        w1_zp: Optional[torch.Tensor] = None,
-        w2_zp: Optional[torch.Tensor] = None,
-        a1_scale: Optional[torch.Tensor] = None,
-        a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[List[int]] = None,  #noqa: UP006
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation: str = "silu",
+    is_act_and_mul: bool = True,
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    use_mxfp4_w4a4: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,  #noqa: UP006
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     return fused_experts_impl(
         hidden_states, w1, w2, topk_weights, topk_ids, False, activation,
         is_act_and_mul, apply_router_weight_on_input, use_fp8_w8a8,
         use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, use_mxfp4_w4a4,
         per_channel_quant, global_num_experts, expert_map, w1_scale, w2_scale,
-        w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
+        w1_zp, w2_zp, a1_scale, a2_scale, block_shape, w1_bias, w2_bias)
 
 
 def outplace_fused_experts_fake(
@@ -1300,7 +1319,9 @@ def outplace_fused_experts_fake(
         w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[list[int]] = None) -> torch.Tensor:
+        block_shape: Optional[list[int]] = None,
+        w1_bias: Optional[torch.Tensor] = None,
+        w2_bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
@@ -1332,33 +1353,34 @@ def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
 
 # TODO (bnell): replace this with modular op.  Can get rid of inplace/outplace
 # torch ops.
-def fused_experts(
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        inplace: bool = False,
-        activation: str = "silu",
-        is_act_and_mul: bool = True,
-        apply_router_weight_on_input: bool = False,
-        use_fp8_w8a8: bool = False,
-        use_int8_w8a8: bool = False,
-        use_int8_w8a16: bool = False,
-        use_int4_w4a16: bool = False,
-        use_mxfp4_w4a4: bool = False,
-        per_channel_quant: bool = False,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        w1_scale: Optional[torch.Tensor] = None,
-        w2_scale: Optional[torch.Tensor] = None,
-        w1_zp: Optional[torch.Tensor] = None,
-        w2_zp: Optional[torch.Tensor] = None,
-        a1_scale: Optional[torch.Tensor] = None,
-        a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[list[int]] = None,
-        allow_deep_gemm: bool = False,
-        allow_cutlass_block_scaled_grouped_gemm: bool = False) -> torch.Tensor:
+def fused_experts(hidden_states: torch.Tensor,
+                  w1: torch.Tensor,
+                  w2: torch.Tensor,
+                  topk_weights: torch.Tensor,
+                  topk_ids: torch.Tensor,
+                  inplace: bool = False,
+                  activation: str = "silu",
+                  is_act_and_mul: bool = True,
+                  apply_router_weight_on_input: bool = False,
+                  use_fp8_w8a8: bool = False,
+                  use_int8_w8a8: bool = False,
+                  use_int8_w8a16: bool = False,
+                  use_int4_w4a16: bool = False,
+                  use_mxfp4_w4a4: bool = False,
+                  per_channel_quant: bool = False,
+                  global_num_experts: int = -1,
+                  expert_map: Optional[torch.Tensor] = None,
+                  w1_scale: Optional[torch.Tensor] = None,
+                  w2_scale: Optional[torch.Tensor] = None,
+                  w1_zp: Optional[torch.Tensor] = None,
+                  w2_zp: Optional[torch.Tensor] = None,
+                  a1_scale: Optional[torch.Tensor] = None,
+                  a2_scale: Optional[torch.Tensor] = None,
+                  block_shape: Optional[list[int]] = None,
+                  allow_deep_gemm: bool = False,
+                  allow_cutlass_block_scaled_grouped_gemm: bool = False,
+                  w1_bias: Optional[torch.Tensor] = None,
+                  w2_bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     # For now, disable DeepGemm for small N (<= 512) until better
     # permute/unpermute ops are available.
     # However, on B200, we use DeepGemm for all cases because they only support
@@ -1423,7 +1445,10 @@ def fused_experts(
             w2_zp=w2_zp,
             a1_scale=a1_scale,
             a2_scale=a2_scale,
-            block_shape=block_shape)
+            block_shape=block_shape,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+        )
 
 
 def fused_experts_impl(
@@ -1451,6 +1476,8 @@ def fused_experts_impl(
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[list[int]] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     # Check constraints.
     if use_int4_w4a16:
@@ -1591,7 +1618,19 @@ def fused_experts_impl(
                                 use_int8_w8a16=use_int8_w8a16,
                                 use_int4_w4a16=use_int4_w4a16,
                                 per_channel_quant=per_channel_quant,
-                                block_shape=block_shape)
+                                block_shape=block_shape,
+                                B_bias=w1_bias)
+
+        # TODO fused kernel
+        def swiglu_oai(gate_up):
+            alpha = 1.702
+            limit = 7.0
+            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+            gate = gate.clamp(min=None, max=limit)
+            up = up.clamp(min=-limit, max=limit)
+            glu = gate * torch.sigmoid(gate * alpha)
+            gated_output = (up + 1) * glu
+            return gated_output
 
         # Activation function with multiplication
         if activation == "silu" and is_act_and_mul:
@@ -1605,6 +1644,8 @@ def fused_experts_impl(
             intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
         elif activation == "gelu":
             intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
+        elif activation == "swiglu_oai":
+            intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N))
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}, "
                              f"with is_act_and_mul={is_act_and_mul}.")
@@ -1635,7 +1676,8 @@ def fused_experts_impl(
                                 use_int8_w8a16=use_int8_w8a16,
                                 use_int4_w4a16=use_int4_w4a16,
                                 per_channel_quant=per_channel_quant,
-                                block_shape=block_shape)
+                                block_shape=block_shape,
+                                B_bias=w2_bias)
 
         ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.size()),
                     out_hidden_states[begin_chunk_idx:end_chunk_idx])
@@ -1672,6 +1714,8 @@ def fused_moe(
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[list[int]] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -1766,7 +1810,9 @@ def fused_moe(
                          w2_zp=w2_zp,
                          a1_scale=a1_scale,
                          a2_scale=a2_scale,
-                         block_shape=block_shape)
+                         block_shape=block_shape,
+                         w1_bias=w1_bias,
+                         w2_bias=w2_bias)
 
 
 class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
@@ -1937,7 +1983,9 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             use_int8_w8a16=self.use_int8_w8a16,
             use_int4_w4a16=self.use_int4_w4a16,
             per_channel_quant=self.per_act_token_quant,
-            block_shape=self.block_shape)
+            block_shape=self.block_shape,
+            B_bias=None  # TODO support B_bias
+        )
 
         self.activation(activation, intermediate_cache2,
                         intermediate_cache1.view(-1, N))
@@ -1948,26 +1996,29 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             intermediate_cache2, a2_scale, self.quant_dtype,
             self.per_act_token_quant, self.block_shape)
 
-        invoke_fused_moe_kernel(qintermediate_cache2,
-                                w2,
-                                intermediate_cache3,
-                                a2q_scale,
-                                w2_scale,
-                                w2_zp,
-                                topk_weights,
-                                sorted_token_ids,
-                                expert_ids,
-                                num_tokens_post_padded,
-                                not apply_router_weight_on_input,
-                                1,
-                                config,
-                                compute_type=compute_type,
-                                use_fp8_w8a8=self.use_fp8_w8a8,
-                                use_int8_w8a8=self.use_int8_w8a8,
-                                use_int8_w8a16=self.use_int8_w8a16,
-                                use_int4_w4a16=self.use_int4_w4a16,
-                                per_channel_quant=self.per_act_token_quant,
-                                block_shape=self.block_shape)
+        invoke_fused_moe_kernel(
+            qintermediate_cache2,
+            w2,
+            intermediate_cache3,
+            a2q_scale,
+            w2_scale,
+            w2_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            not apply_router_weight_on_input,
+            1,
+            config,
+            compute_type=compute_type,
+            use_fp8_w8a8=self.use_fp8_w8a8,
+            use_int8_w8a8=self.use_int8_w8a8,
+            use_int8_w8a16=self.use_int8_w8a16,
+            use_int4_w4a16=self.use_int4_w4a16,
+            per_channel_quant=self.per_act_token_quant,
+            block_shape=self.block_shape,
+            B_bias=None  # TODO support B_bias
+        )
 
         ops.moe_sum(intermediate_cache3, output)
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index d664a92841..d5a89655e3 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -255,7 +255,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         self.fused_experts = fused_experts  # type: ignore
         self.topk_indices_dtype = None
         self.moe = moe
-
+        self.has_bias = self.moe.has_bias
         self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
         if self.rocm_aiter_moe_enabled:
             from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
@@ -291,7 +291,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                                         requires_grad=False)
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
-
+        if self.has_bias:
+            w13_bias = torch.nn.Parameter(torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                dtype=params_dtype),
+                                          requires_grad=False)
+            layer.register_parameter("w13_bias", w13_bias)
+            set_weight_attrs(w13_bias, extra_weight_attrs)
         # down_proj (row parallel)
         w2_weight = torch.nn.Parameter(torch.empty(
             num_experts,
@@ -301,6 +308,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                                        requires_grad=False)
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
+        if self.has_bias:
+            w2_bias = torch.nn.Parameter(torch.zeros(num_experts,
+                                                     hidden_size,
+                                                     dtype=params_dtype),
+                                         requires_grad=False)
+            layer.register_parameter("w2_bias", w2_bias)
+            set_weight_attrs(w2_bias, extra_weight_attrs)
 
     def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
         # Pad the weight tensor. This is an optimization on ROCm platform, which
@@ -465,6 +479,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
+                w1_bias=layer.w13_bias if self.has_bias else None,
+                w2_bias=layer.w2_bias if self.has_bias else None,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
@@ -702,6 +718,7 @@ class FusedMoE(torch.nn.Module):
         activation: str = "silu",
         enable_eplb: bool = False,
         num_redundant_experts: int = 0,
+        has_bias: bool = False,
     ):
         super().__init__()
         if params_dtype is None:
@@ -793,16 +810,15 @@ class FusedMoE(torch.nn.Module):
             # since model_config is not set in the pytest test.
             model_dtype = params_dtype
 
-        moe = FusedMoEConfig.make(
-            num_experts=self.global_num_experts,
-            experts_per_token=top_k,
-            hidden_dim=hidden_size,
-            num_local_experts=self.local_num_experts,
-            moe_parallel_config=self.moe_parallel_config,
-            in_dtype=model_dtype,
-            max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE,
-            quant_config=quant_config,
-        )
+        moe = FusedMoEConfig.make(num_experts=self.global_num_experts,
+                                  experts_per_token=top_k,
+                                  hidden_dim=hidden_size,
+                                  num_local_experts=self.local_num_experts,
+                                  moe_parallel_config=self.moe_parallel_config,
+                                  in_dtype=model_dtype,
+                                  max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE,
+                                  quant_config=quant_config,
+                                  has_bias=has_bias)
         self.moe_config = moe
         self.quant_config = quant_config
 
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index feb323a045..6a65bbbe2e 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -160,7 +160,9 @@ class MLPBlock(torch.nn.Module):
                                 renormalize=True,
                                 quant_config=quant_config,
                                 prefix=f"{prefix}.experts",
-                                apply_router_weight_on_input=False)
+                                apply_router_weight_on_input=False,
+                                has_bias=True,
+                                activation="swiglu_oai")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         t = self.norm(x)
@@ -262,8 +264,8 @@ class GptOssForCausalLM(nn.Module):
                                        sampling_metadata)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def _load_weights_mxfp4(
+            self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         rename_mapping = {
             "self_attn": "attn",
             "input_layernorm.weight": "attn.norm.weight",
@@ -469,3 +471,147 @@ class GptOssForCausalLM(nn.Module):
                 loaded_params.add(renamed_name)
 
         return loaded_params
+
+    def _load_weights_other(
+            self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        rename_mapping = {
+            "self_attn": "attn",
+            "input_layernorm.weight": "attn.norm.weight",
+            "post_attention_layernorm.weight": "mlp.norm.weight",
+            "embed_tokens": "embedding",
+        }
+
+        def maybe_rename(name: str) -> str:
+            for remap_name, new_name in rename_mapping.items():
+                if remap_name in name:
+                    return name.replace(remap_name, new_name)
+            return name
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        tp_rank = get_tensor_model_parallel_rank()
+        tp_size = get_tensor_model_parallel_world_size()
+        intermediate_size = self.model_config.intermediate_size
+
+        per_rank_intermediate_size = cdiv(intermediate_size, tp_size)
+        # Calculate common slicing bounds for current rank
+        tp_rank_start = tp_rank * per_rank_intermediate_size
+        tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size,
+                          intermediate_size)
+
+        # Attention heads per rank
+        heads_per_rank = self.model_config.num_attention_heads // tp_size
+        head_start = tp_rank * heads_per_rank
+
+        use_ep = self.vllm_config.parallel_config.enable_expert_parallel
+        ep_size = get_ep_group().world_size
+        ep_rank = get_ep_group().rank
+        num_experts = self.model_config.num_local_experts
+        experts_per_rank = num_experts // ep_size
+        ep_rank_start = ep_rank * experts_per_rank
+        ep_rank_end = (ep_rank + 1) * experts_per_rank
+
+        for name, weight in weights:
+            if ".experts.gate_up_proj" in name and "bias" not in name:
+                # Handle MLP gate and up projection weights
+                new_name = name.replace(".experts.gate_up_proj",
+                                        ".experts.w13_weight")
+
+                # Extract gate and up projection parts
+                # since the weight is shuffled, we can slice directly
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[:, :,
+                                           2 * tp_rank_start:2 * tp_rank_end]
+
+                narrow_weight = narrow_weight.permute(0, 2, 1).contiguous()
+                param = params_dict[new_name]
+
+                param.copy_(narrow_weight)
+                loaded_params.add(new_name)
+
+            elif ".experts.down_proj" in name and "bias" not in name:
+                # Handle MLP down projection weights
+                new_name = name.replace(".experts.down_proj",
+                                        ".experts.w2_weight")
+
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[:, tp_rank_start:tp_rank_end, :]
+                narrow_weight = narrow_weight.permute(0, 2, 1).contiguous()
+                param = params_dict[new_name]
+
+                param.copy_(narrow_weight)
+                loaded_params.add(new_name)
+
+            elif "gate_up_proj_bias" in name:
+                # Handle MLP gate and up projection biases
+                new_name = name.replace("gate_up_proj_bias", "w13_bias")
+
+                # Extract gate and up projection bias parts
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[:,
+                                           2 * tp_rank_start:2 * tp_rank_end]
+
+                param = params_dict[new_name]
+
+                param.copy_(narrow_weight)
+                loaded_params.add(new_name)
+
+            elif "down_proj_bias" in name:
+                # Handle MLP down projection bias
+                new_name = name.replace("down_proj_bias", "w2_bias")
+
+                if use_ep:
+                    weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    # (only load on rank 0 to avoid duplication)
+                    if tp_rank != 0:
+                        weight.zero_()
+                param = params_dict[new_name]
+                param.copy_(weight)
+                loaded_params.add(new_name)
+            elif "sinks" in name:
+                # Handle attention sinks (distributed across ranks)
+                name = name.replace("self_attn", "attn")
+                param = params_dict[name]
+                narrow_weight = weight.narrow(0, head_start, heads_per_rank)
+                param.data.copy_(narrow_weight)
+                loaded_params.add(name)
+            elif "q_proj" in name or "k_proj" in name or "v_proj" in name:
+                shard_id = ("q" if "q_proj" in name else
+                            "k" if "k_proj" in name else "v")
+                name = name.replace("self_attn", "attn")
+                param_name = name.replace(f"{shard_id}_proj", "qkv")
+                param = params_dict[param_name]
+                weight_loader = param.weight_loader
+                weight_loader(param, weight, loaded_shard_id=shard_id)
+                loaded_params.add(param_name)
+            else:
+                # Handle all other weights with potential renaming
+
+                renamed_name = maybe_rename(name)
+                if renamed_name not in params_dict:
+                    continue
+                param = params_dict[renamed_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, weight)
+                loaded_params.add(renamed_name)
+
+        return loaded_params
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        quant_method = (self.model_config.quantization_config['quant_method']
+                        if hasattr(self.model_config, "quantization_config")
+                        else None)
+        if quant_method == "mxfp4":
+            return self._load_weights_mxfp4(weights)
+        else:
+            return self._load_weights_other(weights)

From 3d7363e61c0a27bcba9e6694ae9771f9b780ce3d Mon Sep 17 00:00:00 2001
From: Le Chen <lecself@163.com>
Date: Sun, 10 Aug 2025 11:21:05 +0800
Subject: [PATCH 130/932] [Config] add "qwen" as a native eagle3 target
 supported model (#22333)

Signed-off-by: lechen <lecself@163.com>
Signed-off-by: LeChen <lecself@163.com>
---
 tests/models/registry.py                 |  4 +++
 tests/v1/e2e/test_spec_decode.py         | 39 +++++++++++++-----------
 vllm/config/__init__.py                  |  8 +----
 vllm/model_executor/models/registry.py   |  1 +
 vllm/transformers_utils/configs/eagle.py |  5 +--
 5 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index e0939d1a20..898e38a4ae 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -525,6 +525,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                             trust_remote_code=True,
                                             speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
                                             tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
+    "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3",  # noqa: E501
+                                              trust_remote_code=True,
+                                              speculative_model="AngelSlim/Qwen3-8B_eagle3",
+                                              tokenizer="Qwen/Qwen3-8B"),
     "EagleLlama4ForCausalLM": _HfExamplesInfo(
         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
         trust_remote_code=True,
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 4950faf826..cd383b58db 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -125,24 +125,27 @@ def test_ngram_correctness(
         cleanup_dist_env_and_memory()
 
 
-@pytest.mark.parametrize(
-    ["model_setup", "mm_enabled"], [
-        (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
-          "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
-        (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
-          "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
-        pytest.param(
-            ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-             "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-            False,
-            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-        pytest.param(
-            ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-             "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-            True,
-            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-    ],
-    ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle", "llama4_eagle_mm"])
+@pytest.mark.parametrize(["model_setup", "mm_enabled"], [
+    (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
+    (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+      "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
+    (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+      "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
+    pytest.param(
+        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+        False,
+        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+    pytest.param(
+        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+        True,
+        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+],
+                         ids=[
+                             "qwen3_eagle3", "llama3_eagle", "llama3_eagle3",
+                             "llama4_eagle", "llama4_eagle_mm"
+                         ])
 @pytest.mark.parametrize("attn_backend",
                          get_attn_backend_list_based_on_platform())
 def test_eagle_correctness(
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 7efab23f14..b2826de93d 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -2852,13 +2852,7 @@ class SpeculativeConfig:
                              "speculative decoding is > 1, but got "
                              f"{self.disable_by_batch_size=}")
 
-        from vllm.transformers_utils.configs import SpeculatorsConfig
-
-        eagle3_target_supported = ["llama"]
-        if self.draft_model_config and isinstance(
-                self.draft_model_config.hf_config, SpeculatorsConfig):
-            eagle3_target_supported.append("qwen")
-
+        eagle3_target_supported = ["llama", "qwen"]
         if self.method == "eagle3" and self.target_model_config and not any(
                 supported_model in
                 self.target_model_config.hf_text_config.model_type
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 3d8694e7b9..aca3d84f00 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -259,6 +259,7 @@ _SPECULATIVE_DECODING_MODELS = {
     "EagleLlama4ForCausalLM": ("llama4_eagle", "EagleLlama4ForCausalLM"),
     "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"),
     "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
     "MedusaModel": ("medusa", "Medusa"),
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index 5445a333c4..01217eb191 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -45,6 +45,7 @@ class EAGLEConfig(PretrainedConfig):
 
         # Eagle model name should follow naming convention of
         # LlamaForCausalLM -> EagleLlamaForCausalLM
+        # LlamaForCausalLM -> Eagle3LlamaForCausalLM / LlamaForCausalLMEagle3
         if method == "eagle":
             assert self.model is not None, \
                 "model should not be None when method is eagle"
@@ -56,8 +57,8 @@ class EAGLEConfig(PretrainedConfig):
             assert self.model is not None, \
                 "model should not be None when method is eagle3"
             kwargs["architectures"] = [
-                f"Eagle3{arch}" if not arch.startswith("Eagle3") \
-                    else arch for arch in self.model.architectures
+                arch if arch.startswith("Eagle3") or arch.endswith("Eagle3")
+                else f"Eagle3{arch}" for arch in self.model.architectures
             ]
         else:
             raise ValueError(f"Invalid method {method}. \

From 534c45b9620d4d97cf2ea2cdee77e8461844a243 Mon Sep 17 00:00:00 2001
From: ZiTian Zhao <zitian.zhao@tencentmusic.com>
Date: Sun, 10 Aug 2025 11:25:42 +0800
Subject: [PATCH 131/932] Improve fast_topk function with type hints and
 documentation (#22530)

Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
---
 vllm/model_executor/models/utils.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index c69df6e616..6c27fedc61 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -736,7 +736,23 @@ def cast_overflow_tensors(
     return tensors
 
 
-def fast_topk(values, topk, dim):
+def fast_topk(values: torch.Tensor, topk: int,
+              dim: int) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Optimized topk implementation that uses torch.max for k=1 case.
+    
+    This function provides better performance for the common case of k=1
+    by using torch.max instead of the more general torch.topk.
+    
+    Args:
+        values: Input tensor to find top-k values from
+        topk: Number of top values to return (k). Must be > 0.
+        dim: Dimension along which to compute topk
+        
+    Returns:
+        Tuple of (values, indices) where values are the top-k values
+        and indices are their corresponding indices in the input tensor
+    """
     if topk == 1:
         # Use max along the specified dimension to get both value and index
         return torch.max(values, dim=dim, keepdim=True)

From 2a84fb422fc62ab29238dccbf7bdb214fc51c31e Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Sat, 9 Aug 2025 20:49:04 -0700
Subject: [PATCH 132/932] [TPU] kv cache update kernel doesn't need to be
 padded slices to multiple of num_slices_per_block (#22394)

Signed-off-by: Chengji Yao <chengjiyao@gmail.com>
Co-authored-by: Chengji Yao <chengjiyao@gmail.com>
---
 tests/v1/tpu/test_kv_cache_update_kernel.py  |  5 -----
 vllm/attention/ops/pallas_kv_cache_update.py | 16 ++++++++++------
 vllm/v1/worker/tpu_model_runner.py           | 19 +++++++++----------
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/tests/v1/tpu/test_kv_cache_update_kernel.py b/tests/v1/tpu/test_kv_cache_update_kernel.py
index f82737325e..acb607247d 100644
--- a/tests/v1/tpu/test_kv_cache_update_kernel.py
+++ b/tests/v1/tpu/test_kv_cache_update_kernel.py
@@ -43,11 +43,6 @@ def test_kv_cache_update_kernel(page_size: int, combined_kv_head_num: int,
          np.cumsum(slice_lens[:-1])])
     slot_mapping = np.stack(
         [kv_cache_start_indices, new_kv_cache_indices, slice_lens], axis=1)
-    padded_size = (slot_mapping.shape[0] + num_slices_per_block -
-                   1) // num_slices_per_block * num_slices_per_block
-    slot_mapping = np.pad(slot_mapping,
-                          [[0, padded_size - slot_mapping.shape[0]], [0, 0]],
-                          constant_values=0)
     slot_mapping = np.transpose(slot_mapping)
     slot_mapping_cpu = torch.tensor(slot_mapping,
                                     device="cpu",
diff --git a/vllm/attention/ops/pallas_kv_cache_update.py b/vllm/attention/ops/pallas_kv_cache_update.py
index e7d727a45e..d75983bd40 100644
--- a/vllm/attention/ops/pallas_kv_cache_update.py
+++ b/vllm/attention/ops/pallas_kv_cache_update.py
@@ -14,6 +14,7 @@ def _kv_cache_update_kernel(
     # Prefetch
     slices_ref,  # [3, padded_num_slices], list of (kv_cache_start,
     # new_kv_start, slice_len)
+    num_slices_ref,  # [1]
     # Input
     new_kv_hbm_ref,  # [num_tokens, num_combined_kv_heads, head_dim]
     kv_cache_hbm_ref,  # [total_num_pages * page_size, num_combined_kv_heads,
@@ -32,8 +33,10 @@ def _kv_cache_update_kernel(
     # Copy from new_kv_hbm_ref to scratch
     for i in range(num_slices_per_block):
         offset_i = i + block_idx * num_slices_per_block
-        new_kv_start = slices_ref[1, offset_i]
-        length = slices_ref[2, offset_i]
+        new_kv_start = jax.lax.select(offset_i < num_slices_ref[0],
+                                      slices_ref[1, offset_i], 0)
+        length = jax.lax.select(offset_i < num_slices_ref[0],
+                                slices_ref[2, offset_i], 0)
         async_copy = pltpu.make_async_copy(
             new_kv_hbm_ref.at[pl.ds(new_kv_start, length), ...],
             scratch.at[i, pl.ds(0, length), ...],
@@ -49,8 +52,10 @@ def _kv_cache_update_kernel(
     async_copies.clear()
     for i in range(num_slices_per_block):
         offset_i = i + block_idx * num_slices_per_block
-        kv_cache_start = slices_ref[0, offset_i]
-        length = slices_ref[2, offset_i]
+        kv_cache_start = jax.lax.select(offset_i < num_slices_ref[0],
+                                        slices_ref[0, offset_i], 0)
+        length = jax.lax.select(offset_i < num_slices_ref[0],
+                                slices_ref[2, offset_i], 0)
         async_copy = pltpu.make_async_copy(
             scratch.at[i, pl.ds(0, length), ...],
             kv_cache_hbm_ref.at[pl.ds(kv_cache_start, length), ...],
@@ -77,7 +82,6 @@ def kv_cache_update(
     page_size: int = 32,
     num_slices_per_block: int = 8,
 ):
-    assert slices.shape[1] % num_slices_per_block == 0
     _, num_combined_kv_heads, head_dim = new_kv.shape
     assert kv_cache.shape[1] == num_combined_kv_heads
     assert kv_cache.shape[2] == head_dim
@@ -93,7 +97,7 @@ def kv_cache_update(
     out_specs = [pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.ANY)]
     out_shape = [jax.ShapeDtypeStruct(kv_cache.shape, dtype=kv_cache.dtype)]
 
-    scalar_prefetches = [slices]
+    scalar_prefetches = [slices, num_kv_update_slices]
     scratch = pltpu.VMEM(
         (num_slices_per_block, page_size, num_combined_kv_heads, head_dim),
         new_kv.dtype,
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 442c0ea068..915869726f 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -745,7 +745,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         num_kv_update_slices = slot_mapping_metadata.shape[0]
         padded_num_slices = _get_padded_num_kv_cache_update_slices(
             padded_total_num_scheduled_tokens, self.max_num_reqs,
-            self.block_size, self._num_slices_per_kv_cache_update_block)
+            self.block_size)
         slot_mapping_metadata = np.pad(
             slot_mapping_metadata,
             [[0, padded_num_slices - len(slot_mapping_metadata)], [0, 0]],
@@ -1244,8 +1244,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         position_ids = torch.zeros(num_tokens,
                                    dtype=torch.int32).to(self.device)
         padded_num_slices = _get_padded_num_kv_cache_update_slices(
-            num_tokens, self.max_num_reqs, self.block_size,
-            self._num_slices_per_kv_cache_update_block)
+            num_tokens, self.max_num_reqs, self.block_size)
         num_kv_update_slices = torch.tensor([padded_num_slices],
                                             dtype=torch.int32).to(self.device)
         slot_mapping = torch.zeros((3, padded_num_slices),
@@ -1963,17 +1962,17 @@ def copy_kv_blocks(
         _copy_fn(src_tensor, dst_tensor, src_indices, dst_indices)
 
 
-def _get_padded_num_kv_cache_update_slices(
-        num_tokens: int, max_num_reqs: int, page_size: int,
-        num_slices_per_kv_cache_update_block: int) -> int:
+def _get_padded_num_kv_cache_update_slices(num_tokens: int, max_num_reqs: int,
+                                           page_size: int) -> int:
     """Calculates the padded number of KV cache update slices to avoid
     recompilation."""
+    # NOTE(chengjiyao): let's say R_i is the token num for i-th request,
+    # so it occupies most 2 + R_i // page_size pages. The total maximum
+    # possible number of pages needed is sum(2 + R_i // page_size), which
+    # is <= 2 * max_num_reqs + sum(R_i) // page_size
+    # = 2 * max_num_reqs + num_tokens // page_size
     padded_num_slices = 2 * max_num_reqs + num_tokens // page_size
     padded_num_slices = min(padded_num_slices, num_tokens)
-    padded_num_slices = (
-        padded_num_slices + num_slices_per_kv_cache_update_block - 1
-    ) // num_slices_per_kv_cache_update_block * \
-        num_slices_per_kv_cache_update_block
     return padded_num_slices
 
 
From c49848396d34a1059fbec2a197394484acf5a903 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sun, 10 Aug 2025 04:50:48 +0100
Subject: [PATCH 133/932] Refactor sliding window configuration to Transformers
 best practice (#21927)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/contributing/model/basic.md           |   2 +-
 tests/test_config.py                       |  22 ----
 vllm/config/__init__.py                    | 111 ++++++---------------
 vllm/engine/arg_utils.py                   |  10 +-
 vllm/model_executor/models/commandr.py     |  20 ++--
 vllm/model_executor/models/exaone4.py      |  21 +---
 vllm/model_executor/models/gemma2.py       |   9 +-
 vllm/model_executor/models/gemma3.py       |  14 +--
 vllm/model_executor/models/gemma3_mm.py    |   6 +-
 vllm/model_executor/models/gemma3n.py      |  13 ++-
 vllm/model_executor/models/gritlm.py       |   4 +-
 vllm/model_executor/models/llama.py        |  17 +---
 vllm/model_executor/models/phi4flash.py    |   9 +-
 vllm/model_executor/models/qwen2.py        |   4 +-
 vllm/model_executor/models/transformers.py |  52 ++--------
 vllm/transformers_utils/config.py          |  40 ++++++++
 16 files changed, 123 insertions(+), 231 deletions(-)

diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index edd9a47e13..21b1f21d60 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -117,7 +117,7 @@ For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `m
 
 To support a model with interleaving sliding windows, we need to take care of the following details:
 
-- Make sure the model's `config.json` contains `sliding_window_pattern`. vLLM then sets `self.hf_text_config.interleaved_sliding_window` to the value of `self.hf_text_config.sliding_window` and deletes `sliding_window` from `self.hf_text_config`. The model will then be treated as a full-attention model.
+- Make sure the model's `config.json` contains `layer_types`.
 - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
 
 With these two steps, interleave sliding windows should work with the model.
diff --git a/tests/test_config.py b/tests/test_config.py
index 441c07b99a..19b1b74e42 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -200,28 +200,6 @@ def test_disable_sliding_window(model_id_expected):
     assert model_config.max_model_len == expected
 
 
-def test_get_sliding_window():
-    TEST_SLIDING_WINDOW = 4096
-    # Test that the sliding window is correctly computed.
-    # For Qwen1.5/Qwen2, get_sliding_window() should be None
-    # when use_sliding_window is False.
-    qwen2_model_config = ModelConfig("Qwen/Qwen1.5-7B")
-
-    qwen2_model_config.hf_config.use_sliding_window = False
-    qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
-    assert qwen2_model_config.get_sliding_window() is None
-
-    qwen2_model_config.hf_config.use_sliding_window = True
-    assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
-
-    mistral_model_config = ModelConfig("mistralai/Mistral-7B-v0.1")
-    mistral_model_config.hf_config.sliding_window = None
-    assert mistral_model_config.get_sliding_window() is None
-
-    mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
-    assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
-
-
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
 def test_get_pooling_config():
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index b2826de93d..49da3fd848 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -40,8 +40,9 @@ from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
-    maybe_override_with_speculators_target_model, try_get_generation_config,
-    try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope)
+    is_interleaved, maybe_override_with_speculators_target_model,
+    try_get_generation_config, try_get_safetensors_metadata,
+    try_get_tokenizer_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 # yapf conflicts with isort for this block
@@ -714,53 +715,31 @@ class ModelConfig:
             revision=self.revision,
         )
 
-        # Workaround for Gemma 2 which uses interleaved sliding window
-        # attention, but it's not specified in its config.
-        # TODO: remove this when Gemma 2 config updated in HuggingFace.
-        if self.hf_text_config.model_type == "gemma2":
-            self.hf_text_config.sliding_window_pattern = 2
-
-        # TODO: remove this when Gemma 3n config updated in HuggingFace.
-        if self.hf_text_config.model_type == "gemma3n_text":
-            # 4 sliding window attention followed by 1 full attention
-            self.hf_text_config.sliding_window_pattern = "LLLLG"
-
-        sliding_window = getattr(self.hf_text_config, "sliding_window", None)
-        sliding_window_pattern = getattr(self.hf_text_config,
-                                         "sliding_window_pattern", None)
-        has_interleaved_attention = sliding_window_pattern is not None or (
-            isinstance(sliding_window, list))
-
-        if not self.disable_sliding_window and has_interleaved_attention:
-            if not envs.VLLM_USE_V1 and (backend := envs.VLLM_ATTENTION_BACKEND
-                                         ) in ("XFORMERS", "FLASHINFER"):
-                sliding_window_len_min = get_min_sliding_window(
-                    self.hf_text_config.sliding_window)
-
-                logger.warning_once(
-                    "%s has interleaved attention, which is currently not supported by the %s backend. Disabling sliding window and capping the max length to the sliding window size (%d).",  # noqa: E501
-                    self.hf_text_config.model_type,
-                    backend,
-                    sliding_window_len_min,
-                )
-                self.disable_sliding_window = True
-            else:
-                # for a model with interleaved attention,
-                # the scheduler and the model treat it as full attention
-                # (i.e., not dropping any tokens outside the window).
-                # only the attention layer itself is aware of the sliding
-                # window, and use the window size to compute the attention.
-                self.hf_text_config.interleaved_sliding_window = sliding_window
-
-                if hasattr(self.hf_text_config, "sliding_window"):
-                    delattr(self.hf_text_config, "sliding_window")
-
-                sliding_window = None
+        # Interleaved attention is not supported by some backends in V0
+        if (not self.disable_sliding_window
+                and is_interleaved(self.hf_text_config)
+                and not envs.VLLM_USE_V1
+                and (backend := envs.VLLM_ATTENTION_BACKEND)
+                in ("XFORMERS", "FLASHINFER")):
+            logger.warning_once(
+                "%s has interleaved attention, which is currently not "
+                "supported by the %s backend. Disabling sliding window and "
+                "capping the max length to the sliding window size (%d).",
+                self.hf_text_config.model_type,
+                backend,
+                self.hf_text_config.sliding_window,
+            )
+            self.disable_sliding_window = True
 
         self.original_max_model_len = self.max_model_len
         self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
         self.multimodal_config = self._init_multimodal_config()
 
+        if self.disable_sliding_window:
+            # Set after get_and_verify_max_len to ensure that max_model_len
+            # can be correctly capped to sliding window size
+            self.hf_text_config.sliding_window = None
+
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
 
@@ -1322,27 +1301,10 @@ class ModelConfig:
             if self.use_async_output_proc:
                 self.use_async_output_proc = False
 
-    def get_hf_config_sliding_window(
-            self) -> Union[Optional[int], list[Optional[int]]]:
-        """Get the sliding window size, or None if disabled."""
-
-        # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
-        # addition to sliding window size. We check if that field is present
-        # and if it's False, return None.
-        if (hasattr(self.hf_text_config, "use_sliding_window")
-                and not self.hf_text_config.use_sliding_window):
-            return None
+    def get_sliding_window(self) -> Optional[int]:
+        """Get the sliding window size from the HF text config if present."""
         return getattr(self.hf_text_config, "sliding_window", None)
 
-    def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]:
-        """Get the sliding window size, or None if disabled.
-        """
-        # If user disables sliding window, return None.
-        if self.disable_sliding_window:
-            return None
-        # Otherwise get the value from the hf config.
-        return self.get_hf_config_sliding_window()
-
     def get_vocab_size(self) -> int:
         return getattr(self.hf_text_config, "vocab_size", 0)
 
@@ -1762,7 +1724,7 @@ class ModelConfig:
             tokenizer_config=tokenizer_config,
             max_model_len=max_model_len,
             disable_sliding_window=self.disable_sliding_window,
-            sliding_window_len=self.get_hf_config_sliding_window(),
+            sliding_window=self.get_sliding_window(),
             spec_target_max_model_len=self.spec_target_max_model_len,
             encoder_config=self.encoder_config)
         logger.info("Using max model len %s", max_model_len)
@@ -3305,7 +3267,7 @@ def _get_and_verify_max_len(
     tokenizer_config: Optional[dict],
     max_model_len: Optional[int],
     disable_sliding_window: bool,
-    sliding_window_len: Optional[Union[int, list[Optional[int]]]],
+    sliding_window: Optional[int],
     spec_target_max_model_len: Optional[int] = None,
     encoder_config: Optional[Any] = None,
 ) -> int:
@@ -3344,13 +3306,10 @@ def _get_and_verify_max_len(
 
     # If sliding window is manually disabled, max_length should be less
     # than the sliding window length in the model config.
-    if disable_sliding_window and sliding_window_len is not None:
-
-        sliding_window_len_min = get_min_sliding_window(sliding_window_len)
-        max_len_key = "sliding_window" \
-            if sliding_window_len_min < derived_max_model_len else max_len_key
-        derived_max_model_len = min(derived_max_model_len,
-                                    sliding_window_len_min)
+    if (disable_sliding_window and sliding_window is not None
+            and sliding_window < derived_max_model_len):
+        max_len_key = "sliding_window"
+        derived_max_model_len = sliding_window
 
     # Consider model_max_length in tokenizer_config
     if tokenizer_config:
@@ -3451,14 +3410,6 @@ def _get_and_verify_max_len(
     return int(max_model_len)
 
 
-def get_min_sliding_window(
-        sliding_window: Union[int, list[Optional[int]]]) -> int:
-    if isinstance(sliding_window, list):
-        return min(s for s in sliding_window if s is not None)
-
-    return sliding_window
-
-
 def get_served_model_name(model: str,
                           served_model_name: Optional[Union[str, list[str]]]):
     """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4d4ce4c78e..4767201617 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -39,6 +39,7 @@ from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_ray_initialized
 from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
+from vllm.transformers_utils.config import is_interleaved
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                         GiB_bytes, get_ip, is_in_ray_actor)
@@ -1081,6 +1082,13 @@ class EngineArgs:
                 "DualChunkFlashAttention is not supported on V1 engine. "
                 "To run the model in V0 engine, try set 'VLLM_USE_V1=0'")
 
+        sliding_window: Optional[int] = None
+        if not is_interleaved(model_config.hf_text_config):
+            # Only set CacheConfig.sliding_window if the model is all sliding
+            # window. Otherwise CacheConfig.sliding_window will override the
+            # global layers in interleaved sliding window models.
+            sliding_window = model_config.get_sliding_window()
+
         cache_config = CacheConfig(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
@@ -1088,7 +1096,7 @@ class EngineArgs:
             cache_dtype=self.kv_cache_dtype,
             is_attention_free=model_config.is_attention_free,
             num_gpu_blocks_override=self.num_gpu_blocks_override,
-            sliding_window=model_config.get_sliding_window(),
+            sliding_window=sliding_window,
             enable_prefix_caching=self.enable_prefix_caching,
             prefix_caching_hash_algo=self.prefix_caching_hash_algo,
             cpu_offload_gb=self.cpu_offload_gb,
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 69281abf73..4dd84b8f8f 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -182,21 +182,13 @@ class CohereAttention(nn.Module):
         )
 
         # Model v2 has interleaved sliding windows, v1 does not
-        interleaved_sliding_window = getattr(config,
-                                             "interleaved_sliding_window",
-                                             None)
-        self.v1 = interleaved_sliding_window is None
+        self.v1 = isinstance(config, CohereConfig)
 
-        layer_idx = extract_layer_index(prefix)
-        layer_has_sliding_window = (
-            getattr(config, "sliding_window_pattern", False) and
-            (layer_idx + 1) % self.config.sliding_window_pattern
-            != 0) or (getattr(config, "layer_types", False)
-                      and config.layer_types[layer_idx] == "sliding_attention")
-
-        self.sliding_window = (interleaved_sliding_window
-                               or config.sliding_window
-                               if layer_has_sliding_window else None)
+        self.sliding_window = None
+        if not self.v1:
+            layer_idx = extract_layer_index(prefix)
+            if config.layer_types[layer_idx] == "sliding_attention":
+                self.sliding_window = config.sliding_window
 
         self.attn = Attention(self.num_heads,
                               self.head_dim,
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index ecd942a76a..827e901418 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -159,25 +159,12 @@ class Exaone4Attention(nn.Module):
         if quant_config is not None and quant_config.get_name() == "gguf":
             is_neox_style = False
 
-        self.apply_all_layers = False  # apply rotary embeddings to every layer.
         layer_idx = extract_layer_index(prefix)
-        interleaved_sliding_window = getattr(config,
-                                             "interleaved_sliding_window",
-                                             4096)
-        sliding_window_pattern = getattr(config, "sliding_window_pattern",
-                                         "LLLG")
+        is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        self.sliding_window = config.sliding_window if is_sliding else None
 
-        if sliding_window_pattern:
-            layer_has_sliding_window = (
-                layer_idx + 1) % sliding_window_pattern.__len__() != 0
-        else:
-            layer_has_sliding_window = False
-            self.apply_all_layers = True
-
-        if layer_has_sliding_window:
-            self.sliding_window = interleaved_sliding_window
-        else:
-            self.sliding_window = None
+        # apply rotary embeddings to every layer
+        self.apply_all_layers = not is_sliding
 
         self.rotary_emb = get_rope(
             self.head_dim,
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 8beefb2cd0..8cfe92c645 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -144,13 +144,10 @@ class Gemma2Attention(nn.Module):
             is_neox_style=True,
         )
 
-        # reference:
-        # https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa
         layer_idx = extract_layer_index(prefix)
-        use_sliding_window = (layer_idx % 2 == 0 and getattr(
-            config, "interleaved_sliding_window", None) is not None)
-        sliding_window = config.interleaved_sliding_window if \
-            use_sliding_window else None
+        is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        sliding_window = config.sliding_window if is_sliding else None
+
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 1a2ce65d1e..b762be3c52 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -146,25 +146,19 @@ class Gemma3Attention(nn.Module):
         self.q_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
 
-        # TODO(woosuk): Add reference to the original HF implementation.
         layer_idx = extract_layer_index(prefix)
-        self.is_sliding = (getattr(
-            config, "interleaved_sliding_window", None) is not None and (bool(
-                (layer_idx + 1) % config.sliding_window_pattern))) or (
-                    getattr(config, "layer_types", None) is not None
-                    and config.layer_types[layer_idx] == "sliding_attention")
+        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        sliding_window = config.sliding_window if self.is_sliding else None
+
         # Initialize the rotary embedding.
         if self.is_sliding:
             # Local attention. Override the values in config.json.
             self.rope_theta = config.rope_local_base_freq
             self.rope_scaling = {"rope_type": "default"}
-            self.sliding_window = (config.interleaved_sliding_window
-                                   or config.sliding_window)
         else:
             # Global attention. Use the values in config.json.
             self.rope_theta = config.rope_theta
             self.rope_scaling = config.rope_scaling
-            self.sliding_window = None
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
@@ -182,7 +176,7 @@ class Gemma3Attention(nn.Module):
                               cache_config=cache_config,
                               quant_config=quant_config,
                               logits_soft_cap=attn_logits_soft_cap,
-                              per_layer_sliding_window=self.sliding_window,
+                              per_layer_sliding_window=sliding_window,
                               prefix=f"{prefix}.attn")
 
     def forward(
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index e9ee1ebdcc..9871b11b37 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -502,8 +502,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         self.config = config
         self.quant_config = quant_config
         self.multimodal_config = multimodal_config
-        self.sliding_window = getattr(config.text_config,
-                                      "interleaved_sliding_window", None)
 
         self.vision_tower = SiglipVisionModel(config.vision_config,
                                               quant_config,
@@ -690,11 +688,11 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
             global_attn_masks.append(global_attn_mask)
 
-            if self.sliding_window is not None:
+            if (sliding_window := self.config.sliding_window) is not None:
                 # Create a local causal mask with sliding window (1024).
                 local_attn_mask = torch.ones_like(global_attn_mask)
                 local_attn_mask = torch.tril(local_attn_mask,
-                                             diagonal=-self.sliding_window)
+                                             diagonal=-sliding_window)
                 local_attn_mask = torch.where(local_attn_mask == 0,
                                               global_attn_mask, float("-inf"))
                 local_attn_masks.append(local_attn_mask)
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 4b41cba1c7..ffec340870 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -313,17 +313,16 @@ class Gemma3nAttention(nn.Module):
                               has_weight=False)
 
         layer_idx = extract_layer_index(prefix)
+        is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        self.sliding_window = config.sliding_window if is_sliding else None
 
-        is_sliding_window = (
-            getattr(config, "interleaved_sliding_window", None) is not None
-            and config.layer_types[layer_idx] == "sliding_attention")
-
-        if is_sliding_window:
-            self.sliding_window = config.interleaved_sliding_window
+        # Initialize the rotary embedding.
+        if is_sliding:
+            # Local attention. Override the values in config.json.
             rope_theta = config.rope_local_base_freq
             rope_scaling = {"rope_type": "default"}
         else:
-            self.sliding_window = None
+            # Global attention. Use the values in config.json.
             rope_theta = config.rope_theta
             rope_scaling = config.rope_scaling
 
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index c99970284a..9e7490e3c4 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -248,9 +248,7 @@ class GritLM(LlamaForCausalLM, SupportsV0Only):
 
             vllm_config.cache_config.sliding_window = None
 
-            for attr in ("sliding_window", "interleaved_sliding_window"):
-                if hasattr(hf_config, attr):
-                    delattr(hf_config, attr)
+            hf_config.sliding_window = None
 
         super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 48ec611df1..bc511d8339 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -167,18 +167,11 @@ class LlamaAttention(nn.Module):
                               rope_scaling=rope_scaling,
                               quant_config=quant_config)
 
-        if hasattr(config, "interleaved_sliding_window"):
-            interleaved_sliding_window = config.interleaved_sliding_window
-            if isinstance(interleaved_sliding_window, int):
-                sliding_window = interleaved_sliding_window
-            elif isinstance(interleaved_sliding_window, list):
-                sw_idx = layer_idx % len(interleaved_sliding_window)
-                sliding_window = interleaved_sliding_window[sw_idx]
-            else:
-                raise ValueError(
-                    f"{type(interleaved_sliding_window)} is not supported.")
-        else:
-            sliding_window = None
+        sliding_window = None
+        if layer_types := getattr(config, "layer_types", None):
+            is_sliding = layer_types[layer_idx] == "sliding_attention"
+            if is_sliding:
+                sliding_window = config.sliding_window
 
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py
index 1a761d01fc..493a4192d3 100644
--- a/vllm/model_executor/models/phi4flash.py
+++ b/vllm/model_executor/models/phi4flash.py
@@ -116,13 +116,8 @@ class SambaYAttention(nn.Module):
             self.Wqkv = nn.Linear(self.hidden_size, op_size, bias=True)
 
         # disable sliding window for the second half of the model
-        sliding_window = config.interleaved_sliding_window[layer_idx]
-        if layer_idx >= config.num_hidden_layers // 2:
-            assert sliding_window is None, \
-                "sliding_window must be none for the second decoder"
-        else:
-            assert sliding_window is not None, \
-                "sliding_window must be set for the first decoder"
+        is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        sliding_window = config.sliding_window if is_sliding else None
 
         assert self.num_heads % 2 == 0, 'num_heads should be even'
         assert self.num_key_value_heads % 2 == 0, 'num_heads should be even'
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index e4f0de04e9..7304fbf120 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -49,6 +49,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import is_interleaved
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
@@ -285,8 +286,7 @@ class Qwen2Model(nn.Module):
         quant_config = vllm_config.quant_config
 
         # TODO (@robertgshaw2): see if this can be moved out
-        if (cache_config.sliding_window is not None
-                and hasattr(config, "max_window_layers")):
+        if is_interleaved(vllm_config.model_config.hf_text_config):
             assert config.max_window_layers == config.num_hidden_layers, (
                 "Sliding window for some but all layers is not supported. "
                 "This model uses sliding window but `max_window_layers` = {} "
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 92e132045c..fc4585618b 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -16,7 +16,7 @@
 # limitations under the License.
 """Wrapper around `transformers` models"""
 from collections.abc import Iterable, Mapping
-from contextlib import contextmanager, nullcontext
+from contextlib import contextmanager
 from typing import Literal, Optional, Union
 
 import regex as re
@@ -382,33 +382,6 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
         )
 
 
-class ConfigOverride:
-    """Context manager to temporarily override config attributes."""
-
-    def __init__(self, config: PretrainedConfig, **kwargs):
-        self.config = config
-        self.kwargs = kwargs
-        self.kwargs_original = {}
-        self.kwargs_delete = set()
-
-    def __enter__(self):
-        """Override config attributes."""
-        for key, value in self.kwargs.items():
-            if not hasattr(self.config, key):
-                self.kwargs_delete.add(key)
-            self.kwargs_original[key] = getattr(self.config, key, None)
-            setattr(self.config, key, value)
-        return self.config
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        """Restore original config attributes."""
-        for key, value in self.kwargs_original.items():
-            if key in self.kwargs_delete:
-                delattr(self.config, key)
-            else:
-                setattr(self.config, key, value)
-
-
 class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
     embedding_padding_modules = ["lm_head"]
     embedding_modules = ["embed_tokens"
@@ -434,21 +407,11 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         # To be updated in child classes for use in `load_weights`
         self.skip_prefixes: Optional[list[str]] = None
 
-        # vLLM handles interleaved sliding window attention by creating a new
-        # interleaved_sliding_window attribute and deleting the sliding_window
-        # attribute. This breaks the constructors in Transformers so we
-        # temporarily add the attribute back to construct the model.
-        config_override = nullcontext()
-        if hasattr(self.config, "interleaved_sliding_window"):
-            config_override = ConfigOverride(
-                self.config,
-                sliding_window=self.config.interleaved_sliding_window)
-
         # Set correct attn and init on "meta" to delay allocating GPU tensors
         # TODO: @raushan, use the public `model.set_attn_implementation()`
         # method once its checks are fixed in Transformers.
         self.text_config._attn_implementation = "vllm"
-        with init_on_device_without_buffers("meta"), config_override:
+        with init_on_device_without_buffers("meta"):
             self.model: PreTrainedModel = AutoModel.from_config(
                 self.config,
                 torch_dtype=self.model_config.dtype,
@@ -575,11 +538,10 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         attention_instances = {}
         for i in range(start, end):
             # Handle interleaved sliding window attention
-            sliding_window = None
-            if (hasattr(self.config, "interleaved_sliding_window")
-                    and hasattr(self.config, "sliding_window_pattern")
-                    and ((i + 1) % self.config.sliding_window_pattern > 0)):
-                sliding_window = self.config.interleaved_sliding_window
+            per_layer_sliding_window = None
+            if (hasattr(self.config, "layer_types")
+                    and self.config.layer_types[i] == "sliding_attention"):
+                per_layer_sliding_window = self.config.sliding_window
 
             attention_instances[i] = Attention(
                 num_heads=num_heads,
@@ -590,7 +552,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
                 num_kv_heads=num_kv_heads,
                 cache_config=self.cache_config,
                 quant_config=self.quant_config,
-                per_layer_sliding_window=sliding_window,
+                per_layer_sliding_window=per_layer_sliding_window,
                 prefix=f"{i}.attn")
         return attention_instances
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index de779f94a4..6b70164c8c 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -280,6 +280,17 @@ def is_encoder_decoder(config: PretrainedConfig) -> bool:
     return getattr(config, "is_encoder_decoder", False)
 
 
+def is_interleaved(config: PretrainedConfig) -> bool:
+    """
+    Detect if the model with this config is used with interleaved attention.
+    """
+    text_config = config.get_text_config()
+    if layer_types := getattr(text_config, "layer_types", None):
+        interleaved_types = {"full_attention", "sliding_attention"}
+        return interleaved_types.issubset(layer_types)
+    return False
+
+
 def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
     """Remap config attributes to match the expected names."""
     for old_attr, new_attr in _CONFIG_ATTRS_MAPPING.items():
@@ -423,6 +434,23 @@ def get_config(
                     raise e
         config = _maybe_remap_hf_config_attrs(config)
 
+        # Phi4Flash misuses this config as list[int]. Convert it to int and add
+        # the layer_types list[str] to make it HF compatible
+        if (config.model_type == "phi4flash"):
+            # TODO: Remove after the following PR is merged:
+            # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/6
+            if not hasattr(config, "layer_types"):
+                config.layer_types = [
+                    "sliding_attention" if i < config.num_hidden_layers // 2
+                    and i % 2 == 1 else "full_attention"
+                    for i in range(config.num_hidden_layers)
+                ]
+            # TODO: Remove after the following PR is merged:
+            # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/7
+            if isinstance(config.sliding_window, list):
+                config.sliding_window = next(
+                    filter(None, config.sliding_window), None)
+
     elif config_format == ConfigFormat.MISTRAL:
         # This function loads a params.json config which
         # should be used when loading models in mistral format
@@ -434,6 +462,18 @@ def get_config(
             config_dict["max_position_embeddings"] = max_position_embeddings
 
         config = adapt_config_dict(config_dict)
+
+        # Mistral configs may define sliding_window as list[int]. Convert it
+        # to int and add the layer_types list[str] to make it HF compatible
+        if ((sliding_window := getattr(config, "sliding_window", None))
+                and isinstance(sliding_window, list)):
+            pattern_repeats = config.num_hidden_layers // len(sliding_window)
+            layer_types = sliding_window * pattern_repeats
+            config.layer_types = [
+                "full_attention" if layer_type is None else "sliding_attention"
+                for layer_type in layer_types
+            ]
+            config.sliding_window = next(filter(None, sliding_window), None)
     else:
         supported_formats = [
             fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO

From 7e8d685775fe9e11c3cea79e84418a9f0bab4a5f Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 10 Aug 2025 15:08:23 +0800
Subject: [PATCH 134/932] [Minor] Fix pre-commit error on main (#22579)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 3ad5f5b7ad..86cc6e0e5d 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1038,9 +1038,9 @@ def inplace_fused_experts(
         w2_zp: Optional[torch.Tensor] = None,
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[List[int]] = None,
+        block_shape: Optional[List[int]] = None,  #noqa: UP006
         w1_bias: Optional[torch.Tensor] = None,
-        w2_bias: Optional[torch.Tensor] = None) -> None:  #noqa: UP006
+        w2_bias: Optional[torch.Tensor] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
                        activation, is_act_and_mul,
                        apply_router_weight_on_input, use_fp8_w8a8,

From 326976291b541f0fd5bef34aa1ff4a84bf8fb37d Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sun, 10 Aug 2025 15:08:48 +0800
Subject: [PATCH 135/932] [Misc] code clean duplicate set_current_vllm_config
 in _set_vllm_config (#22566)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 tests/kernels/moe/modular_kernel_tools/parallel_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
index 1f8d21a7a7..459b785e65 100644
--- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
+++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
@@ -36,7 +36,6 @@ def _set_vllm_config(vllm_config: VllmConfig, world_size: int, rank: int,
     import tempfile
     temp_file = tempfile.mkstemp()[1]
 
-    set_current_vllm_config(vllm_config)
     with set_current_vllm_config(vllm_config):
         init_distributed_environment(
             world_size=world_size,

From 010e0e39ea49508a94ad42062505d7629e19b8d2 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Sun, 10 Aug 2025 01:35:22 -0700
Subject: [PATCH 136/932] [Doc] Fix API doc link in side navigation (#22585)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 docs/.nav.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/.nav.yml b/docs/.nav.yml
index 77342e2674..f57703c329 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -1,5 +1,5 @@
 nav:
-  - Home: 
+  - Home:
     - vLLM: README.md
     - Getting Started:
       - getting_started/quickstart.md
@@ -11,7 +11,7 @@ nav:
     - Quick Links:
       - User Guide: usage/README.md
       - Developer Guide: contributing/README.md
-      - API Reference: api/README.md
+      - API Reference: api/summary.md
       - CLI Reference: cli/README.md
     - Timeline:
       - Roadmap: https://roadmap.vllm.ai
@@ -49,7 +49,7 @@ nav:
     - General:
       - glob: contributing/*
         flatten_single_child_sections: true
-    - Model Implementation: 
+    - Model Implementation:
       - contributing/model/README.md
       - contributing/model/basic.md
       - contributing/model/registration.md

From d411df029648ff8107bddf89594b101879960491 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 10 Aug 2025 20:49:48 +0800
Subject: [PATCH 137/932] [Misc] Further refine type annotations in parallel
 state (#22499)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/distributed/eplb/eplb_state.py |  3 ---
 vllm/distributed/parallel_state.py  | 36 +++++++++++++++--------------
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index c415d409f7..979f2a06ce 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -259,7 +259,6 @@ class EplbState:
 
         if global_expert_load is not None:
             ep_group = get_ep_group().device_group
-            assert ep_group is not None
             assert global_expert_load.shape == (model.num_moe_layers,
                                                 model.num_logical_experts)
             assert global_expert_load.dtype == torch.int64
@@ -366,7 +365,6 @@ class EplbState:
 
             # Collect load metrics from all ranks
             ep_group = get_ep_group().device_group
-            assert ep_group is not None
             all_reduce(total_expert_load_pass, group=ep_group)
 
             # num_tokens_per_rank: (num_moe_layers, num_ranks)
@@ -422,7 +420,6 @@ class EplbState:
         """
 
         ep_group = get_ep_group().device_group
-        assert ep_group is not None
         ep_rank = ep_group.rank()
 
         time_start = None
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 0b3993ca02..b89aee99c8 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -197,11 +197,10 @@ class GroupCoordinator:
     #   3     |   1  |  3   |     1      |       3
     local_rank: int  # local rank used to assign devices
     rank_in_group: int  # rank inside the group
-    cpu_group: Optional[ProcessGroup]  # group for CPU communication
-    device_group: Optional[ProcessGroup]  # group for device communication
-    use_device_communicator: bool  # whether to use device communicator
-    device_communicator: Optional[
-        DeviceCommunicatorBase]  # device communicator
+    cpu_group: ProcessGroup  # group for CPU communication
+    device_group: ProcessGroup  # group for device communication
+    # device communicator (if use_device_communicator=True)
+    device_communicator: Optional[DeviceCommunicatorBase]
     mq_broadcaster: Optional[Any]  # shared memory broadcaster
 
     def __init__(
@@ -209,7 +208,7 @@ class GroupCoordinator:
         group_ranks: list[list[int]],
         local_rank: int,
         torch_distributed_backend: Union[str, Backend],
-        use_device_communicator: bool,
+        use_device_communicator: bool,  # whether to use device communicator
         use_message_queue_broadcaster: bool = False,
         group_name: Optional[str] = None,
     ):
@@ -219,8 +218,9 @@ class GroupCoordinator:
 
         self.rank = torch.distributed.get_rank()
         self.local_rank = local_rank
-        self.device_group = None
-        self.cpu_group = None
+
+        self_device_group = None
+        self_cpu_group = None
 
         for ranks in group_ranks:
             device_group = torch.distributed.new_group(
@@ -232,11 +232,14 @@ class GroupCoordinator:
                 self.ranks = ranks
                 self.world_size = len(ranks)
                 self.rank_in_group = ranks.index(self.rank)
-                self.device_group = device_group
-                self.cpu_group = cpu_group
+                self_device_group = device_group
+                self_cpu_group = cpu_group
 
-        assert self.cpu_group is not None
-        assert self.device_group is not None
+        assert self_cpu_group is not None
+        assert self_device_group is not None
+
+        self.cpu_group = self_cpu_group
+        self.device_group = self_device_group
 
         from vllm.platforms import current_platform
 
@@ -251,7 +254,6 @@ class GroupCoordinator:
             self.device = torch.device("cpu")
 
         self.use_device_communicator = use_device_communicator
-
         self.device_communicator = None
         if use_device_communicator and self.world_size > 1:
             device_comm_cls = resolve_obj_by_qualname(
@@ -817,12 +819,12 @@ class GroupCoordinator:
         return self.device_communicator.recv(size, dtype, src)
 
     def destroy(self):
-        if self.device_group is not None:
+        if hasattr(self, "device_group"):
             torch.distributed.destroy_process_group(self.device_group)
-            self.device_group = None
-        if self.cpu_group is not None:
+            del self.device_group
+        if hasattr(self, "cpu_group"):
             torch.distributed.destroy_process_group(self.cpu_group)
-            self.cpu_group = None
+            del self.cpu_group
         if self.device_communicator is not None:
             self.device_communicator.destroy()
         if self.mq_broadcaster is not None:

From 00976db0c311be2b0bbc6f7769918f61a8d17bcf Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sun, 10 Aug 2025 13:49:51 +0100
Subject: [PATCH 138/932] [Docs] Fix warnings in docs build (#22588)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/api/summary.md                    |   2 -
 docs/configuration/tpu.md              |   2 +-
 docs/contributing/model/multimodal.md  |   8 +-
 docs/models/generative_models.md       |   4 +-
 docs/models/pooling_models.md          |   2 +-
 docs/models/supported_models.md        |   2 +-
 vllm/attention/layers/__init__.py      |   0
 vllm/inputs/__init__.py                |  10 +-
 vllm/model_executor/warmup/__init__.py |   0
 vllm/sampling_params.py                | 140 +++++++++++--------------
 10 files changed, 80 insertions(+), 90 deletions(-)
 create mode 100644 vllm/attention/layers/__init__.py
 create mode 100644 vllm/model_executor/warmup/__init__.py

diff --git a/docs/api/summary.md b/docs/api/summary.md
index db4dab0ae5..327472df1d 100644
--- a/docs/api/summary.md
+++ b/docs/api/summary.md
@@ -1,7 +1,5 @@
 # Summary
 
-[](){ #configuration }
-
 ## Configuration
 
 API documentation for vLLM's configuration classes.
diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
index a2941c80bd..a93435ed71 100644
--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@@ -96,7 +96,7 @@ Although it’s common to do this with GPUs, don't try to fragment 2 or 8 differ
 
 ### Tune your workloads
 
-Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
+Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
 
 ### Future Topics We'll Cover
 
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 3295b8c711..64a48be326 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -540,8 +540,10 @@ return a schema of the tensors outputted by the HF processor that are related to
     The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore
     `(1, num_images, num_patches, patch_width * patch_height * num_channels)`.
 
-    In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
-    we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
+    In order to support the use of
+    [MultiModalFieldConfig.batched][vllm.multimodal.inputs.MultiModalFieldConfig.batched]
+    like in LLaVA, we remove the extra batch dimension by overriding
+    [BaseMultiModalProcessor._call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor]:
 
     ??? code
 
@@ -816,7 +818,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
 After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2),
 [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3),
 and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4),
-decorate the model class with [MULTIMODAL_REGISTRY.register_processor][vllm.multimodal.processing.MultiModalRegistry.register_processor]
+decorate the model class with [MULTIMODAL_REGISTRY.register_processor][vllm.multimodal.registry.MultiModalRegistry.register_processor]
 to register them to the multi-modal registry:
 
 ```diff
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index a3ad413593..a64ecd31eb 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -4,7 +4,7 @@ vLLM provides first-class support for generative models, which covers most of LL
 
 In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
-which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text.
+which are then passed through [Sampler][vllm.model_executor.layers.sampler.Sampler] to obtain the final text.
 
 ## Configuration
 
@@ -19,7 +19,7 @@ Run a model in generation mode via the option `--runner generate`.
 ## Offline Inference
 
 The [LLM][vllm.LLM] class provides various methods for offline inference.
-See [configuration][configuration] for a list of options when initializing the model.
+See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
 
 ### `LLM.generate`
 
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index c6588363b6..39f209d0eb 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -81,7 +81,7 @@ which takes priority over both the model's and Sentence Transformers's defaults.
 ## Offline Inference
 
 The [LLM][vllm.LLM] class provides various methods for offline inference.
-See [configuration][configuration] for a list of options when initializing the model.
+See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
 
 ### `LLM.embed`
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 5c48998ba4..ddab7ad5d9 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -770,7 +770,7 @@ The following table lists those that are tested in vLLM.
 Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
 These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
 
-| Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][parallelism-scaling]   | [V1](gh-issue:8779)   |
+| Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA](../features/lora.md)   | [PP](../serving/parallelism_scaling.md)   | [V1](gh-issue:8779)   |
 |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
 | `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
 
diff --git a/vllm/attention/layers/__init__.py b/vllm/attention/layers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 37bf2b7a44..aef7841e71 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from .data import (DecoderOnlyInputs, EmbedsInputs, EncoderDecoderInputs,
-                   ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
-                   SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
-                   TokensPrompt, build_explicit_enc_dec_prompt, embeds_inputs,
+from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
+                   EncoderDecoderInputs, ExplicitEncoderDecoderPrompt,
+                   ProcessorInputs, PromptType, SingletonInputs,
+                   SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
+                   build_explicit_enc_dec_prompt, embeds_inputs,
                    to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts)
 from .registry import (DummyData, InputContext, InputProcessingContext,
                        InputRegistry)
@@ -24,6 +25,7 @@ __all__ = [
     "ExplicitEncoderDecoderPrompt",
     "TokenInputs",
     "EmbedsInputs",
+    "EmbedsPrompt",
     "token_inputs",
     "embeds_inputs",
     "DecoderOnlyInputs",
diff --git a/vllm/model_executor/warmup/__init__.py b/vllm/model_executor/warmup/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 52e4cbd096..df4cca9ba1 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -103,113 +103,89 @@ class SamplingParams(
     Overall, we follow the sampling parameters from the OpenAI text completion
     API (https://platform.openai.com/docs/api-reference/completions/create).
     In addition, we support beam search, which is not supported by OpenAI.
-
-    Args:
-        n: Number of output sequences to return for the given prompt.
-        best_of: Number of output sequences that are generated from the prompt.
-            From these `best_of` sequences, the top `n` sequences are returned.
-            `best_of` must be greater than or equal to `n`. By default,
-            `best_of` is set to `n`. Warning, this is only supported in V0.
-        presence_penalty: Float that penalizes new tokens based on whether they
-            appear in the generated text so far. Values > 0 encourage the model
-            to use new tokens, while values < 0 encourage the model to repeat
-            tokens.
-        frequency_penalty: Float that penalizes new tokens based on their
-            frequency in the generated text so far. Values > 0 encourage the
-            model to use new tokens, while values < 0 encourage the model to
-            repeat tokens.
-        repetition_penalty: Float that penalizes new tokens based on whether
-            they appear in the prompt and the generated text so far. Values > 1
-            encourage the model to use new tokens, while values < 1 encourage
-            the model to repeat tokens.
-        temperature: Float that controls the randomness of the sampling. Lower
-            values make the model more deterministic, while higher values make
-            the model more random. Zero means greedy sampling.
-        top_p: Float that controls the cumulative probability of the top tokens
-            to consider. Must be in (0, 1]. Set to 1 to consider all tokens.
-        top_k: Integer that controls the number of top tokens to consider. Set
-            to 0 (or -1) to consider all tokens.
-        min_p: Float that represents the minimum probability for a token to be
-            considered, relative to the probability of the most likely token.
-            Must be in [0, 1]. Set to 0 to disable this.
-        seed: Random seed to use for the generation.
-        stop: list of strings that stop the generation when they are generated.
-            The returned output will not contain the stop strings.
-        stop_token_ids: list of tokens that stop the generation when they are
-            generated. The returned output will contain the stop tokens unless
-            the stop tokens are special tokens.
-        bad_words: list of words that are not allowed to be generated.
-            More precisely, only the last token of a corresponding
-            token sequence is not allowed when the next generated token
-            can complete the sequence.
-        include_stop_str_in_output: Whether to include the stop strings in
-            output text. Defaults to False.
-        ignore_eos: Whether to ignore the EOS token and continue generating
-            tokens after the EOS token is generated.
-        max_tokens: Maximum number of tokens to generate per output sequence.
-        min_tokens: Minimum number of tokens to generate per output sequence
-            before EOS or stop_token_ids can be generated
-        logprobs: Number of log probabilities to return per output token.
-            When set to None, no probability is returned. If set to a non-None
-            value, the result includes the log probabilities of the specified
-            number of most likely tokens, as well as the chosen tokens.
-            Note that the implementation follows the OpenAI API: The API will
-            always return the log probability of the sampled token, so there
-            may be up to `logprobs+1` elements in the response.
-            When set to -1, return all `vocab_size` log probabilities.
-        prompt_logprobs: Number of log probabilities to return per prompt token.
-        detokenize: Whether to detokenize the output. Defaults to True.
-        skip_special_tokens: Whether to skip special tokens in the output.
-        spaces_between_special_tokens: Whether to add spaces between special
-            tokens in the output.  Defaults to True.
-        logits_processors: list of functions that modify logits based on
-            previously generated tokens, and optionally prompt tokens as
-            a first argument.
-        truncate_prompt_tokens: If set to -1, will use the truncation size
-            supported by the model. If set to an integer k, will use only
-            the last k tokens from the prompt (i.e., left truncation).
-            Defaults to None (i.e., no truncation).
-        guided_decoding: If provided, the engine will construct a guided
-            decoding logits processor from these parameters. Defaults to None.
-        logit_bias: If provided, the engine will construct a logits processor
-            that applies these logit biases. Defaults to None.
-        allowed_token_ids: If provided, the engine will construct a logits
-            processor which only retains scores for the given token ids.
-            Defaults to None.
-        extra_args: Arbitrary additional args, that can be used by custom
-            sampling implementations, plugins, etc. Not used by any in-tree
-            sampling implementations.
     """
 
     n: int = 1
+    """Number of output sequences to return for the given prompt."""
     best_of: Optional[int] = None
+    """Number of output sequences that are generated from the prompt. From
+    these `best_of` sequences, the top `n` sequences are returned. `best_of`
+    must be greater than or equal to `n`. By default, `best_of` is set to `n`.
+    Warning, this is only supported in V0."""
     _real_n: Optional[int] = None
     presence_penalty: float = 0.0
+    """Penalizes new tokens based on whether they appear in the generated text
+    so far. Values > 0 encourage the model to use new tokens, while values < 0
+    encourage the model to repeat tokens."""
     frequency_penalty: float = 0.0
+    """Penalizes new tokens based on their frequency in the generated text so
+    far. Values > 0 encourage the model to use new tokens, while values < 0
+    encourage the model to repeat tokens."""
     repetition_penalty: float = 1.0
+    """Penalizes new tokens based on whether they appear in the prompt and the
+    generated text so far. Values > 1 encourage the model to use new tokens,
+    while values < 1 encourage the model to repeat tokens."""
     temperature: float = 1.0
+    """Controls the randomness of the sampling. Lower values make the model
+    more deterministic, while higher values make the model more random. Zero
+    means greedy sampling."""
     top_p: float = 1.0
+    """Controls the cumulative probability of the top tokens to consider. Must
+    be in (0, 1]. Set to 1 to consider all tokens."""
     top_k: int = 0
+    """Controls the number of top tokens to consider. Set to 0 (or -1) to
+    consider all tokens."""
     min_p: float = 0.0
+    """Represents the minimum probability for a token to be considered,
+    relative to the probability of the most likely token. Must be in [0, 1].
+    Set to 0 to disable this."""
     seed: Optional[int] = None
+    """Random seed to use for the generation."""
     stop: Optional[Union[str, list[str]]] = None
+    """String(s) that stop the generation when they are generated. The returned
+    output will not contain the stop strings."""
     stop_token_ids: Optional[list[int]] = None
+    """Token IDs that stop the generation when they are generated. The returned
+    output will contain the stop tokens unless the stop tokens are special
+    tokens."""
     ignore_eos: bool = False
+    """Whether to ignore the EOS token and continue generating
+    tokens after the EOS token is generated."""
     max_tokens: Optional[int] = 16
+    """Maximum number of tokens to generate per output sequence."""
     min_tokens: int = 0
+    """Minimum number of tokens to generate per output sequence before EOS or
+    `stop_token_ids` can be generated"""
     logprobs: Optional[int] = None
+    """Number of log probabilities to return per output token. When set to
+    `None`, no probability is returned. If set to a non-`None` value, the
+    result includes the log probabilities of the specified number of most
+    likely tokens, as well as the chosen tokens. Note that the implementation
+    follows the OpenAI API: The API will always return the log probability of
+    the sampled token, so there may be up to `logprobs+1` elements in the
+    response. When set to -1, return all `vocab_size` log probabilities."""
     prompt_logprobs: Optional[int] = None
+    """Number of log probabilities to return per prompt token."""
     # NOTE: This parameter is only exposed at the engine level for now.
     # It is not exposed in the OpenAI API server, as the OpenAI API does
     # not support returning only a list of token IDs.
     detokenize: bool = True
+    """Whether to detokenize the output."""
     skip_special_tokens: bool = True
+    """Whether to skip special tokens in the output."""
     spaces_between_special_tokens: bool = True
+    """Whether to add spaces between special tokens in the output."""
     # Optional[list[LogitsProcessor]] type. We use Any here because
     # Optional[list[LogitsProcessor]] type is not supported by msgspec.
     logits_processors: Optional[Any] = None
+    """Functions that modify logits based on previously generated tokens, and
+    optionally prompt tokens as a first argument."""
     include_stop_str_in_output: bool = False
+    """Whether to include the stop strings in output text."""
     truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None
+    """If set to -1, will use the truncation size supported by the model. If
+    set to an integer k, will use only the last k tokens from the prompt
+    (i.e., left truncation). If set to `None`, truncation is disabled."""
     output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE
 
     # The below fields are not supposed to be used as an input.
@@ -219,12 +195,24 @@ class SamplingParams(
 
     # Fields used to construct logits processors
     guided_decoding: Optional[GuidedDecodingParams] = None
+    """If provided, the engine will construct a guided decoding logits
+    processor from these parameters."""
     logit_bias: Optional[dict[int, float]] = None
+    """If provided, the engine will construct a logits processor that applies
+    these logit biases."""
     allowed_token_ids: Optional[list[int]] = None
+    """If provided, the engine will construct a logits processor which only
+    retains scores for the given token ids."""
     extra_args: Optional[dict[str, Any]] = None
+    """Arbitrary additional args, that can be used by custom sampling
+    implementations, plugins, etc. Not used by any in-tree sampling
+    implementations."""
 
     # Fields used for bad words
     bad_words: Optional[list[str]] = None
+    """Words that are not allowed to be generated. More precisely, only the
+    last token of a corresponding token sequence is not allowed when the next
+    generated token can complete the sequence."""
     _bad_words_token_ids: Optional[list[list[int]]] = None
 
     @staticmethod

From 049c245143ef0f8fd338fc3200f51a18fc53b403 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 10 Aug 2025 21:18:21 +0800
Subject: [PATCH 139/932] [Misc] Replace flaky image urls in pixtral test
 (#22574)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .../multimodal/generation/test_pixtral.py     | 24 +++++++++----------
 tests/models/utils.py                         |  3 ++-
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index e157d6f4a7..d39cf70678 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -18,7 +18,7 @@ from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sequence import Logprob, SampleLogprobs
 
 from ....utils import VLLM_PATH, large_gpu_test
-from ...utils import check_logprobs_close
+from ...utils import check_logprobs_close, dummy_hf_overrides
 
 if TYPE_CHECKING:
     from _typeshed import StrPath
@@ -29,10 +29,10 @@ MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
 
 IMG_URLS = [
-    "https://picsum.photos/id/237/400/300",
-    "https://picsum.photos/id/231/200/300",
-    "https://picsum.photos/id/27/500/500",
-    "https://picsum.photos/id/17/150/600",
+    "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
+    "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/231-200x300.jpg",
+    "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/27-500x500.jpg",
+    "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/17-150x600.jpg",
 ]
 PROMPT = "Describe each image in one short sentence."
 
@@ -110,11 +110,6 @@ MSGS = [
     _create_msg_format(IMG_URLS[:2]),
     _create_msg_format(IMG_URLS),
 ]
-ENGINE_INPUTS = [
-    _create_engine_inputs(IMG_URLS[:1]),
-    _create_engine_inputs(IMG_URLS[:2]),
-    _create_engine_inputs(IMG_URLS),
-]
 
 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 LIMIT_MM_PER_PROMPT = dict(image=4)
@@ -195,7 +190,6 @@ def test_chat(
                          name_1="output")
 
 
-@large_gpu_test(min_gb=48)
 @pytest.mark.parametrize("prompt,expected_ranges",
                          [(_create_engine_inputs_hf(IMG_URLS[:1]),
                            [PlaceholderRange(offset=11, length=494)]),
@@ -204,7 +198,7 @@ def test_chat(
                               PlaceholderRange(offset=277, length=1056),
                               PlaceholderRange(offset=1333, length=418)
                           ])])
-def test_multi_modal_placeholders(vllm_runner, prompt,
+def test_multi_modal_placeholders(vllm_runner, prompt: TextPrompt,
                                   expected_ranges: list[PlaceholderRange],
                                   monkeypatch) -> None:
 
@@ -215,6 +209,8 @@ def test_multi_modal_placeholders(vllm_runner, prompt,
             "mistral-community/pixtral-12b",
             max_model_len=8192,
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+            load_format="dummy",
+            hf_overrides=dummy_hf_overrides,
     ) as vllm_model:
         outputs = vllm_model.llm.generate(prompt)
 
@@ -230,5 +226,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt,
             expected_ranges), f"{image_placeholder_ranges=}"
         for real_range, expected_range in zip(image_placeholder_ranges,
                                               expected_ranges):
-            assert real_range == expected_range, \
+            assert real_range.offset == expected_range.offset, \
+                f"{real_range=} {expected_range=}"
+            assert real_range.length == expected_range.length, \
                 f"{real_range=} {expected_range=}"
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 1e3d51aeec..11ddf45c8e 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -357,7 +357,8 @@ class RerankModelInfo(NamedTuple):
 
 def dummy_hf_overrides(
     hf_config: PretrainedConfig,
-    model_arch: str,
+    *,
+    model_arch: str = "",
     exist_overrides: Optional[dict[str, Any]] = None,
 ) -> PretrainedConfig:
     """

From 8290d15d2c6a4a82e4fd0af86b352aa522178a68 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sun, 10 Aug 2025 15:36:40 +0100
Subject: [PATCH 140/932] Move `CacheConfig` from `config/__init__.py` to
 `config/cache.py` (#22586)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/__init__.py | 190 +------------------------------------
 vllm/config/cache.py    | 204 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 208 insertions(+), 186 deletions(-)
 create mode 100644 vllm/config/cache.py

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 49da3fd848..700d29f956 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -29,6 +29,8 @@ from typing_extensions import Self, assert_never, runtime_checkable
 
 import vllm.envs as envs
 from vllm import version
+from vllm.config.cache import (BlockSize, CacheConfig, CacheDType,
+                               PrefixCachingHashAlgo)
 from vllm.config.compilation import (CompilationConfig, CompilationLevel,
                                      PassConfig)
 from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig
@@ -49,9 +51,8 @@ from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 # yapf: disable
 from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
                         MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-                        POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
-                        LayerBlockType, LazyLoader, common_broadcastable_dtype,
-                        get_cpu_memory, random_uuid)
+                        POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
+                        LazyLoader, common_broadcastable_dtype, random_uuid)
 
 # yapf: enable
 
@@ -1731,189 +1732,6 @@ class ModelConfig:
         return max_model_len
 
 
-BlockSize = Literal[1, 8, 16, 32, 64, 128]
-CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
-PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"]
-
-
-@config
-@dataclass
-class CacheConfig:
-    """Configuration for the KV cache."""
-
-    block_size: SkipValidation[BlockSize] = None  # type: ignore
-    """Size of a contiguous cache block in number of tokens. This is ignored on
-    neuron devices and set to `--max-model-len`. On CUDA devices, only block
-    sizes up to 32 are supported. On HPU devices, block size defaults to 128.
-
-    This config has no static default. If left unspecified by the user, it will
-    be set in `Platform.check_and_update_config()` based on the current
-    platform."""
-    gpu_memory_utilization: float = 0.9
-    """The fraction of GPU memory to be used for the model executor, which can
-    range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
-    utilization. If unspecified, will use the default value of 0.9. This is a
-    per-instance limit, and only applies to the current vLLM instance. It does
-    not matter if you have another vLLM instance running on the same GPU. For
-    example, if you have two vLLM instances running on the same GPU, you can
-    set the GPU memory utilization to 0.5 for each instance."""
-    swap_space: float = 4
-    """Size of the CPU swap space per GPU (in GiB)."""
-    cache_dtype: CacheDType = "auto"
-    """Data type for kv cache storage. If "auto", will use model data type.
-    CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
-    fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc)."""
-    is_attention_free: bool = False
-    """Whether the model is attention-free. This is primarily set in
-    `ModelConfig` and that value should be manually duplicated here."""
-    num_gpu_blocks_override: Optional[int] = None
-    """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
-    if specified. Does nothing if `None`. Used for testing preemption."""
-    sliding_window: Optional[int] = None
-    """Sliding window size for the KV cache. This is primarily set in
-    `ModelConfig` and that value should be manually duplicated here."""
-    enable_prefix_caching: Optional[bool] = None
-    """Whether to enable prefix caching. Disabled by default for V0. Enabled by
-    default for V1."""
-    prefix_caching_hash_algo: PrefixCachingHashAlgo = "builtin"
-    """Set the hash algorithm for prefix caching:\n
-    - "builtin" is Python's built-in hash.\n
-    - "sha256" is collision resistant but with certain overheads.
-    This option uses Pickle for object serialization before hashing.\n
-    - "sha256_cbor_64bit" provides a reproducible, cross-language compatible
-    hash. It serializes objects using canonical CBOR and hashes them with
-    SHA-256. The resulting hash consists of the lower 64 bits of the SHA-256
-    digest."""
-    cpu_offload_gb: float = 0
-    """The space in GiB to offload to CPU, per GPU. Default is 0, which means
-    no offloading. Intuitively, this argument can be seen as a virtual way to
-    increase the GPU memory size. For example, if you have one 24 GB GPU and
-    set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
-    load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
-    Note that this requires fast CPU-GPU interconnect, as part of the model is
-    loaded from CPU memory to GPU memory on the fly in each model forward pass.
-    """
-    calculate_kv_scales: bool = False
-    """This enables dynamic calculation of `k_scale` and `v_scale` when
-    kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
-    checkpoint if available. Otherwise, the scales will default to 1.0."""
-    cpu_kvcache_space_bytes: Optional[int] = None
-    """(CPU backend only) CPU key-value cache space."""
-    mamba_page_size_padded: Optional[int] = None
-    """ Optional override for mamba page size; used by hybrid mamba/attention
-    models to ensure exact alignment with attention page size."""
-
-    # Will be set after profiling.
-    num_gpu_blocks: Optional[int] = field(default=None, init=False)
-    """The number of blocks to allocate for GPU memory."""
-    num_cpu_blocks: Optional[int] = field(default=None, init=False)
-    """The number of blocks to allocate for CPU memory."""
-
-    kv_sharing_fast_prefill: bool = False
-    """This feature is work in progress and no prefill optimization takes place
-    with this flag enabled currently.
-
-    In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
-    some layers can skip tokens corresponding to prefill. This flag enables
-    attention metadata for eligible layers to be overriden with metadata
-    necessary for implementating this optimization in some models (e.g. Gemma3n)
-    """
-
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        factors: list[Any] = []
-        factors.append(self.cache_dtype)
-        # `cpu_offload_gb` does not use `torch.compile` yet.
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
-        return hash_str
-
-    def __post_init__(self) -> None:
-        self.swap_space_bytes = self.swap_space * GiB_bytes
-
-        self._verify_cache_dtype()
-        self._verify_prefix_caching()
-
-    def metrics_info(self):
-        # convert cache_config to dict(key: str, value: str) for prometheus
-        # metrics info
-        return {key: str(value) for key, value in self.__dict__.items()}
-
-    @model_validator(mode='after')
-    def _verify_args(self) -> Self:
-        if self.cpu_offload_gb < 0:
-            raise ValueError("CPU offload space must be non-negative"
-                             f", but got {self.cpu_offload_gb}")
-
-        if self.gpu_memory_utilization > 1.0:
-            raise ValueError(
-                "GPU memory utilization must be less than 1.0. Got "
-                f"{self.gpu_memory_utilization}.")
-
-        if self.kv_sharing_fast_prefill:
-            logger.warning_once(
-                "--kv-sharing-fast-prefill is currently work in progress "
-                "and not functional yet (i.e. no prefill savings)")
-
-        return self
-
-    def _verify_cache_dtype(self) -> None:
-        if self.cache_dtype == "auto":
-            pass
-        elif self.cache_dtype in get_args(CacheDType):
-            logger.info(
-                "Using fp8 data type to store kv cache. It reduces the GPU "
-                "memory footprint and boosts the performance. "
-                "Meanwhile, it may cause accuracy drop without a proper "
-                "scaling factor.")
-        else:
-            raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
-
-    def _verify_prefix_caching(self) -> None:
-        if not self.enable_prefix_caching:
-            return
-
-        if self.sliding_window is not None and not envs.VLLM_USE_V1:
-            raise NotImplementedError(
-                "Prefix caching is not supported with sliding window. "
-                "Run with --disable-sliding-window to use prefix caching.")
-
-        if (self.enable_prefix_caching and self.prefix_caching_hash_algo
-                not in get_args(PrefixCachingHashAlgo)):
-            raise ValueError(
-                "Unknown prefix caching hash algorithm: "
-                f"{self.prefix_caching_hash_algo}. Must be one of "
-                f"{get_args(PrefixCachingHashAlgo)}.")
-
-    def verify_with_parallel_config(
-        self,
-        parallel_config: "ParallelConfig",
-    ) -> None:
-        total_cpu_memory = get_cpu_memory()
-        # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
-        # group are in the same node. However, the GPUs may span multiple nodes.
-        num_gpus_per_node = parallel_config.tensor_parallel_size
-        cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
-
-        msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
-               f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
-               "is allocated for the swap space.")
-        if cpu_memory_usage > 0.7 * total_cpu_memory:
-            raise ValueError("Too large swap space. " + msg)
-        elif cpu_memory_usage > 0.4 * total_cpu_memory:
-            logger.warning("Possibly too large swap space. %s", msg)
-
-
 @config
 @dataclass
 class LoadConfig:
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
new file mode 100644
index 0000000000..69cb0d9732
--- /dev/null
+++ b/vllm/config/cache.py
@@ -0,0 +1,204 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+from dataclasses import field
+from typing import TYPE_CHECKING, Any, Literal, Optional, get_args
+
+from pydantic import SkipValidation, model_validator
+from pydantic.dataclasses import dataclass
+from typing_extensions import Self
+
+import vllm.envs as envs
+from vllm.config.utils import config
+from vllm.logger import init_logger
+from vllm.utils import GiB_bytes, get_cpu_memory
+
+if TYPE_CHECKING:
+    from vllm.config.parallel import ParallelConfig
+else:
+    ParallelConfig = Any
+
+logger = init_logger(__name__)
+
+BlockSize = Literal[1, 8, 16, 32, 64, 128]
+CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
+PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"]
+
+
+@config
+@dataclass
+class CacheConfig:
+    """Configuration for the KV cache."""
+
+    block_size: SkipValidation[BlockSize] = None  # type: ignore
+    """Size of a contiguous cache block in number of tokens. This is ignored on
+    neuron devices and set to `--max-model-len`. On CUDA devices, only block
+    sizes up to 32 are supported. On HPU devices, block size defaults to 128.
+
+    This config has no static default. If left unspecified by the user, it will
+    be set in `Platform.check_and_update_config()` based on the current
+    platform."""
+    gpu_memory_utilization: float = 0.9
+    """The fraction of GPU memory to be used for the model executor, which can
+    range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
+    utilization. If unspecified, will use the default value of 0.9. This is a
+    per-instance limit, and only applies to the current vLLM instance. It does
+    not matter if you have another vLLM instance running on the same GPU. For
+    example, if you have two vLLM instances running on the same GPU, you can
+    set the GPU memory utilization to 0.5 for each instance."""
+    swap_space: float = 4
+    """Size of the CPU swap space per GPU (in GiB)."""
+    cache_dtype: CacheDType = "auto"
+    """Data type for kv cache storage. If "auto", will use model data type.
+    CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
+    fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc)."""
+    is_attention_free: bool = False
+    """Whether the model is attention-free. This is primarily set in
+    `ModelConfig` and that value should be manually duplicated here."""
+    num_gpu_blocks_override: Optional[int] = None
+    """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
+    if specified. Does nothing if `None`. Used for testing preemption."""
+    sliding_window: Optional[int] = None
+    """Sliding window size for the KV cache. This is primarily set in
+    `ModelConfig` and that value should be manually duplicated here."""
+    enable_prefix_caching: Optional[bool] = None
+    """Whether to enable prefix caching. Disabled by default for V0. Enabled by
+    default for V1."""
+    prefix_caching_hash_algo: PrefixCachingHashAlgo = "builtin"
+    """Set the hash algorithm for prefix caching:\n
+    - "builtin" is Python's built-in hash.\n
+    - "sha256" is collision resistant but with certain overheads.
+    This option uses Pickle for object serialization before hashing.\n
+    - "sha256_cbor_64bit" provides a reproducible, cross-language compatible
+    hash. It serializes objects using canonical CBOR and hashes them with
+    SHA-256. The resulting hash consists of the lower 64 bits of the SHA-256
+    digest."""
+    cpu_offload_gb: float = 0
+    """The space in GiB to offload to CPU, per GPU. Default is 0, which means
+    no offloading. Intuitively, this argument can be seen as a virtual way to
+    increase the GPU memory size. For example, if you have one 24 GB GPU and
+    set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
+    load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
+    Note that this requires fast CPU-GPU interconnect, as part of the model is
+    loaded from CPU memory to GPU memory on the fly in each model forward pass.
+    """
+    calculate_kv_scales: bool = False
+    """This enables dynamic calculation of `k_scale` and `v_scale` when
+    kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
+    checkpoint if available. Otherwise, the scales will default to 1.0."""
+    cpu_kvcache_space_bytes: Optional[int] = None
+    """(CPU backend only) CPU key-value cache space."""
+    mamba_page_size_padded: Optional[int] = None
+    """ Optional override for mamba page size; used by hybrid mamba/attention
+    models to ensure exact alignment with attention page size."""
+
+    # Will be set after profiling.
+    num_gpu_blocks: Optional[int] = field(default=None, init=False)
+    """The number of blocks to allocate for GPU memory."""
+    num_cpu_blocks: Optional[int] = field(default=None, init=False)
+    """The number of blocks to allocate for CPU memory."""
+
+    kv_sharing_fast_prefill: bool = False
+    """This feature is work in progress and no prefill optimization takes place
+    with this flag enabled currently.
+
+    In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
+    some layers can skip tokens corresponding to prefill. This flag enables
+    attention metadata for eligible layers to be overriden with metadata
+    necessary for implementating this optimization in some models (e.g. Gemma3n)
+    """
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+        factors.append(self.cache_dtype)
+        # `cpu_offload_gb` does not use `torch.compile` yet.
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self) -> None:
+        self.swap_space_bytes = self.swap_space * GiB_bytes
+
+        self._verify_cache_dtype()
+        self._verify_prefix_caching()
+
+    def metrics_info(self):
+        # convert cache_config to dict(key: str, value: str) for prometheus
+        # metrics info
+        return {key: str(value) for key, value in self.__dict__.items()}
+
+    @model_validator(mode='after')
+    def _verify_args(self) -> Self:
+        if self.cpu_offload_gb < 0:
+            raise ValueError("CPU offload space must be non-negative"
+                             f", but got {self.cpu_offload_gb}")
+
+        if self.gpu_memory_utilization > 1.0:
+            raise ValueError(
+                "GPU memory utilization must be less than 1.0. Got "
+                f"{self.gpu_memory_utilization}.")
+
+        if self.kv_sharing_fast_prefill:
+            logger.warning_once(
+                "--kv-sharing-fast-prefill is currently work in progress "
+                "and not functional yet (i.e. no prefill savings)")
+
+        return self
+
+    def _verify_cache_dtype(self) -> None:
+        if self.cache_dtype == "auto":
+            pass
+        elif self.cache_dtype in get_args(CacheDType):
+            logger.info(
+                "Using fp8 data type to store kv cache. It reduces the GPU "
+                "memory footprint and boosts the performance. "
+                "Meanwhile, it may cause accuracy drop without a proper "
+                "scaling factor.")
+        else:
+            raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
+
+    def _verify_prefix_caching(self) -> None:
+        if not self.enable_prefix_caching:
+            return
+
+        if self.sliding_window is not None and not envs.VLLM_USE_V1:
+            raise NotImplementedError(
+                "Prefix caching is not supported with sliding window. "
+                "Run with --disable-sliding-window to use prefix caching.")
+
+        if (self.enable_prefix_caching and self.prefix_caching_hash_algo
+                not in get_args(PrefixCachingHashAlgo)):
+            raise ValueError(
+                "Unknown prefix caching hash algorithm: "
+                f"{self.prefix_caching_hash_algo}. Must be one of "
+                f"{get_args(PrefixCachingHashAlgo)}.")
+
+    def verify_with_parallel_config(
+        self,
+        parallel_config: ParallelConfig,
+    ) -> None:
+        total_cpu_memory = get_cpu_memory()
+        # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
+        # group are in the same node. However, the GPUs may span multiple nodes.
+        num_gpus_per_node = parallel_config.tensor_parallel_size
+        cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
+
+        msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
+               f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
+               "is allocated for the swap space.")
+        if cpu_memory_usage > 0.7 * total_cpu_memory:
+            raise ValueError("Too large swap space. " + msg)
+        elif cpu_memory_usage > 0.4 * total_cpu_memory:
+            logger.warning("Possibly too large swap space. %s", msg)

From 0757551c96fa97a4f8c0f06519e5b296171a08f1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 10 Aug 2025 22:51:36 +0800
Subject: [PATCH 141/932] [doc] add beijing meetup links (#22596)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 README.md                 | 3 ++-
 docs/community/meetups.md | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5348405b72..a65d4803fa 100644
--- a/README.md
+++ b/README.md
@@ -18,14 +18,15 @@ Easy, fast, and cheap LLM serving for everyone
 
 *Latest News* 🔥
 
+- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
-- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 
 <details>
 <summary>Previous News</summary>
 
+- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index e8b3a9c9c8..36232e6ad9 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -2,6 +2,7 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
 - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
 - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).

From b81fe83b2cfa061cb0f9cd88da9c88f22529f284 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 10 Aug 2025 23:13:47 +0800
Subject: [PATCH 142/932] [doc] add alibaba cloud as sponsor (#22597)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 README.md                  | 1 +
 docs/community/sponsors.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index a65d4803fa..d9e3ca660f 100644
--- a/README.md
+++ b/README.md
@@ -122,6 +122,7 @@ Cash Donations:
 
 Compute Resources:
 
+- Alibaba Cloud
 - AMD
 - Anyscale
 - AWS
diff --git a/docs/community/sponsors.md b/docs/community/sponsors.md
index b8a1ddbe38..6ad3a66252 100644
--- a/docs/community/sponsors.md
+++ b/docs/community/sponsors.md
@@ -15,6 +15,7 @@ Cash Donations:
 
 Compute Resources:
 
+- Alibaba Cloud
 - AMD
 - Anyscale
 - AWS

From b76753f0b58a070f626549115d1414ec421e7e49 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 11 Aug 2025 00:00:36 +0800
Subject: [PATCH 143/932] [Bugfix][Kernel] Support partial rotary embedding for
 MRoPE triton kernel (#22593)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/kernels/{ => core}/test_mrope.py        | 20 +++++++++----
 .../layers/rotary_embedding/mrope.py          | 28 +++++++++++--------
 2 files changed, 30 insertions(+), 18 deletions(-)
 rename tests/kernels/{ => core}/test_mrope.py (92%)

diff --git a/tests/kernels/test_mrope.py b/tests/kernels/core/test_mrope.py
similarity index 92%
rename from tests/kernels/test_mrope.py
rename to tests/kernels/core/test_mrope.py
index 5918b7a58b..3f2f330f6d 100644
--- a/tests/kernels/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -42,12 +42,13 @@ def unroll_model_tp_dict(model_tp_dict):
 model_tp_dict = {
     "Qwen/Qwen2-VL-7B-Instruct": [1, 2],
     "Qwen/Qwen2-VL-72B-Instruct": [1, 2],
-    "Qwen/Qwen2.5-VL-72B-Instruct": [1, 2]
+    "Qwen/Qwen2.5-VL-72B-Instruct": [1, 2],
+    "zai-org/GLM-4.1V-9B-Thinking": [1, 2],
 }
 
 # https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
 dtype_atol_rtol_list = [
-    [torch.bfloat16, 1e-5, 1.6e-2],
+    [torch.bfloat16, 1e-2, 1.6e-2],
 ]
 
 num_tokens_list = [11, 8192]
@@ -73,10 +74,12 @@ def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens):
 
     rope_theta = config.rope_theta
     max_position = config.max_position_embeddings
+    partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
+    rotary_dim = int(head_dim * partial_rotary_factor)
 
     mrope_helper_class = get_rope(
         head_size=head_dim,
-        rotary_dim=head_dim,
+        rotary_dim=rotary_dim,
         max_position=max_position,
         base=rope_theta,
         is_neox_style=is_neox_style,
@@ -110,7 +113,10 @@ def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens):
                     reason="Skipping CUDA/ROCm only tests.")
 @pytest.mark.parametrize(
     "model_name, tp_size",
-    unroll_model_tp_dict({"Qwen/Qwen2-VL-7B-Instruct": [1, 2]}))
+    unroll_model_tp_dict({
+        "Qwen/Qwen2-VL-7B-Instruct": [1, 2],
+        "zai-org/GLM-4.1V-9B-Thinking": [1, 2]
+    }))
 @pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
 @pytest.mark.parametrize("num_tokens", [4])
 def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol,
@@ -126,10 +132,12 @@ def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol,
     is_neox_style = True
     rope_theta = config.rope_theta
     max_position = config.max_position_embeddings
+    partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
+    rotary_dim = int(head_dim * partial_rotary_factor)
 
     mrope_helper_class = get_rope(
         head_size=head_dim,
-        rotary_dim=head_dim,
+        rotary_dim=rotary_dim,
         max_position=max_position,
         base=rope_theta,
         is_neox_style=is_neox_style,
@@ -145,7 +153,7 @@ def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol,
     # Create a wrapper that makes the in-place function appear functional
     def functional_forward_cuda(pos, q, k):
         """Wrapper that converts in-place operation to functional style
-        
+
         CUDA Graph does not support in-place operations.
         This wrapper creates working copies of the 
         input tensors and modifies them.
diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index d3b71930b6..a091cfb743 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -25,6 +25,7 @@ def _triton_qwen2vl_mrope_forward(
     n_qh: tl.constexpr,
     n_kh: tl.constexpr,
     hd: tl.constexpr,
+    rd: tl.constexpr,
     pad_n_qh: tl.constexpr,
     pad_n_kh: tl.constexpr,
     pad_hd: tl.constexpr,
@@ -51,19 +52,19 @@ def _triton_qwen2vl_mrope_forward(
     h_end = t_end + mrope_section_h
 
     # Updated stride calculation for half head_dim
-    half_hd = hd // 2
-    t_cos = cos + pid * half_hd
-    h_cos = t_cos + num_tokens * half_hd
-    w_cos = h_cos + num_tokens * half_hd
-    t_sin = sin + pid * half_hd
-    h_sin = t_sin + num_tokens * half_hd
-    w_sin = h_sin + num_tokens * half_hd
+    half_rd = rd // 2
+    t_cos = cos + pid * half_rd
+    h_cos = t_cos + num_tokens * half_rd
+    w_cos = h_cos + num_tokens * half_rd
+    t_sin = sin + pid * half_rd
+    h_sin = t_sin + num_tokens * half_rd
+    w_sin = h_sin + num_tokens * half_rd
 
     # Updated offsets for half head_dim
     cos_offsets = tl.arange(0, pad_hd // 2)
     t_mask = cos_offsets < t_end
     h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end)
-    w_mask = (h_end <= cos_offsets) & (cos_offsets < half_hd)
+    w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd)
 
     t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0)
     h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0)
@@ -85,9 +86,9 @@ def _triton_qwen2vl_mrope_forward(
     first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(
         0, pad_hd // 2)[None, :]
     first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange(
-        0, pad_hd // 2)[None, :] < hd // 2)
+        0, pad_hd // 2)[None, :] < rd // 2)
     first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange(
-        0, pad_hd // 2)[None, :] < hd // 2)
+        0, pad_hd // 2)[None, :] < rd // 2)
 
     q_tile_1 = tl.load(q_ptr + first_half_q_offsets,
                        mask=first_q_mask,
@@ -97,8 +98,8 @@ def _triton_qwen2vl_mrope_forward(
                        other=0).to(sin_row.dtype)
 
     # right half of the head
-    second_half_q_offsets = first_half_q_offsets + (hd // 2)
-    second_half_k_offsets = first_half_k_offsets + (hd // 2)
+    second_half_q_offsets = first_half_q_offsets + (rd // 2)
+    second_half_k_offsets = first_half_k_offsets + (rd // 2)
     second_q_mask = first_q_mask
     second_k_mask = first_k_mask
 
@@ -130,6 +131,7 @@ def triton_mrope(
     sin: torch.Tensor,
     mrope_section: list[int],
     head_size: int,
+    rotary_dim: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Qwen2VL mrope kernel.
 
@@ -166,6 +168,7 @@ def triton_mrope(
         n_q_head,
         n_kv_head,
         head_size,
+        rotary_dim,
         pad_n_q_head,
         pad_n_kv_head,
         pad_hd,
@@ -300,6 +303,7 @@ class MRotaryEmbedding(RotaryEmbedding):
                 sin,
                 self.mrope_section,
                 self.head_size,
+                self.rotary_dim,
             )
 
             return q.reshape(query_shape), k.reshape(key_shape)

From 65a7917be480c1b0e45f12bfad31eb4b25539db9 Mon Sep 17 00:00:00 2001
From: Breno Baldas Skuk <breno.skuk@hcompany.ai>
Date: Sun, 10 Aug 2025 18:03:15 +0200
Subject: [PATCH 144/932] Fix(benchmarks): allow multiple mm contents in OpenAI
 Chat Completion Benchmarks (#22534)

Signed-off-by: breno.skuk <breno.skuk@hcompany.ai>
---
 benchmarks/backend_request_func.py           | 17 ++++++++++++++---
 benchmarks/benchmark_dataset.py              |  2 +-
 benchmarks/benchmark_serving.py              |  9 ++++++++-
 vllm/benchmarks/datasets.py                  |  4 +++-
 vllm/benchmarks/lib/endpoint_request_func.py | 18 +++++++++++++++---
 vllm/benchmarks/serve.py                     |  9 ++++++++-
 6 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index c7229dbb8e..1559ca2d92 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -31,7 +31,7 @@ class RequestFuncInput:
     model_name: Optional[str] = None
     logprobs: Optional[int] = None
     extra_body: Optional[dict] = None
-    multi_modal_content: Optional[dict] = None
+    multi_modal_content: Optional[dict | list[dict]] = None
     ignore_eos: bool = False
     language: Optional[str] = None
 
@@ -364,7 +364,15 @@ async def async_request_openai_chat_completions(
     ) as session:
         content = [{"type": "text", "text": request_func_input.prompt}]
         if request_func_input.multi_modal_content:
-            content.append(request_func_input.multi_modal_content)
+            mm_content = request_func_input.multi_modal_content
+            if isinstance(mm_content, list):
+                content.extend(mm_content)
+            elif isinstance(mm_content, dict):
+                content.append(mm_content)
+            else:
+                raise TypeError(
+                    "multi_modal_content must be a dict or list[dict] for openai-chat"
+                )
         payload = {
             "model": request_func_input.model_name
             if request_func_input.model_name
@@ -491,7 +499,10 @@ async def async_request_openai_audio(
             buffer.seek(0)
             return buffer
 
-        with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
+        mm_audio = request_func_input.multi_modal_content
+        if not isinstance(mm_audio, dict) or "audio" not in mm_audio:
+            raise TypeError("multi_modal_content must be a dict containing 'audio'")
+        with to_bytes(*mm_audio["audio"]) as f:
             form = aiohttp.FormData()
             form.add_field("file", f, content_type="audio/wav")
             for key, value in payload.items():
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 1ad6cef7a9..ea684f18a7 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -52,7 +52,7 @@ class SampleRequest:
     prompt: Union[str, Any]
     prompt_len: int
     expected_output_len: int
-    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
+    multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None
     lora_request: Optional[LoRARequest] = None
 
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 93b72211eb..ae38caf729 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -263,7 +263,14 @@ async def benchmark(
         input_requests[0].multi_modal_data,
     )
 
-    assert test_mm_content is None or isinstance(test_mm_content, dict)
+    assert (
+        test_mm_content is None
+        or isinstance(test_mm_content, dict)
+        or (
+            isinstance(test_mm_content, list)
+            and all(isinstance(item, dict) for item in test_mm_content)
+        )
+    ), "multi_modal_data must be a dict or list[dict]"
     test_input = RequestFuncInput(
         model=model_id,
         model_name=model_name,
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 45b58035eb..4e8ac51625 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -71,7 +71,9 @@ class SampleRequest:
     prompt: Union[str, Any]
     prompt_len: int
     expected_output_len: int
-    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
+    multi_modal_data: Optional[
+        Union[MultiModalDataDict, dict, list[dict]]
+    ] = None
     lora_request: Optional[LoRARequest] = None
 
 
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
index 2d64cc115f..47bc288774 100644
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -28,7 +28,7 @@ class RequestFuncInput:
     model_name: Optional[str] = None
     logprobs: Optional[int] = None
     extra_body: Optional[dict] = None
-    multi_modal_content: Optional[dict] = None
+    multi_modal_content: Optional[dict | list[dict]] = None
     ignore_eos: bool = False
     language: Optional[str] = None
 
@@ -172,7 +172,16 @@ async def async_request_openai_chat_completions(
 
     content = [{"type": "text", "text": request_func_input.prompt}]
     if request_func_input.multi_modal_content:
-        content.append(request_func_input.multi_modal_content)
+        mm_content = request_func_input.multi_modal_content
+        if isinstance(mm_content, list):
+            content.extend(mm_content)
+        elif isinstance(mm_content, dict):
+            content.append(mm_content)
+        else:
+            raise TypeError(
+                "multi_modal_content must be a dict or list[dict] "
+                "for openai-chat"
+            )
     payload = {
         "model":
         request_func_input.model_name
@@ -310,7 +319,10 @@ async def async_request_openai_audio(
         buffer.seek(0)
         return buffer
 
-    with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
+    mm_audio = request_func_input.multi_modal_content
+    if not isinstance(mm_audio, dict) or "audio" not in mm_audio:
+        raise TypeError("multi_modal_content must be a dict containing 'audio'")
+    with to_bytes(*mm_audio["audio"]) as f:
         form = aiohttp.FormData()
         form.add_field("file", f, content_type="audio/wav")
         for key, value in payload.items():
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 7cdf87cb4c..7bf04c7532 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -365,7 +365,14 @@ async def benchmark(
         input_requests[0].multi_modal_data,
     )
 
-    assert test_mm_content is None or isinstance(test_mm_content, dict)
+    assert (
+        test_mm_content is None
+        or isinstance(test_mm_content, dict)
+        or (
+            isinstance(test_mm_content, list)
+            and all(isinstance(item, dict) for item in test_mm_content)
+        )
+    ), "multi_modal_data must be a dict or list[dict]"
     test_input = RequestFuncInput(
         model=model_id,
         model_name=model_name,

From b4e2916721463b43f3b06ccc980050dfb37b615a Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sun, 10 Aug 2025 09:05:21 -0700
Subject: [PATCH 145/932] Migrate LlavaNextImageInputs to TensorSchema (#21774)

Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/models/llava_next.py | 96 ++++++++----------------
 vllm/utils/tensor_schema.py              |  3 +
 2 files changed, 35 insertions(+), 64 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 04fb6b5736..a63c18493d 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -3,7 +3,7 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping
-from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
+from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar,
                     Union)
 
 import torch
@@ -11,7 +11,6 @@ import torch.nn as nn
 from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor
 from transformers.models.llava_next.modeling_llava_next import (
     get_anyres_image_grid_shape, unpad_image)
-from typing_extensions import NotRequired
 
 from vllm.config import VllmConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -19,6 +18,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig
 from vllm.multimodal.parse import ImageSize
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -30,32 +30,36 @@ from .utils import (AutoWeightsLoader, WeightsMapper, embed_multimodal,
                     flatten_bn, init_vllm_registered_model, maybe_prefix)
 
 
-class LlavaNextImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+class LlavaNextImagePixelInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
-
+    Dimensions:
+        - bn: Batch size * number of images
+        - np: Number of patches + 1
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+    
     Note that `num_patches` may be different per batch and image,
     in which case the data is passed as a list instead of a batched tensor.
     """
+    type: Literal["pixel_values"] = "pixel_values"
+    pixel_values: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np"})]
 
-    image_sizes: NotRequired[torch.Tensor]
+    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
+    # This should be in `(height, width)` format.
+
+
+class LlavaNextImageEmbeddingInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_images, 2)`
-
-    This should be in `(height, width)` format.
-    """
-
-
-class LlavaNextImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
+    Dimensions:
+        - bn: Batch size * number of images
+        - ifs: Image feature size
+        - hs: Hidden size (must match language model backbone)
     """
+    type: Literal["image_embeds"] = "image_embeds"
+    data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
 LlavaNextImageInputs = Union[LlavaNextImagePixelInputs,
@@ -269,44 +273,6 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
-        expected_dims = (2, )
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    f"The expected shape of image sizes per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
-    def _validate_pixel_values(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape[1:])
-
-            if actual_dims != expected_dims:
-                expected_expr = ("num_patches", *map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaNextImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -325,13 +291,15 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
                 raise ValueError("Incorrect type of image sizes. "
                                  f"Got type: {type(image_sizes)}")
 
+            expected_h = expected_w = self.config.vision_config.image_size
             return LlavaNextImagePixelInputs(
                 type="pixel_values",
-                pixel_values=self._validate_pixel_values(
-                    flatten_bn(pixel_values)),
-                image_sizes=self._validate_image_sizes(
-                    flatten_bn(image_sizes, concat=True)),
-            )
+                pixel_values=flatten_bn(pixel_values),
+                image_sizes=flatten_bn(image_sizes, concat=True),
+                resolve_bindings={
+                    "h": expected_h,
+                    "w": expected_w,
+                })
 
         if image_embeds is not None:
             if not isinstance(image_embeds, torch.Tensor):
diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py
index 343df71e10..4c3acf0094 100644
--- a/vllm/utils/tensor_schema.py
+++ b/vllm/utils/tensor_schema.py
@@ -60,6 +60,9 @@ class TensorSchema:
     def __getitem__(self, item) -> Any:
         return getattr(self, item)
 
+    def get(self, item, default=None) -> Any:
+        return getattr(self, item, default)
+
     def _match_shape_with_dynamic(self, actual: tuple[int, ...],
                                   reference: tuple[int, ...],
                                   expected_shape: tuple[Union[int, str], ...],

From 8c50d62f5a51799c2ecc1ad25380a5a6dd7c7180 Mon Sep 17 00:00:00 2001
From: ZiTian Zhao <zitian.zhao@tencentmusic.com>
Date: Mon, 11 Aug 2025 00:20:00 +0800
Subject: [PATCH 146/932] Remove redundant row_indices unsqueeze operation in
 MiniCPMO (#22528)

Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
---
 vllm/model_executor/models/minicpmo.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index 1ee0a94c37..e1746695bd 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -605,7 +605,6 @@ class MiniCPMO(MiniCPMV2_6):
                                   max=size)
         # Create column indices for broadcasting
         col_indices = torch.arange(size, device=device).unsqueeze(0)
-        row_indices = row_indices.unsqueeze(1)
         start_indices = start_indices.unsqueeze(1)
         end_indices = end_indices.unsqueeze(1)
         # Vectorized mask creation

From 68b254d67300a1740db900a3d0ff4252424715d7 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sun, 10 Aug 2025 10:16:44 -0700
Subject: [PATCH 147/932] Fix TensorSchema validation test for symbolic dims
 (#22366)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 tests/standalone_tests/test_tensor_schema.py | 28 +++++++++++---------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/tests/standalone_tests/test_tensor_schema.py b/tests/standalone_tests/test_tensor_schema.py
index e98aa3f53f..69744921b1 100644
--- a/tests/standalone_tests/test_tensor_schema.py
+++ b/tests/standalone_tests/test_tensor_schema.py
@@ -4,8 +4,8 @@
 import pytest
 import torch
 
-from vllm.model_executor.models.fuyu import FuyuImagePatchInputs
 from vllm.model_executor.models.glm4_1v import Glm4vImageEmbeddingInputs
+from vllm.model_executor.models.granite_speech import GraniteSpeechAudioInputs
 from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs
 
 
@@ -129,23 +129,27 @@ def test_tensor_schema_with_invalid_resolve_binding_dims():
 
 
 def test_tensor_schema_with_list_of_symbolic_dim():
-    flat_data = torch.stack([torch.randn(768) for _ in range(3)])  # (bn=3, fn)
-    patches_per_image = [64, 64, 64]  # len = bn = 3
+    input_features = torch.randn(3, 10, 160)  # (b=3, fi=10, 160)
+    input_features_mask = torch.randn(3, 8)  # (b=3, fo=8)
+    audio_embed_sizes = [8, 8, 8]  # len = b = 3
 
-    FuyuImagePatchInputs(
-        flat_data=flat_data,
-        patches_per_image=patches_per_image,
+    GraniteSpeechAudioInputs(
+        input_features=input_features,
+        input_features_mask=input_features_mask,
+        audio_embed_sizes=audio_embed_sizes,
     )
 
 
 def test_tensor_schema_with_list_of_symbolic_dim_mismatch_in_length():
-    flat_data = torch.stack([torch.randn(768) for _ in range(4)])  # (bn=4, fn)
-    patches_per_image = [64, 64, 64]  # len = 3 ≠ bn
+    input_features = torch.randn(4, 10, 160)  # (b=4, fi=10, 160)
+    input_features_mask = torch.randn(4, 8)  # (b=4, fo=8)
+    audio_embed_sizes = [8, 8, 8]  # len = 3 ≠ b
 
-    with pytest.raises(ValueError, match="expected 'bn'=4, got 3"):
-        FuyuImagePatchInputs(
-            flat_data=flat_data,
-            patches_per_image=patches_per_image,
+    with pytest.raises(ValueError, match="expected 'b'=4, got 3"):
+        GraniteSpeechAudioInputs(
+            input_features=input_features,
+            input_features_mask=input_features_mask,
+            audio_embed_sizes=audio_embed_sizes,
         )
 
 
From d1af8b7be9c5ad9d2926ce215771e9cd7279147b Mon Sep 17 00:00:00 2001
From: Doug Smith <dosmith@redhat.com>
Date: Sun, 10 Aug 2025 19:29:02 -0400
Subject: [PATCH 148/932] enable Docker-aware precompiled wheel setup (#22106)

Signed-off-by: dougbtv <dosmith@redhat.com>
---
 docker/Dockerfile |  15 ++--
 setup.py          | 185 +++++++++++++++++++++++++---------------------
 vllm/envs.py      |  11 ++-
 3 files changed, 116 insertions(+), 95 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 04a63f5d68..85f55cac8d 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -210,16 +210,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 
 # Flag to control whether to use pre-built vLLM wheels
-ARG VLLM_USE_PRECOMPILED
-# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
-ENV VLLM_USE_PRECOMPILED=""
-RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
-        export VLLM_USE_PRECOMPILED=1 && \
-        echo "Using precompiled wheels"; \
-    else \
-        unset VLLM_USE_PRECOMPILED && \
-        echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
-    fi
+ARG VLLM_USE_PRECOMPILED=""
 
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -236,6 +227,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
         && export SCCACHE_IDLE_TIMEOUT=0 \
         && export CMAKE_BUILD_TYPE=Release \
+        && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
+        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
         && sccache --show-stats \
         && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
         && sccache --show-stats; \
@@ -249,6 +242,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         # Clean any existing CMake artifacts
         rm -rf .deps && \
         mkdir -p .deps && \
+        export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
+        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
diff --git a/setup.py b/setup.py
index e374fcb816..7f6c787129 100644
--- a/setup.py
+++ b/setup.py
@@ -7,6 +7,7 @@ import json
 import logging
 import os
 import re
+import shutil
 import subprocess
 import sys
 from pathlib import Path
@@ -281,10 +282,81 @@ class cmake_build_ext(build_ext):
             self.copy_file(file, dst_file)
 
 
-class repackage_wheel(build_ext):
+class precompiled_build_ext(build_ext):
+    """Disables extension building when using precompiled binaries."""
+
+    def run(self) -> None:
+        assert _is_cuda(
+        ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+
+    def build_extensions(self) -> None:
+        print("Skipping build_ext: using precompiled extensions.")
+        return
+
+
+class precompiled_wheel_utils:
     """Extracts libraries and other files from an existing wheel."""
 
-    def get_base_commit_in_main_branch(self) -> str:
+    @staticmethod
+    def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
+        import tempfile
+        import zipfile
+
+        temp_dir = None
+        try:
+            if not os.path.isfile(wheel_url_or_path):
+                wheel_filename = wheel_url_or_path.split("/")[-1]
+                temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
+                wheel_path = os.path.join(temp_dir, wheel_filename)
+                print(f"Downloading wheel from {wheel_url_or_path} "
+                      f"to {wheel_path}")
+                from urllib.request import urlretrieve
+                urlretrieve(wheel_url_or_path, filename=wheel_path)
+            else:
+                wheel_path = wheel_url_or_path
+                print(f"Using existing wheel at {wheel_path}")
+
+            package_data_patch = {}
+
+            with zipfile.ZipFile(wheel_path) as wheel:
+                files_to_copy = [
+                    "vllm/_C.abi3.so",
+                    "vllm/_moe_C.abi3.so",
+                    "vllm/_flashmla_C.abi3.so",
+                    "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
+                    "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
+                    "vllm/cumem_allocator.abi3.so",
+                ]
+
+                compiled_regex = re.compile(
+                    r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
+                file_members = list(
+                    filter(lambda x: x.filename in files_to_copy,
+                           wheel.filelist))
+                file_members += list(
+                    filter(lambda x: compiled_regex.match(x.filename),
+                           wheel.filelist))
+
+                for file in file_members:
+                    print(f"[extract] {file.filename}")
+                    target_path = os.path.join(".", file.filename)
+                    os.makedirs(os.path.dirname(target_path), exist_ok=True)
+                    with wheel.open(file.filename) as src, open(
+                            target_path, "wb") as dst:
+                        shutil.copyfileobj(src, dst)
+
+                    pkg = os.path.dirname(file.filename).replace("/", ".")
+                    package_data_patch.setdefault(pkg, []).append(
+                        os.path.basename(file.filename))
+
+            return package_data_patch
+        finally:
+            if temp_dir is not None:
+                print(f"Removing temporary directory {temp_dir}")
+                shutil.rmtree(temp_dir)
+
+    @staticmethod
+    def get_base_commit_in_main_branch() -> str:
         # Force to use the nightly wheel. This is mainly used for CI testing.
         if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
             return "nightly"
@@ -297,6 +369,10 @@ class repackage_wheel(build_ext):
             ]).decode("utf-8")
             upstream_main_commit = json.loads(resp_json)["sha"]
 
+            # In Docker build context, .git may be immutable or missing.
+            if envs.VLLM_DOCKER_BUILD_CONTEXT:
+                return upstream_main_commit
+
             # Check if the upstream_main_commit exists in the local repo
             try:
                 subprocess.check_output(
@@ -329,86 +405,6 @@ class repackage_wheel(build_ext):
                 "wheel may not be compatible with your dev branch: %s", err)
             return "nightly"
 
-    def run(self) -> None:
-        assert _is_cuda(
-        ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
-
-        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
-        if wheel_location is None:
-            base_commit = self.get_base_commit_in_main_branch()
-            wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-            # Fallback to nightly wheel if latest commit wheel is unavailable,
-            # in this rare case, the nightly release CI hasn't finished on main.
-            if not is_url_available(wheel_location):
-                wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-
-        import zipfile
-
-        if os.path.isfile(wheel_location):
-            wheel_path = wheel_location
-            print(f"Using existing wheel={wheel_path}")
-        else:
-            # Download the wheel from a given URL, assume
-            # the filename is the last part of the URL
-            wheel_filename = wheel_location.split("/")[-1]
-
-            import tempfile
-
-            # create a temporary directory to store the wheel
-            temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
-            wheel_path = os.path.join(temp_dir, wheel_filename)
-
-            print(f"Downloading wheel from {wheel_location} to {wheel_path}")
-
-            from urllib.request import urlretrieve
-
-            try:
-                urlretrieve(wheel_location, filename=wheel_path)
-            except Exception as e:
-                from setuptools.errors import SetupError
-
-                raise SetupError(
-                    f"Failed to get vLLM wheel from {wheel_location}") from e
-
-        with zipfile.ZipFile(wheel_path) as wheel:
-            files_to_copy = [
-                "vllm/_C.abi3.so",
-                "vllm/_moe_C.abi3.so",
-                "vllm/_flashmla_C.abi3.so",
-                "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
-                "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
-                "vllm/cumem_allocator.abi3.so",
-                # "vllm/_version.py", # not available in nightly wheels yet
-            ]
-
-            file_members = list(
-                filter(lambda x: x.filename in files_to_copy, wheel.filelist))
-
-            # vllm_flash_attn python code:
-            # Regex from
-            #  `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
-            compiled_regex = re.compile(
-                r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
-            file_members += list(
-                filter(lambda x: compiled_regex.match(x.filename),
-                       wheel.filelist))
-
-            for file in file_members:
-                print(f"Extracting and including {file.filename} "
-                      "from existing wheel")
-                package_name = os.path.dirname(file.filename).replace("/", ".")
-                file_name = os.path.basename(file.filename)
-
-                if package_name not in package_data:
-                    package_data[package_name] = []
-
-                wheel.extract(file)
-                if file_name.endswith(".py"):
-                    # python files shouldn't be added to package_data
-                    continue
-
-                package_data[package_name].append(file_name)
-
 
 def _no_device() -> bool:
     return VLLM_TARGET_DEVICE == "empty"
@@ -639,6 +635,29 @@ package_data = {
     ]
 }
 
+# If using precompiled, extract and patch package_data (in advance of setup)
+if envs.VLLM_USE_PRECOMPILED:
+    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+    wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
+    if wheel_location is not None:
+        wheel_url = wheel_location
+    else:
+        base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
+        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+        from urllib.request import urlopen
+        try:
+            with urlopen(wheel_url) as resp:
+                if resp.status != 200:
+                    wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+        except Exception as e:
+            print(f"[warn] Falling back to nightly wheel: {e}")
+            wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+
+    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
+        wheel_url)
+    for pkg, files in patch.items():
+        package_data.setdefault(pkg, []).extend(files)
+
 if _no_device():
     ext_modules = []
 
@@ -647,7 +666,7 @@ if not ext_modules:
 else:
     cmdclass = {
         "build_ext":
-        repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
+        precompiled_build_ext if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
     }
 
 setup(
diff --git a/vllm/envs.py b/vllm/envs.py
index f81f6dacd8..c26c7f215d 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -70,6 +70,7 @@ if TYPE_CHECKING:
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
     VLLM_USE_PRECOMPILED: bool = False
+    VLLM_DOCKER_BUILD_CONTEXT: bool = False
     VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
     CMAKE_BUILD_TYPE: Optional[str] = None
@@ -234,8 +235,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
 
     # If set, vllm will use precompiled binaries (*.so)
     "VLLM_USE_PRECOMPILED":
-    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
-        os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
+    lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in
+    ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
+
+    # Used to mark that setup.py is running in a Docker build context,
+    # in order to force the use of precompiled binaries.
+    "VLLM_DOCKER_BUILD_CONTEXT":
+    lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in
+    ("1", "true"),
 
     # Whether to force using nightly wheel in python build.
     # This is used for testing the nightly wheel in python build.

From a554991748584b00e3bbd2ab192cbcac3f630263 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sun, 10 Aug 2025 19:29:16 -0700
Subject: [PATCH 149/932] Migrate LlavaNextVideoPixelInputs to TensorSchema
 (#21843)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 .../model_executor/models/llava_next_video.py | 57 +++++++------------
 1 file changed, 22 insertions(+), 35 deletions(-)

diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index a96df0b6f5..abc519edad 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -3,7 +3,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -25,6 +25,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .llava import init_vision_tower_for_llava
@@ -35,17 +36,25 @@ from .utils import (AutoWeightsLoader, WeightsMapper,
 from .vision import get_vision_encoder_info
 
 
-class LlavaNextVideoPixelInputs(TypedDict):
-    type: Literal["pixel_values_videos"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    Shape: `(batch_size, num_frames, num_channels, height, width)`
+class LlavaNextVideoPixelInputs(TensorSchema):
+    """    
+    Dimensions:
+        - bs: Batch size
+        - nv: Number of videos
+        - nf: Number of frames
+        - nc: Number of channels (3)
+        - h: Height of each frame
+        - w: Width of each frame
 
     Note that `num_frames` may be different for each batch, in which case
     the data is passed as a list instead of a batched tensor.
 
     Note that it only supports one video input for one batch.
     """
+    type: Literal["pixel_values_videos"] = "pixel_values_videos"
+
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("bs", "nv", "nf", 3, "h", "w")]
 
 
 class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
@@ -320,27 +329,6 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.make_empty_intermediate_tensors = (
             self.language_model.model.make_empty_intermediate_tensors)
 
-    def _validate_video_pixel_values(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape[2:])
-
-            if actual_dims != expected_dims:
-                expected_expr = ("num_frames", *map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values in each video frame "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_video_input(
             self, **kwargs: object) -> Optional[LlavaNextVideoPixelInputs]:
         """
@@ -355,14 +343,13 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
         if pixel_values_videos is None:
             return None
 
-        if not isinstance(pixel_values_videos, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of pixel_values_videos. "
-                             f"Got type: {type(pixel_values_videos)}")
-
-        return LlavaNextVideoPixelInputs(
-            type="pixel_values_videos",
-            data=pixel_values_videos,
-        )
+        expected_h = expected_w = self.config.vision_config.image_size
+        return LlavaNextVideoPixelInputs(type="pixel_values_videos",
+                                         data=pixel_values_videos,
+                                         resolve_bindings={
+                                             "h": expected_h,
+                                             "w": expected_w,
+                                         })
 
     def _select_image_features(self, image_features: torch.Tensor, *,
                                strategy: str) -> torch.Tensor:

From 06da44f0cbf84da771a2a1e336e06432a09875c8 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sun, 10 Aug 2025 19:29:19 -0700
Subject: [PATCH 150/932] Migrate LlavaImageInputs to TensorSchema (#21770)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/llava.py | 67 +++++++++++++++--------------
 1 file changed, 35 insertions(+), 32 deletions(-)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index cfc6ffd99a..708ca98995 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -3,7 +3,7 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
+from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar,
                     Union, cast)
 
 import torch
@@ -33,6 +33,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -44,35 +45,46 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
 from .vision import get_vision_encoder_info
 
 
-class LlavaImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
+class LlavaImagePixelInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_images, num_channels, height, width)`
-
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+    
     Note that `height` or `width` may be different per batch and image,
     in which case the data is passed as a list instead of a batched tensor.
     """
+    type: Literal["pixel_values"] = "pixel_values"
+    pixel_values: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
 
-class PixtralHFImagePixelInputs(TypedDict):
-    type: Literal["pixel_values_pixtral"]
-    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+class PixtralHFImagePixelInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_images, num_channels, height, width)`
-
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels
+        - h: Height
+        - w: Width
+    
     Note that `height` or `width` may be different per batch and image,
     in which case the data is passed as a list instead of a batched tensor.
     """
+    type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral"
+    pixel_values: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                            TensorShape("bn", "c", "h", "w")]
 
 
-class LlavaImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
+class LlavaImageEmbeddingInputs(TensorSchema):
     """
+    Dimensions:
+        - bn: Batch size * number of images
+        - ifs: Image feature size
+        - hs: Hidden size (must match language model backbone)
+    """
+    type: Literal["image_embeds"] = "image_embeds"
+    data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
 LlavaImageInputs = Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs,
@@ -547,19 +559,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -579,10 +578,14 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
                     pixel_values=flatten_bn(pixel_values),
                 )
 
+            expected_h = expected_w = self.config.vision_config.image_size
             return LlavaImagePixelInputs(
                 type="pixel_values",
-                pixel_values=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                pixel_values=flatten_bn(pixel_values, concat=True),
+                resolve_bindings={
+                    "h": expected_h,
+                    "w": expected_w
+                },
             )
 
         if image_embeds is not None:

From b799f4b9ea8d15d62c4f4a97926b274561fd9492 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Sun, 10 Aug 2025 19:30:00 -0700
Subject: [PATCH 151/932] [CI/Build] Fix tensorizer test for load_format change
 (#22583)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                          | 1 -
 tests/entrypoints/openai/test_tensorizer_entrypoint.py | 2 +-
 tests/tensorizer_loader/test_tensorizer.py             | 4 ++--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 221888edb3..db7351edbb 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -426,7 +426,6 @@ steps:
 
 - label: Tensorizer Test # 11min
   mirror_hardwares: [amdexperimental]
-  soft_fail: true
   source_file_dependencies:
   - vllm/model_executor/model_loader
   - tests/tensorizer_loader
diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
index 4bf3798503..058e96f203 100644
--- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py
+++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -44,7 +44,7 @@ def model_uri(tmp_dir):
 def tensorize_model_and_lora(tmp_dir, model_uri):
     tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri,
                                          lora_dir=tmp_dir)
-    args = EngineArgs(model=MODEL_NAME, device="cuda")
+    args = EngineArgs(model=MODEL_NAME)
 
     tensorize_lora_adapter(LORA_PATH, tensorizer_config)
     tensorize_vllm_model(args, tensorizer_config)
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index b8d7892e57..0fb142a1b6 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -166,7 +166,7 @@ def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref):
         combined_output = out + err
         assert ("ValueError: Model loader extra config "
                 "is not supported for load "
-                "format LoadFormat.AUTO") in combined_output
+                "format auto") in combined_output
     finally:
         del model
         gc.collect()
@@ -186,7 +186,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd,
 
         combined_output = out + err
         assert ("ValueError: Model loader extra config is not supported "
-                "for load format LoadFormat.SAFETENSORS") in combined_output
+                "for load format safetensors") in combined_output
     finally:
         del model
         gc.collect()

From 5898b135abc7b7c0ef7107d21a07d54a84314b7c Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sun, 10 Aug 2025 19:33:48 -0700
Subject: [PATCH 152/932] [BugFix] Fix KVConnectorOutput TPU breakage (#22598)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/kv_connector/unit/utils.py | 12 ++++++++----
 vllm/v1/core/sched/scheduler.py     |  4 ++--
 vllm/v1/worker/tpu_model_runner.py  | 13 +++++++++----
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 291c84d117..c22d5b861e 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -179,6 +179,13 @@ def create_model_runner_output(
     sampled_token = EOS_TOKEN_ID if use_eos else 0
     sampled_token_ids = [[sampled_token] for _ in req_ids]
 
+    kv_connector_output = None if (
+        finished_sending is None
+        and finished_recving is None) else KVConnectorOutput(
+            finished_sending=finished_sending,
+            finished_recving=finished_recving,
+        )
+
     # Make output data structure.
     return ModelRunnerOutput(
         req_ids=req_ids,
@@ -188,10 +195,7 @@ def create_model_runner_output(
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=None,
-        kv_connector_output=KVConnectorOutput(
-            finished_sending=finished_sending,
-            finished_recving=finished_recving,
-        ),
+        kv_connector_output=kv_connector_output,
     )
 
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 85fc1a4a01..dcb9f4dd36 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1151,8 +1151,8 @@ class Scheduler(SchedulerInterface):
             scheduler the request during the next step.
         """
 
-        assert self.connector is not None
-        self.connector.update_connector_output(kv_connector_output)
+        if self.connector is not None:
+            self.connector.update_connector_output(kv_connector_output)
 
         # KV Connector:: update recv and send status from last step.
         for req_id in (kv_connector_output.finished_recving or ()):
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 915869726f..ae0219458e 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1138,6 +1138,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     i, target_slice] = valid_sampled_token_ids[i]
                 req_state.output_token_ids.extend(valid_sampled_token_ids[i])
 
+        kv_connector_output = None if (
+            finished_sending is None
+            and finished_recving is None) else KVConnectorOutput(
+                finished_sending=finished_sending,
+                finished_recving=finished_recving,
+            )
+
         model_runner_output = ModelRunnerOutput(
             req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
@@ -1146,10 +1153,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
             pooler_output=[],
-            kv_connector_output=KVConnectorOutput(
-                finished_sending=finished_sending,
-                finished_recving=finished_recving,
-            ))
+            kv_connector_output=kv_connector_output,
+        )
 
         # Check there are no new graphs compiled - all the graphs should be
         # captured and compiled during warm up.

From 1b9902806915040ac9b3029f2ab7522ec505afc3 Mon Sep 17 00:00:00 2001
From: Lifans <lifans@meta.com>
Date: Sun, 10 Aug 2025 19:49:51 -0700
Subject: [PATCH 153/932] [Misc][gpt-oss] Add rules to label gpt-oss related
 PRs (#22600)

Signed-off-by: Lifan Shen <lifans@meta.com>
---
 .github/mergify.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index d8ae509e0a..495d207d44 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -118,6 +118,20 @@ pull_request_rules:
       add:
         - qwen
 
+- name: label-gpt-oss
+  description: Automatically apply gpt-oss label
+  conditions:
+    - or:
+      - files~=^examples/.*gpt[-_]?oss.*\.py
+      - files~=^tests/.*gpt[-_]?oss.*\.py
+      - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
+      - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
+      - title~=(?i)gpt[-_]?oss
+  actions:
+    label:
+      add:
+        - gpt-oss
+
 - name: label-rocm
   description: Automatically apply rocm label
   conditions:

From afa5b7ca0b417abadfa85e32f28969b72e58a885 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Sun, 10 Aug 2025 21:29:35 -0700
Subject: [PATCH 154/932] [Misc][gpt-oss] guard import when triton kernel when
 not up to date  (#22584)

Signed-off-by: zhewenli <zhewenli@meta.com>
---
 .../fused_moe/gpt_oss_triton_kernels_moe.py       | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 6d6a2e22bc..6b5284dc6c 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -5,15 +5,24 @@ from typing import TYPE_CHECKING, Any, Optional
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate)
 from vllm.model_executor.layers.fused_moe.utils import extract_required_args
 from vllm.utils import has_triton_kernels
 
+logger = init_logger(__name__)
+
 if has_triton_kernels():
-    import triton_kernels.swiglu
-    from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs
-    from triton_kernels.routing import routing
+    try:
+        import triton_kernels.swiglu
+        from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation,
+                                               matmul_ogs)
+        from triton_kernels.routing import routing
+    except ModuleNotFoundError:
+        logger.error(
+            "Failed to import Triton kernels. Please make sure your triton "
+            "version is compatible.")
 
 if TYPE_CHECKING:
     from triton_kernels.matmul_ogs import PrecisionConfig

From f919d4cb8faac8c869ab87ee705dbd340fae4679 Mon Sep 17 00:00:00 2001
From: Eugene Cheah <PicoCreator@users.noreply.github.com>
Date: Sun, 10 Aug 2025 22:52:31 -0700
Subject: [PATCH 155/932] [BugFix] Fix logits repetition penalty cuda check
 (#22592)

---
 vllm/_custom_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 92de394180..70605d3c5f 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -311,7 +311,7 @@ def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
         output_mask: A boolean tensor indicating which tokens appear in the output.
         repetition_penalties: The repetition penalties of shape (num_seqs, ).
     """
-    if current_platform.is_cuda() and logits.is_contiguous():
+    if logits.is_cuda and logits.is_contiguous():
         apply_repetition_penalties_cuda(logits, prompt_mask, output_mask,
                                         repetition_penalties)
     else:

From 9c97a1c3496d7d8574dd0d2b3fffeae5cc2223ca Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Mon, 11 Aug 2025 13:52:34 +0800
Subject: [PATCH 156/932] [ROCm][AITER] Support AITER Rope ops in
 RotaryEmbedding Module. (#22521)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../layers/rotary_embedding/base.py           |  71 ++++++++++
 .../layers/rotary_embedding/common.py         |   4 +-
 .../rotary_embedding/deepseek_scaling_rope.py |  12 +-
 .../rotary_embedding/rocm_aiter_rope_ops.py   | 127 ++++++++++++++++++
 4 files changed, 204 insertions(+), 10 deletions(-)
 create mode 100644 vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py

diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 10fce857a8..6dfc28be7d 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -8,6 +8,7 @@ import torch
 from vllm.model_executor.custom_op import CustomOp
 
 from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch
+from .rocm_aiter_rope_ops import is_rocm_rotary_embedding_enabled
 
 
 @CustomOp.register("rotary_embedding")
@@ -35,6 +36,7 @@ class RotaryEmbedding(CustomOp):
         cache = cache.to(dtype)
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
+        self.is_rocm_aiter_enabled = is_rocm_rotary_embedding_enabled()
 
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
@@ -119,6 +121,75 @@ class RotaryEmbedding(CustomOp):
                                  self.cos_sin_cache, self.is_neox_style)
         return query, key
 
+    def forward_hip(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+        is_nope_first=False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        # currently only rotary embedding ops from AITER package are
+        # supported for HiP forward.
+        if self.is_rocm_aiter_enabled:
+            return self.forward_hip_rocm_aiter(positions, query, key, offsets,
+                                               is_nope_first)
+        return self.forward_native(positions, query, key, offsets)
+
+    def forward_hip_rocm_aiter(
+        self,
+        positions: torch.Tensor,
+        # if     is_nope_first
+        # [[batch_size, seq_len, num_heads, nope_size+rope_size]
+        # if NOT is_nope_first
+        # [[batch_size, seq_len, num_heads, rope_size+nope_size],
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+        is_nope_first: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if self.cos_sin_cache.device != query.device or \
+            self.cos_sin_cache.dtype != query.dtype:
+            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                       dtype=query.dtype)
+        cos, sin = self.cos_sin_cache.chunk(2, dim=-1)
+
+        cos = cos.unsqueeze(-2).unsqueeze(-2)
+        sin = sin.unsqueeze(-2).unsqueeze(-2)
+
+        rotate_style = 0 if self.is_neox_style else 1
+
+        num_tokens = positions.numel()
+
+        query_shape = query.shape
+        query = query.view(1, num_tokens, -1, self.head_size)
+        if key is not None:
+            key_shape = key.shape
+            key = key.view(1, num_tokens, -1, self.head_size)
+
+        positions = positions.view(*query.shape[:2])
+        if offsets is not None:
+            offsets = offsets.view(*query.shape[:2])
+
+        if not is_nope_first:
+            query_ = query[..., :self.rotary_dim]
+            key_ = key[..., :self.rotary_dim] if key is not None else None
+        else:
+            query_ = query[..., -self.rotary_dim:]
+            key_ = key[..., -self.rotary_dim:] if key is not None else None
+
+        if key_ is None:
+            torch.ops.vllm.rocm_aiter_rotary_emb_without_key_forward_hip(
+                positions, sin, cos, query_, offsets, rotate_style,
+                is_nope_first)
+            return query.view(query_shape), None
+
+        torch.ops.vllm.rocm_aiter_rotary_emb_with_key_forward_hip(
+            positions, sin, cos, query_, key_, offsets, rotate_style,
+            is_nope_first)
+
+        return query.view(query_shape), key.view(key_shape)
+
     def forward_xpu(
         self,
         positions: torch.Tensor,
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 8d821bea19..99b6bb2120 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -99,7 +99,7 @@ def yarn_linear_ramp_mask(low: float, high: float, dim: int,
     return ramp_func
 
 
-def yarn_get_mscale(scale: float = 1) -> float:
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
     if scale <= 1:
         return 1.0
-    return 0.1 * math.log(scale) + 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index cd888b7334..5af671703a 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import math
 from typing import Optional
 
 import torch
@@ -10,13 +9,7 @@ from vllm.platforms import current_platform
 
 from .base import RotaryEmbedding
 from .common import (rotate_gptj, rotate_neox, yarn_find_correction_range,
-                     yarn_linear_ramp_mask)
-
-
-def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
-    if scale <= 1:
-        return 1.0
-    return 0.1 * mscale * math.log(scale) + 1.0
+                     yarn_get_mscale, yarn_linear_ramp_mask)
 
 
 class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
@@ -96,6 +89,9 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         offsets: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         """PyTorch-native implementation equivalent to forward()."""
+        if self.is_rocm_aiter_enabled:
+            return self.forward_hip_rocm_aiter(positions, query, key, offsets)
+
         assert key is not None
         query_rot = query[..., :self.rotary_dim]
         key_rot = key[..., :self.rotary_dim]
diff --git a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
new file mode 100644
index 0000000000..91a2318bad
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+import vllm.envs as envs
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
+
+
+def is_rocm_rotary_embedding_enabled() -> bool:
+    return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER)
+
+
+def rocm_aiter_rotary_emb_without_key_forward_hip_impl(
+    positions: torch.Tensor,
+    sin: torch.Tensor,
+    cos: torch.Tensor,
+    query: torch.Tensor,
+    offsets: Optional[torch.Tensor] = None,
+    rotate_style: int = 0,
+    is_nope_first: bool = False,
+) -> None:
+    import aiter as ops
+    if offsets is None:
+        ops.rope_cached_positions_fwd_inplace(
+            query,
+            cos,
+            sin,
+            positions,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            nope_first=is_nope_first,
+        )
+    else:
+        ops.rope_cached_positions_offsets_fwd_inplace(
+            query,
+            cos,
+            sin,
+            positions,
+            offsets,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            nope_first=is_nope_first,
+        )
+
+
+def rocm_aiter_rotary_emb_with_key_forward_hip_impl(
+    positions: torch.Tensor,
+    sin: torch.Tensor,
+    cos: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    offsets: Optional[torch.Tensor] = None,
+    rotate_style: int = 0,
+    is_nope_first: bool = False,
+) -> None:
+    import aiter as ops
+    if offsets is None:
+        ops.rope_cached_positions_2c_fwd_inplace(
+            query,
+            key,
+            cos,
+            sin,
+            positions,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            nope_first=is_nope_first,
+        )
+    else:
+        ops.rope_cached_positions_offsets_2c_fwd_inplace(
+            query,
+            key,
+            cos,
+            sin,
+            positions,
+            offsets,
+            rotate_style,
+            reuse_freqs_front_part=True,
+            nope_first=is_nope_first,
+        )
+
+
+def rocm_aiter_rotary_emb_with_key_forward_hip_fake(
+    positions: torch.Tensor,
+    sin: torch.Tensor,
+    cos: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    offsets: Optional[torch.Tensor] = None,
+    rotate_style: int = 0,
+    is_nope_first: bool = False,
+) -> None:
+    pass
+
+
+def rocm_aiter_rotary_emb_without_key_forward_hip_fake(
+    positions: torch.Tensor,
+    sin: torch.Tensor,
+    cos: torch.Tensor,
+    query: torch.Tensor,
+    offsets: Optional[torch.Tensor] = None,
+    rotate_style: int = 0,
+    is_nope_first: bool = False,
+) -> None:
+    pass
+
+
+if is_rocm_rotary_embedding_enabled():
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_rotary_emb_with_key_forward_hip",
+        op_func=rocm_aiter_rotary_emb_with_key_forward_hip_impl,
+        mutates_args=["key", "query"],
+        fake_impl=rocm_aiter_rotary_emb_with_key_forward_hip_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_rotary_emb_without_key_forward_hip",
+        op_func=rocm_aiter_rotary_emb_without_key_forward_hip_impl,
+        mutates_args=["query"],
+        fake_impl=rocm_aiter_rotary_emb_without_key_forward_hip_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
\ No newline at end of file

From 39052dbca87616a549ab152713f1a3020b2f4eb8 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Mon, 11 Aug 2025 02:54:59 -0300
Subject: [PATCH 157/932] Support token_type_ids in V1 with less code changes
 (#21985)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 tests/entrypoints/openai/test_rerank.py       |  4 +-
 tests/entrypoints/openai/test_score.py        |  4 +-
 tests/models/language/pooling/test_scoring.py |  9 ++
 vllm/entrypoints/llm.py                       | 54 ++++++------
 vllm/entrypoints/openai/serving_score.py      | 82 +++++++----------
 vllm/entrypoints/score_utils.py               | 40 ++++++++-
 vllm/model_executor/models/bert.py            | 88 +++++++++++++------
 vllm/model_executor/models/roberta.py         | 36 ++++----
 vllm/pooling_params.py                        |  8 +-
 vllm/v1/worker/gpu_model_runner.py            | 40 +++++++++
 10 files changed, 235 insertions(+), 130 deletions(-)

diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
index f121693e32..73364294cb 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -126,7 +126,9 @@ def test_invocations(server: RemoteOpenAIServer):
                                                  invocation_output["results"]):
         assert rerank_result.keys() == invocations_result.keys()
         assert rerank_result["relevance_score"] == pytest.approx(
-            invocations_result["relevance_score"], rel=0.01)
+            invocations_result["relevance_score"], rel=0.05)
+        # TODO: reset this tolerance to 0.01 once we find
+        # an alternative to flash_attn with bfloat16
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index 1a5df1d2db..cb6ec795ae 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -220,7 +220,9 @@ class TestModel:
                                                invocation_output["data"]):
             assert score_data.keys() == invocation_data.keys()
             assert score_data["score"] == pytest.approx(
-                invocation_data["score"], rel=0.01)
+                invocation_data["score"], rel=0.05)
+            # TODO: reset this tolerance to 0.01 once we find
+            # an alternative to flash_attn with bfloat16
 
     def test_activation(self, server: RemoteOpenAIServer, model: dict[str,
                                                                       Any]):
diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py
index ef9d5530cd..6b5ff70681 100644
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -23,6 +23,15 @@ TEXTS_2 = [
     "The capital of Germany is Berlin.",
 ]
 
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 DTYPE = "half"
 
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index ca24b0c32b..4014a961c6 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -28,11 +28,15 @@ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                          apply_mistral_chat_template,
                                          parse_chat_messages,
                                          resolve_chat_template_content_format)
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.entrypoints.score_utils import (ScoreContentPartParam,
                                           ScoreMultiModalParam,
                                           _cosine_similarity,
                                           _validate_score_input_lens,
+                                          compress_token_type_ids,
                                           get_score_prompt)
+# yapf: enable
 from vllm.entrypoints.utils import (_validate_truncation_size,
                                     log_non_default_args)
 from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
@@ -1329,6 +1333,7 @@ class LLM:
 
         model_config = self.llm_engine.model_config
         pooling_params.verify("score", model_config)
+        pooling_params_list = list[PoolingParams]()
 
         tokenization_kwargs: dict[str, Any] = {}
 
@@ -1339,38 +1344,31 @@ class LLM:
 
         input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
 
-        if model_config.is_multimodal_model:
-            for q, d in input_pairs:
-                _, engine_prompt = get_score_prompt(
-                    model_config=model_config,
-                    data_1=q,
-                    data_2=d,
-                    tokenizer=tokenizer,
-                    tokenization_kwargs=tokenization_kwargs,
-                )
+        model_config = self.llm_engine.model_config
 
-                parsed_prompts.append(engine_prompt)
-        else:
-            for q, t in input_pairs:
-                if model_config.use_pad_token:
-                    # cross_encoder models defaults to using pad_token.
-                    prompt_inputs = tokenizer(
-                        text=q,  # type: ignore[arg-type]
-                        text_pair=t,  # type: ignore[arg-type]
-                        **tokenization_kwargs)
-                else:
-                    # `llm as reranker` models defaults to not using pad_token.
-                    prompt_inputs = tokenizer(
-                        text=q + t,  # type: ignore[operator]
-                        **tokenization_kwargs)
-                engine_prompt = TokensPrompt(
-                    prompt_token_ids=prompt_inputs["input_ids"],
-                    token_type_ids=prompt_inputs.get("token_type_ids"))
-                parsed_prompts.append(engine_prompt)
+        for q, d in input_pairs:
+            _, engine_prompt = get_score_prompt(
+                model_config=model_config,
+                data_1=q,
+                data_2=d,
+                tokenizer=tokenizer,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+
+            if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop(
+                    "token_type_ids", None)):
+                params = pooling_params.clone()
+                compressed = compress_token_type_ids(token_type_ids)
+                params.extra_kwargs = {"compressed_token_type_ids": compressed}
+                pooling_params_list.append(params)
+            else:
+                pooling_params_list.append(pooling_params)
+
+            parsed_prompts.append(engine_prompt)
 
         self._validate_and_add_requests(
             prompts=parsed_prompts,
-            params=pooling_params,
+            params=pooling_params_list,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
         )
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 4da2094147..c246274514 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -7,6 +7,7 @@ from typing import Any, Optional, Union
 
 from fastapi import Request
 
+from vllm import envs
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
@@ -17,11 +18,15 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument,
                                               ScoreResponseData, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.entrypoints.score_utils import (ScoreContentPartParam,
                                           ScoreMultiModalParam,
                                           _cosine_similarity,
                                           _validate_score_input_lens,
+                                          compress_token_type_ids,
                                           get_score_prompt)
+# yapf: enable
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
@@ -158,6 +163,8 @@ class ServingScores(OpenAIServing):
             tokenizer=tokenizer,
             tokenization_kwargs=tokenization_kwargs,
         )
+        self._validate_input(request, engine_prompt["prompt_token_ids"],
+                             full_prompt)
         if request.mm_processor_kwargs is not None:
             engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
 
@@ -188,64 +195,27 @@ class ServingScores(OpenAIServing):
 
         input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
 
-        if self.model_config.is_multimodal_model:
+        preprocess_async = make_async(self._preprocess_score,
+                                      executor=self._tokenizer_executor)
 
-            preprocess_async = make_async(self._preprocess_score,
-                                          executor=self._tokenizer_executor)
+        preprocessed_prompts = await asyncio.gather(
+            *(preprocess_async(request=request,
+                               tokenizer=tokenizer,
+                               tokenization_kwargs=tokenization_kwargs,
+                               data_1=t1,
+                               data_2=t2) for t1, t2 in input_pairs))
 
-            preprocessed_prompts = await asyncio.gather(
-                *(preprocess_async(request=request,
-                                   tokenizer=tokenizer,
-                                   tokenization_kwargs=tokenization_kwargs,
-                                   data_1=t1,
-                                   data_2=t2) for t1, t2 in input_pairs))
-
-            for full_prompt, engine_prompt in preprocessed_prompts:
-                request_prompts.append(full_prompt)
-                engine_prompts.append(engine_prompt)
-
-        else:
-            tokenize_async = make_async(tokenizer.__call__,
-                                        executor=self._tokenizer_executor)
-            use_pad_token = self.model_config.use_pad_token
-
-            if use_pad_token:
-                # cross_encoder models defaults to using pad_token.
-                tokenized_prompts = await asyncio.gather(*(
-                    tokenize_async(
-                        text=t1,  # type: ignore[arg-type]
-                        text_pair=t2,  # type: ignore[arg-type]
-                        **tokenization_kwargs) for t1, t2 in input_pairs))
-            else:
-                # `llm as reranker` models defaults to not using pad_token.
-                tokenized_prompts = await asyncio.gather(*(
-                    tokenize_async(
-                        text=t1 +  # type: ignore[operator]
-                        t2,
-                        **tokenization_kwargs) for t1, t2 in input_pairs))
-
-            for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs):
-                sep_token = tokenizer.sep_token if (tokenizer.sep_token
-                                                    and use_pad_token) else ''
-                request_prompt = f"{t1}{sep_token}{t2}"
-
-                input_ids = prompt_inputs["input_ids"]
-                text_token_prompt = \
-                    self._validate_input(request, input_ids, request_prompt)
-                engine_prompt = TokensPrompt(
-                    prompt_token_ids=text_token_prompt["prompt_token_ids"],
-                    token_type_ids=prompt_inputs.get("token_type_ids"))
-
-                request_prompts.append(request_prompt)
-                engine_prompts.append(engine_prompt)
+        for full_prompt, engine_prompt in preprocessed_prompts:
+            request_prompts.append(full_prompt)
+            engine_prompts.append(engine_prompt)
 
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
 
-        pooling_params = request.to_pooling_params()
+        default_pooling_params = request.to_pooling_params()
 
         try:
-            pooling_params.verify("score", self.model_config)
+            default_pooling_params.verify("score", self.model_config)
         except ValueError as e:
             return self.create_error_response(str(e))
 
@@ -254,9 +224,19 @@ class ServingScores(OpenAIServing):
 
             self._log_inputs(request_id_item,
                              request_prompts[i],
-                             params=pooling_params,
+                             params=default_pooling_params,
                              lora_request=lora_request)
 
+            if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop(
+                    "token_type_ids", None)):
+                pooling_params = default_pooling_params.clone()
+                compressed = compress_token_type_ids(token_type_ids)
+                pooling_params.extra_kwargs = {
+                    "compressed_token_type_ids": compressed
+                }
+            else:
+                pooling_params = (default_pooling_params)
+
             generator = self.engine_client.encode(
                 engine_prompt,
                 pooling_params,
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index f3f042355c..642d638953 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -184,15 +184,49 @@ def get_score_prompt(
         model_config,
         tokenizer,
     )
+    from vllm.model_executor.model_loader import get_model_cls
 
-    full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
-
-    prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
+    model = get_model_cls(model_config)
+    if supports_score_template(model):
+        full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
+        prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
+    elif model_config.use_pad_token:
+        # cross_encoder models defaults to using pad_token.
+        prompt_inputs = tokenizer(text=prompt_1,
+                                  text_pair=prompt_2,
+                                  **tokenization_kwargs)
+        full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
+    else:
+        # `llm as reranker` models defaults to not using pad_token.
+        full_prompt = prompt_1 + prompt_2
+        prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs)
 
     engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])
 
+    if (token_type_ids := prompt_inputs.get("token_type_ids")) is not None:
+        engine_prompt["token_type_ids"] = token_type_ids
+
     post_process_tokens(model_config, engine_prompt)
 
     if mm_data is not None:
         engine_prompt["multi_modal_data"] = mm_data
     return full_prompt, engine_prompt
+
+
+def compress_token_type_ids(token_type_ids: list[int]) -> int:
+    """
+    Return position of the first 1 or the length of the list
+    if not found.
+    """
+    first_one = len(token_type_ids)
+    err_msg = "Token type ids are expected to be a sequence"\
+              " of zeros followed by a sequence of ones"
+    for i, type_id in enumerate(token_type_ids):
+        if type_id == 0 and first_one < i:
+            raise ValueError(err_msg)
+        elif type_id == 1 and first_one > i:
+            first_one = i
+        elif type_id > 1:
+            raise ValueError(err_msg)
+
+    return first_one
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 8f988903f7..3d5d5d505b 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -28,7 +28,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
 
-from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only
+from .interfaces import SupportsCrossEncoding, SupportsQuant
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
@@ -60,21 +60,13 @@ class BertEmbedding(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        input_shape = input_ids.size()
 
-        # Input embeddings.
+        token_type_ids = _decode_token_type_ids(input_ids)
+
         inputs_embeds = self.word_embeddings(input_ids)
-
-        # Position embeddings.
         position_embeddings = self.position_embeddings(position_ids)
 
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape,
-                                         dtype=torch.long,
-                                         device=inputs_embeds.device)
-
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
 
         embeddings = inputs_embeds + token_type_embeddings + position_embeddings
@@ -350,25 +342,23 @@ class BertModel(nn.Module, SupportsQuant):
     ) -> None:
         super().__init__()
 
-        config = vllm_config.model_config.hf_config
-        self.embeddings = embedding_class(config)
+        self.config = vllm_config.model_config.hf_config
+        self.embeddings = embedding_class(self.config)
         self.encoder = BertEncoder(vllm_config=vllm_config,
                                    prefix=f"{prefix}.encoder")
 
     def forward(
         self,
         input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
+        positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
             hidden_states = self.embeddings(input_ids=input_ids,
-                                            position_ids=position_ids,
-                                            token_type_ids=token_type_ids)
+                                            position_ids=positions)
         return self.encoder(hidden_states)
 
     def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
@@ -468,13 +458,11 @@ class BertEmbeddingModel(nn.Module, SupportsQuant):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        token_type_ids: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         return self.model(input_ids=input_ids,
-                          position_ids=positions,
-                          token_type_ids=token_type_ids,
+                          positions=positions,
                           inputs_embeds=inputs_embeds,
                           intermediate_tensors=intermediate_tensors)
 
@@ -508,8 +496,53 @@ class BertEmbeddingModel(nn.Module, SupportsQuant):
         })
 
 
-class BertForSequenceClassification(nn.Module, SupportsV0Only,
-                                    SupportsCrossEncoding, SupportsQuant):
+# Here we encode the token type ids together with the input ids.
+# Since we use int 32 for the input IDs and the vocabulary size
+# is way lower than 2**31, there is room to encode additional
+# bits. At the same time, for cross-encoder use cases, the
+# token type ids are only 0 or 1, requiring only 1 bit.
+# This means that we can store the token type ids in the 31st
+# bit. We void the 32nd bit because that would produce a negative
+# number, which could be used to signal other things.
+#
+# The reason for all of this is that all the tensors that are
+# passed as input to the forward function of a module marked
+# with @support_torch_compile have to be persistent. So to
+# avoid adding more persistent tensors in the model runner, we
+# encode more information in the same persistent tensor.
+#
+# Since the *ForClassification module is outside of the BertModel
+# which is compiled, we can do the encoding here and then separate
+# the information again in the Embedding  layer. Since with bit masks
+# we can do this entirely with torch operations and without branching,
+# it works with torch compile.
+
+TOKEN_TYPE_SHIFT = 30
+
+
+def _encode_token_type_ids(input_ids: torch.Tensor,
+                           token_type_ids: torch.Tensor) -> None:
+    # input_ids can be padded to the right
+    input_ids[:token_type_ids.shape[0]].bitwise_or_(
+        token_type_ids << TOKEN_TYPE_SHIFT)
+
+
+def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
+
+    ids_mask = torch.ones(input_ids.shape,
+                          dtype=torch.int32,
+                          device=input_ids.device) << TOKEN_TYPE_SHIFT
+    tokens_mask = ids_mask.bitwise_not()
+
+    token_type_ids = input_ids.bitwise_and(ids_mask) >> TOKEN_TYPE_SHIFT
+
+    input_ids.bitwise_and_(tokens_mask)
+
+    return token_type_ids
+
+
+class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
+                                    SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
    This class encapsulates the BertModel and provides an interface for
@@ -567,8 +600,13 @@ class BertForSequenceClassification(nn.Module, SupportsV0Only,
         inputs_embeds: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+
+        if token_type_ids is not None:
+            assert self.bert.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
+            assert input_ids is not None
+            _encode_token_type_ids(input_ids, token_type_ids)
+
         return self.bert(input_ids=input_ids,
-                         position_ids=positions,
+                         positions=positions,
                          inputs_embeds=inputs_embeds,
-                         intermediate_tensors=intermediate_tensors,
-                         token_type_ids=token_type_ids)
+                         intermediate_tensors=intermediate_tensors)
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 61c8faed40..005b917982 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -14,13 +14,16 @@ from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool,
                                                DispatchPooler, Pooler)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
+from vllm.model_executor.models.bert import (TOKEN_TYPE_SHIFT,
+                                             BertEmbeddingModel, BertModel,
+                                             _decode_token_type_ids,
+                                             _encode_token_type_ids)
 from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
                                               maybe_prefix)
 from vllm.sequence import IntermediateTensors
 
 from .bert_with_rope import BertWithRope, JinaRobertaModel
-from .interfaces import SupportsCrossEncoding, SupportsV0Only
+from .interfaces import SupportsCrossEncoding
 
 
 class RobertaEmbedding(nn.Module):
@@ -53,17 +56,12 @@ class RobertaEmbedding(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        input_shape = input_ids.size()
-        inputs_embeds = self.word_embeddings(input_ids)
 
-        # Position embeddings.
+        token_type_ids = _decode_token_type_ids(input_ids)
+
+        inputs_embeds = self.word_embeddings(input_ids)
         position_embeddings = self.position_embeddings(position_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape,
-                                         dtype=torch.long,
-                                         device=inputs_embeds.device)
 
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
         embeddings = inputs_embeds + token_type_embeddings + position_embeddings
@@ -107,7 +105,6 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        token_type_ids: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -119,9 +116,8 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
                                   position_ids=positions,
                                   padding_idx=self.padding_idx)
 
-        return self.model(input_ids,
-                          positions,
-                          token_type_ids=token_type_ids,
+        return self.model(input_ids=input_ids,
+                          positions=positions,
                           inputs_embeds=inputs_embeds,
                           intermediate_tensors=intermediate_tensors)
 
@@ -153,8 +149,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
         return loader.load_weights(weights_list, mapper=mapper)
 
 
-class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
-                                       SupportsV0Only):
+class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
     """A model that uses Roberta to provide embedding functionalities.
 
    This class encapsulates the BertModel and provides an interface for
@@ -226,11 +221,14 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
         replace_roberta_positions(input_ids=input_ids,
                                   position_ids=positions,
                                   padding_idx=self.padding_idx)
+        if token_type_ids is not None:
+            assert self.roberta.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
+            assert input_ids is not None
+            _encode_token_type_ids(input_ids, token_type_ids)
         return self.roberta(input_ids=input_ids,
-                            position_ids=positions,
+                            positions=positions,
                             inputs_embeds=inputs_embeds,
-                            intermediate_tensors=intermediate_tensors,
-                            token_type_ids=token_type_ids)
+                            intermediate_tensors=intermediate_tensors)
 
 
 # Adapted from transformers
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 7077f68353..29f037b437 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 import msgspec
 
@@ -46,6 +46,9 @@ class PoolingParams(
     requires_token_ids: bool = False
     """Internal use only."""
 
+    extra_kwargs: Optional[dict[str, Any]] = None
+    """Internal use only."""
+
     output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
 
     @property
@@ -167,7 +170,8 @@ class PoolingParams(
                 f"softmax={self.softmax}, "
                 f"step_tag_id={self.step_tag_id}, "
                 f"returned_token_ids={self.returned_token_ids}, "
-                f"requires_token_ids={self.requires_token_ids})")
+                f"requires_token_ids={self.requires_token_ids}, "
+                f"extra_kwargs={self.extra_kwargs})")
 
     def __post_init__(self) -> None:
         assert self.output_kind == RequestOutputKind.FINAL_ONLY,\
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 48ff50fd6b..3cde7c6e96 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -336,6 +336,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         self.reorder_batch_threshold: Optional[int] = None
 
+    def _init_model_kwargs(self, num_tokens: int):
+        model_kwargs = dict[str, Any]()
+        num_reqs = self.input_batch.num_reqs
+
+        pooling_params = self.input_batch.pooling_metadata.pooling_params
+
+        num_pooling_reqs = len(pooling_params)
+
+        if num_pooling_reqs == 0:
+            return model_kwargs
+
+        assert num_pooling_reqs == num_reqs
+
+        token_type_id_requests = dict[int, Any]()
+        for i, param in enumerate(pooling_params):
+            if param.extra_kwargs is not None and \
+            (token_types := param.extra_kwargs.get(
+                "compressed_token_type_ids")) is not None:
+                token_type_id_requests[i] = token_types
+
+        if len(token_type_id_requests) == 0:
+            return model_kwargs
+
+        seq_lens = self.seq_lens[:num_reqs]
+        token_type_ids = []
+
+        for i in range(num_reqs):
+            pos = token_type_id_requests.get(i, seq_lens[i])
+            ids = (torch.arange(seq_lens[i]) >= pos).int()
+            token_type_ids.append(ids)
+
+        model_kwargs["token_type_ids"] = torch.concat(token_type_ids).to(
+            device=self.device)
+        return model_kwargs
+
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
         Update the order of requests in the batch based on the attention
@@ -1504,12 +1539,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             input_ids = None
             inputs_embeds = self.inputs_embeds[:num_input_tokens]
             model_mm_kwargs = self._extract_mm_kwargs(scheduler_output)
+            model_kwargs = self._init_model_kwargs(num_scheduled_tokens)
         else:
             # For text-only models, we use token ids as input.
             # While it is possible to use embeddings as input just like the
             # multimodal models, it is not desirable for performance since
             # then the embedding layer is not included in the CUDA graph.
             input_ids = self.input_ids[:num_input_tokens]
+            model_kwargs = self._init_model_kwargs(num_input_tokens)
             inputs_embeds = None
             model_mm_kwargs = {}
         if self.uses_mrope:
@@ -1548,6 +1585,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     model_mm_kwargs,
                     device=self.device,
                 ),
+                **model_kwargs,
             )
 
         if self.use_aux_hidden_state_outputs:
@@ -2211,6 +2249,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
                                             num_scheduled_tokens):
+            model_kwargs = self._init_model_kwargs(num_tokens)
             if self.supports_mm_inputs:
                 input_ids = None
                 inputs_embeds = self.inputs_embeds[:num_tokens]
@@ -2252,6 +2291,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         model_mm_kwargs,
                         device=self.device,
                     ),
+                    **model_kwargs,
                 )
 
             if self.use_aux_hidden_state_outputs:

From 384a052971607f1561e734c87c9216f77f47e0fb Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 11 Aug 2025 15:13:27 +0800
Subject: [PATCH 158/932] [Misc] benchmark_moe supports expert parallel
 (#22251)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 72250e2fb6..13bf1be836 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -22,10 +22,10 @@ from vllm.utils import FlexibleArgumentParser
 FP8_DTYPE = current_platform.fp8_dtype()
 
 
-def ensure_divisibility(numerator, denominator):
+def ensure_divisibility(numerator, denominator, text):
     """Ensure that numerator is divisible by the denominator."""
-    assert numerator % denominator == 0, (
-        "intermediate_size {} is not divisible by tp {}.".format(numerator, denominator)
+    assert numerator % denominator == 0, "{} {} is not divisible by tp {}.".format(
+        text, numerator, denominator
     )
 
 
@@ -577,12 +577,10 @@ def main(args: argparse.Namespace):
         E = config.ffn_config.moe_num_experts
         topk = config.ffn_config.moe_top_k
         intermediate_size = config.ffn_config.ffn_hidden_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     elif config.architectures[0] == "JambaForCausalLM":
         E = config.num_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     elif config.architectures[0] in (
         "DeepseekV3ForCausalLM",
         "DeepseekV2ForCausalLM",
@@ -591,17 +589,14 @@ def main(args: argparse.Namespace):
         E = config.n_routed_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
         E = config.num_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
         E = config.num_experts
         topk = config.moe_topk[0]
         intermediate_size = config.moe_intermediate_size[0]
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
         # Support for llama4
         config = config.get_text_config()
@@ -609,8 +604,14 @@ def main(args: argparse.Namespace):
         E = config.num_local_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
+    enable_ep = bool(args.enable_expert_parallel)
+    if enable_ep:
+        ensure_divisibility(E, args.tp_size, "Number of experts")
+        E = E // args.tp_size
+        shard_intermediate_size = 2 * intermediate_size
+    else:
+        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    ensure_divisibility(intermediate_size, args.tp_size)
     hidden_size = config.hidden_size
     dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
@@ -742,6 +743,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2
     )
+    parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true")
     parser.add_argument(
         "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
     )

From 1e55dfa7e552e0995630a2563aeae443945e2e81 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@gmail.com>
Date: Mon, 11 Aug 2025 09:13:30 +0200
Subject: [PATCH 159/932] [BUGFIX] KeyError 'layers.14.mlp.gate.g_idx' for
 Qwen3-MoE with GPTQ on ROCm (#22017)

---
 vllm/model_executor/models/qwen3_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 3d1e72299b..9b49952f37 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -149,7 +149,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
         self.gate = ReplicatedLinear(config.hidden_size,
                                      config.num_experts,
                                      bias=False,
-                                     quant_config=None,
+                                     quant_config=quant_config,
                                      prefix=f"{prefix}.gate")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:

From bc1d02ac85d834c98ec2794f1122b269f4c3e45b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 11 Aug 2025 08:13:33 +0100
Subject: [PATCH 160/932] [Docs] Add comprehensive CLI reference for all large
 `vllm` subcommands (#22601)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/.nav.yml                          | 10 ++-
 docs/api/{summary.md => README.md}     |  0
 docs/cli/.meta.yml                     |  1 +
 docs/cli/.nav.yml                      |  8 +++
 docs/cli/README.md                     | 87 +++++++++++++-------------
 docs/cli/bench/latency.md              |  9 +++
 docs/cli/bench/serve.md                |  9 +++
 docs/cli/bench/throughput.md           |  9 +++
 docs/cli/chat.md                       |  5 ++
 docs/cli/complete.md                   |  5 ++
 docs/cli/json_tip.inc.md               |  9 +++
 docs/cli/run-batch.md                  |  9 +++
 docs/cli/serve.md                      |  9 +++
 docs/configuration/engine_args.md      | 10 +--
 docs/mkdocs/hooks/generate_argparse.py | 49 ++++++++++-----
 requirements/docs.txt                  |  2 +
 vllm/benchmarks/throughput.py          |  4 +-
 vllm/entrypoints/cli/openai.py         | 70 ++++++++++++---------
 vllm/entrypoints/openai/run_batch.py   |  5 +-
 vllm/utils/__init__.py                 |  5 +-
 20 files changed, 205 insertions(+), 110 deletions(-)
 rename docs/api/{summary.md => README.md} (100%)
 create mode 100644 docs/cli/.meta.yml
 create mode 100644 docs/cli/.nav.yml
 create mode 100644 docs/cli/bench/latency.md
 create mode 100644 docs/cli/bench/serve.md
 create mode 100644 docs/cli/bench/throughput.md
 create mode 100644 docs/cli/chat.md
 create mode 100644 docs/cli/complete.md
 create mode 100644 docs/cli/json_tip.inc.md
 create mode 100644 docs/cli/run-batch.md
 create mode 100644 docs/cli/serve.md

diff --git a/docs/.nav.yml b/docs/.nav.yml
index f57703c329..acedc32c30 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -11,7 +11,7 @@ nav:
     - Quick Links:
       - User Guide: usage/README.md
       - Developer Guide: contributing/README.md
-      - API Reference: api/summary.md
+      - API Reference: api/README.md
       - CLI Reference: cli/README.md
     - Timeline:
       - Roadmap: https://roadmap.vllm.ai
@@ -58,11 +58,9 @@ nav:
     - CI: contributing/ci
     - Design Documents: design
   - API Reference:
-    - Summary: api/summary.md
-    - Contents:
-      - api/vllm/*
-  - CLI Reference:
-    - Summary: cli/README.md
+    - api/README.md
+    - api/vllm/*
+  - CLI Reference: cli
   - Community:
     - community/*
     - Blog: https://blog.vllm.ai
diff --git a/docs/api/summary.md b/docs/api/README.md
similarity index 100%
rename from docs/api/summary.md
rename to docs/api/README.md
diff --git a/docs/cli/.meta.yml b/docs/cli/.meta.yml
new file mode 100644
index 0000000000..0e1f7eccee
--- /dev/null
+++ b/docs/cli/.meta.yml
@@ -0,0 +1 @@
+toc_depth: 3
\ No newline at end of file
diff --git a/docs/cli/.nav.yml b/docs/cli/.nav.yml
new file mode 100644
index 0000000000..6c2c09d566
--- /dev/null
+++ b/docs/cli/.nav.yml
@@ -0,0 +1,8 @@
+nav:
+  - README.md
+  - serve.md
+  - chat.md
+  - complete.md
+  - run-batch.md
+  - vllm bench:
+    - bench/*.md
diff --git a/docs/cli/README.md b/docs/cli/README.md
index b512a4f4ba..c708eb7958 100644
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@@ -1,7 +1,3 @@
----
-toc_depth: 4
----
-
 # vLLM CLI Guide
 
 The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
@@ -16,52 +12,48 @@ Available Commands:
 vllm {chat,complete,serve,bench,collect-env,run-batch}
 ```
 
-When passing JSON CLI arguments, the following sets of arguments are equivalent:
-
-- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
-- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
-
-Additionally, list elements can be passed individually using `+`:
-
-- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
-- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`
-
 ## serve
 
-Start the vLLM OpenAI Compatible API server.
+Starts the vLLM OpenAI Compatible API server.
 
-??? console "Examples"
+Start with a model:
 
-    ```bash
-    # Start with a model
-    vllm serve meta-llama/Llama-2-7b-hf
+```bash
+vllm serve meta-llama/Llama-2-7b-hf
+```
 
-    # Specify the port
-    vllm serve meta-llama/Llama-2-7b-hf --port 8100
+Specify the port:
 
-    # Serve over a Unix domain socket
-    vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock
+```bash
+vllm serve meta-llama/Llama-2-7b-hf --port 8100
+```
 
-    # Check with --help for more options
-    # To list all groups
-    vllm serve --help=listgroup
+Serve over a Unix domain socket:
 
-    # To view a argument group
-    vllm serve --help=ModelConfig
+```bash
+vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock
+```
 
-    # To view a single argument
-    vllm serve --help=max-num-seqs
+Check with --help for more options:
 
-    # To search by keyword
-    vllm serve --help=max
+```bash
+# To list all groups
+vllm serve --help=listgroup
 
-    # To view full help with pager (less/more)
-    vllm serve --help=page
-    ```
+# To view a argument group
+vllm serve --help=ModelConfig
 
-### Options
+# To view a single argument
+vllm serve --help=max-num-seqs
 
---8<-- "docs/argparse/serve.md"
+# To search by keyword
+vllm serve --help=max
+
+# To view full help with pager (less/more)
+vllm serve --help=page
+```
+
+See [vllm serve](./serve.md) for the full reference of all available arguments.
 
 ## chat
 
@@ -78,6 +70,8 @@ vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1
 vllm chat --quick "hi"
 ```
 
+See [vllm chat](./chat.md) for the full reference of all available arguments.
+
 ## complete
 
 Generate text completions based on the given prompt via the running API server.
@@ -93,7 +87,7 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
 vllm complete --quick "The future of AI is"
 ```
 
-</details>
+See [vllm complete](./complete.md) for the full reference of all available arguments.
 
 ## bench
 
@@ -120,6 +114,8 @@ vllm bench latency \
     --load-format dummy
 ```
 
+See [vllm bench latency](./bench/latency.md) for the full reference of all available arguments.
+
 ### serve
 
 Benchmark the online serving throughput.
@@ -134,6 +130,8 @@ vllm bench serve \
     --num-prompts  5
 ```
 
+See [vllm bench serve](./bench/serve.md) for the full reference of all available arguments.
+
 ### throughput
 
 Benchmark offline inference throughput.
@@ -147,6 +145,8 @@ vllm bench throughput \
     --load-format dummy
 ```
 
+See [vllm bench throughput](./bench/throughput.md) for the full reference of all available arguments.
+
 ## collect-env
 
 Start collecting environment information.
@@ -159,24 +159,25 @@ vllm collect-env
 
 Run batch prompts and write results to file.
 
-<details>
-<summary>Examples</summary>
+Running with a local file:
 
 ```bash
-# Running with a local file
 vllm run-batch \
     -i offline_inference/openai_batch/openai_example_batch.jsonl \
     -o results.jsonl \
     --model meta-llama/Meta-Llama-3-8B-Instruct
+```
 
-# Using remote file
+Using remote file:
+
+```bash
 vllm run-batch \
     -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
     -o results.jsonl \
     --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
-</details>
+See [vllm run-batch](./run-batch.md) for the full reference of all available arguments.
 
 ## More Help
 
diff --git a/docs/cli/bench/latency.md b/docs/cli/bench/latency.md
new file mode 100644
index 0000000000..21ab13e637
--- /dev/null
+++ b/docs/cli/bench/latency.md
@@ -0,0 +1,9 @@
+# vllm bench latency
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Options
+
+--8<-- "docs/argparse/bench_latency.md"
diff --git a/docs/cli/bench/serve.md b/docs/cli/bench/serve.md
new file mode 100644
index 0000000000..f7c415c6be
--- /dev/null
+++ b/docs/cli/bench/serve.md
@@ -0,0 +1,9 @@
+# vllm bench serve
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Options
+
+--8<-- "docs/argparse/bench_serve.md"
diff --git a/docs/cli/bench/throughput.md b/docs/cli/bench/throughput.md
new file mode 100644
index 0000000000..e4ff5ce43c
--- /dev/null
+++ b/docs/cli/bench/throughput.md
@@ -0,0 +1,9 @@
+# vllm bench throughput
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Options
+
+--8<-- "docs/argparse/bench_throughput.md"
diff --git a/docs/cli/chat.md b/docs/cli/chat.md
new file mode 100644
index 0000000000..b006cb8de6
--- /dev/null
+++ b/docs/cli/chat.md
@@ -0,0 +1,5 @@
+# vllm chat
+
+## Options
+
+--8<-- "docs/argparse/chat.md"
diff --git a/docs/cli/complete.md b/docs/cli/complete.md
new file mode 100644
index 0000000000..400359acf4
--- /dev/null
+++ b/docs/cli/complete.md
@@ -0,0 +1,5 @@
+# vllm complete
+
+## Options
+
+--8<-- "docs/argparse/complete.md"
diff --git a/docs/cli/json_tip.inc.md b/docs/cli/json_tip.inc.md
new file mode 100644
index 0000000000..c22430c264
--- /dev/null
+++ b/docs/cli/json_tip.inc.md
@@ -0,0 +1,9 @@
+When passing JSON CLI arguments, the following sets of arguments are equivalent:
+
+- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
+- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
+
+Additionally, list elements can be passed individually using `+`:
+
+- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
+- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`
\ No newline at end of file
diff --git a/docs/cli/run-batch.md b/docs/cli/run-batch.md
new file mode 100644
index 0000000000..f7d401b8da
--- /dev/null
+++ b/docs/cli/run-batch.md
@@ -0,0 +1,9 @@
+# vllm run-batch
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Options
+
+--8<-- "docs/argparse/run-batch.md"
diff --git a/docs/cli/serve.md b/docs/cli/serve.md
new file mode 100644
index 0000000000..2c8f9d320f
--- /dev/null
+++ b/docs/cli/serve.md
@@ -0,0 +1,9 @@
+# vllm serve
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Options
+
+--8<-- "docs/argparse/serve.md"
diff --git a/docs/configuration/engine_args.md b/docs/configuration/engine_args.md
index e7ca08b557..05d4f76230 100644
--- a/docs/configuration/engine_args.md
+++ b/docs/configuration/engine_args.md
@@ -11,15 +11,7 @@ Engine arguments control the behavior of the vLLM engine.
 
 The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings.
 
-When passing JSON CLI arguments, the following sets of arguments are equivalent:
-
-- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
-- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
-
-Additionally, list elements can be passed individually using `+`:
-
-- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
-- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`
+--8<-- "docs/cli/json_tip.inc.md"
 
 ## `EngineArgs`
 
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index b003b5fd6c..ed5d3b0092 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -15,8 +15,14 @@ sys.modules["aiohttp"] = MagicMock()
 sys.modules["blake3"] = MagicMock()
 sys.modules["vllm._C"] = MagicMock()
 
+from vllm.benchmarks import latency  # noqa: E402
+from vllm.benchmarks import serve  # noqa: E402
+from vllm.benchmarks import throughput  # noqa: E402
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs  # noqa: E402
-from vllm.entrypoints.openai.cli_args import make_arg_parser  # noqa: E402
+from vllm.entrypoints.cli.openai import ChatCommand  # noqa: E402
+from vllm.entrypoints.cli.openai import CompleteCommand  # noqa: E402
+from vllm.entrypoints.openai import cli_args  # noqa: E402
+from vllm.entrypoints.openai import run_batch  # noqa: E402
 from vllm.utils import FlexibleArgumentParser  # noqa: E402
 
 logger = logging.getLogger("mkdocs")
@@ -68,7 +74,8 @@ class MarkdownFormatter(HelpFormatter):
                 self._markdown_output.append(
                     f"Possible choices: {metavar}\n\n")
 
-            self._markdown_output.append(f"{action.help}\n\n")
+            if action.help:
+                self._markdown_output.append(f"{action.help}\n\n")
 
             if (default := action.default) != SUPPRESS:
                 self._markdown_output.append(f"Default: `{default}`\n\n")
@@ -78,7 +85,7 @@ class MarkdownFormatter(HelpFormatter):
         return "".join(self._markdown_output)
 
 
-def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
+def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
     """Create a parser for the given class with markdown formatting.
     
     Args:
@@ -88,18 +95,12 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
     Returns:
         FlexibleArgumentParser: A parser with markdown formatting for the class.
     """
-    parser = FlexibleArgumentParser()
+    parser = FlexibleArgumentParser(add_json_tip=False)
     parser.formatter_class = MarkdownFormatter
     with patch("vllm.config.DeviceConfig.__post_init__"):
-        return cls.add_cli_args(parser, **kwargs)
-
-
-def create_serve_parser() -> FlexibleArgumentParser:
-    """Create a parser for the serve command with markdown formatting."""
-    parser = FlexibleArgumentParser()
-    parser.formatter_class = lambda prog: MarkdownFormatter(
-        prog, starting_heading_level=4)
-    return make_arg_parser(parser)
+        _parser = add_cli_args(parser, **kwargs)
+    # add_cli_args might be in-place so return parser if _parser is None
+    return _parser or parser
 
 
 def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
@@ -113,10 +114,24 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
 
     # Create parsers to document
     parsers = {
-        "engine_args": create_parser(EngineArgs),
-        "async_engine_args": create_parser(AsyncEngineArgs,
-                                           async_args_only=True),
-        "serve": create_serve_parser(),
+        "engine_args":
+        create_parser(EngineArgs.add_cli_args),
+        "async_engine_args":
+        create_parser(AsyncEngineArgs.add_cli_args, async_args_only=True),
+        "serve":
+        create_parser(cli_args.make_arg_parser),
+        "chat":
+        create_parser(ChatCommand.add_cli_args),
+        "complete":
+        create_parser(CompleteCommand.add_cli_args),
+        "bench_latency":
+        create_parser(latency.add_cli_args),
+        "bench_throughput":
+        create_parser(throughput.add_cli_args),
+        "bench_serve":
+        create_parser(serve.add_cli_args),
+        "run-batch":
+        create_parser(run_batch.make_arg_parser),
     }
 
     # Generate documentation for each parser
diff --git a/requirements/docs.txt b/requirements/docs.txt
index c589093110..a24b9c7e92 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -29,3 +29,5 @@ setproctitle
 torch
 transformers
 zmq
+uvloop
+prometheus-client
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index bbd18ca3ae..fdf6548ada 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -24,8 +24,6 @@ from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
 from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
                                        write_to_json)
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
 from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
@@ -146,6 +144,8 @@ async def run_vllm_async(
     disable_detokenize: bool = False,
 ) -> float:
     from vllm import SamplingParams
+    from vllm.entrypoints.openai.api_server import (
+        build_async_engine_client_from_engine_args)
 
     async with build_async_engine_client_from_engine_args(
         engine_args,
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index e71f77ba80..7c01de94a3 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -130,28 +130,33 @@ class ChatCommand(CLISubcommand):
             conversation.append(response_message)  # type: ignore
             print(output)
 
-    def subparser_init(
-            self,
-            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
-        chat_parser = subparsers.add_parser(
-            "chat",
-            help="Generate chat completions via the running API server.",
-            description="Generate chat completions via the running API server.",
-            usage="vllm chat [options]")
-        _add_query_options(chat_parser)
-        chat_parser.add_argument(
+    @staticmethod
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+        """Add CLI arguments for the chat command."""
+        _add_query_options(parser)
+        parser.add_argument(
             "--system-prompt",
             type=str,
             default=None,
             help=("The system prompt to be added to the chat template, "
                   "used for models that support system prompts."))
-        chat_parser.add_argument("-q",
-                                 "--quick",
-                                 type=str,
-                                 metavar="MESSAGE",
-                                 help=("Send a single prompt as MESSAGE "
-                                       "and print the response, then exit."))
-        return chat_parser
+        parser.add_argument("-q",
+                            "--quick",
+                            type=str,
+                            metavar="MESSAGE",
+                            help=("Send a single prompt as MESSAGE "
+                                  "and print the response, then exit."))
+        return parser
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        parser = subparsers.add_parser(
+            "chat",
+            help="Generate chat completions via the running API server.",
+            description="Generate chat completions via the running API server.",
+            usage="vllm chat [options]")
+        return ChatCommand.add_cli_args(parser)
 
 
 class CompleteCommand(CLISubcommand):
@@ -179,25 +184,30 @@ class CompleteCommand(CLISubcommand):
             output = completion.choices[0].text
             print(output)
 
-    def subparser_init(
-            self,
-            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
-        complete_parser = subparsers.add_parser(
-            "complete",
-            help=("Generate text completions based on the given prompt "
-                  "via the running API server."),
-            description=("Generate text completions based on the given prompt "
-                         "via the running API server."),
-            usage="vllm complete [options]")
-        _add_query_options(complete_parser)
-        complete_parser.add_argument(
+    @staticmethod
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+        """Add CLI arguments for the complete command."""
+        _add_query_options(parser)
+        parser.add_argument(
             "-q",
             "--quick",
             type=str,
             metavar="PROMPT",
             help=
             "Send a single prompt and print the completion output, then exit.")
-        return complete_parser
+        return parser
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        parser = subparsers.add_parser(
+            "complete",
+            help=("Generate text completions based on the given prompt "
+                  "via the running API server."),
+            description=("Generate text completions based on the given prompt "
+                         "via the running API server."),
+            usage="vllm complete [options]")
+        return CompleteCommand.add_cli_args(parser)
 
 
 def cmd_init() -> list[CLISubcommand]:
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index a10d57456b..01551a8c7f 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -20,7 +20,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
 # yapf: disable
-from vllm.entrypoints.openai.api_server import build_async_engine_client
 from vllm.entrypoints.openai.protocol import (BatchRequestInput,
                                               BatchRequestOutput,
                                               BatchResponseData,
@@ -34,7 +33,6 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
 from vllm.entrypoints.openai.serving_score import ServingScores
 from vllm.logger import init_logger
-from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
 
@@ -469,6 +467,9 @@ async def run_batch(
 
 
 async def main(args: Namespace):
+    from vllm.entrypoints.openai.api_server import build_async_engine_client
+    from vllm.usage.usage_lib import UsageContext
+
     async with build_async_engine_client(
             args,
             usage_context=UsageContext.OPENAI_BATCH_RUNNER,
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index a4997226ea..095829db83 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1682,6 +1682,8 @@ class FlexibleArgumentParser(ArgumentParser):
         # Set the default "formatter_class" to SortedHelpFormatter
         if "formatter_class" not in kwargs:
             kwargs["formatter_class"] = SortedHelpFormatter
+        # Pop kwarg "add_json_tip" to control whether to add the JSON tip
+        self.add_json_tip = kwargs.pop("add_json_tip", True)
         super().__init__(*args, **kwargs)
 
     if sys.version_info < (3, 13):
@@ -1726,7 +1728,8 @@ class FlexibleArgumentParser(ArgumentParser):
     def format_help(self) -> str:
         # Add tip about JSON arguments to the epilog
         epilog = self.epilog or ""
-        if not epilog.startswith(FlexibleArgumentParser._json_tip):
+        if (self.add_json_tip
+                and not epilog.startswith(FlexibleArgumentParser._json_tip)):
             self.epilog = FlexibleArgumentParser._json_tip + epilog
         return super().format_help()
 

From ebf7605b0dd58ff5d572d1918e52ca732025eee0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 11 Aug 2025 15:15:27 +0800
Subject: [PATCH 161/932] [Misc] Move tensor schema tests (#22612)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                            | 7 ++++---
 tests/utils_/__init__.py                                 | 6 ++++++
 tests/{standalone_tests => utils_}/test_tensor_schema.py | 0
 tests/{ => utils_}/test_utils.py                         | 3 +--
 tools/check_pickle_imports.py                            | 2 +-
 5 files changed, 12 insertions(+), 6 deletions(-)
 create mode 100644 tests/utils_/__init__.py
 rename tests/{standalone_tests => utils_}/test_tensor_schema.py (100%)
 rename tests/{ => utils_}/test_utils.py (99%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index db7351edbb..ebcf51981e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -57,9 +57,10 @@ steps:
   - vllm/
   - tests/mq_llm_engine
   - tests/async_engine
-  - tests/test_inputs
+  - tests/test_inputs.py
+  - tests/test_outputs.py
   - tests/multimodal
-  - tests/test_utils
+  - tests/utils_
   - tests/worker
   - tests/standalone_tests/lazy_imports.py
   commands:
@@ -70,7 +71,7 @@ steps:
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s multimodal
-  - pytest -v -s test_utils.py # Utils
+  - pytest -v -s utils_ # Utils
   - pytest -v -s worker # Worker
 
 - label: Python-only Installation Test
diff --git a/tests/utils_/__init__.py b/tests/utils_/__init__.py
new file mode 100644
index 0000000000..e6b4c3f636
--- /dev/null
+++ b/tests/utils_/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This module is named `utils_` instead of `utils` to avoid obscuring
+`tests/utils.py`.
+"""
diff --git a/tests/standalone_tests/test_tensor_schema.py b/tests/utils_/test_tensor_schema.py
similarity index 100%
rename from tests/standalone_tests/test_tensor_schema.py
rename to tests/utils_/test_tensor_schema.py
diff --git a/tests/test_utils.py b/tests/utils_/test_utils.py
similarity index 99%
rename from tests/test_utils.py
rename to tests/utils_/test_utils.py
index 53a34642e5..a2db1ae684 100644
--- a/tests/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -5,7 +5,6 @@
 import asyncio
 import hashlib
 import json
-import logging
 import pickle
 import socket
 from collections.abc import AsyncIterator
@@ -29,7 +28,7 @@ from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
                         merge_async_iterators, sha256, split_host_port,
                         split_zmq_path, supports_kw, swap_dict_values)
 
-from .utils import create_new_process_for_each_test, error_on_warning
+from ..utils import create_new_process_for_each_test, error_on_warning
 
 
 @pytest.mark.asyncio
diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py
index 5e99dc63eb..444e2bf53f 100644
--- a/tools/check_pickle_imports.py
+++ b/tools/check_pickle_imports.py
@@ -32,7 +32,7 @@ ALLOWED_FILES = set([
     'vllm/multimodal/hasher.py',
     'vllm/transformers_utils/config.py',
     'vllm/model_executor/models/registry.py',
-    'tests/test_utils.py',
+    'tests/utils_/test_utils.py',
     'tests/tokenization/test_cached_tokenizer.py',
     'vllm/distributed/utils.py',
     'vllm/distributed/parallel_state.py',

From 951b038298cae379d1321087a296882aae61fce7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 11 Aug 2025 18:49:32 +0800
Subject: [PATCH 162/932] [Misc] Move jsontree to utils (#22622)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/inputs/registry.py                     | 2 +-
 vllm/model_executor/models/aya_vision.py    | 2 +-
 vllm/model_executor/models/llava.py         | 2 +-
 vllm/model_executor/models/minimax_vl_01.py | 2 +-
 vllm/model_executor/models/tarsier.py       | 2 +-
 vllm/multimodal/cache.py                    | 2 +-
 vllm/multimodal/inputs.py                   | 2 +-
 vllm/{ => utils}/jsontree.py                | 0
 8 files changed, 7 insertions(+), 7 deletions(-)
 rename vllm/{ => utils}/jsontree.py (100%)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 6331a70b46..dc32365083 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -8,10 +8,10 @@ import torch
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import TypeVar
 
-from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.utils import get_allowed_kwarg_only_overrides
+from vllm.utils.jsontree import JSONTree, json_map_leaves
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index b476a4f918..5cd74bbba4 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -16,7 +16,6 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import (
     get_optimal_tiled_canvas)
 
 from vllm.config import VllmConfig
-from vllm.jsontree import json_map_leaves
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
@@ -29,6 +28,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 708ca98995..89d2817b57 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -16,7 +16,6 @@ from transformers.models.pixtral import PixtralProcessor
 
 from vllm.config import VllmConfig
 from vllm.inputs import InputProcessingContext
-from vllm.jsontree import json_map_leaves
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
@@ -33,6 +32,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index 62a7d37ec9..8107c6e8a0 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -8,7 +8,6 @@ import torch.nn as nn
 from transformers import BatchFeature, PretrainedConfig
 
 from vllm.config import VllmConfig
-from vllm.jsontree import json_map_leaves
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
@@ -17,6 +16,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig
 from vllm.sequence import IntermediateTensors
+from vllm.utils.jsontree import json_map_leaves
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index 70cf5e95a5..c8709d866b 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -18,7 +18,6 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
 from vllm.config import VllmConfig
 from vllm.inputs import InputProcessingContext
-from vllm.jsontree import json_map_leaves
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
@@ -34,6 +33,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.jsontree import json_map_leaves
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 262b22e554..6074a4d54f 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -7,9 +7,9 @@ from typing import TypeVar, Union
 
 import torch
 
-from vllm.jsontree import json_map_leaves, json_reduce_leaves
 from vllm.logger import init_logger
 from vllm.utils import GiB_bytes, LRUCache
+from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves
 
 from .inputs import MultiModalKwargs, MultiModalKwargsItem, NestedTensors
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 18aae35c6f..6d4bcef320 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -13,8 +13,8 @@ from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
 import numpy as np
 from typing_extensions import NotRequired, TypeAlias
 
-from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.utils import LazyLoader, full_groupby, is_list_of
+from vllm.utils.jsontree import JSONTree, json_map_leaves
 
 if TYPE_CHECKING:
     import torch
diff --git a/vllm/jsontree.py b/vllm/utils/jsontree.py
similarity index 100%
rename from vllm/jsontree.py
rename to vllm/utils/jsontree.py

From 14a5d903ab826b723a24a2d89631006394de76a1 Mon Sep 17 00:00:00 2001
From: danielafrimi <45691845+danielafrimi@users.noreply.github.com>
Date: Mon, 11 Aug 2025 14:09:24 +0300
Subject: [PATCH 163/932] [Model] NemotronH Support  (#22349)

Signed-off-by: Daniel Afrimi <danielafrimi8@gmail.com>
---
 vllm/model_executor/models/nemotron_h.py      | 26 +++++++++++++++----
 vllm/transformers_utils/configs/nemotron_h.py |  4 +--
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index eb62d5a53c..08315a1385 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -64,20 +64,32 @@ class NemotronHMLP(nn.Module):
     def __init__(
         self,
         config: NemotronHConfig,
+        layer_idx: int,
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
         super().__init__()
+
+        hybrid_override_pattern = config.hybrid_override_pattern
+        mlp_index = hybrid_override_pattern[:layer_idx + 1].count("-") - 1
+        if isinstance(config.intermediate_size, list):
+            if len(config.intermediate_size) == 1:
+                intermediate_size = config.intermediate_size[0]
+            else:
+                intermediate_size = config.intermediate_size[mlp_index]
+        else:
+            intermediate_size = config.intermediate_size
+
         self.up_proj = ColumnParallelLinear(
             input_size=config.hidden_size,
-            output_size=config.intermediate_size,
+            output_size=intermediate_size,
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.up_proj",
         )
         self.down_proj = RowParallelLinear(
-            input_size=config.intermediate_size,
+            input_size=intermediate_size,
             output_size=config.hidden_size,
             bias=bias,
             quant_config=quant_config,
@@ -110,6 +122,7 @@ class NemotronHMLPDecoderLayer(nn.Module):
             quant_config=quant_config,
             bias=config.mlp_bias,
             prefix=f"{prefix}.mixer",
+            layer_idx=layer_idx,
         )
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -146,7 +159,7 @@ class NemotronHMambaDecoderLayer(nn.Module):
             hidden_size=config.hidden_size,
             ssm_state_size=config.ssm_state_size,
             conv_kernel_size=config.conv_kernel,
-            intermediate_size=config.expand * config.hidden_size,
+            intermediate_size=config.mamba_num_heads * config.mamba_head_dim,
             use_conv_bias=config.use_conv_bias,
             use_bias=config.use_bias,
             n_groups=config.n_groups,
@@ -205,7 +218,10 @@ class NemotronHAttention(nn.Module):
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = config.hidden_size // self.total_num_heads
+        if hasattr(config, "head_dim") and config.head_dim is not None:
+            self.head_dim = config.head_dim
+        else:
+            self.head_dim = config.hidden_size // self.total_num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -481,7 +497,7 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
         """
         parallel_config = vllm_config.parallel_config
         hf_config = vllm_config.model_config.hf_config
-        intermediate_size = hf_config.expand * hf_config.hidden_size
+        intermediate_size = hf_config.mamba_num_heads * hf_config.mamba_head_dim
 
         return MambaStateShapeCalculator.mamba2_state_shape(
             intermediate_size=intermediate_size,
diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py
index 457b3371e9..027f291154 100644
--- a/vllm/transformers_utils/configs/nemotron_h.py
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -151,7 +151,7 @@ class NemotronHConfig(PretrainedConfig):
         num_hidden_layers=52,
         hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
         num_attention_heads=32,
-        attention_head_dim=128,
+        head_dim=128,
         num_key_value_heads=8,  # nemo: num_query_groups
         mlp_hidden_act="relu2",
         attention_bias=False,
@@ -194,7 +194,7 @@ class NemotronHConfig(PretrainedConfig):
         self.num_hidden_layers = num_hidden_layers
         self.hybrid_override_pattern = hybrid_override_pattern
         self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
+        self.head_dim = head_dim
         self.sliding_window = sliding_window
         self.max_position_embeddings = max_position_embeddings
         self.attention_dropout = attention_dropout

From 3fa5b258455772b522d0e0d764d7dad65578310a Mon Sep 17 00:00:00 2001
From: Eric Curtin <ecurtin@redhat.com>
Date: Mon, 11 Aug 2025 15:22:45 +0100
Subject: [PATCH 164/932] Document aarch64 CPU support works (#22646)

Signed-off-by: Eric Curtin <ecurtin@redhat.com>
---
 docs/usage/v1_guide.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index a9492c8502..12191d3490 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -59,12 +59,12 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 
 ### Hardware
 
-| Hardware   | Status                             |
-|------------|------------------------------------|
-| **NVIDIA** | <nobr>🚀</nobr>                   |
-| **AMD**    | <nobr>🟢</nobr>                   |
-| **TPU**    | <nobr>🟢</nobr>                   |
-| **CPU**    | <nobr>🟢 (x86) 🟡 (MacOS) </nobr> |
+| Hardware   | Status                                        |
+|------------|-----------------------------------------------|
+| **NVIDIA** | <nobr>🚀</nobr>                               |
+| **AMD**    | <nobr>🟢</nobr>                               |
+| **TPU**    | <nobr>🟢</nobr>                               |
+| **CPU**    | <nobr>🟢 (x86\_64/aarch64) 🟡 (MacOS) </nobr> |
 
 !!! note
 

From 8e13d9fe6d486f3bfa096e28d683601d72a5a1cc Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 12 Aug 2025 00:22:25 +0800
Subject: [PATCH 165/932] [Misc] Further clean up some redundant config
 definitions (#22649)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/transformers_utils/config.py           | 51 +++++++++++++--------
 vllm/transformers_utils/configs/__init__.py |  6 +--
 vllm/transformers_utils/configs/mllama.py   | 31 -------------
 vllm/transformers_utils/configs/nvlm_d.py   | 31 -------------
 4 files changed, 34 insertions(+), 85 deletions(-)
 delete mode 100644 vllm/transformers_utils/configs/mllama.py
 delete mode 100644 vllm/transformers_utils/configs/nvlm_d.py

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 6b70164c8c..02ea0814dd 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -32,11 +32,10 @@ from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config,
                                              EAGLEConfig, JAISConfig,
                                              KimiVLConfig, MedusaConfig,
-                                             MllamaConfig, MLPSpeculatorConfig,
+                                             MLPSpeculatorConfig,
                                              Nemotron_Nano_VL_Config,
-                                             NemotronConfig, NVLM_D_Config,
-                                             OvisConfig, RWConfig,
-                                             SpeculatorsConfig,
+                                             NemotronConfig, OvisConfig,
+                                             RWConfig, SpeculatorsConfig,
                                              Step3TextConfig, Step3VLConfig,
                                              UltravoxConfig)
 # yapf: enable
@@ -68,10 +67,6 @@ def _get_hf_token() -> Optional[str]:
     return None
 
 
-_CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = {
-    "mllama": MllamaConfig
-}
-
 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
     "chatglm": ChatGLMConfig,
     "deepseek_vl_v2": DeepseekVLV2Config,
@@ -85,18 +80,30 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
     "eagle": EAGLEConfig,
     "speculators": SpeculatorsConfig,
     "nemotron": NemotronConfig,
-    "NVLM_D": NVLM_D_Config,
     "ovis": OvisConfig,
     "ultravox": UltravoxConfig,
     "step3_vl": Step3VLConfig,
     "step3_text": Step3TextConfig,
-    **_CONFIG_REGISTRY_OVERRIDE_HF
 }
 
 _CONFIG_ATTRS_MAPPING: dict[str, str] = {
     "llm_config": "text_config",
 }
 
+_AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = {
+    "internvl_chat": {
+        "has_no_defaults_at_init": True
+    },
+    # transformers regards mllama as is_encoder_decoder=False
+    # vllm needs is_encoder_decoder=True to enable cross-attention
+    "mllama": {
+        "is_encoder_decoder": True
+    },
+    "NVLM_D": {
+        "has_no_defaults_at_init": True
+    },
+}
+
 
 class ConfigFormat(str, enum.Enum):
     AUTO = "auto"
@@ -273,11 +280,12 @@ def thinker_uses_mrope(config: PretrainedConfig) -> bool:
 
 def is_encoder_decoder(config: PretrainedConfig) -> bool:
     """Detect if the model with this config is used as an encoder/decoder."""
-    text_config = getattr(config, "text_config", None)
-    if text_config is not None:
-        return is_encoder_decoder(text_config)
 
-    return getattr(config, "is_encoder_decoder", False)
+    def _is_encoder_decoder(config: PretrainedConfig) -> bool:
+        return getattr(config, "is_encoder_decoder", False)
+
+    return (_is_encoder_decoder(config)
+            or _is_encoder_decoder(config.get_text_config()))
 
 
 def is_interleaved(config: PretrainedConfig) -> bool:
@@ -291,13 +299,21 @@ def is_interleaved(config: PretrainedConfig) -> bool:
     return False
 
 
+def _maybe_update_auto_config_kwargs(kwargs: dict[str, Any], model_type: str):
+    """
+    Update kwargs for AutoConfig initialization based on model_type
+    """
+    if model_type in _AUTO_CONFIG_KWARGS_OVERRIDES:
+        kwargs.update(_AUTO_CONFIG_KWARGS_OVERRIDES[model_type])
+    return kwargs
+
+
 def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
     """Remap config attributes to match the expected names."""
     for old_attr, new_attr in _CONFIG_ATTRS_MAPPING.items():
         if hasattr(config, old_attr):
             if not hasattr(config, new_attr):
                 config.update({new_attr: getattr(config, old_attr)})
-            delattr(config, old_attr)
             logger.debug("Remapped config attribute '%s' to '%s'", old_attr,
                          new_attr)
     return config
@@ -408,15 +424,14 @@ def get_config(
             )
         else:
             try:
+                kwargs = _maybe_update_auto_config_kwargs(
+                    kwargs, model_type=model_type)
                 config = AutoConfig.from_pretrained(
                     model,
                     trust_remote_code=trust_remote_code,
                     revision=revision,
                     code_revision=code_revision,
                     token=_get_hf_token(),
-                    # some old custom model's config needs
-                    # `has_no_defaults_at_init=True` to work.
-                    has_no_defaults_at_init=trust_remote_code,
                     **kwargs,
                 )
             except ValueError as e:
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 82d24bb16b..8339c55bcf 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -17,13 +17,11 @@ from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
-from vllm.transformers_utils.configs.mllama import MllamaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
 from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
-from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
 from vllm.transformers_utils.configs.ovis import OvisConfig
 from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
 from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
@@ -34,18 +32,16 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 __all__ = [
     "ChatGLMConfig",
     "DeepseekVLV2Config",
+    "EAGLEConfig",
     "RWConfig",
     "JAISConfig",
     "MedusaConfig",
-    "EAGLEConfig",
-    "MllamaConfig",
     "MLPSpeculatorConfig",
     "MoonViTConfig",
     "KimiVLConfig",
     "NemotronConfig",
     "NemotronHConfig",
     "Nemotron_Nano_VL_Config",
-    "NVLM_D_Config",
     "OvisConfig",
     "SpeculatorsConfig",
     "UltravoxConfig",
diff --git a/vllm/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py
deleted file mode 100644
index f0cd2d52a5..0000000000
--- a/vllm/transformers_utils/configs/mllama.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from transformers.models.mllama import configuration_mllama as mllama_hf_config
-
-
-class MllamaTextConfig(mllama_hf_config.MllamaTextConfig):
-    '''
-    Use this class to override is_encoder_decoder:
-    - transformers regards mllama as is_encoder_decoder=False
-    - vllm needs is_encoder_decoder=True to enable cross-attention
-    '''
-
-    def __init__(
-        self,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.is_encoder_decoder = True
-
-
-class MllamaConfig(mllama_hf_config.MllamaConfig):
-
-    def __init__(
-        self,
-        text_config=None,
-        **kwargs,
-    ):
-        if isinstance(text_config, dict):
-            text_config = MllamaTextConfig(**text_config)
-        super().__init__(text_config=text_config, **kwargs)
diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py
deleted file mode 100644
index edfc506882..0000000000
--- a/vllm/transformers_utils/configs/nvlm_d.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py
-# --------------------------------------------------------
-# NVLM-D
-# Copyright (c) 2024 NVIDIA
-# Licensed under Apache 2.0 License [see LICENSE for details]
-# --------------------------------------------------------
-from transformers import Qwen2Config
-from transformers.configuration_utils import PretrainedConfig
-
-
-class NVLM_D_Config(PretrainedConfig):
-    model_type = 'NVLM_D'
-    is_composition = True
-
-    def __init__(self, vision_config=None, llm_config=None, **kwargs):
-        super().__init__(**kwargs)
-
-        # Handle vision_config initialization
-        if vision_config is None:
-            vision_config = {}
-
-        # Handle llm_config initialization
-        if llm_config is None:
-            llm_config = {}
-
-        self.vision_config = PretrainedConfig(**vision_config)
-        self.text_config = Qwen2Config(**llm_config)

From f7dcce7a4aabb1445c2827ac5d978a9c5e18be30 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:39:08 -0400
Subject: [PATCH 166/932] [Feature] Add `VLLM_USE_DEEP_GEMM_E8M0` Env to
 Control E8M0 Scale (#21968)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/kernels/moe/test_block_fp8.py           |  5 +-
 tests/kernels/moe/test_deepep_deepgemm_moe.py |  6 +--
 vllm/envs.py                                  |  5 ++
 .../layers/fused_moe/batched_deep_gemm_moe.py |  4 +-
 .../layers/fused_moe/fused_moe.py             |  6 +--
 .../layers/fused_moe/triton_deep_gemm_moe.py  |  6 +--
 .../model_executor/layers/quantization/fp8.py | 19 +++-----
 .../layers/quantization/utils/fp8_utils.py    |  6 +--
 vllm/utils/deep_gemm.py                       | 47 +++++++++++++++----
 9 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index 7dc6282326..75b2e9f791 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, modular_triton_fused_moe)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
 
 dg_available = has_deep_gemm()
 
@@ -224,7 +224,8 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
-@pytest.mark.skipif(is_blackwell_deep_gemm_used(), reason="Not E8M0 scale MOE")
+@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
+                    reason="Not E8M0 scale MOE")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
                                             monkeypatch):
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 266f1161a6..9b064db973 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep, has_deep_gemm
-from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used,
+from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
                                   is_deep_gemm_supported)
 
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -370,7 +370,7 @@ NUM_EXPERTS = [32]
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
+@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
                     reason="Skipping test for Blackwell DeepGEMM")
 def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
                                 topk: int, world_dp_size: tuple[int, int]):
@@ -427,7 +427,7 @@ USE_FP8_DISPATCH = [False]
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
+@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
                     reason="Skipping test for Blackwell DeepGEMM")
 def test_ll_deepep_deepgemm_moe(
     mnk: tuple[int, int, int],
diff --git a/vllm/envs.py b/vllm/envs.py
index c26c7f215d..931edcfa7f 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -127,6 +127,7 @@ if TYPE_CHECKING:
     VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
     VLLM_TPU_USING_PATHWAYS: bool = False
     VLLM_USE_DEEP_GEMM: bool = False
+    VLLM_USE_DEEP_GEMM_E8M0: bool = True
     VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
@@ -925,6 +926,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_DEEP_GEMM":
     lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
 
+    # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
+    # E8M0 is faster on B200 but may reduce accuracy.
+    "VLLM_USE_DEEP_GEMM_E8M0":
+    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))),
     # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
     # JIT all the required kernels before model execution so there is no
     # JIT'ing in the hot-path. However, this warmup increases the engine
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index 3ccddb5299..c48a0137c3 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (fp8_m_grouped_gemm_nt_masked,
-                                  is_blackwell_deep_gemm_used)
+                                  is_blackwell_deep_gemm_e8m0_used)
 
 logger = init_logger(__name__)
 
@@ -176,7 +176,7 @@ def silu_mul_fp8_quant_deep_gemm(
         eps,
         fp8_min,
         fp8_max,
-        is_blackwell_deep_gemm_used(),
+        is_blackwell_deep_gemm_e8m0_used(),
         BLOCK=group_size,
         NUM_STAGES=8,
         num_warps=1,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 86cc6e0e5d..ad094c37f9 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
 
 from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
 
@@ -1387,8 +1387,8 @@ def fused_experts(hidden_states: torch.Tensor,
     # E8M0 scale, which means we requantize the weight and input to the specific
     # scale. Fallen back to cutlass or triton for some cases would cause
     # accuracy issue.
-    should_use_deep_gemm = is_blackwell_deep_gemm_used() or _valid_deep_gemm(
-        hidden_states, w1, w2)
+    should_use_deep_gemm = is_blackwell_deep_gemm_e8m0_used(
+    ) or _valid_deep_gemm(hidden_states, w1, w2)
     if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm):
         assert apply_router_weight_on_input is False
         assert is_act_and_mul, (
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index c67f7e8083..9d0ff2e061 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
     DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape,
     deep_gemm_block_shape)
 from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
 
 
 class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
@@ -107,7 +107,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
-        if self.allow_deep_gemm and (is_blackwell_deep_gemm_used()
+        if self.allow_deep_gemm and (is_blackwell_deep_gemm_e8m0_used()
                                      or _valid_deep_gemm_shape(M, N, K)):
             assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.workspace_shapes(
@@ -133,7 +133,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
               extra_expert_args: Optional[dict[str, Any]]):
         use_deep_gemm = (self.allow_deep_gemm
                          and (_valid_deep_gemm(hidden_states, w1, w2)
-                              or is_blackwell_deep_gemm_used()))
+                              or is_blackwell_deep_gemm_e8m0_used()))
 
         experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert
         assert experts is not None
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 8b6ed154bd..9577fa025b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -45,7 +45,8 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
+                                  is_deep_gemm_supported)
 from vllm.utils.flashinfer import has_flashinfer_moe
 
 if TYPE_CHECKING:
@@ -415,10 +416,10 @@ class Fp8LinearMethod(LinearMethodBase):
             # Activations not quantized for marlin.
             del layer.input_scale
 
-        # On B200, DeepGemm only support E8M0 scale, which means we need to
+        # On B200, if E8M0 for DeepGemm is used, we need to
         # requantize the weight and input to the specific scale
         # at the same time.
-        if is_blackwell_deep_gemm_used():
+        if is_blackwell_deep_gemm_e8m0_used():
             assert layer.weight_block_size is not None
             block_sz = tuple(layer.weight_block_size)
             requant_weight_ue8m0_inplace(
@@ -505,15 +506,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             elif not self.block_quant:
                 logger.warning_once("Model is not block quantized. Not using "
                                     "DeepGemm kernels")
-            elif (current_platform.is_cuda()
-                  and current_platform.is_device_capability(90)):
+            elif (is_deep_gemm_supported()):
                 logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.")
                 self.allow_deep_gemm = True
-            elif (current_platform.is_cuda()
-                  and is_blackwell_deep_gemm_used()):
-                logger.info_once("Using DeepGemm SM100 kernels for "
-                                 "Fp8MoEMethod.")
-                self.allow_deep_gemm = True
             else:
                 logger.warning_once(
                     "DeepGemm not supported on the current platform.")
@@ -725,7 +720,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
             # DeepGemm scales need to be transposed and aligned.  We try to do
             # it ahead of time for performance reasons.
-            if self.allow_deep_gemm and not is_blackwell_deep_gemm_used():
+            if self.allow_deep_gemm and not is_blackwell_deep_gemm_e8m0_used():
                 # Lazy import to avoid CUDA initialization problems.
                 if _is_col_major(layer.w13_weight_scale_inv):
                     layer.w13_weight_scale_inv = \
@@ -851,7 +846,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-        if is_blackwell_deep_gemm_used():
+        if is_blackwell_deep_gemm_e8m0_used():
             assert layer.weight_block_size is not None
             # Re-quantise the expert weights so their scales are UE8M0.
             block_sz = tuple(layer.weight_block_size)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 68a061968a..2fb7ef29e4 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv, direct_register_custom_op, has_deep_gemm
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
 
 logger = init_logger(__name__)
 
@@ -394,10 +394,8 @@ def per_token_group_quant_fp8(
         tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
         scaling factor.
     """
-    # TODO(wentao): refactor this
-    # use_ue8m0 should be a global flag that could be set by user
     if use_ue8m0 is None:
-        use_ue8m0 = is_blackwell_deep_gemm_used()
+        use_ue8m0 = is_blackwell_deep_gemm_e8m0_used()
     dtype = current_platform.fp8_dtype() if dtype is None else dtype
     assert (x.shape[-1] % group_size == 0), (
         f"the last dimension of `x` {x.shape[-1]} must be divisible "
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 174287b44b..861d9c0c00 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -31,19 +31,37 @@ def is_deep_gemm_supported() -> bool:
 
 
 @functools.cache
-def is_blackwell_deep_gemm_used() -> bool:
-    """Return ``True`` if vLLM is configured to use DeepGEMM on a
-    Blackwell-class GPU.
+def is_blackwell_deep_gemm_e8m0_used() -> bool:
+    """Return ``True`` if vLLM is configured to use DeepGEMM "
+    "E8M0 scale on a Blackwell-class GPU.
     """
-    if not (envs.VLLM_USE_DEEP_GEMM and has_deep_gemm()):
+    if not (envs.VLLM_USE_DEEP_GEMM):
+        logger.debug_once("DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM=0.")
+        return False
+
+    if not has_deep_gemm():
+        logger.debug_once("DeepGEMM E8M0 disabled: DeepGEMM backend missing.")
+        return False
+
+    if not envs.VLLM_USE_DEEP_GEMM_E8M0:
+        logger.debug_once("DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM_E8M0=0.")
         return False
 
     _lazy_init()
+
     if _fp8_gemm_nt_impl is None:
+        logger.debug_once(
+            "DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
         return False
 
-    return (current_platform.is_cuda()
-            and current_platform.is_device_capability(100))
+    enabled = (current_platform.is_cuda()
+               and current_platform.has_device_capability(100))
+    if enabled:
+        logger.debug_once("DeepGEMM E8M0 enabled on Blackwell GPU.")
+    else:
+        logger.debug_once(
+            "DeepGEMM E8M0 disabled: not running on Blackwell GPU.")
+    return enabled
 
 
 def _missing(*_: Any, **__: Any) -> NoReturn:
@@ -109,21 +127,30 @@ def fp8_gemm_nt(*args, **kwargs):
     _lazy_init()
     if _fp8_gemm_nt_impl is None:
         return _missing(*args, **kwargs)
-    return _fp8_gemm_nt_impl(*args, **kwargs)
+    return _fp8_gemm_nt_impl(
+        *args,
+        disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(),
+        **kwargs)
 
 
 def m_grouped_fp8_gemm_nt_contiguous(*args, **kwargs):
     _lazy_init()
     if _grouped_impl is None:
         return _missing(*args, **kwargs)
-    return _grouped_impl(*args, **kwargs)
+    return _grouped_impl(
+        *args,
+        disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(),
+        **kwargs)
 
 
 def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
     _lazy_init()
     if _grouped_masked_impl is None:
         return _missing(*args, **kwargs)
-    return _grouped_masked_impl(*args, **kwargs)
+    return _grouped_masked_impl(
+        *args,
+        disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(),
+        **kwargs)
 
 
 def _ceil_to_ue8m0(x: torch.Tensor):
@@ -181,6 +208,6 @@ __all__ = [
     "m_grouped_fp8_gemm_nt_contiguous",
     "fp8_m_grouped_gemm_nt_masked",
     "per_block_cast_to_fp8",
-    "is_blackwell_deep_gemm_used",
+    "is_blackwell_deep_gemm_e8m0_used",
     "is_deep_gemm_supported",
 ]

From 16fb668b61c8d21d1e86f0fa4aa876beb7647a8d Mon Sep 17 00:00:00 2001
From: GuanLuo <41310872+GuanLuo@users.noreply.github.com>
Date: Mon, 11 Aug 2025 09:40:55 -0700
Subject: [PATCH 167/932] fix: NIXL connector transfers partial block to pass
 full multi-modal context (#21074)

Signed-off-by: GuanLuo <gluo@nvidia.com>
---
 .../kv_connector/unit/test_nixl_connector.py  |  18 ++-
 .../unit/test_remote_decode_lifecycle.py      |  23 ++--
 .../unit/test_remote_prefill_lifecycle.py     | 104 +++++++++++++++++-
 .../kv_connector/v1/nixl_connector.py         |  26 ++---
 4 files changed, 130 insertions(+), 41 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index c5ca7df836..c673983235 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -173,9 +173,9 @@ def test_prompt_less_than_block_size():
     """
     Test that we can handle case where prompt is < block.
 
-    In this case, the P worker will send empty remote_block_ids.
-    The D worker should not schedule an async read in this case,
-    since there is nothing to pull.
+    In this case, the P worker will still send remote_block_ids of the
+    partial block. The D worker should schedule an async read
+    in this case.
     """
     vllm_config = create_vllm_config()
     scheduler = create_scheduler(vllm_config)
@@ -184,22 +184,20 @@ def test_prompt_less_than_block_size():
     BLOCK_SIZE = vllm_config.cache_config.block_size
     NUM_TOKENS = int(BLOCK_SIZE * 0.5)
 
-    # Request will have 0 remote blocks.
+    # Request will have 1 partial remote block.
     request = create_request(request_id=1,
                              num_tokens=NUM_TOKENS,
                              do_remote_prefill=True,
-                             num_remote_blocks=0)
+                             num_remote_blocks=1)
     scheduler.add_request(request)
     scheduler_output = scheduler.schedule()
 
-    # This request should not have to read async.
+    # This request will read async.
     kv_connector_metadata = scheduler_output.kv_connector_metadata
     assert kv_connector_metadata is not None
     assert isinstance(kv_connector_metadata, NixlConnectorMetadata)
-    assert len(kv_connector_metadata.reqs_to_recv) == 0
-
-    # This request should be scheduled regularly.
-    assert len(scheduler_output.scheduled_new_reqs) == 1
+    assert len(kv_connector_metadata.reqs_to_recv) == 1
+    assert len(scheduler_output.scheduled_new_reqs) == 0
 
 
 class FakeNixlConnectorWorker(NixlConnectorWorker):
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index 76394a540a..1bddfef0f2 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -121,13 +121,18 @@ def test_short_prompt_lifecycle():
     model_runner_output = create_model_runner_output(reqs=[request])
 
     # (1c): update_from_output()
-    # Since tokens < block_size, there will be no kv xfer.
-    # So this should be cleaned up immediately.
-    _ = scheduler.update_from_output(scheduler_output, model_runner_output)
+    # Even though tokens < block_size, there will be kv xfer for partial block.
+    eco = scheduler.update_from_output(scheduler_output, model_runner_output)
+    kv_transfer_params = eco[0].outputs[0].kv_transfer_params
+
+    assert (len(kv_transfer_params["remote_block_ids"]) == 1)
 
     # Confirm we do not have any memory leaks after req lifecycle.
-    # We need one more call to schedule() to clear data for persistent batch.
-    _ = scheduler.schedule()
+    # We need to mark sending finish to clear data for persistent batch.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
+    model_runner_output.finished_sending = [request.request_id]
+    scheduler.update_from_output(scheduler_output, model_runner_output)
     assert_scheduler_empty(scheduler)
 
 
@@ -169,16 +174,16 @@ def test_prefix_cache_lifecycle():
     eco = scheduler.update_from_output(scheduler_output, model_runner_output)
     kv_transfer_params = eco[0].outputs[0].kv_transfer_params
 
-    # Ensure we send all block ids, even if there is a cache hit.
+    # Ensure we send all block ids, including the partial blocks,
+    # even if there is a cache hit.
     assert (len(
-        kv_transfer_params["remote_block_ids"]) == NUM_EXTERNAL_FULL_BLOCKS)
+        kv_transfer_params["remote_block_ids"]) == (NUM_EXTERNAL_FULL_BLOCKS +
+                                                    1))
 
     # STEP (2): Ensure it is freed.
     scheduler_output = scheduler.schedule()
-    scheduler.schedule()
     model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
     model_runner_output.kv_connector_output = KVConnectorOutput(
         finished_sending=[request_remote.request_id])
     scheduler.update_from_output(scheduler_output, model_runner_output)
-    _ = scheduler.schedule()
     assert_scheduler_empty(scheduler)
diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
index 3d52ea526d..87f7490698 100644
--- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -362,7 +362,7 @@ def test_cannot_schedule_after_recv():
     BLOCK_SIZE = vllm_config.cache_config.block_size
     # Prompt will use 2 blocks + 1 block after we schedule.
     NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
-    NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5))
+    NUM_TOKENS_REMOTE = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
 
     request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL)
     request_remote = create_request(request_id=2,
@@ -393,14 +393,24 @@ def test_cannot_schedule_after_recv():
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
 
-    # Step 4: try to schedule, not enough blocks.
+    # Step 4: try to schedule, remote request is put to running list
+    # because the transfer is completed.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(
+        reqs=[request_normal, request_remote])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 2
+    assert len(scheduler.waiting) == 0
+
+    # Step 5: Remote request will be put back to waiting list
+    # because it needs new block to hold generated token.
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output(reqs=[request_normal])
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 1
 
-    # Step 5: finish the request, free it.
+    # Step 6: finish the request, free it.
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output(reqs=[request_normal],
                                                      use_eos=True)
@@ -408,15 +418,99 @@ def test_cannot_schedule_after_recv():
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 1
 
-    # Step 6: now we can schedule (with 2 blocks computed).
+    # Step 7: now we can schedule (with 2 blocks computed),
+    # request is retrieved from preempted list.
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output(reqs=[request_remote])
-    assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens ==
+    assert (scheduler_output.scheduled_cached_reqs.num_computed_tokens[0] ==
             NUM_PROMPT_BLOCKS * BLOCK_SIZE)
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert len(scheduler.running) == 1
     assert len(scheduler.waiting) == 0
 
+    # Step 8: free everything.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_remote],
+                                                     use_eos=True)
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    _ = scheduler.schedule()
+    assert_scheduler_empty(scheduler)
+
+
+def test_cannot_recv():
+    """
+    Test that we can handle no schedule KV block transfer due to not
+    enough remaining KV blocks.
+    """
+
+    # NOTE: the KVCacheManager will use 1 null block.
+    # So there are 5 total working blocks.
+    TOTAL_NUM_BLOCKS = 6
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config, num_blocks=TOTAL_NUM_BLOCKS)
+
+    # Prime the KVCache.
+    NUM_PROMPT_BLOCKS = 2
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    # Prompt will use 2 blocks + 1 block after we schedule.
+    NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
+    NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5))
+
+    request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL)
+    request_remote = create_request(request_id=2,
+                                    num_tokens=NUM_TOKENS_REMOTE,
+                                    do_remote_prefill=True)
+
+    # STEP 1: 3 blocks are in use (2 for prompt, 1 for decode).
+    scheduler.add_request(request_normal)
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_normal])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # Step 2: 3 blocks are in use,
+    # need 3 new for remote blocks but only 2 are available.
+    scheduler.add_request(request_remote)
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_normal])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+    # Should not have KV transfer in progress.
+    assert (request_remote.status != RequestStatus.WAITING_FOR_REMOTE_KVS)
+
+    # Step 3: finish the request, free it.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_normal],
+                                                     use_eos=True)
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+
+    # Step 4: now we can initiate KV transfer (with 2 blocks computed).
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+    assert (request_remote.status == RequestStatus.WAITING_FOR_REMOTE_KVS)
+
+    # Step 5: finish recving (5 blocks in use)
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(
+        reqs=[], finished_recving=[request_remote.request_id])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+
+    # Step 6: schedule remote request
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_remote])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
     # Step 7: free everything.
     scheduler_output = scheduler.schedule()
     model_runner_output = create_model_runner_output(reqs=[request_remote],
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index e7fc2b1181..a6eeb27853 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -29,7 +29,7 @@ from vllm.distributed.utils import divide
 from vllm.forward_context import ForwardContext
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
-from vllm.utils import make_zmq_path, make_zmq_socket, round_down
+from vllm.utils import make_zmq_path, make_zmq_socket
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.request import RequestStatus
 
@@ -275,10 +275,7 @@ class NixlConnectorScheduler:
 
         if params is not None and params.get("do_remote_prefill"):
             # Remote prefill: get all prompt blocks from remote.
-            assert num_computed_tokens % self.block_size == 0
-            rounded_num_prompt_tokens = round_down(
-                len(request.prompt_token_ids), self.block_size)
-            count = max(rounded_num_prompt_tokens - num_computed_tokens, 0)
+            count = len(request.prompt_token_ids) - num_computed_tokens
             if count > 0:
                 return count, True
 
@@ -301,18 +298,16 @@ class NixlConnectorScheduler:
             # NOTE: when accelerator is not directly supported by Nixl,
             # prefilled blocks need to be saved to host memory before transfer.
 
-            # figure out full computed blocks to save
+            # save all blocks
             block_ids = blocks.get_block_ids()[0]
-            all_full = request.num_tokens % self.block_size == 0
-            full_block_ids = (block_ids if all_full else block_ids[:-1])
             # TODO: skip the blocks that are already in the host xfer buffer.
             # Currently, the host xfer buffer block is 1-to-1 mapped to device
             # kv blocks, so host blocks won't be flushed as long as its device
             # block is not overwritten; and it will be safe to skip saving them
             # to host xfer buffer.
-            if full_block_ids:
+            if block_ids:
                 self._reqs_need_save[request.request_id] = \
-                    (request, full_block_ids)
+                    (request, block_ids)
         elif params.get("do_remote_prefill"):
             if params.get("remote_block_ids"):
                 if all(p in params for p in ("remote_engine_id", "remote_host",
@@ -401,12 +396,9 @@ class NixlConnectorScheduler:
                 or request.status != RequestStatus.FINISHED_LENGTH_CAPPED):
             return False, None
 
-        # Get computed blocks.
-        all_full = request.num_computed_tokens % self.block_size == 0
-        computed_block_ids = block_ids if all_full else block_ids[:-1]
-
-        # If prompt < block_size, no xfer so free blocks immediately.
-        delay_free_blocks = len(computed_block_ids) > 0
+        # TODO: check whether block_ids actually ever be 0. If not we could
+        # remove the conditional below
+        delay_free_blocks = len(block_ids) > 0
 
         if delay_free_blocks:
             # Prefill request on remote. It will be read from D upon completion
@@ -416,7 +408,7 @@ class NixlConnectorScheduler:
         return delay_free_blocks, dict(
             do_remote_prefill=True,
             do_remote_decode=False,
-            remote_block_ids=computed_block_ids,
+            remote_block_ids=block_ids,
             remote_engine_id=self.engine_id,
             remote_host=self.side_channel_host,
             remote_port=self.side_channel_port,

From 84cf78acee1e75bfa163863b3674aeb3ba266844 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 12 Aug 2025 00:41:37 +0800
Subject: [PATCH 168/932] [Model] Pooling models default to using chunked
 prefill & prefix caching if supported. (#20930)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 tests/entrypoints/llm/test_classify.py        |   6 +
 .../entrypoints/openai/test_classification.py |  15 +++
 tests/models/language/pooling/mteb_utils.py   |  12 +-
 .../pooling/test_auto_prefix_cache_support.py |  93 ++++++++++++++
 tests/models/language/pooling/test_baai.py    | 117 +++++++++---------
 .../pooling/test_bge_reranker_v2_gemma.py     |   8 +-
 .../language/pooling/test_cross_encoder.py    |  12 +-
 tests/models/language/pooling/test_gte.py     |  87 ++++++-------
 .../models/language/pooling/test_intfloat.py  |  44 +++----
 tests/models/language/pooling/test_jina.py    |  14 ++-
 .../language/pooling/test_mxbai_rerank.py     |  15 +--
 tests/models/language/pooling/test_nomic.py   |  27 ++--
 .../language/pooling/test_qwen3_reranker.py   |  15 +--
 .../pooling/test_snowflake_arctic_embed.py    |  67 +++++-----
 tests/models/utils.py                         |  18 +++
 tests/test_config.py                          |  14 +++
 vllm/config/__init__.py                       |   8 ++
 vllm/engine/arg_utils.py                      |   9 +-
 vllm/entrypoints/llm.py                       |   4 +
 vllm/model_executor/layers/pooler.py          |  38 ++----
 vllm/model_executor/models/adapters.py        |   4 +-
 vllm/model_executor/models/bert.py            |  16 +--
 vllm/model_executor/models/bert_with_rope.py  |   4 +-
 vllm/model_executor/models/interfaces.py      |  14 +++
 vllm/model_executor/models/internlm2.py       |   3 +-
 vllm/model_executor/models/jamba.py           |   4 +-
 vllm/model_executor/models/modernbert.py      |   6 +-
 vllm/model_executor/models/qwen2_rm.py        |  16 +--
 vllm/model_executor/models/registry.py        |   6 +-
 vllm/model_executor/models/roberta.py         |   4 +-
 vllm/v1/worker/gpu_model_runner.py            |  13 +-
 31 files changed, 452 insertions(+), 261 deletions(-)
 create mode 100644 tests/models/language/pooling/test_auto_prefix_cache_support.py

diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py
index abdce8935e..71e76abcb7 100644
--- a/tests/entrypoints/llm/test_classify.py
+++ b/tests/entrypoints/llm/test_classify.py
@@ -65,3 +65,9 @@ def test_pooling_params(llm: LLM):
     assert torch.allclose(
         softmax(wo_activation), w_activation, atol=1e-2
     ), "w_activation should be close to activation(wo_activation)."
+
+
+def test_encode_api(llm: LLM):
+    err_msg = "pooling_task must be one of.+"
+    with pytest.raises(ValueError, match=err_msg):
+        llm.encode(prompts, use_tqdm=False)
diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py
index 886267c211..30078fe902 100644
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@@ -211,3 +211,18 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str):
     assert torch.allclose(
         F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2
     ), "w_activation should be close to activation(wo_activation)."
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_pooling(server: RemoteOpenAIServer, model_name: str):
+    # pooling api uses ALL pooling, which does not support chunked prefill.
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float"
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 77aaddb4f5..d024c76ddd 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -177,9 +177,12 @@ def mteb_test_embed_models(hf_runner,
                      max_model_len=None,
                      **vllm_extra_kwargs) as vllm_model:
 
+        model_config = vllm_model.llm.llm_engine.model_config
+
         if model_info.architecture:
-            assert (model_info.architecture
-                    in vllm_model.llm.llm_engine.model_config.architectures)
+            assert model_info.architecture in model_config.architectures
+        assert (model_config._model_info.default_pooling_type ==
+                model_info.default_pooling_type)
 
         vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                               MTEB_EMBED_TASKS)
@@ -286,7 +289,12 @@ def mteb_test_rerank_models(hf_runner,
                      **vllm_extra_kwargs) as vllm_model:
 
         model_config = vllm_model.llm.llm_engine.model_config
+
+        if model_info.architecture:
+            assert (model_info.architecture in model_config.architectures)
         assert model_config.hf_config.num_labels == 1
+        assert (model_config._model_info.default_pooling_type ==
+                model_info.default_pooling_type)
 
         vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
                                           tasks=MTEB_RERANK_TASKS,
diff --git a/tests/models/language/pooling/test_auto_prefix_cache_support.py b/tests/models/language/pooling/test_auto_prefix_cache_support.py
new file mode 100644
index 0000000000..15e24c59d1
--- /dev/null
+++ b/tests/models/language/pooling/test_auto_prefix_cache_support.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModelForSequenceClassification
+
+from tests.models.language.pooling.embed_utils import (
+    run_embedding_correctness_test)
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["jason9693/Qwen2.5-1.5B-apeach"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_classify_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+
+    example_prompts = example_prompts * 2
+
+    with vllm_runner(model,
+                     max_model_len=512,
+                     dtype=dtype,
+                     enable_prefix_caching=True) as vllm_model:
+        cache_config = vllm_model.llm.llm_engine.cache_config
+        assert cache_config.enable_prefix_caching
+        vllm_outputs = vllm_model.classify(example_prompts)
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+
+        assert torch.allclose(hf_output, vllm_output,
+                              1e-3 if dtype == "float" else 1e-2)
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["Qwen/Qwen3-Embedding-0.6B"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_embed_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+):
+    example_prompts = [str(s).strip() for s in example_prompts] * 2
+
+    with vllm_runner(
+            model,
+            runner="pooling",
+            max_model_len=None,
+            enable_prefix_caching=True,
+    ) as vllm_model:
+        cache_config = vllm_model.llm.llm_engine.cache_config
+        assert cache_config.enable_prefix_caching
+        vllm_outputs = vllm_model.embed(example_prompts)
+
+    with hf_runner(
+            model,
+            is_sentence_transformer=True,
+    ) as hf_model:
+        run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "intfloat/e5-small",
+        "Alibaba-NLP/gte-Qwen2-1.5B-instruct",  # is_causal == False
+        "papluca/xlm-roberta-base-language-detection",
+    ])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_non_causal_models(hf_runner, vllm_runner, example_prompts, model: str,
+                           dtype: str) -> None:
+    with vllm_runner(model,
+                     max_model_len=512,
+                     dtype=dtype,
+                     enable_prefix_caching=True) as vllm_model:
+        cache_config = vllm_model.llm.llm_engine.cache_config
+        assert not cache_config.enable_prefix_caching
diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py
index 64a8f25220..6fbe0e82d7 100644
--- a/tests/models/language/pooling/test_baai.py
+++ b/tests/models/language/pooling/test_baai.py
@@ -2,73 +2,78 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from ...utils import EmbedModelInfo, RerankModelInfo
+from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
+                      EmbedModelInfo, LASTPoolingEmbedModelInfo,
+                      RerankModelInfo)
 from .embed_utils import correctness_test_embed_models
 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 MODELS = [
     ########## BertModel
-    EmbedModelInfo("BAAI/bge-base-en",
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("BAAI/bge-base-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-en",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-en",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-zh-noinstruct",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-base-en-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-base-zh-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-en-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-small-zh-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-en-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("BAAI/bge-large-zh-v1.5",
-                   architecture="BertModel",
-                   enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-base-en",
+                             architecture="BertModel",
+                             enable_test=True),
+    CLSPoolingEmbedModelInfo("BAAI/bge-base-zh",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-small-en",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-small-zh",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-large-en",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-large-zh",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-noinstruct",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-base-en-v1.5",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-base-zh-v1.5",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-small-en-v1.5",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-small-zh-v1.5",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-large-en-v1.5",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("BAAI/bge-large-zh-v1.5",
+                             architecture="BertModel",
+                             enable_test=False),
     ########## XLMRobertaModel
-    EmbedModelInfo("BAAI/bge-m3",
-                   architecture="XLMRobertaModel",
-                   enable_test=True),
+    CLSPoolingEmbedModelInfo("BAAI/bge-m3",
+                             architecture="XLMRobertaModel",
+                             enable_test=True),
     ########## Qwen2Model
-    EmbedModelInfo("BAAI/bge-code-v1",
-                   architecture="Qwen2Model",
-                   dtype="float32",
-                   enable_test=True),
+    LASTPoolingEmbedModelInfo("BAAI/bge-code-v1",
+                              architecture="Qwen2Model",
+                              dtype="float32",
+                              enable_test=True),
 ]
 
 RERANK_MODELS = [
     ########## XLMRobertaForSequenceClassification
-    RerankModelInfo("BAAI/bge-reranker-base",
-                    architecture="XLMRobertaForSequenceClassification",
-                    enable_test=True),
-    RerankModelInfo("BAAI/bge-reranker-large",
-                    architecture="XLMRobertaForSequenceClassification",
-                    enable_test=False),
-    RerankModelInfo("BAAI/bge-reranker-v2-m3",
-                    architecture="XLMRobertaForSequenceClassification",
-                    enable_test=False)
+    CLSPoolingRerankModelInfo(
+        "BAAI/bge-reranker-base",
+        architecture="XLMRobertaForSequenceClassification",
+        enable_test=True),
+    CLSPoolingRerankModelInfo(
+        "BAAI/bge-reranker-large",
+        architecture="XLMRobertaForSequenceClassification",
+        enable_test=False),
+    CLSPoolingRerankModelInfo(
+        "BAAI/bge-reranker-v2-m3",
+        architecture="XLMRobertaForSequenceClassification",
+        enable_test=False)
 ]
 
 
diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
index 7fa9485dbc..206524d7ca 100644
--- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
@@ -8,12 +8,12 @@ import torch
 
 from tests.conftest import HfRunner
 
-from .mteb_utils import (RerankModelInfo, VllmMtebEncoder,
-                         mteb_test_rerank_models)
+from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
+from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models
 
 RERANK_MODELS = [
-    RerankModelInfo("BAAI/bge-reranker-v2-gemma",
-                    architecture="GemmaForSequenceClassification"),
+    LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
+                               architecture="GemmaForSequenceClassification"),
 ]
 
 PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501
diff --git a/tests/models/language/pooling/test_cross_encoder.py b/tests/models/language/pooling/test_cross_encoder.py
index 9a33063d7b..8c1bc5779b 100644
--- a/tests/models/language/pooling/test_cross_encoder.py
+++ b/tests/models/language/pooling/test_cross_encoder.py
@@ -2,13 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+from ...utils import (CLSPoolingRerankModelInfo, LASTPoolingRerankModelInfo,
+                      RerankModelInfo)
+from .mteb_utils import mteb_test_rerank_models
 
 RERANK_MODELS = [
-    RerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
-                    architecture="BertForSequenceClassification"),
-    RerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
-                    architecture="Qwen3ForSequenceClassification")
+    CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
+                              architecture="BertForSequenceClassification"),
+    LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
+                               architecture="Qwen3ForSequenceClassification")
 ]
 
 
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 48a0cd64fe..5a5fdfbb21 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -4,57 +4,58 @@ from typing import Any
 
 import pytest
 
-from ...utils import check_transformers_version
-from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo,
+                      LASTPoolingEmbedModelInfo, check_transformers_version)
+from .embed_utils import correctness_test_embed_models
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     ########## BertModel
-    EmbedModelInfo("thenlper/gte-large",
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("thenlper/gte-base",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("thenlper/gte-small",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("thenlper/gte-large-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("thenlper/gte-base-zh",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("thenlper/gte-small-zh",
-                   architecture="BertModel",
-                   enable_test=False),
+    CLSPoolingEmbedModelInfo("thenlper/gte-large",
+                             architecture="BertModel",
+                             enable_test=True),
+    CLSPoolingEmbedModelInfo("thenlper/gte-base",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("thenlper/gte-small",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("thenlper/gte-large-zh",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("thenlper/gte-base-zh",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("thenlper/gte-small-zh",
+                             architecture="BertModel",
+                             enable_test=False),
     ########### NewModel
-    EmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
-                   architecture="GteNewModel",
-                   enable_test=True),
-    EmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
-                   architecture="GteNewModel",
-                   enable_test=True),
-    EmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
-                   architecture="GteNewModel",
-                   enable_test=True),
+    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
+                             architecture="GteNewModel",
+                             enable_test=True),
+    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
+                             architecture="GteNewModel",
+                             enable_test=True),
+    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
+                             architecture="GteNewModel",
+                             enable_test=True),
     ########### Qwen2ForCausalLM
-    EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                   architecture="Qwen2ForCausalLM",
-                   enable_test=True),
+    LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+                              architecture="Qwen2ForCausalLM",
+                              enable_test=True),
     ########## ModernBertModel
-    EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
-                   architecture="ModernBertModel",
-                   enable_test=True),
+    CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
+                             architecture="ModernBertModel",
+                             enable_test=True),
     ########## Qwen3ForCausalLM
-    EmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
-                   architecture="Qwen3ForCausalLM",
-                   dtype="float32",
-                   enable_test=True),
-    EmbedModelInfo("Qwen/Qwen3-Embedding-4B",
-                   architecture="Qwen3ForCausalLM",
-                   dtype="float32",
-                   enable_test=False),
+    LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
+                              architecture="Qwen3ForCausalLM",
+                              dtype="float32",
+                              enable_test=True),
+    LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-4B",
+                              architecture="Qwen3ForCausalLM",
+                              dtype="float32",
+                              enable_test=False),
 ]
 
 
diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py
index d899aaada2..e48bdbe940 100644
--- a/tests/models/language/pooling/test_intfloat.py
+++ b/tests/models/language/pooling/test_intfloat.py
@@ -2,34 +2,34 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from ...utils import EmbedModelInfo
+from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
 from .embed_utils import correctness_test_embed_models
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     ########## BertModel
-    EmbedModelInfo("intfloat/e5-small",
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("intfloat/e5-base",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("intfloat/e5-large",
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("intfloat/multilingual-e5-small",
-                   architecture="BertModel",
-                   enable_test=False),
+    CLSPoolingEmbedModelInfo("intfloat/e5-small",
+                             architecture="BertModel",
+                             enable_test=True),
+    CLSPoolingEmbedModelInfo("intfloat/e5-base",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("intfloat/e5-large",
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-small",
+                             architecture="BertModel",
+                             enable_test=False),
     ########## XLMRobertaModel
-    EmbedModelInfo("intfloat/multilingual-e5-base",
-                   architecture="XLMRobertaModel",
-                   enable_test=True),
-    EmbedModelInfo("intfloat/multilingual-e5-large",
-                   architecture="XLMRobertaModel",
-                   enable_test=False),
-    EmbedModelInfo("intfloat/multilingual-e5-large-instruct",
-                   architecture="XLMRobertaModel",
-                   enable_test=False),
+    CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base",
+                             architecture="XLMRobertaModel",
+                             enable_test=True),
+    CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large",
+                             architecture="XLMRobertaModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large-instruct",
+                             architecture="XLMRobertaModel",
+                             enable_test=False),
 ]
 
 
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 59b634428c..37c5bdc97d 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -6,20 +6,22 @@ import pytest
 
 from vllm import PoolingParams
 
-from ...utils import EmbedModelInfo, RerankModelInfo
+from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
+                      EmbedModelInfo, RerankModelInfo)
 from .embed_utils import (check_embeddings_close,
                           correctness_test_embed_models, matryoshka_fy)
 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 EMBEDDING_MODELS = [
-    EmbedModelInfo("jinaai/jina-embeddings-v3",
-                   architecture="XLMRobertaModel",
-                   is_matryoshka=True)
+    CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3",
+                             architecture="XLMRobertaModel",
+                             is_matryoshka=True)
 ]
 
 RERANK_MODELS = [
-    RerankModelInfo("jinaai/jina-reranker-v2-base-multilingual",
-                    architecture="XLMRobertaForSequenceClassification")
+    CLSPoolingRerankModelInfo(
+        "jinaai/jina-reranker-v2-base-multilingual",
+        architecture="XLMRobertaForSequenceClassification")
 ]
 
 
diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py
index e74c58744d..480bd5e456 100644
--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@@ -7,15 +7,16 @@ import torch
 
 from tests.conftest import HfRunner
 
-from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
+from .mteb_utils import mteb_test_rerank_models
 
 RERANK_MODELS = [
-    RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
-                    architecture="Qwen2ForSequenceClassification",
-                    enable_test=True),
-    RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
-                    architecture="Qwen2ForSequenceClassification",
-                    enable_test=False)
+    LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
+                               architecture="Qwen2ForSequenceClassification",
+                               enable_test=True),
+    LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
+                               architecture="Qwen2ForSequenceClassification",
+                               enable_test=False)
 ]
 
 
diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py
index e16ec239a3..2d05958e9b 100644
--- a/tests/models/language/pooling/test_nomic.py
+++ b/tests/models/language/pooling/test_nomic.py
@@ -3,22 +3,23 @@
 
 import pytest
 
-from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from .embed_utils import correctness_test_embed_models
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
-    EmbedModelInfo("nomic-ai/nomic-embed-text-v1",
-                   architecture="NomicBertModel",
-                   enable_test=True),
-    EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
-                   architecture="NomicBertModel",
-                   enable_test=False),
-    EmbedModelInfo("nomic-ai/CodeRankEmbed",
-                   architecture="NomicBertModel",
-                   enable_test=False),
-    EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
-                   architecture="NomicBertModel",
-                   enable_test=True)
+    CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1",
+                             architecture="NomicBertModel",
+                             enable_test=True),
+    CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
+                             architecture="NomicBertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("nomic-ai/CodeRankEmbed",
+                             architecture="NomicBertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
+                             architecture="NomicBertModel",
+                             enable_test=True)
 ]
 
 
diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py
index 68e96f3270..37f5566a33 100644
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -8,15 +8,16 @@ import torch
 from tests.conftest import HfRunner
 from tests.utils import multi_gpu_test
 
-from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
+from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
+from .mteb_utils import mteb_test_rerank_models
 
 RERANK_MODELS = [
-    RerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
-                    architecture="Qwen3ForSequenceClassification",
-                    enable_test=True),
-    RerankModelInfo("Qwen/Qwen3-Reranker-4B",
-                    architecture="Qwen3ForSequenceClassification",
-                    enable_test=False)
+    LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
+                               architecture="Qwen3ForSequenceClassification",
+                               enable_test=True),
+    LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
+                               architecture="Qwen3ForSequenceClassification",
+                               enable_test=False)
 ]
 
 
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
index d6b5dbd083..585fa0e683 100644
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -3,42 +3,43 @@
 
 import pytest
 
-from .embed_utils import EmbedModelInfo, correctness_test_embed_models
+from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from .embed_utils import correctness_test_embed_models
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
-                   is_matryoshka=False,
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
-                   is_matryoshka=False,
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
-                   is_matryoshka=False,
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
-                   is_matryoshka=False,
-                   architecture="NomicBertModel",
-                   enable_test=True),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
-                   is_matryoshka=False,
-                   architecture="BertModel",
-                   enable_test=False),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
-                   is_matryoshka=True,
-                   architecture="BertModel",
-                   enable_test=True),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
-                   is_matryoshka=True,
-                   architecture="XLMRobertaModel",
-                   enable_test=True),
-    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
-                   is_matryoshka=True,
-                   architecture="GteModel",
-                   enable_test=True),
+    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
+                             is_matryoshka=False,
+                             architecture="BertModel",
+                             enable_test=True),
+    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
+                             is_matryoshka=False,
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
+                             is_matryoshka=False,
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
+                             is_matryoshka=False,
+                             architecture="NomicBertModel",
+                             enable_test=True),
+    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
+                             is_matryoshka=False,
+                             architecture="BertModel",
+                             enable_test=False),
+    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
+                             is_matryoshka=True,
+                             architecture="BertModel",
+                             enable_test=True),
+    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
+                             is_matryoshka=True,
+                             architecture="XLMRobertaModel",
+                             enable_test=True),
+    CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
+                             is_matryoshka=True,
+                             architecture="GteModel",
+                             enable_test=True),
 ]
 
 
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 11ddf45c8e..84aeb927c5 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -345,16 +345,34 @@ class EmbedModelInfo(NamedTuple):
     matryoshka_dimensions: Optional[list[int]] = None
     architecture: str = ""
     dtype: str = "auto"
+    default_pooling_type: str = ""
     enable_test: bool = True
 
 
+class CLSPoolingEmbedModelInfo(EmbedModelInfo):
+    default_pooling_type: str = "CLS"
+
+
+class LASTPoolingEmbedModelInfo(EmbedModelInfo):
+    default_pooling_type: str = "LAST"
+
+
 class RerankModelInfo(NamedTuple):
     name: str
     architecture: str = ""
     dtype: str = "auto"
+    default_pooling_type: str = ""
     enable_test: bool = True
 
 
+class CLSPoolingRerankModelInfo(RerankModelInfo):
+    default_pooling_type: str = "CLS"
+
+
+class LASTPoolingRerankModelInfo(RerankModelInfo):
+    default_pooling_type: str = "LAST"
+
+
 def dummy_hf_overrides(
     hf_config: PretrainedConfig,
     *,
diff --git a/tests/test_config.py b/tests/test_config.py
index 19b1b74e42..957771a422 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -227,6 +227,20 @@ def test_get_pooling_config_from_args():
     assert asdict(pooling_config) == asdict(override_pooler_config)
 
 
+@pytest.mark.parametrize(
+    ("model_id", "default_pooling_type", "pooling_type"),
+    [
+        ("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", "LAST", "LAST"),  # LLM
+        ("intfloat/e5-small", "CLS", "MEAN"),  # BertModel
+        ("Qwen/Qwen2.5-Math-RM-72B", "ALL", "ALL"),  # reward
+        ("Qwen/Qwen2.5-Math-PRM-7B", "STEP", "STEP")  # step reward
+    ])
+def test_default_pooling_type(model_id, default_pooling_type, pooling_type):
+    model_config = ModelConfig(model_id)
+    assert model_config._model_info.default_pooling_type == default_pooling_type
+    assert model_config.pooler_config.pooling_type == pooling_type
+
+
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
 def test_get_bert_tokenization_sentence_transformer_config():
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 700d29f956..03ab034c62 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -871,6 +871,10 @@ class ModelConfig:
                     if getattr(pooler_config, k) is None:
                         setattr(pooler_config, k, v)
 
+            default_pooling_type = self._model_info.default_pooling_type
+            if pooler_config.pooling_type is None:
+                pooler_config.pooling_type = default_pooling_type
+
             return pooler_config
 
         return None
@@ -3844,6 +3848,10 @@ class VllmConfig:
                 disable_chunked_prefill_reasons.append(
                     "Only \"last\" pooling supports chunked "
                     "prefill and prefix caching; disabling both.")
+            elif not getattr(self.model_config.hf_config, "is_causal", True):
+                disable_chunked_prefill_reasons.append(
+                    "Only models using causal attention supports chunked "
+                    "prefill and prefix caching; disabling both.")
 
         if disable_chunked_prefill_reasons:
             for reason in disable_chunked_prefill_reasons:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4767201617..41a6da709b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1600,11 +1600,10 @@ class EngineArgs:
         else:
 
             pooling_type = model_config.pooler_config.pooling_type
-
-            # TODO: when encoder models are supported we'll have to
-            # check for causal attention here.
-            incremental_prefill_supported = (pooling_type is not None and
-                                             pooling_type.lower() == "last")
+            is_causal = getattr(model_config.hf_config, "is_causal", True)
+            incremental_prefill_supported = (pooling_type is not None
+                                             and pooling_type.lower() == "last"
+                                             and is_causal)
 
             action = "Enabling" if \
                 incremental_prefill_supported else "Disabling"
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 4014a961c6..915f14a29b 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1100,6 +1100,10 @@ class LLM:
                 "Try passing `--runner pooling` to use the model as a "
                 "pooling model.")
 
+        if pooling_task not in self.supported_tasks:
+            raise ValueError(
+                f"pooling_task must be one of {self.supported_tasks}.")
+
         if prompt_token_ids is not None:
             parsed_prompts = self._convert_v1_inputs(
                 prompts=cast(Optional[Union[str, list[str]]], prompts),
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 0f2e58eb9b..e2162e5cbf 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -44,15 +44,14 @@ class ResolvedPoolingConfig:
     task: PoolingTask
 
     @classmethod
-    def from_config_with_defaults(
+    def from_config(
         cls,
         task: PoolingTask,
         pooler_config: PoolerConfig,
-        pooling_type: PoolingType,
     ) -> "ResolvedPoolingConfig":
+        assert pooler_config.pooling_type is not None
         return cls(task=task,
-                   pooling_type=PoolingType[pooler_config.pooling_type]
-                   if pooler_config.pooling_type is not None else pooling_type)
+                   pooling_type=PoolingType[pooler_config.pooling_type])
 
 
 @dataclass(frozen=True)
@@ -68,32 +67,20 @@ class Pooler(nn.Module, ABC):
     """The interface required for all poolers used in pooling models in vLLM."""
 
     @staticmethod
-    def for_encode(
-        pooler_config: PoolerConfig,
-        *,
-        default_pooling_type: PoolingType = PoolingType.ALL,
-    ):
-        resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
-            task="encode",
-            pooler_config=pooler_config,
-            pooling_type=default_pooling_type,
-        )
-
-        if resolved_config.pooling_type == PoolingType.STEP:
+    def for_encode(pooler_config: PoolerConfig):
+        if pooler_config.pooling_type == "STEP":
             return StepPooler()
 
+        resolved_config = ResolvedPoolingConfig(task="encode",
+                                                pooling_type=PoolingType.ALL)
+
         return SimplePooler.from_config(resolved_config)
 
     @staticmethod
-    def for_embed(
-        pooler_config: PoolerConfig,
-        *,
-        default_pooling_type: PoolingType = PoolingType.LAST,
-    ):
-        resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
+    def for_embed(pooler_config: PoolerConfig):
+        resolved_config = ResolvedPoolingConfig.from_config(
             task="embed",
             pooler_config=pooler_config,
-            pooling_type=default_pooling_type,
         )
 
         return SimplePooler.from_config(resolved_config)
@@ -102,13 +89,10 @@ class Pooler(nn.Module, ABC):
     def for_classify(
         pooler_config: PoolerConfig,
         classifier: Optional[ClassifierFn],
-        *,
-        default_pooling_type: PoolingType = PoolingType.LAST,
     ):
-        resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
+        resolved_config = ResolvedPoolingConfig.from_config(
             task="classify",
             pooler_config=pooler_config,
-            pooling_type=default_pooling_type,
         )
 
         pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type)
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 867de2c68b..1dbe70f84a 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -182,8 +182,8 @@ def as_seq_cls_model(cls: _T) -> _T:
             assert pooler_config is not None
 
             pooling_type_str = pooler_config.pooling_type
-            pooling_type = (PoolingType.LAST if pooling_type_str is None else
-                            PoolingType[pooling_type_str])
+            assert pooling_type_str is not None
+            pooling_type = PoolingType[pooling_type_str]
 
             self.pooler = DispatchPooler({
                 "encode":
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 3d5d5d505b..6638f06f98 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -28,7 +28,8 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
 
-from .interfaces import SupportsCrossEncoding, SupportsQuant
+from .interfaces import (SupportsCrossEncoding, SupportsQuant,
+                         default_pooling_type)
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
@@ -327,6 +328,7 @@ class BertOutput(nn.Module):
 
 
 @support_torch_compile
+@default_pooling_type("CLS")
 class BertModel(nn.Module, SupportsQuant):
 
     is_pooling_model = True
@@ -401,6 +403,7 @@ class BertModel(nn.Module, SupportsQuant):
         return loaded_params
 
 
+@default_pooling_type("ALL")
 class BertPoolingModel(BertModel):
 
     is_pooling_model = True
@@ -431,6 +434,7 @@ class BertPoolingModel(BertModel):
         return loaded_params
 
 
+@default_pooling_type("CLS")
 class BertEmbeddingModel(nn.Module, SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
@@ -486,13 +490,8 @@ class BertEmbeddingModel(nn.Module, SupportsQuant):
 
     def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
         return DispatchPooler({
-            "encode":
-            Pooler.for_encode(pooler_config),
-            "embed":
-            Pooler.for_embed(
-                pooler_config,
-                default_pooling_type=PoolingType.CLS,
-            ),
+            "encode": Pooler.for_encode(pooler_config),
+            "embed": Pooler.for_embed(pooler_config),
         })
 
 
@@ -541,6 +540,7 @@ def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
     return token_type_ids
 
 
+@default_pooling_type("CLS")
 class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
                                     SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 050f18f16e..e18b7b7ffa 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -27,7 +27,8 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import SupportsQuant
+from vllm.model_executor.models.interfaces import (SupportsQuant,
+                                                   default_pooling_type)
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
@@ -401,6 +402,7 @@ class BertWithRopeEncoder(nn.Module):
 
 
 @support_torch_compile
+@default_pooling_type("CLS")
 class BertWithRope(nn.Module, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index b6d9877cd0..46caf3fce4 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -641,6 +641,20 @@ def supports_cross_encoding(
     return is_pooling_model(model) and _supports_cross_encoding(model)
 
 
+def default_pooling_type(pooling_type: str) -> object:
+    """Set default_pooling_type decorator. """
+
+    def func(model: object):
+        model.default_pooling_type = pooling_type
+        return model
+
+    return func
+
+
+def get_default_pooling_type(model: Union[type[object], object]) -> str:
+    return getattr(model, "default_pooling_type", "LAST")
+
+
 class SupportsQuant:
     """The interface required for all models that support quantization."""
 
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index d29779a35e..d0c4bf5450 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -31,7 +31,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -401,6 +401,7 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         return loaded_params
 
 
+@default_pooling_type("ALL")
 class InternLM2ForRewardModel(InternLM2ForCausalLM):
 
     is_pooling_model = True
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index c1033aff07..fbd310121a 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -22,8 +22,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.mamba.mamba_utils import (
     MambaStateShapeCalculator)
-from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
-                                               PoolingType)
+from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
@@ -604,6 +603,5 @@ class JambaForSequenceClassification(JambaForCausalLM):
             Pooler.for_classify(
                 pooler_config,
                 classifier=self.score,
-                default_pooling_type=PoolingType.LAST,
             ),
         })
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 761fce815e..2c3bdd1c93 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -26,7 +26,8 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
 
-from .interfaces import SupportsCrossEncoding, SupportsV0Only
+from .interfaces import (SupportsCrossEncoding, SupportsV0Only,
+                         default_pooling_type)
 from .utils import WeightsMapper, maybe_prefix
 
 
@@ -201,6 +202,7 @@ class ModernBertEncoderLayer(nn.Module):
 
 
 @support_torch_compile
+@default_pooling_type("CLS")
 class ModernBertModel(nn.Module):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={"layers.": "encoder_layer.layers."})
@@ -264,7 +266,6 @@ class ModernBertPooler(Pooler):
         self.pooling = PoolingMethod.from_pooling_type(pooling_type)
         self.dense = nn.Linear(config.hidden_size, config.hidden_size,
                                config.classifier_bias)
-        self.pooling_type = config.classifier_pooling
         self.act = nn.GELU()
         self.norm = nn.LayerNorm(config.hidden_size,
                                  eps=config.norm_eps,
@@ -294,6 +295,7 @@ class ModernBertPooler(Pooler):
         return pooled_output
 
 
+@default_pooling_type("CLS")
 class ModernBertForSequenceClassification(nn.Module, SupportsV0Only,
                                           SupportsCrossEncoding):
 
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 9b6b70c75c..e0a30e04c6 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -15,11 +15,10 @@ from torch import nn
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
-                                               PoolingType)
+from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type
 from .qwen2 import Qwen2Model
 from .utils import AutoWeightsLoader, maybe_prefix
 
@@ -90,6 +89,7 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
         return loader.load_weights(weights)
 
 
+@default_pooling_type("ALL")
 class Qwen2ForRewardModel(Qwen2RewardBaseModel):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -103,6 +103,7 @@ class Qwen2ForRewardModel(Qwen2RewardBaseModel):
             {"encode": Pooler.for_encode(pooler_config)}, )
 
 
+@default_pooling_type("STEP")
 class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -112,10 +113,5 @@ class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler({
-            "encode":
-            Pooler.for_encode(
-                pooler_config,
-                default_pooling_type=PoolingType.STEP,
-            )
-        })
+        self.pooler = DispatchPooler(
+            {"encode": Pooler.for_encode(pooler_config)})
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index aca3d84f00..1b0c902c5e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -25,8 +25,8 @@ from vllm.logger import init_logger
 from vllm.transformers_utils.dynamic_module import (
     try_get_class_from_dynamic_module)
 
-from .interfaces import (has_inner_state, has_noops, is_attention_free,
-                         is_hybrid, supports_cross_encoding,
+from .interfaces import (get_default_pooling_type, has_inner_state, has_noops,
+                         is_attention_free, is_hybrid, supports_cross_encoding,
                          supports_multimodal, supports_multimodal_raw_input,
                          supports_pp, supports_transcription, supports_v0_only)
 from .interfaces_base import is_pooling_model, is_text_generation_model
@@ -305,6 +305,7 @@ class _ModelInfo:
     architecture: str
     is_text_generation_model: bool
     is_pooling_model: bool
+    default_pooling_type: str
     supports_cross_encoding: bool
     supports_multimodal: bool
     supports_multimodal_raw_input: bool
@@ -323,6 +324,7 @@ class _ModelInfo:
             architecture=model.__name__,
             is_text_generation_model=is_text_generation_model(model),
             is_pooling_model=is_pooling_model(model),
+            default_pooling_type=get_default_pooling_type(model),
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_multimodal_raw_input=supports_multimodal_raw_input(model),
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 005b917982..32a4a2c9a2 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -23,7 +23,7 @@ from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
 from vllm.sequence import IntermediateTensors
 
 from .bert_with_rope import BertWithRope, JinaRobertaModel
-from .interfaces import SupportsCrossEncoding
+from .interfaces import SupportsCrossEncoding, default_pooling_type
 
 
 class RobertaEmbedding(nn.Module):
@@ -86,6 +86,7 @@ class RobertaClassificationHead(nn.Module):
         return x
 
 
+@default_pooling_type("CLS")
 class RobertaEmbeddingModel(BertEmbeddingModel):
     """A model that uses Roberta to provide embedding functionalities.
 
@@ -149,6 +150,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
         return loader.load_weights(weights_list, mapper=mapper)
 
 
+@default_pooling_type("CLS")
 class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
     """A model that uses Roberta to provide embedding functionalities.
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 3cde7c6e96..045a06d927 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1272,7 +1272,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if not is_pooling_model(model):
             return []
 
-        return list(model.pooler.get_supported_tasks())
+        supported_tasks = list(model.pooler.get_supported_tasks())
+
+        if (self.scheduler_config.chunked_prefill_enabled
+                and "encode" in supported_tasks):
+            supported_tasks.remove("encode")
+
+            logger.info_once("Chunked prefill is not supported with "
+                             "encode task which using ALL pooling. "
+                             "Please turn off chunked prefill by "
+                             "`--no-enable-chunked-prefill` before using it.")
+
+        return supported_tasks
 
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         tasks = list[SupportedTask]()

From c90fb03df566cd76b0e69f91158108909da80c51 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 12 Aug 2025 01:00:58 +0800
Subject: [PATCH 169/932] [CI/Build] Skip Mllama HF runner tests with
 Transformers v4.55.0 (#22659)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .../models/multimodal/generation/test_mllama.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py
index 2bb01e494d..b413c4d6b3 100644
--- a/tests/models/multimodal/generation/test_mllama.py
+++ b/tests/models/multimodal/generation/test_mllama.py
@@ -6,6 +6,7 @@ from typing import Optional, overload
 import pytest
 import torch
 from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm import LLM, SamplingParams
 from vllm.attention.backends.flash_attn import FlashAttentionMetadata
@@ -285,6 +286,10 @@ def clear_cache():
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+@pytest.mark.skipif(
+    TRANSFORMERS_VERSION == "4.55.0",
+    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
                                      model, sizes, dtype, max_tokens,
                                      num_logprobs,
@@ -313,6 +318,10 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+@pytest.mark.skipif(
+    TRANSFORMERS_VERSION == "4.55.0",
+    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
                                      model, dtype, max_tokens, num_logprobs,
                                      attn_backend: _Backend) -> None:
@@ -362,6 +371,10 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
+@pytest.mark.skipif(
+    TRANSFORMERS_VERSION == "4.55.0",
+    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
                                    dtype, max_tokens, num_logprobs,
                                    attn_backend: _Backend) -> None:
@@ -402,6 +415,10 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.skipif(
+    TRANSFORMERS_VERSION == "4.55.0",
+    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_distributed(
     hf_runner,
     vllm_runner,

From 807d21b80d11437f10dc3360ad8215f3ca6eb2e8 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Mon, 11 Aug 2025 10:31:36 -0700
Subject: [PATCH 170/932] [BugFix] [Spec Decode] Remove LlamaForCausalLMEagle3
 to fix CI (#22611)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 tests/models/registry.py                 |  9 ++---
 tests/v1/e2e/test_spec_decode.py         | 45 +++++++++++++-----------
 vllm/model_executor/models/registry.py   |  3 +-
 vllm/transformers_utils/configs/eagle.py |  2 +-
 4 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 898e38a4ae..c5816df25b 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -525,10 +525,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                             trust_remote_code=True,
                                             speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
                                             tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
-    "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3",  # noqa: E501
-                                              trust_remote_code=True,
-                                              speculative_model="AngelSlim/Qwen3-8B_eagle3",
-                                              tokenizer="Qwen/Qwen3-8B"),
+    # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611   # noqa: E501
+    # "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3",  # noqa: E501
+    #                                         trust_remote_code=True,
+    #                                         speculative_model="AngelSlim/Qwen3-8B_eagle3",   # noqa: E501
+    #                                         tokenizer="Qwen/Qwen3-8B"),
     "EagleLlama4ForCausalLM": _HfExamplesInfo(
         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
         trust_remote_code=True,
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index cd383b58db..599916c0d1 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -125,27 +125,30 @@ def test_ngram_correctness(
         cleanup_dist_env_and_memory()
 
 
-@pytest.mark.parametrize(["model_setup", "mm_enabled"], [
-    (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
-    (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
-      "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
-    (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
-      "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
-    pytest.param(
-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-        False,
-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-    pytest.param(
-        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-        True,
-        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-],
-                         ids=[
-                             "qwen3_eagle3", "llama3_eagle", "llama3_eagle3",
-                             "llama4_eagle", "llama4_eagle_mm"
-                         ])
+@pytest.mark.parametrize(
+    ["model_setup", "mm_enabled"],
+    [
+        # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
+        # (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
+        (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+          "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
+        (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+          "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
+        pytest.param(
+            ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+             "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+            False,
+            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+        pytest.param(
+            ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+             "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+            True,
+            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+    ],
+    ids=[
+        "qwen3_eagle3", "llama3_eagle", "llama3_eagle3", "llama4_eagle",
+        "llama4_eagle_mm"
+    ])
 @pytest.mark.parametrize("attn_backend",
                          get_attn_backend_list_based_on_platform())
 def test_eagle_correctness(
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 1b0c902c5e..870704c64d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -259,7 +259,8 @@ _SPECULATIVE_DECODING_MODELS = {
     "EagleLlama4ForCausalLM": ("llama4_eagle", "EagleLlama4ForCausalLM"),
     "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"),
     "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
-    "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
+    # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
     "MedusaModel": ("medusa", "Medusa"),
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index 01217eb191..bc249c5836 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -45,7 +45,7 @@ class EAGLEConfig(PretrainedConfig):
 
         # Eagle model name should follow naming convention of
         # LlamaForCausalLM -> EagleLlamaForCausalLM
-        # LlamaForCausalLM -> Eagle3LlamaForCausalLM / LlamaForCausalLMEagle3
+        # LlamaForCausalLM -> Eagle3LlamaForCausalLM
         if method == "eagle":
             assert self.model is not None, \
                 "model should not be None when method is eagle"

From 65abe111a3035d3bf70dce217ba4e1889aa20dc3 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Mon, 11 Aug 2025 10:36:05 -0700
Subject: [PATCH 171/932] [CI] Skip Tree Attn Test in `test_max_len.py` to
 unblock CI (#22664)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/v1/spec_decode/test_max_len.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index fef6a5421b..01019b29e0 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -40,6 +40,11 @@ def test_eagle_max_len(monkeypatch: pytest.MonkeyPatch,
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
+        if attn_backend == "TREE_ATTN" and num_speculative_tokens > 1:
+            # TREE_ATTN fails the test with multi-token spec decode
+            # TODO: Investigate why
+            pytest.skip("TREE_ATTN fails the test")
+
         m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
         if (attn_backend == "TRITON_ATTN_VLLM_V1"

From 458e74eb907f96069e6d8a4f3c9f457001fef2ea Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 11 Aug 2025 18:42:48 +0100
Subject: [PATCH 172/932] Support more parallel styles in Transformers backend
 TP (#22651)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index fc4585618b..25b8b69e08 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -107,10 +107,17 @@ def replace_linear_class(
         raise ValueError(
             f"Unsupported parallel style type {type(style)}, expected str")
 
-    vllm_linear_cls = {
-        "colwise": ColumnParallelLinear,
-        "rowwise": RowParallelLinear,
-    }.get(style, ReplicatedLinear)
+    vllm_linear_cls, vllm_linear_kwargs = {
+        "colwise": (ColumnParallelLinear, {}),
+        "colwise_rep": (ColumnParallelLinear, {
+            "gather_output": True
+        }),
+        "rowwise": (RowParallelLinear, {}),
+        "rowwise_rep": (RowParallelLinear, {
+            "input_is_parallel": False
+        }),
+        "replicate": (ReplicatedLinear, {}),
+    }.get(style, (ReplicatedLinear, {}))
 
     return vllm_linear_cls(
         input_size=linear.in_features,
@@ -118,6 +125,7 @@ def replace_linear_class(
         bias=linear.bias is not None,
         quant_config=quant_config,
         return_bias=False,
+        **vllm_linear_kwargs,
     )
 
 
@@ -506,7 +514,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         # Some weight loaders expect linear layers to inherit from vLLM's
         # LinearBase class, so we set a default style which causes any
         # unspecified linear layers to be replaced with ReplicatedLinear
-        tp_plan[".*"] = "replicated"
+        tp_plan[".*"] = "replicate"
 
         def _tensor_parallel(module: nn.Module, prefix: str = ""):
             for child_name, child_module in module.named_children():

From 95a935fc48563ec63de02a65d41fd2d7cb1d9ea5 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 11 Aug 2025 17:46:59 -0700
Subject: [PATCH 173/932] [gpt-oss] Support streaming in response API (#22431)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/openai/serving_responses.py | 450 ++++++++++++++++++-
 1 file changed, 445 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 1e3746e956..089f50a1e6 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+import json
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from contextlib import AsyncExitStack
@@ -10,10 +11,22 @@ from http import HTTPStatus
 from typing import Any, Callable, Final, Optional, Union
 
 import jinja2
+import openai.types.responses as openai_responses_types
 from fastapi import Request
-from openai.types.responses import (ResponseFunctionToolCall,
-                                    ResponseOutputItem, ResponseOutputMessage,
-                                    ResponseOutputText, ResponseReasoningItem)
+from openai import BaseModel
+# yapf conflicts with isort for this block
+# yapf: disable
+from openai.types.responses import (ResponseContentPartDoneEvent,
+                                    ResponseCreatedEvent,
+                                    ResponseFunctionToolCall,
+                                    ResponseInProgressEvent,
+                                    ResponseOutputItem,
+                                    ResponseOutputItemDoneEvent,
+                                    ResponseOutputMessage, ResponseOutputText,
+                                    ResponseReasoningItem,
+                                    ResponseReasoningTextDeltaEvent,
+                                    ResponseReasoningTextDoneEvent)
+# yapf: enable
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent)
 from openai_harmony import Message as OpenAIHarmonyMessage
@@ -330,8 +343,15 @@ class OpenAIServingResponses(OpenAIServing):
                 return response
 
             if request.stream:
-                raise NotImplementedError(
-                    "Streaming responses are not supported")
+                return self.responses_stream_generator(
+                    request,
+                    sampling_params,
+                    result_generator,
+                    context,
+                    model_name,
+                    tokenizer,
+                    request_metadata,
+                )
 
             try:
                 return await self.responses_full_generator(
@@ -744,3 +764,423 @@ class OpenAIServingResponses(OpenAIServing):
                      "starting the vLLM server."),
             status_code=HTTPStatus.BAD_REQUEST,
         )
+
+    async def responses_stream_generator(
+        self,
+        request: ResponsesRequest,
+        sampling_params: SamplingParams,
+        result_generator: AsyncIterator[Optional[ConversationContext]],
+        context: ConversationContext,
+        model_name: str,
+        tokenizer: AnyTokenizer,
+        request_metadata: RequestResponseMetadata,
+        created_time: Optional[int] = None,
+    ) -> AsyncGenerator[str, None]:
+        # TODO:
+        # 1. Handle disconnect
+
+        if not isinstance(context, StreamingHarmonyContext):
+            raise NotImplementedError(
+                "Streaming is not supported for responses API without Harmony."
+            )
+
+        created_time = created_time or int(time.time())
+
+        sequence_number = 0
+
+        def _send_event(event: BaseModel):
+            nonlocal sequence_number
+            # Set sequence_number if the event has this attribute
+            if hasattr(event, 'sequence_number'):
+                event.sequence_number = sequence_number
+            sequence_number += 1
+            # Get event type from the event's type field if it exists
+            event_type = getattr(event, 'type', 'unknown')
+            return (f"event: {event_type}\n"
+                    f"data: {event.model_dump_json(indent=None)}\n\n")
+
+        current_content_index = 0  # FIXME: this number is never changed
+        current_output_index = 0
+        current_item_id = ""  # FIXME: this number is never changed
+        sent_output_item_added = False
+
+        initial_response = ResponsesResponse.from_request(
+            request,
+            sampling_params,
+            model_name=model_name,
+            created_time=created_time,
+            output=[],
+            status="in_progress",
+            usage=None,
+        ).model_dump()
+        yield _send_event(
+            ResponseCreatedEvent(
+                type="response.created",
+                sequence_number=-1,
+                response=initial_response,
+            ))
+        yield _send_event(
+            ResponseInProgressEvent(
+                type="response.in_progress",
+                sequence_number=-1,
+                response=initial_response,
+            ))
+
+        async for ctx in result_generator:
+
+            assert isinstance(ctx, StreamingHarmonyContext)
+
+            if ctx.is_expecting_start():
+                current_output_index += 1
+                sent_output_item_added = False
+
+                if len(ctx.parser.messages) > 0:
+                    previous_item = ctx.parser.messages[-1]
+                    if previous_item.recipient is not None:
+                        # Deal with tool call here
+                        pass
+                    elif previous_item.channel == "analysis":
+                        reasoning_item = ResponseReasoningItem(
+                            type="reasoning",
+                            content=[
+                                ResponseReasoningTextContent(
+                                    text=previous_item.content[0].text),
+                            ],
+                            status="completed",
+                        )
+                        yield _send_event(
+                            ResponseReasoningTextDoneEvent(
+                                type="response.reasoning_text.done",
+                                item_id=current_item_id,
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                content_index=current_content_index,
+                                text=previous_item.content[0].text,
+                            ))
+                        yield _send_event(
+                            ResponseContentPartDoneEvent(
+                                type="response.content_part.done",
+                                item_id=current_item_id,
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                content_index=current_content_index,
+                                part=reasoning_item,
+                            ))
+                        yield _send_event(
+                            ResponseOutputItemDoneEvent(
+                                type="response.output_item.done",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=reasoning_item,
+                            ))
+                    elif previous_item.channel == "final":
+                        text_content = ResponseOutputText(
+                            type="output_text",
+                            text=previous_item.content[0].text,
+                            annotations=[],
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseTextDoneEvent(
+                                type="response.output_text.done",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                content_index=current_content_index,
+                                text=previous_item.content[0].text,
+                                logprobs=[],
+                                item_id=current_item_id,
+                            ))
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseContentPartDoneEvent(
+                                type="response.content_part.done",
+                                sequence_number=-1,
+                                item_id=current_item_id,
+                                output_index=current_output_index,
+                                content_index=current_content_index,
+                                part=text_content,
+                            ))
+                        yield _send_event(
+                            openai_responses_types.ResponseOutputItemDoneEvent(
+                                type="response.output_item.done",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=ResponseOutputMessage(
+                                    id=current_item_id,
+                                    type="message",
+                                    role="assistant",
+                                    content=[text_content],
+                                    status="completed",
+                                ),
+                            ))
+
+            if ctx.parser.last_content_delta:
+                if (ctx.parser.current_channel == "final"
+                        and ctx.parser.current_recipient is None):
+                    if not sent_output_item_added:
+                        sent_output_item_added = True
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseOutputItemAddedEvent(
+                                type="response.output_item.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.
+                                ResponseOutputMessage(
+                                    id=current_item_id,
+                                    type="message",
+                                    role="assistant",
+                                    content=[],
+                                    status="in_progress",
+                                ),
+                            ))
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseContentPartAddedEvent(
+                                type="response.content_part.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item_id=current_item_id,
+                                content_index=current_content_index,
+                                part=openai_responses_types.ResponseOutputText(
+                                    type="output_text",
+                                    text="",
+                                    annotations=[],
+                                    logprobs=[],
+                                ),
+                            ))
+                    yield _send_event(
+                        openai_responses_types.ResponseTextDeltaEvent(
+                            type="response.output_text.delta",
+                            sequence_number=-1,
+                            content_index=current_content_index,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            delta=ctx.parser.last_content_delta,
+                            # TODO, use logprobs from ctx.last_request_output
+                            logprobs=[],
+                        ))
+                elif (ctx.parser.current_channel == "analysis"
+                      and ctx.parser.current_recipient is None):
+                    if not sent_output_item_added:
+                        sent_output_item_added = True
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseOutputItemAddedEvent(
+                                type="response.output_item.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.
+                                ResponseReasoningItem(
+                                    type="reasoning",
+                                    id=current_item_id,
+                                    summary=[],
+                                    status="in_progress",
+                                ),
+                            ))
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseContentPartAddedEvent(
+                                type="response.content_part.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item_id=current_item_id,
+                                content_index=current_content_index,
+                                part=openai_responses_types.ResponseOutputText(
+                                    type="output_text",
+                                    text="",
+                                    annotations=[],
+                                    logprobs=[],
+                                ),
+                            ))
+                    yield _send_event(
+                        ResponseReasoningTextDeltaEvent(
+                            type="response.reasoning_text.delta",
+                            item_id=current_item_id,
+                            output_index=current_output_index,
+                            content_index=current_content_index,
+                            delta=ctx.parser.last_content_delta,
+                            sequence_number=-1,
+                        ))
+
+            if ctx.is_assistant_action_turn() and len(ctx.parser.messages) > 0:
+                previous_item = ctx.parser.messages[-1]
+                if (self.tool_server is not None
+                        and self.tool_server.has_tool("browser")
+                        and previous_item.recipient is not None
+                        and previous_item.recipient.startswith("browser.")):
+                    function_name = previous_item.recipient[len("browser."):]
+                    action = None
+                    parsed_args = json.loads(previous_item.content[0].text)
+                    if function_name == "search":
+                        action = (openai_responses_types.
+                                  response_function_web_search.ActionSearch(
+                                      type="search",
+                                      query=parsed_args["query"],
+                                  ))
+                    elif function_name == "open":
+                        action = (
+                            openai_responses_types.
+                            response_function_web_search.ActionOpenPage(
+                                type="open_page",
+                                # TODO: translate to url
+                                url=f"cursor:{parsed_args.get('cursor', '')}",
+                            ))
+                    elif function_name == "find":
+                        action = (
+                            openai_responses_types.
+                            response_function_web_search.ActionFind(
+                                type="find",
+                                pattern=parsed_args["pattern"],
+                                # TODO: translate to url
+                                url=f"cursor:{parsed_args.get('cursor', '')}",
+                            ))
+                    else:
+                        raise ValueError(
+                            f"Unknown function name: {function_name}")
+
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemAddedEvent(
+                            type="response.output_item.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.
+                            response_function_web_search.
+                            ResponseFunctionWebSearch(
+                                # TODO: generate a unique id for web search call
+                                type="web_search_call",
+                                id=current_item_id,
+                                action=action,
+                                status="in_progress",
+                            ),
+                        ))
+                    yield _send_event(
+                        openai_responses_types.
+                        ResponseWebSearchCallInProgressEvent(
+                            type="response.web_search_call.in_progress",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        ))
+                    yield _send_event(
+                        openai_responses_types.
+                        ResponseWebSearchCallSearchingEvent(
+                            type="response.web_search_call.searching",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        ))
+
+                    # enqueue
+                    yield _send_event(
+                        openai_responses_types.
+                        ResponseWebSearchCallCompletedEvent(
+                            type="response.web_search_call.completed",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        ))
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemDoneEvent(
+                            type="response.output_item.done",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.
+                            ResponseFunctionWebSearch(
+                                type="web_search_call",
+                                id=current_item_id,
+                                action=action,
+                                status="completed",
+                            ),
+                        ))
+
+                if (self.tool_server is not None
+                        and self.tool_server.has_tool("python")
+                        and previous_item.recipient is not None
+                        and previous_item.recipient.startswith("python")):
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemAddedEvent(
+                            type="response.output_item.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.
+                            ResponseCodeInterpreterToolCallParam(
+                                type="code_interpreter_call",
+                                id=current_item_id,
+                                code="",
+                                container_id="auto",
+                                outputs=[],
+                                status="in_progress",
+                            ),
+                        ))
+                    yield _send_event(
+                        openai_responses_types.
+                        ResponseCodeInterpreterCallInProgressEvent(
+                            type="response.code_interpreter_call.in_progress",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        ))
+                    # TODO: do we need to add delta event here?
+                    yield _send_event(
+                        openai_responses_types.
+                        ResponseCodeInterpreterCallCodeDoneEvent(
+                            type="response.code_interpreter_call_code.done",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            code=previous_item.content[0].text))
+                    yield _send_event(
+                        openai_responses_types.
+                        ResponseCodeInterpreterCallInterpretingEvent(
+                            type="response.code_interpreter_call.interpreting",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        ))
+                    yield _send_event(
+                        openai_responses_types.
+                        ResponseCodeInterpreterCallCompletedEvent(
+                            type="response.code_interpreter_call.completed",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        ))
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemDoneEvent(
+                            type="response.output_item.done",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.
+                            ResponseCodeInterpreterToolCallParam(
+                                type="code_interpreter_call",
+                                id=current_item_id,
+                                code=previous_item.content[0].text,
+                                container_id="auto",
+                                # TODO: add outputs here
+                                outputs=[],
+                                status="completed",
+                            ),
+                        ))
+
+        async def empty_async_generator():
+            # A hack to trick Python to think this is a generator but in fact
+            # it immediately returns.
+            if False:
+                yield
+
+        final_response = await self.responses_full_generator(
+            request,
+            sampling_params,
+            empty_async_generator(),
+            context,
+            model_name,
+            tokenizer,
+            request_metadata,
+            created_time=created_time,
+        )
+        yield _send_event(
+            openai_responses_types.ResponseCompletedEvent(
+                type="response.completed",
+                sequence_number=-1,
+                response=final_response.model_dump(),
+            ))

From 1891a265d316217f9c1e552cf7c380ef5bf1eec1 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 11 Aug 2025 17:47:24 -0700
Subject: [PATCH 174/932] [gpt-oss] Add test for response API + harmony (but
 skipped) (#22554)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 .../openai/test_response_api_with_harmony.py  | 624 ++++++++++++++++++
 1 file changed, 624 insertions(+)
 create mode 100644 tests/entrypoints/openai/test_response_api_with_harmony.py

diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
new file mode 100644
index 0000000000..1ca52599c5
--- /dev/null
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -0,0 +1,624 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import time
+
+import pytest
+import pytest_asyncio
+import requests
+from openai import BadRequestError, NotFoundError, OpenAI
+
+from ...utils import RemoteOpenAIServer
+
+pytest.skip(allow_module_level=True, reason="gpt-oss can't run on CI yet.")
+
+MODEL_NAME = "openai/gpt-oss-20b"
+DTYPE = "bfloat16"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--enforce-eager", "--tool-server", "demo"]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response is not None
+    print("response: ", response)
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic_with_instructions(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+        instructions="Respond in Korean.",
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the capital of South Korea?",
+        reasoning={"effort": "low"},
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chat(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "role": "system",
+                "content": "Respond in Korean."
+            },
+            {
+                "role": "user",
+                "content": "Hello!"
+            },
+            {
+                "role": "assistant",
+                "content": "Hello! How can I help you today?"
+            },
+            {
+                "role": "user",
+                "content": "What is 13 * 24? Explain your answer."
+            },
+        ],
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chat_with_input_type(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "role": "user",
+                "content": [{
+                    "type": "input_text",
+                    "text": "What is 13*24?"
+                }],
+            },
+        ],
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_structured_output(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "role": "system",
+                "content": "Extract the event information."
+            },
+            {
+                "role": "user",
+                "content":
+                "Alice and Bob are going to a science fair on Friday.",
+            },
+        ],
+        text={
+            "format": {
+                "type": "json_schema",
+                "name": "calendar_event",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "name": {
+                            "type": "string"
+                        },
+                        "date": {
+                            "type": "string"
+                        },
+                        "participants": {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            }
+                        },
+                    },
+                    "required": ["name", "date", "participants"],
+                    "additionalProperties": False,
+                },
+                "description": "A calendar event.",
+                "strict": True,
+            }
+        },
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_structured_output_with_parse(client: OpenAI, model_name: str):
+    from pydantic import BaseModel
+
+    class CalendarEvent(BaseModel):
+        name: str
+        date: str
+        participants: list[str]
+
+    response = await client.responses.parse(
+        model=model_name,
+        input="Alice and Bob are going to a science fair on Friday",
+        instructions="Extract the event information",
+        text_format=CalendarEvent,
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_store(client: OpenAI, model_name: str):
+    for store in [True, False]:
+        response = await client.responses.create(
+            model=model_name,
+            input="What is 13 * 24?",
+            store=store,
+        )
+        assert response is not None
+
+        try:
+            _retrieved_response = await client.responses.retrieve(response.id)
+            is_not_found = False
+        except NotFoundError:
+            is_not_found = True
+
+        assert is_not_found == (not store)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_background(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+        background=True,
+    )
+    assert response is not None
+
+    retries = 0
+    max_retries = 30
+    while retries < max_retries:
+        response = await client.responses.retrieve(response.id)
+        if response.status == "completed":
+            break
+        time.sleep(1)
+        retries += 1
+
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_background_cancel(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="Write a long story about a cat.",
+        background=True,
+    )
+    assert response is not None
+    time.sleep(1)
+
+    cancelled_response = await client.responses.cancel(response.id)
+    assert cancelled_response is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_stateful_multi_turn(client: OpenAI, model_name: str):
+    response1 = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response1 is not None
+    assert response1.status == "completed"
+
+    response2 = await client.responses.create(
+        model=model_name,
+        input="What if I increase both numbers by 1?",
+        previous_response_id=response1.id,
+    )
+    assert response2 is not None
+    assert response2.status == "completed"
+
+    response3 = await client.responses.create(
+        model=model_name,
+        input="Divide the result by 2.",
+        previous_response_id=response2.id,
+    )
+    assert response3 is not None
+    assert response3.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_streaming(client: OpenAI, model_name: str):
+    prompts = [
+        "tell me a story about a cat in 20 words",
+        "What is 13 * 24? Use python to calculate the result.",
+        "When did Jensen found NVIDIA? Search it and answer the year only.",
+    ]
+
+    for prompt in prompts:
+        response = await client.responses.create(
+            model=model_name,
+            input=prompt,
+            reasoning={"effort": "low"},
+            tools=[
+                {
+                    "type": "web_search_preview"
+                },
+                {
+                    "type": "code_interpreter",
+                    "container": {
+                        "type": "auto"
+                    }
+                },
+            ],
+            stream=True,
+        )
+
+        events = []
+        current_event_mode = None
+        async for event in response:
+            if current_event_mode != event.type:
+                current_event_mode = event.type
+                print(f"\n[{event.type}] ", end="", flush=True)
+
+            if "text.delta" in event.type:
+                print(event.delta, end="", flush=True)
+            elif "reasoning_text.delta" in event.type:
+                print(f"{event.delta}", end="", flush=True)
+            elif "response.code_interpreter_call_code.done" in event.type:
+                print(f"Code: {event.code}", end="", flush=True)
+            elif ("response.output_item.added" in event.type
+                  and event.item.type == "web_search_call"):
+                print(f"Web search: {event.item.action}", end="", flush=True)
+            events.append(event)
+
+        assert len(events) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_web_search(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="Who is the president of South Korea as of now?",
+        tools=[{
+            "type": "web_search_preview"
+        }],
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_code_interpreter(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="Multiply 64548*15151 using builtin python interpreter.",
+        tools=[{
+            "type": "code_interpreter",
+            "container": {
+                "type": "auto"
+            }
+        }],
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+def get_weather(latitude, longitude):
+    response = requests.get(
+        f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}&current=temperature_2m,wind_speed_10m&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m"  # noqa
+    )
+    data = response.json()
+    return data["current"]["temperature_2m"]
+
+
+def get_place_to_travel():
+    return "Paris"
+
+
+def call_function(name, args):
+    if name == "get_weather":
+        return get_weather(**args)
+    elif name == "get_place_to_travel":
+        return get_place_to_travel()
+    else:
+        raise ValueError(f"Unknown function: {name}")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling(client: OpenAI, model_name: str):
+    tools = [{
+        "type": "function",
+        "name": "get_weather",
+        "description":
+        "Get current temperature for provided coordinates in celsius.",  # noqa
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "latitude": {
+                    "type": "number"
+                },
+                "longitude": {
+                    "type": "number"
+                },
+            },
+            "required": ["latitude", "longitude"],
+            "additionalProperties": False,
+        },
+        "strict": True,
+    }]
+
+    response = await client.responses.create(
+        model=model_name,
+        input="What's the weather like in Paris today?",
+        tools=tools,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    assert len(response.output) == 2
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "function_call"
+
+    tool_call = response.output[1]
+    name = tool_call.name
+    args = json.loads(tool_call.arguments)
+
+    result = call_function(name, args)
+
+    response_2 = await client.responses.create(
+        model=model_name,
+        input=[{
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }],
+        tools=tools,
+        previous_response_id=response.id,
+    )
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert response_2.output_text is not None
+
+    # NOTE: chain-of-thought should be removed.
+    response_3 = await client.responses.create(
+        model=model_name,
+        input="What's the weather like in Paris today?",
+        tools=tools,
+        previous_response_id=response_2.id,
+    )
+    assert response_3 is not None
+    assert response_3.status == "completed"
+    assert response_3.output_text is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
+    tools = [
+        {
+            "type": "function",
+            "name": "get_place_to_travel",
+            "description": "Get a random place to travel",
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        },
+        {
+            "type": "function",
+            "name": "get_weather",
+            "description":
+            "Get current temperature for provided coordinates in celsius.",  # noqa
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "latitude": {
+                        "type": "number"
+                    },
+                    "longitude": {
+                        "type": "number"
+                    },
+                },
+                "required": ["latitude", "longitude"],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        },
+    ]
+
+    response = await client.responses.create(
+        model=model_name,
+        input=
+        "Help me plan a trip to a random place. And tell me the weather there.",
+        tools=tools,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    assert len(response.output) == 2
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "function_call"
+
+    tool_call = response.output[1]
+    name = tool_call.name
+    args = json.loads(tool_call.arguments)
+
+    result = call_function(name, args)
+
+    response_2 = await client.responses.create(
+        model=model_name,
+        input=[{
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }],
+        tools=tools,
+        previous_response_id=response.id,
+    )
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert len(response_2.output) == 2
+    assert response_2.output[0].type == "reasoning"
+    assert response_2.output[1].type == "function_call"
+
+    tool_call = response_2.output[1]
+    name = tool_call.name
+    args = json.loads(tool_call.arguments)
+
+    result = call_function(name, args)
+
+    response_3 = await client.responses.create(
+        model=model_name,
+        input=[{
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }],
+        tools=tools,
+        previous_response_id=response_2.id,
+    )
+    assert response_3 is not None
+    assert response_3.status == "completed"
+    assert response_3.output_text is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_required(client: OpenAI, model_name: str):
+    tools = [{
+        "type": "function",
+        "name": "get_weather",
+        "description":
+        "Get current temperature for provided coordinates in celsius.",  # noqa
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "latitude": {
+                    "type": "number"
+                },
+                "longitude": {
+                    "type": "number"
+                },
+            },
+            "required": ["latitude", "longitude"],
+            "additionalProperties": False,
+        },
+        "strict": True,
+    }]
+
+    with pytest.raises(BadRequestError):
+        await client.responses.create(
+            model=model_name,
+            input="What's the weather like in Paris today?",
+            tools=tools,
+            tool_choice="required",
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_full_history(client: OpenAI, model_name: str):
+    tools = [{
+        "type": "function",
+        "name": "get_weather",
+        "description":
+        "Get current temperature for provided coordinates in celsius.",  # noqa
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "latitude": {
+                    "type": "number"
+                },
+                "longitude": {
+                    "type": "number"
+                },
+            },
+            "required": ["latitude", "longitude"],
+            "additionalProperties": False,
+        },
+        "strict": True,
+    }]
+
+    input_messages = [{
+        "role": "user",
+        "content": "What's the weather like in Paris today?"
+    }]
+
+    response = await client.responses.create(
+        model=model_name,
+        input=input_messages,
+        tools=tools,
+    )
+
+    assert response is not None
+    assert response.status == "completed"
+
+    tool_call = response.output[-1]
+    name = tool_call.name
+    args = json.loads(tool_call.arguments)
+
+    result = call_function(name, args)
+
+    input_messages.extend(
+        response.output)  # append model's function call message
+    input_messages.append(
+        {  # append result message
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }
+    )
+
+    response_2 = await client.responses.create(
+        model=model_name,
+        input=input_messages,
+        tools=tools,
+    )
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert response_2.output_text is not None

From 9b94d6ec8f5f2e8ec2d3897ed05fb2b13cc012da Mon Sep 17 00:00:00 2001
From: Andy Chen <37168711+py-andy-c@users.noreply.github.com>
Date: Mon, 11 Aug 2025 19:02:14 -0700
Subject: [PATCH 175/932] Enable 4bit bnb prequant MOE (#21548)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../model_executor/model_loader/bitsandbytes_loader.py | 10 +++-------
 vllm/model_executor/models/qwen3_moe.py                |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index ea2fb2e3ac..b8393956ee 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -427,14 +427,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             elif isinstance(module, FusedMoE) and hasattr(
                     module.quant_method, "quant_config"):
                 # TODO: support FusedMoE with prequant and 8bit.
-                if self.pre_quant:
+                if self.pre_quant and self.load_8bit:
                     raise ValueError(
-                        "Prequant BitsAndBytes models with FusedMoE is not "
-                        "supported yet.")
-                if self.load_8bit:
-                    raise ValueError(
-                        "BitsAndBytes 8bit quantization with FusedMoE is not "
-                        "supported yet.")
+                        "Prequant BitsAndBytes 8bit models with FusedMoE "
+                        "is not supported yet.")
                 # Get the corresponding weight name using module name and
                 # expert_params_mapping.
 
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 9b49952f37..085fc90b47 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -684,4 +684,4 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
         return loader.load_weights(weights)
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-        return self.model.get_expert_mapping()
+        return self.model.get_expert_mapping()
\ No newline at end of file

From 839ab0034932e5e6863a8d837e5b04944fa0cac5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 12 Aug 2025 03:54:40 +0100
Subject: [PATCH 176/932] Re-enable Xet on TPU tests now that `hf_xet` has been
 updated (#22666)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh | 2 +-
 .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh       | 2 +-
 tests/entrypoints/llm/test_accuracy.py                  | 3 ---
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index 10d2e23649..b571618f48 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -128,7 +128,7 @@ run_and_track_test() {
 
 # --- Actual Test Execution ---
 run_and_track_test 1 "test_struct_output_generate.py" \
-    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 run_and_track_test 2 "test_moe_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 3 "test_lora.py" \
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 9e7b5a5462..d55a786e41 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -134,7 +134,7 @@ run_and_track_test 1 "test_compilation.py" \
 run_and_track_test 2 "test_basic.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
 run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
-    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
+    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
 run_and_track_test 4 "test_quantization_accuracy.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
 run_and_track_test 5 "examples/offline_inference/tpu.py" \
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 39bc8ab07d..5d605e906e 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -96,9 +96,6 @@ def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
         more_args = None
         if current_platform.is_tpu():
             # Limit compilation time for TPU V1
-
-            # xet doesn't work well for Qwen/Qwen3-1.7B
-            m.setenv("HF_HUB_DISABLE_XET", "1")
             more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
 
             # Add TP test (if provided)

From dc5e4a653c859573dfcca99f1b0141c2db9f94cc Mon Sep 17 00:00:00 2001
From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com>
Date: Tue, 12 Aug 2025 10:58:41 +0800
Subject: [PATCH 177/932] Upgrade FlashInfer to v0.2.11 (#22613)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 docker/Dockerfile | 2 +-
 setup.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 85f55cac8d..b96d50f0a1 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -387,7 +387,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
 # We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
-ARG FLASHINFER_GIT_REF="v0.2.10"
+ARG FLASHINFER_GIT_REF="v0.2.11"
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
   . /etc/environment
     git clone --depth 1 --recursive --shallow-submodules \
diff --git a/setup.py b/setup.py
index 7f6c787129..919300e143 100644
--- a/setup.py
+++ b/setup.py
@@ -684,7 +684,7 @@ setup(
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.10"],
+        "flashinfer": ["flashinfer-python==0.2.11"],
     },
     cmdclass=cmdclass,
     package_data=package_data,

From ea1292ad3ee724e44b3dfec2a26778cd614729f9 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 11 Aug 2025 23:20:42 -0400
Subject: [PATCH 178/932] [CI Failure] Use float32 for
 tests/entrypoints/openai/test_audio.py (#22686)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/entrypoints/openai/test_audio.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index d67c05ab3e..2d33d3c3a6 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -23,6 +23,8 @@ MAXIMUM_AUDIOS = 2
 @pytest.fixture(scope="module")
 def server():
     args = [
+        "--dtype",
+        "float32",
         "--max-model-len",
         "2048",
         "--max-num-seqs",

From 93d0652433f9385959d5296a4dc1c98ec58f0d58 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 11 Aug 2025 23:31:36 -0400
Subject: [PATCH 179/932] [CI] Increase timeout for
 test_completion_with_image_embeds (#22670)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../v1/entrypoints/openai/test_completion_with_image_embeds.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
index be98be8d14..41f1d02bf7 100644
--- a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
+++ b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
@@ -38,7 +38,8 @@ def default_image_embeds_server_args() -> list[str]:
 @pytest.fixture(scope="module")
 def server_with_image_embeds(default_image_embeds_server_args):
     with RemoteOpenAIServer(MODEL_NAME,
-                            default_image_embeds_server_args) as remote_server:
+                            default_image_embeds_server_args,
+                            max_wait_seconds=600) as remote_server:
         yield remote_server
 
 
From 467850347687f0ef76c1a57d79e2c0639eaa1456 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Mon, 11 Aug 2025 20:43:37 -0700
Subject: [PATCH 180/932] Migrate MiniCPMVImageInputs to TensorSchema (#21939)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/minicpmv.py | 65 ++++++++++++++------------
 1 file changed, 36 insertions(+), 29 deletions(-)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 3aa16bb9ab..7db3a1bb90 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -27,7 +27,7 @@ import math
 from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
-from typing import Any, Callable, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Callable, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -63,6 +63,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils import flatten_2d_lists
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -74,36 +75,47 @@ from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
 _MAX_FRAMES_PER_VIDEO = 16
 
 
-class MiniCPMVImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: list[torch.Tensor]
+class MiniCPMVImagePixelInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_images * num_slices, num_channels, height, width)`
-
-    Note that the image size may vary, so we pass it as a list
-    instead of a batched tensor.
+    Dimensions:
+        - bns: Batch size * number of images * number of slices
+        - bn: Batch size * number of images
+        - c: Number of channels
+        - h: Height
+        - w: Width
     """
 
-    tgt_sizes: torch.Tensor
-    """
-    Shape: `(batch_size * num_images * num_slices, 2)`
+    type: Literal["pixel_values"] = "pixel_values"
 
-    This should be in `(height, width)` format.
+    # Note that the image size may vary, so we pass it as a list instead of a
+    # batched tensor.
+    pixel_values: Annotated[
+        list[torch.Tensor],
+        TensorShape("bns", "c", "h", "w"),
+    ]
+    tgt_sizes: Annotated[
+        torch.Tensor,
+        TensorShape("bns", 2),  # This should be in `(height, width)` format.
+    ]
+    num_slices: Annotated[
+        torch.Tensor,
+        TensorShape("bn"),
+    ]
+
+
+class MiniCPMVImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - ns: Number of slices
+        - hs: Hidden size (must match language model backbone)
     """
 
-    num_slices: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
-
-
-class MiniCPMVImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    image_embeds: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    Shape: `(batch_size * num_images, num_slices, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
-    instead of a batched tensor.
-    """
+    image_embeds: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "ns", "hs"),
+    ]
 
 
 MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
@@ -832,11 +844,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         pixel_values_flat = flatten_bn(flatten_2d_lists(pixel_values))
         tgt_sizes_flat = flatten_bn(flatten_2d_lists(tgt_sizes), concat=True)
 
-        if len(pixel_values_flat) != len(tgt_sizes_flat):
-            raise ValueError("Inconsistent flattened lengths, found: "
-                             f"{len(pixel_values_flat)} vs. "
-                             f"{len(tgt_sizes_flat)}")
-
         return MiniCPMVImagePixelInputs(
             type="pixel_values",
             pixel_values=pixel_values_flat,

From bbaf9e9cb15af23e7a1fd250bf49a5efb15cadf7 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 11 Aug 2025 21:22:26 -0700
Subject: [PATCH 181/932] [gpt-oss] Fix mxfp4 support (#22700)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/model_executor/layers/quantization/utils/mxfp4_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 4084dd837c..95eabe149d 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -68,7 +68,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
     return not (use_grouped_topk or topk_group or num_expert_group
                 or expert_map or custom_routing_function
                 or e_score_correction_bias or apply_router_weight_on_input
-                or scoring_func != "softmax" or activation != "silu"
+                or scoring_func != "softmax" or activation != "swiglu_oai"
                 or expert_load_view or logical_to_physical_map
                 or logical_replica_count)
 

From ad344ef552ece428d12e04fbcb64b8b50768283b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 11 Aug 2025 22:04:38 -0700
Subject: [PATCH 182/932] [gpt-oss] Small bug fixes for frontend (#22512)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/context.py                  | 54 +++++++++++++++-----
 vllm/entrypoints/openai/protocol.py          |  5 +-
 vllm/entrypoints/openai/serving_responses.py | 29 ++++++-----
 vllm/entrypoints/tool.py                     | 15 +++++-
 vllm/entrypoints/tool_server.py              |  5 +-
 5 files changed, 76 insertions(+), 32 deletions(-)

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 6292306e7c..e817f07ef5 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -1,15 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
 import logging
 from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Union
 
-from openai_harmony import Message, Role, StreamState
+from openai_harmony import Author, Message, Role, StreamState, TextContent
 
 from vllm.entrypoints.harmony_utils import (
     get_encoding, get_streamable_parser_for_assistant, render_for_completion)
 from vllm.entrypoints.tool import Tool
 from vllm.outputs import RequestOutput
 
+if TYPE_CHECKING:
+    from mcp.client import ClientSession
+
 logger = logging.getLogger(__name__)
 
 
@@ -71,6 +76,7 @@ class HarmonyContext(ConversationContext):
     def append_output(self, output) -> None:
         if isinstance(output, RequestOutput):
             output_token_ids = output.outputs[0].token_ids
+            self.parser = get_streamable_parser_for_assistant()
             for token_id in output_token_ids:
                 self.parser.process(token_id)
             output_msgs = self.parser.messages
@@ -106,19 +112,41 @@ class HarmonyContext(ConversationContext):
     def render_for_completion(self) -> list[int]:
         return render_for_completion(self.messages)
 
-    async def call_search_tool(
-        self,
-        tool_session: Tool,
-        last_msg: Message,
-    ) -> list[Message]:
-        return await tool_session.get_result(self)
+    async def call_search_tool(self, tool_session: Union["ClientSession",
+                                                         Tool],
+                               last_msg: Message) -> list[Message]:
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result(self)
+        tool_name = last_msg.recipient.split(".")[1]
+        args = json.loads(last_msg.content[0].text)
+        result = await tool_session.call_tool(tool_name, args)
+        result_str = result.content[0].text
+        content = TextContent(text=result_str)
+        author = Author(role=Role.TOOL, name=last_msg.recipient)
+        return [
+            Message(author=author, content=[content], recipient=Role.ASSISTANT)
+        ]
 
-    async def call_python_tool(
-        self,
-        tool_session: Tool,
-        last_msg: Message,
-    ) -> list[Message]:
-        return await tool_session.get_result(self)
+    async def call_python_tool(self, tool_session: Union["ClientSession",
+                                                         Tool],
+                               last_msg: Message) -> list[Message]:
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result(self)
+        param = {
+            "code": last_msg.content[0].text,
+        }
+        result = await tool_session.call_tool("python", param)
+        result_str = result.content[0].text
+
+        content = TextContent(text=result_str)
+        author = Author(role=Role.TOOL, name="python")
+
+        return [
+            Message(author=author,
+                    content=[content],
+                    channel=last_msg.channel,
+                    recipient=Role.ASSISTANT)
+        ]
 
 
 class StreamingHarmonyContext(HarmonyContext):
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 3b9f4b544e..543701ed14 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -19,8 +19,8 @@ from openai.types.chat.chat_completion_message import (
 # yapf: enable
 from openai.types.responses import (ResponseFunctionToolCall,
                                     ResponseInputItemParam, ResponseOutputItem,
-                                    ResponsePrompt, ResponseStatus,
-                                    ResponseTextConfig)
+                                    ResponsePrompt, ResponseReasoningItem,
+                                    ResponseStatus, ResponseTextConfig)
 from openai.types.responses.response import ToolChoice
 from openai.types.responses.tool import Tool
 from openai.types.shared import Metadata, Reasoning
@@ -239,6 +239,7 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
 
 
 ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam,
+                                           ResponseReasoningItem,
                                            ResponseFunctionToolCall]
 
 
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 089f50a1e6..86c16df40e 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -16,8 +16,7 @@ from fastapi import Request
 from openai import BaseModel
 # yapf conflicts with isort for this block
 # yapf: disable
-from openai.types.responses import (ResponseContentPartDoneEvent,
-                                    ResponseCreatedEvent,
+from openai.types.responses import (ResponseCreatedEvent,
                                     ResponseFunctionToolCall,
                                     ResponseInProgressEvent,
                                     ResponseOutputItem,
@@ -54,7 +53,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
 # yapf: enable
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.tool_server import ToolServer
+from vllm.entrypoints.tool_server import MCPToolServer, ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput
@@ -238,6 +237,15 @@ class OpenAIServingResponses(OpenAIServing):
         if raw_request:
             raw_request.state.request_metadata = request_metadata
 
+        if self.tool_server is not None and isinstance(
+                self.tool_server, MCPToolServer
+        ) and (request.background or request.stream) and request.tools and any(
+                tool.type in ["web_search_preview", "code_interpreter"]
+                for tool in request.tools):
+            return self.create_error_response(
+                "MCP tool server is not supported in background mode and "
+                "streaming mode")
+
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[ConversationContext, None]] = []
 
@@ -844,9 +852,13 @@ class OpenAIServingResponses(OpenAIServing):
                             type="reasoning",
                             content=[
                                 ResponseReasoningTextContent(
-                                    text=previous_item.content[0].text),
+                                    text=previous_item.content[0].text,
+                                    type="reasoning_text",
+                                ),
                             ],
                             status="completed",
+                            id=current_item_id,
+                            summary=[],
                         )
                         yield _send_event(
                             ResponseReasoningTextDoneEvent(
@@ -857,15 +869,6 @@ class OpenAIServingResponses(OpenAIServing):
                                 content_index=current_content_index,
                                 text=previous_item.content[0].text,
                             ))
-                        yield _send_event(
-                            ResponseContentPartDoneEvent(
-                                type="response.content_part.done",
-                                item_id=current_item_id,
-                                sequence_number=-1,
-                                output_index=current_output_index,
-                                content_index=current_content_index,
-                                part=reasoning_item,
-                            ))
                         yield _send_event(
                             ResponseOutputItemDoneEvent(
                                 type="response.output_item.done",
diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py
index 01ee77414f..723cff91d4 100644
--- a/vllm/entrypoints/tool.py
+++ b/vllm/entrypoints/tool.py
@@ -2,7 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional
+
+from openai_harmony import Message
 
 from vllm.logger import init_logger
 
@@ -70,7 +72,16 @@ class HarmonyPythonTool(Tool):
                 "gpt_oss is not installed, code interpreter is disabled")
             return
 
-        self.python_tool = PythonTool()
+        # NOTE (Chen): as of gpt-oss 0.0.2, there is a bug in _make_response
+        # and we do the following monkey patch to fix it.
+        class PatchedGptOssPythonTool(PythonTool):
+
+            def _make_response(self,
+                               output: str,
+                               channel: Optional[str] = None) -> Message:
+                return super()._make_response(output)
+
+        self.python_tool = PatchedGptOssPythonTool()
         logger.info_once("Code interpreter tool initialized")
 
     async def get_result(self, context: "ConversationContext") -> Any:
diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py
index 352704b2b3..2f28595f27 100644
--- a/vllm/entrypoints/tool_server.py
+++ b/vllm/entrypoints/tool_server.py
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 from contextlib import AbstractAsyncContextManager, asynccontextmanager
 from typing import TYPE_CHECKING, Any, Optional
 
-from openai_harmony import ToolNamespaceConfig
+from openai_harmony import ToolDescription, ToolNamespaceConfig
 
 from vllm.entrypoints.tool import HarmonyBrowserTool, HarmonyPythonTool, Tool
 from vllm.logger import init_logger
@@ -105,7 +105,6 @@ class MCPToolServer(ToolServer):
         self.harmony_tool_descriptions = {}
 
     async def add_tool_server(self, server_url: str):
-        from mcp.types import ToolDescription
         tool_urls = server_url.split(",")
         self.harmony_tool_descriptions = {}
         self.urls: dict[str, str] = {}
@@ -133,6 +132,8 @@ class MCPToolServer(ToolServer):
                 logger.warning(
                     "Tool %s already exists. Ignoring duplicate tool server %s",
                     tool_from_mcp.name, url)
+        logger.info("MCPToolServer initialized with tools: %s",
+                    list(self.harmony_tool_descriptions.keys()))
 
     def has_tool(self, tool_name: str):
         return tool_name in self.harmony_tool_descriptions

From 4fbd8bb597cf392b94def04a6955f22580356d76 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 12 Aug 2025 06:13:32 +0100
Subject: [PATCH 183/932] Fix passing `SpeculativeConfig` from the CLI (#22652)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 41a6da709b..d74db67bda 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -826,6 +826,10 @@ class EngineArgs:
             title="VllmConfig",
             description=VllmConfig.__doc__,
         )
+        # We construct SpeculativeConfig using fields from other configs in
+        # create_engine_config. So we set the type to a JSON string here to
+        # delay the Pydantic validation that comes with SpeculativeConfig.
+        vllm_kwargs["speculative_config"]["type"] = optional_type(json.loads)
         vllm_group.add_argument("--speculative-config",
                                 **vllm_kwargs["speculative_config"])
         vllm_group.add_argument("--kv-transfer-config",

From 3a7e3bbdd255b470d37727a31cc0471aa0fe6ecb Mon Sep 17 00:00:00 2001
From: Hongsheng Liu <liuhongsheng4@huawei.com>
Date: Tue, 12 Aug 2025 15:14:51 +0800
Subject: [PATCH 184/932] [Doc] Added unmentioned required option "method" in
 the usage of EAGLE-3 based models (#21737)

Signed-off-by: Dilute-l <dilu2333@163.com>
Co-authored-by: Dilute-l <dilu2333@163.com>
---
 docs/features/spec_decode.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
index 89d5b489e1..597a8e8644 100644
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -203,6 +203,7 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
             "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
             "draft_tensor_parallel_size": 1,
             "num_speculative_tokens": 2,
+            "method": "eagle",
         },
     )
 
@@ -231,6 +232,9 @@ A few important things to consider when using the EAGLE based draft models:
    reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
    investigation and tracked here: <gh-issue:9565>.
 
+4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3".
+   That is, to specify `"method": "eagle3"` in `speculative_config`.
+
 A variety of EAGLE draft models are available on the Hugging Face hub:
 
 | Base Model                                                           | EAGLE on Hugging Face                     | # EAGLE Parameters |

From 2f4657952b1a118e616165e57af94c9007121fb8 Mon Sep 17 00:00:00 2001
From: Sooraj S <94284954+sooraj-satheesh@users.noreply.github.com>
Date: Tue, 12 Aug 2025 12:51:08 +0530
Subject: [PATCH 185/932] [doc] Update x86 CPU-inference installation doc to
 reflect optionality of AVX512f  (#22707)

Signed-off-by: Sooraj S <94284954+sooraj-satheesh@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
---
 docs/getting_started/installation/cpu/x86.inc.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md
index 49e223f9b9..6dc6f94249 100644
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@@ -6,7 +6,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 # --8<-- [start:requirements]
 
 - OS: Linux
-- CPU flags: `avx512f`, `avx512_bf16` (Optional), `avx512_vnni` (Optional)
+- CPU flags: `avx512f` (Recommended), `avx512_bf16` (Optional), `avx512_vnni` (Optional)
 
 !!! tip
     Use `lscpu` to check the CPU flags.
@@ -28,7 +28,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
 
 !!! warning
-    If deploying the pre-built images on machines only contain `avx512f`, `Illegal instruction` error may be raised. It is recommended to build images for these machines with `--build-arg VLLM_CPU_AVX512BF16=false` and `--build-arg VLLM_CPU_AVX512VNNI=false`.
+    If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. It is recommended to build images for these machines with the appropriate build arguments (e.g., `--build-arg VLLM_CPU_DISABLE_AVX512=true`, `--build-arg VLLM_CPU_AVX512BF16=false`, or `--build-arg VLLM_CPU_AVX512VNNI=false`) to disable unsupported features. Please note that without `avx512f`, AVX2 will be used and this version is not recommended because it only has basic feature support.
 
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
@@ -37,6 +37,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 docker build -f docker/Dockerfile.cpu \
         --build-arg VLLM_CPU_AVX512BF16=false (default)|true \
         --build-arg VLLM_CPU_AVX512VNNI=false (default)|true \
+        --build-arg VLLM_CPU_DISABLE_AVX512=false (default)|true \ 
         --tag vllm-cpu-env \
         --target vllm-openai .
 

From 6d729c43fbaf63d534e71c0b8aa61f0a82dd2018 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 12 Aug 2025 15:23:17 +0800
Subject: [PATCH 186/932] [Bugfix] Fix ModernBert load & Enable sliding window
 attention for bidirectional attention. (#22637)

Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
---
 tests/models/language/pooling/test_gte.py |  21 ++++-
 vllm/model_executor/models/modernbert.py  |  31 +++----
 vllm/v1/attention/backends/flash_attn.py  |   2 +
 vllm/v1/worker/gpu_model_runner.py        | 106 ++++++++++++++--------
 4 files changed, 101 insertions(+), 59 deletions(-)

diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 5a5fdfbb21..f805a64103 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -4,10 +4,11 @@ from typing import Any
 
 import pytest
 
-from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo,
-                      LASTPoolingEmbedModelInfo, check_transformers_version)
+from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
+                      EmbedModelInfo, LASTPoolingEmbedModelInfo,
+                      RerankModelInfo, check_transformers_version)
 from .embed_utils import correctness_test_embed_models
-from .mteb_utils import mteb_test_embed_models
+from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 MODELS = [
     ########## BertModel
@@ -58,6 +59,14 @@ MODELS = [
                               enable_test=False),
 ]
 
+RERANK_MODELS = [
+    # classifier_pooling: mean
+    CLSPoolingRerankModelInfo(
+        "Alibaba-NLP/gte-reranker-modernbert-base",
+        architecture="ModernBertForSequenceClassification",
+        enable_test=True),
+]
+
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
@@ -88,3 +97,9 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
 
     correctness_test_embed_models(hf_runner, vllm_runner, model_info,
                                   example_prompts, vllm_extra_kwargs)
+
+
+@pytest.mark.parametrize("model_info", RERANK_MODELS)
+def test_rerank_models_mteb(hf_runner, vllm_runner,
+                            model_info: RerankModelInfo) -> None:
+    mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 2c3bdd1c93..c6e84e2d4e 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -26,8 +26,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
 
-from .interfaces import (SupportsCrossEncoding, SupportsV0Only,
-                         default_pooling_type)
+from .interfaces import SupportsCrossEncoding, default_pooling_type
 from .utils import WeightsMapper, maybe_prefix
 
 
@@ -93,16 +92,14 @@ class ModernBertAttention(nn.Module):
             bias=config.attention_bias,
         )
 
+        sliding_window = None
         if layer_id % config.global_attn_every_n_layers != 0:
-            self.local_attention = (config.local_attention // 2,
-                                    config.local_attention // 2)
+            sliding_window = config.local_attention // 2
+            rope_theta = config.local_rope_theta if config.local_rope_theta \
+                    is not None else config.global_rope_theta
         else:
-            self.local_attention = (-1, -1)
+            rope_theta = config.global_rope_theta
 
-        rope_theta = config.global_rope_theta
-        if self.local_attention != (
-                -1, -1) and config.local_rope_theta is not None:
-            rope_theta = config.local_rope_theta
         self.rotary_emb = ModernBertRotaryEmbedding(config=config,
                                                     head_size=self.head_dim,
                                                     dim=self.head_dim,
@@ -111,7 +108,8 @@ class ModernBertAttention(nn.Module):
                               self.head_dim,
                               self.scaling,
                               prefix=f"{layer_id}.attn",
-                              attn_type=AttentionType.ENCODER_ONLY)
+                              attn_type=AttentionType.ENCODER_ONLY,
+                              per_layer_sliding_window=sliding_window)
         self.Wo = RowParallelLinear(config.hidden_size,
                                     config.hidden_size,
                                     bias=config.attention_bias)
@@ -278,6 +276,7 @@ class ModernBertPooler(Pooler):
         return self.pooling.get_pooling_updates(task)
 
     def _head(self, pooled_output: torch.Tensor):
+        pooled_output = pooled_output.to(self.dense.weight.dtype)
         return self.norm(self.act(self.dense(pooled_output)))
 
     def forward(
@@ -296,8 +295,7 @@ class ModernBertPooler(Pooler):
 
 
 @default_pooling_type("CLS")
-class ModernBertForSequenceClassification(nn.Module, SupportsV0Only,
-                                          SupportsCrossEncoding):
+class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
 
     is_pooling_model = True
 
@@ -308,6 +306,7 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only,
         self.model = ModernBertModel(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "modernbert"))
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.pooling = ModernBertPooler(config)
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
@@ -317,14 +316,14 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only,
             Pooler.for_encode(pooler_config),
             "classify":
             ClassifierPooler(
-                pooling=ModernBertPooler(config),
+                pooling=self.pooling,
                 classifier=self.classifier,
                 act_fn=ClassifierPooler.act_fn_for_seq_cls(
                     vllm_config.model_config),
             ),
             "score":
             ClassifierPooler(
-                pooling=ModernBertPooler(config),
+                pooling=self.pooling,
                 classifier=self.classifier,
                 act_fn=ClassifierPooler.act_fn_for_cross_encoder(
                     vllm_config.model_config),
@@ -353,7 +352,7 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only,
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
             if name.startswith("head"):
-                param = params_dict["_pooler.pooler." + name[len("head") + 1:]]
+                param = params_dict["pooling." + name[len("head") + 1:]]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
@@ -368,5 +367,5 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only,
         return self.model(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
-            position_ids=positions,
+            positions=positions,
         )
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 95ba56b359..a411477bc3 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -384,6 +384,8 @@ class FlashAttentionImpl(AttentionImpl):
         self.alibi_slopes = alibi_slopes
         if sliding_window is None:
             self.sliding_window = (-1, -1)
+        elif attn_type == AttentionType.ENCODER_ONLY:
+            self.sliding_window = (sliding_window - 1, sliding_window - 1)
         else:
             self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 045a06d927..ed4d6bcb09 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -826,7 +826,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Prepare encoder attention metadata separately
         # (encoder layers are not in KV cache groups)
         if self.is_encoder_only_model:
-            common_attn_metadata, encoder_attn_metadata = \
+
+            per_layer_metadata = \
                 self._build_encoder_only_attn_metadata(
                 scheduler_output)
 
@@ -835,6 +836,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 self.vllm_config, Attention)
             for layer_name, attn_module in attention_layers.items():
                 if attn_module.attn_type == AttentionType.ENCODER_ONLY:
+                    common_attn_metadata, encoder_attn_metadata =\
+                        per_layer_metadata[layer_name]
                     attn_metadata[layer_name] = encoder_attn_metadata
 
         # Prepare the attention metadata for each KV cache group and make layers
@@ -2683,30 +2686,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Check if model is encoder-only
         block_size = self.vllm_config.cache_config.block_size
         use_mla = self.vllm_config.model_config.use_mla
-        attn_specs = list[AttentionSpec]()
-        for attn_module in attn_layers.values():
+        attn_specs: dict[AttentionSpec, list[str]] = defaultdict(list)
+        for layer_name, attn_module in attn_layers.items():
 
             if attn_module.attn_type == AttentionType.ENCODER_ONLY:
-                assert attn_module.sliding_window is None, "Sliding "
-                "window attention is not supported for encoder-only models"
+                if attn_module.sliding_window is None:
+                    attn_spec: AttentionSpec = FullAttentionSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        use_mla=use_mla)
+                else:
+                    attn_spec = SlidingWindowSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        sliding_window=attn_module.sliding_window,
+                        use_mla=use_mla)
+                attn_specs[attn_spec].append(layer_name)
 
-                attn_specs.append(
-                    FullAttentionSpec(block_size=block_size,
-                                      num_kv_heads=attn_module.num_kv_heads,
-                                      head_size=attn_module.head_size,
-                                      dtype=self.kv_cache_dtype,
-                                      use_mla=use_mla))
             else:
                 raise ValueError("Expected only encoder-only layers")
 
         if len(attn_specs) > 0:
-            assert len(attn_specs) == len(attn_layers), \
+            total_layers = 0
+            for attn_spec, layer_names in attn_specs.items():
+
+                attn_backends = get_attn_backends_for_layers(layer_names)
+                total_layers += len(layer_names)
+
+                self.attn_groups.append(
+                    create_attn_groups(attn_backends, attn_spec))
+            assert total_layers == len(attn_layers), \
                 "All or none of the layers are expected to be encoder-only"
-
-            attn_backends = get_attn_backends_for_layers(attn_layers.keys())
-
-            self.attn_groups.append(
-                create_attn_groups(attn_backends, attn_specs[0]))
             self.is_encoder_only_model = True
 
     def calculate_reorder_batch_threshold(self) -> None:
@@ -3071,7 +3085,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def _build_encoder_only_attn_metadata(
             self, scheduler_output: "SchedulerOutput") -> \
-                tuple[CommonAttentionMetadata, Any]:
+                dict[str, tuple[CommonAttentionMetadata, Any]]:
         """Prepare encoder attention metadata for encoder-only models.
 
         Args:
@@ -3088,10 +3102,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
         max_num_scheduled_tokens = max(tokens)
 
-        # Use the first attention metadata builder
-        # to create encoder attention metadata
-        builder = self.attn_groups[0][0].metadata_builder
-
         dummy_block_table = torch.zeros((num_reqs, 1),
                                         dtype=torch.int32,
                                         device=self.device)
@@ -3099,22 +3109,38 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                          dtype=torch.int32,
                                          device=self.device)
 
-        common_metadata = CommonAttentionMetadata(
-            query_start_loc=self.query_start_loc[:num_reqs + 1],
-            query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
-            seq_lens=self.seq_lens[:num_reqs],
-            seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
-            num_computed_tokens_cpu=self.input_batch.
-            num_computed_tokens_cpu_tensor[:num_reqs],
-            num_reqs=num_reqs,
-            num_actual_tokens=total_num_scheduled_tokens,
-            max_query_len=max_num_scheduled_tokens,
-            block_table_tensor=dummy_block_table,
-            slot_mapping=dummy_slot_mapping,
-            causal=False,
-        )
+        group_metadata = dict[str, tuple[CommonAttentionMetadata, Any]]()
 
-        return common_metadata, builder.build(
-            common_prefix_len=0,  # No cascade for encoder
-            common_attn_metadata=common_metadata,
-        )
+        for attn_group_list in self.attn_groups:
+
+            assert len(attn_group_list) == 1
+            attn_group = attn_group_list[0]
+
+            # Use the first attention metadata builder
+            # to create encoder attention metadata
+            builder = attn_group.metadata_builder
+
+            common_metadata = CommonAttentionMetadata(
+                query_start_loc=self.query_start_loc[:num_reqs + 1],
+                query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
+                seq_lens=self.seq_lens[:num_reqs],
+                seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
+                num_computed_tokens_cpu=self.input_batch.
+                num_computed_tokens_cpu_tensor[:num_reqs],
+                num_reqs=num_reqs,
+                num_actual_tokens=total_num_scheduled_tokens,
+                max_query_len=max_num_scheduled_tokens,
+                block_table_tensor=dummy_block_table,
+                slot_mapping=dummy_slot_mapping,
+                causal=False,
+            )
+
+            metadata = builder.build(
+                common_prefix_len=0,  # No cascade for encoder
+                common_attn_metadata=common_metadata,
+            )
+
+            for layer_name in attn_group.layer_names:
+                group_metadata[layer_name] = (common_metadata, metadata)
+
+        return group_metadata

From 78077d5417aee128ac4fe92220476ea721ac27e4 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 12 Aug 2025 08:23:49 +0100
Subject: [PATCH 187/932] Move `SchedulerConfig` from `config/__init__.py` to
 `config/scheduler.py` (#22626)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/__init__.py  | 316 +------------------------------------
 vllm/config/scheduler.py | 329 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 331 insertions(+), 314 deletions(-)
 create mode 100644 vllm/config/scheduler.py

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 03ab034c62..159106003f 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -34,6 +34,7 @@ from vllm.config.cache import (BlockSize, CacheConfig, CacheDType,
 from vllm.config.compilation import (CompilationConfig, CompilationLevel,
                                      PassConfig)
 from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig
+from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
 from vllm.config.utils import ConfigType, config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -47,15 +48,9 @@ from vllm.transformers_utils.config import (
     try_get_tokenizer_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
-                        MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-                        POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
+from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
                         LazyLoader, common_broadcastable_dtype, random_uuid)
 
-# yapf: enable
-
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
     from transformers.configuration_utils import PretrainedConfig
@@ -1820,313 +1815,6 @@ class LoadConfig:
             self.ignore_patterns = ["original/**/*"]
 
 
-PreemptionMode = Literal["swap", "recompute"]
-SchedulerPolicy = Literal["fcfs", "priority"]
-
-
-@config
-@dataclass
-class SchedulerConfig:
-    """Scheduler configuration."""
-
-    runner_type: RunnerType = "generate"
-    """The runner type to launch for the model."""
-
-    max_num_batched_tokens: SkipValidation[int] = None  # type: ignore
-    """Maximum number of tokens to be processed in a single iteration.
-
-    This config has no static default. If left unspecified by the user, it will
-    be set in `EngineArgs.create_engine_config` based on the usage context."""
-
-    max_num_seqs: SkipValidation[int] = None  # type: ignore
-    """Maximum number of sequences to be processed in a single iteration.
-
-    This config has no static default. If left unspecified by the user, it will
-    be set in `EngineArgs.create_engine_config` based on the usage context."""
-
-    max_model_len: SkipValidation[int] = None  # type: ignore
-    """Maximum length of a sequence (including prompt and generated text). This
-    is primarily set in `ModelConfig` and that value should be manually
-    duplicated here."""
-
-    max_num_partial_prefills: int = 1
-    """For chunked prefill, the maximum number of sequences that can be
-    partially prefilled concurrently."""
-
-    max_long_partial_prefills: int = 1
-    """For chunked prefill, the maximum number of prompts longer than
-    long_prefill_token_threshold that will be prefilled concurrently. Setting
-    this less than max_num_partial_prefills will allow shorter prompts to jump
-    the queue in front of longer prompts in some cases, improving latency."""
-
-    long_prefill_token_threshold: int = 0
-    """For chunked prefill, a request is considered long if the prompt is
-    longer than this number of tokens."""
-
-    num_lookahead_slots: int = 0
-    """The number of slots to allocate per sequence per
-    step, beyond the known token ids. This is used in speculative
-    decoding to store KV activations of tokens which may or may not be
-    accepted.
-
-    NOTE: This will be replaced by speculative config in the future; it is
-    present to enable correctness tests until then."""
-
-    cuda_graph_sizes: list[int] = field(default_factory=list)
-    """Cuda graph capture sizes
-    1. if none provided, then default set to [min(max_num_seqs * 2, 512)]
-    2. if one value is provided, then the capture list would follow the
-    pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
-    3. more than one value (e.g. 1 2 128) is provided, then the capture list
-    will follow the provided list."""
-
-    delay_factor: float = 0.0
-    """Apply a delay (of delay factor multiplied by previous
-    prompt latency) before scheduling next prompt."""
-
-    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
-    """If True, prefill requests can be chunked based
-    on the remaining max_num_batched_tokens."""
-
-    is_multimodal_model: bool = False
-    """True if the model is multimodal."""
-
-    # TODO (ywang96): Make this configurable.
-    max_num_encoder_input_tokens: int = field(init=False)
-    """Multimodal encoder compute budget, only used in V1.
-
-    NOTE: This is not currently configurable. It will be overridden by
-    max_num_batched_tokens in case max multimodal embedding size is larger."""
-
-    # TODO (ywang96): Make this configurable.
-    encoder_cache_size: int = field(init=False)
-    """Multimodal encoder cache size, only used in V1.
-
-    NOTE: This is not currently configurable. It will be overridden by
-    max_num_batched_tokens in case max multimodal embedding size is larger."""
-
-    preemption_mode: Optional[PreemptionMode] = None
-    """Whether to perform preemption by swapping or
-    recomputation. If not specified, we determine the mode as follows:
-    We use recomputation by default since it incurs lower overhead than
-    swapping. However, when the sequence group has multiple sequences
-    (e.g., beam search), recomputation is not currently supported. In
-    such a case, we use swapping instead."""
-
-    num_scheduler_steps: int = 1
-    """Maximum number of forward steps per scheduler call."""
-
-    multi_step_stream_outputs: bool = True
-    """If False, then multi-step will stream outputs at the end of all steps"""
-
-    send_delta_data: bool = False
-    """Private API. If used, scheduler sends delta data to
-    workers instead of an entire data. It should be enabled only
-    when SPMD worker architecture is enabled. I.e.,
-    VLLM_USE_RAY_SPMD_WORKER=1"""
-
-    policy: SchedulerPolicy = "fcfs"
-    """The scheduling policy to use:\n
-    - "fcfs" means first come first served, i.e. requests are handled in order
-    of arrival.\n
-    - "priority" means requests are handled based on given priority (lower
-    value means earlier handling) and time of arrival deciding any ties)."""
-
-    chunked_prefill_enabled: bool = field(init=False)
-    """True if chunked prefill is enabled."""
-
-    disable_chunked_mm_input: bool = False
-    """If set to true and chunked prefill is enabled, we do not want to
-    partially schedule a multimodal item. Only used in V1
-    This ensures that if a request has a mixed prompt
-    (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
-    some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
-    it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
-
-    # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
-    # or "mod.custom_class".
-    scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
-    """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
-    default scheduler. Can be a class directly or the path to a class of form
-    "mod.custom_class"."""
-
-    disable_hybrid_kv_cache_manager: bool = False
-    """If set to True, KV cache manager will allocate the same size of KV cache
-    for all attention layers even if there are multiple type of attention layers
-    like full attention and sliding window attention.
-    """
-
-    async_scheduling: bool = False
-    """EXPERIMENTAL: If set to True, perform async scheduling. This may help
-    reduce the CPU overheads, leading to better latency and throughput. However,
-    async scheduling is currently not supported with some features such as
-    structured outputs, speculative decoding, and pipeline parallelism.
-    """
-
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        # no factors to consider.
-        # this config will not affect the computation graph.
-        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
-        return hash_str
-
-    def __post_init__(self) -> None:
-        if self.max_model_len is None:
-            self.max_model_len = 8192
-
-        if self.max_num_seqs is None:
-            self.max_num_seqs = 128
-
-        if self.max_num_batched_tokens is None:
-            if self.enable_chunked_prefill:
-                if self.num_scheduler_steps > 1:
-                    # Multi-step Chunked-Prefill doesn't allow prompt-chunking
-                    # for now. Have max_num_batched_tokens set to max_model_len
-                    # so we don't reject sequences on account of a short
-                    # max_num_batched_tokens.
-                    self.max_num_batched_tokens = max(
-                        self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
-                else:
-                    self.max_num_batched_tokens = (
-                        DEFAULT_MAX_NUM_BATCHED_TOKENS)
-            else:
-                # If max_model_len is too short, use
-                # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
-                # for higher throughput.
-                self.max_num_batched_tokens = max(
-                    self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
-
-            if self.runner_type == "pooling":
-                # Choose specific value for higher throughput
-                self.max_num_batched_tokens = max(
-                    self.max_num_batched_tokens,
-                    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
-                )
-            if self.is_multimodal_model:
-                # The value needs to be at least the number of multimodal tokens
-                self.max_num_batched_tokens = max(
-                    self.max_num_batched_tokens,
-                    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-                )
-
-            # When using default settings,
-            # Ensure max_num_batched_tokens does not exceed model limit.
-            # Some models (e.g., Whisper) have embeddings tied to max length.
-            self.max_num_batched_tokens = min(
-                self.max_num_seqs * self.max_model_len,
-                self.max_num_batched_tokens)
-
-        self.max_num_encoder_input_tokens = self.max_num_batched_tokens
-        self.encoder_cache_size = self.max_num_batched_tokens
-
-        if self.enable_chunked_prefill:
-            logger.info(
-                "Chunked prefill is enabled with max_num_batched_tokens=%d.",
-                self.max_num_batched_tokens)
-
-        self.chunked_prefill_enabled = self.enable_chunked_prefill
-        if self.max_num_partial_prefills > 1:
-            if self.long_prefill_token_threshold == 0:
-                self.long_prefill_token_threshold = int(self.max_model_len *
-                                                        0.04)
-
-            logger.info(
-                "Concurrent partial prefills enabled with "
-                "max_num_partial_prefills=%d, max_long_partial_prefills=%d, "
-                "long_prefill_token_threshold=%d",
-                self.max_num_partial_prefills, self.max_long_partial_prefills,
-                self.long_prefill_token_threshold)
-
-        # NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)].
-        # This avoids OOM in tight memory scenarios with small max_num_seqs,
-        # and prevents capture of many large graphs (>512) that would greatly
-        # increase startup time with limited performance benefit.
-        if not self.cuda_graph_sizes:
-            self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)]
-
-        if self.async_scheduling:
-            self.scheduler_cls = (
-                "vllm.v1.core.sched.async_scheduler.AsyncScheduler")
-
-    @model_validator(mode='after')
-    def _verify_args(self) -> Self:
-        if (self.max_num_batched_tokens < self.max_model_len
-                and not self.chunked_prefill_enabled):
-            raise ValueError(
-                f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
-                f"smaller than max_model_len ({self.max_model_len}). "
-                "This effectively limits the maximum sequence length to "
-                "max_num_batched_tokens and makes vLLM reject longer "
-                "sequences. Please increase max_num_batched_tokens or "
-                "decrease max_model_len.")
-
-        if self.max_num_batched_tokens < self.max_num_seqs:
-            raise ValueError(
-                f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
-                "be greater than or equal to max_num_seqs "
-                f"({self.max_num_seqs}).")
-
-        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
-            logger.warning(
-                "max_num_batched_tokens (%d) exceeds max_num_seqs "
-                "* max_model_len (%d). This may lead to unexpected behavior.",
-                self.max_num_batched_tokens,
-                self.max_num_seqs * self.max_model_len)
-
-        if self.num_lookahead_slots < 0:
-            raise ValueError(
-                "num_lookahead_slots "
-                f"({self.num_lookahead_slots}) must be greater than or "
-                "equal to 0.")
-
-        if self.num_scheduler_steps < 1:
-            raise ValueError(
-                "num_scheduler_steps "
-                f"({self.num_scheduler_steps}) must be greater than or "
-                "equal to 1.")
-
-        if self.max_num_partial_prefills < 1:
-            raise ValueError(
-                f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
-                "must be greater than or equal to 1.")
-        elif self.max_num_partial_prefills > 1:
-            if not self.chunked_prefill_enabled:
-                raise ValueError("Chunked prefill must be enabled to set "
-                                 "max_num_partial_prefills > 1.")
-
-            if self.long_prefill_token_threshold > self.max_model_len:
-                raise ValueError(
-                    "long_prefill_token_threshold "
-                    f"({self.long_prefill_token_threshold}) cannot be greater "
-                    f"than the max_model_len ({self.max_model_len}).")
-
-        if (self.max_long_partial_prefills
-                < 1) or (self.max_long_partial_prefills
-                         > self.max_num_partial_prefills):
-            raise ValueError(
-                f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
-                "must be greater than or equal to 1 and less than or equal to "
-                f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
-
-        return self
-
-    @property
-    def is_multi_step(self) -> bool:
-        return self.num_scheduler_steps > 1
-
-
 Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
 
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
new file mode 100644
index 0000000000..db669600a0
--- /dev/null
+++ b/vllm/config/scheduler.py
@@ -0,0 +1,329 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+from dataclasses import field
+from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+
+from pydantic import SkipValidation, model_validator
+from pydantic.dataclasses import dataclass
+from typing_extensions import Self
+
+from vllm.config.utils import config
+from vllm.logger import init_logger
+from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                        MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
+                        POOLING_MODEL_MAX_NUM_BATCHED_TOKENS)
+
+if TYPE_CHECKING:
+    from vllm.config import RunnerType
+else:
+    RunnerType = Any
+
+logger = init_logger(__name__)
+
+PreemptionMode = Literal["swap", "recompute"]
+SchedulerPolicy = Literal["fcfs", "priority"]
+
+
+@config
+@dataclass
+class SchedulerConfig:
+    """Scheduler configuration."""
+
+    runner_type: RunnerType = "generate"
+    """The runner type to launch for the model."""
+
+    max_num_batched_tokens: SkipValidation[int] = None  # type: ignore
+    """Maximum number of tokens to be processed in a single iteration.
+
+    This config has no static default. If left unspecified by the user, it will
+    be set in `EngineArgs.create_engine_config` based on the usage context."""
+
+    max_num_seqs: SkipValidation[int] = None  # type: ignore
+    """Maximum number of sequences to be processed in a single iteration.
+
+    This config has no static default. If left unspecified by the user, it will
+    be set in `EngineArgs.create_engine_config` based on the usage context."""
+
+    max_model_len: SkipValidation[int] = None  # type: ignore
+    """Maximum length of a sequence (including prompt and generated text). This
+    is primarily set in `ModelConfig` and that value should be manually
+    duplicated here."""
+
+    max_num_partial_prefills: int = 1
+    """For chunked prefill, the maximum number of sequences that can be
+    partially prefilled concurrently."""
+
+    max_long_partial_prefills: int = 1
+    """For chunked prefill, the maximum number of prompts longer than
+    long_prefill_token_threshold that will be prefilled concurrently. Setting
+    this less than max_num_partial_prefills will allow shorter prompts to jump
+    the queue in front of longer prompts in some cases, improving latency."""
+
+    long_prefill_token_threshold: int = 0
+    """For chunked prefill, a request is considered long if the prompt is
+    longer than this number of tokens."""
+
+    num_lookahead_slots: int = 0
+    """The number of slots to allocate per sequence per
+    step, beyond the known token ids. This is used in speculative
+    decoding to store KV activations of tokens which may or may not be
+    accepted.
+
+    NOTE: This will be replaced by speculative config in the future; it is
+    present to enable correctness tests until then."""
+
+    cuda_graph_sizes: list[int] = field(default_factory=list)
+    """Cuda graph capture sizes
+    1. if none provided, then default set to [min(max_num_seqs * 2, 512)]
+    2. if one value is provided, then the capture list would follow the
+    pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
+    3. more than one value (e.g. 1 2 128) is provided, then the capture list
+    will follow the provided list."""
+
+    delay_factor: float = 0.0
+    """Apply a delay (of delay factor multiplied by previous
+    prompt latency) before scheduling next prompt."""
+
+    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
+    """If True, prefill requests can be chunked based
+    on the remaining max_num_batched_tokens."""
+
+    is_multimodal_model: bool = False
+    """True if the model is multimodal."""
+
+    # TODO (ywang96): Make this configurable.
+    max_num_encoder_input_tokens: int = field(init=False)
+    """Multimodal encoder compute budget, only used in V1.
+
+    NOTE: This is not currently configurable. It will be overridden by
+    max_num_batched_tokens in case max multimodal embedding size is larger."""
+
+    # TODO (ywang96): Make this configurable.
+    encoder_cache_size: int = field(init=False)
+    """Multimodal encoder cache size, only used in V1.
+
+    NOTE: This is not currently configurable. It will be overridden by
+    max_num_batched_tokens in case max multimodal embedding size is larger."""
+
+    preemption_mode: Optional[PreemptionMode] = None
+    """Whether to perform preemption by swapping or
+    recomputation. If not specified, we determine the mode as follows:
+    We use recomputation by default since it incurs lower overhead than
+    swapping. However, when the sequence group has multiple sequences
+    (e.g., beam search), recomputation is not currently supported. In
+    such a case, we use swapping instead."""
+
+    num_scheduler_steps: int = 1
+    """Maximum number of forward steps per scheduler call."""
+
+    multi_step_stream_outputs: bool = True
+    """If False, then multi-step will stream outputs at the end of all steps"""
+
+    send_delta_data: bool = False
+    """Private API. If used, scheduler sends delta data to
+    workers instead of an entire data. It should be enabled only
+    when SPMD worker architecture is enabled. I.e.,
+    VLLM_USE_RAY_SPMD_WORKER=1"""
+
+    policy: SchedulerPolicy = "fcfs"
+    """The scheduling policy to use:\n
+    - "fcfs" means first come first served, i.e. requests are handled in order
+    of arrival.\n
+    - "priority" means requests are handled based on given priority (lower
+    value means earlier handling) and time of arrival deciding any ties)."""
+
+    chunked_prefill_enabled: bool = field(init=False)
+    """True if chunked prefill is enabled."""
+
+    disable_chunked_mm_input: bool = False
+    """If set to true and chunked prefill is enabled, we do not want to
+    partially schedule a multimodal item. Only used in V1
+    This ensures that if a request has a mixed prompt
+    (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
+    some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
+    it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
+
+    # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
+    # or "mod.custom_class".
+    scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
+    """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
+    default scheduler. Can be a class directly or the path to a class of form
+    "mod.custom_class"."""
+
+    disable_hybrid_kv_cache_manager: bool = False
+    """If set to True, KV cache manager will allocate the same size of KV cache
+    for all attention layers even if there are multiple type of attention layers
+    like full attention and sliding window attention.
+    """
+
+    async_scheduling: bool = False
+    """EXPERIMENTAL: If set to True, perform async scheduling. This may help
+    reduce the CPU overheads, leading to better latency and throughput. However,
+    async scheduling is currently not supported with some features such as
+    structured outputs, speculative decoding, and pipeline parallelism.
+    """
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self) -> None:
+        if self.max_model_len is None:
+            self.max_model_len = 8192
+
+        if self.max_num_seqs is None:
+            self.max_num_seqs = 128
+
+        if self.max_num_batched_tokens is None:
+            if self.enable_chunked_prefill:
+                if self.num_scheduler_steps > 1:
+                    # Multi-step Chunked-Prefill doesn't allow prompt-chunking
+                    # for now. Have max_num_batched_tokens set to max_model_len
+                    # so we don't reject sequences on account of a short
+                    # max_num_batched_tokens.
+                    self.max_num_batched_tokens = max(
+                        self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
+                else:
+                    self.max_num_batched_tokens = (
+                        DEFAULT_MAX_NUM_BATCHED_TOKENS)
+            else:
+                # If max_model_len is too short, use
+                # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
+                # for higher throughput.
+                self.max_num_batched_tokens = max(
+                    self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+            if self.runner_type == "pooling":
+                # Choose specific value for higher throughput
+                self.max_num_batched_tokens = max(
+                    self.max_num_batched_tokens,
+                    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
+                )
+            if self.is_multimodal_model:
+                # The value needs to be at least the number of multimodal tokens
+                self.max_num_batched_tokens = max(
+                    self.max_num_batched_tokens,
+                    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
+                )
+
+            # When using default settings,
+            # Ensure max_num_batched_tokens does not exceed model limit.
+            # Some models (e.g., Whisper) have embeddings tied to max length.
+            self.max_num_batched_tokens = min(
+                self.max_num_seqs * self.max_model_len,
+                self.max_num_batched_tokens)
+
+        self.max_num_encoder_input_tokens = self.max_num_batched_tokens
+        self.encoder_cache_size = self.max_num_batched_tokens
+
+        if self.enable_chunked_prefill:
+            logger.info(
+                "Chunked prefill is enabled with max_num_batched_tokens=%d.",
+                self.max_num_batched_tokens)
+
+        self.chunked_prefill_enabled = self.enable_chunked_prefill
+        if self.max_num_partial_prefills > 1:
+            if self.long_prefill_token_threshold == 0:
+                self.long_prefill_token_threshold = int(self.max_model_len *
+                                                        0.04)
+
+            logger.info(
+                "Concurrent partial prefills enabled with "
+                "max_num_partial_prefills=%d, max_long_partial_prefills=%d, "
+                "long_prefill_token_threshold=%d",
+                self.max_num_partial_prefills, self.max_long_partial_prefills,
+                self.long_prefill_token_threshold)
+
+        # NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)].
+        # This avoids OOM in tight memory scenarios with small max_num_seqs,
+        # and prevents capture of many large graphs (>512) that would greatly
+        # increase startup time with limited performance benefit.
+        if not self.cuda_graph_sizes:
+            self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)]
+
+        if self.async_scheduling:
+            self.scheduler_cls = (
+                "vllm.v1.core.sched.async_scheduler.AsyncScheduler")
+
+    @model_validator(mode='after')
+    def _verify_args(self) -> Self:
+        if (self.max_num_batched_tokens < self.max_model_len
+                and not self.chunked_prefill_enabled):
+            raise ValueError(
+                f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
+                f"smaller than max_model_len ({self.max_model_len}). "
+                "This effectively limits the maximum sequence length to "
+                "max_num_batched_tokens and makes vLLM reject longer "
+                "sequences. Please increase max_num_batched_tokens or "
+                "decrease max_model_len.")
+
+        if self.max_num_batched_tokens < self.max_num_seqs:
+            raise ValueError(
+                f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
+                "be greater than or equal to max_num_seqs "
+                f"({self.max_num_seqs}).")
+
+        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
+            logger.warning(
+                "max_num_batched_tokens (%d) exceeds max_num_seqs "
+                "* max_model_len (%d). This may lead to unexpected behavior.",
+                self.max_num_batched_tokens,
+                self.max_num_seqs * self.max_model_len)
+
+        if self.num_lookahead_slots < 0:
+            raise ValueError(
+                "num_lookahead_slots "
+                f"({self.num_lookahead_slots}) must be greater than or "
+                "equal to 0.")
+
+        if self.num_scheduler_steps < 1:
+            raise ValueError(
+                "num_scheduler_steps "
+                f"({self.num_scheduler_steps}) must be greater than or "
+                "equal to 1.")
+
+        if self.max_num_partial_prefills < 1:
+            raise ValueError(
+                f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
+                "must be greater than or equal to 1.")
+        elif self.max_num_partial_prefills > 1:
+            if not self.chunked_prefill_enabled:
+                raise ValueError("Chunked prefill must be enabled to set "
+                                 "max_num_partial_prefills > 1.")
+
+            if self.long_prefill_token_threshold > self.max_model_len:
+                raise ValueError(
+                    "long_prefill_token_threshold "
+                    f"({self.long_prefill_token_threshold}) cannot be greater "
+                    f"than the max_model_len ({self.max_model_len}).")
+
+        if (self.max_long_partial_prefills
+                < 1) or (self.max_long_partial_prefills
+                         > self.max_num_partial_prefills):
+            raise ValueError(
+                f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
+                "must be greater than or equal to 1 and less than or equal to "
+                f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
+
+        return self
+
+    @property
+    def is_multi_step(self) -> bool:
+        return self.num_scheduler_steps > 1

From 59f3b936365afd200e474ddc9d1f5aa33f05b634 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Tue, 12 Aug 2025 03:22:49 -0500
Subject: [PATCH 188/932] [DOC] update v1_guide with INTEL HW (#22679)

Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
---
 docs/usage/v1_guide.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 12191d3490..54af970ea8 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -63,6 +63,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 |------------|-----------------------------------------------|
 | **NVIDIA** | <nobr>🚀</nobr>                               |
 | **AMD**    | <nobr>🟢</nobr>                               |
+| **INTEL GPU**    | <nobr>🟢</nobr>                               |
 | **TPU**    | <nobr>🟢</nobr>                               |
 | **CPU**    | <nobr>🟢 (x86\_64/aarch64) 🟡 (MacOS) </nobr> |
 
@@ -72,6 +73,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 
     - [vllm-ascend](https://github.com/vllm-project/vllm-ascend)
     - [vllm-spyre](https://github.com/vllm-project/vllm-spyre)
+    - [vllm-gaudi](https://github.com/vllm-project/vllm-gaudi)
     - [vllm-openvino](https://github.com/vllm-project/vllm-openvino)
 
     Please check their corresponding repositories for more details.

From 9f909b89963aa71b06b490a78ac9905d11879454 Mon Sep 17 00:00:00 2001
From: dongluw <108290936+dongluw@users.noreply.github.com>
Date: Tue, 12 Aug 2025 04:39:54 -0400
Subject: [PATCH 189/932] [New Model] Support Command-A-Vision (#22660)

Signed-off-by: donglu <donglu@cohere.com>
---
 docs/models/supported_models.md               |   3 +-
 examples/offline_inference/vision_language.py |  24 +
 .../vision_language_multi_image.py            |  37 ++
 tests/models/registry.py                      |   1 +
 vllm/model_executor/models/cohere2_vision.py  | 445 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 6 files changed, 510 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/cohere2_vision.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index ddab7ad5d9..ea36331542 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -331,7 +331,7 @@ th {
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
 | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
@@ -601,6 +601,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
 | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ |
+| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
 | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
 | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 5b3f0d2dc2..988ad35cdd 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -126,6 +126,29 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "CohereLabs/command-a-vision-07-2025"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=32768,
+        tensor_parallel_size=4,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = [
+        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><|IMG_PATCH|>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Deepseek-VL2
 def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1417,6 +1440,7 @@ model_example_map = {
     "aya_vision": run_aya_vision,
     "blip-2": run_blip2,
     "chameleon": run_chameleon,
+    "command_a_vision": run_command_a_vision,
     "deepseek_vl_v2": run_deepseek_vl2,
     "florence2": run_florence2,
     "fuyu": run_fuyu,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 1ab405fa14..799337ed68 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -107,6 +107,42 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_command_a_vision(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "CohereLabs/command-a-vision-07-2025"
+
+    # NOTE: This model is 122B parameters and requires tensor parallelism
+    # Recommended to use tp=4 on H100 GPUs
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=32768,
+        tensor_parallel_size=4,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "deepseek-ai/deepseek-vl2-tiny"
 
@@ -1031,6 +1067,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
 model_example_map = {
     "aria": load_aria,
     "aya_vision": load_aya_vision,
+    "command_a_vision": load_command_a_vision,
     "deepseek_vl_v2": load_deepseek_vl2,
     "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index c5816df25b..eae5829030 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -383,6 +383,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
                                                      extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
+    "Cohere2VisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/command-a-vision-07-2025"), # noqa: E501
     "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                 extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
                                                 max_transformers_version="4.48",  # noqa: E501
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
new file mode 100644
index 0000000000..f17583768f
--- /dev/null
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -0,0 +1,445 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from vllm/model_executor/models/aya_vision.py
+"""Command-A-Vision (Cohere2Vision) multimodal model implementation for vLLM."""
+
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Annotated, Literal, Optional, Union
+
+import torch
+from torch import nn
+from transformers import BatchFeature, PretrainedConfig
+from transformers.models.cohere2_vision import Cohere2VisionConfig
+from transformers.models.cohere2_vision.processing_cohere2_vision import (
+    Cohere2VisionProcessor)
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.activation import MulAndSilu
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.awq import AWQConfig
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo,
+                                        MultiModalFieldConfig,
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+
+class Cohere2VisionImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: The total number of patches over each image over each prompt in
+              the batch
+        - c: Number of channels
+        - h: Height of each image patch
+        - w: Width of each image patch
+        - bn: Batch size * number of images
+    """
+
+    type: Literal["pixel_values"]
+
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", 3, "h", "w"),
+    ]
+
+    num_patches: Annotated[
+        torch.Tensor,
+        TensorShape("bn"),
+    ]
+
+
+class Cohere2VisionMultiModalProjector(nn.Module):
+    """Multimodal projector that maps vision features to text embedding space.
+    
+    Uses pixel shuffle downsampling followed by SwiGLU activation.
+    """
+
+    def __init__(self, config: Cohere2VisionConfig, prefix: str = ""):
+        super().__init__()
+        self.downsample_factor = config.downsample_factor
+
+        # Input dimension after pixel shuffle downsampling
+        input_dim = config.vision_config.hidden_size * (
+            config.downsample_factor**2)
+        # MergedColumnParallelLinear expects the intermediate size to be a list
+        # of sizes, so that it will load the weights as two separate linear
+        # layers before applying any parallelism.
+        # We need to divide the alignment intermediate size by 2 because
+        # the weights are merged weights of two linear layers for SwiGLU.
+        self.intermediate_size = config.alignment_intermediate_size // 2
+
+        self.linear_1 = MergedColumnParallelLinear(
+            input_dim,
+            [self.intermediate_size] * 2,
+            bias=True,
+            return_bias=False,
+            prefix=f"{prefix}.linear_1",
+        )
+        self.act = MulAndSilu()
+        self.linear_2 = RowParallelLinear(
+            self.intermediate_size,
+            config.text_config.hidden_size,
+            bias=True,
+            return_bias=False,
+            prefix=f"{prefix}.linear_2",
+        )
+
+    def forward(self, image_features):
+        image_features = self.pixel_shuffle(image_features)
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+    def pixel_shuffle(self, image_features: torch.Tensor) -> torch.Tensor:
+        """Apply pixel shuffle downsampling to reduce spatial dimensions.
+        
+        Args:
+            image_features: Input tensor of shape [B, S, D] where S = H*W
+            
+        Returns:
+            Downsampled tensor with increased channel dimension
+        """
+        height = width = int(image_features.shape[1]**0.5)
+        x = image_features.reshape(image_features.shape[0], width, height, -1)
+        n, h, w, c = x.size()
+        scale_factor = 1. / self.downsample_factor
+        nh = int(h * scale_factor)
+        nw = int(w * scale_factor)
+        x = x.reshape(n, nh, self.downsample_factor, nw,
+                      self.downsample_factor, c)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        x = x.reshape(n, nh, nw, -1)
+        return x
+
+
+class Cohere2VisionProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> Cohere2VisionConfig:
+        return self.ctx.get_hf_config(Cohere2VisionConfig)
+
+    def get_hf_processor(self, **kwargs: object) -> Cohere2VisionProcessor:
+        return self.ctx.get_hf_processor(Cohere2VisionProcessor, **kwargs)
+
+    def get_image_processor(self, **kwargs: object):
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
+        height = image_processor.size['height']
+        width = image_processor.size['width']
+        max_patches = image_processor.max_patches
+        return ImageSize(height=height * max_patches, width=width)
+
+    def get_num_patches(self, image_width: int, image_height: int) -> int:
+        """
+        Calculate the number of image patches for a given image.
+        Uses the HF processor to determine the actual number of patches.
+        """
+        return self.get_hf_processor(
+        ).image_processor.get_number_of_image_patches(image_height,
+                                                      image_width, {})
+
+
+class Cohere2VisionDummyInputsBuilder(
+        BaseDummyInputsBuilder[Cohere2VisionProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        image_size = \
+            self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=image_size.width,
+                                   height=image_size.height,
+                                   num_images=num_images)
+        }
+
+
+class Cohere2VisionMultiModalProcessor(
+        BaseMultiModalProcessor[Cohere2VisionProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
+            tok_kwargs,
+        )
+
+        # Ensure num_patches is available for proper tensor splitting
+        if "num_patches" not in processed_outputs and (
+                images := mm_data.get("images")) is not None:
+            # Fallback calculation if HF processor didn't provide num_patches
+            parsed_images = self._get_data_parser().parse_mm_data({
+                "image":
+                images
+            }).get_items("image", ImageProcessorItems)
+
+            num_patches = [
+                self.info.get_num_patches(
+                    image_width=parsed_images.get_image_size(i).width,
+                    image_height=parsed_images.get_image_size(i).height)
+                for i in range(len(parsed_images))
+            ]
+            processed_outputs["num_patches"] = torch.tensor(num_patches)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
+            num_patches=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_token = hf_processor.image_token
+        img_line_break_token = hf_processor.img_line_break_token
+        boi_token = hf_processor.boi_token
+        eoi_token = hf_processor.eoi_token
+
+        def get_replacement(item_idx: int):
+            images: ImageProcessorItems = mm_items.get("image",
+                                                       ImageProcessorItems)
+            image_size: ImageSize = images.get_image_size(item_idx)
+
+            num_patches = self.info.get_num_patches(image_size.height,
+                                                    image_size.width)
+            img_tokens_per_tile = int(hf_processor.patch_size**2)
+            single_tile_tokens = image_token * img_tokens_per_tile + \
+                img_line_break_token
+            img_string = f"{boi_token}\
+                {single_tile_tokens * num_patches}\
+                {eoi_token}"
+
+            return PromptUpdateDetails.select_text(img_string, image_token)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Cohere2VisionMultiModalProcessor,
+    info=Cohere2VisionProcessingInfo,
+    dummy_inputs=Cohere2VisionDummyInputsBuilder)
+class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                            SupportsPP):
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "model.language_model.": "language_model.model.",
+            "lm_head.": "language_model.lm_head.",
+        })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: Cohere2VisionConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.quant_config = quant_config
+        self.multimodal_config = multimodal_config
+        self._patch_quant_config(config, quant_config)
+
+        self.vision_tower = SiglipVisionModel(config.vision_config,
+                                              quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "vision_tower"))
+        self.vocab_size = config.text_config.vocab_size
+        self.multi_modal_projector = \
+            Cohere2VisionMultiModalProjector(
+                config, prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Cohere2ForCausalLM"])
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def _process_image_input(self, image_input: Cohere2VisionImagePixelInputs,
+                             **kwargs) -> list[torch.Tensor]:
+        """Process image pixels through vision tower and projector.
+        
+        Args:
+            image_input: Validated image input containing pixel values and 
+                         patch counts
+            
+        Returns:
+            List of flattened image embeddings, one per image
+        """
+        assert self.vision_tower is not None, "Vision tower is required"
+
+        pixel_values = image_input["pixel_values"]
+        num_patches = image_input["num_patches"]
+
+        # Extract visual features
+        image_features = self.vision_tower(pixel_values)
+
+        # Project to text embedding space
+        image_embeds = self.multi_modal_projector(image_features)
+
+        # Split and flatten embeddings per image
+        return [
+            e.flatten(0, 2) for e in image_embeds.split(num_patches.tolist())
+        ]
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Cohere2VisionImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        num_patches = kwargs.pop("num_patches", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        assert image_embeds is None, \
+            "Cohere2Vision does not support image_embeds."
+
+        if pixel_values is None:
+            return None
+
+        return Cohere2VisionImagePixelInputs(
+            type="pixel_values",
+            pixel_values=flatten_bn(pixel_values, concat=True),
+            num_patches=flatten_bn(num_patches, concat=True),
+            resolve_bindings={
+                "h": self.config.vision_config.image_size,
+                "w": self.config.vision_config.image_size,
+            })
+
+    def _patch_quant_config(self, config: PretrainedConfig,
+                            quant_config: QuantizationConfig):
+        # the awq models from OpenGVLab missing `modules_to_not_convert`
+        # patch the quant_config to add `modules_to_not_convert` back
+        if isinstance(quant_config, AWQConfig):
+            text_config = config.text_config
+            llm_quant_config = getattr(text_config, "quantization_config",
+                                       None)
+            if (not quant_config.modules_to_not_convert) and (llm_quant_config
+                                                              is not None):
+                quant_config.modules_to_not_convert.append("vision_tower")
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return []
+
+        return self._process_image_input(image_input, **kwargs)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None \
+            and len(multimodal_embeddings) != 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                multimodal_embeddings=multimodal_embeddings,
+                placeholder_token_id=self.config.image_token_id,
+            )
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 870704c64d..279e045a70 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -201,6 +201,7 @@ _MULTIMODAL_MODELS = {
     "AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"),  # noqa: E501
     "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
+    "Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501

From 8d17fa633e319c4a585f6ae1258000a40750e127 Mon Sep 17 00:00:00 2001
From: Sugar-zsg <64777228+Sugar-zsg@users.noreply.github.com>
Date: Tue, 12 Aug 2025 17:01:08 +0800
Subject: [PATCH 190/932] [V0] Correct CUDA Graph capture for encoder-decoder
 models (#22630)

---
 vllm/config/__init__.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 159106003f..df4eb33f5d 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1164,8 +1164,18 @@ class ModelConfig:
                     "non-quantized models.", self.quantization)
 
     def _verify_cuda_graph(self) -> None:
+        # The `max_seq_len_to_capture` was incorrectly
+        # based on the encoder's input length (448)
+        # but not the decoder's larger input length (1500).
+        # This change ensures the CUDA Graph captures the correct,
+        # larger sequence length, allowing it to work as intended.
+        effective_max_seq_len = self.max_model_len
+        if self.is_encoder_decoder:
+            effective_max_seq_len = max(
+                effective_max_seq_len,
+                getattr(self.hf_config, "max_source_positions", 0))
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
-                                          self.max_model_len)
+                                          effective_max_seq_len)
         # CUDAGraph capture not supported for enc-dec models and mllama on ROCm
         ROCM_UNSUPPORTED_MODELS = ['mllama']
         unsupported_rocm = (self.hf_config.model_type

From bc8372efc318d404db4b40a6ef86c3452f5f2a46 Mon Sep 17 00:00:00 2001
From: phantomlei <phantomlei3@gmail.com>
Date: Tue, 12 Aug 2025 17:03:22 +0800
Subject: [PATCH 191/932] [Bugfix] Fix erroneous randomly generated cases in
 bad word testing (#22170)

Signed-off-by: phantomlei <phantomlei3@gmail.com>
---
 tests/v1/sample/test_sampler.py | 34 +++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index ea10661ea1..31c6c881d7 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -90,6 +90,27 @@ def _create_bad_words_token_ids(
     return bad_words_token_ids
 
 
+# Returns all last tokens of bad word sequences that share the same prefix
+# as `given_prefix` (excluding the last token).
+def _collect_suffixes_with_same_prefix(
+        given_prefix: list[int],
+        bad_words_token_ids: list[list[int]]) -> list[int]:
+    return [bwt[-1] for bwt in bad_words_token_ids if bwt[:-1] == given_prefix]
+
+
+# generate a valid token id that is not in bad_words_token_ids
+def _generate_valid_token_id(bad_words_token_ids: list[list[int]],
+                             vocab_size: int) -> int:
+    forbidden_start_tokens = set()
+    for bad_word in bad_words_token_ids:
+        forbidden_start_tokens.add(bad_word[0])
+    # Get a safe token that's not in forbidden starts
+    safe_token_candidates = list(
+        set(range(vocab_size)) - forbidden_start_tokens)
+    # Pick a random safe token
+    return np.random.choice(safe_token_candidates)
+
+
 def _update_output_token_ids_for_bad_words(
         metadata: SamplingMetadata, vocab_size: int) -> dict[int, list[int]]:
     bad_words_last_tokens = {}
@@ -104,12 +125,17 @@ def _update_output_token_ids_for_bad_words(
                 prefix_length = len(bad_word_token_ids) - 1
                 has_bad_words = np.random.choice([True, False])
                 if has_bad_words:
-                    output_token_ids[-prefix_length:] = bad_word_token_ids[:-1]
-                    bad_words_last_token.append(bad_word_token_ids[-1])
+                    prefix = bad_word_token_ids[:-1]
+                    output_token_ids[-prefix_length:] = prefix
+                    # Collect all last tokens from other bad words
+                    # that share this prefix
+                    bad_words_last_token.extend(
+                        _collect_suffixes_with_same_prefix(
+                            prefix, bad_words_token_ids))
                     break  # Maximum one update to output_token_ids
                 else:  # Make sure no accidental match to bad words
-                    output_token_ids[-1] = (bad_word_token_ids[-2] +
-                                            1) % vocab_size
+                    output_token_ids[-1] = _generate_valid_token_id(
+                        bad_words_token_ids, vocab_size)
         bad_words_last_tokens[batch_idx] = bad_words_last_token
     return bad_words_last_tokens
 

From 1ece7f30baa9d94ff57e13d851725acf657a9690 Mon Sep 17 00:00:00 2001
From: Jun-Howie <62869005+Jun-Howie@users.noreply.github.com>
Date: Tue, 12 Aug 2025 17:03:53 +0800
Subject: [PATCH 192/932] Fix: AWQ Marlin get_quant_method does not recognize
 "modules_to_not_convert" (#21888)

Signed-off-by: JunHowie <JunHowie@aliyun.com>
Co-authored-by: JunHowie <JunHowie@aliyun.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/layers/quantization/awq_marlin.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 0fdded0b5a..6cf02658a9 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -10,7 +10,8 @@ import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
+    UnquantizedFusedMoEMethod)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod,
                                                set_weight_attrs)
@@ -141,6 +142,9 @@ class AWQMarlinConfig(QuantizationConfig):
         elif isinstance(layer, FusedMoE):
             from vllm.model_executor.layers.quantization.moe_wna16 import (
                 MoeWNA16Config)
+            if is_layer_skipped_awq(
+                    prefix, getattr(self, "modules_to_not_convert", [])):
+                return UnquantizedFusedMoEMethod(layer.moe_config)
             if not check_moe_marlin_supports_layer(layer, self.group_size):
                 logger.warning_once(
                     f"Layer '{prefix}' is not supported by AWQMoeMarlin. "
@@ -520,4 +524,4 @@ class AWQMoEMethod(FusedMoEMethodBase):
             expert_map=expert_map,
             w1_zeros=layer.w13_qzeros,
             w2_zeros=layer.w2_qzeros,
-            workspace=layer.workspace)
+            workspace=layer.workspace)
\ No newline at end of file

From 46ae7f666699496f45c0349b87f08d5119720951 Mon Sep 17 00:00:00 2001
From: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
Date: Tue, 12 Aug 2025 05:04:37 -0400
Subject: [PATCH 193/932] [Bugfix] Mamba2 SSD varlen bug fix initstates decay,
 improve test, assert chunk pwr 2 (#21783)

Signed-off-by: Rishi Astra <40644327+RishiAstra@users.noreply.github.com>
---
 tests/kernels/mamba/test_mamba_ssm_ssd.py       | 17 ++++++++---------
 .../layers/mamba/ops/ssd_chunk_scan.py          |  6 ++----
 .../layers/mamba/ops/ssd_combined.py            |  5 +++++
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
index 67b14a7faa..d2b893ffff 100644
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -187,7 +187,7 @@ def generate_continuous_batched_examples(example_lens_by_batch,
                          [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
 @pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
-@pytest.mark.parametrize("seq_len_chunk_size", [(119, 17), (128, 32)])
+@pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)])
 def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
                                          itype):
 
@@ -253,15 +253,15 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
             (8, 8, 16, 32, 16),
         ]),  # mode examples with varied lengths
 
-        # odd chunk_size
-        (64, 29, 2, [(11, 4), (13, 23), (19, 22),
-                     (21, 15)]),  # irregular sizes
-
         # large-ish chunk_size (256)
         (64, 256, 1, [(5, ), (1, ), (1, ),
                       (1, )]),  # irregular sizes with small sequences
         (64, 256, 2, [(5, 30), (1, 2), (1, 2),
                       (1, 2)]),  # irregular sizes with small sequences
+
+        # we also need to test some large seqlen
+        # to catch errors with init states decay
+        (768, 128, 2, [(138, 225), (138, 225)]),
     ])
 def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
                                      itype):
@@ -271,10 +271,9 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
 
     seqlen, chunk_size, num_examples, cases = seq_len_chunk_size_cases
 
-    # TODO: the irregular chunk size cases have some issues and require higher
-    # tolerance. This is to be invesigated
-    if chunk_size not in {8, 256}:
-        atol, rtol = 5e-1, 5e-1
+    # This test can have larger error for longer sequences
+    if seqlen > 256:
+        atol, rtol = 1e-2, 5e-3
     else:
         atol, rtol = 5e-3, 5e-3
 
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index fc2b3b25fd..365139e237 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -290,10 +290,8 @@ def _chunk_scan_fwd_kernel(
             # get the cs at the offset boundary
             # - c_off == 0 is a passthrough
             dA_cs_m_boundary = tl.load(
-                dA_cumsum_ptr +
-                (pid_m * BLOCK_SIZE_M + c_off - 1) * stride_dA_cs_csize,
-                mask=(((pid_m * BLOCK_SIZE_M + c_off - 1) > -1)
-                      and ((pid_m * BLOCK_SIZE_M + c_off) < chunk_size)),
+                dA_cumsum_ptr + (c_off - 1) * stride_dA_cs_csize,
+                mask=(((c_off - 1) > -1) and ((c_off) < chunk_size)),
                 other=0.0).to(tl.float32)
 
     if HAS_SEQ_IDX:
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
index ad2853a3d8..fd74cb8372 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -21,6 +21,10 @@ from .ssd_state_passing import _state_passing_fwd
 TRITON_22 = version.parse(triton.__version__) >= version.parse('2.2.0')
 
 
+def is_int_pow_2(n):
+    return isinstance(n, int) and n > 0 and (n & (n - 1)) == 0
+
+
 def _mamba_chunk_scan_combined_fwd(x,
                                    dt,
                                    A,
@@ -38,6 +42,7 @@ def _mamba_chunk_scan_combined_fwd(x,
                                    dt_softplus=False,
                                    dt_limit=(0.0, float("inf")),
                                    out=None):
+    assert is_int_pow_2(chunk_size), "chunk_size must be integer power of 2"
     batch, seqlen, nheads, headdim = x.shape
     _, _, ngroups, dstate = B.shape
     assert nheads % ngroups == 0

From 50f2aae1b4afb8799bc6a38254639e031997e61c Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Tue, 12 Aug 2025 17:05:14 +0800
Subject: [PATCH 194/932] [LMCache][Example] Align the PYTHONHASHSEED for
 prefillers and decoders for KV chunks hashing (#21161)

Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
---
 .../disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh     | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
index 1284466a45..682df45d95 100644
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
@@ -15,6 +15,14 @@ else
     MODEL=$2
 fi
 
+# The prefillers and decoders in LMCache use the same hash seed for all chunk keys.
+# This seed must be aligned so that decoders can identify and retrieve KV cache
+# entries stored by prefillers.
+#
+# WARNING: Using a fixed hash seed is insecure and makes the application vulnerable to
+# denial-of-service attacks. In a production environment, this should be set to a
+# secure random value. This is set to a fixed value for demonstration purposes only.
+export PYTHONHASHSEED=${VLLM_PYTHON_HASH_SEED:-123}
 
 if [[ $1 == "prefiller" ]]; then
     # Prefiller listens on port 8100

From b8a9d0e4298710c5b3533b411395593dcaaa61c2 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 12 Aug 2025 18:15:33 +0800
Subject: [PATCH 195/932] [Misc] remove GH discussions link (#22722)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d9e3ca660f..fd8b02ac1f 100644
--- a/README.md
+++ b/README.md
@@ -162,7 +162,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 ## Contact Us
 
 <!-- --8<-- [start:contact-us] -->
-- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues)
 - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
 - For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
 - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature

From 007dd90859cc0337510536677418a43d8f66e286 Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Tue, 12 Aug 2025 06:21:44 -0400
Subject: [PATCH 196/932] [gpt-oss] Enable gpt-oss on ampere (#22714)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
---
 .../vllm_add_dummy_platform/dummy_platform.py              | 5 +++--
 vllm/attention/layer.py                                    | 4 +++-
 vllm/attention/selector.py                                 | 5 ++++-
 vllm/model_executor/layers/quantization/mxfp4.py           | 2 +-
 vllm/platforms/cpu.py                                      | 4 ++--
 vllm/platforms/cuda.py                                     | 7 +++++--
 vllm/platforms/interface.py                                | 4 ++--
 vllm/platforms/rocm.py                                     | 4 ++--
 vllm/platforms/tpu.py                                      | 4 ++--
 vllm/platforms/xpu.py                                      | 4 ++--
 10 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
index e67825f89d..8d0687b49b 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -25,5 +25,6 @@ class DummyPlatform(Platform):
             compilation_config.custom_ops = ["all"]
 
     def get_attn_backend_cls(self, backend_name, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1, use_mla):
-        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
\ No newline at end of file
+                             kv_cache_dtype, block_size, use_v1, use_mla,
+                             has_sink):
+        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index b4c3cbd7c9..1a9c0e26b5 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -138,6 +138,7 @@ class Attention(nn.Module):
         self.head_size = head_size
         self.num_kv_heads = num_kv_heads
         self.sliding_window = sliding_window
+        self.has_sink = extra_impl_args.get("sinks") is not None
 
         quant_method = quant_config.get_quant_method(
             self, prefix=prefix) if quant_config else None
@@ -165,7 +166,8 @@ class Attention(nn.Module):
                                                  kv_cache_dtype,
                                                  block_size,
                                                  is_attention_free,
-                                                 use_mla=use_mla)
+                                                 use_mla=use_mla,
+                                                 has_sink=self.has_sink)
         else:
             self.attn_backend = attn_backend
 
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 508470bb36..3a235ba6e0 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -144,6 +144,7 @@ def get_attn_backend(
     block_size: int,
     is_attention_free: bool = False,
     use_mla: bool = False,
+    has_sink: bool = False,
 ) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
     # Accessing envs.* behind an @lru_cache decorator can cause the wrong
@@ -158,6 +159,7 @@ def get_attn_backend(
         is_attention_free=is_attention_free,
         use_v1=envs.VLLM_USE_V1,
         use_mla=use_mla,
+        has_sink=has_sink,
     )
 
 
@@ -170,6 +172,7 @@ def _cached_get_attn_backend(
     is_attention_free: bool,
     use_v1: bool = False,
     use_mla: bool = False,
+    has_sink: bool = False,
 ) -> type[AttentionBackend]:
     # If there are no attention layers (e.g. we are running Mamba),
     # use the placeholder NO_ATTENTION
@@ -201,7 +204,7 @@ def _cached_get_attn_backend(
     # get device-specific attn_backend
     attention_cls = current_platform.get_attn_backend_cls(
         selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
-        use_mla)
+        use_mla, has_sink)
     if not attention_cls:
         raise ValueError(
             f"Invalid attention backend for {current_platform.device_name}")
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 4e59aef480..03fbcf1583 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -42,7 +42,7 @@ class Mxfp4Config(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 90
+        return 80
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 31a67183ff..0b16a8e1d1 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -91,8 +91,8 @@ class CpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool,
-                             use_mla: bool) -> str:
+                             block_size: int, use_v1: bool, use_mla: bool,
+                             has_sink: bool) -> str:
         if selected_backend and selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         if use_mla:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index dd9356e399..c876c52a2e 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -222,8 +222,8 @@ class CudaPlatformBase(Platform):
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1,
-                             use_mla) -> str:
+                             kv_cache_dtype, block_size, use_v1, use_mla,
+                             has_sink) -> str:
         if use_mla:
             # TODO(lucas): refactor to be more concise
             #  we should probably consider factoring out V1 here
@@ -321,6 +321,9 @@ class CudaPlatformBase(Platform):
 
             # FlashAttention is the default for SM 8.0+ GPUs
             if cls.has_device_capability(80):
+                if has_sink:
+                    logger.info_once("Using Triton backend on V1 engine.")
+                    return TRITON_ATTN_VLLM_V1
                 if is_default_backend_supported := is_attn_backend_supported(
                         FLASH_ATTN_V1, head_size, dtype,
                         allow_import_error=False):
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index a85b583abc..91d5314900 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -196,8 +196,8 @@ class Platform:
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool,
-                             use_mla: bool) -> str:
+                             block_size: int, use_v1: bool, use_mla: bool,
+                             has_sink: bool) -> str:
         """Get the attention backend class of a device."""
         return ""
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d26e4b3350..8005830f55 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -188,8 +188,8 @@ class RocmPlatform(Platform):
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1,
-                             use_mla) -> str:
+                             kv_cache_dtype, block_size, use_v1, use_mla,
+                             has_sink) -> str:
         if use_mla:
             from vllm.attention.backends.rocm_aiter_mla import (
                 is_aiter_mla_enabled)
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 146801c9d7..c56096d936 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -46,8 +46,8 @@ class TpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool,
-                             use_mla: bool) -> str:
+                             block_size: int, use_v1: bool, use_mla: bool,
+                             has_sink) -> str:
         if (selected_backend != _Backend.PALLAS
                 and selected_backend != _Backend.PALLAS_VLLM_V1):
             logger.info("Cannot use %s backend on TPU.", selected_backend)
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index d8a663f2f0..abd58dbbcb 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -35,8 +35,8 @@ class XPUPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool,
-                             use_mla: bool) -> str:
+                             block_size: int, use_v1: bool, use_mla: bool,
+                             has_sink: bool) -> str:
         if selected_backend is not None and selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
         use_v1 = envs.VLLM_USE_V1

From 767e63b860dcb8952779f6035d2b215b53dd744d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 12 Aug 2025 12:25:55 +0100
Subject: [PATCH 197/932] [Docs] Improve docs navigation (#22720)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .gitignore                        |  3 ++-
 docs/.nav.yml                     | 22 +++++++---------------
 docs/README.md                    | 11 +++++++++++
 docs/examples/README.md           |  7 +++++++
 docs/mkdocs/stylesheets/extra.css |  7 +++++++
 docs/usage/README.md              |  4 +++-
 mkdocs.yaml                       |  5 +++--
 7 files changed, 40 insertions(+), 19 deletions(-)
 create mode 100644 docs/examples/README.md

diff --git a/.gitignore b/.gitignore
index 5dc0f04b6f..721dd7536b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -150,7 +150,8 @@ venv.bak/
 # mkdocs documentation
 /site
 docs/argparse
-docs/examples
+docs/examples/*
+!docs/examples/README.md
 
 # mypy
 .mypy_cache/
diff --git a/docs/.nav.yml b/docs/.nav.yml
index acedc32c30..dbac0e12f1 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -1,25 +1,17 @@
 nav:
-  - Home:
-    - vLLM: README.md
+  - Home: README.md
+  - User Guide:
+    - usage/README.md
     - Getting Started:
       - getting_started/quickstart.md
       - getting_started/installation
     - Examples:
+      - examples/README.md
       - Offline Inference: examples/offline_inference
       - Online Serving: examples/online_serving
       - Others: examples/others
-    - Quick Links:
-      - User Guide: usage/README.md
-      - Developer Guide: contributing/README.md
-      - API Reference: api/README.md
-      - CLI Reference: cli/README.md
-    - Timeline:
-      - Roadmap: https://roadmap.vllm.ai
-      - Releases: https://github.com/vllm-project/vllm/releases
-  - User Guide:
-    - Summary: usage/README.md
-    - usage/v1_guide.md
     - General:
+      - usage/v1_guide.md
       - usage/*
     - Inference and Serving:
       - serving/offline_inference.md
@@ -32,7 +24,7 @@ nav:
       - deployment/integrations
     - Training: training
     - Configuration:
-      - Summary: configuration/README.md
+      - configuration/README.md
       - configuration/*
     - Models:
       - models/supported_models.md
@@ -45,7 +37,7 @@ nav:
       - features/*
       - features/quantization
   - Developer Guide:
-    - Summary: contributing/README.md
+    - contributing/README.md
     - General:
       - glob: contributing/*
         flatten_single_child_sections: true
diff --git a/docs/README.md b/docs/README.md
index 6823008ed3..e8d2fd953a 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -21,6 +21,17 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.
 
 Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
 
+Where to get started with vLLM depends on the type of user. If you are looking to:
+
+- Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
+- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
+- Build vLLM, we recommend starting with [Developer Guide](./contributing)
+
+For information about the development of vLLM, see:
+
+- [Roadmap](https://roadmap.vllm.ai)
+- [Releases](https://github.com/vllm-project/vllm/releases)
+
 vLLM is fast with:
 
 - State-of-the-art serving throughput
diff --git a/docs/examples/README.md b/docs/examples/README.md
new file mode 100644
index 0000000000..34e4dfd408
--- /dev/null
+++ b/docs/examples/README.md
@@ -0,0 +1,7 @@
+# Examples
+
+vLLM's examples are split into three categories:
+
+- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference/)
+- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving/)
+- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others/)
diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css
index fb44d9cdcf..6a1979b241 100644
--- a/docs/mkdocs/stylesheets/extra.css
+++ b/docs/mkdocs/stylesheets/extra.css
@@ -23,6 +23,13 @@ a:not(:has(svg)):not(.md-icon):not(.autorefs-external) {
     }
 }
 
+a[href*="localhost"]::after,
+a[href*="127.0.0.1"]::after,
+a[href*="org.readthedocs.build"]::after,
+a[href*="docs.vllm.ai"]::after {
+    display: none !important;
+}
+
 /* Light mode: darker section titles */
 body[data-md-color-scheme="default"] .md-nav__item--section > label.md-nav__link .md-ellipsis {
   color: rgba(0, 0, 0, 0.7) !important;
diff --git a/docs/usage/README.md b/docs/usage/README.md
index 681db57d8e..83aea12181 100644
--- a/docs/usage/README.md
+++ b/docs/usage/README.md
@@ -1,6 +1,8 @@
 # Using vLLM
 
-vLLM supports the following usage patterns:
+First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
+
+Then, vLLM supports the following usage patterns:
 
 - [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model.
 - [Deployment](../deployment/docker.md): Scale up model instances for production.
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 3a64888fb4..47fe1ebce9 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -34,13 +34,14 @@ theme:
     - content.action.edit
     - content.code.copy
     - content.tabs.link
+    - navigation.instant
+    - navigation.instant.progress
     - navigation.tracking
     - navigation.tabs
     - navigation.tabs.sticky
     - navigation.sections
-    - navigation.prune
-    - navigation.top
     - navigation.indexes
+    - navigation.top
     - search.highlight
     - search.share
     - toc.follow

From d030b01548d52a5e3afe56fdb8ce7a367b9799e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 12 Aug 2025 14:37:30 +0200
Subject: [PATCH 198/932] [BugFix][Nixl][PD] Fix heterogenous TP (#22663)

Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 .../kv_transfer/kv_connector/factory.py       | 37 ++++++++++++-------
 .../kv_transfer/kv_connector/utils.py         | 11 ++++--
 2 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 01673a0d7c..584fc1d655 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -4,13 +4,17 @@
 import importlib
 from typing import TYPE_CHECKING, Callable
 
+# yapf: disable
 import vllm.envs as envs
-from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.distributed.kv_transfer.kv_connector.base import (
+    KVConnectorBase, KVConnectorBaseType)
 from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
 from vllm.logger import init_logger
 
+# yapf: enable
+
 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
+    from vllm.config import KVTransferConfig, VllmConfig
 
 logger = init_logger(__name__)
 
@@ -42,17 +46,7 @@ class KVConnectorFactory:
                              f"but found {envs.VLLM_USE_V1=}")
 
         kv_transfer_config = config.kv_transfer_config
-        connector_name = kv_transfer_config.kv_connector
-        if connector_name in cls._registry:
-            connector_cls = cls._registry[connector_name]()
-        else:
-            connector_module_path = kv_transfer_config.kv_connector_module_path
-            if connector_module_path is None:
-                raise ValueError(
-                    f"Unsupported connector type: {connector_name}")
-            connector_module = importlib.import_module(connector_module_path)
-            connector_cls = getattr(connector_module, connector_name)
-        assert issubclass(connector_cls, KVConnectorBase)
+        connector_cls = cls.get_connector_class(kv_transfer_config)
         logger.info("Creating v1 connector with name: %s and engine_id: %s",
                     connector_cls.__name__, kv_transfer_config.engine_id)
         # NOTE(Kuntai): v1 connector is explicitly separated into two roles.
@@ -65,6 +59,23 @@ class KVConnectorFactory:
         # We build separately to enforce strict separation
         return connector_cls(config, role)
 
+    @classmethod
+    def get_connector_class(
+            cls, kv_transfer_config: "KVTransferConfig"
+    ) -> type[KVConnectorBaseType]:
+        """Get the connector class by name."""
+        connector_name = kv_transfer_config.kv_connector
+        if connector_name in cls._registry:
+            connector_cls = cls._registry[connector_name]()
+        else:
+            connector_module_path = kv_transfer_config.kv_connector_module_path
+            if connector_module_path is None:
+                raise ValueError(
+                    f"Unsupported connector type: {connector_name}")
+            connector_module = importlib.import_module(connector_module_path)
+            connector_cls = getattr(connector_module, connector_name)
+        return connector_cls
+
 
 # Register various connectors here.
 # The registration should not be done in each individual file, as we want to
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 1da41790f9..2364400b3d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -13,8 +13,8 @@ import torch
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig, get_current_vllm_config
-from vllm.distributed.kv_transfer.kv_connector.v1.base import (
-    KVConnectorBase_V1)
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
 from vllm.logger import init_logger
 from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 
@@ -106,8 +106,9 @@ def get_kv_connector_cache_layout():
     vllm_config = get_current_vllm_config()
     kv_config = vllm_config.kv_transfer_config
     if kv_config is not None:
-        required_kvcache_layout = (
-            KVConnectorBase_V1.get_required_kvcache_layout(vllm_config))
+        connector_cls = KVConnectorFactory.get_connector_class(kv_config)
+        required_kvcache_layout = connector_cls.get_required_kvcache_layout(
+            vllm_config)
         if required_kvcache_layout is not None:
             return required_kvcache_layout
         logger.info_once("Connectors do not specify a " \
@@ -143,6 +144,8 @@ class KVOutputAggregator:
         finished_recving = set[str]()
         for output in outputs:
             output = output.kv_connector_output
+            if not output:
+                continue
             update_finished_set(output.finished_sending,
                                 self._send_remaining_count, finished_sending)
             update_finished_set(output.finished_recving,

From 80bb1e8afe950342e93b7262e7bf25eb6d29b287 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 12 Aug 2025 13:38:48 +0100
Subject: [PATCH 199/932] Officially support SmolLM3 using the Transformers
 backend (#22665)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/models/supported_models.md        | 6 ++++++
 tests/models/registry.py               | 1 +
 vllm/model_executor/models/registry.py | 3 +++
 3 files changed, 10 insertions(+)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index ea36331542..a24fa4bcce 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -409,6 +409,12 @@ th {
 | `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ |
 | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
 
+Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
+
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ | ✅︎ |
+
 !!! note
     Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index eae5829030..d7d20d1f3a 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -291,6 +291,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
+    "SmolLM3ForCausalLM": _HfExamplesInfo("HuggingFaceTB/SmolLM3-3B"),
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 279e045a70..64dbde4916 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -271,6 +271,9 @@ _SPECULATIVE_DECODING_MODELS = {
 }
 
 _TRANSFORMERS_SUPPORTED_MODELS = {
+    # Text generation models
+    "SmolLM3ForCausalLM": ("transformers", "TransformersForCausalLM"),
+    # Multimodal models
     "Emu3ForConditionalGeneration": ("transformers", "TransformersForMultimodalLM"),  # noqa: E501
 }
 

From f7ad6a1eb3deb9ca70a6bce3705dbd16cf9d8b28 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 12 Aug 2025 20:42:58 +0800
Subject: [PATCH 200/932] [CI Failure] fix
 tests/entrypoints/openai/test_skip_tokenizer.py (#22708)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 .../model_executor/models/prithvi_geospatial_mae.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 304a9e987e..20f423cc76 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -25,11 +25,11 @@ import torch.nn as nn
 from transformers import BatchFeature
 
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.pooler import (AllPool, PoolerHead,
-                                               PoolerIdentity, SimplePooler)
+from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (
-    IsAttentionFree, MultiModalEmbeddings, SupportsMultiModalWithRawInput)
+    IsAttentionFree, MultiModalEmbeddings, SupportsMultiModalWithRawInput,
+    default_pooling_type)
 from vllm.model_executor.models.utils import AutoWeightsLoader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -142,6 +142,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
         )
 
 
+@default_pooling_type("All")
 @MULTIMODAL_REGISTRY.register_processor(
     PrithviGeoSpatialMAEMultiModalProcessor,
     info=PrithviGeoSpatialMAEProcessingInfo,
@@ -198,7 +199,11 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree,
                 "Only SemanticSegmentationTask is supported for now "
                 "by PrithviGeospatialMAE.")
 
-        self.pooler = SimplePooler(AllPool(), PoolerHead(PoolerIdentity()))
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler(
+            {"encode": Pooler.for_encode(pooler_config)}, )
 
     def _parse_and_validate_multimodal_data(
             self, **kwargs) -> tuple[torch.Tensor, Optional[torch.Tensor]]:

From 67c153b88a2129c3b6fb78af09901738f1034a68 Mon Sep 17 00:00:00 2001
From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com>
Date: Tue, 12 Aug 2025 20:50:59 +0800
Subject: [PATCH 201/932] Fix Llama4 FlashInfer FP4 MoE issues (#22511)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
---
 .../layers/fused_moe/flashinfer_cutlass_moe.py             | 2 --
 .../fused_moe/flashinfer_cutlass_prepare_finalize.py       | 7 ++++++-
 vllm/model_executor/layers/quantization/modelopt.py        | 5 +++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 3e79a1a8c2..4e3e15a35a 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -170,8 +170,6 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
             "w1_scale and w2_scale must not "
             "be None for FlashInferExperts")
 
-        assert not apply_router_weight_on_input
-
         quant_scales = [
             a1_gscale,
             w1_scale.view(torch.int32),
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index 7fdb465c45..36aca8cf74 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -60,7 +60,12 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
                Optional[torch.Tensor], Optional[torch.Tensor]]:
 
-        assert not apply_router_weight_on_input
+        if apply_router_weight_on_input:
+            topk = topk_ids.size(1)
+            # TODO: this only works for topK=1, will need to update for topK>1
+            assert topk == 1, \
+                "apply_router_weight_on_input is only implemented for topk=1"
+            a1.mul_(topk_weights.to(a1.dtype))
 
         (a1_gscale, use_dp, local_tokens) = extract_required_args(
             extra_prepare_args, ['a1_gscale', 'use_dp', 'local_tokens'])
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 147b275eaf..bed5022267 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1299,8 +1299,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 output2_scale_scalar=layer.g2_alphas.data,
                 num_experts=global_num_experts,
                 top_k=top_k,
-                n_group=num_expert_group,
-                topk_group=topk_group,
+                n_group=num_expert_group
+                if num_expert_group is not None else 0,
+                topk_group=topk_group if topk_group is not None else 0,
                 intermediate_size=layer.intermediate_size_per_partition,
                 local_expert_offset=layer.ep_rank * layer.local_num_experts,
                 local_num_experts=layer.local_num_experts,

From 3d9d40efdeea7011dc3c496ad9d55cfdc90aff92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 12 Aug 2025 16:30:17 +0200
Subject: [PATCH 202/932] [Bugfix][CI] Fix
 `test_remote_decode_lifecycle.py::test_short_prompt_lifecycle` (#22727)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index 1bddfef0f2..2f8228864e 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -130,8 +130,9 @@ def test_short_prompt_lifecycle():
     # Confirm we do not have any memory leaks after req lifecycle.
     # We need to mark sending finish to clear data for persistent batch.
     scheduler_output = scheduler.schedule()
-    model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
-    model_runner_output.finished_sending = [request.request_id]
+    # Use create_model_runner_output to pass kv_connector_output along
+    model_runner_output = create_model_runner_output(
+        reqs=[request], finished_sending=[request.request_id])
     scheduler.update_from_output(scheduler_output, model_runner_output)
     assert_scheduler_empty(scheduler)
 

From e5d3d63c42aa85025dfb1b5dec369c0c856a4efa Mon Sep 17 00:00:00 2001
From: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com>
Date: Tue, 12 Aug 2025 17:41:37 +0300
Subject: [PATCH 203/932] [Benchmark] Fix terminal colors in
 benchmark_serving_multi_turn (python 3.12) (#22730)

Signed-off-by: daniels <daniels@pliops.com>
---
 benchmarks/multi_turn/bench_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/multi_turn/bench_utils.py b/benchmarks/multi_turn/bench_utils.py
index d4d3c1ca8c..e959a4be71 100644
--- a/benchmarks/multi_turn/bench_utils.py
+++ b/benchmarks/multi_turn/bench_utils.py
@@ -4,7 +4,7 @@ import logging
 from enum import Enum
 
 
-class Color(str, Enum):
+class Color(Enum):
     RED = "\033[91m"
     GREEN = "\033[92m"
     BLUE = "\033[94m"
@@ -13,6 +13,9 @@ class Color(str, Enum):
     YELLOW = "\033[93m"
     RESET = "\033[0m"
 
+    def __str__(self):
+        return self.value
+
 
 TEXT_SEPARATOR = "-" * 100
 

From 5a4b4b3729e1a1594bf56d38b7c8d3f556754634 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rtuli@redhat.com>
Date: Tue, 12 Aug 2025 21:54:52 +0530
Subject: [PATCH 204/932] Add: `SupportsEagle3` interface for explicit EAGLE3
 support (#22642)

Signed-off-by: Rahul Tuli <rtuli@redhat.com>
---
 .../speculators/test_eagle3.py                | 18 ++++++-
 vllm/model_executor/models/interfaces.py      | 53 +++++++++++++++++++
 vllm/model_executor/models/llama.py           |  4 +-
 vllm/model_executor/models/qwen3.py           |  4 +-
 vllm/v1/worker/gpu_model_runner.py            | 10 +++-
 5 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py
index c46ac7a88b..45ddb21787 100644
--- a/tests/speculative_decoding/speculators/test_eagle3.py
+++ b/tests/speculative_decoding/speculators/test_eagle3.py
@@ -3,12 +3,20 @@
 import pytest
 import torch
 
+from vllm.model_executor.models.interfaces import supports_eagle3
+
 
 @pytest.mark.parametrize(
     "model_path",
     [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")])
-def test_llama(vllm_runner, example_prompts, model_path):
+def test_llama(vllm_runner, example_prompts, model_path, monkeypatch):
+    # Set environment variable for V1 engine serialization
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
     with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
+        eagle3_supported = vllm_model.apply_model(supports_eagle3)
+        assert eagle3_supported
+
         vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                   max_tokens=20)
         print(vllm_outputs)
@@ -18,8 +26,14 @@ def test_llama(vllm_runner, example_prompts, model_path):
 @pytest.mark.parametrize(
     "model_path",
     [("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")])
-def test_qwen(vllm_runner, example_prompts, model_path):
+def test_qwen(vllm_runner, example_prompts, model_path, monkeypatch):
+    # Set environment variable for V1 engine serialization
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+
     with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
+        eagle3_supported = vllm_model.apply_model(supports_eagle3)
+        assert eagle3_supported
+
         vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                   max_tokens=20)
         print(vllm_outputs)
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 46caf3fce4..c425488f83 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -823,3 +823,56 @@ def supports_v0_only(
     model: Union[type[object], object],
 ) -> Union[TypeIs[type[SupportsV0Only]], TypeIs[SupportsV0Only]]:
     return getattr(model, "supports_v0_only", False)
+
+
+@runtime_checkable
+class SupportsEagle3(Protocol):
+    """The interface required for models that support 
+    EAGLE3 speculative decoding."""
+
+    supports_eagle3: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports EAGLE3 
+    speculative decoding.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        """
+        Set which layers should output auxiliary
+        hidden states for EAGLE3.
+        
+        Args:
+            layers: Tuple of layer indices that should output auxiliary
+              hidden states.
+        """
+        ...
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        """
+        Get the layer indices that should output auxiliary hidden states
+        for EAGLE3.
+        
+        Returns:
+            Tuple of layer indices for auxiliary hidden state outputs.
+        """
+        ...
+
+
+@overload
+def supports_eagle3(model: type[object]) -> TypeIs[type[SupportsEagle3]]:
+    ...
+
+
+@overload
+def supports_eagle3(model: object) -> TypeIs[SupportsEagle3]:
+    ...
+
+
+def supports_eagle3(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[SupportsEagle3]], TypeIs[SupportsEagle3]]:
+    return isinstance(model, SupportsEagle3)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index bc511d8339..24cd448d83 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -49,7 +49,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
@@ -463,7 +463,7 @@ class LlamaModel(nn.Module):
         return loaded_params
 
 
-class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"]
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 0ad50640bb..2060206633 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -44,7 +44,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2MLP as Qwen3MLP
 from .qwen2 import Qwen2Model
 from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
@@ -261,7 +261,7 @@ class Qwen3Model(Qwen2Model):
                          decoder_layer_type=Qwen3DecoderLayer)
 
 
-class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ed4d6bcb09..2e1cc37b1b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -35,6 +35,7 @@ from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.interfaces import (is_mixture_of_experts,
+                                                   supports_eagle3,
                                                    supports_transcription)
 from vllm.model_executor.models.interfaces_base import (
     VllmModelForPooling, is_pooling_model, is_text_generation_model)
@@ -1981,8 +1982,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 logger.info("Loading drafter model...")
                 self.drafter.load_model(self.model)
             if self.use_aux_hidden_state_outputs:
-                self.model.set_aux_hidden_state_layers(
-                    self.model.get_eagle3_aux_hidden_state_layers())
+                if supports_eagle3(self.model):
+                    self.model.set_aux_hidden_state_layers(
+                        self.model.get_eagle3_aux_hidden_state_layers())
+                else:
+                    raise RuntimeError(
+                        "Model does not support EAGLE3 interface but "
+                        "aux_hidden_state_outputs was requested")
             time_after_load = time.perf_counter()
         self.model_memory_usage = m.consumed_memory
         logger.info("Model loading took %.4f GiB and %.6f seconds",

From c42fe0b63a29d3ec157089c9784643000dde4aec Mon Sep 17 00:00:00 2001
From: TeeKen Lau <13831887+teekenl@users.noreply.github.com>
Date: Wed, 13 Aug 2025 02:34:41 +1000
Subject: [PATCH 205/932] Add more test scenario for tensor schema (#22733)

Signed-off-by: teekenl <teekenlau@gmail.com>
---
 tests/utils_/test_tensor_schema.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tests/utils_/test_tensor_schema.py b/tests/utils_/test_tensor_schema.py
index 69744921b1..6aa781c156 100644
--- a/tests/utils_/test_tensor_schema.py
+++ b/tests/utils_/test_tensor_schema.py
@@ -33,6 +33,31 @@ def test_tensor_schema_constant_dim_failure():
         )
 
 
+def test_tensor_schema_invalid_types_in_list():
+    with pytest.raises(ValueError, match="is not a torch.Tensor"):
+        Phi3VImagePixelInputs(
+            data=[
+                torch.randn(64, 3, 32, 32),
+                "not_a_tensor",
+                torch.randn(64, 3, 32, 32),
+            ],
+            image_sizes=torch.randint(0, 256, (3, 2)),
+        )
+
+
+def test_tensor_schema_rank_mismatch():
+    with pytest.raises(ValueError, match="has rank 3 but expected 5"):
+        Phi3VImagePixelInputs(
+            data=torch.randn(16, 64, 3),
+            image_sizes=torch.randint(0, 256, (16, 2)),
+        )
+
+
+def test_tensor_schema_missing_required_field():
+    with pytest.raises(ValueError, match="Required field 'data' is missing"):
+        Phi3VImagePixelInputs(image_sizes=torch.randint(0, 256, (16, 2)), )
+
+
 def test_tensor_schema_symbolic_dim_mismatch():
     with pytest.raises(ValueError, match="expected 'bn'=12, got 16"):
         Phi3VImagePixelInputs(

From dab4f9f764119117c8ea1af0a3b5bcbb1c80bf76 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 12 Aug 2025 12:50:31 -0400
Subject: [PATCH 206/932] [Chore] Update CODEOWNERS to include @yewentao256 for
 CUDA kernels, attention backends, quantization, and related tests (#22741)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .github/CODEOWNERS | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 0a7f8e8be4..a0a327319a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -9,7 +9,7 @@
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
@@ -20,7 +20,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
-/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
+/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
 
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
@@ -34,16 +34,16 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
-/tests/kernels @tlrmchlsmth @WoosukKwon
+/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multi_step @alexm-redhat @comaniac
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
-/tests/quantization @mgoin @robertgshaw2-redhat
+/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/weight_loading @mgoin @youkaichao
+/tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 
 # Docs

From 6bd8ebf026600e9851026c8850f88c5e10acfab1 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Tue, 12 Aug 2025 12:53:36 -0700
Subject: [PATCH 207/932] [Kernel][AMD] Avoid D2H copy and cumsum kernel
 (#22683)

Signed-off-by: Xiaozhu <mxz297@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/v1/attention/backends/rocm_aiter_fa.py | 32 +++++++++++++--------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index abe0517450..e8bffbef44 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -214,12 +214,14 @@ class AiterFlashAttentionMetadata:
     #                                   |-- query_len ---|
 
     num_actual_tokens: int  # Number of tokens excluding padding.
+    num_actual_kv_tokens: int
     max_query_len: int
     query_start_loc: torch.Tensor
     max_seq_len: int
     seq_lens: torch.Tensor
     slot_mapping: torch.Tensor
     block_table: torch.Tensor
+    cu_seq_lens: Optional[torch.Tensor]
 
     # For cascade attention.
     use_cascade: bool
@@ -272,6 +274,20 @@ class AiterFlashAttentionMetadataBuilder(
         seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
+        if max_query_len > 1:
+            # We pre-compute cumulative seq len needed for prefill attention
+            # here to avoid recomputing it for every layer
+            cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1,
+                                      dtype=torch.int32,
+                                      device=seq_lens.device)
+            torch.cumsum(seq_lens,
+                         dim=0,
+                         dtype=cu_seq_lens.dtype,
+                         out=cu_seq_lens[1:])
+            num_actual_kv_tokens = int(cu_seq_lens[-1].item())
+        else:
+            cu_seq_lens = None
+            num_actual_kv_tokens = 0
 
         def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
                      max_seq_len, causal):
@@ -281,12 +297,14 @@ class AiterFlashAttentionMetadataBuilder(
 
         attn_metadata = AiterFlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
+            num_actual_kv_tokens=num_actual_kv_tokens,
             max_query_len=max_query_len,
             query_start_loc=query_start_loc,
             max_seq_len=max_seq_len,
             seq_lens=seq_lens,
             block_table=block_table_tensor,
             slot_mapping=slot_mapping,
+            cu_seq_lens=cu_seq_lens,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,
             total_tokens=self.total_tokens,
@@ -475,16 +493,6 @@ class AiterFlashAttentionImpl(AttentionImpl):
             block_table = attn_metadata.block_table
 
             if max_seqlen_q > 1:
-
-                cu_seq_lens = torch.zeros(seqused_k.shape[0] + 1,
-                                          dtype=torch.int32,
-                                          device=query.device)
-
-                torch.cumsum(seqused_k,
-                             dim=0,
-                             dtype=cu_seq_lens.dtype,
-                             out=cu_seq_lens[1:])
-
                 torch.ops.vllm.flash_attn_varlen_func(
                     query[:num_actual_tokens],
                     key_cache,
@@ -497,10 +505,10 @@ class AiterFlashAttentionImpl(AttentionImpl):
                     alibi_slopes=self.alibi_slopes,
                     window_size=self.sliding_window,
                     block_table=block_table,
-                    cu_seqlens_k=cu_seq_lens,
+                    cu_seqlens_k=attn_metadata.cu_seq_lens,
                     k_scale=layer._k_scale,
                     v_scale=layer._v_scale,
-                    total_tokens=attn_metadata.total_tokens,
+                    total_tokens=attn_metadata.num_actual_kv_tokens,
                 )
 
             _, num_heads, head_size = query.shape

From 422f22e01265b0ba6a99763e0b69f8dbba06b371 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 12 Aug 2025 21:53:52 +0200
Subject: [PATCH 208/932] [CI][Nixl] Check kv cache layout during handshake
 (#22745)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../kv_connector/unit/test_nixl_connector.py  | 46 +++++++++++++++++++
 .../kv_connector/v1/nixl_connector.py         | 13 ++++--
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index c673983235..3860d7c857 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -419,6 +419,52 @@ class TestNixlHandshake:
                     return
         raise TimeoutError("Took too long to complete async handshake.")
 
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper)
+    def test_handshake_fails_on_kv_cache_layout_mismatch(self, dist_init):
+        """
+        Verify that adding a remote agent fails if kv_cache_layout differs.
+        This test is only relevant for heterogeneous TP.
+        """
+        vllm_config = create_vllm_config()
+
+        # Mock TP world size to 2 to force heterogeneous TP when
+        # remote_tp_size=1
+        with patch(
+                "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.get_tensor_model_parallel_world_size",  # noqa: E501
+                return_value=2):
+            # Initialize connector and worker (with fake NIXL wrapper)
+            connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+            connector.connector_worker = FakeNixlConnectorWorker(
+                vllm_config, connector.engine_id, hand_shake_latency=0)
+            worker = connector.connector_worker
+
+            # Minimal local registration params used by add_remote_agent
+            worker.slot_size_bytes = 4096
+            worker.block_len = worker.slot_size_bytes * worker.block_size
+            worker.num_blocks = 1
+            worker.dst_num_blocks[worker.engine_id] = worker.num_blocks
+
+            # Metadata with different kv_cache_layout than local worker
+            mismatched_layout = "HND" if worker.kv_cache_layout != "HND" \
+                else "NHD"
+            meta = NixlAgentMetadata(
+                engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                agent_metadata=FakeNixlWrapper.AGENT_METADATA,
+                kv_caches_base_addr=[0],
+                num_blocks=1,
+                block_len=worker.block_len,
+                attn_backend_name=worker.backend_name,
+                kv_cache_layout=mismatched_layout,
+            )
+
+            # We don't check layout for homogeneous TP and MLA for now, as the
+            # whole block is moved.
+            worker.add_remote_agent(meta, remote_tp_size=2)
+            with pytest.raises(AssertionError):
+                worker.add_remote_agent(meta, remote_tp_size=1)
+
 
 # NOTE: resource cleanup in mp backend is a bit finicky, so the order in which
 # we put here is important. First run ray, it will clean up the resources, then
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index a6eeb27853..4f51229ffb 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -30,6 +30,7 @@ from vllm.forward_context import ForwardContext
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import make_zmq_path, make_zmq_socket
+from vllm.v1.attention.backends.utils import get_kv_cache_layout
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.request import RequestStatus
 
@@ -73,6 +74,7 @@ class NixlAgentMetadata(
     num_blocks: int
     block_len: int
     attn_backend_name: str
+    kv_cache_layout: str
 
 
 @dataclass
@@ -538,7 +540,9 @@ class NixlConnectorWorker:
         attn_backend = backend_name_to_enum(self.backend_name)
         self._use_flashinfer = attn_backend == _Backend.FLASHINFER_VLLM_V1
         self._use_pallas_v1 = attn_backend == _Backend.PALLAS_VLLM_V1
+        self.kv_cache_layout = get_kv_cache_layout()
         logger.debug("Detected attention backend %s", self.backend_name)
+        logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
 
         self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
         # With heterogeneous TP, P must wait for all assigned D TP workers to
@@ -839,7 +843,8 @@ class NixlConnectorWorker:
             kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
             num_blocks=self.num_blocks,
             block_len=self.block_len,
-            attn_backend_name=self.backend_name)
+            attn_backend_name=self.backend_name,
+            kv_cache_layout=self.kv_cache_layout)
         ready_event = threading.Event()
         self._nixl_handshake_listener_t = threading.Thread(
             target=self._nixl_handshake_listener,
@@ -900,8 +905,7 @@ class NixlConnectorWorker:
             self._tp_size[engine_id] = remote_tp_size
         else:
             assert self._tp_size[engine_id] == remote_tp_size
-        # We may eventually enable this after asserting equality in cache
-        # layout and close outputs.
+        # TODO We may eventually want to skip enforcing the same attn backend.
         assert nixl_agent_meta.attn_backend_name == self.backend_name
 
         remote_agent_name = self.nixl_wrapper.add_remote_agent(
@@ -930,6 +934,9 @@ class NixlConnectorWorker:
             if self._use_flashinfer:
                 # Account for joint KV in FlashInfer.
                 remote_block_size //= 2
+            if tp_ratio > 1:
+                # Heterogeneous TP expects same kv_cache_layout.
+                assert nixl_agent_meta.kv_cache_layout == self.kv_cache_layout
 
             assert nixl_agent_meta.block_len == self.block_len * tp_ratio, (
                 "Remote P worker KV layer cache must be of shape [2, N, "

From 6534d2fc9773db101e0cb6d2bd9617bfd41e7876 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Tue, 12 Aug 2025 12:54:42 -0700
Subject: [PATCH 209/932] Fix torch version check for SM100 mxfp4  (#22535)

Signed-off-by: Zifei Tong <zifeitong@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index d5a89655e3..fb38fb91ea 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -741,12 +741,14 @@ class FusedMoE(torch.nn.Module):
 
         # we padding globally so EP buffer allocation works
         if quant_config and quant_config.get_name() == "mxfp4":
-            if not is_torch_equal_or_newer("2.8.0"):
-                raise RuntimeError("Mxfp4 on hopper requires torch >= 2.8.0")
-            if current_platform.is_device_capability(
-                    90) and not has_triton_kernels():
-                raise NotImplementedError(
-                    "Triton kernels must be installed for mxfp4 on hopper")
+            if not current_platform.is_device_capability(100):
+                if not is_torch_equal_or_newer("2.8.0"):
+                    raise RuntimeError(
+                        "Mxfp4 on non-blackwell requires torch >= 2.8.0")
+                if not has_triton_kernels():
+                    raise NotImplementedError(
+                        "triton_kernels must be installed for "
+                        "mxfp4 on non-blackwell")
             if (current_platform.is_rocm()
                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):

From 53c730286c5ad86a6d78d4a4d8a2cd7042725d24 Mon Sep 17 00:00:00 2001
From: RUTHLESS-BOT <wujiafeng@cmbchina.com>
Date: Wed, 13 Aug 2025 04:31:48 +0800
Subject: [PATCH 210/932] [Misc] parametrize 'dtype' in test_flash_mla (#22641)

Signed-off-by: RUTHLESS-BOT <wujiafeng@cmbchina.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/kernels/attention/test_flashmla.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
index 21b08e45fd..81841be583 100644
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -35,11 +35,10 @@ FLASH_MLA_UNSUPPORTED_REASON = is_flashmla_supported()[1] \
 @pytest.mark.parametrize("block_size", [64])
 @pytest.mark.parametrize("causal", [True])
 @pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
 @torch.inference_mode()
 def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
-                   varlen):
-    # TODO: parametrize using pytest
-    dtype = torch.bfloat16
+                   varlen, dtype):
     device = torch.device("cuda:0")
     torch.set_default_dtype(dtype)
     torch.set_default_device(device)
@@ -48,7 +47,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
     random.seed(0)
 
     print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
-          f"{d=}, {dv=}, {causal=}, {varlen=}")
+          f"{d=}, {dv=}, {causal=}, {varlen=}, {dtype=}")
 
     cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32)
     if varlen:

From ba81acbdc1eec643ba815a76628ae3e4b2263b76 Mon Sep 17 00:00:00 2001
From: Frank Wang <41319051+frankwang28@users.noreply.github.com>
Date: Tue, 12 Aug 2025 15:43:06 -0700
Subject: [PATCH 211/932] [Bugfix] Bump DeepGEMM Version to Fix SMXX Layout
 Issues (#22606)

Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index b96d50f0a1..a20a4bfb2b 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -432,7 +432,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 # Install DeepGEMM from source
 ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
-ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1"
+ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
   . /etc/environment
     CUDA_MAJOR="${CUDA_VERSION%%.*}"

From 45c3936e945ee1b869911f155d5519f2b60ce9d1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 13 Aug 2025 01:12:26 +0100
Subject: [PATCH 212/932] [Docs] Hide the navigation and toc sidebars on home
 page (#22749)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/README.md b/docs/README.md
index e8d2fd953a..683e1d3756 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,3 +1,9 @@
+---
+hide:
+  - navigation
+  - toc
+---
+
 # Welcome to vLLM
 
 <figure markdown="span">

From d0a63015888f5d5ab33e369bfa5ede4c8e0faea7 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 13 Aug 2025 01:12:30 +0100
Subject: [PATCH 213/932] Fix Transformers backend tensor parallel for
 multimodal models (#22673)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers.py | 49 +++++++++++++++-------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 25b8b69e08..4ec2b683fc 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -505,30 +505,47 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         Apply the model's tensor parallelization plan.
         Currently only supports linear layers.
         """
-        tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {}
+        # Look for tp plans in all of the PreTrainedModels found in self.model
+        is_pretrained_model = lambda m: isinstance(m, PreTrainedModel)
+        supports_tp_plan = lambda m: m.config.base_model_tp_plan is not None
+        pretrained_models = filter(is_pretrained_model, self.model.modules())
+        models_with_tp_plan = filter(supports_tp_plan, pretrained_models)
 
-        if not tp_plan and self.tp_size > 1:
+        if not any(models_with_tp_plan) and self.tp_size > 1:
             raise ValueError(
                 f"{type(self.model)} does not support tensor parallel yet!")
 
-        # Some weight loaders expect linear layers to inherit from vLLM's
-        # LinearBase class, so we set a default style which causes any
-        # unspecified linear layers to be replaced with ReplicatedLinear
-        tp_plan[".*"] = "replicate"
+        def _tensor_parallel(module: nn.Module,
+                             prefix: str = "",
+                             tp_plan=None):
+            tp_plan = tp_plan or {}
 
-        def _tensor_parallel(module: nn.Module, prefix: str = ""):
+            # If the current module is a PreTrainedModel, set the tp_plan for
+            # all of its children
+            if isinstance(module, PreTrainedModel):
+                tp_plan = module.config.base_model_tp_plan or {}
+                tp_plan = {
+                    maybe_prefix(prefix, k): v
+                    for k, v in tp_plan.items()
+                }
+
+            # Some weight loaders expect linear layers to inherit from vLLM's
+            # LinearBase class, so we set a default style which causes any
+            # unspecified linear layers to be replaced with ReplicatedLinear
             for child_name, child_module in module.named_children():
                 qual_name = maybe_prefix(prefix, child_name)
-                for pattern, style in tp_plan.items():
-                    if re.match(pattern, qual_name) and isinstance(
-                            child_module, nn.Linear):
-                        new_module = replace_linear_class(
-                            child_module, style, self.quant_config)
-                        setattr(module, child_name, new_module)
-                        log_replacement(qual_name, child_module, new_module)
-                        break
+                if isinstance(child_module, nn.Linear):
+                    generator = (p for p in tp_plan if re.match(p, qual_name))
+                    pattern = next(generator, None)
+                    style = tp_plan.get(pattern, "replicate")
+                    new_module = replace_linear_class(child_module, style,
+                                                      self.quant_config)
+                    setattr(module, child_name, new_module)
+                    log_replacement(qual_name, child_module, new_module)
                 else:
-                    _tensor_parallel(child_module, prefix=qual_name)
+                    _tensor_parallel(child_module,
+                                     prefix=qual_name,
+                                     tp_plan=tp_plan)
 
         _tensor_parallel(self.model)
 

From fde0b611a37e442cb8a53999a1cce48d76f49c16 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 13 Aug 2025 08:13:17 +0800
Subject: [PATCH 214/932] [Model] Decouple glm4v (#22751)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/models/supported_models.md        |  2 +-
 vllm/model_executor/models/glm4_1v.py  | 26 +++++++++++++++++++++-----
 vllm/model_executor/models/registry.py |  2 +-
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index a24fa4bcce..dbbbc5122b 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -615,7 +615,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 7983895687..2a89c03bfe 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -1227,10 +1227,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             "k_proj",
             "v_proj",
         ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
+        "gate_up_proj": ["gate_up_proj"]
     }
 
     # To ensure correct weight loading and mapping.
@@ -1567,7 +1564,26 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
         Get the module prefix in multimodal models
         """
         return MultiModelKeys.from_string_field(
-            language_model="language_model",
+            language_model="language_model.model",
             connector="visual.merger.",
             tower_model="visual.",
         )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Glm4vMultiModalProcessor,
+    info=Glm4vProcessingInfo,
+    dummy_inputs=Glm4vDummyInputsBuilder,
+)
+class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 64dbde4916..b817615b43 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -208,7 +208,7 @@ _MULTIMODAL_MODELS = {
     "Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"),    # noqa: E501
     "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
     "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
-    "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
+    "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"),  # noqa: E501
     "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),

From e18859298d109870b22cb5b8672d1078818e268d Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 12 Aug 2025 20:14:46 -0400
Subject: [PATCH 215/932] Add hardware plugins to installation doc (#22732)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/getting_started/installation/README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index a252343dce..f6ecceb85d 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -14,3 +14,16 @@ vLLM supports the following hardware platforms:
 - [Google TPU](google_tpu.md)
 - [Intel Gaudi](intel_gaudi.md)
 - [AWS Neuron](aws_neuron.md)
+
+## Hardware Plugins
+
+The backends below live **outside** the main `vllm` repository and follow the
+[Hardware-Pluggable RFC](../design/plugin_system.md).
+
+| Accelerator | PyPI / package | Repository |
+|-------------|----------------|------------|
+| Ascend NPU | `vllm-ascend` | <https://github.com/vllm-project/vllm-ascend> |
+| Intel Gaudi (HPU) | N/A, install from source | <https://github.com/vllm-project/vllm-gaudi> |
+| MetaX MACA GPU | N/A, install from source | <https://github.com/MetaX-MACA/vLLM-metax> |
+| Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
+| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |

From 71683ca6f6764f35abe23d612a0d7dbd33babe32 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Aug 2025 20:18:39 -0700
Subject: [PATCH 216/932] [V0 Deprecation] Remove multi-step scheduling
 (#22138)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
---
 .../tests/genai-perf-tests.json               |   1 -
 .../tests/nightly-tests.json                  |   6 -
 .buildkite/test-pipeline.yaml                 |  22 -
 .github/CODEOWNERS                            |   1 -
 tests/async_engine/test_async_llm_engine.py   | 409 --------
 tests/config/test_config.yaml                 |   1 -
 tests/config/test_config_with_model.yaml      |   1 -
 tests/core/test_chunked_prefill_scheduler.py  |  10 +-
 tests/core/test_num_computed_tokens_update.py |  24 +-
 .../test_multi_step_output_processor.py       | 274 ------
 .../openai/correctness/test_lmeval.py         |   3 -
 tests/metrics/test_metrics.py                 |  39 -
 .../models/language/generation/test_hybrid.py |  26 -
 .../multi_step/test_correctness_async_llm.py  | 232 -----
 tests/multi_step/test_correctness_llm.py      | 383 --------
 tests/samplers/test_logits_processor.py       |  70 --
 tests/tpu/lora/test_lora.py                   |   1 -
 tests/utils_/test_utils.py                    |   2 -
 tests/v1/test_oracle.py                       |   6 -
 tests/worker/test_model_input.py              |  79 --
 vllm/config/__init__.py                       |   2 -
 vllm/core/scheduler.py                        |  92 +-
 vllm/engine/arg_utils.py                      |  43 +-
 vllm/engine/async_llm_engine.py               |  26 +-
 vllm/engine/llm_engine.py                     | 178 +---
 vllm/engine/output_processor/interfaces.py    |  26 +-
 vllm/engine/output_processor/multi_step.py    | 211 ----
 vllm/platforms/cuda.py                        |  14 +-
 vllm/platforms/rocm.py                        |  14 +-
 vllm/platforms/tpu.py                         |   7 +-
 vllm/sequence.py                              |  38 -
 vllm/worker/model_runner.py                   |   7 +-
 vllm/worker/multi_step_model_runner.py        | 908 ------------------
 vllm/worker/multi_step_neuron_model_runner.py |  84 --
 ...i_step_neuronx_distributed_model_runner.py |  63 --
 vllm/worker/multi_step_worker.py              | 197 ----
 vllm/worker/neuron_worker.py                  |  22 +-
 37 files changed, 57 insertions(+), 3465 deletions(-)
 delete mode 100644 tests/async_engine/test_async_llm_engine.py
 delete mode 100644 tests/engine/test_multi_step_output_processor.py
 delete mode 100644 tests/multi_step/test_correctness_async_llm.py
 delete mode 100644 tests/multi_step/test_correctness_llm.py
 delete mode 100644 tests/samplers/test_logits_processor.py
 delete mode 100644 vllm/engine/output_processor/multi_step.py
 delete mode 100644 vllm/worker/multi_step_model_runner.py
 delete mode 100644 vllm/worker/multi_step_neuron_model_runner.py
 delete mode 100644 vllm/worker/multi_step_neuronx_distributed_model_runner.py
 delete mode 100644 vllm/worker/multi_step_worker.py

diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
index f26ae7634f..afb844880f 100644
--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@@ -12,7 +12,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 41b4a40088..423a3bfe12 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -36,7 +36,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -90,7 +89,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -144,7 +142,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -195,7 +192,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -248,7 +244,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -301,7 +296,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ebcf51981e..740be2bc87 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -67,7 +67,6 @@ steps:
   - python3 standalone_tests/lazy_imports.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
   - pytest -v -s async_engine # AsyncLLMEngine
-  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s multimodal
@@ -773,27 +772,6 @@ steps:
   - pytest -v -s models/test_oot_registration.py # it needs a clean process
   - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 
-- label: Multi-step Tests (4 GPUs) # 36min
-  mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/model_executor/layers/sampler.py
-  - vllm/sequence.py
-  - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/multi_step_worker.py
-  - vllm/worker/model_runner_base.py
-  - vllm/worker/model_runner.py
-  - vllm/worker/multi_step_model_runner.py
-  - vllm/engine
-  - tests/multi_step
-  commands:
-  # this test is quite flaky
-  # TODO: investigate and fix.
-  # - pytest -v -s multi_step/test_correctness_async_llm.py
-  - pytest -v -s multi_step/test_correctness_llm.py
-
 - label: Pipeline Parallelism Test # 45min
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index a0a327319a..b0dd5e99d4 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -36,7 +36,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
 /tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
-/tests/multi_step @alexm-redhat @comaniac
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
deleted file mode 100644
index 0eb7a6eb52..0000000000
--- a/tests/async_engine/test_async_llm_engine.py
+++ /dev/null
@@ -1,409 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import os
-import uuid
-from asyncio import CancelledError
-from copy import copy
-from dataclasses import dataclass, field
-from typing import Any, Optional
-
-import pytest
-import pytest_asyncio
-import torch
-
-from vllm import SamplingParams
-from vllm.config import ParallelConfig
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
-from vllm.outputs import RequestOutput as RealRequestOutput
-from vllm.sampling_params import RequestOutputKind
-
-from ..utils import wait_for_gpu_memory_to_clear
-
-
-@dataclass
-class RequestOutput:
-    request_id: int
-    finished: bool = False
-
-
-@dataclass
-class MockModelConfig:
-    use_async_output_proc = True
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-
-
-class MockEngine:
-
-    def __init__(self):
-        self.step_calls = 0
-        self.add_request_calls = 0
-        self.abort_request_calls = 0
-        self.request_id = None
-        # Ugly, remove dependency when possible
-        self.parallel_config = ParallelConfig()
-        self.model_config = MockModelConfig()
-
-    async def step_async(self, virtual_engine):
-        # PP size is 1, ignore virtual engine
-        self.step_calls += 1
-        return [RequestOutput(
-            request_id=self.request_id)] if self.request_id else []
-
-    async def process_model_inputs_async(self, *args, **kwargs):
-        pass
-
-    async def stop_remote_worker_execution_loop_async(self):
-        pass
-
-    def generate(self, request_id):
-        self.request_id = request_id
-
-    def stop_generating(self):
-        self.request_id = None
-
-    def add_request(self, **kwargs):
-        del kwargs  # Unused
-        self.add_request_calls += 1
-        print(f'Request calls: {self.add_request_calls}')
-
-    async def add_request_async(self, **kwargs):
-        self.add_request_calls += 1
-        return
-
-    def abort_request(self, request_id):
-        del request_id  # Unused
-        self.abort_request_calls += 1
-
-    def has_unfinished_requests(self):
-        return self.request_id is not None
-
-    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
-        return self.request_id is not None
-
-
-class MockAsyncLLMEngine(AsyncLLMEngine):
-    _engine_class = MockEngine
-
-
-@pytest.mark.asyncio
-async def test_new_requests_event():
-    params = SamplingParams()
-
-    engine = MockAsyncLLMEngine()
-    engine.start_background_loop()
-    await asyncio.sleep(0.01)
-    assert engine.engine.step_calls == 0
-
-    await engine.add_request("1", "", params)
-    await asyncio.sleep(0.01)
-    assert engine.engine.add_request_calls == 1
-    assert engine.engine.step_calls == 1
-
-    await engine.add_request("2", "", params)
-    engine.engine.generate("2")
-    await asyncio.sleep(0)
-    await asyncio.sleep(0)
-    await asyncio.sleep(0)
-    assert engine.engine.add_request_calls == 2
-    assert engine.engine.step_calls >= 2
-    await asyncio.sleep(0.001)
-    assert engine.engine.step_calls >= 3
-    engine.engine.stop_generating()
-    await asyncio.sleep(0.001)
-    old_step_calls = engine.engine.step_calls
-    await asyncio.sleep(0.001)
-    assert engine.engine.step_calls == old_step_calls
-
-    await engine.add_request("3", "", params)
-    await asyncio.sleep(0.01)
-    assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == old_step_calls + 1
-    await asyncio.sleep(0.01)
-    assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == old_step_calls + 1
-
-    engine = MockAsyncLLMEngine()
-    assert engine.get_model_config() is not None
-    assert engine.get_tokenizer() is not None
-    assert engine.get_decoding_config() is not None
-
-
-def start_engine():
-    wait_for_gpu_memory_to_clear(
-        devices=list(range(torch.cuda.device_count())),
-        threshold_bytes=2 * 2**30,
-        timeout_s=60,
-    )
-
-    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
-    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
-
-    return AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m",
-                        enforce_eager=True,
-                        num_scheduler_steps=num_scheduler_steps))
-
-
-def uid() -> str:
-    return str(uuid.uuid4())
-
-
-@pytest_asyncio.fixture(scope="module")
-async def async_engine():
-    # We cannot use monkeypatch since this is a module
-    # scoped fixture and monkeypatch is function scoped.
-    previous_value = os.getenv("VLLM_USE_V1", None)
-    os.environ["VLLM_USE_V1"] = "0"
-    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
-                                                            func=start_engine)
-    try:
-        yield engine
-    finally:
-        engine.shutdown_background_loop()
-        del engine
-        await asyncio.sleep(0.1)
-        cleanup_dist_env_and_memory()
-
-        if previous_value:
-            os.environ["VLLM_USE_V1"] = previous_value
-        else:
-            del os.environ["VLLM_USE_V1"]
-
-
-@pytest.fixture()
-def should_do_global_cleanup_after_test(request) -> bool:
-    # So we can share the async engine fixture between these tests
-    return False
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_asyncio_run(async_engine, stop):
-
-    scheduler_config = await async_engine.get_scheduler_config()
-    num_scheduler_steps = scheduler_config.num_scheduler_steps
-
-    async def run(prompt: str):
-        sampling_params = SamplingParams(
-            temperature=0,
-            max_tokens=32,
-            min_tokens=32,
-            stop=stop,
-        )
-
-        output_count = 0
-        final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  sampling_params,
-                                                  request_id=uid()):
-            output_count += 1
-            final_output = output
-        return final_output, output_count
-
-    results = await asyncio.gather(
-        run("test0"),
-        run("test0"),
-    )
-    assert len(results) == 2
-    first, second = results
-
-    # remove nondeterministic fields for comparison
-    first[0].metrics = None
-    second[0].metrics = None
-    first[0].request_id = None
-    second[0].request_id = None
-
-    assert str(first) == str(second)
-
-    output_count = results[0][1]
-    if num_scheduler_steps == 1:
-        assert output_count == 32
-    else:
-        assert 1 < output_count < 32
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_output_kinds(async_engine, stop):
-    """Test that output_kind works as expected and that
-    results are equivalent across different kinds."""
-
-    scheduler_config = await async_engine.get_scheduler_config()
-    num_scheduler_steps = scheduler_config.num_scheduler_steps
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        max_tokens=32,
-        min_tokens=32,
-        stop=stop,
-    )
-
-    async def run(prompt: str, kind: RequestOutputKind):
-        params = copy(sampling_params)
-        params.output_kind = kind
-
-        output_count = 0
-        final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  params,
-                                                  request_id=uid()):
-            output_count += 1
-            final_output = output
-
-        assert final_output is not None
-        assert final_output.finished
-
-        return (final_output.prompt_token_ids,
-                final_output.outputs[0].token_ids,
-                final_output.outputs[0].text, output_count)
-
-    async def run_deltas(prompt: str):
-        params = copy(sampling_params)
-        params.output_kind = RequestOutputKind.DELTA
-
-        prompt_tokens = None
-        output_tokens: list[int] = []
-        output_text = ""
-        output_count = 0
-        final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  params,
-                                                  request_id=uid()):
-            token_ids = output.outputs[0].token_ids
-            text = output.outputs[0].text
-            final_output = output
-
-            # Ensure we get prompt ids iff we haven't yet received output tokens
-            if output_tokens:
-                assert 1 <= len(token_ids) <= num_scheduler_steps
-                assert stop or text
-                assert not output.prompt_token_ids
-            else:
-                assert output.prompt_token_ids
-                prompt_tokens = output.prompt_token_ids
-
-            output_tokens.extend(token_ids)
-            output_text += text
-
-            output_count += 1
-
-        assert final_output is not None
-        assert final_output.finished
-
-        return prompt_tokens, output_tokens, output_text, output_count
-
-    results = await asyncio.gather(
-        run("common input prompt", RequestOutputKind.CUMULATIVE),
-        run("common input prompt", RequestOutputKind.FINAL_ONLY),
-        run_deltas("common input prompt"))
-
-    # Make sure outputs are the same
-    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
-    assert len(prompt_set) == 1
-
-    text_set = set(text for _, _, text, _ in results)
-    assert len(text_set) == 1
-
-    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
-    assert len(tokens_set) == 1
-
-    cumulative, final, deltas = results
-
-    # output message counts
-    assert cumulative[3] == deltas[3]
-
-    if num_scheduler_steps == 1:
-        assert cumulative[3] == 32
-    else:
-        assert 1 < cumulative[3] < 32
-
-    assert final[3] == 1
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_cancellation(async_engine, stop):
-    scheduler_config = await async_engine.get_scheduler_config()
-    num_scheduler_steps = scheduler_config.num_scheduler_steps
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        min_tokens=13,
-        max_tokens=13,
-        stop=stop,
-    )
-
-    stop_at = 5 if num_scheduler_steps == 1 else 1
-
-    request_id = uid()
-
-    i = 0
-    with pytest.raises(CancelledError):
-        async for output in async_engine.generate("test2",
-                                                  sampling_params,
-                                                  request_id=request_id):
-            assert not output.finished
-            i += 1
-            if i == stop_at:
-                await async_engine.abort(request_id)
-
-    assert i == stop_at
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_delayed_generator(async_engine, stop):
-    scheduler_config = await async_engine.get_scheduler_config()
-
-    if scheduler_config.num_scheduler_steps != 1:
-        pytest.skip("no need to test this one with multistep")
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        min_tokens=10,
-        max_tokens=10,
-        stop=stop,
-    )
-
-    stream = async_engine.generate("test3", sampling_params, request_id=uid())
-    i = 0
-    final_output: Optional[RealRequestOutput] = None
-    async for output in stream:
-        final_output = output
-        if i == 0:
-            # wait for generation to complete before consuming
-            # the remaining messages
-            await asyncio.sleep(1)
-        if i < 9:
-            assert not output.finished
-        i += 1
-
-    assert i == 10
-    assert final_output is not None
-    assert len(final_output.outputs[0].token_ids) == 10
-    assert final_output.finished
-
-
-@pytest.mark.asyncio(scope="module")
-async def test_invalid_argument(async_engine):
-    scheduler_config = await async_engine.get_scheduler_config()
-
-    if scheduler_config.num_scheduler_steps != 1:
-        pytest.skip("no need to test this one with multistep")
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        min_tokens=10,
-        max_tokens=10,
-    )
-
-    # Targeting specific DP rank only supported in v1 multi-instance DP
-    with pytest.raises(ValueError):
-        async for _ in async_engine.generate("test",
-                                             sampling_params,
-                                             request_id=uid(),
-                                             data_parallel_rank=0):
-            pass
diff --git a/tests/config/test_config.yaml b/tests/config/test_config.yaml
index 5090e8f357..a16857b5f2 100644
--- a/tests/config/test_config.yaml
+++ b/tests/config/test_config.yaml
@@ -2,4 +2,3 @@ port: 12312
 served_model_name: mymodel
 tensor_parallel_size: 2
 trust_remote_code: true
-multi_step_stream_outputs: false
diff --git a/tests/config/test_config_with_model.yaml b/tests/config/test_config_with_model.yaml
index d8c8c7bc81..9fbdb77d4e 100644
--- a/tests/config/test_config_with_model.yaml
+++ b/tests/config/test_config_with_model.yaml
@@ -4,4 +4,3 @@ port: 12312
 served_model_name: mymodel
 tensor_parallel_size: 2
 trust_remote_code: true
-multi_step_stream_outputs: false
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index d4dacc4f12..ce1fe189b3 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -644,11 +644,9 @@ def test_chunked_prefill_preempt():
     assert out.num_batched_tokens == max_num_batched_tokens
 
 
-@pytest.mark.parametrize("num_scheduler_steps", [1, 5])
-def test_chunked_prefill_spec_prefill(num_scheduler_steps):
+def test_chunked_prefill_spec_prefill():
     """Verify that the num_lookahead_slots is set appropriately for an all"""
-    """prefill batch depending on whether multi-step scheduling is enabled"""
-    """or not"""
+    """prefill batch."""
     block_size = 4
     max_seqs = 30
     max_model_len = 200
@@ -661,7 +659,6 @@ def test_chunked_prefill_spec_prefill(num_scheduler_steps):
         max_model_len,
         enable_chunked_prefill=True,
         num_lookahead_slots=num_lookahead_slots,
-        num_scheduler_steps=num_scheduler_steps,
     )
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 16
@@ -679,8 +676,7 @@ def test_chunked_prefill_spec_prefill(num_scheduler_steps):
     assert out.num_prefill_groups == 1
     assert out.num_batched_tokens == max_num_batched_tokens
     print(out.num_lookahead_slots)
-    assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else
-                                       num_lookahead_slots)
+    assert out.num_lookahead_slots == 0
 
 
 def test_chunked_prefill_max_seqs():
diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py
index 9e1b7913df..131a7b3a62 100644
--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@@ -6,7 +6,6 @@ import pytest
 from tests.conftest import VllmRunner
 from tests.core.utils import create_dummy_prompt
 from vllm.engine.llm_engine import LLMEngine
-from vllm.platforms import current_platform
 from vllm.sequence import SequenceGroup
 
 MODEL = "JackFram/llama-160m"
@@ -17,32 +16,19 @@ def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
     scheduler.add_seq_group(seq_group)
 
 
-@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
 @pytest.mark.parametrize("enable_chunked_prefill", [False, True])
 @pytest.mark.parametrize("enforce_eager", [False, True])
-def test_num_computed_tokens_update(num_scheduler_steps: int,
-                                    enable_chunked_prefill: bool,
+def test_num_computed_tokens_update(enable_chunked_prefill: bool,
                                     enforce_eager: bool):
 
-    is_multi_step = num_scheduler_steps > 1
-    is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
-
-    if is_multi_step_chunked_prefill and current_platform.is_rocm():
-        pytest.skip("Multi-step with Chunked-Prefill does not support "
-                    "rocm_flash_attn backend")
-
     # Make a vllm engine
     runner = VllmRunner(model_name=MODEL,
                         gpu_memory_utilization=0.7,
-                        num_scheduler_steps=num_scheduler_steps,
                         enable_chunked_prefill=enable_chunked_prefill,
                         enforce_eager=enforce_eager)
     engine: LLMEngine = runner.llm.llm_engine
 
-    # In multi-step + chunked-prefill there is no separate single prompt step.
-    # What is scheduled will run for num_scheduler_steps always.
-    num_prompt_steps = num_scheduler_steps \
-        if is_multi_step_chunked_prefill else 1
+    num_prompt_steps = 1
 
     num_output_tokens_list = [4, 8, 12, 15, 16, 17]
 
@@ -73,10 +59,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
                 # Test correctness of num_computed_tokens after the decode steps
                 assert seq.data.get_num_computed_tokens(
                 ) == prompt_num_computed_tokens + decode_step_counter
-                for _ in range(num_scheduler_steps):
-                    # decode step
-                    engine.step()
-                    decode_step_counter += 1
+                engine.step()
+                decode_step_counter += 1
 
         # Test correctness of num_computed_tokens after the sequence finish.
         assert seq.data.get_num_computed_tokens(
diff --git a/tests/engine/test_multi_step_output_processor.py b/tests/engine/test_multi_step_output_processor.py
deleted file mode 100644
index 458f4deb74..0000000000
--- a/tests/engine/test_multi_step_output_processor.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import random
-from unittest.mock import MagicMock
-
-import pytest
-from transformers import PreTrainedTokenizer
-
-from vllm.core.scheduler import Scheduler
-from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
-                           SequenceOutput, SequenceStatus)
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.utils import Counter
-
-from ..core.utils import create_seq_group
-
-
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [1, 12])
-@pytest.mark.skip_global_cleanup
-def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
-    """Verify multi-step decoding appends token ids correctly.
-
-    We append token ids and verify all the token ids were appended correctly.
-    Note that ignore_eos=True.
-    """
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=1024,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(max_tokens=seq_output_len +
-                                       num_new_tokens,
-                                       ignore_eos=True),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
-    output_processor.process_outputs(seq_group, outputs)
-    assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
-
-
-@pytest.mark.parametrize("seq_prompt_len", [1024])
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
-@pytest.mark.parametrize("max_tokens", [128 + 3])
-@pytest.mark.skip_global_cleanup
-def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
-                             seq_output_len: int, max_tokens: int):
-    """Verify tokens after max_tokens are dropped and not appended to the
-    sequence.
-    """
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=seq_prompt_len,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(max_tokens=max_tokens, ),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_len() == seq_prompt_len + seq_output_len
-    output_processor.process_outputs(seq_group, outputs)
-
-    # Expect the processed sequence to not go over max tokens in len.
-    assert seq.get_len() == seq_prompt_len + max_tokens
-
-    # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
-
-
-@pytest.mark.parametrize("seq_prompt_len", [1024])
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [12])
-@pytest.mark.parametrize("seed", list(range(6)))
-@pytest.mark.skip_global_cleanup
-def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
-                               seq_output_len: int, seed: int):
-    """Verify the eos token id is included in the sequence, but subsequent
-    tokens are dropped (not appended to sequence).
-    """
-    random.seed(seed)
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    eos_token_id = 100
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=seq_prompt_len,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(
-            # Ensure enough space.
-            max_tokens=seq_output_len + num_new_tokens, ),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-    assert eos_token_id not in new_token_ids
-    eos_index = random.randint(0, len(new_token_ids) - 1)
-    new_token_ids[eos_index] = eos_token_id
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_len() == seq_prompt_len + seq_output_len
-    output_processor.process_outputs(seq_group, outputs)
-
-    # Expect the processed sequence to not go beyond provided eos.
-    assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
-
-    # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:eos_index + 1]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
-
-
-@pytest.mark.parametrize("seq_prompt_len", [1024])
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [12])
-@pytest.mark.parametrize("seed", list(range(6)))
-@pytest.mark.skip_global_cleanup
-def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
-                              seq_output_len: int, seed: int):
-    """When sampling parameters dictate that we should ignore the eos token id,
-    ensure all token ids are appended even if the eos token id is emitted.
-    """
-    random.seed(seed)
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    eos_token_id = 100
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=seq_prompt_len,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(
-            # Ensure enough space.
-            max_tokens=seq_output_len + num_new_tokens,
-            ignore_eos=True,
-        ),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-    assert eos_token_id not in new_token_ids
-    eos_index = random.randint(0, len(new_token_ids) - 1)
-    new_token_ids[eos_index] = eos_token_id
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_len() == seq_prompt_len + seq_output_len
-    output_processor.process_outputs(seq_group, outputs)
-
-    # Expect the processed sequence to go beyond eos.
-    assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
-
-    # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
-                                             seq_output_len]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
-
-
-def mock_tokenizer(eos_token_id=1000):
-    tokenizer = MagicMock(spec=PreTrainedTokenizer)
-    tokenizer.eos_token_id = eos_token_id
-    return tokenizer
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index d75731637d..684407cd6e 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -26,15 +26,12 @@ DEFAULT_ARGS = ["--max-model-len", "4096"]
 MORE_ARGS_LIST = [
     [],  # Default
     ["--enable-chunked-prefill"],  # Chunked
-    ["--num-scheduler-steps", "8"],  # MS
-    ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"]  # MS+Stream
 ]
 MAX_WAIT_SECONDS = None
 
 if current_platform.is_tpu():
     MORE_ARGS_LIST = [
         [],  # Default
-        # ["--num-scheduler-steps", "8"], # Multi-step << currently fails
     ]
     MAX_WAIT_SECONDS = 600
 
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 8cae8a80d3..dbd9c518e0 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -94,45 +94,6 @@ def test_metric_counter_generation_tokens(
         f"metric: {metric_count!r}")
 
 
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [128, 129])
-@pytest.mark.parametrize("disable_async_output_proc", [True, False])
-def test_metric_counter_generation_tokens_multi_step(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    max_tokens: int,
-    disable_async_output_proc: bool,
-) -> None:
-    num_scheduler_steps = 8
-    with vllm_runner(
-            model,
-            disable_log_stats=False,
-            gpu_memory_utilization=0.4,
-            num_scheduler_steps=num_scheduler_steps,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        tokenizer = vllm_model.llm.get_tokenizer()
-        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
-        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
-            **stat_logger.labels)._value.get()
-        vllm_generation_count = 0
-        for i in range(len(example_prompts)):
-            vllm_output_ids, vllm_output_str = vllm_outputs[i]
-            prompt_ids = tokenizer.encode(example_prompts[i])
-            # vllm_output_ids contains both prompt tokens and generation tokens.
-            # We're interested only in the count of the generation tokens.
-            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
-
-    # The multi-step scheduling will continue to execute forward even when
-    # encountering EOS, leading to slightly imprecise metrics.
-    assert abs(vllm_generation_count - metric_count) <\
-        len(example_prompts) * num_scheduler_steps, \
-        (f"generation token count: {vllm_generation_count!r}\n"
-         f"metric: {metric_count!r}")
-
-
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize(
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 76f6c226ba..19fcbf5616 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -331,32 +331,6 @@ def test_state_cleanup(
                     "could be related to finished_requests_ids")
 
 
-@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
-@pytest.mark.parametrize("max_tokens", [64])
-def test_multistep_correctness(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    max_tokens: int,
-) -> None:
-    with vllm_runner(model, num_scheduler_steps=8,
-                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_multistep = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    with vllm_runner(model, num_scheduler_steps=1,
-                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_single_step = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_outputs_multistep,
-        outputs_1_lst=vllm_outputs_single_step,
-        name_0="vllm_outputs_multistep",
-        name_1="vllm_outputs_single_step",
-    )
-
-
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [64])
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
deleted file mode 100644
index 56e339d485..0000000000
--- a/tests/multi_step/test_correctness_async_llm.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Test the AsyncLLMEngine with multi-step-decoding
-from typing import Optional
-
-import pytest
-
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_logprobs_close
-from ..utils import (completions_with_server_args, get_client_text_generations,
-                     get_client_text_logprob_generations)
-
-MODELS = [
-    "JackFram/llama-160m",
-]
-NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
-NUM_PROMPTS = [10]
-
-DEFAULT_SERVER_ARGS: list[str] = [
-    "--distributed-executor-backend",
-    "ray",
-    "--gpu-memory-utilization",
-    "0.85",
-    "--swap-space",
-    "16",
-]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize(("tp_size, pp_size"), [
-    (1, 1),
-    (2, 2),
-])
-@pytest.mark.parametrize("eager_mode", [False, True])
-@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
-@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("is_async", [True])
-@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
-@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
-@pytest.mark.asyncio
-async def test_multi_step(
-    example_prompts,
-    model: str,
-    tp_size: int,
-    pp_size: int,
-    eager_mode: int,
-    num_scheduler_steps: int,
-    num_prompts: int,
-    is_async: bool,
-    num_logprobs: Optional[int],
-    attention_backend: str,
-    enable_chunked_prefill: bool,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
-    client/server environment.
-
-    Set up an engine with single-step scheduling as a ground-truth reference.
-
-    Send a completions API request to both engines with the same prompts.
-
-    Validate:
-    * Generated tokens match
-    * Generated logprobs are all very close
-
-    Args:
-      example_prompts: test fixture providing example prompts
-      model: model under test (same for single- and multi-step engines)
-      tp_size: degree of tensor-parallelism
-      pp_size: degree of pipeline-parallelism
-      eager_mode
-      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
-                           GPU -> CPU output transfer
-      num_prompts: number of example prompts under test
-      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> no logprobs
-    """
-    if enable_chunked_prefill and \
-        (pp_size > 1 or attention_backend != "FLASH_ATTN"):
-        pytest.skip("Multi-step with Chunked-Prefill only supports"
-                    "PP=1 and FLASH_ATTN backend")
-
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        prompts = example_prompts
-        if len(prompts) < num_prompts:
-            prompts = prompts * ((num_prompts // len(prompts)) + 1)
-        prompts = prompts[:num_prompts]
-        assert len(prompts) == num_prompts
-
-        server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
-        ms_server_args = DEFAULT_SERVER_ARGS + \
-            ["--num-scheduler-steps", f"{num_scheduler_steps}"]
-
-        if not is_async:
-            ms_server_args += ["--disable-async-output-proc"]
-
-        if eager_mode:
-            ms_server_args.append("--enforce-eager")
-
-        if enable_chunked_prefill:
-            ms_server_args.append("--enable-chunked-prefill")
-
-        distributed_args = [
-            "--tensor-parallel-size",
-            str(tp_size),
-            "--pipeline-parallel-size",
-            str(pp_size),
-        ]
-
-        # Spin up client/server & issue completion API requests.
-        # Default `max_wait_seconds` is 240 but was empirically
-        # was raised 5x to 1200 *just for this test* due to
-        # observed timeouts in GHA CI
-        ref_completions = await completions_with_server_args(
-            prompts,
-            model,
-            server_args + distributed_args,
-            num_logprobs,
-            max_wait_seconds=5 * 240)
-        test_completions = await completions_with_server_args(
-            prompts,
-            model,
-            ms_server_args + distributed_args,
-            num_logprobs,
-            max_wait_seconds=5 * 240)
-
-        # Assert multi-step scheduling produces identical tokens
-        # to single-step scheduling.
-        ref_generations = get_client_text_generations(ref_completions)
-        test_generations = get_client_text_generations(test_completions)
-        assert ref_generations == test_generations
-
-        # Assert multi-step scheduling produces nearly-identical logprobs
-        # to single-step scheduling.
-        ref_text_logprobs = get_client_text_logprob_generations(
-            ref_completions)
-        test_text_logprobs = get_client_text_logprob_generations(
-            test_completions)
-        check_logprobs_close(
-            outputs_0_lst=ref_text_logprobs,
-            outputs_1_lst=test_text_logprobs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize(("tp_size, pp_size"), [
-    (1, 2),
-])
-@pytest.mark.asyncio
-async def test_multi_step_pp_smoke(
-    tp_size: int,
-    pp_size: int,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """
-    Smoke test for the vLLM engine with multi-step scheduling in an
-    OpenAI-protocol client/server environment.
-
-    This tests compares the outputs between multi-step scheduling and
-    single-step scheduling. Notably, this test lets the engines generate
-    more tokens (default is 5) and test for an exact match over all the
-    tokens.
-
-    Args:
-      tp_size: degree of tensor-parallelism
-      pp_size: degree of pipeline-parallelism
-      eager_mode
-    """
-
-    model = "JackFram/llama-160m"
-    num_scheduler_steps = 8
-    attention_backend = "FLASH_ATTN"
-    max_num_seqs = 3
-
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        # Prompt from the ShareGPT dataset
-        prompts = [
-            "in the jtbd context whats a push?",  # codespell:ignore
-            "in the jtbd context whats a push?",  # codespell:ignore
-            "in the jtbd context whats a push?",  # codespell:ignore
-            "in the jtbd context whats a push?",  # codespell:ignore
-        ]
-        # Use varying max_tokens to introduce scheduling randomness.
-        max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
-        assert len(prompts) == len(max_tokens)
-
-        test_args = [
-            "--tensor-parallel-size",
-            str(tp_size), "--pipeline-parallel-size",
-            str(pp_size), "--max-num-seqs",
-            str(max_num_seqs)
-        ]
-
-        server_args = DEFAULT_SERVER_ARGS + test_args
-        ms_server_args = DEFAULT_SERVER_ARGS + \
-          ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
-          test_args
-
-        # Spin up client/server & issue completion API requests.
-        # Default `max_wait_seconds` is 240 but was empirically
-        # was raised 3x to 720 *just for this test* due to
-        # observed timeouts in GHA CI
-        ref_completions = await completions_with_server_args(
-            prompts=prompts,
-            model_name=model,
-            server_cli_args=server_args,
-            num_logprobs=None,
-            max_wait_seconds=5 * 240,
-            max_tokens=max_tokens)
-
-        test_completions = await completions_with_server_args(
-            prompts=prompts,
-            model_name=model,
-            server_cli_args=ms_server_args,
-            num_logprobs=None,
-            max_wait_seconds=5 * 240,
-            max_tokens=max_tokens)
-
-        # Assert multi-step scheduling produces identical tokens
-        # to single-step scheduling.
-        ref_generations = get_client_text_generations(ref_completions)
-        test_generations = get_client_text_generations(test_completions)
-
-        assert ref_generations == test_generations
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
deleted file mode 100644
index 0df00c98b7..0000000000
--- a/tests/multi_step/test_correctness_llm.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Test the LLMEngine with multi-step-decoding
-
-import copy
-from typing import Optional
-
-import pytest
-
-from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_logprobs_close, check_outputs_equal
-
-MODELS = [
-    "JackFram/llama-160m",
-]
-NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
-NUM_PROMPTS = [10]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("tp_size", [1])
-@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True, False])
-@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
-@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs", [None, 5])
-@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN", "FLASHINFER"])
-def test_multi_step_llm(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    tp_size: int,
-    enable_chunked_prefill: bool,
-    max_tokens: int,
-    enforce_eager: int,
-    num_scheduler_steps: int,
-    num_prompts: int,
-    num_logprobs: Optional[int],
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Test vLLM engine with multi-step scheduling via sync LLM Engine.
-
-    Set up a HuggingFace (HF) transformers model as a ground-truth reference.
-
-    Prompt them with the same example prompts.
-
-    Validate:
-    * Generated tokens match
-    * Generated logprobs are all very close
-
-    Args:
-      hf_runner: HF transformers model runner fixture
-      vllm_runner: vLLM model runner fixture
-      example_prompts: test fixture providing example prompts
-      model: model under test (same for single- and multi-step engines)
-      dtype: tensor datatype for engine to utilize
-      tp_size: degree of tensor-parallelism
-      enable_chunked_prefill: chunked-prefill on/off
-      max_tokens: the maximum number of tokens to generate
-      enforce_eager
-      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
-                           GPU -> CPU output transfer
-      num_prompts: number of example prompts under test
-      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> 1 logprob returned.
-    """
-    if current_platform.is_rocm() and \
-        (attention_backend == "FLASHINFER" or enable_chunked_prefill):
-        pytest.skip(
-            "Multi-Step with FLASHINFER or Chunked-Prefill is not supported"
-            "on ROCm")
-
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        prompts = example_prompts
-        if len(prompts) < num_prompts:
-            prompts = prompts * ((num_prompts // len(prompts)) + 1)
-        prompts = prompts[:num_prompts]
-        assert len(prompts) == num_prompts
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                enable_chunked_prefill=enable_chunked_prefill,
-                num_scheduler_steps=num_scheduler_steps,
-        ) as vllm_model:
-            vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
-                            if num_logprobs is None else
-                            vllm_model.generate_greedy_logprobs(
-                                prompts, max_tokens, num_logprobs))
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
-                          if num_logprobs is None else
-                          hf_model.generate_greedy_logprobs_limit(
-                              prompts, max_tokens, num_logprobs))
-
-        if num_logprobs is None:
-            check_outputs_equal(
-                outputs_0_lst=hf_outputs,
-                outputs_1_lst=vllm_outputs,
-                name_0="hf",
-                name_1="vllm",
-            )
-        else:
-            check_logprobs_close(
-                outputs_0_lst=hf_outputs,
-                outputs_1_lst=vllm_outputs,
-                name_0="hf",
-                name_1="vllm",
-            )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("tp_size", [1])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
-@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)])
-@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
-def test_multi_step_llm_w_prompt_logprobs(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    tp_size: int,
-    max_tokens: int,
-    enforce_eager: int,
-    num_scheduler_steps: int,
-    num_prompts: int,
-    num_logprobs: Optional[int],
-    num_prompt_logprobs: Optional[int],
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
-
-    Set up a vLLM engine instance w/ single-step scheduling as a ground-truth
-    reference.
-
-    Prompt them with the same example prompts.
-
-    Validate:
-    * All generated logprobs are all very close
-
-    Args:
-      hf_runner: HF transformers model runner fixture
-      vllm_runner: vLLM model runner fixture
-      example_prompts: test fixture providing example prompts
-      model: model under test (same for single- and multi-step engines)
-      dtype: tensor datatype for engine to utilize
-      tp_size: degree of tensor-parallelism
-      max_tokens: the maximum number of tokens to generate
-      enforce_eager
-      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
-                           GPU -> CPU output transfer
-      num_prompts: number of example prompts under test
-      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> no logprobs
-      num_prompt_logprobs: number of logprobs to return for each prompt token;
-                           note that this argument is not supported by the
-                           OpenAI completions endpoint.
-    """
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        prompts = example_prompts
-        if len(prompts) < num_prompts:
-            prompts = prompts * ((num_prompts // len(prompts)) + 1)
-        prompts = prompts[:num_prompts]
-        assert len(prompts) == num_prompts
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                num_scheduler_steps=num_scheduler_steps,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs,
-                num_prompt_logprobs=num_prompt_logprobs)
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-        ) as vllm_model:
-            single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs,
-                num_prompt_logprobs=num_prompt_logprobs)
-
-        check_logprobs_close(
-            outputs_0_lst=single_step_vllm_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("tp_size", [1])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
-@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs", [None, 5])
-@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
-@pytest.mark.skipif(
-    current_platform.is_rocm(),
-    reason="Multi-Step + Chunked-Prefill not supported on ROCm")
-def test_multi_step_llm_chunked_prefill_prefix_cache(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    tp_size: int,
-    max_tokens: int,
-    enforce_eager: int,
-    num_scheduler_steps: int,
-    num_prompts: int,
-    num_logprobs: Optional[int],
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
-
-    Set up contrived scenario which tests for a possible failure mode of
-    scheduling with multi-step+"single-step chunked prefill"+APC
-
-    "single-step chunked prefill" here refers to the current vLLM multi-step+
-    chunked-prefill implementation, which requires that a prefill may only
-    be scheduled in the same step as decodes if the prefill prompt fits in a
-    single chunk (note that "complete" multi-step+chunked-prefill would allow
-    a prefill to span multiple chunks & multiple steps but that is not yet
-    the case.)
-
-    "APC" is short for "automatic prefix caching".
-
-    This test creates a scenario where the scheduler must decide whether/how
-    to schedule a prefill with a prompt that exceeds the available token budget.
-    The correct behavior for multi-step+"single-step chunked prefill"+APC is to
-    put off scheduling the prefill until a future step.
-
-    Validate that:
-    * Multi-step kernels do not raise an exception due to incorrect scheduler
-      behavior
-    * Generated tokens match between
-      multi-step+"single-step chunked prefill"+APC and
-      single-step scheduling.
-    * (If logprobs are enabled) check logprobs are close enough
-
-    Args:
-      vllm_runner: vLLM model runner fixture
-      example_prompts: test fixture providing example prompts
-      model: model under test (same for single- and multi-step engines)
-      dtype: tensor datatype for engine to utilize
-      tp_size: degree of tensor-parallelism
-      max_tokens: the maximum number of tokens to generate
-      enforce_eager
-      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
-                           GPU -> CPU output transfer
-      num_prompts: number of example prompts under test
-      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> 1 logprob returned.
-    """
-
-    # Set up contrived test for correct scheduling behavior with
-    # multi-step+"single-step chunked prefill"+APC.
-    #
-    # Assume block_size=16
-    #
-    # Assume max_num_batched_tokens=48
-    #   => Per-step token budget=48
-    #
-    # 1. Scheduler schedules 0th prompt (24 tokens)
-    #      => Remaining token budget=24
-    # 2. Scheduler attempts to schedule 1st prompt (30 tokens)
-    #    * 30 tokens exceeds 24 token remaining budget
-    #    * Correct behavior: do not schedule this prompt in this step
-    #    * Incorrect behavior: schedule prompt chunk
-    #      * `do_sample=False` for this prompt in this step
-    #      * Chunk size = (remaining tokens // block size) * block size
-    #
-    # The Incorrect scheduling behavior - if it occurs - will cause an exception
-    # in the model runner resulting from `do_sample=False`.
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        assert len(example_prompts) >= 2
-        challenge_prompts = copy.deepcopy(example_prompts)
-        challenge_prompts[0] = (
-            'vLLM is a high-throughput and memory-efficient '
-            'inference and serving engine for LLMs.\n')  # 24 tok
-        challenge_prompts[1] = (
-            'Briefly describe the major milestones in the '
-            'development of artificial intelligence from 1950 to 2020.\n'
-        )  # 30 tok
-
-        # If necessary, adjust the length of `challenge_prompts` to match
-        # `num_prompts`
-        if len(challenge_prompts) < num_prompts:
-            challenge_prompts = (challenge_prompts *
-                                 ((num_prompts // len(challenge_prompts)) + 1))
-        challenge_prompts = challenge_prompts[:num_prompts]
-        assert len(challenge_prompts) == num_prompts
-
-        # Single-step scheduler baseline
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                num_scheduler_steps=num_scheduler_steps,
-                max_model_len=48,
-                max_num_batched_tokens=48,
-                max_num_seqs=4,
-                block_size=16,
-        ) as vllm_model:
-            outputs_baseline = (
-                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
-                num_logprobs is None else vllm_model.generate_greedy_logprobs(
-                    challenge_prompts, max_tokens, num_logprobs))
-
-        # multi-step+"single-step chunked prefill"+APC
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                enable_chunked_prefill=True,
-                enable_prefix_caching=True,
-                num_scheduler_steps=num_scheduler_steps,
-                max_model_len=48,
-                max_num_batched_tokens=48,
-                max_num_seqs=4,
-                block_size=16,
-        ) as vllm_model:
-            outputs_w_features = (
-                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
-                num_logprobs is None else vllm_model.generate_greedy_logprobs(
-                    challenge_prompts, max_tokens, num_logprobs))
-
-        if num_logprobs is None:
-            # No-logprobs test
-            check_outputs_equal(
-                outputs_0_lst=outputs_baseline,
-                outputs_1_lst=outputs_w_features,
-                name_0="multi-step",
-                name_1="multi-step+features",
-            )
-        else:
-            # Yes-logprobs test
-            check_logprobs_close(
-                outputs_0_lst=outputs_baseline,
-                outputs_1_lst=outputs_w_features,
-                name_0="multi-step",
-                name_1="multi-step+features",
-            )
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
deleted file mode 100644
index 123f9595e9..0000000000
--- a/tests/samplers/test_logits_processor.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm import SamplingParams
-
-MODELS = ["distilbert/distilgpt2"]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This file tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_logits_processor_force_generate(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        tokenizer = vllm_model.llm.get_tokenizer()
-        repeat_times = 2
-        enforced_answers = " vLLM"
-        vllm_token_ids = tokenizer.encode(enforced_answers,
-                                          add_special_tokens=False)
-        max_tokens = len(vllm_token_ids) * repeat_times
-
-        def pick_vllm(token_ids, logits):
-            token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
-            logits[token_id] = torch.finfo(logits.dtype).max
-            return logits
-
-        params_with_logprobs = SamplingParams(
-            logits_processors=[pick_vllm],
-            prompt_logprobs=3,
-            max_tokens=max_tokens,
-        )
-
-        # test logits_processors when prompt_logprobs is not None
-        vllm_model.llm._add_request(
-            example_prompts[0],
-            params=params_with_logprobs,
-        )
-
-        # test prompt_logprobs is not None
-        vllm_model.llm._add_request(
-            example_prompts[1],
-            params=SamplingParams(
-                prompt_logprobs=3,
-                max_tokens=max_tokens,
-            ),
-        )
-
-        # test grouped requests
-        vllm_model.llm._add_request(
-            example_prompts[2],
-            params=SamplingParams(max_tokens=max_tokens),
-        )
-
-        outputs = vllm_model.llm._run_engine(use_tqdm=False)
-
-        assert outputs[0].outputs[0].text == enforced_answers * repeat_times
diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py
index 4c47b8c43c..636108e985 100644
--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@@ -30,7 +30,6 @@ def use_v1_only(monkeypatch: pytest.MonkeyPatch):
 
 def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
     return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
-                    num_scheduler_steps=1,
                     max_model_len=256,
                     max_seq_len_to_capture=256,
                     max_num_seqs=8,
diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index a2db1ae684..8be1e103dc 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -236,7 +236,6 @@ def test_config_args(parser_with_config, cli_config_file):
         ['serve', 'mymodel', '--config', cli_config_file])
     assert args.tensor_parallel_size == 2
     assert args.trust_remote_code
-    assert not args.multi_step_stream_outputs
 
 
 def test_config_file(parser_with_config):
@@ -828,7 +827,6 @@ def test_model_specification(parser_with_config, cli_config_file,
     ])
     assert args.tensor_parallel_size == 2
     assert args.trust_remote_code is True
-    assert args.multi_step_stream_outputs is False
     assert args.port == 12312
 
 
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index a756c89b52..1f16e92f65 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -58,12 +58,6 @@ def test_unsupported_configs(monkeypatch):
                 disable_async_output_proc=True,
             ).create_engine_config()
 
-        with pytest.raises(NotImplementedError):
-            AsyncEngineArgs(
-                model=MODEL,
-                num_scheduler_steps=5,
-            ).create_engine_config()
-
         with pytest.raises(NotImplementedError):
             AsyncEngineArgs(
                 model=MODEL,
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index ec33d334ab..2031f41fab 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -11,7 +11,6 @@ from vllm.attention.backends.utils import CommonAttentionState
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-from vllm.worker.multi_step_model_runner import StatefulModelInput
 from vllm.worker.pooling_model_runner import (
     ModelInputForGPUWithPoolingMetadata)
 
@@ -166,81 +165,3 @@ def test_embedding_model_runner_input():
                        None) == getattr(attn_metadata, field.name, None)
     # Pooling metadata is not broadcast.
     assert received_model_input.pooling_metadata is None
-
-
-def test_multi_step_model_runner_input():
-    sampling_metadata = SamplingMetadata(
-        ["seq_group"],
-        "selected_token_indices",
-        "categorized_sample_indices",
-        "num_prompts",
-    )
-    attn_metadata = AttentionMetadata(
-        num_prefills=1,
-        num_prefill_tokens=2,
-        num_decode_tokens=3,
-        slot_mapping=torch.zeros(1),
-        multi_modal_placeholder_index_maps=None,
-        enable_kv_scales_calculation=True,
-    )
-    frozen_model_input = ModelInputForGPUWithSamplingMetadata(
-        input_tokens=torch.ones(10),
-        input_positions=torch.ones(10),
-        sampling_metadata=sampling_metadata,
-        attn_metadata=attn_metadata)
-
-    model_input = StatefulModelInput(
-        frozen_model_input=frozen_model_input,
-        is_last_step=True,
-        is_first_multi_step=False,
-        current_step=4,
-        last_sampled_token_ids=torch.ones((10, 1)),
-        is_multi_step=True,
-        num_queries=8,
-        num_seqs=5,
-        cached_outputs=[],
-    )
-
-    assert isinstance(model_input, StatefulModelInput)
-
-    # Test round trip serialization.
-    tensor_dict = model_input.as_broadcastable_tensor_dict()
-    attn_backend = MockAttentionBackend()
-    received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
-        tensor_dict, attn_backend=attn_backend))
-
-    received_frozen_input = received_model_input.frozen_model_input
-
-    # Check that received copy has correct values.
-    assert isinstance(received_model_input, StatefulModelInput)
-    assert received_frozen_input.input_tokens is not None
-    assert (received_frozen_input.input_tokens ==
-            frozen_model_input.input_tokens).all()
-    assert received_frozen_input.input_positions is not None
-    assert (received_frozen_input.input_positions ==
-            frozen_model_input.input_positions).all()
-    assert received_frozen_input.multi_modal_kwargs is None
-    assert (frozen_model_input.multi_modal_kwargs ==
-            frozen_model_input.multi_modal_kwargs)
-    assert received_frozen_input.lora_requests is None
-    assert (received_frozen_input.lora_requests ==
-            frozen_model_input.lora_requests)
-    assert received_frozen_input.lora_mapping is None
-    assert (
-        received_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
-    for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(received_frozen_input.attn_metadata, field.name,
-                       None) == getattr(attn_metadata, field.name, None)
-    # For sampling metadata, only selected_token_indices is copied.
-    assert (received_frozen_input.sampling_metadata.selected_token_indices ==
-            sampling_metadata.selected_token_indices)
-    assert received_frozen_input.sampling_metadata.seq_groups is None
-
-    # check non frozen fields
-    assert received_model_input.is_last_step == model_input.is_last_step
-    assert (received_model_input.is_first_multi_step ==
-            model_input.is_first_multi_step)
-    assert received_model_input.current_step == model_input.current_step
-    assert (received_model_input.last_sampled_token_ids ==
-            model_input.last_sampled_token_ids).all()
-    assert received_model_input.is_multi_step == model_input.is_multi_step
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index df4eb33f5d..6649cd89ee 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3779,8 +3779,6 @@ class VllmConfig:
             f"observability_config={self.observability_config!r}, "
             f"seed={self.model_config.seed}, "
             f"served_model_name={self.model_config.served_model_name}, "
-            f"num_scheduler_steps={self.scheduler_config.num_scheduler_steps}, "
-            f"multi_step_stream_outputs={self.scheduler_config.multi_step_stream_outputs}, "  # noqa
             f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
             f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
             f"use_async_output_proc={self.model_config.use_async_output_proc}, "
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 61346da145..63894e7f5d 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -929,8 +929,7 @@ class Scheduler:
         )
 
     def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
-        if (self.scheduler_config.chunked_prefill_enabled
-                and not self.scheduler_config.is_multi_step):
+        if self.scheduler_config.chunked_prefill_enabled:
             prompt_limit = self.scheduler_config.max_model_len
         else:
             prompt_limit = min(
@@ -1114,9 +1113,6 @@ class Scheduler:
                 continue
 
             num_lookahead_slots: int = 0
-            if self.scheduler_config.is_multi_step and enable_chunking:
-                num_lookahead_slots = self._get_num_lookahead_slots(
-                    True, enable_chunking)
 
             # If the sequence group cannot be allocated, stop.
             can_allocate = self.block_manager.can_allocate(
@@ -1195,24 +1191,6 @@ class Scheduler:
                 partial_prefill_metadata.maybe_increment_partial_prefills(
                     seq_group)
 
-            if enable_chunking and self.scheduler_config.is_multi_step:
-                blocks_to_copy: List[Tuple[int, int]] = []
-                # init_multi_step_from_lookahead_slots happens in append_slots
-                self._append_slots(seq_group, blocks_to_copy, enable_chunking)
-                # This assert will trip when a copy-on-write happens. This is
-                # not a concern as the very first sequence-group block
-                # allocation happens above. Still, we have the assert to
-                # catch any edge-cases.
-                assert not blocks_to_copy
-            else:
-                seq_group.init_multi_step_from_lookahead_slots(
-                    num_lookahead_slots,
-                    num_scheduler_steps=self.scheduler_config.
-                    num_scheduler_steps,
-                    is_multi_step=self.scheduler_config.is_multi_step,
-                    enable_chunking=enable_chunking,
-                )
-
             seq_groups.append(
                 ScheduledSequenceGroup(seq_group=seq_group,
                                        token_chunk_size=num_new_tokens))
@@ -1453,14 +1431,6 @@ class Scheduler:
         num_prefill_groups = (len(prefills.seq_groups) +
                               len(swapped_in.prefill_seq_groups) +
                               len(running_scheduled.prefill_seq_groups))
-        # If all prompts, then we set num_lookahead_slots to 0
-        # this allows us to go through the `no_spec` path in
-        # `spec_decode_worker.py`
-        all_prefills = len(scheduled_seq_groups) == num_prefill_groups
-        num_lookahead_slots = (0 if
-                               (all_prefills
-                                and not self.scheduler_config.is_multi_step)
-                               else running_scheduled.num_lookahead_slots)
         return SchedulerOutputs(
             scheduled_seq_groups=scheduled_seq_groups,
             num_prefill_groups=num_prefill_groups,
@@ -1472,7 +1442,7 @@ class Scheduler:
             swapped_in.blocks_to_copy,
             ignored_seq_groups=prefills.ignored_seq_groups +
             swapped_in.infeasible_seq_groups,
-            num_lookahead_slots=num_lookahead_slots,
+            num_lookahead_slots=0,
             running_queue_size=len(self.running),
             preempted=(len(running_scheduled.preempted) +
                        len(running_scheduled.swapped_out)),
@@ -1516,11 +1486,6 @@ class Scheduler:
         num_lookahead_slots = self._get_num_lookahead_slots(
             is_prefill, enable_chunking)
 
-        if is_prefill and num_lookahead_slots > 0:
-            # Appending prefill slots only happens multi-step and
-            # chunked-prefill are enabled together.
-            assert self.scheduler_config.is_multi_step and enable_chunking
-
         return self.block_manager.can_append_slots(
             seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
 
@@ -1776,19 +1741,7 @@ class Scheduler:
         num_lookahead_slots: int = self._get_num_lookahead_slots(
             is_prefill, enable_chunking)
 
-        seq_group.init_multi_step_from_lookahead_slots(
-            num_lookahead_slots,
-            num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
-            is_multi_step=self.scheduler_config.is_multi_step,
-            enable_chunking=enable_chunking,
-        )
-
         seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
-        if self.scheduler_config.is_multi_step and enable_chunking:
-            # In multi-step chunked-prefill any sequence type can have
-            # slots appended.
-            seq_status = None
-
         for seq in seq_group.get_seqs(status=seq_status):
             cows = self.block_manager.append_slots(seq, num_lookahead_slots)
             if len(cows) > 0:
@@ -1904,29 +1857,8 @@ class Scheduler:
         """The number of slots to allocate per sequence per step, beyond known
         token ids. Speculative decoding uses these slots to store KV activations
         of tokens which may or may not be accepted.
-
-        Speculative decoding does not yet support prefill, so we do not perform
-        lookahead allocation for prefill.
-
-        When chunking is enabled with multi-step, we allocate lookahead slots
-        for the prefills for when the prefills turn into decodes in the first
-        step.
         """
-        if is_prefill:
-            if self.scheduler_config.is_multi_step and enable_chunking:
-                # num_lookahead_slots was introduced in the context of decodes,
-                # in Speculative Decoding.
-                # When the num_scheduler_steps is 8, say, then the
-                # num_lookahead_slots is 7. Meaning, we are doing a 1-step of
-                # decode anyways and we wish to do 7 more.
-                #
-                # "lookaheads" for prefills, is introduced in support for
-                # Chunked-Prefill in Multi-Step.
-                return self.scheduler_config.num_lookahead_slots + 1
-            else:
-                return 0
-
-        return self.scheduler_config.num_lookahead_slots
+        return 0
 
     def _get_num_new_uncached_and_cached_tokens(
         self,
@@ -2068,24 +2000,6 @@ class Scheduler:
             The number of new tokens to schedule after chunking.
         """
         remaining_token_budget = budget.remaining_token_budget()
-        if scheduler_config.is_multi_step:
-            # The current multi-step + chunked prefill capability does
-            # not actually support chunking prompts.
-            #
-            # Therefore, `num_new_tokens` is computed in the same fashion
-            # for both multi-step+chunked-prefill &
-            # multi-step+chunked-prefill+APC
-            #
-            # Prompts with more tokens than the current remaining budget
-            # are postponed to future scheduler steps
-            if num_new_tokens > prompt_limit:
-                # If the seq_group is in prompt-stage, pass the
-                # num_new_tokens as-is so the caller can ignore
-                # the sequence.
-                return num_new_tokens
-
-            return 0 if num_new_tokens > \
-                remaining_token_budget else num_new_tokens
 
         # Get the number of tokens to allocate to this prefill slot
         prefill_slot_budget = (
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d74db67bda..c058001ceb 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -362,8 +362,6 @@ class EngineArgs:
     lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
     lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
 
-    num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
-    multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: Optional[
         int] = CacheConfig.num_gpu_blocks_override
@@ -799,11 +797,8 @@ class EngineArgs:
                                      **scheduler_kwargs["delay_factor"])
         scheduler_group.add_argument("--preemption-mode",
                                      **scheduler_kwargs["preemption_mode"])
-        scheduler_group.add_argument("--num-scheduler-steps",
-                                     **scheduler_kwargs["num_scheduler_steps"])
-        scheduler_group.add_argument(
-            "--multi-step-stream-outputs",
-            **scheduler_kwargs["multi_step_stream_outputs"])
+        # multi-step scheduling has been removed; corresponding arguments
+        # are no longer supported.
         scheduler_group.add_argument("--scheduling-policy",
                                      **scheduler_kwargs["policy"])
         scheduler_group.add_argument(
@@ -1257,28 +1252,11 @@ class EngineArgs:
             disable_log_stats=self.disable_log_stats,
         )
 
-        # Reminder: Please update docs/features/compatibility_matrix.md
-        # If the feature combo become valid
-        if self.num_scheduler_steps > 1:
-            if speculative_config is not None:
-                raise ValueError("Speculative decoding is not supported with "
-                                 "multi-step (--num-scheduler-steps > 1)")
-            if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
-                raise ValueError("Multi-Step Chunked-Prefill is not supported "
-                                 "for pipeline-parallel-size > 1")
-            if current_platform.is_cpu():
-                logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
-                               "currently not supported for CPUs and has been "
-                               "disabled.")
-                self.num_scheduler_steps = 1
-
-        # make sure num_lookahead_slots is set the higher value depending on
-        # if we are using speculative decoding or multi-step
-        num_lookahead_slots = max(self.num_lookahead_slots,
-                                  self.num_scheduler_steps - 1)
-        num_lookahead_slots = num_lookahead_slots \
-            if speculative_config is None \
-            else speculative_config.num_lookahead_slots
+        # make sure num_lookahead_slots is set appropriately depending on
+        # whether speculative decoding is enabled
+        num_lookahead_slots = self.num_lookahead_slots
+        if speculative_config is not None:
+            num_lookahead_slots = speculative_config.num_lookahead_slots
 
         scheduler_config = SchedulerConfig(
             runner_type=model_config.runner_type,
@@ -1292,8 +1270,6 @@ class EngineArgs:
             disable_chunked_mm_input=self.disable_chunked_mm_input,
             is_multimodal_model=model_config.is_multimodal_model,
             preemption_mode=self.preemption_mode,
-            num_scheduler_steps=self.num_scheduler_steps,
-            multi_step_stream_outputs=self.multi_step_stream_outputs,
             send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                              and parallel_config.use_ray),
             policy=self.scheduling_policy,
@@ -1392,11 +1368,6 @@ class EngineArgs:
                                recommend_to_remove=True)
             return False
 
-        if self.num_scheduler_steps != SchedulerConfig.num_scheduler_steps:
-            _raise_or_fallback(feature_name="--num-scheduler-steps",
-                               recommend_to_remove=True)
-            return False
-
         if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
             _raise_or_fallback(feature_name="--scheduler-delay-factor",
                                recommend_to_remove=True)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 1f962b008e..b6ee410534 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -15,7 +15,7 @@ from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
-from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
+from vllm.engine.llm_engine import LLMEngine
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.engine.protocol import EngineClient
 from vllm.executor.executor_base import ExecutorBase
@@ -308,13 +308,6 @@ class _AsyncLLMEngine(LLMEngine):
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
                 self._process_model_outputs(ctx=ctx)
 
-            if (self.scheduler_config.is_multi_step
-                    and scheduler_outputs.num_lookahead_slots > 0):
-                # cache the scheduler outputs for the next iteration if we have
-                # lookahead slots
-                self._cache_scheduler_outputs_for_multi_step(
-                    virtual_engine, seq_group_metadata_list, scheduler_outputs,
-                    allow_async_output_proc)
         else:
             finished_requests_ids = list()
 
@@ -351,29 +344,14 @@ class _AsyncLLMEngine(LLMEngine):
             outputs = await self.model_executor.execute_model_async(
                 execute_model_req)
 
-            # we need to do this here so that last step's sampled_token_ids can
-            # be passed to the next iteration for PP.
-            if self.scheduler_config.is_multi_step:
-                self._update_cached_scheduler_output(virtual_engine, outputs)
         else:
             if len(ctx.output_queue) > 0:
                 self._process_model_outputs(ctx=ctx)
             outputs = []
 
-        # Finish the current step for all the sequence groups.
-        if self.scheduler_config.is_multi_step:
-            for seq_group in seq_group_metadata_list:
-                seq_group.finish_step()
-
         if not self._has_remaining_steps(seq_group_metadata_list):
-            # Clear the cache if we have finished all the steps
-            if self.scheduler_config.is_multi_step:
-                self.cached_scheduler_outputs[
-                    virtual_engine] = SchedulerOutputState()
-
             # is_first_step_output is True only when the num_steps of all
-            # the sequences are 1. When the num_steps > 1,
-            # multi_step_model_runner does the first-step output append.
+            # the sequences are 1.
             is_first_step_output: bool = False if not seq_group_metadata_list \
                 else seq_group_metadata_list[0].state.num_steps == 1
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3fc4f6445d..bbe958351e 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -25,7 +25,6 @@ from vllm.engine.metrics_types import StatLoggerBase, Stats
 from vllm.engine.output_processor.interfaces import (
     SequenceGroupOutputProcessor)
 from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.engine.output_processor.util import create_output_by_sequence_group
 from vllm.entrypoints.openai.logits_processors import (
     get_logits_processors as get_openai_logits_processors)
 from vllm.executor.executor_base import ExecutorBase
@@ -91,7 +90,7 @@ class OutputData(NamedTuple):
 
 class SchedulerContext:
 
-    def __init__(self, multi_step_stream_outputs: bool = False):
+    def __init__(self) -> None:
         self.output_queue: Deque[OutputData] = deque()
         self.request_outputs: List[Union[RequestOutput,
                                          PoolingRequestOutput]] = []
@@ -99,8 +98,6 @@ class SchedulerContext:
             List[SequenceGroupMetadata]] = None
         self.scheduler_outputs: Optional[SchedulerOutputs] = None
 
-        self.multi_step_stream_outputs: bool = multi_step_stream_outputs
-
     def append_output(self, outputs: List[SamplerOutput],
                       seq_group_metadata_list: List[SequenceGroupMetadata],
                       scheduler_outputs: SchedulerOutputs, is_async: bool,
@@ -303,8 +300,7 @@ class LLMEngine:
         ]
 
         self.scheduler_contexts = [
-            SchedulerContext(multi_step_stream_outputs=self.scheduler_config.
-                             multi_step_stream_outputs)
+            SchedulerContext()
             for _ in range(self.parallel_config.pipeline_parallel_size)
         ]
 
@@ -683,8 +679,7 @@ class LLMEngine:
                              "Priority scheduling is not enabled.")
 
         if isinstance(params, SamplingParams) \
-            and params.logits_processors \
-            and self.scheduler_config.num_scheduler_steps > 1:
+            and params.logits_processors:
             raise ValueError(
                 "Logits processors are not supported in multi-step decoding")
 
@@ -868,45 +863,6 @@ class LLMEngine:
 
         return
 
-    def _update_num_computed_tokens_for_multi_step_prefill(
-            self, seq_group: SequenceGroup,
-            seq_group_meta: SequenceGroupMetadata,
-            is_first_step_output: Optional[bool]):
-        """
-        This function updates num_computed_tokens for prompt sequences
-        when Multi-Step is enabled.
-
-        seq_group: SequenceGroup to update the num_computed_tokens for.
-        seq_group_meta: Metadata of the given SequenceGroup.
-        is_first_step_output: Optional[bool] -
-            When available, is_first_step_output indicates if the appended
-            output token is the output of the first-step in multi-step.
-            A value of None indicates that outputs from all steps in
-            in multi-step are submitted in a single burst.
-        """
-
-        assert self.scheduler_config.is_multi_step
-
-        if not seq_group_meta.is_prompt:
-            # num_computed_token updates for multi-step decodes happen after
-            # the tokens are appended to the sequence.
-            return
-
-        do_update: bool = False
-        if self.scheduler_config.chunked_prefill_enabled:
-            # In multi-step + chunked-prefill case, the prompt sequences
-            # that are scheduled are fully processed in the first step.
-            do_update = is_first_step_output is None or is_first_step_output
-        else:
-            # Normal multi-step decoding case. In this case prompt-sequences
-            # are actually single-stepped. Always update in this case.
-            assert seq_group.state.num_steps == 1
-            do_update = True
-
-        if do_update:
-            seq_group.update_num_computed_tokens(
-                seq_group_meta.token_chunk_size)
-
     def _process_model_outputs(self,
                                ctx: SchedulerContext,
                                request_id: Optional[str] = None) -> None:
@@ -939,33 +895,8 @@ class LLMEngine:
 
         has_multiple_outputs: bool = len(outputs) > 1
         outputs_by_sequence_group: List[List[SequenceGroupOutput]]
-        if has_multiple_outputs:
-            assert self.scheduler_config.is_multi_step or \
-                     self.speculative_config
-            # Organize outputs by [step][sequence group] instead of
-            # [sequence group][step].
-            if self.scheduler_config.is_multi_step:
-                outputs_by_sequence_group = create_output_by_sequence_group(
-                    outputs, len(seq_group_metadata_list))
-            elif self.speculative_config:
-                # Decodes are multi-steps while prefills are not, outputting at
-                # most 1 token. Separate them so that we can trigger chunk
-                # processing without having to pad or copy over prompts K times
-                # to match decodes structure (costly with prompt_logprobs).
-                num_prefills = sum(sg.is_prompt
-                                   for sg in seq_group_metadata_list)
-                prefills, decodes = outputs[:num_prefills], outputs[
-                    num_prefills:]
-                outputs_by_sequence_group = create_output_by_sequence_group(
-                    decodes,
-                    num_seq_groups=len(seq_group_metadata_list) - num_prefills)
-                outputs_by_sequence_group = [p.outputs for p in prefills
-                                             ] + outputs_by_sequence_group
-            # We have outputs for multiple steps submitted in a single burst,
-            # so invalidate is_first_step_output.
-            is_first_step_output = None
-        else:
-            outputs_by_sequence_group = outputs
+        assert not has_multiple_outputs
+        outputs_by_sequence_group = outputs
 
         # Determine the requests we need to operate on
         if request_id:
@@ -1006,13 +937,8 @@ class LLMEngine:
                 output = [outputs_by_sequence_group[0][i]]
 
             if not is_async:
-                if self.scheduler_config.is_multi_step:
-                    # Updates happen only if the sequence is prefill
-                    self._update_num_computed_tokens_for_multi_step_prefill(
-                        seq_group, seq_group_meta, is_first_step_output)
-                else:
-                    seq_group.update_num_computed_tokens(
-                        seq_group_meta.token_chunk_size or 0)
+                seq_group.update_num_computed_tokens(
+                    seq_group_meta.token_chunk_size or 0)
 
             if outputs:
                 for o in outputs:
@@ -1074,15 +1000,6 @@ class LLMEngine:
             for scheduler in self.scheduler:
                 scheduler.free_finished_seq_groups()
 
-        # For multi-step without streaming, don't create outputs each iteration
-        if not is_last_step and not ctx.multi_step_stream_outputs:
-            # Immediately process request outputs here (if callback is given)
-            if (finished_now
-                    and self.process_request_outputs_callback is not None):
-                self.process_request_outputs_callback(ctx.request_outputs)
-                ctx.request_outputs.clear()
-            return
-
         # Create the outputs
         for i in indices:
             if i in skip or i in finished_before or i in finished_now:
@@ -1101,13 +1018,7 @@ class LLMEngine:
             if request_output:
                 ctx.request_outputs.append(request_output)
 
-        # For multi-step with streaming, create outputs each iteration
-        if not is_last_step and ctx.multi_step_stream_outputs:
-            # Immediately process request outputs here (if callback is given)
-            if self.process_request_outputs_callback is not None:
-                self.process_request_outputs_callback(ctx.request_outputs)
-                ctx.request_outputs.clear()
-            return
+        # Create outputs only after processing the scheduler's results
 
         for seq_group in scheduler_outputs.ignored_seq_groups:
             params = seq_group.sampling_params
@@ -1157,16 +1068,10 @@ class LLMEngine:
             if seq_group.is_finished():
                 continue
 
-            if self.scheduler_config.is_multi_step:
-                # Updates happen only if the sequence is prefill
-                self._update_num_computed_tokens_for_multi_step_prefill(
-                    seq_group, seq_group_metadata,
-                    seq_group.state.num_steps == 1)
-            else:
-                token_chunk_size = (seq_group_metadata.token_chunk_size
-                                    if seq_group_metadata.token_chunk_size
-                                    is not None else 0)
-                seq_group.update_num_computed_tokens(token_chunk_size)
+            token_chunk_size = (seq_group_metadata.token_chunk_size
+                                if seq_group_metadata.token_chunk_size
+                                is not None else 0)
+            seq_group.update_num_computed_tokens(token_chunk_size)
 
             if seq_group_metadata.do_sample:
                 assert len(sequence_group_outputs.samples) == 1, (
@@ -1177,16 +1082,8 @@ class LLMEngine:
                 assert len(seq_group.seqs) == 1
                 seq = seq_group.seqs[0]
 
-                if self.scheduler_config.is_multi_step:
-                    is_prefill_append = seq.data.get_num_uncomputed_tokens(
-                    ) == 0
-                    seq.append_token_id(sample.output_token, sample.logprobs,
-                                        sample.output_embed)
-                    if not is_prefill_append:
-                        seq_group.update_num_computed_tokens(1)
-                else:
-                    seq.append_token_id(sample.output_token, sample.logprobs,
-                                        sample.output_embed)
+                seq.append_token_id(sample.output_token, sample.logprobs,
+                                    sample.output_embed)
 
     def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
@@ -1289,13 +1186,6 @@ class LLMEngine:
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
                 self._process_model_outputs(ctx=ctx)
 
-            if (self.scheduler_config.is_multi_step
-                    and scheduler_outputs.num_lookahead_slots > 0):
-                # cache the scheduler outputs for the next iteration if we have
-                # lookahead slots
-                self._cache_scheduler_outputs_for_multi_step(
-                    virtual_engine, seq_group_metadata_list, scheduler_outputs,
-                    allow_async_output_proc)
         else:
             finished_requests_ids = list()
 
@@ -1345,10 +1235,6 @@ class LLMEngine:
                 # Raise so the caller is notified that this request failed
                 raise
 
-            # We need to do this here so that last step's sampled_token_ids can
-            # be passed to the next iteration for PP.
-            if self.scheduler_config.is_multi_step:
-                self._update_cached_scheduler_output(virtual_engine, outputs)
         else:
             # Nothing scheduled => If there is pending async postprocessor,
             # then finish it here.
@@ -1357,19 +1243,9 @@ class LLMEngine:
             # No outputs in this case
             outputs = []
 
-        # Finish the current step for all the sequence groups.
-        if self.scheduler_config.is_multi_step:
-            for seq_group in seq_group_metadata_list:
-                seq_group.finish_step()
-
         if not self._has_remaining_steps(seq_group_metadata_list):
-            # clear the cache if we have finished all the steps.
-            if self.scheduler_config.is_multi_step:
-                self.cached_scheduler_outputs[0] = SchedulerOutputState()
-
             # is_first_step_output is True only when the num_steps of all
-            # the sequences are 1. When the num_steps > 1,
-            # multi_step_model_runner does the first-step output append.
+            # the sequences are 1.
             is_first_step_output: bool = False if not seq_group_metadata_list \
                 else seq_group_metadata_list[0].state.num_steps == 1
 
@@ -1453,22 +1329,7 @@ class LLMEngine:
     def _has_remaining_steps(
         self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]]
     ) -> bool:
-        if (not self.scheduler_config.is_multi_step
-                or not seq_group_metadata_list):
-            return False
-
-        # TODO(will) this is a sanity check for nowto make sure that all the
-        # seqs are on the same steps. Eventually we will want to do some sort of
-        # dynamic scheduling when doing multi-step decoding.
-        ref_remaining_steps = seq_group_metadata_list[0].state.remaining_steps
-        if any([
-                seq_group.state.remaining_steps != ref_remaining_steps
-                for seq_group in seq_group_metadata_list[1:]
-        ]):
-            raise AssertionError("All running sequence groups should "
-                                 "have the same remaining steps.")
-
-        return ref_remaining_steps > 0
+        return False
 
     def _cache_scheduler_outputs_for_multi_step(
             self, virtual_engine: int,
@@ -1497,13 +1358,6 @@ class LLMEngine:
 
     def _get_last_sampled_token_ids(
             self, virtual_engine: int) -> Optional[torch.Tensor]:
-        cached_last_output = self.cached_scheduler_outputs[
-            virtual_engine].last_output
-        if (self.scheduler_config.is_multi_step
-                and self.parallel_config.pipeline_parallel_size > 1
-                and cached_last_output is not None
-                and cached_last_output.sampled_token_ids_cpu is not None):
-            return cached_last_output.sampled_token_ids_cpu
         return None
 
     def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
index 19c5963d32..4d75719c17 100644
--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
@@ -36,27 +36,13 @@ class SequenceGroupOutputProcessor(ABC):
     ):
         """Create an output processor.
 
-        This returns a single-step output processor if num_lookahead_slots is
-        zero, else returns a multi-step output processor.
+        Multi-step scheduling is no longer supported. Always return a
+        single-step output processor.
         """
-        if scheduler_config.num_lookahead_slots == 0:
-            # Importing here to avoid cycle.
-            from vllm.engine.output_processor.single_step import (
-                SingleStepOutputProcessor)
-            return SingleStepOutputProcessor(scheduler_config, detokenizer,
-                                             scheduler, seq_counter,
-                                             stop_checker)
-        else:
-            # Importing here to avoid cycle.
-            from vllm.engine.output_processor.multi_step import (
-                MultiStepOutputProcessor)
-            return MultiStepOutputProcessor(
-                detokenizer,
-                scheduler,
-                seq_counter,
-                get_tokenizer_for_seq,
-                stop_checker,
-            )
+        from vllm.engine.output_processor.single_step import (
+            SingleStepOutputProcessor)
+        return SingleStepOutputProcessor(scheduler_config, detokenizer,
+                                         scheduler, seq_counter, stop_checker)
 
     @abstractmethod
     def process_outputs(self, sequence_group: SequenceGroup,
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
deleted file mode 100644
index 8b66ef0dc7..0000000000
--- a/vllm/engine/output_processor/multi_step.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import functools
-from typing import Callable, List, cast
-
-from vllm.core.scheduler import Scheduler
-from vllm.engine.output_processor.interfaces import (
-    SequenceGroupOutputProcessor)
-from vllm.engine.output_processor.single_step import (
-    single_step_process_prompt_logprob)
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.logger import init_logger
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
-                           CompletionSequenceGroupOutput, Sequence,
-                           SequenceGroup, SequenceGroupOutput, SequenceOutput,
-                           SequenceStatus)
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import Counter
-
-logger = init_logger(__name__)
-
-
-class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
-    """SequenceGroupOutputProcessor which handles logic related to
-    detokenization and stopping conditions. It specializes to "multi-step
-    decoding", where vLLM's worker may generate multiple tokens per invocation.
-    This is currently mutually exclusive with advanced sampling techniques like
-    beam search, which motivates the separation of this logic from the single
-    step output processor.
-
-    This class is responsible for things such as correctly appending all new
-    token ids to their sequence, detokenizing new token ids, truncating new
-    output tokens after an eos token, and correctly handling the case where the
-    number of new output tokens per sequence differs in a single batch.
-    """
-
-    def __init__(
-        self,
-        detokenizer: Detokenizer,
-        scheduler: List[Scheduler],
-        seq_counter: Counter,
-        get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer],
-        stop_checker: StopChecker,
-    ):
-        self.detokenizer = detokenizer
-        self.scheduler = scheduler
-        self.seq_counter = seq_counter
-        self.get_tokenizer_for_seq = get_tokenizer_for_seq
-        self.stop_checker = stop_checker
-
-    def process_prompt_logprob(self, seq_group: SequenceGroup,
-                               outputs: List[SequenceGroupOutput]) -> None:
-        """Process prompt logprobs associated with each step of a multi-step-
-        scheduled computation.
-
-        Args:
-          seq_group: the outputs are associated with this
-              [`SequenceGroup`][vllm.sequence.SequenceGroup]
-          outputs: the
-              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s
-              for all scheduler steps
-        """
-        for output in outputs:
-            # Concatenate single-step prompt logprob processing results.
-            assert isinstance(output, CompletionSequenceGroupOutput)
-            single_step_process_prompt_logprob(self, seq_group, output)
-
-    @staticmethod
-    @functools.lru_cache
-    def _log_prompt_logprob_unsupported_warning_once():
-        # Reminder: Please update docs/features/compatibility_matrix.md
-        # If the feature combo become valid
-        logger.warning(
-            "Prompt logprob is not supported by multi step workers. "
-            "(e.g., speculative decode uses multi step workers).")
-
-    def process_outputs(self,
-                        sequence_group: SequenceGroup,
-                        outputs: List[SequenceGroupOutput],
-                        is_async: bool = False) -> None:
-        """Append new tokens in the outputs to sequences in the sequence group.
-
-        This only supports sequence groups of size 1. It supports greater than
-        one new token per sequence.
-
-        This applies logic like stop condition checking and detokenization.
-        It also handles cases where there are tokens emitted after 
-        the EOS token.
-
-        is_async - Indicates whether this postprocessor runs in 
-            parallel with the GPU forward pass and is processing 
-            tokens from the previous step. If this is true, then
-            no tokens need to be appended since it is already done
-            externally (before the next schedule() call)
-        """
-        # Sequences can be in RUNNING or FINISHED_ABORTED state
-        # once scheduled, as a sequence is moved to FINISHED_ABORTED
-        # if a client disconnects from the api server.
-        seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
-        if seqs is None:
-            seqs = sequence_group.get_seqs(
-                status=SequenceStatus.FINISHED_ABORTED)
-
-        assert seqs, "Expected RUNNING or FINISHED_ABORTED sequences"
-        assert len(seqs) == 1, (
-            "Beam search not supported in multi-step decoding.")
-        seq = seqs[0]
-        seq_id = seq.seq_id
-        # This method is defined in the more generic
-        # SequenceGroupOutputProcessor, but here we assume that the outputs are
-        # of a more specific type.
-        assert all([
-            isinstance(output, CompletionSequenceGroupOutput)
-            for output in outputs
-        ])
-        compl_outputs = cast(List[CompletionSequenceGroupOutput], outputs)
-        assert all([
-            seq_id == output.samples[0].parent_seq_id
-            for output in compl_outputs
-        ])
-
-        if is_async:
-            # Async case: We process tokens one by one. Here, we know the token
-            # was already appended, so we only need to do the rest of the
-            # postprocessor: Detokenization + stopping logic
-            self._process_decode_and_stop(seq, sequence_group.sampling_params)
-        else:
-            # Standard multi-step case
-
-            # Since there's only one sequence per sequence group,
-            # we can take the first sample.
-            samples = [output.samples[0] for output in compl_outputs]
-
-            # entries in sample tokens may be invalid (eg. due to spec decode
-            # rejecting tokens).
-            valid_samples = [
-                sample for sample in samples
-                if sample.output_token != VLLM_INVALID_TOKEN_ID
-            ]
-
-            # When both spec-decode and pre-fill chunking are enabled, we
-            # don't have guaranteed samples here (e.g. all -1s).
-            if valid_samples:
-                self._process_seq_outputs(seq, valid_samples,
-                                          sequence_group.sampling_params)
-
-    def _process_decode_and_stop(self, seq: Sequence,
-                                 sampling_params: SamplingParams) -> None:
-        new_char_count = 0
-        if sampling_params.detokenize and self.detokenizer:
-            new_char_count = self.detokenizer.decode_sequence_inplace(
-                seq, sampling_params)
-
-        # TODO(sang): Support lora.
-        self.stop_checker.maybe_stop_sequence(
-            seq,
-            new_char_count=new_char_count,
-            sampling_params=sampling_params,
-        )
-
-    def _process_seq_outputs(self, seq: Sequence,
-                             valid_samples: List[SequenceOutput],
-                             sampling_params: SamplingParams) -> None:
-        output_token_ids = [sample.output_token for sample in valid_samples]
-        output_logprobs = [sample.logprobs for sample in valid_samples]
-        output_embeds = [sample.output_embed for sample in valid_samples]
-
-        # Truncate to max_tokens if necessary.
-        remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
-                                                         len(output_token_ids))
-        if remaining_tokens < 0:
-            output_token_ids = output_token_ids[:remaining_tokens]
-
-        # Truncate any tokens after EOS. This is required as spec decode
-        # generates a fixed number of tokens without evaluating stopping
-        # conditions within the block. This can cause an eos token to be
-        # unintentionally ignored.
-        if not sampling_params.ignore_eos and self.detokenizer:
-            eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id
-            # Avoiding .index calls as exception throwing in the happy path
-            # is expensive.
-            for i in range(len(output_token_ids)):
-                if output_token_ids[i] == eos_token_id:
-                    output_token_ids = output_token_ids[:i + 1]
-                    break
-
-        is_prefill_sampled_token = seq.data.get_num_uncomputed_tokens() == 0
-        # Incrementally append tokens to the sequence, as if we had only one new
-        # token.
-        for output_token_id, output_logprob, output_embed in zip(
-                output_token_ids, output_logprobs, output_embeds):
-            seq.append_token_id(
-                token_id=output_token_id,
-                logprobs=output_logprob,
-                token_embed=output_embed,
-            )
-
-            if is_prefill_sampled_token:
-                is_prefill_sampled_token = False
-            else:
-                # Update num_computed_tokens iff the sampled token is not from
-                # a prefill step.
-                seq.data.update_num_computed_tokens(1)
-
-            self._process_decode_and_stop(seq, sampling_params)
-
-            if seq.is_finished():
-                break
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index c876c52a2e..7095913157 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -118,20 +118,10 @@ class CudaPlatformBase(Platform):
     @classmethod
     def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         parallel_config = vllm_config.parallel_config
-        scheduler_config = vllm_config.scheduler_config
         model_config = vllm_config.model_config
 
         if parallel_config.worker_cls == "auto":
-            if scheduler_config.is_multi_step:
-                if envs.VLLM_USE_V1:
-                    raise NotImplementedError(
-                        "Multi-step scheduling is not supported (and not "
-                        "needed) on vLLM V1. Please launch without "
-                        "--num-scheduler-steps.")
-                else:
-                    parallel_config.worker_cls = \
-                        "vllm.worker.multi_step_worker.MultiStepWorker"
-            elif vllm_config.speculative_config:
+            if vllm_config.speculative_config:
                 if not envs.VLLM_USE_V1:
                     raise NotImplementedError(
                         "Speculative decoding is not supported on vLLM V0.")
@@ -139,7 +129,7 @@ class CudaPlatformBase(Platform):
             else:
                 if envs.VLLM_USE_V1:
                     parallel_config.worker_cls = \
-                            "vllm.v1.worker.gpu_worker.Worker"
+                        "vllm.v1.worker.gpu_worker.Worker"
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 8005830f55..2d5bee5fc5 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -327,18 +327,8 @@ class RocmPlatform(Platform):
             cache_config.block_size = 16
 
         parallel_config = vllm_config.parallel_config
-        scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
-            if scheduler_config.is_multi_step:
-                if envs.VLLM_USE_V1:
-                    raise NotImplementedError(
-                        "Multi-step scheduling is not supported (and not "
-                        "needed) on vLLM V1. Please launch without "
-                        "--num-scheduler-steps.")
-                else:
-                    parallel_config.worker_cls = \
-                        "vllm.worker.multi_step_worker.MultiStepWorker"
-            elif vllm_config.speculative_config:
+            if vllm_config.speculative_config:
                 if not envs.VLLM_USE_V1:
                     raise NotImplementedError(
                         "Speculative decoding is not supported on vLLM V0.")
@@ -346,7 +336,7 @@ class RocmPlatform(Platform):
             else:
                 if envs.VLLM_USE_V1:
                     parallel_config.worker_cls = \
-                            "vllm.v1.worker.gpu_worker.Worker"
+                        "vllm.v1.worker.gpu_worker.Worker"
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index c56096d936..c7522a89c2 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -133,18 +133,13 @@ class TpuPlatform(Platform):
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
-            if scheduler_config.is_multi_step:
-                raise NotImplementedError(
-                    "Multi-step scheduling is not supported (and not "
-                    "needed) on vLLM V1. Please launch without "
-                    "--num-scheduler-steps.")
             parallel_config.worker_cls = "vllm.v1.worker.tpu_worker.TPUWorker"
 
         assert not vllm_config.speculative_config, (
             "Speculative decoding is not yet supported for TPU backend")
 
         if scheduler_config.is_multimodal_model and not \
-            scheduler_config.disable_chunked_mm_input:
+                scheduler_config.disable_chunked_mm_input:
             logger.warning("TPU does not support running Multimodal models"\
             " without setting `--disable_chunked_mm_input`. " \
             "Forcing --disable_chunked_mm_input.")
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 6e65a2bd03..cbe63f8d1d 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -794,35 +794,6 @@ class SequenceGroup:
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
-    def init_multi_step(self, num_steps: int) -> None:
-        self.state.num_steps = num_steps
-        self.state.current_step = 0
-
-    def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int,
-                                             num_scheduler_steps: int,
-                                             is_multi_step: bool,
-                                             enable_chunking: bool) -> None:
-
-        if not is_multi_step:
-            self.init_multi_step(num_steps=num_scheduler_steps)
-            return
-
-        # Multi-Step case
-        is_prefill = self.is_prefill()
-
-        # The asserts below reflect the expectations of the current system.
-        if is_prefill and enable_chunking:
-            assert num_lookahead_slots == num_scheduler_steps
-            self.init_multi_step(num_steps=num_lookahead_slots)
-        else:
-            is_decode: bool = not is_prefill
-            # If it is a prefill, num_lookahead_slots must be 0
-            assert num_lookahead_slots == 0 or is_decode
-            # If it is a decode, num_lookahead_slots + 1 must match
-            # the scheduler steps.
-            assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill
-            self.init_multi_step(num_steps=num_lookahead_slots + 1)
-
     def set_last_token_time(self, now: float) -> None:
         """Sets the last token time for Request level timings."""
         # If still in prefill phase, assertion fails.
@@ -1367,15 +1338,6 @@ class ExecuteModelRequest(
     # Async callback
     async_callback: Optional[Callable] = None
 
-    @property
-    def is_first_multi_step(self) -> bool:
-        # TODO(will) make this be able to handle batches with variable number of
-        # steps
-        assert len(self.seq_group_metadata_list) > 0
-        first_seq_group = self.seq_group_metadata_list[0]
-        assert first_seq_group.state is not None
-        return first_seq_group.state.current_step == 0
-
     @property
     def is_last_step(self) -> bool:
         # TODO(will) make this be able to handle batches with variable number of
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 20b9b733cd..a63797e3a4 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -508,8 +508,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
         if inter_data.is_prompt:
             context_len = seq_data.get_num_computed_tokens()
             seq_len = min(seq_len, context_len + token_chunk_size)
-        elif self.runner.scheduler_config.is_multi_step or \
-            self.runner.model_config.is_encoder_decoder:
+        elif self.runner.model_config.is_encoder_decoder:
             context_len = seq_len - 1
         else:
             context_len = seq_data.get_num_computed_tokens()
@@ -778,9 +777,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
             int: Returns the determined number of padding sequences. If
                 CUDA graphs is not viable, returns -1.
         """
-        is_mscp: bool = self.runner.scheduler_config.is_multi_step and \
-                    self.runner.scheduler_config.chunked_prefill_enabled
-        decode_only = self.decode_only or is_mscp
+        decode_only = self.decode_only
         if not decode_only:
             # Early exit so we can treat num_seqs as the batch_size below.
             return -1
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
deleted file mode 100644
index 2aa910bdff..0000000000
--- a/vllm/worker/multi_step_model_runner.py
+++ /dev/null
@@ -1,908 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-import functools
-from dataclasses import dataclass, field
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
-                    Union)
-
-import torch
-
-from vllm.distributed import get_pp_group
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import (PromptLogprobs, SampleLogprobs,
-                                                SamplerOutput,
-                                                SamplingMetadata, get_logprobs,
-                                                get_pythonized_sample_results)
-from vllm.platforms import current_platform
-from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
-                           Logprob, SequenceGroupMetadata, SequenceOutput)
-from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream
-from vllm.worker.model_runner import (GPUModelRunnerBase,
-                                      ModelInputForGPUWithSamplingMetadata)
-from vllm.worker.model_runner_base import (
-    BroadcastableModelInput, _init_attn_metadata_from_tensor_dict,
-    _init_frozen_model_input_from_tensor_dict,
-    _init_sampling_metadata_from_tensor_dict)
-
-from ..model_executor.model_loader.tensorizer import TensorizerConfig
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-logger = init_logger(__name__)
-
-MULTI_STEP_ATTENTION_BACKENDS = [
-    "FLASH_ATTN", "ROCM_FLASH", "FLASHINFER", "NO_ATTENTION"
-]
-MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN", "FLASHINFER"]
-
-def _get_supported_attention_backends(chunked_prefill_enabled: bool) \
-    -> List[str]:
-    if chunked_prefill_enabled:
-        return MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS
-    else:
-        return MULTI_STEP_ATTENTION_BACKENDS
-
-
-def seq_output_builder():
-    return SequenceOutput(
-        0, 0,
-        {0: Logprob(logprob=float('inf'), rank=None, decoded_token=None)})
-
-
-def completion_seq_group_output_builder():
-    return CompletionSequenceGroupOutput([], None)
-
-
-# Used by pythonization to reduce python object allocations
-class PythonizationCache:
-
-    def __init__(self):
-        self.cached_seq_output = PyObjectCache(seq_output_builder)
-        self.cached_completion_seq_group_output = PyObjectCache(
-            completion_seq_group_output_builder)
-
-    def reset(self):
-        self.cached_seq_output.reset()
-        self.cached_completion_seq_group_output.reset()
-
-
-@dataclass
-class ModelOutput:
-    """The output of a single model forward pass.
-
-    The sampler_output_ready_event is set when the tensors in
-    sampler_output are ready (the model+sampler forward pass has
-    completed). We use the event to synchronize the GPU->CPU transfer,
-    which we want to only run when the data has been written to the
-    GPU tensors. Until the event is ready, the tensors in sampler_output
-    will have garbage data.
-
-    There are two scenarios:
-    1. The output tensors are ready and we can pythonize them immediately.
-    2. The output tensors are not ready and we need to wait for the event to be
-    ready.
-    """
-    sampler_output: SamplerOutput
-    sampler_output_ready_event: torch.cuda.Event
-    sampled_token_ids: Optional[torch.Tensor] = None
-    pythonized: bool = False
-    # On-device tensor containing the logprobs of each token.
-    logprobs: Optional["torch.Tensor"] = None
-    pythonization_cache: Optional[PythonizationCache] = None
-
-    def pythonize(self, input_metadata: "StatefulModelInput",
-                  copy_stream: torch.cuda.Stream,
-                  pinned_sampled_token_buffer: torch.Tensor) -> None:
-        """Pythonize the output. Blocking."""
-        if not self.pythonized:
-            self._pythonize_sampler_output(input_metadata, copy_stream,
-                                           pinned_sampled_token_buffer, True)
-            self.pythonized = True
-
-    def maybe_pythonize(self, input_metadata: "StatefulModelInput",
-                        copy_stream: torch.cuda.Stream,
-                        pinned_sampled_token_buffer: torch.Tensor) -> None:
-        """Pythonize the output if ready, else return None. Non-blocking."""
-        if not self.pythonized:
-            self.pythonized = self._pythonize_sampler_output(
-                input_metadata, copy_stream, pinned_sampled_token_buffer,
-                False)
-
-    def _pythonize_sampler_output(self, input_metadata: "StatefulModelInput",
-                                  copy_stream: torch.cuda.Stream,
-                                  pinned_sampled_token_buffer: torch.Tensor,
-                                  blocking: bool) -> bool:
-        """
-        If blocking is set, will block until the forward pass for the output is
-        ready and pythonize the output. Upon completing Pythonization, erases
-        self.logprobs (note that a non-blocking call that is performed when
-        the sampler output is not yet ready, will not erase self.logprobs.)
-        """
-        assert self.sampled_token_ids is not None
-        if not blocking and not self.sampler_output_ready_event.query():
-            return False
-
-        if blocking:
-            self.sampler_output_ready_event.synchronize()
-        with torch.cuda.stream(copy_stream):
-            _pythonize_sampler_output(input_metadata, self.sampler_output,
-                                      pinned_sampled_token_buffer,
-                                      self.sampled_token_ids, self.logprobs,
-                                      self.pythonization_cache)
-
-        # Erase the logprobs GPU-side tensor.
-        # Note that although _pythonize_sampler_output() runs in its
-        # own CUDA stream, nonetheless _pythonize_sampler_output()
-        # cannot return until Pythonization is complete; therefore
-        # we know that by the time the CPU reaches this point,
-        # `self.logprobs` is no longer needed.
-        self.logprobs = None
-        return True
-
-
-@dataclass(frozen=False)
-class StatefulModelInput(BroadcastableModelInput):
-    # actual frozen model input dataclass passed to _base_model_runner
-    frozen_model_input: Optional[ModelInputForGPUWithSamplingMetadata] = None
-
-    # list of model outputs for each step, may not be all pythonized
-    cached_outputs: List[ModelOutput] = field(default_factory=list)
-
-    # used to pass sampled token ids from the last step to the current step for
-    # TP workers. Used to append to end of outputs and used by advance_step
-    last_sampled_token_ids: Optional[torch.Tensor] = None
-    current_step: int = 0
-    is_multi_step: bool = True
-    is_last_step: bool = False
-    is_first_multi_step: bool = False
-    base_output_proc_callback: Optional[Callable] = None
-    # ping-pong data structures for multi-step to wait on the previous step
-    step_cuda_events: List[current_platform.Event] = field(
-        default_factory=lambda: [current_platform.Event(blocking=True)] * 2)
-    num_seqs: int = -1
-    num_queries: int = -1
-    num_single_step_prefills: int = 0
-
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        assert self.frozen_model_input is not None
-        tensor_dict = self.frozen_model_input.as_broadcastable_tensor_dict()
-        new_tensor_dict = {
-            'last_sampled_token_ids': self.last_sampled_token_ids,
-            'current_step': self.current_step,
-            'is_multi_step': self.is_multi_step,
-            'is_last_step': self.is_last_step,
-            'is_first_multi_step': self.is_first_multi_step,
-            'num_seqs': self.num_seqs,
-            'num_queries': self.num_queries,
-            'num_single_step_prefills': self.num_single_step_prefills,
-        }
-        tensor_dict.update(new_tensor_dict)
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "StatefulModelInput":
-        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        tensor_dict = _init_frozen_model_input_from_tensor_dict(
-            ModelInputForGPUWithSamplingMetadata, tensor_dict)
-
-        return cls(**tensor_dict)
-
-    def record_step_event(self, current_stream: torch.cuda.Stream):
-        # record the event for the current step so that the next step can sync
-        # on it. We modulo by 2 to keep the events in a circular buffer and
-        # support any attn backends that may be supported in the future. ie
-        # Flashinfer would want two DecodeWrappers to overlap the CPU and GPU.
-        self.step_cuda_events[self.current_step & 1] = \
-            torch.cuda.Event(blocking=True)
-        self.step_cuda_events[self.current_step & 1].record(current_stream)
-
-    def wait_previous_step(self):
-        # These cuda events are an explicit synchronization to ensure that
-        # advance_step() (for other attn backends that may be supported in the
-        # future) do not clobber any data structures that is also used by any
-        # enqueued forwards steps. For distributed case, only a single event is
-        # needed, but for single GPU case, since we can let the CPU run much
-        # further ahead, two events allow us to overlap the advance_step with
-        # the previous forward (ie using two DecodeWrappers for flashinfer
-        # backend)
-        self.step_cuda_events[(self.current_step + 1) & 1].wait()
-
-    def add_sampler_output(self,
-                           sampler_output: SamplerOutput,
-                           sampled_token_ids: Optional[torch.Tensor] = None):
-        self.cached_outputs.append(
-            ModelOutput(sampler_output=sampler_output,
-                        sampler_output_ready_event=None,
-                        sampled_token_ids=sampled_token_ids,
-                        pythonized=False))
-
-    def maybe_advance_sampling_metadata(self, device: str, pin_memory: bool):
-        """
-        sampling_metadata.selected_token_indices is constructed for the
-        first-step in Multi-Step. However, when chunked-prefill is enabled with
-        multi-step, the scheduled prompts are fully processed in the
-        first-step and are processed as decodes in the rest of the steps.
-        This function updates the sampling_metadata.selected_token_indices
-        to account for this conversion.
-
-        Example:
-        Let 2 prompts and 2 decodes be scheduled together. Let the
-        num-tokens to process for the 2 prompts be 5 and 8 respectively.
-
-        In that case, sampling_metadata.sampled_token_indices will be,
-        [4, 12, 13, 14] as it is constructed for the first-step in
-        multi-step.
-        However, the prompts turns to decodes after the first-step
-        and the num-tokens for the previously-prompt sequences will
-        be 1 and 1 as they are decodes now. The self.sampled_token_indices
-        must be updated to [0,1,2,3].
-        """
-        assert self.current_step == 1 and self.num_single_step_prefills > 0
-        if not get_pp_group().is_last_rank:
-            return
-
-        assert self.frozen_model_input is not None
-        assert self.frozen_model_input.sampling_metadata is not None
-        self.frozen_model_input.sampling_metadata.selected_token_indices =  \
-            async_tensor_h2d(list(range(self.num_queries)),
-                             dtype=torch.long,
-                             target_device=device,
-                             pin_memory=pin_memory)
-
-    def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool):
-        """
-        Advancing the datastructures of StatefulModelInput::frozen_model_input
-        is only required when prefills are scheduled with decodes to run in
-        multi-step. This advancement/correction is required to account for
-        the conversion of Prefills to Decodes after the first multi-step.
-        """
-        if self.current_step != 1 or self.num_single_step_prefills == 0:
-            return
-
-        assert self.frozen_model_input is not None
-        fmi = self.frozen_model_input
-
-        # Truncate input_tokens
-        assert fmi.input_tokens is not None
-        assert fmi.input_tokens.shape[0] >= self.num_seqs
-        fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs]
-
-        # Update frozen_model_input::input_positions.
-        assert fmi.input_positions is not None
-        assert fmi.input_positions.shape[0] >= self.num_seqs
-        fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self.
-                                                                    num_seqs]
-
-        # Assert unsupported
-        assert fmi.lora_mapping is None
-        assert fmi.lora_requests is not None
-        assert len(fmi.lora_requests) == 0
-        assert fmi.attn_metadata is not None
-        assert fmi.multi_modal_kwargs is not None
-        assert len(fmi.multi_modal_kwargs) == 0
-
-        self.frozen_model_input = dataclasses.replace(
-            self.frozen_model_input,
-            input_tokens=fmi_new_input_tokens,
-            input_positions=fmi_new_input_positions)
-
-        self.maybe_advance_sampling_metadata(device, pin_memory)
-
-
-# MutableModelInputForGPUWithMultiStepMetadata is not subclass of
-# ModelInputForGPU but it wraps the actual input dataclass and adds multi-step
-# metadata
-# mypy: disable-error-code=type-var
-class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
-    # mypy: enable-error-code=type-var
-
-    def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs):
-
-        super().__init__(*args, **kwargs)
-
-        # Check attention backend support.
-        supported_attention_backends: List[str] = \
-            _get_supported_attention_backends(
-                self.scheduler_config.chunked_prefill_enabled)
-        if self.attn_backend.get_name() not in supported_attention_backends:
-            ms_config_str: str = "Multi-Step + Chunked-Prefill" \
-                if self.scheduler_config.chunked_prefill_enabled \
-                      else "Multi-Step"
-            raise ValueError(
-                f"{ms_config_str} not supported for attention backend: "
-                f"{self.attn_backend.get_name()}. Set VLLM_ATTENTION_BACKEND "
-                f"to a value from {supported_attention_backends}.")
-
-        # uses the base model runner to execute the model and wraps it with
-        # multi-step logic
-        self._base_model_runner: GPUModelRunnerBase = base_model_runner
-
-        self.is_multi_step = self.scheduler_config.is_multi_step
-        self.pinned_sampled_token_ids: Optional[torch.Tensor] = None
-
-        # Using the PythonizationCache in Pipeline-Parallel clobbers the
-        # SequenceOutput and CompletionSequenceGroupOutput object.
-        # When cache-reset happens at the last step of a multi-step
-        # execution, there may be other on-going single-step/multi-step
-        # executions. The current caching implementation does not check
-        # for this.
-        self.pythonization_cache = PythonizationCache() \
-            if self.parallel_config.pipeline_parallel_size == 1 else None
-
-    @functools.cached_property
-    def _copy_stream(self):
-        # used to copy tensors from GPU to CPU asynchronously
-        return torch.cuda.Stream()
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self, tensor_dict: Dict[str, Any]) -> StatefulModelInput:
-        model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        ))
-        return model_input
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> StatefulModelInput:
-        frozen_model_input: ModelInputForGPUWithSamplingMetadata = \
-              self._base_model_runner.prepare_model_input(
-                    seq_group_metadata_list,
-                    virtual_engine,
-                    finished_requests_ids)
-
-        assert frozen_model_input.query_lens is not None
-        assert frozen_model_input.seq_lens is not None
-        assert frozen_model_input.attn_metadata is not None
-        num_queries = len(frozen_model_input.query_lens)
-        num_seqs = len(frozen_model_input.seq_lens)
-        num_single_step_prefills = frozen_model_input.attn_metadata.num_prefills
-
-        model_input = StatefulModelInput(
-            frozen_model_input=frozen_model_input,
-            num_seqs=num_seqs,
-            num_queries=num_queries,
-            num_single_step_prefills=num_single_step_prefills)
-
-        return model_input
-
-    def _async_process_outputs(self, model_input: StatefulModelInput,
-                               output_proc_callback: Callable):
-        # Proceed with pythonization and output_proc in order.
-        # Stop on the first one that fails to pythonize
-        output_proc_callback()
-
-        cont = True
-        for step_num, model_output in enumerate(model_input.cached_outputs):
-            if not model_output.pythonized:
-                model_output.maybe_pythonize(model_input, self._copy_stream,
-                                             self.pinned_sampled_token_ids)
-                if model_output.pythonized:
-                    ctx = output_proc_callback.keywords["ctx"]
-                    ctx.append_output(
-                        outputs=[model_output.sampler_output],
-                        seq_group_metadata_list=ctx.seq_group_metadata_list,
-                        scheduler_outputs=ctx.scheduler_outputs,
-                        is_async=False,
-                        is_last_step=False,
-                        is_first_step_output=step_num == 0)
-
-                    output_proc_callback()
-                else:
-                    cont = False
-
-            if not cont:
-                break
-
-    def _final_process_outputs(
-            self, model_input: StatefulModelInput,
-            output_proc_callback: Optional[Callable]) -> List[SamplerOutput]:
-        assert model_input.frozen_model_input is not None
-
-        has_async_callback = output_proc_callback is not None
-
-        outputs = []
-        for step_num, output in enumerate(model_input.cached_outputs):
-            is_last_step = step_num == len(model_input.cached_outputs) - 1
-
-            # For non-async case:
-            #   -- We simply add the outputs
-            # For async case:
-            #   -- Invoke callback, pythonize, add to callback queue and repeat
-            #   -- For last output, just add to callback queue
-            if has_async_callback:
-                assert output_proc_callback is not None
-
-                # Invoke callback before pythonize (to overlap with GPU)
-                output_proc_callback()
-
-                # Pythonize
-                if not output.pythonized:
-                    output.pythonize(model_input, self._copy_stream,
-                                     self.pinned_sampled_token_ids)
-
-                    # For non last step, add to callback queue to chain
-                    # callbacks=>pythonize pairs (for GPU overlap)
-                    if not is_last_step:
-                        ctx = output_proc_callback.keywords[  # type: ignore
-                            "ctx"]  # type: ignore
-                        ctx.append_output(
-                            outputs=[output.sampler_output],
-                            seq_group_metadata_list=ctx.
-                            seq_group_metadata_list,
-                            scheduler_outputs=ctx.scheduler_outputs,
-                            is_async=False,
-                            is_last_step=False,
-                            is_first_step_output=step_num == 0)
-                    else:
-                        outputs.append(output.sampler_output)
-            else:
-                output.pythonize(model_input, self._copy_stream,
-                                 self.pinned_sampled_token_ids)
-                outputs.append(output.sampler_output)
-
-        return outputs
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: StatefulModelInput,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
-        """ 
-        Execute the model for a single step and update multi-step
-        metadata
-        """
-        assert num_steps == 1, "MultiStepModelRunner only supports num_steps=1"
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-
-        # path for warm up runs
-        if not model_input.is_multi_step:
-            return self._base_model_runner.execute_model(
-                frozen_model_input, None, intermediate_tensors, num_steps)
-
-        # make sure we skip the sampler on the lask rank and only pythonize
-        # if CPU is ahead.
-        if self.is_driver_worker and get_pp_group().is_last_rank:
-            if self.pinned_sampled_token_ids is None:
-                self.pinned_sampled_token_ids = torch.zeros(
-                    (self.scheduler_config.max_num_seqs, 1),
-                    dtype=torch.long,
-                    device="cpu",
-                    pin_memory=True)
-
-            self._base_model_runner.sampler.include_gpu_probs_tensor = True
-            if frozen_model_input.sampling_metadata:
-                frozen_model_input.sampling_metadata.skip_sampler_cpu_output = (
-                    True)
-
-        # some pre-execute model logic for multi-step:
-        #   - if it's the first step, we need to reset the sampling tensors
-        #   - if it's not the first step, we need to advance the step using the
-        #   appended sampler output from last iteration
-        #   - also maybe pythonize if CPU is ahead of GPU
-
-        stream = current_stream()
-        if not model_input.is_first_multi_step:
-            # Explicitly block on the previous step's forward to make sure we
-            # don't clobber any GPU tensors still in use.
-            # This is not needed for flashattn backend, but for other attn
-            # backends such as flashinfer that performs extra CPU operations on
-            # input metadata we may need to synchronize any CPU operations that
-            # might clobber enqueued forwards. (prevents CPU from running too
-            # far ahead if needed)
-            model_input.wait_previous_step()
-            model_input = self._advance_step(
-                model_input, model_input.cached_outputs[-1].sampler_output)
-
-            # frozen_model_input may have been updated
-            frozen_model_input = model_input.frozen_model_input
-            assert frozen_model_input is not None
-
-        if model_input.base_output_proc_callback is None:
-            assert frozen_model_input is not None
-            model_input.base_output_proc_callback = \
-                        frozen_model_input.async_callback
-
-        if frozen_model_input.async_callback is not None:
-            assert model_input.base_output_proc_callback is not None
-            async_callback = functools.partial(
-                self._async_process_outputs,
-                model_input=model_input,
-                output_proc_callback=model_input.base_output_proc_callback)
-
-            model_input.frozen_model_input = dataclasses.replace(  # type: ignore
-                model_input.frozen_model_input,
-                async_callback=async_callback)
-            # Update the local instance
-            frozen_model_input = model_input.frozen_model_input
-            assert frozen_model_input is not None
-
-        # Execute the model
-        output = self._base_model_runner.execute_model(frozen_model_input,
-                                                       None,
-                                                       intermediate_tensors,
-                                                       num_steps=1)
-
-        # record the event for the current step so that the next step can sync
-        model_input.record_step_event(stream)
-
-        if get_pp_group().is_last_rank and self.is_driver_worker:
-            assert isinstance(output, list)
-            assert len(
-                output
-            ) == 1, "MultiStepModelRunner requires single-step base_models"
-
-            # event for the pythonization so that we only pythonize if the
-            # tensors are ready. May be able to be combined with the step event
-            output_ready_event = torch.cuda.Event()
-            output_ready_event.record(stream)
-            if self.parallel_config.pipeline_parallel_size > 1:
-                output[0].sampled_token_ids_cpu = output[
-                    0].sampled_token_ids.cpu()
-            model_input.cached_outputs.append(
-                ModelOutput(output[0], output_ready_event,
-                            output[0].sampled_token_ids, False,
-                            output[0].logprobs, self.pythonization_cache))
-
-            # These GPU tensors are not required by multi-step;
-            # erase them to ensure they are not pythonized or
-            # transferred to CPU
-            output[0].sampled_token_ids = None
-            output[0].sampled_token_probs = None
-            output[0].logprobs = None
-
-            # Pythonize the output if CPU is ahead and the previous step is
-            # ready.
-            if frozen_model_input.async_callback is None:
-                for model_output in model_input.cached_outputs:
-                    model_output.maybe_pythonize(model_input,
-                                                 self._copy_stream,
-                                                 self.pinned_sampled_token_ids)
-
-        model_input.current_step += 1
-
-        if not get_pp_group().is_last_rank:
-            # Should be IntermediateTensors
-            assert isinstance(output, IntermediateTensors)
-            return output
-        if not self.is_driver_worker:
-            return []
-
-        # Pythonize the output and block if needed since it is the last step
-        if model_input.is_last_step:
-            outputs = self._final_process_outputs(
-                model_input, model_input.base_output_proc_callback)
-            if self.pythonization_cache:
-                self.pythonization_cache.reset()
-            return outputs
-
-        # should be [SamplerOutput]
-        return output
-
-    def _update_sampling_metadata(self, sampling_metadata: SamplingMetadata,
-                                  num_seqs: Optional[int], num_queries: int):
-
-        assert sampling_metadata.num_prompts == 0
-        assert len(sampling_metadata.seq_groups) == num_queries
-        assert sampling_metadata.selected_token_indices.shape == (
-            num_queries, )
-        # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501
-
-        # Verify that all sequences are decodes
-        for i in range(num_queries):
-            seq_group = sampling_metadata.seq_groups[i]
-
-            assert seq_group.is_prompt is False  # No prompt
-            assert seq_group.prompt_logprob_indices == []  # No prompt
-            assert seq_group.sample_indices == [i]  # Simple
-            assert seq_group.seq_len is None  # Decode
-            assert seq_group.query_len is None  # Decode
-
-    def _advance_step(self, model_input: StatefulModelInput,
-                      out: SamplerOutput) -> StatefulModelInput:
-
-        model_input.maybe_advance_frozen_model_input(self.device,
-                                                     self.pin_memory)
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-        assert frozen_model_input.input_tokens is not None
-        assert frozen_model_input.input_tokens.shape[0] == model_input.num_seqs
-        assert frozen_model_input.attn_metadata is not None
-
-        sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids
-        num_seqs = model_input.num_seqs
-        num_queries = model_input.num_queries
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-        attn_metadata = frozen_model_input.attn_metadata
-        assert attn_metadata is not None
-
-        turn_prefills_into_decodes: bool = model_input.current_step == 1 and \
-                                    model_input.num_single_step_prefills != 0
-        attn_metadata.advance_step(
-            frozen_model_input,
-            sampled_token_ids,
-            self.block_size,
-            num_seqs,
-            num_queries,
-            turn_prefills_into_decodes=turn_prefills_into_decodes)
-
-        return model_input
-
-    def load_model(self) -> None:
-        self._base_model_runner.load_model()
-        self.model_memory_usage = self._base_model_runner.model_memory_usage
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
-    ) -> None:
-        return self._base_model_runner.save_sharded_state(
-            path, pattern, max_size)
-
-    def save_tensorized_model(self,
-                              tensorizer_config: TensorizerConfig) -> None:
-        return self._base_model_runner.save_tensorized_model(tensorizer_config)
-
-    def profile_run(self) -> None:
-        return self._base_model_runner.profile_run()
-
-    def remove_all_loras(self):
-        return self._base_model_runner.remove_all_loras()
-
-    def capture_model(self, kv_caches: List[List]) -> None:
-        return self._base_model_runner.capture_model(kv_caches)
-
-    @property
-    def vocab_size(self) -> int:
-        return self._base_model_runner.vocab_size
-
-
-DeferredLogprobsReturnType = Tuple[Optional[List[Optional[PromptLogprobs]]],
-                                   Optional[List[SampleLogprobs]]]
-
-
-def deferred_pythonize_logprobs(
-    output: SamplerOutput,
-    sampling_metadata: SamplingMetadata,
-    logprobs_tensor: Optional[torch.Tensor],
-) -> DeferredLogprobsReturnType:
-    """Perform deferred logprob Pythonization.
-
-    1. Pythonize GPU-side sampler result tensors into CPU-side sampler result.
-    2. Pythonize GPU-side logprobs tensor into CPU-side logprobs lists,
-       utilizing  the Pythonized sampler result computed in step 1.
-    
-    These deferred computations are not required for single-step scheduling
-    or the `profile_run()` phase of multi-step scheduling.
-
-    Args:
-        output: sampler output (under deferred Pythonization)
-        sampling_metadata
-        
-    Returns:
-        prompt_logprobs (CPU), sample_logprobs (CPU)
-    """
-
-    # - Deferred pythonization of sample result
-    sampler_result = get_pythonized_sample_results(
-        output.deferred_sample_results_args)
-
-    # - Erase the GPU-side deferred sample_result
-    #   computation args to ensure it is never
-    #   pythonized or transferred to CPU
-    output.deferred_sample_results_args = None
-
-    # - Deferred pythonization of logprobs
-    (
-        prompt_logprobs,
-        sample_logprobs,
-    ) = get_logprobs(logprobs_tensor, sampling_metadata, sampler_result)
-    assert len(prompt_logprobs) == len(sampling_metadata.seq_groups)
-    assert len(sample_logprobs) == len(sampling_metadata.seq_groups)
-
-    return prompt_logprobs, sample_logprobs
-
-
-def _pythonize_sampler_output(
-    model_input: StatefulModelInput,
-    output: SamplerOutput,
-    pinned_sampled_token_buffer: torch.Tensor,
-    sampled_token_ids: torch.Tensor,
-    logprobs_tensor: Optional[torch.Tensor],
-    cache: Optional[PythonizationCache],
-) -> None:
-    """ This function is only called when the output tensors are ready.
-    See [`ModelOutput`][vllm.worker.multi_step_model_runner.ModelOutput].
-
-    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
-    adding a Pythonized output data structure
-    ([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput])
-    for each [`SequenceGroup`][vllm.sequence.SequenceGroup].
-
-    Args:
-      model_input
-      output: sampler output
-      pinned_sampled_token_token_buffer: CPU-side pinned memory
-                                         (receives copy of
-                                         GPU-side token buffer.)
-      sampled_token_ids: GPU-side token buffer
-      logprobs_tensor: GPU-side tensor containing 
-                       logprobs computed during sampling
-    """
-
-    assert model_input.frozen_model_input is not None
-
-    frozen_model_input = model_input.frozen_model_input
-    assert frozen_model_input.sampling_metadata is not None
-    sampling_metadata = frozen_model_input.sampling_metadata
-    # samples generation should have been skipped
-    assert not output.outputs
-
-    pinned_buffer = pinned_sampled_token_buffer[:model_input.num_queries]
-
-    # We guarantee output tensors are ready, so it is safe to
-    # pythonize the sampler output & obtain CPU-side logprobs.
-    #
-    # However we should check whether logprobs pythonization may
-    # be skipped entirely, i.e. because no logprobs were requested
-    # or pythonization was not deferred. To that end,
-    #
-    # * `prompt_logprobs_are_requested_for_prefill` signals that
-    #   there are *any* prefill-phase requests which specify that
-    #   prompt logprobs should be returned.
-    #
-    # * `any_logprobs_are_requested` signals that there are any
-    #   requests which (1) specify that sample logprobs should be
-    #   returned, or (2) are in the prefill phase AND specify that
-    #   prompt logprobs should be returned.
-    #
-    # Later on, these flags cause adjustments to the pythonization
-    # process to accommodate logprobs.
-
-    seq_groups = sampling_metadata.seq_groups
-    prompt_logprobs_are_requested_for_prefill = any([
-        sg.sampling_params.prompt_logprobs is not None and sg.is_prompt
-        for sg in seq_groups
-    ])
-    any_logprobs_are_requested = (
-        prompt_logprobs_are_requested_for_prefill
-        or any([sg.sampling_params.logprobs is not None for sg in seq_groups]))
-
-    if prompt_logprobs_are_requested_for_prefill:
-        # CPU GPU sync, after gathering *only* sampled tokens (since
-        # requesting prompt logprobs leads `sampled_token_ids` to
-        # include prompt token ids in addition to sampled token ids.)
-        sample_idx_tensor = torch.tensor(
-            [sdx for sg in seq_groups for sdx in sg.sample_indices])
-        pinned_buffer = pinned_buffer.copy_(
-            sampled_token_ids[sample_idx_tensor, :], non_blocking=False)
-    else:
-        # CPU GPU sync
-        pinned_buffer = pinned_buffer.copy_(sampled_token_ids,
-                                            non_blocking=False)
-
-    # this will not block as the tensors are already on CPU
-    samples_list = pinned_buffer.tolist()
-
-    skip_sampler_cpu_output = (
-        frozen_model_input.sampling_metadata.skip_sampler_cpu_output)
-
-    # *Don't* skip logprobs pythonization *if*:
-    # * Any requests require logprobs to be returned in this
-    # iteration AND
-    # * These requests are being scheduled in a fashion which
-    # defers pythonization (i.e. multi-step scheduling.)
-    do_pythonize_logprobs = (skip_sampler_cpu_output
-                             and any_logprobs_are_requested)
-    (
-        prompt_logprobs,
-        sample_logprobs,
-    ) = (deferred_pythonize_logprobs(output, sampling_metadata,
-                                     logprobs_tensor)
-         if do_pythonize_logprobs else (None, None))
-
-    for sgdx, (seq_group,
-               sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/features/compatibility_matrix.md
-        # If the feature combo become valid
-        # (Check for Guided Decoding)
-        if seq_group.sampling_params.logits_processors:
-            assert len(seq_group.sampling_params.logits_processors) == 0, (
-                "Logits Processors are not supported in multi-step decoding")
-
-        if do_pythonize_logprobs:
-            assert prompt_logprobs is not None
-            assert sample_logprobs is not None
-
-            (
-                group_prompt_logprobs,
-                group_sample_logprobs,
-            ) = (  # Utilize deferred pythonization results
-                prompt_logprobs[sgdx],
-                sample_logprobs[sgdx],
-            )
-        elif any_logprobs_are_requested:
-            (
-                group_prompt_logprobs,
-                group_sample_logprobs,
-            ) = (
-                # profile_run: use already-computed logprobs
-                output.outputs[sgdx].prompt_logprobs,
-                [sample.logprobs for sample in output.outputs[sgdx].samples])
-
-        seq_ids = seq_group.seq_ids
-        next_token_ids = sample_result
-        parent_ids = [0]
-        seq_outputs: List[SequenceOutput]
-
-        if cache is not None:
-            completion_seq_group_output: CompletionSequenceGroupOutput = \
-                cache.cached_completion_seq_group_output.get_object()
-            completion_seq_group_output.samples.clear()
-            seq_outputs = completion_seq_group_output.samples
-        else:
-            seq_outputs = []
-
-        for tdx, (parent_id,
-                  next_token_id) in enumerate(zip(parent_ids, next_token_ids)):
-            if cache is not None:
-                seq_output: SequenceOutput = cache.cached_seq_output.get_object(
-                )
-                seq_output.parent_seq_id = seq_ids[parent_id]
-                seq_output.output_token = next_token_id
-
-                if any_logprobs_are_requested:
-                    seq_output.logprobs = group_sample_logprobs[tdx]
-                else:
-                    logprobs = next(iter(seq_output.logprobs.values()))
-                    seq_output.logprobs.clear()
-
-                    logprobs.logprob = float('inf')
-                    logprobs.rank = None
-                    logprobs.decoded_token = None
-
-                    seq_output.logprobs[next_token_id] = logprobs
-
-                seq_outputs.append(seq_output)
-
-            else:
-                seq_outputs.append(
-                    SequenceOutput(seq_ids[parent_id], next_token_id,
-                                   (group_sample_logprobs[tdx]
-                                    if any_logprobs_are_requested else {
-                                        next_token_id:
-                                        Logprob(logprob=float('inf'),
-                                                rank=None,
-                                                decoded_token=None)
-                                    })))
-        if cache is not None:
-            completion_seq_group_output.prompt_logprobs = \
-                group_prompt_logprobs if any_logprobs_are_requested else None
-            output.outputs.append(completion_seq_group_output)
-        else:
-            output.outputs.append(
-                CompletionSequenceGroupOutput(
-                    seq_outputs, (group_prompt_logprobs
-                                  if any_logprobs_are_requested else None)))
-
-    assert len(output.outputs) > 0
diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py
deleted file mode 100644
index 25f588077c..0000000000
--- a/vllm/worker/multi_step_neuron_model_runner.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from importlib.util import find_spec
-from typing import List, Optional
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MultiModalKwargs
-from vllm.sequence import IntermediateTensors
-from vllm.worker.neuron_model_runner import (ModelInputForNeuron,
-                                             NeuronModelRunner)
-
-
-class MultiStepNeuronModelRunner(NeuronModelRunner):
-    """A model runner for multi step decoding using the transformers_neuronx
-    framework"""
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ):
-        super().__init__(vllm_config)
-        self.speculation_config = self.speculative_config
-        from transformers_neuronx.config import GenerationConfig
-        self.speculation_config.draft_model_config.neuron_sampling_params = (
-            GenerationConfig(
-            max_length=self.scheduler_config.max_model_len,
-            do_sample=True,
-            per_batch_line=True,
-            top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \
-                  * self.scheduler_config.max_num_seqs,
-            top_p=[1.0] * self.scheduler_config.max_num_seqs,
-            temperature=[1.0] * self.scheduler_config.max_num_seqs,
-            dynamic=True,
-            global_top_k=self._MAX_NEURON_SAMPLING_TOP_K
-        ))
-
-    def load_model(self) -> None:
-        if find_spec("transformers_neuronx") is not None:
-            from vllm.model_executor.model_loader.neuron import (
-                get_neuron_eagle_speculation_model,
-                get_neuron_speculation_model)
-            if self.speculation_config.speculative_token_tree is not None:
-                self.model = get_neuron_eagle_speculation_model(
-                    self.model_config,
-                    parallel_config=self.parallel_config,
-                    scheduler_config=self.scheduler_config,
-                    speculation_config=self.speculation_config)
-            else:
-                self.model = get_neuron_speculation_model(
-                    self.model_config,
-                    parallel_config=self.parallel_config,
-                    scheduler_config=self.scheduler_config,
-                    speculation_config=self.speculation_config)
-        else:
-            raise NotImplementedError(
-                "Supports only Transformer-NeuronX based models.")
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForNeuron,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        logits = self.model(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            input_block_ids=model_input.input_block_ids,
-            **MultiModalKwargs.as_kwargs(
-                model_input.multi_modal_kwargs or {},
-                device=self.device,
-            ),
-        )
-
-        output = self.model.sample(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        return output
diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
deleted file mode 100644
index dd521dd67d..0000000000
--- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import List, Optional
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MultiModalKwargs
-from vllm.sequence import IntermediateTensors
-from vllm.worker.neuronx_distributed_model_runner import (
-    NeuronxDistributedModelRunner)
-
-
-class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner):
-    """A model runner for multi-step decoding using the
-    neuronx-distributed-inference framework"""
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ):
-        super().__init__(vllm_config)
-
-    def load_model(self) -> None:
-        from vllm.model_executor.model_loader.neuronx_distributed import (
-            get_neuron_speculation_model)
-        self.model = get_neuron_speculation_model(
-            self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            speculation_config=self.speculative_config)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        sampling_params = torch.tensor([[
-            seq_group.sampling_params.top_k,
-            seq_group.sampling_params.top_p,
-            seq_group.sampling_params.temperature,
-        ] for seq_group in model_input.sampling_metadata.seq_groups])
-
-        logits = self.model(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            input_block_ids=model_input.input_block_ids,
-            sampling_params=sampling_params,
-            **MultiModalKwargs.as_kwargs(
-                model_input.multi_modal_kwargs or {},
-                device=self.device,
-            ),
-        )
-
-        output = self.model.sample(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        return output
diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py
deleted file mode 100644
index ea16e14f9e..0000000000
--- a/vllm/worker/multi_step_worker.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-
-import torch
-
-from vllm.distributed import broadcast_tensor_dict, get_pp_group
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.worker.model_runner_base import BroadcastableModelInput
-from vllm.worker.multi_step_model_runner import (MultiStepModelRunner,
-                                                 StatefulModelInput)
-from vllm.worker.worker import Worker, WorkerInput
-
-
-@dataclass
-class MultiStepState:
-    worker_input: WorkerInput
-    model_input: StatefulModelInput
-
-
-class MultiStepWorker(Worker):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        base_model_runner = self.model_runner
-        # for multi-step model, wrap the model runner with MultiStepModelRunner
-        self.model_runner = MultiStepModelRunner(
-            base_model_runner,
-            vllm_config=base_model_runner.vllm_config,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=base_model_runner.is_driver_worker,
-        )
-
-        pipeline_parallel_size = self.parallel_config.pipeline_parallel_size
-        self.multi_step_states: List[
-            Optional[MultiStepState]] = [None] * pipeline_parallel_size
-        self.temp_output = None
-
-    def _get_driver_input_and_broadcast(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]:
-        """
-        Get the driver input and broadcast it to other workers.
-        """
-        assert self.is_driver_worker
-        virtual_engine = execute_model_req.virtual_engine
-        is_first_multi_step = execute_model_req.is_first_multi_step
-        if is_first_multi_step:
-            # on first step we prepare the worker input and model input normally
-            worker_input: WorkerInput = self.prepare_worker_input(
-                execute_model_req=execute_model_req)
-            model_input: StatefulModelInput = (
-                self.model_runner.prepare_model_input(
-                    execute_model_req.seq_group_metadata_list,
-                    execute_model_req.virtual_engine,
-                    execute_model_req.finished_requests_ids))
-
-            if execute_model_req.async_callback:
-                model_input.frozen_model_input = dataclasses.replace(  # type: ignore
-                    model_input.frozen_model_input,
-                    async_callback=execute_model_req.async_callback)
-        else:
-            # on subsequent steps we reuse the worker input and model input
-            multi_step_state = self.multi_step_states[virtual_engine]
-            worker_input = multi_step_state.worker_input
-            model_input = multi_step_state.model_input
-            frozen_model_input = model_input.frozen_model_input
-            assert frozen_model_input is not None
-            assert frozen_model_input.attn_metadata is not None
-            # clear the cached metadata so that it can be recomputed on
-            # the workers.
-            frozen_model_input.attn_metadata._cached_prefill_metadata = None
-            frozen_model_input.attn_metadata._cached_decode_metadata = None
-
-        model_input.is_first_multi_step = is_first_multi_step
-        model_input.is_last_step = execute_model_req.is_last_step
-
-        if not is_first_multi_step:
-            # we broadcast the last sampled token ids to all TP workers so they
-            # can update their model input metadata in-place.
-            self._prepare_last_sampled_token_ids_for_tp_workers(
-                execute_model_req=execute_model_req, model_input=model_input)
-
-        if self.do_metadata_broadcast:
-            broadcast_data = worker_input.as_broadcastable_tensor_dict()
-            broadcast_data.update(model_input.as_broadcastable_tensor_dict())
-            broadcast_tensor_dict(broadcast_data, src=0)
-
-        # Retuning empty dict here to keep this compatible with
-        # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
-        return model_input, worker_input, {}
-
-    def _prepare_last_sampled_token_ids_for_tp_workers(
-        self,
-        execute_model_req: ExecuteModelRequest,
-        model_input: StatefulModelInput,
-    ) -> None:
-        """ 
-        Prepare the last sampled token ids for TP workers. If it's the last 
-        PP rank, then the last sampled token ids are already in the model_input.
-        If it is NOT the last PP rank, then we need to get the last sampled
-        token that is cached in the execute_model_req.
-        """
-        if get_pp_group().is_last_rank:
-            assert model_input.cached_outputs[
-                -1].sampler_output.sampled_token_ids is None
-            assert model_input.cached_outputs[-1].sampled_token_ids is not None
-            model_input.last_sampled_token_ids = model_input.cached_outputs[
-                -1].sampled_token_ids
-            # free sampled token ids from the previous step if it has been
-            # pythonized. Cannot free the last sampled token ids because
-            # we need it for GPU advance_step.
-            for output in model_input.cached_outputs[:-1]:
-                if output.pythonized:
-                    output.sampled_token_ids = None
-        else:
-            # otherwise we need to get the cached sampled token ids from the
-            # execute_model_req
-            assert execute_model_req.last_sampled_token_ids is not None
-            model_input.last_sampled_token_ids = (
-                execute_model_req.last_sampled_token_ids.cuda())
-            model_input.add_sampler_output(
-                SamplerOutput(outputs=[], sampled_token_ids=None),
-                model_input.last_sampled_token_ids)
-
-            # free sampled token ids from the previous step.
-            # TODO(will) we could reuse the sampled token ids tensor from
-            # the previous step instead.
-            for output in model_input.cached_outputs[:-1]:
-                output.sampled_token_ids = None
-            assert model_input.cached_outputs[-1].sampled_token_ids is not None
-
-    def prepare_input(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> Optional[Tuple[StatefulModelInput, WorkerInput, Dict[str,
-                                                              torch.Tensor]]]:
-        """
-        Depending on the current state of the request and multi step worker,
-        this method may skip the normal _prepare_model_input and
-        _prepare_worker_input methods and instead used cached values.
-        """
-        if self.is_driver_worker:
-            if execute_model_req is None:
-                if self.do_metadata_broadcast:
-                    # This signals that there's no more requests to process for
-                    # now. All workers are running infinite loop with
-                    # broadcast_tensor_dict, and it stops the loop when the
-                    # driver broadcasts an empty input. Send an empty input to
-                    # notify all other workers to stop their execution loop.
-                    broadcast_tensor_dict({}, src=0)
-                return None
-
-            virtual_engine = execute_model_req.virtual_engine
-            (model_input, worker_input,
-             kwargs) = self._get_driver_input_and_broadcast(execute_model_req)
-            assert isinstance(model_input, StatefulModelInput)
-            if execute_model_req.is_first_multi_step:
-                # cache the worker input and model input for the next steps
-                self.multi_step_states[virtual_engine] = MultiStepState(
-                    worker_input=worker_input, model_input=model_input)
-        # if TP workers
-        else:
-            broadcast_data = self._get_worker_input_from_broadcast()
-            # if the driver has sent an empty input, we should stop the worker
-            # loop
-            if broadcast_data is None:
-                return None
-            model_input, worker_input, kwargs = broadcast_data
-            assert isinstance(model_input, StatefulModelInput)
-            virtual_engine = worker_input.virtual_engine
-            if model_input.is_first_multi_step:
-                pass
-                # TODO(will) Can cache the worker input and model input for the
-                # next steps. See below for details
-            else:
-                # TODO(will) possible to also cache and reuse the cached worker
-                # input and model input. The idea is essentially the delta
-                # optimization for model_inputs. Where the TP workers can cache
-                # the model input states and we only broadcast the delta need
-                # for the next step (sampled_token_ids from the previous step)
-
-                assert isinstance(model_input, StatefulModelInput)
-                # we need to update the last sampled token ids in the model
-                # input for the workers so that they can run inplace
-                # advance_step
-                model_input.add_sampler_output(
-                    SamplerOutput(outputs=[], sampled_token_ids=None),
-                    model_input.last_sampled_token_ids)
-
-        assert model_input is not None
-        assert worker_input is not None
-        return model_input, worker_input, kwargs
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 4e1408300f..3e4512a639 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -64,25 +64,21 @@ class NeuronWorker(LocalOrDistributedWorkerBase):
         assert (self.lora_config
                 is None), ("LoRA is not supported for TransformersNeuronX "
                            "framework.")
-        from vllm.worker.multi_step_neuron_model_runner import (
-            MultiStepNeuronModelRunner)
         if self.speculative_config is not None:
-            return MultiStepNeuronModelRunner(vllm_config=vllm_config)
-        else:
-            return NeuronModelRunner(vllm_config=vllm_config)
+            raise NotImplementedError(
+                "Speculative decoding is not supported for TransformersNeuronX"
+            )
+        return NeuronModelRunner(vllm_config=vllm_config)
 
     def get_neuronx_distributed_model_runner(self, vllm_config):
-        from vllm.worker.multi_step_neuronx_distributed_model_runner import (
-            MultiStepNeuronxDistributedModelRunner)
         from vllm.worker.neuronx_distributed_model_runner import (
             NeuronxDistributedModelRunner)
         if self.speculative_config is not None:
-            assert (self.lora_config
-                    is None), "LoRA is not supported for Speculative Decoding"
-            return MultiStepNeuronxDistributedModelRunner(
-                vllm_config=vllm_config)
-        else:
-            return NeuronxDistributedModelRunner(vllm_config=vllm_config)
+            assert (self.lora_config is None), (
+                "LoRA is not supported for Speculative Decoding")
+            raise NotImplementedError(
+                "Speculative decoding is not supported for NeuronxDistributed")
+        return NeuronxDistributedModelRunner(vllm_config=vllm_config)
 
     def init_device(self) -> None:
         self.init_distributed_environment()

From d31f97cf57839b71cc182c6547a87278aa32d8cb Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Aug 2025 20:21:18 -0700
Subject: [PATCH 217/932] [Misc] Remove tests/multi_step/__init__.py (#22778)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
---
 tests/multi_step/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/multi_step/__init__.py

diff --git a/tests/multi_step/__init__.py b/tests/multi_step/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000

From c5830381afbef44023ec1c97ae61ff02f22b1f9a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Aug 2025 20:38:18 -0700
Subject: [PATCH 218/932] [V0 Deprecation] Remove args for multi-step
 scheduling (#22779)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
---
 tests/utils_/test_utils.py |  1 -
 vllm/config/scheduler.py   | 27 +--------------------------
 2 files changed, 1 insertion(+), 27 deletions(-)

diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index 8be1e103dc..084d82dee1 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -161,7 +161,6 @@ def parser_with_config():
     parser.add_argument('--port', type=int)
     parser.add_argument('--tensor-parallel-size', type=int)
     parser.add_argument('--trust-remote-code', action='store_true')
-    parser.add_argument('--multi-step-stream-outputs', action=StoreBoolean)
     return parser
 
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index db669600a0..9300201279 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -115,12 +115,6 @@ class SchedulerConfig:
     (e.g., beam search), recomputation is not currently supported. In
     such a case, we use swapping instead."""
 
-    num_scheduler_steps: int = 1
-    """Maximum number of forward steps per scheduler call."""
-
-    multi_step_stream_outputs: bool = True
-    """If False, then multi-step will stream outputs at the end of all steps"""
-
     send_delta_data: bool = False
     """Private API. If used, scheduler sends delta data to
     workers instead of an entire data. It should be enabled only
@@ -193,16 +187,7 @@ class SchedulerConfig:
 
         if self.max_num_batched_tokens is None:
             if self.enable_chunked_prefill:
-                if self.num_scheduler_steps > 1:
-                    # Multi-step Chunked-Prefill doesn't allow prompt-chunking
-                    # for now. Have max_num_batched_tokens set to max_model_len
-                    # so we don't reject sequences on account of a short
-                    # max_num_batched_tokens.
-                    self.max_num_batched_tokens = max(
-                        self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
-                else:
-                    self.max_num_batched_tokens = (
-                        DEFAULT_MAX_NUM_BATCHED_TOKENS)
+                self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
             else:
                 # If max_model_len is too short, use
                 # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
@@ -293,12 +278,6 @@ class SchedulerConfig:
                 f"({self.num_lookahead_slots}) must be greater than or "
                 "equal to 0.")
 
-        if self.num_scheduler_steps < 1:
-            raise ValueError(
-                "num_scheduler_steps "
-                f"({self.num_scheduler_steps}) must be greater than or "
-                "equal to 1.")
-
         if self.max_num_partial_prefills < 1:
             raise ValueError(
                 f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
@@ -323,7 +302,3 @@ class SchedulerConfig:
                 f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
 
         return self
-
-    @property
-    def is_multi_step(self) -> bool:
-        return self.num_scheduler_steps > 1

From 4f0f844b1675419fd2171bc5e981a82386ec552b Mon Sep 17 00:00:00 2001
From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com>
Date: Wed, 13 Aug 2025 12:21:50 +0800
Subject: [PATCH 219/932] Fix cuda illegal mem access with Llama4 TP8 +
 rms_norm custom op (#22701)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
---
 vllm/model_executor/models/llama4.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 1f8b9d0744..308cb3e85e 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -224,10 +224,14 @@ class Llama4Attention(nn.Module):
 
         if self.rotary_emb is not None:
             q, k = self.rotary_emb(positions, q, k)
+
         if self.qk_norm is not None:
-            q = q.reshape(-1, self.num_heads, self.head_dim)
+            # Normalization is applied on the head_dim dimension. The rest of
+            # the dimensions are collapsed into a single dimension to support
+            # custom rms_norm cuda kernel.
+            q = q.reshape(-1, self.head_dim)
             q = self.qk_norm(q.float()).reshape(-1, self.q_size).to(q.dtype)
-            k = k.reshape(-1, self.num_kv_heads, self.head_dim)
+            k = k.reshape(-1, self.head_dim)
             k = self.qk_norm(k.float()).reshape(-1, self.kv_size).to(k.dtype)
 
         # We are applying temperature tuning (https://arxiv.org/abs/2501.19399)

From b1361c7273f60ca244e5425bdb7a9120057327fe Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 13 Aug 2025 00:22:05 -0400
Subject: [PATCH 220/932] [Bugfix] Fix default enable for CUTLASS MLA on SM100
 (#22738)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/platforms/cuda.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 7095913157..63f6b373c3 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -152,6 +152,9 @@ class CudaPlatformBase(Platform):
                 if cls.is_device_capability(100):
                     # Blackwell => Force CutlassMLA.
                     use_cutlass_mla = True
+                    # TODO: This does not work, because the
+                    # global_force_attn_backend_context_manager is not set.
+                    # See vllm/attention/selector.py:_cached_get_attn_backend
                     envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA"
                 else:
                     # Not Blackwell
@@ -217,7 +220,9 @@ class CudaPlatformBase(Platform):
         if use_mla:
             # TODO(lucas): refactor to be more concise
             #  we should probably consider factoring out V1 here
-            if selected_backend == _Backend.CUTLASS_MLA:
+            if selected_backend == _Backend.CUTLASS_MLA or (
+                    cls.is_device_capability(100) and selected_backend is None
+                    and block_size == 128):
                 if use_v1:
                     logger.info_once("Using Cutlass MLA backend on V1 engine.")
                     return ("vllm.v1.attention.backends.mla."

From c6b928798e96f0a99a666945686c63b61bbbced4 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 13 Aug 2025 00:22:16 -0400
Subject: [PATCH 221/932] Force TRTLLM attention for gpt-oss on SM100 (#22678)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/models/gpt_oss.py    |  5 +----
 vllm/utils/flashinfer.py                 |  8 ++++++++
 vllm/v1/attention/backends/flashinfer.py | 11 +++++++----
 vllm/v1/attention/backends/utils.py      |  5 ++++-
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 6a65bbbe2e..7c7712dbe1 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -8,7 +8,6 @@ import torch.distributed as dist
 from torch import nn
 from transformers import GptOssConfig
 
-from vllm import envs
 from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -70,11 +69,9 @@ class OAIAttention(nn.Module):
 
         tp_size = get_tensor_model_parallel_world_size()
 
-        attention_sink_dtype = (torch.float32 if envs.VLLM_USE_TRTLLM_ATTENTION
-                                else torch.bfloat16)
         self.sinks = torch.nn.Parameter(
             torch.empty(config.num_attention_heads // tp_size,
-                        dtype=attention_sink_dtype,
+                        dtype=torch.bfloat16,
                         requires_grad=False))
 
         self.norm = RMSNorm(config.hidden_size, eps=1e-5)
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 5998d4c312..6b23ed4268 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -154,6 +154,7 @@ def use_trtllm_attention(
     num_qo_heads: Optional[int],
     num_kv_heads: Optional[int],
     attn_head_size: Optional[int],
+    has_sinks: bool = False,
 ) -> bool:
     # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
     if not (current_platform.is_device_capability(100)
@@ -165,6 +166,13 @@ def use_trtllm_attention(
             or num_qo_heads % num_kv_heads != 0):
         return False
 
+    # If sinks are being used, we must use TRTLLM attention as it's
+    # the only backend that supports them
+    if has_sinks:
+        logger.info_once(
+            "Using TRTLLM attention (required for attention sinks).")
+        return True
+
     env_value = envs.VLLM_USE_TRTLLM_ATTENTION
     if env_value is not None:
         logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value)
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index c85d8bce31..12e5542d69 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -523,14 +523,17 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         num_kv_heads = self.kv_cache_spec.num_kv_heads
         head_dim = self.kv_cache_spec.head_size
 
+        # Check if any layer uses sinks (requires TRTLLM attention)
+        has_sinks = self.global_hyperparameters.has_sinks
+
         # currently prefill trtllm attention does not support fp8 kv cache
         prefill_use_trtllm = not cache_dtype.startswith("fp8") \
                                 and use_trtllm_attention(
                                 num_prefill_tokens, max_seq_len, cache_dtype,
-                                num_qo_heads, num_kv_heads, head_dim)
+                                num_qo_heads, num_kv_heads, head_dim, has_sinks)
         decode_use_trtllm = use_trtllm_attention(
                                 num_decode_tokens, max_seq_len, cache_dtype,
-                                num_qo_heads, num_kv_heads, head_dim)
+                                num_qo_heads, num_kv_heads, head_dim, has_sinks)
 
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -642,9 +645,9 @@ class FlashInferImpl(AttentionImpl):
                     f"heads in the layer. Expected {num_heads}, but got "
                     f"{sinks.shape[0]}."
                 )
+            # Cast sinks to float32 if needed (FlashInfer requirement)
             if sinks.dtype != torch.float32:
-                raise ValueError("Sinks must be of type float32, but got "
-                                 f"{sinks.dtype}.")
+                sinks = sinks.to(torch.float32)
             self.sinks = sinks
 
     def forward(
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index e23dd8bc5b..91eb84245a 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -285,6 +285,7 @@ class PerLayerParameters:
     window_left: int
     logits_soft_cap: Optional[float]
     sm_scale: float
+    has_sinks: bool = False
 
 
 def get_per_layer_parameters(
@@ -307,9 +308,11 @@ def get_per_layer_parameters(
         window_left = window_size[0] if window_size is not None else -1
         logits_soft_cap = getattr(impl, "logits_soft_cap", None)
         sm_scale = impl.scale
+        has_sinks = getattr(impl, "sinks", None) is not None
 
         per_layer_params[key] = PerLayerParameters(window_left,
-                                                   logits_soft_cap, sm_scale)
+                                                   logits_soft_cap, sm_scale,
+                                                   has_sinks)
 
     return per_layer_params
 

From 4082338a25851e1f923ad5601616f2717536c6fd Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 13 Aug 2025 00:26:38 -0400
Subject: [PATCH 222/932] Remove unneeded ROCm platform import when using CUDA
 (#22765)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/attention/backends/rocm_flash_attn.py         | 2 +-
 vllm/attention/ops/chunked_prefill_paged_decode.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 1ee1dea729..da3d9ff328 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -22,7 +22,6 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape)
 from vllm.platforms import current_platform
-from vllm.platforms.rocm import use_rocm_custom_paged_attention
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
@@ -886,6 +885,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
             num_seqs, num_heads, head_size = decode_query.shape
             block_size = value_cache.shape[3]
             gqa_ratio = num_heads // self.num_kv_heads
+            from vllm.platforms.rocm import use_rocm_custom_paged_attention
             use_custom = use_rocm_custom_paged_attention(
                 decode_query.dtype, head_size, block_size, gqa_ratio,
                 decode_meta.max_decode_seq_len, self.sliding_window,
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index dc10d7eca9..e5b90a8b27 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -11,7 +11,6 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.platforms.rocm import use_rocm_custom_paged_attention
 from vllm.triton_utils import tl, triton
 
 from .prefix_prefill import context_attention_fwd
@@ -296,6 +295,7 @@ def chunked_prefill_paged_decode(
     num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv),
                                     16)
 
+    from vllm.platforms.rocm import use_rocm_custom_paged_attention
     use_custom = use_rocm_custom_paged_attention(
         query.dtype,
         head_size,

From 77a6bf07aedf132aad2b6719f6d87abc5d3311ab Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 13 Aug 2025 00:31:47 -0400
Subject: [PATCH 223/932] [Bug] Fix Unexpected Keyword Argument 'w1_bias'
 (#22757)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index fb38fb91ea..8ef0a805d8 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -475,12 +475,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 activation=activation,
                 apply_router_weight_on_input=apply_router_weight_on_input)
         else:
-            return self.fused_experts(
+            # add w1_bias/w2_bias to kwargs if they exist
+            kwargs = dict(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
-                w1_bias=layer.w13_bias if self.has_bias else None,
-                w2_bias=layer.w2_bias if self.has_bias else None,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
@@ -489,6 +488,17 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
             )
+            if isinstance(self.fused_experts,
+                          FusedMoEModularKernel) and self.has_bias:
+                raise ValueError(
+                    "FusedMoEModularKernel does not support bias.")
+            if self.has_bias:
+                kwargs.update({
+                    "w1_bias": getattr(layer, "w13_bias", None),
+                    "w2_bias": getattr(layer, "w2_bias", None),
+                })
+
+            return self.fused_experts(**kwargs)
 
     def forward_cpu(
         self,

From 4c558cf62ed69fbd8c031809b0a7f8b12afa980b Mon Sep 17 00:00:00 2001
From: shixianc <49539556+shixianc@users.noreply.github.com>
Date: Tue, 12 Aug 2025 21:34:47 -0700
Subject: [PATCH 224/932] [Perf] Support topk softmax fused kernel for broader
 num_experts (#22211)

Signed-off-by: Shixian Cui <shixian@amazon.com>
Co-authored-by: Shixian Cui <shixian@amazon.com>
---
 csrc/moe/topk_softmax_kernels.cu | 77 +++++++++++++++++++-------------
 tests/kernels/moe/test_moe.py    |  2 +-
 2 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 7a7865b901..946c137db6 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -188,7 +188,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(
   It fuses the softmax, max and argmax into a single kernel.
 
   Limitations:
-  1) This implementation is intended for when the number of experts is a small power of 2.
+  1) This implementation is optimized for when the number of experts is a small power of 2.
+     Additionally it also supports when number of experts is multiple of 64 which is still
+     faster than the computing softmax and topK separately (only tested on CUDA yet).
   2) This implementation assumes k is small, but will work for any k.
 */
 
@@ -198,8 +200,6 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
         int* source_rows, const int k, const int start_expert, const int end_expert)
 {
     // We begin by enforcing compile time assertions and setting up compile time constants.
-    static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
-    static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
     static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
     static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
 
@@ -407,12 +407,10 @@ struct TopkConstants
 };
 } // namespace detail
 
-template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, typename IndType>
+template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
 void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
     int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
 {
-    static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
-
     static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
     using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
     static constexpr int VPT = Constants::VPT;
@@ -425,21 +423,12 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
         input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
 }
 
-#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)                                \
-    switch (warpSize) {                                                          \
-        case 32:                                                                 \
-            topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32>(      \
-                gating_output, nullptr, topk_weights, topk_indices,              \
-                token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
-            break;                                                               \
-        case 64:                                                                 \
-            topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64>(      \
-                gating_output, nullptr, topk_weights, topk_indices,              \
-                token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
-            break;                                                               \
-        default:                                                                 \
-            TORCH_CHECK(false, "Unsupported warp size: ", warpSize);             \
-    }
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                          \
+    static_assert(WARP_SIZE == 32 || WARP_SIZE == 64,                                 \
+                  "Unsupported warp size. Only 32 and 64 are supported.");            \
+    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
+        gating_output, nullptr, topk_weights, topk_indices,                           \
+        token_expert_indices, num_tokens, topk, 0, num_experts, stream);
 
 template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
@@ -453,38 +442,62 @@ void topkGatingSoftmaxKernelLauncher(
     const int topk,
     cudaStream_t stream) {
     static constexpr int WARPS_PER_TB = 4;
-    auto warpSize = WARP_SIZE;
+    static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
+    static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
     switch (num_experts) {
         case 1:
-            LAUNCH_SOFTMAX(1, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 2:
-            LAUNCH_SOFTMAX(2, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 4:
-            LAUNCH_SOFTMAX(4, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 8:
-            LAUNCH_SOFTMAX(8, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 16:
-            LAUNCH_SOFTMAX(16, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 32:
-            LAUNCH_SOFTMAX(32, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 64:
-            LAUNCH_SOFTMAX(64, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 128:
-            LAUNCH_SOFTMAX(128, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 256:
-            LAUNCH_SOFTMAX(256, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
+        case 512:
+            LAUNCH_SOFTMAX(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            break;
+        // (CUDA only) support multiples of 64 when num_experts is not power of 2.
+        // ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts,
+        // alternatively we can test 4 bytes loading and enable it in future.
+#ifndef USE_ROCM
+        case 192:
+            LAUNCH_SOFTMAX(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+        case 320:
+            LAUNCH_SOFTMAX(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+        case 384:
+            LAUNCH_SOFTMAX(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+        case 448:
+            LAUNCH_SOFTMAX(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+        case 576:
+            LAUNCH_SOFTMAX(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+#endif
         default: {
             TORCH_CHECK(softmax_workspace != nullptr,
-                "softmax_workspace must be provided for num_experts that are not a power of 2.");
+                "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
             static constexpr int TPB = 256;
             moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
                 gating_output, nullptr, softmax_workspace, num_experts);
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 0f1c787046..49c097718e 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -36,7 +36,7 @@ from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
-NUM_EXPERTS = [8, 64]
+NUM_EXPERTS = [8, 64, 192]
 EP_SIZE = [1, 4]
 TOP_KS = [2, 6]
 

From 6807af8f46acd184f99342ff38f2a1359f693b10 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 12 Aug 2025 21:37:26 -0700
Subject: [PATCH 225/932] [gpt-oss] upgrade gpt-oss to v0.0.3 and add version
 check (#22768)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/tool.py | 51 ++++++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py
index 723cff91d4..758789a5e0 100644
--- a/vllm/entrypoints/tool.py
+++ b/vllm/entrypoints/tool.py
@@ -2,9 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Optional
-
-from openai_harmony import Message
+from typing import TYPE_CHECKING, Any
 
 from vllm.logger import init_logger
 
@@ -15,6 +13,30 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
 
 
+def validate_gpt_oss_install():
+    """
+    Check if the gpt-oss is installed and its version is at least 0.0.3.
+    If not, raise an ImportError.
+    """
+    from importlib.metadata import PackageNotFoundError, version
+
+    from packaging.version import InvalidVersion, Version
+
+    try:
+        pkg_version_str = version("gpt_oss")  # e.g., "0.0.5"
+        pkg_version = Version(pkg_version_str)
+    except PackageNotFoundError:
+        raise ImportError("Package 'gpt_oss' is not installed.") from None
+    except InvalidVersion as e:
+        raise ImportError(
+            f"Invalid version string for 'gpt_oss': {e}") from None
+
+    if pkg_version < Version("0.0.3"):
+        raise ImportError(
+            f"gpt_oss >= 0.0.3 is required, but {pkg_version} is installed."
+        ) from None
+
+
 class Tool(ABC):
 
     @abstractmethod
@@ -33,12 +55,14 @@ class HarmonyBrowserTool(Tool):
             return
 
         try:
+            validate_gpt_oss_install()
             from gpt_oss.tools.simple_browser import SimpleBrowserTool
             from gpt_oss.tools.simple_browser.backend import ExaBackend
-        except ImportError:
+        except ImportError as e:
             self.enabled = False
             logger.warning_once(
-                "gpt_oss is not installed, browsing is disabled")
+                "gpt_oss is not installed properly (%s), browsing is disabled",
+                e)
             return
 
         browser_backend = ExaBackend(source="web", api_key=exa_api_key)
@@ -65,23 +89,16 @@ class HarmonyPythonTool(Tool):
         self.enabled = True
 
         try:
+            validate_gpt_oss_install()
             from gpt_oss.tools.python_docker.docker_tool import PythonTool
-        except ImportError:
+        except ImportError as e:
             self.enabled = False
             logger.warning_once(
-                "gpt_oss is not installed, code interpreter is disabled")
+                "gpt_oss is not installed properly (%s), code interpreter is "
+                "disabled", e)
             return
 
-        # NOTE (Chen): as of gpt-oss 0.0.2, there is a bug in _make_response
-        # and we do the following monkey patch to fix it.
-        class PatchedGptOssPythonTool(PythonTool):
-
-            def _make_response(self,
-                               output: str,
-                               channel: Optional[str] = None) -> Message:
-                return super()._make_response(output)
-
-        self.python_tool = PatchedGptOssPythonTool()
+        self.python_tool = PythonTool()
         logger.info_once("Code interpreter tool initialized")
 
     async def get_result(self, context: "ConversationContext") -> Any:

From d16aa3dae446d93f870a2e51b240e18a01cac294 Mon Sep 17 00:00:00 2001
From: zzh142857 <chaorenzhaozhenghao@gmail.com>
Date: Wed, 13 Aug 2025 03:09:13 -0400
Subject: [PATCH 226/932] [Model] Add option to run Step3VisionEncoder in DP
 (#22697)

Signed-off-by: zzh142857 <chaorenzhaozhenghao@gmail.com>
---
 vllm/model_executor/models/step3_vl.py | 132 +++++++++++++++++--------
 1 file changed, 91 insertions(+), 41 deletions(-)

diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 41dba312cb..f1f38c01b7 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -21,6 +21,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -33,6 +34,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.utils import run_dp_sharded_vision_model
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import Step3VisionEncoderConfig
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -650,7 +652,8 @@ class Step3VisionAttention(nn.Module):
     def __init__(self,
                  config,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -659,20 +662,42 @@ class Step3VisionAttention(nn.Module):
 
         self.scale = self.head_dim**-0.5
 
-        tp_size = get_tensor_model_parallel_world_size()
+        tp_size = (1 if use_data_parallel else
+                   get_tensor_model_parallel_world_size())
         assert self.total_num_heads % tp_size == 0
         self.num_heads = self.total_num_heads // tp_size
-        self.qkv_proj = QKVParallelLinear(self.embed_dim,
-                                          self.head_dim,
-                                          self.total_num_heads,
-                                          bias=True,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
-        self.out_proj = RowParallelLinear(self.embed_dim,
-                                          self.embed_dim,
-                                          bias=True,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
+
+        self.q_size = self.num_heads * self.head_dim
+
+        if use_data_parallel:
+            self.qkv_proj = ReplicatedLinear(
+                self.embed_dim,
+                3 * self.q_size,
+                bias=True,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+            self.out_proj = ReplicatedLinear(
+                self.total_num_heads * self.head_dim,
+                self.embed_dim,
+                bias=True,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+        else:
+            self.qkv_proj = QKVParallelLinear(
+                self.embed_dim,
+                self.head_dim,
+                self.total_num_heads,
+                bias=True,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+            self.out_proj = RowParallelLinear(self.embed_dim,
+                                              self.embed_dim,
+                                              bias=True,
+                                              quant_config=quant_config,
+                                              prefix=prefix)
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads,
@@ -712,20 +737,25 @@ class Step3VisionMLP(nn.Module):
     def __init__(self,
                  config,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        self.fc1 = ColumnParallelLinear(config.hidden_size,
-                                        config.intermediate_size,
-                                        bias=True,
-                                        quant_config=quant_config,
-                                        prefix=prefix)
-        self.fc2 = RowParallelLinear(config.intermediate_size,
-                                     config.hidden_size,
-                                     bias=True,
-                                     quant_config=quant_config,
-                                     prefix=prefix)
+        cls_fc1 = (ReplicatedLinear
+                   if use_data_parallel else ColumnParallelLinear)
+        self.fc1 = cls_fc1(config.hidden_size,
+                           config.intermediate_size,
+                           bias=True,
+                           quant_config=quant_config,
+                           prefix=prefix)
+        cls_fc2 = (ReplicatedLinear
+                   if use_data_parallel else RowParallelLinear)
+        self.fc2 = cls_fc2(config.intermediate_size,
+                           config.hidden_size,
+                           bias=True,
+                           quant_config=quant_config,
+                           prefix=prefix)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc1(hidden_states)
@@ -739,15 +769,22 @@ class Step3VisionEncoderLayer(nn.Module):
     def __init__(self,
                  config: Step3VisionEncoderConfig,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
+        self.use_data_parallel = use_data_parallel
         self.embed_dim = config.hidden_size
-        self.self_attn = Step3VisionAttention(config,
-                                              quant_config,
-                                              prefix=f"{prefix}.self_attn")
+        self.self_attn = Step3VisionAttention(
+            config,
+            quant_config,
+            prefix=f"{prefix}.self_attn",
+            use_data_parallel=self.use_data_parallel)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim,
                                         eps=config.layer_norm_eps)
-        self.mlp = Step3VisionMLP(config, quant_config, prefix=f"{prefix}.mlp")
+        self.mlp = Step3VisionMLP(config,
+                                  quant_config,
+                                  prefix=f"{prefix}.mlp",
+                                  use_data_parallel=self.use_data_parallel)
         self.layer_norm2 = nn.LayerNorm(self.embed_dim,
                                         eps=config.layer_norm_eps)
 
@@ -767,13 +804,16 @@ class Step3VisionEncoder(nn.Module):
     def __init__(self,
                  config: Step3VisionEncoderConfig,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
         self.config = config
+        self.use_data_parallel = use_data_parallel
         self.layers = nn.ModuleList([
             Step3VisionEncoderLayer(config,
                                     quant_config,
-                                    prefix=f"{prefix}.layers.{i}")
+                                    prefix=f"{prefix}.layers.{i}",
+                                    use_data_parallel=self.use_data_parallel)
             for i in range(config.num_hidden_layers)
         ])
 
@@ -792,21 +832,29 @@ class Step3VisionTransformer(nn.Module):
     def __init__(self,
                  config: Step3VisionEncoderConfig,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
         self.config = config
+        self.use_data_parallel = use_data_parallel
         self.image_size = config.image_size
         self.embeddings = Step3VisionEmbeddings(config)
-        self.transformer = Step3VisionEncoder(config,
-                                              quant_config,
-                                              prefix=f"{prefix}.transformer")
+        self.transformer = Step3VisionEncoder(
+            config,
+            quant_config,
+            prefix=f"{prefix}.transformer",
+            use_data_parallel=self.use_data_parallel)
 
     def forward(
         self,
         pixel_values: torch.Tensor,
     ):
         hidden_states = self.embeddings(pixel_values)
-        hidden_states = self.transformer(inputs_embeds=hidden_states)
+        if self.use_data_parallel:
+            hidden_states = run_dp_sharded_vision_model(
+                hidden_states, self.transformer)
+        else:
+            hidden_states = self.transformer(inputs_embeds=hidden_states)
         return hidden_states
 
 
@@ -836,13 +884,15 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         self.config = config
         self.multimodal_config = multimodal_config
+        self.use_data_parallel = (vllm_config.parallel_config.
+                                  enable_multimodal_encoder_data_parallel)
 
         if multimodal_config.get_limit_per_prompt("image"):
-            self.vision_model = Step3VisionTransformer(config.vision_config,
-                                                       None,
-                                                       prefix=maybe_prefix(
-                                                           prefix,
-                                                           "vision_model"))
+            self.vision_model = Step3VisionTransformer(
+                config.vision_config,
+                None,
+                prefix=maybe_prefix(prefix, "vision_model"),
+                use_data_parallel=self.use_data_parallel)
             self.vit_downsampler = nn.Conv2d(
                 config.vision_config.hidden_size,
                 config.vision_config.output_hidden_size,

From 9e7e5baaa83b1e5070a3cf3823c134b28eaa2a1c Mon Sep 17 00:00:00 2001
From: Yuxuan Zhang <2448370773@qq.com>
Date: Wed, 13 Aug 2025 16:23:33 +0800
Subject: [PATCH 227/932] [Model] Add missing prefix to glm4_1v (#22716)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
---
 vllm/model_executor/models/glm4_1v.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 2a89c03bfe..88c53c8363 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -453,25 +453,30 @@ class Glm4vPatchMerger(nn.Module):
         context_dim: int,
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = d_model
         self.proj = ColumnParallelLinear(self.hidden_size,
                                          self.hidden_size,
                                          bias=bias,
-                                         gather_output=True)
+                                         gather_output=True,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.proj")
         self.post_projection_norm = nn.LayerNorm(self.hidden_size)
         self.gate_up_proj = MergedColumnParallelLinear(
             input_size=self.hidden_size,
             output_sizes=[context_dim] * 2,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
             context_dim,
             self.hidden_size,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         self.act_fn = SiluAndMul()
         self.extra_activation_func = nn.GELU()
@@ -661,6 +666,7 @@ class Glm4vVisionTransformer(nn.Module):
             context_dim=vision_config.intermediate_size,
             quant_config=quant_config,
             bias=False,
+            prefix=f"{prefix}.merger",
         )
         self.embeddings = Glm4vVisionEmbeddings(vision_config)
 

From a01e0018b50fbda6aaf151268fd6f4769b6e81a8 Mon Sep 17 00:00:00 2001
From: Duc-Viet Hoang <vietyb00@gmail.com>
Date: Wed, 13 Aug 2025 17:11:36 +0700
Subject: [PATCH 228/932] [Bugfix] Fix Nemotron VL image processing (#22739)

Co-authored-by: ducviet00-h2 <viet.d.hoang@h2corporation.jp>
---
 .../multimodal/processing/test_nemotron_vl.py |   8 +-
 vllm/model_executor/models/nemotron_vl.py     | 186 ++++++++++++++++++
 2 files changed, 190 insertions(+), 4 deletions(-)

diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py
index 3ce88bc427..6fbbab0d26 100644
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -23,15 +23,15 @@ def _get_expected_num_patches(
     min_num: int,
     max_num: int,
 ):
-    from vllm.model_executor.models.internvl import (
-        calculate_internvl_targets, get_internvl_target_ratios)
+    from vllm.model_executor.models.nemotron_vl import (
+        calculate_nemotron_vl_targets, get_nemotron_vl_target_ratios)
 
     width, height = image.size
 
-    blocks, _, _ = calculate_internvl_targets(
+    blocks, _, _ = calculate_nemotron_vl_targets(
         orig_width=width,
         orig_height=height,
-        target_ratios=get_internvl_target_ratios(
+        target_ratios=get_nemotron_vl_target_ratios(
             min_num,
             max_num,
         ),
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index b90cb9b39a..82bcd06462 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -13,6 +13,7 @@ from typing import Optional
 
 import torch
 import torch.nn as nn
+import torchvision.transforms as T
 from PIL import Image
 from transformers import AutoModel, PretrainedConfig
 from transformers.image_processing_utils_fast import BaseImageProcessorFast
@@ -27,6 +28,7 @@ from vllm.model_executor.models.internvl import (
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.processing import PromptUpdateDetails
 from vllm.sequence import IntermediateTensors
@@ -44,6 +46,146 @@ IMG_END = '</img>'
 IMG_CONTEXT = '<image>'
 
 
+def build_transform(input_size: int):
+    return T.Compose([
+        T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
+        T.Resize((input_size, input_size),
+                 interpolation=T.InterpolationMode.BICUBIC),
+        T.ToTensor(),
+    ])
+
+
+# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_factor = float('-inf')
+    best_ratio = (1, 1)
+    area = width * height
+
+    for rw, rh in target_ratios:
+        target_aspect_ratio = rw / rh
+        size_factor = min((rw * rh * image_size * image_size) / area, 0.6)
+        ratio_closeness = min(target_aspect_ratio / aspect_ratio,
+                              aspect_ratio / target_aspect_ratio)
+        factor = size_factor * ratio_closeness
+
+        if factor > best_factor:
+            best_factor = factor
+            best_ratio = (rw, rh)
+
+    return best_ratio
+
+
+def calculate_nemotron_vl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+def dynamic_preprocess_nemotron_vl(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_nemotron_vl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+def get_nemotron_vl_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1)
+                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def image_to_pixel_values_nemotron_vl(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_nemotron_vl_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+
+    images = dynamic_preprocess_nemotron_vl(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
 class NemotronVLProcessor(InternVLProcessor):
 
     def __init__(
@@ -87,6 +229,50 @@ class NemotronVLProcessor(InternVLProcessor):
     def image_token_id(self) -> int:
         return self.tokenizer.get_vocab()[IMG_CONTEXT]
 
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_nemotron_vl_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_nemotron_vl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+            ) for image in images
+        ]
+
     def _preprocess_image(
         self,
         text: list[str],

From 3f52738dce57360ccc92c9993c5adcaaec1f5ac2 Mon Sep 17 00:00:00 2001
From: 633WHU <cliu_whu@yeah.net>
Date: Wed, 13 Aug 2025 19:10:07 +0800
Subject: [PATCH 229/932] [Doc] Add max_lora_rank configuration guide (#22782)

Signed-off-by: chiliu <cliu_whu@yeah.net>
---
 docs/features/lora.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/docs/features/lora.md b/docs/features/lora.md
index a4e05dae11..668460a368 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -351,3 +351,22 @@ vllm serve ibm-granite/granite-speech-3.3-2b \
 ```
 
 Note: Default multimodal LoRAs are currently only available for `.generate` and chat completions.
+
+## Using Tips
+
+### Configuring `max_lora_rank`
+
+The `--max-lora-rank` parameter controls the maximum rank allowed for LoRA adapters. This setting affects memory allocation and performance:
+
+- **Set it to the maximum rank** among all LoRA adapters you plan to use
+- **Avoid setting it too high** - using a value much larger than needed wastes memory and can cause performance issues
+
+For example, if your LoRA adapters have ranks [16, 32, 64], use `--max-lora-rank 64` rather than 256
+
+```bash
+# Good: matches actual maximum rank
+vllm serve model --enable-lora --max-lora-rank 64
+
+# Bad: unnecessarily high, wastes memory
+vllm serve model --enable-lora --max-lora-rank 256
+```

From d94e3026ded838bc0c3eec9e0a0b4b3affa0cbc9 Mon Sep 17 00:00:00 2001
From: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com>
Date: Wed, 13 Aug 2025 04:11:28 -0700
Subject: [PATCH 230/932] [V1] Add tree drafting tests for eagle spec decoding
 (#22705)

Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
---
 tests/v1/spec_decode/test_eagle.py      | 160 +++++++++++++++++++++++-
 tests/v1/spec_decode/test_max_len.py    |   6 -
 vllm/v1/attention/backends/tree_attn.py |   6 +-
 vllm/v1/spec_decode/eagle.py            |  61 +++------
 4 files changed, 178 insertions(+), 55 deletions(-)

diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 2b4f8bd2a8..7b8445a0b2 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Optional
 from unittest import mock
 
 import pytest
@@ -23,7 +24,11 @@ eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
 eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
 
 
-def _create_proposer(method: str, k: int) -> EagleProposer:
+def _create_proposer(
+    method: str,
+    num_speculative_tokens: int,
+    speculative_token_tree: Optional[list[tuple[int]]] = None,
+) -> EagleProposer:
     model_config = ModelConfig(model=model_dir,
                                runner="generate",
                                max_model_len=100)
@@ -31,12 +36,18 @@ def _create_proposer(method: str, k: int) -> EagleProposer:
     # Choose model directory based on method
     draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
 
+    spec_token_tree_str = None
+    if speculative_token_tree is not None:
+        assert num_speculative_tokens == len(speculative_token_tree)
+        spec_token_tree_str = str(speculative_token_tree)
+
     speculative_config = SpeculativeConfig(
         target_model_config=model_config,
         target_parallel_config=ParallelConfig(),
         model=draft_model_dir,
         method=method,
-        num_speculative_tokens=k,
+        num_speculative_tokens=num_speculative_tokens,
+        speculative_token_tree=spec_token_tree_str,
     )
 
     vllm_config = VllmConfig(
@@ -189,7 +200,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
         target_model.lm_head = mock.MagicMock()
 
     # Create proposer using the helper function
-    proposer = _create_proposer(method, k=8)
+    proposer = _create_proposer(method, num_speculative_tokens=8)
 
     # Call the method under test
     proposer.load_model(target_model)
@@ -226,6 +237,10 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
         pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
                     "multi-token eagle spec decode on current platform")
 
+    if (attn_backend == "TREE_ATTN"):
+        pytest.skip("TREE_ATTN is tested separately in test_propose_tree"
+                    "because it requires special input mocking.")
+
     if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
@@ -378,3 +393,142 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
 
     # Verify all tokens match our expectations
     assert torch.equal(result, expected_tokens)
+
+
+@pytest.mark.parametrize(
+    "spec_token_tree",
+    [
+        [(0, )],  # A single token
+        [(0, ), (0, 0), (0, 0, 0)],  # Chain
+        [(0, ), (1, ), (2, )],  # Parallel
+        [(0, ), (1, ), (2, ), (0, 0), (0, 1), (1, 0), (1, 1), (2, 0),
+         (2, 1)],  # Tree
+    ])
+def test_propose_tree(spec_token_tree):
+    # Get GPU device.
+    device = torch.device(current_platform.device_type)
+
+    # Setup test parameters.
+    batch_size = 2
+    seq_len_1 = 5
+    seq_len_2 = 3
+    total_tokens = seq_len_1 + seq_len_2
+    vocab_size = 100
+    seq_lens = [seq_len_1, seq_len_2]
+    num_speculative_tokens = len(spec_token_tree)
+
+    # Create proposer first so we can use its actual hidden_size.
+    proposer = _create_proposer("eagle",
+                                num_speculative_tokens,
+                                speculative_token_tree=spec_token_tree)
+    # Get the hidden_size from the proposer to ensure consistency.
+    hidden_size = proposer.hidden_size
+
+    # Helper to create deterministic logits that will produce specific tokens
+    def create_deterministic_logits(token_ids, k: int):
+        logits = torch.full((batch_size, vocab_size), -100.0, device=device)
+        for i, token_id in enumerate(token_ids):
+            # Assign decreasing values to the k, consecutive, tokens.
+            for j in range(k):
+                logits[i, token_id + j] = 100.0 - j
+        return logits
+
+    # Mock a model that returns deterministic logits.
+    base_token_ids = torch.tensor([42, 60], dtype=torch.int64, device=device)
+
+    # Skip loading the model and replace it with a mock that returns
+    # deterministic outputs.
+    model_mock = mock.MagicMock()
+
+    # Mock the model forward calls.
+    forward_returns = [(torch.zeros(total_tokens, hidden_size, device=device),
+                        torch.zeros(total_tokens, hidden_size, device=device))]
+    for cu_num_drafts in proposer.cu_drafts_per_level:
+        h_logits = torch.zeros(batch_size * cu_num_drafts,
+                               hidden_size,
+                               device=device)
+        h_states = torch.zeros(batch_size * cu_num_drafts,
+                               hidden_size,
+                               device=device)
+        forward_returns.append((h_logits, h_states))
+    model_mock.side_effect = forward_returns
+
+    # Mock the compute_logits calls.
+    cu_num_drafts_tensor = torch.tensor([0] + proposer.cu_drafts_per_level,
+                                        dtype=torch.int32,
+                                        device=device)
+    logits_returns = []
+    for level, num_children in enumerate(proposer.child_drafts_per_level):
+        token_ids = base_token_ids + cu_num_drafts_tensor[level]
+        level_num_drafts = cu_num_drafts_tensor[
+            level + 1] - cu_num_drafts_tensor[level]
+        level_logits = []
+        for i in range(level_num_drafts // num_children):
+            level_logits.append(
+                create_deterministic_logits(token_ids + i * num_children,
+                                            num_children))
+        logits_returns.append(torch.stack(level_logits, dim=1))
+    model_mock.compute_logits.side_effect = logits_returns
+
+    # Assign the mock to the proposer
+    proposer.model = model_mock
+
+    # Assign draft attn_layer_names since load_model is not invoked
+    proposer.attn_layer_names = ["layer.0"]
+
+    # Get the tree attention metadata builder.
+    attn_metadata_builder_cls, _ = get_attention_backend(_Backend.TREE_ATTN)
+    attn_metadata_builder = attn_metadata_builder_cls(
+        kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
+        layer_names=proposer.attn_layer_names,
+        vllm_config=proposer.vllm_config,
+        device=device,
+    )
+
+    # Mock runner for attention metadata building.
+    proposer.runner = mock.MagicMock()
+    proposer.runner.attn_groups.append([mock.MagicMock()])
+    proposer.runner.attn_groups[0][0].metadata_builder = attn_metadata_builder
+
+    # Setup inputs for the proposer.
+    target_token_ids = torch.randint(0,
+                                     vocab_size, (total_tokens, ),
+                                     device=device)
+    target_positions = torch.cat([
+        torch.arange(seq_len_1, device=device),
+        torch.arange(seq_len_2, device=device)
+    ])
+    target_hidden_states = torch.randn(total_tokens,
+                                       hidden_size,
+                                       device=device)
+    next_token_ids = torch.randint(0,
+                                   vocab_size, (batch_size, ),
+                                   dtype=torch.int32,
+                                   device=device)
+    batch_spec = BatchSpec(
+        seq_lens=seq_lens,
+        query_lens=seq_lens,
+    )
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec,
+        block_size=16,
+        device=device,
+    )
+    sampling_metadata = mock.MagicMock()
+
+    # Propose draft tokens.
+    result = proposer.propose(target_token_ids=target_token_ids,
+                              target_positions=target_positions,
+                              target_hidden_states=target_hidden_states,
+                              next_token_ids=next_token_ids,
+                              common_attn_metadata=common_attn_metadata,
+                              sampling_metadata=sampling_metadata)
+    assert result.shape == (batch_size, num_speculative_tokens)
+
+    # The tokens are expected to be consecutive integers starting
+    # from the base token IDs.
+    expected_tokens = base_token_ids[:, None] + torch.arange(
+        num_speculative_tokens, dtype=torch.int64, device=device)
+
+    # Verify that the draft tokens match our expectations.
+    assert torch.equal(result, expected_tokens)
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index 01019b29e0..a5b10bb518 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -39,12 +39,6 @@ def test_eagle_max_len(monkeypatch: pytest.MonkeyPatch,
                        num_speculative_tokens: int, attn_backend: str):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
-
-        if attn_backend == "TREE_ATTN" and num_speculative_tokens > 1:
-            # TREE_ATTN fails the test with multi-token spec decode
-            # TODO: Investigate why
-            pytest.skip("TREE_ATTN fails the test")
-
         m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
         if (attn_backend == "TRITON_ATTN_VLLM_V1"
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index 3b53b039f1..5d10e9e260 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -236,9 +236,9 @@ class TreeAttentionMetadataBuilder(
             # Use prefill for drafting at the root level.
             self.tree_attn_bias = torch.empty(0)
         else:
-            # Slice the tree attention bias for drafting.
-            query_len = common_attn_metadata.max_query_len
-            start, end = draft_index, draft_index + query_len
+            # Slice the tree attention bias for drafting. Exclude
+            # the root level.
+            start, end = 1, 1 + common_attn_metadata.max_query_len
             self.tree_attn_bias = self.tree_attn_bias[start:end,
                                                       start:end].contiguous()
 
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index f75d76dd97..a8a160a0f9 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -113,13 +113,6 @@ class EagleProposer:
                                             num_drafts_per_level[level])
             self.child_drafts_per_level.append(num_drafts_per_level[level] //
                                                num_drafts_per_level[level - 1])
-        # Find the first level where the tree branches off into one or more
-        # children.
-        self.first_branching_level = None
-        for level in range(tree_depth):
-            if self.cu_drafts_per_level[level] > level + 1:
-                self.first_branching_level = level
-                break
         # Precompute draft position offsets in flattened tree.
         self.tree_draft_pos_offsets = torch.arange(
             1,
@@ -209,11 +202,10 @@ class EagleProposer:
         logits = self.model.compute_logits(sample_hidden_states, None)
         positions = target_positions[last_token_indices]
         hidden_states = hidden_states[last_token_indices]
-        if self.first_branching_level == 0:
-            # Branching has occurred at the root level. Draft using tree
-            # attention.
+
+        if isinstance(attn_metadata, TreeAttentionMetadata):
+            # Draft using tree attention.
             draft_token_ids_list = self.propose_tree(
-                tree_root_level=0,
                 batch_size=batch_size,
                 logits=logits,
                 positions=positions,
@@ -242,11 +234,10 @@ class EagleProposer:
                 (TritonAttentionMetadata, AiterFlashAttentionMetadata,
                  FlashAttentionMetadata))
         else:
-            # Currently, only FlashAttention and TreeAttention support
-            # multi-token eagle spec decode. This is because the code below
-            # makes assumptions about attn_metadata attributes available.
-            assert isinstance(attn_metadata,
-                              (FlashAttentionMetadata, TreeAttentionMetadata))
+            # Currently, only FlashAttention supports multi-token eagle spec
+            # decode. This is because the code below makes assumptions about
+            # attn_metadata attributes available.
+            assert isinstance(attn_metadata, FlashAttentionMetadata)
 
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
@@ -259,7 +250,7 @@ class EagleProposer:
         attn_metadata.num_actual_tokens = batch_size
         attn_metadata.max_query_len = 1
         attn_metadata.query_start_loc = self.arange[:batch_size + 1]
-        for token_index in range(self.num_speculative_tokens - 1):
+        for _ in range(self.num_speculative_tokens - 1):
             # Update the inputs.
             # cast to int32 is crucial when eagle model is compiled.
             # tensor.argmax() returns int64 by default.
@@ -327,21 +318,6 @@ class EagleProposer:
             hidden_states = hidden_states[:batch_size]
             logits = self.model.compute_logits(last_hidden_states[:batch_size],
                                                None)
-
-            if self.first_branching_level == token_index + 1:
-                # Branching has occurred. The remaining tokens are drafted
-                # using tree attention.
-                draft_token_ids_list += self.propose_tree(
-                    tree_root_level=token_index + 1,
-                    batch_size=batch_size,
-                    logits=logits,
-                    positions=positions,
-                    hidden_states=hidden_states,
-                    common_attn_metadata=common_attn_metadata,
-                )
-                # [batch_size, num_tree_tokens]
-                return torch.cat(draft_token_ids_list, dim=1)
-
             draft_token_ids = logits.argmax(dim=-1)
             draft_token_ids_list.append(draft_token_ids)
 
@@ -351,7 +327,6 @@ class EagleProposer:
 
     def propose_tree(
         self,
-        tree_root_level: int,
         batch_size: int,
         # [num_tokens, vocab_size]
         logits: torch.Tensor,
@@ -366,10 +341,10 @@ class EagleProposer:
         assert isinstance(tree_attn_metadata_builder,
                           TreeAttentionMetadataBuilder)
 
-        total_num_drafts = self.cu_drafts_per_level[tree_root_level]
+        total_num_drafts = self.cu_drafts_per_level[0]
         level_num_drafts = total_num_drafts
         # Sample a draft token for each child at the tree root level.
-        num_children = self.child_drafts_per_level[tree_root_level]
+        num_children = self.child_drafts_per_level[0]
         if num_children == 1:
             draft_token_ids = logits.argmax(dim=-1).view(batch_size, -1)
         else:
@@ -393,22 +368,23 @@ class EagleProposer:
             positions.view(batch_size, -1) +
             self.tree_draft_pos_offsets[:batch_size, :])
         tree_depth = len(self.cu_drafts_per_level)
-        for level in range(tree_root_level, tree_depth - 1):
+        for level in range(tree_depth - 1):
             # Get draft positions for RoPE.
             draft_positions = positions + (level + 1)
             exceeds_max_model_len = (positions +
                                      total_num_drafts) >= self.max_model_len
             # Mask out the position ids that exceed the max model length.
             # Otherwise, we may get out-of-range error in RoPE.
-            clamped_draft_positions = torch.where(
+            draft_positions = torch.where(
                 exceeds_max_model_len,
                 0,
                 draft_positions,
-            )
+            ).view(batch_size, -1)
+
             if level_num_drafts > 1:
                 # Repeat the positions for each draft at this level.
-                draft_positions = clamped_draft_positions.repeat_interleave(
-                    level_num_drafts).reshape(batch_size, -1)
+                draft_positions = draft_positions.repeat_interleave(
+                    level_num_drafts, dim=1)
 
             if num_children > 1:
                 # Repeat draft hidden states for each child.
@@ -425,7 +401,7 @@ class EagleProposer:
 
             # Build new attention metadata for the next level of drafts.
             # This is necessary to support tree attention.
-            query_len = total_num_drafts - tree_root_level
+            query_len = total_num_drafts
             common_attn_metadata = replace(
                 common_attn_metadata,
                 query_start_loc=query_len * self.arange[:batch_size + 1],
@@ -435,7 +411,7 @@ class EagleProposer:
             )
             attn_metadata = tree_attn_metadata_builder.build_for_drafting(
                 common_attn_metadata=common_attn_metadata,
-                draft_index=tree_root_level + 1,
+                draft_index=level + 1,
             )
 
             # Apply new attention metadata to all layers.
@@ -516,7 +492,6 @@ class EagleProposer:
             level_num_drafts = self.cu_drafts_per_level[level +
                                                         1] - total_num_drafts
             total_num_drafts = self.cu_drafts_per_level[level + 1]
-
         return draft_token_ids_list
 
     def prepare_inputs(

From 0b1bdac6af33b890a4d68321df05e71a1ba43dc4 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Wed, 13 Aug 2025 19:12:00 +0800
Subject: [PATCH 231/932] [Platform] Custom ops support for FusedMoe (#22509)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py        |  3 ++-
 vllm/model_executor/layers/linear.py                 | 12 ++++++------
 .../layers/vocab_parallel_embedding.py               |  4 +++-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 8ef0a805d8..ddc02168e5 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -682,7 +682,8 @@ def determine_expert_map(
     return (local_num_experts, expert_map)
 
 
-class FusedMoE(torch.nn.Module):
+@CustomOp.register("fused_moe")
+class FusedMoE(CustomOp):
     """FusedMoE layer for MoE models.
 
     This layer contains both MergedColumnParallel weights (gate_up_proj /
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index bb81a663d4..75391c51f7 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -16,6 +16,7 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
 from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
@@ -226,7 +227,7 @@ class UnquantizedLinearMethod(LinearMethodBase):
         return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
 
 
-class LinearBase(torch.nn.Module):
+class LinearBase(CustomOp):
     """Base linear layer.
 
     Args:
@@ -269,12 +270,8 @@ class LinearBase(torch.nn.Module):
                                                               prefix=prefix)
         self.return_bias = return_bias
 
-    def forward(
-        self, x: torch.Tensor
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
-        raise NotImplementedError
-
 
+@CustomOp.register("replicated_linear")
 class ReplicatedLinear(LinearBase):
     """Replicated linear layer.
 
@@ -443,6 +440,7 @@ class MergedReplicatedLinear(ReplicatedLinear):
         param[shard_offset:shard_offset + shard_size] = loaded_weight
 
 
+@CustomOp.register("column_parallel_linear")
 class ColumnParallelLinear(LinearBase):
     """Linear layer with column parallelism.
 
@@ -1229,6 +1227,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         param_data.copy_(loaded_weight)
 
 
+@CustomOp.register("row_parallel_linear")
 class RowParallelLinear(LinearBase):
     """Linear layer with row parallelism.
 
@@ -1405,6 +1404,7 @@ class RowParallelLinear(LinearBase):
         return s
 
 
+@CustomOp.register("qkv_cross_parallel_linear")
 class QKVCrossParallelLinear(LinearBase):
     """Linear layers for efficient cross-attention's QKV transformation.
 
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index a5f262c832..9f223998e5 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -12,6 +12,7 @@ from torch.nn.parameter import Parameter, UninitializedParameter
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding)
 from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
@@ -159,7 +160,8 @@ def get_masked_input_and_mask(
     return input_, ~vocab_mask
 
 
-class VocabParallelEmbedding(torch.nn.Module):
+@CustomOp.register("vocab_parallel_embedding")
+class VocabParallelEmbedding(CustomOp):
     """Embedding parallelized in the vocabulary dimension.
 
     Adapted from torch.nn.Embedding, note that we pad the vocabulary size to

From 653124bd46c57770b151eb58cc2a59170753daa5 Mon Sep 17 00:00:00 2001
From: Kdump <rootshellexp@gmail.com>
Date: Wed, 13 Aug 2025 19:14:24 +0800
Subject: [PATCH 232/932] [Frontend] Add chunked processing to handle long
 inputs in embedding models (#22280)

Signed-off-by: x22x22 <wadeking@qq.com>
Signed-off-by: Kdump <rootshellexp@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Maximilien de Bayser <maxdebayser@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../openai_embedding_long_text/README.md      | 186 +++++++
 .../openai_embedding_long_text/client.py      | 366 ++++++++++++++
 .../openai_embedding_long_text/service.sh     | 137 ++++++
 .../openai/test_embedding_long_text.py        | 441 +++++++++++++++++
 vllm/config/__init__.py                       |  19 +
 vllm/entrypoints/openai/serving_embedding.py  | 457 +++++++++++++++++-
 6 files changed, 1603 insertions(+), 3 deletions(-)
 create mode 100644 examples/online_serving/openai_embedding_long_text/README.md
 create mode 100644 examples/online_serving/openai_embedding_long_text/client.py
 create mode 100644 examples/online_serving/openai_embedding_long_text/service.sh
 create mode 100644 tests/entrypoints/openai/test_embedding_long_text.py

diff --git a/examples/online_serving/openai_embedding_long_text/README.md b/examples/online_serving/openai_embedding_long_text/README.md
new file mode 100644
index 0000000000..04edc4680e
--- /dev/null
+++ b/examples/online_serving/openai_embedding_long_text/README.md
@@ -0,0 +1,186 @@
+# Long Text Embedding with Chunked Processing
+
+This directory contains examples for using vLLM's **chunked processing** feature to handle long text embedding that exceeds the model's maximum context length.
+
+## 🚀 Quick Start
+
+### Start the Server
+
+Use the provided script to start a vLLM server with chunked processing enabled:
+
+```bash
+# Basic usage (supports very long texts up to ~3M tokens)
+./service.sh
+
+# Custom configuration with different models
+MODEL_NAME="jinaai/jina-embeddings-v3" \
+MAX_EMBED_LEN=1048576 \
+./service.sh
+
+# For extremely long documents
+MODEL_NAME="intfloat/multilingual-e5-large" \
+MAX_EMBED_LEN=3072000 \
+./service.sh
+```
+
+### Test Long Text Embedding
+
+Run the comprehensive test client:
+
+```bash
+python client.py
+```
+
+## 📁 Files
+
+| File | Description |
+|------|-------------|
+| `service.sh` | Server startup script with chunked processing enabled |
+| `client.py` | Comprehensive test client for long text embedding |
+
+## ⚙️ Configuration
+
+### Server Configuration
+
+The key parameters for chunked processing are in the `--override-pooler-config`:
+
+```json
+{
+  "pooling_type": "auto",
+  "normalize": true,
+  "enable_chunked_processing": true,
+  "max_embed_len": 3072000
+}
+```
+
+!!! note
+    `pooling_type` sets the model's own pooling strategy for processing within each chunk. The cross-chunk aggregation automatically uses MEAN strategy when input exceeds the model's native maximum length.
+
+#### Chunked Processing Behavior
+
+Chunked processing uses **MEAN aggregation** for cross-chunk combination when input exceeds the model's native maximum length:
+
+| Component | Behavior | Description |
+|-----------|----------|-------------|
+| **Within chunks** | Model's native pooling | Uses the model's configured pooling strategy |
+| **Cross-chunk aggregation** | Always MEAN | Weighted averaging based on chunk token counts |
+| **Performance** | Optimal | All chunks processed for complete semantic coverage |
+
+### Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `MODEL_NAME` | `intfloat/multilingual-e5-large` | Embedding model to use (supports multiple models) |
+| `PORT` | `31090` | Server port |
+| `GPU_COUNT` | `1` | Number of GPUs to use |
+| `MAX_EMBED_LEN` | `3072000` | Maximum embedding input length (supports very long documents) |
+| `POOLING_TYPE` | `auto` | Model's native pooling type: `auto`, `MEAN`, `CLS`, `LAST` (only affects within-chunk pooling, not cross-chunk aggregation) |
+| `API_KEY` | `EMPTY` | API key for authentication |
+
+## 🔧 How It Works
+
+1. **Enhanced Input Validation**: `max_embed_len` allows accepting inputs longer than `max_model_len` without environment variables
+2. **Smart Chunking**: Text is split based on `max_position_embeddings` to maintain semantic integrity
+3. **Unified Processing**: All chunks processed separately through the model using its configured pooling strategy
+4. **MEAN Aggregation**: When input exceeds model's native length, results combined using token count-based weighted averaging across all chunks
+5. **Consistent Output**: Final embeddings maintain the same dimensionality as standard processing
+
+### Input Length Handling
+
+- **Within max_embed_len**: Input is accepted and processed (up to 3M+ tokens)
+- **Exceeds max_position_embeddings**: Chunked processing is automatically triggered
+- **Exceeds max_embed_len**: Input is rejected with clear error message
+- **No environment variables required**: Works without `VLLM_ALLOW_LONG_MAX_MODEL_LEN`
+
+### Extreme Long Text Support
+
+With `MAX_EMBED_LEN=3072000`, you can process:
+
+- **Academic papers**: Full research papers with references
+- **Legal documents**: Complete contracts and legal texts  
+- **Books**: Entire chapters or small books
+- **Code repositories**: Large codebases and documentation
+
+## 📊 Performance Characteristics
+
+### Chunked Processing Performance
+
+| Aspect | Behavior | Performance |
+|--------|----------|-------------|
+| **Chunk Processing** | All chunks processed with native pooling | Consistent with input length |
+| **Cross-chunk Aggregation** | MEAN weighted averaging | Minimal overhead |
+| **Memory Usage** | Proportional to number of chunks | Moderate, scalable |
+| **Semantic Quality** | Complete text coverage | Optimal for long documents |
+
+## 🧪 Test Cases
+
+The test client demonstrates:
+
+- ✅ **Short text**: Normal processing (baseline)
+- ✅ **Medium text**: Single chunk processing
+- ✅ **Long text**: Multi-chunk processing with aggregation
+- ✅ **Very long text**: Many chunks processing
+- ✅ **Extreme long text**: Document-level processing (100K+ tokens)
+- ✅ **Batch processing**: Mixed-length inputs in one request
+- ✅ **Consistency**: Reproducible results across runs
+
+## 🐛 Troubleshooting
+
+### Common Issues
+
+1. **Chunked processing not enabled**:
+
+   ```log
+   ValueError: This model's maximum position embeddings length is 4096 tokens...
+   ```
+
+   **Solution**: Ensure `enable_chunked_processing: true` in pooler config
+
+2. **Input exceeds max_embed_len**:
+
+   ```log
+   ValueError: This model's maximum embedding input length is 3072000 tokens...
+   ```
+
+   **Solution**: Increase `max_embed_len` in pooler config or reduce input length
+
+3. **Memory errors**:
+  
+   ```log
+   RuntimeError: CUDA out of memory
+   ```
+  
+   **Solution**: Reduce chunk size by adjusting model's `max_position_embeddings` or use fewer GPUs
+
+4. **Slow processing**:
+   **Expected**: Long text takes more time due to multiple inference calls
+
+### Debug Information
+
+Server logs show chunked processing activity:
+
+```log
+INFO: Input length 150000 exceeds max_position_embeddings 4096, will use chunked processing
+INFO: Split input of 150000 tokens into 37 chunks (max_chunk_size: 4096)
+```
+
+## 🤝 Contributing
+
+To extend chunked processing support to other embedding models:
+
+1. Check model compatibility with the pooling architecture
+2. Test with various text lengths
+3. Validate embedding quality compared to single-chunk processing
+4. Submit PR with test cases and documentation updates
+
+## 🆕 Enhanced Features
+
+### max_embed_len Parameter
+
+The new `max_embed_len` parameter provides:
+
+- **Simplified Configuration**: No need for `VLLM_ALLOW_LONG_MAX_MODEL_LEN` environment variable
+- **Flexible Input Validation**: Accept inputs longer than `max_model_len` up to `max_embed_len`
+- **Extreme Length Support**: Process documents with millions of tokens
+- **Clear Error Messages**: Better feedback when inputs exceed limits
+- **Backward Compatibility**: Existing configurations continue to work
diff --git a/examples/online_serving/openai_embedding_long_text/client.py b/examples/online_serving/openai_embedding_long_text/client.py
new file mode 100644
index 0000000000..6e9838ac6d
--- /dev/null
+++ b/examples/online_serving/openai_embedding_long_text/client.py
@@ -0,0 +1,366 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Example script demonstrating long text embedding with chunked processing in vLLM.
+
+This example shows how to use vLLM's chunked processing feature to handle text
+inputs that exceed the model's maximum token length. The feature automatically
+splits long text into chunks and handles different pooling types optimally.
+
+Prerequisites:
+1. Start vLLM server with chunked processing enabled:
+   
+   # MEAN pooling (processes all chunks, recommended for complete coverage)
+   vllm serve intfloat/multilingual-e5-large \
+     --override-pooler-config \
+      '{"pooling_type": "MEAN", "normalize": true, ' \
+      '"enable_chunked_processing": true, "max_embed_len": 3072000}' \
+     --served-model-name multilingual-e5-large \
+     --trust-remote-code \
+     --port 31090 \
+     --api-key your-api-key
+
+   # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
+   vllm serve BAAI/bge-large-en-v1.5 \
+     --override-pooler-config \
+      '{"pooling_type": "CLS", "normalize": true, ' \
+      '"enable_chunked_processing": true, "max_embed_len": 1048576}' \
+     --served-model-name bge-large-en-v1.5 \
+     --trust-remote-code \
+     --port 31090 \
+     --api-key your-api-key
+
+2. Install required dependencies:
+   pip install openai requests
+"""
+
+import time
+
+import numpy as np
+from openai import OpenAI
+
+# Configuration
+API_KEY = "your-api-key"  # Replace with your actual API key
+BASE_URL = "http://localhost:31090/v1"
+MODEL_NAME = "multilingual-e5-large"
+
+
+def generate_long_text(base_text: str, repeat_count: int) -> str:
+    """Generate long text by repeating base text."""
+    return base_text * repeat_count
+
+
+def test_embedding_with_different_lengths():
+    """Test embedding generation with different text lengths."""
+    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+
+    # Test cases with different text lengths
+    test_cases = [
+        {
+            "name": "Short Text",
+            "text": "Hello, this is a short text for embedding.",
+            "expected_chunks": 1,
+        },
+        {
+            "name": "Medium Text",
+            "text": generate_long_text(
+                "This is a medium-length text that should fit within the "
+                "model's context window. " * 20,
+                2,
+            ),
+            "expected_chunks": 1,
+        },
+        {
+            "name": "Long Text (2 chunks)",
+            "text": generate_long_text(
+                "This is a very long text that will exceed the model's "
+                "maximum context length and trigger chunked processing. " * 50,
+                5,
+            ),
+            "expected_chunks": 2,
+        },
+        {
+            "name": "Very Long Text (3+ chunks)",
+            "text": generate_long_text(
+                "This text is extremely long and will definitely "
+                "require multiple chunks for processing. " * 100,
+                10,
+            ),
+            "expected_chunks": 3,
+        },
+    ]
+
+    print("🧪 Testing vLLM Long Text Embedding with Chunked Processing")
+    print("=" * 70)
+
+    for i, test_case in enumerate(test_cases, 1):
+        print(f"\n📝 Test {i}: {test_case['name']}")
+        print(f"Text length: {len(test_case['text'])} characters")
+
+        try:
+            start_time = time.time()
+
+            response = client.embeddings.create(
+                input=test_case["text"], model=MODEL_NAME, encoding_format="float"
+            )
+
+            end_time = time.time()
+            processing_time = end_time - start_time
+
+            # Extract embedding data
+            embedding = response.data[0].embedding
+            embedding_dim = len(embedding)
+
+            print("✅ Success!")
+            print(f"   - Embedding dimension: {embedding_dim}")
+            print(f"   - Processing time: {processing_time:.2f}s")
+            print(f"   - Expected chunks: ~{test_case['expected_chunks']}")
+            print(f"   - First 5 values: {embedding[:5]}")
+
+        except Exception as e:
+            print(f"❌ Failed: {str(e)}")
+
+
+def test_batch_embedding():
+    """Test batch embedding with mixed-length inputs."""
+    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+
+    print("\n🔄 Testing Batch Embedding with Mixed Lengths")
+    print("=" * 50)
+
+    # Mix of short and long texts
+    batch_inputs = [
+        "Short text 1",
+        generate_long_text("Medium length text that fits in one chunk. " * 20, 1),
+        "Another short text",
+        generate_long_text("Long text requiring chunked processing. " * 100, 5),
+    ]
+
+    try:
+        start_time = time.time()
+
+        response = client.embeddings.create(
+            input=batch_inputs, model=MODEL_NAME, encoding_format="float"
+        )
+
+        end_time = time.time()
+        processing_time = end_time - start_time
+
+        print("✅ Batch processing successful!")
+        print(f"   - Number of inputs: {len(batch_inputs)}")
+        print(f"   - Number of embeddings: {len(response.data)}")
+        print(f"   - Total processing time: {processing_time:.2f}s")
+        print(
+            f"   - Average time per input: {processing_time / len(batch_inputs):.2f}s"
+        )
+
+        for i, data in enumerate(response.data):
+            input_length = len(batch_inputs[i])
+            embedding_dim = len(data.embedding)
+            print(
+                f"   - Input {i + 1}: {input_length} chars → {embedding_dim}D embedding"
+            )
+
+    except Exception as e:
+        print(f"❌ Batch processing failed: {str(e)}")
+
+
+def test_multiple_long_texts_batch():
+    """Test batch processing with multiple long texts to verify chunk ID uniqueness."""
+    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+
+    print("\n🔧 Testing Multiple Long Texts in Batch (Chunk ID Fix Verification)")
+    print("=" * 70)
+
+    # Create multiple distinct long texts that will all require chunking
+    # Note: All pooling types now use MEAN aggregation across chunks:
+    # - Native pooling (MEAN/CLS/LAST) is used within each chunk
+    # - MEAN aggregation combines results across all chunks
+    # - Full semantic coverage for all pooling types
+    long_texts = [
+        generate_long_text(
+            "First long document about artificial intelligence and machine learning. "
+            * 80,
+            6,
+        ),
+        generate_long_text(
+            "Second long document about natural language processing and transformers. "
+            * 80,
+            6,
+        ),
+        generate_long_text(
+            "Third long document about computer vision and neural networks. " * 80, 6
+        ),
+    ]
+
+    # Add some short texts to mix things up
+    batch_inputs = [
+        "Short text before long texts",
+        long_texts[0],
+        "Short text between long texts",
+        long_texts[1],
+        long_texts[2],
+        "Short text after long texts",
+    ]
+
+    print("📊 Batch composition:")
+    for i, text in enumerate(batch_inputs):
+        length = len(text)
+        text_type = "Long (will be chunked)" if length > 5000 else "Short"
+        print(f"   - Input {i + 1}: {length} chars ({text_type})")
+
+    try:
+        start_time = time.time()
+
+        response = client.embeddings.create(
+            input=batch_inputs, model=MODEL_NAME, encoding_format="float"
+        )
+
+        end_time = time.time()
+        processing_time = end_time - start_time
+
+        print("\n✅ Multiple long texts batch processing successful!")
+        print(f"   - Number of inputs: {len(batch_inputs)}")
+        print(f"   - Number of embeddings returned: {len(response.data)}")
+        print(f"   - Total processing time: {processing_time:.2f}s")
+
+        # Verify each embedding is different (no incorrect aggregation)
+        embeddings = [data.embedding for data in response.data]
+
+        if len(embeddings) >= 3:
+            import numpy as np
+
+            # Compare embeddings of the long texts (indices 1, 3, 4)
+            long_embeddings = [
+                np.array(embeddings[1]),  # First long text
+                np.array(embeddings[3]),  # Second long text
+                np.array(embeddings[4]),  # Third long text
+            ]
+
+            print("\n🔍 Verifying embedding uniqueness:")
+            for i in range(len(long_embeddings)):
+                for j in range(i + 1, len(long_embeddings)):
+                    cosine_sim = np.dot(long_embeddings[i], long_embeddings[j]) / (
+                        np.linalg.norm(long_embeddings[i])
+                        * np.linalg.norm(long_embeddings[j])
+                    )
+                    print(
+                        f"   - Similarity between long text {i + 1} and {j + 1}: "
+                        f"{cosine_sim:.4f}"
+                    )
+
+                    if (
+                        cosine_sim < 0.9
+                    ):  # Different content should have lower similarity
+                        print("     ✅ Good: Embeddings are appropriately different")
+                    else:
+                        print(
+                            "     ⚠️ High similarity - may indicate chunk "
+                            "aggregation issue"
+                        )
+
+        print("\n📋 Per-input results:")
+        for i, data in enumerate(response.data):
+            input_length = len(batch_inputs[i])
+            embedding_dim = len(data.embedding)
+            embedding_norm = np.linalg.norm(data.embedding)
+            print(
+                f"   - Input {i + 1}: {input_length} chars → {embedding_dim}D "
+                f"embedding (norm: {embedding_norm:.4f})"
+            )
+
+        print(
+            "\n✅ This test verifies the fix for chunk ID collisions in "
+            "batch processing"
+        )
+        print("   - Before fix: Multiple long texts would have conflicting chunk IDs")
+        print("   - After fix: Each prompt's chunks have unique IDs with prompt index")
+
+    except Exception as e:
+        print(f"❌ Multiple long texts batch test failed: {str(e)}")
+        print("   This might indicate the chunk ID collision bug is present!")
+
+
+def test_embedding_consistency():
+    """Test that chunked processing produces consistent results."""
+    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+
+    print("\n🔍 Testing Embedding Consistency")
+    print("=" * 40)
+
+    # Use the same long text multiple times
+    long_text = generate_long_text(
+        "Consistency test text for chunked processing validation. " * 50, 3
+    )
+
+    embeddings = []
+
+    try:
+        for i in range(3):
+            response = client.embeddings.create(
+                input=long_text, model=MODEL_NAME, encoding_format="float"
+            )
+            embeddings.append(response.data[0].embedding)
+            print(f"   - Generated embedding {i + 1}")
+
+        # Check consistency (embeddings should be identical)
+        if len(embeddings) >= 2:
+            # Calculate similarity between first two embeddings
+
+            emb1 = np.array(embeddings[0])
+            emb2 = np.array(embeddings[1])
+
+            # Cosine similarity
+            cosine_sim = np.dot(emb1, emb2) / (
+                np.linalg.norm(emb1) * np.linalg.norm(emb2)
+            )
+
+            print("✅ Consistency test completed!")
+            print(f"   - Cosine similarity between runs: {cosine_sim:.6f}")
+            print("   - Expected: ~1.0 (identical embeddings)")
+
+            if cosine_sim > 0.999:
+                print("   - ✅ High consistency achieved!")
+            else:
+                print("   - ⚠️ Consistency may vary due to numerical precision")
+
+    except Exception as e:
+        print(f"❌ Consistency test failed: {str(e)}")
+
+
+def main():
+    """Main function to run all tests."""
+    print("🚀 vLLM Long Text Embedding Client")
+    print(f"📡 Connecting to: {BASE_URL}")
+    print(f"🤖 Model: {MODEL_NAME}")
+    masked_key = "*" * (len(API_KEY) - 4) + API_KEY[-4:] if len(API_KEY) > 4 else "****"
+    print(f"🔑 API Key: {masked_key}")
+
+    # Run all test cases
+    test_embedding_with_different_lengths()
+    test_batch_embedding()
+    test_multiple_long_texts_batch()
+    test_embedding_consistency()
+
+    print("\n" + "=" * 70)
+    print("🎉 All tests completed!")
+    print("\n💡 Key Features Demonstrated:")
+    print("   - ✅ Automatic chunked processing for long text")
+    print("   - ✅ Seamless handling of mixed-length batches")
+    print("   - ✅ Multiple long texts in single batch (chunk ID fix)")
+    print("   - ✅ Unified chunked processing:")
+    print("     • Native pooling used within each chunk")
+    print("     • MEAN aggregation across all chunks")
+    print("     • Complete semantic coverage for all pooling types")
+    print("   - ✅ Consistent embedding generation")
+    print("   - ✅ Backward compatibility with short text")
+    print("\n📚 For more information, see:")
+    print(
+        "   - Documentation: https://docs.vllm.ai/en/latest/models/pooling_models.html"
+    )
+    print("   - Chunked Processing Guide: openai_embedding_long_text.md")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh
new file mode 100644
index 0000000000..f356d7d452
--- /dev/null
+++ b/examples/online_serving/openai_embedding_long_text/service.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# vLLM Embedding Server with Enhanced Chunked Processing
+# This script starts a vLLM server with chunked processing enabled for long text embedding.
+# Now supports proper pooling type validation and model-specific configurations.
+
+set -euo pipefail
+
+# Configuration
+MODEL_NAME=${MODEL_NAME:-"intfloat/multilingual-e5-large"}
+MODEL_CODE=${MODEL_CODE:-"multilingual-e5-large"}
+
+PORT=${PORT:-31090}
+GPU_COUNT=${GPU_COUNT:-1}
+MAX_EMBED_LEN=${MAX_EMBED_LEN:-3072000}
+API_KEY=${API_KEY:-"your-api-key"}
+
+# Enhanced pooling configuration with model-specific defaults
+POOLING_TYPE=${POOLING_TYPE:-"auto"}  # auto, MEAN, CLS, LAST
+export VLLM_ENABLE_CHUNKED_PROCESSING=true
+export CUDA_VISIBLE_DEVICES=2,3,4,5
+# export VLLM_ATTENTION_BACKEND=XFORMERS
+
+echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
+echo "=================================================================="
+
+# Environment variables for optimization
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+# Function to determine optimal pooling type for known models
+get_optimal_pooling_type() {
+    local model="$1"
+    case "$model" in
+        *"e5-"* | *"multilingual-e5"*)
+            echo "MEAN"  # E5 series native pooling
+            ;;
+        *"bge-"*)
+            echo "CLS"   # BGE series native pooling
+            ;;
+        *"gte-"*)
+            echo "LAST"  # GTE series native pooling
+            ;;
+        *"sentence-t5"* | *"st5"*)
+            echo "MEAN"  # Sentence-T5 native pooling
+            ;;
+        *"jina-embeddings"*)
+            echo "MEAN"  # Jina embeddings native pooling
+            ;;
+        *"Qwen"*"Embedding"*)
+            echo "LAST"  # Qwen embeddings native pooling
+            ;;
+        *)
+            echo "MEAN"  # Default native pooling for unknown models
+            ;;
+    esac
+}
+
+# Auto-detect pooling type if not explicitly set
+if [ "$POOLING_TYPE" = "auto" ]; then
+    POOLING_TYPE=$(get_optimal_pooling_type "$MODEL_NAME")
+    echo "🔍 Auto-detected pooling type: $POOLING_TYPE for model $MODEL_NAME"
+fi
+
+# Display configuration
+echo "📋 Configuration:"
+echo "   - Model: $MODEL_NAME"
+echo "   - Port: $PORT"
+echo "   - GPU Count: $GPU_COUNT"
+echo "   - Enhanced Chunked Processing: ${VLLM_ENABLE_CHUNKED_PROCESSING}"
+echo "   - Max Embed Length: ${MAX_EMBED_LEN} tokens"
+echo "   - Native Pooling Type: $POOLING_TYPE + Normalization"
+echo "   - Cross-chunk Aggregation: MEAN (automatic)"
+echo ""
+
+# Validate GPU availability
+if command -v nvidia-smi &> /dev/null; then
+    gpu_count=$(nvidia-smi --list-gpus | wc -l)
+    echo "🖥️  Available GPUs: $gpu_count"
+    if [ "$GPU_COUNT" -gt "$gpu_count" ]; then
+        echo "⚠️  Warning: Requested $GPU_COUNT GPUs but only $gpu_count available"
+        echo "   Adjusting to use $gpu_count GPUs"
+        GPU_COUNT=$gpu_count
+    fi
+else
+    echo "⚠️  Warning: nvidia-smi not found. GPU detection skipped."
+fi
+
+# Chunked processing uses unified MEAN aggregation
+echo "ℹ️  Chunked Processing: Using $POOLING_TYPE pooling within chunks, MEAN aggregation across chunks"
+echo "   - All chunks processed for complete semantic coverage"
+echo "   - Weighted averaging based on chunk token counts"
+
+echo ""
+echo "🔧 Starting server with enhanced chunked processing configuration..."
+
+# Build pooler config JSON
+POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
+
+# Start vLLM server with enhanced chunked processing
+vllm serve "$MODEL_NAME" \
+  --tensor-parallel-size "$GPU_COUNT" \
+  --enforce-eager \
+  --override-pooler-config "$POOLER_CONFIG" \
+  --served-model-name ${MODEL_CODE} \
+  --api-key "$API_KEY" \
+  --trust-remote-code \
+  --port "$PORT" \
+  --host 0.0.0.0
+
+echo ""
+echo "✅ vLLM Embedding Server started successfully!"
+echo ""
+echo "📡 Server Information:"
+echo "   - Base URL: http://localhost:$PORT"
+echo "   - Model Code: ${MODEL_CODE}"
+echo "   - API Key: $API_KEY"
+echo "   - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN"
+echo ""
+echo "🧪 Test the server with:"
+echo "   python examples/online_serving/openai_embedding_long_text_client.py"
+echo ""
+echo "📚 Enhanced features enabled:"
+echo "   ✅ Intelligent native pooling type detection"
+echo "   ✅ Unified MEAN aggregation for chunked processing"
+echo "   ✅ Model-specific native pooling optimization"
+echo "   ✅ Enhanced max embedding length (${MAX_EMBED_LEN} tokens)"
+echo "   ✅ Complete semantic coverage for all pooling types"
+echo "   ✅ OpenAI-compatible API"
+echo "   ✅ GPU acceleration"
+echo ""
+echo "🔧 Advanced usage:"
+echo "   - Set POOLING_TYPE=MEAN|CLS|LAST to override auto-detection"
+echo "   - Set MAX_EMBED_LEN to adjust maximum input length"
+echo "   - All pooling types use MEAN aggregation across chunks" 
diff --git a/tests/entrypoints/openai/test_embedding_long_text.py b/tests/entrypoints/openai/test_embedding_long_text.py
new file mode 100644
index 0000000000..86bd34abb9
--- /dev/null
+++ b/tests/entrypoints/openai/test_embedding_long_text.py
@@ -0,0 +1,441 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test cases for long text embedding with automatic chunking mechanism.
+
+This test suite validates vLLM's automatic chunking functionality for handling
+text inputs that exceed the model's maximum token length, specifically targeting
+the intfloat/multilingual-e5-small model (max token length: 512).
+"""
+
+import random
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
+
+from ...utils import RemoteOpenAIServer
+
+
+def _generate_random_text(word_count: int) -> str:
+    """Generate random text with approximately the specified word count."""
+    # Common English words with focus on verbs and nouns for realistic text
+    common_words = [
+        # Essential articles and pronouns (minimal)
+        "the",
+        "and",
+        "you",
+        "they",
+        "this",
+        "that",
+        "these",
+        "those",
+
+        # Action verbs
+        "create",
+        "build",
+        "develop",
+        "design",
+        "implement",
+        "execute",
+        "analyze",
+        "process",
+        "generate",
+        "calculate",
+        "evaluate",
+        "optimize",
+        "transform",
+        "integrate",
+        "configure",
+        "deploy",
+        "monitor",
+        "manage",
+        "discover",
+        "explore",
+        "investigate",
+        "research",
+        "study",
+        "examine",
+        "improve",
+        "enhance",
+        "upgrade",
+        "modify",
+        "update",
+        "maintain",
+        "solve",
+        "resolve",
+        "handle",
+        "address",
+        "tackle",
+        "overcome",
+        "communicate",
+        "collaborate",
+        "coordinate",
+        "organize",
+        "plan",
+        "achieve",
+        "accomplish",
+        "complete",
+        "finish",
+        "deliver",
+        "provide",
+
+        # Technology and science nouns
+        "system",
+        "application",
+        "software",
+        "hardware",
+        "network",
+        "database",
+        "algorithm",
+        "model",
+        "framework",
+        "platform",
+        "interface",
+        "protocol",
+        "architecture",
+        "infrastructure",
+        "component",
+        "module",
+        "service",
+        "technology",
+        "innovation",
+        "solution",
+        "methodology",
+        "approach",
+        "artificial",
+        "intelligence",
+        "machine",
+        "learning",
+        "neural",
+        "network",
+        "computer",
+        "processor",
+        "memory",
+        "storage",
+        "computation",
+        "data",
+        "information",
+        "knowledge",
+        "insight",
+        "pattern",
+        "trend",
+        "analysis",
+        "research",
+        "development",
+        "engineering",
+        "science",
+        "mathematics",
+        "statistics",
+        "probability",
+        "optimization",
+        "performance",
+        "efficiency",
+
+        # General nouns
+        "project",
+        "team",
+        "organization",
+        "company",
+        "business",
+        "industry",
+        "market",
+        "customer",
+        "user",
+        "client",
+        "product",
+        "feature",
+        "function",
+        "requirement",
+        "specification",
+        "documentation",
+        "report",
+        "result",
+        "outcome",
+        "impact",
+        "benefit",
+        "advantage",
+        "challenge",
+        "problem",
+        "opportunity",
+        "strategy",
+        "goal",
+        "objective",
+        "target",
+        "milestone",
+        "process",
+        "procedure",
+        "workflow",
+        "pipeline",
+        "operation",
+        "task",
+        "activity",
+        "event",
+        "session",
+        "meeting",
+        "discussion",
+        "decision"
+    ]
+
+    words = []
+    for _ in range(word_count):
+        words.append(random.choice(common_words))
+
+    # Add some punctuation for more realistic text
+    text = " ".join(words)
+    # Add periods every 10-20 words
+    words_list = text.split()
+    result = []
+    for i, word in enumerate(words_list):
+        result.append(word)
+        if ((i + 1) % random.randint(10, 20) == 0 and i < len(words_list) - 1):
+            result[-1] += "."
+
+    return " ".join(result)
+
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+DTYPE = "bfloat16"
+
+# Test text: Generate text with approximately 1500 words to exceed 1024 tokens
+LONG_TEXT_1500_WORDS = _generate_random_text(1500)
+
+# Test text: Generate text with approximately 2500 words to exceed 2048 tokens
+LONG_TEXT_2500_WORDS = _generate_random_text(2500)
+
+
+@pytest.fixture(scope="module")
+def server_with_chunked_processing():
+    """Start server with automatic chunking processing enabled."""
+    args = [
+        "--runner",
+        "pooling",
+        "--dtype",
+        DTYPE,
+        "--enforce-eager",
+        "--max-model-len",
+        "512",  # Set smaller max_model_len to trigger chunking mechanism
+        '--override-pooler-config',
+        ('{"pooling_type": "MEAN", "normalize": true, '
+         '"enable_chunked_processing": true, "max_embed_len": 10000}'),
+        "--gpu-memory-utilization",
+        "0.8",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_with_chunked_processing(server_with_chunked_processing):
+    """Create async client with chunking processing support."""
+    async with server_with_chunked_processing.get_async_client(
+    ) as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_long_text_embedding_1500_chars(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test embedding processing for ~1500 character long text 
+    (~1028 tokens, exceeding 512 token limit)."""
+
+    # Verify text length
+    # Verify text has sufficient word count (approximately 1500 words)
+    word_count = len(LONG_TEXT_1500_WORDS.split())
+    assert word_count >= 1400, (
+        f"Test text word count insufficient: {word_count} words")
+
+    # Send embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[LONG_TEXT_1500_WORDS],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding
+               ) == 384  # multilingual-e5-small embedding dimension
+    assert embeddings.usage.completion_tokens == 0
+    # Due to chunked processing, token count should
+    # reflect actual processed tokens
+    # With ~1500 words, we expect roughly
+    # 1024+ tokens (exceeding 512 token limit)
+    # Should exceed single chunk limit of 512
+    assert embeddings.usage.prompt_tokens > 800
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+    # Verify embedding vector validity
+    embedding_vector = embeddings.data[0].embedding
+    assert all(
+        isinstance(x, float)
+        for x in embedding_vector), "Embedding vector should contain floats"
+    assert not all(
+        x == 0
+        for x in embedding_vector), "Embedding vector should not be all zeros"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_long_text_embedding_2500_chars(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test embedding processing for ~2500 character long text
+    (~2048 tokens, requiring multiple chunks)."""
+
+    # Verify text length
+    # Verify text has sufficient word count (approximately 2500 words)
+    word_count = len(LONG_TEXT_2500_WORDS.split())
+    assert word_count >= 2300, (
+        f"Test text word count insufficient: {word_count} words")
+
+    # Send embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[LONG_TEXT_2500_WORDS],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding
+               ) == 384  # multilingual-e5-small embedding dimension
+    assert embeddings.usage.completion_tokens == 0
+    # Due to chunked processing, token count should
+    # reflect actual processed tokens
+    # With ~2500 words, we expect
+    # roughly 2048+ tokens (requiring multiple chunks)
+    # Should require multiple chunks for processing
+    assert embeddings.usage.prompt_tokens > 1500
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+    # Verify embedding vector validity
+    embedding_vector = embeddings.data[0].embedding
+    assert all(
+        isinstance(x, float)
+        for x in embedding_vector), "Embedding vector should contain floats"
+    assert not all(
+        x == 0
+        for x in embedding_vector), "Embedding vector should not be all zeros"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_long_text_embedding(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test batch long text embedding processing."""
+
+    input_texts = [
+        LONG_TEXT_1500_WORDS,
+        LONG_TEXT_2500_WORDS,
+        "This is a short text test.",  # Short text for comparison
+    ]
+
+    # Send batch embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 3  # Three input texts
+
+    # Verify each embedding dimension
+    for i, embedding_data in enumerate(embeddings.data):
+        assert len(embedding_data.embedding) == 384
+        assert embedding_data.index == i
+
+        # Verify embedding vector validity
+        embedding_vector = embedding_data.embedding
+        assert all(isinstance(x, float) for x in embedding_vector)
+        assert not all(x == 0 for x in embedding_vector)
+
+    # Verify token usage
+    assert embeddings.usage.completion_tokens == 0
+    # Total token count should be very substantial
+    assert embeddings.usage.prompt_tokens > 1000
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chunked_vs_normal_consistency(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test consistency between chunked and
+    normal processing (using short text)."""
+
+    # Use a short text within the 512 token limit
+    short_text = ("Artificial intelligence technology is changing our world, "
+                  "bringing unprecedented opportunities and challenges.")
+
+    # Send embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[short_text],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    # Short text should not require chunked processing
+    assert embeddings.usage.prompt_tokens < 512
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+    # 验证embedding向量的有效性
+    embedding_vector = embeddings.data[0].embedding
+    assert all(isinstance(x, float) for x in embedding_vector)
+    assert not all(x == 0 for x in embedding_vector)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chunked_processing_response_format(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test response format and structure during chunked processing."""
+
+    # Test with long text to trigger chunking
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[LONG_TEXT_1500_WORDS],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert embeddings.data[0].object == "embedding"
+    assert embeddings.data[0].index == 0
+
+    # Verify embedding vector properties
+    embedding_vector = embeddings.data[0].embedding
+    import math
+    vector_norm = math.sqrt(sum(x * x for x in embedding_vector))
+    # Check that the vector is normalized
+    # (default behavior for most embedding models)
+    assert 0.8 < vector_norm < 1.2, (
+        f"Vector norm should be reasonable, actual: {vector_norm}")
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 6649cd89ee..b4ea15ef5a 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -2598,6 +2598,25 @@ class PoolerConfig:
     ``math-shepherd-mistral-7b-prm`` model.
     """
 
+    enable_chunked_processing: Optional[bool] = None
+    """
+    Whether to enable chunked processing for long inputs that exceed the model's
+    maximum position embeddings. When enabled, long inputs will be split into
+    chunks, processed separately, and then aggregated using weighted averaging.
+    This allows embedding models to handle arbitrarily long text without CUDA
+    errors. Defaults to False.
+    """
+
+    max_embed_len: Optional[int] = None
+    """
+    Maximum input length allowed for embedding generation. When set, allows 
+    inputs longer than max_embed_len to be accepted for embedding models.
+    This parameter enables accepting long inputs without requiring 
+    VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds
+    max_embed_len, it will be handled according to the original max_model_len
+    validation logic. Defaults to None (i.e. set to max_model_len).
+    """
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 84ba008731..9dcad8e391 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -2,9 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
-from typing import Final, Literal, Optional, Union, cast
+from collections.abc import AsyncGenerator, Mapping
+from typing import Any, Final, Literal, Optional, Union, cast
 
 import numpy as np
+import torch
 from fastapi import Request
 from typing_extensions import assert_never, override
 
@@ -12,19 +14,28 @@ from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
+# yapf conflicts with isort for this docstring
+# yapf: disable
 from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
+                                              EmbeddingCompletionRequest,
                                               EmbeddingRequest,
                                               EmbeddingResponse,
                                               EmbeddingResponseData,
                                               ErrorResponse, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (EmbeddingServeContext,
                                                     OpenAIServing,
-                                                    ServeContext)
+                                                    RequestPrompt,
+                                                    ServeContext,
+                                                    TextTokensPrompt)
+# yapf: enable
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
+from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
-                          PoolingRequestOutput)
+                          PoolingOutput, PoolingRequestOutput, RequestOutput)
 from vllm.pooling_params import PoolingParams
+from vllm.utils import chunk_list
 
 logger = init_logger(__name__)
 
@@ -46,6 +57,17 @@ def _get_embedding(
 
 class EmbeddingMixin(OpenAIServing):
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        pooler_config = self.model_config.pooler_config
+
+        # Avoid repeated attribute lookups
+        self.supports_chunked_processing = bool(
+            pooler_config and pooler_config.enable_chunked_processing)
+        self.max_embed_len = (pooler_config.max_embed_len if pooler_config
+                              and pooler_config.max_embed_len else None)
+
     @override
     async def _preprocess(
         self,
@@ -129,6 +151,435 @@ class EmbeddingMixin(OpenAIServing):
             usage=usage,
         )
 
+    def _get_max_position_embeddings(self) -> int:
+        """Get the model's effective maximum sequence length for chunking."""
+        return self.model_config.max_model_len
+
+    def _should_use_chunked_processing(self, request) -> bool:
+        """Check if chunked processing should be used for this request."""
+        return isinstance(
+            request,
+            (EmbeddingCompletionRequest,
+             EmbeddingChatRequest)) and self.supports_chunked_processing
+
+    async def _process_chunked_request(
+        self,
+        ctx: EmbeddingServeContext,
+        original_prompt: TextTokensPrompt,
+        pooling_params,
+        trace_headers,
+        prompt_idx: int,
+    ) -> list[AsyncGenerator[PoolingRequestOutput, None]]:
+        """Process a single prompt using chunked processing."""
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
+        token_ids = original_prompt["prompt_token_ids"]
+
+        # Split into chunks using max_position_embeddings
+        max_pos_embeddings = self._get_max_position_embeddings()
+        # Process all chunks for MEAN aggregation
+        for chunk_idx, chunk_tokens in enumerate(
+                chunk_list(token_ids, max_pos_embeddings)):
+            # Create a request ID for this chunk
+            chunk_request_id = (f"{ctx.request_id}-prompt-{prompt_idx}-"
+                                f"chunk-{chunk_idx}")
+
+            # Create engine prompt for this chunk
+            chunk_engine_prompt = EngineTokensPrompt(
+                prompt_token_ids=chunk_tokens)
+
+            # Create chunk request prompt for logging
+            chunk_text = ""
+            chunk_request_prompt = TextTokensPrompt(
+                prompt=chunk_text, prompt_token_ids=chunk_tokens)
+
+            # Log the chunk
+            self._log_inputs(chunk_request_id,
+                             chunk_request_prompt,
+                             params=pooling_params,
+                             lora_request=ctx.lora_request)
+
+            # Create generator for this chunk and wrap it to return indices
+            original_generator = self.engine_client.encode(
+                chunk_engine_prompt,
+                pooling_params,
+                chunk_request_id,
+                lora_request=ctx.lora_request,
+                trace_headers=trace_headers,
+                priority=getattr(ctx.request, "priority", 0),
+            )
+
+            generators.append(original_generator)
+
+        return generators
+
+    def _validate_input(
+        self,
+        request,
+        input_ids: list[int],
+        input_text: str,
+    ) -> TextTokensPrompt:
+        """Override to support chunked processing for embedding requests."""
+        token_num = len(input_ids)
+
+        # Note: EmbeddingRequest doesn't have max_tokens
+        if isinstance(request,
+                      (EmbeddingCompletionRequest, EmbeddingChatRequest)):
+            # Check if chunked processing is enabled for pooling models
+            enable_chunked = self._should_use_chunked_processing(request)
+
+            # Use max_position_embeddings for chunked processing decisions
+            max_pos_embeddings = self._get_max_position_embeddings()
+
+            # Determine the effective max length for validation
+            if self.max_embed_len is not None:
+                # Use max_embed_len for validation instead of max_model_len
+                length_type = "maximum embedding input length"
+                max_length_value = self.max_embed_len
+            else:
+                # Fall back to max_model_len validation (original behavior)
+                length_type = "maximum context length"
+                max_length_value = self.max_model_len
+
+            validation_error_msg = (
+                "This model's {length_type} is {max_length_value} tokens. "
+                "However, you requested {token_num} tokens in the input for "
+                "embedding generation. Please reduce the length of the input.")
+
+            chunked_processing_error_msg = (
+                "This model's {length_type} is {max_length_value} tokens. "
+                "However, you requested {token_num} tokens in the input for "
+                "embedding generation. Please reduce the length of the input "
+                "or enable chunked processing.")
+
+            # Check if input exceeds max length
+            if token_num > max_length_value:
+                raise ValueError(
+                    validation_error_msg.format(
+                        length_type=length_type,
+                        max_length_value=max_length_value,
+                        token_num=token_num))
+
+            # Check for chunked processing
+            # when exceeding max_position_embeddings
+            if token_num > max_pos_embeddings:
+                if enable_chunked:
+                    # Allow long inputs when chunked processing is enabled
+                    logger.info(
+                        "Input length %s exceeds max_position_embeddings "
+                        "%s, will use chunked processing", token_num,
+                        max_pos_embeddings)
+                else:
+                    raise ValueError(
+                        chunked_processing_error_msg.format(
+                            length_type="maximum position embeddings length",
+                            max_length_value=max_pos_embeddings,
+                            token_num=token_num))
+
+            return TextTokensPrompt(prompt=input_text,
+                                    prompt_token_ids=input_ids)
+
+        # For other request types, use the parent's implementation
+        return super()._validate_input(request, input_ids, input_text)
+
+    def _is_text_tokens_prompt(self, prompt) -> bool:
+        """Check if a prompt is a TextTokensPrompt (has prompt_token_ids)."""
+        return (isinstance(prompt, dict) and "prompt_token_ids" in prompt
+                and "prompt_embeds" not in prompt)
+
+    async def _create_single_prompt_generator(
+        self,
+        ctx: EmbeddingServeContext,
+        engine_prompt: Union[EngineTokensPrompt, EngineEmbedsPrompt],
+        request_prompt: RequestPrompt,
+        pooling_params: PoolingParams,
+        trace_headers: Optional[Mapping[str, str]],
+        prompt_index: int,
+    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+        """Create a generator for a single prompt using standard processing."""
+        request_id_item = f"{ctx.request_id}-{prompt_index}"
+
+        self._log_inputs(request_id_item,
+                         request_prompt,
+                         params=pooling_params,
+                         lora_request=ctx.lora_request)
+
+        # Mypy has an existing bug related to inferring the variance
+        # of TypedDicts with `builtins.enumerate`:
+        # https://github.com/python/mypy/issues/8586#issuecomment-2867698435
+        engine_prompt = cast(Union[EngineTokensPrompt, EngineEmbedsPrompt],
+                             engine_prompt)
+
+        # Return the original generator without wrapping
+        return self.engine_client.encode(
+            engine_prompt,
+            pooling_params,
+            request_id_item,
+            lora_request=ctx.lora_request,
+            trace_headers=trace_headers,
+            priority=getattr(ctx.request, "priority", 0),
+        )
+
+    @override
+    async def _prepare_generators(
+        self,
+        ctx: ServeContext,
+    ) -> Optional[ErrorResponse]:
+        """Override to support chunked processing."""
+        ctx = cast(EmbeddingServeContext, ctx)
+
+        # Check if we should use chunked processing
+        use_chunked = self._should_use_chunked_processing(ctx.request)
+
+        # If no chunked processing needed, delegate to parent class
+        if not use_chunked:
+            return await super()._prepare_generators(ctx)
+
+        # Custom logic for chunked processing
+        generators: list[AsyncGenerator[Union[RequestOutput,
+                                              PoolingRequestOutput],
+                                        None]] = []
+
+        try:
+            trace_headers = (None if ctx.raw_request is None else await
+                             self._get_trace_headers(ctx.raw_request.headers))
+
+            pooling_params = self._create_pooling_params(ctx)
+            if isinstance(pooling_params, ErrorResponse):
+                return pooling_params
+
+            # Verify and set the task for pooling params
+            try:
+                pooling_params.verify("embed", self.model_config)
+            except ValueError as e:
+                return self.create_error_response(str(e))
+
+            if ctx.engine_prompts is None:
+                return self.create_error_response(
+                    "Engine prompts not available")
+
+            if ctx.request_prompts is None:
+                return self.create_error_response(
+                    "Request prompts not available")
+
+            max_pos_embeddings = self._get_max_position_embeddings()
+
+            for i, engine_prompt in enumerate(ctx.engine_prompts):
+                request_prompt = ctx.request_prompts[i]
+
+                # Check if this specific prompt needs chunked processing
+                if self._is_text_tokens_prompt(request_prompt):
+                    # Cast to TextTokensPrompt since we've verified
+                    # prompt_token_ids
+                    text_tokens_prompt = cast(TextTokensPrompt, request_prompt)
+                    if (len(text_tokens_prompt["prompt_token_ids"])
+                            > max_pos_embeddings):
+                        # Use chunked processing for this prompt
+                        chunk_generators = await self._process_chunked_request(
+                            ctx, text_tokens_prompt, pooling_params,
+                            trace_headers, i)
+                        generators.extend(chunk_generators)
+                        continue
+
+                # Normal processing for short prompts or non-token prompts
+                # Cast engine_prompt to the expected type for mypy
+                engine_prompt_typed = cast(
+                    Union[EngineTokensPrompt, EngineEmbedsPrompt],
+                    engine_prompt)
+                generator = await self._create_single_prompt_generator(
+                    ctx, engine_prompt_typed, request_prompt, pooling_params,
+                    trace_headers, i)
+                generators.append(generator)
+
+            from vllm.utils import merge_async_iterators
+            ctx.result_generator = merge_async_iterators(*generators)
+
+            return None
+
+        except Exception as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+    @override
+    async def _collect_batch(
+        self,
+        ctx: ServeContext,
+    ) -> Optional[ErrorResponse]:
+        """Collect and aggregate batch results
+        with support for chunked processing.
+        
+        For chunked requests, performs online aggregation to 
+        minimize memory usage.
+        For regular requests, collects results normally.
+        """
+        ctx = cast(EmbeddingServeContext, ctx)
+        try:
+            if ctx.engine_prompts is None:
+                return self.create_error_response(
+                    "Engine prompts not available")
+
+            # Check if we used chunked processing
+            use_chunked = self._should_use_chunked_processing(ctx.request)
+
+            if not use_chunked:
+                return await super()._collect_batch(ctx=ctx)
+
+            if ctx.request_prompts is None:
+                return self.create_error_response(
+                    "Request prompts not available")
+
+            if ctx.result_generator is None:
+                return self.create_error_response(
+                    "Result generator not available")
+
+            # Online aggregation for chunked requests to
+            # minimize memory usage
+            # Track aggregation state for each prompt
+            prompt_aggregators: dict[int, dict[str, Any]] = {}
+            short_prompts_results: dict[int, PoolingRequestOutput] = {}
+
+            async for result_idx, result in ctx.result_generator:
+                if "-chunk-" in result.request_id:
+                    # Extract prompt_idx from chunked request_id
+                    parts = result.request_id.split("-")
+                    try:
+                        prompt_idx = int(parts[parts.index("prompt") + 1])
+                    except (ValueError, IndexError):
+                        # Fallback: extract from result_idx if parsing fails
+                        prompt_idx = result_idx
+
+                    # Initialize aggregator for this prompt if needed
+                    if prompt_idx not in prompt_aggregators:
+                        prompt_aggregators[prompt_idx] = {
+                            'weighted_sum': None,
+                            'total_weight': 0,
+                            'chunk_count': 0,
+                            'request_id': result.request_id.split("-chunk-")[0]
+                        }
+
+                    aggregator = prompt_aggregators[prompt_idx]
+
+                    # MEAN pooling with online weighted averaging
+                    # Ensure result is PoolingRequestOutput
+                    # for embedding processing
+                    if not isinstance(result, PoolingRequestOutput):
+                        return self.create_error_response(
+                            f"Expected PoolingRequestOutput for "
+                            f"chunked embedding, got "
+                            f"{type(result).__name__}")
+
+                    # Handle both PoolingOutput and
+                    # EmbeddingOutput types
+                    if hasattr(result.outputs, 'data'):
+                        # PoolingOutput case
+                        embedding_data = result.outputs.data
+                    elif hasattr(result.outputs, 'embedding'):
+                        # EmbeddingOutput case -
+                        # convert embedding list to tensor
+                        embedding_data = result.outputs.embedding
+                    else:
+                        return self.create_error_response(
+                            f"Unsupported output type: "
+                            f"{type(result.outputs).__name__}")
+
+                    if not isinstance(embedding_data, torch.Tensor):
+                        embedding_data = torch.tensor(embedding_data,
+                                                      dtype=torch.float32)
+
+                    if result.prompt_token_ids is None:
+                        return self.create_error_response(
+                            "prompt_token_ids cannot be None for "
+                            "chunked processing")
+                    weight = len(result.prompt_token_ids)
+
+                    weighted_embedding = embedding_data.to(
+                        dtype=torch.float32) * weight
+
+                    if aggregator['weighted_sum'] is None:
+                        # First chunk
+                        aggregator['weighted_sum'] = weighted_embedding
+                    else:
+                        # Accumulate
+                        aggregator['weighted_sum'] += weighted_embedding
+
+                    aggregator['total_weight'] += weight
+                    aggregator['chunk_count'] += 1
+                else:
+                    # Non-chunked result - extract prompt_idx from request_id
+                    parts = result.request_id.split("-")
+                    try:
+                        # Last part should be prompt index
+                        prompt_idx = int(parts[-1])
+                    except (ValueError, IndexError):
+                        prompt_idx = result_idx  # Fallback to result_idx
+
+                    short_prompts_results[prompt_idx] = cast(
+                        PoolingRequestOutput, result)
+
+            # Finalize aggregated results
+            final_res_batch: list[Union[PoolingRequestOutput,
+                                        EmbeddingRequestOutput]] = []
+            num_prompts = len(ctx.engine_prompts)
+
+            for prompt_idx in range(num_prompts):
+                if prompt_idx in prompt_aggregators:
+                    # Finalize MEAN aggregation for this chunked prompt
+                    aggregator = prompt_aggregators[prompt_idx]
+
+                    weighted_sum = aggregator['weighted_sum']
+                    total_weight = aggregator['total_weight']
+
+                    if (weighted_sum is not None
+                            and isinstance(weighted_sum, torch.Tensor)
+                            and isinstance(total_weight,
+                                           (int, float)) and total_weight > 0):
+
+                        # Compute final mean embedding
+                        final_embedding = weighted_sum / total_weight
+
+                        # Create a PoolingRequestOutput
+                        # for the aggregated result
+                        pooling_output_data = PoolingOutput(
+                            data=final_embedding)
+
+                        # Get original prompt token IDs for this prompt
+                        original_prompt = ctx.request_prompts[prompt_idx]
+                        if not self._is_text_tokens_prompt(original_prompt):
+                            return self.create_error_response(
+                                f"Chunked prompt {prompt_idx} is not a "
+                                f"TextTokensPrompt")
+
+                        original_token_ids = cast(
+                            TextTokensPrompt,
+                            original_prompt)["prompt_token_ids"]
+
+                        pooling_request_output = PoolingRequestOutput(
+                            request_id=aggregator['request_id'],
+                            prompt_token_ids=original_token_ids,
+                            outputs=pooling_output_data,
+                            finished=True)
+
+                        final_res_batch.append(pooling_request_output)
+                    else:
+                        return self.create_error_response(
+                            f"Failed to aggregate chunks "
+                            f"for prompt {prompt_idx}")
+                elif prompt_idx in short_prompts_results:
+                    final_res_batch.append(
+                        cast(PoolingRequestOutput,
+                             short_prompts_results[prompt_idx]))
+                else:
+                    return self.create_error_response(
+                        f"Result not found for prompt {prompt_idx}")
+
+            ctx.final_res_batch = cast(
+                list[Union[RequestOutput, PoolingRequestOutput]],
+                final_res_batch)
+
+            return None
+
+        except Exception as e:
+            return self.create_error_response(str(e))
+
 
 class OpenAIServingEmbedding(EmbeddingMixin):
     request_id_prefix = "embd"

From 98deac3879860b829dd9a30b19bbb2adb9c96e7f Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Wed, 13 Aug 2025 20:27:25 +0800
Subject: [PATCH 233/932] [FEATURE] support custom vllm tuned config path for
 fused moe triton kernels (#22791)

Signed-off-by: Chi Zhang <zhangchi.usc1992@bytedance.com>
---
 vllm/envs.py                                  |  6 ++++
 .../layers/fused_moe/fused_moe.py             | 28 +++++++++++++------
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 931edcfa7f..e7796aa73d 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -158,6 +158,7 @@ if TYPE_CHECKING:
     VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
+    VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
 
 
 def get_default_cache_root():
@@ -1120,6 +1121,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     #    never removed from memory until the server terminates.
     "VLLM_ENABLE_RESPONSES_API_STORE":
     lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
+
+    # Allows vllm to find tuned config under customized folder
+    "VLLM_TUNED_CONFIG_FOLDER":
+    lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
+
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index ad094c37f9..98087a35e1 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -701,20 +701,32 @@ def get_moe_configs(
     block_shape = [block_n, block_k] if block_n and block_k else None
     json_file_name = get_config_file_name(E, N, dtype, block_shape)
 
-    config_file_path = os.path.join(
+    config_file_paths = []
+
+    # note that we prioritize user defined config
+    user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER
+    if user_defined_config_folder is not None:
+        user_defined_config_file_path = os.path.join(
+            user_defined_config_folder, json_file_name)
+        config_file_paths.append(user_defined_config_file_path)
+
+    default_config_file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
-    if os.path.exists(config_file_path):
-        with open(config_file_path) as f:
-            logger.info("Using configuration from %s for MoE layer.",
-                        config_file_path)
-            # If a configuration has been found, return it
-            return {int(key): val for key, val in json.load(f).items()}
+    config_file_paths.append(default_config_file_path)
+
+    for config_file_path in config_file_paths:
+        if os.path.exists(config_file_path):
+            with open(config_file_path) as f:
+                logger.info("Using configuration from %s for MoE layer.",
+                            config_file_path)
+                # If a configuration has been found, return it
+                return {int(key): val for key, val in json.load(f).items()}
 
     # If no optimized configuration is available, we will use the default
     # configuration
     logger.warning(
         ("Using default MoE config. Performance might be sub-optimal! "
-         "Config file not found at %s"), config_file_path)
+         "Config file not found at %s"), config_file_paths)
     return None
 
 
From 6b794c756c5a6b3c443c19a093435d02d91d525f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Wed, 13 Aug 2025 15:03:53 +0200
Subject: [PATCH 234/932] [Nixl][CI] Fix tests (#22806)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/kv_connector/unit/test_nixl_connector.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 3860d7c857..b185936ab0 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -229,6 +229,9 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
                 num_blocks=1,
                 block_len=self.block_len,
                 attn_backend_name=self.backend_name,
+                # `self.kv_cache_layout` is only forced to HND when vllm engine
+                # is started. We mock HND here.
+                kv_cache_layout="HND",
             ),
             remote_tp_size=remote_tp_size)
         return {0: remote_agent_name}

From fceafaf582cd72e6636f47127a665afb9e0ea0aa Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Wed, 13 Aug 2025 06:07:09 -0700
Subject: [PATCH 235/932] [Bugfix][mamba] Fix type annotation of Mamba2Metadata
 (#22787)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 .../layers/mamba/mamba_mixer2.py              |  8 ++--
 vllm/v1/attention/backends/mamba_attn.py      | 39 +++++++++++--------
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index d5f4877135..10a5618c22 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -473,12 +473,12 @@ class MambaMixer2(MambaBase, CustomOp):
                 conv_state = self_kv_cache[0].transpose(-1, -2)
                 ssm_state = self_kv_cache[1]
                 state_indices_tensor = attn_metadata.state_indices_tensor
-                has_initial_states_p = attn_metadata.has_initial_states
+                has_initial_states_p = attn_metadata.has_initial_states_p
                 prep_initial_states = attn_metadata.prep_initial_states
                 chunk_size = attn_metadata.chunk_size
-                seq_idx_p = attn_metadata.seq_idx
-                chunk_indices_p = attn_metadata.chunk_indices
-                chunk_offsets_p = attn_metadata.chunk_offsets
+                seq_idx_p = attn_metadata.seq_idx_p
+                chunk_indices_p = attn_metadata.chunk_indices_p
+                chunk_offsets_p = attn_metadata.chunk_offsets_p
         else:
             conv_state = mamba_cache_params.conv_state
             ssm_state = mamba_cache_params.ssm_state
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index 7c1226049f..3f84f8967d 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -68,14 +68,19 @@ class Mamba2AttentionMetadata:
     query_start_loc: torch.Tensor
     seq_lens: torch.Tensor
 
-    has_initial_states: torch.Tensor
     prep_initial_states: bool
     chunk_size: int
-    seq_idx: torch.Tensor
-    chunk_indices: torch.Tensor
-    chunk_offsets: torch.Tensor
+
+    # The following tensors only contain prefill requests and will be None if
+    # the batch has no prefill request.
+    has_initial_states_p: Optional[torch.Tensor]
+    seq_idx_p: Optional[torch.Tensor]
+    chunk_indices_p: Optional[torch.Tensor]
+    chunk_offsets_p: Optional[torch.Tensor]
 
     state_indices_tensor: torch.Tensor  # shape: [batch,]
+
+    # The following attributes are for triton implementation of causal_conv1d
     nums_dict: Optional[dict] = None
     cu_seqlen: Optional[int] = None
     batch_ptr: Optional[torch.tensor] = None
@@ -115,11 +120,11 @@ class Mamba2AttentionMetadataBuilder(
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
 
-        seq_idx = None
-        chunk_indices, chunk_offsets = None, None
+        seq_idx_p = None
+        chunk_indices_p, chunk_offsets_p = None, None
         # Need flags to indicate if there are initial states
         # currently we really only support the FlashAttention backend
-        has_initial_states = None
+        has_initial_states_p = None
         prep_initial_states = False
 
         state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
@@ -135,25 +140,25 @@ class Mamba2AttentionMetadataBuilder(
                 common_attn_metadata.
                 num_computed_tokens_cpu[num_reqs - num_prefills:num_reqs] > 0)
             prep_initial_states = torch.any(has_initial_states_cpu).item()
-            has_initial_states = has_initial_states_cpu.to(
+            has_initial_states_p = has_initial_states_cpu.to(
                 query_start_loc.device)
 
             query_start_loc_p = common_attn_metadata.query_start_loc[
                 -num_prefills - 1:] - num_decode_tokens
 
-            seq_idx = torch.repeat_interleave(torch.arange(
+            seq_idx_p = torch.repeat_interleave(torch.arange(
                 num_prefills,
                 dtype=torch.int32,
                 device=query_start_loc_p.device),
-                                              query_start_loc_p.diff(),
-                                              output_size=num_prefill_tokens)
-            seq_idx.unsqueeze_(0)
+                                                query_start_loc_p.diff(),
+                                                output_size=num_prefill_tokens)
+            seq_idx_p.unsqueeze_(0)
 
             # We compute metadata for chunked prefill once at the top level
             # model forward and reuse them in mamba layers. If not needed,
             # they will be ignored inside mamba kernels.
             if prep_initial_states:
-                chunk_indices, chunk_offsets = (
+                chunk_indices_p, chunk_offsets_p = (
                     _query_start_loc_to_chunk_indices_offsets(
                         query_start_loc_p, self.chunk_size,
                         num_prefill_tokens))
@@ -173,12 +178,12 @@ class Mamba2AttentionMetadataBuilder(
             num_decode_tokens=num_decode_tokens,
             query_start_loc=query_start_loc,
             seq_lens=seq_lens,
-            has_initial_states=has_initial_states,
             prep_initial_states=prep_initial_states,
             chunk_size=self.chunk_size,
-            seq_idx=seq_idx,
-            chunk_indices=chunk_indices,
-            chunk_offsets=chunk_offsets,
+            has_initial_states_p=has_initial_states_p,
+            seq_idx_p=seq_idx_p,
+            chunk_indices_p=chunk_indices_p,
+            chunk_offsets_p=chunk_offsets_p,
             state_indices_tensor=state_indices_tensor,
         )
         return attn_metadata

From 6772bb0f7d58d137576f386dd921117a5a00f0fb Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Wed, 13 Aug 2025 21:07:28 +0800
Subject: [PATCH 236/932] Remove unnecessary CUDA sync of qwen image and video
 preprocess (#22792)

Signed-off-by: cyy <cyyever@outlook.com>
Signed-off-by: Yuanyuan Chen <cyyever@outlook.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 6bea180ffe..5bcbcc4f0e 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -976,10 +976,12 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each image item.
+        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
         merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
+        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
+                 (merge_size * merge_size)).tolist()
 
-        return image_embeds.split(sizes.tolist())
+        return image_embeds.split(sizes)
 
     def _process_video_input(
             self,
@@ -998,9 +1000,11 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
+        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
+        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
+                 (merge_size * merge_size)).tolist()
 
-        return video_embeds.split(sizes.tolist())
+        return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         mm_input_by_modality = {}

From b159c0a67aaafe865c785d289335c3760e01a62f Mon Sep 17 00:00:00 2001
From: Gh0u1L5 <Gh0u1L5@outlook.com>
Date: Wed, 13 Aug 2025 21:08:23 +0800
Subject: [PATCH 237/932] Fix GGUF loader for Qwen3 MoE. (#22785)

Signed-off-by: Gh0u1L5 <Gh0u1L5@outlook.com>
---
 vllm/model_executor/model_loader/gguf_loader.py | 11 +++++++++++
 vllm/model_executor/models/qwen3_moe.py         |  1 +
 2 files changed, 12 insertions(+)

diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 26af87c1ed..21655b0c69 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -74,6 +74,17 @@ class GGUFModelLoader(BaseModelLoader):
                         f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
                 gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = \
                         f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
+        if model_type in ("qwen2_moe", "qwen3_moe"):
+            model_type = model_type.replace("_", "")
+            # GGUF layer map assumes that we will have a merged expert weights
+            # so we need to map them manually
+            for idx in range(config.num_hidden_layers):
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = \
+                        f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = \
+                        f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = \
+                        f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
 
         arch = None
         for key, value in gguf.MODEL_ARCH_NAMES.items():
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 085fc90b47..61b16b6a1d 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -375,6 +375,7 @@ class Qwen3MoeModel(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
             prefix=f"{prefix}.embed_tokens")
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,

From 20d65aa75548c4ee0c9b69847f177dec085cd358 Mon Sep 17 00:00:00 2001
From: milesial <milesial@users.noreply.github.com>
Date: Wed, 13 Aug 2025 06:09:26 -0700
Subject: [PATCH 238/932] [Frontend] Multithreaded async multimodal load_bytes
 (#22710)

Signed-off-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com>
Co-authored-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com>
---
 vllm/envs.py             |  7 +++++++
 vllm/multimodal/utils.py | 26 ++++++++++++++++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index e7796aa73d..145ec3495a 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -63,6 +63,7 @@ if TYPE_CHECKING:
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_VIDEO_FETCH_TIMEOUT: int = 30
     VLLM_AUDIO_FETCH_TIMEOUT: int = 10
+    VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
     VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
     VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
@@ -555,6 +556,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_AUDIO_FETCH_TIMEOUT":
     lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
 
+    # Max number of workers for the thread pool handling
+    # media bytes loading. Set to 1 to disable parallel processing.
+    # Default is 8
+    "VLLM_MEDIA_LOADING_THREAD_COUNT":
+    lambda: int(os.getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")),
+
     # Maximum filesize in MB for a single audio file when processing
     # speech-to-text requests. Files larger than this will be rejected.
     # Default is 25 MB
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 8dfbc65035..b8266fd350 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import asyncio
+import atexit
+from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
@@ -33,6 +36,10 @@ else:
     MultiModalKwargs = Any
     MultiModalPlaceholderDict = Any
 
+global_thread_pool = ThreadPoolExecutor(
+    max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT)
+atexit.register(global_thread_pool.shutdown)
+
 
 class MediaConnector:
 
@@ -139,19 +146,26 @@ class MediaConnector:
         fetch_timeout: Optional[int] = None,
     ) -> _M:
         url_spec = urlparse(url)
+        loop = asyncio.get_running_loop()
 
         if url_spec.scheme.startswith("http"):
             connection = self.connection
             data = await connection.async_get_bytes(url, timeout=fetch_timeout)
-
-            return media_io.load_bytes(data)
+            future = loop.run_in_executor(global_thread_pool,
+                                          media_io.load_bytes, data)
+            return await future
 
         if url_spec.scheme == "data":
-            return self._load_data_url(url_spec, media_io)
+            future = loop.run_in_executor(global_thread_pool,
+                                          self._load_data_url, url_spec,
+                                          media_io)
+            return await future
 
         if url_spec.scheme == "file":
-            return self._load_file_url(url_spec, media_io)
-
+            future = loop.run_in_executor(global_thread_pool,
+                                          self._load_file_url, url_spec,
+                                          media_io)
+            return await future
         msg = "The URL must be either a HTTP, data or file URL."
         raise ValueError(msg)
 
@@ -489,4 +503,4 @@ def fetch_video(
         "video": video_io_kwargs
     }
     media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
-    return media_connector.fetch_video(video_url)
\ No newline at end of file
+    return media_connector.fetch_video(video_url)

From 19b927e52df8400084df1c8116af7d6f0a5f5d15 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 13 Aug 2025 22:18:07 +0800
Subject: [PATCH 239/932] [Core] Use individual MM items in P0/P1 cache and
 model runner (#22570)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_utils.py               | 235 +++++++------------
 tests/v1/core/test_kv_cache_utils.py         |  48 ++--
 tests/v1/core/test_prefix_caching.py         |  31 ++-
 tests/v1/core/test_scheduler.py              |  21 +-
 tests/v1/core/utils.py                       |  19 +-
 tests/v1/engine/test_engine_core.py          |   2 +-
 tests/v1/engine/test_engine_core_client.py   |   2 +-
 tests/v1/engine/test_output_processor.py     |  10 +-
 tests/v1/kv_connector/unit/utils.py          |   2 +-
 tests/v1/tpu/worker/test_tpu_model_runner.py |   2 +-
 tests/v1/worker/test_gpu_input_batch.py      |   2 +-
 tests/v1/worker/test_gpu_model_runner.py     |   2 +-
 vllm/multimodal/inputs.py                    | 141 +++++++++--
 vllm/multimodal/utils.py                     | 135 ++++++-----
 vllm/v1/core/sched/output.py                 |  10 +-
 vllm/v1/engine/__init__.py                   |   6 +-
 vllm/v1/engine/core.py                       |   7 +-
 vllm/v1/engine/mm_input_cache.py             |  78 +++---
 vllm/v1/engine/processor.py                  |  64 ++---
 vllm/v1/request.py                           |  21 +-
 vllm/v1/serial_utils.py                      |  48 ++--
 vllm/v1/worker/gpu_input_batch.py            |  13 +-
 vllm/v1/worker/gpu_model_runner.py           |  97 ++++----
 vllm/v1/worker/tpu_model_runner.py           |  39 ++-
 24 files changed, 549 insertions(+), 486 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 3fdf7e33ca..41f4773a11 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -5,7 +5,7 @@ import base64
 import mimetypes
 import os
 from tempfile import NamedTemporaryFile, TemporaryDirectory
-from typing import TYPE_CHECKING, NamedTuple, Optional
+from typing import TYPE_CHECKING, NamedTuple
 
 import numpy as np
 import pytest
@@ -19,14 +19,12 @@ from vllm.distributed.parallel_state import (init_distributed_environment,
                                              initialize_model_parallel)
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
-from vllm.multimodal.utils import (MediaConnector,
-                                   merge_and_sort_multimodal_metadata,
+from vllm.multimodal.utils import (MediaConnector, argsort_mm_positions,
                                    run_dp_sharded_vision_model)
 from vllm.platforms import current_platform
 from vllm.utils import get_open_port, update_environment_variables
 
 if TYPE_CHECKING:
-    from vllm.multimodal.hasher import MultiModalHashDict
     from vllm.multimodal.inputs import MultiModalPlaceholderDict
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
@@ -178,19 +176,17 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
     assert metadata_sync == metadata_async
 
 
-# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
+# Used for `test_argsort_mm_positions`.
 class TestCase(NamedTuple):
     mm_positions: "MultiModalPlaceholderDict"
-    mm_hashes: Optional["MultiModalHashDict"]
-    expected_modalities: list[str]
-    expected_ranges: list[PlaceholderRange]
-    expected_hashes: Optional[list[str]]
+    expected_modality_idxs: list[tuple[str, int]]
 
 
-def test_merge_and_sort_multimodal_metadata():
+def test_argsort_mm_positions():
 
     test_cases = [
-        # Single modality should return result as is but flattened
+        # Single modality
+        ## Internally sorted
         TestCase(
             mm_positions={
                 "image": [
@@ -198,34 +194,27 @@ def test_merge_and_sort_multimodal_metadata():
                     PlaceholderRange(offset=3, length=2),
                 ]
             },
-            mm_hashes={"image": ["hash1", "hash2"]},
-            expected_modalities=["image", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=3, length=2),
+            expected_modality_idxs=[
+                ("image", 0),
+                ("image", 1),
             ],
-            expected_hashes=["hash1", "hash2"],
         ),
-
-        # Single modality without hashes return None for mm hash.
+        ## Internally unsorted
         TestCase(
             mm_positions={
                 "image": [
+                    PlaceholderRange(offset=3, length=2),
                     PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=2, length=2),
                 ]
             },
-            mm_hashes=None,
-            expected_modalities=["image", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=2),
+            expected_modality_idxs=[
+                ("image", 1),
+                ("image", 0),
             ],
-            expected_hashes=None,
         ),
 
-        # Multiple modalities with hashes should return sorted modalities
-        # and flattened ranges and hashes.
+        # Two modalities
+        ## Internally sorted
         TestCase(
             mm_positions={
                 "image": [
@@ -237,47 +226,54 @@ def test_merge_and_sort_multimodal_metadata():
                     PlaceholderRange(offset=2, length=3),
                 ]
             },
-            mm_hashes={
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1", "audio_hash2"],
-            },
-            expected_modalities=["audio", "audio", "image", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3),
-                PlaceholderRange(offset=7, length=4),
-                PlaceholderRange(offset=11, length=5),
-            ],
-            expected_hashes=[
-                "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
+            expected_modality_idxs=[
+                ("audio", 0),
+                ("audio", 1),
+                ("image", 0),
+                ("image", 1),
             ],
         ),
-
-        # Multiple modalities without hashes should return sorted modalities
-        # and flattened ranges and None.
+        ## Interleaved, internally sorted
         TestCase(
             mm_positions={
                 "image": [
-                    PlaceholderRange(offset=7, length=4),
-                    PlaceholderRange(offset=11, length=5),
+                    PlaceholderRange(offset=0, length=4),
+                    PlaceholderRange(offset=8, length=2),
                 ],
                 "audio": [
-                    PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=2, length=3),
+                    PlaceholderRange(offset=5, length=2),
+                    PlaceholderRange(offset=11, length=4),
                 ]
             },
-            mm_hashes=None,
-            expected_modalities=["audio", "audio", "image", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3),
-                PlaceholderRange(offset=7, length=4),
-                PlaceholderRange(offset=11, length=5),
+            expected_modality_idxs=[
+                ("image", 0),
+                ("audio", 0),
+                ("image", 1),
+                ("audio", 1),
+            ],
+        ),
+        ## Interleaved, internally unsorted
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=8, length=2),
+                    PlaceholderRange(offset=0, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=11, length=4),
+                    PlaceholderRange(offset=5, length=2),
+                ]
+            },
+            expected_modality_idxs=[
+                ("image", 1),
+                ("audio", 1),
+                ("image", 0),
+                ("audio", 0),
             ],
-            expected_hashes=None,
         ),
 
         # Three modalities
+        ## Internally sorted
         TestCase(
             mm_positions={
                 "image": [
@@ -293,72 +289,16 @@ def test_merge_and_sort_multimodal_metadata():
                     PlaceholderRange(offset=12, length=6),
                 ]
             },
-            mm_hashes={
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1"],
-                "video": ["video_hash1", "video_hash2", "video_hash3"]
-            },
-            expected_modalities=[
-                "audio", "video", "video", "video", "image", "image"
-            ],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=3, length=4),
-                PlaceholderRange(offset=7, length=5),
-                PlaceholderRange(offset=12, length=6),
-                PlaceholderRange(offset=15, length=7),
-                PlaceholderRange(offset=22, length=8),
-            ],
-            expected_hashes=[
-                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
-                "image_hash1", "image_hash2"
+            expected_modality_idxs=[
+                ("audio", 0),
+                ("video", 0),
+                ("video", 1),
+                ("video", 2),
+                ("image", 0),
+                ("image", 1),
             ],
         ),
-    ]
-
-    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
-         expected_hashes) in test_cases:
-        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
-            mm_positions, mm_hashes)
-
-        assert modalities == expected_modalities
-        assert ranges == expected_ranges
-        assert hashes == expected_hashes
-
-
-def test_merge_and_sort_multimodal_metadata_with_interleaving():
-
-    test_cases = [
-
-        # <image> <audio> <image> <audio>
-        TestCase(
-            mm_positions={
-                "image": [
-                    PlaceholderRange(offset=0, length=4),
-                    PlaceholderRange(offset=8, length=2),
-                ],
-                "audio": [
-                    PlaceholderRange(offset=5, length=2),
-                    PlaceholderRange(offset=11, length=4),
-                ]
-            },
-            mm_hashes={
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1", "audio_hash2"],
-            },
-            expected_modalities=["image", "audio", "image", "audio"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=4),
-                PlaceholderRange(offset=5, length=2),
-                PlaceholderRange(offset=8, length=2),
-                PlaceholderRange(offset=11, length=4),
-            ],
-            expected_hashes=[
-                "image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
-            ],
-        ),
-
-        # <image> <image> <audio> <video> <image>
+        ## Interleaved, internally sorted
         TestCase(
             mm_positions={
                 "image": [
@@ -373,58 +313,43 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                     PlaceholderRange(offset=8, length=5),
                 ]
             },
-            mm_hashes=None,
-            expected_modalities=["image", "image", "audio", "video", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3),
-                PlaceholderRange(offset=5, length=2),
-                PlaceholderRange(offset=8, length=5),
-                PlaceholderRange(offset=20, length=4),
+            expected_modality_idxs=[
+                ("image", 0),
+                ("image", 1),
+                ("audio", 0),
+                ("video", 0),
+                ("image", 2),
             ],
-            expected_hashes=None,
         ),
-
-        # <image> <audio> <video> <image> with hashes
+        ## Interleaved, internally sunorted
         TestCase(
             mm_positions={
                 "image": [
                     PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=18, length=4),
+                    PlaceholderRange(offset=20, length=4),
+                    PlaceholderRange(offset=2, length=3),
                 ],
                 "audio": [
-                    PlaceholderRange(offset=6, length=2),
+                    PlaceholderRange(offset=5, length=2),
                 ],
                 "video": [
-                    PlaceholderRange(offset=10, length=5),
+                    PlaceholderRange(offset=8, length=5),
                 ]
             },
-            mm_hashes={
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1"],
-                "video": ["video_hash1"],
-            },
-            expected_modalities=["image", "audio", "video", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=6, length=2),
-                PlaceholderRange(offset=10, length=5),
-                PlaceholderRange(offset=18, length=4),
-            ],
-            expected_hashes=[
-                "image_hash1", "audio_hash1", "video_hash1", "image_hash2"
+            expected_modality_idxs=[
+                ("image", 0),
+                ("image", 2),
+                ("audio", 0),
+                ("video", 0),
+                ("image", 1),
             ],
         ),
     ]
 
-    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
-         expected_hashes) in test_cases:
-        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
-            mm_positions, mm_hashes)
+    for mm_positions, expected_modality_idxs in test_cases:
+        modality_idxs = argsort_mm_positions(mm_positions)
 
-        assert modalities == expected_modalities
-        assert ranges == expected_ranges
-        assert hashes == expected_hashes
+        assert modality_idxs == expected_modality_idxs
 
 
 class SimpleLinearModel(torch.nn.Module):
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index bff3724d95..182ea2b234 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
+from typing import Optional
 
 import pytest
 import torch
 
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalBatchedField,
+                                    MultiModalFieldElem, MultiModalKwargsItem,
+                                    PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit
 from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -27,20 +30,29 @@ from vllm.v1.request import Request
 # yapf: enable
 
 
-def make_request(request_id,
-                 prompt_token_ids,
-                 mm_positions=None,
-                 mm_hashes=None,
-                 cache_salt=None):
+def make_request(
+    request_id: str,
+    prompt_token_ids: list[int],
+    mm_positions: Optional[list[PlaceholderRange]] = None,
+    mm_hashes: Optional[list[str]] = None,
+    cache_salt: Optional[str] = None,
+):
     if mm_positions is None:
-        multi_modal_inputs = None
+        mm_kwargs = None
     else:
-        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+        mm_elem = MultiModalFieldElem(
+            modality="dummy_m",
+            key="dummy_k",
+            data=None,
+            field=MultiModalBatchedField(),
+        )
+        mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+        mm_kwargs = [mm_item] * len(mm_positions)
 
     return Request(
         request_id=request_id,
         prompt_token_ids=prompt_token_ids,
-        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_kwargs=mm_kwargs,
         multi_modal_hashes=mm_hashes,
         multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
@@ -316,7 +328,7 @@ def test_free_kv_cache_block_queue_get_all_free_blocks():
 
 def test_generate_block_hash_extra_keys():
     request = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(20)],
         mm_positions=[
             PlaceholderRange(offset=0, length=5),
@@ -348,7 +360,7 @@ def test_generate_block_hash_extra_keys():
 
 def test_generate_block_hash_extra_keys_no_mm_inputs():
     request = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
         mm_positions=None,
         mm_hashes=None,
@@ -361,7 +373,7 @@ def test_generate_block_hash_extra_keys_no_mm_inputs():
 
 def test_generate_block_hash_extra_keys_cache_salt():
     request = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
         mm_positions=None,
         mm_hashes=None,
@@ -382,7 +394,7 @@ def test_generate_block_hash_extra_keys_cache_salt():
 
     # works together with other extra keys
     request_mm = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(20)],
         mm_positions=[
             PlaceholderRange(offset=0, length=5),
@@ -420,7 +432,7 @@ def test_hash_request_tokens(hash_fn):
     import vllm.v1.core.kv_cache_utils
     init_none_hash(hash_fn)
     request = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
         mm_positions=[
             PlaceholderRange(offset=0, length=3),
@@ -450,7 +462,7 @@ def test_hash_tokens_different_mm_input(hash_fn):
     init_none_hash(hash_fn)
 
     request1 = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
         mm_positions=[
             PlaceholderRange(offset=0, length=3),
@@ -459,7 +471,7 @@ def test_hash_tokens_different_mm_input(hash_fn):
         mm_hashes=["hash1", "hash2"],
     )
     request2 = make_request(
-        request_id=1,
+        request_id="1",
         prompt_token_ids=[_ for _ in range(6)],
         mm_positions=[
             PlaceholderRange(offset=0, length=3),
@@ -479,7 +491,7 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn):
     init_none_hash(hash_fn)
 
     request = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
         mm_positions=None,
         mm_hashes=None,
@@ -844,7 +856,7 @@ def test_allocate_with_lookahead():
     )
 
     request = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[],
         mm_positions=None,
         mm_hashes=None,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 085616303d..87acdef220 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -9,7 +9,9 @@ import pytest
 import torch
 
 from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalBatchedField,
+                                    MultiModalFieldElem, MultiModalKwargsItem,
+                                    PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256, sha256_cbor_64bit
 from vllm.v1.core.block_pool import BlockPool
@@ -21,21 +23,30 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, SlidingWindowSpec)
 
 
-def make_request(request_id,
-                 prompt_token_ids,
-                 mm_positions=None,
-                 mm_hashes=None,
-                 prompt_logprobs: Optional[int] = None,
-                 cache_salt: Optional[str] = None):
+def make_request(
+    request_id: str,
+    prompt_token_ids: list[int],
+    mm_positions: Optional[list[PlaceholderRange]] = None,
+    mm_hashes: Optional[list[str]] = None,
+    prompt_logprobs: Optional[int] = None,
+    cache_salt: Optional[str] = None,
+):
     if mm_positions is None:
-        multi_modal_inputs = None
+        mm_kwargs = None
     else:
-        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+        mm_elem = MultiModalFieldElem(
+            modality="dummy_m",
+            key="dummy_k",
+            data=None,
+            field=MultiModalBatchedField(),
+        )
+        mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+        mm_kwargs = [mm_item] * len(mm_positions)
 
     return Request(
         request_id=request_id,
         prompt_token_ids=prompt_token_ids,
-        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_kwargs=mm_kwargs,
         multi_modal_hashes=mm_hashes,
         multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17,
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index c719d1975b..1c7dd0ca90 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -8,7 +8,9 @@ import torch
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalBatchedField,
+                                    MultiModalFieldElem, MultiModalKwargsItem,
+                                    PlaceholderRange)
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
@@ -1304,7 +1306,7 @@ def create_requests_with_priority(
         priorities: list[int],
         arrival_times: Optional[list[float]] = None,
         num_tokens: int = 10,
-        mm_positions: Optional[list[PlaceholderRange]] = None,
+        mm_positions: Optional[list[list[PlaceholderRange]]] = None,
         max_tokens: int = 16,
         stop_token_ids: Optional[list[int]] = None,
         prompt_logprobs: Optional[int] = None):
@@ -1323,16 +1325,23 @@ def create_requests_with_priority(
     for i in range(num_requests):
         if mm_positions is not None:
             mm_position = mm_positions[i]
-            mm_inputs = [MultiModalKwargs({})] * len(mm_position)
+            mm_elem = MultiModalFieldElem(
+                modality="dummy_m",
+                key="dummy_k",
+                data=None,
+                field=MultiModalBatchedField(),
+            )
+            mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+            mm_kwargs = [mm_item] * len(mm_position)
         else:
             mm_position = None
-            mm_inputs = None
+            mm_kwargs = None
         request = Request(
             request_id=f"{i}",
             prompt_token_ids=[i] * num_tokens,
             sampling_params=sampling_params,
             pooling_params=None,
-            multi_modal_inputs=mm_inputs,
+            multi_modal_kwargs=mm_kwargs,
             multi_modal_placeholders=mm_position,
             multi_modal_hashes=None,
             eos_token_id=EOS_TOKEN_ID,
@@ -1816,7 +1825,7 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
     request = Request(
         request_id="0",
         prompt_token_ids=[0, 1],
-        multi_modal_inputs=None,
+        multi_modal_kwargs=None,
         multi_modal_hashes=None,
         multi_modal_placeholders=None,
         sampling_params=sampling_params,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 02ca4498db..484afe61fc 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -6,7 +6,9 @@ import torch
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalBatchedField,
+                                    MultiModalFieldElem, MultiModalKwargsItem,
+                                    PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.async_scheduler import AsyncScheduler
 from vllm.v1.core.sched.scheduler import Scheduler
@@ -115,7 +117,7 @@ def create_scheduler(
 def create_requests(
     num_requests: int,
     num_tokens: int = 10,
-    mm_positions: Optional[list[PlaceholderRange]] = None,
+    mm_positions: Optional[list[list[PlaceholderRange]]] = None,
     max_tokens: int = 16,
     stop_token_ids: Optional[list[int]] = None,
     prompt_logprobs: Optional[int] = None,
@@ -129,10 +131,17 @@ def create_requests(
     for i in range(num_requests):
         if mm_positions is not None:
             mm_position = mm_positions[i]
-            mm_inputs = [MultiModalKwargs({})] * len(mm_position)
+            mm_elem = MultiModalFieldElem(
+                modality="dummy_m",
+                key="dummy_k",
+                data=None,
+                field=MultiModalBatchedField(),
+            )
+            mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+            mm_kwargs = [mm_item] * len(mm_position)
         else:
             mm_position = None
-            mm_inputs = None
+            mm_kwargs = None
         prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
                             num_tokens)
         request = Request(
@@ -140,7 +149,7 @@ def create_requests(
             prompt_token_ids=prompt_token_ids,
             sampling_params=sampling_params,
             pooling_params=None,
-            multi_modal_inputs=mm_inputs,
+            multi_modal_kwargs=mm_kwargs,
             multi_modal_placeholders=mm_position,
             multi_modal_hashes=None,
             eos_token_id=EOS_TOKEN_ID,
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index c52b989671..2ea957a3e2 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -35,7 +35,7 @@ def make_request() -> EngineCoreRequest:
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
         prompt_token_ids=PROMPT_TOKENS,
-        mm_inputs=None,
+        mm_kwargs=None,
         mm_hashes=None,
         mm_placeholders=None,
         sampling_params=SamplingParams(),
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 1329ce5f69..c82285639a 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -52,7 +52,7 @@ def make_request(
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
         prompt_token_ids=prompt_tokens_ids,
-        mm_inputs=None,
+        mm_kwargs=None,
         mm_hashes=None,
         mm_placeholders=None,
         sampling_params=params,
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 949ab764e2..c113439a70 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -53,7 +53,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
         EngineCoreRequest(request_id=f"request-{idx}",
                           prompt_token_ids=prompt_tokens,
                           arrival_time=0,
-                          mm_inputs=None,
+                          mm_kwargs=None,
                           mm_hashes=None,
                           mm_placeholders=None,
                           eos_token_id=None,
@@ -402,7 +402,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
         EngineCoreRequest(request_id=request_id_list[idx],
                           prompt_token_ids=prompt_tokens,
                           arrival_time=0,
-                          mm_inputs=None,
+                          mm_kwargs=None,
                           mm_hashes=None,
                           mm_placeholders=None,
                           eos_token_id=None,
@@ -567,7 +567,7 @@ def test_stop_token(include_stop_str_in_output: bool,
         request_id=request_id,
         prompt_token_ids=prompt_tokens,
         arrival_time=0,
-        mm_inputs=None,
+        mm_kwargs=None,
         mm_hashes=None,
         mm_placeholders=None,
         eos_token_id=eos_token_id,
@@ -666,7 +666,7 @@ def test_stop_string(include_stop_str_in_output: bool,
             request_id=request_id_list[idx],
             prompt_token_ids=prompt_tokens,
             arrival_time=0,
-            mm_inputs=None,
+            mm_kwargs=None,
             mm_hashes=None,
             mm_placeholders=None,
             eos_token_id=None,
@@ -782,7 +782,7 @@ def test_iteration_stats(dummy_test_vectors):
             request_id=f"request-{idx}",
             prompt_token_ids=prompt_tokens,
             arrival_time=0,
-            mm_inputs=None,
+            mm_kwargs=None,
             mm_hashes=None,
             mm_placeholders=None,
             eos_token_id=None,
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index c22d5b861e..60847c4858 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -154,7 +154,7 @@ def create_request(
         prompt_token_ids=prompt_token_ids,
         sampling_params=sampling_params,
         pooling_params=None,
-        multi_modal_inputs=None,
+        multi_modal_kwargs=None,
         multi_modal_placeholders=None,
         multi_modal_hashes=None,
         eos_token_id=EOS_TOKEN_ID,
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 215be09bf5..5a05781a03 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -64,7 +64,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
             NewRequestData(
                 req_id=req_id,
                 prompt_token_ids=[1, 2, 3],
-                mm_inputs=[],
+                mm_kwargs=[],
                 mm_hashes=[],
                 mm_positions=[],
                 sampling_params=SamplingParams(),
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 943a13deba..74ab19a3ce 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -203,7 +203,7 @@ def _construct_cached_request_state(req_id_suffix: int):
         prompt_token_ids=prompt_token_ids,
         sampling_params=_create_sampling_params(),
         pooling_params=None,
-        mm_inputs=[],
+        mm_kwargs=[],
         mm_positions=[],
         block_ids=([], ),
         generator=None,
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index e151d388c2..e97cdf4827 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -120,7 +120,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
             NewRequestData(
                 req_id=req_id,
                 prompt_token_ids=[1, 2, 3],
-                mm_inputs=[],
+                mm_kwargs=[],
                 mm_hashes=[],
                 mm_positions=[],
                 sampling_params=SamplingParams(),
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 6d4bcef320..0bbac45c12 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -4,7 +4,7 @@
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 from functools import partial
 from itertools import accumulate
 from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
@@ -198,7 +198,7 @@ A dictionary containing nested tensors which have been batched via
 """
 
 
-@dataclass(frozen=True)
+@dataclass
 class MultiModalFieldElem:
     """
     Represents a keyword argument corresponding to a multi-modal item
@@ -218,11 +218,14 @@ class MultiModalFieldElem:
     i.e. the name of the keyword argument to be passed to the model.
     """
 
-    data: NestedTensors
+    data: Optional[NestedTensors]
     """
     The tensor data of this field in
     [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
     i.e. the value of the keyword argument to be passed to the model.
+
+    It may be set to `None` if it is determined that the item is cached
+    in `EngineCore`.
     """
 
     field: "BaseMultiModalField"
@@ -235,8 +238,15 @@ class MultiModalFieldElem:
         if not isinstance(other, self.__class__):
             return False
 
+        if self.data is None:
+            data_equal = other.data is None
+        elif other.data is None:
+            data_equal = self.data is None
+        else:
+            data_equal = nested_tensors_equal(self.data, other.data)
+
         return ((self.modality, self.key) == (other.modality, other.key)
-                and nested_tensors_equal(self.data, other.data)
+                and data_equal
                 and type(self.field) == type(other.field))  # noqa: E721
 
 
@@ -280,10 +290,20 @@ class BaseMultiModalField(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+    def _reduce_data(
+        self,
+        batch: list[NestedTensors],
+        *,
+        pin_memory: bool,
+    ) -> NestedTensors:
         raise NotImplementedError
 
-    def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
+    def reduce_data(
+        self,
+        elems: list[MultiModalFieldElem],
+        *,
+        pin_memory: bool = False,
+    ) -> NestedTensors:
         """
         Merge the data from multiple instances of
         [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem].
@@ -295,7 +315,13 @@ class BaseMultiModalField(ABC):
         if len(set(field_types)) > 1:
             raise ValueError(f"Cannot merge different {field_types=}")
 
-        return self._reduce_data([item.data for item in elems])
+        validated_data = list[NestedTensors]()
+        for i, elem in enumerate(elems):
+            assert elem.data is not None, (
+                f"Cannot merge with empty `elems[{i}]`")
+            validated_data.append(elem.data)
+
+        return self._reduce_data(validated_data, pin_memory=pin_memory)
 
 
 @dataclass(frozen=True)
@@ -314,7 +340,12 @@ class MultiModalBatchedField(BaseMultiModalField):
         field_factory = self._field_factory(modality=modality, key=key)
         return [field_factory(item) for item in data]
 
-    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+    def _reduce_data(
+        self,
+        batch: list[NestedTensors],
+        *,
+        pin_memory: bool,
+    ) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
             if len(batch) == 1:
                 # An optimization when `batch` contains only one tensor:
@@ -323,7 +354,11 @@ class MultiModalBatchedField(BaseMultiModalField):
                 return batch[0].unsqueeze(0).contiguous()
             first_shape = batch[0].shape
             if all(elem.shape == first_shape for elem in batch):
-                return torch.stack(batch)
+                out = torch.empty((len(batch), *batch[0].shape),
+                                  dtype=batch[0].dtype,
+                                  device=batch[0].device,
+                                  pin_memory=pin_memory)
+                return torch.stack(batch, out=out)
 
         return batch
 
@@ -350,7 +385,12 @@ class MultiModalFlatField(BaseMultiModalField):
                 "torch.Tensor is required for multiple slices"
         return [field_factory(data[cast(slice, s)]) for s in self.slices]
 
-    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+    def _reduce_data(
+        self,
+        batch: list[NestedTensors],
+        *,
+        pin_memory: bool,
+    ) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
             if len(batch) == 1:
                 # An optimization when `batch` contains only one tensor:
@@ -358,13 +398,21 @@ class MultiModalFlatField(BaseMultiModalField):
                 # - will achieve zero-copy if the tensor is contiguous
                 return batch[0].contiguous()
 
-            def _expect_same_shape(tensor: torch.Tensor):
-                return tensor.shape[:self.dim] + tensor.shape[self.dim + 1:]
+            dim = self.dim + (self.dim < 0) * len(batch[0].shape)
 
-            first_shape = _expect_same_shape(batch[0])
+            def _shape_before_after(tensor: torch.Tensor):
+                return tensor.shape[:dim], tensor.shape[dim + 1:]
 
-            if all(_expect_same_shape(elem) == first_shape for elem in batch):
-                return torch.concat(batch, dim=self.dim)
+            first_shape = _shape_before_after(batch[0])
+
+            if all(_shape_before_after(elem) == first_shape for elem in batch):
+                shape_before, shape_after = first_shape
+                shape_concat = sum(item.shape[dim] for item in batch)
+                out = torch.empty((*shape_before, shape_concat, *shape_after),
+                                  dtype=batch[0].dtype,
+                                  device=batch[0].device,
+                                  pin_memory=pin_memory)
+                return torch.concat(batch, dim=self.dim, out=out)
 
         assert self.dim == 0, "dim == 0 is required for nested list"
         return [e for elem in batch for e in elem]
@@ -387,7 +435,12 @@ class MultiModalSharedField(BaseMultiModalField):
         field_factory = self._field_factory(modality=modality, key=key)
         return [field_factory(data)] * self.batch_size
 
-    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+    def _reduce_data(
+        self,
+        batch: list[NestedTensors],
+        *,
+        pin_memory: bool,
+    ) -> NestedTensors:
         return batch[0]
 
 
@@ -594,11 +647,53 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
     def from_elems(elems: Sequence[MultiModalFieldElem]):
         return MultiModalKwargsItem({elem.key: elem for elem in elems})
 
-    @property
-    def modality(self) -> str:
+    def __init__(self, data: Mapping[str, MultiModalFieldElem]) -> None:
+        super().__init__(data)
+
         modalities = {elem.modality for elem in self.data.values()}
         assert len(modalities) == 1, f"Found different modalities={modalities}"
-        return next(iter(modalities))
+        self._modality = next(iter(modalities))
+
+        self._is_empty = any(elem.data is None for elem in self.values())
+
+    @property
+    def modality(self) -> str:
+        return self._modality
+
+    @property
+    def is_empty(self) -> bool:
+        return self._is_empty
+
+    def get_data(self) -> Optional[Mapping[str, NestedTensors]]:
+        if self._is_empty:
+            return None
+
+        out_data = dict[str, NestedTensors]()
+        for key, elem in self.items():
+            assert elem.data is not None, (
+                f"Cannot get data of empty `elem[{key!r}]`")
+            out_data[key] = elem.data
+
+        return out_data
+
+    def require_data(self) -> Mapping[str, NestedTensors]:
+        if (data := self.get_data()) is None:
+            raise RuntimeError("Cannot get data of empty item")
+
+        return data
+
+    # These methods create a new item to avoid mutating cached items in place
+    def with_data(self, data: Mapping[str, NestedTensors]):
+        return MultiModalKwargsItem({
+            key: replace(elem, data=data[key])
+            for key, elem in self.items()
+        })
+
+    def without_data(self):
+        return MultiModalKwargsItem({
+            key: replace(elem, data=None)
+            for key, elem in self.items()
+        })
 
 
 # NOTE: UserDict is for V0 compatibility.
@@ -650,7 +745,11 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
         return MultiModalKwargs.from_items(items)
 
     @staticmethod
-    def from_items(items: Sequence[MultiModalKwargsItem]):
+    def from_items(
+        items: Sequence[MultiModalKwargsItem],
+        *,
+        pin_memory: bool = False,
+    ):
         """Construct a new
         [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]
         from multiple items."""
@@ -660,7 +759,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
                 elems_by_key[key].append(elem)
 
         data = {
-            key: elems[0].field.reduce_data(elems)
+            key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
             for key, elems in elems_by_key.items() if len(elems) > 0
         }
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index b8266fd350..3b01ee7ad4 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,6 +3,7 @@
 
 import asyncio
 import atexit
+from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
@@ -13,6 +14,7 @@ import numpy as np
 import numpy.typing as npt
 import torch
 from PIL import Image, UnidentifiedImageError
+from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
@@ -23,17 +25,17 @@ from vllm.distributed import (get_tensor_model_parallel_rank,
 from .audio import AudioMediaIO
 from .base import MediaIO
 from .image import ImageEmbeddingMediaIO, ImageMediaIO
-from .inputs import PlaceholderRange
 from .video import VideoMediaIO
 
 _M = TypeVar("_M")
 
 if TYPE_CHECKING:
-    from .hasher import MultiModalHashDict
-    from .inputs import MultiModalKwargs, MultiModalPlaceholderDict
+    from .inputs import (BatchedTensorInputs, MultiModalKwargs,
+                         MultiModalKwargsItem, MultiModalPlaceholderDict)
 else:
-    MultiModalHashDict = Any
+    BatchedTensorInputs = Any
     MultiModalKwargs = Any
+    MultiModalKwargsItem = Any
     MultiModalPlaceholderDict = Any
 
 global_thread_pool = ThreadPoolExecutor(
@@ -331,79 +333,32 @@ def encode_video_base64(frames: npt.NDArray) -> str:
     return video_io.encode_base64(frames)
 
 
-def merge_and_sort_multimodal_metadata(
-    mm_positions: MultiModalPlaceholderDict,
-    mm_hashes: Optional[MultiModalHashDict],
-) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
-    """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
-    objects from all available modalities into a single list of 
-    PlaceholderRange, sorted by their offset (starting index in the input
-    sequence) in the ascending order.
-
-    Optionally if a `MultiModalHashDict` is given, same operation will be
-    applied to the object and the sorted list of hashes will be returned.
-    
-    Returns:
-        list[str]: List of item modalities in order of their positions in the
-        input sequence.
-        list[PlaceholderRange]: Sorted list of all PlaceholderRanges from
-        mm_positions.
-        Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
-        None otherwise.
+def argsort_mm_positions(
+        mm_positions: MultiModalPlaceholderDict) -> list[tuple[str, int]]:
     """
+    Given a `MultiModalPlaceholderDict`, output a sequence of keys to
+    sort the dictionary by `offset` (starting index in the input sequence)
+    in ascending order.
 
-    modalities = list(mm_positions.keys())
+    Returns:
+        A list of `(modality, idx)`, which can be used to access an item
+        by `mm_positions[modality][idx]`.
+    """
+    flat_items = ((modality, idx, item)
+                  for modality, items in mm_positions.items()
+                  for idx, item in enumerate(items))
 
-    assert len(modalities) > 0, "No modalities found in the mm_positions."
+    sorted_flat_items = sorted(flat_items, key=lambda x: x[2].offset)
 
-    # For single modality, placeholder ranges and hashes are already sorted
-    # so we can return the list directly.
-    if len(modalities) == 1:
-        modality = modalities[0]
-        placeholder_list = list(mm_positions[modality])
-
-        return [modality] * len(
-            placeholder_list
-        ), placeholder_list, None if not mm_hashes else mm_hashes[modality]
-
-    # Create a list of (modality, placeholder, hash) tuples for all placeholders
-    all_items = []
-    for modality in modalities:
-        placeholder_list = list(mm_positions[modality])
-        hash_list: list[Optional[str]] = list(
-            mm_hashes[modality]) if mm_hashes and modality in mm_hashes else [
-                None
-            ] * len(placeholder_list)
-
-        for placeholder, hash_value in zip(placeholder_list, hash_list):
-            all_items.append((modality, placeholder, hash_value))
-
-    # Sort all items by offset
-    all_items.sort(key=lambda x: x[1].offset)
-
-    # Split into separate lists
-    sorted_modalities = [item[0] for item in all_items]
-    merged_placeholders = [item[1] for item in all_items]
-    merged_hashes = [str(item[2])
-                     for item in all_items] if mm_hashes is not None else None
-
-    return sorted_modalities, merged_placeholders, merged_hashes
+    return [(modality, idx) for modality, idx, _ in sorted_flat_items]
 
 
+# Temporary back-compatibility for plugins that define model runner
+@deprecated("`group_mm_inputs_by_modality` is superseded by "
+            "`group_mm_kwargs_by_modality` and will be removed in v0.13. "
+            "Please use `group_mm_kwargs_by_modality` instead.")
 def group_mm_inputs_by_modality(
         mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]:
-    """Group consecutive MultiModalKwargs from mm_inputs with the same modality
-    together into the same list for batching purpose. For MultiModalKwargs with
-    multiple modalities, put them into their own list.
-
-    Args:
-        mm_inputs: List of MultiModalKwargs.
-
-    Returns:
-        list[list[vllm.multimodal.MultiModalKwargs]]: List of list of
-        `MultiModalKwargs`, each inner list contains consecutive
-        `MultiModalKwargs` with same modality.
-    """
     if not mm_inputs:
         return []
 
@@ -426,6 +381,48 @@ def group_mm_inputs_by_modality(
     ]
 
 
+def group_mm_kwargs_by_modality(
+    mm_kwargs: list[MultiModalKwargsItem],
+    *,
+    device: torch.types.Device = None,
+    pin_memory: bool = False,
+) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
+    """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
+    modality together into the same `MultiModalKwargs` instance.
+
+    Args:
+        mm_inputs: List of `MultiModalKwargsItem`.
+
+    Yields:
+        A tuple `(modality, num_items, grouped_kwargs)`.
+    """
+    from vllm.multimodal.inputs import MultiModalKwargs
+
+    for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
+        items_lst = list(items)
+
+        # mm_kwargs_group = MultiModalKwargs.from_items(items_lst,
+        #                                               pin_memory=pin_memory)
+
+        # if device is not None:
+        #     mm_kwargs_group = json_map_leaves(lambda x: x.to(device=device),
+        #                                       mm_kwargs_group.data)
+
+        # TODO: Once V0 is removed, we can use the merging logic above
+        # to avoid creating an extra batch dimension (except for fields
+        # that are meant to be stacked anyway).
+        # We will also need to update each model to remove `flatten_bn`.
+        mm_kwargs_group = MultiModalKwargs.as_kwargs(
+            MultiModalKwargs.batch(
+                [MultiModalKwargs.from_items([item]) for item in items_lst],
+                pin_memory=pin_memory,
+            ),
+            device=device,
+        )
+
+        yield modality, len(items_lst), mm_kwargs_group
+
+
 def run_dp_sharded_vision_model(image_input: torch.Tensor,
                                 vision_model: torch.nn.Module) -> torch.Tensor:
     """Run a vision model with data parallelism (DP) sharding. The function 
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index d34f393278..fac07f9719 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -13,7 +13,7 @@ if TYPE_CHECKING:
     from vllm.distributed.kv_transfer.kv_connector.v1.base import (
         KVConnectorMetadata)
     from vllm.lora.request import LoRARequest
-    from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+    from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
     from vllm.v1.request import Request
@@ -24,7 +24,7 @@ class NewRequestData:
 
     req_id: str
     prompt_token_ids: list[int]
-    mm_inputs: list[MultiModalKwargs]
+    mm_kwargs: list[MultiModalKwargsItem]
     mm_hashes: list[str]
     mm_positions: list[PlaceholderRange]
     sampling_params: Optional[SamplingParams]
@@ -42,7 +42,7 @@ class NewRequestData:
         return cls(
             req_id=request.request_id,
             prompt_token_ids=request.prompt_token_ids,
-            mm_inputs=request.mm_inputs,
+            mm_kwargs=request.mm_kwargs,
             mm_hashes=request.mm_hashes,
             mm_positions=request.mm_positions,
             sampling_params=request.sampling_params,
@@ -56,7 +56,7 @@ class NewRequestData:
         return (f"NewRequestData("
                 f"req_id={self.req_id},"
                 f"prompt_token_ids={self.prompt_token_ids},"
-                f"mm_inputs={self.mm_inputs},"
+                f"mm_kwargs={self.mm_kwargs},"
                 f"mm_hashes={self.mm_hashes},"
                 f"mm_positions={self.mm_positions},"
                 f"sampling_params={self.sampling_params},"
@@ -70,7 +70,7 @@ class NewRequestData:
         return (f"NewRequestData("
                 f"req_id={self.req_id},"
                 f"prompt_token_ids_len={len(self.prompt_token_ids)},"
-                f"mm_inputs={self.mm_inputs},"
+                f"mm_kwargs={self.mm_kwargs},"
                 f"mm_hashes={self.mm_hashes},"
                 f"mm_positions={self.mm_positions},"
                 f"sampling_params={self.sampling_params},"
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 810d03f32d..b29394f3e6 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -3,15 +3,13 @@
 
 import enum
 import time
-from collections.abc import Sequence
 from typing import Any, Optional, Union
 
 import msgspec
 import torch
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.v1.metrics.stats import SchedulerStats
@@ -49,7 +47,7 @@ class EngineCoreRequest(
 
     request_id: str
     prompt_token_ids: list[int]
-    mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]]
+    mm_kwargs: Optional[list[MultiModalKwargsItem]]
     mm_hashes: Optional[list[str]]
     mm_placeholders: Optional[list[PlaceholderRange]]
     sampling_params: Optional[SamplingParams]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f92a3e43da..ed426f8ff4 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -409,12 +409,13 @@ class EngineCore:
         request initialization running in parallel with Model forward
         """
         if request.mm_hashes is not None:
-            assert request.mm_inputs is not None
+            assert request.mm_kwargs is not None
+
             # Note on thread safety: no race condition.
             # `mm_input_cache_server` is reset at the end of LLMEngine init,
             # and will only accessed in the input processing thread afterwards.
-            request.mm_inputs = self.mm_input_cache_server.get_and_update(
-                request.mm_inputs, request.mm_hashes)
+            request.mm_kwargs = self.mm_input_cache_server.get_and_update(
+                request.mm_kwargs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
         if req.use_structured_output:
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 0532cda03d..1fed74330f 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Sequence
-from typing import TYPE_CHECKING, Optional
+from collections.abc import Mapping
+from typing import TYPE_CHECKING
 
-from vllm.multimodal import MultiModalKwargs, MultiModalRegistry
+from vllm.multimodal import MultiModalRegistry
 from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
-from vllm.utils import is_list_of
+from vllm.multimodal.inputs import MultiModalKwargsItem, NestedTensors
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -17,23 +17,23 @@ if TYPE_CHECKING:
 # -- P0:
 #  - BaseMultiModalProcessor calls MultiModalHasher to get the `mm_hash` of
 #    each input multi-modal item (e.g. image),
-#  - BaseMultiModalProcessor processes the input items into `mm_inputs`,
+#  - BaseMultiModalProcessor processes the input items into `mm_kwargs`,
 #    which are MultiModalKwargsItem instances that each correspond to an
 #    input multi-modal item.
-#  - MultiModalInputCacheClient accepts the `mm_inputs` and corresponding
+#  - MultiModalInputCacheClient accepts the `mm_kwargs` and corresponding
 #    `mm_hash` for each item. It stores the `mm_hash` as keys and the size
-#    of `mm_inputs`, but not the `mm_inputs` themselves, to avoid taking
+#    of `mm_kwargs`, but not the `mm_kwargs` themselves, to avoid taking
 #    up additional memory in P0.
 #  - The `mm_hash` is always sent to P1.
-#  - The corresponding `mm_inputs` are only sent to P1 if they are not cached
+#  - The corresponding `mm_kwargs` are only sent to P1 if they are not cached
 #    in MultiModalInputCacheServer.
 #
 # -- P1:
-#  - If the `mm_hash` is cached (i.e. `mm_inputs` are not sent from P0),
-#    MultiModalInputCacheServer retrieves the corresponding `mm_inputs`.
-#  - If the `mm_hash` is not cached (i.e. `mm_inputs` are sent from P0),
-#    MultiModalInputCacheServer stores `mm_inputs` under the key `mm_hash`.
-#  - Either way, the `mm_hash` and corresponding `mm_inputs` are sent to
+#  - If the `mm_hash` is cached (i.e. `mm_kwargs` are not sent from P0),
+#    MultiModalInputCacheServer retrieves the corresponding `mm_kwargs`.
+#  - If the `mm_hash` is not cached (i.e. `mm_kwargs` are sent from P0),
+#    MultiModalInputCacheServer stores `mm_kwargs` under the key `mm_hash`.
+#  - Either way, the `mm_hash` and corresponding `mm_kwargs` are sent to
 #    the engine for model execution.
 #
 # Both Client and Server must perform cache update and eviction based on the
@@ -58,26 +58,24 @@ class MultiModalInputCacheClient:
 
     def get_and_update(
         self,
-        mm_inputs: Sequence[MultiModalKwargs],
+        mm_kwargs: list[MultiModalKwargsItem],
         mm_hashes: list[str],
-    ) -> Sequence[Optional[MultiModalKwargs]]:
-        assert len(mm_inputs) == len(mm_hashes)
-
+    ) -> list[MultiModalKwargsItem]:
         if not self.enabled:
-            assert is_list_of(mm_inputs, MultiModalKwargs)
-            return mm_inputs
+            return mm_kwargs
 
-        full_mm_inputs = list[Optional[MultiModalKwargs]]()
-        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+        assert len(mm_kwargs) == len(mm_hashes)
+
+        out_mm_items = list[MultiModalKwargsItem]()
+        for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
             if self.mm_cache.get(mm_hash) is not None:
-                mm_input = None
+                out_mm_items.append(mm_item.without_data())
             else:
                 self.mm_cache[mm_hash] = \
-                    MultiModalCacheItemMetadata.wraps(mm_input)
+                    MultiModalCacheItemMetadata.wraps(mm_item.require_data())
+                out_mm_items.append(mm_item)
 
-            full_mm_inputs.append(mm_input)
-
-        return full_mm_inputs
+        return out_mm_items
 
     def reset(self) -> None:
         self.mm_cache.clear()
@@ -93,30 +91,28 @@ class MultiModalInputCacheServer:
         self.enabled = mm_registry.enable_mm_input_cache(model_config)
         self.mm_cache = MultiModalCache.get_lru_cache(
             model_config.get_mm_input_cache_gb(),
-            MultiModalKwargs,
+            Mapping[str, NestedTensors],
         )
 
     def get_and_update(
         self,
-        mm_inputs: Sequence[Optional[MultiModalKwargs]],
+        mm_kwargs: list[MultiModalKwargsItem],
         mm_hashes: list[str],
-    ) -> Sequence[MultiModalKwargs]:
-        assert len(mm_inputs) == len(mm_hashes)
-
+    ) -> list[MultiModalKwargsItem]:
         if not self.enabled:
-            assert is_list_of(mm_inputs, MultiModalKwargs)
-            return mm_inputs
+            return mm_kwargs
 
-        full_mm_inputs = list[MultiModalKwargs]()
-        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
-            if mm_input is None:
-                mm_input = self.mm_cache[mm_hash]
+        assert len(mm_kwargs) == len(mm_hashes)
+
+        out_mm_items = list[MultiModalKwargsItem]()
+        for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
+            if (mm_data := mm_item.get_data()) is None:
+                out_mm_items.append(mm_item.with_data(self.mm_cache[mm_hash]))
             else:
-                self.mm_cache[mm_hash] = mm_input
+                self.mm_cache[mm_hash] = mm_data
+                out_mm_items.append(mm_item)
 
-            full_mm_inputs.append(mm_input)
-
-        return full_mm_inputs
+        return out_mm_items
 
     def reset(self) -> None:
         self.mm_cache.clear()
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index b9419142ca..376c76a7e7 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
-from collections.abc import Mapping, Sequence
+from collections.abc import Mapping
 from typing import Any, Literal, Optional, Union
 
 from vllm.config import VllmConfig
@@ -10,11 +10,10 @@ from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
 from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
-                             MultiModalRegistry)
-from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.multimodal.processing import EncDecMultiModalProcessor
-from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
+from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
@@ -296,57 +295,42 @@ class Processor:
             pooling_params = params.clone()
 
         # Multimodal related.
-        sorted_mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]] = None
+        sorted_mm_inputs: Optional[list[MultiModalKwargsItem]] = None
         sorted_mm_positions: Optional[list[PlaceholderRange]] = None
         sorted_mm_hashes: Optional[list[str]] = None
         if decoder_inputs["type"] == "multimodal":
             decoder_mm_inputs = decoder_inputs["mm_kwargs"]
+            decoder_mm_positions = decoder_inputs["mm_placeholders"]
+            decoder_mm_hashes = decoder_inputs.get("mm_hashes")
 
             # Merge and flatten multimodal placeholders, hashes and inputs
             # from dictionaries to lists, and sort them by each item's position
             # in the input sequence.
-            (
-                sorted_item_modalities,
-                sorted_mm_positions,
-                sorted_mm_hashes,
-            ) = merge_and_sort_multimodal_metadata(
-                decoder_inputs["mm_placeholders"],
-                decoder_inputs["mm_hashes"] if return_mm_hashes else None,
-            )
+            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
 
-            # The output of merged multi-modal processor (`decoder_mm_inputs`)
-            # is a single MultiModalKwargs for all items from all modalities.
-            # This code flattens kwargs for individual items in a list and
-            # sorts them by each item's position in the input sequence if there
-            # are multiple modalities.
-            unique_modalities = set(sorted_item_modalities)
-            if len(unique_modalities) > 1:
-                orig_sorted_mm_inputs = []
-                used_indices = {modality: 0 for modality in unique_modalities}
-
-                for modality in sorted_item_modalities:
-                    items = decoder_mm_inputs.get_items(modality)
-                    item = items[used_indices[modality]]
-
-                    orig_sorted_mm_inputs.append(
-                        MultiModalKwargs.from_items([item]))
-                    used_indices[modality] += 1
-            else:
-                orig_sorted_mm_inputs = [
-                    MultiModalKwargs.from_items([item]) for item in
-                    decoder_mm_inputs.get_items(sorted_item_modalities[0])
-                ]
+            sorted_mm_inputs = [
+                decoder_mm_inputs.get_item(modality, idx)
+                for modality, idx in sorted_mm_idxs
+            ]
+            sorted_mm_positions = [
+                decoder_mm_positions[modality][idx]
+                for modality, idx in sorted_mm_idxs
+            ]
+            sorted_mm_hashes = None if decoder_mm_hashes is None else [
+                decoder_mm_hashes[modality][idx]
+                for modality, idx in sorted_mm_idxs
+            ]
 
             if sorted_mm_hashes is not None:
                 sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
-                    orig_sorted_mm_inputs, sorted_mm_hashes)
-            else:
-                sorted_mm_inputs = orig_sorted_mm_inputs
+                    sorted_mm_inputs,
+                    sorted_mm_hashes,
+                )
 
         return decoder_inputs.get("prompt"), EngineCoreRequest(
             request_id=request_id,
             prompt_token_ids=decoder_inputs["prompt_token_ids"],
-            mm_inputs=sorted_mm_inputs,
+            mm_kwargs=sorted_mm_inputs,
             mm_hashes=sorted_mm_hashes,
             mm_placeholders=sorted_mm_positions,
             sampling_params=sampling_params,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 85f5dcb92e..d1f1c7f987 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -5,7 +5,7 @@ import enum
 import time
 from typing import TYPE_CHECKING, Any, Optional, Union
 
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_list_of
@@ -24,7 +24,7 @@ class Request:
         self,
         request_id: str,
         prompt_token_ids: list[int],
-        multi_modal_inputs: Optional[list[MultiModalKwargs]],
+        multi_modal_kwargs: Optional[list[MultiModalKwargsItem]],
         multi_modal_hashes: Optional[list[str]],
         multi_modal_placeholders: Optional[list[PlaceholderRange]],
         sampling_params: Optional[SamplingParams],
@@ -84,15 +84,15 @@ class Request:
 
         # Multi-modal related
         self.mm_positions = multi_modal_placeholders or []
-        self.mm_inputs = multi_modal_inputs or []
+        self.mm_kwargs = multi_modal_kwargs or []
         self.mm_hashes: list[str] = multi_modal_hashes or []
-        self.num_encoder_inputs = len(self.mm_inputs)
+        self.num_encoder_inputs = len(self.mm_kwargs)
         self.has_encoder_inputs = self.num_encoder_inputs > 0
 
         # Sanity check
-        assert len(self.mm_inputs) == len(self.mm_positions)
+        assert len(self.mm_kwargs) == len(self.mm_positions)
         if self.mm_hashes:
-            assert len(self.mm_inputs) == len(self.mm_hashes)
+            assert len(self.mm_kwargs) == len(self.mm_hashes)
 
         # Read-only views
         # Prevent directly appending to these lists since
@@ -110,16 +110,15 @@ class Request:
 
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
-        if request.mm_inputs is not None:
-            assert isinstance(request.mm_inputs, list)
-            assert is_list_of(request.mm_inputs, MultiModalKwargs), (
-                "mm_inputs was not updated in EngineCore.add_request")
+        if request.mm_kwargs is not None:
+            assert is_list_of(request.mm_kwargs, MultiModalKwargsItem), (
+                "mm_kwargs was not updated in EngineCore.add_request")
 
         return cls(
             request_id=request.request_id,
             client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
-            multi_modal_inputs=request.mm_inputs,
+            multi_modal_kwargs=request.mm_kwargs,
             multi_modal_hashes=request.mm_hashes,
             multi_modal_placeholders=request.mm_placeholders,
             sampling_params=request.sampling_params,
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 9d063f1eda..3f0fad8a64 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -113,6 +113,9 @@ class MsgpackEncoder:
                 int(v) if v is not None else None
                 for v in (obj.start, obj.stop, obj.step))
 
+        if isinstance(obj, MultiModalKwargsItem):
+            return self._encode_mm_item(obj)
+
         if isinstance(obj, MultiModalKwargs):
             mm: MultiModalKwargs = obj
             if not mm.modalities:
@@ -120,17 +123,12 @@ class MsgpackEncoder:
                 return dict(mm)
 
             # ignore the main dict, it will be re-indexed.
-            # Encode a list of MultiModalKwargsItems as plain dicts
-            # + special handling for .field.
             # Any tensors *not* indexed by modality will be ignored.
-            return [[{
-                "modality": elem.modality,
-                "key": elem.key,
-                "data": self._encode_nested_tensors(elem.data),
-                "field": self._encode_mm_field(elem.field),
-            } for elem in item.values()]
-                    for itemlist in mm._items_by_modality.values()
-                    for item in itemlist]
+            return [
+                self._encode_mm_item(item)
+                for itemlist in mm._items_by_modality.values()
+                for item in itemlist
+            ]
 
         if isinstance(obj, UtilityResult):
             result = obj.result
@@ -192,6 +190,23 @@ class MsgpackEncoder:
         dtype = str(obj.dtype).removeprefix("torch.")
         return dtype, obj.shape, data
 
+    def _encode_mm_item(self,
+                        item: MultiModalKwargsItem) -> list[dict[str, Any]]:
+        return [self._encode_mm_field_elem(elem) for elem in item.values()]
+
+    def _encode_mm_field_elem(self,
+                              elem: MultiModalFieldElem) -> dict[str, Any]:
+        return {
+            "modality":
+            elem.modality,
+            "key":
+            elem.key,
+            "data": (None if elem.data is None else
+                     self._encode_nested_tensors(elem.data)),
+            "field":
+            self._encode_mm_field(elem.field),
+        }
+
     def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
         if isinstance(nt, torch.Tensor):
             return self._encode_tensor(nt)
@@ -250,6 +265,8 @@ class MsgpackDecoder:
                 return self._decode_tensor(obj)
             if t is slice:
                 return slice(*obj)
+            if issubclass(t, MultiModalKwargsItem):
+                return self._decode_mm_item(obj)
             if issubclass(t, MultiModalKwargs):
                 if isinstance(obj, list):
                     return MultiModalKwargs.from_items(
@@ -311,15 +328,18 @@ class MsgpackDecoder:
         # Convert back to proper shape & type
         return arr.view(torch_dtype).view(shape)
 
-    def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]:
+    def _decode_mm_items(self, obj: list[Any]) -> list[MultiModalKwargsItem]:
         return [self._decode_mm_item(v) for v in obj]
 
-    def _decode_mm_item(self, obj: list) -> MultiModalKwargsItem:
+    def _decode_mm_item(self, obj: list[Any]) -> MultiModalKwargsItem:
         return MultiModalKwargsItem.from_elems(
             [self._decode_mm_field_elem(v) for v in obj])
 
-    def _decode_mm_field_elem(self, obj: dict) -> MultiModalFieldElem:
-        obj["data"] = self._decode_nested_tensors(obj["data"])
+    def _decode_mm_field_elem(self, obj: dict[str,
+                                              Any]) -> MultiModalFieldElem:
+        if obj["data"] is not None:
+            obj["data"] = self._decode_nested_tensors(obj["data"])
+
         # Reconstruct the field processor using MultiModalFieldConfig
         factory_meth_name, *field_args = obj["field"]
         factory_meth = getattr(MultiModalFieldConfig, factory_meth_name)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index d9d0b4bec8..2469e09f82 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -7,9 +7,11 @@ from typing import Optional, cast
 
 import numpy as np
 import torch
+from typing_extensions import deprecated
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalKwargs, MultiModalKwargsItem,
+                                    PlaceholderRange)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
@@ -29,7 +31,7 @@ class CachedRequestState:
 
     req_id: str
     prompt_token_ids: list[int]
-    mm_inputs: list[MultiModalKwargs]
+    mm_kwargs: list[MultiModalKwargsItem]
     mm_positions: list[PlaceholderRange]
     sampling_params: Optional[SamplingParams]
     pooling_params: Optional[PoolingParams]
@@ -51,6 +53,13 @@ class CachedRequestState:
     def num_tokens(self) -> int:
         return self.num_prompt_tokens + len(self.output_token_ids)
 
+    # Temporary back-compatibility for plugins that define model runner
+    @property
+    @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
+                "removed in v0.13. Please use `mm_kwargs` instead.")
+    def mm_inputs(self) -> list[MultiModalKwargs]:
+        return [MultiModalKwargs.from_items([item]) for item in self.mm_kwargs]
+
     def get_token_id(self, idx: int) -> int:
         if idx < self.num_prompt_tokens:
             return self.prompt_token_ids[idx]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2e1cc37b1b..a03e860a91 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -40,9 +40,9 @@ from vllm.model_executor.models.interfaces import (is_mixture_of_experts,
 from vllm.model_executor.models.interfaces_base import (
     VllmModelForPooling, is_pooling_model, is_text_generation_model)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
+from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem,
                                     PlaceholderRange)
-from vllm.multimodal.utils import group_mm_inputs_by_modality
+from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors, PoolerOutput
@@ -478,7 +478,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
                 prompt_token_ids=new_req_data.prompt_token_ids,
-                mm_inputs=new_req_data.mm_inputs,
+                mm_kwargs=new_req_data.mm_kwargs,
                 mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
                 pooling_params=pooling_params,
@@ -496,18 +496,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 second_per_grid_ts = []
                 audio_feature_lengths = []
                 use_audio_in_video = False
-                for mm_input in self.requests[req_id].mm_inputs:
+                for item in self.requests[req_id].mm_kwargs:
+                    mm_input = item.require_data()
                     if mm_input.get("image_grid_thw") is not None:
-                        image_grid_thw.extend(
+                        image_grid_thw.append(
                             mm_input["image_grid_thw"].tolist())
                     if mm_input.get("video_grid_thw") is not None:
-                        video_grid_thw.extend(
+                        video_grid_thw.append(
                             mm_input["video_grid_thw"].tolist())
                     if mm_input.get("second_per_grid_ts") is not None:
-                        second_per_grid_ts.extend(
+                        second_per_grid_ts.append(
                             mm_input["second_per_grid_ts"])
                     if mm_input.get("audio_feature_lengths") is not None:
-                        audio_feature_lengths.extend(
+                        audio_feature_lengths.append(
                             mm_input["audio_feature_lengths"])
                     if mm_input.get("use_audio_in_video") is True:
                         use_audio_in_video = True
@@ -624,14 +625,23 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     ) -> BatchedTensorInputs:
         if self.is_multimodal_raw_input_supported:  # noqa: SIM102
             if scheduler_output:
-                multi_modal_kwargs_list = list[MultiModalKwargs]()
+                mm_kwargs = list[MultiModalKwargsItem]()
                 for req in scheduler_output.scheduled_new_reqs:
-                    req_mm_inputs = req.mm_inputs
-                    if not isinstance(req_mm_inputs, list):
-                        req_mm_inputs = list(req_mm_inputs)
-                    multi_modal_kwargs_list.extend(req_mm_inputs)
+                    req_mm_kwargs = req.mm_kwargs
+                    if not isinstance(req_mm_kwargs, list):
+                        req_mm_kwargs = list(req_mm_kwargs)
+                    mm_kwargs.extend(req_mm_kwargs)
 
-                return MultiModalKwargs.batch(multi_modal_kwargs_list)
+                # Input all modalities at once
+                mm_kwargs_combined: BatchedTensorInputs = {}
+                for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
+                        mm_kwargs,
+                        device=self.device,
+                        pin_memory=self.pin_memory,
+                ):
+                    mm_kwargs_combined.update(mm_kwargs_group)
+
+                return mm_kwargs_combined
 
         return {}
 
@@ -1146,13 +1156,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs = list[MultiModalKwargs]()
+        mm_kwargs = list[MultiModalKwargsItem]()
         req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
 
             for mm_input_id in encoder_input_ids:
-                mm_inputs.append(req_state.mm_inputs[mm_input_id])
+                mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
                 req_ids_pos.append(
                     (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
 
@@ -1163,17 +1173,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # in the same batch while still being able to benefit from batching
         # multimodal inputs. The proper solution should be reordering the
         # encoder outputs.
-        grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
-
         encoder_outputs = []
-        for grouped_mm_inputs in grouped_mm_inputs_list:
-            batched_mm_inputs = MultiModalKwargs.batch(
-                grouped_mm_inputs, pin_memory=self.pin_memory)
-            batched_mm_inputs = MultiModalKwargs.as_kwargs(
-                batched_mm_inputs,
+        for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
+                mm_kwargs,
                 device=self.device,
-            )
-
+                pin_memory=self.pin_memory,
+        ):
             # Run the encoder.
             # `curr_group_outputs` is either of the following:
             # 1. A tensor of shape (num_items, feature_size, hidden_size)
@@ -1182,11 +1187,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # (feature_size, hidden_size) in case the feature size is dynamic
             # depending on the input multimodal items.
             curr_group_outputs = self.model.get_multimodal_embeddings(
-                **batched_mm_inputs)
+                **mm_kwargs_group)
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
-                expected_num_items=len(grouped_mm_inputs),
+                expected_num_items=num_items,
             )
 
             for output in curr_group_outputs:
@@ -1553,17 +1558,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
             input_ids = None
             inputs_embeds = self.inputs_embeds[:num_input_tokens]
-            model_mm_kwargs = self._extract_mm_kwargs(scheduler_output)
-            model_kwargs = self._init_model_kwargs(num_scheduled_tokens)
+            model_kwargs = {
+                **self._init_model_kwargs(num_scheduled_tokens),
+                **self._extract_mm_kwargs(scheduler_output),
+            }
         else:
             # For text-only models, we use token ids as input.
             # While it is possible to use embeddings as input just like the
             # multimodal models, it is not desirable for performance since
             # then the embedding layer is not included in the CUDA graph.
             input_ids = self.input_ids[:num_input_tokens]
-            model_kwargs = self._init_model_kwargs(num_input_tokens)
             inputs_embeds = None
-            model_mm_kwargs = {}
+            model_kwargs = self._init_model_kwargs(num_input_tokens)
         if self.uses_mrope:
             positions = self.mrope_positions[:, :num_input_tokens]
         else:
@@ -1596,10 +1602,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 positions=positions,
                 intermediate_tensors=intermediate_tensors,
                 inputs_embeds=inputs_embeds,
-                **MultiModalKwargs.as_kwargs(
-                    model_mm_kwargs,
-                    device=self.device,
-                ),
                 **model_kwargs,
             )
 
@@ -2196,14 +2198,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # Result in the maximum GPU consumption of the model
         dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
-        dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
 
-        batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
-                                                         max_items_per_batch)
-        return MultiModalKwargs.as_kwargs(
-            batched_dummy_mm_inputs,
-            device=self.device,
-        )
+        return next(mm_kwargs_group
+                    for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
+                        [dummy_mm_item] * max_items_per_batch,
+                        device=self.device,
+                        pin_memory=self.pin_memory,
+                    ))
 
     @torch.inference_mode()
     def _dummy_run(
@@ -2269,15 +2270,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
                                             num_scheduled_tokens):
-            model_kwargs = self._init_model_kwargs(num_tokens)
             if self.supports_mm_inputs:
                 input_ids = None
                 inputs_embeds = self.inputs_embeds[:num_tokens]
-                model_mm_kwargs = self._dummy_mm_kwargs(num_reqs)
+                model_kwargs = {
+                    **self._init_model_kwargs(num_tokens),
+                    **self._dummy_mm_kwargs(num_reqs),
+                }
             else:
                 input_ids = self.input_ids[:num_tokens]
                 inputs_embeds = None
-                model_mm_kwargs = {}
+                model_kwargs = self._init_model_kwargs(num_tokens)
 
             if self.uses_mrope:
                 positions = self.mrope_positions[:, :num_tokens]
@@ -2307,10 +2310,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     positions=positions,
                     intermediate_tensors=intermediate_tensors,
                     inputs_embeds=inputs_embeds,
-                    **MultiModalKwargs.as_kwargs(
-                        model_mm_kwargs,
-                        device=self.device,
-                    ),
                     **model_kwargs,
                 )
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index ae0219458e..46262284e3 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -32,9 +32,9 @@ from vllm.model_executor.models.interfaces import supports_transcription
 from vllm.model_executor.models.interfaces_base import (
     is_pooling_model, is_text_generation_model)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
+from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem,
                                     PlaceholderRange)
-from vllm.multimodal.utils import group_mm_inputs_by_modality
+from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import (LayerBlockType, cdiv, is_pin_memory_available,
@@ -394,7 +394,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
                 prompt_token_ids=new_req_data.prompt_token_ids,
-                mm_inputs=new_req_data.mm_inputs,
+                mm_kwargs=new_req_data.mm_kwargs,
                 mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
                 pooling_params=None,
@@ -842,13 +842,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs = list[MultiModalKwargs]()
+        mm_kwargs = list[MultiModalKwargsItem]()
         req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
 
             for mm_input_id in encoder_input_ids:
-                mm_inputs.append(req_state.mm_inputs[mm_input_id])
+                mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
                 req_ids_pos.append(
                     (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
 
@@ -859,16 +859,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # in the same batch while still being able to benefit from batching
         # multimodal inputs. The proper solution should be reordering the
         # encoder outputs.
-        grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
-
         encoder_outputs = []
-        for grouped_mm_inputs in grouped_mm_inputs_list:
-            batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
-            batched_mm_inputs = MultiModalKwargs.as_kwargs(
-                batched_mm_inputs,
+        for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
+                mm_kwargs,
                 device=self.device,
-            )
-
+                pin_memory=self.pin_memory,
+        ):
             # Run the encoder.
             # `curr_group_outputs` is either of the following:
             # 1. A tensor of shape (num_items, feature_size, hidden_size)
@@ -878,12 +874,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # depending on the input multimodal items.
             xm.mark_step()
             curr_group_outputs = self.model.get_multimodal_embeddings(
-                **batched_mm_inputs)
+                **mm_kwargs_group)
             xm.mark_step()
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
-                expected_num_items=len(grouped_mm_inputs),
+                expected_num_items=num_items,
             )
 
             if isinstance(curr_group_outputs, torch.Tensor):
@@ -1823,14 +1819,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # Result in the maximum GPU consumption of the model
         dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
-        dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
 
-        batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
-                                                         max_items_per_batch)
-        return MultiModalKwargs.as_kwargs(
-            batched_dummy_mm_inputs,
-            device=self.device,
-        )
+        return next(grouped_mm_kwargs
+                    for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality(
+                        [dummy_mm_item] * max_items_per_batch,
+                        device=self.device,
+                        pin_memory=self.pin_memory,
+                    ))
 
 
 def _get_req_paddings(min_req_size: int, max_req_size: int) -> list[int]:

From da2705198fa19030a25d0bea437f7be6547d47d4 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Wed, 13 Aug 2025 07:22:56 -0700
Subject: [PATCH 240/932] [Misc] clear and separate error messages for input
 too long and input + max-tokens too long (#22803)

Signed-off-by: Roger Wang <hey@rogerw.me>
---
 vllm/entrypoints/openai/serving_engine.py | 31 +++++++++++++----------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index fb9d456df7..d6f92a6330 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -585,6 +585,8 @@ class OpenAIServing:
                       (EmbeddingChatRequest, EmbeddingCompletionRequest,
                        ScoreRequest, RerankRequest, ClassificationRequest)):
 
+            # Note: input length can be up to the entire model context length
+            # since these requests don't generate tokens.
             if token_num > self.max_model_len:
                 operations: dict[type[AnyRequest], str] = {
                     ScoreRequest: "score",
@@ -613,21 +615,24 @@ class OpenAIServing:
             max_tokens = request.max_completion_tokens or request.max_tokens
         else:
             max_tokens = getattr(request, "max_tokens", None)
-        if max_tokens is None:
-            if token_num >= self.max_model_len:
-                raise ValueError(
-                    f"This model's maximum context length is "
-                    f"{self.max_model_len} tokens. However, you requested "
-                    f"{token_num} tokens in the messages, "
-                    f"Please reduce the length of the messages.")
-        elif token_num + max_tokens > self.max_model_len:
+
+        # Note: input length can be up to model context length - 1 for
+        # completion-like requests.
+        if token_num >= self.max_model_len:
             raise ValueError(
                 f"This model's maximum context length is "
-                f"{self.max_model_len} tokens. However, you requested "
-                f"{max_tokens + token_num} tokens "
-                f"({token_num} in the messages, "
-                f"{max_tokens} in the completion). "
-                f"Please reduce the length of the messages or completion.")
+                f"{self.max_model_len} tokens. However, your request has "
+                f"{token_num} input tokens. Please reduce the length of "
+                "the input messages.")
+
+        if max_tokens is not None and \
+            token_num + max_tokens > self.max_model_len:
+            raise ValueError(
+                "'max_tokens' or 'max_completion_tokens' is too large: "
+                f"{max_tokens}. This model's maximum context length is "
+                f"{self.max_model_len} tokens and your request has "
+                f"{token_num} input tokens ({max_tokens} > {self.max_model_len}"
+                f" - {token_num}).")
 
         return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 

From 9bd9294f0e0434190bc4cb3886288572b26e272b Mon Sep 17 00:00:00 2001
From: HWH <67449739+jio-H@users.noreply.github.com>
Date: Thu, 14 Aug 2025 00:41:41 +0800
Subject: [PATCH 241/932] [Bugfix] Fix MiniCPMV Image input inference failed
 (#22813)

Signed-off-by: HWH <67449739+jio-H@users.noreply.github.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/model_executor/models/minicpmv.py | 17 +++++++
 vllm/utils/tensor_schema.py            | 70 ++++++++++++++++----------
 2 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 7db3a1bb90..88dd1a5762 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -85,6 +85,23 @@ class MiniCPMVImagePixelInputs(TensorSchema):
         - w: Width
     """
 
+    def _validate_nested_tensors(
+        self,
+        value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]],
+        field_name: str,
+        expected_shape: tuple[Union[int, str], ...],
+        dynamic_dims: set[str],
+    ) -> tuple[int, ...]:
+        # value[0] is the scaled image,
+        # and value[1:] is a collection of image slices.
+        # It is ensured that all slices in the collection
+        # have the same shape.
+        if field_name == "pixel_values":
+            value = value[1:] if len(value) > 1 else value
+
+        return super()._validate_nested_tensors(value, field_name,
+                                                expected_shape, dynamic_dims)
+
     type: Literal["pixel_values"] = "pixel_values"
 
     # Note that the image size may vary, so we pass it as a list instead of a
diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py
index 4c3acf0094..21d3249fe1 100644
--- a/vllm/utils/tensor_schema.py
+++ b/vllm/utils/tensor_schema.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Annotated, Any, Union, get_args, get_origin, get_type_hints
+from typing import (Annotated, Any, Optional, Union, get_args, get_origin,
+                    get_type_hints)
 
 import torch
 
@@ -11,9 +12,13 @@ logger = init_logger(__name__)
 
 class TensorShape:
 
-    def __init__(self,
-                 *dims: Union[int, str],
-                 dynamic_dims: set[str, ...] = None) -> None:
+    def __init__(
+        self,
+        *dims: Union[int, str],
+        dynamic_dims: Optional[set[str]] = None,
+    ) -> None:
+        super().__init__()
+
         self.dims = dims
         self.dynamic_dims = dynamic_dims if dynamic_dims else set()
 
@@ -44,11 +49,15 @@ class TensorShape:
 
 class TensorSchema:
 
-    def __init__(self,
-                 *,
-                 validate: bool = True,
-                 resolve_bindings: dict[str, int] = None,
-                 **kwargs: Any) -> None:
+    def __init__(
+        self,
+        *,
+        validate: bool = True,
+        resolve_bindings: Optional[dict[str, int]] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+
         self._resolve_bindings = resolve_bindings if resolve_bindings else {}
 
         for key, value in kwargs.items():
@@ -57,16 +66,19 @@ class TensorSchema:
         if validate:
             self.validate()
 
-    def __getitem__(self, item) -> Any:
-        return getattr(self, item)
+    def __getitem__(self, key: str) -> Any:
+        return getattr(self, key)
 
-    def get(self, item, default=None) -> Any:
-        return getattr(self, item, default)
+    def get(self, key: str, default: Any = None) -> Any:
+        return getattr(self, key, default)
 
-    def _match_shape_with_dynamic(self, actual: tuple[int, ...],
-                                  reference: tuple[int, ...],
-                                  expected_shape: tuple[Union[int, str], ...],
-                                  dynamic_dims: set[str, ...]) -> bool:
+    def _match_shape_with_dynamic(
+        self,
+        actual: tuple[int, ...],
+        reference: tuple[int, ...],
+        expected_shape: tuple[Union[int, str], ...],
+        dynamic_dims: set[str],
+    ) -> bool:
         if len(actual) != len(reference) or len(actual) > len(expected_shape):
             return False
 
@@ -84,10 +96,12 @@ class TensorSchema:
         return True
 
     def _validate_nested_tensors(
-            self, value: Union[list[torch.Tensor, ...],
-                               tuple[torch.Tensor, ...]], field_name: str,
-            expected_shape: tuple[Union[int, str], ...],
-            dynamic_dims: set[str, ...]) -> tuple[int, ...]:
+        self,
+        value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]],
+        field_name: str,
+        expected_shape: tuple[Union[int, str], ...],
+        dynamic_dims: set[str],
+    ) -> tuple[int, ...]:
         """Validate a list/tuple of tensors and return the actual shape."""
         # Ensure all tensors in the list have the same
         # shape, besides dynamic dimensions
@@ -110,12 +124,14 @@ class TensorSchema:
         # shape = (len(list), *tensor.shape)
         return (len(value), ) + first.shape
 
-    def _validate_tensor_shape_expected(self, actual_shape: tuple[int, ...],
-                                        expected_shape: tuple[Union[int, str],
-                                                              ...],
-                                        field_name: str, shape_env: dict[str,
-                                                                         int],
-                                        dynamic_dims: set[str, ...]) -> None:
+    def _validate_tensor_shape_expected(
+        self,
+        actual_shape: tuple[int, ...],
+        expected_shape: tuple[Union[int, str], ...],
+        field_name: str,
+        shape_env: dict[str, int],
+        dynamic_dims: set[str],
+    ) -> None:
         """Validate that the actual tensor shape matches the expected shape."""
 
         if len(actual_shape) != len(expected_shape):

From c9232d41f433abd1d6f0960bcec020660078d718 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 14 Aug 2025 01:03:05 +0800
Subject: [PATCH 242/932] [CI/Build] Update VLM common tests (#22841)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/generation/test_common.py      | 16 +---------------
 vllm/model_executor/models/minicpmv.py        | 19 +------------------
 2 files changed, 2 insertions(+), 33 deletions(-)

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 2a65d7e244..2919bdbe91 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -561,7 +561,7 @@ VLM_TEST_SETTINGS = {
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
-        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
         marks=[pytest.mark.skip("HF import fails")],
     ),
     "minicpmv_26": VLMTestInfo(
@@ -574,8 +574,6 @@ VLM_TEST_SETTINGS = {
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
-        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
-        marks=[pytest.mark.skip("HF import fails")],
     ),
     "minimax_vl_01": VLMTestInfo(
         models=["MiniMaxAI/MiniMax-VL-01"],
@@ -611,18 +609,6 @@ VLM_TEST_SETTINGS = {
         patch_hf_runner=model_utils.ovis_patch_hf_runner,
         marks=[large_gpu_mark(min_gb=32)],
     ),
-    "ovis1_6": VLMTestInfo(
-        models=["AIDC-AI/Ovis1.6-Llama3.2-3B"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
-        max_model_len=4096,
-        max_num_seqs=2,
-        dtype="half",
-        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
-        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
-        patch_hf_runner=model_utils.ovis_patch_hf_runner,
-    ),
     "ovis2": VLMTestInfo(
         models=["AIDC-AI/Ovis2-1B"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 88dd1a5762..47ce771d8c 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -85,30 +85,13 @@ class MiniCPMVImagePixelInputs(TensorSchema):
         - w: Width
     """
 
-    def _validate_nested_tensors(
-        self,
-        value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]],
-        field_name: str,
-        expected_shape: tuple[Union[int, str], ...],
-        dynamic_dims: set[str],
-    ) -> tuple[int, ...]:
-        # value[0] is the scaled image,
-        # and value[1:] is a collection of image slices.
-        # It is ensured that all slices in the collection
-        # have the same shape.
-        if field_name == "pixel_values":
-            value = value[1:] if len(value) > 1 else value
-
-        return super()._validate_nested_tensors(value, field_name,
-                                                expected_shape, dynamic_dims)
-
     type: Literal["pixel_values"] = "pixel_values"
 
     # Note that the image size may vary, so we pass it as a list instead of a
     # batched tensor.
     pixel_values: Annotated[
         list[torch.Tensor],
-        TensorShape("bns", "c", "h", "w"),
+        TensorShape("bns", "c", "h", "w", dynamic_dims={"h", "w"}),
     ]
     tgt_sizes: Annotated[
         torch.Tensor,

From 12817a8ac7f0e9b70bfd785b1fb54c28966e7935 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Wed, 13 Aug 2025 19:35:50 +0200
Subject: [PATCH 243/932] [CI] Fix
 `tests/v1/e2e/test_kv_sharing_fast_prefill.py` import on test (#22815)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/e2e/test_kv_sharing_fast_prefill.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index f5a7b9cc27..d72e50e519 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -11,7 +11,8 @@ from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationLevel
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.forward_context import get_forward_context
-from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration
+from vllm.model_executor.models.gemma3n_mm import (
+    Gemma3nForConditionalGeneration)
 from vllm.model_executor.models.registry import ModelRegistry
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.sequence import IntermediateTensors
@@ -32,12 +33,13 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds, **kwargs)
+        hidden_states = super().forward(input_ids, positions,
+                                        intermediate_tensors, inputs_embeds,
+                                        **kwargs)
         attn_metadata = get_forward_context().attn_metadata
         # attn_metadata is None during dummy runs
         if (attn_metadata is not None
-                and self.cache_config.kv_sharing_fast_prefill):
+                and self.language_model.cache_config.kv_sharing_fast_prefill):
             assert isinstance(attn_metadata, dict)  # true in V1
             # Gemma3n-E2B has 30 layers, with last 20 layers being
             # cross-decoder layers. Check attention metadata is correct
@@ -52,7 +54,7 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
 
             # Last layer will be a KV sharing layer
             layer_attn_metadata = attn_metadata[
-                self.model.language_model.layers[-1].self_attn.attn.layer_name]
+                self.language_model.model.layers[-1].self_attn.attn.layer_name]
             logits_indices_padded = (layer_attn_metadata.logits_indices_padded)
             assert logits_indices_padded is not None
             num_logits_indices = layer_attn_metadata.num_logits_indices

From b4b78d63170ff0b1e5310c295473109d92ee51c2 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 14 Aug 2025 01:55:25 +0800
Subject: [PATCH 244/932] [CI/Build] Fix param mismatch in
 `test_eagle_correctness` (#22847)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/v1/e2e/test_spec_decode.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 599916c0d1..dde95fbe59 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -146,7 +146,11 @@ def test_ngram_correctness(
             marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
     ],
     ids=[
-        "qwen3_eagle3", "llama3_eagle", "llama3_eagle3", "llama4_eagle",
+        # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
+        # "qwen3_eagle3",
+        "llama3_eagle",
+        "llama3_eagle3",
+        "llama4_eagle",
         "llama4_eagle_mm"
     ])
 @pytest.mark.parametrize("attn_backend",

From df0e0f023e1be63d259280ccb9caf5547302ad30 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 14 Aug 2025 04:36:28 +0800
Subject: [PATCH 245/932] [CI/Build] Skip gpt_big model test because of broken
 HF model (#22848)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/registry.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index d7d20d1f3a..eb48c0f6a7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -195,7 +195,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
                                        {"alias": "gpt2"}),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
-                                             {"tiny": "bigcode/tiny_starcoder_py"}),  # noqa: E501
+                                             extras={"tiny": "bigcode/tiny_starcoder_py"},  # noqa: E501
+                                             min_transformers_version="4.55.1"),
     "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M",
                                        {"6b": "EleutherAI/gpt-j-6b"}),
     "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",

From c6cd5ca3d3294f62bf5fad25ada8192ba39249b9 Mon Sep 17 00:00:00 2001
From: kliuae <17350011+kliuae@users.noreply.github.com>
Date: Thu, 14 Aug 2025 04:45:03 +0800
Subject: [PATCH 246/932] [ROCm][Bugfix] Fix compilation error in topk softmax
 fused kernel (#22819)

Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com>
---
 csrc/moe/topk_softmax_kernels.cu | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 946c137db6..99c52ef17d 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -423,12 +423,27 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
         input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
 }
 
+#ifndef USE_ROCM
 #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                          \
-    static_assert(WARP_SIZE == 32 || WARP_SIZE == 64,                                 \
-                  "Unsupported warp size. Only 32 and 64 are supported.");            \
+    static_assert(WARP_SIZE == 32,                                                    \
+                  "Unsupported warp size. Only 32 is supported for CUDA");            \
     topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
         gating_output, nullptr, topk_weights, topk_indices,                           \
         token_expert_indices, num_tokens, topk, 0, num_experts, stream);
+#else
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                             \
+    if (WARP_SIZE == 64) {                                                               \
+        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>(       \
+            gating_output, nullptr, topk_weights, topk_indices,                          \
+            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+    } else if (WARP_SIZE == 32) {                                                        \
+        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>(       \
+            gating_output, nullptr, topk_weights, topk_indices,                          \
+            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+    } else {                                                                             \
+        assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
+    }
+#endif
 
 template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
@@ -443,7 +458,9 @@ void topkGatingSoftmaxKernelLauncher(
     cudaStream_t stream) {
     static constexpr int WARPS_PER_TB = 4;
     static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
+#ifndef USE_ROCM
     static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
+#endif
     switch (num_experts) {
         case 1:
             LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);

From 4e8614e88bf621d11682b6f387c8640f6c9ad086 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Wed, 13 Aug 2025 17:38:35 -0400
Subject: [PATCH 247/932] Move checklist in PR template (#22852)

Signed-off-by: Luka Govedic <lgovedic@redhat.com>
---
 .github/PULL_REQUEST_TEMPLATE.md   | 20 ++++++++++++--------
 .github/scripts/cleanup_pr_body.sh |  8 ++++----
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index d4aceab447..1b30c1292d 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,11 +1,5 @@
-# Essential Elements of an Effective PR Description Checklist
-
-- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
-- [ ] The test plan, such as providing test command.
-- [ ] The test results, such as pasting the results comparison before and after, or e2e results
-- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
-
-PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
+<!-- markdownlint-disable -->
+PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
 
 ## Purpose
 
@@ -15,4 +9,14 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
 
 ## (Optional) Documentation Update
 
+---
+<details>
+<summary> Essential Elements of an Effective PR Description Checklist </summary>
+
+- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
+- [ ] The test plan, such as providing test command.
+- [ ] The test results, such as pasting the results comparison before and after, or e2e results
+- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+</details>
+
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh
index 8d65936fba..25af344aab 100755
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt
 gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
 cp "${OLD}" "${NEW}"
 
-# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
-sed -i '/FIX #xxxx.*$/d' "${NEW}"
+# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
+sed -i '/<!--.*-->$/d' "${NEW}"
 
-# Remove "FILL IN THE PR DESCRIPTION HERE"
-sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
+# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
+sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
 
 # Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
 sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"

From 31a500c86fb38417a2696d516a8ea1a642a5df06 Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Wed, 13 Aug 2025 14:44:06 -0700
Subject: [PATCH 248/932] [Core] [N-gram SD Optimization][1/n] Propose tokens
 with a single KMP (#22437)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 benchmarks/benchmark_block_pool.py          |  74 ++++++++++
 benchmarks/benchmark_ngram_proposer.py      | 112 +++++++++++++++
 benchmarks/benchmark_utils.py               |  55 +++++++-
 benchmarks/kv_cache/benchmark_block_pool.py | 108 ---------------
 tests/v1/spec_decode/test_ngram.py          | 102 +++++++++-----
 vllm/v1/spec_decode/ngram_proposer.py       | 143 ++++++++++++--------
 6 files changed, 388 insertions(+), 206 deletions(-)
 create mode 100644 benchmarks/benchmark_block_pool.py
 create mode 100644 benchmarks/benchmark_ngram_proposer.py
 delete mode 100644 benchmarks/kv_cache/benchmark_block_pool.py

diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py
new file mode 100644
index 0000000000..fd363c2ad0
--- /dev/null
+++ b/benchmarks/benchmark_block_pool.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+
+from tabulate import tabulate
+
+from benchmark_utils import TimeCollector
+from vllm.utils import FlexibleArgumentParser
+from vllm.v1.core.block_pool import BlockPool
+
+
+def main(args):
+    rows = []
+    for allocate_block in args.allocate_blocks:
+        # Enforce a GC collect ahead to minimize the impact among runs
+        gc.collect()
+        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
+
+        get_blocks_times = TimeCollector(TimeCollector.US)
+        free_blocks_times = TimeCollector(TimeCollector.US)
+        for _ in range(args.num_iteration):
+            with get_blocks_times:
+                blocks = block_pool.get_new_blocks(allocate_block)
+            with free_blocks_times:
+                block_pool.free_blocks(blocks)
+
+        rows.append(
+            [get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
+            + get_blocks_times.dump_avg_max()
+            + free_blocks_times.dump_avg_max()
+        )
+
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "Iterations",
+                "Total\nBlocks",
+                "Allocated\nBlocks",
+                "Get Blocks\nAvg (us)",
+                "Get Blocks\nMax (us)",
+                "Free Blocks\nAvg (us)",
+                "Free Blocks\nMax (us)",
+            ],
+            tablefmt="grid",
+            floatfmt=".3f",
+        )
+    )
+
+
+def invoke_main() -> None:
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of BlockPool for KV Cache."
+    )
+    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
+    parser.add_argument(
+        "--num-iteration",
+        type=int,
+        default=1000,
+        help="Number of iterations to run to stablize final data readings",
+    )
+    parser.add_argument(
+        "--allocate-blocks",
+        type=int,
+        nargs="*",
+        default=[10, 50, 100, 500, 1000],
+        help="Number of blocks to allocate",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
new file mode 100644
index 0000000000..c60040d05a
--- /dev/null
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+
+import numpy as np
+from tabulate import tabulate
+
+from benchmark_utils import TimeCollector
+from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.utils import FlexibleArgumentParser
+from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+
+
+def main(args):
+    rows = []
+    for max_ngram in args.max_ngram:
+        collector = TimeCollector(TimeCollector.US)
+
+        model_config = ModelConfig(
+            model="facebook/opt-125m",
+            task="generate",
+            max_model_len=args.num_token + args.num_spec_token,
+            tokenizer="facebook/opt-125m",
+            tokenizer_mode="auto",
+            dtype="auto",
+            seed=None,
+            trust_remote_code=False,
+        )
+        proposer = NgramProposer(
+            vllm_config=VllmConfig(
+                model_config=model_config,
+                speculative_config=SpeculativeConfig(
+                    prompt_lookup_min=args.min_ngram,
+                    prompt_lookup_max=max_ngram,
+                    num_speculative_tokens=args.num_spec_token,
+                    method="ngram",
+                ),
+            )
+        )
+
+        # Warm up
+        proposer.propose(np.random.randint(0, 20, (args.num_token,)))
+
+        gc.collect()
+        for _ in range(args.num_iteration):
+            tokens = np.random.randint(0, 20, (args.num_req, args.num_token))
+            with collector:
+                for i in range(args.num_req):
+                    proposer.propose(tokens[i, :])
+        rows.append(
+            [args.num_req, args.num_token, args.min_ngram, max_ngram]
+            + collector.dump_avg_max()
+        )
+
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "# Request",
+                "# Token",
+                "Min Ngram",
+                "Max Ngram",
+                "Avg (us)",
+                "Max (us)",
+            ],
+            tablefmt="grid",
+            floatfmt=".3f",
+        )
+    )
+
+
+def invoke_main() -> None:
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of N-gram speculative decode drafting"
+    )
+    parser.add_argument(
+        "--num-iteration",
+        type=int,
+        default=100,
+        help="Number of iterations to run to stablize final data readings",
+    )
+    parser.add_argument(
+        "--num-req", type=int, default=128, help="Number of requests in the batch"
+    )
+    parser.add_argument(
+        "--num-token", type=int, default=1500, help="Number of tokens for each request"
+    )
+    parser.add_argument(
+        "--min-ngram",
+        type=int,
+        default=3,
+        help="Minimum n-gram to match",
+    )
+    parser.add_argument(
+        "--max-ngram",
+        type=int,
+        nargs="*",
+        default=[5, 7, 10, 15, 20],
+        help="Maximum n-gram to match",
+    )
+    parser.add_argument(
+        "--num-spec-token",
+        type=int,
+        default=3,
+        help="Number of speculative tokens to generate",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 283f938df5..98624abdf4 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import argparse
 import json
 import math
 import os
-from typing import Any
+import time
+from types import TracebackType
+from typing import Any, Optional, Union
 
 
 def convert_to_pytorch_benchmark_format(
@@ -72,3 +73,53 @@ def write_to_json(filename: str, records: list) -> None:
             cls=InfEncoder,
             default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
         )
+
+
+# Collect time and generate time metrics
+#
+# Example Usage:
+#   collector = TimeCollector(TimeCollector.US)
+#   for _ in range(total_iteration):
+#      with collector:
+#          ...
+#   collector.dump_avg_max()
+class TimeCollector:
+    NS: int = 1
+    US: int = NS * 1000
+    MS: int = US * 1000
+    S: int = MS * 1000
+
+    def __init__(self, scale: int) -> None:
+        self.cnt: int = 0
+        self._sum: int = 0
+        self._max: Optional[int] = None
+        self.scale = scale
+        self.start_time: int = time.monotonic_ns()
+
+    def collect(self, v: int) -> None:
+        self.cnt += 1
+        self._sum += v
+        if self._max is None:
+            self._max = v
+        else:
+            self._max = max(self._max, v)
+
+    def avg(self) -> Union[float, str]:
+        return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
+
+    def max(self) -> Union[float, str]:
+        return self._max / self.scale if self._max else "N/A"
+
+    def dump_avg_max(self) -> list[Union[float, str]]:
+        return [self.avg(), self.max()]
+
+    def __enter__(self) -> None:
+        self.start_time = time.monotonic_ns()
+
+    def __exit__(
+        self,
+        exc_type: Optional[type[BaseException]],
+        exc_value: Optional[BaseException],
+        exc_traceback: Optional[TracebackType],
+    ) -> None:
+        self.collect(time.monotonic_ns() - self.start_time)
diff --git a/benchmarks/kv_cache/benchmark_block_pool.py b/benchmarks/kv_cache/benchmark_block_pool.py
deleted file mode 100644
index 134551bb61..0000000000
--- a/benchmarks/kv_cache/benchmark_block_pool.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import gc
-import time
-from typing import Optional
-
-from tabulate import tabulate
-
-from vllm.utils import FlexibleArgumentParser
-from vllm.v1.core.block_pool import BlockPool
-
-
-class Metric:
-    def __init__(self) -> None:
-        self.cnt: int = 0
-        self.sum_v: int = 0
-        self.max_v: Optional[int] = None
-
-    def update(self, v: int) -> None:
-        self.cnt += 1
-        self.sum_v += v
-        if self.max_v is None:
-            self.max_v = v
-        else:
-            self.max_v = max(self.max_v, v)
-
-    def avg_v(self) -> float:
-        return self.sum_v * 1.0 / self.cnt
-
-
-def main(args):
-    rows = []
-    for allocate_block in args.allocate_blocks:
-        # Enforce a GC collect ahead to minimize the impact among runs
-        gc.collect()
-        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
-
-        get_blocks_metric: Metric = Metric()
-        free_blocks_metric: Metric = Metric()
-        for _ in range(args.num_iteration):
-            t1 = time.monotonic_ns()
-            blocks = block_pool.get_new_blocks(allocate_block)
-            t2 = time.monotonic_ns()
-            block_pool.free_blocks(blocks)
-            t3 = time.monotonic_ns()
-            get_blocks_metric.update(t2 - t1)
-            free_blocks_metric.update(t3 - t2)
-
-        if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
-            rows.append(
-                [
-                    get_blocks_metric.cnt,
-                    args.num_gpu_blocks,
-                    allocate_block,
-                    get_blocks_metric.avg_v() / 1000000,
-                    get_blocks_metric.max_v / 1000000.0,
-                    free_blocks_metric.avg_v() / 1000000,
-                    free_blocks_metric.max_v / 1000000.0,
-                ]
-            )
-        else:
-            print(
-                "No valid metrics found."
-                f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
-            )
-
-    print(
-        tabulate(
-            rows,
-            headers=[
-                "Iterations",
-                "Total\nBlocks",
-                "Allocated\nBlocks",
-                "Get Blocks\nAvg (ms)",
-                "Get Blocks\nMax (ms)",
-                "Free Blocks\nAvg (ms)",
-                "Free Blocks\nMax (ms)",
-            ],
-            tablefmt="grid",
-            floatfmt=".6f",
-        )
-    )
-
-
-def invoke_main() -> None:
-    parser = FlexibleArgumentParser(
-        description="Benchmark the performance of BlockPool for KV Cache."
-    )
-    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
-    parser.add_argument(
-        "--num-iteration",
-        type=int,
-        default=1000,
-        help="Number of iterations to run to stablize final data readings",
-    )
-    parser.add_argument(
-        "--allocate-blocks",
-        type=int,
-        nargs="*",
-        default=[10, 50, 100, 500, 1000],
-        help="Number of blocks to allocate",
-    )
-    args = parser.parse_args()
-    main(args)
-
-
-if __name__ == "__main__":
-    invoke_main()  # pragma: no cover
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index b7303e0443..4193f4041b 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -1,43 +1,63 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import numpy as np
 
 from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
-from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
-                                                _find_subarray_kmp,
-                                                _kmp_lps_array)
+from vllm.v1.spec_decode.ngram_proposer import (
+    NgramProposer, _find_longest_matched_ngram_and_propose_tokens)
 
 
-def test_kmp_lps_array():
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([]))
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0]))
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])),
-                                  np.array([0, 1, 2]))
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])),
-                                  np.array([0, 0, 0, 0]))
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])),
-                                  np.array([0, 0, 1, 2, 0]))
+def test_find_longest_matched_ngram_and_propose_tokens():
+    tokens = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
+    assert _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                          min_ngram=2,
+                                                          max_ngram=2,
+                                                          max_model_len=1024,
+                                                          k=2) is None
 
+    tokens = np.array([1, 2, 3, 4, 1, 2, 3])
+    np.testing.assert_array_equal(
+        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                       min_ngram=2,
+                                                       max_ngram=2,
+                                                       max_model_len=1024,
+                                                       k=3),
+        np.array([4, 1, 2]))
+    np.testing.assert_array_equal(
+        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                       min_ngram=2,
+                                                       max_ngram=2,
+                                                       max_model_len=1024,
+                                                       k=2), np.array([4, 1]))
+    np.testing.assert_array_equal(
+        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                       min_ngram=1,
+                                                       max_ngram=1,
+                                                       max_model_len=1024,
+                                                       k=3),
+        np.array([4, 1, 2]))
+    np.testing.assert_array_equal(
+        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                       min_ngram=1,
+                                                       max_ngram=1,
+                                                       max_model_len=1024,
+                                                       k=2), np.array([4, 1]))
 
-def test_find_subarray_kmp():
-    X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
-    assert _find_subarray_kmp(X, 2, 2) is None
-    X = np.array([1, 2, 3, 4, 1, 2, 3])
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
-                                  np.array([4, 1, 2]))
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4,
-                                                                         1]))
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
-                                  np.array([4, 1, 2]))
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4,
-                                                                         1]))
-    X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
-                                  np.array([4, 1, 2]))
+    tokens = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
+    np.testing.assert_array_equal(
+        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                       min_ngram=2,
+                                                       max_ngram=2,
+                                                       max_model_len=1024,
+                                                       k=3),
+        np.array([4, 1, 2]))
     # Return on the first match
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
-                                  np.array([6, 2, 3]))
+    np.testing.assert_array_equal(
+        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                       min_ngram=1,
+                                                       max_ngram=1,
+                                                       max_model_len=1024,
+                                                       k=2), np.array([6, 2]))
 
 
 def test_ngram_proposer():
@@ -56,27 +76,35 @@ def test_ngram_proposer():
 
     # No match.
     result = ngram_proposer(
-        2, 2, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
+        min_n=2, max_n=2,
+        k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
     assert result is None
 
     # No match for 4-gram.
     result = ngram_proposer(
-        4, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
+        min_n=4, max_n=4,
+        k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
     assert result is None
 
     # No match for 4-gram but match for 3-gram.
     result = ngram_proposer(
-        3, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
+        min_n=3, max_n=4,
+        k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
     assert np.array_equal(result, np.array([4, 1]))
 
     # Match for both 4-gram and 3-gram.
     # In this case, the proposer should return the 4-gram match.
-    result = ngram_proposer(3, 4, 2).propose(
+    result = ngram_proposer(min_n=3, max_n=4, k=2).propose(
         context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]))
     assert np.array_equal(result, np.array([1, 2]))  # Not [5, 1]
 
     # Match for 2-gram and 3-gram, but not 4-gram.
-    result = ngram_proposer(
-        2, 4,
-        2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
+    result = ngram_proposer(min_n=2, max_n=4, k=2).propose(
+        context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
     assert np.array_equal(result, np.array([1, 2]))  # Not [5, 2]
+
+    # Multiple 3-gram matched, but always pick the first one.
+    result = ngram_proposer(
+        min_n=3, max_n=3, k=2).propose(context_token_ids=np.array(
+            [1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]))
+    assert np.array_equal(result, np.array([100, 1]))
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 6b90d0970b..fbcf2cb50d 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -11,6 +11,10 @@ from vllm.config import VllmConfig
 class NgramProposer:
 
     def __init__(self, vllm_config: VllmConfig):
+        assert vllm_config.speculative_config is not None
+        assert vllm_config.speculative_config.prompt_lookup_min is not None
+        assert vllm_config.speculative_config.prompt_lookup_max is not None
+
         # Minimum length of the n-gram to match.
         self.min_n = vllm_config.speculative_config.prompt_lookup_min
         # Maximum length of the n-gram to match.
@@ -54,17 +58,13 @@ class NgramProposer:
               followed that pattern. Here we will return [4,2,3] because 
               we only have three tokens after the match.
         """
-        # Do not generate draft tokens beyond the max model length.
-        k = min(self.k, self.max_model_len - context_token_ids.shape[0])
-        if k <= 0:
-            return None
-
         # TODO(woosuk): Optimize this.
-        for n in range(self.max_n, self.min_n - 1, -1):
-            result = _find_subarray_kmp(context_token_ids, n, k)
-            if result is not None:
-                return result
-        return None
+        return _find_longest_matched_ngram_and_propose_tokens(
+            origin_tokens=context_token_ids,
+            min_ngram=self.min_n,
+            max_ngram=self.max_n,
+            max_model_len=self.max_model_len,
+            k=self.k)
 
     def load_model(self, *args, **kwargs):
         # No model to load.
@@ -72,61 +72,86 @@ class NgramProposer:
 
 
 @jit(nopython=True)
-def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
+def _find_longest_matched_ngram_and_propose_tokens(
+        origin_tokens: np.ndarray, min_ngram: int, max_ngram: int,
+        max_model_len: int, k: int) -> Optional[np.ndarray]:
     """
-    Build the lps (longest proper prefix which is also suffix) 
-    array for the pattern.
+    Find the longest n-gram which matches the suffix of the given tokens
+    whose length is within [min_ngram, max_ngram] (inclusive).
+
+    If found, we will extract k right after the matched ngram.
     """
-    lps = np.zeros(len(pattern), dtype=np.int32)
-    prev_lps = 0  # length of the previous longest prefix suffix
+    # Do not generate draft tokens is context is shorter than minimum n-gram
+    total_token = origin_tokens.shape[0]
+    if total_token < min_ngram:
+        return None
+
+    # Do not generate draft tokens beyond the max model length.
+    k = min(k, max_model_len - total_token)
+    if k <= 0:
+        return None
+
+    # Flip tokens, and the goal become to find longest ngram
+    # on the rightmost position which matches the prefix with
+    # length [min_n, max_n] (inclusive).
+    tokens = origin_tokens[::-1]
+
+    # Longest prefix (not including itself) which is a suffix of
+    # the current position.
+    #   lps[i] = max{v, where tokens[0:v] == tokens[i+1-v:i+1]}
+    #
+    # As ngram is capped by max_ngram to save memory, we only need to
+    # store lps for the first max_ngram prefix.
+    lps = np.zeros(max_ngram, dtype=np.int32)
+
+    longest_ngram = 0
+    position = 0
+
+    # lps[0] always equal to 0, we starts with index 1
+    prev_lps = 0
     i = 1
-
-    while i < len(pattern):
-        if pattern[i] == pattern[prev_lps]:
+    while i < total_token:
+        # tokens[:prev_lps] is the longest prefix as a suffix of tokens[:i]
+        if tokens[prev_lps] == tokens[i]:
+            # Token match: tokens[:prev_lps+1] is the longest prefix as
+            # a suffix of tokens[:i+1]
             prev_lps += 1
-            lps[i] = prev_lps
+            # Check if we found a longer valid ngram.
+            #
+            # Update position when longest_ngram matched prev_lps,
+            # as we want to get the target n-gram of the earliest position
+            # in the original tokens (i.e.
+            # latest position in the reversed tokens)
+            if prev_lps >= longest_ngram:
+                longest_ngram = prev_lps
+                position = i
+            if i < max_ngram:
+                # Store LPS for the first max_ngram prefix
+                lps[i] = prev_lps
+            if prev_lps == max_ngram:
+                # When prev_lps reached max_ngram, update prev_lps
+                # to lps[max_ngram-1] to avoid matching ngram
+                # longer than max_ngram
+                prev_lps = lps[max_ngram - 1]
             i += 1
+        elif prev_lps != 0:
+            # Token mismatch: try the second longest prefix
+            # among all suffix of tokens[:i],
+            # which is the longest prefix of tokens[:prev_lps]
+            prev_lps = lps[prev_lps - 1]
         else:
-            if prev_lps != 0:
-                prev_lps = lps[prev_lps - 1]
-            else:
-                lps[i] = 0
-                i += 1
-    return lps
-
-
-@jit(nopython=True)
-def _find_subarray_kmp(
-    context_token_ids: np.ndarray,
-    n: int,
-    k: int,
-) -> Optional[np.ndarray]:
-    context_len = context_token_ids.shape[0]
-    assert n > 0
-
-    pattern = context_token_ids[-n:]
-    # Precompute lps array for Y
-    lps = _kmp_lps_array(pattern)
-
-    i = 0
-    j = 0
-    # -n because the last n tokens are used as pattern
-    while i < context_len - n:
-        if context_token_ids[i] == pattern[j]:
+            # Token mismatch, and no more prefix (except empty string)
+            # as a suffix of tokens[:i]
             i += 1
-            j += 1
 
-            # If we have matched the entire Y
-            if j == n:
-                # Found pattern in context, gather the next K elements
-                return context_token_ids[i:i + k]
-        else:
-            # Mismatch
-            if j != 0:
-                # Use the lps array to avoid re-checking elements
-                j = lps[j - 1]
-            else:
-                i += 1
+    if longest_ngram < min_ngram:
+        # No valid ngram is found
+        return None
 
-    # Y not found
-    return None
+    # Flip the position back, so in origin_tokens,
+    # origin_tokens[total_token-1-position:total_token-1-position+longest_ngram]
+    # is the matched ngram, so we should start drafting tokens from
+    # total_token-1-position+longest_ngram
+    start_position = total_token - 1 - position + longest_ngram
+    k = min(k, total_token - start_position)
+    return origin_tokens[start_position:start_position + k]

From 0ca2393b47e72c4424a49aa3b32c7c5d0e378a72 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 14 Aug 2025 06:52:48 +0800
Subject: [PATCH 249/932] [CI/Build] Increase pooling tolerance to pass CI
 (#22844)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 tests/models/language/pooling/test_intfloat.py               | 2 +-
 tests/models/language/pooling/test_snowflake_arctic_embed.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py
index e48bdbe940..6cae53a660 100644
--- a/tests/models/language/pooling/test_intfloat.py
+++ b/tests/models/language/pooling/test_intfloat.py
@@ -36,7 +36,7 @@ MODELS = [
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
index 585fa0e683..c22c78592e 100644
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -46,7 +46,7 @@ MODELS = [
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
 
 
 @pytest.mark.parametrize("model_info", MODELS)

From b6af24fba73cad27254e8826bbd842810cce7ee8 Mon Sep 17 00:00:00 2001
From: Will Eaton <wseaton@users.noreply.github.com>
Date: Wed, 13 Aug 2025 23:09:07 -0400
Subject: [PATCH 250/932] [CI][Entrypoints]: add filter to generation to filter
 out invalid tool calls (#22826)

Signed-off-by: Will Eaton <weaton@redhat.com>
---
 .../entrypoints/openai/test_openai_schema.py  | 48 ++++++++++++-------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 771119d04e..246bd014aa 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -54,38 +54,54 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
     op = context.operation
     assert op is not None
 
-    def no_file_type(case: schemathesis.models.Case):
+    def no_invalid_types(case: schemathesis.models.Case):
         """
-        This filter skips test cases for the `POST /tokenize` endpoint where the
-        HTTP request body uses `"type": "file"` in any message's content.
-        We expect these cases to fail because that type isn't implemented here
-        https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
+        This filter skips test cases with invalid data that schemathesis
+        incorrectly generates due to permissive schema configurations.
+        
+        1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in 
+           message content, which isn't implemented.
+        
+        2. Skips tool_calls with `"type": "custom"` which schemathesis 
+           incorrectly generates instead of the valid `"type": "function"`.
 
         Example test cases that are skipped:
         curl -X POST -H 'Content-Type: application/json' \
-            -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
             http://localhost:8000/tokenize
 
         curl -X POST -H 'Content-Type: application/json' \
-            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
-            http://localhost:8000/tokenize
+            -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
+            http://localhost:8000/v1/chat/completions
         """  # noqa: E501
-        if (op.method.lower() == "post" and op.path == "/tokenize"
-                and hasattr(case, "body") and isinstance(case.body, dict)
+        if (hasattr(case, "body") and isinstance(case.body, dict)
                 and "messages" in case.body
                 and isinstance(case.body["messages"], list)
                 and len(case.body["messages"]) > 0):
+
             for message in case.body["messages"]:
                 if not isinstance(message, dict):
                     continue
-                content = message.get("content", [])
-                if not isinstance(content, list) or len(content) == 0:
-                    continue
-                if any(item.get("type") == "file" for item in content):
-                    return False
+
+                # Check for invalid file type in tokenize endpoint
+                if op.method.lower() == "post" and op.path == "/tokenize":
+                    content = message.get("content", [])
+                    if (isinstance(content, list) and len(content) > 0 and any(
+                            item.get("type") == "file" for item in content)):
+                        return False
+
+                # Check for invalid tool_calls with non-function types
+                tool_calls = message.get("tool_calls", [])
+                if isinstance(tool_calls, list):
+                    for tool_call in tool_calls:
+                        if isinstance(tool_call, dict):
+                            if tool_call.get("type") != "function":
+                                return False
+                            if "custom" in tool_call:
+                                return False
         return True
 
-    return strategy.filter(no_file_type)
+    return strategy.filter(no_invalid_types)
 
 
 @schema.parametrize()

From 1d20c347179733875faf4c7802ef695ad3dec4f8 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Thu, 14 Aug 2025 05:09:30 +0200
Subject: [PATCH 251/932] [CI] Fix
 `tests/distributed/test_ca_buffer_sharing.py` (#22849)

Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 vllm/distributed/device_communicators/custom_all_reduce.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 7dd104a4fc..8dfb7959a5 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -297,7 +297,7 @@ class CustomAllreduce:
     @staticmethod
     def free_shared_buffer(pointers: list[int],
                            group: Optional[ProcessGroup] = None,
-                           rank: Optional[int] = 0) -> None:
+                           rank: Optional[int] = None) -> None:
         if rank is None:
             rank = dist.get_rank(group=group)
         if ops is not None:

From a353bd083d22c92c90479d6b5b5029c0daed49da Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 14 Aug 2025 00:41:51 -0400
Subject: [PATCH 252/932] [CI] remove flaky v0 test (#22864)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
---
 tests/entrypoints/openai/test_default_mm_loras.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py
index 1fc87c8b42..372e9b1fec 100644
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@@ -24,18 +24,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original
 
 
 @pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module", params=[False, True])
-def multimodal_server(request, monkeypatch_module):  # noqa: F811
-
-    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+def multimodal_server():  # noqa: F811
 
     args = [
         # use half precision for speed and memory savings in CI environment

From 00e3f9da462b31f271d9d9fdb526f148572609a9 Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Thu, 14 Aug 2025 00:12:17 -0700
Subject: [PATCH 253/932]  vLLM Benchmark suite improvement (#22119)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
---
 .buildkite/nightly-benchmarks/README.md       |  32 ++--
 .../scripts/compare-json-results.py           | 175 ++++++++++++++++--
 .../convert-results-json-to-markdown.py       | 161 +++++++++++++++-
 .../scripts/run-performance-benchmarks.sh     |  85 +++++----
 .../tests/latency-tests-cpu.json              |   4 +-
 .../tests/serving-tests-cpu-snc2.json         |  49 +++--
 .../tests/serving-tests-cpu-snc3.json         |  52 +++---
 .../tests/serving-tests-cpu.json              |  30 +--
 .../tests/throughput-tests-cpu.json           |   4 +-
 docs/contributing/benchmarks.md               |   2 +-
 10 files changed, 447 insertions(+), 147 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 3f2e2da397..b39f9899a8 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm.
 - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
 
-See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
 
 ## Performance benchmark quick overview
 
@@ -138,28 +138,20 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts
 
 The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
 When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
-`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
+If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
 
-Here is an example using the script to compare result_a and result_b without detail test name.
-`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
-
-|    | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
-|----|----------------------------------------|----------------------------------------|----------|
-| 0  | 142.633982                             | 156.526018                             | 1.097396 |
-| 1  | 241.620334                             | 294.018783                             | 1.216863 |
-| 2  | 218.298905                             | 262.664916                             | 1.203235 |
-| 3  | 242.743860                             | 299.816190                             | 1.235113 |
-
-Here is an example using the script to compare result_a and result_b with detail test name.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
 
-|   | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio        |
-|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
-| 0 | serving_llama8B_tp1_sharegpt_qps_1          | 142.633982                             | serving_llama8B_tp1_sharegpt_qps_1          | 156.526018                             | 1.097396 |
-| 1 | serving_llama8B_tp1_sharegpt_qps_16         | 241.620334                             | serving_llama8B_tp1_sharegpt_qps_16         | 294.018783                             | 1.216863 |
-| 2 | serving_llama8B_tp1_sharegpt_qps_4          | 218.298905                             | serving_llama8B_tp1_sharegpt_qps_4          | 262.664916                             | 1.203235 |
-| 3 | serving_llama8B_tp1_sharegpt_qps_inf        | 242.743860                             | serving_llama8B_tp1_sharegpt_qps_inf        | 299.816190                             | 1.235113 |
-| 4 | serving_llama8B_tp2_random_1024_128_qps_1   | 96.613390                              | serving_llama8B_tp4_random_1024_128_qps_1   | 108.404853                             | 1.122048 |
+|   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
+|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
+| 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 |
+| 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 |
+
+A comparison diagram will be generated below the table.
+Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
+<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
 
 ## Nightly test details
 
diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 20c1062349..12c4ba6aa6 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -1,24 +1,38 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
+import json
+import os
 
 import pandas as pd
 
 
 def compare_data_columns(
-    files, name_column, data_column, drop_column, ignore_test_name=False
+    files, name_column, data_column, info_cols, drop_column, debug=False
 ):
     print("\ncompare_data_column: " + data_column)
     frames = []
+    raw_data_cols = []
     compare_frames = []
     for file in files:
         data_df = pd.read_json(file)
         serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
-        if ignore_test_name is False:
+        # Show all info columns in the first couple columns
+        if not frames:
+            for col in info_cols:
+                if col not in serving_df.columns:
+                    print(f"Skipping missing column: {col}")
+                    continue
+                frames.append(serving_df[col])
+        # only show test name under debug mode
+        if debug is True:
             serving_df = serving_df.rename(columns={name_column: file + "_name"})
             frames.append(serving_df[file + "_name"])
+
+        file = "/".join(file.split("/")[:-1])
         serving_df = serving_df.rename(columns={data_column: file})
         frames.append(serving_df[file])
+        raw_data_cols.append(file)
         compare_frames.append(serving_df[file])
         if len(compare_frames) >= 2:
             # Compare numbers among two files
@@ -27,7 +41,68 @@ def compare_data_columns(
             compare_frames.pop(1)
 
     concat_df = pd.concat(frames, axis=1)
-    return concat_df
+    print(raw_data_cols)
+    return concat_df, raw_data_cols
+
+
+def split_json_by_tp_pp(
+    input_file: str = "benchmark_results.json", output_root: str = "."
+) -> list[str]:
+    """
+    Split a benchmark JSON into separate folders by (TP Size, PP Size).
+
+    Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
+    Returns: list of file paths written.
+    """
+    # Load JSON data into DataFrame
+    with open(input_file, encoding="utf-8") as f:
+        data = json.load(f)
+
+    # If the JSON is a dict with a list under common keys, use that list
+    if isinstance(data, dict):
+        for key in ("results", "serving_results", "benchmarks", "data"):
+            if isinstance(data.get(key), list):
+                data = data[key]
+                break
+
+    df = pd.DataFrame(data)
+
+    # Handle alias column names
+    rename_map = {
+        "tp_size": "TP Size",
+        "tensor_parallel_size": "TP Size",
+        "pp_size": "PP Size",
+        "pipeline_parallel_size": "PP Size",
+    }
+    df.rename(
+        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
+    )
+
+    # Ensure TP/PP columns exist (default to 1 if missing)
+    if "TP Size" not in df.columns:
+        df["TP Size"] = 1
+    if "PP Size" not in df.columns:
+        df["PP Size"] = 1
+
+    # make sure TP/PP are numeric ints with no NaN
+    df["TP Size"] = (
+        pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
+    )
+    df["PP Size"] = (
+        pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
+    )
+
+    # Split into separate folders
+    saved_paths: list[str] = []
+    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
+        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
+        os.makedirs(folder_name, exist_ok=True)
+        filepath = os.path.join(folder_name, "benchmark_results.json")
+        group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
+        print(f"Saved: {filepath}")
+        saved_paths.append(filepath)
+
+    return saved_paths
 
 
 if __name__ == "__main__":
@@ -36,31 +111,105 @@ if __name__ == "__main__":
         "-f", "--file", action="append", type=str, help="input file name"
     )
     parser.add_argument(
-        "--ignore_test_name", action="store_true", help="ignore_test_name or not"
+        "--debug", action="store_true", help="show all information for debugging"
+    )
+    parser.add_argument(
+        "--plot",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="plot perf diagrams or not --no-plot --plot",
+    )
+    parser.add_argument(
+        "-x",
+        "--xaxis",
+        type=str,
+        default="# of max concurrency.",
+        help="column name to use as X Axis in comparision graph",
     )
     args = parser.parse_args()
-    files = args.file
-    print("comparing : " + ", ".join(files))
 
     drop_column = "P99"
     name_column = "Test name"
+    info_cols = [
+        "Model",
+        "Dataset Name",
+        "Input Len",
+        "Output Len",
+        "TP Size",
+        "PP Size",
+        "# of max concurrency.",
+        "qps",
+    ]
     data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
     html_msgs_for_data_cols = [
         "Compare Output Tokens /n",
         "Median TTFT /n",
         "Median TPOT /n",
     ]
-    ignore_test_name = args.ignore_test_name
+
+    if len(args.file) == 1:
+        files = split_json_by_tp_pp(args.file[0], output_root="splits")
+        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
+    else:
+        files = args.file
+    print("comparing : " + ", ".join(files))
+    debug = args.debug
+    plot = args.plot
+    # For Plot feature, assign y axis from one of info_cols
+    y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
     with open("perf_comparison.html", "w") as text_file:
         for i in range(len(data_cols_to_compare)):
-            output_df = compare_data_columns(
+            output_df, raw_data_cols = compare_data_columns(
                 files,
                 name_column,
                 data_cols_to_compare[i],
+                info_cols,
                 drop_column,
-                ignore_test_name=ignore_test_name,
+                debug=debug,
             )
-            print(output_df)
-            html = output_df.to_html()
-            text_file.write(html_msgs_for_data_cols[i])
-            text_file.write(html)
+
+            # For Plot feature, insert y axis from one of info_cols
+            raw_data_cols.insert(0, info_cols[y_axis_index])
+
+            filtered_info_cols = info_cols[:-2]
+            existing_group_cols = [
+                c for c in filtered_info_cols if c in output_df.columns
+            ]
+            if not existing_group_cols:
+                raise ValueError(
+                    f"No valid group-by columns  "
+                    f"Expected subset: {filtered_info_cols}, "
+                    f"but DataFrame has: {list(output_df.columns)}"
+                )
+
+            output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
+            for name, group in output_groups:
+                html = group.to_html()
+                text_file.write(html_msgs_for_data_cols[i])
+                text_file.write(html)
+
+                if plot is True:
+                    import pandas as pd
+                    import plotly.express as px
+
+                    df = group[raw_data_cols]
+                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
+                    # Melt DataFrame for plotting
+                    df_melted = df_sorted.melt(
+                        id_vars=info_cols[y_axis_index],
+                        var_name="Configuration",
+                        value_name=data_cols_to_compare[i],
+                    )
+                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+                    # Create Plotly line chart
+                    fig = px.line(
+                        df_melted,
+                        x=info_cols[y_axis_index],
+                        y=data_cols_to_compare[i],
+                        color="Configuration",
+                        title=title,
+                        markers=True,
+                    )
+                    # Export to HTML
+                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 554256b4bd..496ee6083a 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,17 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import argparse
 import json
 import os
+import re
+import shlex
 from importlib import util
 from pathlib import Path
+from typing import Any
 
 import pandas as pd
 import psutil
 from tabulate import tabulate
 
-results_folder = Path("results/")
-
 # latency results and the keys that will be printed into markdown
 latency_results = []
 latency_column_mapping = {
@@ -42,14 +44,22 @@ throughput_results_column_mapping = {
 serving_results = []
 serving_column_mapping = {
     "test_name": "Test name",
+    "model_id": "Model",
+    "dataset_name": "Dataset Name",
+    "input_len": "Input Len",
+    "output_len": "Output Len",
+    "tp_size": "TP Size",
+    "pp_size": "PP Size",
+    "dtype": "dtype",
     "gpu_type": "GPU",
     "completed": "# of req.",
+    "qps": "qps",
     "max_concurrency": "# of max concurrency.",
     "request_throughput": "Tput (req/s)",
     "total_token_throughput": "Total Token Tput (tok/s)",
     "output_throughput": "Output Tput (tok/s)",
-    "total_input_tokens": "Total input tokens",
-    "total_output_tokens": "Total output tokens",
+    # "total_input_tokens": "Total input tokens",
+    # "total_output_tokens": "Total output tokens",
     "mean_ttft_ms": "Mean TTFT (ms)",
     "median_ttft_ms": "Median TTFT (ms)",
     "p99_ttft_ms": "P99 TTFT (ms)",
@@ -94,7 +104,104 @@ def get_size_with_unit(bytes, suffix="B"):
         bytes /= factor
 
 
+def _coerce(val: str) -> Any:
+    """Best-effort type coercion from string to Python types."""
+    low = val.lower()
+    if low == "null":
+        return None
+    if low == "true":
+        return True
+    if low == "false":
+        return False
+    # integers
+    if re.fullmatch(r"[+-]?\d+", val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+    # floats (keep 'inf'/'-inf'/'nan' as strings)
+    if re.fullmatch(r"[+-]?\d*\.\d+", val):
+        try:
+            return float(val)
+        except ValueError:
+            pass
+    return val
+
+
+def parse_client_command(cmd: str) -> dict[str, Any]:
+    """Parse the client_command shell string into {executable, script, args}."""
+    toks = shlex.split(cmd)
+    if len(toks) < 2:
+        raise ValueError("client_command must include an executable and a script")
+    executable, script = toks[0], toks[1]
+    args: dict[str, Any] = {}
+
+    i = 2
+    while i < len(toks):
+        t = toks[i]
+        if t.startswith("--"):
+            # --key=value or --key (value) or boolean flag
+            if "=" in t:
+                key, val = t.split("=", 1)
+                if key == "--metadata":
+                    md = {}
+                    if val:
+                        if "=" in val:
+                            k, v = val.split("=", 1)
+                            md[k] = _coerce(v)
+                        else:
+                            md[val] = True
+                    args[key] = md
+                else:
+                    args[key] = _coerce(val)
+                i += 1
+                continue
+
+            key = t
+
+            # Special: consume metadata k=v pairs until next --flag
+            if key == "--metadata":
+                i += 1
+                md = {}
+                while i < len(toks) and not toks[i].startswith("--"):
+                    pair = toks[i]
+                    if "=" in pair:
+                        k, v = pair.split("=", 1)
+                        md[k] = _coerce(v)
+                    else:
+                        md[pair] = True
+                    i += 1
+                args[key] = md
+                continue
+
+            # Standard: check if next token is a value (not a flag)
+            if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
+                args[key] = _coerce(toks[i + 1])
+                i += 2
+            else:
+                # lone flag -> True
+                args[key] = True
+                i += 1
+        else:
+            # unexpected positional; skip
+            i += 1
+
+    return {"executable": executable, "script": script, "args": args}
+
+
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-r",
+        "--result",
+        type=str,
+        default="results",
+        help="Folder name for benchmark output results.",
+    )
+    args = parser.parse_args()
+    results_folder = Path(args.result)
+    if not results_folder.exists():
+        raise FileNotFoundError(f"results folder does not exist: {results_folder}")
     # collect results
     for test_file in results_folder.glob("*.json"):
         with open(test_file) as f:
@@ -102,7 +209,6 @@ if __name__ == "__main__":
 
         if "serving" in str(test_file):
             # this result is generated via `vllm bench serve` command
-
             # attach the benchmarking command to raw_result
             try:
                 with open(test_file.with_suffix(".commands")) as f:
@@ -110,12 +216,44 @@ if __name__ == "__main__":
             except OSError as e:
                 print(e)
                 continue
+            # Parse Server Command Arg
+            out: dict[str, Any] = {
+                "server_command": parse_client_command(command["server_command"])
+            }
+            parse_args = [
+                "--tensor-parallel-size",
+                "--pipeline-parallel-size",
+                "--dtype",
+            ]
+            col_mapping = ["tp_size", "pp_size", "dtype"]
+            for index, arg in enumerate(parse_args):
+                if arg in out["server_command"]["args"]:
+                    raw_result.update(
+                        {col_mapping[index]: out["server_command"]["args"][arg]}
+                    )
 
+            # Parse Client Command Arg
+            out: dict[str, Any] = {
+                "client_command": parse_client_command(command["client_command"])
+            }
+            parse_args = [
+                "--dataset-name",
+                "--random-input-len",
+                "--random-output-len",
+                "--request-rate",
+            ]
+            col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
+
+            for index, arg in enumerate(parse_args):
+                if arg in out["client_command"]["args"]:
+                    raw_result.update(
+                        {col_mapping[index]: out["client_command"]["args"][arg]}
+                    )
+            # Add Server, Client command
             raw_result.update(command)
 
             # update the test name of this result
             raw_result.update({"test_name": test_file.stem})
-
             # add the result to raw_result
             serving_results.append(raw_result)
             continue
@@ -205,7 +343,10 @@ if __name__ == "__main__":
             columns=latency_column_mapping
         )
     if not serving_results.empty:
-        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+        valid_columns = [
+            col for col in serving_column_mapping if col in serving_results.columns
+        ]
+        serving_results = serving_results[valid_columns].rename(
             columns=serving_column_mapping
         )
     if not throughput_results.empty:
@@ -245,7 +386,9 @@ if __name__ == "__main__":
     )
 
     # document the result
-    with open(results_folder / "benchmark_results.md", "w") as f:
+    md_file = "benchmark_results.md"
+    json_file = "benchmark_results.json"
+    with open(results_folder / md_file, "w") as f:
         results = read_markdown(
             "../.buildkite/nightly-benchmarks/"
             + "performance-benchmarks-descriptions.md"
@@ -260,7 +403,7 @@ if __name__ == "__main__":
         f.write(results)
 
     # document benchmarking results in json
-    with open(results_folder / "benchmark_results.json", "w") as f:
+    with open(results_folder / json_file, "w") as f:
         results = (
             latency_results.to_dict(orient="records")
             + throughput_results.to_dict(orient="records")
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 2c57666a81..b1b7d2d77a 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -194,9 +194,11 @@ run_latency_tests() {
 
     # check if there is enough GPU to run the test
     tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ];then
-      if [[ $numa_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+    if [ "$ON_CPU" == "1" ]; then
+      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+      world_size=$(($tp*$pp))
+      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
         continue
       fi
     else
@@ -261,9 +263,11 @@ run_throughput_tests() {
 
     # check if there is enough GPU to run the test
     tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ];then
-      if [[ $numa_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+    if [ "$ON_CPU" == "1" ]; then
+      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+      world_size=$(($tp*$pp))
+      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
         continue
       fi
     else
@@ -329,12 +333,21 @@ run_serving_tests() {
     qps_list=$(echo "$params" | jq -r '.qps_list')
     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
     echo "Running over qps list $qps_list"
+    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
+    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
+        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+        max_concurrency_list="[$num_prompts]"
+    fi
+    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
+    echo "Running over max concurrency list $max_concurrency_list"
 
     # check if there is enough resources to run the test
     tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ];then
-      if [[ $numa_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+    if [ "$ON_CPU" == "1" ]; then
+      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+      world_size=$(($tp*$pp))
+      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
         continue
       fi
     else
@@ -390,35 +403,39 @@ run_serving_tests() {
         echo "now qps is $qps"
       fi
 
-      new_test_name=$test_name"_qps_"$qps
+      # iterate over different max_concurrency
+      for max_concurrency in $max_concurrency_list; do
+        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
+        echo " new test name $new_test_name"
+        # pass the tensor parallel size to the client so that it can be displayed
+        # on the benchmark dashboard
+        client_command="vllm bench serve \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --max-concurrency $max_concurrency \
+          --metadata "tensor_parallel_size=$tp" \
+          $client_args $client_remote_args "
 
-      # pass the tensor parallel size to the client so that it can be displayed
-      # on the benchmark dashboard
-      client_command="vllm bench serve \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        --metadata "tensor_parallel_size=$tp" \
-        $client_args $client_remote_args "
+        echo "Running test case $test_name with qps $qps"
+        echo "Client command: $client_command"
 
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
+        bash -c "$client_command"
 
-      bash -c "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+        # record the benchmarking commands
+        jq_output=$(jq -n \
+          --arg server "$server_command" \
+          --arg client "$client_command" \
+          --arg gpu "$gpu_type" \
+          '{
+            server_command: $server,
+            client_command: $client,
+            gpu_type: $gpu
+          }')
+        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
 
+      done
     done
 
     # clean up
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
index da93fdd1db..569117aae8 100644
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@@ -6,7 +6,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "load_format": "dummy",
             "num_iters_warmup": 5,
@@ -20,7 +20,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
             "load_format": "dummy",
             "num_iters_warmup": 5,
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
index dd0e24edff..2d88a0b30c 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -1,7 +1,8 @@
 [
     {
         "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -10,7 +11,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -23,17 +24,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_tp2_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -42,7 +43,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -55,17 +56,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_tp4_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -74,7 +75,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -87,17 +88,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_tp1_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -106,7 +107,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -120,19 +121,19 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
             "num_prompts": 1000
         }
     },
     {
         "test_name": "serving_llama8B_tp2_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -141,7 +142,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -155,19 +156,19 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
             "num_prompts": 1000
         }
     },
     {
         "test_name": "serving_llama8B_tp4_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -176,7 +177,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -190,13 +191,11 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
-	    "ignore-eos": "",
-	    "max_concurrency": 1000,
             "num_prompts": 1000
         }
     }
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
index f1bda65a75..823abbaa99 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -1,7 +1,8 @@
 [
     {
         "test_name": "serving_llama8B_pp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -10,7 +11,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -23,17 +24,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_pp3_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -42,7 +43,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -55,17 +56,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
-        "test_name": "serving_llama8B_tp2pp6_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "test_name": "serving_llama8B_tp2pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -74,7 +75,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 2,
             "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
@@ -88,17 +89,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_pp1_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -107,7 +108,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -121,28 +122,28 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
             "num_prompts": 1000
         }
     },
     {
         "test_name": "serving_llama8B_pp3_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL:": 1,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -156,19 +157,19 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
             "num_prompts": 1000
         }
     },
     {
         "test_name": "serving_llama8B_tp2pp3_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -177,7 +178,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 2,
             "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
@@ -192,13 +193,12 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
             "num_prompts": 1000
         }
     }
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
index f150b9abee..e21c8df0a9 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -2,6 +2,7 @@
     {
         "test_name": "serving_llama8B_tp1_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -10,7 +11,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -23,17 +24,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_tp2_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -42,7 +43,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -55,17 +56,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_tp4_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -74,7 +75,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -87,17 +88,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_tp4_random_1024_128",
         "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -106,7 +107,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -120,19 +121,19 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 100,
             "num_prompts": 100
         }
     },
     {
         "test_name": "serving_llama8B_pp6_random_1024_128",
         "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -141,7 +142,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "pipeline_parallel_size": 6,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -155,13 +156,12 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 100,
             "num_prompts": 100
         }
     }
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
index f159c30637..48c015aa84 100644
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@@ -6,7 +6,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -21,7 +21,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index 0ebd99ba5a..2bbed778f3 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -11,7 +11,7 @@ vLLM contains two sets of benchmarks:
 
 The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
 
-The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai).
+The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
 
 More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
 

From 7c3a0741c67007dd759f52d07f7aca854628b81b Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 14 Aug 2025 17:35:43 +0800
Subject: [PATCH 254/932] [Bugfix] Fix `PixtralHFImagePixelInputs` dynamic
 shape check (#22827)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/multimodal/test_tensor_schema.py | 2 +-
 vllm/model_executor/models/llava.py           | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py
index a4cb1a6883..92390d8c2f 100644
--- a/tests/models/multimodal/test_tensor_schema.py
+++ b/tests/models/multimodal/test_tensor_schema.py
@@ -153,4 +153,4 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner],
                     if hasattr(model, method_name):
                         getattr(model, method_name)(**mm_kwargs)
 
-            vllm_model.apply_model(validate_model_input)
+            vllm_model.apply_model(validate_model_input)
\ No newline at end of file
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 89d2817b57..4927d6b62c 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -72,8 +72,9 @@ class PixtralHFImagePixelInputs(TensorSchema):
     in which case the data is passed as a list instead of a batched tensor.
     """
     type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral"
-    pixel_values: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                            TensorShape("bn", "c", "h", "w")]
+    pixel_values: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "c", "h", "w", dynamic_dims={"h", "w"})]
 
 
 class LlavaImageEmbeddingInputs(TensorSchema):

From eb08487b18f4be3cc8dbe6776c2d69d223b3737c Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 14 Aug 2025 03:44:29 -0700
Subject: [PATCH 255/932] [BugFix] Threadsafe close async zmq sockets (#22877)

Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/utils/__init__.py        | 24 ++++++++++-
 vllm/v1/engine/core_client.py | 79 ++++++++++++++++++++++++-----------
 2 files changed, 77 insertions(+), 26 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 095829db83..cae4eecc0d 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -709,8 +709,28 @@ class AsyncMicrobatchTokenizer:
 
 
 def cancel_task_threadsafe(task: Task):
-    if task and not task.done() and not (loop := task.get_loop()).is_closed():
-        loop.call_soon_threadsafe(task.cancel)
+    if task and not task.done():
+        run_in_loop(task.get_loop(), task.cancel)
+
+
+def close_sockets(sockets: Sequence[Union[zmq.Socket, zmq.asyncio.Socket]]):
+    for sock in sockets:
+        if sock is not None:
+            sock.close(linger=0)
+
+
+def run_in_loop(loop: AbstractEventLoop, function: Callable, *args):
+    if in_loop(loop):
+        function(*args)
+    elif not loop.is_closed():
+        loop.call_soon_threadsafe(function, *args)
+
+
+def in_loop(event_loop: AbstractEventLoop) -> bool:
+    try:
+        return asyncio.get_running_loop() == event_loop
+    except RuntimeError:
+        return False
 
 
 def make_async(
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 05b4d72608..5ffa555570 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -23,8 +23,8 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.tasks import SupportedTask
-from vllm.utils import (cancel_task_threadsafe, get_open_port,
-                        get_open_zmq_inproc_path, make_zmq_socket)
+from vllm.utils import (close_sockets, get_open_port, get_open_zmq_inproc_path,
+                        in_loop, make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType,
                             ReconfigureDistributedRequest, ReconfigureRankType,
@@ -317,7 +317,7 @@ class BackgroundResources:
     """Used as a finalizer for clean shutdown, avoiding
     circular reference back to the client object."""
 
-    ctx: Union[zmq.Context]
+    ctx: zmq.Context
     # If CoreEngineProcManager, it manages local engines;
     # if CoreEngineActorManager, it manages all engines.
     engine_manager: Optional[Union[CoreEngineProcManager,
@@ -326,6 +326,8 @@ class BackgroundResources:
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     first_req_send_socket: Optional[zmq.asyncio.Socket] = None
+    first_req_rcv_socket: Optional[zmq.asyncio.Socket] = None
+    stats_update_socket: Optional[zmq.asyncio.Socket] = None
     output_queue_task: Optional[asyncio.Task] = None
     stats_update_task: Optional[asyncio.Task] = None
     shutdown_path: Optional[str] = None
@@ -343,23 +345,47 @@ class BackgroundResources:
         if self.coordinator is not None:
             self.coordinator.close()
 
-        cancel_task_threadsafe(self.output_queue_task)
-        cancel_task_threadsafe(self.stats_update_task)
+        if isinstance(self.output_socket, zmq.asyncio.Socket):
+            # Async case.
+            loop = self.output_socket._get_loop()
+            asyncio.get_running_loop()
+            sockets = (self.output_socket, self.input_socket,
+                       self.first_req_send_socket, self.first_req_rcv_socket,
+                       self.stats_update_socket)
 
-        # ZMQ context termination can hang if the sockets
-        # aren't explicitly closed first.
-        for socket in (self.output_socket, self.input_socket,
-                       self.first_req_send_socket):
-            if socket is not None:
-                socket.close(linger=0)
+            tasks = (self.output_queue_task, self.stats_update_task)
 
-        if self.shutdown_path is not None:
-            # We must ensure that the sync output socket is
-            # closed cleanly in its own thread.
-            with self.ctx.socket(zmq.PAIR) as shutdown_sender:
-                shutdown_sender.connect(self.shutdown_path)
-                # Send shutdown signal.
-                shutdown_sender.send(b'')
+            def close_sockets_and_tasks():
+                close_sockets(sockets)
+                for task in tasks:
+                    if task is not None and not task.done():
+                        task.cancel()
+
+            if in_loop(loop):
+                close_sockets_and_tasks()
+            elif not loop.is_closed():
+                loop.call_soon_threadsafe(close_sockets_and_tasks)
+            else:
+                # Loop has been closed, try to clean up directly.
+                del tasks
+                del close_sockets_and_tasks
+                close_sockets(sockets)
+                del self.output_queue_task
+                del self.stats_update_task
+        else:
+            # Sync case.
+
+            # ZMQ context termination can hang if the sockets
+            # aren't explicitly closed first.
+            close_sockets((self.output_socket, self.input_socket))
+
+            if self.shutdown_path is not None:
+                # We must ensure that the sync output socket is
+                # closed cleanly in its own thread.
+                with self.ctx.socket(zmq.PAIR) as shutdown_sender:
+                    shutdown_sender.connect(self.shutdown_path)
+                    # Send shutdown signal.
+                    shutdown_sender.send(b'')
 
     def validate_alive(self, frames: Sequence[zmq.Frame]):
         if len(frames) == 1 and (frames[0].buffer
@@ -969,14 +995,19 @@ class DPAsyncMPClient(AsyncMPClient):
                             self.engine_ranks_managed[-1] + 1)
 
         async def run_engine_stats_update_task():
-            with make_zmq_socket(self.ctx, self.stats_update_address,
-                                 zmq.XSUB) as socket, make_zmq_socket(
-                                     self.ctx,
-                                     self.first_req_sock_addr,
-                                     zmq.PAIR,
-                                     bind=False) as first_req_rcv_socket:
+            with (make_zmq_socket(self.ctx,
+                                  self.stats_update_address,
+                                  zmq.XSUB,
+                                  linger=0) as socket,
+                  make_zmq_socket(self.ctx,
+                                  self.first_req_sock_addr,
+                                  zmq.PAIR,
+                                  bind=False,
+                                  linger=0) as first_req_rcv_socket):
                 assert isinstance(socket, zmq.asyncio.Socket)
                 assert isinstance(first_req_rcv_socket, zmq.asyncio.Socket)
+                self.resources.stats_update_socket = socket
+                self.resources.first_req_rcv_socket = first_req_rcv_socket
                 # Send subscription message.
                 await socket.send(b'\x01')
 

From f4efda821d7f144f5f9478e960b5011578c69bf0 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 14 Aug 2025 12:03:49 +0100
Subject: [PATCH 256/932] Remove Phi 4 Flash configuration workaround (#22723)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/transformers_utils/config.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 02ea0814dd..d8c964fb2a 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -449,23 +449,6 @@ def get_config(
                     raise e
         config = _maybe_remap_hf_config_attrs(config)
 
-        # Phi4Flash misuses this config as list[int]. Convert it to int and add
-        # the layer_types list[str] to make it HF compatible
-        if (config.model_type == "phi4flash"):
-            # TODO: Remove after the following PR is merged:
-            # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/6
-            if not hasattr(config, "layer_types"):
-                config.layer_types = [
-                    "sliding_attention" if i < config.num_hidden_layers // 2
-                    and i % 2 == 1 else "full_attention"
-                    for i in range(config.num_hidden_layers)
-                ]
-            # TODO: Remove after the following PR is merged:
-            # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/7
-            if isinstance(config.sliding_window, list):
-                config.sliding_window = next(
-                    filter(None, config.sliding_window), None)
-
     elif config_format == ConfigFormat.MISTRAL:
         # This function loads a params.json config which
         # should be used when loading models in mistral format

From 7655dc3e45e65f39eee9755cda5298e7319240f2 Mon Sep 17 00:00:00 2001
From: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Date: Thu, 14 Aug 2025 18:04:18 +0700
Subject: [PATCH 257/932] [Bugfix] Add reset prefix cache for online serving
 (#22726)

Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/engine/async_llm_engine.py | 1 +
 vllm/v1/engine/async_llm.py     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index b6ee410534..73726eeab5 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1092,6 +1092,7 @@ class AsyncLLMEngine(EngineClient):
         self.engine.reset_prefix_cache(device)
 
     async def sleep(self, level: int = 1) -> None:
+        await self.reset_prefix_cache()
         self.engine.sleep(level)
 
     async def wake_up(self, tags: Optional[list[str]] = None) -> None:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a270632791..edc2e235c3 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -576,6 +576,7 @@ class AsyncLLM(EngineClient):
         await self.engine_core.reset_prefix_cache_async()
 
     async def sleep(self, level: int = 1) -> None:
+        await self.reset_prefix_cache()
         await self.engine_core.sleep_async(level)
 
     async def wake_up(self, tags: Optional[list[str]] = None) -> None:

From 0783f139603aaf8c204c36e067a88a8ef1ff47e6 Mon Sep 17 00:00:00 2001
From: Daniele <36171005+dtrifiro@users.noreply.github.com>
Date: Thu, 14 Aug 2025 13:06:13 +0200
Subject: [PATCH 258/932] [Doc] fix dead link (#22898)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 docs/getting_started/installation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index f6ecceb85d..0ee680f5c6 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -18,7 +18,7 @@ vLLM supports the following hardware platforms:
 ## Hardware Plugins
 
 The backends below live **outside** the main `vllm` repository and follow the
-[Hardware-Pluggable RFC](../design/plugin_system.md).
+[Hardware-Pluggable RFC](../../design/plugin_system.md).
 
 | Accelerator | PyPI / package | Repository |
 |-------------|----------------|------------|

From 540d54ca8d38633b69cc5a2ba99641e6304a7564 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 14 Aug 2025 13:34:34 +0200
Subject: [PATCH 259/932] [CI] Re-enable transcriptions
 `test_long_audio_request` (#22890)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/entrypoints/openai/test_transcription_validation.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 28fd02171b..e103bd206b 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -80,9 +80,6 @@ async def test_bad_requests(mary_had_lamb):
 async def test_long_audio_request(mary_had_lamb, model_name):
     server_args = ["--enforce-eager"]
 
-    if model_name.startswith("openai"):
-        return
-
     mary_had_lamb.seek(0)
     audio, sr = librosa.load(mary_had_lamb)
     # Add small silence after each audio for repeatability in the split process

From 829b9a62d0a89872883397ae4b5184048836589f Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 14 Aug 2025 08:28:09 -0400
Subject: [PATCH 260/932] [Perf] Dont create unnecessary pooling params
 (#22876)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a03e860a91..8fb9641844 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -341,13 +341,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         model_kwargs = dict[str, Any]()
         num_reqs = self.input_batch.num_reqs
 
-        pooling_params = self.input_batch.pooling_metadata.pooling_params
-
-        num_pooling_reqs = len(pooling_params)
+        num_pooling_reqs = len(self.input_batch.pooling_params)
 
         if num_pooling_reqs == 0:
             return model_kwargs
 
+        pooling_params = self.input_batch.pooling_metadata.pooling_params
+
         assert num_pooling_reqs == num_reqs
 
         token_type_id_requests = dict[int, Any]()

From 92ff41abea130a3217faa54abb89ccc27aef3f06 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 14 Aug 2025 20:28:50 +0800
Subject: [PATCH 261/932] [Model] Modify the gate implementation of glm4_moe
 (#22832)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/models/supported_models.md        |  2 +-
 vllm/model_executor/models/glm4_moe.py | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index dbbbc5122b..a24fa4bcce 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -615,7 +615,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | | ✅︎ | ✅︎ |
+| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 131c042c3c..aff491f959 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
-                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -118,14 +117,15 @@ class Glm4MoE(nn.Module):
         if config.hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {config.hidden_act}. "
                              "Only silu is supported for now.")
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.n_routed_experts,
-                                     bias=False,
-                                     quant_config=None,
-                                     params_dtype=torch.float32,
-                                     prefix=f"{prefix}.gate")
-
+        # NOTE In the transformers implementation, the gate isn't an nn.Linear,
+        # so we cannot use ReplicatedLinear here.
+        # See: https://github.com/huggingface/transformers/blob/v4.55.1/src/transformers/models/glm4_moe/modeling_glm4_moe.py#L260
+        self.gate = nn.Linear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            dtype=torch.float32,
+        )
         self.gate.e_score_correction_bias = nn.Parameter(
             torch.empty(config.n_routed_experts, dtype=torch.float32))
 
@@ -181,7 +181,7 @@ class Glm4MoE(nn.Module):
 
         if self.n_shared_experts is not None:
             shared_output = self.shared_experts(hidden_states)
-        router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32))
+        router_logits = self.gate(hidden_states.to(dtype=torch.float32))
         final_hidden_states = self.experts(
             hidden_states=hidden_states,
             router_logits=router_logits) * self.routed_scaling_factor

From 625ccd1c4d1996a144b0167caefd150cf2956437 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Thu, 14 Aug 2025 23:09:27 +0800
Subject: [PATCH 262/932] [Bugfix] Replace custom Encoding class with
 BatchEncoding in MistralTokenizer (#22786)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 6ccc636efa..4dd8b2439b 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -2,13 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional, Union, cast
 
 import huggingface_hub
 import regex as re
 from huggingface_hub import HfApi, hf_hub_download
+from transformers.tokenization_utils_base import BatchEncoding
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer_base import TokenizerBase
@@ -27,11 +27,6 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
 
 
-@dataclass
-class Encoding:
-    input_ids: Union[list[int], list[list[int]]]
-
-
 def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
     # SEE: https://github.com/vllm-project/vllm/pull/9951
     # Credits go to: @gcalmettes
@@ -359,7 +354,7 @@ class MistralTokenizer(TokenizerBase):
         # For str, single prompt text
         else:
             input_ids = self.encode_one(text, truncation, max_length)
-        return Encoding(input_ids=input_ids)
+        return BatchEncoding({"input_ids": input_ids})
 
     def get_vocab(self) -> dict[str, int]:
         # NB: the dictionary form of the vocabulary collapses token ids that map

From dbe298046c8a2528c48cbd2ceca0d074052054c4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 14 Aug 2025 23:09:44 +0800
Subject: [PATCH 263/932] [Bugfix] Fix parsing of
 `--disable-mm-preprocessor-cache` (#22909)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c058001ceb..dd1072da08 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -711,7 +711,7 @@ class EngineArgs:
             "--mm-processor-cache-gb",
             **multimodal_kwargs["mm_processor_cache_gb"])
         multimodal_group.add_argument("--disable-mm-preprocessor-cache",
-                                      type=bool,
+                                      action="store_true",
                                       deprecated=True)
         multimodal_group.add_argument(
             "--interleave-mm-strings",

From ab9f2cfd1942f7ddfee658ce86ea96b4789862af Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Thu, 14 Aug 2025 20:01:16 +0200
Subject: [PATCH 264/932] [CI] [Hybrid]  Bump min transformers version for
 Bamba and Jamba (#22908)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 tests/models/registry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index eb48c0f6a7..3efc9a99ea 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -151,7 +151,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
                                          trust_remote_code=True),
     "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1",
-                                        min_transformers_version="4.55.1",
+                                        min_transformers_version="4.56.0",
                                         extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
                                         {"1b": "bigscience/bloomz-1b1"}),
@@ -227,7 +227,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                             trust_remote_code=True),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
     "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
-                                        min_transformers_version="4.55.1",
+                                        min_transformers_version="4.56.0",
                                         extras={
                                             "tiny": "ai21labs/Jamba-tiny-dev",
                                             "random": "ai21labs/Jamba-tiny-random",  # noqa: E501

From 33c63e95474f9354b362e72d447c225d1c64dfc2 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Fri, 15 Aug 2025 02:23:22 +0800
Subject: [PATCH 265/932] [Kernel] [Quantization] Add MXFP4 and bias support
 for marlin kernel (#22428)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Animesh Jain <anijain@umich.edu>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: tjtanaavllm <tunjian.tan@amd.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: yan <yan.ma@intel.com>
Signed-off-by: Yan Ma <yan.ma@intel.com>
Signed-off-by: Xiao Liu <xiszishu@gmail.com>
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
Signed-off-by: Andy Xie <andy.xning@gmail.com>
Signed-off-by: Haibin Lin <haibin.lin@bytedance.com>
Signed-off-by: David Ben-David <davidb@pliops.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Signed-off-by: Abirdcfly <fp544037857@gmail.com>
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: huangweixiao <huangweixiao@msh.team>
Signed-off-by: alyosha-swamy <raghav@arcee.ai>
Signed-off-by: Eric Hanley <ericehanley@google.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Signed-off-by: CLFutureX <775523362@qq.com>
Signed-off-by: Linkun Chen <github@lkchen.net>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: tlipoca9 <tlipoca9@gmail.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Benji Beck <benjibeck@meta.com>
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Zhang Jason <ning.zhang2@amd.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: asafg <asafg@ai21.com>
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: Lain <fusiyuan2000@hotmail.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: QscQ <qscqesze@gmail.com>
Signed-off-by: qingjun <qingjun@minimaxi.com>
Signed-off-by: Syed Muhammad Bin Asif <syedmba7@connect.hku.hk>
Signed-off-by: Lionel Villard <villard@us.ibm.com>
Signed-off-by: ycyaw66 <497410282@qq.com>
Signed-off-by: David Chen <530634352@qq.com>
Signed-off-by: Linkun <github@lkchen.net>
Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
Signed-off-by: Ming Yang <minos.future@gmail.com>
Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai>
Signed-off-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
Signed-off-by: Andrew Chan <andrewkchan.akc@gmail.com>
Signed-off-by: Felix Marty <Felix.Marty@amd.com>
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: XIn Li <xinli@nvidia.com>
Signed-off-by: Junhao Li <junhao@ubicloud.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Signed-off-by: <zyy1102000@gmail.com>
Signed-off-by: Guy Stone <guys@spotify.com>
Signed-off-by: <yyweiss@gmail.com>
Signed-off-by: yyw <yyweiss@gmail.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com>
Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io>
Co-authored-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Animesh Jain <jainanimesh2305@yahoo.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: XiongfeiWei <isaacwxf23@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: JartX <sagformas@gmail.com>
Co-authored-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: kf <kuanfu.liu@embeddedllm.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: tjtanaavllm <tunjian.tan@amd.com>
Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Yan Ma <yan.ma@intel.com>
Co-authored-by: Xiao <xiszishu@gmail.com>
Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Co-authored-by: Ning Xie <andy.xning@gmail.com>
Co-authored-by: H <linhaibin.eric@gmail.com>
Co-authored-by: David Ben-David <sdavidbd@gmail.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: TankNee <nee@tanknee.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Co-authored-by: ZiTian.Zhao <zitian.zhao@tencentmusic.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Abirdcfly <fp544037857@gmail.com>
Co-authored-by: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com>
Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu>
Co-authored-by: Chenxi Yang <cxyang@meta.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Weixiao Huang <hwx.simle@gmail.com>
Co-authored-by: Raghav Ravishankar <113712354+alyosha-swamy@users.noreply.github.com>
Co-authored-by: ericehanley <ericehanley@google.com>
Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com>
Co-authored-by: Po-Han Huang (NVIDIA) <53919306+nvpohanh@users.noreply.github.com>
Co-authored-by: PiteXChen <44110731+CLFutureX@users.noreply.github.com>
Co-authored-by: lkchen <github@lkchen.net>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Co-authored-by: tlipoca9 <160737620+tlipoca9@users.noreply.github.com>
Co-authored-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Siyuan Liu <lsiyuan@google.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Zhang Jason <ning.zhang2@amd.com>
Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Co-authored-by: asafg <asafg@ai21.com>
Co-authored-by: Lain <siyuanf@nvidia.com>
Co-authored-by: tc-mb <157115220+tc-mb@users.noreply.github.com>
Co-authored-by: imning3 <hbning@pku.edu.cn>
Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com>
Co-authored-by: qscqesze <qingjun@minimaxi.com>
Co-authored-by: Syed Muhammad Bin Asif <92625830+syedmba@users.noreply.github.com>
Co-authored-by: Lionel Villard <villard@us.ibm.com>
Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Co-authored-by: ycyaw66 <497410282@qq.com>
Co-authored-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
Co-authored-by: Ming Yang <minos.future@gmail.com>
Co-authored-by: Adrián García García <adrigarvk8@gmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
Co-authored-by: JaceyShao <65159281+JaceyShao@users.noreply.github.com>
Co-authored-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Co-authored-by: Ricardo Decal <crypdick@users.noreply.github.com>
Co-authored-by: Andrew Chan <andrewkchan.akc@gmail.com>
Co-authored-by: fxmarty-amd <felmarty@amd.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: Zhiyu <zhiyuc@nvidia.com>
Co-authored-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: XIn Li <xinli@nvidia.com>
Co-authored-by: Junhao Li <streaver91@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: Hong Hanh <hanh.usth@gmail.com>
Co-authored-by: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Guy Stone <guys@spotify.com>
Co-authored-by: yyweiss <70619747+yyweiss@users.noreply.github.com>
Co-authored-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
---
 CMakeLists.txt                                |   7 +
 benchmarks/kernels/benchmark_machete.py       |   1 +
 csrc/core/scalar_type.hpp                     |   2 +
 csrc/moe/marlin_moe_wna16/generate_kernels.py |  15 ++
 csrc/moe/marlin_moe_wna16/kernel.h            |  26 +--
 csrc/moe/marlin_moe_wna16/marlin_template.h   | 137 ++++++++++---
 csrc/moe/marlin_moe_wna16/ops.cu              | 187 ++++++++++++------
 csrc/moe/torch_bindings.cpp                   |   3 +-
 csrc/quantization/gptq_marlin/dequant.h       |  23 ++-
 .../gptq_marlin/generate_kernels.py           |  17 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  | 162 ++++++++++-----
 csrc/quantization/gptq_marlin/kernel.h        |   5 +-
 .../gptq_marlin/marlin_template.h             | 139 ++++++++++---
 csrc/torch_bindings.cpp                       |   1 +
 tests/kernels/moe/test_moe.py                 | 175 ++++++++++++----
 .../kernels/quantization/test_marlin_gemm.py  |  95 +++++++--
 tests/kernels/utils.py                        |  21 +-
 vllm/_custom_ops.py                           |  18 +-
 vllm/envs.py                                  |  11 ++
 .../layers/fused_moe/fused_marlin_moe.py      |  30 ++-
 vllm/model_executor/layers/fused_moe/layer.py |  20 +-
 .../layers/quantization/awq_marlin.py         |  13 +-
 .../compressed_tensors_moe.py                 |   6 +
 .../model_executor/layers/quantization/fp8.py |   2 +
 .../layers/quantization/gptq_marlin.py        |  10 +-
 .../layers/quantization/hqq_marlin.py         |   9 +-
 .../kernels/mixed_precision/marlin.py         |   8 +-
 .../layers/quantization/modelopt.py           |   2 +
 .../layers/quantization/mxfp4.py              |  91 ++++++++-
 .../layers/quantization/utils/marlin_utils.py |  15 +-
 .../quantization/utils/marlin_utils_fp4.py    | 167 +++++++++++++---
 .../quantization/utils/marlin_utils_fp8.py    |  30 ++-
 .../layers/quantization/utils/mxfp4_utils.py  |   2 +-
 vllm/scalar_type.py                           |   2 +
 34 files changed, 1128 insertions(+), 324 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 093330caa4..5c1a200d18 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -351,6 +351,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
+    set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
+      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
 
     list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 
@@ -364,7 +366,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_SRCS}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
+    set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
+      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
     list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
+
     message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
   else()
     message(STATUS "Not building Marlin kernels as no compatible archs found"
@@ -854,6 +859,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set_gencode_flags_for_srcs(
       SRCS "${MOE_WNAA16_MARLIN_SRC}"
       CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+    set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
 
     list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
 
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index f73d0511e0..975d10f2e9 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -236,6 +236,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
             a=bt.a,
             c=None,
             b_q_weight=w_q,
+            b_bias=None,
             b_scales=w_s,
             global_scale=None,
             b_zeros=w_zp,
diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
index d0f85e2360..68a8750f58 100644
--- a/csrc/core/scalar_type.hpp
+++ b/csrc/core/scalar_type.hpp
@@ -321,6 +321,8 @@ static inline constexpr auto kFE3M2f =
     ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
 static inline constexpr auto kFE4M3fn =
     ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
+static inline constexpr auto kFE8M0fnu =
+    ScalarType(8, 0, false, 0, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
 static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
 static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
 static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index 49f33718a2..698deb107c 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -20,6 +20,7 @@ namespace MARLIN_NAMESPACE_NAME {
 TEMPLATE = ("template __global__ void Marlin<"
             "{{scalar_t}}, "
             "{{w_type_id}}, "
+            "{{s_type_id}}, "
             "{{threads}}, "
             "{{thread_m_blocks}}, "
             "{{thread_n_blocks}}, "
@@ -77,6 +78,7 @@ def generate_new_kernels():
             if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
                 continue
             # nvfp4 only supports group_size == 16
+            # mxfp4 only supports group_size == 32
             if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
                 continue
             # other quantization methods don't support group_size = 16
@@ -89,9 +91,22 @@ def generate_new_kernels():
 
             c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
 
+            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
+                s_type = "vllm::kFE4M3fn"
+            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
+                s_type = "vllm::kFE8M0fnu"
+                if dtype == "fp16":
+                    # we cannot safely dequantize e8m0 to fp16, so skip this
+                    continue
+            elif dtype == "fp16":
+                s_type = "vllm::kFloat16"
+            elif dtype == "bf16":
+                s_type = "vllm::kBFloat16"
+
             template_str = jinja2.Template(TEMPLATE).render(
                 scalar_t=c_dtype,
                 w_type_id=scalar_type + ".id()",
+                s_type_id=s_type + ".id()",
                 threads=threads,
                 thread_m_blocks=max(m_blocks, 1),
                 thread_n_blocks=n_blocks,
diff --git a/csrc/moe/marlin_moe_wna16/kernel.h b/csrc/moe/marlin_moe_wna16/kernel.h
index 537282aba8..6190f7ee21 100644
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -7,23 +7,25 @@
 #include "quantization/gptq_marlin/marlin_dtypes.cuh"
 #include "core/scalar_type.hpp"
 
-#define MARLIN_KERNEL_PARAMS                                          \
-  const int4 *__restrict__ A, const int4 *__restrict__ B,             \
-      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                 \
-      const int4 *__restrict__ scales_ptr,                            \
-      const uint16_t *__restrict__ scale2_ptr,                        \
-      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
-      const int32_t *__restrict__ sorted_token_ids_ptr,               \
-      const int32_t *__restrict__ expert_ids_ptr,                     \
-      const int32_t *__restrict__ num_tokens_past_padded_ptr,         \
-      const float *__restrict__ topk_weights_ptr, int top_k,          \
-      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,  \
-      int prob_n, int prob_k, int *locks, bool use_atomic_add,        \
+#define MARLIN_KERNEL_PARAMS                                                  \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,                     \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                         \
+      const int4 *__restrict__ b_bias_ptr,                                    \
+      const int4 *__restrict__ scales_ptr,                                    \
+      const uint16_t *__restrict__ scale2_ptr,                                \
+      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,         \
+      const int32_t *__restrict__ sorted_token_ids_ptr,                       \
+      const int32_t *__restrict__ expert_ids_ptr,                             \
+      const int32_t *__restrict__ num_tokens_past_padded_ptr,                 \
+      const float *__restrict__ topk_weights_ptr, int top_k,                  \
+      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,          \
+      int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
       bool use_fp32_reduce, int max_shared_mem
 
 namespace MARLIN_NAMESPACE_NAME {
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
index 8a913bb4a7..dd86a9a5ba 100644
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -280,6 +280,7 @@ __device__ inline void wait_negative_and_add(int* lock) {
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -299,6 +300,7 @@ __global__ void Marlin(
     const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
     int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ b_bias_ptr,
     const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                           // (k/groupsize)xn
     const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
@@ -318,8 +320,9 @@ __global__ void Marlin(
     int prob_n,             // output dimension n
     int prob_k,             // reduction dimension k
     int* locks,             // extra global storage for barrier synchronization
-    bool use_atomic_add,    // whether to use atomic add to reduce
-    bool use_fp32_reduce,   // whether to use fp32 global reduce
+    bool has_bias,
+    bool use_atomic_add,   // whether to use atomic add to reduce
+    bool use_fp32_reduce,  // whether to use fp32 global reduce
     int max_shared_mem) {
   // Each threadblock processes one "stripe" of the B matrix with (roughly) the
   // same size, which might involve multiple column "slices" (of width 16 *
@@ -342,12 +345,23 @@ __global__ void Marlin(
 
   extern __shared__ int4 sh[];
   static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
+  if constexpr (w_type == vllm::kFE2M1f) {
+    static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
+                  s_type == vllm::kFE8M0fnu && group_blocks == 2);
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    static_assert(s_type == vllm::kBFloat16);
+  } else if constexpr (std::is_same<scalar_t, half>::value) {
+    static_assert(s_type == vllm::kFloat16);
+  }
+
   constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
   constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
                                w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
   // see comments of dequant.h for more details
   constexpr bool dequant_skip_flop =
-      !is_int_type ||
+      w_type == vllm::kFE4M3fn ||
+      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
       has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
       has_zp && !is_zp_float && !(w_type == vllm::kU8);
 
@@ -365,6 +379,7 @@ __global__ void Marlin(
   const int zp_expert_stride =
       is_zp_float ? prob_n * prob_k / group_size / 8
                   : prob_n * prob_k / group_size / (pack_factor * 4);
+  const int b_bias_expert_stride = prob_n / 8;
 
   // parallel: num valid moe blocks
   int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
@@ -475,7 +490,7 @@ __global__ void Marlin(
         for (int i = 0; i < 4; i++) {
           int idx = tid4 * 4 + i;
           idx = idx < block_num_valid_tokens ? idx : 0;
-          if constexpr (w_type == vllm::kFE2M1f) {
+          if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
             sh_block_topk_weights[idx] = __hmul2(
                 global_scale, Dtype::num2num2(Dtype::float2num(
                                   topk_weights_ptr[sh_block_sorted_ids[idx]])));
@@ -513,7 +528,7 @@ __global__ void Marlin(
       expert_id = expert_ids_ptr[block_id];
     }
 
-    if constexpr (w_type == vllm::kFE2M1f) {
+    if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
       uint16_t val = scale2_ptr[expert_id];
       global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
     }
@@ -526,6 +541,9 @@ __global__ void Marlin(
     if constexpr (has_act_order) {
       g_idx += (expert_id - old_expert_id) * prob_k;
     }
+    if (has_bias) {
+      b_bias_ptr += (expert_id - old_expert_id) * b_bias_expert_stride;
+    }
 
     read_moe_block_data(block_id);
   };
@@ -721,7 +739,7 @@ __global__ void Marlin(
 
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
               (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + warp_row % 2;
+    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
 
   } else if constexpr (group_blocks != -1)
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
@@ -734,6 +752,18 @@ __global__ void Marlin(
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
               (threadIdx.x % 32) % 4;
 
+  int bias_sh_rd;
+  if constexpr (m_block_size_8) {
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 (threadIdx.x % 32) / 8;
+  } else {
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 (threadIdx.x % 32) % 4;
+  }
+
+  int bias_sh_wr = threadIdx.x;
+  int bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
+
   // Zero-points have the same read layout as the scales
   // (without column-wise case)
   constexpr int num_col_threads = 8;
@@ -793,7 +823,19 @@ __global__ void Marlin(
   constexpr int sh_b_size = stages * b_sh_stage;
   int4* sh_b = sh_new;
   int4* sh_red = sh_new;
-  int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+
+  constexpr int sh_size_b_red_min =
+      (sh_red_size < sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int sh_size_b_red_max =
+      (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int sh_bias_size = (thread_n_blocks * 16 / 8);
+  constexpr int sh_b_red_bias_size =
+      sh_size_b_red_max > (sh_size_b_red_min + sh_bias_size)
+          ? sh_size_b_red_max
+          : (sh_size_b_red_min + sh_bias_size);
+
+  int4* sh_bias = sh_new + sh_size_b_red_min;
+  int4* sh_g_idx = sh_new + sh_b_red_bias_size;
   int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
@@ -803,9 +845,9 @@ __global__ void Marlin(
   static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
                 stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
-  constexpr int shm_size_used =
-      moe_block_size + stages * (g_idx_stage + zp_sh_stage) + sh_s_size +
-      (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int shm_size_used = moe_block_size +
+                                stages * (g_idx_stage + zp_sh_stage) +
+                                sh_s_size + sh_b_red_bias_size;
 
   // all remaining shared memory is used to cache A (input)
   // sh_a_max_row is at least ` stages * 16 * thread_m_blocks `
@@ -816,7 +858,8 @@ __global__ void Marlin(
   FragA frag_a[2][thread_m_blocks];
   I4 frag_b_quant[2][b_thread_vecs];
   FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];                    // No act-order
+  FragS frag_s[2][4];  // No act-order
+  FragS frag_bias[2][4];
   FragS act_frag_s[2][4][4];             // For act-order
   int frag_qzp[2][num_ints_per_thread];  // Zero-points
   FragZP frag_zp;                        // Zero-points in fp16
@@ -1065,10 +1108,15 @@ __global__ void Marlin(
           if constexpr (w_type_id != vllm::kFE2M1f.id()) {
             reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                 sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else {
+          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
             reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                 reinterpret_cast<int2*>(
                     sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          } else {
+            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
+                reinterpret_cast<int2*>(
+                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
+                                k % 2];
           }
         }
       }
@@ -1281,9 +1329,9 @@ __global__ void Marlin(
       int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
       int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
 
-      dequant_fp8_scales<scalar_t2>(s_quant_0,
-                                    reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2>(
+      dequant_fp8_scales<scalar_t2, s_type_id>(
+          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<scalar_t2, s_type_id>(
           s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
     }
 
@@ -1566,7 +1614,7 @@ __global__ void Marlin(
   // Write out the reduce final result in the correct layout. We only actually
   // reshuffle matrix fragments in this step, the reduction above is performed
   // in fragment layout.
-  auto write_result = [&]() {
+  auto write_result = [&](bool last) {
     int c_gl_stride = prob_n / 8;
     constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
     int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
@@ -1592,7 +1640,7 @@ __global__ void Marlin(
 
     // We first reorder in shared memory to guarantee the most efficient final
     // global write patterns
-    auto write = [&](int idx, float c0, float c1, FragS& s) {
+    auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
       scalar_t2 res =
           Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
 
@@ -1601,14 +1649,27 @@ __global__ void Marlin(
       if constexpr (!has_act_order && group_blocks == -1 &&
                     w_type.size_bits() == 4 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        res = __hmul2(res, s[0]);
+        scalar_t2 tmp_scale = s[0];
+        if constexpr (m_block_size_8) {
+          tmp_scale = Dtype::num2num2(
+              reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
+        }
+        res = __hmul2(res, tmp_scale);
       }
 
-      if constexpr (w_type == vllm::kFE2M1f) {
+      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
         if (!mul_topk_weights) {
           res = __hmul2(res, global_scale);
         }
       }
+      if (has_bias && last) {
+        scalar_t2 tmp_bias = b_bias[0];
+        if constexpr (m_block_size_8) {
+          tmp_bias = Dtype::num2num2(
+              reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
+        }
+        res = __hadd2(res, tmp_bias);
+      }
 
       if constexpr (m_block_size_8) {
         ((scalar_t*)sh_red)[idx] = res.x;
@@ -1626,19 +1687,25 @@ __global__ void Marlin(
           if constexpr (m_block_size_8) {
             int wr = c_sh_wr + 16 * j;
             write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
-                  frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
             write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3],
-                  frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
           } else {
             int wr = c_sh_wr + 8 * j;
             write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
-                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
             write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
-                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
             write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
-                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
             write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
-                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
           }
         }
         c_sh_wr += 16 * (4 * c_sh_stride);
@@ -1805,6 +1872,14 @@ __global__ void Marlin(
       }
 
       thread_block_reduce();
+
+      if (has_bias && last) {
+        __syncthreads();
+        cp_async4_pred(&sh_bias[bias_sh_wr], &b_bias_ptr[bias_gl_rd],
+                       threadIdx.x < 16 * thread_n_blocks / 8);
+        cp_async_fence();
+      }
+
       if constexpr (!has_act_order && group_blocks == -1 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
         if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
@@ -1867,11 +1942,20 @@ __global__ void Marlin(
         }
         barrier_release(&locks[locks_off], last);
       }
+
+      if (has_bias && last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
+        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
+        __syncthreads();
+      }
+
       if (use_atomic_add && slice_count > 1 && slice_idx != 0)
         wait_negative_and_add(&locks[locks_off]);
       if (last || use_atomic_add)
         // only the last block in a slice actually writes the result
-        write_result();
+        write_result(last);
       int old_slice_row = slice_row;
       slice_row = 0;
       slice_col_par++;
@@ -1904,6 +1988,7 @@ __global__ void Marlin(
           for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
         }
 
+        bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
         // Update slice k/n for scales loading
         if constexpr (has_act_order) {
           slice_k_start = tb_k * slice_row;
diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu
index 2cff04f699..601e2aa6f9 100644
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -51,8 +51,9 @@ __global__ void permute_cols_kernel(
 }  // namespace marlin
 
 torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
-    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
     std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
@@ -212,7 +213,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
   // Get B size
   int tb_k = th_config.thread_k;
   int tb_n = th_config.thread_n;
-  int tb_m = thread_m_blocks * (m_block_size_8 ? 8 : 16);
+  int tb_m = thread_m_blocks * 16;
 
   // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
   // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
@@ -220,6 +221,11 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
   int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
   int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
   int sh_red_size = tb_m * (tb_n + 8) * 2;
+  int sh_bias_size = tb_n * 2;
+  int tmp_size =
+      (sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size;
+  tmp_size = max(max(sh_b_size, sh_red_size), tmp_size);
+
   int sh_s_size =
       get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                             group_size, has_act_order, is_k_full);
@@ -234,8 +240,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
       sh_zp_size = sh_s_size / 2;
   }
 
-  int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size +
-                   sh_zp_size + sh_g_idx_size + sh_block_meta_size;
+  int total_size = tmp_size + sh_a_size + sh_s_size + sh_zp_size +
+                   sh_g_idx_size + sh_block_meta_size;
 
   return total_size;
 }
@@ -270,20 +276,25 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
   int cache_size = get_kernel_cache_size(
       th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
       num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size <= max_shared_mem;
+  return cache_size + 512 <= max_shared_mem;
 }
 
-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)    \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&       \
-             thread_n_blocks == THREAD_N_BLOCKS &&                           \
-             thread_k_blocks == THREAD_K_BLOCKS &&                           \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                             \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&   \
-             is_zp_float == IS_ZP_FLOAT) {                                   \
-      kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,   \
-                      THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8,      \
-                      pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>;               \
+  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
+                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
+    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
+             thread_n_blocks == THREAD_N_BLOCKS &&                             \
+             thread_k_blocks == THREAD_K_BLOCKS &&                             \
+             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
+             is_zp_float == IS_ZP_FLOAT) {                                     \
+      constexpr auto S_TYPE =                                                  \
+          W_TYPE == vllm::kFE2M1f                                              \
+              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
+              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
+                                                     : vllm::kBFloat16);       \
+      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
+                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
+                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
     }
 
   // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
@@ -335,31 +346,45 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
     _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
     _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
     _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
 
-  #define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define FP4_GET_IF(W_TYPE)            \
-    FP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FP4_GET_IF_M234(W_TYPE, 8, 4, 128)
-
   #define BIGGROUP_GET_IF(W_TYPE)            \
     BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
     BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
     BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
     BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)
 
+  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
+
+  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
+
+  #define NVFP4_GET_IF(W_TYPE)            \
+    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
+
+  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
+
+  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
+
+  #define MXFP4_GET_IF(W_TYPE)            \
+    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
+
   // We currently have 4-bit models only with group_blocks == 4
   #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
     _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
@@ -408,12 +433,17 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
   COMMON_GET_IF(vllm::kU4B8)
   COMMON_GET_IF(vllm::kU8B128)
 
-  BIGGROUP_GET_IF(vllm::kFE4M3fn)
+  NVFP4_GET_IF(vllm::kFE2M1f)
 
-  FP4_GET_IF(vllm::kFE2M1f)
+  BIGGROUP_GET_IF(vllm::kFE4M3fn)
 
   ACT_GET_IF(vllm::kU4B8)
   ACT_GET_IF(vllm::kU8B128)
+  if (std::is_same<scalar_t, nv_bfloat16>::value) {
+    if (false) {
+    }
+    MXFP4_GET_IF(vllm::kFE2M1f)
+  }
 
   return kernel;
 }
@@ -482,16 +512,16 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 }
 
 template <typename scalar_t>
-void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
-               void* s2, void* zp, void* g_idx, void* perm, void* a_tmp,
-               void* sorted_token_ids, void* expert_ids,
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
+               void* s, void* s2, void* zp, void* g_idx, void* perm,
+               void* a_tmp, void* sorted_token_ids, void* expert_ids,
                void* num_tokens_past_padded, void* topk_weights,
                int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
                int prob_m, int prob_n, int prob_k, void* workspace,
-               vllm::ScalarType const& q_type, bool has_act_order,
-               bool is_k_full, bool has_zp, int num_groups, int group_size,
-               int dev, cudaStream_t stream, int thread_k, int thread_n,
-               int sms, bool use_atomic_add, bool use_fp32_reduce,
+               vllm::ScalarType const& q_type, bool has_bias,
+               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
+               int group_size, int dev, cudaStream_t stream, int thread_k,
+               int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce,
                bool is_zp_float) {
   int thread_m_blocks = div_ceil(moe_block_size, 16);
   bool m_block_size_8 = moe_block_size == 8;
@@ -538,6 +568,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
   int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* bias_ptr = (const int4*)b_bias;
   const int4* s_ptr = (const int4*)s;
   const uint16_t* s2_ptr = (const uint16_t*)s2;
   const int4* zp_ptr = (const int4*)zp;
@@ -648,10 +679,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
   // avoid ">>>" being formatted to "> > >"
   // clang-format off
   kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
-      A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
       sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
       topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
-      prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce, max_shared_mem);
+      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem);
   // clang-format on
 }
 
@@ -659,7 +690,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
 
 torch::Tensor moe_wna16_marlin_gemm(
     torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
-    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
     std::optional<torch::Tensor> const& global_scale_or_none,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
@@ -766,7 +798,6 @@ torch::Tensor moe_wna16_marlin_gemm(
   num_groups = b_scales.size(1);
 
   torch::Tensor g_idx, perm, a_tmp;
-  ;
   if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
     g_idx = g_idx_or_none.value();
     perm = perm_or_none.value();
@@ -815,12 +846,24 @@ torch::Tensor moe_wna16_marlin_gemm(
   torch::Tensor global_scale;
   if (global_scale_or_none.has_value()) {
     global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f,
-                "global_scale can only be used for float4_e2m1f.");
+    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+                "global_scale can only be used for nvfp4 format.");
   } else {
     global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f),
-                "the global_scale parameter must be passed for float4_e2m1f.");
+    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+                "the global_scale parameter must be passed for nvfp4 format.");
+  }
+
+  bool has_bias = b_bias_or_none.has_value();
+  torch::Tensor b_bias;
+  if (has_bias) {
+    b_bias = b_bias_or_none.value();
+    TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
+    TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
+    TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n");
+    TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1");
+  } else {
+    b_bias = torch::empty({0}, options);
   }
 
   torch::Tensor b_zeros;
@@ -832,7 +875,6 @@ torch::Tensor moe_wna16_marlin_gemm(
     b_zeros = torch::empty({0}, options);
   }
   bool has_zp = b_zeros.size(-1) > 0;
-
   if (has_zp) {
     TORCH_CHECK(
         b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
@@ -890,41 +932,58 @@ torch::Tensor moe_wna16_marlin_gemm(
   if (a.scalar_type() == at::ScalarType::Half) {
     void* scales_ptr;
     if (b_q_type == vllm::kFE2M1f) {
-      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      if (group_size == 16)
+        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      else if (group_size == 32)
+        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
+      else
+        TORCH_CHECK(false,
+                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
+                    "and group_size == 32 (MXFP4)");
     } else {
       scales_ptr = b_scales.data_ptr<at::Half>();
     }
 
     MARLIN_NAMESPACE_NAME::marlin_mm<half>(
         a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), scales_ptr, global_scale.data_ptr<at::Half>(),
-        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::Half>(), sorted_token_ids.data_ptr(),
-        expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
-        topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep,
-        size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order,
-        is_k_full, has_zp, num_groups, group_size, dev,
+        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
+        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+        perm.data_ptr(), a_tmp.data_ptr<at::Half>(),
+        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
+        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
+        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
+        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
+        has_zp, num_groups, group_size, dev,
         at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
         use_atomic_add, use_fp32_reduce, is_zp_float);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
     void* scales_ptr;
     if (b_q_type == vllm::kFE2M1f) {
-      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      if (group_size == 16)
+        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      else if (group_size == 32)
+        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
+      else
+        TORCH_CHECK(false,
+                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
+                    "and group_size == 32 (MXFP4)");
     } else {
       scales_ptr = b_scales.data_ptr<at::BFloat16>();
     }
 
     MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(), scales_ptr,
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
+        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
         global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
         g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
         sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
         num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
         moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
-        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
+        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
+        has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        use_atomic_add, use_fp32_reduce, is_zp_float);
   } else {
     TORCH_CHECK(false,
                 "moe_wna16_marlin_gemm only supports bfloat16 and float16");
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index d96e082f6e..7e49f68f62 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -35,7 +35,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
 
   m.def(
       "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
-      "Tensor! b_q_weight, Tensor! b_scales, Tensor? global_scale, Tensor? "
+      "Tensor! b_q_weight, Tensor? b_bias_or_none,"
+      "Tensor! b_scales, Tensor? global_scale, Tensor? "
       "b_zeros_or_none,"
       "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
       "Tensor sorted_token_ids,"
diff --git a/csrc/quantization/gptq_marlin/dequant.h b/csrc/quantization/gptq_marlin/dequant.h
index ae0d6c0f20..e8b0c302b2 100644
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
@@ -470,11 +470,12 @@ __device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), false>(
   frag_b[0] = __hmul2(frag_b[0], bias_reg);
 }
 
-template <typename scalar_t2>
+template <typename scalar_t2, vllm::ScalarTypeId s_type_id>
 __device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b);
 
 template <>
-__device__ inline void dequant_fp8_scales<half2>(int q, half2* frag_b) {
+__device__ inline void dequant_fp8_scales<half2, vllm::kFE4M3fn.id()>(
+    int q, half2* frag_b) {
   int Out1 = (q & 0xFF00FF00) >> 1;
   ;
   q <<= 8;
@@ -486,8 +487,8 @@ __device__ inline void dequant_fp8_scales<half2>(int q, half2* frag_b) {
 };
 
 template <>
-__device__ inline void dequant_fp8_scales<nv_bfloat162>(int q,
-                                                        nv_bfloat162* frag_b) {
+__device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE4M3fn.id()>(
+    int q, nv_bfloat162* frag_b) {
   constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
   constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
   constexpr int MASK = 0x7F007F00;
@@ -502,6 +503,20 @@ __device__ inline void dequant_fp8_scales<nv_bfloat162>(int q,
   frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
 }
 
+template <>
+__device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE8M0fnu.id()>(
+    int q, nv_bfloat162* frag_b) {
+  // In this conversion, 2 ** -127 in FP8E8M0 would become 0 in BF16,
+  // but we assume that such a extreme value would not occur in real models.
+  int Out1 = (q & 0xFF00FF00) >> 1;
+  q <<= 7;
+  int Out2 = q & 0x7F807F80;
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+
 #endif
 
 }  // namespace MARLIN_NAMESPACE_NAME
diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py
index 18fb6c1a81..7576e0548a 100644
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -20,6 +20,7 @@ namespace MARLIN_NAMESPACE_NAME {
 TEMPLATE = ("template __global__ void Marlin<"
             "{{scalar_t}}, "
             "{{w_type_id}}, "
+            "{{s_type_id}}, "
             "{{threads}}, "
             "{{thread_m_blocks}}, "
             "{{thread_n_blocks}}, "
@@ -78,7 +79,8 @@ def generate_new_kernels():
             if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
                 continue
             # nvfp4 only supports group_size == 16
-            if scalar_type == "vllm::kFE2M1f" and group_blocks != 1:
+            # mxfp4 only supports group_size == 32
+            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
                 continue
             # other quantization methods don't support group_size = 16
             if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
@@ -97,10 +99,23 @@ def generate_new_kernels():
                 # 4bit quantization and fp16
                 is_zp_float_list.append(True)
 
+            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
+                s_type = "vllm::kFE4M3fn"
+            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
+                s_type = "vllm::kFE8M0fnu"
+                if dtype == "fp16":
+                    # we cannot safely dequantize e8m0 to fp16, so skip this
+                    continue
+            elif dtype == "fp16":
+                s_type = "vllm::kFloat16"
+            elif dtype == "bf16":
+                s_type = "vllm::kBFloat16"
+
             for is_zp_float in is_zp_float_list:
                 template_str = jinja2.Template(TEMPLATE).render(
                     scalar_t=c_dtype,
                     w_type_id=scalar_type + ".id()",
+                    s_type_id=s_type + ".id()",
                     threads=threads,
                     thread_m_blocks=max(m_blocks, 1),
                     thread_n_blocks=n_blocks,
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 4a242f2050..cc30abcf00 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -48,7 +48,8 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
 
 torch::Tensor gptq_marlin_gemm(
     torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
-    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
     std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
@@ -187,7 +188,12 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
   int tb_m = thread_m_blocks * 16;
   int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
   int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
-  int sh_red_size = tb_m * (tb_n + 8);
+  int sh_red_size = tb_m * (tb_n + 8) * 2;
+  int sh_bias_size = tb_n * 2;
+  int tmp_size =
+      (sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size;
+  tmp_size = max(max(sh_b_size, sh_red_size), tmp_size);
+
   int sh_s_size =
       get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                             group_size, has_act_order, is_k_full);
@@ -202,8 +208,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
       sh_zp_size = sh_s_size / 2;
   }
 
-  int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size +
-                   sh_zp_size + sh_g_idx_size;
+  int total_size =
+      tmp_size + sh_a_size + sh_s_size + sh_zp_size + sh_g_idx_size;
 
   return total_size;
 }
@@ -237,20 +243,25 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
   int cache_size = get_kernel_cache_size(
       th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
       has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size <= max_shared_mem;
+  return cache_size + 512 <= max_shared_mem;
 }
 
-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)    \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&       \
-             thread_n_blocks == THREAD_N_BLOCKS &&                           \
-             thread_k_blocks == THREAD_K_BLOCKS &&                           \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                             \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&   \
-             is_zp_float == IS_ZP_FLOAT) {                                   \
-      kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,   \
-                      THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8,      \
-                      pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>;               \
+  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
+                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
+    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
+             thread_n_blocks == THREAD_N_BLOCKS &&                             \
+             thread_k_blocks == THREAD_K_BLOCKS &&                             \
+             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
+             is_zp_float == IS_ZP_FLOAT) {                                     \
+      constexpr auto S_TYPE =                                                  \
+          W_TYPE == vllm::kFE2M1f                                              \
+              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
+              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
+                                                     : vllm::kBFloat16);       \
+      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
+                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
+                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
     }
 
   // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
@@ -315,22 +326,39 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
     BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
     BIGGROUP_GET_IF_M234(W_TYPE, 4, 8, 128)
 
-  #define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
+  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
     _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
 
-  #define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
     _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
 
-  #define FP4_GET_IF(W_TYPE)            \
-    FP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    FP4_GET_IF_M234(W_TYPE, 4, 8, 128)
+  #define NVFP4_GET_IF(W_TYPE)            \
+    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    NVFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+    NVFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
+
+  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
+
+  #define MXFP4_GET_IF(W_TYPE)            \
+    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    MXFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+    MXFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
 
   // We currently have 4-bit models only with group_blocks == 4
   #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
@@ -384,7 +412,7 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
   COMMON_GET_IF(vllm::kU4B8)
   COMMON_GET_IF(vllm::kU8B128)
 
-  FP4_GET_IF(vllm::kFE2M1f)
+  NVFP4_GET_IF(vllm::kFE2M1f)
 
   BIGGROUP_GET_IF(vllm::kFE4M3fn)
 
@@ -396,6 +424,11 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
     }
     FZP_GET_IF(vllm::kU4)
   }
+  if (std::is_same<scalar_t, nv_bfloat16>::value) {
+    if (false) {
+    }
+    MXFP4_GET_IF(vllm::kFE2M1f)
+  }
 
   return kernel;
 }
@@ -453,12 +486,12 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 }
 
 template <typename scalar_t>
-void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
-               void* s2, void* zp, void* g_idx, void* perm, void* a_tmp,
-               int prob_m, int prob_n, int prob_k, int lda, void* workspace,
-               vllm::ScalarType const& q_type, bool has_act_order,
-               bool is_k_full, bool has_zp, int num_groups, int group_size,
-               int dev, cudaStream_t stream, int thread_k_init,
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
+               void* s, void* s2, void* zp, void* g_idx, void* perm,
+               void* a_tmp, int prob_m, int prob_n, int prob_k, int lda,
+               void* workspace, vllm::ScalarType const& q_type, bool has_bias,
+               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
+               int group_size, int dev, cudaStream_t stream, int thread_k_init,
                int thread_n_init, int sms, bool use_atomic_add,
                bool use_fp32_reduce, bool is_zp_float) {
   if (has_zp) {
@@ -503,6 +536,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
   int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* bias_ptr = (const int4*)b_bias;
   const int4* s_ptr = (const int4*)s;
   const uint16_t* s2_ptr = (const uint16_t*)s2;
   const int4* zp_ptr = (const int4*)zp;
@@ -623,8 +657,9 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
     // avoid ">>>" being formatted to "> > >"
     // clang-format off
     kernel<<<blocks, num_threads, max_shared_mem_new, stream>>>(
-        A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr, num_groups,
-        prob_m_split, prob_n, prob_k, lda, locks, part_use_atomic_add,
+        A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr,
+        g_idx_ptr, num_groups,
+        prob_m_split, prob_n, prob_k, lda, locks, has_bias, part_use_atomic_add,
         use_fp32_reduce, max_shared_mem_new);
     // clang-format on
 
@@ -638,7 +673,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
 
 torch::Tensor gptq_marlin_gemm(
     torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
-    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
     std::optional<torch::Tensor> const& global_scale_or_none,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
@@ -785,12 +821,24 @@ torch::Tensor gptq_marlin_gemm(
   torch::Tensor global_scale;
   if (global_scale_or_none.has_value()) {
     global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f,
-                "global_scale can only be used for float4_e2m1f.");
+    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+                "global_scale can only be used for nvfp4 format.");
   } else {
     global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f),
-                "the global_scale parameter must be passed for float4_e2m1f.");
+    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+                "the global_scale parameter must be passed for nvfp4 format.");
+  }
+
+  bool has_bias = b_bias_or_none.has_value();
+  torch::Tensor b_bias;
+  if (has_bias) {
+    b_bias = b_bias_or_none.value();
+    TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
+    TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
+    TORCH_CHECK(b_bias.size(0) == size_n, "b_bias.size(0) != size_n");
+    TORCH_CHECK(b_bias.stride(0) == 1, "b_bias.stride(0) != 1");
+  } else {
+    b_bias = torch::empty({0}, options);
   }
 
   torch::Tensor b_zeros;
@@ -857,34 +905,50 @@ torch::Tensor gptq_marlin_gemm(
   if (a.scalar_type() == at::ScalarType::Half) {
     void* scales_ptr;
     if (b_q_type == vllm::kFE2M1f) {
-      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      if (group_size == 16)
+        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      else if (group_size == 32)
+        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
+      else
+        TORCH_CHECK(false,
+                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
+                    "and group_size == 32 (MXFP4)");
     } else {
       scales_ptr = b_scales.data_ptr<at::Half>();
     }
 
     marlin::marlin_mm<half>(
         a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), scales_ptr, global_scale.data_ptr<at::Half>(),
-        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k, a.stride(0),
-        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
-        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
+        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
+        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+        perm.data_ptr(), a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
+        a.stride(0), workspace.data_ptr(), b_q_type, has_bias, has_act_order,
+        is_k_full, has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        use_atomic_add, use_fp32_reduce, is_zp_float);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
     void* scales_ptr;
     if (b_q_type == vllm::kFE2M1f) {
-      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      if (group_size == 16)
+        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      else if (group_size == 32)
+        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
+      else
+        TORCH_CHECK(false,
+                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
+                    "and group_size == 32 (MXFP4)");
     } else {
       scales_ptr = b_scales.data_ptr<at::BFloat16>();
     }
 
     marlin::marlin_mm<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(), scales_ptr,
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
+        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
         global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
         g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
         size_m, size_n, size_k, a.stride(0), workspace.data_ptr(), b_q_type,
-        has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
+        has_bias, has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
         at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
         use_atomic_add, use_fp32_reduce, is_zp_float);
   } else {
diff --git a/csrc/quantization/gptq_marlin/kernel.h b/csrc/quantization/gptq_marlin/kernel.h
index f92056589d..bb454f6aff 100644
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
@@ -10,15 +10,18 @@
 #define MARLIN_KERNEL_PARAMS                                                   \
   const int4 *__restrict__ A, const int4 *__restrict__ B,                      \
       int4 *__restrict__ C, int4 *__restrict__ C_tmp,                          \
+      const int4 *__restrict__ b_bias_ptr,                                     \
       const int4 *__restrict__ scales_ptr,                                     \
       const uint16_t *__restrict__ scale2_ptr,                                 \
       const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,          \
       int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
-      bool use_atomic_add, bool use_fp32_reduce, int max_shared_mem
+      bool has_bias, bool use_atomic_add, bool use_fp32_reduce,                \
+      int max_shared_mem
 
 namespace MARLIN_NAMESPACE_NAME {
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
diff --git a/csrc/quantization/gptq_marlin/marlin_template.h b/csrc/quantization/gptq_marlin/marlin_template.h
index 0086633857..bfb0a3668f 100644
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
@@ -39,6 +39,7 @@ namespace MARLIN_NAMESPACE_NAME {
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -271,6 +272,7 @@ __device__ inline void wait_negative_and_add(int* lock) {
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -290,6 +292,7 @@ __global__ void Marlin(
     const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
     int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ b_bias_ptr,
     const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                           // (k/groupsize)xn
     const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
@@ -297,12 +300,13 @@ __global__ void Marlin(
     const int4* __restrict__ zp_ptr,  // 4bit packed zero-points of shape
                                       // (k/groupsize)x(n/pack_factor)
     const int* __restrict__ g_idx,    // int32 group indices of shape k
-    int num_groups,        // number of scale groups per output channel
-    int prob_m,            // batch dimension m
-    int prob_n,            // output dimension n
-    int prob_k,            // reduction dimension k
-    int lda,               // A.stride(0), equal to prob_k is A is contiguous
-    int* locks,            // extra global storage for barrier synchronization
+    int num_groups,  // number of scale groups per output channel
+    int prob_m,      // batch dimension m
+    int prob_n,      // output dimension n
+    int prob_k,      // reduction dimension k
+    int lda,         // A.stride(0), equal to prob_k is A is contiguous
+    int* locks,      // extra global storage for barrier synchronization
+    bool has_bias,
     bool use_atomic_add,   // whether to use atomic add to reduce
     bool use_fp32_reduce,  // whether to use fp32 global reduce
     int max_shared_mem) {
@@ -326,18 +330,29 @@ __global__ void Marlin(
   using FragZP = typename ScalarType<scalar_t>::FragZP;
 
   static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
+  if constexpr (w_type == vllm::kFE2M1f) {
+    static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
+                  s_type == vllm::kFE8M0fnu && group_blocks == 2);
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    static_assert(s_type == vllm::kBFloat16);
+  } else if constexpr (std::is_same<scalar_t, half>::value) {
+    static_assert(s_type == vllm::kFloat16);
+  }
+
   constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
   constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
                                w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
   // see comments of dequant.h for more details
   constexpr bool dequant_skip_flop =
-      !is_int_type ||
+      w_type == vllm::kFE4M3fn ||
+      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
       has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
       has_zp && !is_zp_float && !(w_type == vllm::kU8);
 
   scalar_t2 global_scale;
-
-  if constexpr (w_type == vllm::kFE2M1f) {
+  if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+    // NVFP4 format requires global scale
     uint16_t val = scale2_ptr[0];
     global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
   }
@@ -589,7 +604,7 @@ __global__ void Marlin(
 
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
               (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + warp_row % 2;
+    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
 
   } else if constexpr (group_blocks != -1)
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
@@ -602,6 +617,18 @@ __global__ void Marlin(
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
               (threadIdx.x % 32) % 4;
 
+  int bias_sh_rd;
+  if constexpr (m_block_size_8) {
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 (threadIdx.x % 32) / 8;
+  } else {
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 (threadIdx.x % 32) % 4;
+  }
+
+  int bias_sh_wr = threadIdx.x;
+  int bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
+
   // Zero-points have the same read layout as the scales
   // (without column-wise case)
   constexpr int num_col_threads = 8;
@@ -670,7 +697,19 @@ __global__ void Marlin(
   constexpr int sh_b_size = stages * b_sh_stage;
   int4* sh_b = sh;
   int4* sh_red = sh;
-  int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+
+  constexpr int sh_size_b_red_min =
+      (sh_red_size < sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int sh_size_b_red_max =
+      (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int sh_bias_size = (thread_n_blocks * 16 / 8);
+  constexpr int sh_b_red_bias_size =
+      sh_size_b_red_max > (sh_size_b_red_min + sh_bias_size)
+          ? sh_size_b_red_max
+          : (sh_size_b_red_min + sh_bias_size);
+
+  int4* sh_bias = sh + sh_size_b_red_min;
+  int4* sh_g_idx = sh + sh_b_red_bias_size;
   int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
@@ -680,15 +719,13 @@ __global__ void Marlin(
   static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
                 stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
-  // constexpr int shm_size_used =
-  //     stages * (g_idx_stage + zp_sh_stage) + sh_s_size +
-  //     (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
   I4 frag_b_quant[2][b_thread_vecs];
   FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];                    // No act-order
+  FragS frag_s[2][4];  // No act-order
+  FragS frag_bias[2][4];
   FragS act_frag_s[2][4][4];             // For act-order
   int frag_qzp[2][num_ints_per_thread];  // Zero-points
   FragZP frag_zp;                        // Zero-points in fp16
@@ -923,10 +960,15 @@ __global__ void Marlin(
           if constexpr (w_type_id != vllm::kFE2M1f.id()) {
             reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                 sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else {
+          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
             reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                 reinterpret_cast<int2*>(
                     sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          } else {
+            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
+                reinterpret_cast<int2*>(
+                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
+                                k % 2];
           }
         }
       }
@@ -1139,9 +1181,9 @@ __global__ void Marlin(
       int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
       int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
 
-      dequant_fp8_scales<scalar_t2>(s_quant_0,
-                                    reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2>(
+      dequant_fp8_scales<scalar_t2, s_type_id>(
+          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<scalar_t2, s_type_id>(
           s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
     }
 
@@ -1411,7 +1453,7 @@ __global__ void Marlin(
   // Write out the reduce final result in the correct layout. We only actually
   // reshuffle matrix fragments in this step, the reduction above is performed
   // in fragment layout.
-  auto write_result = [&]() {
+  auto write_result = [&](bool last) {
     int c_gl_stride = prob_n / 8;
     constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
     int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
@@ -1438,7 +1480,7 @@ __global__ void Marlin(
     int c_gl_wr_end = c_gl_stride * prob_m;
     // We first reorder in shared memory to guarantee the most efficient final
     // global write patterns
-    auto write = [&](int idx, float c0, float c1, FragS& s) {
+    auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
       scalar_t2 res =
           Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
 
@@ -1447,12 +1489,25 @@ __global__ void Marlin(
       if constexpr (!has_act_order && group_blocks == -1 &&
                     w_type.size_bits() == 4 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        res = __hmul2(res, s[0]);
+        scalar_t2 tmp_scale = s[0];
+        if constexpr (m_block_size_8) {
+          tmp_scale = Dtype::num2num2(
+              reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
+        }
+        res = __hmul2(res, tmp_scale);
       }
 
-      if constexpr (w_type == vllm::kFE2M1f) {
+      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
         res = __hmul2(res, global_scale);
       }
+      if (has_bias && last) {
+        scalar_t2 tmp_bias = b_bias[0];
+        if constexpr (m_block_size_8) {
+          tmp_bias = Dtype::num2num2(
+              reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
+        }
+        res = __hadd2(res, tmp_bias);
+      }
 
       if constexpr (m_block_size_8) {
         ((scalar_t*)sh_red)[idx] = res.x;
@@ -1470,19 +1525,25 @@ __global__ void Marlin(
           if constexpr (m_block_size_8) {
             int wr = c_sh_wr + 16 * j;
             write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
-                  frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
             write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3],
-                  frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
           } else {
             int wr = c_sh_wr + 8 * j;
             write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
-                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
             write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
-                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
             write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
-                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
             write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
-                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
           }
         }
         c_sh_wr += 16 * (4 * c_sh_stride);
@@ -1622,6 +1683,14 @@ __global__ void Marlin(
       }
 
       thread_block_reduce();
+
+      if (has_bias && last) {
+        __syncthreads();
+        cp_async4_pred(&sh_bias[bias_sh_wr], &b_bias_ptr[bias_gl_rd],
+                       threadIdx.x < 16 * thread_n_blocks / 8);
+        cp_async_fence();
+      }
+
       if constexpr (!has_act_order && group_blocks == -1 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
         if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
@@ -1684,11 +1753,20 @@ __global__ void Marlin(
         }
         barrier_release(&locks[locks_off], last);
       }
+
+      if (has_bias && last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
+        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
+        __syncthreads();
+      }
+
       if (use_atomic_add && slice_count > 1 && slice_idx != 0)
         wait_negative_and_add(&locks[locks_off]);
       if (last || use_atomic_add)
         // only the last block in a slice actually writes the result
-        write_result();
+        write_result(last);
       slice_row = 0;
       slice_col_par++;
       slice_col++;
@@ -1706,6 +1784,7 @@ __global__ void Marlin(
           for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
         }
 
+        bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
         // Update slice k/n for scales loading
         if constexpr (has_act_order) {
           slice_k_start = tb_k * slice_row;
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 85b6abef00..8c207be083 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -326,6 +326,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // gptq_marlin Optimized Quantized GEMM for GPTQ.
   ops.def(
       "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
+      "Tensor? b_bias_or_none,"
       "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? "
       "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, "
       "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 49c097718e..b82c74a42a 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -24,8 +24,10 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, modular_triton_fused_moe)
 from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
     fused_moe as iterative_moe)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    marlin_permute_bias)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    rand_marlin_weight_fp4_like)
+    rand_marlin_weight_mxfp4_like, rand_marlin_weight_nvfp4_like)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     marlin_quant_fp8_torch)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
@@ -476,8 +478,11 @@ def marlin_moe_generate_valid_test_cases():
         if quant_type == scalar_types.float8_e4m3fn and \
                 group_size not in [-1, 128]:
             return False
-        if quant_type == scalar_types.float4_e2m1f and group_size != 16:
-            return False
+        if quant_type == scalar_types.float4_e2m1f:
+            if group_size not in [16, 32]:
+                return False
+            if dtype == torch.float16 and group_size == 32:
+                return False
         if quant_type != scalar_types.float4_e2m1f and group_size == 16:
             return False
 
@@ -520,31 +525,6 @@ def test_fused_marlin_moe(
     torch.cuda.manual_seed(0)
     has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
 
-    if quant_type == scalar_types.float8_e4m3fn:
-        if group_size not in [-1, 128]:
-            return
-        if act_order:
-            return
-
-    # Filter act_order
-    if act_order:
-        if quant_type == scalar_types.float8_e4m3fn:
-            return
-        if group_size == -1:
-            return
-        if group_size in (k, n):
-            return
-        if has_zp:
-            return
-    else:
-        if not is_k_full:
-            return
-
-    if quant_type == scalar_types.float4_e2m1f and group_size != 16:
-        return
-    if quant_type != scalar_types.float4_e2m1f and group_size == 16:
-        return
-
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 20
     w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 20
@@ -569,13 +549,19 @@ def test_fused_marlin_moe(
 
     for i in range(w1.shape[0]):
         if quant_type == scalar_types.float4_e2m1f:
-            w_ref1, qweight1, scales1, global_scale1 = \
-                rand_marlin_weight_fp4_like(w1[i], group_size)
+            if group_size == 16:
+                w_ref1, qweight1, scales1, global_scale1 = \
+                    rand_marlin_weight_nvfp4_like(w1[i], group_size)
+            else:
+                w_ref1, qweight1, scales1 = \
+                    rand_marlin_weight_mxfp4_like(w1[i], group_size)
+                global_scale1 = None
 
             w_ref1_l.append(w_ref1.T)
             qweight1_l.append(qweight1)
             scales1_l.append(scales1)
-            global_scale1_l.append(global_scale1)
+            if global_scale1 is not None:
+                global_scale1_l.append(global_scale1)
         elif quant_type == scalar_types.float8_e4m3fn:
             w_ref1, qweight1, scales1 = marlin_quant_fp8_torch(
                 w1[i], group_size)
@@ -620,13 +606,19 @@ def test_fused_marlin_moe(
 
     for i in range(w2.shape[0]):
         if quant_type == scalar_types.float4_e2m1f:
-            w_ref2, qweight2, scales2, global_scale2 = \
-                rand_marlin_weight_fp4_like(w2[i], group_size)
+            if group_size == 16:
+                w_ref2, qweight2, scales2, global_scale2 = \
+                    rand_marlin_weight_nvfp4_like(w2[i], group_size)
+            else:
+                w_ref2, qweight2, scales2 = \
+                    rand_marlin_weight_mxfp4_like(w2[i], group_size)
+                global_scale2 = None
 
             w_ref2_l.append(w_ref2.T)
             qweight2_l.append(qweight2)
             scales2_l.append(scales2)
-            global_scale2_l.append(global_scale2)
+            if global_scale2 is not None:
+                global_scale2_l.append(global_scale2)
         elif quant_type == scalar_types.float8_e4m3fn:
             w_ref2, qweight2, scales2 = marlin_quant_fp8_torch(
                 w2[i], group_size)
@@ -677,6 +669,8 @@ def test_fused_marlin_moe(
         a,
         qweight1,
         qweight2,
+        None,
+        None,
         scales1,
         scales2,
         score,
@@ -698,6 +692,119 @@ def test_fused_marlin_moe(
     torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
 
 
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
+@pytest.mark.parametrize("m", [1, 256])
+def test_fused_marlin_moe_with_bias(m):
+    torch.cuda.manual_seed(0)
+
+    e, topk = 32, 4
+    n, k = 2048, 2048
+    group_size = 128
+    act_order = False
+    is_k_full = True
+    quant_type = scalar_types.uint4b8
+    dtype = torch.half
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    b_bias1 = torch.randn((e, 2 * n), device="cuda", dtype=dtype) / 10
+    b_bias2 = torch.randn((e, k), device="cuda", dtype=dtype) / 10
+
+    b_bias1_l = []
+    w_ref1_l = []
+    qweight1_l = []
+    scales1_l = []
+    g_idx1_l = []
+    sort_indices1_l = []
+
+    for i in range(w1.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = \
+            marlin_quantize(w1[i].transpose(1, 0), quant_type,
+                            group_size, act_order, test_perm)
+
+        w_ref1_l.append(w_ref1.T)
+        qweight1_l.append(qweight1)
+        scales1_l.append(scales1)
+        g_idx1_l.append(g_idx1)
+        sort_indices1_l.append(sort_indices1)
+        b_bias1_l.append(marlin_permute_bias(b_bias1[i]))
+
+    w_ref1 = stack_and_dev(w_ref1_l)
+    qweight1 = stack_and_dev(qweight1_l).contiguous()
+    scales1 = stack_and_dev(scales1_l)
+    global_scale1 = None
+    g_idx1 = stack_and_dev(g_idx1_l) if g_idx1_l else None
+    zeros1 = None
+    sort_indices1 = stack_and_dev(sort_indices1_l) if sort_indices1_l else None
+    marlin_bias1 = stack_and_dev(b_bias1_l) if b_bias1_l else None
+
+    b_bias2_l = []
+    w_ref2_l = []
+    qweight2_l = []
+    scales2_l = []
+    g_idx2_l = []
+    sort_indices2_l = []
+
+    for i in range(w2.shape[0]):
+        test_perm = torch.randperm(n)
+        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = \
+            marlin_quantize(w2[i].transpose(1, 0), quant_type,
+                            group_size, act_order, test_perm)
+
+        w_ref2_l.append(w_ref2.T)
+        qweight2_l.append(qweight2)
+        scales2_l.append(scales2)
+        g_idx2_l.append(g_idx2)
+        sort_indices2_l.append(sort_indices2)
+        b_bias2_l.append(marlin_permute_bias(b_bias2[i]))
+
+    w_ref2 = stack_and_dev(w_ref2_l)
+    qweight2 = stack_and_dev(qweight2_l).contiguous()
+    scales2 = stack_and_dev(scales2_l)
+    global_scale2 = None
+    g_idx2 = stack_and_dev(g_idx2_l) if g_idx2_l else None
+    zeros2 = None
+    sort_indices2 = stack_and_dev(sort_indices2_l) if sort_indices2_l else None
+    marlin_bias2 = stack_and_dev(b_bias2_l) if b_bias2_l else None
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
+
+    with set_current_vllm_config(vllm_config):
+        torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, b_bias1,
+                                 b_bias2)
+
+    marlin_output = torch.ops.vllm.fused_marlin_moe(
+        a,
+        qweight1,
+        qweight2,
+        marlin_bias1,
+        marlin_bias2,
+        scales1,
+        scales2,
+        score,
+        topk_weights,
+        topk_ids,
+        global_num_experts=e,
+        expert_map=None,
+        global_scale1=global_scale1,
+        global_scale2=global_scale2,
+        g_idx1=g_idx1,
+        g_idx2=g_idx2,
+        sort_indices1=sort_indices1,
+        sort_indices2=sort_indices2,
+        w1_zeros=zeros1,
+        w2_zeros=zeros2,
+        quant_type_id=quant_type.id,
+        is_k_full=is_k_full)
+
+    torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
+
+
 def test_moe_align_block_size_opcheck():
     num_experts = 4
     block_size = 4
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 92914bd5cb..1bd6713ce7 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -19,10 +19,11 @@ from vllm.model_executor.layers.quantization.qqq import (
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
     MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
-    marlin_make_workspace_new, marlin_permute_scales,
+    marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
     query_marlin_supported_quant_types)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    FP4_MARLIN_SUPPORTED_GROUP_SIZES, rand_marlin_weight_fp4_like)
+    FP4_MARLIN_SUPPORTED_GROUP_SIZES, rand_marlin_weight_mxfp4_like,
+    rand_marlin_weight_nvfp4_like)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     marlin_quant_fp8_torch)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
@@ -39,7 +40,7 @@ from vllm.scalar_type import scalar_types
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
 USE_ATOMIC_ADD_OPTS = [False, True]
-USE_FP32_REDUCE_OPTS = [False, True]
+USE_FP32_REDUCE_OPTS = [True]
 
 MARLIN_K_CHUNKS = [128]
 MARLIN_N_CHUNKS = [64, 256]
@@ -202,17 +203,10 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
 @pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
 @pytest.mark.parametrize("use_atomic_add", USE_ATOMIC_ADD_OPTS)
 @pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
-def test_gptq_marlin_gemm(
-    k_chunk,
-    n_chunk,
-    quant_type,
-    group_size,
-    mnk_factors,
-    act_order,
-    is_k_full,
-    use_atomic_add,
-    use_fp32_reduce,
-):
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_gptq_marlin_gemm(k_chunk, n_chunk, quant_type, group_size,
+                          mnk_factors, act_order, is_k_full, use_atomic_add,
+                          use_fp32_reduce, dtype):
     m_factor, n_factor, k_factor = mnk_factors
     has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
 
@@ -231,14 +225,23 @@ def test_gptq_marlin_gemm(
     if size_k % group_size != 0:
         return
 
-    a_input = rand_data((size_m, size_k))
-    b_weight = rand_data((size_k, size_n))
+    a_input = rand_data((size_m, size_k), dtype)
+    b_weight = rand_data((size_k, size_n), dtype)
 
     if quant_type == scalar_types.float4_e2m1f:
-        if group_size != 16 or act_order:
+        if group_size not in [16, 32] or act_order:
             return
-        w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
-            b_weight.T, group_size)
+        if group_size == 32 and dtype == torch.float16:
+            return
+
+        if group_size == 16:
+            w_ref, marlin_q_w, marlin_s, marlin_s2 = \
+                rand_marlin_weight_nvfp4_like(b_weight.T, group_size)
+        else:
+            w_ref, marlin_q_w, marlin_s = \
+                rand_marlin_weight_mxfp4_like(b_weight.T, group_size)
+            marlin_s2 = None
+
         g_idx = None
         sort_indices = None
         marlin_zp = None
@@ -272,8 +275,8 @@ def test_gptq_marlin_gemm(
     workspace = marlin_make_workspace_new(w_ref.device)
 
     opcheck(torch.ops._C.gptq_marlin_gemm,
-            (a_input, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, g_idx,
-             sort_indices, workspace, quant_type.id, a_input.shape[0],
+            (a_input, None, marlin_q_w, None, marlin_s, marlin_s2, marlin_zp,
+             g_idx, sort_indices, workspace, quant_type.id, a_input.shape[0],
              b_weight.shape[1], a_input.shape[1], is_k_full, use_atomic_add,
              use_fp32_reduce, False),
             test_utils=DEFAULT_OPCHECK_TEST_UTILS)
@@ -282,6 +285,7 @@ def test_gptq_marlin_gemm(
         a_input,
         None,
         marlin_q_w,
+        None,
         marlin_s,
         marlin_s2,
         marlin_zp,
@@ -418,6 +422,7 @@ def test_hqq_marlin_gemm(
         a_input,
         None,
         marlin_w_q,
+        None,
         marlin_s,
         None,
         marlin_zp,
@@ -531,6 +536,7 @@ def test_marlin_gemm_subset_input():
         a_input,
         None,
         marlin_q_w,
+        None,
         marlin_s,
         None,
         marlin_zp,
@@ -555,6 +561,53 @@ def test_marlin_gemm_subset_input():
     assert max_diff < 0.04
 
 
+@pytest.mark.parametrize("size_m", [1, 256])
+def test_marlin_gemm_with_bias(size_m):
+    quant_type = scalar_types.uint4b8
+    group_size = 128
+
+    size_k, size_n = 1024, 2048
+    a_input = rand_data((size_m, size_k))
+    b_weight = rand_data((size_k, size_n))
+    b_bias = rand_data((size_n, )) * 10
+
+    marlin_bias = marlin_permute_bias(b_bias)
+
+    w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
+        b_weight, quant_type, group_size, False)
+
+    marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
+    workspace = marlin_make_workspace_new(a_input.device)
+
+    output = ops.gptq_marlin_gemm(
+        a_input,
+        None,
+        marlin_q_w,
+        marlin_bias,
+        marlin_s,
+        None,
+        marlin_zp,
+        g_idx,
+        sort_indices,
+        workspace,
+        quant_type,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+        is_k_full=True,
+        use_atomic_add=False,
+        use_fp32_reduce=True,
+        is_zp_float=False,
+    )
+    output_ref = torch.matmul(a_input, w_ref) + b_bias.view(1, -1)
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
+
+
 def test_marlin_gemm_opcheck():
     size_m = 2048
     size_n = 4096
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 2e8febbdcf..fa4125840a 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1064,6 +1064,8 @@ def torch_experts(
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
     global_num_experts: int = -1,
+    b_bias1: Optional[torch.Tensor] = None,
+    b_bias2: Optional[torch.Tensor] = None,
     expert_map: Optional[torch.Tensor] = None,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
@@ -1108,8 +1110,13 @@ def torch_experts(
         if mask.sum():
             if quant_dtype is None:
                 tmp1 = a[mask] @ w1[i].transpose(0, 1)
+                if b_bias1 is not None:
+                    tmp1 = tmp1 + b_bias1[i].view(1, -1).to(tmp1.dtype)
                 tmp2 = SiluAndMul()(tmp1)
                 out[mask] = tmp2 @ w2[i].transpose(0, 1)
+                if b_bias2 is not None:
+                    out[mask] = out[mask] + b_bias2[i].view(1, -1).to(
+                        tmp1.dtype)
             elif block_shape is not None:
                 # block quantized
                 assert (a_scale is not None and w1_scale is not None
@@ -1117,6 +1124,8 @@ def torch_experts(
                 tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
                                                 w1_scale[i], block_shape,
                                                 out.dtype)
+                if b_bias1 is not None:
+                    tmp1 = tmp1 + b_bias1[i].view(1, -1).to(tmp1.dtype)
                 tmp2 = SiluAndMul()(tmp1)
                 tmp2, b_scale = moe_kernel_quantize_input(
                     tmp2, a2_scale, quant_dtype, per_act_token_quant,
@@ -1125,6 +1134,9 @@ def torch_experts(
                 out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
                                                      w2_scale[i], block_shape,
                                                      out.dtype)
+                if b_bias2 is not None:
+                    out[mask] = out[mask] + b_bias2[i].view(1, -1).to(
+                        tmp1.dtype)
             else:
                 assert (a_scale is not None and w1_scale is not None
                         and w2_scale is not None)
@@ -1133,6 +1145,8 @@ def torch_experts(
                 tmp1 = a[mask].to(f32) * scales
                 w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
                 tmp1 = (tmp1 @ w1_dq).to(out.dtype)
+                if b_bias1 is not None:
+                    tmp1 = tmp1 + b_bias1[i].view(1, -1).to(out.dtype)
 
                 tmp2 = SiluAndMul()(tmp1).to(out.dtype)
 
@@ -1144,6 +1158,9 @@ def torch_experts(
                 tmp2 = tmp2.to(f32) * b_scale
                 w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
                 out[mask] = (tmp2 @ w2_dq).to(out.dtype)
+                if b_bias2 is not None:
+                    out[mask] = out[mask] + b_bias2[i].view(1, -1).to(
+                        out.dtype)
 
     if apply_router_weights_on_input:
         return out
@@ -1157,12 +1174,14 @@ def torch_moe(a: torch.Tensor,
               w2: torch.Tensor,
               score: torch.Tensor,
               topk: int,
+              b_bias1: Optional[torch.Tensor] = None,
+              b_bias2: Optional[torch.Tensor] = None,
               global_num_experts: int = -1,
               expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
     score = torch.softmax(score, dim=-1, dtype=torch.float32)
     topk_weight, topk_ids = torch.topk(score, topk)
     return torch_experts(a, w1, w2, topk_weight, topk_ids, global_num_experts,
-                         expert_map)
+                         b_bias1, b_bias2, expert_map)
 
 
 def torch_moe_single(a, w, score, topk):
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 70605d3c5f..a020b171e8 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -452,6 +452,7 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
     def _gptq_marlin_gemm_fake(a: torch.Tensor,
                                c: Optional[torch.Tensor],
                                b_q_weight: torch.Tensor,
+                               b_bias: Optional[torch.Tensor],
                                b_scales: torch.Tensor,
                                global_scale: Optional[torch.Tensor],
                                b_zeros: Optional[torch.Tensor],
@@ -1048,6 +1049,7 @@ def awq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
 def gptq_marlin_gemm(a: torch.Tensor,
                      c: Optional[torch.Tensor],
                      b_q_weight: torch.Tensor,
+                     b_bias: Optional[torch.Tensor],
                      b_scales: torch.Tensor,
                      global_scale: Optional[torch.Tensor],
                      b_zeros: Optional[torch.Tensor],
@@ -1062,7 +1064,7 @@ def gptq_marlin_gemm(a: torch.Tensor,
                      use_atomic_add: bool = False,
                      use_fp32_reduce: bool = False,
                      is_zp_float: bool = False) -> torch.Tensor:
-    return torch.ops._C.gptq_marlin_gemm(a, c, b_q_weight, b_scales,
+    return torch.ops._C.gptq_marlin_gemm(a, c, b_q_weight, b_bias, b_scales,
                                          global_scale, b_zeros, g_idx, perm,
                                          workspace, b_q_type.id, size_m,
                                          size_n, size_k, is_k_full,
@@ -1540,7 +1542,9 @@ def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
 
 
 def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
-                          b_qweight: torch.Tensor, b_scales: torch.Tensor,
+                          b_qweight: torch.Tensor,
+                          b_bias: Optional[torch.Tensor],
+                          b_scales: torch.Tensor,
                           global_scale: Optional[torch.Tensor],
                           b_qzeros: Optional[torch.Tensor],
                           g_idx: Optional[torch.Tensor],
@@ -1556,11 +1560,11 @@ def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
                           use_fp32_reduce: bool,
                           is_zp_float: bool) -> torch.Tensor:
     return torch.ops._moe_C.moe_wna16_marlin_gemm(
-        input, output, b_qweight, b_scales, global_scale, b_qzeros, g_idx,
-        perm, workspace, sorted_token_ids, expert_ids, num_tokens_past_padded,
-        topk_weights, moe_block_size, top_k, mul_topk_weights, is_ep,
-        b_q_type.id, size_m, size_n, size_k, is_k_full, use_atomic_add,
-        use_fp32_reduce, is_zp_float)
+        input, output, b_qweight, b_bias, b_scales, global_scale, b_qzeros,
+        g_idx, perm, workspace, sorted_token_ids, expert_ids,
+        num_tokens_past_padded, topk_weights, moe_block_size, top_k,
+        mul_topk_weights, is_ep, b_q_type.id, size_m, size_n, size_k,
+        is_k_full, use_atomic_add, use_fp32_reduce, is_zp_float)
 
 
 if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
diff --git a/vllm/envs.py b/vllm/envs.py
index 145ec3495a..110bb542b1 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -122,6 +122,7 @@ if TYPE_CHECKING:
     VLLM_MOE_DP_CHUNK_SIZE: int = 256
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
+    VLLM_MXFP4_USE_MARLIN: Optional[bool] = None
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
     VLLM_V1_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
@@ -182,6 +183,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     return int(value)
 
 
+def maybe_convert_bool(value: Optional[str]) -> Optional[bool]:
+    if value is None:
+        return None
+    return bool(int(value))
+
+
 def get_vllm_port() -> Optional[int]:
     """Get the port from VLLM_PORT environment variable.
 
@@ -906,6 +913,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_MARLIN_USE_ATOMIC_ADD":
     lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
 
+    # Whether to use marlin kernel in mxfp4 quantization method
+    "VLLM_MXFP4_USE_MARLIN":
+    lambda: maybe_convert_bool(os.environ.get("VLLM_MXFP4_USE_MARLIN", None)),
+
     # Whether to turn on the outlines cache for V0
     # This cache is unbounded and on disk, so it's not safe to use in
     # an environment with potentially malicious users.
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 1988c73ba7..a49d41c184 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -18,6 +18,8 @@ from vllm.utils import direct_register_custom_op
 def fused_marlin_moe(hidden_states: torch.Tensor,
                      w1: torch.Tensor,
                      w2: torch.Tensor,
+                     bias1: Optional[torch.Tensor],
+                     bias2: Optional[torch.Tensor],
                      w1_scale: torch.Tensor,
                      w2_scale: torch.Tensor,
                      gating_output: torch.Tensor,
@@ -26,6 +28,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
                      quant_type_id: int,
                      apply_router_weight_on_input: bool = False,
                      global_num_experts: int = -1,
+                     activation: Optional[str] = "silu",
                      expert_map: Optional[torch.Tensor] = None,
                      global_scale1: Optional[torch.Tensor] = None,
                      global_scale2: Optional[torch.Tensor] = None,
@@ -88,6 +91,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
     assert hidden_states.dtype in [torch.float16, torch.bfloat16]
     assert num_bits in [4, 8]
+    assert topk_weights.dtype == torch.float32
 
     M, K = hidden_states.shape
     E = w1.shape[0]
@@ -138,6 +142,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
         hidden_states,
         intermediate_cache1,
         w1,
+        bias1,
         w1_scale,
         global_scale1,
         w1_zeros,
@@ -161,8 +166,28 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
         use_fp32_reduce=True,
         is_zp_float=False)
 
-    torch.ops._C.silu_and_mul(intermediate_cache2,
-                              intermediate_cache1.view(-1, 2 * N))
+    if activation == "silu":
+        torch.ops._C.silu_and_mul(intermediate_cache2,
+                                  intermediate_cache1.view(-1, 2 * N))
+    elif activation == "swiglu_oai":
+        # NOTE: in gpt-oss, the gate_proj and up_proj is interleaved
+        # - interleaved: gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+        # - origin: gate, up = gate_up[..., :N], gate_up[..., N:]
+
+        @torch.compile(dynamic=True)
+        def swiglu_oai(gate_up):
+            alpha = 1.702
+            limit = 7.0
+            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+            gate = gate.clamp(min=None, max=limit)
+            up = up.clamp(min=-limit, max=limit)
+            glu = gate * torch.sigmoid(gate * alpha)
+            return (up + 1) * glu
+
+        intermediate_cache2 = swiglu_oai(intermediate_cache1)
+    else:
+        raise ValueError(f"Unsupported activation: {activation}. "
+                         "Only silu and swiglu_oai activations are supported.")
 
     if expert_map is not None:
         intermediate_cache3.zero_()
@@ -171,6 +196,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
         intermediate_cache2,
         intermediate_cache3,
         w2,
+        bias2,
         w2_scale,
         global_scale2,
         w2_zeros,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index ddc02168e5..36e7582585 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -36,7 +36,7 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
 from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx,
-                        has_triton_kernels, is_torch_equal_or_newer, round_up)
+                        round_up)
 from vllm.utils.flashinfer import has_flashinfer
 
 if current_platform.is_cuda_alike():
@@ -751,19 +751,11 @@ class FusedMoE(CustomOp):
         self.global_num_experts = num_experts + num_redundant_experts
 
         # we padding globally so EP buffer allocation works
-        if quant_config and quant_config.get_name() == "mxfp4":
-            if not current_platform.is_device_capability(100):
-                if not is_torch_equal_or_newer("2.8.0"):
-                    raise RuntimeError(
-                        "Mxfp4 on non-blackwell requires torch >= 2.8.0")
-                if not has_triton_kernels():
-                    raise NotImplementedError(
-                        "triton_kernels must be installed for "
-                        "mxfp4 on non-blackwell")
-            if (current_platform.is_rocm()
-                    or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                    or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
-                hidden_size = round_up(hidden_size, 256)
+        if (quant_config and quant_config.get_name() == "mxfp4"
+                and (current_platform.is_rocm()
+                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16)):
+            hidden_size = round_up(hidden_size, 256)
 
         # For smuggling this layer into the fused moe custom op
         compilation_config = vllm_config.compilation_config
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 6cf02658a9..ed7ffb21e8 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
     check_marlin_supports_layer, check_moe_marlin_supports_layer,
     marlin_make_empty_g_idx, marlin_make_workspace_new,
-    marlin_moe_permute_scales, marlin_permute_scales,
+    marlin_moe_permute_scales, marlin_permute_bias, marlin_permute_scales,
     moe_awq_to_marlin_zero_points, verify_marlin_supported,
     verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
@@ -303,6 +303,9 @@ class AWQMarlinLinearMethod(LinearMethodBase):
         layer.g_idx = marlin_make_empty_g_idx(device)
         layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
 
+        if hasattr(layer, "bias") and layer.bias is not None:
+            layer.bias.data = marlin_permute_bias(layer.bias)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -469,6 +472,12 @@ class AWQMoEMethod(FusedMoEMethodBase):
             num_bits=self.quant_config.weight_bits)
         replace_parameter(layer, "w2_qzeros", marlin_w2_zp)
 
+        if hasattr(layer, "w13_bias") and layer.w13_bias is not None:
+            layer.w13_bias.data = marlin_permute_bias(layer.w13_bias)
+
+        if hasattr(layer, "w2_bias") and layer.w2_bias is not None:
+            layer.w2_bias.data = marlin_permute_bias(layer.w2_bias)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -513,6 +522,8 @@ class AWQMoEMethod(FusedMoEMethodBase):
             x,
             layer.w13_qweight,
             layer.w2_qweight,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
             layer.w13_scales,
             layer.w2_scales,
             router_logits,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index c04f7c39a5..839942beaf 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -324,6 +324,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
+                None,
+                None,
                 layer.w13_weight_scale,
                 layer.w2_weight_scale,
                 router_logits,
@@ -795,6 +797,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
+                None,
+                None,
                 layer.w13_weight_scale,
                 layer.w2_weight_scale,
                 router_logits,
@@ -1253,6 +1257,8 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             x,
             layer.w13_weight_packed,
             layer.w2_weight_packed,
+            None,
+            None,
             layer.w13_weight_scale,
             layer.w2_weight_scale,
             router_logits,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 9577fa025b..5e107c799b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -983,6 +983,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
+                None,
+                None,
                 layer.w13_weight_scale,
                 layer.w2_weight_scale,
                 router_logits,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 9bed5e2e48..3299221e3a 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization.utils.gptq_utils import (
     get_dynamic_override, get_linear_quant_method, override_config)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supported, check_moe_marlin_supports_layer,
-    marlin_make_workspace_new, marlin_moe_permute_scales,
+    marlin_make_workspace_new, marlin_moe_permute_scales, marlin_permute_bias,
     marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
@@ -618,6 +618,12 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         )
         replace_parameter(layer, "w2_scales", marlin_w2_scales)
 
+        if hasattr(layer, "w13_bias") and layer.w13_bias is not None:
+            layer.w13_bias.data = marlin_permute_bias(layer.w13_bias)
+
+        if hasattr(layer, "w2_bias") and layer.w2_bias is not None:
+            layer.w2_bias.data = marlin_permute_bias(layer.w2_bias)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -662,6 +668,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             x,
             layer.w13_qweight,
             layer.w2_qweight,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
             layer.w13_scales,
             layer.w2_scales,
             router_logits,
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index ee8a0e34b3..8385ccac32 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -14,7 +14,7 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    marlin_make_empty_g_idx, marlin_permute_scales)
+    marlin_make_empty_g_idx, marlin_permute_bias, marlin_permute_scales)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     MarlinWorkspace)
 from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack
@@ -284,6 +284,9 @@ class HQQMarlinMethod(LinearMethodBase):
         layer.marlin_zeros = marlin_zp
         layer.marlin_scales = marlin_s
 
+        if hasattr(layer, "bias") and layer.bias is not None:
+            layer.bias.data = marlin_permute_bias(layer.bias)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -307,6 +310,7 @@ class HQQMarlinMethod(LinearMethodBase):
             x,
             None,
             layer.marlin_qweight,
+            bias,
             scales,
             None,
             zeros,
@@ -326,7 +330,4 @@ class HQQMarlinMethod(LinearMethodBase):
         if orig_type != torch.float16:
             marlin_out = marlin_out.to(orig_type)
 
-        if bias is not None:
-            marlin_out.add_(bias)
-
         return marlin_out
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index 73e0b17ea8..5eb9938309 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -9,8 +9,9 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
     check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
-    marlin_make_workspace_new, marlin_permute_scales, marlin_sort_g_idx,
-    marlin_zero_points, query_marlin_supported_quant_types, unpack_cols)
+    marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
+    marlin_sort_g_idx, marlin_zero_points, query_marlin_supported_quant_types,
+    unpack_cols)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            permute_param_layout_)
 from vllm.platforms import current_platform
@@ -111,6 +112,9 @@ class MarlinLinearKernel(MPLinearKernel):
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
 
+        if hasattr(layer, "bias") and layer.bias is not None:
+            layer.bias.data = marlin_permute_bias(layer.bias)
+
     def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index bed5022267..8868c62379 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1330,6 +1330,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
+                None,
+                None,
                 layer.w13_weight_scale,
                 layer.w2_weight_scale,
                 router_logits,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 03fbcf1583..dbe6c603c0 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -15,13 +15,17 @@ from vllm.model_executor.layers.linear import (LinearBase,
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    prepare_moe_fp4_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     _can_support_mxfp4, _swizzle_mxfp4)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import next_power_of_2, round_up
+from vllm.scalar_type import scalar_types
+from vllm.utils import (has_triton_kernels, is_torch_equal_or_newer,
+                        next_power_of_2, round_up)
 
 if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
         or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
@@ -81,6 +85,21 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         super().__init__()
         self.topk_indices_dtype = None
         self.moe = moe
+        self.use_marlin = self._should_use_marlin()
+
+    def _should_use_marlin(self):
+        if envs.VLLM_MXFP4_USE_MARLIN is not None:
+            return envs.VLLM_MXFP4_USE_MARLIN
+        if current_platform.is_cuda() and \
+                not current_platform.has_device_capability(100):
+            if not current_platform.is_device_capability(90):
+                # marlin kernel has better performance on ampere
+                return True
+            if not has_triton_kernels():
+                return True
+            if not is_torch_equal_or_newer("2.8.0"):
+                return True
+        return False
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -101,11 +120,29 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
         intermediate_size_per_partition_after_pad = \
             intermediate_size_per_partition
-        # pad the intermediate size to be a multiple of 2 * mxfp4_block
-        # for to hold non-uniform sharded tensor as well as swizzling
-        # other padding to increase performance
-        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        if self.use_marlin:
+            # The moe marlin kernel requires that for each linear
+            # n % 256 == 0 and k % 128 == 0.
+            # In gate_up_proj:
+            #    n = 2 * intermediate_size_per_partition_after_pad
+            #    k = hidden_size
+            # In down_proj
+            #    n = hidden_size
+            #    k = intermediate_size_per_partition_after_pad
+            intermediate_size_per_partition_after_pad = round_up(
+                intermediate_size_per_partition, 128)
+            hidden_size = round_up(hidden_size, 256)
+
+            layer.params_dtype = params_dtype
+            layer.num_experts = num_experts
+            layer.hidden_size = hidden_size
+            layer.intermediate_size_per_partition = \
+                intermediate_size_per_partition_after_pad
+        elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+              or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+            # pad the intermediate size to be a multiple of 2 * mxfp4_block
+            # for to hold non-uniform sharded tensor as well as swizzling
+            # other padding to increase performance
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 256)
             hidden_size = round_up(hidden_size, 256)
@@ -191,8 +228,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         set_weight_attrs(w2_bias, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer):
-        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        if self.use_marlin:
+            prepare_moe_fp4_layer_for_marlin(layer)
+        elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+              or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
             layer.gemm1_alpha = Parameter(torch.tensor(
                 [1.702] * self.num_experts, dtype=torch.float32).cuda(),
                                           requires_grad=False)
@@ -399,13 +438,45 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         if enable_eplb:
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
+        if self.use_marlin:
+            topk_weights, topk_ids = FusedMoE.select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                use_grouped_topk=use_grouped_topk,
+                top_k=top_k,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                custom_routing_function=custom_routing_function,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias)
+
+            return torch.ops.vllm.fused_marlin_moe(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_bias,
+                layer.w2_bias,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                router_logits,
+                topk_weights,
+                topk_ids,
+                global_scale1=None,
+                global_scale2=None,
+                quant_type_id=scalar_types.float4_e2m1f.id,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                activation=activation,
+                expert_map=expert_map)
+
         assert _can_support_mxfp4(
             use_grouped_topk, topk_group, num_expert_group, expert_map,
             custom_routing_function, e_score_correction_bias,
             apply_router_weight_on_input, scoring_func, activation,
             expert_load_view, logical_to_physical_map,
-            logical_replica_count), ("MXFP4 are not supported\
-                                      with this configuration.")
+            logical_replica_count), (
+                "MXFP4 are not supported with this configuration.")
 
         if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
                 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 7540a1516f..02057b476c 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -261,6 +261,13 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
     return s
 
 
+def marlin_permute_bias(s: torch.Tensor) -> torch.Tensor:
+    origin_shape = s.shape
+    _, scale_perm_single = get_scale_perms()
+    s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    return s.reshape(*origin_shape).contiguous()
+
+
 def marlin_moe_permute_scales(
     s: torch.Tensor,
     size_k: int,
@@ -410,6 +417,7 @@ def apply_gptq_marlin_linear(
     output = ops.gptq_marlin_gemm(reshaped_x,
                                   None,
                                   weight,
+                                  bias,
                                   weight_scale,
                                   None,
                                   weight_zp,
@@ -425,9 +433,6 @@ def apply_gptq_marlin_linear(
                                   use_fp32_reduce=use_fp32_reduce,
                                   is_zp_float=False)
 
-    if bias is not None:
-        output.add_(bias)  # In-place add
-
     return output.reshape(out_shape)
 
 
@@ -456,6 +461,7 @@ def apply_awq_marlin_linear(
     output = ops.gptq_marlin_gemm(reshaped_x,
                                   None,
                                   weight,
+                                  bias,
                                   weight_scale,
                                   None,
                                   weight_zp,
@@ -470,7 +476,4 @@ def apply_awq_marlin_linear(
                                   use_fp32_reduce=use_fp32_reduce,
                                   is_zp_float=False)
 
-    if bias is not None:
-        output.add_(bias)  # In-place add
-
     return output.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index ca10db69dc..94ffdcd26e 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -8,8 +8,8 @@ import torch
 import vllm._custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
-    should_use_atomic_add_reduce)
+    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_bias,
+    marlin_permute_scales, should_use_atomic_add_reduce)
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
@@ -22,7 +22,7 @@ def is_fp4_marlin_supported():
     return current_platform.has_device_capability(80)
 
 
-def fp4_marlin_process_scales(marlin_scales):
+def nvfp4_marlin_process_scales(marlin_scales):
     if not (marlin_scales >= 0).all():
         logger.warning_once(
             "NVFP4 Marlin assumes the scales to be >=0, but has encountered "
@@ -56,7 +56,20 @@ def fp4_marlin_process_scales(marlin_scales):
     return marlin_scales
 
 
-def fp4_marlin_process_global_scale(global_scale):
+def mxfp4_marlin_process_scales(marlin_scales):
+    # 8 is the number of scale number using by one thread
+    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
+    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
+        marlin_scales.size(0) * 2, -1)
+
+    # fit the layout of fp8 dequantization
+    marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
+        marlin_scales.size(0), -1)
+    marlin_scales = marlin_scales.to(torch.float8_e8m0fnu)
+    return marlin_scales
+
+
+def nvfp4_marlin_process_global_scale(global_scale):
     assert global_scale.dtype in [torch.half, torch.bfloat16]
     fp4_exponent = 2
     if global_scale.dtype == torch.half:
@@ -73,7 +86,7 @@ def apply_fp4_marlin_linear(
         input: torch.Tensor,
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
-        weight_scale_2: torch.Tensor,
+        weight_scale_2: Optional[torch.Tensor],
         workspace: torch.Tensor,
         size_n: int,
         size_k: int,
@@ -94,6 +107,7 @@ def apply_fp4_marlin_linear(
     output = ops.gptq_marlin_gemm(a=reshaped_x,
                                   c=None,
                                   b_q_weight=weight,
+                                  b_bias=bias,
                                   b_scales=weight_scale,
                                   global_scale=weight_scale_2,
                                   b_zeros=None,
@@ -107,9 +121,6 @@ def apply_fp4_marlin_linear(
                                   use_atomic_add=use_atomic_add,
                                   use_fp32_reduce=use_fp32_reduce)
 
-    if bias is not None:
-        output.add_(bias)  # In-place add
-
     return output.reshape(out_shape)
 
 
@@ -120,6 +131,9 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         "be used leveraging the Marlin kernel. This may degrade "
         "performance for compute-heavy workloads.")
 
+    is_nvfp4 = hasattr(layer, "weight_scale_2")
+    group_size = 16 if is_nvfp4 else 32
+
     part_size_n = layer.output_size_per_partition
     part_size_k = layer.input_size_per_partition
     param_dtype = layer.params_dtype
@@ -145,18 +159,35 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
 
     # WEIGHT SCALES
     # Permute scales
-    weight_scale = layer.weight_scale.T.to(param_dtype)
+    weight_scale = layer.weight_scale.T.contiguous()
+
+    if not is_nvfp4:
+        weight_scale = weight_scale.view(torch.float8_e8m0fnu)
+
+    weight_scale = weight_scale.to(param_dtype)
     weight_scale = marlin_permute_scales(s=weight_scale,
                                          size_k=part_size_k,
                                          size_n=part_size_n,
-                                         group_size=16)
-    weight_scale = fp4_marlin_process_scales(weight_scale)
-    layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+                                         group_size=group_size)
 
-    weight_scale_2 = layer.weight_scale_2.to(param_dtype)
-    weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
-    layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
-                                              requires_grad=False)
+    if is_nvfp4:
+        weight_scale = nvfp4_marlin_process_scales(weight_scale)
+        layer.weight_scale = torch.nn.Parameter(weight_scale,
+                                                requires_grad=False)
+
+        weight_scale_2 = layer.weight_scale_2.to(param_dtype)
+        weight_scale_2 = nvfp4_marlin_process_global_scale(weight_scale_2)
+        layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
+                                                  requires_grad=False)
+    else:
+        weight_scale = mxfp4_marlin_process_scales(weight_scale)
+        layer.weight_scale = torch.nn.Parameter(weight_scale,
+                                                requires_grad=False)
+
+    if hasattr(layer, "bias") and layer.bias is not None:
+        assert layer.bias.shape == (part_size_n, )
+        bias = marlin_permute_bias(layer.bias)
+        layer.bias = torch.nn.Parameter(bias, requires_grad=False)
 
     return
 
@@ -168,6 +199,9 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         "be used leveraging the Marlin kernel. This may degrade "
         "performance for compute-heavy workloads.")
 
+    is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
+    group_size = 16 if is_nvfp4 else 32
+
     e = layer.num_experts
     k = layer.hidden_size
     n = layer.intermediate_size_per_partition
@@ -208,8 +242,13 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
     # WEIGHT SCALES
     # Permute scales
     for name in ["w13", "w2"]:
-        scales = getattr(layer, name + "_weight_scale").to(param_dtype)
-        global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
+        scales = getattr(layer, name + "_weight_scale")
+        if not is_nvfp4:
+            scales = scales.view(torch.float8_e8m0fnu)
+        scales = scales.to(param_dtype)
+        if is_nvfp4:
+            global_scale = getattr(layer,
+                                   name + "_weight_scale_2").to(param_dtype)
 
         tensor_list = []
         if "w13" in name:
@@ -218,23 +257,47 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
             size_n, size_k = k, n
 
         for i in range(e):
-            marlin_scales = marlin_permute_scales(s=scales[i].T,
+            scale = scales[i].T
+
+            marlin_scales = marlin_permute_scales(s=scale,
                                                   size_k=size_k,
                                                   size_n=size_n,
-                                                  group_size=16)
-            marlin_scales = fp4_marlin_process_scales(marlin_scales)
+                                                  group_size=group_size)
+            if is_nvfp4:
+                marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
+            else:
+                marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
             tensor_list.append(marlin_scales)
 
         scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
         scales = torch.nn.Parameter(scales, requires_grad=False)
         setattr(layer, name + "_weight_scale", scales)
 
-        global_scale = fp4_marlin_process_global_scale(global_scale)
-        global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
-        setattr(layer, name + "_weight_scale_2", global_scale)
+        if is_nvfp4:
+            global_scale = nvfp4_marlin_process_global_scale(global_scale)
+            global_scale = torch.nn.Parameter(global_scale,
+                                              requires_grad=False)
+            setattr(layer, name + "_weight_scale_2", global_scale)
+
+    # BIAS
+    # Permute bias
+    for name in ["w13_bias", "w2_bias"]:
+        if not hasattr(layer, name):
+            continue
+        bias = getattr(layer, name).to(param_dtype)
+
+        tensor_list = []
+        for i in range(e):
+            expert_bias = bias[i]
+
+            tensor_list.append(marlin_permute_bias(expert_bias))
+
+        bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        bias = torch.nn.Parameter(bias, requires_grad=False)
+        setattr(layer, name, bias)
 
 
-def rand_marlin_weight_fp4_like(weight, group_size):
+def rand_marlin_weight_nvfp4_like(weight, group_size):
     assert group_size > 0
     size_n, size_k = weight.shape
     device = weight.device
@@ -276,8 +339,58 @@ def rand_marlin_weight_fp4_like(weight, group_size):
                                           size_k=size_k,
                                           size_n=size_n,
                                           group_size=group_size)
-    marlin_scales = fp4_marlin_process_scales(marlin_scales)
+    marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
 
-    global_scale = fp4_marlin_process_global_scale(global_scale)
+    global_scale = nvfp4_marlin_process_global_scale(global_scale)
 
     return weight_ref.T, marlin_qweight, marlin_scales, global_scale
+
+
+def rand_marlin_weight_mxfp4_like(weight, group_size):
+    assert group_size > 0
+    size_n, size_k = weight.shape
+    device = weight.device
+
+    scales = torch.randint(100,
+                           125, (size_n, size_k // group_size),
+                           dtype=torch.uint8,
+                           device=weight.device)
+    scales = scales.view(torch.float8_e8m0fnu)
+
+    fp4_weight = torch.randint(0,
+                               256, (size_n, size_k // 2),
+                               dtype=torch.uint8,
+                               device=weight.device)
+    fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
+                         ((fp4_weight & 0b01110000) >> 2))
+    fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
+    fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
+
+    fp4_weight2 = fp4_weight << 4
+    fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
+                         ((fp4_weight2 & 0b01110000) >> 2))
+    fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
+    fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
+
+    weight_ref = torch.cat(
+        [fp4_weight_part_2.unsqueeze(2),
+         fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
+    weight_ref = weight_ref * \
+        scales.repeat_interleave(group_size, 1).to(weight.dtype)
+
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=4,
+    )
+
+    marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+
+    marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
+
+    return weight_ref.T, marlin_qweight, marlin_scales.to(torch.float8_e8m0fnu)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 5372c49d98..511e19545d 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -8,8 +8,8 @@ import torch
 import vllm._custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
-    should_use_atomic_add_reduce)
+    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_bias,
+    marlin_permute_scales, should_use_atomic_add_reduce)
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
@@ -58,6 +58,7 @@ def apply_fp8_marlin_linear(
     output = ops.gptq_marlin_gemm(a=reshaped_x,
                                   c=None,
                                   b_q_weight=weight,
+                                  b_bias=bias,
                                   b_scales=weight_scale,
                                   global_scale=None,
                                   b_zeros=None,
@@ -71,9 +72,6 @@ def apply_fp8_marlin_linear(
                                   use_atomic_add=use_atomic_add,
                                   use_fp32_reduce=use_fp32_reduce)
 
-    if bias is not None:
-        output.add_(bias)  # In-place add
-
     return output.reshape(out_shape)
 
 
@@ -160,6 +158,11 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
     marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
     layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
 
+    if hasattr(layer, "bias") and layer.bias is not None:
+        assert layer.bias.shape == (part_size_n, )
+        bias = marlin_permute_bias(layer.bias)
+        layer.bias = torch.nn.Parameter(bias, requires_grad=False)
+
 
 def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,
                                      size_k_first: bool = True) -> None:
@@ -274,6 +277,23 @@ def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,
 
         setattr(layer, name + "_weight_scale", scales)
 
+    # BIAS
+    # Permute bias
+    for name in ["w13_bias", "w2_bias"]:
+        if not hasattr(layer, name):
+            continue
+        bias = getattr(layer, name).to(layer.orig_dtype)
+
+        tensor_list = []
+        for i in range(e):
+            expert_bias = bias[i]
+
+            tensor_list.append(marlin_permute_bias(expert_bias))
+
+        bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        bias = torch.nn.Parameter(bias, requires_grad=False)
+        setattr(layer, name, bias)
+
 
 def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
                       size_k_first: bool = True) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 95eabe149d..deeb69bcad 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -61,7 +61,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
                        e_score_correction_bias: Optional[torch.Tensor] = None,
                        apply_router_weight_on_input: bool = False,
                        scoring_func: str = "softmax",
-                       activation: str = "silu",
+                       activation: str = "swiglu_oai",
                        expert_load_view: Optional[torch.Tensor] = None,
                        logical_to_physical_map: Optional[torch.Tensor] = None,
                        logical_replica_count: Optional[torch.Tensor] = None):
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index 9060b55c79..6f11ab8e03 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -327,6 +327,8 @@ class scalar_types:
     uint8 = ScalarType.uint(8, None)
     float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN)
     float8_e5m2 = ScalarType.float_IEEE754(5, 2)
+    float8_e8m0fnu = ScalarType(8, 0, False, 0, True,
+                                NanRepr.EXTD_RANGE_MAX_MIN)
     float16_e8m7 = ScalarType.float_IEEE754(8, 7)
     float16_e5m10 = ScalarType.float_IEEE754(5, 10)
 

From 637093ae26953f20aef328424e4307826820efe4 Mon Sep 17 00:00:00 2001
From: Nir <bhr166@gmail.com>
Date: Thu, 14 Aug 2025 22:56:54 +0300
Subject: [PATCH 266/932] docs: update fastsafetensors usage instructions
 (#22891)

Signed-off-by: Nir Levy <bhr166@gmail.com>
---
 docs/models/extensions/fastsafetensor.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/models/extensions/fastsafetensor.md b/docs/models/extensions/fastsafetensor.md
index 531d586900..2a5a18102d 100644
--- a/docs/models/extensions/fastsafetensor.md
+++ b/docs/models/extensions/fastsafetensor.md
@@ -2,4 +2,5 @@ Loading Model weights with fastsafetensors
 ===================================================================
 
 Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
-For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
+
+To enable this feature, use the ``--load-format fastsafetensors`` command-line argument

From b8ff05361a2ab91e6be33601d4f564408e10eb24 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 14 Aug 2025 15:59:16 -0400
Subject: [PATCH 267/932] [CI] Temporarily disable flaky test  (#22930)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/v1/e2e/test_spec_decode.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index dde95fbe59..7b3f458312 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -162,6 +162,12 @@ def test_eagle_correctness(
     mm_enabled: bool,
     attn_backend: str,
 ):
+    if attn_backend == "TREE_ATTN":
+        # TODO: Fix this flaky test
+        pytest.skip(
+            "TREE_ATTN is flaky in the test disable for now until it can be "
+            "reolved (see https://github.com/vllm-project/vllm/issues/22922)")
+
     # Generate test prompts inside the function instead of using fixture
     test_prompts = get_test_prompts(mm_enabled)
     '''

From 279a5f31b3faa6f40759516efa5c742f637ab8b7 Mon Sep 17 00:00:00 2001
From: nvjullin <jullin@nvidia.com>
Date: Fri, 15 Aug 2025 04:03:55 +0800
Subject: [PATCH 268/932] [Kernel] Add nvfp4 gemm flashinfer backends (#22346)

Signed-off-by: Julien Lin <jullin@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 .../test_flashinfer_nvfp4_scaled_mm.py        | 139 ++++++++++++++++++
 .../quantization/test_nvfp4_scaled_mm.py      |   3 +
 vllm/envs.py                                  |   7 +
 .../schemes/compressed_tensors_w4a4_nvfp4.py  |  60 ++++++--
 .../layers/quantization/modelopt.py           |  84 ++++++++---
 vllm/model_executor/warmup/kernel_warmup.py   |  39 ++++-
 vllm/utils/flashinfer.py                      |  71 +++++++++
 vllm/v1/worker/gpu_worker.py                  |   4 +-
 9 files changed, 369 insertions(+), 39 deletions(-)
 create mode 100644 tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 740be2bc87..942a8d3f9b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -669,6 +669,7 @@ steps:
     - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
     - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
     # Fusion
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
diff --git a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
new file mode 100644
index 0000000000..131086a5f7
--- /dev/null
+++ b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX,
+                         convert_swizzled_to_linear, dequantize_nvfp4_to_dtype)
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason="Nvfp4 Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
+
+DTYPES = [torch.float16, torch.bfloat16]
+# m, n, k
+SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)]
+PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
+SHAPES.extend(PAD_SHAPES)
+
+SEEDS = [42]
+CUDA_DEVICES = ["cuda:0"]
+
+
+def get_ref_results(
+    a_fp4,
+    b_fp4,
+    a_sf,
+    b_sf,
+    a_global_scale,
+    b_global_scale,
+    m,
+    n,
+    dtype,
+    block_size,
+    device,
+):
+    _, m_k = a_fp4.shape
+    _, n_k = b_fp4.shape
+    assert m_k == n_k
+    a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
+                                           a_sf,
+                                           a_global_scale,
+                                           dtype=dtype,
+                                           device=device,
+                                           block_size=block_size)
+    b_in_dtype = dequantize_nvfp4_to_dtype(b_fp4,
+                                           b_sf,
+                                           b_global_scale,
+                                           dtype=dtype,
+                                           device=device,
+                                           block_size=block_size)
+    return torch.matmul(a_in_dtype, b_in_dtype.t())
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("backend", ["cutlass", "trtllm"])
+@pytest.mark.parametrize("autotune", [False, True])
+@torch.inference_mode()
+def test_flashinfer_nvfp4_gemm(
+    dtype: torch.dtype,
+    shape: tuple[int, int, int],
+    seed: int,
+    device: str,
+    backend: str,
+    autotune: bool,
+) -> None:
+    if backend == "trtllm" and dtype == torch.float16:
+        pytest.skip(
+            "Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations")
+
+    current_platform.seed_everything(seed)
+    m, n, packed_k = shape
+    k = packed_k * 2
+    block_size = 16
+    a_dtype = torch.randn((m, k), dtype=dtype, device=device)
+    b_dtype = torch.randn((n, k), dtype=dtype, device=device)
+
+    a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                      torch.amax(a_dtype.flatten(), dim=-1)).to(torch.float32)
+    b_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                      torch.amax(b_dtype.flatten(), dim=-1)).to(torch.float32)
+    alpha = 1.0 / (a_global_scale * b_global_scale)
+    # ops.scaled_fp4_quant returns swizzled scales, while weights
+    # from checkpoints are in linear scales.
+    # So instead of needing to swizzle for cutlass as in modelopt.py,
+    # we need to unswizzle for trtllm here.
+    a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a_dtype, a_global_scale)
+    b_fp4, b_scale_interleaved = ops.scaled_fp4_quant(b_dtype, b_global_scale)
+
+    # get_ref_results unswizzles the scales internally.
+    expected_out = get_ref_results(
+        a_fp4,
+        b_fp4,
+        a_scale_interleaved,
+        b_scale_interleaved,
+        a_global_scale,
+        b_global_scale,
+        m,
+        n,
+        dtype,
+        block_size,
+        device,
+    )
+
+    import flashinfer
+
+    if backend == "trtllm":
+        epilogue_tile_m = 128
+        b_fp4 = flashinfer.shuffle_matrix_a(b_fp4.view(torch.uint8),
+                                            epilogue_tile_m)
+
+        b_scale_interleaved = convert_swizzled_to_linear(
+            b_scale_interleaved, n, k, block_size)
+        b_scale_interleaved = (flashinfer.shuffle_matrix_sf_a(
+            b_scale_interleaved.view(torch.uint8), epilogue_tile_m).reshape(
+                b_scale_interleaved.shape).view(torch.float8_e4m3fn))
+
+    with flashinfer.autotune(autotune):
+        out = flashinfer_scaled_fp4_mm(
+            a_fp4,
+            b_fp4,
+            a_scale_interleaved,
+            b_scale_interleaved,
+            alpha,
+            dtype,
+            backend=backend,
+        )
+
+    torch.testing.assert_close(out,
+                               expected_out.to(dtype=dtype),
+                               atol=1e-1,
+                               rtol=1e-1)
diff --git a/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
index 0b45c22981..67e041f2b7 100644
--- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
@@ -65,9 +65,12 @@ def test_nvfp4_gemm(
     b_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
                       torch.amax(b_dtype.flatten(), dim=-1)).to(torch.float32)
     alpha = 1. / (a_global_scale * b_global_scale)
+    # ops.scaled_fp4_quant returns swizzled scales, while weights
+    # from checkpoints are in linear scales.
     a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a_dtype, a_global_scale)
     b_fp4, b_scale_interleaved = ops.scaled_fp4_quant(b_dtype, b_global_scale)
 
+    # get_ref_results unswizzles the scales internally.
     expected_out = get_ref_results(a_fp4, b_fp4, a_scale_interleaved,
                                    b_scale_interleaved, a_global_scale,
                                    b_global_scale, m, n, dtype, block_size,
diff --git a/vllm/envs.py b/vllm/envs.py
index 110bb542b1..2f0bafa01c 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1101,6 +1101,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_TRTLLM_ATTENTION":
     lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
 
+    # If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
+    # Otherwise, uses the first available of: flashinfer cutlass GEMM,
+    # vllm cutlass GEMM, marlin GEMM.
+    "VLLM_USE_TRTLLM_FP4_GEMM":
+    lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_FP4_GEMM", "0"))),
+
     # Controls garbage collection during CUDA graph capture.
     # If set to 0 (default), enables GC freezing to speed up capture time.
     # If set to 1, allows GC to run during capture.
@@ -1208,6 +1214,7 @@ def compute_hash() -> str:
         "VLLM_DP_SIZE",
         "VLLM_USE_STANDALONE_COMPILE",
         "VLLM_FUSED_MOE_CHUNK_SIZE",
+        "VLLM_USE_TRTLLM_FP4_GEMM",
     ]
     for key in environment_variables_to_hash:
         if key in environment_variables:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 8ba7216292..63bfe565b1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -15,6 +15,7 @@ from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
+from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm, has_flashinfer
 
 logger = init_logger(__name__)
 
@@ -24,6 +25,13 @@ __all__ = ["CompressedTensorsW4A4Fp4"]
 class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
 
     def __init__(self):
+        if envs.VLLM_USE_TRTLLM_FP4_GEMM:
+            assert has_flashinfer(), "TRTLLM FP4 GEMM requires FlashInfer"
+            self.backend = "flashinfer-trtllm"
+        elif has_flashinfer():
+            self.backend = "flashinfer-cutlass"
+        else:
+            self.backend = "cutlass"
         self.group_size = 16
 
     @classmethod
@@ -108,16 +116,36 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
             layer.weight_global_scale.max().to(torch.float32),
             requires_grad=False)
 
-        swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
-        layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
-                                                requires_grad=False)
+        if self.backend == "flashinfer-trtllm":
+            # FlashInfer TRTLLM FP4 GEMM requires a different weight layout.
+            # FlashInfer provides nvfp4_quantize to quantize + shuffle the
+            # layout but we use our own quantization so we have to call
+            # shuffles ourselves.
+            from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a
 
-        # required by cutlass kernel; need Parameter, not ModelWeightParameter
-        layer.weight = Parameter(layer.weight_packed.data, requires_grad=False)
+            weight = layer.weight_packed.data
+            weight_scale = layer.weight_scale.data
 
-        layer.alpha = Parameter(layer.input_global_scale *
-                                layer.weight_global_scale,
-                                requires_grad=False)
+            epilogue_tile_m = 128
+            weight = shuffle_matrix_a(weight.view(torch.uint8),
+                                      epilogue_tile_m)
+            weight_scale = (shuffle_matrix_sf_a(weight_scale.view(
+                torch.uint8), epilogue_tile_m).reshape(
+                    weight_scale.shape).view(torch.float8_e4m3fn))
+
+            layer.weight_scale_swizzled = Parameter(weight_scale,
+                                                    requires_grad=False)
+            layer.weight_packed = Parameter(weight, requires_grad=False)
+        else:
+            swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
+            layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
+                                                    requires_grad=False)
+            layer.weight_packed = Parameter(layer.weight_packed.data,
+                                            requires_grad=False)
+
+        layer.alpha = Parameter(
+            1 / (layer.input_global_scale * layer.weight_global_scale),
+            requires_grad=False)
 
     def apply_weights(self,
                       layer: torch.nn.Module,
@@ -128,7 +156,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
             out = run_nvfp4_emulations(
                 x=x,
                 input_global_scale=layer.input_global_scale,
-                weight=layer.weight,
+                weight=layer.weight_packed,
                 weight_scale_swizzled=layer.weight_scale_swizzled,
                 weight_global_scale=layer.weight_global_scale)
             if bias is not None:
@@ -136,14 +164,20 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
             return out
 
         output_dtype = x.dtype
-        output_shape = [x.shape[0], layer.weight.shape[0]]
+        output_shape = [x.shape[0], layer.weight_packed.shape[0]]
 
         # quantize BF16 or FP16 to (FP4 and interleaved block scale)
         x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
 
-        out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale,
-                                    layer.weight_scale_swizzled,
-                                    1 / layer.alpha, output_dtype)
+        mm_args = (x_fp4, layer.weight_packed, x_blockscale,
+                   layer.weight_scale_swizzled, layer.alpha, output_dtype)
+        if self.backend == "flashinfer-trtllm":
+            out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm")
+        elif self.backend == "flashinfer-cutlass":
+            out = flashinfer_scaled_fp4_mm(*mm_args, backend="cutlass")
+        else:
+            out = cutlass_scaled_fp4_mm(*mm_args)
+
         if bias is not None:
             out = out + bias
         return out.view(*output_shape)
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 8868c62379..8f9ca73bc5 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -38,7 +38,8 @@ from vllm.model_executor.parameter import (ModelWeightParameter,
                                            PerTensorScaleParameter)
 from vllm.scalar_type import scalar_types
 from vllm.utils import next_power_of_2
-from vllm.utils.flashinfer import has_flashinfer_moe
+from vllm.utils.flashinfer import (flashinfer_scaled_fp4_mm, has_flashinfer,
+                                   has_flashinfer_moe)
 
 logger = init_logger(__name__)
 
@@ -724,16 +725,20 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         self.quant_config = quant_config
-        self.cutlass_nvfp4_supported = cutlass_fp4_supported()
-        self.use_marlin = False
 
-        if not self.cutlass_nvfp4_supported:
-            if is_fp4_marlin_supported():
-                self.use_marlin = True
-            else:
-                raise ValueError("Current platform does not support NVFP4"
-                                 " quantization. Please use Blackwell and"
-                                 " above.")
+        if envs.VLLM_USE_TRTLLM_FP4_GEMM:
+            assert has_flashinfer(), "TRTLLM FP4 GEMM requires FlashInfer"
+            self.backend = "flashinfer-trtllm"
+        elif has_flashinfer():
+            self.backend = "flashinfer-cutlass"
+        elif cutlass_fp4_supported():
+            self.backend = "cutlass"
+        elif is_fp4_marlin_supported():
+            self.backend = "marlin"
+        else:
+            raise ValueError("Current platform does not support NVFP4"
+                             " quantization. Please use Blackwell and"
+                             " above.")
 
     def create_weights(
         self,
@@ -815,17 +820,38 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         # block_size = 16;
         assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Block scale must be represented as FP8-E4M3")
-        swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
 
-        layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
-                                                requires_grad=False)
-        layer.weight = Parameter(layer.weight.data, requires_grad=False)
+        if self.backend == "flashinfer-trtllm":
+            # FlashInfer TRTLLM FP4 GEMM requires a different weight layout.
+            # FlashInfer provides nvfp4_quantize to quantize + shuffle the
+            # layout but we use our own quantization so we have to call
+            # shuffles ourselves.
+            from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a
 
-        if self.use_marlin:
-            prepare_fp4_layer_for_marlin(layer)
-            del layer.alpha
-            del layer.input_scale
-            del layer.weight_scale_swizzled
+            weight = layer.weight.data
+            weight_scale = layer.weight_scale.data
+
+            epilogue_tile_m = 128
+            weight = shuffle_matrix_a(weight.view(torch.uint8),
+                                      epilogue_tile_m)
+            weight_scale = (shuffle_matrix_sf_a(weight_scale.view(
+                torch.uint8), epilogue_tile_m).reshape(
+                    weight_scale.shape).view(torch.float8_e4m3fn))
+
+            layer.weight_scale_swizzled = Parameter(weight_scale,
+                                                    requires_grad=False)
+            layer.weight = Parameter(weight, requires_grad=False)
+        else:
+            swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
+            layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
+                                                    requires_grad=False)
+            layer.weight = Parameter(layer.weight.data, requires_grad=False)
+
+            if self.backend == "marlin":
+                prepare_fp4_layer_for_marlin(layer)
+                del layer.alpha
+                del layer.input_scale
+                del layer.weight_scale_swizzled
 
     def apply(
         self,
@@ -833,7 +859,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        if self.use_marlin:
+        if self.backend == "marlin":
             return apply_fp4_marlin_linear(
                 input=x,
                 weight=layer.weight,
@@ -859,9 +885,21 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         assert (layer.weight_scale_swizzled.dtype == torch.float8_e4m3fn)
         assert (layer.alpha.dtype == torch.float32)
 
-        out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale,
-                                    layer.weight_scale_swizzled, layer.alpha,
-                                    output_dtype)
+        mm_args = (
+            x_fp4,
+            layer.weight,
+            x_blockscale,
+            layer.weight_scale_swizzled,
+            layer.alpha,
+            output_dtype,
+        )
+        if self.backend == "flashinfer-trtllm":
+            out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm")
+        elif self.backend == "flashinfer-cutlass":
+            out = flashinfer_scaled_fp4_mm(*mm_args, backend="cutlass")
+        else:
+            out = cutlass_scaled_fp4_mm(*mm_args)
+
         if bias is not None:
             out = out + bias
         return out.view(*output_shape)
diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
index 10f2dc0252..761172e4d3 100644
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -5,16 +5,53 @@ Warmup kernels used during model execution.
 This is useful specifically for JIT'ed kernels as we don't want JIT'ing to
 happen during model execution.
 """
+from typing import TYPE_CHECKING
+
 import torch
 
 import vllm.envs as envs
 from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup
+from vllm.platforms import current_platform
 from vllm.utils.deep_gemm import is_deep_gemm_supported
+from vllm.utils.flashinfer import has_flashinfer
+
+if TYPE_CHECKING:
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+    from vllm.v1.worker.gpu_worker import Worker
 
 
-def kernel_warmup(model: torch.nn.Module, max_tokens: int):
+def kernel_warmup(worker: "Worker"):
+    # Deep GEMM warmup
     do_deep_gemm_warmup = (envs.VLLM_USE_DEEP_GEMM
                            and is_deep_gemm_supported()
                            and not envs.VLLM_SKIP_DEEP_GEMM_WARMUP)
     if do_deep_gemm_warmup:
+        model = worker.get_model()
+        max_tokens = worker.scheduler_config.max_num_batched_tokens
         deep_gemm_warmup(model, max_tokens)
+
+    # FlashInfer autotune for Blackwell (SM 10.0) GPUs
+    if has_flashinfer() and current_platform.is_device_capability(100):
+        flashinfer_autotune(worker.model_runner)
+
+
+def flashinfer_autotune(runner: "GPUModelRunner") -> None:
+    """
+    Autotune FlashInfer operations.
+    FlashInfer have many implementations for the same operation,
+    autotuning runs benchmarks for each implementation and stores
+    the results. The results are cached transparently and
+    future calls to FlashInfer will use the best implementation.
+    Without autotuning, FlashInfer will rely on heuristics, which may
+    be significantly slower.
+    """
+    from vllm.utils.flashinfer import autotune
+
+    with torch.inference_mode(), autotune():
+        # We skip EPLB here since we don't want to record dummy metrics
+        # When autotuning with number of tokens m, flashinfer will autotune
+        # operations for all number of tokens up to m.
+        # So we only need to run with the max number of tokens.
+        runner._dummy_run(runner.scheduler_config.max_num_batched_tokens,
+                          skip_eplb=True,
+                          is_profile=True)
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 6b23ed4268..0d7d4b694f 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -14,6 +14,7 @@ import os
 from typing import Any, Callable, NoReturn, Optional
 
 import requests
+import torch
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -193,6 +194,75 @@ def use_trtllm_attention(
         return use_trtllm
 
 
+if has_flashinfer():
+
+    @torch.library.custom_op(
+        "vllm::flashinfer_mm_fp4",
+        mutates_args=[],
+        device_types="cuda",
+    )
+    def flashinfer_mm_fp4(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        A_scale: torch.Tensor,
+        B_scale: torch.Tensor,
+        g_scale: torch.Tensor,
+        dtype: torch.dtype,
+        backend: str,
+    ) -> torch.Tensor:
+        from flashinfer import mm_fp4 as flashinfer_mm_fp4_
+        return flashinfer_mm_fp4_(A,
+                                  B,
+                                  A_scale,
+                                  B_scale,
+                                  g_scale,
+                                  dtype,
+                                  block_size=16,
+                                  backend=backend)
+
+    @torch.library.register_fake("vllm::flashinfer_mm_fp4", )
+    def flashinfer_mm_fp4_fake(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        A_scale: torch.Tensor,
+        B_scale: torch.Tensor,
+        g_scale: torch.Tensor,
+        dtype: torch.dtype,
+        backend: str,
+    ) -> torch.Tensor:
+        return torch.empty(A.shape[0],
+                           B.shape[1],
+                           dtype=dtype,
+                           device=A.device)
+
+
+def flashinfer_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
+                             block_scale_a: torch.Tensor,
+                             block_scale_b: torch.Tensor, alpha: torch.Tensor,
+                             out_dtype: torch.dtype,
+                             backend: str) -> torch.Tensor:
+    assert a.ndim == 2 and b.ndim == 2
+    assert block_scale_a.ndim == 2 and block_scale_b.ndim == 2
+    assert a.stride(-1) == 1 and b.stride(-1) == 1
+    assert a.shape[1] == b.shape[1]
+    assert block_scale_a.shape[1] == a.shape[1] // 8
+    assert block_scale_b.shape[1] == b.shape[1] // 8
+
+    if backend == "cutlass":
+        block_scale_a = block_scale_a.view(torch.uint8)
+        block_scale_b = block_scale_b.view(torch.uint8)
+
+    return flashinfer_mm_fp4(
+        a,
+        b.t(),
+        block_scale_a,
+        block_scale_b.t(),
+        alpha,
+        out_dtype,
+        backend=backend,
+    )
+
+
 __all__ = [
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
@@ -205,4 +275,5 @@ __all__ = [
     "has_flashinfer_cutlass_fused_moe",
     "has_nvidia_artifactory",
     "use_trtllm_attention",
+    "flashinfer_scaled_fp4_mm",
 ]
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0ea23921a0..84f065f25f 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -310,6 +310,7 @@ class Worker(WorkerBase):
         for size in sorted(warmup_sizes, reverse=True):
             logger.info("Compile and warming up model for size %d", size)
             self.model_runner._dummy_run(size, skip_eplb=True)
+
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
 
@@ -340,8 +341,7 @@ class Worker(WorkerBase):
                     hidden_states=last_hidden_states)
 
         # Warmup kernels used during model execution
-        kernel_warmup(self.get_model(),
-                      max_tokens=self.scheduler_config.max_num_batched_tokens)
+        kernel_warmup(self)
 
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.

From 4121de512ea44bd2ad4fae22c96c437598be6b62 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 14 Aug 2025 17:32:09 -0400
Subject: [PATCH 269/932] [Quantization]: Support compressed-tensors
 mixed-precision model loading (#22468)

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 .../compressed_tensors/compressed_tensors.py  | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 69bced7c0b..637a843729 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -192,7 +192,15 @@ class CompressedTensorsConfig(QuantizationConfig):
                         quant_config.get("weights"))
 
                 target_scheme_map[target]["input_activations"] = None
-                if is_activation_quantization_format(quant_format):
+                target_scheme_map[target]["format"] = quant_config.get(
+                    "format")
+                format = target_scheme_map[target].get("format")
+                # If no per-config format defined, use global format in config
+                act_quant_format = is_activation_quantization_format(
+                    format
+                ) if format is not None else is_activation_quantization_format(
+                    quant_format)
+                if act_quant_format:
                     input_activations = quant_config.get("input_activations")
                     # The only case where we have activation quant supported
                     # but no input_activations provided in the config
@@ -389,8 +397,10 @@ class CompressedTensorsConfig(QuantizationConfig):
         return (is_channel_group and input_quant_none and is_static)
 
     def _get_scheme_from_parts(
-            self, weight_quant: BaseModel,
-            input_quant: BaseModel) -> "CompressedTensorsScheme":
+            self,
+            weight_quant: BaseModel,
+            input_quant: BaseModel,
+            format: Optional[str] = None) -> "CompressedTensorsScheme":
         # Detect If Mixed Precision
         if self._is_fp4a16_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A16Fp4()
@@ -412,7 +422,11 @@ class CompressedTensorsConfig(QuantizationConfig):
                     group_size=weight_quant.group_size,
                     actorder=weight_quant.actorder)
 
-        if is_activation_quantization_format(self.quant_format):
+        act_quant_format = is_activation_quantization_format(
+            format
+        ) if format is not None else is_activation_quantization_format(
+            self.quant_format)
+        if act_quant_format:
             if self._is_fp4a4_nvfp4(weight_quant, input_quant):
                 if cutlass_fp4_supported(
                 ) or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
@@ -507,6 +521,7 @@ class CompressedTensorsConfig(QuantizationConfig):
             scheme_dict = self.target_scheme_map[matched_target]
             weight_quant = scheme_dict.get("weights")
             input_quant = scheme_dict.get("input_activations")
+            format = scheme_dict.get("format")
 
         # Find the sparsity scheme of the layer
         # assume that fused layers inerhit first component's sparsity scheme
@@ -547,7 +562,7 @@ class CompressedTensorsConfig(QuantizationConfig):
             scheme = self._get_scheme_from_parts(  # type: ignore
                 weight_quant=weight_quant,
                 input_quant=input_quant,
-            )
+                format=format)
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)

From ebcce2cd36a75effd10556942f0467f5f670a080 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 14 Aug 2025 14:49:02 -0700
Subject: [PATCH 270/932] [Core] Return final response for aborted requests
 from `AsyncLLM.generate` (#22283)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/engine/test_async_llm.py  | 87 ++++++++++++++++++++++++++++++
 vllm/v1/engine/output_processor.py | 33 +++++++-----
 2 files changed, 107 insertions(+), 13 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 21694491dd..484640233f 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -13,6 +13,7 @@ from vllm.assets.image import ImageAsset
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs import PromptType
+from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.utils import set_default_torch_num_threads
@@ -398,3 +399,89 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
 
         # Test 3: Verify healthy engine still works after mock
         await engine.check_health()
+
+
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.asyncio
+async def test_abort_final_output(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+):
+    """Test that abort() returns a final output with correct information."""
+
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        request_id = "test-abort-final-output"
+
+        # Start a long-running request
+        sampling_params = SamplingParams(
+            max_tokens=3000,  # Long enough to allow abort
+            ignore_eos=True,
+            output_kind=output_kind,
+            temperature=0.5,
+            seed=42,
+        )
+
+        outputs: list[RequestOutput] = []
+        generated = asyncio.create_task(
+            collect_outputs(engine, request_id, TEXT_PROMPT, sampling_params,
+                            outputs))
+
+        # Let it generate some tokens
+        await asyncio.sleep(0.5)
+
+        # Abort the request
+        await engine.abort(request_id)
+
+        # Wait for generation to complete and return final output
+        final_output = await generated
+
+        # Verify we got a final output
+        assert final_output is not None
+        assert final_output.finished
+        assert len(final_output.outputs) == 1
+
+        assert final_output.outputs[0].finish_reason == "abort"
+        assert final_output.outputs[0].stop_reason is None
+
+        # Verify num_cached_tokens is set correctly
+        assert hasattr(final_output, 'num_cached_tokens')
+        assert final_output.num_cached_tokens >= 0
+
+        # If we got intermediate outputs, verify they are consistent
+        if output_kind == RequestOutputKind.DELTA:
+            # For DELTA, sum all intermediate tokens should <= final tokens
+            token_count = sum(
+                len(output.outputs[0].token_ids) for output in outputs)
+            assert token_count > 0
+            assert len(final_output.outputs[0].token_ids) == 0
+        else:
+            # For FINAL_ONLY, we should only get the final output
+            assert len(outputs) == 0
+            assert len(final_output.outputs[0].token_ids) > 0
+
+        assert not engine.output_processor.has_unfinished_requests()
+
+
+async def collect_outputs(
+    engine: AsyncLLM,
+    request_id: str,
+    prompt: PromptType,
+    sampling_params: SamplingParams,
+    outputs_list: list[RequestOutput],
+) -> Optional[RequestOutput]:
+    """Helper to collect outputs and return the final one."""
+    final_output: Optional[RequestOutput] = None
+    async for output in engine.generate(request_id=request_id,
+                                        prompt=prompt,
+                                        sampling_params=sampling_params):
+        if not output.finished:
+            outputs_list.append(output)
+        final_output = output
+    return final_output
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 3be6c48212..2ee55b585d 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -107,6 +107,7 @@ class RequestState:
         self.max_tokens_param = max_tokens_param
         self.is_prefilling = True
         self.queue = queue
+        self.num_cached_tokens = 0
 
         self.stats = RequestStateStats(
             arrival_time=arrival_time) if log_stats else None
@@ -167,7 +168,6 @@ class RequestState:
         finish_reason: Optional[FinishReason],
         stop_reason: Union[int, str, None],
         kv_transfer_params: Optional[dict[str, Any]] = None,
-        num_cached_tokens: int = 0,
     ) -> Optional[Union[RequestOutput, PoolingRequestOutput]]:
 
         finished = finish_reason is not None
@@ -195,7 +195,7 @@ class RequestState:
                 return None
 
         return self._new_request_output(request_id, outputs, finished,
-                                        kv_transfer_params, num_cached_tokens)
+                                        kv_transfer_params)
 
     def _new_request_output(
         self,
@@ -203,14 +203,14 @@ class RequestState:
         outputs: Union[list[CompletionOutput], list[PoolingOutput]],
         finished: bool,
         kv_transfer_params: Optional[dict[str, Any]] = None,
-        num_cached_tokens: int = 0,
     ) -> Union[RequestOutput, PoolingRequestOutput]:
 
-        if isinstance(outputs[0], PoolingOutput):
+        first_output = outputs[0]
+        if isinstance(first_output, PoolingOutput):
             assert len(outputs) == 1
             return PoolingRequestOutput(
                 request_id=request_id,
-                outputs=outputs[0],
+                outputs=first_output,
                 prompt_token_ids=self.prompt_token_ids,
                 finished=finished,
             )
@@ -229,7 +229,7 @@ class RequestState:
             outputs=cast(list[CompletionOutput], outputs),
             finished=finished,
             kv_transfer_params=kv_transfer_params,
-            num_cached_tokens=num_cached_tokens,
+            num_cached_tokens=self.num_cached_tokens,
         )
 
     def _new_completion_output(
@@ -308,11 +308,18 @@ class OutputProcessor:
             if req_state is not None:
                 self.lora_states.abort_request(req_state)
                 request_ids_to_abort.append(request_id)
-            else:
-                parent = self.parent_requests.pop(request_id, None)
-                if parent and parent.child_requests:
-                    self.abort_requests(parent.child_requests)
-                    request_ids_to_abort.extend(parent.child_requests)
+                # Produce final abort output.
+                if req_state.queue is not None and (
+                        request_output := req_state.make_request_output(
+                            [], None, FinishReason.ABORT, None, None)):
+                    req_state.queue.put(request_output)
+            elif parent := self.parent_requests.get(request_id):
+                # Abort children prior to removing the parent.
+                if parent.child_requests:
+                    child_reqs = list(parent.child_requests)
+                    child_reqs = self.abort_requests(child_reqs)
+                    request_ids_to_abort.extend(child_reqs)
+                self.parent_requests.pop(request_id, None)
         return request_ids_to_abort
 
     def add_request(
@@ -390,7 +397,7 @@ class OutputProcessor:
             finish_reason = engine_core_output.finish_reason
             stop_reason = engine_core_output.stop_reason
             kv_transfer_params = engine_core_output.kv_transfer_params
-            num_cached_tokens = engine_core_output.num_cached_tokens
+            req_state.num_cached_tokens = engine_core_output.num_cached_tokens
             req_state.is_prefilling = False
 
             if pooling_output is None:
@@ -411,7 +418,7 @@ class OutputProcessor:
             # 4) Create and handle RequestOutput objects.
             if request_output := req_state.make_request_output(
                     new_token_ids, pooling_output, finish_reason, stop_reason,
-                    kv_transfer_params, num_cached_tokens):
+                    kv_transfer_params):
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
                     req_state.queue.put(request_output)

From 919234fe17a701dfb5c7370df6cd1ea22202c4d7 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 14 Aug 2025 15:20:28 -0700
Subject: [PATCH 271/932] [BugFix] Fix initial DP request load imbalance
 (#22910)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core_client.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 5ffa555570..29ee0a9dfb 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -965,7 +965,7 @@ class DPAsyncMPClient(AsyncMPClient):
 
         # List of [waiting, running] pair per engine.
         # Used only by DPLBAsyncMPClient subclass.
-        self.lb_engines: list[list[int]] = []
+        self.lb_engines: list[list[int]] = [[0, 0] for _ in self.core_engines]
 
         self.first_req_sock_addr = get_open_zmq_inproc_path()
         self.first_req_send_socket = self.resources.first_req_send_socket = (
@@ -1121,10 +1121,8 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
     def get_core_engine_for_request(
             self, request: EngineCoreRequest) -> EngineIdentity:
         # Engines are in rank order.
-        current_counts = self.lb_engines
         if (eng_index := request.data_parallel_rank) is None:
-            if not current_counts:
-                return self.core_engine
+            current_counts = self.lb_engines
             # TODO use P2C alg for larger DP sizes
             num_engines = len(current_counts)
             min_score = sys.maxsize

From 39cd09dc86cadd80f3588e8d67ee9288012bcb9e Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Thu, 14 Aug 2025 19:37:22 -0400
Subject: [PATCH 272/932] [Bugfix] use flash attn on sm90 (#22933)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/platforms/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 63f6b373c3..483d5e1531 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -316,7 +316,7 @@ class CudaPlatformBase(Platform):
 
             # FlashAttention is the default for SM 8.0+ GPUs
             if cls.has_device_capability(80):
-                if has_sink:
+                if has_sink and not cls.is_device_capability(90):
                     logger.info_once("Using Triton backend on V1 engine.")
                     return TRITON_ATTN_VLLM_V1
                 if is_default_backend_supported := is_attn_backend_supported(

From 81f4b9648117772212326e938b366f3d7e92ef06 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 15 Aug 2025 08:21:29 +0800
Subject: [PATCH 273/932] [Kernel]  Add cuda kernel for gpt_oss activation
 (#22538)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 csrc/activation_kernels.cu                    | 59 +++++++++++++++++++
 csrc/ops.h                                    |  2 +
 csrc/torch_bindings.cpp                       |  5 ++
 tests/kernels/core/test_activation.py         | 45 ++++++++++++--
 vllm/model_executor/layers/activation.py      | 41 ++++++++++++-
 .../layers/fused_moe/fused_moe.py             | 18 ++----
 .../layers/quantization/utils/mxfp4_utils.py  |  2 +-
 vllm/model_executor/models/gpt_oss.py         |  2 +-
 8 files changed, 150 insertions(+), 24 deletions(-)

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 55e6596797..a4a880f13c 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -128,6 +128,45 @@ __global__ void act_and_mul_kernel_with_param(
   }
 }
 
+template <typename T>
+__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
+                                               float alpha, float limit) {
+  // clamp gate: min=None, max=limit
+  const float gate_f = (float)gate;
+  const float clamped_gate = gate_f > limit ? limit : gate_f;
+
+  // clamp up: min=-limit, max=limit
+  const float up_f = (float)up;
+  const float clamped_up =
+      up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
+
+  // glu = gate * sigmoid(gate * alpha)
+  const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
+  const float glu = clamped_gate * sigmoid_val;
+
+  // (up + 1) * glu
+  return (T)((clamped_up + 1.0f) * glu);
+}
+
+template <typename scalar_t,
+          scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
+                             const float)>
+__global__ void swigluoai_and_mul_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d, const float alpha, const float limit) {
+  const int64_t token_idx = blockIdx.x;
+  // TODO: Vectorize loads and stores.
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    // gate = x[..., ::2]  (even indices)
+    const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
+    // up = x[..., 1::2]   (odd indices)
+    const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
+
+    out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
+  }
+}
+
 }  // namespace vllm
 
 #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM)         \
@@ -145,11 +184,31 @@ __global__ void act_and_mul_kernel_with_param(
                                          PARAM);                        \
       });
 
+#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT)                          \
+  int d = input.size(-1) / 2;                                                  \
+  int64_t num_tokens = input.numel() / input.size(-1);                         \
+  dim3 grid(num_tokens);                                                       \
+  dim3 block(std::min(d, 1024));                                               \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
+  VLLM_DISPATCH_FLOATING_TYPES(                                                \
+      input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] {            \
+        vllm::swigluoai_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),             \
+                                         input.data_ptr<scalar_t>(), d, ALPHA, \
+                                         LIMIT);                               \
+      });
+
 void fatrelu_and_mul(torch::Tensor& out,    // [..., d],
                      torch::Tensor& input,  // [..., 2 * d]
                      double threshold) {
   LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
 }
+void swigluoai_and_mul(torch::Tensor& out,    // [..., d]
+                       torch::Tensor& input,  // [..., 2 * d]
+                       double alpha, double limit) {
+  LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit);
+}
 namespace vllm {
 
 // Element-wise activation kernel template.
diff --git a/csrc/ops.h b/csrc/ops.h
index 207291eceb..8b41b95473 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -138,6 +138,8 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
 
 void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
                      double threshold);
+void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input,
+                       double alpha = 1.702, double limit = 7.0);
 
 void gelu_new(torch::Tensor& out, torch::Tensor& input);
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 8c207be083..41e9bc8a5e 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -130,6 +130,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()");
   ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul);
 
+  ops.def(
+      "swigluoai_and_mul(Tensor! out, Tensor input, float alpha, float limit) "
+      "-> ()");
+  ops.impl("swigluoai_and_mul", torch::kCUDA, &swigluoai_and_mul);
+
   // GELU implementation used in GPT-2.
   ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_new", torch::kCUDA, &gelu_new);
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index 29c5e70a8b..ec5c60fd7b 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -11,7 +11,7 @@ from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
                                                    GeluAndMul, MulAndSilu,
                                                    NewGELU, QuickGELU,
-                                                   SiluAndMul)
+                                                   SiluAndMul, SwigluOAIAndMul)
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -25,7 +25,15 @@ CUDA_DEVICES = [
 
 @pytest.mark.parametrize(
     "activation",
-    ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"])
+    [
+        "silu_and_mul",
+        "mul_and_silu",
+        "gelu",
+        "gelu_tanh",
+        "fatrelu",
+        "swigluoai_and_mul",
+    ],
+)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -59,18 +67,43 @@ def test_act_and_mul(
         threshold = random.uniform(0, 1)
         layer = FatreluAndMul(threshold)
         fn = torch.ops._C.fatrelu_and_mul
+    elif activation == "swigluoai_and_mul":
+        layer = SwigluOAIAndMul()
+        fn = torch.ops._C.swigluoai_and_mul
     out = layer(x)
     ref_out = layer.forward_native(x)
-    # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
-    # equivalent to the native PyTorch implementations, so we can do exact
-    # comparison.
-    torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
+    if activation == "swigluoai_and_mul":
+
+        rtol = {
+            #For fp16, change the relative tolerance from 1e-3 to 2e-3
+            torch.float16:
+            2e-3,
+            torch.bfloat16:
+            2e-2,
+            torch.float:
+            1.3e-6
+        }
+
+        def _get_rtol(output) -> float:
+            return rtol[output.dtype]
+
+        torch.testing.assert_close(out,
+                                   ref_out,
+                                   atol=get_default_atol(out),
+                                   rtol=_get_rtol(out))
+    else:
+        # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
+        # equivalent to the native PyTorch implementations, so we can do exact
+        # comparison.
+        torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
 
     d = x.shape[-1] // 2
     output_shape = (x.shape[:-1] + (d, ))
     out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
     if activation == "fatrelu":
         opcheck(fn, (out, x, threshold))
+    elif activation == "swigluoai_and_mul":
+        opcheck(fn, (out, x, layer.alpha, layer.limit))
     else:
         opcheck(fn, (out, x))
 
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 7ce44174ea..5f89dadec8 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -239,6 +239,35 @@ class GeluAndMul(CustomOp):
         return f'approximate={repr(self.approximate)}'
 
 
+@CustomOp.register("swigluoai_and_mul")
+class SwigluOAIAndMul(CustomOp):
+    # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110
+    def __init__(self, alpha: float = 1.702, limit: float = 7.0):
+        super().__init__()
+        self.alpha = alpha
+        self.limit = limit
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+
+        gate, up = x[..., ::2], x[..., 1::2]
+        gate = gate.clamp(min=None, max=self.limit)
+        up = up.clamp(min=-self.limit, max=self.limit)
+        glu = gate * torch.sigmoid(gate * self.alpha)
+        gated_output = (up + 1) * glu
+        return gated_output
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit)
+        return out
+
+    def extra_repr(self) -> str:
+        return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}"
+
+
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
 
@@ -330,6 +359,7 @@ class ReLUSquaredActivation(CustomOp):
         return torch.square(F.relu(x))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        #TODO : implement cuda kenrels
         return self.forward_native(x)
 
 
@@ -406,9 +436,14 @@ def get_act_fn(act_fn_name: str) -> nn.Module:
 
 
 _ACTIVATION_AND_MUL_REGISTRY = LazyDict({
-    "gelu": lambda: GeluAndMul(),
-    "silu": lambda: SiluAndMul(),
-    "geglu": lambda: GeluAndMul(),
+    "gelu":
+    lambda: GeluAndMul(),
+    "silu":
+    lambda: SiluAndMul(),
+    "geglu":
+    lambda: GeluAndMul(),
+    "swigluoai_and_mul":
+    lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs),
 })
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 98087a35e1..23ebad36da 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1633,17 +1633,6 @@ def fused_experts_impl(
                                 block_shape=block_shape,
                                 B_bias=w1_bias)
 
-        # TODO fused kernel
-        def swiglu_oai(gate_up):
-            alpha = 1.702
-            limit = 7.0
-            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-            gate = gate.clamp(min=None, max=limit)
-            up = up.clamp(min=-limit, max=limit)
-            glu = gate * torch.sigmoid(gate * alpha)
-            gated_output = (up + 1) * glu
-            return gated_output
-
         # Activation function with multiplication
         if activation == "silu" and is_act_and_mul:
             torch.ops._C.silu_and_mul(intermediate_cache2,
@@ -1651,13 +1640,16 @@ def fused_experts_impl(
         elif activation == "gelu" and is_act_and_mul:
             torch.ops._C.gelu_and_mul(intermediate_cache2,
                                       intermediate_cache1.view(-1, N))
+        elif activation == "swigluoai" and is_act_and_mul:
+            # alpha = 1.702, limit = 7.0
+            torch.ops._C.swigluoai_and_mul(intermediate_cache2,
+                                           intermediate_cache1.view(-1, N))
         # Activation function without multiplication
         elif activation == "silu":
             intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
         elif activation == "gelu":
             intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
-        elif activation == "swiglu_oai":
-            intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N))
+
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}, "
                              f"with is_act_and_mul={is_act_and_mul}.")
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index deeb69bcad..dca38a019e 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -68,7 +68,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
     return not (use_grouped_topk or topk_group or num_expert_group
                 or expert_map or custom_routing_function
                 or e_score_correction_bias or apply_router_weight_on_input
-                or scoring_func != "softmax" or activation != "swiglu_oai"
+                or scoring_func != "softmax" or activation != "swigluoai"
                 or expert_load_view or logical_to_physical_map
                 or logical_replica_count)
 
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 7c7712dbe1..2f5d9ddd90 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -159,7 +159,7 @@ class MLPBlock(torch.nn.Module):
                                 prefix=f"{prefix}.experts",
                                 apply_router_weight_on_input=False,
                                 has_bias=True,
-                                activation="swiglu_oai")
+                                activation="swigluoai")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         t = self.norm(x)

From f1f0d2fab8a1b1ced68ccf5b7197393cf01e1a02 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 14 Aug 2025 17:38:10 -0700
Subject: [PATCH 274/932] Revert "[Kernel]  Add cuda kernel for gpt_oss
 activation" (#22948)

---
 csrc/activation_kernels.cu                    | 59 -------------------
 csrc/ops.h                                    |  2 -
 csrc/torch_bindings.cpp                       |  5 --
 tests/kernels/core/test_activation.py         | 45 ++------------
 vllm/model_executor/layers/activation.py      | 41 +------------
 .../layers/fused_moe/fused_moe.py             | 18 ++++--
 .../layers/quantization/utils/mxfp4_utils.py  |  2 +-
 vllm/model_executor/models/gpt_oss.py         |  2 +-
 8 files changed, 24 insertions(+), 150 deletions(-)

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index a4a880f13c..55e6596797 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -128,45 +128,6 @@ __global__ void act_and_mul_kernel_with_param(
   }
 }
 
-template <typename T>
-__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
-                                               float alpha, float limit) {
-  // clamp gate: min=None, max=limit
-  const float gate_f = (float)gate;
-  const float clamped_gate = gate_f > limit ? limit : gate_f;
-
-  // clamp up: min=-limit, max=limit
-  const float up_f = (float)up;
-  const float clamped_up =
-      up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
-
-  // glu = gate * sigmoid(gate * alpha)
-  const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
-  const float glu = clamped_gate * sigmoid_val;
-
-  // (up + 1) * glu
-  return (T)((clamped_up + 1.0f) * glu);
-}
-
-template <typename scalar_t,
-          scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
-                             const float)>
-__global__ void swigluoai_and_mul_kernel(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., 2, d]
-    const int d, const float alpha, const float limit) {
-  const int64_t token_idx = blockIdx.x;
-  // TODO: Vectorize loads and stores.
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    // gate = x[..., ::2]  (even indices)
-    const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
-    // up = x[..., 1::2]   (odd indices)
-    const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
-
-    out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
-  }
-}
-
 }  // namespace vllm
 
 #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM)         \
@@ -184,31 +145,11 @@ __global__ void swigluoai_and_mul_kernel(
                                          PARAM);                        \
       });
 
-#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT)                          \
-  int d = input.size(-1) / 2;                                                  \
-  int64_t num_tokens = input.numel() / input.size(-1);                         \
-  dim3 grid(num_tokens);                                                       \
-  dim3 block(std::min(d, 1024));                                               \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
-  VLLM_DISPATCH_FLOATING_TYPES(                                                \
-      input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] {            \
-        vllm::swigluoai_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
-            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),             \
-                                         input.data_ptr<scalar_t>(), d, ALPHA, \
-                                         LIMIT);                               \
-      });
-
 void fatrelu_and_mul(torch::Tensor& out,    // [..., d],
                      torch::Tensor& input,  // [..., 2 * d]
                      double threshold) {
   LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
 }
-void swigluoai_and_mul(torch::Tensor& out,    // [..., d]
-                       torch::Tensor& input,  // [..., 2 * d]
-                       double alpha, double limit) {
-  LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit);
-}
 namespace vllm {
 
 // Element-wise activation kernel template.
diff --git a/csrc/ops.h b/csrc/ops.h
index 8b41b95473..207291eceb 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -138,8 +138,6 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
 
 void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
                      double threshold);
-void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input,
-                       double alpha = 1.702, double limit = 7.0);
 
 void gelu_new(torch::Tensor& out, torch::Tensor& input);
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 41e9bc8a5e..8c207be083 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -130,11 +130,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()");
   ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul);
 
-  ops.def(
-      "swigluoai_and_mul(Tensor! out, Tensor input, float alpha, float limit) "
-      "-> ()");
-  ops.impl("swigluoai_and_mul", torch::kCUDA, &swigluoai_and_mul);
-
   // GELU implementation used in GPT-2.
   ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_new", torch::kCUDA, &gelu_new);
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index ec5c60fd7b..29c5e70a8b 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -11,7 +11,7 @@ from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
                                                    GeluAndMul, MulAndSilu,
                                                    NewGELU, QuickGELU,
-                                                   SiluAndMul, SwigluOAIAndMul)
+                                                   SiluAndMul)
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -25,15 +25,7 @@ CUDA_DEVICES = [
 
 @pytest.mark.parametrize(
     "activation",
-    [
-        "silu_and_mul",
-        "mul_and_silu",
-        "gelu",
-        "gelu_tanh",
-        "fatrelu",
-        "swigluoai_and_mul",
-    ],
-)
+    ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"])
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -67,43 +59,18 @@ def test_act_and_mul(
         threshold = random.uniform(0, 1)
         layer = FatreluAndMul(threshold)
         fn = torch.ops._C.fatrelu_and_mul
-    elif activation == "swigluoai_and_mul":
-        layer = SwigluOAIAndMul()
-        fn = torch.ops._C.swigluoai_and_mul
     out = layer(x)
     ref_out = layer.forward_native(x)
-    if activation == "swigluoai_and_mul":
-
-        rtol = {
-            #For fp16, change the relative tolerance from 1e-3 to 2e-3
-            torch.float16:
-            2e-3,
-            torch.bfloat16:
-            2e-2,
-            torch.float:
-            1.3e-6
-        }
-
-        def _get_rtol(output) -> float:
-            return rtol[output.dtype]
-
-        torch.testing.assert_close(out,
-                                   ref_out,
-                                   atol=get_default_atol(out),
-                                   rtol=_get_rtol(out))
-    else:
-        # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
-        # equivalent to the native PyTorch implementations, so we can do exact
-        # comparison.
-        torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
+    # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
+    # equivalent to the native PyTorch implementations, so we can do exact
+    # comparison.
+    torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
 
     d = x.shape[-1] // 2
     output_shape = (x.shape[:-1] + (d, ))
     out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
     if activation == "fatrelu":
         opcheck(fn, (out, x, threshold))
-    elif activation == "swigluoai_and_mul":
-        opcheck(fn, (out, x, layer.alpha, layer.limit))
     else:
         opcheck(fn, (out, x))
 
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 5f89dadec8..7ce44174ea 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -239,35 +239,6 @@ class GeluAndMul(CustomOp):
         return f'approximate={repr(self.approximate)}'
 
 
-@CustomOp.register("swigluoai_and_mul")
-class SwigluOAIAndMul(CustomOp):
-    # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110
-    def __init__(self, alpha: float = 1.702, limit: float = 7.0):
-        super().__init__()
-        self.alpha = alpha
-        self.limit = limit
-
-    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
-        """PyTorch-native implementation equivalent to forward()."""
-
-        gate, up = x[..., ::2], x[..., 1::2]
-        gate = gate.clamp(min=None, max=self.limit)
-        up = up.clamp(min=-self.limit, max=self.limit)
-        glu = gate * torch.sigmoid(gate * self.alpha)
-        gated_output = (up + 1) * glu
-        return gated_output
-
-    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit)
-        return out
-
-    def extra_repr(self) -> str:
-        return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}"
-
-
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
 
@@ -359,7 +330,6 @@ class ReLUSquaredActivation(CustomOp):
         return torch.square(F.relu(x))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        #TODO : implement cuda kenrels
         return self.forward_native(x)
 
 
@@ -436,14 +406,9 @@ def get_act_fn(act_fn_name: str) -> nn.Module:
 
 
 _ACTIVATION_AND_MUL_REGISTRY = LazyDict({
-    "gelu":
-    lambda: GeluAndMul(),
-    "silu":
-    lambda: SiluAndMul(),
-    "geglu":
-    lambda: GeluAndMul(),
-    "swigluoai_and_mul":
-    lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs),
+    "gelu": lambda: GeluAndMul(),
+    "silu": lambda: SiluAndMul(),
+    "geglu": lambda: GeluAndMul(),
 })
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 23ebad36da..98087a35e1 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1633,6 +1633,17 @@ def fused_experts_impl(
                                 block_shape=block_shape,
                                 B_bias=w1_bias)
 
+        # TODO fused kernel
+        def swiglu_oai(gate_up):
+            alpha = 1.702
+            limit = 7.0
+            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+            gate = gate.clamp(min=None, max=limit)
+            up = up.clamp(min=-limit, max=limit)
+            glu = gate * torch.sigmoid(gate * alpha)
+            gated_output = (up + 1) * glu
+            return gated_output
+
         # Activation function with multiplication
         if activation == "silu" and is_act_and_mul:
             torch.ops._C.silu_and_mul(intermediate_cache2,
@@ -1640,16 +1651,13 @@ def fused_experts_impl(
         elif activation == "gelu" and is_act_and_mul:
             torch.ops._C.gelu_and_mul(intermediate_cache2,
                                       intermediate_cache1.view(-1, N))
-        elif activation == "swigluoai" and is_act_and_mul:
-            # alpha = 1.702, limit = 7.0
-            torch.ops._C.swigluoai_and_mul(intermediate_cache2,
-                                           intermediate_cache1.view(-1, N))
         # Activation function without multiplication
         elif activation == "silu":
             intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
         elif activation == "gelu":
             intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
-
+        elif activation == "swiglu_oai":
+            intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N))
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}, "
                              f"with is_act_and_mul={is_act_and_mul}.")
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index dca38a019e..deeb69bcad 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -68,7 +68,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
     return not (use_grouped_topk or topk_group or num_expert_group
                 or expert_map or custom_routing_function
                 or e_score_correction_bias or apply_router_weight_on_input
-                or scoring_func != "softmax" or activation != "swigluoai"
+                or scoring_func != "softmax" or activation != "swiglu_oai"
                 or expert_load_view or logical_to_physical_map
                 or logical_replica_count)
 
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 2f5d9ddd90..7c7712dbe1 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -159,7 +159,7 @@ class MLPBlock(torch.nn.Module):
                                 prefix=f"{prefix}.experts",
                                 apply_router_weight_on_input=False,
                                 has_bias=True,
-                                activation="swigluoai")
+                                activation="swiglu_oai")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         t = self.norm(x)

From 0933f9d518ecb18e8427bf42731d2a3d00276883 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 14 Aug 2025 18:39:43 -0700
Subject: [PATCH 275/932] [BugFix][KVConn] Fix use of
 `get_required_kvcache_layout` (#22734)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/distributed/kv_transfer/kv_connector/v1/base.py         | 4 ++++
 .../kv_transfer/kv_connector/v1/multi_connector.py           | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index b721043978..07fcdecac6 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -325,4 +325,8 @@ class KVConnectorBase_V1(ABC):
             str: the required KV cache layout. e.g. HND, or NHD.
             None if the connector does not require a specific layout.
         """
+
+        if cls is KVConnectorBase_V1:
+            raise TypeError("get_required_kvcache_layout should not be called "
+                            "on the abstract base class")
         return None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 7d67c76e2f..d3f6a226dc 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -228,9 +228,10 @@ class MultiConnector(KVConnectorBase_V1):
         for ktc in ktcs:
             kv_transfer_config = KVTransferConfig(**ktc)
             temp_vllm_config.kv_transfer_config = kv_transfer_config
+            connector_cls = KVConnectorFactory.get_connector_class(
+                kv_transfer_config)
             required_kvcache_layout = (
-                KVConnectorBase_V1.get_required_kvcache_layout(
-                    temp_vllm_config))
+                connector_cls.get_required_kvcache_layout(temp_vllm_config))
             if required_kvcache_layout is not None:
                 layouts.add(required_kvcache_layout)
 

From ae05a6d83d8c7aab9bdfb98f92f915a3a1f852bd Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 14 Aug 2025 20:17:11 -0700
Subject: [PATCH 276/932] [BugFix] Fix port lookup in internal DP LB tests
 (#22252)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/test_internal_lb_dp.py | 54 +++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py
index ca80d3a494..2b031865ca 100644
--- a/tests/v1/test_internal_lb_dp.py
+++ b/tests/v1/test_internal_lb_dp.py
@@ -4,6 +4,8 @@ import asyncio
 import os
 import threading
 import time
+import traceback
+from typing import Optional, cast
 
 import openai  # use the official client for correctness check
 import pytest
@@ -41,12 +43,15 @@ class MultinodeInternalLBServerManager:
         self.tp_size = tp_size
         self.api_server_count = api_server_count
         self.base_server_args = base_server_args
-        self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
+        self.servers: list[Optional[tuple[RemoteOpenAIServer,
+                                          list[str]]]] = [None] * (dp_size //
+                                                                   dp_per_node)
         self.server_threads: list[threading.Thread] = []
 
     def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
         """Start all server instances for multi-node internal LB mode."""
-        for rank in range(0, self.dp_size, self.dp_per_node):
+        for server_idx, rank in enumerate(
+                range(0, self.dp_size, self.dp_per_node)):
             # Create server args for this specific rank
             server_args = self.base_server_args.copy()
 
@@ -87,7 +92,7 @@ class MultinodeInternalLBServerManager:
                 ])
 
             # Use a thread to start each server to allow parallel initialization
-            def start_server(r: int, sargs: list[str]):
+            def start_server(sidx: int, r: int, sargs: list[str]):
                 gpus_per_node = self.tp_size * self.dp_per_node
                 try:
                     # Start the server
@@ -110,13 +115,14 @@ class MultinodeInternalLBServerManager:
                             f"{self.api_server_count} API servers")
                     else:
                         print(f"Headless node (rank {r}) started successfully")
-                    self.servers.append((server, sargs))
+                    self.servers[sidx] = (server, sargs)
                 except Exception as e:
                     print(f"Failed to start server rank {r}: {e}")
+                    traceback.print_exc()
                     raise
 
             thread = threading.Thread(target=start_server,
-                                      args=(rank, server_args))
+                                      args=(server_idx, rank, server_args))
             thread.start()
 
             self.server_threads.append(thread)
@@ -128,18 +134,20 @@ class MultinodeInternalLBServerManager:
         # Give servers additional time to fully initialize and coordinate
         time.sleep(3)
 
-        if len(self.servers) != self.dp_size // self.dp_per_node:
+        if not all(self.servers):
             raise Exception("Servers failed to start")
 
-        return self.servers
+        return cast(list[tuple[RemoteOpenAIServer, list[str]]], self.servers)
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Stop all server instances."""
         while self.servers:
-            try:
-                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
-            except Exception as e:
-                print(f"Error stopping server: {e}")
+            if server := self.servers.pop():
+                try:
+                    server[0].__exit__(exc_type, exc_val, exc_tb)
+                except Exception as e:
+                    print(f"Error stopping server: {e}")
+                    traceback.print_exc()
 
 
 class APIOnlyServerManager:
@@ -157,7 +165,8 @@ class APIOnlyServerManager:
         self.tp_size = tp_size
         self.api_server_count = api_server_count
         self.base_server_args = base_server_args
-        self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
+        self.servers: list[Optional[tuple[RemoteOpenAIServer,
+                                          list[str]]]] = [None] * 2
         self.server_threads: list[threading.Thread] = []
 
     def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
@@ -209,7 +218,7 @@ class APIOnlyServerManager:
                 server.__enter__()
                 print(f"API-only server started successfully with "
                       f"{self.api_server_count} API servers")
-                self.servers.append((server, api_server_args))
+                self.servers[0] = (server, api_server_args)
             except Exception as e:
                 print(f"Failed to start API-only server: {e}")
                 raise
@@ -231,7 +240,7 @@ class APIOnlyServerManager:
                 server.__enter__()
                 print(f"Headless engines server started successfully with "
                       f"{self.dp_size} engines")
-                self.servers.append((server, engines_server_args))
+                self.servers[1] = (server, engines_server_args)
             except Exception as e:
                 print(f"Failed to start headless engines server: {e}")
                 raise
@@ -253,18 +262,20 @@ class APIOnlyServerManager:
         # Give servers additional time to fully initialize and coordinate
         time.sleep(3)
 
-        if len(self.servers) != 2:
+        if not all(self.servers):
             raise Exception("Both servers failed to start")
 
-        return self.servers
+        return cast(list[tuple[RemoteOpenAIServer, list[str]]], self.servers)
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Stop both server instances."""
         while self.servers:
-            try:
-                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
-            except Exception as e:
-                print(f"Error stopping server: {e}")
+            if server := self.servers.pop():
+                try:
+                    server[0].__exit__(exc_type, exc_val, exc_tb)
+                except Exception as e:
+                    print(f"Error stopping server: {e}")
+                    traceback.print_exc()
 
 
 @pytest.fixture(scope="module")
@@ -560,7 +571,7 @@ async def test_api_only_multinode_dp_completion(
     assert len(results) == num_requests
     assert all(completion is not None for completion in results)
 
-    _, api_server_args = api_only_servers[0]
+    api_server, api_server_args = api_only_servers[0]
     api_server_count = (
         api_server_args.count('--api-server-count')
         and api_server_args[api_server_args.index('--api-server-count') + 1]
@@ -569,7 +580,6 @@ async def test_api_only_multinode_dp_completion(
           f"engines on headless server (API server count: {api_server_count})")
 
     # Check request balancing via Prometheus metrics
-    api_server = api_only_servers[0][0]
     check_request_balancing(api_server, DP_SIZE)
 
 
From 590bddbfc5e5b03db775346fc3c8f929d8a99f61 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 14 Aug 2025 23:25:34 -0400
Subject: [PATCH 277/932] [CI Perf] Prune tests in
 `tests/kernels/quantization/` (#22942)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/kernels/quantization/test_fp8_quant.py  |  8 +--
 tests/kernels/quantization/test_int8_quant.py |  7 +--
 tests/kernels/quantization/test_machete_mm.py |  4 --
 .../kernels/quantization/test_marlin_gemm.py  |  4 --
 .../quantization/test_rocm_skinny_gemms.py    | 60 +++++++++++++++----
 .../quantization/test_triton_scaled_mm.py     | 16 +++--
 6 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/tests/kernels/quantization/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py
index 0a3edd4ddc..c2e70ffb8d 100644
--- a/tests/kernels/quantization/test_fp8_quant.py
+++ b/tests/kernels/quantization/test_fp8_quant.py
@@ -11,11 +11,9 @@ from tests.kernels.quant_utils import (FP8_DTYPE,
 from tests.kernels.utils import opcheck
 from vllm.platforms import current_platform
 
-DTYPES = [torch.half, torch.bfloat16, torch.float]
-HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192,
-                8193]  # Arbitrary values for testing
-HIDDEN_SIZES += list(range(1024, 1033))  # vectorized conversion edge cases
-NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
+DTYPES = [torch.bfloat16, torch.float]
+HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
+NUM_TOKENS = [1, 7, 4096]
 SCALE_UBS = [True, False]
 SEEDS = [0]
 
diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py
index 5a37b976db..c1c9bf191d 100644
--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
@@ -9,10 +9,9 @@ from tests.kernels.utils import opcheck
 from vllm._custom_ops import scaled_int8_quant
 from vllm.platforms import current_platform
 
-DTYPES = [torch.half, torch.bfloat16, torch.float]
-HIDDEN_SIZES = [16, 67, 768, 5137, 8193]  # Arbitrary values for testing
-HIDDEN_SIZES += list(range(1024, 1033))  # vectorized conversion edge cases
-NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
+DTYPES = [torch.bfloat16, torch.float]
+HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
+NUM_TOKENS = [1, 7, 4096]
 SEEDS = [0]
 SCALE = [0.1, 2.1]
 
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index a7cb2a4e7f..a842d2f1cb 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -34,8 +34,6 @@ IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
 
 MNK_SHAPES = [
     (1, 128, 128),
-    (1, 512, 1024),
-    (1, 4096, 4096),
     (1, 8192, 28672),
     (13, 8192, 4096),
     (26, 4096, 8192),
@@ -43,8 +41,6 @@ MNK_SHAPES = [
     (64, 8192, 28672),
     (257, 128, 4096),
     (257, 4224, 4160),
-    (257, 4096, 4096),
-    (1024, 4096, 8192),
     (1024, 8192, 4096),
 ]
 
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 1bd6713ce7..cea7700ac3 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -53,12 +53,8 @@ HQQ_SUPPORTED_GROUP_SIZES = [64]
 MNK_FACTORS = [
     (1, 1, 1),
     (1, 4, 8),
-    (1, 7, 5),
-    (13, 17, 67),
     (26, 37, 13),
-    (67, 13, 11),
     (257, 13, 11),
-    (658, 13, 11),
 ]
 
 DTYPES = [torch.float16, torch.bfloat16]
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
index 533a4fe596..03d5d98739 100644
--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -8,15 +8,55 @@ from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant
 from vllm.platforms import current_platform
 
 DTYPES = [torch.bfloat16, torch.float16]
-M = [16, 32, 64, 128, 256, 512, 1024, 4096, 8192]
-K = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 6144, 8192]  # k % 8 == 0
-N = [1, 2, 3, 4]
+# Specific (N, K, M) combinations for targeted testing
+NKM_FACTORS_LLMM1 = [
+    # Small, medium, large cases
+    (1, 8, 16),
+    (1, 32, 64),
+    (1, 128, 256),
+    (1, 512, 1024),
+    (1, 2048, 4096),
+    # Edge cases with specific K sizes
+    (1, 6144, 1024),
+    (1, 8192, 2048),
+    # Very large case
+    (1, 4096, 8192),
+]
+
+NKM_FACTORS_WVSPLITK = [
+    # Different batch sizes with key dimensions
+    (1, 16, 16),
+    (1, 64, 64),
+    (2, 256, 256),
+    (3, 1024, 1024),
+    (4, 4096, 4096),
+    # Extended K values
+    (1, 9216, 512),
+    (2, 10240, 1024),
+    (4, 16384, 8192),
+    # Minimum M constraint validation (m >= 8)
+    (1, 64, 8),
+    (2, 128, 8),
+    (4, 256, 8),
+]
+
+NKM_FACTORS_WVSPLITK_FP8 = [
+    # FP8-specific cases with K % 16 == 0
+    (1, 16, 16),
+    (1, 64, 64),
+    (2, 512, 512),
+    (3, 2048, 2048),
+    (4, 4096, 4096),
+    # Extended FP8 dimensions not covered by WVSPLITK
+    (1, 14336, 1024),
+    (2, 24576, 2048),
+    (4, 32768, 28672),
+]
+
 SEEDS = [0]
 
 
-@pytest.mark.parametrize("n", [1])  # only test for batch size 1
-@pytest.mark.parametrize("k", K)
-@pytest.mark.parametrize("m", M)
+@pytest.mark.parametrize("n,k,m", NKM_FACTORS_LLMM1)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("rows_per_block", [2, 4, 8, 16])
 @pytest.mark.parametrize("seed", SEEDS)
@@ -34,9 +74,7 @@ def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
     assert torch.allclose(out, ref_out, rtol=0.01)
 
 
-@pytest.mark.parametrize("n", N)  # only test for batch size <= 4
-@pytest.mark.parametrize("k", K + [9216, 10240, 16384])
-@pytest.mark.parametrize("m", [8] + M)  # m >= 8
+@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(not current_platform.is_rocm(),
@@ -54,9 +92,7 @@ def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
     assert torch.allclose(out, ref_out, rtol=0.01)
 
 
-@pytest.mark.parametrize("n", N)  # only test for batch size <= 4
-@pytest.mark.parametrize("k", K[1:] + [14336, 24576, 32768])  # k % 16 == 0
-@pytest.mark.parametrize("m", M + [28672])  # m >= 16
+@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index 8a2cc3bace..24245663fb 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -60,10 +60,18 @@ def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path,
                                             num_logprobs)
 
 
-@pytest.mark.parametrize("M", [1, 33, 64, 512])
-@pytest.mark.parametrize("N", [256, 971, 20486])
-@pytest.mark.parametrize("K", [128, 496, 1024])
-@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16])
+MNK_FACTORS = [
+    (1, 256, 128),
+    (33, 256, 496),
+    (64, 971, 1024),
+    (64, 20486, 128),
+    (512, 256, 496),
+    (512, 20486, 1024),
+]
+
+
+@pytest.mark.parametrize("M,N,K", MNK_FACTORS)
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16])
 @pytest.mark.parametrize("in_dtype", get_8bit_types())
 @pytest.mark.parametrize("use_scalar_scale_a", [True, False])
 @pytest.mark.parametrize("use_scalar_scale_b", [True, False])

From d2b0e97ea6538c7f3f38880efc830afd7abfa5d4 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 14 Aug 2025 23:33:42 -0400
Subject: [PATCH 278/932] [CI Perf] Prune tests in `tests/kernels/moe/`
 (#22939)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/kernels/moe/test_batched_moe.py         | 13 +++-----
 .../moe/test_count_expert_num_tokens.py       |  5 ++-
 tests/kernels/moe/test_moe.py                 | 33 +++++++++++++------
 .../kernels/moe/test_moe_align_block_size.py  |  6 ++--
 .../kernels/moe/test_moe_permute_unpermute.py |  8 ++---
 tests/kernels/moe/test_pplx_moe.py            | 12 +++++--
 6 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 69317405d4..edf3e61892 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -89,14 +89,11 @@ class BatchedMMTensors:
         return BatchedMMTensors(A, B, C, num_expert_tokens)
 
 
-@pytest.mark.parametrize("num_experts", [8, 16, 32])
-@pytest.mark.parametrize("max_tokens_per_expert",
-                         [32, 64, 128, 192, 224, 256, 512])
-@pytest.mark.parametrize("K", [128, 256, 1024])
-@pytest.mark.parametrize("N", [128, 256, 1024])
-@pytest.mark.parametrize(
-    "dtype",
-    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("num_experts", [8, 32])
+@pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512])
+@pytest.mark.parametrize("K", [128, 1024])
+@pytest.mark.parametrize("N", [128, 1024])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
diff --git a/tests/kernels/moe/test_count_expert_num_tokens.py b/tests/kernels/moe/test_count_expert_num_tokens.py
index 0872836b60..1768baaf1c 100644
--- a/tests/kernels/moe/test_count_expert_num_tokens.py
+++ b/tests/kernels/moe/test_count_expert_num_tokens.py
@@ -113,8 +113,7 @@ def do_test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
                                    rtol=0)
 
 
-@pytest.mark.parametrize(
-    "num_tokens", [1, 4, 8, 11, 19, 128, 127, 405, 1024, 3333, 6666, 7317])
+@pytest.mark.parametrize("num_tokens", [1, 4, 8, 11, 127, 128, 3333, 7317])
 @pytest.mark.parametrize("num_topk", [2, 6, 8])
 @pytest.mark.parametrize("num_experts", [64])
 @pytest.mark.parametrize("ep_size", [1, 2, 4])
@@ -126,7 +125,7 @@ def test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
                                       ep_size, topk_ids_dtype)
 
 
-@pytest.mark.parametrize("numel", list(range(1, 8192, 11)))
+@pytest.mark.parametrize("numel", list(range(1, 8192, 111)))
 @pytest.mark.parametrize("num_experts", [32])
 @pytest.mark.parametrize("ep_size", [2])
 @pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index b82c74a42a..1951eb0c61 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -42,6 +42,24 @@ NUM_EXPERTS = [8, 64, 192]
 EP_SIZE = [1, 4]
 TOP_KS = [2, 6]
 
+FUSED_MOE_MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 2048, 128),
+    (33, 2048, 128),
+    (222, 1024, 1024),
+    (32768, 128, 128),
+    (32768, 2048, 511),
+    (40000, 1024, 1024),
+]
+
+FUSED_MOE_WN16_MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 1024, 1024),
+    (32, 2048, 128),
+    (32, 1024, 1024),
+    (222, 2048, 1024),
+]
+
 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
 vllm_config.scheduler_config.max_model_len = 8192
@@ -116,13 +134,11 @@ def run_moe_test(
     return baseline_output
 
 
-@pytest.mark.parametrize("m", [1, 33, 64, 222, 32768, 40000])
-@pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("m,n,k", FUSED_MOE_MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("ep_size", EP_SIZE)
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("padding", [True, False])
 @pytest.mark.parametrize("chunk_size", [8192])
 def test_fused_moe(
@@ -235,13 +251,11 @@ def test_fused_moe(
                use_cudagraph=use_cudagraph)
 
 
-@pytest.mark.parametrize("m", [1, 32, 222])
-@pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 1024])
+@pytest.mark.parametrize("m,n,k", FUSED_MOE_WN16_MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("ep_size", EP_SIZE)
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("group_size", [64, 128])
 @pytest.mark.parametrize("has_zp", [True, False])
 @pytest.mark.parametrize("weight_bits", [4, 8])
@@ -352,8 +366,7 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
     torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
 
 
-@pytest.mark.parametrize("dtype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("padding", [True, False])
 @pytest.mark.parametrize(
     "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py
index 12ef9e776c..5dfc8d9fab 100644
--- a/tests/kernels/moe/test_moe_align_block_size.py
+++ b/tests/kernels/moe/test_moe_align_block_size.py
@@ -15,10 +15,10 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
 from vllm.platforms import current_platform
 from vllm.utils import round_up
 
-NUM_TOKENS = [1, 3, 7, 16, 256, 2256, 4096]
-NUM_EXPERTS = [32, 160, 256, 257, 512]
+NUM_TOKENS = [1, 3, 256, 2256, 4096]
+NUM_EXPERTS = [32, 160, 256, 257]
 TOP_KS = [1, 2, 16, 32]
-BLOCK_SIZES = [32, 64, 128, 256]
+BLOCK_SIZES = [32, 128]
 current_platform.seed_everything(0)
 
 
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index 8d215a0cbe..6ca01f9271 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -18,7 +18,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
 from vllm.platforms import current_platform
 
 NUM_EXPERTS = [16, 64, 256]
-TOP_KS = [2, 4, 6, 8]
+TOP_KS = [2, 6, 8]
 EP_SIZE = [1, 4, 16]
 current_platform.seed_everything(0)
 
@@ -177,11 +177,11 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
     return output
 
 
-@pytest.mark.parametrize("n_token", [1, 33, 64, 222, 1024, 2048, 3000, 5000])
-@pytest.mark.parametrize("n_hidden", [2048, 4096, 7168])
+@pytest.mark.parametrize("n_token", [1, 33, 1024, 5000])
+@pytest.mark.parametrize("n_hidden", [2048, 7168])
 @pytest.mark.parametrize("n_expert", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("ep_size", EP_SIZE)
 @pytest.mark.parametrize("align_block_size", [None, 128])
 def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index f7a661b4bc..fbef6706be 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -44,6 +44,14 @@ requires_pplx = pytest.mark.skipif(
     reason="Requires PPLX kernels",
 )
 
+BATCHED_MOE_MNK_FACTORS = [
+    (1, 128, 128),
+    (33, 2048, 128),
+    (64, 128, 2048),
+    (222, 128, 128),
+    (222, 2048, 1024),
+]
+
 PPLX_COMBOS = [
     # TODO: figure out why this fails, seems to be test problem
     #(1, 128, 128),
@@ -152,9 +160,7 @@ def torch_batched_moe(
     return torch_finalize(out, topk_weight, topk_ids)
 
 
-@pytest.mark.parametrize("m", [1, 33, 64, 222])
-@pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 512, 1024])
+@pytest.mark.parametrize("m,n,k", BATCHED_MOE_MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.bfloat16])

From 0fe85087a950f3ca94d60293c865c0e6c05e6eff Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 14 Aug 2025 23:34:53 -0400
Subject: [PATCH 279/932] [CI Perf] Prune tests in `tests/kernels/attention/`
 (#22936)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../attention/test_aiter_flash_attn.py        |  6 ++---
 tests/kernels/attention/test_attention.py     |  7 ++----
 tests/kernels/attention/test_cache.py         |  4 ++--
 tests/kernels/attention/test_flash_attn.py    | 16 +++++++------
 tests/kernels/attention/test_flashinfer.py    | 24 ++++++++++---------
 .../test_flashinfer_trtllm_attention.py       |  6 ++---
 .../kernels/attention/test_prefix_prefill.py  |  6 ++---
 .../test_triton_unified_attention.py          |  8 +++----
 8 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py
index d0687c62b1..2d882bdf40 100644
--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@@ -9,10 +9,10 @@ import torch
 import vllm.v1.attention.backends.rocm_aiter_fa  # noqa: F401
 from vllm.platforms import current_platform
 
-NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
-BLOCK_SIZES = [16, 32]
-DTYPES = [torch.float16, torch.bfloat16]
+BLOCK_SIZES = [16]
+DTYPES = [torch.bfloat16]
 QDTYPES = [None]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 2e0b4efebf..7083661575 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -29,17 +29,14 @@ MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
 NUM_BLOCKS = 4321  # Arbitrary values for testing
 PARTITION_SIZE = 512
 PARTITION_SIZE_ROCM = 256
-# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
-DTYPES = [
-    torch.half, torch.bfloat16, torch.float
-] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 NUM_GEN_SEQS = [7]  # Arbitrary values for testing
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
 NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
 
 # This should be sync with get_supported_head_sizes() in
 # vllm.attention.ops.paged_attn.PagedAttention
-HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [32, 80, 128, 256]
 
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 7895076155..8c3cc8cba9 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -11,11 +11,11 @@ from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 
 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
-DTYPES = [torch.half, torch.bfloat16, torch.float]
+DTYPES = [torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
-HEAD_SIZES = [64, 80, 120, 256]
+HEAD_SIZES = [64, 80, 256]
 BLOCK_SIZES = [8, 16, 32]
 CACHE_LAYOUTS = ["NHD", "HND"]
 
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index bd3190d09b..2544703f8b 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -12,14 +12,16 @@ from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
                                   flash_attn_with_kvcache,
                                   is_fa_version_supported)
 
-NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
-BLOCK_SIZES = [16, 32]
-DTYPES = [torch.float16, torch.bfloat16]
+BLOCK_SIZES = [16]
+DTYPES = [torch.bfloat16]
 QDTYPES = [None, torch.float8_e4m3fn]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
+SOFT_CAPS = [None, 50.0]
+SLIDING_WINDOWS = [None, 256]
 
 
 def ref_paged_attn(
@@ -83,9 +85,9 @@ def ref_paged_attn(
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
-@pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
 @pytest.mark.parametrize("fa_version", [2, 3])
 @pytest.mark.parametrize("q_dtype", QDTYPES)
 @torch.inference_mode()
@@ -198,9 +200,9 @@ def test_flash_attn_with_paged_kv(
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("fa_version", [2, 3])
 @pytest.mark.parametrize("q_dtype", QDTYPES)
diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index 8f9b4eceaa..be78f0e4fc 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -9,11 +9,13 @@ import torch
 
 from vllm.platforms import current_platform
 
-NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)]
+NUM_HEADS = [(32, 8), (6, 1)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
-DTYPES = [torch.float16, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+SOFT_CAPS = [None, 30.0]
+SLIDING_WINDOWS = [None, 64]
 
 
 def ref_paged_attn(
@@ -76,8 +78,8 @@ def ref_paged_attn(
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
-@pytest.mark.parametrize("sliding_window", [None, 64])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
 @torch.inference_mode
 def test_flashinfer_decode_with_paged_kv(
     kv_lens: list[int],
@@ -173,8 +175,8 @@ def test_flashinfer_decode_with_paged_kv(
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
-@pytest.mark.parametrize("sliding_window", [None, 64])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
 @torch.inference_mode
 def test_flashinfer_prefill_with_paged_kv(
     seq_lens: list[tuple[int, int]],
@@ -278,11 +280,11 @@ def test_flashinfer_prefill_with_paged_kv(
 
 
 @pytest.mark.parametrize("seq_lens", [[(1, 132), (5, 18)]])
-@pytest.mark.parametrize("num_heads", [(32, 8), (6, 1)])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
 def test_flashinfer_prefill_with_paged_fp8_kv(
         seq_lens: list[tuple[int, int]], num_heads: tuple[int, int],
         head_size: int, dtype: torch.dtype, block_size: int,
@@ -385,11 +387,12 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
 
 
 @pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
-@pytest.mark.parametrize("num_heads", [(32, 8), (64, 8), (6, 1)])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.skip(reason="TODO: fix the accuracy issue")
 @torch.inference_mode
 def test_flashinfer_decode_with_paged_fp8_kv(
     kv_lens: list[int],
@@ -399,7 +402,6 @@ def test_flashinfer_decode_with_paged_fp8_kv(
     block_size: int,
     soft_cap: Optional[float],
 ) -> None:
-    pytest.skip("TODO: fix the accuracy issue")
     # test doesn't work for num_heads = (16,16)
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index e87ce520bc..53e225ea3e 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -20,11 +20,11 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 MAX_Q_LEN = 1024
 MAX_KV_LEN = 4096
 BATCH_SIZES = [4, 12]
-NUM_HEADS = [(64, 8), (16, 16), (40, 8), (32, 8)]
+NUM_HEADS = [(16, 16), (40, 8)]
 HEAD_SIZES = [128]
-BLOCK_SIZES = [16, 32]
+BLOCK_SIZES = [16]
 KV_LAYOUTS = ["HND"]
-DTYPES = [torch.float16, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 KV_CACHE_DTYPES = [None, current_platform.fp8_dtype()]
 NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
 SOFT_CAPS = [None, 50.0]
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index b09e1bbc42..8544eab3ac 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -19,13 +19,13 @@ from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
 NUM_HEADS = [64]
-NUM_QUERIES_PER_KV = [1, 8, 64]
-HEAD_SIZES = [128, 96, 24]
+NUM_QUERIES_PER_KV = [1, 64]
+HEAD_SIZES = [24, 128]
 DTYPES = [torch.float16]
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
-SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
+SLIDING_WINDOW = [0, 16, 2048]
 KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
 
 OPS = [chunked_prefill_paged_decode, context_attention_fwd]
diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
index 0cb7f5963c..4b97d51e6e 100644
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -9,11 +9,11 @@ import torch
 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.platforms import current_platform
 
-NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
-BLOCK_SIZES = [16, 32]
+BLOCK_SIZES = [16]
 
-DTYPES = [torch.float16, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [
     None, torch.float8_e4m3fnuz
 ]
@@ -85,7 +85,7 @@ def ref_paged_attn(
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("sliding_window", [None, 256])
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("soft_cap", [None, 50.0])
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("q_dtype", QDTYPES)
 @torch.inference_mode()

From b4cef5e6c7bd9ec3dbb951fd913ee36dbadf598d Mon Sep 17 00:00:00 2001
From: amirkl94 <203507526+amirkl94@users.noreply.github.com>
Date: Fri, 15 Aug 2025 09:19:31 +0300
Subject: [PATCH 280/932] refactor: Change scaling factors calculation for
 flashinfer FusedMoE (#22812)

Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../layers/fused_moe/fused_moe.py             | 29 +++++-------
 .../model_executor/layers/quantization/fp8.py |  5 +-
 .../layers/quantization/modelopt.py           |  5 +-
 .../quantization/utils/flashinfer_utils.py    | 46 +++++++++++++++++--
 4 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 98087a35e1..1c497fa552 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1189,10 +1189,10 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
         hidden_states: torch.Tensor,
         input_scale: torch.Tensor,
         gemm1_weights: torch.Tensor,
-        gemm1_weights_scale: torch.Tensor,
-        activation_scale: torch.Tensor,
         gemm2_weights: torch.Tensor,
-        gemm2_weights_scale: torch.Tensor,
+        output1_scales_scalar: torch.Tensor,
+        output1_scales_gate_scalar: torch.Tensor,
+        output2_scales_scalar: torch.Tensor,
         num_experts: int,
         top_k: int,
         num_expert_group: Optional[int],
@@ -1206,17 +1206,12 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
     num_expert_group = num_expert_group if num_expert_group is not None else 0
     topk_group = topk_group if topk_group is not None else 0
 
-    quant_hidden_states, input_scale = moe_kernel_quantize_input(
+    quant_hidden_states, _ = moe_kernel_quantize_input(
         hidden_states,
         input_scale,
         quant_dtype=torch.float8_e4m3fn,
         per_act_token_quant=False)
 
-    output1_scales_scalar = gemm1_weights_scale * input_scale * (
-        1.0 / activation_scale)
-    output1_scales_gate_scalar = gemm1_weights_scale * input_scale
-    output2_scales_scalar = activation_scale * gemm2_weights_scale
-
     from vllm.utils.flashinfer import (
         flashinfer_trtllm_fp8_per_tensor_scale_moe)
     return flashinfer_trtllm_fp8_per_tensor_scale_moe(
@@ -1244,24 +1239,24 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
 
 def flashinfer_fused_moe_per_tensor_scale_fp8_fake(
         routing_logits: torch.Tensor,
-        routing_bias: torch.Tensor,
+        routing_bias: Optional[torch.Tensor],
         hidden_states: torch.Tensor,
+        input_scale: torch.Tensor,
         gemm1_weights: torch.Tensor,
+        gemm2_weights: torch.Tensor,
         output1_scales_scalar: torch.Tensor,
         output1_scales_gate_scalar: torch.Tensor,
-        gemm2_weights: torch.Tensor,
         output2_scales_scalar: torch.Tensor,
         num_experts: int,
         top_k: int,
-        num_expert_group: int,
-        topk_group: int,
+        num_expert_group: Optional[int],
+        topk_group: Optional[int],
         intermediate_size: int,
         local_expert_offset: int,
         local_num_experts: int,
-        routed_scaling_factor: float = 1.0,
-        use_routing_scales_on_input: bool = False,
-        tile_tokens_dim: int = 8,
-        routing_method_type: int = 0) -> torch.Tensor:
+        use_routing_scales_on_input: bool,
+        routing_method_type: int,
+        routed_scaling_factor: float = 1.0) -> torch.Tensor:
     pass
 
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 5e107c799b..dbd5234286 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -24,8 +24,8 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights,
-    swap_w13_to_w31)
+    apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors,
+    rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
@@ -694,6 +694,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 w2_weight = layer.w2_weight.data
                 w2_weight_scale_inv = layer.w2_weight_scale_inv.data
                 if not self.block_quant:
+                    register_moe_scaling_factors(layer)
                     rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
             else:
                 w13_weight = layer.w13_weight.data
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 8f9ca73bc5..22fbbab00e 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -25,8 +25,8 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_kernel,
     flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights,
-    swap_w13_to_w31)
+    apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors,
+    rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     apply_fp4_marlin_linear, is_fp4_marlin_supported,
     prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
@@ -430,6 +430,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
             rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
                                               layer.w2_weight)
+            register_moe_scaling_factors(layer)
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 9fb194767e..278ee5232f 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -82,6 +82,12 @@ def apply_flashinfer_per_tensor_scale_fp8(
     apply_router_weight_on_input: bool,
 ) -> torch.Tensor:
     from flashinfer.fused_moe import RoutingMethodType
+    assert layer.output1_scales_scalar is not None, (
+        "Expected output1_scales_scalar to be initialized")
+    assert layer.output1_scales_scalar is not None, (
+        "Expected output1_scales_gate_scalar to be initialized")
+    assert layer.output1_scales_scalar is not None, (
+        "Expected output2_scales_scalar to be initialized")
 
     from vllm.model_executor.models.llama4 import Llama4MoE
     assert layer.custom_routing_function == Llama4MoE.custom_routing_function, \
@@ -92,10 +98,10 @@ def apply_flashinfer_per_tensor_scale_fp8(
         hidden_states=hidden_states,
         input_scale=layer.w13_input_scale,
         gemm1_weights=layer.w13_weight,
-        gemm1_weights_scale=layer.w13_weight_scale,
         gemm2_weights=layer.w2_weight,
-        gemm2_weights_scale=layer.w2_weight_scale,
-        activation_scale=layer.w2_input_scale,
+        output1_scales_scalar=layer.output1_scales_scalar,
+        output1_scales_gate_scalar=layer.output1_scales_gate_scalar,
+        output2_scales_scalar=layer.output2_scales_scalar,
         num_experts=global_num_experts,
         top_k=top_k,
         num_expert_group=num_expert_group,
@@ -105,4 +111,36 @@ def apply_flashinfer_per_tensor_scale_fp8(
         local_num_experts=layer.local_num_experts,
         use_routing_scales_on_input=apply_router_weight_on_input,
         routing_method_type=RoutingMethodType.Llama4,
-    )
\ No newline at end of file
+    )
+
+
+def get_moe_scaling_factors(
+    input_scale: torch.Tensor,
+    gemm1_weights_scale: torch.Tensor,
+    activation_scale: torch.Tensor,
+    gemm2_weights_scale: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    output1_scales_scalar = gemm1_weights_scale * input_scale * (
+        1.0 / activation_scale)
+    output1_scales_gate_scalar = gemm1_weights_scale * input_scale
+    output2_scales_scalar = activation_scale * gemm2_weights_scale
+
+    return output1_scales_scalar, output1_scales_gate_scalar, \
+        output2_scales_scalar
+
+
+def register_moe_scaling_factors(layer: torch.nn.Module) -> None:
+    output1_scales, output1_gate_scales, output2_scales = \
+        get_moe_scaling_factors(
+            layer.w13_input_scale, layer.w13_weight_scale,
+            layer.w2_input_scale, layer.w2_weight_scale
+        )
+    layer.register_parameter(
+        'output1_scales_scalar',
+        torch.nn.Parameter(output1_scales, requires_grad=False))
+    layer.register_parameter(
+        'output1_scales_gate_scalar',
+        torch.nn.Parameter(output1_gate_scales, requires_grad=False))
+    layer.register_parameter(
+        'output2_scales_scalar',
+        torch.nn.Parameter(output2_scales, requires_grad=False))

From 5c3fbfe46bf62d339a42476120d9bf268fedfa24 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 15 Aug 2025 02:27:30 -0400
Subject: [PATCH 281/932] [Feature] Full Cuda Graph Support for Cutlass MLA and
 6% E2E Throughput Improvement (#22763)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../compile/piecewise/test_full_cudagraph.py  | 74 +++++++++++++++++++
 vllm/v1/attention/backends/mla/cutlass_mla.py | 16 +++-
 2 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index efe9c843f1..cc1a95b820 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -66,6 +66,80 @@ def llm_pair(request):
     )
 
 
+@pytest.fixture(scope="class")
+def cutlass_mla_llm_pair(request):
+    model = request.param
+
+    # force V1 engine and Cutlass MLA backend
+    with temporary_environ({
+            "VLLM_USE_V1": "1",
+            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
+            "FORCE_NUM_KV_SPLITS":
+            "1",  # TODO: remove this when hang issue is fixed
+    }):
+        full = LLM(
+            model=model,
+            gpu_memory_utilization=0.45,
+            trust_remote_code=True,
+            max_model_len=1024,
+            compilation_config=CompilationConfig(
+                full_cuda_graph=True,
+                cudagraph_capture_sizes=[16, 32, 64, 128, 256, 512],
+            ),
+        )
+        piecewise = LLM(
+            model=model,
+            gpu_memory_utilization=0.45,
+            trust_remote_code=True,
+            max_model_len=1024,
+            compilation_config=CompilationConfig(),
+        )
+
+    yield weakref.proxy(full), weakref.proxy(piecewise)
+    del full
+    del piecewise
+
+    wait_for_gpu_memory_to_clear(
+        devices=[0],
+        threshold_ratio=0.1,
+    )
+
+
+@pytest.mark.parametrize(
+    "cutlass_mla_llm_pair",
+    [
+        # use an MLA model
+        "deepseek-ai/DeepSeek-V2-Lite",
+    ],
+    indirect=True)
+@pytest.mark.skipif(current_platform.get_device_capability() != (10, 0),
+                    reason="Only Blackwell GPUs support Cutlass MLA")
+class TestFullCUDAGraphCutlassMLA:
+    """
+    Validate full CUDA Graph with Cutlass MLA (decode-only capture).
+    """
+
+    @pytest.mark.parametrize(("batch_size", "max_tokens"), [
+        (8, 8),
+    ])
+    def test_full_cudagraph_sm100_cutlass_mla(
+            self, batch_size, max_tokens, cutlass_mla_llm_pair: tuple[LLM,
+                                                                      LLM]):
+        piecewise_llm, full_cudagraph_llm = cutlass_mla_llm_pair
+
+        prompts = ["Hello, my name is"] * batch_size
+        sampling_params = SamplingParams(temperature=0.0,
+                                         max_tokens=max_tokens,
+                                         top_p=0.95)
+
+        piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
+        full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
+
+        for piecewise_res, full_res in zip(piecewise_responses,
+                                           full_responses):
+            assert piecewise_res.outputs[0].text == full_res.outputs[0].text
+
+
 @pytest.mark.parametrize(
     "llm_pair",
     [
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index b23a8f0a5e..b076613c86 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Optional
+from typing import ClassVar, Optional
 
 import torch
 
@@ -12,11 +12,19 @@ from vllm.attention.backends.abstract import (AttentionType,
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                    MLACommonImpl,
-                                                   MLACommonMetadata)
+                                                   MLACommonMetadata,
+                                                   MLACommonMetadataBuilder)
+from vllm.v1.attention.backends.utils import AttentionCGSupport
 
 logger = init_logger(__name__)
 
 
+class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
+    # enable full CUDA Graph support for decode-only capture
+    attn_cudagraph_support: ClassVar[
+        AttentionCGSupport] = AttentionCGSupport.PURE_DECODE_ONLY
+
+
 class CutlassMLABackend(MLACommonBackend):
 
     @staticmethod
@@ -27,6 +35,10 @@ class CutlassMLABackend(MLACommonBackend):
     def get_impl_cls() -> type["CutlassMLAImpl"]:
         return CutlassMLAImpl
 
+    @staticmethod
+    def get_builder_cls() -> type["CutlassMLAMetadataBuilder"]:
+        return CutlassMLAMetadataBuilder
+
 
 class SM100Workspace:
 

From 3d232dbd19f9c9d782a47b579f4a3c5a2f996499 Mon Sep 17 00:00:00 2001
From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Fri, 15 Aug 2025 09:38:05 +0300
Subject: [PATCH 282/932] [Mamba] - refactor: Renamed mamba_attn to mamba2_attn
 (#22818)

Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
---
 tests/kernels/mamba/test_mamba_ssm_ssd.py                    | 2 +-
 tests/v1/attention/test_mamba_selectors.py                   | 2 +-
 vllm/model_executor/layers/mamba/mamba2_metadata.py          | 2 +-
 vllm/model_executor/layers/mamba/mamba_mixer2.py             | 2 +-
 vllm/v1/attention/backends/{mamba_attn.py => mamba2_attn.py} | 0
 vllm/v1/attention/backends/mamba_selectors.py                | 2 +-
 6 files changed, 5 insertions(+), 5 deletions(-)
 rename vllm/v1/attention/backends/{mamba_attn.py => mamba2_attn.py} (100%)

diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
index d2b893ffff..2c554baaff 100644
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -9,7 +9,7 @@ from einops import rearrange, repeat
 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
     mamba_chunk_scan_combined)
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.mamba_attn import (
+from vllm.v1.attention.backends.mamba2_attn import (
     _query_start_loc_to_chunk_indices_offsets)
 
 # Added by the IBM Team, 2024
diff --git a/tests/v1/attention/test_mamba_selectors.py b/tests/v1/attention/test_mamba_selectors.py
index 8eaafc5e16..4245b50c71 100644
--- a/tests/v1/attention/test_mamba_selectors.py
+++ b/tests/v1/attention/test_mamba_selectors.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
+from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
 from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend
 
 
diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py
index 0a836fd175..3256ac034a 100644
--- a/vllm/model_executor/layers/mamba/mamba2_metadata.py
+++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py
@@ -11,7 +11,7 @@ from vllm.attention.backends.placeholder_attn import (
     PlaceholderAttentionMetadata)
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.mamba_attn import (
+from vllm.v1.attention.backends.mamba2_attn import (
     Mamba2AttentionMetadata, _query_start_loc_to_chunk_indices_offsets)
 
 
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 10a5618c22..6bf0c18ebd 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -36,7 +36,7 @@ from vllm.model_executor.models.mamba_cache import MambaCacheParams
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
-from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionMetadata
+from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
 
 # Added by the IBM Team, 2024
 
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
similarity index 100%
rename from vllm/v1/attention/backends/mamba_attn.py
rename to vllm/v1/attention/backends/mamba2_attn.py
diff --git a/vllm/v1/attention/backends/mamba_selectors.py b/vllm/v1/attention/backends/mamba_selectors.py
index 852e0dfe1b..d3a0c63c5e 100644
--- a/vllm/v1/attention/backends/mamba_selectors.py
+++ b/vllm/v1/attention/backends/mamba_selectors.py
@@ -3,7 +3,7 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend
 from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
-from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
+from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
 
 
 def get_mamba_attn_backend(mamba_type: str) -> type[AttentionBackend]:

From b2f6c247a9b84556a8ea0e75bb4a2db765ff3315 Mon Sep 17 00:00:00 2001
From: TJian <tunjian1996@gmail.com>
Date: Thu, 14 Aug 2025 23:39:19 -0700
Subject: [PATCH 283/932] Revert "[ROCm][AITER] Support AITER Rope ops in
 RotaryEmbedding Module." (#22956)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../layers/rotary_embedding/base.py           |  71 ----------
 .../layers/rotary_embedding/common.py         |   4 +-
 .../rotary_embedding/deepseek_scaling_rope.py |  12 +-
 .../rotary_embedding/rocm_aiter_rope_ops.py   | 127 ------------------
 4 files changed, 10 insertions(+), 204 deletions(-)
 delete mode 100644 vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py

diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 6dfc28be7d..10fce857a8 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -8,7 +8,6 @@ import torch
 from vllm.model_executor.custom_op import CustomOp
 
 from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch
-from .rocm_aiter_rope_ops import is_rocm_rotary_embedding_enabled
 
 
 @CustomOp.register("rotary_embedding")
@@ -36,7 +35,6 @@ class RotaryEmbedding(CustomOp):
         cache = cache.to(dtype)
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
-        self.is_rocm_aiter_enabled = is_rocm_rotary_embedding_enabled()
 
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
@@ -121,75 +119,6 @@ class RotaryEmbedding(CustomOp):
                                  self.cos_sin_cache, self.is_neox_style)
         return query, key
 
-    def forward_hip(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-        is_nope_first=False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        # currently only rotary embedding ops from AITER package are
-        # supported for HiP forward.
-        if self.is_rocm_aiter_enabled:
-            return self.forward_hip_rocm_aiter(positions, query, key, offsets,
-                                               is_nope_first)
-        return self.forward_native(positions, query, key, offsets)
-
-    def forward_hip_rocm_aiter(
-        self,
-        positions: torch.Tensor,
-        # if     is_nope_first
-        # [[batch_size, seq_len, num_heads, nope_size+rope_size]
-        # if NOT is_nope_first
-        # [[batch_size, seq_len, num_heads, rope_size+nope_size],
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-        is_nope_first: bool = False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if self.cos_sin_cache.device != query.device or \
-            self.cos_sin_cache.dtype != query.dtype:
-            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                       dtype=query.dtype)
-        cos, sin = self.cos_sin_cache.chunk(2, dim=-1)
-
-        cos = cos.unsqueeze(-2).unsqueeze(-2)
-        sin = sin.unsqueeze(-2).unsqueeze(-2)
-
-        rotate_style = 0 if self.is_neox_style else 1
-
-        num_tokens = positions.numel()
-
-        query_shape = query.shape
-        query = query.view(1, num_tokens, -1, self.head_size)
-        if key is not None:
-            key_shape = key.shape
-            key = key.view(1, num_tokens, -1, self.head_size)
-
-        positions = positions.view(*query.shape[:2])
-        if offsets is not None:
-            offsets = offsets.view(*query.shape[:2])
-
-        if not is_nope_first:
-            query_ = query[..., :self.rotary_dim]
-            key_ = key[..., :self.rotary_dim] if key is not None else None
-        else:
-            query_ = query[..., -self.rotary_dim:]
-            key_ = key[..., -self.rotary_dim:] if key is not None else None
-
-        if key_ is None:
-            torch.ops.vllm.rocm_aiter_rotary_emb_without_key_forward_hip(
-                positions, sin, cos, query_, offsets, rotate_style,
-                is_nope_first)
-            return query.view(query_shape), None
-
-        torch.ops.vllm.rocm_aiter_rotary_emb_with_key_forward_hip(
-            positions, sin, cos, query_, key_, offsets, rotate_style,
-            is_nope_first)
-
-        return query.view(query_shape), key.view(key_shape)
-
     def forward_xpu(
         self,
         positions: torch.Tensor,
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 99b6bb2120..8d821bea19 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -99,7 +99,7 @@ def yarn_linear_ramp_mask(low: float, high: float, dim: int,
     return ramp_func
 
 
-def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+def yarn_get_mscale(scale: float = 1) -> float:
     if scale <= 1:
         return 1.0
-    return 0.1 * mscale * math.log(scale) + 1.0
+    return 0.1 * math.log(scale) + 1.0
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index 5af671703a..cd888b7334 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import math
 from typing import Optional
 
 import torch
@@ -9,7 +10,13 @@ from vllm.platforms import current_platform
 
 from .base import RotaryEmbedding
 from .common import (rotate_gptj, rotate_neox, yarn_find_correction_range,
-                     yarn_get_mscale, yarn_linear_ramp_mask)
+                     yarn_linear_ramp_mask)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
 
 
 class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
@@ -89,9 +96,6 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         offsets: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         """PyTorch-native implementation equivalent to forward()."""
-        if self.is_rocm_aiter_enabled:
-            return self.forward_hip_rocm_aiter(positions, query, key, offsets)
-
         assert key is not None
         query_rot = query[..., :self.rotary_dim]
         key_rot = key[..., :self.rotary_dim]
diff --git a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
deleted file mode 100644
index 91a2318bad..0000000000
--- a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Optional
-
-import torch
-
-import vllm.envs as envs
-from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
-
-
-def is_rocm_rotary_embedding_enabled() -> bool:
-    return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER)
-
-
-def rocm_aiter_rotary_emb_without_key_forward_hip_impl(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    offsets: Optional[torch.Tensor] = None,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    import aiter as ops
-    if offsets is None:
-        ops.rope_cached_positions_fwd_inplace(
-            query,
-            cos,
-            sin,
-            positions,
-            rotate_style,
-            reuse_freqs_front_part=True,
-            nope_first=is_nope_first,
-        )
-    else:
-        ops.rope_cached_positions_offsets_fwd_inplace(
-            query,
-            cos,
-            sin,
-            positions,
-            offsets,
-            rotate_style,
-            reuse_freqs_front_part=True,
-            nope_first=is_nope_first,
-        )
-
-
-def rocm_aiter_rotary_emb_with_key_forward_hip_impl(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    offsets: Optional[torch.Tensor] = None,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    import aiter as ops
-    if offsets is None:
-        ops.rope_cached_positions_2c_fwd_inplace(
-            query,
-            key,
-            cos,
-            sin,
-            positions,
-            rotate_style,
-            reuse_freqs_front_part=True,
-            nope_first=is_nope_first,
-        )
-    else:
-        ops.rope_cached_positions_offsets_2c_fwd_inplace(
-            query,
-            key,
-            cos,
-            sin,
-            positions,
-            offsets,
-            rotate_style,
-            reuse_freqs_front_part=True,
-            nope_first=is_nope_first,
-        )
-
-
-def rocm_aiter_rotary_emb_with_key_forward_hip_fake(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    offsets: Optional[torch.Tensor] = None,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    pass
-
-
-def rocm_aiter_rotary_emb_without_key_forward_hip_fake(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    offsets: Optional[torch.Tensor] = None,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    pass
-
-
-if is_rocm_rotary_embedding_enabled():
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_rotary_emb_with_key_forward_hip",
-        op_func=rocm_aiter_rotary_emb_with_key_forward_hip_impl,
-        mutates_args=["key", "query"],
-        fake_impl=rocm_aiter_rotary_emb_with_key_forward_hip_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_rotary_emb_without_key_forward_hip",
-        op_func=rocm_aiter_rotary_emb_without_key_forward_hip_impl,
-        mutates_args=["query"],
-        fake_impl=rocm_aiter_rotary_emb_without_key_forward_hip_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
\ No newline at end of file

From b2c06509e58d8afefc1b5fb0f3d91f0cc9d9f279 Mon Sep 17 00:00:00 2001
From: frankie <wangyongsheng686@gmail.com>
Date: Fri, 15 Aug 2025 15:01:48 +0800
Subject: [PATCH 284/932] [P/D]Provide bucket algorithm rate limiter  for
 proxy_server (#22643)

Signed-off-by: frankie-ys <yongshengwang@cmbchina.com>
Signed-off-by: frankie <wangyongsheng686@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Kuntai Du <kuntai@uchicago.edu>
---
 .../disagg_prefill_proxy_server.py            | 224 ++++++++++++++----
 benchmarks/disagg_benchmarks/rate_limiter.py  |  45 ++++
 benchmarks/disagg_benchmarks/request_queue.py |  39 +++
 3 files changed, 264 insertions(+), 44 deletions(-)
 create mode 100644 benchmarks/disagg_benchmarks/rate_limiter.py
 create mode 100644 benchmarks/disagg_benchmarks/request_queue.py

diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index f62d8102e2..904f805349 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -1,63 +1,199 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import argparse
+import asyncio
+import logging
 import os
 
 import aiohttp
-from quart import Quart, make_response, request
+from quart import Quart, Response, make_response, request
+from rate_limiter import RateLimiter
+from request_queue import RequestQueue
 
-AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
-
-app = Quart(__name__)
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 
 
-async def forward_request(url, data):
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+def parse_args():
+    """parse command line arguments"""
+    parser = argparse.ArgumentParser(description="vLLM P/D disaggregation proxy server")
+
+    # Add args
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=300,
+        help="Timeout for backend service requests in seconds (default: 300)",
+    )
+    parser.add_argument(
+        "--max-concurrent",
+        type=int,
+        default=100,
+        help="Maximum concurrent requests to backend services (default: 100)",
+    )
+    parser.add_argument(
+        "--queue-size",
+        type=int,
+        default=500,
+        help="Maximum number of requests in the queue (default: 500)",
+    )
+    parser.add_argument(
+        "--rate-limit",
+        type=int,
+        default=40,
+        help="Maximum requests per second (default: 40)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Port to run the server on (default: 8000)",
+    )
+    parser.add_argument(
+        "--prefill-url",
+        type=str,
+        default="http://localhost:8100/v1/completions",
+        help="Prefill service endpoint URL",
+    )
+    parser.add_argument(
+        "--decode-url",
+        type=str,
+        default="http://localhost:8200/v1/completions",
+        help="Decode service endpoint URL",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    """parse command line arguments"""
+    args = parse_args()
+
+    # Initialize configuration using command line parameters
+    AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
+    MAX_CONCURRENT_REQUESTS = args.max_concurrent
+    REQUEST_QUEUE_SIZE = args.queue_size
+    RATE_LIMIT = args.rate_limit
+    PREFILL_SERVICE_URL = args.prefill_url
+    DECODE_SERVICE_URL = args.decode_url
+    PORT = args.port
+
+    app = Quart(__name__)
+
+    # Initialize the rate limiter and request queue
+    rate_limiter = RateLimiter(RATE_LIMIT)
+    request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
+
+    # Attach the configuration object to the application instance
+    app.config.update(
+        {
+            "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
+            "rate_limiter": rate_limiter,
+            "request_queue": request_queue,
+            "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
+            "DECODE_SERVICE_URL": DECODE_SERVICE_URL,
+        }
+    )
+
+    # Start queue processing on app startup
+    @app.before_serving
+    async def startup():
+        """Start request processing task when app starts serving"""
+        asyncio.create_task(request_queue.process())
+
+    async def forward_request(url, data):
+        """Forward request to backend service with rate limiting and error handling"""
         headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-        async with session.post(url=url, json=data, headers=headers) as response:
-            if response.status == 200:
-                # if response.headers.get('Transfer-Encoding') == 'chunked':
-                if True:
-                    async for chunk_bytes in response.content.iter_chunked(1024):
-                        yield chunk_bytes
-                else:
-                    content = await response.read()
-                    yield content
 
-
-@app.route("/v1/completions", methods=["POST"])
-async def handle_request():
-    try:
-        original_request_data = await request.get_json()
-
-        prefill_request = original_request_data.copy()
-        # change max_tokens = 1 to let it only do prefill
-        prefill_request["max_tokens"] = 1
-
-        # finish prefill
-        async for _ in forward_request(
-            "http://localhost:8100/v1/completions", prefill_request
+        # Use rate limiter as context manager
+        async with (
+            rate_limiter,
+            aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
         ):
-            continue
+            try:
+                async with session.post(
+                    url=url, json=data, headers=headers
+                ) as response:
+                    if response.status == 200:
+                        # Stream response chunks
+                        async for chunk_bytes in response.content.iter_chunked(1024):
+                            yield chunk_bytes
+                    else:
+                        # Handle backend service errors
+                        error_text = await response.text()
+                        logger.error(
+                            "Backend service error: %s - %s",
+                            response.status,
+                            error_text,
+                        )
+                        yield b'{"error": "Backend service error"}'
+            except aiohttp.ClientError as e:
+                # Handle connection errors
+                logger.error("Connection error to %s: %s", url, str(e))
+                yield b'{"error": "Service unavailable"}'
+            except asyncio.TimeoutError:
+                # Handle timeout errors
+                logger.error("Timeout connecting to %s", url)
+                yield b'{"error": "Service timeout"}'
 
-        # return decode
-        generator = forward_request(
-            "http://localhost:8200/v1/completions", original_request_data
-        )
-        response = await make_response(generator)
-        response.timeout = None
+    async def process_request():
+        """Process a single request through prefill and decode stages"""
+        try:
+            original_request_data = await request.get_json()
 
-        return response
+            # Create prefill request (max_tokens=1)
+            prefill_request = original_request_data.copy()
+            prefill_request["max_tokens"] = 1
 
-    except Exception as e:
-        import sys
-        import traceback
+            # Execute prefill stage
+            async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
+                continue
 
-        exc_info = sys.exc_info()
-        print("Error occurred in disagg prefill proxy server")
-        print(e)
-        print("".join(traceback.format_exception(*exc_info)))
+            # Execute decode stage and stream response
+            generator = forward_request(DECODE_SERVICE_URL, original_request_data)
+            response = await make_response(generator)
+            response.timeout = None  # Disable timeout for streaming response
+            return response
+
+        except Exception:
+            logger.exception("Error processing request")
+            return Response(
+                response=b'{"error": "Internal server error"}',
+                status=500,
+                content_type="application/json",
+            )
+
+    @app.route("/v1/completions", methods=["POST"])
+    async def handle_request():
+        """Handle incoming API requests with concurrency and rate limiting"""
+        # Create task for request processing
+        task = asyncio.create_task(process_request())
+
+        # Enqueue request or reject if queue is full
+        if not await request_queue.enqueue(task):
+            return Response(
+                response=b'{"error": "Server busy, try again later"}',
+                status=503,
+                content_type="application/json",
+            )
+
+        try:
+            # Return the response from the processing task
+            return await task
+        except asyncio.CancelledError:
+            # Handle task cancellation (timeout or queue full)
+            logger.warning("Request cancelled due to timeout or queue full")
+            return Response(
+                response=b'{"error": "Request cancelled"}',
+                status=503,
+                content_type="application/json",
+            )
+
+    # Start the Quart server with host can be set to 0.0.0.0
+    app.run(port=PORT)
 
 
 if __name__ == "__main__":
-    app.run(port=8000)
+    main()
diff --git a/benchmarks/disagg_benchmarks/rate_limiter.py b/benchmarks/disagg_benchmarks/rate_limiter.py
new file mode 100644
index 0000000000..87ac8cb6ab
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/rate_limiter.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import time
+
+
+class RateLimiter:
+    """Token bucket rate limiter implementation"""
+
+    def __init__(self, rate_limit):
+        self.rate_limit = rate_limit  # Requests per second
+        self.num_available_tokens = rate_limit  # Available tokens
+        self.last_refill = time.monotonic()  # Last token refill time
+        self.lock = asyncio.Lock()  # Synchronization lock
+
+    async def acquire(self):
+        """Acquire a token from the rate limiter"""
+        while True:
+            async with self.lock:
+                current_time = time.monotonic()
+                elapsed = current_time - self.last_refill
+
+                # Refill num_available_tokens if more than 1 second has passed
+                if elapsed > 1.0:
+                    self.num_available_tokens = self.rate_limit
+                    self.last_refill = current_time
+
+                # Check if num_available_tokens are available
+                if self.num_available_tokens > 0:
+                    self.num_available_tokens -= 1
+                    return True
+
+                # Calculate wait time if no num_available_tokens available
+                wait_time = 1.0 - elapsed
+            await asyncio.sleep(wait_time)
+
+    async def __aenter__(self):
+        """Enter async context manager - acquire token"""
+        await self.acquire()
+        return self
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        """Exit async context manager - no cleanup needed"""
+        pass
diff --git a/benchmarks/disagg_benchmarks/request_queue.py b/benchmarks/disagg_benchmarks/request_queue.py
new file mode 100644
index 0000000000..410bcb9560
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/request_queue.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from collections import deque
+
+
+class RequestQueue:
+    """Request queue manager with concurrency control"""
+
+    def __init__(self, max_concurrent, max_queue_size):
+        # Maximum concurrent requests
+        self.max_concurrent = max_concurrent
+        self.max_queue_size = max_queue_size  # Maximum queue size
+        # Concurrency control
+        self.semaphore = asyncio.Semaphore(max_concurrent)
+        self.queue = deque()  # Request queue
+        self.queue_size = 0  # Current queue size
+        self.lock = asyncio.Lock()  # Sync queue Lock
+
+    async def enqueue(self, task):
+        """Add a request task to the queue"""
+        async with self.lock:
+            if self.queue_size >= self.max_queue_size:
+                return False
+
+            self.queue.append(task)
+            self.queue_size += 1
+            return True
+
+    async def process(self):
+        """Process queued requests using semaphore for concurrency control"""
+        while True:
+            if self.queue:
+                async with self.semaphore, self.lock:
+                    task = self.queue.popleft()
+                    self.queue_size -= 1
+                    await task
+            await asyncio.sleep(0.01)  # Yield control to event loop

From 5406ebf5c9606b717ee11a40db8ba56d8c8dde17 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Fri, 15 Aug 2025 16:16:15 +0800
Subject: [PATCH 285/932] [CI] Pooling models mteb test uses enforce_eager
 (#22878)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 tests/models/language/pooling/mteb_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index d024c76ddd..4a1f8a53d0 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -18,7 +18,7 @@ from tests.models.utils import EmbedModelInfo, RerankModelInfo
 # - Different model results in differences more than 1e-3
 # 1e-4 is a good tolerance threshold
 MTEB_EMBED_TASKS = ["STS12"]
-MTEB_EMBED_TOL = 1e-4
+MTEB_EMBED_TOL = 0.02
 
 # See #19344
 MTEB_RERANK_TASKS = ["NFCorpus"]
@@ -175,6 +175,7 @@ def mteb_test_embed_models(hf_runner,
     with vllm_runner(model_info.name,
                      runner="pooling",
                      max_model_len=None,
+                     enforce_eager=True,
                      **vllm_extra_kwargs) as vllm_model:
 
         model_config = vllm_model.llm.llm_engine.model_config
@@ -198,6 +199,7 @@ def mteb_test_embed_models(hf_runner,
         st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
         st_dtype = next(hf_model.model.parameters()).dtype
 
+    print("Model:", model_info.name)
     print("VLLM:", vllm_dtype, vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
@@ -286,6 +288,7 @@ def mteb_test_rerank_models(hf_runner,
                      runner="pooling",
                      max_model_len=None,
                      max_num_seqs=8,
+                     enforce_eager=True,
                      **vllm_extra_kwargs) as vllm_model:
 
         model_config = vllm_model.llm.llm_engine.model_config
@@ -304,6 +307,7 @@ def mteb_test_rerank_models(hf_runner,
     st_main_score, st_dtype = mteb_test_rerank_models_hf(
         hf_runner, model_info.name, hf_model_callback)
 
+    print("Model:", model_info.name)
     print("VLLM:", vllm_dtype, vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)

From fe91ce9591760e3b58fc24845fa80364fbcdd07f Mon Sep 17 00:00:00 2001
From: amirai21 <89905406+amirai21@users.noreply.github.com>
Date: Fri, 15 Aug 2025 11:59:52 +0300
Subject: [PATCH 286/932] [V1] - Split Prefill and Decode for Mamba1 models
 (#22653)

Signed-off-by: amirk <amirk@ai21.com>
Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
---
 .../models/language/generation/test_hybrid.py |  13 +
 .../layers/mamba/mamba_mixer.py               | 305 +++++++++++++-----
 vllm/v1/attention/backends/mamba1_attn.py     |  26 +-
 3 files changed, 251 insertions(+), 93 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 19fcbf5616..e75677347f 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -57,6 +57,13 @@ V1_SUPPORTED_MODELS = [
 # Avoid OOM
 MAX_NUM_SEQS = 4
 
+# Once we add support for FCG in Mamba1, this list will be removed and tests
+# all test cases will use enforce_eager=False
+ENFORCE_EAGER_MODELS_V1 = [
+    "state-spaces/mamba-130m-hf",
+    "ai21labs/Jamba-tiny-dev",
+]
+
 
 @pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
@@ -94,13 +101,19 @@ def test_models(
             example_prompts, max_tokens, num_logprobs)
 
     if model in V1_SUPPORTED_MODELS:
+        enforce_eager = False
         with monkeypatch.context() as m:
             m.setenv("VLLM_USE_V1", "1")
             if model in HYBRID_MODELS:
                 # required due to reorder_batch behaviour
                 m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+
+            if model in ENFORCE_EAGER_MODELS_V1:
+                enforce_eager = True
+
             with vllm_runner(model,
                              max_num_seqs=MAX_NUM_SEQS,
+                             enforce_eager=enforce_eager,
                              enable_prefix_caching=False) as vllm_model:
                 vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                     example_prompts, max_tokens, num_logprobs)
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 17b7f84a93..3b17fb0ca8 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,13 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
+from typing import NamedTuple, Optional
 
 import torch
 from torch import nn
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import get_current_vllm_config
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
@@ -154,13 +155,38 @@ class MambaMixer(MambaBase, CustomOp):
 
         self.prefix = prefix
 
+    def _ssm_transform(
+            self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if self.is_lora_enabled:
+            #  Lora kernel requires contiguous tensor.
+            ssm_params = self.x_proj(x.contiguous())[0]
+        else:
+            ssm_params = self.x_proj(x)[0]
+        time_step, B, C = torch.split(
+            ssm_params,
+            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
+            dim=-1)
+        if self.use_rms_norm:
+            assert self.dt_layernorm is not None
+            assert self.b_layernorm is not None
+            assert self.c_layernorm is not None
+            time_step = self.dt_layernorm(time_step.contiguous())
+            B = self.b_layernorm(B.contiguous())
+            C = self.c_layernorm(C.contiguous())
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
+        return discrete_time_step, B, C
+
     def forward(self,
                 hidden_states: torch.Tensor,
                 mamba_cache_params: Optional[MambaCacheParams] = None):
         if not envs.VLLM_USE_V1:
             return CustomOp.forward(self, hidden_states, mamba_cache_params)
         else:
-            return self.forward_cuda(hidden_states, mamba_cache_params)
+            return self.forward_cuda(
+                hidden_states,
+                mamba_cache_params,
+            )
 
     def forward_native(self,
                        hidden_states: torch.Tensor,
@@ -170,6 +196,27 @@ class MambaMixer(MambaBase, CustomOp):
     def forward_cuda(self,
                      hidden_states: torch.Tensor,
                      mamba_cache_params: Optional[MambaCacheParams] = None):
+        """
+        Run the Mamba-1 SSM pipeline.
+
+        Steps
+        -----
+        1. Apply the gated-MLP linear projection to the raw input.
+        2. Pass the projected sequence through the convolutional mixing layer.
+        3. Feed the result into the State-Space Model (SSM) blocks.
+        4. Perform the recurrence y ← SSM(A, B, C, Δ)(x)
+           to produce contextual representations.
+        5. Project the contextualised sequence back
+           to the output embedding dimension.
+
+        Batch handling
+        --------------
+        Prefill and decode tokens are processed by dedicated CUDA
+        kernels for both the convolutional (conv1d) and SSM stages.
+        In the case of a mixed batch (containing both prefill and
+        decode tokens), both sets of kernels are executed independently
+        and their outputs are concatenated before the final output projection.
+        """
 
         forward_context: ForwardContext = get_forward_context()
         attn_metadata = forward_context.attn_metadata
@@ -185,126 +232,142 @@ class MambaMixer(MambaBase, CustomOp):
                 self_kv_cache = self.kv_cache[forward_context.virtual_engine]
                 conv_state = self_kv_cache[0].transpose(-1, -2)
                 ssm_state = self_kv_cache[1]
-                has_initial_state = mamba1_metadata.has_initial_states
-                context_lens_tensor = mamba1_metadata.context_lens_tensor
+                has_initial_states = mamba1_metadata.has_initial_states
         else:
+            assert isinstance(attn_metadata, AttentionMetadata)
             assert mamba_cache_params is not None
             conv_state = mamba_cache_params.conv_state
             ssm_state = mamba_cache_params.ssm_state
             state_indices_tensor = mamba_cache_params.state_indices_tensor
             query_start_loc = attn_metadata.query_start_loc
             context_lens_tensor = attn_metadata.context_lens_tensor
-
+            has_initial_states = None
             if context_lens_tensor is not None:
-                has_initial_state = context_lens_tensor > 0
+                has_initial_states = context_lens_tensor > 0
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
-        hidden_states, gate = projected_states.chunk(2, dim=-2)
+        hidden_states_BC, gate = projected_states.chunk(2, dim=-2)
 
-        # 2. Convolution sequence transformation
         conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
                                                self.conv1d.weight.size(2))
 
         if envs.VLLM_USE_V1 and attn_metadata is None:
             # V1 profile run
-            hidden_states = hidden_states.contiguous()
-            return self.out_proj(hidden_states.transpose(-2, -1))[0]
+            hidden_states_BC = hidden_states_BC.contiguous()
+            return self.out_proj(hidden_states_BC.transpose(-2, -1))[0]
 
-        if query_start_loc is not None and context_lens_tensor is not None:
-            # |---------- N-1 iteration --------|
-            # |---------------- N iteration ---------------------|
-            # |- tokenA -|......................|-- newTokens ---|
-            # |---------- context_len ----------|
-            # |-------------------- seq_len ---------------------|
-            #                                   |-- query_len ---|
-            hidden_states = causal_conv1d_fn(
-                hidden_states,
+        num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        num_prefills = attn_metadata.num_prefills  # request count
+        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
+        has_prefill = num_prefill_tokens > 0
+        has_decode = num_decode_tokens > 0
+
+        prefill_decode_split = split_batch_to_prefill_and_decode(
+            hidden_states_BC,
+            gate,
+            state_indices_tensor,
+            query_start_loc,
+            has_initial_states,
+            num_prefill_tokens,
+            num_decode_tokens,
+            num_prefills,
+            num_decodes,
+        )
+        hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p
+        hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d
+        gate_p = prefill_decode_split.gate_p
+        gate_d = prefill_decode_split.gate_d
+        state_indices_tensor_p = prefill_decode_split.state_indices_tensor_p
+        state_indices_tensor_d = prefill_decode_split.state_indices_tensor_d
+        query_start_loc_p = prefill_decode_split.query_start_loc_p
+        has_initial_states_p = prefill_decode_split.has_initial_states_p
+
+        ssm_outputs = []
+
+        if has_prefill:
+            # 2. Convolution sequence transformation
+            conv_out_p = causal_conv1d_fn(
+                hidden_states_BC_p,
                 conv_weights,
-                bias=self.conv1d.bias,
+                self.conv1d.bias,
                 activation=self.activation,
                 conv_states=conv_state,
-                has_initial_state=has_initial_state,
-                cache_indices=state_indices_tensor,
-                query_start_loc=query_start_loc)
-        else:
-            hidden_states = causal_conv1d_update(
-                hidden_states.transpose(0, 1),
+                has_initial_state=has_initial_states_p,
+                cache_indices=state_indices_tensor_p,
+                query_start_loc=query_start_loc_p)
+            # 3. State Space Model sequence transformations.
+            discrete_time_step_p, B_p, C_p = self._ssm_transform(
+                conv_out_p.transpose(-2, -1))
+            time_proj_bias = self._time_proj_bias()
+
+            # 4. Perform the recurrence y ← SSM(A, B, C, Δ)(x)
+            scan_out_p = selective_scan_fn(
+                conv_out_p,
+                ssm_state,
+                discrete_time_step_p,
+                self.A,
+                B_p.transpose(-2, -1),
+                C_p.transpose(-2, -1),
+                self.D.float(),
+                gate_p,
+                time_proj_bias,
+                delta_softplus=True,
+                cache_indices=state_indices_tensor_p,
+                has_initial_state=has_initial_states_p,
+                query_start_loc=query_start_loc_p)
+            ssm_outputs.append(scan_out_p)
+
+        if has_decode:
+            # 2. Convolution sequence transformation
+            conv_out_d = causal_conv1d_update(
+                hidden_states_BC_d.transpose(0, 1),
                 conv_state,
                 conv_weights,
                 self.conv1d.bias,
                 self.activation,
-                conv_state_indices=state_indices_tensor)
-            hidden_states = hidden_states.transpose(0, 1)
+                conv_state_indices=state_indices_tensor_d).transpose(0, 1)
 
-        # 3. State Space Model sequence transformation
-        # 3.a. input varying initialization of time_step, B and C
+            # 3. State Space Model sequence transformation.
+            discrete_time_step_d, B_d, C_d = self._ssm_transform(
+                conv_out_d.transpose(-2, -1))
+            time_proj_bias = self._time_proj_bias()
 
-        if self.is_lora_enabled:
-            #   lora kernel requires contiguous tensor
-            ssm_parameters = self.x_proj(
-                hidden_states.transpose(-2, -1).contiguous())[0]
-        else:
-            ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
-
-        time_step, B, C = torch.split(
-            ssm_parameters,
-            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
-            dim=-1,
-        )
-        if self.use_rms_norm:
-            assert self.dt_layernorm is not None
-            assert self.b_layernorm is not None
-            assert self.c_layernorm is not None
-            time_step = self.dt_layernorm(time_step.contiguous())
-            B = self.b_layernorm(B.contiguous())
-            C = self.c_layernorm(C.contiguous())
-
-        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
-        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
-        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
-            self.dt_proj, "bias") else None)
-
-        if query_start_loc is not None and context_lens_tensor is not None:
-            scan_outputs = selective_scan_fn(
-                hidden_states,
-                ssm_state,
-                discrete_time_step,
-                self.A,
-                B.transpose(-2, -1),
-                C.transpose(-2, -1),
-                self.D.float(),
-                gate,
-                time_proj_bias,
-                delta_softplus=True,
-                cache_indices=state_indices_tensor,
-                has_initial_state=has_initial_state,
-                query_start_loc=query_start_loc)
-        else:
-            scan_outputs = torch.empty_like(hidden_states.transpose(0, 1))
+            # 4. Perform the recurrence y ← SSM(A, B, C, Δ)(x)
+            scan_outputs_d = torch.empty_like(
+                hidden_states_BC_d.transpose(0, 1))
             selective_state_update(ssm_state,
-                                   hidden_states.transpose(0, 1),
-                                   discrete_time_step.transpose(0, 1),
+                                   conv_out_d.transpose(0, 1),
+                                   discrete_time_step_d.transpose(0, 1),
                                    self.A,
-                                   B,
-                                   C,
+                                   B_d,
+                                   C_d,
                                    self.D,
-                                   gate.transpose(0, 1),
+                                   gate_d.transpose(0, 1),
                                    time_proj_bias,
                                    dt_softplus=True,
-                                   state_batch_indices=state_indices_tensor,
-                                   out=scan_outputs)
-            scan_outputs = scan_outputs.transpose(0, 1)
+                                   state_batch_indices=state_indices_tensor_d,
+                                   out=scan_outputs_d)
+            scan_outputs_d = scan_outputs_d.transpose(0, 1)
 
-        # 4. Final linear projection
-        if self.is_lora_enabled:
-            #  lora kernel requires contiguous tensor
-            contextualized_states = self.out_proj(
-                scan_outputs.transpose(-2, -1).contiguous())[0]
+            if envs.VLLM_USE_V1:
+                ssm_outputs.insert(0, scan_outputs_d)
+            else:
+                ssm_outputs.append(scan_outputs_d)
+
+        scan_outputs_combined = ssm_outputs[0] if len(
+            ssm_outputs) == 1 else torch.cat(ssm_outputs, dim=-1)
+
+        # 5. Final output projection
+        if self.is_lora_enabled:  # Lora kernel requires contiguous tensor.
+            scan_outputs_combined = scan_outputs_combined.transpose(
+                -2, -1).contiguous()
+            out = self.out_proj(scan_outputs_combined)[0]
         else:
-            contextualized_states = self.out_proj(
-                scan_outputs.transpose(-2, -1))[0]
-        return contextualized_states
+            out = self.out_proj(scan_outputs_combined.transpose(-2, -1))[0]
+
+        return out
 
     def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
         return MambaStateShapeCalculator.mamba1_state_shape(
@@ -317,3 +380,69 @@ class MambaMixer(MambaBase, CustomOp):
     @property
     def mamba_type(self) -> str:
         return "mamba1"
+
+    def _time_proj_bias(self) -> Optional[torch.Tensor]:
+        if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None:
+            return self.dt_proj.bias.float()
+        return None
+
+
+class PrefillDecodeSplit(NamedTuple):
+    hidden_states_BC_p: torch.Tensor
+    hidden_states_BC_d: torch.Tensor
+    gate_p: torch.Tensor
+    gate_d: torch.Tensor
+    state_indices_tensor_p: torch.Tensor
+    state_indices_tensor_d: torch.Tensor
+    query_start_loc_p: Optional[torch.Tensor]
+    has_initial_states_p: Optional[torch.Tensor]
+
+
+def split_batch_to_prefill_and_decode(
+    hidden_states_BC: torch.Tensor,
+    gate: torch.Tensor,
+    state_indices_tensor: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    has_initial_states: Optional[torch.Tensor],
+    num_prefill_tokens: int,
+    num_decode_tokens: int,
+    num_prefills: int,
+    num_decodes: int,
+) -> PrefillDecodeSplit:
+    if envs.VLLM_USE_V1:
+        # In v1, decode tokens come first, then prefill tokens.
+        hidden_states_BC_d, hidden_states_BC_p = torch.split(
+            hidden_states_BC, [num_decode_tokens, num_prefill_tokens], dim=-1)
+        gate_d, gate_p = torch.split(gate,
+                                     [num_decode_tokens, num_prefill_tokens],
+                                     dim=-1)
+        state_indices_tensor_d, state_indices_tensor_p = torch.split(
+            state_indices_tensor, [num_decodes, num_prefills], dim=0)
+        query_start_loc_p = (query_start_loc[-num_prefills - 1:] -
+                             num_decodes if num_prefills > 0 else None)
+        has_initial_states_p = has_initial_states[-num_prefills:] if (
+            has_initial_states is not None and num_prefills > 0) else None
+    else:
+        # In v0, prefill tokens come first, then decode tokens.
+        hidden_states_BC_p, hidden_states_BC_d = torch.split(
+            hidden_states_BC, [num_prefill_tokens, num_decode_tokens], dim=-1)
+        gate_p, gate_d = torch.split(gate,
+                                     [num_prefill_tokens, num_decode_tokens],
+                                     dim=-1)
+        state_indices_tensor_p, state_indices_tensor_d = torch.split(
+            state_indices_tensor, [num_prefills, num_decodes], dim=0)
+        query_start_loc_p = (query_start_loc[:num_prefills +
+                                             1] if num_prefills > 0 else None)
+        has_initial_states_p = has_initial_states[:num_prefills] if (
+            has_initial_states is not None and num_prefills > 0) else None
+
+    return PrefillDecodeSplit(
+        hidden_states_BC_p=hidden_states_BC_p,
+        hidden_states_BC_d=hidden_states_BC_d,
+        gate_p=gate_p,
+        gate_d=gate_d,
+        state_indices_tensor_p=state_indices_tensor_p,
+        state_indices_tensor_d=state_indices_tensor_d,
+        query_start_loc_p=query_start_loc_p,
+        has_initial_states_p=has_initial_states_p,
+    )
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index f0e4636fdb..6cdc509083 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -2,14 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar
+from typing import ClassVar, Optional
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
-                                              CommonAttentionMetadata)
+                                              CommonAttentionMetadata,
+                                              split_decodes_and_prefills)
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
 
@@ -25,12 +26,15 @@ class Mamba1AttentionMetadata:
     query_start_loc: torch.Tensor
     context_lens_tensor: torch.Tensor
     state_indices_tensor: torch.Tensor
-    has_initial_states: torch.Tensor
+    has_initial_states: Optional[torch.Tensor]
+    num_prefills: int
+    num_prefill_tokens: int
+    num_decodes: int
+    num_decode_tokens: int
 
 
 class Mamba1AttentionMetadataBuilder(
         AttentionMetadataBuilder[Mamba1AttentionMetadata]):
-
     reorder_batch_threshold: ClassVar[int] = 1
 
     def __init__(
@@ -57,11 +61,23 @@ class Mamba1AttentionMetadataBuilder(
         state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
         context_lens_tensor = common_attn_metadata.num_computed_tokens_cpu.to(
             query_start_loc.device)
-        has_initial_states = (context_lens_tensor > 0)
+
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+            split_decodes_and_prefills(common_attn_metadata,
+                                       decode_threshold=1))
+
+        has_initial_states = None
+
+        if num_prefills > 0:
+            has_initial_states = context_lens_tensor > 0
 
         return Mamba1AttentionMetadata(
             query_start_loc=query_start_loc,
             context_lens_tensor=context_lens_tensor,
             has_initial_states=has_initial_states,
             state_indices_tensor=state_indices_tensor,
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
         )

From aa300c438dad029ac308e0b6fcc2bae9ac689f8c Mon Sep 17 00:00:00 2001
From: Sayandip Dutta <sayandip199309@gmail.com>
Date: Fri, 15 Aug 2025 14:58:00 +0530
Subject: [PATCH 287/932] [Bugfix] Unquote file uri before reading image
 (#22912)

Signed-off-by: Sayandip Dutta <sayandip199309@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/multimodal/test_utils.py | 26 ++++++++++++++++++++++++++
 vllm/multimodal/utils.py       |  3 ++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 41f4773a11..ea964a5438 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -148,6 +148,32 @@ async def test_fetch_image_local_files(image_url: str):
                 f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
+@pytest.mark.asyncio
+async def test_fetch_image_local_files_with_space_in_name():
+    image_url = TEST_IMAGE_URLS[0]
+    connector = MediaConnector()
+
+    with TemporaryDirectory() as temp_dir:
+        local_connector = MediaConnector(allowed_local_media_path=temp_dir)
+
+        origin_image = connector.fetch_image(image_url)
+        filename = "file name with space.jpg"
+        origin_image.save(os.path.join(temp_dir, filename),
+                          quality=100,
+                          icc_profile=origin_image.info.get('icc_profile'))
+
+        try:
+            image_async = await local_connector.fetch_image_async(
+                f"file://{temp_dir}/{filename}")
+            image_sync = local_connector.fetch_image(
+                f"file://{temp_dir}/{filename}")
+        except FileNotFoundError as e:
+            pytest.fail(
+                "Failed to fetch image with space in name: {}".format(e))
+        # Check that the images are equal
+        assert not ImageChops.difference(image_sync, image_async).getbbox()
+
+
 @pytest.mark.asyncio
 async def test_fetch_image_error_conversion():
     connector = MediaConnector()
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 3b01ee7ad4..f914d0dc6c 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -9,6 +9,7 @@ from itertools import groupby
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
+from urllib.request import url2pathname
 
 import numpy as np
 import numpy.typing as npt
@@ -108,7 +109,7 @@ class MediaConnector:
             raise RuntimeError("Cannot load local files without "
                                "`--allowed-local-media-path`.")
 
-        filepath = Path(url_spec.path)
+        filepath = Path(url2pathname(url_spec.path))
         if allowed_local_media_path not in filepath.resolve().parents:
             raise ValueError(
                 f"The file path {filepath} must be a subpath "

From 3e6dd400160525b71b19e3f94b0066b1bea7bb97 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Fri, 15 Aug 2025 18:10:22 +0800
Subject: [PATCH 288/932] [Bugfix] fix cuda 12.6 and 11.8 build (#22952)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
---
 CMakeLists.txt | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c1a200d18..dcec854a08 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -351,8 +351,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
-    set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
-      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
 
     list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 
@@ -366,8 +368,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_SRCS}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
-    set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
-      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
     list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
 
     message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
@@ -859,8 +863,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set_gencode_flags_for_srcs(
       SRCS "${MOE_WNAA16_MARLIN_SRC}"
       CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
-    set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
-      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
 
     list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
 

From 49252cf59e70b1e1a8bae21da929f6d51e9acce4 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Fri, 15 Aug 2025 04:41:38 -0700
Subject: [PATCH 289/932] [MM] Allow skipping memory profiling for multimodal
 models. (#22950)

Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/config/__init__.py            |  17 ++++-
 vllm/engine/arg_utils.py           |   4 ++
 vllm/v1/worker/gpu_model_runner.py |  84 ++++++++++++-----------
 vllm/v1/worker/tpu_model_runner.py | 104 +++++++++++++++--------------
 4 files changed, 120 insertions(+), 89 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index b4ea15ef5a..a2e93c344b 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -388,6 +388,10 @@ class ModelConfig:
     interleave_mm_strings: bool = False
     """Enable fully interleaved support for multimodal prompts, while using
     --chat-template-content-format=string. Defaults to False."""
+    skip_mm_profiling: bool = False
+    """When enabled, skips multimodal memory profiling and only profiles with
+    language backbone model during engine initialization.
+    """
     media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
     """Additional args passed to process media inputs, keyed by modalities.
     For example, to set num_frames for video, set
@@ -837,7 +841,8 @@ class ModelConfig:
                 media_io_kwargs=self.media_io_kwargs,
                 mm_processor_kwargs=self.mm_processor_kwargs,
                 mm_processor_cache_gb=self.mm_processor_cache_gb,
-                interleave_mm_strings=self.interleave_mm_strings)
+                interleave_mm_strings=self.interleave_mm_strings,
+                skip_mm_profiling=self.skip_mm_profiling)
 
         return None
 
@@ -2511,6 +2516,16 @@ class MultiModalConfig:
     Enable fully interleaved support for multimodal prompts.
     """
 
+    skip_mm_profiling: bool = False
+    """
+    When enabled, skips multimodal memory profiling and only profiles with 
+    language backbone model during engine initialization.
+
+    This reduces engine startup time but shifts the responsibility to users for
+    estimating the peak memory usage of the activation of multimodal encoder and
+    embedding cache.
+    """
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index dd1072da08..31de2ede7a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -350,6 +350,7 @@ class EngineArgs:
         MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
     mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
+    skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     # LoRA fields
     enable_lora: bool = False
     enable_lora_bias: bool = LoRAConfig.bias_enabled
@@ -716,6 +717,8 @@ class EngineArgs:
         multimodal_group.add_argument(
             "--interleave-mm-strings",
             **multimodal_kwargs["interleave_mm_strings"])
+        multimodal_group.add_argument("--skip-mm-profiling",
+                                      **multimodal_kwargs["skip_mm_profiling"])
 
         # LoRA related configs
         lora_kwargs = get_kwargs(LoRAConfig)
@@ -918,6 +921,7 @@ class EngineArgs:
             limit_mm_per_prompt=self.limit_mm_per_prompt,
             interleave_mm_strings=self.interleave_mm_strings,
             media_io_kwargs=self.media_io_kwargs,
+            skip_mm_profiling=self.skip_mm_profiling,
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8fb9641844..703092ca9f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2479,50 +2479,56 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
-            mm_budget = self.mm_budget
-            assert mm_budget is not None
-
-            # TODO: handle encoder-decoder models once we support them.
-            if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
-                # NOTE: Currently model is profiled with a single non-text
-                # modality with the max possible input tokens even when
-                # it supports multiple.
-                (
-                    dummy_modality,
-                    max_tokens,
-                ) = mm_budget.get_modality_with_max_tokens()
-                (
-                    max_mm_items_per_prompt,
-                    max_mm_items_per_batch,
-                ) = mm_budget.get_max_items(dummy_modality, max_tokens)
-
+            if self.model_config.multimodal_config.skip_mm_profiling:
                 logger.info(
-                    "Encoder cache will be initialized with a budget of "
-                    "%s tokens, and profiled with %s %s items of the maximum "
-                    "feature size.",
-                    encoder_budget,
-                    max_mm_items_per_batch,
-                    dummy_modality,
-                )
+                    "Skipping memory profiling for multimodal encoder and "
+                    "encoder cache.")
+            else:
+                mm_budget = self.mm_budget
+                assert mm_budget is not None
 
-                # Create dummy batch of multimodal inputs.
-                batched_dummy_mm_inputs = self._get_mm_dummy_batch(
-                    dummy_modality,
-                    max_mm_items_per_batch,
-                )
+                # TODO: handle encoder-decoder models once we support them.
+                if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
+                    # NOTE: Currently model is profiled with a single non-text
+                    # modality with the max possible input tokens even when
+                    # it supports multiple.
+                    (
+                        dummy_modality,
+                        max_tokens,
+                    ) = mm_budget.get_modality_with_max_tokens()
+                    (
+                        max_mm_items_per_prompt,
+                        max_mm_items_per_batch,
+                    ) = mm_budget.get_max_items(dummy_modality, max_tokens)
 
-                # Run multimodal encoder.
-                dummy_encoder_outputs = self.model.get_multimodal_embeddings(
-                    **batched_dummy_mm_inputs)
+                    logger.info(
+                        "Encoder cache will be initialized with a budget of "
+                        "%s tokens, and profiled with %s %s items of the "
+                        "maximum feature size.",
+                        encoder_budget,
+                        max_mm_items_per_batch,
+                        dummy_modality,
+                    )
 
-                sanity_check_mm_encoder_outputs(
-                    dummy_encoder_outputs,
-                    expected_num_items=max_mm_items_per_batch,
-                )
+                    # Create dummy batch of multimodal inputs.
+                    batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                        dummy_modality,
+                        max_mm_items_per_batch,
+                    )
 
-                # Cache the dummy encoder outputs.
-                self.encoder_cache["tmp"] = dict(
-                    enumerate(dummy_encoder_outputs))
+                    # Run multimodal encoder.
+                    dummy_encoder_outputs = \
+                        self.model.get_multimodal_embeddings(
+                        **batched_dummy_mm_inputs)
+
+                    sanity_check_mm_encoder_outputs(
+                        dummy_encoder_outputs,
+                        expected_num_items=max_mm_items_per_batch,
+                    )
+
+                    # Cache the dummy encoder outputs.
+                    self.encoder_cache["tmp"] = dict(
+                        enumerate(dummy_encoder_outputs))
 
         # Add `is_profile` here to pre-allocate communication buffers
         hidden_states, last_hidden_states \
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 46262284e3..f7e68edba3 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1529,60 +1529,66 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     ) -> None:
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
-            mm_budget = self.mm_budget
-            assert mm_budget is not None
-
-            # TODO: handle encoder-decoder models once we support them.
-            if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
-                # NOTE: Currently model is profiled with a single non-text
-                # modality with the max possible input tokens even when
-                # it supports multiple.
-                (
-                    dummy_modality,
-                    max_tokens,
-                ) = mm_budget.get_modality_with_max_tokens()
-                (
-                    max_mm_items_per_prompt,
-                    max_mm_items_per_batch,
-                ) = mm_budget.get_max_items(dummy_modality, max_tokens)
-
+            if self.model_config.multimodal_config.skip_mm_profiling:
                 logger.info(
-                    "Encoder cache will be initialized with a budget of "
-                    "%s tokens, and profiled with %s %s items of the maximum "
-                    "feature size.",
-                    encoder_budget,
-                    max_mm_items_per_batch,
-                    dummy_modality,
-                )
+                    "Skipping memory profiling for multimodal encoder and "
+                    "encoder cache.")
+            else:
+                mm_budget = self.mm_budget
+                assert mm_budget is not None
 
-                # Create dummy batch of multimodal inputs.
-                batched_dummy_mm_inputs = self._get_mm_dummy_batch(
-                    dummy_modality,
-                    max_mm_items_per_batch,
-                )
+                # TODO: handle encoder-decoder models once we support them.
+                if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
+                    # NOTE: Currently model is profiled with a single non-text
+                    # modality with the max possible input tokens even when
+                    # it supports multiple.
+                    (
+                        dummy_modality,
+                        max_tokens,
+                    ) = mm_budget.get_modality_with_max_tokens()
+                    (
+                        max_mm_items_per_prompt,
+                        max_mm_items_per_batch,
+                    ) = mm_budget.get_max_items(dummy_modality, max_tokens)
 
-                # Run multimodal encoder.
-                # Isolate encoder graph from post-processing to minimize
-                # impact of recompilation until it's fixed.
-                start = time.perf_counter()
-                xm.mark_step()
-                dummy_encoder_outputs = self.model.get_multimodal_embeddings(
-                    **batched_dummy_mm_inputs)
-                xm.mark_step()
-                xm.wait_device_ops()
-                end = time.perf_counter()
-                logger.info(
-                    "Multimodal Encoder profiling finished in in %.2f [secs].",
-                    end - start)
+                    logger.info(
+                        "Encoder cache will be initialized with a budget of "
+                        "%s tokens, and profiled with %s %s items of the "
+                        "maximum feature size.",
+                        encoder_budget,
+                        max_mm_items_per_batch,
+                        dummy_modality,
+                    )
 
-                sanity_check_mm_encoder_outputs(
-                    dummy_encoder_outputs,
-                    expected_num_items=max_mm_items_per_batch,
-                )
+                    # Create dummy batch of multimodal inputs.
+                    batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                        dummy_modality,
+                        max_mm_items_per_batch,
+                    )
 
-                # Cache the dummy encoder outputs.
-                self.encoder_cache["tmp"] = dict(
-                    enumerate(dummy_encoder_outputs))
+                    # Run multimodal encoder.
+                    # Isolate encoder graph from post-processing to minimize
+                    # impact of recompilation until it's fixed.
+                    start = time.perf_counter()
+                    xm.mark_step()
+                    dummy_encoder_outputs = \
+                        self.model.get_multimodal_embeddings(
+                        **batched_dummy_mm_inputs)
+                    xm.mark_step()
+                    xm.wait_device_ops()
+                    end = time.perf_counter()
+                    logger.info(
+                        "Multimodal Encoder profiling finished in %.2f [secs].",
+                        end - start)
+
+                    sanity_check_mm_encoder_outputs(
+                        dummy_encoder_outputs,
+                        expected_num_items=max_mm_items_per_batch,
+                    )
+
+                    # Cache the dummy encoder outputs.
+                    self.encoder_cache["tmp"] = dict(
+                        enumerate(dummy_encoder_outputs))
 
         # Trigger compilation for general shape.
         self._dummy_run(num_tokens, self.num_reqs_max_model_len,

From 22341b996e7b6361624dd2909df0d9b37ae6e41e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Staszek=20Pa=C5=9Bko?= <staszek@gmail.com>
Date: Fri, 15 Aug 2025 14:32:56 +0200
Subject: [PATCH 290/932] Improve multimodal hasher performance for re-used
 Image prompts (#22825)

Signed-off-by: Staszek Pasko <staszek@gmail.com>
---
 tests/multimodal/test_hasher.py | 20 ++++++++++++++++++++
 vllm/multimodal/hasher.py       |  6 ++++++
 2 files changed, 26 insertions(+)

diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py
index 42cb40739d..75a233c256 100644
--- a/tests/multimodal/test_hasher.py
+++ b/tests/multimodal/test_hasher.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import uuid
 from pathlib import Path
 
 import numpy as np
@@ -72,3 +73,22 @@ def test_hash_non_contiguous_array():
     hasher = MultiModalHasher
     # Both should be hashable and produce the same hashes
     assert hasher.hash_kwargs(data=arr) == hasher.hash_kwargs(data=arr_c)
+
+
+def test_hash_image_exif_id():
+    # Test that EXIF ImageId tag can be used to store UUID
+    # and the hasher will use that instead of the image data.
+    image1 = image2 = Image.new("1", size=(10, 20))
+    id = uuid.uuid4()
+    image1.getexif()[Image.ExifTags.Base.ImageID] = id
+    image2 = Image.open(ASSETS_DIR / "image1.png")
+    image2.getexif()[Image.ExifTags.Base.ImageID] = "Not a UUID"
+    image2a = Image.open(ASSETS_DIR / "image1.png")
+
+    hasher = MultiModalHasher
+    # first image has UUID in ImageID, so it should hash to that UUID
+    assert hasher.hash_kwargs(image=image1) == hasher.hash_kwargs(
+        image=id.bytes)
+    # second image has non-UUID in ImageID, so it should hash to the image data
+    assert hasher.hash_kwargs(image=image2) == hasher.hash_kwargs(
+        image=image2a)
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index ac27bb66f7..c9ce1f0be5 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle
+import uuid
 from collections.abc import Iterable, Mapping
 from typing import Union
 
@@ -34,6 +35,11 @@ class MultiModalHasher:
             return np.array(obj).tobytes()
 
         if isinstance(obj, Image.Image):
+            exif = obj.getexif()
+            if Image.ExifTags.Base.ImageID in exif and isinstance(
+                    exif[Image.ExifTags.Base.ImageID], uuid.UUID):
+                # If the image has exif ImageID tag, use that
+                return exif[Image.ExifTags.Base.ImageID].bytes
             return cls.item_to_bytes(
                 "image", np.asarray(convert_image_mode(obj, "RGBA")))
         if isinstance(obj, torch.Tensor):

From 75531a6c134282f940c86461b3c40996b4136793 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 15 Aug 2025 14:57:06 +0200
Subject: [PATCH 291/932] [V1] [Hybrid] Support using float32 for state in
 Hybrid Models (Mamba2, Mamba1, Minimax) (#22928)

Signed-off-by: Daniel Afrimi <danielafrimi8@gmail.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Daniel Afrimi <danielafrimi8@gmail.com>
Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
---
 .../models/language/generation/test_hybrid.py | 62 +++++++++++++++++++
 tests/v1/worker/test_gpu_model_runner.py      |  2 +
 vllm/config/__init__.py                       |  2 +-
 vllm/config/cache.py                          | 12 ++++
 vllm/engine/arg_utils.py                      | 20 ++++--
 .../layers/mamba/mamba_mixer.py               | 17 ++++-
 .../layers/mamba/mamba_mixer2.py              | 51 +++++++++------
 .../layers/mamba/mamba_utils.py               | 52 ++++++++++++++++
 .../layers/mamba/ops/ssd_combined.py          | 10 ++-
 vllm/model_executor/models/bamba.py           | 29 +++++++--
 vllm/model_executor/models/config.py          |  2 +-
 vllm/model_executor/models/falcon_h1.py       | 29 +++++++--
 .../model_executor/models/granitemoehybrid.py | 30 +++++++--
 vllm/model_executor/models/jamba.py           | 28 +++++++--
 vllm/model_executor/models/mamba.py           | 27 ++++++--
 vllm/model_executor/models/mamba2.py          | 36 +++++++++--
 vllm/model_executor/models/mamba_cache.py     | 15 +++--
 vllm/model_executor/models/minimax_text_01.py | 34 +++++++++-
 vllm/model_executor/models/nemotron_h.py      | 32 ++++++++--
 vllm/model_executor/models/zamba2.py          | 38 ++++++++++--
 vllm/utils/__init__.py                        |  1 +
 vllm/v1/kv_cache_interface.py                 |  7 ++-
 vllm/v1/worker/gpu_model_runner.py            | 18 +++---
 23 files changed, 467 insertions(+), 87 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index e75677347f..aee0a50336 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -431,3 +431,65 @@ def test_full_cuda_graph(
         name_0="hf" if hf_outputs is not None else "vllm-v0",
         name_1="vllm-v1",
     )
+
+
+@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_fp32_state(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    with hf_runner(model) as hf_model:
+        if model not in HF_UNSUPPORTED_MODELS:
+            hf_outputs = hf_model.generate_greedy_logprobs_limit(
+                example_prompts, max_tokens, num_logprobs)
+        else:
+            hf_outputs = None
+
+    with vllm_runner(model,
+                     max_num_seqs=MAX_NUM_SEQS,
+                     mamba_ssm_cache_dtype="float32") as vllm_model:
+        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        if model in HYBRID_MODELS:
+            # required due to reorder_batch behaviour
+            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+        with vllm_runner(model,
+                         max_num_seqs=MAX_NUM_SEQS,
+                         mamba_ssm_cache_dtype="float32",
+                         enable_prefix_caching=False) as vllm_model:
+            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
+
+    if hf_outputs is not None:
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_v0_outputs,
+            name_0="hf",
+            name_1="vllm-v0",
+        )
+
+    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
+    check_logprobs_close(
+        outputs_0_lst=ref_outputs,
+        outputs_1_lst=vllm_v1_outputs,
+        name_0="hf" if hf_outputs is not None else "vllm-v0",
+        name_1="vllm-v1",
+    )
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index e97cdf4827..4bcc63f293 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -772,6 +772,8 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
                 head_dim=hf_config.mamba_d_head,
                 rms_norm_eps=hf_config.rms_norm_eps,
                 activation=hf_config.hidden_act,
+                cache_config=cache_config,
+                model_config=model_config,
                 prefix=key,
             )
         # suppress var not used error
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index a2e93c344b..82ef8db673 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -29,7 +29,7 @@ from typing_extensions import Self, assert_never, runtime_checkable
 
 import vllm.envs as envs
 from vllm import version
-from vllm.config.cache import (BlockSize, CacheConfig, CacheDType,
+from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
                                PrefixCachingHashAlgo)
 from vllm.config.compilation import (CompilationConfig, CompilationLevel,
                                      PassConfig)
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 69cb0d9732..ae11dec3ca 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -23,6 +23,7 @@ logger = init_logger(__name__)
 
 BlockSize = Literal[1, 8, 16, 32, 64, 128]
 CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
+MambaDType = Literal["auto", "float32"]
 PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"]
 
 
@@ -93,6 +94,15 @@ class CacheConfig:
     """ Optional override for mamba page size; used by hybrid mamba/attention
     models to ensure exact alignment with attention page size."""
 
+    mamba_cache_dtype: MambaDType = "auto"
+    """The data type to use for the Mamba cache (both the conv as well as the
+    ssm state). If set to 'auto', the data type will be inferred from the model
+    config."""
+    mamba_ssm_cache_dtype: MambaDType = "auto"
+    """The data type to use for the Mamba cache (ssm state only, conv state will
+    still be controlled by mamba_cache_dtype). If set to 'auto', the data type
+    for the ssm state will be determined by mamba_cache_dtype."""
+
     # Will be set after profiling.
     num_gpu_blocks: Optional[int] = field(default=None, init=False)
     """The number of blocks to allocate for GPU memory."""
@@ -123,6 +133,8 @@ class CacheConfig:
         """
         factors: list[Any] = []
         factors.append(self.cache_dtype)
+        factors.append(self.mamba_cache_dtype)
+        factors.append(self.mamba_ssm_cache_dtype)
         # `cpu_offload_gb` does not use `torch.compile` yet.
         hash_str = hashlib.md5(str(factors).encode(),
                                usedforsecurity=False).hexdigest()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 31de2ede7a..f8af6d36e0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -27,12 +27,12 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
                          DeviceConfig, DistributedExecutorBackend,
                          GuidedDecodingBackend, HfOverrides, KVEventsConfig,
                          KVTransferConfig, LoadConfig, LogprobsMode,
-                         LoRAConfig, ModelConfig, ModelDType, ModelImpl,
-                         MultiModalConfig, ObservabilityConfig, ParallelConfig,
-                         PoolerConfig, PrefixCachingHashAlgo, RunnerOption,
-                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
-                         TaskOption, TokenizerMode, VllmConfig, get_attr_docs,
-                         get_field)
+                         LoRAConfig, MambaDType, ModelConfig, ModelDType,
+                         ModelImpl, MultiModalConfig, ObservabilityConfig,
+                         ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
+                         RunnerOption, SchedulerConfig, SchedulerPolicy,
+                         SpeculativeConfig, TaskOption, TokenizerMode,
+                         VllmConfig, get_attr_docs, get_field)
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
@@ -422,6 +422,8 @@ class EngineArgs:
     override_attention_dtype: str = ModelConfig.override_attention_dtype
 
     calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
+    mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype
+    mamba_ssm_cache_dtype: MambaDType = CacheConfig.mamba_ssm_cache_dtype
 
     additional_config: dict[str, Any] = \
         get_field(VllmConfig, "additional_config")
@@ -694,6 +696,10 @@ class EngineArgs:
                                  **cache_kwargs["calculate_kv_scales"])
         cache_group.add_argument("--kv-sharing-fast-prefill",
                                  **cache_kwargs["kv_sharing_fast_prefill"])
+        cache_group.add_argument("--mamba-cache-dtype",
+                                 **cache_kwargs["mamba_cache_dtype"])
+        cache_group.add_argument("--mamba-ssm-cache-dtype",
+                                 **cache_kwargs["mamba_ssm_cache_dtype"])
 
         # Multimodal related configs
         multimodal_kwargs = get_kwargs(MultiModalConfig)
@@ -1105,6 +1111,8 @@ class EngineArgs:
             cpu_offload_gb=self.cpu_offload_gb,
             calculate_kv_scales=self.calculate_kv_scales,
             kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
+            mamba_cache_dtype=self.mamba_cache_dtype,
+            mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype,
         )
 
         ray_runtime_env = None
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 3b17fb0ca8..3c7322260d 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -9,7 +9,7 @@ from torch.nn.parameter import Parameter
 
 from vllm import envs
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.config import get_current_vllm_config
+from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
 from vllm.forward_context import ForwardContext, get_forward_context
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
@@ -56,6 +56,8 @@ class MambaMixer(MambaBase, CustomOp):
                  rms_norm_eps: float = 1e-5,
                  activation="silu",
                  is_lora_enabled: bool = False,
+                 model_config: Optional[ModelConfig] = None,
+                 cache_config: Optional[CacheConfig] = None,
                  prefix: str = ""):
         super().__init__()
         self.time_step_rank = time_step_rank
@@ -153,6 +155,8 @@ class MambaMixer(MambaBase, CustomOp):
             # The inner tuple is (conv_state, ssm_state)
             self.kv_cache = [(torch.tensor([]), torch.tensor([]))]
 
+        self.model_config = model_config
+        self.cache_config = cache_config
         self.prefix = prefix
 
     def _ssm_transform(
@@ -369,6 +373,15 @@ class MambaMixer(MambaBase, CustomOp):
 
         return out
 
+    def get_state_dtype(self) -> tuple[torch.dtype]:
+        assert self.model_config is not None
+        assert self.cache_config is not None
+        return MambaStateDtypeCalculator.mamba1_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+            self.cache_config.mamba_ssm_cache_dtype,
+        )
+
     def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
         return MambaStateShapeCalculator.mamba1_state_shape(
             tp_world_size=get_tensor_model_parallel_world_size(),
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 6bf0c18ebd..743e520ec8 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -8,7 +8,7 @@ from torch import nn
 
 from vllm import envs
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.config import get_current_vllm_config
+from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_gather,
@@ -21,7 +21,7 @@ from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba2_metadata import (Mamba2Metadata,
                                                               update_metadata)
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated
@@ -218,23 +218,23 @@ class MambaMixer2(MambaBase, CustomOp):
     **selective** state spaces)
     """
 
-    def __init__(
-        self,
-        hidden_size: int,
-        ssm_state_size: int,
-        conv_kernel_size: int,
-        intermediate_size: int,
-        use_conv_bias: bool,
-        use_bias: bool,
-        n_groups: int = 1,
-        num_heads: int = 128,
-        head_dim: int = 64,
-        rms_norm_eps: float = 1e-5,
-        activation: str = "silu",
-        use_rms_norm: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self,
+                 hidden_size: int,
+                 ssm_state_size: int,
+                 conv_kernel_size: int,
+                 intermediate_size: int,
+                 use_conv_bias: bool,
+                 use_bias: bool,
+                 n_groups: int = 1,
+                 num_heads: int = 128,
+                 head_dim: int = 64,
+                 rms_norm_eps: float = 1e-5,
+                 activation: str = "silu",
+                 use_rms_norm: bool = True,
+                 model_config: Optional[ModelConfig] = None,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
 
         # For TP, the sharding plan is as follows:
@@ -417,6 +417,8 @@ class MambaMixer2(MambaBase, CustomOp):
             # The inner tuple is (conv_state, ssm_state)
             self.kv_cache = [(torch.tensor([]), torch.tensor([]))]
 
+        self.model_config = model_config
+        self.cache_config = cache_config
         self.prefix = prefix
 
     def forward_native(
@@ -670,7 +672,7 @@ class MambaMixer2(MambaBase, CustomOp):
                 dt_limit=(0.0, float("inf")),
                 out=preallocated_ssm_out_p.view(1, num_prefill_tokens, -1,
                                                 self.head_dim),
-            )
+                state_dtype=ssm_state.dtype)
 
             # update ssm states
             # - varlen state is a (num_prefills, nheads, headdim, dstate) tensor
@@ -732,6 +734,15 @@ class MambaMixer2(MambaBase, CustomOp):
         # 5. Final linear projection
         output[:num_actual_tokens], _ = self.out_proj(hidden_states)
 
+    def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
+        assert self.model_config is not None
+        assert self.cache_config is not None
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+            self.cache_config.mamba_ssm_cache_dtype,
+        )
+
     def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
         return MambaStateShapeCalculator.mamba2_state_shape(
             intermediate_size=self.intermediate_size,
diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py
index ad14017912..66674d1a6f 100644
--- a/vllm/model_executor/layers/mamba/mamba_utils.py
+++ b/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -1,6 +1,58 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Union
+
+import torch
+
+from vllm.config import MambaDType, ModelDType
 from vllm.distributed import divide
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_kv_cache_torch_dtype
+
+
+class MambaStateDtypeCalculator:
+
+    @classmethod
+    def linear_attention_state_dtype(
+        cls,
+        model_dtype: Union[ModelDType, torch.dtype],
+        mamba_cache_dtype: MambaDType,
+    ) -> tuple[torch.dtype, ...]:
+        # TODO (tdoublep) requires testing
+        if mamba_cache_dtype == "float32":
+            raise ValueError("fp32 state for minimax is not yet supported")
+        state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
+        return (state_dtype, )
+
+    @classmethod
+    def mamba1_state_dtype(
+        cls,
+        model_dtype: Union[ModelDType, torch.dtype],
+        mamba_cache_dtype: MambaDType,
+        mamba_ssm_cache_dtype: MambaDType,
+    ) -> tuple[torch.dtype, ...]:
+        # TODO (tdoublep) requires kernel changes
+        if mamba_cache_dtype == "float32" or mamba_ssm_cache_dtype == "float32":
+            raise ValueError("fp32 state for mamba1 is not yet supported")
+        else:
+            return MambaStateDtypeCalculator.mamba2_state_dtype(
+                model_dtype, mamba_cache_dtype, mamba_ssm_cache_dtype)
+
+    @classmethod
+    def mamba2_state_dtype(
+        cls,
+        model_dtype: Union[ModelDType, torch.dtype],
+        mamba_cache_dtype: MambaDType,
+        mamba_ssm_cache_dtype: MambaDType,
+    ) -> tuple[torch.dtype, ...]:
+        conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype,
+                                                    model_dtype)
+        if mamba_ssm_cache_dtype == "auto":
+            temporal_state_dtype = conv_state_dtype
+        else:
+            temporal_state_dtype = (
+                STR_DTYPE_TO_TORCH_DTYPE[mamba_ssm_cache_dtype])
+
+        return (conv_state_dtype, temporal_state_dtype)
 
 
 class MambaStateShapeCalculator:
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
index fd74cb8372..d0b3e9e523 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -41,6 +41,7 @@ def _mamba_chunk_scan_combined_fwd(x,
                                    cu_seqlens=None,
                                    dt_softplus=False,
                                    dt_limit=(0.0, float("inf")),
+                                   state_dtype=None,
                                    out=None):
     assert is_int_pow_2(chunk_size), "chunk_size must be integer power of 2"
     batch, seqlen, nheads, headdim = x.shape
@@ -118,7 +119,7 @@ def _mamba_chunk_scan_combined_fwd(x,
         if initial_states is not None else None,
         seq_idx=seq_idx,
         chunk_size=chunk_size,
-        out_dtype=C.dtype,
+        out_dtype=state_dtype if state_dtype is not None else C.dtype,
         is_cont_batched=cu_seqlens is not None)
     states, final_states = (rearrange(t, "... (p n) -> ... p n", n=dstate)
                             for t in [states, final_states])
@@ -189,7 +190,8 @@ def mamba_chunk_scan_combined(x,
                               dt_limit=(0.0, float("inf")),
                               out=None,
                               return_final_states=False,
-                              return_varlen_states=False):
+                              return_varlen_states=False,
+                              state_dtype=None):
     """
     Argument:
         x: (batch, seqlen, nheads, headdim)
@@ -206,6 +208,7 @@ def mamba_chunk_scan_combined(x,
         cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
         dt_softplus: Whether to apply softplus to dt
         out: Preallocated output tensor
+        state_dtype: The data type of the ssm state
     """
 
     if not return_varlen_states:
@@ -229,7 +232,8 @@ def mamba_chunk_scan_combined(x,
         cu_seqlens=cu_seqlens,
         dt_softplus=dt_softplus,
         dt_limit=dt_limit,
-        out=out)
+        out=out,
+        state_dtype=state_dtype)
     if not return_varlen_states:
         if not return_final_states:
             return
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 4a2ae07581..e2cd31af53 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -12,7 +12,7 @@ from transformers import BambaConfig
 from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
@@ -26,7 +26,7 @@ from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -83,6 +83,7 @@ class BambaMixerDecoderLayer(nn.Module):
     def __init__(self,
                  config: BambaConfig,
                  layer_idx: int,
+                 model_config: Optional[ModelConfig] = None,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = "") -> None:
@@ -100,6 +101,8 @@ class BambaMixerDecoderLayer(nn.Module):
                                 head_dim=config.mamba_d_head,
                                 rms_norm_eps=config.rms_norm_eps,
                                 activation=config.hidden_act,
+                                model_config=model_config,
+                                cache_config=cache_config,
                                 quant_config=quant_config,
                                 prefix=f"{prefix}.mixer")
 
@@ -138,6 +141,7 @@ class BambaAttentionDecoderLayer(nn.Module):
         self,
         config: BambaConfig,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -266,6 +270,7 @@ class BambaModel(nn.Module):
         super().__init__()
 
         config: BambaConfig = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -289,6 +294,7 @@ class BambaModel(nn.Module):
             return layer_class(
                 config,
                 layer_idx,
+                model_config,
                 cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
@@ -437,6 +443,18 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     }
     embedding_padding_modules = ["lm_head"]
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
@@ -528,10 +546,13 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                 mamba_state_shape = \
                     self.get_mamba_state_shape_from_config(
                         self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.lm_head.weight.dtype,
                                                      num_mamba_layers,
-                                                     *mamba_state_shape)
+                                                     *mamba_state_shape,
+                                                     *mamba_state_dtype)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 6f21cd267b..882df7e816 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -318,7 +318,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
         # get mamba page size
         mamba_page_size = MambaSpec(
             shapes=model_cls.get_mamba_state_shape_from_config(vllm_config),
-            dtype=kv_cache_dtype,
+            dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config),
             block_size=model_config.max_model_len,
         ).page_size_bytes
 
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 85d64af5bd..5e2b6d6912 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -11,7 +11,7 @@ from transformers import FalconH1Config
 from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
@@ -25,7 +25,7 @@ from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -85,6 +85,7 @@ class FalconH1SSMDecoderLayer(nn.Module):
     def __init__(
         self,
         config: FalconH1Config,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -108,6 +109,8 @@ class FalconH1SSMDecoderLayer(nn.Module):
             head_dim=config.mamba_d_head,
             rms_norm_eps=config.rms_norm_eps,
             activation=config.hidden_act,
+            model_config=model_config,
+            cache_config=cache_config,
             quant_config=quant_config,
             use_rms_norm=config.mamba_rms_norm,
             prefix=f"{prefix}.mixer",
@@ -317,6 +320,7 @@ class FalconH1ParallelHybrid(nn.Module):
         self,
         config: FalconH1Config,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -339,6 +343,7 @@ class FalconH1ParallelHybrid(nn.Module):
         # Instantiate the SSM branch
         self.mamba = FalconH1SSMDecoderLayer(
             config=config,
+            model_config=model_config,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=ssm_prefix,
@@ -408,6 +413,7 @@ class FalconH1Model(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: FalconH1Config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -435,6 +441,7 @@ class FalconH1Model(nn.Module):
             return layer_class(
                 config,
                 layer_idx,
+                model_config,
                 cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
@@ -519,6 +526,18 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     }
     embedding_padding_modules = ["lm_head"]
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
@@ -624,12 +643,14 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                 mamba_state_shape = \
                     self.get_mamba_state_shape_from_config(
                         self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(
                     self.vllm_config,
-                    self.lm_head.weight.dtype if hasattr(
-                        self.lm_head, 'weight') else torch.bfloat16,
                     self.config.num_hidden_layers,
                     *mamba_state_shape,
+                    *mamba_state_dtype,
                 )
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index e59502f12a..5704496b9a 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -12,7 +12,7 @@ from transformers import GraniteMoeHybridConfig
 from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
@@ -24,7 +24,7 @@ from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -50,6 +50,7 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
     def __init__(self,
                  config: GraniteMoeHybridConfig,
                  layer_idx: int,
+                 model_config: Optional[ModelConfig] = None,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = "") -> None:
@@ -70,6 +71,8 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
                                 head_dim=config.mamba_d_head,
                                 rms_norm_eps=config.rms_norm_eps,
                                 activation=config.hidden_act,
+                                model_config=model_config,
+                                cache_config=cache_config,
                                 quant_config=quant_config,
                                 prefix=f"{prefix}.mixer")
 
@@ -137,6 +140,7 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
         self,
         config: GraniteMoeHybridConfig,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -217,6 +221,7 @@ class GraniteMoeHybridAttention(nn.Module):
     def __init__(
         self,
         config: GraniteMoeHybridConfig,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -316,6 +321,7 @@ class GraniteMoeHybridModel(nn.Module):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -340,6 +346,7 @@ class GraniteMoeHybridModel(nn.Module):
             return layer_class(
                 config,
                 layer_idx,
+                model_config,
                 cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
@@ -527,6 +534,18 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
     }
     embedding_padding_modules = ["lm_head"]
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
@@ -625,10 +644,13 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
                 mamba_state_shape = \
                     self.get_mamba_state_shape_from_config(
                         self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.model_config.dtype,
                                                      num_mamba_layers,
-                                                     *mamba_state_shape)
+                                                     *mamba_state_shape,
+                                                     *mamba_state_dtype)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index fbd310121a..0b32d6f256 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -10,7 +10,7 @@ from transformers import JambaConfig
 
 from vllm import envs
 from vllm.attention.layer import Attention
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -21,7 +21,7 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -94,6 +94,7 @@ class JambaMambaDecoderLayer(nn.Module):
     def __init__(self,
                  config: JambaConfig,
                  layer_idx: int,
+                 model_config: Optional[ModelConfig] = None,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  is_lora_enabled: Optional[bool] = False,
@@ -114,6 +115,8 @@ class JambaMambaDecoderLayer(nn.Module):
                                 rms_norm_eps=config.rms_norm_eps,
                                 activation=config.hidden_act,
                                 is_lora_enabled = self.is_lora_enabled,
+                                model_config=model_config,
+                                cache_config=cache_config,
                                 prefix=f"{prefix}.mixer",
                                 )
 
@@ -164,6 +167,7 @@ class JambaAttentionDecoderLayer(nn.Module):
     def __init__(self,
                  config: JambaConfig,
                  layer_idx: int,
+                 model_config: Optional[ModelConfig] = None,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = "",
@@ -280,6 +284,7 @@ class JambaModel(nn.Module):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -304,6 +309,7 @@ class JambaModel(nn.Module):
                 config.layers_block_type[layer_idx]]
             return layer_class(config,
                                layer_idx,
+                               model_config,
                                cache_config,
                                quant_config=quant_config,
                                prefix=prefix,
@@ -520,9 +526,11 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                     self.vllm_config.parallel_config, LayerBlockType.mamba)
                 state_shape = self.get_mamba_state_shape_from_config(
                     self.vllm_config)
+                state_dtype = self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.lm_head.weight.dtype,
-                                                     num_layers, *state_shape)
+                                                     num_layers, *state_shape,
+                                                     *state_dtype)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
@@ -537,6 +545,18 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba1_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 80b63e1537..f4aaf0c6f4 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -9,13 +9,13 @@ from torch import nn
 from transformers import MambaConfig
 
 from vllm import envs
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -40,6 +40,7 @@ class MambaDecoderLayer(nn.Module):
 
     def __init__(self,
                  config: MambaConfig,
+                 model_config: Optional[ModelConfig] = None,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  is_lora_enabled: Optional[bool] = False,
@@ -61,6 +62,8 @@ class MambaDecoderLayer(nn.Module):
                                 rms_norm_eps=mixer_rms_eps,
                                 activation=config.hidden_act,
                                 is_lora_enabled=self.is_lora_enabled,
+                                model_config=model_config,
+                                cache_config=cache_config,
                                 prefix=f"{prefix}.mixer")
 
         self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
@@ -88,6 +91,7 @@ class MambaModel(nn.Module):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -108,6 +112,7 @@ class MambaModel(nn.Module):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: MambaDecoderLayer(config,
+                                             model_config=model_config,
                                              cache_config=cache_config,
                                              quant_config=quant_config,
                                              is_lora_enabled=is_lora_enabled,
@@ -243,9 +248,11 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
                     self.vllm_config.parallel_config, LayerBlockType.mamba)
                 state_shape = self.get_mamba_state_shape_from_config(
                     self.vllm_config)
+                state_dtype = self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.lm_head.weight.dtype,
-                                                     num_layers, *state_shape)
+                                                     num_layers, *state_shape,
+                                                     *state_dtype)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
@@ -254,6 +261,18 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
 
         return hidden_states
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba1_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 75e92b0176..3432cf29fe 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -11,7 +11,7 @@ from transformers import MambaConfig
 from vllm import envs
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -45,6 +45,8 @@ class Mamba2DecoderLayer(nn.Module):
 
     def __init__(self,
                  config: MambaConfig,
+                 model_config: Optional[ModelConfig] = None,
+                 cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = "") -> None:
         super().__init__()
@@ -62,6 +64,8 @@ class Mamba2DecoderLayer(nn.Module):
                                  head_dim=config.head_dim,
                                  rms_norm_eps=config.layer_norm_epsilon,
                                  activation=config.hidden_act,
+                                 model_config=model_config,
+                                 cache_config=cache_config,
                                  quant_config=quant_config,
                                  prefix=f"{prefix}.mixer")
 
@@ -93,6 +97,8 @@ class Mamba2Model(nn.Module):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         is_lora_enabled = bool(lora_config)
@@ -112,8 +118,11 @@ class Mamba2Model(nn.Module):
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: Mamba2DecoderLayer(
-                config, quant_config=quant_config, prefix=prefix),
+            lambda prefix: Mamba2DecoderLayer(config,
+                                              model_config=model_config,
+                                              cache_config=cache_config,
+                                              quant_config=quant_config,
+                                              prefix=prefix),
             prefix=f"{prefix}.layers")
 
         self.norm_f = RMSNorm(config.hidden_size,
@@ -200,6 +209,18 @@ class Mamba2Model(nn.Module):
 
 class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
@@ -290,10 +311,13 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
                 mamba_state_shape = \
                     self.get_mamba_state_shape_from_config(
                         self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.lm_head.weight.dtype,
                                                      num_mamba_layers,
-                                                     *mamba_state_shape)
+                                                     *mamba_state_shape,
+                                                     *mamba_state_dtype)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
         else:
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
index 27685c59a3..6b16e3ce7d 100644
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -24,9 +24,14 @@ class MambaCacheParams:
 
 class MambaCacheManager(ConstantSizeCache):
 
-    def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype,
-                 num_mamba_layers: int, conv_state_shape: tuple[int, int],
-                 temporal_state_shape: tuple[int, int]):
+    def __init__(self, vllm_config: VllmConfig, num_mamba_layers: int,
+                 conv_state_shape: tuple[int, int],
+                 temporal_state_shape: tuple[int, int],
+                 conv_state_dtype: torch.dtype,
+                 temporal_state_dtype: torch.dtype):
+
+        self.conv_state_dtype = conv_state_dtype
+        self.temporal_state_dtype = temporal_state_dtype
 
         # Determine max batch size to set size of MambaCache
         max_batch_size = vllm_config.scheduler_config.max_num_seqs
@@ -40,11 +45,11 @@ class MambaCacheManager(ConstantSizeCache):
         assert conv_state_shape[0] > conv_state_shape[1]
         conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
                                  (conv_state_shape[1], conv_state_shape[0]),
-                                 dtype=dtype,
+                                 dtype=self.conv_state_dtype,
                                  device="cuda").transpose(-1, -2)
         temporal_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
                                      temporal_state_shape,
-                                     dtype=dtype,
+                                     dtype=self.temporal_state_dtype,
                                      device="cuda")
 
         self._mamba_cache = (conv_state, temporal_state)
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 3d14a6ad5c..82e96844cd 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -16,7 +16,8 @@ from transformers import MiniMaxConfig
 
 from vllm import envs
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
+                         get_current_vllm_config)
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tensor_model_parallel_rank,
@@ -36,7 +37,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -338,6 +339,12 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
     def mamba_type(self) -> str:
         return "linear_attention"
 
+    def get_state_dtype(self) -> tuple[torch.dtype]:
+        return MambaStateDtypeCalculator.linear_attention_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+        )
+
     def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
         return MambaStateShapeCalculator.linear_attention_state_shape(
             num_heads=self.num_heads,
@@ -353,6 +360,8 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
         max_position: int,
         block_size: int,
         num_hidden_layer: int,
+        model_config: Optional[ModelConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         layer_idx: int = 0,
         linear_layer_idx: int = 0,
@@ -374,6 +383,8 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
         self.tp_heads = self.total_num_heads // self.tp_size
         self.qkv_size = self.num_heads * self.head_dim
         self.tp_hidden = self.head_dim * self.tp_heads
+        self.model_config = model_config
+        self.cache_config = cache_config
         self.prefix = prefix
 
         self.qkv_proj = ColumnParallelLinear(
@@ -657,6 +668,7 @@ class MiniMaxText01DecoderLayer(nn.Module):
     def __init__(
         self,
         config: MiniMaxConfig,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         expert_num: int = 1,
@@ -693,6 +705,8 @@ class MiniMaxText01DecoderLayer(nn.Module):
                 max_position=max_position_embeddings,
                 block_size=config.block if hasattr(config, "block") else 256,
                 num_hidden_layer=config.num_hidden_layers,
+                model_config=model_config,
+                cache_config=cache_config,
                 quant_config=quant_config,
                 layer_idx=self._ilayer,
                 linear_layer_idx=linear_layer_id,
@@ -861,6 +875,7 @@ class MiniMaxText01Model(nn.Module):
     def __init__(
         self,
         config: MiniMaxConfig,
+        model_config: Optional[ModelConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         scheduler_config=None,
@@ -910,6 +925,7 @@ class MiniMaxText01Model(nn.Module):
             decoder_kwargs = {
                 "quant_config": quant_config,
                 "layer_id": layer_idx,
+                "model_config": model_config,
                 "cache_config": cache_config
             }
 
@@ -1111,8 +1127,9 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
             self.config.max_model_len = vllm_config.model_config.max_model_len
         self.model = MiniMaxText01Model(
             self.config,
-            quant_config,
+            model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
+            quant_config=quant_config,
             scheduler_config=vllm_config.scheduler_config,
             prefix=maybe_prefix(prefix, "model"))
         if get_pp_group().is_last_rank:
@@ -1409,6 +1426,17 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
             load_basic_weight(name, loaded_weight, self)
         return loaded_params
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.linear_attention_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 08315a1385..07cd5a4c6e 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -26,7 +26,7 @@ from torch import nn
 from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
@@ -40,7 +40,7 @@ from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
@@ -110,6 +110,7 @@ class NemotronHMLPDecoderLayer(nn.Module):
         self,
         config: NemotronHConfig,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -149,6 +150,7 @@ class NemotronHMambaDecoderLayer(nn.Module):
         self,
         config: NemotronHConfig,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -167,6 +169,8 @@ class NemotronHMambaDecoderLayer(nn.Module):
             head_dim=config.mamba_head_dim,
             rms_norm_eps=config.rms_norm_eps,
             activation=config.mamba_hidden_act,
+            model_config=model_config,
+            cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.mixer",
         )
@@ -198,6 +202,7 @@ class NemotronHAttention(nn.Module):
         self,
         config: NemotronHConfig,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -270,6 +275,7 @@ class NemotronHAttentionDecoderLayer(nn.Module):
         self,
         config: NemotronHConfig,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -279,6 +285,7 @@ class NemotronHAttentionDecoderLayer(nn.Module):
         self.mixer = NemotronHAttention(
             config,
             layer_idx,
+            model_config,
             cache_config,
             quant_config,
             prefix=f"{prefix}.mixer",
@@ -317,6 +324,7 @@ class NemotronHModel(nn.Module):
         super().__init__()
 
         config: NemotronHConfig = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -340,6 +348,7 @@ class NemotronHModel(nn.Module):
             return layer_class(
                 config,
                 layer_idx,
+                model_config,
                 cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
@@ -478,6 +487,18 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     }
     embedding_padding_modules = ["lm_head"]
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
@@ -569,10 +590,13 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                 mamba_state_shape = \
                     self.get_mamba_state_shape_from_config(
                         self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.lm_head.weight.dtype,
                                                      num_mamba_layers,
-                                                     *mamba_state_shape)
+                                                     *mamba_state_shape,
+                                                     *mamba_state_dtype)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 4cb0becf30..ed65944c10 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -18,7 +18,7 @@ from transformers import Zamba2Config
 from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import GeluAndMul
@@ -33,7 +33,7 @@ from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -478,6 +478,8 @@ class Zamba2MambaDecoderLayer(nn.Module):
 
     def __init__(self,
                  config: Zamba2Config,
+                 model_config: Optional[ModelConfig] = None,
+                 cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = "") -> None:
         """Initialize the Mamba decoder layer.
@@ -502,6 +504,8 @@ class Zamba2MambaDecoderLayer(nn.Module):
                                  config.n_mamba_heads,
                                  rms_norm_eps=config.rms_norm_eps,
                                  activation="silu",
+                                 model_config=model_config,
+                                 cache_config=cache_config,
                                  quant_config=quant_config,
                                  prefix=f"{prefix}.mixer")
 
@@ -578,6 +582,8 @@ class Zamba2HybridLayer(nn.Module):
         shared_transformer: Zamba2AttentionDecoderLayer,
         config: Zamba2Config,
         block_idx: int,
+        model_config: Optional[ModelConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
@@ -596,6 +602,8 @@ class Zamba2HybridLayer(nn.Module):
                                        bias=False,
                                        quant_config=quant_config)
         self.mamba_decoder = Zamba2MambaDecoderLayer(config,
+                                                     model_config=model_config,
+                                                     cache_config=cache_config,
                                                      quant_config=quant_config,
                                                      prefix=prefix)
 
@@ -669,6 +677,7 @@ class Zamba2Model(nn.Module):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -718,11 +727,15 @@ class Zamba2Model(nn.Module):
                     Zamba2HybridLayer(block,
                                       config,
                                       block_idx,
-                                      quant_config,
+                                      model_config=model_config,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config,
                                       prefix=prefix))
             else:
                 layers.append(
                     Zamba2MambaDecoderLayer(config,
+                                            model_config=model_config,
+                                            cache_config=cache_config,
                                             quant_config=quant_config,
                                             prefix=prefix))
         self.layers = nn.ModuleList(layers)
@@ -848,6 +861,18 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         "1.weight": "B.weight",
     })
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
@@ -966,10 +991,13 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
                 mamba_state_shape = \
                     self.get_mamba_state_shape_from_config(
                         self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.lm_head.weight.dtype,
                                                      num_mamba_layers,
-                                                     *mamba_state_shape)
+                                                     *mamba_state_shape,
+                                                     *mamba_state_dtype)
 
             # Get cache parameters for current run
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index cae4eecc0d..a1f8ad1647 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -173,6 +173,7 @@ CYAN = '\033[1;36m'
 RESET = '\033[0;0m'
 
 STR_DTYPE_TO_TORCH_DTYPE = {
+    "float32": torch.float32,
     "half": torch.half,
     "bfloat16": torch.bfloat16,
     "float": torch.float,
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 4ff96f9786..429416afa2 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -182,14 +182,15 @@ class SlidingWindowSpec(AttentionSpec):
 @dataclass(frozen=True)
 class MambaSpec(KVCacheSpec):
     shapes: tuple[tuple[int, ...], ...]
-    dtype: torch.dtype
+    dtypes: tuple[torch.dtype]
     page_size_padded: Optional[int] = None
     mamba_type: str = "mamba2"
 
     @property
     def page_size_bytes(self) -> int:
-        num_elements = sum(prod(shape) for shape in self.shapes)
-        page_size = num_elements * get_dtype_size(self.dtype)
+        page_size = sum(
+            prod(shape) * get_dtype_size(dtype)
+            for (shape, dtype) in zip(self.shapes, self.dtypes))
         if self.page_size_padded is not None:
             assert self.page_size_padded >= page_size
             return self.page_size_padded
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 703092ca9f..d532528788 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2884,23 +2884,25 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 elif isinstance(kv_cache_spec, MambaSpec):
                     has_mamba = True
                     raw_tensor = kv_cache_raw_tensors[layer_name]
-                    dtype = kv_cache_spec.dtype
-                    num_element_per_page = (kv_cache_spec.page_size_bytes //
-                                            get_dtype_size(dtype))
                     state_tensors = []
-                    storage_offset = 0
-                    for shape in kv_cache_spec.shapes:
+                    storage_offset_bytes = 0
+                    for (shape, dtype) in zip(kv_cache_spec.shapes,
+                                              kv_cache_spec.dtypes):
+                        dtype_size = get_dtype_size(dtype)
+                        num_element_per_page = (
+                            kv_cache_spec.page_size_bytes // dtype_size)
                         target_shape = (num_blocks, *shape)
                         stride = torch.empty(target_shape).stride()
                         target_stride = (num_element_per_page, *stride[1:])
+                        assert storage_offset_bytes % dtype_size == 0
                         tensor = torch.as_strided(
                             raw_tensor.view(dtype),
                             size=target_shape,
                             stride=target_stride,
-                            storage_offset=storage_offset,
+                            storage_offset=storage_offset_bytes // dtype_size,
                         )
                         state_tensors.append(tensor)
-                        storage_offset += stride[0]
+                        storage_offset_bytes += stride[0] * dtype_size
 
                     kv_caches[layer_name] = state_tensors
                 else:
@@ -3087,7 +3089,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             for layer_name, mamba_module in mamba_layers.items():
                 kv_cache_spec[layer_name] = MambaSpec(
                     shapes=mamba_module.get_state_shape(),
-                    dtype=self.kv_cache_dtype,
+                    dtypes=mamba_module.get_state_dtype(),
                     block_size=max_model_len,
                     page_size_padded=page_size_padded,
                     mamba_type=mamba_module.mamba_type)

From 48f4636927ac57041b3989746203247aeab3e4ce Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 15 Aug 2025 20:58:03 +0800
Subject: [PATCH 292/932] [Misc] Ignore ep_kernels_workspace (#22807)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 721dd7536b..465935d488 100644
--- a/.gitignore
+++ b/.gitignore
@@ -207,3 +207,6 @@ shellcheck*/
 
 # Ignore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
+
+# Ignore ep_kernels_workspace folder
+ep_kernels_workspace/
\ No newline at end of file

From e8b40c7fa2b21ca46b3e4b091597f474273ddf22 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:58:06 +0100
Subject: [PATCH 293/932] [CI] Remove duplicated docs build from buildkite
 (#22924)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 | 10 ----------
 docker/Dockerfile                             | 11 ++++-------
 tests/standalone_tests/python_only_compile.sh |  2 +-
 3 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 942a8d3f9b..04d7cdc3d8 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -31,16 +31,6 @@
 steps:
 ##### fast check tests  #####
 
-- label: Documentation Build # 2min
-  mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/test_docs"
-  fast_check: true
-  no_gpu: True
-  commands:
-  - pip install -r ../requirements/docs.txt
-  # TODO: add `--strict` once warnings in docstrings are fixed
-  - mkdocs build
-
 - label: Pytorch Nightly Dependency Override Check # 2min
   # if this test fails, it means the nightly torch version is not compatible with some
   # of the dependencies. Please check the error message and add the package to whitelist
diff --git a/docker/Dockerfile b/docker/Dockerfile
index a20a4bfb2b..66a6e6fd6f 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -497,14 +497,11 @@ ENV HF_HUB_ENABLE_HF_TRANSFER 1
 # Copy in the v1 package for testing (it isn't distributed yet)
 COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
 
-# doc requires source code
-# we hide them inside `test_docs/` , so that this source code
+# Source code is used in the `python_only_compile.sh` test
+# We hide it inside `src/` so that this source code
 # will not be imported by other tests
-RUN mkdir test_docs
-RUN mv docs test_docs/
-RUN cp -r examples test_docs/
-RUN mv vllm test_docs/
-RUN mv mkdocs.yaml test_docs/
+RUN mkdir src
+RUN mv vllm src/vllm
 #################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
index ec1bcbcc58..7cc5ef6596 100644
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -10,7 +10,7 @@ cd /vllm-workspace/
 # uninstall vllm
 pip3 uninstall -y vllm
 # restore the original files
-mv test_docs/vllm ./vllm
+mv src/vllm ./vllm
 
 # remove all compilers
 apt remove --purge build-essential -y

From a0632a3e03f9287a0e3966e8c76064984cfdc855 Mon Sep 17 00:00:00 2001
From: Csrayz <jover@cmbchina.com>
Date: Fri, 15 Aug 2025 21:00:20 +0800
Subject: [PATCH 294/932] [Frontend] Expose do_log_stats interval to env
 (#22905)

Signed-off-by: Csrayz <jover@cmbchina.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/usage/troubleshooting.md         | 1 +
 vllm/entrypoints/openai/api_server.py | 2 +-
 vllm/envs.py                          | 7 +++++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 9715ad66d9..b92c6cef4a 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -35,6 +35,7 @@ You can check if this is happening by trying the old defaults with `--generation
 If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
 
 - `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging.
+- `export VLLM_LOG_STATS_INTERVAL=1.` to get log statistics more frequently for tracking running queue, waiting queue and cache hit states.
 - `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
 - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
 - `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index e5d31c1fd0..af86835a49 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -126,7 +126,7 @@ async def lifespan(app: FastAPI):
 
             async def _force_log():
                 while True:
-                    await asyncio.sleep(10.)
+                    await asyncio.sleep(envs.VLLM_LOG_STATS_INTERVAL)
                     await engine_client.do_log_stats()
 
             task = asyncio.create_task(_force_log())
diff --git a/vllm/envs.py b/vllm/envs.py
index 2f0bafa01c..82084d1fc5 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -38,6 +38,7 @@ if TYPE_CHECKING:
     VLLM_LOGGING_PREFIX: str = ""
     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
     VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
+    VLLM_LOG_STATS_INTERVAL: float = 10.
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
@@ -436,6 +437,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0"))
     if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None,
 
+    # If set, vllm will log stats at this interval in seconds
+    # If not set, vllm will log stats every 10 seconds.
+    "VLLM_LOG_STATS_INTERVAL":
+    lambda: val if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10.")))
+        > 0. else 10.,
+
     # Trace function calls
     # If set to 1, vllm will trace function calls
     # Useful for debugging

From 74f441f4b517a895ad12afd314a6f40caf657c4e Mon Sep 17 00:00:00 2001
From: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Date: Fri, 15 Aug 2025 22:01:39 +0800
Subject: [PATCH 295/932] [Core] Allow full cudagraph with separate attention
 routines and orthogonal to compilation, add support for FA2 and FlashInfer
 (#20059)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: fhl <2410591650@qq.com>
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
---
 .../compile/piecewise/test_full_cudagraph.py  | 253 ++++++-----
 tests/compile/piecewise/test_simple.py        |  33 +-
 tests/compile/piecewise/test_toy_llama.py     |  36 +-
 tests/v1/cudagraph/__init__.py                |   0
 tests/v1/cudagraph/test_cudagraph_dispatch.py | 406 ++++++++++++++++++
 tests/v1/cudagraph/test_cudagraph_mode.py     | 187 ++++++++
 vllm/compilation/backends.py                  |  42 +-
 vllm/compilation/base_piecewise_backend.py    |  72 ----
 vllm/compilation/base_static_graph.py         |  54 +++
 vllm/compilation/cuda_graph.py                | 193 +++++++++
 vllm/compilation/cuda_piecewise_backend.py    | 133 +-----
 vllm/compilation/monitor.py                   |  18 +
 vllm/compilation/wrapper.py                   |   7 +-
 vllm/config/__init__.py                       |  52 ++-
 vllm/config/compilation.py                    | 186 ++++++--
 vllm/forward_context.py                       |  52 ++-
 vllm/platforms/cuda.py                        |  13 +-
 vllm/platforms/interface.py                   |  19 +-
 vllm/platforms/rocm.py                        |   4 +-
 vllm/platforms/tpu.py                         |  12 +-
 vllm/platforms/xpu.py                         |  22 +-
 vllm/v1/attention/backends/flash_attn.py      |  68 +--
 vllm/v1/attention/backends/flashinfer.py      |  13 +-
 vllm/v1/attention/backends/mamba2_attn.py     |   8 +-
 vllm/v1/attention/backends/mla/common.py      |   6 +-
 vllm/v1/attention/backends/mla/cutlass_mla.py |   2 +-
 vllm/v1/attention/backends/mla/flashmla.py    |  11 +-
 .../attention/backends/mla/rocm_aiter_mla.py  |  13 +-
 vllm/v1/attention/backends/rocm_aiter_fa.py   |   5 -
 vllm/v1/attention/backends/triton_attn.py     |   8 +-
 vllm/v1/attention/backends/utils.py           |  24 +-
 vllm/v1/cudagraph_dispatcher.py               | 120 ++++++
 vllm/v1/worker/gpu_model_runner.py            | 359 ++++++++++++----
 vllm/v1/worker/gpu_worker.py                  |   5 -
 34 files changed, 1839 insertions(+), 597 deletions(-)
 create mode 100644 tests/v1/cudagraph/__init__.py
 create mode 100644 tests/v1/cudagraph/test_cudagraph_dispatch.py
 create mode 100644 tests/v1/cudagraph/test_cudagraph_mode.py
 delete mode 100644 vllm/compilation/base_piecewise_backend.py
 create mode 100644 vllm/compilation/base_static_graph.py
 create mode 100644 vllm/compilation/cuda_graph.py
 create mode 100644 vllm/v1/cudagraph_dispatcher.py

diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index cc1a95b820..97140a9db7 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -3,7 +3,8 @@
 import contextlib
 import os
 import weakref
-from contextlib import ExitStack
+from dataclasses import dataclass
+from typing import Optional
 
 import pytest
 
@@ -32,27 +33,130 @@ def temporary_environ(env_vars):
                 os.environ[k] = v
 
 
+@dataclass
+class BackendConfig:
+    name: str
+    env_vars: dict
+    comp_config: dict
+    specific_gpu_arch: Optional[tuple] = None
+
+
+# Define all backend configurations of full cudagraph to be tested
+backend_configs = {
+    # FA3 on Hopper
+    "FA3":
+    BackendConfig(name="FA3",
+                  env_vars={"VLLM_FLASH_ATTN_VERSION": "3"},
+                  comp_config={
+                      "cudagraph_mode": "FULL",
+                  },
+                  specific_gpu_arch=(9, 0)),
+    # FlashMLA on Hopper
+    "FlashMLA":
+    BackendConfig(name="FlashMLA",
+                  env_vars={
+                      "VLLM_ATTENTION_BACKEND": "FLASHMLA",
+                  },
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  },
+                  specific_gpu_arch=(9, 0)),
+    # Cutlass MLA on Blackwell
+    "CutlassMLA":
+    BackendConfig(
+        name="CutlassMLA",
+        env_vars={
+            "VLLM_USE_V1": "1",
+            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
+            "FORCE_NUM_KV_SPLITS":
+            "1",  # TODO: remove this when hang issue is fixed
+        },
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+            "cudagraph_capture_sizes": [16, 32, 64, 128, 256, 512],
+        },
+        specific_gpu_arch=(10, 0)),
+    # FA2
+    "FA2":
+    BackendConfig(name="FA2",
+                  env_vars={"VLLM_FLASH_ATTN_VERSION": "2"},
+                  comp_config={
+                      "cudagraph_mode": "FULL",
+                  }),
+    # Triton Attention
+    "TritonAttn":
+    BackendConfig(name="TritonAttn",
+                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
+                  comp_config={
+                      "cudagraph_mode": "FULL",
+                  }),
+    # FlashInfer
+    "FlashInfer":
+    BackendConfig(name="FlashInfer",
+                  env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  }),
+}
+
+test_params_full_cudagraph = []
+
+# deepseek-ai/DeepSeek-V2-Lite with MLA
+MLA_backends = ["FlashMLA", "CutlassMLA"]
+for mla_backend in MLA_backends:
+    test_params_full_cudagraph.append(
+        pytest.param(
+            ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])))
+
+# Qwen/Qwen2-1.5B-Instruct with other backends
+other_backend_configs = [
+    backend_configs[c] for c in backend_configs if c not in MLA_backends
+]
+for backend_config in other_backend_configs:
+    test_params_full_cudagraph.append(
+        pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config)))
+
+
 @pytest.fixture(scope="class")
 def llm_pair(request):
-    model = request.param
+    model, backend_config = request.param
 
-    with temporary_environ({
-            "VLLM_USE_V1": "1",
-            "VLLM_FLASH_ATTN_VERSION": "3"
-    }):
+    # Dynamically skip test if GPU capability is not met
+    if backend_config.specific_gpu_arch and backend_config.specific_gpu_arch\
+        != current_platform.get_device_capability():
+        if backend_config.specific_gpu_arch == (9, 0):
+            pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
+        elif backend_config.specific_gpu_arch == (10, 0):
+            pytest.skip("Only Blackwell GPUs support Cutlass MLA")
+
+    env_vars = {
+        "VLLM_USE_V1": "1",
+        # Force native sampler to avoid potential nondeterminism in FlashInfer
+        # when per-request generators are not used in V1.
+        "VLLM_USE_FLASHINFER_SAMPLER": "0",
+        **backend_config.env_vars,
+    }
+    with temporary_environ(env_vars):
         full = LLM(
             model=model,
-            gpu_memory_utilization=0.45,
+            gpu_memory_utilization=0.43,
             trust_remote_code=True,
             max_model_len=1024,
-            compilation_config=CompilationConfig(full_cuda_graph=True),
+            max_num_seqs=128,
+            compilation_config=\
+                CompilationConfig(**backend_config.comp_config),
+            generation_config="vllm",
+            seed=42,
         )
         piecewise = LLM(
             model=model,
-            gpu_memory_utilization=0.45,
+            gpu_memory_utilization=0.43,
             trust_remote_code=True,
             max_model_len=1024,
-            compilation_config=CompilationConfig(),
+            max_num_seqs=128,
+            compilation_config=CompilationConfig(cudagraph_mode="PIECEWISE"),
+            generation_config="vllm",
+            seed=42,
         )
 
     # PyTest caches the fixture values so we use weakref.proxy to enable GC
@@ -66,90 +170,7 @@ def llm_pair(request):
     )
 
 
-@pytest.fixture(scope="class")
-def cutlass_mla_llm_pair(request):
-    model = request.param
-
-    # force V1 engine and Cutlass MLA backend
-    with temporary_environ({
-            "VLLM_USE_V1": "1",
-            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
-            "FORCE_NUM_KV_SPLITS":
-            "1",  # TODO: remove this when hang issue is fixed
-    }):
-        full = LLM(
-            model=model,
-            gpu_memory_utilization=0.45,
-            trust_remote_code=True,
-            max_model_len=1024,
-            compilation_config=CompilationConfig(
-                full_cuda_graph=True,
-                cudagraph_capture_sizes=[16, 32, 64, 128, 256, 512],
-            ),
-        )
-        piecewise = LLM(
-            model=model,
-            gpu_memory_utilization=0.45,
-            trust_remote_code=True,
-            max_model_len=1024,
-            compilation_config=CompilationConfig(),
-        )
-
-    yield weakref.proxy(full), weakref.proxy(piecewise)
-    del full
-    del piecewise
-
-    wait_for_gpu_memory_to_clear(
-        devices=[0],
-        threshold_ratio=0.1,
-    )
-
-
-@pytest.mark.parametrize(
-    "cutlass_mla_llm_pair",
-    [
-        # use an MLA model
-        "deepseek-ai/DeepSeek-V2-Lite",
-    ],
-    indirect=True)
-@pytest.mark.skipif(current_platform.get_device_capability() != (10, 0),
-                    reason="Only Blackwell GPUs support Cutlass MLA")
-class TestFullCUDAGraphCutlassMLA:
-    """
-    Validate full CUDA Graph with Cutlass MLA (decode-only capture).
-    """
-
-    @pytest.mark.parametrize(("batch_size", "max_tokens"), [
-        (8, 8),
-    ])
-    def test_full_cudagraph_sm100_cutlass_mla(
-            self, batch_size, max_tokens, cutlass_mla_llm_pair: tuple[LLM,
-                                                                      LLM]):
-        piecewise_llm, full_cudagraph_llm = cutlass_mla_llm_pair
-
-        prompts = ["Hello, my name is"] * batch_size
-        sampling_params = SamplingParams(temperature=0.0,
-                                         max_tokens=max_tokens,
-                                         top_p=0.95)
-
-        piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
-        full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
-
-        for piecewise_res, full_res in zip(piecewise_responses,
-                                           full_responses):
-            assert piecewise_res.outputs[0].text == full_res.outputs[0].text
-
-
-@pytest.mark.parametrize(
-    "llm_pair",
-    [
-        # Model names for the llm_pair fixture
-        "deepseek-ai/DeepSeek-V2-Lite",
-        "Qwen/Qwen2-1.5B-Instruct"
-    ],
-    indirect=True)
-@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
-                    reason="Only Hopper GPUs support FA3 and FlashMLA")
+@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True)
 class TestFullCUDAGraph:
     """
     Use a class such that an llm pair is constructed once for all
@@ -178,12 +199,14 @@ class TestFullCUDAGraph:
         full cudagraph compilation works for padded cases too.
         """
 
-        piecewise_llm, full_cudagraph_llm = llm_pair
+        full_cudagraph_llm, piecewise_llm = llm_pair
 
-        prompts = ["Hello, my name is"] * batch_size
+        prompts = ["the quick brown fox"] * batch_size
+        # Use purely greedy decoding to avoid top-p truncation sensitivity
+        # that can amplify tiny numeric differences across runtimes.
         sampling_params = SamplingParams(temperature=0.0,
                                          max_tokens=max_tokens,
-                                         top_p=0.95)
+                                         top_p=1.0)
 
         piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
         full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
@@ -191,42 +214,16 @@ class TestFullCUDAGraph:
         # Check that all responses are the same
         for piecewise_res, full_res in zip(piecewise_responses,
                                            full_responses):
-            assert piecewise_res.outputs[0].text == full_res.outputs[0].text
-
-
-@pytest.mark.parametrize(
-    "model, supported",
-    [
-        ("Qwen/Qwen2-1.5B-Instruct", True),
-        # MLA does not support capturing CUDA Graphs with size > max_num_seqs
-        ("deepseek-ai/DeepSeek-V2-Lite", False),
-    ])
-@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
-                    reason="Only Hopper GPUs support FA3 and FlashMLA")
-def test_lower_max_num_seqs(model, supported):
-    with temporary_environ({
-            "VLLM_USE_V1": "1",
-            "VLLM_FLASH_ATTN_VERSION": "3"
-    }), ExitStack() as stack:
-        if not supported:
-            stack.enter_context(pytest.raises(RuntimeError))
-
-        llm = LLM(model=model,
-                  max_num_seqs=256,
-                  trust_remote_code=True,
-                  max_model_len=1024,
-                  compilation_config=CompilationConfig(
-                      full_cuda_graph=True,
-                      cudagraph_capture_sizes=[64, 256, 512]))
-        llm.generate(["Hello, my name is"] * 10)
+            assert piecewise_res.outputs[0].text.lower() == \
+                full_res.outputs[0].text.lower()
 
 
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 def test_full_cudagraph_with_invalid_backend():
     with temporary_environ({
             "VLLM_USE_V1": "1",
-            "VLLM_FLASH_ATTN_VERSION":
-            "2"  #FA2 not supported with full_cuda_graph
+            "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION"
+            # Flex_Attention is not supported with full cuda graph
     }), pytest.raises(RuntimeError):
         LLM(model="Qwen/Qwen2-1.5B-Instruct",
-            compilation_config=CompilationConfig(full_cuda_graph=True))
+            compilation_config=CompilationConfig(cudagraph_mode="FULL"))
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 06ac3527e1..2d1a72d44e 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -11,10 +11,10 @@ from torch.library import Library
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         VllmConfig, set_current_vllm_config)
 from vllm.envs import VLLM_USE_V1
-from vllm.forward_context import set_forward_context
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import direct_register_custom_op
 
 global_counter = 0
@@ -101,16 +101,33 @@ def test_simple_piecewise_compile(use_inductor):
             num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
             num_cudagraph_captured=
             6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ), set_forward_context({}, vllm_config=vllm_config):
-
+    ), set_forward_context(None,
+                           vllm_config=vllm_config):  # background context
+        # warm up with background context
         model(inputs)
 
-        model(torch.randn(2).cuda())
-        model(torch.randn(1).cuda())
+        # capturing/replaying should under context of cudagraph dispatching
+        with set_forward_context(
+                None,
+                vllm_config=vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                batch_descriptor=BatchDescriptor(num_tokens=2, )):
+            model(torch.randn(2).cuda())
+        with set_forward_context(
+                None,
+                vllm_config=vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                batch_descriptor=BatchDescriptor(num_tokens=1, )):
+            model(torch.randn(1).cuda())
 
         input = torch.zeros(2).cuda()
         global global_counter
         global_counter = 0
-        output = model(input)
+        with set_forward_context(
+                None,
+                vllm_config=vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                batch_descriptor=BatchDescriptor(num_tokens=2, )):
+            output = model(input)
         assert global_counter == 2
         assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index b7ed8353b3..bcfd0d834c 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -18,9 +18,9 @@ from torch.library import Library
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
-from vllm.forward_context import set_forward_context
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         VllmConfig, set_current_vllm_config)
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import direct_register_custom_op
 
 # create a library to hold the custom op
@@ -276,9 +276,11 @@ def run_model(llama_config,
         )
         if split_attn:
             compilation_config.splitting_ops = ["silly.attention"]
+        cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
     else:
         compilation_config = CompilationConfig(
             level=CompilationLevel.NO_COMPILATION, )
+        cudagraph_runtime_mode = CUDAGraphMode.NONE
 
     vllm_config = VllmConfig(compilation_config=compilation_config,
                              additional_config=llama_config)
@@ -287,17 +289,37 @@ def run_model(llama_config,
                            vllm_config=vllm_config,
                            prefix="").eval().cuda()
 
-    with set_forward_context({}, vllm_config=vllm_config):
+    with set_forward_context({},
+                             vllm_config=vllm_config):  # background context
         B = 16  # max batch size
         input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
         positions = torch.arange(B).cuda()
 
+        # warmup for the model with cudagraph_mode NONE
         model(input_ids, positions)
-        model(input_ids[:2], positions[:2])
-        model(input_ids[:1], positions[:1])
+
+        # simulate cudagraphs capturing
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            model(input_ids[:2], positions[:2])
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=1, )):
+            model(input_ids[:1], positions[:1])
 
         input_ids[:2].zero_()
-        output = model(input_ids[:2], positions[:2])
+        # simulate cudagraphs replay
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            output = model(input_ids[:2], positions[:2])
 
         output = output.cpu()
 
diff --git a/tests/v1/cudagraph/__init__.py b/tests/v1/cudagraph/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
new file mode 100644
index 0000000000..64f2fa4628
--- /dev/null
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -0,0 +1,406 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+import torch.nn as nn
+
+from tests.utils import create_new_process_for_each_test
+from vllm.compilation.cuda_graph import CUDAGraphWrapper
+from vllm.compilation.monitor import set_cudagraph_capturing_enabled
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.platforms import current_platform
+from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
+
+
+# Helper MLP for testing
+class SimpleMLP(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(10, 10)
+        self.fc2 = nn.Linear(10, 10)
+
+    def forward(self, x):
+        return self.fc2(self.fc1(x))
+
+
+def _create_vllm_config(compilation_config: CompilationConfig,
+                        max_num_seqs: int = 8) -> MagicMock:
+    mock_config = MagicMock(spec=VllmConfig)
+    mock_config.compilation_config = compilation_config
+    mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs)
+    mock_config.parallel_config = ParallelConfig()
+
+    # Mimic the behavior of VllmConfig.__post_init__()
+    if compilation_config.level == CompilationLevel.PIECEWISE:
+        compilation_config.set_splitting_ops_for_v1()
+
+    return mock_config
+
+
+class TestCudagraphDispatcher:
+
+    @pytest.mark.parametrize(
+        "params",
+        [
+            # Test case 0: Full CG for mixed batches, no separate routine
+            {
+                "case_id": 0,
+                "cudagraph_mode": "FULL",
+                "compilation_level": CompilationLevel.NO_COMPILATION,
+            },
+            # Test case 1: Full CG for uniform batches, piecewise for mixed
+            {
+                "case_id": 1,
+                "cudagraph_mode": "FULL_AND_PIECEWISE",
+                "compilation_level": CompilationLevel.PIECEWISE,
+            },
+            # Test case 2: Full CG for uniform batches, no CG for mixed
+            {
+                "case_id": 2,
+                "cudagraph_mode": "FULL_DECODE_ONLY",
+                "compilation_level": CompilationLevel.NO_COMPILATION,
+            },
+            # Test case 3: Piecewise for all
+            {
+                "case_id": 3,
+                "cudagraph_mode": "PIECEWISE",
+                "compilation_level": CompilationLevel.PIECEWISE,
+            },
+        ])
+    def test_dispatcher(self, params):
+        # Setup dispatcher
+        comp_config = CompilationConfig(
+            cudagraph_mode=params["cudagraph_mode"],
+            level=params["compilation_level"],
+            cudagraph_capture_sizes=[1, 8])
+
+        config = _create_vllm_config(comp_config, max_num_seqs=8)
+        dispatcher = CudagraphDispatcher(config)
+        dispatcher.initialize_cudagraph_keys(
+            cudagraph_mode=comp_config.cudagraph_mode,
+            uniform_decode_query_len=1)
+
+        # Verify the key is initialized correctly
+        if params["cudagraph_mode"] in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
+            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 2
+        else:
+            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 0
+        if params["cudagraph_mode"] not in ["NONE", "PIECEWISE"]:
+            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 2
+        else:
+            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 0
+
+        # Test dispatch logic
+        # 1. non-uniform batch, size in cudagraph size list
+        desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False)
+        rt_mode, key = dispatcher.dispatch(desc_full_exact)
+        if params["cudagraph_mode"] == "FULL":
+            assert rt_mode == CUDAGraphMode.FULL
+            assert key == desc_full_exact
+        elif params["cudagraph_mode"] in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
+            assert rt_mode == CUDAGraphMode.PIECEWISE
+            assert key == desc_full_exact
+        else:
+            assert rt_mode == CUDAGraphMode.NONE
+
+        # 2. uniform decode batch, size in cudagraph size list
+        desc_uniform_exact = BatchDescriptor(num_tokens=8, uniform_decode=True)
+        rt_mode, key = dispatcher.dispatch(desc_uniform_exact)
+        if params["cudagraph_mode"] == "FULL":
+            assert rt_mode == CUDAGraphMode.FULL
+            assert key == desc_uniform_exact.non_uniform
+        elif params["cudagraph_mode"] in [
+                "FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"
+        ]:
+            assert rt_mode == CUDAGraphMode.FULL
+            assert key == desc_uniform_exact
+        elif params["cudagraph_mode"] == "PIECEWISE":
+            assert rt_mode == CUDAGraphMode.PIECEWISE
+            assert key == desc_uniform_exact.non_uniform
+        else:
+            assert rt_mode == CUDAGraphMode.NONE
+
+        # 3. No key match
+        desc_no_match = BatchDescriptor(num_tokens=15, uniform_decode=False)
+        rt_mode, key = dispatcher.dispatch(desc_no_match)
+        assert rt_mode == CUDAGraphMode.NONE
+        assert key is None
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+class TestCUDAGraphWrapper:
+
+    def setup_method(self):
+        self.vllm_config = _create_vllm_config(CompilationConfig())
+        self.model = SimpleMLP().to("cuda")
+        self.persistent_input_buffer = torch.zeros(1, 10, device="cuda")
+        self.input_tensor = torch.randn(1, 10, device="cuda")
+
+    @create_new_process_for_each_test("spawn")
+    def test_capture_and_replay(self):
+        wrapper = CUDAGraphWrapper(self.model,
+                                   self.vllm_config,
+                                   runtime_mode=CUDAGraphMode.FULL)
+        batch_descriptor = BatchDescriptor(num_tokens=10)
+
+        # 0. global warmup
+        with set_forward_context(attn_metadata=None,
+                                 vllm_config=self.vllm_config,
+                                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                                 batch_descriptor=None):
+            wrapper(self.input_tensor)
+
+        # 1. Capture
+        with set_forward_context(
+                attn_metadata=None,
+                vllm_config=self.vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.FULL,
+                batch_descriptor=batch_descriptor),\
+            patch("torch.cuda.graph",
+                       wraps=torch.cuda.graph) as mock_cuda_graph:
+            output1 = wrapper(self.input_tensor)
+            # capturing phase should generate a zero output
+            assert torch.allclose(output1, torch.zeros_like(output1))
+            mock_cuda_graph.assert_called_once()
+
+        assert batch_descriptor in wrapper.concrete_cudagraph_entries
+        entry = wrapper.concrete_cudagraph_entries[batch_descriptor]
+        assert entry.cudagraph is not None
+
+        # 2. Replay
+        with set_forward_context(
+                attn_metadata=None,
+                vllm_config=self.vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.FULL,
+                batch_descriptor=batch_descriptor),\
+            patch.object(entry.cudagraph, 'replay',
+                         wraps=entry.cudagraph.replay) as mock_replay:
+            output2 = wrapper(self.input_tensor)
+            mock_replay.assert_called_once()
+
+        # Compare with eager output
+        eager_output = self.model(self.input_tensor)
+        torch.testing.assert_close(eager_output, output2)
+
+    @create_new_process_for_each_test("spawn")
+    def test_bypass_on_mode_mismatch(self):
+        wrapper = CUDAGraphWrapper(self.model,
+                                   self.vllm_config,
+                                   runtime_mode=CUDAGraphMode.FULL)
+        batch_descriptor = BatchDescriptor(num_tokens=10)
+
+        with set_forward_context(
+                attn_metadata=None,
+                vllm_config=self.vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                batch_descriptor=batch_descriptor), \
+            patch('torch.cuda.graph',
+                  wraps=torch.cuda.graph) as mock_cuda_graph, \
+            patch.object(self.model, 'forward',
+                         wraps=self.model.forward) as mock_forward:
+            wrapper(self.input_tensor)
+            mock_cuda_graph.assert_not_called()
+            mock_forward.assert_called_once()
+        assert not wrapper.concrete_cudagraph_entries
+
+    @create_new_process_for_each_test("spawn")
+    def test_bypass_on_mode_none(self):
+        wrapper = CUDAGraphWrapper(self.model,
+                                   self.vllm_config,
+                                   runtime_mode=CUDAGraphMode.FULL)
+        batch_descriptor = BatchDescriptor(num_tokens=10)
+
+        with set_forward_context(
+                attn_metadata=None,
+                vllm_config=self.vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                batch_descriptor=batch_descriptor), \
+            patch('torch.cuda.graph',
+                  wraps=torch.cuda.graph) as mock_cuda_graph:
+            wrapper(self.input_tensor)
+            mock_cuda_graph.assert_not_called()
+        assert not wrapper.concrete_cudagraph_entries
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+class TestCudagraphIntegration:
+
+    def setup_method(self):
+        # only FULL mode for non-uniform batches
+        self.comp_config = CompilationConfig(level=CompilationLevel.PIECEWISE,
+                                             cudagraph_mode="FULL",
+                                             cudagraph_capture_sizes=[10, 20])
+        self.vllm_config = _create_vllm_config(self.comp_config)
+        self.dispatcher = CudagraphDispatcher(self.vllm_config)
+        self.dispatcher.initialize_cudagraph_keys(
+            self.comp_config.cudagraph_mode, uniform_decode_query_len=1)
+
+    def _run_and_monitor_call(self, wrapper, input_tensor, runtime_mode,
+                              batch_descriptor):
+        """Helper to run a single call and monitor the action."""
+
+        with patch('torch.cuda.graph',
+                wraps=torch.cuda.graph) as mock_graph_context, \
+            patch.object(wrapper, 'runnable',
+                        wraps=wrapper.runnable) as mock_runnable:
+
+            entry = wrapper.concrete_cudagraph_entries.get(
+                batch_descriptor, None)
+
+            context = set_forward_context(attn_metadata=None,
+                                          vllm_config=self.vllm_config,
+                                          cudagraph_runtime_mode=runtime_mode,
+                                          batch_descriptor=batch_descriptor)
+            mock_replay = MagicMock()
+            if entry and entry.cudagraph:
+                with context, \
+                    patch.object(entry.cudagraph, 'replay',
+                                new_callable=MagicMock) as mock_replay:
+                    wrapper(input_tensor)
+            else:
+                with context:
+                    wrapper(input_tensor)
+
+            if mock_graph_context.called:
+                # note that this is globally mocked, so it will be detected
+                # even whether called by the inner or outer wrapper
+                return "capture_global"
+            if mock_replay.called:
+                # only for outer wrapper
+                return "replay"
+            if mock_runnable.call_count > 0:
+                # only for outer wrapper
+                return "bypass"
+            return "unknown"
+
+    @create_new_process_for_each_test("spawn")
+    def test_capture_replay_bypass_logic(self):
+        model = SimpleMLP().to("cuda")
+        full_wrapper = CUDAGraphWrapper(model, self.vllm_config,
+                                        CUDAGraphMode.FULL)
+        max_bs = 16
+        persistent_input_buffer = torch.zeros(max_bs, 10, device="cuda")
+        input_1 = persistent_input_buffer[:1]
+        input_2 = persistent_input_buffer[:2]
+        input_3 = persistent_input_buffer[:3]
+
+        desc_1 = BatchDescriptor(num_tokens=1)
+        desc_2 = BatchDescriptor(num_tokens=2)
+        desc_3_unseen = BatchDescriptor(num_tokens=3)
+
+        # 0. global warmup
+        with set_forward_context(attn_metadata=None,
+                                 vllm_config=self.vllm_config,
+                                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                                 batch_descriptor=None):
+            full_wrapper(input_1)
+
+        rt_mode, key = self.dispatcher.dispatch(desc_1)
+        # 1. Capture first shape
+        action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode,
+                                            key)
+        assert action == "capture_global"
+
+        # 2. Replay first shape
+        action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode,
+                                            key)
+        assert action == "replay"
+
+        rt_mode, key = self.dispatcher.dispatch(desc_2)
+        # 3. Capture second shape
+        action = self._run_and_monitor_call(full_wrapper, input_2, rt_mode,
+                                            key)
+        assert action == "capture_global"
+
+        # 4. Replay second shape
+        action = self._run_and_monitor_call(full_wrapper, input_2,
+                                            CUDAGraphMode.FULL, desc_2)
+        assert action == "replay"
+
+        # 5. Bypass if no key match
+        rt_mode, key = self.dispatcher.dispatch(desc_3_unseen)
+        assert rt_mode == CUDAGraphMode.NONE
+        action = self._run_and_monitor_call(full_wrapper, input_3, rt_mode,
+                                            key)
+        assert action == "bypass"
+
+        # capture unseen shape is not allowed after disable
+        set_cudagraph_capturing_enabled(False)
+        with pytest.raises(RuntimeError):
+            self._run_and_monitor_call(full_wrapper, input_3,
+                                       CUDAGraphMode.FULL, desc_3_unseen)
+        set_cudagraph_capturing_enabled(True)
+
+    @create_new_process_for_each_test("spawn")
+    def test_nested_wrappers(self):
+        """Tests a scenario with a PIECEWISE wrapper inside a FULL one."""
+        model = SimpleMLP().to("cuda")
+        full_wrapper = CUDAGraphWrapper(model, self.vllm_config,
+                                        CUDAGraphMode.FULL)
+        input_1 = torch.randn(1, 10, device="cuda")
+
+        # Setup: Inner model is wrapped with PIECEWISE, outer with FULL
+        inner_model = SimpleMLP().to("cuda")
+        piecewise_wrapper = CUDAGraphWrapper(inner_model, self.vllm_config,
+                                             CUDAGraphMode.PIECEWISE)
+        inner_model.forward = MagicMock(wraps=inner_model.forward)
+        outer_model = SimpleMLP().to("cuda")
+        # When outer model is called, it calls the piecewise_wrapper
+        outer_model.forward = MagicMock(wraps=outer_model.forward,
+                                        side_effect=piecewise_wrapper)
+        full_wrapper = CUDAGraphWrapper(outer_model, self.vllm_config,
+                                        CUDAGraphMode.FULL)
+
+        desc_1 = BatchDescriptor(num_tokens=1)
+
+        # 0. global warmup
+        with set_forward_context(attn_metadata=None,
+                                 vllm_config=self.vllm_config,
+                                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                                 batch_descriptor=None):
+            full_wrapper(input_1)
+
+        # --- Test runtime mode FULL---
+        # Run with FULL mode context. Expect outer wrapper to capture.
+        # The inner mock should be called once inside the graph capture.
+        outer_model.forward.reset_mock()
+        inner_model.forward.reset_mock()
+        action = self._run_and_monitor_call(full_wrapper, input_1,
+                                            CUDAGraphMode.FULL, desc_1)
+        assert action == "capture_global"
+        assert outer_model.forward.call_count == 1
+        assert inner_model.forward.call_count == 1
+
+        # Run again. Expect outer wrapper to replay.
+        # The outer model should NOT be called because the whole graph
+        # is replayed.
+        action = self._run_and_monitor_call(full_wrapper, input_1,
+                                            CUDAGraphMode.FULL, desc_1)
+        assert action == "replay"
+        assert outer_model.forward.call_count == 1  # No new call
+        assert inner_model.forward.call_count == 1
+
+        # --- Test runtime mode PIECEWISE ---
+        outer_model.forward.reset_mock()
+        inner_model.forward.reset_mock()
+        # Run with PIECEWISE mode context.
+        # Expect outer wrapper to bypass and call inner wrapper.
+        # Inner wrapper should capture.
+        action = self._run_and_monitor_call(full_wrapper, input_1,
+                                            CUDAGraphMode.PIECEWISE, desc_1)
+        assert action == "capture_global"
+        assert outer_model.forward.call_count == 1
+        assert inner_model.forward.call_count == 1
+
+        # Run again with PIECEWISE.
+        # Outer bypasses, inner replays.
+        action = self._run_and_monitor_call(full_wrapper, input_1,
+                                            CUDAGraphMode.PIECEWISE, desc_1)
+        assert action == "bypass"
+        assert outer_model.forward.call_count == 2
+        assert inner_model.forward.call_count == 1
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
new file mode 100644
index 0000000000..81655e4175
--- /dev/null
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import os
+import weakref
+from contextlib import ExitStack
+from dataclasses import dataclass
+from typing import Optional
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from vllm import LLM
+from vllm.config import CompilationConfig
+from vllm.platforms import current_platform
+
+
+@contextlib.contextmanager
+def temporary_environ(env_vars):
+    """
+    Temporarily set environment variables and restore them afterward.
+    We have to do this vs monkeypatch because monkeypatch doesn't work
+    with "module" scoped fixtures.
+    """
+    original_env = {k: os.environ.get(k) for k in env_vars}
+    try:
+        os.environ.update(env_vars)
+        yield
+    finally:
+        for k, v in original_env.items():
+            if v is None:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = v
+
+
+@dataclass
+class BackendConfig:
+    name: str
+    env_vars: dict
+    comp_config: dict
+    specific_gpu_arch: Optional[tuple] = None
+
+
+# Define all backend configurations of full cudagraph to be tested
+backend_configs = {
+    # FA3 on Hopper
+    "FA3":
+    BackendConfig(name="FA3",
+                  env_vars={"VLLM_FLASH_ATTN_VERSION": "3"},
+                  comp_config={
+                      "cudagraph_mode": "FULL",
+                  },
+                  specific_gpu_arch=(9, 0)),
+    # FlashMLA on Hopper
+    "FlashMLA":
+    BackendConfig(name="FlashMLA",
+                  env_vars={
+                      "VLLM_ATTENTION_BACKEND": "FLASHMLA",
+                  },
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  },
+                  specific_gpu_arch=(9, 0)),
+    # FA2
+    "FA2":
+    BackendConfig(name="FA2",
+                  env_vars={"VLLM_FLASH_ATTN_VERSION": "2"},
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  }),
+    # Triton Attention
+    "TritonAttn":
+    BackendConfig(name="TritonAttn",
+                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  }),
+    # FlashInfer
+    "FlashInfer":
+    BackendConfig(name="FlashInfer",
+                  env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  }),
+}
+
+# test attention backend and cudagraph_mode combo
+# (backend_name, cudagraph_mode, supported)
+combo_cases_1 = [
+    ("FA3", "FULL", True),
+    ("FA3", "FULL_AND_PIECEWISE", True),
+    ("FA2", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
+    ("FA2", "FULL_AND_PIECEWISE", True),
+    ("FlashInfer", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
+    ("FlashInfer", "FULL_AND_PIECEWISE", True),
+]
+
+
+@pytest.mark.parametrize("combo_case", combo_cases_1)
+def test_backend_and_cudagraph_mode_combo(combo_case):
+    backend_name, cudagraph_mode, supported = combo_case
+    if backend_name == "FlashInfer":
+        try:
+            import flashinfer  # noqa: F401
+        except ImportError:
+            pytest.skip("FlashInfer is not installed")
+    backend_config = backend_configs[backend_name]
+    # Dynamically skip test if GPU capability is not met
+    if backend_config.specific_gpu_arch and backend_config.specific_gpu_arch\
+        != current_platform.get_device_capability():
+        pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
+
+    env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
+
+    with temporary_environ(env_vars), ExitStack() as stack:
+        if not supported:
+            stack.enter_context(pytest.raises(Exception))
+
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+                  max_num_seqs=256,
+                  trust_remote_code=True,
+                  gpu_memory_utilization=0.45,
+                  max_model_len=1024,
+                  compilation_config=CompilationConfig(
+                      level=3, cudagraph_mode=cudagraph_mode))
+        llm.generate(["Hello, my name is"] * 10)
+
+    try:
+        llm = weakref.proxy(llm)
+        del llm
+    except UnboundLocalError:
+        pass
+
+    wait_for_gpu_memory_to_clear(
+        devices=[0],
+        threshold_ratio=0.1,
+    )
+
+
+# test cudagraph_mode with different compilation level.
+# (backend_name, cudagraph_mode, compilation_level, supported)
+combo_cases_2 = [
+    ("FA2", "FULL", 0, True),  # no compilation + full cudagraph
+    ("FA2", "FULL", 3, True),  # piecewise compilation + full cudagraph
+    ("FA2", "PIECEWISE", 0, False),  # no compilation + piecewise cudagraph
+    ("FA2", "PIECEWISE", 3,
+     True),  # piecewise compilation + piecewise cudagraph
+    ("FA2", "FULL_AND_PIECEWISE", 0,
+     False),  # piecewise cudagraph not supported without piecewise compilation
+    ("FA2", "FULL_AND_PIECEWISE", 3, True),
+    ("FA2", "FULL_DECODE_ONLY", 0, True),
+    ("FA2", "FULL_DECODE_ONLY", 3, True),
+    ("FA2", "NONE", 0, True),  # no compilation + no cudagraph
+    ("FA2", "NONE", 3, True),  # piecewise compilation + no cudagraph
+]
+
+
+@pytest.mark.parametrize("combo_case", combo_cases_2)
+def test_cudagraph_compilation_combo(combo_case):
+    backend_name, cudagraph_mode, compilation_level, supported\
+        = combo_case
+
+    env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
+
+    with temporary_environ(env_vars), ExitStack() as stack:
+        if not supported:
+            stack.enter_context(pytest.raises(Exception))
+
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+                  max_num_seqs=256,
+                  trust_remote_code=True,
+                  gpu_memory_utilization=0.45,
+                  max_model_len=1024,
+                  compilation_config=CompilationConfig(
+                      level=compilation_level, cudagraph_mode=cudagraph_mode))
+        llm.generate(["Hello, my name is"] * 10)
+    try:
+        llm = weakref.proxy(llm)
+        del llm
+    except UnboundLocalError:
+        pass
+    finally:
+        wait_for_gpu_memory_to_clear(
+            devices=[0],
+            threshold_ratio=0.1,
+        )
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 673fb58662..059e7a3b29 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -15,7 +15,7 @@ import torch.fx as fx
 from torch._dispatch.python import enable_python_dispatcher
 
 import vllm.envs as envs
-from vllm.config import CompilationConfig, VllmConfig
+from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
@@ -277,9 +277,6 @@ def split_graph(graph: fx.GraphModule,
     return split_gm, outputs
 
 
-# we share the global graph pool among all the backends
-global_graph_pool = None
-
 compilation_start_time = 0.0
 
 
@@ -339,14 +336,37 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                 graph_index=index,
                 num_graphs=len(self.compile_submod_names),
                 runtime_shape=None)
+            # Lazy import here to avoid circular import
+            from .cuda_graph import CUDAGraphOptions
+            from .cuda_piecewise_backend import PiecewiseBackend
 
-            piecewise_backend = resolve_obj_by_qualname(
-                current_platform.get_piecewise_backend_cls())
-            self.module.__dict__[target] = piecewise_backend(
-                submod, self.vllm_config, self.graph_pool, index,
+            piecewise_backend = PiecewiseBackend(
+                submod, self.vllm_config, index,
                 len(self.compile_submod_names), sym_shape_indices,
                 compiled_graph_for_dynamic_shape, self.vllm_backend)
 
+            if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
+                # resolve the static graph wrapper class (e.g. CUDAGraphWrapper
+                # class) as platform dependent.
+                static_graph_wrapper_class = resolve_obj_by_qualname(
+                    current_platform.get_static_graph_wrapper_cls())
+
+                # Always assign PIECEWISE runtime mode to the
+                # CUDAGraphWrapper for piecewise_backend, to distinguish
+                # it from the FULL cudagraph runtime mode, no matter it
+                # is wrapped on a full or piecewise fx graph.
+                self.module.__dict__[target] = static_graph_wrapper_class(
+                    runnable=piecewise_backend,
+                    vllm_config=self.vllm_config,
+                    runtime_mode=CUDAGraphMode.PIECEWISE,
+                    graph_pool=self.graph_pool,
+                    cudagraph_options=CUDAGraphOptions(
+                        debug_log_enable=piecewise_backend.is_first_graph,
+                        gc_disable=not piecewise_backend.is_first_graph,
+                        weak_ref_output=piecewise_backend.is_last_graph))
+            else:
+                self.module.__dict__[target] = piecewise_backend
+
             compilation_counter.num_piecewise_capturable_graphs_seen += 1
 
         return output
@@ -413,9 +433,7 @@ class VllmBackend:
         # them, e.g. backbone (default), eagle_head, etc.
         self.prefix = prefix or model_tag
 
-        global global_graph_pool
-        if global_graph_pool is None:
-            global_graph_pool = current_platform.graph_pool_handle()
+        global_graph_pool = current_platform.get_global_graph_pool()
 
         # TODO: in the future, if we want to use multiple
         # streams, it might not be safe to share a global pool.
@@ -585,7 +603,7 @@ class VllmBackend:
 
         self._called = True
 
-        if not self.compilation_config.use_cudagraph or \
+        if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE or \
             not self.compilation_config.cudagraph_copy_inputs:
             return self.split_gm
 
diff --git a/vllm/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py
deleted file mode 100644
index 4d7aeeb4d0..0000000000
--- a/vllm/compilation/base_piecewise_backend.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Any, Callable, Protocol
-
-import torch.fx as fx
-
-from vllm.compilation.backends import VllmBackend
-from vllm.config import VllmConfig
-
-
-class AbstractPiecewiseBackend(Protocol):
-    """
-    PiecewiseBackend interface that allows platforms to extend 
-    piecewise static graph.
-    """
-
-    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
-                 graph_pool: Any, piecewise_compile_index: int,
-                 total_piecewise_compiles: int, sym_shape_indices: list[int],
-                 compiled_graph_for_general_shape: Callable,
-                 vllm_backend: VllmBackend, **kwargs):
-        """
-        Initializes the PiecewiseBackend class with compilation and 
-        execution-related configurations.
-
-        This class handles piecewise compilation, graph capturing, 
-        and dispatching for specific input shapes.
-
-        Args:
-            graph (fx.GraphModule): The graph represented in fx.
-            vllm_config (VllmConfig): Global configuration for vLLM.
-            graph_pool (Any): 
-                Graph memory pool handle, e.g., 
-                    `torch.cuda.graph_pool_handle()`.
-            piecewise_compile_index (int): 
-                Index of the current piecewise subgraph.
-            total_piecewise_compiles (int): 
-                Total number of piecewise-compiled graphs.
-            sym_shape_indices (list[int]): 
-                Indices of symbolic shape.
-            compiled_graph_for_general_shape (Callable): 
-                Callable that executes the graph compiled for general shapes.
-            vllm_backend (VllmBackend): 
-                Backend compiler that manages compilation and graph runtime 
-                for vLLM.
-
-        Keyword Args:
-            kwargs: Additional keyword arguments reserved for future 
-                extensions or custom platforms.
-        """
-        raise NotImplementedError
-
-    def __call__(self, *args) -> Any:
-        """Executes the compiled graph for given input args.
-
-        If this is the first invocation, executes the general compiled graph
-        and initiates the compilation process tracking. For subsequent calls,
-        dynamically dispatches execution to either a compiled graph or a static
-        graph based on the input shape.
-
-        Args:
-            *args: Variable length input arguments to be passed into the 
-                graph. The symbolic shape is expected to be in position 
-                `sym_shape_indices[0]`.
-
-        Returns:
-            Any: Output of the executed graph. This can be from the general
-            compiled graph, a specialized compiled version for the given shape,
-            or a replayed static graph.
-        """
-        raise NotImplementedError
diff --git a/vllm/compilation/base_static_graph.py b/vllm/compilation/base_static_graph.py
new file mode 100644
index 0000000000..1c3f52c533
--- /dev/null
+++ b/vllm/compilation/base_static_graph.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Protocol
+
+from vllm.config import CUDAGraphMode, VllmConfig
+
+
+class AbstractStaticGraphWrapper(Protocol):
+    """
+    StaticGraphWrapper interface that allows platforms to wrap a callable
+    to be captured as a static graph.
+    """
+
+    def __init__(self, runnable: Callable, vllm_config: VllmConfig,
+                 runtime_mode: CUDAGraphMode, graph_pool: Any, **kwargs):
+        """
+        Initializes the StaticGraphWrapper class with graph capturing and
+        execution-related configurations.
+
+        Args:
+            runnable (Callable): The callable to be wrapped and captured.
+            vllm_config (VllmConfig): Global configuration for vLLM.
+            runtime_mode (CUDAGraphMode): The style of the static
+                graph runtime. See CUDAGraphMode in vllm/config.py.
+                Note that only the subset enum `NONE`, `PIECEWISE` and `FULL`
+                are used as concrete runtime mode for cudagraph dispatching.
+            graph_pool (Any):
+                Graph memory pool handle, e.g.,
+                    `torch.cuda.graph_pool_handle()`.
+        Keyword Args:
+            kwargs: Additional keyword arguments for platform-specific
+                configurations.
+        """
+        raise NotImplementedError
+
+    def __call__(self, *args, **kwargs) -> Any:
+        """
+        Executes the wrapped callable.
+
+        If the current runtime mode in the ForwardContext matches the runtime
+        mode of this instance, it replays the CUDAGraph or captures it using
+        the callable if it hasn't been captured yet. Otherwise, it calls the
+        original callable directly.
+
+        Args:
+            *args: Variable length input arguments to be passed into the
+                callable.
+            **kwargs: Keyword arguments to be passed into the callable.
+
+        Returns:
+            Any: Output of the executed callable.
+        """
+        raise NotImplementedError
diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
new file mode 100644
index 0000000000..65a38197ad
--- /dev/null
+++ b/vllm/compilation/cuda_graph.py
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+from contextlib import ExitStack
+from typing import Any, Callable, Optional
+from unittest.mock import patch
+
+import torch
+
+import vllm.envs as envs
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.monitor import validate_cudagraph_capturing_enabled
+from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.forward_context import BatchDescriptor, get_forward_context
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import weak_ref_tensors
+
+logger = init_logger(__name__)
+
+
+@dataclasses.dataclass
+class CUDAGraphEntry:
+    batch_descriptor: BatchDescriptor
+    cudagraph: Optional[torch.cuda.CUDAGraph] = None
+    output: Optional[Any] = None
+
+    # for cudagraph debugging, track the input addresses
+    # during capture, and check if they are the same during replay
+    input_addresses: Optional[list[int]] = None
+
+
+@dataclasses.dataclass
+class CUDAGraphOptions:
+    debug_log_enable: bool = True
+    gc_disable: bool = False
+    weak_ref_output: bool = True
+
+
+class CUDAGraphWrapper:
+    """Wraps a runnable to add CUDA graph capturing and replaying ability. And
+    provide attribute access to the underlying `runnable` via `__getattr__`.
+
+    The workflow of this wrapper in the cudagraph dispatching is as follows:
+    1. At initialization, a runtime mode is assigned to the wrapper (FULL or
+    PIECEWISE). 
+    2. At runtime, the wrapper receives a runtime_mode and a 
+    batch_descriptor(key) from the forward context and blindly trust them
+    for cudagraph dispatching. 
+    3. If runtime_mode is NONE or runtime_mode does not match the mode of the
+    wrapper, just call the runnable directly.
+    4. Otherwise, i.e., the runtime_mode matches the mode of the wrapper,
+    the wrapper will perform cudagraph capture(if key does not exist, create
+    a new entry and cache it) or replay (if key exists in the cache).
+
+    Note: CUDAGraphWrapper does not store persistent buffers or copy any
+    runtime inputs into that buffers for replay. We assume implementing them
+    is done outside of the wrapper. That is because we do not make any 
+    assumption on the dynamic shape (batch size) of the runtime inputs, as a
+    trade-off for staying orthogonal to compilation logic. Nevertheless, 
+    tracing and checking the input addresses to be consistent during replay is
+    guaranteed when VLLM_LOGGING_LEVEL == "DEBUG".
+    """
+
+    def __init__(self,
+                 runnable: Callable,
+                 vllm_config: VllmConfig,
+                 runtime_mode: CUDAGraphMode,
+                 graph_pool: Any = None,
+                 cudagraph_options: Optional[CUDAGraphOptions] = None):
+        self.runnable = runnable
+        self.vllm_config = vllm_config
+        self.graph_pool = graph_pool
+        self.runtime_mode = runtime_mode
+        self.compilation_config = vllm_config.compilation_config
+
+        self.first_run_finished = False
+        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+
+        # assert runtime_mode is not NONE(no cudagraph), otherwise, we don't
+        # need to initialize a CUDAGraphWrapper.
+        assert self.runtime_mode != CUDAGraphMode.NONE
+        if self.graph_pool is None:
+            self.graph_pool = current_platform.get_global_graph_pool()
+
+        if cudagraph_options is None:
+            cudagraph_options = CUDAGraphOptions()
+        self.cudagraph_options = cudagraph_options
+        # the entries for different batch descriptors that we need to capture
+        # cudagraphs for.
+        self.concrete_cudagraph_entries: dict[BatchDescriptor, CUDAGraphEntry]\
+                                                                        = {}
+
+    def __getattr__(self, key: str):
+        # allow accessing the attributes of the runnable.
+        if hasattr(self.runnable, key):
+            return getattr(self.runnable, key)
+        raise AttributeError(f"Attribute {key} not exists in the runnable of "
+                             f"cudagraph wrapper: {self.runnable}")
+
+    def unwrap(self) -> Callable:
+        # in case we need to access the original runnable.
+        return self.runnable
+
+    def __call__(self, *args, **kwargs):
+        forward_context = get_forward_context()
+        batch_descriptor = forward_context.batch_descriptor
+        cudagraph_runtime_mode = forward_context.cudagraph_runtime_mode
+
+        if cudagraph_runtime_mode == CUDAGraphMode.NONE or \
+                            cudagraph_runtime_mode != self.runtime_mode:
+            # CUDAGraphMode.NONE could mean the profile run, a warmup run, or
+            # running without cudagraphs.
+            # We do not trigger capture/replay if the runtime mode is not
+            # matches. This enables properly dispatching to the correct
+            # CUDAGraphWrapper when nesting multiple instances with different
+            # runtime modes.
+            return self.runnable(*args, **kwargs)
+
+        if batch_descriptor not in self.concrete_cudagraph_entries:
+            # create a new entry for this batch descriptor
+            self.concrete_cudagraph_entries[batch_descriptor] = \
+                CUDAGraphEntry(batch_descriptor=batch_descriptor)
+
+        entry = self.concrete_cudagraph_entries[batch_descriptor]
+
+        if entry.cudagraph is None:
+            if self.cudagraph_options.debug_log_enable:
+                # Since we capture cudagraph for many different shapes and
+                # capturing is fast, we don't need to log it for every
+                # shape. E.g. we only log it for the first subgraph in
+                # piecewise mode.
+                logger.debug("Capturing a cudagraph on (%s,%s)",
+                             self.runtime_mode.name, entry.batch_descriptor)
+            # validate that cudagraph capturing is legal at this point.
+            validate_cudagraph_capturing_enabled()
+
+            input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            entry.input_addresses = input_addresses
+            cudagraph = torch.cuda.CUDAGraph()
+
+            with ExitStack() as stack:
+                if self.cudagraph_options.gc_disable:
+                    # during every model forward for piecewise cudagraph
+                    # mode, we will capture many pieces of cudagraphs
+                    # (roughly one per layer). running gc again and again
+                    # across layers will make the cudagraph capture very slow.
+                    # therefore, we only run gc for the first graph,
+                    # and disable gc for the rest of the graphs.
+                    stack.enter_context(patch("gc.collect", lambda: None))
+                    stack.enter_context(
+                        patch("torch.cuda.empty_cache", lambda: None))
+
+                # mind-exploding: carefully manage the reference and memory.
+                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
+                    # `output` is managed by pytorch's cudagraph pool
+                    output = self.runnable(*args, **kwargs)
+                    if self.cudagraph_options.weak_ref_output:
+                        # by converting it to weak ref,
+                        # the original `output` will immediately be released
+                        # to save memory. It is only safe to do this for
+                        # the last graph in piecewise cuadgraph mode, because
+                        # the output of the last graph will not be used by
+                        # any other cuda graph.
+                        output = weak_ref_tensors(output)
+
+            # here we always use weak ref for the output
+            # to save memory
+            entry.output = weak_ref_tensors(output)
+            entry.cudagraph = cudagraph
+
+            compilation_counter.num_cudagraph_captured += 1
+
+            # important: we need to return the output, rather than
+            # the weak ref of the output, so that pytorch can correctly
+            # manage the memory during cuda graph capture
+            return output
+
+        if self.is_debugging_mode:
+            # check if the input addresses are the same
+            new_input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            assert new_input_addresses == entry.input_addresses, (
+                f"Input addresses for cudagraphs are different "
+                f"during replay. Expected {entry.input_addresses}, "
+                f"got {new_input_addresses}")
+
+        entry.cudagraph.replay()
+        return entry.output
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
index 8c49ea6cc1..ae26e9f1bf 100644
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
@@ -2,21 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
-from contextlib import ExitStack
-from typing import Any, Callable, Optional
-from unittest.mock import patch
+from typing import Any, Callable
 
-import torch
 import torch.fx as fx
 
 import vllm.envs as envs
 from vllm.compilation.backends import VllmBackend
-from vllm.compilation.counter import compilation_counter
 from vllm.compilation.monitor import end_monitoring_torch_compile
 from vllm.config import VllmConfig
-from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
-from vllm.utils import weak_ref_tensors
 
 logger = init_logger(__name__)
 
@@ -24,44 +18,29 @@ logger = init_logger(__name__)
 @dataclasses.dataclass
 class ConcreteSizeEntry:
     runtime_shape: int
-    need_to_compile: bool  # the size is in compile_sizes
-    use_cudagraph: bool  # the size is in cudagraph_capture_sizes
-
     compiled: bool = False
     runnable: Callable = None  # type: ignore
-    num_finished_warmup: int = 0
-    cudagraph: Optional[torch.cuda.CUDAGraph] = None
-    output: Optional[Any] = None
-
-    # for cudagraph debugging, track the input addresses
-    # during capture, and check if they are the same during replay
-    input_addresses: Optional[list[int]] = None
 
 
-class CUDAPiecewiseBackend:
+class PiecewiseBackend:
 
     def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
-                 graph_pool: Any, piecewise_compile_index: int,
-                 total_piecewise_compiles: int, sym_shape_indices: list[int],
+                 piecewise_compile_index: int, total_piecewise_compiles: int,
+                 sym_shape_indices: list[int],
                  compiled_graph_for_general_shape: Callable,
                  vllm_backend: VllmBackend):
         """
         The backend for piecewise compilation.
-        It mainly handles the compilation and cudagraph capturing.
+        It mainly handles the compilation of static shapes and 
+        dispatching based on runtime shape.
 
         We will compile `self.graph` once for the general shape,
         and then compile for different shapes specified in
         `compilation_config.compile_sizes`.
-
-        Independently, we will capture cudagraph for different shapes.
-
-        If a shape needs both compilation and cudagraph, we will
-        compile it first, and then capture cudagraph.
         """
         self.graph = graph
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
-        self.graph_pool = graph_pool
         self.piecewise_compile_index = piecewise_compile_index
         self.total_piecewise_compiles = total_piecewise_compiles
         self.vllm_backend = vllm_backend
@@ -70,11 +49,10 @@ class CUDAPiecewiseBackend:
         self.is_last_graph = (
             piecewise_compile_index == total_piecewise_compiles - 1)
 
+        self.is_full_graph = total_piecewise_compiles == 1
+
         self.compile_sizes: set[int] = set(
             self.compilation_config.compile_sizes)
-        self.cudagraph_capture_sizes: set[int] = set(
-            self.compilation_config.cudagraph_capture_sizes
-        ) if self.compilation_config.use_cudagraph else set()
 
         self.first_run_finished = False
 
@@ -84,18 +62,18 @@ class CUDAPiecewiseBackend:
 
         self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
 
-        # the entries for different shapes that we need to either
-        # compile or capture cudagraph
+        # the entries for different shapes that we need to compile
         self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {}
 
         # to_be_compiled_sizes tracks the remaining sizes to compile,
         # and updates during the compilation process, so we need to copy it
         self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy()
-        for shape in self.compile_sizes.union(self.cudagraph_capture_sizes):
+
+        # We only keep compilation management inside this class directly.
+        for shape in self.compile_sizes:
             self.concrete_size_entries[shape] = ConcreteSizeEntry(
                 runtime_shape=shape,
-                need_to_compile=shape in self.compile_sizes,
-                use_cudagraph=shape in self.cudagraph_capture_sizes,
+                runnable=self.compiled_graph_for_general_shape,
             )
 
     def check_for_ending_compilation(self):
@@ -112,16 +90,14 @@ class CUDAPiecewiseBackend:
             return self.compiled_graph_for_general_shape(*args)
 
         runtime_shape = args[self.sym_shape_indices[0]]
+
         if runtime_shape not in self.concrete_size_entries:
             # we don't need to do anything for this shape
             return self.compiled_graph_for_general_shape(*args)
 
         entry = self.concrete_size_entries[runtime_shape]
 
-        if entry.runnable is None:
-            entry.runnable = self.compiled_graph_for_general_shape
-
-        if entry.need_to_compile and not entry.compiled:
+        if not entry.compiled:
             entry.compiled = True
             self.to_be_compiled_sizes.remove(runtime_shape)
             # args are real arguments
@@ -138,81 +114,4 @@ class CUDAPiecewiseBackend:
             if self.is_last_graph and not self.to_be_compiled_sizes:
                 self.check_for_ending_compilation()
 
-        # Skip CUDA graphs if this entry doesn't use them OR
-        # if we're supposed to skip them globally
-        skip_cuda_graphs = get_forward_context().skip_cuda_graphs
-        if not entry.use_cudagraph or skip_cuda_graphs:
-            return entry.runnable(*args)
-
-        if entry.cudagraph is None:
-            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
-                entry.num_finished_warmup += 1
-                if self.is_first_graph:
-                    logger.debug(
-                        "Warming up %s/%s for shape %s",
-                        entry.num_finished_warmup,
-                        self.compilation_config.cudagraph_num_of_warmups,
-                        runtime_shape)
-                return entry.runnable(*args)
-
-            if self.is_first_graph:
-                # Since we capture cudagraph for many different shapes and
-                # capturing is fast, we don't need to log it for every shape.
-                # We only log it in the debug mode.
-                logger.debug("Capturing a cudagraph for shape %s",
-                             runtime_shape)
-
-            input_addresses = [
-                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
-            ]
-            entry.input_addresses = input_addresses
-            cudagraph = torch.cuda.CUDAGraph()
-
-            with ExitStack() as stack:
-                if not self.is_first_graph:
-                    # during every model forward, we will capture
-                    # many pieces of cudagraphs (roughly one per layer).
-                    # running gc again and again across layers will
-                    # make the cudagraph capture very slow.
-                    # therefore, we only run gc for the first graph,
-                    # and disable gc for the rest of the graphs.
-                    stack.enter_context(patch("gc.collect", lambda: None))
-                    stack.enter_context(
-                        patch("torch.cuda.empty_cache", lambda: None))
-
-                # mind-exploding: carefully manage the reference and memory.
-                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
-                    # `output` is managed by pytorch's cudagraph pool
-                    output = entry.runnable(*args)
-                    if self.is_last_graph:
-                        # by converting it to weak ref,
-                        # the original `output` will immediately be released
-                        # to save memory. It is only safe to do this for
-                        # the last graph, because the output of the last graph
-                        # will not be used by any other cuda graph.
-                        output = weak_ref_tensors(output)
-
-            # here we always use weak ref for the output
-            # to save memory
-            entry.output = weak_ref_tensors(output)
-            entry.cudagraph = cudagraph
-
-            compilation_counter.num_cudagraph_captured += 1
-
-            # important: we need to return the output, rather than
-            # the weak ref of the output, so that pytorch can correctly
-            # manage the memory during cuda graph capture
-            return output
-
-        if self.is_debugging_mode:
-            # check if the input addresses are the same
-            new_input_addresses = [
-                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
-            ]
-            assert new_input_addresses == entry.input_addresses, (
-                "Input addresses for cudagraphs are different during replay."
-                f" Expected {entry.input_addresses}, got {new_input_addresses}"
-            )
-
-        entry.cudagraph.replay()
-        return entry.output
+        return entry.runnable(*args)
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 1e059b59fb..9047bf3cbf 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -37,3 +37,21 @@ def end_monitoring_torch_compile(vllm_config: VllmConfig):
         if context_manager is not None:
             context_manager.__exit__(None, None, None)
             context_manager = None
+
+
+cudagraph_capturing_enabled: bool = True
+
+
+def validate_cudagraph_capturing_enabled():
+    # used to monitor whether an cudagraph capturing is legal at runtime.
+    # should be called before any cudagraph capturing.
+    # if an illegal cudagraph capturing happens, raise an error.
+    global cudagraph_capturing_enabled
+    if not cudagraph_capturing_enabled:
+        raise RuntimeError("CUDA graph capturing detected at an inappropriate "
+                           "time. This operation is currently disabled.")
+
+
+def set_cudagraph_capturing_enabled(enabled: bool):
+    global cudagraph_capturing_enabled
+    cudagraph_capturing_enabled = enabled
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 8d5df1061e..96d4eae2ee 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -11,7 +11,8 @@ from typing import Callable, Optional
 import torch
 
 import vllm.envs as envs
-from vllm.config import CompilationLevel, get_current_vllm_config
+from vllm.config import (CompilationLevel, CUDAGraphMode,
+                         get_current_vllm_config)
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -115,8 +116,8 @@ class TorchCompileWrapperWithCustomDispatcher:
                 except Exception:
                     pass
 
-        if self.vllm_config.compilation_config.use_cudagraph and \
-            "update" in new_code.co_names:
+        if self.vllm_config.compilation_config.cudagraph_mode != \
+            CUDAGraphMode.NONE and "update" in new_code.co_names:
             import depyf
             src = depyf.decompile(new_code)
             msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src  # noqa
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 82ef8db673..280ae60c91 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -32,7 +32,7 @@ from vllm import version
 from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
                                PrefixCachingHashAlgo)
 from vllm.config.compilation import (CompilationConfig, CompilationLevel,
-                                     PassConfig)
+                                     CUDAGraphMode, PassConfig)
 from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig
 from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
 from vllm.config.utils import ConfigType, config
@@ -3529,11 +3529,21 @@ class VllmConfig:
                 else:
                     self.compilation_config.level = \
                             CompilationLevel.NO_COMPILATION
+
             else:
                 # NB: Passing both --enforce-eager and a compilation level
                 # in V0 means the compilation level wins out.
                 self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
+        # if cudagraph_mode is not explicitly set by users, set default value
+        if self.compilation_config.cudagraph_mode is None:
+            if envs.VLLM_USE_V1 and self.compilation_config.level \
+                == CompilationLevel.PIECEWISE:
+                self.compilation_config.cudagraph_mode = \
+                    CUDAGraphMode.PIECEWISE
+            else:
+                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
@@ -3541,12 +3551,13 @@ class VllmConfig:
                 True
         if self.compilation_config.pass_config.enable_sequence_parallelism:
             self.compilation_config.custom_ops.append("+rms_norm")
-        if envs.VLLM_USE_V1 and self.model_config is not None and \
-            not self.model_config.enforce_eager:
-            # By default, V1 uses piecewise CUDA graphs. If full_cuda_graph
-            # is set to True, full CUDA graphs will be used.
+
+        # disable cudagraph when enforce eager execution
+        if self.model_config is not None and self.model_config.enforce_eager:
+            logger.info("Cudagraph is disabled under eager mode")
+            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+        elif envs.VLLM_USE_V1:
             self.compilation_config.cudagraph_num_of_warmups = 1
-            self.compilation_config.set_splitting_ops_for_v1()
 
         self._set_cudagraph_sizes()
 
@@ -3566,12 +3577,6 @@ class VllmConfig:
                 "Disabling `torch.compile`.")
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
-        if self.compilation_config.full_cuda_graph and \
-            not self.model_config.disable_cascade_attn:
-            logger.info("full_cuda_graph is not supported with "
-                        "cascade attention. Disabling cascade attention.")
-            self.model_config.disable_cascade_attn = True
-
         disable_chunked_prefill_reasons: list[str] = []
 
         if self.model_config and self.model_config.pooler_config:
@@ -3612,9 +3617,32 @@ class VllmConfig:
                            "to True to enable.")
         current_platform.check_and_update_config(self)
 
+        # final check of cudagraph mode after platform-specific update
+        if envs.VLLM_USE_V1:
+            if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \
+                and self.model_config is not None and \
+                not self.model_config.disable_cascade_attn:
+                logger.info("CUDAGraphMode.FULL is not supported with "
+                            "cascade attention currently. Disabling cascade"
+                            "attention.")
+                self.model_config.disable_cascade_attn = True
+
+            if self.compilation_config.cudagraph_mode\
+                .requires_piecewise_compilation():
+                assert self.compilation_config.level == \
+                    CompilationLevel.PIECEWISE, \
+                    "Compilation level should be CompilationLevel.PIECEWISE "\
+                    "when cudagraph_mode piecewise cudagraphs is used, "\
+                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
+
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
+        # Do this after all the updates to compilation_config.level
+        if envs.VLLM_USE_V1 and \
+            self.compilation_config.level == CompilationLevel.PIECEWISE:
+            self.compilation_config.set_splitting_ops_for_v1()
+
         if (envs.VLLM_USE_V1
                 and not self.scheduler_config.disable_hybrid_kv_cache_manager):
             # logger should only print warning message for hybrid models. As we
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 8a78d811b9..56a2183f8e 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import enum
 import hashlib
 from collections import Counter
 from dataclasses import asdict, field
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
 
-from pydantic import TypeAdapter
+from pydantic import TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
@@ -31,6 +32,40 @@ class CompilationLevel:
     PIECEWISE = 3
 
 
+class CUDAGraphMode(enum.Enum):
+    """ Constants for the cudagraph mode in CompilationConfig.
+    Meanwhile, the subset enum `NONE`, `PIECEWISE` and `FULL` are also
+    treated as concrete runtime mode for cudagraph runtime dispatching.
+    """
+    NONE = 0
+    PIECEWISE = 1
+    FULL = 2
+    FULL_DECODE_ONLY = (FULL, NONE)
+    FULL_AND_PIECEWISE = (FULL, PIECEWISE)
+
+    def decode_mode(self) -> 'CUDAGraphMode':
+        return CUDAGraphMode(self.value[0]) if \
+            self.separate_routine() else self
+
+    def mixed_mode(self) -> 'CUDAGraphMode':
+        return CUDAGraphMode(self.value[1]) if \
+            self.separate_routine() else self
+
+    def requires_piecewise_compilation(self) -> bool:
+        return (self.decode_mode() == CUDAGraphMode.PIECEWISE
+                or self.mixed_mode() == CUDAGraphMode.PIECEWISE)
+
+    def max_cudagraph_mode(self) -> 'CUDAGraphMode':
+        return CUDAGraphMode(max(
+            self.value)) if self.separate_routine() else self
+
+    def has_full_cudagraphs(self) -> bool:
+        return self.max_cudagraph_mode() == CUDAGraphMode.FULL
+
+    def separate_routine(self) -> bool:
+        return isinstance(self.value, tuple)
+
+
 @config
 @dataclass
 class PassConfig:
@@ -91,6 +126,7 @@ class CompilationConfig:
         - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
     - CudaGraph capture:
         - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
+        - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
         - [`cudagraph_capture_sizes`]
         [vllm.config.CompilationConfig.cudagraph_capture_sizes]
         - [`cudagraph_num_of_warmups`]
@@ -157,7 +193,7 @@ class CompilationConfig:
     By default, all custom ops are enabled when running without Inductor and
     disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
     Inductor generates (fused) Triton kernels for disabled custom ops."""
-    splitting_ops: list[str] = field(default_factory=list)
+    splitting_ops: Optional[list[str]] = None
     """A list of ops to split the full graph into subgraphs, used in piecewise
     compilation."""
 
@@ -187,7 +223,43 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    use_cudagraph: bool = field(default_factory=lambda: envs.VLLM_USE_V1)
+    cudagraph_mode: Optional[CUDAGraphMode] = None
+    """
+    The mode of the cudagraph.
+    - NONE, no cudagraph capture.
+    - PIECEWISE. (v1 default)
+    - FULL.
+    - FULL_DECODE_ONLY.
+    - FULL_AND_PIECEWISE.
+
+    PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
+    incompatiable ops (i.e. some attention ops) outside the cudagraph
+    for general flexibility.
+    This is the default mode.
+
+    FULL mode: Capture full cudagraph for all batches. Can be good for small
+    models or workloads with small prompts; not supported by many backends.
+    Generally for performance FULL_AND_PIECEWISE is better.
+    
+    FULL_DECODE_ONLY mode: Capture full cudagraph for decode batches only.
+    Mixed prefill-decode batches are run without cudagraphs. Can be good for
+    decode instances in a P/D setup where prefill is not as important so we
+    can save some memory.
+    
+    FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
+    piecewise cudagraph for prefill and mixed prefill-decode batches.
+    This is like the most performant mode for most models.
+
+    Currently, the cudagraph mode is only used for the v1 engine.
+    Note that the cudagraph logic is generally orthogonal to the 
+    compilation logic. While piecewise cudagraphs require piecewise 
+    compilation (level=PIECEWISE and non-empty splitting_ops), full
+    cudagraphs are supported with and without compilation.
+    
+    Warning: This flag is new and subject to change in addition 
+    more modes may be added.
+    """
+    use_cudagraph: bool = True
     """Whether to use cudagraph inside compilation.
     - False: cudagraph inside compilation is not used.
     - True: cudagraph inside compilation is used. It requires
@@ -197,8 +269,9 @@ class CompilationConfig:
     CompilationLevel.PIECEWISE (aka -O3).
     Note that this is orthogonal to the cudagraph capture logic
     outside of compilation.
-    TODO: move outside cudagraph logic into compilation.
-    torch.compile will handle cudagraph capture logic in the future."""
+    Warning: This flag is deprecated and will be removed in the next major or
+    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
+    """
     cudagraph_num_of_warmups: int = 0
     """Number of warmup runs for cudagraph.
     It means the first several runs will be treated as warmup runs.
@@ -213,12 +286,17 @@ class CompilationConfig:
     cudagraph. If the caller can guarantee that the same input buffers
     are always used, it can set this to False. Otherwise, it should
     set this to True, and the compiler will copy the input to an
-    internally managed buffer. Default is False."""
-    full_cuda_graph: bool = False
+    internally managed buffer. Default is False. 
+    Note that this flag is only effective when cudagraph_mode is PIECEWISE.
+    """
+    full_cuda_graph: Optional[bool] = False
     """whether to use a full cuda graph for the entire forward pass rather than
     splitting certain operations such as attention into subgraphs. Thus this
     flag cannot be used together with splitting_ops. This may provide
-    performance benefits for smaller models."""
+    performance benefits for smaller models.
+    Warning: This flag is deprecated and will be removed in the next major or
+    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
+    """
 
     pass_config: PassConfig = field(default_factory=PassConfig)
     """Custom inductor passes, see PassConfig for more details"""
@@ -253,6 +331,13 @@ class CompilationConfig:
     Map from layer name to layer objects that need to be accessed outside
     model code, e.g., Attention, FusedMOE when dp_size>1."""
 
+    # Attention ops; used for piecewise cudagraphs
+    _attention_ops: ClassVar[list[str]] = [
+        "vllm.unified_attention",
+        "vllm.unified_attention_with_output",
+        "vllm.mamba_mixer2",
+    ]
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -297,13 +382,26 @@ class CompilationConfig:
         if pass_config_exclude:
             exclude["pass_config"] = pass_config_exclude
 
-        return TypeAdapter(CompilationConfig).dump_json(
-            self,
-            exclude=exclude,  # type: ignore[arg-type]
-            exclude_unset=True).decode()
+        # The cast to string is necessary because Pydantic is mocked in docs
+        # builds and sphinx-argparse doesn't know the return type of decode()
+        return str(
+            TypeAdapter(CompilationConfig).dump_json(
+                self,
+                exclude=exclude,  # type: ignore[arg-type]
+                exclude_unset=True).decode())
 
     __str__ = __repr__
 
+    @field_validator("cudagraph_mode", mode="before")
+    @classmethod
+    def validate_cudagraph_mode_before(cls, value: Any) -> Any:
+        """
+        enable parse the `cudagraph_mode` enum type from string
+        """
+        if isinstance(value, str):
+            return CUDAGraphMode[value.upper()]
+        return value
+
     def __post_init__(self) -> None:
         count_none = self.custom_ops.count("none")
         count_all = self.custom_ops.count("all")
@@ -341,7 +439,26 @@ class CompilationConfig:
         if isinstance(self.pass_config, dict):
             self.pass_config = PassConfig(**self.pass_config)
 
-    def init_backend(self, vllm_config: VllmConfig) -> Union[str, Callable]:
+        # migrate the deprecated flags
+        if not self.use_cudagraph:
+            logger.warning("use_cudagraph is deprecated, use "
+                           "cudagraph_mode=NONE instead.")
+            if self.cudagraph_mode is not None:
+                raise ValueError(
+                    "use_cudagraph and cudagraph_mode are mutually"
+                    " exclusive, prefer cudagraph_mode since "
+                    "use_cudagraph is deprecated.")
+            self.cudagraph_mode = CUDAGraphMode.NONE
+        if self.full_cuda_graph:
+            logger.warning("full_cuda_graph is deprecated, use "
+                           "cudagraph_mode=FULL instead.")
+            if self.cudagraph_mode is not None:
+                raise ValueError("full_cuda_graph and cudagraph_mode are "
+                                 "mutually exclusive, prefer cudagraph_mode "
+                                 "since full_cuda_graph is deprecated.")
+            self.cudagraph_mode = CUDAGraphMode.FULL
+
+    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         if self.level == CompilationLevel.NO_COMPILATION:
             raise ValueError("No compilation level is set.")
 
@@ -414,15 +531,34 @@ class CompilationConfig:
             self.max_capture_size] = self.max_capture_size
 
     def set_splitting_ops_for_v1(self):
-        # NOTE: this function needs to be called
-        if self.splitting_ops and self.full_cuda_graph:
-            raise ValueError("full_cuda_graph cannot be used together with "
-                             "splitting_ops, as Full CUDA graph will override "
-                             f"the splitting_ops: {self.splitting_ops}")
+        # NOTE: this function needs to be called only when level is
+        # CompilationLevel.PIECEWISE
+        assert self.level == CompilationLevel.PIECEWISE, (
+            "set_splitting_ops_for_v1 should only be called when "
+            "level is CompilationLevel.PIECEWISE")
 
-        if not self.splitting_ops:
-            self.splitting_ops = [] if self.full_cuda_graph else [
-                "vllm.unified_attention",
-                "vllm.unified_attention_with_output",
-                "vllm.mamba_mixer2",
-            ]
+        if self.splitting_ops is None:
+            # NOTE: When using full cudagraph, instead of setting an empty
+            # list and capture the full cudagraph inside the flattened fx
+            # graph, we keep the piecewise fx graph structure but capture the
+            # full cudagraph outside the fx graph. This reduces some cpu
+            # overhead when the runtime batch_size is not cudagraph captured.
+            # see https://github.com/vllm-project/vllm/pull/20059 for details.
+            self.splitting_ops = self._attention_ops
+        elif len(self.splitting_ops) == 0:
+            logger.warning_once("Using piecewise compilation with empty "
+                                "splitting_ops.")
+            if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
+                logger.warning_once(
+                    "When compilation level is piecewise with empty "
+                    "splitting_ops, PIECEWISE cudagraph_mode will be "
+                    "treated as FULL cudagraph_mode. Please ensure you are "
+                    "using attention backends that support cudagraph or set "
+                    "cudagraph_mode to NONE explicitly if encountering "
+                    "any problems.")
+                self.cudagraph_mode = CUDAGraphMode.FULL
+            self.splitting_ops = []
+
+    def splitting_ops_contain_attention(self) -> bool:
+        return self.splitting_ops is not None and all(
+            op in self.splitting_ops for op in self._attention_ops)
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 4686ba24e6..c57c51d289 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,13 +5,13 @@ import time
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 
 import torch
 import torch.distributed as dist
 
 import vllm.envs as envs
-from vllm.config import ParallelConfig, VllmConfig
+from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -26,6 +26,27 @@ batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL
 batchsize_forward_time: defaultdict = defaultdict(list)
 
 
+class BatchDescriptor(NamedTuple):
+    """
+    Batch descriptor for cudagraph dispatching. We should keep the num of
+    items as minimal as possible to properly and uniquely describe the padded
+    batch for cudagraph.
+    """
+    num_tokens: int
+    uniform_decode: bool = False
+    """
+    False can also be used for an uniform decode batch to dispatch to the 
+    cudagraph supporting non-uniform batches.
+    """
+
+    @property
+    def non_uniform(self) -> "BatchDescriptor":
+        """
+        Return a non-uniform version of current batch descriptor.
+        """
+        return BatchDescriptor(self.num_tokens, uniform_decode=False)
+
+
 def _compute_chunked_local_num_tokens(num_tokens_across_dp_cpu: list[int],
                                       max_num_tokens: int,
                                       chunk_idx: int) -> list[int]:
@@ -152,7 +173,15 @@ class ForwardContext:
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
     dp_metadata: Optional[DPMetadata] = None
-    skip_cuda_graphs: bool = False
+    # determine the cudagraph style at runtime to be FULL, PIECEWISE, or NONE.
+    # by default NONE, no cudagraph is used.
+    cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE
+    batch_descriptor: Optional[BatchDescriptor] = None
+
+    def __post_init__(self):
+        assert self.cudagraph_runtime_mode in [
+            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \
+            f"Invalid cudagraph runtime mode: {self.cudagraph_runtime_mode}"
 
 
 _forward_context: Optional[ForwardContext] = None
@@ -168,13 +197,13 @@ def get_forward_context() -> ForwardContext:
 
 @contextmanager
 def set_forward_context(
-    attn_metadata: Any,
-    vllm_config: VllmConfig,
-    virtual_engine: int = 0,
-    num_tokens: Optional[int] = None,
-    num_tokens_across_dp: Optional[torch.Tensor] = None,
-    skip_cuda_graphs: bool = False,
-):
+        attn_metadata: Any,
+        vllm_config: VllmConfig,
+        virtual_engine: int = 0,
+        num_tokens: Optional[int] = None,
+        num_tokens_across_dp: Optional[torch.Tensor] = None,
+        cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
+        batch_descriptor: Optional[BatchDescriptor] = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
@@ -198,7 +227,8 @@ def set_forward_context(
         virtual_engine=virtual_engine,
         attn_metadata=attn_metadata,
         dp_metadata=dp_metadata,
-        skip_cuda_graphs=skip_cuda_graphs,
+        cudagraph_runtime_mode=cudagraph_runtime_mode,
+        batch_descriptor=batch_descriptor,
     )
 
     try:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 483d5e1531..321db8287c 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -177,17 +177,20 @@ class CudaPlatformBase(Platform):
                 logger.info("Forcing kv cache block size to 128 for "
                             "CUTLASS_MLA backend.")
 
+        # lazy import to avoid circular import
+        from vllm.config import CUDAGraphMode
+
         compilation_config = vllm_config.compilation_config
         if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
                 and parallel_config.data_parallel_size > 1
-                and compilation_config.use_cudagraph):
+                and compilation_config.cudagraph_mode != CUDAGraphMode.NONE):
             logger.info(
-                "Data Parallel: Forcing enforce eager to be True since DP "
+                "Data Parallel: disabling cudagraphs since DP "
                 "with DeepEP high-throughput kernels are not CUDA Graph "
                 "compatible. The DeepEP low-latency kernels are CUDA Graph "
                 "compatible. Set the all_to_all backend to deepep_low_latency "
                 "to use those kernels instead.")
-            compilation_config.use_cudagraph = False
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
             if model_config is not None:
                 model_config.enforce_eager = True
 
@@ -454,8 +457,8 @@ class CudaPlatformBase(Platform):
         return True
 
     @classmethod
-    def get_piecewise_backend_cls(cls) -> str:
-        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+    def get_static_graph_wrapper_cls(cls) -> str:
+        return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
 
     @classmethod
     def stateless_init_device_torch_dist_pg(
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 91d5314900..4017f1ca7e 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -7,7 +7,7 @@ import random
 import sys
 from datetime import timedelta
 from platform import uname
-from typing import TYPE_CHECKING, NamedTuple, Optional, Union
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 
 import numpy as np
 import torch
@@ -137,6 +137,8 @@ class Platform:
 
     additional_env_vars: list[str] = []
 
+    _global_graph_pool: Optional[Any] = None
+
     @property
     def supported_dtypes(self) -> list[torch.dtype]:
         """Returns the supported dtypes for the current platform."""
@@ -522,6 +524,15 @@ class Platform:
             " attribute.", self.device_type, key)
             return None
 
+    def get_global_graph_pool(self) -> Any:
+        """
+        Return the global graph pool for the this platform.
+        """
+        cls = self.__class__
+        if cls._global_graph_pool is None:
+            cls._global_graph_pool = self.graph_pool_handle()
+        return cls._global_graph_pool
+
     @classmethod
     def get_cu_count(cls, device_id: int = 0) -> int:
         """
@@ -530,11 +541,11 @@ class Platform:
         raise NotImplementedError
 
     @classmethod
-    def get_piecewise_backend_cls(cls) -> str:
+    def get_static_graph_wrapper_cls(cls) -> str:
         """
-        Get piecewise backend class for piecewise graph.
+        Get static graph wrapper class for static graph.
         """
-        return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend"  # noqa
+        return "vllm.compilation.base_static_graph.AbstractStaticGraphWrapper"
 
     @classmethod
     def stateless_init_device_torch_dist_pg(
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 2d5bee5fc5..3ede86e158 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -421,8 +421,8 @@ class RocmPlatform(Platform):
         return 'gfx1' in torch.cuda.get_device_properties(0).gcnArchName
 
     @classmethod
-    def get_piecewise_backend_cls(cls) -> str:
-        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+    def get_static_graph_wrapper_cls(cls) -> str:
+        return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
 
     @classmethod
     def stateless_init_device_torch_dist_pg(
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index c7522a89c2..ba06abd07f 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -99,7 +99,7 @@ class TpuPlatform(Platform):
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        from vllm.config import CompilationLevel
+        from vllm.config import CompilationLevel, CUDAGraphMode
 
         cache_config = vllm_config.cache_config
         # For v0, the default block size is 16.
@@ -109,9 +109,17 @@ class TpuPlatform(Platform):
 
         # TPU only supports DYNAMO_ONCE compilation level
         if compilation_config.level != CompilationLevel.DYNAMO_ONCE:
-            logger.info("[TPU] Forcing DYNAMO_ONCE compilation level")
+            logger.info("[TPU] Forcing DYNAMO_ONCE compilation level, and "
+                        "disabling cudagraph.")
             compilation_config.level = CompilationLevel.DYNAMO_ONCE
 
+        if compilation_config.cudagraph_mode is None or \
+                compilation_config.cudagraph_mode.max_cudagraph_mode() \
+                    != CUDAGraphMode.NONE:
+            logger.info("[TPU] CUDA graph is not supported on TPU, "
+                        "disabling cudagraphs.")
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+
         if compilation_config.backend == "":
             compilation_config.backend = "openxla"
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index abd58dbbcb..66ebc8ad9d 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Optional
 import torch
 
 import vllm.envs as envs
+from vllm.config import CUDAGraphMode
 from vllm.logger import init_logger
 from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
@@ -100,16 +101,17 @@ class XPUPlatform(Platform):
         # Instances created using VllmConfig() typically have model_config as
         # None by default. The modification involves adding a check to prevent
         # potential null exceptions check and update model config.
-        if model_config is not None:
-            if model_config.dtype == torch.bfloat16:
-                bf16_supported = cls.device_support_bf16()
-                if not bf16_supported:
-                    model_config.dtype = torch.float16
-            if not model_config.enforce_eager:
-                logger.warning(
-                    "CUDA graph is not supported on XPU, fallback to the eager "
-                    "mode.")
-                model_config.enforce_eager = True
+        if model_config is not None and model_config.dtype == torch.bfloat16 \
+            and not cls.device_support_bf16():
+            model_config.dtype = torch.float16
+
+        compilation_config = vllm_config.compilation_config
+        if compilation_config.cudagraph_mode is None or \
+                compilation_config.cudagraph_mode.max_cudagraph_mode() \
+                    != CUDAGraphMode.NONE:
+            logger.info("[XPU] CUDA graph is not supported on XPU, "
+                        "disabling cudagraphs.")
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a411477bc3..ab7a71a399 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import Optional
 
 import numpy as np
 import torch
@@ -154,9 +154,26 @@ def _get_sliding_window_configs(
 
 class FlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlashAttentionMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.NEVER if get_flash_attn_version() == 2 \
-        else AttentionCGSupport.ALWAYS
+    # FA3:
+    # Supports full cudagraphs for all cases.
+    #
+    # FA2:
+    # For FA2, a graph is captured with max_query_len=1, (which is what we
+    # capture by default for num_tokens <= max_num_seqs when there is no
+    # spec-decode) then these graphs will not work for mixed prefill-decode
+    # (unlike FA3). This is due to special max_query_len=1 packed-GQA handling
+    # in FA2.
+    # In summary if we are running with spec decodes the graphs would
+    # work for mixed prefill-decode and uniform-decode. But for non-spec decodes
+    # the graphs would not work for mixed prefill-decode; sorta the inverse
+    # of UNIFORM_SINGLE_TOKEN_DECODE.
+    # Theres probably a better way to describe this using `AttentionCGSupport`
+    # but for now just set it to `UNIFORM_BATCH` to get use to drop down
+    # to FULL_AND_PIECEWISE.
+    # TODO(luka, lucas): audit FA2 as part of:
+    #  https://github.com/vllm-project/vllm/issues/22945
+    cudagraph_support = AttentionCGSupport.ALWAYS \
+        if get_flash_attn_version() == 3 else AttentionCGSupport.UNIFORM_BATCH
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
@@ -177,17 +194,13 @@ class FlashAttentionMetadataBuilder(
 
         self.max_num_splits = 0  # No upper bound on the number of splits.
         self.aot_schedule = (get_flash_attn_version() == 3)
-        self.use_full_cuda_graph = self.compilation_config.full_cuda_graph
-        if self.use_full_cuda_graph:
-            if not self.aot_schedule:
-                raise ValueError(
-                    "AoT scheduling is required for full cuda graph.")
-            capture_sizes = self.compilation_config.cudagraph_capture_sizes
-            if not capture_sizes:
-                raise ValueError(
-                    "cudagraph_capture_sizes should not be None when "
-                    "full_cuda_graph is True.")
-            self.max_cudagraph_size = max(capture_sizes)
+
+        self.use_full_cuda_graph = \
+            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+
+        if self.use_full_cuda_graph and self.aot_schedule:
+            self.max_cudagraph_size = self.compilation_config.max_capture_size
+
             if self.max_cudagraph_size > 992:
                 # This condition derives from FA3's internal heuristic.
                 # TODO(woosuk): Support larger cudagraph sizes.
@@ -310,9 +323,9 @@ class FlashAttentionMetadataBuilder(
                                           seqlens=seq_lens,
                                           max_seq_len=max_seq_len,
                                           causal=causal)
-
-        if self.use_full_cuda_graph:
-            assert scheduler_metadata is not None
+        # For FA3 + full cudagraph
+        max_num_splits = 0
+        if self.use_full_cuda_graph and scheduler_metadata is not None:
             n = scheduler_metadata.shape[0]
             self.scheduler_metadata[:n] = scheduler_metadata
             # NOTE(woosuk): We should zero out the rest of the scheduler
@@ -322,14 +335,12 @@ class FlashAttentionMetadataBuilder(
             self.scheduler_metadata[n:] = 0
             scheduler_metadata = self.scheduler_metadata[:n]
 
-        max_num_splits = 0
-        if (self.use_full_cuda_graph
-                and num_actual_tokens <= self.max_cudagraph_size):
-            # NOTE(woosuk): Setting num_splits > 1 may increase the memory
-            # usage, because the intermediate buffers of size [num_splits,
-            # num_heads, num_tokens, head_size] are allocated. Therefore,
-            # we only set num_splits when using cuda graphs.
-            max_num_splits = self.max_num_splits
+            if num_actual_tokens <= self.max_cudagraph_size:
+                # NOTE(woosuk): Setting num_splits > 1 may increase the memory
+                # usage, because the intermediate buffers of size [num_splits,
+                # num_heads, num_tokens, head_size] are allocated. Therefore,
+                # we only set num_splits when using cuda graphs.
+                max_num_splits = self.max_num_splits
 
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -350,11 +361,6 @@ class FlashAttentionMetadataBuilder(
             causal=causal)
         return attn_metadata
 
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        # Full CUDA Graph always supported (FA2 support checked separately)
-        return True
-
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         return use_cascade_attention(*args, **kwargs)
 
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 12e5542d69..02decb171f 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -17,7 +17,7 @@ from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 import vllm.envs as envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionType)
-from vllm.config import VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import cdiv, is_pin_memory_available
 from vllm.utils.flashinfer import use_trtllm_attention
@@ -183,8 +183,8 @@ class FlashInferMetadata:
 
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.PURE_DECODE_ONLY
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
     reorder_batch_threshold: ClassVar[int] = 1
 
@@ -203,7 +203,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                      self.kv_cache_spec.block_size)
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         max_num_pages = max_num_reqs * max_num_pages_per_req
-        self.enable_cuda_graph = self.compilation_config.full_cuda_graph
+        self.enable_cuda_graph = self.compilation_config.cudagraph_mode.\
+            decode_mode() == CUDAGraphMode.FULL
         if self.enable_cuda_graph:
             # For full cudagraph capture, one `decode_wrapper` for each batch
             # size is needed for FlashInfer.
@@ -586,10 +587,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
         return self.build(0, m)
 
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        return common_attn_metadata.max_query_len == 1
-
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         if self.kv_cache_spec.dtype != self.vllm_config.model_config.dtype:
             # TODO: The cascade wrapper currently does not support setting
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index 3f84f8967d..ace078e2b2 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -89,8 +89,8 @@ class Mamba2AttentionMetadata:
 
 class Mamba2AttentionMetadataBuilder(
         AttentionMetadataBuilder[Mamba2AttentionMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.PURE_DECODE_ONLY
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
     reorder_batch_threshold: ClassVar[int] = 1
 
@@ -203,7 +203,3 @@ class Mamba2AttentionMetadataBuilder(
         m.max_query_len = 1  # decode-only
 
         return self.build(0, m)
-
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        return common_attn_metadata.max_query_len == 1
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index badff67656..f2610671f7 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -575,7 +575,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             "MLA only supports decode-only full CUDAGraph capture. " \
             "Make sure all cudagraph capture sizes <= max_num_seq."
 
-        m.max_query_len = 1  # decode-only
+        assert m.max_query_len == 1  # decode-only
 
         return self.build(0, m)
 
@@ -728,10 +728,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
         return attn_metadata
 
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        return common_attn_metadata.max_query_len == 1
-
 
 class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     """
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index b076613c86..6e1e5d6533 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -22,7 +22,7 @@ logger = init_logger(__name__)
 class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
     # enable full CUDA Graph support for decode-only capture
     attn_cudagraph_support: ClassVar[
-        AttentionCGSupport] = AttentionCGSupport.PURE_DECODE_ONLY
+        AttentionCGSupport] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
 
 class CutlassMLABackend(MLACommonBackend):
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 2b0f52cf80..1167442340 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -55,8 +55,8 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
 
 
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.PURE_DECODE_ONLY
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_BATCH
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
@@ -73,7 +73,7 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
         device_properties = torch.cuda.get_device_properties(self.device)
         num_sms = device_properties.multi_processor_count
 
-        if self.compilation_config.full_cuda_graph:
+        if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             self.cg_buf_tile_scheduler_metadata = torch.zeros(
                 # Upper bound on size (<= #SMs, TileSchedulerMetaDataSize)
                 # TileSchedulerMetaDataSize = 8
@@ -95,7 +95,10 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
             1, # MQA for the decode path
         )
 
-        if self.compilation_config.full_cuda_graph:
+        # TODO: we can disambiguate between decode and mixed-prefill decode here
+        # so we can only use the persistent buffer if a cudagraph is actually
+        # being used.
+        if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             assert self.cg_buf_tile_scheduler_metadata is not None
             assert self.cg_buf_num_splits is not None
 
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 8b55e1a301..082c7e6f7c 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -65,8 +65,10 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
 
 
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.PURE_DECODE_ONLY
+    # TODO(luka, lucas): audit this as part of:
+    #  https://github.com/vllm-project/vllm/issues/22945
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
@@ -82,7 +84,10 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         max_num_pages = max_num_reqs * max_num_pages_per_req
 
         # Preparing persistent buffers
-        if vllm_config.compilation_config.full_cuda_graph:
+        # TODO: we can disambiguate between decode and mixed-prefill decode here
+        # so we can only use the persistent buffer if a cudagraph is actually
+        # being used.
+        if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
                                                dtype=torch.int32,
                                                device=device)
@@ -120,7 +125,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
             block_table_bounds.cumsum(dim=0, dtype=torch.int32)
         ])
 
-        if self.compilation_config.full_cuda_graph:
+        if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
 
             num_actual_pages = paged_kv_indices.size(0)
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index e8bffbef44..7d09ac0a4a 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -311,11 +311,6 @@ class AiterFlashAttentionMetadataBuilder(
         )
         return attn_metadata
 
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        # Full CUDA Graph always supported (FA2 support checked separately)
-        return True
-
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         return False
 
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index c33afbfebc..48a9af3dec 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -58,8 +58,7 @@ class TritonAttentionMetadata:
 
 class TritonAttentionMetadataBuilder(
         AttentionMetadataBuilder[TritonAttentionMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.ALWAYS
+    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
@@ -132,11 +131,6 @@ class TritonAttentionMetadataBuilder(
         )
         return attn_metadata
 
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        # Full CUDA Graph always supported
-        return True
-
 
 class TritonAttentionBackend(AttentionBackend):
 
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 91eb84245a..1c7d087989 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -158,18 +158,21 @@ class AttentionCGSupport(enum.Enum):
     Here we do not consider the cascade attention, as currently
     it is never cudagraph supported."""
 
+    ALWAYS = 3
+    """Cudagraph always supported; supports mixed-prefill-decode"""
+    UNIFORM_BATCH = 2
+    """Cudagraph supported for batches the only contain query lengths that are
+    the same, this can be used for spec-decode 
+        i.e. "decodes" are 1 + num_speculative_tokens"""
+    UNIFORM_SINGLE_TOKEN_DECODE = 1
+    """Cudagraph supported for batches the only contain query_len==1 decodes"""
     NEVER = 0
     """NO cudagraph support"""
-    PURE_DECODE_ONLY = 1
-    """Cudagraph supported for pure decode, need to run without
-    cudagraph for mixed prefill-decode batches"""
-    ALWAYS = 2
-    """Cudagraph always supported"""
 
 
 class AttentionMetadataBuilder(abc.ABC, Generic[M]):
-    # Does this backend/builder support CUDA Graphs for attention.
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+    # Does this backend/builder support CUDA Graphs for attention (default: no).
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
         AttentionCGSupport.NEVER
     # Does this backend/builder reorder the batch?
     # If not, set this to None. Otherwise set it to the query
@@ -199,13 +202,6 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
         """
         raise NotImplementedError
 
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        """
-        Can this batch (with given metadata) use CUDA Graphs for attention.
-        """
-        return False
-
     def build_for_cudagraph_capture(
             self, common_attn_metadata: CommonAttentionMetadata) -> M:
         """
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
new file mode 100644
index 0000000000..02e65820b7
--- /dev/null
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig
+from vllm.forward_context import BatchDescriptor
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class CudagraphDispatcher:
+    """
+    Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs.
+
+    The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
+    for FULL cudagraph runtime mode. The keys are initialized depending on 
+    attention support and what cudagraph mode is set in CompilationConfig. The 
+    keys stored in dispatcher are the only source of truth for valid
+    cudagraphs that can be dispatched at runtime.
+
+    At runtime, the dispatch method generates the runtime cudagraph mode (FULL, 
+    PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
+    based on the input key. After dispatching (commuicate via forward context), 
+    the cudagraph wrappers will trust the dispatch key to do either capturing
+    or replaying (if mode matched), or pass through to the underlying runnable 
+    without cudagraph (if mode no match or mode is NONE).
+    """
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
+        self.cudagraph_mode = self.compilation_config.cudagraph_mode
+
+        # Dict to store valid cudagraph dispatching keys.
+        self.cudagraph_keys: dict[CUDAGraphMode, set[BatchDescriptor]] = {
+            CUDAGraphMode.PIECEWISE: set(),
+            CUDAGraphMode.FULL: set(),
+        }
+
+        assert not self.cudagraph_mode.requires_piecewise_compilation() or \
+            (self.compilation_config.level == CompilationLevel.PIECEWISE and
+             self.compilation_config.splitting_ops_contain_attention()), \
+            "Compilation level should be CompilationLevel.PIECEWISE when "\
+            "cudagraph_mode piecewise cudagraphs is used, "\
+            f"cudagraph_mode={self.cudagraph_mode}, "\
+            f"compilation_level={self.compilation_config.level}, "\
+            f"splitting_ops={self.compilation_config.splitting_ops}"
+
+        self.keys_initialized = False
+
+    def add_cudagraph_key(self, runtime_mode: CUDAGraphMode,
+                          batch_descriptor: BatchDescriptor):
+        assert runtime_mode in [CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \
+            f"Invalid cudagraph runtime mode: {runtime_mode}"
+        self.cudagraph_keys[runtime_mode].add(batch_descriptor)
+
+    def initialize_cudagraph_keys(self, cudagraph_mode: CUDAGraphMode,
+                                  uniform_decode_query_len: int):
+        # This should be called only after attention backend is initialized.
+
+        # Note: we create all valid keys possible for cudagraph but do not
+        # guarantee all keys would be used. For example, we create keys for
+        # piecewise cudagraphs when it is piecewise compilation, which is always
+        # valid, but for attention backend support unified routine, we may not
+        # trigger capturing/replaying the piecewise cudagraphs depending on
+        # CompilationConfig.cudagraph_mode. In addition, if we allow lazy
+        # capturing in future PR, some keys may never be triggered.
+        if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
+            for bs in self.compilation_config.cudagraph_capture_sizes:
+                self.add_cudagraph_key(
+                    cudagraph_mode.mixed_mode(),
+                    BatchDescriptor(num_tokens=bs, uniform_decode=False))
+
+        # if decode cudagraph mode is FULL, and we don't already have mixed
+        # mode full cudagraphs then add them here.
+        if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL \
+            and cudagraph_mode.separate_routine():
+            max_num_tokens = uniform_decode_query_len * \
+                self.vllm_config.scheduler_config.max_num_seqs
+            cudagraph_capture_sizes_for_decode = [
+                x for x in self.compilation_config.cudagraph_capture_sizes
+                if x <= max_num_tokens and x >= uniform_decode_query_len
+            ]
+            for bs in cudagraph_capture_sizes_for_decode:
+                self.add_cudagraph_key(
+                    CUDAGraphMode.FULL,
+                    BatchDescriptor(num_tokens=bs, uniform_decode=True))
+        self.keys_initialized = True
+
+    def dispatch(
+        self, batch_descriptor: BatchDescriptor
+    ) -> tuple[CUDAGraphMode, Optional[BatchDescriptor]]:
+        """
+        Given a batch descriptor, dispatch to a cudagraph mode.
+        A new batch descriptor is returned as we might dispatch a uniform batch 
+        to a graph that supports a more general batch (uniform to non-uniform).
+        """
+        # if not initialized, just skip dispatching.
+        if not self.keys_initialized:
+            logger.warning_once("cudagraph dispatching keys are not "
+                                "initialized. No cudagraph will be used.")
+            return CUDAGraphMode.NONE, None
+
+        # check if key exists for full cudagraph
+        if batch_descriptor in self.cudagraph_keys[CUDAGraphMode.FULL]:
+            return CUDAGraphMode.FULL, batch_descriptor
+
+        # otherwise, check if non-uniform key exists
+        non_uniform_key = batch_descriptor.non_uniform
+        if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.FULL]:
+            return CUDAGraphMode.FULL, non_uniform_key
+
+        # also check if non-uniform key exists for more "general"
+        # piecewise cudagraph
+        if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
+            return CUDAGraphMode.PIECEWISE, non_uniform_key
+
+        # finally, just return no cudagraphs
+        return CUDAGraphMode.NONE, None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d532528788..9460d91c58 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -21,7 +21,9 @@ from vllm.attention import Attention, AttentionType
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.counter import compilation_counter
-from vllm.config import (CompilationLevel, VllmConfig,
+from vllm.compilation.cuda_graph import CUDAGraphWrapper
+from vllm.compilation.monitor import set_cudagraph_capturing_enabled
+from vllm.config import (CompilationLevel, CUDAGraphMode, VllmConfig,
                          get_layers_from_vllm_config, update_config)
 from vllm.distributed.eplb.eplb_state import EplbState
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
@@ -29,7 +31,8 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
     prepare_communication_buffer_for_model)
-from vllm.forward_context import DPMetadata, set_forward_context
+from vllm.forward_context import (BatchDescriptor, DPMetadata,
+                                  set_forward_context)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
@@ -48,13 +51,15 @@ from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size,
-                        is_pin_memory_available, round_up, supports_dynamo)
+                        GiB_bytes, LazyLoader, cdiv, check_use_alibi,
+                        get_dtype_size, is_pin_memory_available, round_up,
+                        supports_dynamo)
 from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
     make_kv_sharing_fast_prefill_attention_metadata,
     reorder_batch_to_split_decodes_and_prefills)
+from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 from vllm.v1.kv_cache_interface import (AttentionSpec,
                                         ChunkedLocalAttentionSpec,
                                         FullAttentionSpec, KVCacheConfig,
@@ -218,11 +223,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             is_spec_decode=bool(self.vllm_config.speculative_config),
         )
 
-        self.use_cuda_graph = (
-            self.vllm_config.compilation_config.level
-            == CompilationLevel.PIECEWISE
-            and self.vllm_config.compilation_config.use_cudagraph
-            and not self.model_config.enforce_eager)
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
         # The convention is different.
         # self.cudagraph_batch_sizes sorts in ascending order.
@@ -230,8 +230,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.cudagraph_batch_sizes = list(
             reversed(self.compilation_config.cudagraph_capture_sizes))
 
-        self.full_cuda_graph = self.compilation_config.full_cuda_graph
-
         # Cache the device properties.
         self._init_device_properties()
 
@@ -326,6 +324,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.kv_sharing_fast_prefill_logits_indices = torch.zeros(
                 self.max_num_tokens, dtype=torch.int32, device=self.device)
 
+        self.uniform_decode_query_len = 1 if not self.speculative_config else \
+            1 + self.speculative_config.num_speculative_tokens
+
+        # Cudagraph dispatcher for runtime cudagraph dispatching.
+        self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config)
+
         self.mm_budget = (MultiModalBudget(
             self.model_config,
             self.scheduler_config,
@@ -471,7 +475,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 assert (task := pooling_params.task) is not None, (
                     "You did not set `task` in the API")
 
-                model = cast(VllmModelForPooling, self.model)
+                model = cast(VllmModelForPooling, self.get_model())
                 to_update = model.pooler.get_pooling_updates(task)
                 to_update.apply(pooling_params)
 
@@ -679,13 +683,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> tuple[dict[str,
-                    Any], bool, torch.Tensor, Optional[SpecDecodeMetadata],
-               np.ndarray, Optional[CommonAttentionMetadata]]:
+    ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata],
+               np.ndarray, Optional[CommonAttentionMetadata], int]:
         """
         :return: tuple[
             attn_metadata: layer-to-attention_metadata mapping,
-            attention_cuda_graphs: whether attention can run in cudagraph
             logits_indices, spec_decode_metadata
         ]
         """
@@ -820,7 +822,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # valid, we fill the padded indices with the last index.
             self.kv_sharing_fast_prefill_logits_indices[num_logits:].fill_(
                 logits_indices[-1].item())
-            if (self.use_cuda_graph
+            if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
                     and num_logits <= self.cudagraph_batch_sizes[-1]):
                 # Use piecewise CUDA graphs.
                 # Add padding to the batch size.
@@ -925,17 +927,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         continue
                     attn_metadata[layer_name] = attn_metadata_i
 
-        attention_cuda_graphs = all(
-            g.metadata_builder.can_run_in_cudagraph(common_attn_metadata)
-            for g in self._attn_group_iterator())
-
         # Hot-Swap lora model
         if self.lora_config:
             self.set_active_loras(self.input_batch, num_scheduled_tokens)
 
-        return (attn_metadata, attention_cuda_graphs, logits_indices,
-                spec_decode_metadata, num_scheduled_tokens,
-                spec_decode_common_attn_metadata)
+        return (attn_metadata, logits_indices, spec_decode_metadata,
+                num_scheduled_tokens, spec_decode_common_attn_metadata,
+                max_num_scheduled_tokens)
 
     def _compute_cascade_attn_prefix_len(
         self,
@@ -1259,6 +1257,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         return mm_embeds
 
     def get_model(self) -> nn.Module:
+        # get raw model out of the cudagraph wrapper.
+        if isinstance(self.model, CUDAGraphWrapper):
+            return self.model.unwrap()
         return self.model
 
     def get_supported_generation_tasks(self) -> list[GenerationTask]:
@@ -1415,9 +1416,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return
 
         assert self.eplb_state is not None
-        assert is_mixture_of_experts(self.model)
+        model = self.get_model()
+        assert is_mixture_of_experts(model)
         self.eplb_state.step(
-            self.model,
+            model,
             is_dummy,
             is_profile,
             log_stats=self.parallel_config.eplb_log_balancedness,
@@ -1507,15 +1509,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                                 self.vllm_config)
 
         # Prepare the decoder inputs.
-        (attn_metadata, attention_cuda_graphs, logits_indices,
-         spec_decode_metadata, num_scheduled_tokens_np,
-         spec_decode_common_attn_metadata) = (
-             self._prepare_inputs(scheduler_output))
+        (attn_metadata, logits_indices, spec_decode_metadata,
+         num_scheduled_tokens_np, spec_decode_common_attn_metadata,
+         max_query_len) = (self._prepare_inputs(scheduler_output))
 
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        if (self.use_cuda_graph
+        if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
-            # Use piecewise CUDA graphs.
+            # Use CUDA graphs.
             # Add padding to the batch size.
             num_input_tokens = self.vllm_config.pad_for_cudagraph(
                 num_scheduled_tokens)
@@ -1581,10 +1582,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                 num_input_tokens, intermediate_tensors, True)
 
-        # Some attention backends only support CUDA Graphs in pure decode.
-        # If attention doesn't support CUDA Graphs for this batch, but we
-        # compiled with full CUDA graphs, we have to skip them entirely.
-        skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs
+        uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
+            num_scheduled_tokens == self.input_batch.num_reqs * max_query_len)
+        batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
+                                           uniform_decode=uniform_decode)
+        cudagraph_runtime_mode, batch_descriptor = \
+            self.cudagraph_dispatcher.dispatch(batch_descriptor)
 
         # Run the model.
         # Use persistent buffers for CUDA graphs.
@@ -1593,10 +1596,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 self.vllm_config,
                 num_tokens=num_input_tokens,
                 num_tokens_across_dp=num_tokens_across_dp,
-                skip_cuda_graphs=skip_cuda_graphs,
+                cudagraph_runtime_mode=cudagraph_runtime_mode,
+                batch_descriptor=batch_descriptor,
         ), self.maybe_get_kv_connector_output(
                 scheduler_output) as kv_connector_output:
-
             model_output = self.model(
                 input_ids=input_ids,
                 positions=positions,
@@ -2021,20 +2024,31 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.model.compile(
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
                 backend=backend)
+            return
+        # for other compilation levels, cudagraph behavior is controlled by
+        # CudagraphWraper and CudagraphDispatcher of vllm.
+
+        # wrap the model with full cudagraph wrapper if needed.
+        if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
+            self.model = CUDAGraphWrapper(self.model,
+                                          self.vllm_config,
+                                          runtime_mode=CUDAGraphMode.FULL)
 
     def reload_weights(self) -> None:
         assert getattr(self, "model", None) is not None, \
             "Cannot reload weights before model is loaded."
         model_loader = get_model_loader(self.load_config)
         logger.info("Reloading weights inplace...")
-        model_loader.load_weights(self.model, model_config=self.model_config)
+        model = self.get_model()
+        model_loader.load_weights(model, model_config=self.model_config)
 
     def save_tensorized_model(
         self,
         tensorizer_config: "TensorizerConfig",
     ) -> None:
+        model = self.get_model()
         TensorizerLoader.save_model(
-            self.model,
+            model,
             tensorizer_config=tensorizer_config,
             model_config=self.model_config,
         )
@@ -2210,31 +2224,82 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def _dummy_run(
         self,
         num_tokens: int,
-        capture_attn_cudagraph: bool = False,
+        cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
+        force_attention: bool = False,
+        uniform_decode: bool = False,
         skip_eplb: bool = False,
         is_profile: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Run a dummy forward pass to warm up/profile run or capture the
+        CUDA graph for the model.
+
+        Args:
+            num_tokens: Number of tokens to run the dummy forward pass.
+            cudagraph_runtime_mode: used to control the behavior.
+                - CUDAGraphMode.NONE: No cudagraph, for warm up and profile run
+                - CUDAGraphMode.PIECEWISE: Piecewise cudagraph.
+                - CUDAGraphMode.FULL: Full cudagraph, attention metadata is
+                    needed.
+            force_attention: If True, always create attention metadata. Used to 
+                warm up attention backend when mode is NONE.
+            uniform_decode: If True, the batch is a uniform decode batch.
+            skip_eplb: If True, skip EPLB state update.
+            is_profile: If True, this is a profile run.
+        """
+        assert cudagraph_runtime_mode in {
+            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
+        }
 
         # Padding for DP
         num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
         num_tokens += num_pad
 
+        # If cudagraph_mode.decode_mode() == FULL and
+        # cudagraph_mode.seperate_routine(). This means that we are using
+        # different graphs and/or modes for mixed prefill-decode batches vs.
+        # uniform decode batches. A uniform decode batch means that all
+        # requests have identical query length, except a potential virtual
+        # request (shorter) in the batch account for padding.
+        # Uniform decode batch could either be common pure decode, where
+        # max_query_len == 1, or speculative decode, where
+        # max_query_len == 1 + num_spec_decode_tokens.
+
+        # When setting max_query_len = 1, we switch to and capture the optimized
+        # routine of FA2 for pure decode, i.e., Flashdecode + an optimization
+        # for GQA/MQA.
+        max_query_len = self.uniform_decode_query_len if uniform_decode else \
+                                                                num_tokens
+
         # Set num_scheduled_tokens based on num_tokens and max_num_seqs
         # for dummy run with LoRA so that the num_reqs collectively
         # has num_tokens in total.
         assert num_tokens <= self.scheduler_config.max_num_batched_tokens
         max_num_reqs = self.scheduler_config.max_num_seqs
-        num_reqs = min(num_tokens, max_num_reqs)
-        min_tokens_per_req = num_tokens // num_reqs
-        num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
-        num_scheduled_tokens_list[-1] += num_tokens % num_reqs
+        if uniform_decode:
+            num_reqs = cdiv(num_tokens, max_query_len)
+            assert num_reqs <= max_num_reqs, \
+                "Do not capture num_reqs > max_num_reqs for uniform batch"
+            num_scheduled_tokens_list = [max_query_len] * num_reqs
+            if num_tokens % max_query_len != 0:
+                num_scheduled_tokens_list[-1] = num_tokens % max_query_len
+        else:
+            num_reqs = min(num_tokens, max_num_reqs)
+            min_tokens_per_req = num_tokens // num_reqs
+            num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
+            num_scheduled_tokens_list[-1] += num_tokens % num_reqs
+
         assert sum(num_scheduled_tokens_list) == num_tokens
         assert len(num_scheduled_tokens_list) == num_reqs
         num_scheduled_tokens = np.array(num_scheduled_tokens_list,
                                         dtype=np.int32)
 
         attn_metadata: Optional[dict[str, Any]] = None
-        if capture_attn_cudagraph:
+
+        # If force_attention is True, we always capture attention. Otherwise,
+        # it only happens for cudagraph_runtime_mode=FULL.
+        if force_attention or cudagraph_runtime_mode == \
+                CUDAGraphMode.FULL:
             attn_metadata = {}
 
             # Make sure max_model_len is used at the graph capture time.
@@ -2255,7 +2320,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     num_computed_tokens_cpu_tensor[:num_reqs],
                     num_reqs=num_reqs,
                     num_actual_tokens=num_tokens,
-                    max_query_len=num_tokens,
+                    max_query_len=max_query_len,
                     block_table_tensor=self.input_batch.block_table[
                         kv_cache_group_id].get_device_tensor()[:num_reqs],
                     slot_mapping=self.input_batch.
@@ -2299,12 +2364,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
                 intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                     num_tokens, None, False)
+            if cudagraph_runtime_mode == CUDAGraphMode.NONE:
+                batch_descriptor = None
+            else:
+                # filter out the valid batch descriptor
+                _cg_mode, batch_descriptor = \
+                    self.cudagraph_dispatcher.dispatch(
+                        BatchDescriptor(num_tokens=num_tokens,
+                                        uniform_decode=uniform_decode))
+                # sanity check
+                assert cudagraph_runtime_mode == _cg_mode, (
+                    f"Cudagraph runtime mode mismatch at dummy_run. "
+                    f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}.")
 
             with self.maybe_randomize_inputs(input_ids), set_forward_context(
                     attn_metadata,
                     self.vllm_config,
                     num_tokens=num_tokens,
-                    num_tokens_across_dp=num_tokens_across_dp):
+                    num_tokens_across_dp=num_tokens_across_dp,
+                    cudagraph_runtime_mode=cudagraph_runtime_mode,
+                    batch_descriptor=batch_descriptor):
                 outputs = self.model(
                     input_ids=input_ids,
                     positions=positions,
@@ -2436,7 +2515,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                       dtype=torch.int32,
                                       device=self.device)
 
-        model = cast(VllmModelForPooling, self.model)
+        model = cast(VllmModelForPooling, self.get_model())
         dummy_pooling_params = PoolingParams(task=task)
         to_update = model.pooler.get_pooling_updates(task)
         to_update.apply(dummy_pooling_params)
@@ -2546,12 +2625,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         gc.collect()
 
     def capture_model(self) -> None:
-        if not self.use_cuda_graph:
+        if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
             logger.warning(
                 "Skipping CUDA graph capture. To turn on CUDA graph capture, "
-                "set -O %s and ensure `use_cudagraph` was not manually set to "
-                "False", CompilationLevel.PIECEWISE)
+                "ensure `cudagraph_mode` was not manually set to `NONE`")
             return
+        else:
+            self.initialize_cudagraph_capture()
 
         compilation_counter.num_gpu_runner_capture_triggers += 1
 
@@ -2576,25 +2656,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes
         # can reuse the memory pool allocated for the large shapes.
+        set_cudagraph_capturing_enabled(True)
         with freeze_gc(), graph_capture(device=self.device):
-            full_cg = self.full_cuda_graph
-            # Only rank 0 should print progress bar during capture
-            compilation_cases = reversed(self.cudagraph_batch_sizes)
-            if is_global_first_rank():
-                compilation_cases = tqdm(
-                    list(compilation_cases),
-                    disable=not self.load_config.use_tqdm_on_load,
-                    desc="Capturing CUDA graph shapes")
-            for num_tokens in compilation_cases:
-                # We skip EPLB here since we don't want to record dummy metrics
-                for _ in range(
-                        self.compilation_config.cudagraph_num_of_warmups):
-                    self._dummy_run(num_tokens,
-                                    capture_attn_cudagraph=full_cg,
-                                    skip_eplb=True)
-                self._dummy_run(num_tokens,
-                                capture_attn_cudagraph=full_cg,
-                                skip_eplb=True)
+            cudagraph_mode = self.compilation_config.cudagraph_mode
+            if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
+                cudagraph_runtime_mode = cudagraph_mode.mixed_mode()
+
+                compilation_cases = list(reversed(self.cudagraph_batch_sizes))
+                self._capture_cudagraphs(
+                    compilation_cases,
+                    cudagraph_runtime_mode=cudagraph_runtime_mode,
+                    uniform_decode=False)
+
+            # Capture full cudagraph for uniform decode batches if we have
+            # dont already have full mixed prefill-decode cudagraphs
+            if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL and \
+                cudagraph_mode.separate_routine():
+                max_num_tokens = self.scheduler_config.max_num_seqs * \
+                        self.uniform_decode_query_len
+                decode_cudagraph_batch_sizes = [
+                    x for x in self.cudagraph_batch_sizes if
+                    x <= max_num_tokens and x >= self.uniform_decode_query_len
+                ]
+                compilation_cases_decode = list(
+                    reversed(decode_cudagraph_batch_sizes))
+                self._capture_cudagraphs(
+                    compilation_cases=compilation_cases_decode,
+                    cudagraph_runtime_mode=CUDAGraphMode.FULL,
+                    uniform_decode=True)
+
+        # Disable cudagraph capturing globally, so any unexpected cudagraph
+        # capturing will be detected and raise an error after here.
+        # Note: We don't put it into graph_capture context manager because
+        # we may doing lazy capturing in future that still allows capturing
+        # after here.
+        set_cudagraph_capturing_enabled(False)
 
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]
@@ -2604,6 +2700,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
                     elapsed_time, cuda_graph_size / (1 << 30))
 
+    def _capture_cudagraphs(self, compilation_cases: list[int],
+                            cudagraph_runtime_mode: CUDAGraphMode,
+                            uniform_decode: bool):
+        assert cudagraph_runtime_mode != CUDAGraphMode.NONE and \
+            cudagraph_runtime_mode in [CUDAGraphMode.FULL,
+                                        CUDAGraphMode.PIECEWISE]
+
+        # Only rank 0 should print progress bar during capture
+        if is_global_first_rank():
+            compilation_cases = tqdm(
+                compilation_cases,
+                disable=not self.load_config.use_tqdm_on_load,
+                desc="Capturing CUDA graphs ({}, {})".format(
+                    "decode" if uniform_decode else "mixed prefill-decode",
+                    cudagraph_runtime_mode.name))
+        # We skip EPLB here since we don't want to record dummy metrics
+        for num_tokens in compilation_cases:
+            for _ in range(self.compilation_config.cudagraph_num_of_warmups):
+                # Use CUDAGraphRuntimeStyle.NONE (default) for warmup.
+                # But be careful, warm up with `NONE`is orthogonal to
+                # if we want to warm up attention or not. This is
+                # different from the case where `FULL` implies capture
+                # attention while `PIECEWISE` implies no attention.
+                force_attention = (
+                    cudagraph_runtime_mode == CUDAGraphMode.FULL)
+                self._dummy_run(num_tokens,
+                                cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                                force_attention=force_attention,
+                                uniform_decode=uniform_decode,
+                                skip_eplb=True)
+            self._dummy_run(num_tokens,
+                            cudagraph_runtime_mode=cudagraph_runtime_mode,
+                            uniform_decode=uniform_decode,
+                            skip_eplb=True)
+
     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize the attention backends and attention metadata builders.
@@ -2648,25 +2779,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                             attn_metadata_builder_i,
                                             layer_names)
                 attn_groups.append(attn_group)
-
-                if self.full_cuda_graph:
-                    if attn_metadata_builder_i.attn_cudagraph_support == \
-                        AttentionCGSupport.NEVER:
-                        raise ValueError(
-                            f"Full CUDAGraph not supported for "
-                            f"{attn_backend.__name__}. Turn off "
-                            f"CompilationConfig.full_cuda_graph or use a "
-                            f" different attention backend.")
-                    if attn_metadata_builder_i.attn_cudagraph_support == \
-                        AttentionCGSupport.PURE_DECODE_ONLY:
-                        # Limit the max cudagraph size to the max number of
-                        # sequences for pure decode only cudagraph backend,
-                        # whose max_query_len is 1.
-                        self.cudagraph_batch_sizes = [
-                            size for size in self.cudagraph_batch_sizes
-                            if size <= self.scheduler_config.max_num_seqs
-                        ]
-
             return attn_groups
 
         for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
@@ -2734,6 +2846,75 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 "All or none of the layers are expected to be encoder-only"
             self.is_encoder_only_model = True
 
+    def initialize_cudagraph_capture(self) -> None:
+        min_cg_support = AttentionCGSupport.ALWAYS
+        min_cg_builder_name = None
+
+        for attn_group in self._attn_group_iterator():
+            builder = attn_group.metadata_builder
+            if builder.cudagraph_support.value < min_cg_support.value:
+                min_cg_support = builder.cudagraph_support
+                min_cg_builder_name = builder.__class__.__name__
+
+        # Flexible resolve the cudagraph mode
+        cudagraph_mode = self.compilation_config.cudagraph_mode
+        # check cudagraph for mixed batch is supported
+        if cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL \
+            and min_cg_support != AttentionCGSupport.ALWAYS:
+            msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
+                   f"with {min_cg_builder_name} backend (support: "
+                   f"{min_cg_support})")
+            if min_cg_support == AttentionCGSupport.NEVER:
+                # if not supported any full cudagraphs, just raise it.
+                msg += "; please try cudagraph_mode=PIECEWISE, and "\
+                    "make sure compilation level is piecewise"
+                raise ValueError(msg)
+
+            # attempt to resolve the full cudagraph related mode
+            if self.compilation_config.splitting_ops_contain_attention():
+                msg += "; setting cudagraph_mode=FULL_AND_PIECEWISE"
+                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                    CUDAGraphMode.FULL_AND_PIECEWISE
+            else:
+                msg += "; setting cudagraph_mode=FULL_DECODE_ONLY"
+                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                    CUDAGraphMode.FULL_DECODE_ONLY
+            logger.warning(msg)
+
+        # check that if we are doing spec-decode + decode full-cudagraphs it is
+        # supported
+        if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
+                and self.uniform_decode_query_len > 1 and min_cg_support.value
+                < AttentionCGSupport.UNIFORM_BATCH.value):
+            msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported"
+                   f" with spec-decode for attention backend "
+                   f"{min_cg_builder_name} (support: {min_cg_support})")
+            if self.compilation_config.splitting_ops_contain_attention():
+                msg += "; setting cudagraph_mode=PIECEWISE"
+                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                    CUDAGraphMode.PIECEWISE
+            else:
+                msg += "; setting cudagraph_mode=NONE"
+                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                    CUDAGraphMode.NONE
+            logger.warning(msg)
+
+        # double check that we can support full cudagraph if they are requested
+        # even after automatic downgrades
+        if cudagraph_mode.has_full_cudagraphs() \
+            and min_cg_support == AttentionCGSupport.NEVER:
+            raise ValueError(f"CUDAGraphMode.{cudagraph_mode.name} is not "
+                             f"supported with {min_cg_builder_name} backend ("
+                             f"support:{min_cg_support}) "
+                             "; please try cudagraph_mode=PIECEWISE, "
+                             "and make sure compilation level is piecewise")
+
+        # Trigger cudagraph dispatching keys initialization here (after
+        # initializing attn backends).
+        self.cudagraph_dispatcher.initialize_cudagraph_keys(
+            self.compilation_config.cudagraph_mode,
+            self.uniform_decode_query_len)
+
     def calculate_reorder_batch_threshold(self) -> None:
         """
         Check that if any backends reorder batches; that the reordering
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 84f065f25f..04de8d3668 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -322,16 +322,11 @@ class Worker(WorkerBase):
         if get_pp_group().is_last_rank:
             max_num_reqs = min(self.scheduler_config.max_num_seqs,
                                self.scheduler_config.max_num_batched_tokens)
-            # activate building attn_metadata for this dummy run to avoid
-            # potential illegal memory access for full cudagraph relay.
-            attn_cudagraph = self.compilation_config.full_cuda_graph and\
-                not self.model_config.enforce_eager
 
             # We skip EPLB here since we don't want to record dummy metrics
             hidden_states, last_hidden_states = \
                 self.model_runner._dummy_run(
                     num_tokens=max_num_reqs,
-                    capture_attn_cudagraph=attn_cudagraph,
                     skip_eplb=True,
                 )
             if self.model_runner.is_pooling_model:

From 1c859a1387286cf650c3bc24fdeac706b97999e8 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 15 Aug 2025 08:22:31 -0700
Subject: [PATCH 296/932] [V0 Deprecation] Remove advance_step (#22969)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 CMakeLists.txt                                |   1 -
 csrc/ops.h                                    |  16 -
 csrc/prepare_inputs/advance_step.cu           | 336 ------------------
 csrc/prepare_inputs/advance_step.cuh          |  19 -
 csrc/torch_bindings.cpp                       |  19 -
 vllm/_custom_ops.py                           |  32 --
 vllm/attention/backends/abstract.py           |   5 -
 .../backends/differential_flash_attn.py       |  76 +---
 vllm/attention/backends/flash_attn.py         |  76 +---
 vllm/attention/backends/flashinfer.py         |  65 +---
 vllm/attention/backends/flashmla.py           |  15 +-
 vllm/attention/backends/mla/common.py         |  87 +----
 vllm/attention/backends/placeholder_attn.py   |  62 +---
 vllm/attention/backends/rocm_aiter_mla.py     |  21 --
 vllm/attention/backends/rocm_flash_attn.py    |  68 +---
 vllm/worker/model_runner.py                   |   3 +-
 16 files changed, 9 insertions(+), 892 deletions(-)
 delete mode 100644 csrc/prepare_inputs/advance_step.cu
 delete mode 100644 csrc/prepare_inputs/advance_step.cuh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcec854a08..cda1ffc795 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -249,7 +249,6 @@ set(VLLM_EXT_SRC
   "csrc/quantization/gguf/gguf_kernel.cu"
   "csrc/quantization/activation_kernels.cu"
   "csrc/cuda_utils_kernels.cu"
-  "csrc/prepare_inputs/advance_step.cu"
   "csrc/custom_all_reduce.cu"
   "csrc/torch_bindings.cpp")
 
diff --git a/csrc/ops.h b/csrc/ops.h
index 207291eceb..3e29f0a973 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -145,22 +145,6 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_quick(torch::Tensor& out, torch::Tensor& input);
 
-void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
-                            int64_t block_size, torch::Tensor& input_tokens,
-                            torch::Tensor& sampled_token_ids,
-                            torch::Tensor& input_positions,
-                            torch::Tensor& seq_lens,
-                            torch::Tensor& slot_mapping,
-                            torch::Tensor& block_tables);
-
-void advance_step_flashinfer(
-    int64_t num_seqs, int64_t num_queries, int64_t block_size,
-    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
-    torch::Tensor& input_positions, torch::Tensor& seq_lens,
-    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
-    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
-    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
-
 void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
                         torch::Tensor const& q_pe,
                         torch::Tensor const& kv_c_and_k_pe_cache,
diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu
deleted file mode 100644
index 3d5077d9de..0000000000
--- a/csrc/prepare_inputs/advance_step.cu
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
- * The goal of this GPU kernel is to advance input tensors on the GPU directly
- * PR: https://github.com/vllm-project/vllm/pull/6338
- * Current restrictions:
- *     1. Specialized for DraftModelRunner
- *     2. Supports flash_attn only
- */
-
-#include "advance_step.cuh"
-
-namespace prepare_inputs {
-
-//
-template <int const num_threads>
-__global__ void advance_step_flashattn_kernel(
-    int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
-    long const* sampled_token_ids_ptr, long* input_positions_ptr,
-    int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
-    int64_t const block_tables_stride) {
-  int const n_pad = num_seqs - num_queries;
-  if (n_pad && blockIdx.x == 0) {
-    // Handle cuda graph padding
-    int const offset = num_queries;
-    for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
-      input_tokens_ptr[offset + i] = 0;
-      input_positions_ptr[offset + i] = 0;
-      slot_mapping_ptr[offset + i] = -1;
-    }
-  }
-
-  int num_query_blocks = div_ceil(num_queries, num_threads);
-
-  if (blockIdx.x >= num_query_blocks) {
-    return;
-  }
-
-  int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
-
-  if (cur_query_id >= num_queries) {
-    return;
-  }
-
-  // Update input_tokens
-  input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
-
-  int seq_len = seq_lens_ptr[cur_query_id];
-  int next_seq_len = seq_len + 1;
-  int next_input_pos = next_seq_len - 1;
-
-  // Update seq_lens
-  seq_lens_ptr[cur_query_id] = next_seq_len;
-  // Update input_positions
-  input_positions_ptr[cur_query_id] = next_input_pos;
-
-  int const* seq_block_tables_ptr =
-      block_tables_ptr + block_tables_stride * cur_query_id;
-
-  int block_index = next_input_pos / block_size;
-  int block_offset = next_input_pos % block_size;
-
-  int slot_num = seq_block_tables_ptr[block_index] * block_size + block_offset;
-  // Update slot_mapping
-  slot_mapping_ptr[cur_query_id] = slot_num;
-}
-
-inline void verify_tensor(std::string const& name, torch::Tensor const& t,
-                          int64_t const size_0, int64_t const size_1,
-                          c10::ScalarType const type) {
-  bool size_0_cond = true;
-  if (size_0 != -1) {
-    size_0_cond = t.size(0) == size_0;
-  }
-
-  bool size_1_cond = true;
-  if (size_1 != -1) {
-    size_1_cond = t.size(1) == size_1;
-  }
-
-  bool is_contiguous = t.is_contiguous();
-  bool same_type = t.dtype() == type;
-
-  bool pass = size_0_cond && size_1_cond && is_contiguous && same_type;
-  if (!pass) {
-    TORCH_CHECK(false, "tensor: name = ", name, ", shape = ", t.sizes(),
-                " is_cont = ", t.is_contiguous(), ", type = ", t.dtype(),
-                " is not as expected: shape = [", size_0, ", ", size_1,
-                "], type = ", type);
-  }
-}
-
-/// each thread processes a block per query
-__global__ void advance_step_flashinfer_kernel(
-    int num_threads, int num_seqs, int num_queries, int block_size,
-    long* input_tokens_ptr, long const* sampled_token_ids_ptr,
-    long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
-    int const* block_tables_ptr, int64_t const block_tables_stride,
-    int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
-  int const n_pad = num_seqs - num_queries;
-  if (n_pad && blockIdx.x == 0) {
-    // Handle cuda graph padding
-    int const offset = num_queries;
-    for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
-      input_tokens_ptr[offset + i] = 0;
-      input_positions_ptr[offset + i] = 0;
-      slot_mapping_ptr[offset + i] = -1;
-    }
-  }
-  int num_query_blocks = div_ceil(num_queries, num_threads);
-
-  if (blockIdx.x < num_query_blocks) {
-    int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
-
-    if (cur_query_id < num_queries) {
-      // Update input_tokens
-      input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
-
-      int seq_len = seq_lens_ptr[cur_query_id];
-      int next_seq_len = seq_len + 1;
-      int next_input_pos = next_seq_len - 1;
-
-      // Update seq_lens
-      seq_lens_ptr[cur_query_id] = next_seq_len;
-      // Update input_positions
-      input_positions_ptr[cur_query_id] = next_input_pos;
-
-      int const* seq_block_tables_ptr =
-          block_tables_ptr + block_tables_stride * cur_query_id;
-
-      int block_index = next_input_pos / block_size;
-      int block_offset = next_input_pos % block_size;
-
-      // Update paged_kv_last_page_len
-      paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
-
-      int slot_num =
-          seq_block_tables_ptr[block_index] * block_size + block_offset;
-      // Update slot_mapping
-      slot_mapping_ptr[cur_query_id] = slot_num;
-      block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
-    }
-  }
-}
-
-__global__ void advance_step_flashinfer_indptr_kernel(
-    int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
-    int* block_table_bound_ptr) {
-  int idx = blockIdx.x * num_threads + threadIdx.x;
-  // Update paged_kv_indptr
-  if (idx == 0) {
-    paged_kv_indptr_ptr[idx] = 0;
-  }
-  if (idx < num_queries) {
-    int sum = 0;
-    for (int i = 0; i <= idx; ++i) {
-      sum += block_table_bound_ptr[i];
-    }
-    paged_kv_indptr_ptr[idx + 1] = sum;
-  }
-}
-
-__global__ void advance_step_flashinfer_indices_kernel(
-    int num_seqs, int num_queries, int const* block_tables_ptr,
-    int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr,
-    int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
-  // note: max_num_blocks_per_seq = block_tables.stride(0)
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // when cuda graphs are enabled, paged_kv_indptr tensor
-  // has to be updated for the padded queries
-  // tid represents a query# for paged_kv_indptr tensor
-  if (num_queries < tid && tid <= num_seqs) {
-    paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries];
-  }
-
-  // each thread processes a block_ptr in block_tables
-  // block_tables shape: [num_queries, max_num_blocks_per_seq]
-  // paged_kv_indices is flattened block_tables.
-  for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq);
-       idx += (gridDim.x * blockDim.x)) {
-    // block_tables-row = paged_kv_indptr[queryNum]
-    int queryNum = idx / max_num_blocks_per_seq;
-    int col = idx % max_num_blocks_per_seq;
-    if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) {
-      int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col;
-      int block_tables_idx = queryNum * max_num_blocks_per_seq + col;
-      paged_kv_indices_ptr[indices_arr_idx] =
-          block_tables_ptr[block_tables_idx];
-    }
-  }
-}
-
-void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
-                            torch::Tensor& input_tokens,       // type: long
-                            torch::Tensor& sampled_token_ids,  // type: long
-                            torch::Tensor& input_positions,    // type: long
-                            torch::Tensor& seq_lens,           // type: int
-                            torch::Tensor& slot_mapping,       // type: long
-                            torch::Tensor& block_tables) {     // type: int
-
-  if (logging) {
-    printf("advance_step_flashattn:\n");
-    printf("  num_seqs = %d\n", num_seqs);
-    printf("  num_queries = %d\n", num_queries);
-    printf("  block_size = %d\n", block_size);
-  }
-  // Verify all tensors
-  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
-  verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
-                at::kLong);
-  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
-  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
-  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
-  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
-
-  int dev = sampled_token_ids.get_device();
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
-
-  int blocks;
-  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
-
-  advance_step_flashattn_kernel<max_threads>
-      <<<blocks, max_threads, 0, stream>>>(
-          num_seqs, num_queries, block_size,
-          reinterpret_cast<long*>(input_tokens.data_ptr()),
-          reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
-          reinterpret_cast<long*>(input_positions.data_ptr()),
-          reinterpret_cast<int*>(seq_lens.data_ptr()),
-          reinterpret_cast<long*>(slot_mapping.data_ptr()),
-          reinterpret_cast<int const*>(block_tables.data_ptr()),
-          block_tables.stride(0));
-}
-
-void advance_step_flashinfer(
-    int num_seqs, int num_queries, int block_size,
-    torch::Tensor& input_tokens,            // type: long
-    torch::Tensor& sampled_token_ids,       // type: long
-    torch::Tensor& input_positions,         // type: long
-    torch::Tensor& seq_lens,                // type: int
-    torch::Tensor& slot_mapping,            // type: long
-    torch::Tensor& block_tables,            // type: int
-    torch::Tensor& paged_kv_indices,        // type: int
-    torch::Tensor& paged_kv_indptr,         // type: int
-    torch::Tensor& paged_kv_last_page_len,  // type: int
-    torch::Tensor& block_table_bound) {     // type: int
-
-  if (logging) {
-    printf("advance_step_flashinfer:\n");
-    printf("  num_seqs = %d\n", num_seqs);
-    printf("  num_queries = %d\n", num_queries);
-    printf("  block_size = %d\n", block_size);
-    printf("  block_tables.stride(0) = %zu\n", block_tables.stride(0));
-  }
-  // Verify all tensors
-  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
-  // verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
-  //               at::kLong);
-  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
-  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
-  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
-  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
-
-  verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
-  verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
-  verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
-                at::kInt);
-
-  verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
-
-  int dev = sampled_token_ids.get_device();
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
-
-  int blocks;
-  int threads;
-  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
-  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
-
-  TORCH_CHECK((blocks * threads > num_queries),
-              "multi-step: not enough threads to map to num_queries = ",
-              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
-              " blocks = ", blocks, " max_threads = ", threads);
-  if (logging) {
-    printf("launching kernels with %d blocks and %d threads\n", blocks,
-           threads);
-  }
-  advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
-      threads, num_seqs, num_queries, block_size,
-      reinterpret_cast<long*>(input_tokens.data_ptr()),
-      reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
-      reinterpret_cast<long*>(input_positions.data_ptr()),
-      reinterpret_cast<int*>(seq_lens.data_ptr()),
-      reinterpret_cast<long*>(slot_mapping.data_ptr()),
-      reinterpret_cast<int const*>(block_tables.data_ptr()),
-      block_tables.stride(0),
-      reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
-      reinterpret_cast<int*>(block_table_bound.data_ptr()));
-
-  advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
-      threads, num_seqs, num_queries,
-      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
-      reinterpret_cast<int*>(block_table_bound.data_ptr()));
-
-  advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
-      num_seqs, num_queries,
-      reinterpret_cast<int const*>(block_tables.data_ptr()),
-      block_tables.stride(0),
-      reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
-      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
-      reinterpret_cast<int*>(block_table_bound.data_ptr()));
-}
-
-}  // namespace prepare_inputs
-
-void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
-                            int64_t block_size, torch::Tensor& input_tokens,
-                            torch::Tensor& sampled_token_ids,
-                            torch::Tensor& input_positions,
-                            torch::Tensor& seq_lens,
-                            torch::Tensor& slot_mapping,
-                            torch::Tensor& block_tables) {
-  prepare_inputs::advance_step_flashattn(
-      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
-      input_positions, seq_lens, slot_mapping, block_tables);
-}
-
-void advance_step_flashinfer(
-    int64_t num_seqs, int64_t num_queries, int64_t block_size,
-    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
-    torch::Tensor& input_positions, torch::Tensor& seq_lens,
-    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
-    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
-    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
-  prepare_inputs::advance_step_flashinfer(
-      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
-      input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
-      paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
-}
diff --git a/csrc/prepare_inputs/advance_step.cuh b/csrc/prepare_inputs/advance_step.cuh
deleted file mode 100644
index f21574681b..0000000000
--- a/csrc/prepare_inputs/advance_step.cuh
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma once
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <iostream>
-
-namespace prepare_inputs {
-
-static constexpr int max_threads = 256;
-static constexpr bool logging = false;
-
-constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
-
-}  // namespace prepare_inputs
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 8c207be083..a547baec50 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -142,25 +142,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
 
-  // prepare_inputs advance_step
-  ops.def(
-      "advance_step_flashattn(int num_seqs, int num_queries, int block_size, "
-      "Tensor! input_tokens, Tensor sampled_token_ids, "
-      "Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
-      "Tensor block_tables) -> ()");
-  ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn);
-
-  ops.def(
-      "advance_step_flashinfer("
-      "    int num_seqs, int num_queries, int block_size,"
-      "    Tensor! input_tokens, Tensor sampled_token_ids,"
-      "    Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping,"
-      "    Tensor block_tables, Tensor! paged_kv_indices,"
-      "    Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len,"
-      "    Tensor! block_table_bounds"
-      ") -> ()");
-  ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
-
   // Layernorm
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
   ops.def(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index a020b171e8..a318637c5a 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -319,38 +319,6 @@ def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
                                          repetition_penalties)
 
 
-def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
-                           input_tokens: torch.Tensor,
-                           sampled_token_ids: torch.Tensor,
-                           input_positions: torch.Tensor,
-                           seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
-                           block_tables: torch.Tensor) -> None:
-    """Advance a step on GPU for existing inputs for a multi-step runner"""
-    return torch.ops._C.advance_step_flashattn(num_seqs, num_queries,
-                                               block_size, input_tokens,
-                                               sampled_token_ids,
-                                               input_positions, seq_lens,
-                                               slot_mapping, block_tables)
-
-
-def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
-                            input_tokens: torch.Tensor,
-                            sampled_token_ids: torch.Tensor,
-                            input_positions: torch.Tensor,
-                            seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
-                            block_tables: torch.Tensor,
-                            paged_kv_indices: torch.Tensor,
-                            paged_kv_indptr: torch.Tensor,
-                            paged_kv_last_page_len: torch.Tensor,
-                            block_table_bound: torch.Tensor) -> None:
-
-    return torch.ops._C.advance_step_flashinfer(
-        num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
-        input_positions, seq_lens, slot_mapping, block_tables,
-        paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len,
-        block_table_bound)
-
-
 # fused quant layer norm ops
 def rms_norm_dynamic_per_token_quant(
     input: torch.Tensor,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 2417fe06a6..d21f077568 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -101,11 +101,6 @@ class AttentionBackend(ABC):
     ) -> None:
         raise NotImplementedError
 
-    def advance_step(self, model_input: "ModelRunnerInputBase",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int, num_seqs: int, num_queries: int) -> None:
-        raise NotImplementedError
-
     @classmethod
     def full_cls_name(cls) -> tuple[str, str]:
         return (cls.__module__, cls.__qualname__)
diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py
index bd9bc42772..fac3c318a8 100644
--- a/vllm/attention/backends/differential_flash_attn.py
+++ b/vllm/attention/backends/differential_flash_attn.py
@@ -35,8 +35,7 @@ from vllm.vllm_flash_attn import (flash_attn_varlen_func,
                                   flash_attn_with_kvcache)
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
 
 logger = init_logger(__name__)
 
@@ -326,79 +325,6 @@ class DifferentialFlashAttentionMetadata(AttentionMetadata):
             cross_block_tables=self.cross_block_tables)
         return self._cached_decode_metadata
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        if turn_prefills_into_decodes:
-            # When Multi-Step is enabled with Chunked-Prefill, prefills and
-            # decodes are scheduled together. In the first step, all the
-            # prefills turn into decodes. This update reflects that
-            # conversion.
-            assert self.num_decode_tokens + self.num_prefills == num_seqs
-            self.num_decode_tokens += self.num_prefills
-            self.num_prefills = 0
-            self.num_prefill_tokens = 0
-            self.max_prefill_seq_len = 0
-            self.max_query_len = 1
-
-            self.slot_mapping = self.slot_mapping[:num_seqs]
-        else:
-            assert self.seq_lens is not None
-            assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-        assert self.slot_mapping.shape == (num_seqs, )
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-
-        assert self.query_start_loc is not None
-        assert self.query_start_loc.shape == (num_queries + 1, )
-        assert self.seq_start_loc is not None
-        assert self.seq_start_loc.shape == (num_seqs + 1, )
-
-        assert self.context_lens_tensor is not None
-        assert self.context_lens_tensor.shape == (num_queries, )
-
-        assert self.block_tables is not None
-        assert self.block_tables.shape[0] == num_seqs
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-
-        ops.advance_step_flashattn(num_seqs=num_seqs,
-                                   num_queries=num_queries,
-                                   block_size=block_size,
-                                   input_tokens=model_input.input_tokens,
-                                   sampled_token_ids=sampled_token_ids,
-                                   input_positions=model_input.input_positions,
-                                   seq_lens=self.seq_lens_tensor,
-                                   slot_mapping=self.slot_mapping,
-                                   block_tables=self.block_tables)
-
 
 class DifferentialFlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[DifferentialFlashAttentionMetadata]):
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ee36fd19e0..e52480d5c5 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -32,8 +32,7 @@ from vllm.vllm_flash_attn import (flash_attn_varlen_func,
                                   flash_attn_with_kvcache)
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
 
 logger = init_logger(__name__)
 
@@ -309,79 +308,6 @@ class FlashAttentionMetadata(AttentionMetadata):
             cross_block_tables=self.cross_block_tables)
         return self._cached_decode_metadata
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        if turn_prefills_into_decodes:
-            # When Multi-Step is enabled with Chunked-Prefill, prefills and
-            # decodes are scheduled together. In the first step, all the
-            # prefills turn into decodes. This update reflects that
-            # conversion.
-            assert self.num_decode_tokens + self.num_prefills == num_seqs
-            self.num_decode_tokens += self.num_prefills
-            self.num_prefills = 0
-            self.num_prefill_tokens = 0
-            self.max_prefill_seq_len = 0
-            self.max_query_len = 1
-
-            self.slot_mapping = self.slot_mapping[:num_seqs]
-        else:
-            assert self.seq_lens is not None
-            assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-        assert self.slot_mapping.shape == (num_seqs, )
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-
-        assert self.query_start_loc is not None
-        assert self.query_start_loc.shape == (num_queries + 1, )
-        assert self.seq_start_loc is not None
-        assert self.seq_start_loc.shape == (num_seqs + 1, )
-
-        assert self.context_lens_tensor is not None
-        assert self.context_lens_tensor.shape == (num_queries, )
-
-        assert self.block_tables is not None
-        assert self.block_tables.shape[0] == num_seqs
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-
-        ops.advance_step_flashattn(num_seqs=num_seqs,
-                                   num_queries=num_queries,
-                                   block_size=block_size,
-                                   input_tokens=model_input.input_tokens,
-                                   sampled_token_ids=sampled_token_ids,
-                                   input_positions=model_input.input_positions,
-                                   seq_lens=self.seq_lens_tensor,
-                                   slot_mapping=self.slot_mapping,
-                                   block_tables=self.block_tables)
-
 
 class FlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlashAttentionMetadata]):
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 78d8a67e37..208cacec38 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -51,8 +51,7 @@ from vllm.utils.flashinfer import use_trtllm_attention
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
 
 
 class FlashInferBackend(AttentionBackend):
@@ -428,7 +427,7 @@ class FlashInferMetadata(AttentionMetadata):
     query_start_loc: Optional[torch.Tensor] = None
     block_tables: Optional[torch.Tensor] = None
 
-    # used for GPU in-place advance_step
+    # used for GPU operations
     seq_lens_tensor: Optional[torch.Tensor] = None
     block_table_bound: Optional[torch.Tensor] = None
 
@@ -587,66 +586,6 @@ class FlashInferMetadata(AttentionMetadata):
             return None
         return self
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-
-        if turn_prefills_into_decodes:
-            # When Multi-Step is enabled with Chunked-Prefill, prefills and
-            # decodes are scheduled together. In the first step, all the
-            # prefills turn into decodes. This update reflects that
-            # conversion.
-            assert self.num_decode_tokens + self.num_prefills == num_seqs
-            # Flashinfer doesn't support speculative decoding + chunked-prefill
-            # + multi-step scheduling yet.
-            assert self.decode_query_len == 1
-            self.num_decode_tokens += self.num_prefills
-            self.num_prefills = 0
-            self.num_prefill_tokens = 0
-            self.max_prefill_seq_len = 0
-            self.max_query_len = 1
-
-            self.slot_mapping = self.slot_mapping[:num_seqs]
-        else:
-            assert self.seq_lens_tensor is not None
-
-        assert num_seqs > 0
-        assert num_queries > 0
-        assert model_input.attn_metadata is not None
-        assert sampled_token_ids is not None
-
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        model_input.input_tokens[:num_queries] = sampled_token_ids.flatten()
-
-        # Update GPU tensors
-        ops.advance_step_flashinfer(
-            num_seqs=num_seqs,
-            num_queries=num_queries,
-            block_size=block_size,
-            input_tokens=model_input.input_tokens,
-            sampled_token_ids=model_input.input_tokens,
-            input_positions=model_input.input_positions,
-            seq_lens=self.seq_lens_tensor,
-            slot_mapping=self.slot_mapping,
-            block_tables=self.block_tables,
-            paged_kv_indices=self.paged_kv_indices,
-            paged_kv_indptr=self.paged_kv_indptr,
-            paged_kv_last_page_len=self.paged_kv_last_page_len,
-            block_table_bound=self.block_table_bound)
-
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py
index a242ac9bbe..f23c096952 100644
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -3,7 +3,7 @@
 
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type
 
 import torch
 
@@ -18,9 +18,6 @@ from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
                                          get_mla_metadata,
                                          is_flashmla_supported)
 
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-
 
 class FlashMLABackend(MLACommonBackend):
 
@@ -62,16 +59,6 @@ class FlashMLAMetadata(MLACommonMetadata):
                 self.decode_num_splits
         return decode_metadata
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        raise NotImplementedError(
-            "advance_step is not implemented for FlashMLA")
-
 
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
 
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 52c4a9e7da..8ff7f56743 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -234,8 +234,7 @@ except ImportError:
         flash_attn_varlen_func = None
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
 
 is_hip = current_platform.is_rocm()
 
@@ -631,90 +630,6 @@ class MLACommonMetadata(AttentionMetadata):
             is_profile_run=self.is_profile_run)
         return self._cached_decode_metadata
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-
-        if turn_prefills_into_decodes:
-            # When Multi-Step is enabled with Chunked-Prefill, prefills and
-            # decodes are scheduled together. In the first step, all the
-            # prefills turn into decodes. This update reflects that
-            # conversion.
-            assert self.num_decode_tokens + self.num_prefills == num_seqs
-            self.num_decode_tokens += self.num_prefills
-            self.num_prefills = 0
-            self.num_prefill_tokens = 0
-            self.max_prefill_seq_len = 0
-            self.max_query_len = 1
-
-            self.slot_mapping = self.slot_mapping[:num_seqs]
-        else:
-            assert self.seq_lens is not None
-            assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-        assert self.slot_mapping.shape == (num_seqs, )
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-
-        assert self.query_start_loc is not None
-        assert self.query_start_loc.shape == (num_queries + 1, )
-        assert self.seq_start_loc is not None
-        assert self.seq_start_loc.shape == (num_seqs + 1, )
-
-        assert self.context_lens_tensor is not None
-        assert self.context_lens_tensor.shape == (num_queries, )
-
-        assert self.block_tables is not None
-        assert self.block_tables.shape[0] == num_seqs
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-
-        self._ops_advance_step(num_seqs=num_seqs,
-                               num_queries=num_queries,
-                               block_size=block_size,
-                               input_tokens=model_input.input_tokens,
-                               sampled_token_ids=sampled_token_ids,
-                               input_positions=model_input.input_positions)
-
-    def _ops_advance_step(self, num_seqs: int, num_queries: int,
-                          block_size: int, input_tokens: torch.Tensor,
-                          sampled_token_ids: torch.Tensor,
-                          input_positions: torch.Tensor) -> None:
-        # here we use advance_step_flashinfo to update the paged_kv_* tensors
-        ops.advance_step_flashattn(num_seqs=num_seqs,
-                                   num_queries=num_queries,
-                                   block_size=block_size,
-                                   input_tokens=input_tokens,
-                                   sampled_token_ids=sampled_token_ids,
-                                   input_positions=input_positions,
-                                   seq_lens=self.seq_lens_tensor,
-                                   slot_mapping=self.slot_mapping,
-                                   block_tables=self.block_tables)
-
 
 class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
     """
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 820ddcab77..e630a6c6de 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -15,8 +15,7 @@ from vllm.attention.backends.utils import CommonAttentionState
 from vllm.multimodal import MultiModalPlaceholderMap
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder)
 from vllm.utils import async_tensor_h2d
 
 # Placeholder attention backend for models like Mamba and pooling models that
@@ -201,65 +200,6 @@ class PlaceholderAttentionMetadata(AttentionMetadata):
         )
         return self._cached_decode_metadata
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        assert not turn_prefills_into_decodes, \
-            ("Multi-Step + Chunked-Prefill is not supported for attention-free"
-             "models. turn_prefills_into_decodes is a "
-             "Multi-Step + Chunked-Prefill specific parameter.")
-
-        assert self.seq_lens is not None
-        assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-
-        assert self.query_start_loc is not None
-        assert self.query_start_loc.shape == (num_queries + 1, )
-        assert self.seq_start_loc is not None
-        assert self.seq_start_loc.shape == (num_seqs + 1, )
-
-        assert self.context_lens_tensor is not None
-        assert self.context_lens_tensor.shape == (num_queries, )
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-
-        # Update sequences, masking off entries greater than num_queries
-        device = self.seq_lens_tensor.device
-        mask = torch.arange(self.seq_lens_tensor.size(0),
-                            device=device) < num_queries
-        self.seq_lens_tensor += mask.to(self.seq_lens_tensor.dtype)
-        if sampled_token_ids is not None:
-            model_input.input_tokens.masked_scatter_(
-                mask, sampled_token_ids[:num_queries])
-
 
 class PlaceholderAttentionMetadataBuilder(
         AttentionMetadataBuilder[PlaceholderAttentionMetadata]):
diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py
index a165a786d6..a2e9710437 100644
--- a/vllm/attention/backends/rocm_aiter_mla.py
+++ b/vllm/attention/backends/rocm_aiter_mla.py
@@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Optional, Type, Union
 
 import torch
 
-import vllm._custom_ops as ops
 import vllm.envs as envs
 from vllm.attention.backends.mla.common import (MLACommonBackend,
                                                 MLACommonImpl,
@@ -107,26 +106,6 @@ class AiterMLAMetadata(MLACommonMetadata):
 
         return self._cached_decode_metadata
 
-    def _ops_advance_step(self, num_seqs: int, num_queries: int,
-                          block_size: int, input_tokens: torch.Tensor,
-                          sampled_token_ids: torch.Tensor,
-                          input_positions: torch.Tensor) -> None:
-
-        ops.advance_step_flashinfer(
-            num_seqs=num_seqs,
-            num_queries=num_queries,
-            block_size=block_size,
-            input_tokens=input_tokens,
-            sampled_token_ids=sampled_token_ids,
-            input_positions=input_positions,
-            seq_lens=self.seq_lens_tensor,
-            slot_mapping=self.slot_mapping,
-            block_tables=self.block_tables,
-            paged_kv_indices=self.paged_kv_indices,
-            paged_kv_indptr=self.paged_kv_indptr,
-            paged_kv_last_page_lens=self.paged_kv_last_page_lens,
-            block_table_bound=self.block_table_bound)
-
 
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     BLOCK_TABLE_EXTENDER: list[list[int]] = [[]]
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index da3d9ff328..63e467f5a7 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -4,7 +4,7 @@
 import itertools
 from dataclasses import dataclass
 from functools import cache
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type
 
 import torch
 
@@ -23,9 +23,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape)
 from vllm.platforms import current_platform
 
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-
 logger = init_logger(__name__)
 _PARTITION_SIZE_ROCM = 256
 
@@ -261,69 +258,6 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
             self._cached_decode_metadata.query_start_loc = qs - qs[0]
         return self._cached_decode_metadata
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-
-        assert not turn_prefills_into_decodes, \
-            ("Chunked prefill is not supported with rocm_flash_attn yet."
-             "turn_prefills_into_decodes is a Multi-Step + Chunked-Prefill "
-             "specific parameter.")
-
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-        assert self.slot_mapping.shape == (num_seqs, )
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-        assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.query_start_loc is not None
-        assert self.query_start_loc.shape == (num_queries + 1, )
-        assert self.seq_start_loc is not None
-        assert self.seq_start_loc.shape == (num_seqs + 1, )
-
-        assert self.context_lens_tensor is not None
-        assert self.context_lens_tensor.shape == (num_queries, )
-
-        assert self.block_tables is not None
-        assert self.block_tables.shape[0] == num_seqs
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-
-        ops.advance_step_flashattn(num_seqs=num_seqs,
-                                   num_queries=num_queries,
-                                   block_size=block_size,
-                                   input_tokens=model_input.input_tokens,
-                                   sampled_token_ids=sampled_token_ids,
-                                   input_positions=model_input.input_positions,
-                                   seq_lens=self.seq_lens_tensor,
-                                   slot_mapping=self.slot_mapping,
-                                   block_tables=self.block_tables)
-
 
 class ROCmFlashAttentionMetadataBuilder(
         CommonMetadataBuilder[ROCmFlashAttentionMetadata]):
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index a63797e3a4..a1c08fa814 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -762,8 +762,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
         has Prefills (if any). The rest of the steps are guaranteed to be all
         decodes. In this case, we set up the padding as if all the sequences
         are decodes so we may run all steps except the first step in CUDA graph
-        mode. The padding is accounted for in the multi-step `advance_step`
-        family of functions.
+        mode.
 
         Args:
             num_seqs (int): Number of sequences scheduled to run.

From 6b04039a7240ae1039fea4bd179ec3b452f19107 Mon Sep 17 00:00:00 2001
From: sstamenk <sstamenk@amd.com>
Date: Fri, 15 Aug 2025 19:17:31 +0200
Subject: [PATCH 297/932] [BugFix] Skip the Q component for QKVParallelLinear
 in the case of QKVCrossParallelLinear since its width is 0 (#22369)

Signed-off-by: sstamenk <sstamenk@amd.com>
---
 vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index ddb5096890..659029fd37 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -121,6 +121,9 @@ def requantize_with_max_scale(
     if unfused_module_in_checkpoint:
         start = 0
         for idx, logical_width in enumerate(logical_widths):
+            # Skip any component with zero width.
+            if logical_width == 0:
+                continue
             end = start + logical_width
             weight_dq = per_tensor_dequantize(weight[start:end, :],
                                               weight_scale[idx])

From 68af77e51c5ca78ec0fd2496eca80b2257176b6e Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Fri, 15 Aug 2025 19:42:49 +0200
Subject: [PATCH 298/932] [FIXBUG] Correctly Apply Grammar Bitmask in Mixed
 Batches (#22896)

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 vllm/v1/worker/gpu_model_runner.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9460d91c58..3ea39dc519 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1337,9 +1337,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         out_indices = []
 
         # Reorder the bitmask to match the order of the requests in the batch.
-        sorted_bitmask = np.zeros_like(grammar_bitmask,
-                                       shape=(logits.shape[0],
-                                              grammar_bitmask.shape[1]))
+        sorted_bitmask = np.full(shape=(logits.shape[0],
+                                        grammar_bitmask.shape[1]),
+                                 fill_value=-1,
+                                 dtype=grammar_bitmask.dtype)
         cumulative_index = 0
         seq = sorted(scheduler_output.structured_output_request_ids.items(),
                      key=lambda x: x[1])

From 993d3d122b114cf93bf423fe0b4410ac493d9c45 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Fri, 15 Aug 2025 11:23:06 -0700
Subject: [PATCH 299/932] [Benchmarks] Include image data when ShareGPT4V
 dataset is used. (#22955)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 benchmarks/README.md            | 49 +++++++++++++++++++++++++++++++++
 benchmarks/benchmark_dataset.py |  8 +++++-
 vllm/benchmarks/datasets.py     |  8 +++++-
 3 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index d6442a4fc3..caff8f0342 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -22,6 +22,17 @@ become available.
       <td style="text-align: center;">✅</td>
       <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
     </tr>
+    <tr>
+      <td><strong>ShareGPT4V (Image)</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>
+        <code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
+        <br>
+        <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
+        <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
+      </td>
+    </tr>
     <tr>
       <td><strong>BurstGPT</strong></td>
       <td style="text-align: center;">✅</td>
@@ -616,3 +627,41 @@ python3 benchmarks/benchmark_prioritization.py \
 ```
 
 </details>
+
+## 👁️ Example - Multi-Modal Benchmark
+
+<details>
+<summary>Show more</summary>
+
+<br/>
+
+Benchmark the performance of multi-modal requests in vLLM.
+
+### Images (ShareGPT4V)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dtype bfloat16 \
+  --limit-mm-per-prompt '{"image": 1}' \
+  --allowed-local-media-path /path/to/sharegpt4v/images
+```
+
+Send requests with images:
+
+```bash
+python benchmarks/benchmark_serving.py \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dataset-name sharegpt \
+  --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
+  --num-prompts 100 \
+  --save-result \
+  --result-dir ~/vllm_benchmark_results \
+  --save-detailed \
+  --endpoint /v1/chat/completion
+```
+
+</details>
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index ea684f18a7..572292a5ac 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -430,14 +430,20 @@ class ShareGPTDataset(BenchmarkDataset):
                 skip_min_output_len_check=output_len is not None,
             ):
                 continue
+            # TODO: Also support ShareGPT4Video.
+            if image_path := entry.get("image"):
+                mm_content = process_image(image_path)
+            else:
+                mm_content = None
             if enable_multimodal_chat:
-                prompt = self.apply_multimodal_chat_transformation(prompt, None)
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
             samples.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=new_output_len,
                     lora_request=lora_request,
+                    multi_modal_data=mm_content,
                 )
             )
         self.maybe_oversample_requests(samples, num_requests)
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 4e8ac51625..5299dcf54b 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -454,15 +454,21 @@ class ShareGPTDataset(BenchmarkDataset):
                                      skip_min_output_len_check=output_len
                                      is not None):
                 continue
+            # TODO: Also support ShareGPT4Video.
+            if image_path := entry.get("image"): 
+                mm_content = process_image(image_path) 
+            else: 
+                mm_content = None
             if enable_multimodal_chat:
                 prompt = self.apply_multimodal_chat_transformation(
-                    prompt, None)
+                    prompt, mm_content)
             samples.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=new_output_len,
                     lora_request=lora_request,
+                    multi_modal_data=mm_content,
                 ))
         self.maybe_oversample_requests(samples, num_requests)
         return samples

From 48b01fd4d442d4b9250cef4fca3ca75d5c5c1f69 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Sat, 16 Aug 2025 02:29:25 +0800
Subject: [PATCH 300/932] [Structured Output] Make the output of structured
 output example more complete (#22481)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 examples/offline_inference/structured_outputs.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 8ef121ebe8..f46064931d 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -15,6 +15,8 @@ from pydantic import BaseModel
 from vllm import LLM, SamplingParams
 from vllm.sampling_params import GuidedDecodingParams
 
+MAX_TOKENS = 50
+
 # Guided decoding by Choice (list of possible options)
 guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
 sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
@@ -23,7 +25,9 @@ prompt_choice = "Classify this sentiment: vLLM is wonderful!"
 # Guided decoding by Regex
 guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
 sampling_params_regex = SamplingParams(
-    guided_decoding=guided_decoding_params_regex, stop=["\n"]
+    guided_decoding=guided_decoding_params_regex,
+    stop=["\n"],
+    max_tokens=MAX_TOKENS,
 )
 prompt_regex = (
     "Generate an email address for Alan Turing, who works in Enigma."
@@ -48,7 +52,10 @@ class CarDescription(BaseModel):
 
 json_schema = CarDescription.model_json_schema()
 guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
-sampling_params_json = SamplingParams(guided_decoding=guided_decoding_params_json)
+sampling_params_json = SamplingParams(
+    guided_decoding=guided_decoding_params_json,
+    max_tokens=MAX_TOKENS,
+)
 prompt_json = (
     "Generate a JSON with the brand, model and car_type of"
     "the most iconic car from the 90's"
@@ -64,7 +71,10 @@ condition ::= column "= " number
 number ::= "1 " | "2 "
 """
 guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
-sampling_params_grammar = SamplingParams(guided_decoding=guided_decoding_params_grammar)
+sampling_params_grammar = SamplingParams(
+    guided_decoding=guided_decoding_params_grammar,
+    max_tokens=MAX_TOKENS,
+)
 prompt_grammar = (
     "Generate an SQL query to show the 'username' and 'email'from the 'users' table."
 )

From 8ad7285ea28ad3bcc898fa99812120bcda8ea7b4 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:46:00 -0400
Subject: [PATCH 301/932] [Kernels] Clean up FusedMoeMethodBase and modular
 kernel setup.  Remove extra arguments from modular kernel methods. (#22035)

Signed-off-by: Bill Nell <bnell@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 docs/design/fused_moe_modular_kernel.md       |  10 +-
 examples/offline_inference/data_parallel.py   |  23 +-
 .../moe/modular_kernel_tools/common.py        | 532 +++++++++---------
 .../moe/modular_kernel_tools/mk_objects.py    | 461 ++++++++++++++-
 .../profile_modular_kernel.py                 |   4 +-
 .../kernels/moe/modular_kernel_tools/utils.py | 117 ----
 tests/kernels/moe/test_batched_moe.py         |   4 +-
 tests/kernels/moe/test_block_fp8.py           |  31 +-
 tests/kernels/moe/test_block_int8.py          |  15 +-
 .../kernels/moe/test_cutlass_grouped_gemm.py  |  17 +-
 tests/kernels/moe/test_deepep_deepgemm_moe.py |   6 +-
 tests/kernels/moe/test_deepgemm.py            |   6 +-
 tests/kernels/moe/test_flashinfer_moe.py      | 147 +++++
 .../moe/test_modular_kernel_combinations.py   | 129 +++--
 tests/kernels/moe/test_nvfp4_moe.py           |  60 +-
 tests/kernels/moe/test_pplx_cutlass_moe.py    |  11 +-
 tests/kernels/moe/test_pplx_moe.py            |   4 +-
 tests/kernels/moe/utils.py                    |  75 ++-
 .../base_device_communicator.py               |   7 +-
 .../layers/fused_moe/__init__.py              |   4 +-
 .../layers/fused_moe/batched_deep_gemm_moe.py |  36 +-
 .../batched_triton_or_deep_gemm_moe.py        |  38 +-
 .../model_executor/layers/fused_moe/config.py |  11 +-
 .../layers/fused_moe/cutlass_moe.py           | 326 ++++++-----
 .../layers/fused_moe/deep_gemm_moe.py         |   3 +-
 .../fused_moe/deepep_ht_prepare_finalize.py   |  30 +-
 .../fused_moe/deepep_ll_prepare_finalize.py   |  32 +-
 .../fused_moe/flashinfer_cutlass_moe.py       |  59 +-
 .../flashinfer_cutlass_prepare_finalize.py    |  52 +-
 .../layers/fused_moe/fused_batched_moe.py     |  98 ++--
 .../layers/fused_moe/fused_moe.py             |   7 +-
 .../fused_moe/gpt_oss_triton_kernels_moe.py   |  15 +-
 vllm/model_executor/layers/fused_moe/layer.py |  93 +--
 .../layers/fused_moe/modular_kernel.py        | 117 ++--
 .../layers/fused_moe/pplx_prepare_finalize.py |  33 +-
 .../layers/fused_moe/prepare_finalize.py      |  43 +-
 .../layers/fused_moe/triton_deep_gemm_moe.py  |  37 +-
 vllm/model_executor/layers/fused_moe/utils.py |  18 +-
 .../layers/quantization/auto_round.py         |   4 +-
 .../model_executor/layers/quantization/awq.py |   2 +-
 .../layers/quantization/awq_marlin.py         |  18 +-
 .../layers/quantization/bitsandbytes.py       |  12 +-
 .../compressed_tensors_moe.py                 | 168 ++++--
 .../layers/quantization/experts_int8.py       |  17 +-
 .../model_executor/layers/quantization/fp8.py |  43 +-
 .../layers/quantization/gguf.py               |  15 +-
 .../layers/quantization/gptq_marlin.py        |  14 +-
 .../layers/quantization/modelopt.py           |  99 ++--
 .../layers/quantization/moe_wna16.py          |  16 +-
 .../layers/quantization/mxfp4.py              |   2 +-
 .../layers/quantization/quark/quark_moe.py    |  39 +-
 .../model_executor/layers/quantization/rtn.py |  13 +-
 .../quantization/utils/flashinfer_fp4_moe.py  | 129 +----
 54 files changed, 2010 insertions(+), 1293 deletions(-)
 delete mode 100644 tests/kernels/moe/modular_kernel_tools/utils.py
 create mode 100644 tests/kernels/moe/test_flashinfer_moe.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 04d7cdc3d8..87296a08e2 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -399,6 +399,7 @@ steps:
 - label: Kernels MoE Test %N
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
+  - csrc/quantization/cutlass_w8a8/moe/
   - csrc/moe/
   - tests/kernels/moe
   - vllm/model_executor/layers/fused_moe/
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index 3ef1232051..4b917ab408 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -175,11 +175,19 @@ implementations that input `FusedMoEActivationFormat.Standard` support chunking
 
 ### FusedMoEModularKernel Initialization
 
-`FusedMoEMethodBase` class has 2 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are,
+`FusedMoEMethodBase` class has 3 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are,
 
+* maybe_make_prepare_finalize,
 * select_gemm_impl, and
 * init_prepare_finalize
 
+#### maybe_make_prepare_finalize
+
+The `maybe_make_prepare_finalize` method is responsbile for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled.  The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case.  Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case.
+Please refer to the implementations in,
+
+* `ModelOptNvFp4FusedMoE`
+
 #### select_gemm_impl
 
 The `select_gemm_impl` method is undefined in the base class. It is the responsibility of the derived class to implement a method that constructs a valid/appropriate `FusedMoEPermuteExpertsUnpermute` object.
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index dbf8ed58cc..dd7559451c 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -70,12 +70,27 @@ def parse_args():
         default=64,
         help=("Maximum number of sequences to be processed in a single iteration."),
     )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        help=("Maximum number of tokens to be processed in a single iteration."),
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=300,
+        help=("Number of seconds before unresponsive process is killed."),
+    )
     parser.add_argument(
         "--gpu-memory-utilization",
         type=float,
         default=0.8,
         help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
     )
+    parser.add_argument(
+        "--quantization",
+        type=str,
+    )
     return parser.parse_args()
 
 
@@ -90,7 +105,9 @@ def main(
     enforce_eager,
     trust_remote_code,
     max_num_seqs,
+    max_model_len,
     gpu_memory_utilization,
+    quantization,
 ):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@@ -142,7 +159,9 @@ def main(
         enable_expert_parallel=True,
         trust_remote_code=trust_remote_code,
         max_num_seqs=max_num_seqs,
+        max_model_len=max_model_len,
         gpu_memory_utilization=gpu_memory_utilization,
+        quantization=quantization,
     )
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
@@ -198,14 +217,16 @@ if __name__ == "__main__":
                 args.enforce_eager,
                 args.trust_remote_code,
                 args.max_num_seqs,
+                args.max_model_len,
                 args.gpu_memory_utilization,
+                args.quantization,
             ),
         )
         proc.start()
         procs.append(proc)
     exit_code = 0
     for proc in procs:
-        proc.join(timeout=300)
+        proc.join(timeout=args.timeout)
         if proc.exitcode is None:
             print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
             proc.kill()
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index fd99e8dc5c..a10666b6ec 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -7,41 +7,22 @@ import torch
 
 import vllm._custom_ops as ops
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_test_weights, per_token_cast_to_fp8
+from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
+                                                    FLOAT8_E4M3_MAX,
+                                                    dequantize_nvfp4_to_dtype)
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig
 from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size
-# Fused experts and PrepareFinalize imports
-from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    BatchedDeepGemmExperts)
-from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-    BatchedTritonOrDeepGemmExperts)
+from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig, FusedMoEParallelConfig, FusedMoEQuantConfig)
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
-from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts, NaiveBatchedExperts)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase,
-                                                        TritonExperts)
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP)
-from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-    TritonOrDeepGemmExperts)
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 
+from .mk_objects import (expert_info, make_fused_experts,
+                         make_prepare_finalize, prepare_finalize_info)
 from .parallel_utils import ProcessGroupInfo
-from .utils import (make_block_quant_fp8_weights, make_non_quant_weights,
-                    make_quant_fp8_weights, per_token_cast_to_fp8)
-
-if has_pplx():
-    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
-if has_deep_ep():
-    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
-    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
 
 
 def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
@@ -69,24 +50,31 @@ class Config:
 
     torch_trace_dir_path: Optional[str] = None
 
+    def __post_init__(self):
+        if self.quant_config is None:
+            self.quant_config = FusedMoEQuantConfig()
+
     def describe(self) -> str:
         s = ""
-        s += "== Config: \n"
-        s += f" world_size={self.world_size} \n"
-        s += f" PF={self.prepare_finalize_type.__name__} \n"
-        s += f" FE={self.fused_experts_type.__name__} \n"
-        s += f" topk={self.topks} \n"
-        s += f" dtype={self.dtype} \n"
-        s += f" fused_moe_chunk_size={self.fused_moe_chunk_size} \n"
-        s += " Quant: \n"
-        s += f" fused_moe_chunk_size={self.fused_moe_chunk_size} \n "
+        s += "== Config:\n"
+        s += f" world_size={self.world_size}\n"
+        s += f" PF={self.prepare_finalize_type.__name__}\n"
+        s += f" FE={self.fused_experts_type.__name__}\n"
+        s += f" E={self.E}\n"
+        s += f" Ms={self.Ms}\n"
+        s += f" N={self.N}\n"
+        s += f" K={self.K}\n"
+        s += f" topk={self.topks}\n"
+        s += f" dtype={self.dtype}\n"
+        s += f" fused_moe_chunk_size={self.fused_moe_chunk_size}\n"
+        s += " Quant:\n"
         if self.quant_config is not None:
-            s += f"     q_dtype={self.quant_dtype} \n"
-            s += f"     q_block_shape={self.quant_block_shape} \n"
-            s += f"     q_per_out_ch_quant={self.is_per_out_ch_quant} \n"
-            s += f"     q_per_act_token={self.is_per_act_token_quant} \n"
+            s += f"     q_dtype={self.quant_dtype}\n"
+            s += f"     q_block_shape={self.quant_block_shape}\n"
+            s += f"     q_per_out_ch_quant={self.is_per_out_ch_quant}\n"
+            s += f"     q_per_act_token={self.is_per_act_token_quant}\n"
         else:
-            s += "     quant=None \n"
+            s += "     quant=None\n"
         return s
 
     @property
@@ -95,34 +83,28 @@ class Config:
         return self.Ms
 
     @property
-    def quant_dtype(self) -> Optional[torch.dtype]:
-        if self.quant_config is None:
-            return None
+    def quant_dtype(self) -> Union[torch.dtype, str, None]:
+        assert self.quant_config is not None
         return self.quant_config.quant_dtype
 
     @property
     def is_per_act_token_quant(self) -> bool:
-        if self.quant_config is None:
-            return False
+        assert self.quant_config is not None
         return self.quant_config.per_act_token_quant
 
     @property
     def is_per_tensor_act_quant(self) -> bool:
-        if self.quant_config is None:
-            return False
         return (not self.is_per_act_token_quant
                 and self.quant_block_shape is None)
 
     @property
     def is_per_out_ch_quant(self) -> bool:
-        if self.quant_config is None:
-            return False
+        assert self.quant_config is not None
         return self.quant_config.per_out_ch_quant
 
     @property
     def quant_block_shape(self) -> Optional[list[int]]:
-        if self.quant_config is None:
-            return None
+        assert self.quant_config is not None
         return self.quant_config.block_shape
 
     @property
@@ -130,36 +112,30 @@ class Config:
         assert isinstance(self.topks, int)
         return self.topks
 
-    @property
-    def topk_ids_dtype(self) -> Optional[torch.dtype]:
-        topk_ids_dtype = None
-        if self.prepare_finalize_type == PplxPrepareAndFinalize:
-            topk_ids_dtype = torch.uint32
-        elif self.prepare_finalize_type in [
-                DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
-        ]:
-            topk_ids_dtype = torch.int64
-        return topk_ids_dtype
-
     @property
     def num_local_experts(self) -> int:
         return self.E // self.world_size
 
     def make_env_data(self) -> tuple[VllmConfig, dict[Any, Any]]:
         """
-        make env data for vllm launch. 
+        make env data for vllm launch.
         """
         vllm_config = VllmConfig()
         vllm_config.parallel_config.data_parallel_size = self.world_size
         vllm_config.parallel_config.enable_expert_parallel = True
 
         env_dict = {
-            "VLLM_ALL2ALL_BACKEND": self.all2all_backend(),
             "VLLM_USE_DEEP_GEMM": str(int(self.needs_deep_gemm())),
         }
+
+        backend = self.all2all_backend()
+        if backend is not None:
+            env_dict.update({"VLLM_ALL2ALL_BACKEND": backend})
+
         if self.fused_moe_chunk_size is not None:
             env_dict.update(
                 {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)})
+
         return vllm_config, env_dict
 
     def is_fp8_block_quantized(self):
@@ -167,85 +143,59 @@ class Config:
                 and self.quant_block_shape is not None)
 
     def is_batched_prepare_finalize(self):
-        return self.prepare_finalize_type in [
-            PplxPrepareAndFinalize, DeepEPLLPrepareAndFinalize
-        ]
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return (mk.FusedMoEActivationFormat.BatchedExperts ==
+                info.activation_format)
 
     def is_batched_fused_experts(self):
-        return self.fused_experts_type in [
-            CutlassExpertsFp8, BatchedDeepGemmExperts, BatchedTritonExperts,
-            NaiveBatchedExperts, BatchedTritonOrDeepGemmExperts
-        ]
+        info = expert_info(self.fused_experts_type)
+        return (mk.FusedMoEActivationFormat.BatchedExperts ==
+                info.activation_format)
 
     def is_standard_fused_experts(self):
-        return self.fused_experts_type in [
-            CutlassExpertsFp8, DeepGemmExperts, TritonOrDeepGemmExperts,
-            TritonExperts
-        ]
+        info = expert_info(self.fused_experts_type)
+        return mk.FusedMoEActivationFormat.Standard == info.activation_format
 
-    def is_fe_16bit_supported(self):
-        return self.fused_experts_type in [
-            BatchedTritonExperts, BatchedTritonOrDeepGemmExperts,
-            NaiveBatchedExperts, TritonExperts
-        ]
+    def fe_supported_types(self):
+        info = expert_info(self.fused_experts_type)
+        return info.supported_dtypes
 
-    def is_fe_fp8_supported(self):
-        return self.fused_experts_type in [
-            BatchedDeepGemmExperts,
-            BatchedTritonExperts,
-            BatchedTritonOrDeepGemmExperts,
-            CutlassExpertsFp8,
-            DeepGemmExperts,
-            TritonExperts,
-            TritonOrDeepGemmExperts,
-            NaiveBatchedExperts,
-        ]
+    def pf_supported_types(self):
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return info.supported_dtypes
 
-    def is_fe_block_fp8_supported(self):
-        return self.fused_experts_type in [
-            BatchedDeepGemmExperts,
-            BatchedTritonOrDeepGemmExperts,
-            DeepGemmExperts,
-            TritonExperts,
-            TritonOrDeepGemmExperts,
-            BatchedTritonExperts,
-            NaiveBatchedExperts,
-        ]
+    def is_block_quant_supported(self):
+        info = expert_info(self.fused_experts_type)
+        return info.blocked_quantization_support
 
     def is_fe_supports_chunking(self):
-        return self.fused_experts_type in [
-            CutlassExpertsFp8, DeepGemmExperts, TritonOrDeepGemmExperts,
-            TritonExperts
-        ]
+        info = expert_info(self.fused_experts_type)
+        return info.supports_chunking
+
+    def supports_expert_map(self):
+        info = expert_info(self.fused_experts_type)
+        return info.supports_expert_map
+
+    def supports_apply_weight_on_input(self):
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return info.supports_apply_weight_on_input
 
     def needs_deep_gemm(self):
-        return self.fused_experts_type in [
-            BatchedDeepGemmExperts,
-            DeepGemmExperts,
-        ]
+        info = expert_info(self.fused_experts_type)
+        return info.needs_deep_gemm
 
     def needs_pplx(self):
-        return self.prepare_finalize_type in [PplxPrepareAndFinalize]
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return info.backend == "pplx"
 
     def needs_deep_ep(self):
-        return self.prepare_finalize_type in [
-            DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
-        ]
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return (info.backend == "deepep_high_throughput"
+                or info.backend == "deepep_low_latency")
 
     def all2all_backend(self):
-        if self.needs_pplx():
-            return "pplx"
-        if self.prepare_finalize_type == DeepEPHTPrepareAndFinalize:
-            return "deepep_high_throughput"
-        if self.prepare_finalize_type == DeepEPLLPrepareAndFinalize:
-            return "deepep_low_latency"
-        return "naive"
-
-    def needs_all2all(self):
-        return self.prepare_finalize_type in [
-            PplxPrepareAndFinalize, DeepEPHTPrepareAndFinalize,
-            DeepEPLLPrepareAndFinalize
-        ]
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return info.backend
 
     def is_valid(self):
         # Check prepare-finalize and fused-experts compatibility
@@ -267,28 +217,28 @@ class Config:
             # invalid quant config
             return False
 
-        # check bf16 / fp16 support
-        is_16bit = (self.dtype.itemsize == 2 and self.quant_dtype is None)
-        if is_16bit and not self.is_fe_16bit_supported():
-            return False
+        # check type support
+        if self.quant_dtype is None:
+            if (self.dtype not in self.pf_supported_types()
+                    or self.dtype not in self.fe_supported_types()):
+                return False
+        else:
+            if (self.quant_dtype not in self.pf_supported_types()
+                    or self.quant_dtype not in self.fe_supported_types()):
+                return False
 
-        # Check fp8 support
-        is_fp8 = self.quant_dtype == torch.float8_e4m3fn
-        if is_fp8 and not self.is_fe_fp8_supported():
-            return False
-
-        # Check fp8 block quanization support
+        # Check block quanization support
         is_block_quatized = self.quant_block_shape is not None
-        if is_block_quatized and not is_fp8:
+        if is_block_quatized and self.quant_dtype is None:
             return False
-        if is_block_quatized and not self.is_fe_block_fp8_supported():
+        if is_block_quatized and not self.is_block_quant_supported():
             return False
 
         # deep_gemm only works with block-quantized
         if self.needs_deep_gemm() and not is_block_quatized:
             return False
 
-        # Check dependencies
+        # Check dependencies (turn into asserts?)
         if self.needs_deep_ep() and not has_deep_ep():
             return False
         if self.needs_deep_gemm() and not has_deep_gemm():
@@ -305,6 +255,8 @@ class WeightTensors:
     w2: torch.Tensor
     w1_scale: Optional[torch.Tensor]
     w2_scale: Optional[torch.Tensor]
+    w1_gs: Optional[torch.Tensor] = None
+    w2_gs: Optional[torch.Tensor] = None
 
     def describe(self):
         s = ""
@@ -313,13 +265,20 @@ class WeightTensors:
         s += f' - {_describe_tensor(self.w2, "w2")} \n'
         s += f' - {_describe_tensor(self.w1_scale, "w1_scale")} \n'
         s += f' - {_describe_tensor(self.w2_scale, "w2_scale")} \n'
+        s += f' - {_describe_tensor(self.w1_gs, "w1_gs")} \n'
+        s += f' - {_describe_tensor(self.w2_gs, "w2_gs")} \n'
         return s
 
+    def is_quantized(self) -> bool:
+        # or w1_scale is not None?
+        return (self.w1.dtype == torch.float8_e4m3fn
+                or self.w1.dtype == torch.uint8 or self.w1.dtype == torch.int8)
+
     def to_current_device(self):
         self.w1 = self.w1.to(device=torch.cuda.current_device())
         self.w2 = self.w2.to(device=torch.cuda.current_device())
-        is_quantized = self.w1.dtype == torch.float8_e4m3fn
-        if is_quantized:
+
+        if self.is_quantized():
             assert self.w1_scale is not None
             assert self.w2_scale is not None
             self.w1_scale = self.w1_scale.to(
@@ -327,56 +286,51 @@ class WeightTensors:
             self.w2_scale = self.w2_scale.to(
                 device=torch.cuda.current_device())
 
+        if self.w1_gs is not None:
+            assert self.w2_gs is not None
+            self.w1_gs = self.w1_gs.to(device=torch.cuda.current_device())
+            self.w2_gs = self.w2_gs.to(device=torch.cuda.current_device())
+
     def slice_weights(self, rank: int,
                       num_local_experts: int) -> "WeightTensors":
         s = rank * num_local_experts
         e = s + num_local_experts
         w1 = self.w1[s:e, :, :]
         w2 = self.w2[s:e, :, :]
-        is_quantized = self.w1.dtype == torch.float8_e4m3fn
+
         w1_scale, w2_scale = (None, None)
-        if is_quantized:
+        if self.is_quantized():
             assert self.w1_scale is not None
             assert self.w2_scale is not None
             w1_scale = self.w1_scale[s:e, :, :]
             w2_scale = self.w2_scale[s:e, :, :]
-        return WeightTensors(w1, w2, w1_scale, w2_scale)
+
+        w1_gs = self.w1_gs
+        w2_gs = self.w2_gs
+        if w1_gs is not None:
+            assert w2_gs is not None
+            w1_gs = w1_gs[s:e]
+            w2_gs = w2_gs[s:e]
+
+        return WeightTensors(w1, w2, w1_scale, w2_scale, w1_gs, w2_gs)
 
     @staticmethod
     def make(config: Config) -> "WeightTensors":
-
-        if config.quant_dtype is None:
-            # just make normal dtype weights
-            w1, w2 = make_non_quant_weights(e=config.E,
-                                            n=config.N,
-                                            k=config.K,
-                                            dtype=config.dtype)
-            return WeightTensors(w1=w1, w2=w2, w1_scale=None, w2_scale=None)
-
-        assert config.quant_dtype == torch.float8_e4m3fn
-        if not config.is_fp8_block_quantized():
-            w1, w2, w1_scale, w2_scale = make_quant_fp8_weights(
-                e=config.E,
-                n=config.N,
-                k=config.K,
-                per_out_channel_quant=config.is_per_out_ch_quant,
-            )
-            return WeightTensors(w1=w1,
-                                 w2=w2,
-                                 w1_scale=w1_scale,
-                                 w2_scale=w2_scale)
-
-        assert config.quant_block_shape is not None
-        w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
+        (_, w1, w1_scale, w1_gs), (_, w2, w2_scale, w2_gs) = make_test_weights(
             e=config.E,
             n=config.N,
             k=config.K,
-            block_size=config.quant_block_shape,
+            in_dtype=config.dtype,
+            quant_dtype=config.quant_dtype,
+            block_shape=config.quant_block_shape,
+            per_act_token_quant=config.is_per_out_ch_quant,
         )
         return WeightTensors(w1=w1,
                              w2=w2,
                              w1_scale=w1_scale,
-                             w2_scale=w2_scale)
+                             w2_scale=w2_scale,
+                             w1_gs=w1_gs,
+                             w2_gs=w2_gs)
 
 
 @dataclass
@@ -449,7 +403,6 @@ class RankTensors:
                             dtype=dtype)
         topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk,
                                                False)
-        topk_ids = topk_ids.to(config.topk_ids_dtype)
 
         # distribute topk_ids evenly
         for mi in range(m):
@@ -457,7 +410,7 @@ class RankTensors:
         topk_ids = topk_ids.to(device=torch.cuda.current_device())
 
         expert_map = None
-        if config.world_size > 1:
+        if config.world_size > 1 and config.supports_expert_map():
             expert_map = torch.full((global_num_experts, ),
                                     fill_value=-1,
                                     dtype=torch.int32)
@@ -480,92 +433,100 @@ class RankTensors:
 def reference_moe_impl(config: Config, weights: WeightTensors,
                        rank_tensors: RankTensors) -> torch.Tensor:
 
-    return torch_experts(a=rank_tensors.hidden_states,
-                         w1=weights.w1,
-                         w2=weights.w2,
+    if config.quant_dtype == "nvfp4":
+        quant_blocksize = 16
+        dtype = config.dtype
+
+        w1_q = weights.w1
+        w1_blockscale = weights.w1_scale
+        w1_gs = weights.w1_gs
+
+        w2_q = weights.w2
+        w2_blockscale = weights.w2_scale
+        w2_gs = weights.w2_gs
+
+        a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(
+            rank_tensors.hidden_states.flatten(), dim=-1)).to(torch.float32)
+
+        assert w1_gs is not None
+        assert w2_gs is not None
+        assert w1_blockscale is not None
+        assert w2_blockscale is not None
+
+        assert w1_blockscale.shape[1] % 128 == 0
+        assert w1_blockscale.shape[2] % 4 == 0
+        assert w2_blockscale.shape[1] % 128 == 0
+        assert w2_blockscale.shape[2] % 4 == 0
+
+        a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(
+            rank_tensors.hidden_states, a_global_scale)
+
+        a = dequantize_nvfp4_to_dtype(a_fp4,
+                                      a_scale_interleaved,
+                                      a_global_scale,
+                                      dtype=dtype,
+                                      device=a_fp4.device,
+                                      block_size=quant_blocksize)
+
+        e = w1_q.shape[0]
+        n = w1_q.shape[1] // 2
+        k = w2_q.shape[1]
+
+        w1 = torch.zeros((e, 2 * n, k), device="cuda", dtype=dtype)
+        w2 = torch.zeros((e, k, n), device="cuda", dtype=dtype)
+
+        for idx in range(0, e):
+            w1[idx] = dequantize_nvfp4_to_dtype(w1_q[idx],
+                                                w1_blockscale[idx],
+                                                w1_gs[idx],
+                                                dtype=dtype,
+                                                device=w1_q.device,
+                                                block_size=quant_blocksize)
+            w2[idx] = dequantize_nvfp4_to_dtype(w2_q[idx],
+                                                w2_blockscale[idx],
+                                                w2_gs[idx],
+                                                dtype=dtype,
+                                                device=w2_q.device,
+                                                block_size=quant_blocksize)
+        a_scale = None
+        w1_scale = None
+        w2_scale = None
+        quant_dtype = None
+        per_act_token_quant = False
+        block_shape = None
+    else:
+        a = rank_tensors.hidden_states
+        a_scale = rank_tensors.hidden_states_scale
+        w1 = weights.w1
+        w1_scale = weights.w1_scale
+        w2 = weights.w2
+        w2_scale = weights.w2_scale
+        quant_dtype = config.quant_dtype
+        per_act_token_quant = config.is_per_act_token_quant
+        block_shape = config.quant_block_shape
+
+    return torch_experts(a=a,
+                         w1=w1,
+                         w2=w2,
                          topk_weight=rank_tensors.topk_weights,
                          topk_ids=rank_tensors.topk_ids,
                          global_num_experts=config.E,
                          expert_map=None,
-                         w1_scale=weights.w1_scale,
-                         w2_scale=weights.w2_scale,
-                         a1_scale=rank_tensors.hidden_states_scale,
-                         quant_dtype=config.quant_dtype,
-                         per_act_token_quant=config.is_per_act_token_quant,
-                         block_shape=config.quant_block_shape,
-                         apply_router_weights_on_input=config.topk == 1)
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         a1_scale=a_scale,
+                         quant_dtype=quant_dtype,
+                         per_act_token_quant=per_act_token_quant,
+                         block_shape=block_shape,
+                         apply_router_weights_on_input=config.topk == 1
+                         and config.supports_apply_weight_on_input())
 
 
-def make_fused_experts(
-        config: Config, moe: FusedMoEConfig,
-        num_dispatchers: int) -> mk.FusedMoEPermuteExpertsUnpermute:
-
-    use_fp8 = config.quant_dtype == torch.float8_e4m3fn
-    batch_kwargs = {
-        "max_num_tokens": moe.max_num_tokens,
-        "num_dispatchers": num_dispatchers,
-    }
-    quant_kwargs = {
-        "use_fp8_w8a8": use_fp8,
-        "use_int8_w8a8": False,
-        "use_int8_w8a16": False,
-        "use_int4_w4a16": False,
-        "block_shape": config.quant_block_shape,
-        "per_act_token_quant": config.is_per_act_token_quant,
-    }
-    deepgemm_kwargs = {"allow_deep_gemm": has_deep_gemm()}
-
-    if config.fused_experts_type == BatchedDeepGemmExperts:
-        kwargs = batch_kwargs | {
-            "block_shape": config.quant_block_shape,
-            "per_act_token_quant": config.is_per_act_token_quant,
-        }
-        print(f"Making BatchedDeepGemmExperts {kwargs} ...")
-        experts = BatchedDeepGemmExperts(**kwargs)
-    elif config.fused_experts_type == BatchedTritonExperts:
-        kwargs = batch_kwargs | quant_kwargs
-        print(f"Making BatchedTritonExperts {kwargs} ...")
-        experts = BatchedTritonExperts(**kwargs)
-    elif config.fused_experts_type == BatchedTritonOrDeepGemmExperts:
-        kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs
-        print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
-        experts = BatchedTritonOrDeepGemmExperts(**kwargs)
-    elif config.fused_experts_type == DeepGemmExperts:
-        print("Making DeepGemmExperts () ...")
-        experts = DeepGemmExperts()
-    elif config.fused_experts_type == TritonExperts:
-        kwargs = quant_kwargs
-        print(f"Making TritonExperts {kwargs} ...")
-        experts = TritonExperts(**kwargs)
-    elif config.fused_experts_type == TritonOrDeepGemmExperts:
-        kwargs = quant_kwargs | deepgemm_kwargs
-        print(f"Making TritonOrDeepGemmExperts {kwargs} ...")
-        experts = TritonOrDeepGemmExperts(**kwargs)
-    elif config.fused_experts_type == NaiveBatchedExperts:
-        kwargs = batch_kwargs | quant_kwargs
-        print(f"Making NaiveBatchedExperts {kwargs} ...")
-        experts = NaiveBatchedExperts(**kwargs)
-    elif config.fused_experts_type == CutlassExpertsFp8:
-        use_batched_format = config.is_batched_prepare_finalize()
-        num_experts = (moe.num_local_experts
-                       if use_batched_format else moe.num_experts)
-        kwargs = {
-            "max_experts_per_worker": num_experts,
-            "out_dtype": moe.in_dtype,
-            "per_act_token_quant": config.is_per_act_token_quant,
-            "per_out_ch_quant": config.is_per_out_ch_quant,
-            "block_shape": config.quant_block_shape,
-            "num_dispatchers": num_dispatchers,
-            "use_batched_format": use_batched_format
-        }
-        print(f"Making CutlassExpertsFp8 {kwargs} ...")
-        experts = CutlassExpertsFp8(**kwargs)
-
-    return experts
-
-
-def make_modular_kernel(config: Config,
-                        vllm_config: VllmConfig) -> mk.FusedMoEModularKernel:
+def make_modular_kernel(
+    config: Config,
+    vllm_config: VllmConfig,
+    weights: WeightTensors,
+) -> mk.FusedMoEModularKernel:
 
     def next_power_of_2(x):
         import math
@@ -579,6 +540,7 @@ def make_modular_kernel(config: Config,
         dp_size_=get_dp_group().world_size,
         vllm_parallel_config=vllm_config.parallel_config,
     )
+
     moe = FusedMoEConfig(
         num_experts=config.E,
         experts_per_token=config.topk,
@@ -591,15 +553,16 @@ def make_modular_kernel(config: Config,
     )
 
     # make modular kernel
-    prepare_finalize = None
-    if config.needs_all2all():
-        prepare_finalize = FusedMoEMethodBase.maybe_make_prepare_finalize(moe)
-        assert prepare_finalize is not None
-    else:
-        prepare_finalize = MoEPrepareAndFinalizeNoEP()
+    prepare_finalize = make_prepare_finalize(config.prepare_finalize_type,
+                                             config.all2all_backend(), moe)
 
-    fused_experts = make_fused_experts(config, moe,
-                                       prepare_finalize.num_dispatchers())
+    fused_experts = make_fused_experts(
+        config.fused_experts_type,
+        moe,
+        prepare_finalize.num_dispatchers(),
+        weights.w1_gs,
+        weights.w2_gs,
+    )
 
     modular_kernel = mk.FusedMoEModularKernel(
         prepare_finalize=prepare_finalize, fused_experts=fused_experts)
@@ -620,22 +583,45 @@ def run_modular_kernel(
     # weights for rank
     rank_weights = weights.slice_weights(pgi.rank, config.num_local_experts)
 
-    mk = make_modular_kernel(config, vllm_config)
+    mk = make_modular_kernel(config, vllm_config, weights)
 
     mk_kwargs = {
-        "hidden_states": rank_tensors.hidden_states.clone(
+        "hidden_states":
+        rank_tensors.hidden_states.clone(
         ),  # impls might update the tensor in place
-        "w1": rank_weights.w1,
-        "w2": rank_weights.w2,
-        "topk_weights": rank_tensors.topk_weights,
-        "topk_ids": rank_tensors.topk_ids,
-        "expert_map": rank_tensors.expert_map,
-        "w1_scale": rank_weights.w1_scale,
-        "w2_scale": rank_weights.w2_scale,
-        "a1_scale": rank_tensors.hidden_states_scale,
-        "global_num_experts": config.E,
-        "apply_router_weight_on_input": config.topk == 1,
+        "w1":
+        rank_weights.w1,
+        "w2":
+        rank_weights.w2,
+        "topk_weights":
+        rank_tensors.topk_weights,
+        "topk_ids":
+        rank_tensors.topk_ids.to(mk.prepare_finalize.topk_indices_dtype()),
+        "expert_map":
+        rank_tensors.expert_map,
+        "w1_scale":
+        rank_weights.w1_scale,
+        "w2_scale":
+        rank_weights.w2_scale,
+        "a1_scale":
+        rank_tensors.hidden_states_scale,
+        "global_num_experts":
+        config.E,
+        "apply_router_weight_on_input":
+        config.topk == 1 and config.supports_apply_weight_on_input(),
     }
-    out = mk.forward(**mk_kwargs)
+
+    num_tokens = rank_tensors.hidden_states.shape[0]
+    num_tokens_across_dp = torch.tensor([num_tokens] * config.world_size,
+                                        device="cuda",
+                                        dtype=torch.int)
+
+    with set_forward_context(
+            None,
+            vllm_config,
+            num_tokens=num_tokens,
+            num_tokens_across_dp=num_tokens_across_dp,
+    ):
+        out = mk.forward(**mk_kwargs)
 
     return out
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
index 73214066f7..aecffae36a 100644
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -1,58 +1,316 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional, Union
 
 import torch
 
 # Fused experts and PrepareFinalize imports
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
     BatchedDeepGemmExperts)
 from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
     BatchedTritonOrDeepGemmExperts)
-from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig,
+                                                         FusedMoEQuantConfig)
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedTritonExperts, NaiveBatchedExperts)
-from vllm.model_executor.layers.fused_moe.layer import TritonExperts
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase,
+                                                        TritonExperts)
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP)
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
     TritonOrDeepGemmExperts)
-from vllm.utils import has_deep_ep, has_pplx
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    cutlass_fp4_supported)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    cutlass_fp8_supported)
+from vllm.platforms import current_platform
+from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
+from vllm.utils.deep_gemm import is_deep_gemm_supported
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
-if has_deep_ep():
+
+@dataclass
+class PrepareFinalizeInfo:
+    activation_format: mk.FusedMoEActivationFormat
+    supported_dtypes: list[Union[torch.dtype, str]]
+    blocked_quantization_support: bool
+    backend: Optional[str]
+    supports_apply_weight_on_input: bool = True
+
+
+@dataclass
+class ExpertInfo:
+    activation_format: mk.FusedMoEActivationFormat
+    supported_dtypes: list[Union[torch.dtype, str]]
+    blocked_quantization_support: bool
+    supports_chunking: bool
+    supports_expert_map: bool
+    needs_matching_quant: bool = False
+    needs_deep_gemm: bool = False
+
+
+PREPARE_FINALIZE_INFO: dict[mk.FusedMoEPrepareAndFinalize,
+                            PrepareFinalizeInfo] = {}
+EXPERT_INFO: dict[mk.FusedMoEPermuteExpertsUnpermute, ExpertInfo] = {}
+MK_ALL_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = []
+MK_MULTI_GPU_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = []
+MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = []
+MK_FUSED_EXPERT_TYPES: list[mk.FusedMoEPermuteExpertsUnpermute] = []
+
+standard_format = mk.FusedMoEActivationFormat.Standard
+batched_format = mk.FusedMoEActivationFormat.BatchedExperts
+common_float_types: list[Union[torch.dtype, str]] = [
+    torch.float8_e4m3fn, torch.bfloat16, torch.float16, torch.float32
+]
+common_float_and_int_types = common_float_types + [torch.int8]
+nv_fp4_types = ["nvfp4"]
+fp8_types = [torch.float8_e4m3fn]
+
+
+def register_prepare_and_finalize(
+    kind,
+    activation_format: mk.FusedMoEActivationFormat,
+    supported_dtypes: list[Union[torch.dtype, str]],
+    blocked_quantization_support: bool,
+    backend: Optional[str],
+    force_multigpu: bool = False,
+    supports_apply_weight_on_input: bool = True,
+):
+    global PREPARE_FINALIZE_INFO
+    global MK_ALL_PREPARE_FINALIZE_TYPES
+    global MK_MULTI_GPU_PREPARE_FINALIZE_TYPES
+    global MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES
+    assert kind not in PREPARE_FINALIZE_INFO
+
+    PREPARE_FINALIZE_INFO[kind] = PrepareFinalizeInfo(
+        activation_format,
+        supported_dtypes,
+        blocked_quantization_support,
+        backend,
+        supports_apply_weight_on_input,
+    )
+    MK_ALL_PREPARE_FINALIZE_TYPES.append(kind)
+    if backend is not None or force_multigpu:
+        MK_MULTI_GPU_PREPARE_FINALIZE_TYPES.append(kind)
+    else:
+        MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES.append(kind)
+
+
+def register_experts(
+    kind,
+    activation_format: mk.FusedMoEActivationFormat,
+    supported_dtypes: list[Union[torch.dtype, str]],
+    blocked_quantization_support: bool,
+    supports_chunking: bool,
+    supports_expert_map: bool,
+    needs_matching_quant: bool = False,
+    needs_deep_gemm: bool = False,
+):
+    global EXPERT_INFO
+    global MK_FUSED_EXPERT_TYPES
+    assert kind not in EXPERT_INFO
+
+    EXPERT_INFO[kind] = ExpertInfo(
+        activation_format,
+        supported_dtypes,
+        blocked_quantization_support,
+        supports_chunking,
+        supports_expert_map,
+        needs_matching_quant,
+        needs_deep_gemm,
+    )
+
+    MK_FUSED_EXPERT_TYPES.append(kind)
+
+
+def prepare_finalize_info(kind) -> PrepareFinalizeInfo:
+    info = PREPARE_FINALIZE_INFO.get(kind)
+    assert info is not None
+    return info
+
+
+def expert_info(kind) -> ExpertInfo:
+    info = EXPERT_INFO.get(kind)
+    assert info is not None
+    return info
+
+
+register_prepare_and_finalize(
+    MoEPrepareAndFinalizeNoEP,
+    standard_format,
+    common_float_types,
+    blocked_quantization_support=True,
+    backend=None,
+)
+
+register_experts(
+    BatchedTritonExperts,
+    batched_format,
+    common_float_types,
+    blocked_quantization_support=True,
+    supports_chunking=False,
+    supports_expert_map=False,
+    needs_matching_quant=True,
+)
+
+register_experts(
+    TritonExperts,
+    standard_format,
+    common_float_and_int_types,
+    blocked_quantization_support=True,
+    supports_chunking=True,
+    supports_expert_map=True,
+    needs_matching_quant=True,
+)
+
+register_experts(
+    NaiveBatchedExperts,
+    batched_format,
+    common_float_and_int_types,
+    blocked_quantization_support=True,
+    supports_chunking=False,
+    supports_expert_map=True,
+)
+
+# Disable on blackwell for now
+if has_deep_ep() and not current_platform.has_device_capability(100):
     from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
         DeepEPHTPrepareAndFinalize)
     from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
         DeepEPLLPrepareAndFinalize)
 
+    register_prepare_and_finalize(
+        DeepEPHTPrepareAndFinalize,
+        standard_format,
+        common_float_types,
+        blocked_quantization_support=True,
+        backend="deepep_high_throughput",
+    )
+
+    register_prepare_and_finalize(
+        DeepEPLLPrepareAndFinalize,
+        batched_format,
+        common_float_types,
+        blocked_quantization_support=True,
+        backend="deepep_low_latency",
+    )
+
 if has_pplx():
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
         PplxPrepareAndFinalize)
+    register_prepare_and_finalize(
+        PplxPrepareAndFinalize,
+        batched_format,
+        common_float_and_int_types,
+        blocked_quantization_support=True,
+        backend="pplx",
+    )
 
-MK_MULTI_GPU_PREPARE_FINALIZE_TYPES = []
-if has_pplx():
-    MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [PplxPrepareAndFinalize]
-if has_deep_ep():
-    MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [
-        DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
-    ]
+if (has_flashinfer_cutlass_fused_moe()
+        and current_platform.has_device_capability(100)):
+    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+        FlashInferExperts)
+    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+        FlashInferCutlassMoEPrepareAndFinalize)
 
-MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES = [MoEPrepareAndFinalizeNoEP]
+    register_prepare_and_finalize(
+        FlashInferCutlassMoEPrepareAndFinalize,
+        standard_format,
+        nv_fp4_types,
+        blocked_quantization_support=True,
+        backend=None,
+        force_multigpu=True,
+        supports_apply_weight_on_input=False,
+    )
 
-MK_ALL_PREPARE_FINALIZE_TYPES = (MK_MULTI_GPU_PREPARE_FINALIZE_TYPES +
-                                 MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+    register_experts(
+        FlashInferExperts,
+        standard_format,
+        nv_fp4_types,
+        blocked_quantization_support=True,
+        supports_chunking=True,
+        # Note: this is a hack to get it to run for now
+        supports_expert_map=True,
+    )
+else:
+    FlashInferCutlassMoEPrepareAndFinalize = None
 
-MK_FUSED_EXPERT_TYPES = [
-    BatchedDeepGemmExperts,
-    BatchedTritonExperts,
-    NaiveBatchedExperts,
-    BatchedTritonOrDeepGemmExperts,
-    CutlassExpertsFp8,
-    DeepGemmExperts,
-    TritonOrDeepGemmExperts,
-    TritonExperts,
-]
+if has_deep_gemm() and is_deep_gemm_supported():
+    register_experts(
+        BatchedDeepGemmExperts,
+        batched_format,
+        fp8_types,
+        blocked_quantization_support=True,
+        supports_chunking=False,
+        supports_expert_map=False,
+        needs_matching_quant=False,
+        needs_deep_gemm=True,
+    )
+    register_experts(
+        DeepGemmExperts,
+        standard_format,
+        fp8_types,
+        blocked_quantization_support=True,
+        supports_chunking=True,
+        supports_expert_map=True,
+        needs_matching_quant=False,
+        needs_deep_gemm=True,
+    ),
+    register_experts(
+        BatchedTritonOrDeepGemmExperts,
+        batched_format,
+        common_float_and_int_types,
+        blocked_quantization_support=True,
+        supports_chunking=False,
+        supports_expert_map=False,
+        needs_matching_quant=True,
+        needs_deep_gemm=True,
+    )
+    register_experts(
+        TritonOrDeepGemmExperts,
+        standard_format,
+        common_float_and_int_types,
+        blocked_quantization_support=True,
+        supports_chunking=True,
+        supports_expert_map=True,
+        needs_matching_quant=True,
+        needs_deep_gemm=True,
+    )
+
+if cutlass_fp8_supported():
+    from vllm.model_executor.layers.fused_moe import (CutlassBatchedExpertsFp8,
+                                                      CutlassExpertsFp8)
+    register_experts(
+        CutlassExpertsFp8,
+        standard_format,
+        fp8_types,
+        blocked_quantization_support=False,
+        supports_chunking=True,
+        supports_expert_map=False,
+    )
+    register_experts(
+        CutlassBatchedExpertsFp8,
+        batched_format,
+        fp8_types,
+        blocked_quantization_support=False,
+        supports_chunking=False,
+        supports_expert_map=False,
+    )
+
+if cutlass_fp4_supported():
+    from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+        CutlassExpertsFp4)
+    register_experts(
+        CutlassExpertsFp4,
+        standard_format,
+        nv_fp4_types,
+        blocked_quantization_support=True,
+        supports_chunking=True,
+        supports_expert_map=False,
+    )
 
 MK_QUANT_CONFIGS = [
     None,
@@ -85,3 +343,156 @@ MK_QUANT_CONFIGS = [
     # block-quantized weights and per-token activations
     # block-quantized weights and per-tensor activations
 ]
+
+if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
+    MK_QUANT_CONFIGS += [
+        FusedMoEQuantConfig(quant_dtype="nvfp4",
+                            per_out_ch_quant=False,
+                            per_act_token_quant=False,
+                            block_shape=None),
+    ]
+
+
+def _make_gscale(num_experts: int) -> torch.Tensor:
+    return torch.ones((num_experts, ),
+                      device=torch.cuda.current_device(),
+                      dtype=torch.float32)
+
+
+def make_prepare_finalize(
+    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
+    backend: Optional[str],
+    moe: FusedMoEConfig,
+) -> mk.FusedMoEPrepareAndFinalize:
+    if backend != "naive" and backend is not None:
+        prepare_finalize = FusedMoEMethodBase._maybe_make_prepare_finalize(moe)
+        assert prepare_finalize is not None
+        return prepare_finalize
+    elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
+        return FlashInferCutlassMoEPrepareAndFinalize(
+            use_dp=moe.moe_parallel_config.dp_size > 1,
+            a1_gscale=_make_gscale(moe.num_local_experts),
+        )
+    else:
+        return MoEPrepareAndFinalizeNoEP()
+
+
+def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor:
+    s = rank * num_local_experts
+    e = s + num_local_experts
+    return t[s:e]
+
+
+def make_fused_experts(
+    fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute,
+    moe: FusedMoEConfig,
+    num_dispatchers: int,
+    w1_gs: Optional[torch.Tensor],
+    w2_gs: Optional[torch.Tensor],
+) -> mk.FusedMoEPermuteExpertsUnpermute:
+
+    use_fp8 = moe.quant_dtype == torch.float8_e4m3fn
+    batch_kwargs = {
+        "max_num_tokens": moe.max_num_tokens,
+        "num_dispatchers": num_dispatchers,
+    }
+    quant_kwargs = {
+        "use_fp8_w8a8": use_fp8,
+        "use_int8_w8a8": False,
+        "use_int8_w8a16": False,
+        "use_int4_w4a16": False,
+        "block_shape": moe.block_shape,
+        "per_act_token_quant": moe.per_act_token_quant,
+    }
+    deepgemm_kwargs = {"allow_deep_gemm": has_deep_gemm()}
+
+    if fused_experts_type == BatchedDeepGemmExperts:
+        kwargs = batch_kwargs | {
+            "block_shape": moe.block_shape,
+            "per_act_token_quant": moe.per_act_token_quant,
+        }
+        print(f"Making BatchedDeepGemmExperts {kwargs} ...")
+        experts = BatchedDeepGemmExperts(**kwargs)
+    elif fused_experts_type == BatchedTritonExperts:
+        kwargs = batch_kwargs | quant_kwargs
+        print(f"Making BatchedTritonExperts {kwargs} ...")
+        experts = BatchedTritonExperts(**kwargs)
+    elif fused_experts_type == BatchedTritonOrDeepGemmExperts:
+        kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs
+        print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
+        experts = BatchedTritonOrDeepGemmExperts(**kwargs)
+    elif fused_experts_type == DeepGemmExperts:
+        print("Making DeepGemmExperts () ...")
+        experts = DeepGemmExperts()
+    elif fused_experts_type == TritonExperts:
+        kwargs = quant_kwargs
+        print(f"Making TritonExperts {kwargs} ...")
+        experts = TritonExperts(**kwargs)
+    elif fused_experts_type == TritonOrDeepGemmExperts:
+        kwargs = quant_kwargs | deepgemm_kwargs
+        print(f"Making TritonOrDeepGemmExperts {kwargs} ...")
+        experts = TritonOrDeepGemmExperts(**kwargs)
+    elif fused_experts_type == NaiveBatchedExperts:
+        kwargs = batch_kwargs | quant_kwargs
+        print(f"Making NaiveBatchedExperts {kwargs} ...")
+        experts = NaiveBatchedExperts(**kwargs)
+    elif fused_experts_type == CutlassExpertsFp8:
+        kwargs = {
+            "out_dtype": moe.in_dtype,
+            "per_act_token_quant": moe.per_act_token_quant,
+            "per_out_ch_quant": moe.per_out_ch_quant,
+            "block_shape": moe.block_shape,
+        }
+        print(f"Making CutlassExpertsFp8 {kwargs} ...")
+        experts = CutlassExpertsFp8(**kwargs)
+    elif fused_experts_type == CutlassBatchedExpertsFp8:
+        kwargs = {
+            "max_experts_per_worker": moe.num_local_experts,
+            "num_dispatchers": num_dispatchers,
+            "out_dtype": moe.in_dtype,
+            "per_act_token_quant": moe.per_act_token_quant,
+            "per_out_ch_quant": moe.per_out_ch_quant,
+            "block_shape": moe.block_shape,
+        }
+        print(f"Making CutlassBatchedExpertsFp8 {kwargs} ...")
+        experts = CutlassBatchedExpertsFp8(**kwargs)
+    elif fused_experts_type == CutlassExpertsFp4:
+        assert w1_gs is not None and w2_gs is not None
+        num_experts = moe.num_local_experts
+        rank = moe.moe_parallel_config.dp_rank
+        kwargs = {
+            "g1_alphas": _slice(rank, num_experts, (1 / w1_gs)),
+            "g2_alphas": _slice(rank, num_experts, (1 / w2_gs)),
+            "a1_gscale": _make_gscale(num_experts),
+            "a2_gscale": _make_gscale(num_experts),
+            "max_experts_per_worker": num_experts,
+            "out_dtype": moe.in_dtype,
+            "per_act_token_quant": moe.per_act_token_quant,
+            "per_out_ch_quant": moe.per_out_ch_quant,
+            "block_shape": moe.block_shape,
+            "num_dispatchers": num_dispatchers,
+        }
+        print(f"Making CutlassExpertsFp4 {kwargs} ...")
+        experts = CutlassExpertsFp4(**kwargs)
+    elif fused_experts_type == FlashInferExperts:
+        assert w1_gs is not None and w2_gs is not None
+        num_experts = moe.num_local_experts
+        rank = moe.moe_parallel_config.dp_rank
+        kwargs = {
+            "g1_alphas": _slice(rank, num_experts, (1 / w1_gs)),
+            "g2_alphas": _slice(rank, num_experts, (1 / w2_gs)),
+            "a1_gscale": _make_gscale(num_experts),
+            "a2_gscale": _make_gscale(num_experts),
+            "out_dtype": moe.in_dtype,
+            "quant_dtype": "nvfp4",
+            "ep_rank": moe.ep_rank,
+            "ep_size": moe.ep_size,
+            "tp_rank": moe.tp_rank,
+            "tp_size": moe.tp_size,
+        }
+        print(f"Making FlashInferExperts {kwargs} ...")
+        experts = FlashInferExperts(**kwargs)
+    else:
+        raise RuntimeError(f"Unknown fused experts type: {fused_experts_type}")
+
+    return experts
diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
index dd16ffb2ea..0da6ee3543 100644
--- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
+++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@@ -52,7 +52,7 @@ def profile_modular_kernel(
     rank_weights = weights.slice_weights(pgi.rank, config.num_local_experts)
 
     # make modular kernel
-    mk = make_modular_kernel(config, vllm_config)
+    mk = make_modular_kernel(config, vllm_config, weights)
 
     mk_kwargs = {
         "hidden_states": rank_tensors.hidden_states,
@@ -83,7 +83,7 @@ def rank_worker(
     # sanity check
     from vllm import envs
     if config.fused_moe_chunk_size is not None:
-        assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+        assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE
 
     # get weights to this device
     weights.to_current_device()
diff --git a/tests/kernels/moe/modular_kernel_tools/utils.py b/tests/kernels/moe/modular_kernel_tools/utils.py
deleted file mode 100644
index 866f52882b..0000000000
--- a/tests/kernels/moe/modular_kernel_tools/utils.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-import vllm._custom_ops as ops
-from vllm.utils.deep_gemm import per_block_cast_to_fp8
-
-
-def per_token_cast_to_fp8(
-        x: torch.Tensor, block_size: int) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    pad_size = (block_size - (n % block_size)) % block_size
-    x = torch.nn.functional.pad(x,
-                                (0, pad_size), value=0) if pad_size > 0 else x
-    x_view = x.view(m, -1, block_size)
-    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
-    fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
-    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
-
-
-def make_non_quant_weights(
-    e: int,
-    n: int,
-    k: int,
-    dtype: torch.dtype,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """
-    Return weights w1, w2
-    """
-    device = torch.cuda.current_device()
-    w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 15
-    w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 15
-    return w1, w2
-
-
-def make_block_quant_fp8_weights(
-    e: int,
-    n: int,
-    k: int,
-    block_size: list[int],
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Return weights w1, w2, w1_scale, w2_scale
-    """
-    dtype = torch.bfloat16
-    device = torch.cuda.current_device()
-
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
-    fp8_max, fp8_min = fp8_info.max, fp8_info.min
-
-    w1_bf16, w2_bf16 = make_non_quant_weights(e, n, k, dtype)
-    w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
-    w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
-
-    block_n, block_k = block_size[0], block_size[1]
-    n_tiles_w1 = ((2 * n) + block_n - 1) // block_n
-    k_tiles_w1 = (k + block_k - 1) // block_k
-    n_tiles_w2 = (k + block_n - 1) // block_n
-    k_tiles_w2 = (n + block_k - 1) // block_k
-
-    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn, device=device)
-    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn, device=device)
-
-    w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1),
-                       device=device,
-                       dtype=torch.float32)
-    w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2),
-                       device=device,
-                       dtype=torch.float32)
-
-    assert w1_s.shape == (e, (2 * n + (block_n - 1)) // block_n,
-                          (k + (block_k - 1)) // block_k)
-    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
-
-    for i in range(e):
-        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
-                                               block_size=[block_k, block_n])
-        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
-                                               block_size=[block_k, block_n])
-
-    return w1, w2, w1_s, w2_s
-
-
-def make_quant_fp8_weights(
-    e: int,
-    n: int,
-    k: int,
-    per_out_channel_quant: bool,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Return w1, w2, w1_scale, w2_scale
-    """
-    q_dtype = torch.float8_e4m3fn
-
-    w1, w2 = make_non_quant_weights(e, n, k, dtype=torch.bfloat16)
-
-    # w1 -> w1_q, w2 -> w2_q
-    w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
-    w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
-
-    n_b_scales = 2 * n if per_out_channel_quant else 1
-    k_b_scales = k if per_out_channel_quant else 1
-    w1_scale = torch.empty((e, n_b_scales, 1),
-                           device="cuda",
-                           dtype=torch.float32)
-    w2_scale = torch.empty((e, k_b_scales, 1),
-                           device="cuda",
-                           dtype=torch.float32)
-
-    for expert in range(e):
-        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-            w1[expert], use_per_token_if_dynamic=per_out_channel_quant)
-        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-            w2[expert], use_per_token_if_dynamic=per_out_channel_quant)
-    return w1_q, w2_q, w1_scale, w2_scale
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index edf3e61892..00b2d780e6 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -133,7 +133,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         per_act_token_quant=per_act_token_quant,
     )
 
-    B, B_q, B_scale, _, _, _ = make_test_weights(
+    (B, B_q, B_scale, _), _ = make_test_weights(
         num_experts,
         N // 2,
         K,
@@ -243,7 +243,7 @@ def test_fused_moe_batched_experts(
         act_dtype = dtype
         quant_dtype = None
 
-    w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights(
+    (w1_16, w1, w1_s, _), (w2_16, w2, w2_s, _) = make_test_weights(
         e,
         n,
         k,
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index 75b2e9f791..9e4eaf221f 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -161,18 +161,20 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
     a = torch.randn((M, K), dtype=dtype) / 10
     score = torch.randn((M, E), dtype=dtype)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
-                                                 N,
-                                                 K,
-                                                 dtype,
-                                                 torch.float8_e4m3fn,
-                                                 per_act_token_quant=False,
-                                                 block_shape=block_size)
+    (_, w1, w1_s, _), (_, w2, w2_s,
+                       _) = make_test_weights(E,
+                                              N,
+                                              K,
+                                              dtype,
+                                              torch.float8_e4m3fn,
+                                              per_act_token_quant=False,
+                                              block_shape=block_size)
 
     m_fused_moe = modular_triton_fused_moe(use_fp8_w8a8=True,
                                            use_int8_w8a8=False,
                                            use_int8_w8a16=False,
                                            use_int4_w4a16=False,
+                                           use_mxfp4_w4a4=False,
                                            per_act_token_quant=False,
                                            block_shape=block_size)
 
@@ -247,13 +249,14 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
     a = torch.randn((M, K), dtype=dtype) / 10
     score = torch.randn((M, E), dtype=dtype)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
-                                                 N,
-                                                 K,
-                                                 dtype,
-                                                 torch.float8_e4m3fn,
-                                                 per_act_token_quant=False,
-                                                 block_shape=block_size)
+    (_, w1, w1_s, _), (_, w2, w2_s,
+                       _) = make_test_weights(E,
+                                              N,
+                                              K,
+                                              dtype,
+                                              torch.float8_e4m3fn,
+                                              per_act_token_quant=False,
+                                              block_shape=block_size)
 
     # Note: for now use_compile will error out if the problem size is
     # large enough to trigger chunking. I'm leaving the flag and
diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py
index 8e680c7229..5e4a93963f 100644
--- a/tests/kernels/moe/test_block_int8.py
+++ b/tests/kernels/moe/test_block_int8.py
@@ -118,13 +118,14 @@ def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
     a = torch.randn((M, K), dtype=dtype) / 10
     score = torch.randn((M, E), dtype=dtype)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
-                                                 N,
-                                                 K,
-                                                 dtype,
-                                                 torch.int8,
-                                                 per_act_token_quant=False,
-                                                 block_shape=block_size)
+    (_, w1, w1_s, _), (_, w2, w2_s,
+                       _) = make_test_weights(E,
+                                              N,
+                                              K,
+                                              dtype,
+                                              torch.int8,
+                                              per_act_token_quant=False,
+                                              block_shape=block_size)
 
     # Set the context to avoid lots of warning spam.
     with set_current_vllm_config(vllm_config):
diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py
index 1aee1ed8c3..3b1618daca 100644
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@@ -9,6 +9,7 @@ import random
 import pytest
 import torch
 
+from tests.kernels.moe.utils import per_token_cast_to_fp8
 from tests.kernels.utils import baseline_scaled_mm
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
@@ -16,20 +17,6 @@ from vllm.utils import cdiv
 from vllm.utils.deep_gemm import per_block_cast_to_fp8
 
 
-def per_token_cast_to_fp8(
-        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    pad_size = (128 - (n % 128)) % 128
-    x = torch.nn.functional.pad(x,
-                                (0, pad_size), value=0) if pad_size > 0 else x
-    x_view = x.view(m, -1, 128)
-    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
-    fp8_data = (x_view *
-                (448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn)
-    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
-
-
 @pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
     (4, 8192, 7168, 4096),
     (4, 8192, 2048, 7168),
@@ -76,7 +63,7 @@ def test_cutlass_grouped_gemm(
                          device=device,
                          dtype=torch.float))
     for i in range(num_groups):
-        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i])
+        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128])
 
     for i in range(num_groups):
         a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]]
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 9b064db973..6f95581a5e 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -70,8 +70,10 @@ def make_block_quant_fp8_weights(
     """
     Return weights w1q, w2q, w1_scale, w2_scale
     """
-    w1, w1q, w1_scale, w2, w2q, w2_scale = make_test_weights(
-        e, n, k, torch.bfloat16, torch.float8_e4m3fn, block_size)
+    (_, w1q, w1_scale, _), (_, w2q, w2_scale,
+                            _) = make_test_weights(e, n, k, torch.bfloat16,
+                                                   torch.float8_e4m3fn,
+                                                   block_size)
     return w1q, w2q, w1_scale, w2_scale
 
 
diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py
index b2b78662c9..4472f34a62 100644
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -132,9 +132,9 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
 # Note: W1 has shape (E, 2N, K), so N = 512
 # can trigger the deepgemm path.
 MNKs = [
-    (1024, 512, 128),
-    (1024, 512, 512),
-    (2048, 512, 512),
+    (1024, 768, 128),
+    (1024, 768, 512),
+    (2048, 768, 512),
     (512, 1024, 1024),
     (512, 2048, 2048),
     (4096, 4096, 1024),
diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py
new file mode 100644
index 0000000000..1c14df2b91
--- /dev/null
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from tests.kernels.moe.utils import make_test_weights
+from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
+                                                    FLOAT8_E4M3_MAX,
+                                                    dequantize_nvfp4_to_dtype)
+from tests.kernels.utils import torch_moe
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP)
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+
+if not has_flashinfer_cutlass_fused_moe(
+) or not current_platform.has_device_capability(100):
+    pytest.skip("Requires flashinfer_cutlass_fused_moe and nvfp4 support",
+                allow_module_level=True)
+
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (2, 1024, 1536),
+    (2, 3072, 1024),
+    (2, 3072, 1536),
+    (64, 1024, 1024),
+    (64, 1024, 1536),
+    (64, 3072, 1024),
+    (64, 2048, 1536),
+    (224, 1024, 1024),
+    (224, 1024, 1536),
+]
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [40, 64, 256])
+#@pytest.mark.parametrize("e", [128, 256])
+@pytest.mark.parametrize("topk", [1, 6, 8])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@torch.inference_mode()
+def test_flashinfer_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
+                                     dtype: torch.dtype):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+
+        quant_blocksize = 16
+
+        (_, w1_q, w1_blockscale,
+         w1_gs), (_, w2_q, w2_blockscale, w2_gs) = make_test_weights(
+             e,
+             n,
+             k,
+             in_dtype=dtype,
+             quant_dtype="nvfp4",
+             block_shape=None,  # use quant_blocksize?
+             per_act_token_quant=False,
+         )
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
+
+        a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
+        a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
+
+        assert is_valid_flashinfer_cutlass_fused_moe(a, w1_q, w2_q)
+
+        assert w1_gs is not None
+        assert w2_gs is not None
+        assert w1_blockscale is not None
+        assert w2_blockscale is not None
+
+        flashinfer_experts = FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            FlashInferExperts(
+                a1_gscale=a1_gs,
+                g1_alphas=(1 / w1_gs),
+                a2_gscale=a2_gs,
+                g2_alphas=(1 / w2_gs),
+                out_dtype=dtype,
+                quant_dtype="nvfp4",
+            ))
+
+        flashinfer_output = flashinfer_experts(
+            hidden_states=a,
+            w1=w1_q,
+            w1_scale=w1_blockscale,
+            w2=w2_q,
+            w2_scale=w2_blockscale,
+            a1_scale=a1_gs,
+            a2_scale=a2_gs,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+        )
+
+        # Reference check:
+        a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                          torch.amax(a.flatten(), dim=-1)).to(torch.float32)
+        a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale)
+        _, m_k = a_fp4.shape
+        a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
+                                               a_scale_interleaved,
+                                               a_global_scale,
+                                               dtype=a.dtype,
+                                               device=a.device,
+                                               block_size=quant_blocksize)
+
+        w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype)
+        w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype)
+
+        for idx in range(0, e):
+            w1_d[idx] = dequantize_nvfp4_to_dtype(w1_q[idx],
+                                                  w1_blockscale[idx],
+                                                  w1_gs[idx],
+                                                  dtype=dtype,
+                                                  device=w1_q.device,
+                                                  block_size=quant_blocksize)
+            w2_d[idx] = dequantize_nvfp4_to_dtype(w2_q[idx],
+                                                  w2_blockscale[idx],
+                                                  w2_gs[idx],
+                                                  dtype=dtype,
+                                                  device=w2_q.device,
+                                                  block_size=quant_blocksize)
+
+        torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk)
+
+        torch.testing.assert_close(torch_output,
+                                   flashinfer_output,
+                                   atol=1e-1,
+                                   rtol=1e-1)
+
+
+if __name__ == "__main__":
+    test_flashinfer_fp4_moe_no_graph((2, 1024, 1024), 40, 1, torch.half)
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index 6f2869c3a6..d45982384e 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
+import textwrap
+import traceback
 from itertools import product
 from typing import Optional
 
@@ -10,41 +12,51 @@ import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.config import VllmConfig, current_platform, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-    BatchedTritonOrDeepGemmExperts)
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts)
-from vllm.model_executor.layers.fused_moe.layer import TritonExperts
-from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-    TritonOrDeepGemmExperts)
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
 from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
                                           reference_moe_impl,
                                           run_modular_kernel)
 from .modular_kernel_tools.mk_objects import (
     MK_FUSED_EXPERT_TYPES, MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
-    MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+    MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, expert_info)
 from .modular_kernel_tools.parallel_utils import (ProcessGroupInfo,
                                                   parallel_launch_with_config)
 
-# TODO (varun): These requirements are very strict and could be relaxed.
-has_all_packages = (has_deep_ep() and has_deep_gemm() and has_pplx())
+has_any_multi_gpu_package = (has_deep_ep() or has_deep_gemm() or has_pplx()
+                             or has_flashinfer_cutlass_fused_moe())
 
-meets_package_requirements = pytest.mark.skipif(
-    not has_all_packages,
-    reason="Requires deep_ep & deep_gemm & pplx packages",
+meets_multi_gpu_requirements = pytest.mark.skipif(
+    not has_any_multi_gpu_package,
+    reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages",
 )
 
 
+def format_result(verbose, msg, ex=None):
+    if ex is not None:
+        x = str(ex)
+        newx = x.strip(" \n\t")[:16]
+        if len(newx) < len(x):
+            newx = newx + " ..."
+
+        prefix = "E\t"
+        print(f"{textwrap.indent(traceback.format_exc(), prefix)}")
+        print(f"FAILED {msg} - {newx}\n")
+    elif verbose:
+        print(f"PASSED {msg}")
+    else:
+        print(".", end="")
+
+
 def rank_worker(
     pgi: ProcessGroupInfo,
     vllm_config: VllmConfig,
     cpu_group,
     config: Config,
     weights: WeightTensors,
+    verbose: bool,
 ):
     current_platform.seed_everything(pgi.rank)
 
@@ -61,39 +73,64 @@ def rank_worker(
     TOPKs = config.topks
     assert isinstance(TOPKs, list)
 
+    exceptions = []
+    count = 0
+
     for m, topk in product(Ms, TOPKs):
-        print(f"Running m={m}, topk={topk} ...")
-        # override m and topk
-        cfgx = copy.deepcopy(config)
-        cfgx.Ms = m
-        cfgx.topks = topk
+        try:
+            print(f"Running[{pgi.rank}]: m={m}, topk={topk} ...")
+            count = count + 1
+            # override m and topk
+            cfgx = copy.deepcopy(config)
+            cfgx.Ms = m
+            cfgx.topks = topk
 
-        # inputs for rank
-        rank_tensors = RankTensors.make(cfgx, pgi)
+            # inputs for rank
+            rank_tensors = RankTensors.make(cfgx, pgi)
 
-        # modular kernel out
-        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
-                                    rank_tensors)
+            # modular kernel out
+            mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
+                                        rank_tensors)
 
-        with set_current_vllm_config(vllm_config):
-            ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
+            with set_current_vllm_config(vllm_config):
+                ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
 
-        torch.testing.assert_close(ref_out, mk_out, atol=3e-2, rtol=3e-2)
+            if config.quant_dtype == "nvfp4":
+                atol = 1e-1
+                rtol = 1e-1
+            else:
+                atol = 3e-2
+                rtol = 3e-2
+
+            torch.testing.assert_close(ref_out, mk_out, atol=atol, rtol=rtol)
+            format_result(verbose, config.describe())
+        except Exception as ex:
+            format_result(verbose, config.describe(), ex)
+            exceptions.append(ex)
+
+    if len(exceptions) > 0:
+        raise RuntimeError(
+            f"{len(exceptions)} of {count} tests failed in child process, "
+            f"rank={pgi.rank}.")
+    else:
+        print(f"{count} of {count} tests passed in child process, "
+              f"rank={pgi.rank}.")
 
 
-def run(config: Config):
+def run(config: Config, verbose: bool):
     assert config.is_valid()
-    print(f"Testing config \n{config.describe()} ...")
 
     weights: WeightTensors = WeightTensors.make(config)
 
     vllm_config, env_dict = config.make_env_data()
     parallel_launch_with_config(config.world_size, rank_worker, vllm_config,
-                                env_dict, config, weights)
+                                env_dict, config, weights, verbose)
 
 
 Ms = [32, 64]
-Ks = [7168]  # hidden sizes
+# hidden sizes, making this too large will cause fp4 tests to fail.
+# Also needs to be a multiple of 1024 for deep_gemm.
+Ks = [2048]
 Ns = [2048]
 TOPKs = [4, 1]
 Es = [32]
@@ -103,19 +140,16 @@ FUSED_MOE_CHUNK_SIZEs = [None, 16]
 
 def is_nyi_config(config: Config) -> bool:
     # We know these configs to be legitimate. but still fail.
+    info = expert_info(config.fused_experts_type)
 
-    if (config.fused_experts_type in [
-            BatchedTritonExperts, BatchedTritonOrDeepGemmExperts,
-            TritonExperts, TritonOrDeepGemmExperts
-    ]):
+    if info.needs_matching_quant:
         # The triton kernels expect both per-act-token-quant and
         # per-out-ch-quant or neither.
         unsupported_quant_config = ((config.is_per_act_token_quant +
                                      config.is_per_out_ch_quant) == 1)
         return unsupported_quant_config
 
-    # cutlass kernels dont support expert_maps yet.
-    return config.fused_experts_type == CutlassExpertsFp8
+    return not info.supports_expert_map
 
 
 @pytest.mark.parametrize("k", Ks)
@@ -128,13 +162,13 @@ def is_nyi_config(config: Config) -> bool:
     product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
 @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
 @pytest.mark.parametrize("world_size", [2])
-@meets_package_requirements
+@meets_multi_gpu_requirements
 def test_modular_kernel_combinations_multigpu(
         k: int, n: int, e: int, dtype: torch.dtype,
-        quant_config: FusedMoEQuantConfig,
+        quant_config: Optional[FusedMoEQuantConfig],
         combination: tuple[mk.FusedMoEPrepareAndFinalize,
                            mk.FusedMoEPermuteExpertsUnpermute],
-        fused_moe_chunk_size: Optional[int], world_size: int):
+        fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig):
 
     config = Config(
         Ms=Ms,
@@ -149,14 +183,15 @@ def test_modular_kernel_combinations_multigpu(
         fused_moe_chunk_size=fused_moe_chunk_size,
         world_size=world_size,
     )
+
     if not config.is_valid():
         pytest.skip(f"Tests config {config} is not valid. Skipping ...")
 
     if is_nyi_config(config):
         pytest.skip(f"Tests config {config} is nyi. Skipping ...")
 
-    print(f"{config.describe()}")
-    run(config)
+    verbosity = pytestconfig.getoption('verbose')
+    run(config, verbosity > 0)
 
 
 @pytest.mark.parametrize("k", Ks)
@@ -169,13 +204,12 @@ def test_modular_kernel_combinations_multigpu(
     product(MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
 @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
 @pytest.mark.parametrize("world_size", [1])
-@meets_package_requirements
 def test_modular_kernel_combinations_singlegpu(
         k: int, n: int, e: int, dtype: torch.dtype,
-        quant_config: FusedMoEQuantConfig,
+        quant_config: Optional[FusedMoEQuantConfig],
         combination: tuple[mk.FusedMoEPrepareAndFinalize,
                            mk.FusedMoEPermuteExpertsUnpermute],
-        fused_moe_chunk_size: Optional[int], world_size: int):
+        fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig):
     config = Config(
         Ms=Ms,
         K=k,
@@ -196,7 +230,8 @@ def test_modular_kernel_combinations_singlegpu(
     if is_nyi_config(config):
         pytest.skip(f"Tests config {config} is nyi. Skipping ...")
 
-    run(config)
+    verbosity = pytestconfig.getoption('verbose')
+    run(config, verbosity > 0)
 
 
 if __name__ == '__main__':
@@ -211,4 +246,4 @@ if __name__ == '__main__':
     args = parser.parse_args()
     config = make_config(args)
 
-    run(config)
+    run(config, True)
diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py
index 3ff3853602..30388ef937 100644
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -3,6 +3,7 @@
 import pytest
 import torch
 
+from tests.kernels.moe.utils import make_test_weights
 from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
                                                     FLOAT8_E4M3_MAX,
                                                     dequantize_nvfp4_to_dtype)
@@ -43,41 +44,20 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
             VllmConfig(parallel_config=ParallelConfig(
                 pipeline_parallel_size=1))):
 
-        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
         quant_blocksize = 16
-        round_up = lambda x, y: (x + y - 1) // y * y
-        sf_w1_2n = round_up(2 * n, 128)
-        sf_w1_k = round_up(k // quant_blocksize, 4)
-        w1_blockscale = torch.empty((e, sf_w1_2n, sf_w1_k),
-                                    device="cuda",
-                                    dtype=torch.float8_e4m3fn)
 
-        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-        sf_w2_k = round_up(k, 128)
-        sf_w2_n = round_up(n // quant_blocksize, 4)
-        w2_blockscale = torch.empty((e, sf_w2_k, sf_w2_n),
-                                    device="cuda",
-                                    dtype=torch.float8_e4m3fn)
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
 
-        w1_q = torch.empty((e, 2 * n, k // 2),
-                           device="cuda",
-                           dtype=torch.uint8)
-        w2_q = torch.empty((e, k, n // 2), device="cuda", dtype=torch.uint8)
-        w1_gs = torch.empty((e, ), device="cuda", dtype=torch.float32)
-        w2_gs = torch.empty((e, ), device="cuda", dtype=torch.float32)
-
-        for expert in range(e):
-            w1_amax = torch.abs(w1).max().to(torch.float32)
-            w2_amax = torch.abs(w2).max().to(torch.float32)
-            w1_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax
-            w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax
-
-            w1_q[expert], w1_blockscale[expert] = ops.scaled_fp4_quant(
-                w1[expert], w1_gs[expert])
-
-            w2_q[expert], w2_blockscale[expert] = ops.scaled_fp4_quant(
-                w2[expert], w2_gs[expert])
+        (_, w1_q, w1_blockscale,
+         w1_gs), (_, w2_q, w2_blockscale, w2_gs) = make_test_weights(
+             e,
+             n,
+             k,
+             in_dtype=dtype,
+             quant_dtype="nvfp4",
+             block_shape=None,  # use quant_blocksize?
+             per_act_token_quant=False,
+         )
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
         topk_weights, topk_ids, _ = fused_topk(a,
@@ -88,6 +68,11 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
         a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
         a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
 
+        assert w1_gs is not None
+        assert w2_gs is not None
+        assert w1_blockscale is not None
+        assert w2_blockscale is not None
+
         cutlass_output = cutlass_moe_fp4(
             a=a,
             a1_gscale=a1_gs,
@@ -104,14 +89,13 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
             n=n,
             k=k,
             e=e,
-            device=a.device,
         )
 
         # Reference check:
         a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
                           torch.amax(a.flatten(), dim=-1)).to(torch.float32)
         a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale)
-        _, m_k = a_fp4.shape
+
         a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
                                                a_scale_interleaved,
                                                a_global_scale,
@@ -126,14 +110,14 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
             w1_d[idx] = dequantize_nvfp4_to_dtype(w1_q[idx],
                                                   w1_blockscale[idx],
                                                   w1_gs[idx],
-                                                  dtype=w1.dtype,
-                                                  device=w1.device,
+                                                  dtype=dtype,
+                                                  device=w1_q.device,
                                                   block_size=quant_blocksize)
             w2_d[idx] = dequantize_nvfp4_to_dtype(w2_q[idx],
                                                   w2_blockscale[idx],
                                                   w2_gs[idx],
-                                                  dtype=w2.dtype,
-                                                  device=w2.device,
+                                                  dtype=dtype,
+                                                  device=w2_q.device,
                                                   block_size=quant_blocksize)
 
         torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk)
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index e4f4a393df..f98937ee6c 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -9,7 +9,8 @@ import torch
 from tests.kernels.utils import torch_experts
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    CutlassBatchedExpertsFp8)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
@@ -123,12 +124,8 @@ def pplx_cutlass_moe(
         num_local_experts=num_local_experts,
         num_dispatchers=num_dispatchers)
 
-    experts = CutlassExpertsFp8(num_local_experts,
-                                out_dtype,
-                                per_act_token,
-                                per_out_ch,
-                                num_dispatchers=num_dispatchers,
-                                use_batched_format=True)
+    experts = CutlassBatchedExpertsFp8(num_local_experts, num_dispatchers,
+                                       out_dtype, per_act_token, per_out_ch)
 
     fused_cutlass_experts = FusedMoEModularKernel(
         prepare_finalize,
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index fbef6706be..c2064de973 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -770,7 +770,7 @@ def test_pplx_moe_slow(
     a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
     score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(
+    (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights(
         e,
         n,
         k,
@@ -836,7 +836,7 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
 
         args = dict()
         if make_weights:
-            _, w1, w1_s, _, w2, w2_s = make_test_weights(
+            (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights(
                 e,
                 n,
                 k,
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index c33134981a..82960bd573 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 
 import vllm._custom_ops as ops
 from tests.kernels.quant_utils import per_block_cast_to_int8
+from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
+                                                    FLOAT8_E4M3_MAX)
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
@@ -169,28 +171,41 @@ def make_quantized_test_activations(
 def moe_quantize_weights(
     w: torch.Tensor,
     w_s: Optional[torch.Tensor],
-    quant_dtype: Optional[torch.dtype],
+    quant_dtype: Union[torch.dtype, str, None],
     per_token_quant: bool,
     block_shape: Optional[list[int]],
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-    assert (quant_dtype == torch.float8_e4m3fn
-            or quant_dtype == torch.int8), "only fp8/int8 supported"
+) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    assert (quant_dtype == torch.float8_e4m3fn or quant_dtype == torch.int8
+            or quant_dtype == "nvfp4"), "only fp8/int8/nvfp4 supported"
+
+    w_gs = None
 
     if block_shape is not None:
         assert not per_token_quant
         if quant_dtype == torch.int8:
             w, w_s = per_block_cast_to_int8(w, block_shape)
-        else:
+        elif quant_dtype == torch.float8_e4m3fn:
             w, w_s = per_block_cast_to_fp8(w, block_shape)
+        elif quant_dtype == "nvfp4":
+            raise RuntimeError("blocked quantization not supported for nvfp4")
+        else:
+            raise RuntimeError(f"Unsupported quant type {quant_dtype}")
     else:
         if quant_dtype == torch.int8:
             w, w_s = ops.scaled_int8_quant(
                 w, w_s, use_per_token_if_dynamic=per_token_quant)
-        else:
+        elif quant_dtype == torch.float8_e4m3fn:
             w, w_s = ops.scaled_fp8_quant(
                 w, w_s, use_per_token_if_dynamic=per_token_quant)
+        elif quant_dtype == "nvfp4":
+            assert not per_token_quant
+            w_amax = torch.abs(w).max().to(torch.float32)
+            w_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w_amax
+            w, w_s = ops.scaled_fp4_quant(w, w_gs)
+        else:
+            raise RuntimeError(f"Unsupported quant type {quant_dtype}")
 
-    return w, w_s
+    return w, w_s, w_gs
 
 
 def make_test_weight(
@@ -198,21 +213,26 @@ def make_test_weight(
     rows: int,
     cols: int,
     in_dtype: torch.dtype = torch.bfloat16,
-    quant_dtype: Optional[torch.dtype] = None,
+    quant_dtype: Union[torch.dtype, str, None] = None,
     block_shape: Optional[list[int]] = None,
     per_act_token_quant: bool = False,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
+           Optional[torch.Tensor]]:
     w_16 = torch.randn((e, rows, cols), device="cuda", dtype=in_dtype) / 15
+    w_gs = None
 
     if quant_dtype is not None:
         w_l = [None] * e
         w_s_l = [None] * e
+        w_gs_l = [None] * e
         for idx in range(e):
-            w_l[idx], w_s_l[idx] = moe_quantize_weights(
+            w_l[idx], w_s_l[idx], w_gs_l[idx] = moe_quantize_weights(
                 w_16[idx], None, quant_dtype, per_act_token_quant, block_shape)
 
         w = torch.stack(w_l)
         w_s = torch.stack(w_s_l)
+        if e > 0 and w_gs_l[0] is not None:
+            w_gs = torch.stack(w_gs_l)
         if w_s.ndim == 2:
             assert w_s.shape[-1] == 1
             w_s = w_s.view(-1, 1, 1)
@@ -225,8 +245,9 @@ def make_test_weight(
     else:
         w = w_16
         w_s = None
+        w_gs = None
 
-    return w_16, w, w_s
+    return w_16, w, w_s, w_gs
 
 
 def make_test_weights(
@@ -234,14 +255,30 @@ def make_test_weights(
     n: int,
     k: int,
     in_dtype: torch.dtype = torch.bfloat16,
-    quant_dtype: Optional[torch.dtype] = None,
+    quant_dtype: Union[torch.dtype, str, None] = None,
     block_shape: Optional[list[int]] = None,
     per_act_token_quant: bool = False,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], torch.Tensor,
-           torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
+                 Optional[torch.Tensor]],
+           tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
+                 Optional[torch.Tensor]]]:
     return (
-        *make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape,
-                          per_act_token_quant),
-        *make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape,
-                          per_act_token_quant),
+        make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape,
+                         per_act_token_quant),
+        make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape,
+                         per_act_token_quant),
     )
+
+
+def per_token_cast_to_fp8(
+        x: torch.Tensor,
+        block_size: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    pad_size = (block_size - (n % block_size)) % block_size
+    x = torch.nn.functional.pad(x,
+                                (0, pad_size), value=0) if pad_size > 0 else x
+    x_view = x.view(m, -1, block_size)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
+    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 127a340fc6..9e5aa4e4c2 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -105,7 +105,8 @@ class DeviceCommunicatorBase:
             # we initialize the all2all manager used in expert parallel.
             use_ep = config.parallel_config.data_parallel_size > 1
 
-        self.use_all2all = "ep" in unique_name and use_ep
+        self.is_ep_communicator = "ep" in unique_name
+        self.use_all2all = self.is_ep_communicator and use_ep
         self.all2all_manager: Optional[All2AllManagerBase] = None
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
@@ -246,7 +247,7 @@ class DeviceCommunicatorBase:
         """
         Prepare the communication buffer for the model.
         """
-        if not self.use_all2all:
+        if not self.is_ep_communicator:
             return
 
         moe_modules = [
@@ -254,7 +255,7 @@ class DeviceCommunicatorBase:
             if module.__class__.__name__ == "FusedMoE"
         ]
         for module in moe_modules:
-            module.quant_method.init_prepare_finalize(module.moe_config)
+            module.quant_method.init_prepare_finalize()
 
     def dispatch(
             self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 3d40879b4c..3007643d7a 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -49,7 +49,8 @@ if HAS_TRITON:
     from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
         BatchedTritonOrDeepGemmExperts)
     from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-        CutlassExpertsFp8, cutlass_moe_fp4, cutlass_moe_fp8)
+        CutlassBatchedExpertsFp8, CutlassExpertsFp8, cutlass_moe_fp4,
+        cutlass_moe_fp8)
     from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
         DeepGemmExperts)
     from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
@@ -69,6 +70,7 @@ if HAS_TRITON:
         "cutlass_moe_fp8",
         "cutlass_moe_fp4",
         "CutlassExpertsFp8",
+        "CutlassBatchedExpertsFp8",
         "TritonExperts",
         "BatchedTritonExperts",
         "DeepGemmExperts",
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index c48a0137c3..d9cfe96f7a 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
@@ -254,18 +254,28 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         output = (num_experts, max_num_tokens * num_dispatchers, K)
         return (workspace13, workspace2, output, a.dtype)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor],
-              w1_scale: Optional[torch.Tensor],
-              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
-              workspace2: torch.Tensor,
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
         assert expert_tokens_meta is not None
         expert_num_tokens = expert_tokens_meta.expert_num_tokens
 
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
index fc30e84e66..89d7412ee2 100644
--- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
@@ -132,18 +132,28 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
                 a, aq, M, N, K, topk, global_num_experts, local_num_experts,
                 expert_tokens_metadata)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor],
-              w1_scale: Optional[torch.Tensor],
-              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
-              workspace2: torch.Tensor,
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
         experts = (self.batched_deep_gemm_experts
                    if self.allow_deep_gemm else self.batched_triton_experts)
         assert experts is not None
@@ -151,4 +161,4 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
                       activation, global_num_experts, expert_map, w1_scale,
                       w2_scale, w1_zp, w2_zp, a1q_scale, a2_scale, workspace13,
                       workspace2, expert_tokens_meta,
-                      apply_router_weight_on_input, extra_expert_args)
+                      apply_router_weight_on_input)
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 31ea826f1f..7c1a7b636a 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -45,7 +45,6 @@ def get_quant_config_weight_quant(
     return _get_quant_config_quantization_args(quant_config, "weights")
 
 
-# TODO (bnell): use scalar_type instead of bools?
 def get_config_quant_dtype(
     use_fp8_w8a8: bool,
     use_int8_w8a8: bool,
@@ -65,7 +64,8 @@ def get_config_quant_dtype(
 @dataclass
 class FusedMoEQuantConfig:
     # The post quantization activation type.
-    quant_dtype: Optional[torch.dtype] = None
+    # TODO (bnell): use scalar_type instead of Union.
+    quant_dtype: Union[torch.dtype, str, None] = None
     per_act_token_quant: bool = False
     per_out_ch_quant: bool = False
     block_shape: Optional[list[int]] = None
@@ -141,6 +141,7 @@ class FusedMoEQuantConfig:
                 use_int8_w8a8,
                 use_int8_w8a16,
                 use_int4_w4a16,
+                use_mxfp4_w4a4,
             ]
         ]) <= 1, "Quantization flags are mutually exclusive."
 
@@ -334,7 +335,7 @@ class FusedMoEConfig:
         assert self.max_num_tokens > 0
 
     @property
-    def quant_dtype(self) -> Optional[torch.dtype]:
+    def quant_dtype(self) -> Union[torch.dtype, str, None]:
         if self.quant_config is not None:
             return self.quant_config.quant_dtype
         else:
@@ -429,7 +430,7 @@ class FusedMoEConfig:
                 block_shape = None
             per_act_token_quant = False
             per_out_ch_quant = False
-            quant_dtype: Optional[torch.dtype] = None
+            quant_dtype: Union[torch.dtype, str, None] = None
 
             input_quant = get_quant_config_input_quant(quant_config)
             weight_quant = get_quant_config_weight_quant(quant_config)
@@ -453,7 +454,7 @@ class FusedMoEConfig:
                 ModelOptNvFp4Config)
             if quant_dtype is None and isinstance(quant_config,
                                                   ModelOptNvFp4Config):
-                quant_dtype = torch.uint8
+                quant_dtype = "nvfp4"
 
             if weight_quant is not None:
                 per_out_ch_quant = (
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 2585a2953c..0a02b558d0 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ CUTLASS based Fused MoE kernels."""
-from typing import Any, Callable, Optional
+from typing import Callable, Optional
 
 import torch
 
@@ -12,11 +12,10 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate)
+    TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP)
 from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm,
                                                         _fp8_quantize,
-                                                        _resize_cache,
-                                                        extract_required_args)
+                                                        _resize_cache)
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -213,19 +212,14 @@ def run_cutlass_moe_fp8(
         output.copy_(c3[c_map].view(M * topk, K), non_blocking=True)
 
 
-# TODO (bnell): split class batched vs. non-batched?
-# maybe remove need for passing aq to workspace_shapes
-class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
+class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
         self,
-        max_experts_per_worker: int,
         out_dtype: Optional[torch.dtype],
         per_act_token_quant: bool,
         per_out_ch_quant: bool,
         block_shape: Optional[list[int]] = None,
-        num_dispatchers: Optional[int] = None,
-        use_batched_format: bool = False,
     ):
         super().__init__(
             FusedMoEQuantConfig(
@@ -234,33 +228,84 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
                 per_out_ch_quant=per_out_ch_quant,
                 block_shape=block_shape,
             ))
-        assert max_experts_per_worker > 0
-        assert not use_batched_format or num_dispatchers is not None
-        self.max_experts_per_worker = max_experts_per_worker
-        self.num_dispatchers = num_dispatchers
         self.out_dtype = out_dtype
-        self.use_batched_format = use_batched_format
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
+        assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
+        assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
+
+        expert_num_tokens = None
+        if expert_tokens_meta is not None:
+            expert_num_tokens = expert_tokens_meta.expert_num_tokens
+
+        activation_callable = lambda o, i: self.activation(activation, o, i)
+
+        use_batched_format = self.activation_formats[
+            0] == mk.FusedMoEActivationFormat.BatchedExperts
+
+        in_dtype = hidden_states.dtype
+        run_cutlass_moe_fp8(
+            output, hidden_states, w1, w2, topk_ids, activation_callable,
+            global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale,
+            a2_scale, workspace13, workspace2, expert_num_tokens,
+            self.out_dtype if self.out_dtype is not None else in_dtype,
+            self.per_act_token_quant, self.per_out_ch_quant,
+            use_batched_format)
+
+
+class CutlassExpertsFp8(CutlassExpertsFp8Base):
+
+    def __init__(
+        self,
+        out_dtype: Optional[torch.dtype],
+        per_act_token_quant: bool,
+        per_out_ch_quant: bool,
+        block_shape: Optional[list[int]] = None,
+    ):
+        super().__init__(
+            out_dtype,
+            per_act_token_quant,
+            per_out_ch_quant,
+            block_shape,
+        )
 
     @property
     def activation_formats(
         self
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        if self.use_batched_format:
-            return (mk.FusedMoEActivationFormat.BatchedExperts,
-                    mk.FusedMoEActivationFormat.BatchedExperts)
-        else:
-            return (mk.FusedMoEActivationFormat.Standard,
-                    mk.FusedMoEActivationFormat.Standard)
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
 
     def supports_chunking(self) -> bool:
-        return not self.use_batched_format
+        return True
 
     def supports_expert_map(self) -> bool:
-        return not self.use_batched_format
-
-    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
-        # Let PrepareAndFinalize::finalize() decide the impl.
-        return TopKWeightAndReduceDelegate()
+        return True
 
     def workspace_shapes(
         self,
@@ -274,54 +319,69 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
         local_num_experts: int,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
-        workspace1: tuple[int, ...] = ()
-        workspace2: tuple[int, ...] = ()
-        output: tuple[int, ...] = ()
-        if self.use_batched_format:
-            padded_M = aq.size(1)
-            num_dp = self.num_dispatchers
-            assert num_dp is not None
-            workspace1 = (self.max_experts_per_worker, padded_M * num_dp,
-                          max(N, K))
-            workspace2 = (self.max_experts_per_worker, padded_M * num_dp,
-                          (N // 2))
-            output = (self.max_experts_per_worker, padded_M, K)
-        else:
-            workspace1 = (M * topk, max(N, K))
-            workspace2 = (M * topk, N // 2)
-            output = (M * topk, K)
+        workspace1 = (M * topk, max(N, K))
+        workspace2 = (M * topk, N // 2)
+        output = (M * topk, K)
         return (workspace1, workspace2, output,
                 self.out_dtype if self.out_dtype is not None else a.dtype)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor],
-              w1_scale: Optional[torch.Tensor],
-              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
-              workspace2: torch.Tensor,
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
-        assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
-        assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
 
-        expert_num_tokens = None
-        if expert_tokens_meta is not None:
-            expert_num_tokens = expert_tokens_meta.expert_num_tokens
+class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
 
-        activation_callable = lambda o, i: self.activation(activation, o, i)
+    def __init__(
+        self,
+        max_experts_per_worker: int,
+        num_dispatchers: int,
+        out_dtype: Optional[torch.dtype],
+        per_act_token_quant: bool,
+        per_out_ch_quant: bool,
+        block_shape: Optional[list[int]] = None,
+    ):
+        super().__init__(
+            out_dtype,
+            per_act_token_quant,
+            per_out_ch_quant,
+            block_shape,
+        )
+        assert max_experts_per_worker > 0
+        self.max_experts_per_worker = max_experts_per_worker
+        self.num_dispatchers = num_dispatchers
 
-        in_dtype = hidden_states.dtype
-        run_cutlass_moe_fp8(
-            output, hidden_states, w1, w2, topk_ids, activation_callable,
-            global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale,
-            a2_scale, workspace13, workspace2, expert_num_tokens,
-            self.out_dtype if self.out_dtype is not None else in_dtype,
-            self.per_act_token_quant, self.per_out_ch_quant,
-            self.use_batched_format)
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.BatchedExperts,
+                mk.FusedMoEActivationFormat.BatchedExperts)
+
+    def supports_chunking(self) -> bool:
+        return False
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    # TODO(bnell): maybe remove need for passing aq to workspace_shapes
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        padded_M = aq.size(1)
+        num_dp = self.num_dispatchers
+        assert num_dp is not None
+        workspace1 = (self.max_experts_per_worker, padded_M * num_dp,
+                      max(N, K))
+        workspace2 = (self.max_experts_per_worker, padded_M * num_dp, (N // 2))
+        output = (self.max_experts_per_worker, padded_M, K)
+        return (workspace1, workspace2, output,
+                self.out_dtype if self.out_dtype is not None else a.dtype)
 
 
 def cutlass_moe_fp8(
@@ -387,11 +447,9 @@ def cutlass_moe_fp8(
     fn = mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(),
         CutlassExpertsFp8(
-            max_experts_per_worker=num_experts,
             out_dtype=a.dtype,
             per_act_token_quant=per_act_token,
             per_out_ch_quant=per_out_ch,
-            use_batched_format=False,
         ),
     )
 
@@ -476,8 +534,9 @@ def run_cutlass_moe_fp4(
     e_w1, nx2_w1, half_k_w1 = w1_fp4.shape
     e_w2, k_w2, half_n_w2 = w2_fp4.shape
 
-    assert (e_w1 == e_w2 and e_w1 == e), ("Number of experts must match",
-                                          " between weights.")
+    assert (e_w1 == e_w2
+            and e_w1 == e), ("Number of experts must match",
+                             f" between weights. {e_w1}, {e_w2}, {e}")
     assert (k_a == half_k_w1 * 2
             and k == k_w2), ("Hidden size mismatch between a, w1 and w2")
     assert (nx2_w1 == n * 2 and half_n_w2 * 2 == n), ("mismatch in "
@@ -554,6 +613,10 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
         self,
+        g1_alphas: torch.Tensor,
+        g2_alphas: torch.Tensor,
+        a1_gscale: torch.Tensor,
+        a2_gscale: torch.Tensor,
         max_experts_per_worker: int,
         out_dtype: torch.dtype,
         per_act_token_quant: bool,
@@ -562,8 +625,12 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
         use_batched_format: bool = False,
     ):
         super().__init__(
+            # NVFP4 requires two levels of quantization, which involves
+            # computing some scaling factors dynamically. This makes it
+            # incompatible with the typical prepare -> MoE -> finalize
+            # pipeline. Move the quantization logic into the MoE body.
             FusedMoEQuantConfig(
-                quant_dtype=torch.uint8,
+                quant_dtype=None,  # skip quantization in prepare/finalize
                 per_act_token_quant=per_act_token_quant,
                 per_out_ch_quant=per_out_ch_quant,
                 block_shape=block_shape,
@@ -572,6 +639,12 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
         self.out_dtype = out_dtype
         self.use_batched_format = use_batched_format
 
+        # TODO(bnell): put this stuff into quant config?
+        self.g1_alphas = g1_alphas
+        self.g2_alphas = g2_alphas
+        self.a1_gscale = a1_gscale
+        self.a2_gscale = a2_gscale
+
     @property
     def activation_formats(
         self
@@ -590,8 +663,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
         return True
 
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
-        # Let PrepareAndFinalize::finalize() decide the impl.
-        return TopKWeightAndReduceDelegate()
+        return TopKWeightAndReduceNoOP()
 
     def workspace_shapes(
         self,
@@ -620,34 +692,42 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
         return (workspace1, workspace2, output,
                 self.out_dtype if self.out_dtype is not None else a.dtype)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor], w1_scale: torch.Tensor,
-              w2_scale: torch.Tensor, w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: torch.Tensor, workspace13: Optional[torch.Tensor],
-              workspace2: Optional[torch.Tensor],
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
-        required_keys = [
-            "g1_alphas", "g2_alphas", "a1_gscale", "a2_gscale", "m", "n", "k",
-            "e", "device"
-        ]
-        (g1_alphas, g2_alphas, a1_gscale, a2_gscale, m, n, k, e,
-         device) = extract_required_args(extra_expert_args, required_keys)
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: torch.Tensor,
+        workspace13: Optional[torch.Tensor],
+        workspace2: Optional[torch.Tensor],
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
+        e, m, n, k, _ = mk._moe_problem_size(hidden_states, w1, w2, topk_ids)
+        n = w2.shape[2] * 2
+
         run_cutlass_moe_fp4(
             output=output,
             a=hidden_states,
-            a1_gscale=a1_gscale,
+            a1_gscale=self.a1_gscale,
             w1_fp4=w1,
             w1_blockscale=w1_scale,
-            w1_alphas=g1_alphas,
-            a2_gscale=a2_gscale,
+            w1_alphas=self.g1_alphas,
+            a2_gscale=self.a2_gscale,
             w2_fp4=w2,
             w2_blockscale=w2_scale,
-            w2_alphas=g2_alphas,
+            w2_alphas=self.g2_alphas,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             workspace13=workspace13,
@@ -656,7 +736,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
             n=n,
             k=k,
             e=e,
-            device=device,
+            device=hidden_states.device,
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
 
@@ -677,7 +757,6 @@ def cutlass_moe_fp4(
         n: int,
         k: int,
         e: int,
-        device: torch.device,
         expert_map: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False) -> torch.Tensor:
     assert expert_map is None, ("Expert Parallelism / expert_map "
@@ -686,6 +765,10 @@ def cutlass_moe_fp4(
     fn = mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(),
         CutlassExpertsFp4(
+            g1_alphas,
+            g2_alphas,
+            a1_gscale,
+            a2_gscale,
             max_experts_per_worker=e,
             out_dtype=a.dtype,
             per_act_token_quant=False,
@@ -693,29 +776,7 @@ def cutlass_moe_fp4(
             use_batched_format=False,
         ),
     )
-    extra_expert_args = {
-        'g1_alphas': g1_alphas,
-        'g2_alphas': g2_alphas,
-        'a1_gscale': a1_gscale,
-        'a2_gscale': a2_gscale,
-        'm': m,
-        'n': n,
-        'k': k,
-        'e': e,
-        'device': device,
-    }
 
-    # NVFP4 requires two levels of quantization, which involves computing some
-    # scaling factors dynamically. This makes it incompatible with the typical
-    # prepare -> MoE -> finalize pipeline. Move the quantization logic into the
-    # MoE body.
-    extra_prepare_args = {
-        'skip_quant': True,
-    }
-    # Similar reason as above.
-    extra_finalize_args = {
-        'skip_weight_reduce': True,
-    }
     return fn(
         hidden_states=a,
         w1=w1_fp4,
@@ -731,9 +792,6 @@ def cutlass_moe_fp4(
         a1_scale=None,
         a2_scale=None,
         apply_router_weight_on_input=apply_router_weight_on_input,
-        extra_expert_args=extra_expert_args,
-        extra_prepare_args=extra_prepare_args,
-        extra_finalize_args=extra_finalize_args,
     )
 
 
@@ -824,16 +882,6 @@ def run_cutlass_block_scaled_fused_experts(
     k = w1_q.size(1)
     n = w2_q.size(1)
 
-    expert_offsets = torch.empty((num_experts + 1, ),
-                                 dtype=torch.int32,
-                                 device="cuda")
-    problem_sizes1 = torch.empty((num_experts, 3),
-                                 dtype=torch.int32,
-                                 device="cuda")
-    problem_sizes2 = torch.empty((num_experts, 3),
-                                 dtype=torch.int32,
-                                 device="cuda")
-
     topk = topk_ids.size(1)
 
     a_q, a1_scale = _fp8_quantize(a,
@@ -842,6 +890,16 @@ def run_cutlass_block_scaled_fused_experts(
                                   block_shape=[128, 128])
     device = a_q.device
 
+    expert_offsets = torch.empty((num_experts + 1, ),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes1 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes2 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+
     a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
     c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
 
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 9b8175f42a..7b8467a5a0 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 from tqdm import tqdm
@@ -230,7 +230,6 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
         apply_router_weight_on_input: bool,
-        extra_expert_args: Optional[dict[str, Any]],
     ):
         assert self.block_shape is not None
         assert a1q_scale is not None
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index f6b62254e7..437e569d31 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional
 
 import deep_ep
 import torch
@@ -127,12 +127,16 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 expert_topk_weights)
 
     def prepare(
-        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor, num_experts: int,
-        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]]
     ) -> tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
                Optional[torch.Tensor]]:
@@ -187,11 +191,15 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
                 expert_topk_weights)
 
-    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
-                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
 
         assert self.handle is not None
 
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index cfc2bdcf02..93ac11fb4b 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
 import deep_ep
 import torch
@@ -77,7 +77,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         a1_scale: Optional[torch.Tensor],
         a2_scale: Optional[torch.Tensor],
         a1_dtype: torch.dtype,
-        quant_dtype: Optional[torch.dtype],
+        quant_dtype: Union[torch.dtype, str, None],
         per_act_token_quant: bool,
         block_shape: Optional[list[int]],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -111,12 +111,16 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         return x, x_scales
 
     def prepare(
-        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor, num_experts: int,
-        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]]
     ) -> tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
                Optional[torch.Tensor]]:
@@ -162,11 +166,15 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         return (expert_x, expert_x_scale, expert_tokens_meta, None, None)
 
-    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
-                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
         assert isinstance(
             weight_and_reduce_impl, TopKWeightAndReduceDelegate
         ), ("Weight application and reduction happens in the combine kernel.")
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 4e3e15a35a..3fbe2a0bc6 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional, Union
 
 import torch
 
@@ -8,8 +8,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate)
-from vllm.model_executor.layers.fused_moe.utils import extract_required_args
+    TopKWeightAndReduceNoOP)
 from vllm.utils.flashinfer import (flashinfer_cutlass_fused_moe,
                                    has_flashinfer_cutlass_fused_moe)
 
@@ -20,7 +19,7 @@ def is_valid_flashinfer_cutlass_fused_moe(hidden_states: torch.Tensor,
                                           w1: torch.Tensor,
                                           w2: torch.Tensor) -> bool:
     """
-    Check if the given problem size is supported by the FlashInfer CUTLASS MoE 
+    Check if the given problem size is supported by the FlashInfer CUTLASS MoE
     kernel.
     """
     if not has_flashinfer_cutlass_fused_moe():
@@ -43,31 +42,34 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
         self,
-        use_nvfp4_w4a4: bool = False,
-        use_fp8_w8a8: bool = False,
-        use_dp: bool = False,
+        g1_alphas: torch.Tensor,
+        g2_alphas: torch.Tensor,
+        a1_gscale: torch.Tensor,
+        a2_gscale: torch.Tensor,
+        out_dtype: torch.dtype,
+        quant_dtype: Union[torch.dtype, str, None],
         ep_rank: int = 0,
         ep_size: int = 1,
         tp_rank: int = 0,
         tp_size: int = 1,
-        num_dispatchers: Optional[int] = None,
-        use_batched_format: bool = False,
     ):
         super().__init__(
             FusedMoEQuantConfig(
-                quant_dtype=torch.uint8,
+                quant_dtype=quant_dtype,
                 per_act_token_quant=False,
                 block_shape=None,
             ))
-        self.use_nvfp4_w4a4 = use_nvfp4_w4a4
-        self.use_fp8_w8a8 = use_fp8_w8a8
+        assert quant_dtype == "nvfp4", ("Only nvfp4 quantization is "
+                                        "currently supported.")
         self.ep_rank = ep_rank
         self.ep_size = ep_size
         self.tp_rank = tp_rank
         self.tp_size = tp_size
-        self.use_dp = use_dp
-        assert not use_batched_format or num_dispatchers is not None
-        self.num_dispatchers = num_dispatchers
+        self.g1_alphas = g1_alphas
+        self.g2_alphas = g2_alphas
+        self.a1_gscale = a1_gscale
+        self.a2_gscale = a2_gscale
+        self.out_dtype = out_dtype
 
     @property
     def activation_formats(
@@ -84,8 +86,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         return True
 
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
-        # Let PrepareAndFinalize::finalize() decide the impl.
-        return TopKWeightAndReduceDelegate()
+        return TopKWeightAndReduceNoOP()
 
     def workspace_shapes(
         self,
@@ -117,8 +118,6 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         - Note: in order for activation chunking to work, the first dimension
           of each tuple must be the number of tokens.
         """
-        assert self.use_nvfp4_w4a4 is True, ("Only nvfp4 quantization is "
-                                             "currently supported.")
         aq_m, aq_n = aq.shape
         workspace2 = ()
         output_shape = (aq_m, aq_n * 2)
@@ -149,21 +148,9 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         workspace2: Optional[torch.Tensor],
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
         apply_router_weight_on_input: Optional[bool],
-        extra_expert_args: Optional[dict[str, Any]],
     ):
-        assert extra_expert_args is not None, \
-            "extra_expert_args must be provided"
-        required_keys = [
-            'g1_alphas', 'g2_alphas', 'a1_gscale', 'a2_gscale', 'out_dtype'
-        ]
-
-        g1_alphas, g2_alphas, a1_gscale, a2_gscale, out_dtype = (
-            extract_required_args(extra_expert_args, required_keys))
-
         # Flashinfer CUTLASS kernel takes scalar global scales,
         # min because inv_scale.
-        assert self.use_nvfp4_w4a4 is True, ("Only nvfp4 quantization is "
-                                             "currently supported.")
 
         # Ensure w1_scale and w2_scale are not None before calling view
         assert w1_scale is not None and w2_scale is not None, (
@@ -171,12 +158,12 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
             "be None for FlashInferExperts")
 
         quant_scales = [
-            a1_gscale,
+            self.a1_gscale,
             w1_scale.view(torch.int32),
-            g1_alphas,
-            a2_gscale,
+            self.g1_alphas,
+            self.a2_gscale,
             w2_scale.view(torch.int32),
-            g2_alphas,
+            self.g2_alphas,
         ]
         _ = flashinfer_cutlass_fused_moe(
             input=hidden_states,
@@ -185,7 +172,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
             # FlashInfer API requires weight to be long for nvfp4
             fc1_expert_weights=w1.view(torch.long),
             fc2_expert_weights=w2.view(torch.long),
-            output_dtype=out_dtype,
+            output_dtype=self.out_dtype,
             quant_scales=quant_scales,
             input_sf=a1q_scale,
             tp_size=self.tp_size,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index 36aca8cf74..061b02172c 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
@@ -9,7 +9,7 @@ from vllm.distributed import get_dp_group
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import (
-    extract_required_args, moe_kernel_quantize_input)
+    moe_kernel_quantize_input)
 from vllm.utils.flashinfer import nvfp4_block_scale_interleave
 
 
@@ -21,16 +21,15 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
     def __init__(
         self,
-        quant_dtype: Optional[torch.dtype] = None,
-        per_channel_quant: bool = False,
-        block_shape: Optional[list[int]] = None,
+        use_dp: bool,
+        a1_gscale: Optional[torch.Tensor],
         num_dispatchers: int = 1,
     ):
         super().__init__()
-        self.per_channel_quant = per_channel_quant
-        self.block_shape = block_shape
-        self.quant_dtype = quant_dtype
         self.num_dispatchers_ = num_dispatchers
+        self.use_dp = use_dp
+        self.a1_gscale = a1_gscale
+        self.local_tokens = None
 
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
@@ -55,10 +54,11 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
+        # TODO(bnell): use quant_config + scales instead of ctor args
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]]
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
-               Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
+               Optional[torch.Tensor]]:
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
@@ -67,22 +67,22 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 "apply_router_weight_on_input is only implemented for topk=1"
             a1.mul_(topk_weights.to(a1.dtype))
 
-        (a1_gscale, use_dp, local_tokens) = extract_required_args(
-            extra_prepare_args, ['a1_gscale', 'use_dp', 'local_tokens'])
-
         a1q, a1q_scale = moe_kernel_quantize_input(
             a1,
-            a1_gscale,
+            self.a1_gscale,
             quant_config.quant_dtype,
-            self.per_channel_quant,
-            self.block_shape,
-            is_fp4_scale_swizzled=not use_dp,  # Swizzling after communication
+            quant_config.per_act_token_quant,
+            quant_config.block_shape,
+            # Swizzling after communication
+            is_fp4_scale_swizzled=not self.use_dp,
         )
-        if use_dp:
+        if self.use_dp:
             topk_weights, topk_ids, a1q, a1q_scale = \
-                get_dp_group().all_gatherv([topk_weights, topk_ids, a1q, a1q_scale], # noqa: E501
-                                           dim=0,
-                                           sizes=get_local_sizes())
+                get_dp_group().all_gatherv(
+                    [topk_weights, topk_ids, a1q, a1q_scale],
+                    dim=0,
+                    sizes=get_local_sizes(),
+                )
             a1_m, a1_n = a1q.shape
             a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
 
@@ -91,13 +91,9 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
                  topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                  apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+                 weight_and_reduce_impl: mk.TopKWeightAndReduce) -> None:
 
-        (use_dp,
-         local_tokens) = extract_required_args(extra_finalize_args,
-                                               ['use_dp', 'local_tokens'])
-        if use_dp:
+        if self.use_dp:
             fused_expert_output = get_dp_group().reduce_scatterv(
                 fused_expert_output, dim=0, sizes=get_local_sizes())
         output.copy_(fused_expert_output)
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index 9a5c85e120..b46f4be4b9 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused batched MoE kernel."""
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
@@ -496,12 +496,16 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         return self.num_dispatchers_
 
     def prepare(
-        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor, num_experts: int,
-        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]]
     ) -> tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
                Optional[torch.Tensor]]:
@@ -590,11 +594,15 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         return b_a1, b_a1_scale, expert_tokens_meta, None, None
 
-    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
-                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
         if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate):
             weight_and_reduce_impl = TopKWeightAndReduceNaiveBatched(self.rank)
         weight_and_reduce_impl.apply(
@@ -688,18 +696,28 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
         else:
             return t.to(f32) * group_broadcast(scale, t.shape)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor],
-              w1_scale: Optional[torch.Tensor],
-              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
-              workspace2: torch.Tensor,
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
         assert hidden_states.dim() == 3
         assert expert_tokens_meta is not None
         expert_num_tokens = expert_tokens_meta.expert_num_tokens
@@ -894,18 +912,28 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         output = (num_experts, max_num_tokens * num_dp, K)
         return (workspace13, workspace2, output, a.dtype)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor],
-              w1_scale: Optional[torch.Tensor],
-              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
-              workspace2: torch.Tensor,
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
         # Check constraints.
         if self.use_int4_w4a16:
             assert hidden_states.size(-1) // 2 == w1.size(2), (
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1c497fa552..e58a9e568d 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1394,9 +1394,9 @@ def fused_experts(hidden_states: torch.Tensor,
     # E8M0 scale, which means we requantize the weight and input to the specific
     # scale. Fallen back to cutlass or triton for some cases would cause
     # accuracy issue.
-    should_use_deep_gemm = is_blackwell_deep_gemm_e8m0_used(
-    ) or _valid_deep_gemm(hidden_states, w1, w2)
-    if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm):
+    if (allow_deep_gemm and use_fp8_w8a8
+            and (is_blackwell_deep_gemm_e8m0_used()
+                 or _valid_deep_gemm(hidden_states, w1, w2))):
         assert apply_router_weight_on_input is False
         assert is_act_and_mul, (
             "DeepGemm only supports is_act_and_mul=True for now.")
@@ -1905,7 +1905,6 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
         apply_router_weight_on_input: bool,
-        extra_expert_args: Optional[dict[str, Any]],
     ):
         # Check constraints.
         if self.use_int4_w4a16:
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 6b5284dc6c..312befe2c1 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
@@ -8,7 +8,6 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate)
-from vllm.model_executor.layers.fused_moe.utils import extract_required_args
 from vllm.utils import has_triton_kernels
 
 logger = init_logger(__name__)
@@ -160,12 +159,16 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         num_dispatchers: int,
         w1_precision: "PrecisionConfig",
         w2_precision: "PrecisionConfig",
+        w1_bias: Optional[torch.Tensor],
+        w2_bias: Optional[torch.Tensor],
     ):
         super().__init__(quant_config)
         self.max_num_tokens = max_num_tokens
         self.num_dispatchers = num_dispatchers
         self.w1_precision = w1_precision
         self.w2_precision = w2_precision
+        self.w1_bias = w1_bias
+        self.w2_bias = w2_bias
 
     @property
     def activation_formats(
@@ -219,11 +222,7 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
         apply_router_weight_on_input: bool,
-        extra_expert_args: Optional[dict[str, Any]],
     ):
-        w1_bias, w2_bias = (extract_required_args(extra_expert_args,
-                                                  ["w1_bias", "w2_bias"]))
-
         return triton_kernel_fused_experts(
             output,
             hidden_states,
@@ -240,8 +239,8 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             expert_map=expert_map,
             w1_scale=w1_scale,
             w2_scale=w2_scale,
-            w1_bias=w1_bias,
-            w2_bias=w2_bias,
+            w1_bias=self.w1_bias,
+            w2_bias=self.w2_bias,
             w1_precision=self.w1_precision,
             w2_precision=self.w2_precision,
             a1_scale=a1q_scale,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 36e7582585..c3c6e47827 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -37,7 +37,6 @@ from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
 from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx,
                         round_up)
-from vllm.utils.flashinfer import has_flashinfer
 
 if current_platform.is_cuda_alike():
     from .fused_batched_moe import BatchedTritonExperts
@@ -49,9 +48,6 @@ if current_platform.is_cuda_alike():
         from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
         from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SHAPE,
                                                  DeepEPLLPrepareAndFinalize)
-    if has_flashinfer():
-        from .flashinfer_cutlass_prepare_finalize import (
-            FlashInferCutlassMoEPrepareAndFinalize)
 else:
     fused_experts = None  # type: ignore
     FusedMoEPermuteExpertsUnpermute = None  # type: ignore
@@ -80,7 +76,12 @@ class FusedMoeWeightScaleSupported(Enum):
 
 class FusedMoEMethodBase(QuantizeMethodBase):
 
-    moe: FusedMoEConfig
+    # TODO(bnell): also pass quant_config?
+    def __init__(self, moe: FusedMoEConfig):
+        super().__init__()
+        self.moe = moe
+        self.fused_experts: Optional[Callable] = None
+        self.topk_indices_dtype = None
 
     @abstractmethod
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -99,16 +100,16 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         return False
 
     @staticmethod
-    def maybe_make_prepare_finalize(
-            moe: FusedMoEConfig) -> Optional[FusedMoEPrepareAndFinalize]:
+    def _maybe_make_prepare_finalize(
+        moe: FusedMoEConfig, ) -> Optional[FusedMoEPrepareAndFinalize]:
         all2all_manager = get_ep_group().device_communicator.all2all_manager
         assert all2all_manager is not None
 
         prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None
 
-        if moe.use_flashinfer_cutlass_kernels:
-            prepare_finalize = FlashInferCutlassMoEPrepareAndFinalize(
-                quant_dtype=moe.quant_dtype, )
+        assert not moe.use_flashinfer_cutlass_kernels, \
+            "Must be created in modelopt.py"
+
         if moe.use_pplx_kernels:
             hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
                 moe.max_num_tokens,
@@ -188,14 +189,25 @@ class FusedMoEMethodBase(QuantizeMethodBase):
 
         return prepare_finalize
 
-    def init_prepare_finalize(self, moe: FusedMoEConfig):
-        self.moe = moe
-        prepare_finalize = FusedMoEMethodBase.maybe_make_prepare_finalize(
-            self.moe)
+    def maybe_make_prepare_finalize(
+        self,
+        moe: FusedMoEConfig,
+    ) -> Optional[FusedMoEPrepareAndFinalize]:
+        if moe.moe_parallel_config.use_all2all_kernels:
+            return FusedMoEMethodBase._maybe_make_prepare_finalize(moe)
+        else:
+            return None
+
+    def init_prepare_finalize(self):
+        assert self.moe is not None
+        prepare_finalize = self.maybe_make_prepare_finalize(self.moe)
 
-        self.topk_indices_dtype = None
         if prepare_finalize is not None:
-            logger.debug("%s", prepare_finalize.__class__.__name__)
+            logger.debug("%s for %s(%s)", prepare_finalize.__class__.__name__,
+                         self, id(self))
+            assert self.topk_indices_dtype is None
+            assert self.fused_experts is None, \
+                f"Attempt to override experts for {id(self)}!"
             self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
             experts = self.select_gemm_impl(prepare_finalize, self.moe)
             self.fused_experts = FusedMoEModularKernel(
@@ -214,12 +226,6 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             f"{self.__class__.__name__} must select appropriate gemm "
             "implementation based on the prepare_finalize")
 
-    def maybe_swap_experts_impl(
-        self,
-        moe_parallel_config: FusedMoEParallelConfig,
-    ):
-        pass
-
     @abstractmethod
     def apply(
         self,
@@ -251,10 +257,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
     def __init__(self, moe: FusedMoEConfig):
-        super().__init__()
-        self.fused_experts = fused_experts  # type: ignore
-        self.topk_indices_dtype = None
-        self.moe = moe
+        super().__init__(moe)
         self.has_bias = self.moe.has_bias
         self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
         if self.rocm_aiter_moe_enabled:
@@ -266,6 +269,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     def select_gemm_impl(
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
+        # TODO(bnell): Remove. Every layer should have an moe config object.
         moe: FusedMoEConfig,
     ) -> FusedMoEPermuteExpertsUnpermute:
         if (prepare_finalize.activation_format ==
@@ -474,9 +478,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 expert_map=expert_map,
                 activation=activation,
                 apply_router_weight_on_input=apply_router_weight_on_input)
-        else:
-            # add w1_bias/w2_bias to kwargs if they exist
-            kwargs = dict(
+        elif self.fused_experts is not None:
+            if self.has_bias:
+                raise ValueError(
+                    "FusedMoEModularKernel does not support bias.")
+            return self.fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
@@ -488,17 +494,22 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
             )
-            if isinstance(self.fused_experts,
-                          FusedMoEModularKernel) and self.has_bias:
-                raise ValueError(
-                    "FusedMoEModularKernel does not support bias.")
-            if self.has_bias:
-                kwargs.update({
-                    "w1_bias": getattr(layer, "w13_bias", None),
-                    "w2_bias": getattr(layer, "w2_bias", None),
-                })
-
-            return self.fused_experts(**kwargs)
+        else:
+            assert fused_experts is not None
+            return fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                w1_bias=layer.w13_bias if self.has_bias else None,
+                w2_bias=layer.w2_bias if self.has_bias else None,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+            )
 
     def forward_cpu(
         self,
@@ -868,8 +879,6 @@ class FusedMoE(CustomOp):
             moe_quant_params["intermediate_size_full"] = intermediate_size
 
         self.quant_method.create_weights(layer=self, **moe_quant_params)
-        if isinstance(self.quant_method, FusedMoEMethodBase):
-            self.quant_method.maybe_swap_experts_impl(self.moe_parallel_config)
 
         # Chunked all2all staging tensor
         self.batched_hidden_states: Optional[torch.Tensor] = None
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 6262904e4d..2ea6383d5a 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from enum import Enum
 from math import prod
-from typing import Any, Optional, final
+from typing import Optional, final
 
 import torch
 
@@ -150,15 +150,23 @@ class FusedMoEPrepareAndFinalize(ABC):
 
     @abstractmethod
     def prepare(
-        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor, num_experts: int,
-        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]]
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> tuple[
+            torch.Tensor,
+            Optional[torch.Tensor],
+            Optional[ExpertTokensMetadata],
+            Optional[torch.Tensor],
+            Optional[torch.Tensor],
+    ]:
         """
         Perform any quantization (and/or) dispatching needed
         for this kernel.
@@ -186,11 +194,15 @@ class FusedMoEPrepareAndFinalize(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
-                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: TopKWeightAndReduce,
+    ) -> None:
         """
         Perform any combine plus apply weights and perform a reduction on the
         fused experts output.
@@ -368,7 +380,6 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[ExpertTokensMetadata],
         apply_router_weight_on_input: bool,
-        extra_expert_args: Optional[dict[str, Any]],
     ):
         """
         This function computes the intermediate result of a Mixture of Experts
@@ -454,18 +465,27 @@ class FusedMoEModularKernel(torch.nn.Module):
                 f"{fused_experts.activation_formats[0]}")
 
     def _do_fused_experts(
-            self, fused_out: Optional[torch.Tensor], a1: torch.Tensor,
-            a1q: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
-            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-            activation: str, global_num_experts: int, local_num_experts: int,
-            expert_map: Optional[torch.Tensor],
-            w1_scale: Optional[torch.Tensor], w2_scale: Optional[torch.Tensor],
-            w1_zp: Optional[torch.Tensor], w2_zp: Optional[torch.Tensor],
-            a1q_scale: Optional[torch.Tensor],
-            a2_scale: Optional[torch.Tensor],
-            expert_tokens_meta: Optional[ExpertTokensMetadata],
-            apply_router_weight_on_input: bool,
-            extra_expert_args: Optional[dict[str, Any]]) -> torch.Tensor:
+        self,
+        fused_out: Optional[torch.Tensor],
+        a1: torch.Tensor,
+        a1q: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        expert_tokens_meta: Optional[ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ) -> torch.Tensor:
 
         _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
 
@@ -509,7 +529,7 @@ class FusedMoEModularKernel(torch.nn.Module):
             workspace2=workspace2,
             expert_tokens_meta=expert_tokens_meta,
             apply_router_weight_on_input=apply_router_weight_on_input,
-            extra_expert_args=extra_expert_args)
+        )
 
         return fused_out
 
@@ -533,7 +553,6 @@ class FusedMoEModularKernel(torch.nn.Module):
         a2_scale: Optional[torch.Tensor],
         expert_tokens_meta: Optional[ExpertTokensMetadata],
         apply_router_weight_on_input: bool,
-        extra_expert_args: Optional[dict[str, Any]],
     ) -> torch.Tensor:
 
         _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
@@ -541,6 +560,9 @@ class FusedMoEModularKernel(torch.nn.Module):
         CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
         num_chunks = cdiv(M, CHUNK_SIZE)
 
+        # TODO(bnell): get rid of one level here, update slice functions
+        # to nops on num_chunks==1
+
         if not self.fused_experts.supports_chunking() or num_chunks == 1:
             return self._do_fused_experts(
                 fused_out=None,
@@ -562,7 +584,7 @@ class FusedMoEModularKernel(torch.nn.Module):
                 a2_scale=a2_scale,
                 expert_tokens_meta=expert_tokens_meta,
                 apply_router_weight_on_input=apply_router_weight_on_input,
-                extra_expert_args=extra_expert_args)
+            )
 
         # Chunking required case
         assert num_chunks > 1
@@ -618,15 +640,6 @@ class FusedMoEModularKernel(torch.nn.Module):
                 expert_num_tokens=c_expert_num_tokens,
                 expert_num_tokens_cpu=c_expert_num_tokens_cpu)
 
-        m = None
-        if extra_expert_args is not None and 'm' in extra_expert_args:
-            m = extra_expert_args.get('m')
-
-        if extra_expert_args is not None:
-            chunked_extra_expert_args = extra_expert_args
-        else:
-            chunked_extra_expert_args = {}
-
         for chunk_idx in range(num_chunks):
             c_a1q, c_a1q_scale, c_a2_scale, c_topk_ids, c_topk_weights = (
                 slice_input_tensors(chunk_idx))
@@ -637,11 +650,6 @@ class FusedMoEModularKernel(torch.nn.Module):
                     expert_tokens_meta, c_topk_ids, local_num_experts,
                     expert_map)
 
-            s = chunk_idx * CHUNK_SIZE
-            e = min(s + CHUNK_SIZE, M)
-
-            if m is not None:
-                chunked_extra_expert_args['m'] = e - s
             self._do_fused_experts(
                 fused_out=slice_output_tensor(chunk_idx),
                 a1=a1,
@@ -662,7 +670,7 @@ class FusedMoEModularKernel(torch.nn.Module):
                 a2_scale=c_a2_scale,
                 expert_tokens_meta=c_expert_tokens_meta,
                 apply_router_weight_on_input=apply_router_weight_on_input,
-                extra_expert_args=chunked_extra_expert_args)
+            )
 
         return fused_out
 
@@ -684,9 +692,6 @@ class FusedMoEModularKernel(torch.nn.Module):
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
-        extra_expert_args: Optional[dict] = None,
-        extra_prepare_args: Optional[dict] = None,
-        extra_finalize_args: Optional[dict] = None,
     ) -> torch.Tensor:
         """
         This function computes a Mixture of Experts (MoE) layer using two sets
@@ -719,12 +724,6 @@ class FusedMoEModularKernel(torch.nn.Module):
         - apply_router_weight_on_input (bool): When true, the topk weights are
           applied directly on the inputs. This is only applicable when topk is
           1.
-        - extra_expert_args (Optional[dict]): Extra keyword arguments to pass to
-          fused_experts.apply.
-        - extra_prepare_args (Optional[dict]): Extra keyword arguments to pass
-          to prepare.
-        - extra_finalize_args (Optional[dict]): Extra keyword arguments to pass 
-          to finalize.
 
         Returns:
         - torch.Tensor: The output tensor after applying the MoE layer.
@@ -748,7 +747,6 @@ class FusedMoEModularKernel(torch.nn.Module):
              expert_map,
              apply_router_weight_on_input,
              self.fused_experts.quant_config,
-             extra_prepare_args,
          )
 
         # Maybe prepare gathered topk_ids and topk_weights from other EP ranks.
@@ -786,12 +784,15 @@ class FusedMoEModularKernel(torch.nn.Module):
                 a2_scale=a2_scale,
                 expert_tokens_meta=expert_tokens_meta,
                 apply_router_weight_on_input=apply_router_weight_on_input,
-                extra_expert_args=extra_expert_args)
+            )
 
         self.prepare_finalize.finalize(
-            output, fused_out, topk_weights, topk_ids,
+            output,
+            fused_out,
+            topk_weights,
+            topk_ids,
             apply_router_weight_on_input,
             self.fused_experts.finalize_weight_and_reduce_impl(),
-            extra_finalize_args)
+        )
 
         return output
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 46931f2dd7..401f37922b 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional, Union
 
 import pplx_kernels as pplx
 import torch
@@ -21,7 +21,7 @@ def pplx_hidden_dim_scale_bytes(
     max_num_tokens: int,
     hidden_dim: int,
     in_dtype: torch.dtype,
-    quant_dtype: Optional[torch.dtype],
+    quant_dtype: Union[torch.dtype, str, None],
     per_act_token_quant: bool,
     block_shape: Optional[list[int]],
 ):
@@ -32,6 +32,7 @@ def pplx_hidden_dim_scale_bytes(
     #   ceil_div(hidden_dim, block_size) * sizeof(float32)
     # For per-token: set to 4 * sizeof(float32) (x4 for alignment)
     if quant_dtype is not None:
+        assert isinstance(quant_dtype, torch.dtype)
         assert quant_dtype.itemsize == 1
         hidden_dim_bytes = hidden_dim * quant_dtype.itemsize
         elem_size = torch.float32.itemsize
@@ -89,12 +90,16 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         return self.num_dispatchers_
 
     def prepare(
-        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor, num_experts: int,
-        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]]
     ) -> tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
                Optional[torch.Tensor]]:
@@ -213,11 +218,15 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         return expert_x, expert_x_scale, expert_tokens_meta, None, None
 
-    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
-                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
         assert isinstance(
             weight_and_reduce_impl, TopKWeightAndReduceDelegate
         ), ("Weight application and reduction happens in the combine kernel.")
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 696c7cdba9..567a0a88fe 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
@@ -38,7 +38,6 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
                Optional[torch.Tensor]]:
@@ -50,32 +49,26 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
                 "apply_router_weight_on_input is only implemented for topk=1"
             a1.mul_(topk_weights.to(a1.dtype))
 
-        if (extra_prepare_args is not None
-                and extra_prepare_args.get("skip_quant", True)):
-            # Skip quantization if explicitly requested
-            return a1, None, None, None, None
-
         a1q, a1q_scale = moe_kernel_quantize_input(
             a1, a1_scale, quant_config.quant_dtype,
             quant_config.per_act_token_quant, quant_config.block_shape)
 
         return a1q, a1q_scale, None, None, None
 
-    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
-                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
-        if (extra_finalize_args is not None
-                and extra_finalize_args.get("skip_weight_reduce", True)):
-            assert output.shape == fused_expert_output.shape
-            output.copy_(fused_expert_output)
-        else:
-            if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate):
-                weight_and_reduce_impl = TopKWeightAndReduceContiguous()
-            weight_and_reduce_impl.apply(
-                output=output,
-                fused_expert_output=fused_expert_output,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                apply_router_weight_on_input=apply_router_weight_on_input)
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
+        if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate):
+            weight_and_reduce_impl = TopKWeightAndReduceContiguous()
+        weight_and_reduce_impl.apply(
+            output=output,
+            fused_expert_output=fused_expert_output,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            apply_router_weight_on_input=apply_router_weight_on_input)
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 9d0ff2e061..486ca881df 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
@@ -119,18 +119,28 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
                                                        local_num_experts,
                                                        expert_tokens_meta)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor],
-              w1_scale: Optional[torch.Tensor],
-              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
-              workspace2: torch.Tensor,
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
         use_deep_gemm = (self.allow_deep_gemm
                          and (_valid_deep_gemm(hidden_states, w1, w2)
                               or is_blackwell_deep_gemm_e8m0_used()))
@@ -158,5 +168,4 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
             workspace2,
             expert_tokens_meta,
             apply_router_weight_on_input,
-            extra_expert_args,
         )
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index 966471b5c5..4c3e700ad3 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from math import prod
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
 import torch
 
@@ -189,7 +189,7 @@ def moe_kernel_quantize_input(
         return _fp8_quantize(A, A_scale, per_act_token_quant, block_shape)
     elif quant_dtype == torch.int8:
         return _int8_quantize(A, A_scale, per_act_token_quant, block_shape)
-    elif quant_dtype == torch.uint8:  # nvfp4
+    elif quant_dtype == "nvfp4":
         return _fp4_quantize(A,
                              A_scale,
                              is_sf_swizzled_layout=is_fp4_scale_swizzled)
@@ -252,17 +252,3 @@ def _validate_scale_shape(
         assert block_shape is not None
         expected = (a.shape[0], cdiv(a.shape[1], block_shape[1]))
         assert a_scale.shape == expected, f"{a_scale.shape} == {expected}"
-
-
-def extract_required_args(
-    extra_args: Optional[dict[str, Any]],
-    required_keys: list[str],
-) -> tuple[Any, ...]:
-    if extra_args is None:
-        raise ValueError("`extra_args` must be provided.")
-
-    missing_keys = [k for k in required_keys if k not in extra_args]
-    if missing_keys:
-        raise ValueError(f"Missing keys in `extra_args`: {missing_keys}")
-
-    return tuple(extra_args[k] for k in required_keys)
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index a9e967e608..fb285413ba 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -241,7 +241,7 @@ class AutoRoundConfig(QuantizationConfig):
 
         if isinstance(layer, FusedMoE):
             if use_marlin:
-                return AWQMoEMethod(quant_args_marlin)
+                return AWQMoEMethod(quant_args_marlin, layer.moe)
             from vllm.model_executor.layers.quantization.moe_wna16 import (
                 MoeWNA16Config)
 
@@ -339,7 +339,7 @@ class AutoRoundConfig(QuantizationConfig):
                 }
                 return MoeWNA16Config.from_config(config).get_quant_method(
                     layer, prefix)
-            return GPTQMarlinMoEMethod(quant_args_marlin)
+            return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
 
         if isinstance(layer, (LinearBase, ParallelLMHead)):
             if use_marlin:
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index fe42e26a17..af602eb9ac 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -113,7 +113,7 @@ class AWQConfig(QuantizationConfig):
             }
             awq_marlin_config = AWQMarlinConfig.from_config(
                 marlin_compatible_config_dict)
-            return AWQMoEMethod(awq_marlin_config)
+            return AWQMoEMethod(awq_marlin_config, layer.moe_config)
         return None
 
 
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index ed7ffb21e8..287d66b06d 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -10,7 +10,7 @@ import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
+    FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
     UnquantizedFusedMoEMethod)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod,
@@ -151,7 +151,7 @@ class AWQMarlinConfig(QuantizationConfig):
                     "Falling back to Moe WNA16 kernels.")
                 return MoeWNA16Config.from_config(
                     self.full_config).get_quant_method(layer, prefix)
-            return AWQMoEMethod(self)
+            return AWQMoEMethod(self, layer.moe_config)
         return None
 
     @classmethod
@@ -328,7 +328,12 @@ class AWQMarlinLinearMethod(LinearMethodBase):
 
 class AWQMoEMethod(FusedMoEMethodBase):
 
-    def __init__(self, quant_config: AWQMarlinConfig):
+    def __init__(
+        self,
+        quant_config: AWQMarlinConfig,
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         self.quant_config = quant_config
         if self.quant_config.weight_bits != 4:
             raise ValueError("AWQMoEMethod only supports 4bit now.")
@@ -500,6 +505,8 @@ class AWQMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `AWQMoEMethod` yet.")
@@ -516,7 +523,8 @@ class AWQMoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
@@ -535,4 +543,4 @@ class AWQMoEMethod(FusedMoEMethodBase):
             expert_map=expert_map,
             w1_zeros=layer.w13_qzeros,
             w2_zeros=layer.w2_qzeros,
-            workspace=layer.workspace)
\ No newline at end of file
+            workspace=layer.workspace)
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 0204ff4685..b7897a4379 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -7,6 +7,7 @@ import torch
 from packaging import version
 
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEConfig,
                                                         FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod,
@@ -132,7 +133,7 @@ class BitsAndBytesConfig(QuantizationConfig):
                 return UnquantizedLinearMethod()
             return BitsAndBytesLinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            return BitsAndBytesMoEMethod(self)
+            return BitsAndBytesMoEMethod(self, layer.moe_config)
         return None
 
 
@@ -411,7 +412,12 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
        quant_config: The BitsAndBytes quantization config.
     """
 
-    def __init__(self, quant_config: BitsAndBytesConfig):
+    def __init__(
+        self,
+        quant_config: BitsAndBytesConfig,
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         try:
             import bitsandbytes
             if version.parse(
@@ -422,7 +428,6 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
             raise ImportError("Please install bitsandbytes>=0.46.1 via "
                               "`pip install bitsandbytes>=0.46.1` to use "
                               "bitsandbytes quantizer.") from err
-        self.topk_indices_dtype = None
         self.quant_config = quant_config
 
     def create_weights(
@@ -470,6 +475,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
+        assert self.fused_experts is None
 
         if enable_eplb:
             raise NotImplementedError(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 839942beaf..42c43cbc03 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -11,20 +11,21 @@ from compressed_tensors.quantization import (ActivationOrdering,
                                              QuantizationStrategy)
 
 import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
     FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
     FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
     FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa
-    FlashInferCutlassMoEPrepareAndFinalize)
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    is_valid_flashinfer_cutlass_fused_moe)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
-    build_flashinfer_fp4_cutlass_moe_kernel,
-    flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
+    build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1,
+    select_nvfp4_gemm_impl)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_moe_marlin_supports_layer, marlin_make_workspace_new,
     marlin_moe_permute_scales)
@@ -58,6 +59,9 @@ __all__ = [
 
 class CompressedTensorsMoEMethod(FusedMoEMethodBase):
 
+    def __init_(self, moe: FusedMoEConfig):
+        super().__init__(moe)
+
     @staticmethod
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
@@ -81,18 +85,22 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
                         "WNA16MoE is not supported with actorder=group/dynamic."
                     )
                 logger.info_once("Using CompressedTensorsWNA16MoEMethod")
-                return CompressedTensorsWNA16MoEMethod(quant_config)
+                return CompressedTensorsWNA16MoEMethod(quant_config,
+                                                       layer.moe_config)
             else:
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
-                return CompressedTensorsWNA16MarlinMoEMethod(quant_config)
+                return CompressedTensorsWNA16MarlinMoEMethod(
+                    quant_config, layer.moe_config)
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
-            return CompressedTensorsW4A4MoeMethod()
+            return CompressedTensorsW4A4MoeMethod(layer.moe_config, layer)
         elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
               or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
               or quant_config._is_fp8_w8a8(weight_quant, input_quant)):
-            return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
+            return CompressedTensorsW8A8Fp8MoEMethod(quant_config,
+                                                     layer.moe_config)
         elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8Int8MoEMethod(quant_config)
+            return CompressedTensorsW8A8Int8MoEMethod(quant_config,
+                                                      layer.moe_config)
         else:
             raise RuntimeError(
                 f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}")
@@ -100,15 +108,16 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
 
 class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
 
-    def __init__(self):
+    def __init__(self, moe: FusedMoEConfig, layer: torch.nn.Module):
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
             detect_nvfp4_moe_support)
+        super().__init__(moe)
         _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
         self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
-        self.fused_experts = None  # type: ignore[assignment]
+        self.layer = layer
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -265,19 +274,36 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         layer.w2_input_scale_quant = torch.nn.Parameter(
             (layer.w2_input_global_scale), requires_grad=False)
 
-    def maybe_swap_experts_impl(self, moe_parallel_config):
+    def maybe_make_prepare_finalize(
+        self,
+        moe: FusedMoEConfig,
+    ) -> Optional[mk.FusedMoEPrepareAndFinalize]:
         if not self.allow_flashinfer:
-            return
-        self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
-            moe_parallel_config)
+            return super().maybe_make_prepare_finalize(moe)
 
-    def select_gemm_impl(self, prepare_finalize, moe):
+        prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(
+            moe,
+            a1_gscale=self.layer.w13_input_scale_quant,
+        )
+        logger.debug_once("%s", prepare_finalize.__class__.__name__)
+        return prepare_finalize
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
         """Return the appropriate GEMM experts implementation."""
-        assert moe is not None and prepare_finalize is not None
-        from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (  # noqa: E501
-            select_nvfp4_gemm_impl)
-
-        return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger)
+        experts = select_nvfp4_gemm_impl(
+            moe,
+            g1_alphas=self.layer.g1_alphas,
+            g2_alphas=self.layer.g2_alphas,
+            a1_gscale=self.layer.w13_input_scale_quant,
+            a2_gscale=self.layer.w2_input_scale_quant,
+            allow_flashinfer=self.allow_flashinfer,
+        )
+        logger.debug_once("Using %s", experts.__class__.__name__)
+        return experts
 
     def apply(
         self,
@@ -301,6 +327,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError("EPLB not supported for "
                                       "`CompressedTensorsW4A4MoeMethod` yet.")
@@ -317,6 +345,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
         )
 
         if self.use_marlin:
@@ -340,15 +369,22 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
 
         # FlashInfer fused experts path
         if self.fused_experts is not None:
-            return flashinfer_fp4_cutlass_moe_forward(
-                self.fused_experts,
-                layer,
-                x,
-                topk_weights,
-                topk_ids,
+            assert is_valid_flashinfer_cutlass_fused_moe(
+                x, layer.w13_weight, layer.w2_weight), (
+                    "Flashinfer CUTLASS Fused MoE not applicable!")
+
+            return self.fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=False,  # TODO(shuw): fix later, now output is high prec
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                w1_scale=layer.w13_blockscale_swizzled,
+                w2_scale=layer.w2_blockscale_swizzled,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
 
@@ -376,7 +412,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             n=layer.w2_weight.shape[2] * 2,
             k=x.shape[1],
             e=layer.w13_weight.shape[0],
-            device=x.device,
             apply_router_weight_on_input=apply_router_weight_on_input).to(
                 x.dtype)
 
@@ -384,15 +419,16 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
-            self,
-            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+        self,
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        moe: FusedMoEConfig,
     ):
+        super().__init__(moe)
         self.quant_config = quant_config
         self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
             "weights")
         self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
             "input_activations")
-        self.topk_indices_dtype = None
 
         per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
                       and self.input_quant.strategy
@@ -429,7 +465,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             self.weight_quant, self.input_quant)
         self.use_cutlass = (quant_config._is_fp8_w8a8_sm90(
             self.weight_quant, self.input_quant) or self.is_fp8_w8a8_sm100)
-        self.fused_experts = None  # type: ignore[assignment]
         self.disable_expert_map = False
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -614,25 +649,31 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
     ) -> FusedMoEPermuteExpertsUnpermute:
         # cutlass path
         if self.use_cutlass:
-            from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8
+            from vllm.model_executor.layers.fused_moe import (
+                CutlassBatchedExpertsFp8, CutlassExpertsFp8)
 
-            use_batched_format = (prepare_finalize.activation_format ==
-                                  FusedMoEActivationFormat.BatchedExperts)
+            experts: FusedMoEPermuteExpertsUnpermute
 
             num_dispatchers = prepare_finalize.num_dispatchers()
-            num_experts = (moe.num_local_experts
-                           if use_batched_format else moe.num_experts)
 
-            logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
-
-            experts = CutlassExpertsFp8(
-                num_experts,
-                moe.in_dtype,
-                self.input_quant.strategy == QuantizationStrategy.TOKEN,
-                self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
-                num_dispatchers=num_dispatchers,
-                use_batched_format=use_batched_format,
-            )
+            if (prepare_finalize.activation_format ==
+                    FusedMoEActivationFormat.BatchedExperts):
+                logger.debug("CutlassBatchedExpertsFp8(%s)",
+                             self.__class__.__name__)
+                experts = CutlassBatchedExpertsFp8(
+                    moe.num_local_experts,
+                    num_dispatchers,
+                    moe.in_dtype,
+                    self.input_quant.strategy == QuantizationStrategy.TOKEN,
+                    self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
+                )
+            else:
+                logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
+                experts = CutlassExpertsFp8(
+                    moe.in_dtype,
+                    self.input_quant.strategy == QuantizationStrategy.TOKEN,
+                    self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
+                )
 
             self.disable_expert_map = (num_dispatchers > 1
                                        or not experts.supports_expert_map())
@@ -834,9 +875,11 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
-            self,
-            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+        self,
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        moe: FusedMoEConfig,
     ):
+        super().__init__(moe)
         self.quant_config = quant_config
         self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
             "weights")
@@ -934,6 +977,8 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for "
@@ -951,7 +996,8 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return fused_experts(
             hidden_states=x,
@@ -975,9 +1021,11 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
-            self,
-            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+        self,
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        moe: FusedMoEConfig,
     ):
+        super().__init__(moe)
         self.quant_config = quant_config
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
@@ -1233,6 +1281,8 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for "
@@ -1251,7 +1301,8 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
@@ -1279,9 +1330,11 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
-            self,
-            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+        self,
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        moe: FusedMoEConfig,
     ):
+        super().__init__(moe)
         self.quant_config = quant_config
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
@@ -1459,6 +1512,8 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError("EPLB not supported for "
                                       "`CompressedTensorsWNA16MoEMethod` yet.")
@@ -1475,7 +1530,8 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return fused_experts(
             x,
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 47eca80609..3e43caa4cb 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -6,7 +6,8 @@ from typing import Any, Callable, Optional
 import torch
 
 from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
-from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
+from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
+                                                  FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -46,13 +47,18 @@ class ExpertsInt8Config(QuantizationConfig):
         if isinstance(layer, LinearBase):
             return UnquantizedLinearMethod()
         elif isinstance(layer, FusedMoE):
-            return ExpertsInt8MoEMethod(self)
+            return ExpertsInt8MoEMethod(self, layer.moe_config)
         return None
 
 
 class ExpertsInt8MoEMethod(FusedMoEMethodBase):
 
-    def __init__(self, quant_config: ExpertsInt8Config):
+    def __init__(
+        self,
+        quant_config: ExpertsInt8Config,
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -122,6 +128,8 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ExpertsInt8MoEMethod` yet.")
@@ -138,7 +146,8 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return fused_experts(
             x,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index dbd5234286..a497449132 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import functools
 from typing import TYPE_CHECKING, Any, Callable, Optional
 
 import torch
@@ -142,7 +141,7 @@ class Fp8Config(QuantizationConfig):
                 return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            return Fp8MoEMethod(self)
+            return Fp8MoEMethod(self, layer.moe_config)
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
         return None
@@ -479,9 +478,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         quant_config: The quantization config.
     """
 
-    def __init__(self, quant_config: Fp8Config):
-
-        from vllm.model_executor.layers.fused_moe import fused_experts
+    def __init__(self, quant_config: Fp8Config, moe: FusedMoEConfig):
+        super().__init__(moe)
         self.quant_config = quant_config
         self.block_quant = self.quant_config.weight_block_size is not None
 
@@ -529,15 +527,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 "CutlassBlockScaledGroupedGemm not supported on the current "
                 "platform.")
 
-        self.topk_indices_dtype = None
-        self.fused_experts = functools.partial(  # type: ignore
-            fused_experts,
-            use_fp8_w8a8=True,
-            block_shape=self.quant_config.weight_block_size,
-            allow_deep_gemm=self.allow_deep_gemm,
-            allow_cutlass_block_scaled_grouped_gemm=(
-                self.allow_cutlass_block_scaled_grouped_gemm))
-
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -1033,7 +1022,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     num_expert_group=num_expert_group,
                     topk_group=topk_group,
                     apply_router_weight_on_input=apply_router_weight_on_input)
-        else:
+        elif self.fused_experts is not None:
             return self.fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -1052,6 +1041,30 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 a1_scale=layer.w13_input_scale,
                 a2_scale=layer.w2_input_scale,
             )
+        else:
+            from vllm.model_executor.layers.fused_moe import fused_experts
+            return fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                expert_map=expert_map,
+                w1_scale=(layer.w13_weight_scale_inv
+                          if self.block_quant else layer.w13_weight_scale),
+                w2_scale=(layer.w2_weight_scale_inv
+                          if self.block_quant else layer.w2_weight_scale),
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+                use_fp8_w8a8=True,
+                block_shape=self.quant_config.weight_block_size,
+                allow_deep_gemm=self.allow_deep_gemm,
+                allow_cutlass_block_scaled_grouped_gemm=(
+                    self.allow_cutlass_block_scaled_grouped_gemm))
 
 
 class Fp8KVCacheMethod(BaseKVCacheMethod):
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 86da04c399..49d28927d6 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -11,6 +11,7 @@ from torch.nn.parameter import Parameter, UninitializedParameter
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEConfig,
                                                         FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -58,7 +59,7 @@ class GGUFConfig(QuantizationConfig):
         elif isinstance(layer, VocabParallelEmbedding):
             return GGUFEmbeddingMethod(self)
         elif isinstance(layer, FusedMoE):
-            return GGUFMoEMethod(self)
+            return GGUFMoEMethod(self, layer.moe_config)
         return None
 
 
@@ -445,7 +446,12 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         quant_config: The GGUF quantization config.
     """
 
-    def __init__(self, quant_config: GGUFConfig):
+    def __init__(
+        self,
+        quant_config: GGUFConfig,
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -525,6 +531,8 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ):
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `GGUFMoEMethod` yet.")
@@ -545,7 +553,8 @@ class GGUFMoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
         return fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight,
                               topk_weights, topk_ids,
                               layer.w13_qweight_type.weight_type,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 3299221e3a..bd14ab9ef6 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -10,7 +10,7 @@ import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
+    FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
     UnquantizedFusedMoEMethod)
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
@@ -375,7 +375,12 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
 class GPTQMarlinMoEMethod(FusedMoEMethodBase):
     """MoE Marlin method with quantization."""
 
-    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+    def __init__(
+        self,
+        quant_config: GPTQMarlinConfig,
+        moe: FusedMoEConfig,
+    ) -> None:
+        super().__init__(moe)
         self.quant_config = quant_config
         if self.quant_config.quant_type.size_bits == 4:
             self.quant_type = scalar_types.uint4b8
@@ -646,6 +651,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `GPTQMarlinMoEMethod` yet.")
@@ -662,7 +669,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 22fbbab00e..e0f462b369 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -12,7 +12,9 @@ import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    is_valid_flashinfer_cutlass_fused_moe)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
@@ -22,8 +24,8 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
-    build_flashinfer_fp4_cutlass_moe_kernel,
-    flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
+    build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1,
+    select_nvfp4_gemm_impl)
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors,
     rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31)
@@ -177,7 +179,7 @@ class ModelOptFp8Config(QuantizationConfig):
         elif isinstance(layer, Attention):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
-            return ModelOptFp8MoEMethod(self)
+            return ModelOptFp8MoEMethod(self, layer.moe_config)
         return None
 
 
@@ -273,7 +275,12 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         quant_config: The ModelOpt quantization config.
     """
 
-    def __init__(self, quant_config: ModelOptFp8Config) -> None:
+    def __init__(
+        self,
+        quant_config: ModelOptFp8Config,
+        moe: FusedMoEConfig,
+    ) -> None:
+        super().__init__(moe)
         self.quant_config = quant_config
         from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
             cutlass_fp8_supported)
@@ -454,6 +461,8 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptFp8MoEMethod` yet.")
@@ -484,6 +493,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
         )
         from vllm.model_executor.layers.fused_moe.fused_moe import (
             fused_experts)
@@ -699,7 +709,7 @@ class ModelOptNvFp4Config(QuantizationConfig):
         elif isinstance(layer, Attention):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
-            return ModelOptNvFp4FusedMoE(self)
+            return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer)
         return None
 
 
@@ -923,10 +933,17 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         quant_config: NVFP4 Quant Config
     """
 
-    def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
-        self.quant_config = quant_config
+    def __init__(
+        self,
+        quant_config: ModelOptNvFp4Config,
+        moe: FusedMoEConfig,
+        layer: torch.nn.Module,
+    ) -> None:
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
             detect_nvfp4_moe_support)
+        super().__init__(moe)
+        self.quant_config = quant_config
+        self.layer = layer
         _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
         self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
         self.allow_flashinfer = _nvfp4.allow_flashinfer
@@ -952,27 +969,35 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         self.fused_experts: Optional[
             mk.FusedMoEModularKernel] = None  # type: ignore[assignment]
 
-    def maybe_swap_experts_impl(
+    def maybe_make_prepare_finalize(
         self,
-        moe_parallel_config: FusedMoEParallelConfig,
-    ):
+        moe: FusedMoEConfig,
+    ) -> Optional[mk.FusedMoEPrepareAndFinalize]:
         if not self.allow_flashinfer:
-            return
-        self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
-            moe_parallel_config)
+            return super().maybe_make_prepare_finalize(moe)
 
-    # This method update self.fused_experts
-    # only prepare_finalize is not None call select_gemm_impl
-    # so when native cutlass fp4, fused_expert is in fuse_moe.py fused_expert
-    # when it's not called(TP case), we still have 2 kernels to use.
-    def select_gemm_impl(self, prepare_finalize,
-                         moe) -> mk.FusedMoEPermuteExpertsUnpermute:
+        prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(
+            moe,
+            a1_gscale=self.layer.w13_input_scale_quant,
+        )
+        logger.debug_once("%s", prepare_finalize.__class__.__name__)
+        return prepare_finalize
 
-        assert moe is not None and prepare_finalize is not None
-        from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (  # noqa: E501
-            select_nvfp4_gemm_impl)
-
-        return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger)
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        experts = select_nvfp4_gemm_impl(
+            moe,
+            g1_alphas=self.layer.g1_alphas,
+            g2_alphas=self.layer.g2_alphas,
+            a1_gscale=self.layer.w13_input_scale_quant,
+            a2_gscale=self.layer.w2_input_scale_quant,
+            allow_flashinfer=self.allow_flashinfer,
+        )
+        logger.debug_once("Using %s", experts.__class__.__name__)
+        return experts
 
     def uses_weight_scale_2_pattern(self) -> bool:
         """
@@ -1362,7 +1387,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         if self.use_marlin:
             return torch.ops.vllm.fused_marlin_moe(
@@ -1404,21 +1430,28 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 n=layer.w2_weight.shape[2] * 2,
                 k=x.shape[1],
                 e=layer.w13_weight.shape[0],
-                device=x.device,
                 expert_map=expert_map,
                 apply_router_weight_on_input=apply_router_weight_on_input)
         else:
             assert self.allow_flashinfer and \
                self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-            out = flashinfer_fp4_cutlass_moe_forward(
-                self.fused_experts,
-                layer,
-                x,
-                topk_weights,
-                topk_ids,
+
+            assert is_valid_flashinfer_cutlass_fused_moe(
+                x, layer.w13_weight, layer.w2_weight), (
+                    "Flashinfer CUTLASS Fused MoE not applicable!")
+
+            out = self.fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=False,  # TODO(shuw): fix later, now output is high prec
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                w1_scale=layer.w13_blockscale_swizzled,
+                w2_scale=layer.w2_blockscale_swizzled,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
 
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index c5055a02fa..364d1ac314 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+    FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -160,7 +160,7 @@ class MoeWNA16Config(QuantizationConfig):
             else:
                 raise ValueError("moe_wna16 only support gptq and awq.")
         elif isinstance(layer, FusedMoE):
-            return MoeWNA16Method(self)
+            return MoeWNA16Method(self, layer.moe_config)
         return None
 
 
@@ -175,7 +175,12 @@ class MoeWNA16Method(FusedMoEMethodBase):
         quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
     """
 
-    def __init__(self, quant_config: MoeWNA16Config):
+    def __init__(
+        self,
+        quant_config: MoeWNA16Config,
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -302,6 +307,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `MoeWNA16Method` yet.")
@@ -318,7 +325,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         weight_bits = self.quant_config.weight_bits
         has_zp = self.quant_config.has_zp
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index dbe6c603c0..3c5d83037c 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -82,7 +82,7 @@ class Mxfp4Config(QuantizationConfig):
 class Mxfp4MoEMethod(FusedMoEMethodBase):
 
     def __init__(self, moe: FusedMoEConfig):
-        super().__init__()
+        super().__init__(moe)
         self.topk_indices_dtype = None
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 6f69210d08..58f56c6381 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -7,7 +7,8 @@ import torch
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
+from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
+                                                  FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     OCP_MX_BLOCK_SIZE)
@@ -25,6 +26,9 @@ __all__ = [
 
 class QuarkMoEMethod(FusedMoEMethodBase):
 
+    def __init__(self, moe: FusedMoEConfig):
+        super().__init__(moe)
+
     @staticmethod
     def get_moe_method(
             quant_config: "QuarkConfig",  # type: ignore # noqa E501 # noqa F821
@@ -42,17 +46,24 @@ class QuarkMoEMethod(FusedMoEMethodBase):
         input_config = layer_quant_config.get("input_tensors")
 
         if quant_config._is_fp8_w8a8(weight_config, input_config):
-            return QuarkW8A8Fp8MoEMethod(weight_config, input_config)
+            return QuarkW8A8Fp8MoEMethod(weight_config, input_config,
+                                         module.moe_config)
         elif quant_config._is_mx_fp4(weight_config, input_config):
-            return QuarkW4A4MXFp4MoEMethod(weight_config, input_config)
+            return QuarkW4A4MXFp4MoEMethod(weight_config, input_config,
+                                           module.moe_config)
         else:
             raise RuntimeError("Unsupported FusedMoe scheme")
 
 
 class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
 
-    def __init__(self, weight_config: dict[str, Any], input_config: dict[str,
-                                                                         Any]):
+    def __init__(
+        self,
+        weight_config: dict[str, Any],
+        input_config: dict[str, Any],
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         self.weight_quant = weight_config
         self.input_quant = input_config
 
@@ -215,6 +226,8 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet.")
@@ -231,7 +244,8 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return fused_experts(
             x,
@@ -253,8 +267,13 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
 
 class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
 
-    def __init__(self, weight_config: dict[str, Any], input_config: dict[str,
-                                                                         Any]):
+    def __init__(
+        self,
+        weight_config: dict[str, Any],
+        input_config: dict[str, Any],
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         self.weight_quant = weight_config
         self.input_quant = input_config
 
@@ -369,6 +388,7 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
 
         if enable_eplb:
             raise NotImplementedError(
@@ -386,7 +406,8 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         out = fused_experts(
             x,
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index cceaf9857c..8bdb50e07b 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -10,7 +10,8 @@ import torch.nn.functional as F
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
+from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
+                                                  FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -76,7 +77,7 @@ class RTNConfig(QuantizationConfig):
         if isinstance(layer, LinearBase):
             return RTNLinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            return RTNMoEMethod(self)
+            return RTNMoEMethod(self, layer.moe_config)
         return None
 
 
@@ -210,7 +211,8 @@ class RTNLinearMethod(LinearMethodBase):
 
 class RTNMoEMethod(FusedMoEMethodBase):
 
-    def __init__(self, quant_config: RTNConfig):
+    def __init__(self, quant_config: RTNConfig, moe: FusedMoEConfig):
+        super().__init__(moe)
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -289,6 +291,8 @@ class RTNMoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `RTNMoEMethod` yet.")
@@ -305,7 +309,8 @@ class RTNMoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         weight_bits = self.quant_config.weight_bits
         group_size = self.quant_config.group_size
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 8ef91eeed4..f5d7c57fe2 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -3,33 +3,30 @@
 """Utility helpers for NVFP4 + FlashInfer fused-MoE path"""
 from __future__ import annotations
 
-from typing import Optional
-
 import torch
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
-    FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe)
+    FlashInferExperts)
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
     FlashInferCutlassMoEPrepareAndFinalize)
 from vllm.platforms import current_platform
-
-logger = init_logger(__name__)
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
 __all__ = [
     "is_flashinfer_fp4_cutlass_moe_available",
     "reorder_w1w3_to_w3w1",
-    "build_flashinfer_fp4_cutlass_moe_kernel",
-    "flashinfer_fp4_cutlass_moe_forward",
+    "build_flashinfer_fp4_cutlass_moe_prepare_finalize",
 ]
 
 
 def is_flashinfer_fp4_cutlass_moe_available() -> bool:
     """Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
-    return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and current_platform.is_cuda()
+    return (envs.VLLM_USE_FLASHINFER_MOE_FP4
+            and has_flashinfer_cutlass_fused_moe()
+            and current_platform.is_cuda()
             and current_platform.is_device_capability(100))
 
 
@@ -49,105 +46,33 @@ def reorder_w1w3_to_w3w1(weight: torch.Tensor,
                                                        dim=dim).contiguous())
 
 
-def build_flashinfer_fp4_cutlass_moe_kernel(
-    moe_parallel_config: FusedMoEParallelConfig, ) -> mk.FusedMoEModularKernel:
-    """Create *and return* a FlashInfer CUTLASS fused-MoE modular kernel"""
-    experts = FlashInferExperts(
-        use_nvfp4_w4a4=True,
-        use_dp=moe_parallel_config.dp_size > 1,
-        ep_rank=moe_parallel_config.ep_rank,
-        ep_size=moe_parallel_config.ep_size,
-        tp_rank=moe_parallel_config.tp_rank,
-        tp_size=moe_parallel_config.tp_size,
-    )
-    logger.debug_once("FlashInferExperts (util)")
-    return mk.FusedMoEModularKernel(
-        FlashInferCutlassMoEPrepareAndFinalize(quant_dtype=torch.uint8),
-        experts,
-    )
-
-
-def flashinfer_fp4_cutlass_moe_forward(
-    fused_experts: mk.FusedMoEModularKernel,
-    layer: torch.nn.Module,
-    x: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    activation: str,
-    global_num_experts: int,
-    expert_map: Optional[torch.Tensor],
-    apply_router_weight_on_input: bool,
-) -> torch.Tensor:
-    """Common forward wrapper for FlashInfer NV-FP4 fused-MoE"""
-
-    assert is_valid_flashinfer_cutlass_fused_moe(
-        x, layer.w13_weight,
-        layer.w2_weight), ("FlashInfer CUTLASS fused-MoE not applicable!")
-
-    a1_gscale = layer.w13_input_scale_quant
-    a2_gscale = layer.w2_input_scale_quant
-
-    extra_expert_args = {
-        "g1_alphas": layer.g1_alphas,
-        "g2_alphas": layer.g2_alphas,
-        # Avoid confusion with a1_scale and a2_scale
-        # where are batch size related.
-        "a1_gscale": a1_gscale,
-        "a2_gscale": a2_gscale,
-        "out_dtype": x.dtype,
-    }
-    extra_prepare_args = {
-        "use_dp": layer.dp_size > 1,
-        "local_tokens": x.shape[0],
-        "a1_gscale": a1_gscale,
-    }
-    extra_finalize_args = {
-        "use_dp": layer.dp_size > 1,
-        "local_tokens": x.shape[0],
-    }
-
-    return fused_experts(
-        hidden_states=x,
-        w1=layer.w13_weight,
-        w2=layer.w2_weight,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        inplace=False,  # TODO(shuw): fix later, now output is high prec
-        activation=activation,
-        global_num_experts=global_num_experts,
-        expert_map=expert_map,
-        w1_scale=layer.w13_blockscale_swizzled,
-        w2_scale=layer.w2_blockscale_swizzled,
-        apply_router_weight_on_input=apply_router_weight_on_input,
-        extra_expert_args=extra_expert_args,
-        extra_prepare_args=extra_prepare_args,
-        extra_finalize_args=extra_finalize_args,
-    )
+def build_flashinfer_fp4_cutlass_moe_prepare_finalize(
+    moe: FusedMoEConfig,
+    a1_gscale: torch.Tensor,
+) -> mk.FusedMoEPrepareAndFinalize:
+    """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel"""
+    use_dp = moe.moe_parallel_config.dp_size > 1
+    return FlashInferCutlassMoEPrepareAndFinalize(use_dp, a1_gscale=a1_gscale)
 
 
 def select_nvfp4_gemm_impl(
-        allow_flashinfer: bool,
-        moe,  # FusedMoEConfig
-        logger):
+    moe: FusedMoEConfig,
+    g1_alphas: torch.Tensor,
+    g2_alphas: torch.Tensor,
+    a1_gscale: torch.Tensor,
+    a2_gscale: torch.Tensor,
+    allow_flashinfer: bool,
+) -> mk.FusedMoEPermuteExpertsUnpermute:
     """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers"""
 
-    # lazy import
-    from vllm.distributed import get_ep_group
-
-    all2all_manager = get_ep_group().device_communicator.all2all_manager
-    assert all2all_manager is not None
-
     if allow_flashinfer:
-        flashinfer_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
-        if flashinfer_backend != "throughput":
-            raise ValueError(
-                f"Only throughput backend is supported for FlashInferExperts, "
-                f"but got {flashinfer_backend}.")
-        logger.debug_once(
-            "Initializing FlashInferExperts with throughput backend.")
         return FlashInferExperts(
-            use_nvfp4_w4a4=True,
-            use_dp=moe.moe_parallel_config.dp_size > 1,
+            g1_alphas=g1_alphas,
+            g2_alphas=g2_alphas,
+            a1_gscale=a1_gscale,
+            a2_gscale=a2_gscale,
+            out_dtype=moe.in_dtype,
+            quant_dtype="nvfp4",
             ep_rank=moe.moe_parallel_config.ep_rank,
             ep_size=moe.moe_parallel_config.ep_size,
             tp_rank=moe.moe_parallel_config.tp_rank,

From 6cd69f51bf9312b2e7f85d4831c1a101c7e9a6e5 Mon Sep 17 00:00:00 2001
From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:47:56 -0400
Subject: [PATCH 302/932] [Model] Granite-4 support loading quantized
 checkpoint (#22925)

Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
---
 vllm/model_executor/models/granitemoehybrid.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 5704496b9a..f451e65338 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -471,7 +471,10 @@ class GraniteMoeHybridModel(nn.Module):
             # Mapping different experts' layout:
             #  from HF (input_linear, output_linear, router)
             #  to vLLM (experts_w13({e}.w1, {e}.w2), experts_w3({e}.w3), gate)
-            if n.endswith('.block_sparse_moe.input_linear.weight'):
+            # The renaming and parameter loading logic is the same for weight
+            # and weight_scale tensors so we can reuse them without issues.
+            if (n.endswith('.block_sparse_moe.input_linear.weight') or
+                    n.endswith('.block_sparse_moe.input_linear.weight_scale')):
                 for e in range(p.size(0)):
                     w1_name = n.replace(
                         '.block_sparse_moe.input_linear.weight',
@@ -490,7 +493,8 @@ class GraniteMoeHybridModel(nn.Module):
                                  w3_name,
                                  shard_id='w3',
                                  expert_id=e)
-            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+            elif (n.endswith('.block_sparse_moe.output_linear.weight') or
+                  n.endswith('.block_sparse_moe.output_linear.weight_scale')):
                 for e in range(p.size(0)):
                     w2_name = n.replace(
                         '.block_sparse_moe.output_linear.weight',

From df5afa82e5c41be7d87ddd1968e13891d22003a7 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:51:50 -0400
Subject: [PATCH 303/932] [Log] Debug Once for Randomizing dummy data for DP
 Rank (#22860)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 3ea39dc519..bef67486d5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2192,7 +2192,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     high=self.model_config.get_vocab_size(),
                     dtype=input_ids.dtype)
 
-            logger.debug("Randomizing dummy data for DP Rank")
+            logger.debug_once("Randomizing dummy data for DP Rank")
             input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
                             non_blocking=True)
             yield

From 6e670778cdd87c282c42002a2304cb0a4a165904 Mon Sep 17 00:00:00 2001
From: Zebing Lin <linzebing1995@gmail.com>
Date: Fri, 15 Aug 2025 15:12:12 -0400
Subject: [PATCH 304/932] [Core] direct indexing on self.block_table_np in
 compute_slot_mapping (#22940)

Signed-off-by: linzebing <linzebing1995@gmail.com>
---
 vllm/v1/worker/block_table.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index bf38e88f0c..5662fc350e 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -91,8 +91,7 @@ class BlockTable:
         # block_size.
         block_table_indices = (req_indices * self.max_num_blocks_per_req +
                                positions // self.block_size)
-        block_table_cpu = self.get_cpu_tensor()
-        block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
+        block_numbers = self.block_table_np.ravel()[block_table_indices]
         block_offsets = positions % self.block_size
         np.add(block_numbers * self.block_size,
                block_offsets,

From 79899b63f6d4517e002cc53eb1dcd47cd9e371ea Mon Sep 17 00:00:00 2001
From: nvjullin <jullin@nvidia.com>
Date: Sat, 16 Aug 2025 04:08:37 +0800
Subject: [PATCH 305/932] [Bugfix] Added more env vars to hash (#22449)

Signed-off-by: Julien Lin <jullin@nvidia.com>
---
 vllm/envs.py | 46 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 82084d1fc5..861e4c6a1b 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1199,14 +1199,6 @@ def compute_hash() -> str:
     affect the choice of different kernels or attention backends should
     also be included in the factors list.
     """
-    factors: list[Any] = []
-
-    # summarize environment variables
-    def factorize(name: str):
-        if __getattr__(name):
-            factors.append(__getattr__(name))
-        else:
-            factors.append("None")
 
     # The values of envs may affects the computation graph.
     # TODO(DefTruth): hash all environment variables?
@@ -1221,11 +1213,45 @@ def compute_hash() -> str:
         "VLLM_DP_SIZE",
         "VLLM_USE_STANDALONE_COMPILE",
         "VLLM_FUSED_MOE_CHUNK_SIZE",
+        "VLLM_FLASHINFER_MOE_BACKEND",
+        "VLLM_V1_USE_PREFILL_DECODE_ATTENTION",
+        "VLLM_USE_AITER_UNIFIED_ATTENTION",
+        "VLLM_ATTENTION_BACKEND",
+        "VLLM_USE_FLASHINFER_SAMPLER",
+        "VLLM_FLASHINFER_FORCE_TENSOR_CORES",
+        "VLLM_DISABLED_KERNELS",
+        "VLLM_USE_DEEP_GEMM",
         "VLLM_USE_TRTLLM_FP4_GEMM",
+        "VLLM_USE_FLASHINFER_MOE_FP8",
+        "VLLM_USE_FLASHINFER_MOE_FP4",
+        "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
+        "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
+        "VLLM_USE_CUDNN_PREFILL",
+        "VLLM_USE_TRTLLM_ATTENTION",
+        "VLLM_ROCM_USE_AITER",
+        "VLLM_ROCM_USE_AITER_PAGED_ATTN",
+        "VLLM_ROCM_USE_AITER_LINEAR",
+        "VLLM_ROCM_USE_AITER_MOE",
+        "VLLM_ROCM_USE_AITER_RMSNORM",
+        "VLLM_ROCM_USE_AITER_MLA",
+        "VLLM_ROCM_USE_AITER_MHA",
+        "VLLM_ROCM_USE_SKINNY_GEMM",
+        "VLLM_ROCM_FP8_PADDING",
+        "VLLM_ROCM_MOE_PADDING",
+        "VLLM_ROCM_CUSTOM_PAGED_ATTN",
+        "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION",
+        "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16",
+        "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB",
     ]
     for key in environment_variables_to_hash:
-        if key in environment_variables:
-            factorize(key)
+        # if this goes out of sync with environment_variables,
+        # it's not a user error, it's a bug
+        assert key in environment_variables, \
+            "Please update environment_variables_to_hash in envs.py"
+
+    factors = [
+        environment_variables[key]() for key in environment_variables_to_hash
+    ]
 
     hash_str = hashlib.md5(str(factors).encode(),
                            usedforsecurity=False).hexdigest()

From a344a1a7da3de182018e2a39ac9739ec6433e5c5 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 15 Aug 2025 16:54:20 -0400
Subject: [PATCH 306/932] Use regex in convert-results-json-to-markdown.py
 (#22989)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
---
 .../scripts/convert-results-json-to-markdown.py                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 496ee6083a..77047636bb 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -4,7 +4,6 @@
 import argparse
 import json
 import os
-import re
 import shlex
 from importlib import util
 from pathlib import Path
@@ -12,6 +11,7 @@ from typing import Any
 
 import pandas as pd
 import psutil
+import regex as re
 from tabulate import tabulate
 
 # latency results and the keys that will be printed into markdown

From 8a87cd27d94f03068b9cbc85058636fc16222e24 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 15 Aug 2025 16:56:31 -0400
Subject: [PATCH 307/932] [CI] Speed up Whisper tests by reusing server
 (#22859)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../openai/test_transcription_validation.py   | 316 ++++++++----------
 .../openai/test_translation_validation.py     | 232 +++++++------
 2 files changed, 260 insertions(+), 288 deletions(-)

diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index e103bd206b..93239f41a4 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -4,19 +4,20 @@
 # imports for guided decoding tests
 import io
 import json
-from unittest.mock import patch
 
 import librosa
 import numpy as np
 import openai
 import pytest
+import pytest_asyncio
 import soundfile as sf
-from openai._base_client import AsyncAPIClient
 
 from vllm.assets.audio import AudioAsset
 
 from ...utils import RemoteOpenAIServer
 
+MODEL_NAME = "openai/whisper-large-v3-turbo"
+SERVER_ARGS = ["--enforce-eager"]
 MISTRAL_FORMAT_ARGS = [
     "--tokenizer_mode", "mistral", "--config_format", "mistral",
     "--load_format", "mistral"
@@ -37,6 +38,18 @@ def winning_call():
         yield f
 
 
+@pytest.fixture(scope="module")
+def server():
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
@@ -60,54 +73,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
         assert "Mary had a little lamb," in out
 
 
-@pytest.mark.asyncio
-async def test_bad_requests(mary_had_lamb):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-
-        # invalid language
-        with pytest.raises(openai.BadRequestError):
-            await client.audio.transcriptions.create(model=model_name,
-                                                     file=mary_had_lamb,
-                                                     language="hh",
-                                                     temperature=0.0)
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"])
-async def test_long_audio_request(mary_had_lamb, model_name):
-    server_args = ["--enforce-eager"]
-
-    mary_had_lamb.seek(0)
-    audio, sr = librosa.load(mary_had_lamb)
-    # Add small silence after each audio for repeatability in the split process
-    audio = np.pad(audio, (0, 1600))
-    repeated_audio = np.tile(audio, 10)
-    # Repeated audio to buffer
-    buffer = io.BytesIO()
-    sf.write(buffer, repeated_audio, sr, format='WAV')
-    buffer.seek(0)
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        transcription = await client.audio.transcriptions.create(
-            model=model_name,
-            file=buffer,
-            language="en",
-            response_format="text",
-            temperature=0.0)
-        out = json.loads(transcription)['text']
-        counts = out.count("Mary had a little lamb")
-        assert counts == 10, counts
-
-
 @pytest.mark.asyncio
 async def test_non_asr_model(winning_call):
     # text to text model
     model_name = "JackFram/llama-68m"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
         client = remote_server.get_async_client()
         res = await client.audio.transcriptions.create(model=model_name,
                                                        file=winning_call,
@@ -120,157 +90,149 @@ async def test_non_asr_model(winning_call):
 
 
 @pytest.mark.asyncio
-async def test_completion_endpoints():
+async def test_bad_requests(mary_had_lamb, client):
+    # invalid language
+    with pytest.raises(openai.BadRequestError):
+        await client.audio.transcriptions.create(model=MODEL_NAME,
+                                                 file=mary_had_lamb,
+                                                 language="hh",
+                                                 temperature=0.0)
+
+
+@pytest.mark.asyncio
+async def test_long_audio_request(mary_had_lamb, client):
+    mary_had_lamb.seek(0)
+    audio, sr = librosa.load(mary_had_lamb)
+    # Add small silence after each audio for repeatability in the split process
+    audio = np.pad(audio, (0, 1600))
+    repeated_audio = np.tile(audio, 10)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format='WAV')
+    buffer.seek(0)
+    transcription = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=buffer,
+        language="en",
+        response_format="text",
+        temperature=0.0)
+    out = json.loads(transcription)['text']
+    counts = out.count("Mary had a little lamb")
+    assert counts == 10, counts
+
+
+@pytest.mark.asyncio
+async def test_completion_endpoints(client):
     # text to text model
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        res = await client.chat.completions.create(
-            model=model_name,
-            messages=[{
-                "role": "system",
-                "content": "You are a helpful assistant."
-            }])
-        err = res.error
-        assert err["code"] == 400
-        assert err[
-            "message"] == "The model does not support Chat Completions API"
+    res = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }])
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Chat Completions API"
 
-        res = await client.completions.create(model=model_name, prompt="Hello")
-        err = res.error
-        assert err["code"] == 400
-        assert err["message"] == "The model does not support Completions API"
+    res = await client.completions.create(model=MODEL_NAME, prompt="Hello")
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Completions API"
 
 
 @pytest.mark.asyncio
-async def test_streaming_response(winning_call):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
+async def test_streaming_response(winning_call, client):
     transcription = ""
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        res_no_stream = await client.audio.transcriptions.create(
-            model=model_name,
-            file=winning_call,
-            response_format="json",
-            language="en",
-            temperature=0.0)
-        # Unfortunately this only works when the openai client is patched
-        # to use streaming mode, not exposed in the transcription api.
-        original_post = AsyncAPIClient.post
+    res_no_stream = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        response_format="json",
+        language="en",
+        temperature=0.0)
+    res = await client.audio.transcriptions.create(model=MODEL_NAME,
+                                                   file=winning_call,
+                                                   language="en",
+                                                   temperature=0.0,
+                                                   stream=True,
+                                                   timeout=30)
+    # Reconstruct from chunks and validate
+    async for chunk in res:
+        text = chunk.choices[0]['delta']['content']
+        transcription += text
 
-        async def post_with_stream(*args, **kwargs):
-            kwargs['stream'] = True
-            return await original_post(*args, **kwargs)
-
-        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
-            client = remote_server.get_async_client()
-            res = await client.audio.transcriptions.create(
-                model=model_name,
-                file=winning_call,
-                language="en",
-                temperature=0.0,
-                extra_body=dict(stream=True),
-                timeout=30)
-            # Reconstruct from chunks and validate
-            async for chunk in res:
-                # just a chunk
-                text = chunk.choices[0]['delta']['content']
-                transcription += text
-
-        assert transcription == res_no_stream.text
+    assert transcription == res_no_stream.text
 
 
 @pytest.mark.asyncio
-async def test_stream_options(winning_call):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        original_post = AsyncAPIClient.post
-
-        async def post_with_stream(*args, **kwargs):
-            kwargs['stream'] = True
-            return await original_post(*args, **kwargs)
-
-        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
-            client = remote_server.get_async_client()
-            res = await client.audio.transcriptions.create(
-                model=model_name,
-                file=winning_call,
-                language="en",
-                temperature=0.0,
-                extra_body=dict(stream=True,
-                                stream_include_usage=True,
-                                stream_continuous_usage_stats=True),
-                timeout=30)
-            final = False
-            continuous = True
-            async for chunk in res:
-                if not len(chunk.choices):
-                    # final usage sent
-                    final = True
-                else:
-                    continuous = continuous and hasattr(chunk, 'usage')
-            assert final and continuous
+async def test_stream_options(winning_call, client):
+    res = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        extra_body=dict(stream_include_usage=True,
+                        stream_continuous_usage_stats=True),
+        timeout=30)
+    final = False
+    continuous = True
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            final = True
+        else:
+            continuous = continuous and hasattr(chunk, 'usage')
+    assert final and continuous
 
 
 @pytest.mark.asyncio
-async def test_sampling_params(mary_had_lamb):
+async def test_sampling_params(mary_had_lamb, client):
     """
     Compare sampling with params and greedy sampling to assert results
     are different when extreme sampling parameters values are picked. 
     """
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        transcription = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            temperature=0.8,
-            extra_body=dict(seed=42,
-                            repetition_penalty=1.9,
-                            top_k=12,
-                            top_p=0.4,
-                            min_p=0.5,
-                            frequency_penalty=1.8,
-                            presence_penalty=2.0))
+    transcription = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.8,
+        extra_body=dict(seed=42,
+                        repetition_penalty=1.9,
+                        top_k=12,
+                        top_p=0.4,
+                        min_p=0.5,
+                        frequency_penalty=1.8,
+                        presence_penalty=2.0))
 
-        greedy_transcription = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            temperature=0.0,
-            extra_body=dict(seed=42))
+    greedy_transcription = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.0,
+        extra_body=dict(seed=42))
 
-        assert greedy_transcription.text != transcription.text
+    assert greedy_transcription.text != transcription.text
 
 
 @pytest.mark.asyncio
-async def test_audio_prompt(mary_had_lamb):
-    model_name = "openai/whisper-large-v3-turbo"
-    server_args = ["--enforce-eager"]
+async def test_audio_prompt(mary_had_lamb, client):
     prompt = "This is a speech, recorded in a phonograph."
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        #Prompts should not omit the part of original prompt while transcribing.
-        prefix = "The first words I spoke in the original phonograph"
-        client = remote_server.get_async_client()
-        transcription = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            response_format="text",
-            temperature=0.0)
-        out = json.loads(transcription)['text']
-        assert prefix in out
-        transcription_wprompt = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            response_format="text",
-            prompt=prompt,
-            temperature=0.0)
-        out_prompt = json.loads(transcription_wprompt)['text']
-        assert prefix in out_prompt
+    #Prompts should not omit the part of original prompt while transcribing.
+    prefix = "The first words I spoke in the original phonograph"
+    transcription = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0)
+    out = json.loads(transcription)['text']
+    assert prefix in out
+    transcription_wprompt = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        prompt=prompt,
+        temperature=0.0)
+    out_prompt = json.loads(transcription_wprompt)['text']
+    assert prefix in out_prompt
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index bfa9bdef1c..f4f5c66f2d 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -4,18 +4,21 @@
 import io
 # imports for guided decoding tests
 import json
-from unittest.mock import patch
 
+import httpx
 import librosa
 import numpy as np
 import pytest
+import pytest_asyncio
 import soundfile as sf
-from openai._base_client import AsyncAPIClient
 
 from vllm.assets.audio import AudioAsset
 
 from ...utils import RemoteOpenAIServer
 
+MODEL_NAME = "openai/whisper-small"
+SERVER_ARGS = ["--enforce-eager"]
+
 
 @pytest.fixture
 def foscolo():
@@ -25,50 +28,23 @@ def foscolo():
         yield f
 
 
-# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
-@pytest.mark.asyncio
-async def test_basic_audio(foscolo):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        translation = await client.audio.translations.create(
-            model=model_name,
-            file=foscolo,
-            response_format="text",
-            # TODO remove once language detection is implemented
-            extra_body=dict(language="it"),
-            temperature=0.0)
-        out = json.loads(translation)['text'].strip().lower()
-        assert "greek sea" in out
+@pytest.fixture(scope="module")
+def server():
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
+        yield remote_server
 
 
-@pytest.mark.asyncio
-async def test_audio_prompt(foscolo):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    # Condition whisper on starting text
-    prompt = "Nor have I ever"
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        transcription = await client.audio.translations.create(
-            model=model_name,
-            file=foscolo,
-            prompt=prompt,
-            extra_body=dict(language="it"),
-            response_format="text",
-            temperature=0.0)
-        out = json.loads(transcription)['text']
-        assert "Nor will I ever touch the sacred" not in out
-        assert prompt not in out
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
 
 
 @pytest.mark.asyncio
 async def test_non_asr_model(foscolo):
     # text to text model
     model_name = "JackFram/llama-68m"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
         client = remote_server.get_async_client()
         res = await client.audio.translations.create(model=model_name,
                                                      file=foscolo,
@@ -78,81 +54,117 @@ async def test_non_asr_model(foscolo):
         assert err["message"] == "The model does not support Translations API"
 
 
+# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
 @pytest.mark.asyncio
-async def test_streaming_response(foscolo):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
+async def test_basic_audio(foscolo, client):
+    translation = await client.audio.translations.create(
+        model=MODEL_NAME,
+        file=foscolo,
+        response_format="text",
+        # TODO remove once language detection is implemented
+        extra_body=dict(language="it"),
+        temperature=0.0)
+    out = json.loads(translation)['text'].strip().lower()
+    assert "greek sea" in out
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(foscolo, client):
+    # Condition whisper on starting text
+    prompt = "Nor have I ever"
+    transcription = await client.audio.translations.create(
+        model=MODEL_NAME,
+        file=foscolo,
+        prompt=prompt,
+        extra_body=dict(language="it"),
+        response_format="text",
+        temperature=0.0)
+    out = json.loads(transcription)['text']
+    assert "Nor will I ever touch the sacred" not in out
+    assert prompt not in out
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(foscolo, client, server):
     translation = ""
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        res_no_stream = await client.audio.translations.create(
-            model=model_name,
-            file=foscolo,
-            response_format="json",
-            extra_body=dict(language="it"),
-            temperature=0.0)
-        # Unfortunately this only works when the openai client is patched
-        # to use streaming mode, not exposed in the translation api.
-        original_post = AsyncAPIClient.post
+    res_no_stream = await client.audio.translations.create(
+        model=MODEL_NAME,
+        file=foscolo,
+        response_format="json",
+        extra_body=dict(language="it"),
+        temperature=0.0)
+    # Stream via HTTPX since OpenAI translation client doesn't expose streaming
+    url = server.url_for("v1/audio/translations")
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    data = {
+        "model": MODEL_NAME,
+        "language": "it",
+        "stream": True,
+        "temperature": 0.0,
+    }
+    foscolo.seek(0)
+    async with httpx.AsyncClient() as http_client:
+        files = {"file": foscolo}
+        async with http_client.stream("POST",
+                                      url,
+                                      headers=headers,
+                                      data=data,
+                                      files=files) as response:
+            async for line in response.aiter_lines():
+                if not line:
+                    continue
+                if line.startswith("data: "):
+                    line = line[len("data: "):]
+                if line.strip() == "[DONE]":
+                    break
+                chunk = json.loads(line)
+                text = chunk["choices"][0].get("delta", {}).get("content")
+                translation += text or ""
 
-        async def post_with_stream(*args, **kwargs):
-            kwargs['stream'] = True
-            return await original_post(*args, **kwargs)
-
-        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
-            client = remote_server.get_async_client()
-            res = await client.audio.translations.create(model=model_name,
-                                                         file=foscolo,
-                                                         temperature=0.0,
-                                                         extra_body=dict(
-                                                             stream=True,
-                                                             language="it"))
-            # Reconstruct from chunks and validate
-            async for chunk in res:
-                # just a chunk
-                text = chunk.choices[0]['delta']['content']
-                translation += text
-
-        assert translation == res_no_stream.text
+    assert translation == res_no_stream.text
 
 
 @pytest.mark.asyncio
-async def test_stream_options(foscolo):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        original_post = AsyncAPIClient.post
-
-        async def post_with_stream(*args, **kwargs):
-            kwargs['stream'] = True
-            return await original_post(*args, **kwargs)
-
-        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
-            client = remote_server.get_async_client()
-            res = await client.audio.translations.create(
-                model=model_name,
-                file=foscolo,
-                temperature=0.0,
-                extra_body=dict(language="it",
-                                stream=True,
-                                stream_include_usage=True,
-                                stream_continuous_usage_stats=True))
-            final = False
-            continuous = True
-            async for chunk in res:
-                if not len(chunk.choices):
+async def test_stream_options(foscolo, client, server):
+    url = server.url_for("v1/audio/translations")
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    data = {
+        "model": MODEL_NAME,
+        "language": "it",
+        "stream": True,
+        "stream_include_usage": True,
+        "stream_continuous_usage_stats": True,
+        "temperature": 0.0,
+    }
+    foscolo.seek(0)
+    final = False
+    continuous = True
+    async with httpx.AsyncClient() as http_client:
+        files = {"file": foscolo}
+        async with http_client.stream("POST",
+                                      url,
+                                      headers=headers,
+                                      data=data,
+                                      files=files) as response:
+            async for line in response.aiter_lines():
+                if not line:
+                    continue
+                if line.startswith("data: "):
+                    line = line[len("data: "):]
+                if line.strip() == "[DONE]":
+                    break
+                chunk = json.loads(line)
+                choices = chunk.get("choices", [])
+                if not choices:
                     # final usage sent
                     final = True
                 else:
-                    continuous = continuous and hasattr(chunk, 'usage')
-            assert final and continuous
+                    continuous = continuous and ("usage" in chunk)
+    assert final and continuous
 
 
 @pytest.mark.asyncio
-async def test_long_audio_request(foscolo):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-
+async def test_long_audio_request(foscolo, client):
     foscolo.seek(0)
     audio, sr = librosa.load(foscolo)
     repeated_audio = np.tile(audio, 2)
@@ -160,13 +172,11 @@ async def test_long_audio_request(foscolo):
     buffer = io.BytesIO()
     sf.write(buffer, repeated_audio, sr, format='WAV')
     buffer.seek(0)
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        translation = await client.audio.translations.create(
-            model=model_name,
-            file=buffer,
-            extra_body=dict(language="it"),
-            response_format="text",
-            temperature=0.0)
-        out = json.loads(translation)['text'].strip().lower()
-        assert out.count("greek sea") == 2
+    translation = await client.audio.translations.create(
+        model=MODEL_NAME,
+        file=buffer,
+        extra_body=dict(language="it"),
+        response_format="text",
+        temperature=0.0)
+    out = json.loads(translation)['text'].strip().lower()
+    assert out.count("greek sea") == 2

From 7f89ed248fef01098c7ce4bebb197b462eb15bc3 Mon Sep 17 00:00:00 2001
From: shixianc <49539556+shixianc@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:02:12 -0700
Subject: [PATCH 308/932] [Fix] enable swap_ab for pplx problem size
 computation (#22991)

Signed-off-by: Shixian Cui <shixian@amazon.com>
Co-authored-by: Shixian Cui <shixian@amazon.com>
---
 .../quantization/cutlass_w8a8/moe/moe_data.cu | 45 +++++++++++++------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
index 857cca1e82..100f485084 100644
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -161,6 +161,7 @@ void get_cutlass_moe_mm_data_caller(
       topk_ids.size(1));
 }
 
+template <bool SWAP_AB>
 __global__ void compute_pplx_data(int32_t* expert_offsets,
                                   int32_t* problem_sizes1,
                                   int32_t* problem_sizes2,
@@ -168,14 +169,23 @@ __global__ void compute_pplx_data(int32_t* expert_offsets,
                                   const int padded_m, const int n,
                                   const int k) {
   int expert_idx = threadIdx.x;
-
   expert_offsets[expert_idx] = expert_idx * padded_m;
-  problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
-  problem_sizes1[expert_idx * 3 + 1] = 2 * n;
-  problem_sizes1[expert_idx * 3 + 2] = k;
-  problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
-  problem_sizes2[expert_idx * 3 + 1] = k;
-  problem_sizes2[expert_idx * 3 + 2] = n;
+
+  if constexpr (!SWAP_AB) {
+    problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
+    problem_sizes1[expert_idx * 3 + 1] = 2 * n;
+    problem_sizes1[expert_idx * 3 + 2] = k;
+    problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
+    problem_sizes2[expert_idx * 3 + 1] = k;
+    problem_sizes2[expert_idx * 3 + 2] = n;
+  } else {
+    problem_sizes1[expert_idx * 3] = 2 * n;
+    problem_sizes1[expert_idx * 3 + 1] = expert_num_tokens[expert_idx];
+    problem_sizes1[expert_idx * 3 + 2] = k;
+    problem_sizes2[expert_idx * 3] = k;
+    problem_sizes2[expert_idx * 3 + 1] = expert_num_tokens[expert_idx];
+    problem_sizes2[expert_idx * 3 + 2] = n;
+  }
 }
 
 void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
@@ -187,10 +197,19 @@ void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
                                          const int64_t n, const int64_t k) {
   auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
 
-  compute_pplx_data<<<1, num_local_experts, 0, stream>>>(
-      static_cast<int32_t*>(expert_offsets.data_ptr()),
-      static_cast<int32_t*>(problem_sizes1.data_ptr()),
-      static_cast<int32_t*>(problem_sizes2.data_ptr()),
-      static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
-      k);
+  if (num_local_experts * padded_m > SWAP_AB_THRESHOLD) {
+    compute_pplx_data<false><<<1, num_local_experts, 0, stream>>>(
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(problem_sizes2.data_ptr()),
+        static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
+        k);
+  } else {
+    compute_pplx_data<true><<<1, num_local_experts, 0, stream>>>(
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(problem_sizes2.data_ptr()),
+        static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
+        k);
+  }
 }
\ No newline at end of file

From 00d6cba0cf430f090e22e93331255cb66d560ff2 Mon Sep 17 00:00:00 2001
From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:09:23 -0700
Subject: [PATCH 309/932] Add PrefixRepetitionRandomDataset to `vllm bench
 serve` datasets (#20638)

Signed-off-by: Seiji Eicher <seiji@anyscale.com>
---
 vllm/benchmarks/datasets.py | 133 +++++++++++++++++++++++++++++++++++-
 1 file changed, 131 insertions(+), 2 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 5299dcf54b..72d7ce49b8 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -26,6 +26,7 @@ from typing import Any, Callable, Optional, Union
 import numpy as np
 from PIL import Image
 from transformers import PreTrainedTokenizerBase
+from typing_extensions import deprecated
 
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
@@ -486,7 +487,10 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "--dataset-name",
         type=str,
         default="random",
-        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
+        choices=[
+            "sharegpt", "burstgpt", "sonnet", "random", "hf", "custom",
+            "prefix_repetition"
+        ],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument(
@@ -603,6 +607,37 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "from the sampled HF dataset.",
     )
 
+    prefix_repetition_group = parser.add_argument_group(
+        "prefix repetition dataset options")
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-prefix-len",
+        type=int,
+        default=256,
+        help="Number of prefix tokens per request, used only for prefix "
+        "repetition dataset.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-suffix-len",
+        type=int,
+        default=256,
+        help="Number of suffix tokens per request, used only for prefix "
+        "repetition dataset. Total input length is prefix_len + suffix_len.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-num-prefixes",
+        type=int,
+        default=10,
+        help="Number of prefixes to generate, used only for prefix repetition "
+        "dataset. Prompts per prefix is num_requests // num_prefixes.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens per request, used only for prefix "
+        "repetition dataset.",
+    )
+
 
 def get_samples(args, tokenizer) -> list[SampleRequest]:
     if args.dataset_name == "custom":
@@ -721,6 +756,17 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 output_len=args.random_output_len,
                 range_ratio=args.random_range_ratio,
             ),
+            "prefix_repetition":
+            lambda: PrefixRepetitionRandomDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                prefix_len=args.prefix_repetition_prefix_len,
+                suffix_len=args.prefix_repetition_suffix_len,
+                num_prefixes=args.prefix_repetition_num_prefixes,
+                output_len=args.prefix_repetition_output_len,
+            ),
         }
 
         try:
@@ -828,7 +874,9 @@ class CustomDataset(BenchmarkDataset):
 # Sonnet Dataset Implementation
 # -----------------------------------------------------------------------------
 
-
+@deprecated(
+    "SonnetDataset is deprecated and will be removed in a future version.",
+)
 class SonnetDataset(BenchmarkDataset):
     """
     Simplified implementation of the Sonnet dataset.  Loads poem lines from a
@@ -1537,3 +1585,84 @@ class MLPerfDataset(HuggingFaceDataset):
 
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Prefix Repetition Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class PrefixRepetitionRandomDataset(BenchmarkDataset):
+    # Default values copied from benchmark_serving.py for the repeated prefix 
+    # dataset.
+    DEFAULT_PREFIX_LEN = 256
+    DEFAULT_SUFFIX_LEN = 256
+    DEFAULT_NUM_PREFIXES = 10
+    DEFAULT_OUTPUT_LEN = 128
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        random.seed(self.random_seed)
+        np.random.seed(self.random_seed)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        suffix_len: int = DEFAULT_SUFFIX_LEN,
+        num_prefixes: int = DEFAULT_NUM_PREFIXES,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        vocab_size = tokenizer.vocab_size
+        prompts_per_prefix = num_requests // num_prefixes
+        if prompts_per_prefix == 0:
+            raise ValueError(
+                f"num_requests ({num_requests}) must be greater than or equal "
+                f"to num_prefixes ({num_prefixes})"
+            )
+
+        def _generate_exact_length_tokens(target_length: int) -> list[int]:
+            """Generate tokens that decode and re-encode to exactly
+            target_length."""
+            # Generate random tokens
+            tokens = np.random.randint(
+                0, vocab_size, size=target_length).tolist()
+            text = tokenizer.decode(tokens)
+            re_encoded = tokenizer.encode(text, add_special_tokens=False)
+
+            if len(re_encoded) == target_length:
+                return re_encoded
+            elif len(re_encoded) < target_length:
+                # Recursively generate additional consistent tokens
+                needed = target_length - len(re_encoded)
+                extra_tokens = _generate_exact_length_tokens(needed)
+                return re_encoded + extra_tokens
+            else:
+                # Truncate to target length
+                return re_encoded[:target_length]
+
+        requests = []
+        for _ in range(num_prefixes):
+            prefix_tokens = _generate_exact_length_tokens(prefix_len)
+
+            for _ in range(prompts_per_prefix):
+                suffix_tokens = _generate_exact_length_tokens(suffix_len)
+
+                combined_tokens = prefix_tokens + suffix_tokens
+                prompt = tokenizer.decode(combined_tokens)
+                prompt_len = len(combined_tokens)
+                requests.append(
+                    SampleRequest(
+                        prompt=prompt,
+                        prompt_len=prompt_len,
+                        expected_output_len=output_len,
+                    )
+                )
+
+        random.shuffle(requests)
+        return requests

From 1723ef1aae749929c1cbddd964ab3ffd96452a70 Mon Sep 17 00:00:00 2001
From: eigen <52445717+yyihuang@users.noreply.github.com>
Date: Fri, 15 Aug 2025 17:38:10 -0400
Subject: [PATCH 310/932] minor: zero workspace buffer init for flashinfer
 trtllm-gen attn (#22603)

---
 tests/kernels/attention/test_flashinfer_trtllm_attention.py | 4 ++--
 vllm/attention/backends/flashinfer.py                       | 2 +-
 vllm/v1/attention/backends/flashinfer.py                    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 53e225ea3e..4b84e6a00e 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -113,7 +113,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
-    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
         workspace_buffer,
         kv_layout,
@@ -247,7 +247,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
-    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, kv_layout)
     wrapper.plan(q_indptr,
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 208cacec38..a85ec24632 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -203,7 +203,7 @@ class FlashInferState(AttentionState):
 
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
-            self._workspace_buffer = torch.empty(
+            self._workspace_buffer = torch.zeros(
                 FLASHINFER_WORKSPACE_BUFFER_SIZE,
                 dtype=torch.uint8,
                 device=self.runner.device)
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 02decb171f..eac3f33e15 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -252,7 +252,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
-            self._workspace_buffer = torch.empty(
+            self._workspace_buffer = torch.zeros(
                 FLASHINFER_WORKSPACE_BUFFER_SIZE,
                 dtype=torch.uint8,
                 device=self.device)

From 177e55e3bd3dbb54089d9062b763a413c8718dff Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 15 Aug 2025 17:41:07 -0400
Subject: [PATCH 311/932] [Attention] FA3 Attention Sinks Perf Boost (#22478)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 cmake/external_projects/vllm_flash_attn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index d24d8e8e5e..4e2a0e4533 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 93cf5a08f421a3efd0c4a7e005ef8f742b578ce0
+          GIT_TAG 2d3b7508f67ad976f781e2042ace676419dd78dd
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From f5d412bafbd9d4700ff57cb6a2d5220cf2b7637e Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Sat, 16 Aug 2025 00:55:26 +0200
Subject: [PATCH 312/932] [BugFix] Fix regression caused by mamba state dtype
 PR (#22998)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/model_executor/models/phi4flash.py | 8 ++++++--
 vllm/model_executor/models/plamo2.py    | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py
index 493a4192d3..fcdfcb7bc1 100644
--- a/vllm/model_executor/models/phi4flash.py
+++ b/vllm/model_executor/models/phi4flash.py
@@ -650,8 +650,12 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
             num_mamba_layers = self.config.num_hidden_layers \
                 // 2 // self.config.mb_per_layer + 1
             self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
-                *self._get_mamba_cache_shape())
+                self.vllm_config,
+                num_mamba_layers,
+                *self._get_mamba_cache_shape(),
+                self.lm_head.weight.dtype,
+                self.lm_head.weight.dtype,
+            )
         mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
         attn_metadata = get_forward_context().attn_metadata
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 8b1df66f02..e5034b5362 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -767,8 +767,12 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
 
             self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
-                *self._get_mamba_cache_shape())
+                self.vllm_config,
+                num_mamba_layers,
+                *self._get_mamba_cache_shape(),
+                self.lm_head.weight.dtype,
+                self.lm_head.weight.dtype,
+            )
 
         mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 

From 76144adf765af39a0702a542b1b99bf3a2ad4e8f Mon Sep 17 00:00:00 2001
From: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
Date: Fri, 15 Aug 2025 16:16:23 -0700
Subject: [PATCH 313/932] ci: Add CUDA + arm64 release builds (#21201)

Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
---
 .buildkite/release-pipeline.yaml | 16 ++++++++++++++++
 docker/Dockerfile                | 17 ++---------------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 6314afd652..85d3e56387 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,4 +1,20 @@
 steps:
+  # aarch64 + CUDA builds
+  - label: "Build arm64 wheel - CUDA 12.8"
+    id: build-wheel-arm64-cuda-12-8
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # x86 + CUDA builds
   - label: "Build wheel - CUDA 12.8"
     id: build-wheel-cuda-12-8
     agents:
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 66a6e6fd6f..7493891778 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -139,21 +139,6 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 WORKDIR /workspace
 
 # install build and runtime dependencies
-
-# arm64 (GH200) build follows the practice of "use existing pytorch" build,
-# we need to install torch and torchvision from the nightly builds first,
-# pytorch will not appear as a vLLM dependency in all of the following steps
-# after this step
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            --pre pytorch_triton==3.3.0+gitab727c40; \
-    fi
-
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -234,6 +219,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         && sccache --show-stats; \
     fi
 
+ARG vllm_target_device="cuda"
+ENV VLLM_TARGET_DEVICE=${vllm_target_device}
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \

From 1fc375dc053424c3b8656802d35d5251e75fc857 Mon Sep 17 00:00:00 2001
From: rishitdholakia13 <123388671+rishitdholakia13@users.noreply.github.com>
Date: Fri, 15 Aug 2025 17:25:05 -0600
Subject: [PATCH 314/932] [Structured Outputs] [Bug] Fix misalignment in
 apply_grammar_bitmask causing unintended masking and NaN logits (#22963)

Signed-off-by: rishitdholakia13 <rishit+github@cohere.com>
---
 vllm/v1/worker/gpu_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bef67486d5..4c919b392f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1355,10 +1355,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             cumulative_index += 1 + num_spec_tokens
         grammar_bitmask = sorted_bitmask
 
-        # If the grammar bitmask and the logits have the same shape
+        # If the length of out indices and the logits have the same shape
         # we don't need to pass indices to the kernel,
         # since the bitmask is already aligned with the logits.
-        skip_out_indices = grammar_bitmask.shape[0] == logits.shape[0]
+        skip_out_indices = len(out_indices) == logits.shape[0]
 
         # Serialization of np.ndarray is much more efficient than a tensor,
         # so we receive it in that format.

From b9dc9d260762cdd98946ac6bae88e298ed28b055 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 15 Aug 2025 16:38:42 -0700
Subject: [PATCH 315/932] [BugFix] Handle case where async utility call is
 cancelled (#22996)

Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Yinghai Lu <yinghai@thinkingmachines.ai>
---
 tests/v1/engine/test_engine_core_client.py | 24 +++++++++++++++++++++-
 vllm/v1/engine/core_client.py              | 21 +++++++++++++------
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index c82285639a..37eb869fe6 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -121,8 +121,13 @@ async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict):
 
 
 # Dummy utility function to monkey-patch into engine core.
-def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
+def echo(self,
+         msg: str,
+         err_msg: Optional[str] = None,
+         sleep: Optional[float] = None) -> str:
     print(f"echo util function called: {msg}, {err_msg}")
+    if sleep is not None:
+        time.sleep(sleep)
     if err_msg is not None:
         raise ValueError(err_msg)
     return msg
@@ -289,6 +294,23 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
                 await core_client.call_utility_async("echo", None, "help!")
 
             assert str(e_info.value) == "Call to echo method failed: help!"
+
+            # Test that cancelling the utility call doesn't destabilize the
+            # engine.
+            util_task = asyncio.create_task(
+                core_client.call_utility_async("echo", "testarg2", None,
+                                               0.5))  # sleep for 0.5 sec
+            await asyncio.sleep(0.05)
+            cancelled = util_task.cancel()
+            assert cancelled
+
+            # Ensure client is still functional. The engine runs utility
+            # methods in a single thread so this request won't be processed
+            # until the cancelled sleeping one is complete.
+            result = await asyncio.wait_for(core_client.call_utility_async(
+                "echo", "testarg3"),
+                                            timeout=1.0)
+            assert result == "testarg3"
         finally:
             client.shutdown()
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 29ee0a9dfb..079dd9a7d3 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -574,13 +574,22 @@ class MPClient(EngineCoreClient):
 
 def _process_utility_output(output: UtilityOutput,
                             utility_results: dict[int, AnyFuture]):
-    """Set the result from a utility method in the waiting future"""
+    """Set the result from a utility method in the waiting future."""
     future = utility_results.pop(output.call_id)
-    if output.failure_message is not None:
-        future.set_exception(Exception(output.failure_message))
-    else:
-        assert output.result is not None
-        future.set_result(output.result.result)
+    failure_message = output.failure_message
+    try:
+        if failure_message is not None:
+            future.set_exception(Exception(failure_message))
+        else:
+            assert output.result is not None
+            future.set_result(output.result.result)
+    except asyncio.InvalidStateError:
+        # This can happen if the future is cancelled due to the
+        # original calling task being cancelled.
+        if failure_message is not None:
+            logger.error(
+                "Cancelled call to utility method failed "
+                "with error: %s", failure_message)
 
 
 class SyncMPClient(MPClient):

From c280066f9dad0288a768a6234bea08171c4b88b9 Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Sat, 16 Aug 2025 02:52:52 +0300
Subject: [PATCH 316/932] [v1] Move block_hashes from KVCacheManager to
 Request.block_hashes (#19728)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 tests/v1/core/test_async_scheduler.py         |  22 +-
 tests/v1/core/test_kv_cache_utils.py          |  50 ++--
 tests/v1/core/test_prefix_caching.py          | 225 ++++++++++--------
 tests/v1/core/test_scheduler.py               |  29 ++-
 .../core/test_single_type_kv_cache_manager.py |   2 -
 tests/v1/core/utils.py                        |  17 +-
 .../kv_connector/unit/test_nixl_connector.py  |   2 +
 .../unit/test_remote_decode_lifecycle.py      |  10 +-
 .../unit/test_remote_prefill_lifecycle.py     |  17 +-
 tests/v1/kv_connector/unit/utils.py           |  31 ++-
 vllm/utils/__init__.py                        |  18 ++
 vllm/v1/core/block_pool.py                    |  75 ++----
 vllm/v1/core/kv_cache_coordinator.py          |  33 +--
 vllm/v1/core/kv_cache_manager.py              |  51 +---
 vllm/v1/core/kv_cache_utils.py                |  78 +++---
 vllm/v1/core/sched/scheduler.py               |   2 -
 vllm/v1/core/single_type_kv_cache_manager.py  |  10 +-
 vllm/v1/engine/core.py                        |  22 +-
 vllm/v1/request.py                            |  22 +-
 19 files changed, 381 insertions(+), 335 deletions(-)

diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index 3ccefbd81c..3a9492269f 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -7,6 +7,7 @@ import pytest
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import RequestStatus
+from vllm.v1.utils import ConstantList
 
 from .utils import create_requests, create_scheduler
 
@@ -140,7 +141,8 @@ def test_prefix_caching_for_prefill_dedup():
     requests = create_requests(num_requests=5,
                                num_tokens=num_prompt_tokens,
                                max_tokens=3,
-                               same_prompt=True)
+                               same_prompt=True,
+                               block_size=BLOCK_SIZE)
     requests_copy = requests.copy()
 
     # Two requests with the same prompt.
@@ -188,7 +190,8 @@ def test_prefix_caching_for_multi_turn():
                                  block_size=BLOCK_SIZE)
     requests = create_requests(num_requests=5,
                                num_tokens=num_prompt_tokens,
-                               max_tokens=num_output_tokens)
+                               max_tokens=num_output_tokens,
+                               block_size=BLOCK_SIZE)
 
     for req in requests:
         scheduler.add_request(req)
@@ -208,14 +211,19 @@ def test_prefix_caching_for_multi_turn():
 
     # Create next-turn requests whose prompts are the full output of the
     # previous turn.
-    next_turn_requests = create_requests(
-        num_requests=5,
-        num_tokens=num_prompt_tokens + num_output_tokens,
-        max_tokens=num_output_tokens,
-    )
+    next_turn_requests = create_requests(num_requests=5,
+                                         num_tokens=num_prompt_tokens +
+                                         num_output_tokens,
+                                         max_tokens=num_output_tokens,
+                                         block_size=BLOCK_SIZE)
     for i, req in enumerate(next_turn_requests):
         req.prompt_token_ids = (requests[i].prompt_token_ids +
                                 list(requests[i].output_token_ids))
+        req._all_token_ids = req.prompt_token_ids.copy()
+        req.all_token_ids = ConstantList(req._all_token_ids)
+        req.block_hashes = []
+        req.block_hashes = req.get_hash_new_full_blocks()
+
     # Schedule the next-turn requests.
     for req in next_turn_requests:
         scheduler.add_request(req)
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 182ea2b234..e0b91e6dd7 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
-from typing import Optional
+from typing import Callable, Optional
 
 import pytest
 import torch
@@ -19,7 +19,7 @@ from vllm.v1.core.kv_cache_utils import (
     FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
     estimate_max_model_len, generate_block_hash_extra_keys,
     get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
-    hash_block_tokens, hash_request_tokens, init_none_hash,
+    get_request_block_hasher, hash_block_tokens, init_none_hash,
     is_kv_cache_type_uniform, unify_kv_cache_configs)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheTensor,
@@ -33,6 +33,8 @@ from vllm.v1.request import Request
 def make_request(
     request_id: str,
     prompt_token_ids: list[int],
+    block_size: int = 3,
+    hash_fn: Callable = hash,
     mm_positions: Optional[list[PlaceholderRange]] = None,
     mm_hashes: Optional[list[str]] = None,
     cache_salt: Optional[str] = None,
@@ -49,18 +51,17 @@ def make_request(
         mm_item = MultiModalKwargsItem.from_elems([mm_elem])
         mm_kwargs = [mm_item] * len(mm_positions)
 
-    return Request(
-        request_id=request_id,
-        prompt_token_ids=prompt_token_ids,
-        multi_modal_kwargs=mm_kwargs,
-        multi_modal_hashes=mm_hashes,
-        multi_modal_placeholders=mm_positions,
-        sampling_params=SamplingParams(max_tokens=17),
-        pooling_params=None,
-        eos_token_id=100,
-        lora_request=None,
-        cache_salt=cache_salt,
-    )
+    return Request(request_id=request_id,
+                   prompt_token_ids=prompt_token_ids,
+                   multi_modal_kwargs=mm_kwargs,
+                   multi_modal_hashes=mm_hashes,
+                   multi_modal_placeholders=mm_positions,
+                   sampling_params=SamplingParams(max_tokens=17),
+                   pooling_params=None,
+                   eos_token_id=100,
+                   lora_request=None,
+                   cache_salt=cache_salt,
+                   block_hasher=get_request_block_hasher(block_size, hash_fn))
 
 
 def new_kv_cache_spec(block_size=16,
@@ -428,12 +429,14 @@ def test_hash_block_tokens(hash_fn):
 
 
 @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
-def test_hash_request_tokens(hash_fn):
+def test_request_block_hasher(hash_fn):
     import vllm.v1.core.kv_cache_utils
     init_none_hash(hash_fn)
     request = make_request(
         request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
+        block_size=3,
+        hash_fn=hash_fn,
         mm_positions=[
             PlaceholderRange(offset=0, length=3),
             PlaceholderRange(offset=3, length=3),
@@ -441,9 +444,7 @@ def test_hash_request_tokens(hash_fn):
         mm_hashes=["hash1", "hash2"],
     )
 
-    block_size = 3
-    block_hashes = hash_request_tokens(hash_fn, block_size, request)
-
+    block_hashes = request.block_hashes
     assert len(block_hashes) == 2
     assert isinstance(block_hashes[0], vllm.v1.core.kv_cache_utils.BlockHash)
     assert isinstance(block_hashes[1], vllm.v1.core.kv_cache_utils.BlockHash)
@@ -464,6 +465,8 @@ def test_hash_tokens_different_mm_input(hash_fn):
     request1 = make_request(
         request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
+        block_size=3,
+        hash_fn=hash_fn,
         mm_positions=[
             PlaceholderRange(offset=0, length=3),
             PlaceholderRange(offset=3, length=3),
@@ -479,9 +482,8 @@ def test_hash_tokens_different_mm_input(hash_fn):
         ],
         mm_hashes=["hash3", "hash2"],
     )
-    block_size = 3
-    block_hashes1 = hash_request_tokens(hash_fn, block_size, request1)
-    block_hashes2 = hash_request_tokens(hash_fn, block_size, request2)
+    block_hashes1 = request1.block_hashes
+    block_hashes2 = request2.block_hashes
     assert block_hashes1[0] != block_hashes2[0]
     assert block_hashes1[1] != block_hashes2[1]
 
@@ -493,12 +495,13 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn):
     request = make_request(
         request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
+        block_size=3,
+        hash_fn=hash_fn,
         mm_positions=None,
         mm_hashes=None,
     )
 
-    block_size = 3
-    block_hashes = hash_request_tokens(hash_fn, block_size, request)
+    block_hashes = request.block_hashes
 
     assert len(block_hashes) == 2
     assert block_hashes[0].token_ids == (0, 1, 2)
@@ -858,6 +861,7 @@ def test_allocate_with_lookahead():
     request = make_request(
         request_id="0",
         prompt_token_ids=[],
+        block_size=block_size,
         mm_positions=None,
         mm_hashes=None,
     )
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 87acdef220..28cfca6767 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -3,7 +3,7 @@
 """Compare the with and without prefix caching."""
 
 import copy
-from typing import Optional
+from typing import Callable, Optional
 
 import pytest
 import torch
@@ -17,8 +17,9 @@ from vllm.utils import sha256, sha256_cbor_64bit
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
-                                         KVCacheBlock, hash_block_tokens,
-                                         init_none_hash)
+                                         KVCacheBlock,
+                                         get_request_block_hasher,
+                                         hash_block_tokens, init_none_hash)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, SlidingWindowSpec)
 
@@ -26,6 +27,8 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
 def make_request(
     request_id: str,
     prompt_token_ids: list[int],
+    block_size: int,
+    hash_fn: Callable,
     mm_positions: Optional[list[PlaceholderRange]] = None,
     mm_hashes: Optional[list[str]] = None,
     prompt_logprobs: Optional[int] = None,
@@ -43,19 +46,18 @@ def make_request(
         mm_item = MultiModalKwargsItem.from_elems([mm_elem])
         mm_kwargs = [mm_item] * len(mm_positions)
 
-    return Request(
-        request_id=request_id,
-        prompt_token_ids=prompt_token_ids,
-        multi_modal_kwargs=mm_kwargs,
-        multi_modal_hashes=mm_hashes,
-        multi_modal_placeholders=mm_positions,
-        sampling_params=SamplingParams(max_tokens=17,
-                                       prompt_logprobs=prompt_logprobs),
-        pooling_params=None,
-        eos_token_id=100,
-        lora_request=None,
-        cache_salt=cache_salt,
-    )
+    return Request(request_id=request_id,
+                   prompt_token_ids=prompt_token_ids,
+                   multi_modal_kwargs=mm_kwargs,
+                   multi_modal_hashes=mm_hashes,
+                   multi_modal_placeholders=mm_positions,
+                   sampling_params=SamplingParams(
+                       max_tokens=17, prompt_logprobs=prompt_logprobs),
+                   pooling_params=None,
+                   eos_token_id=100,
+                   lora_request=None,
+                   cache_salt=cache_salt,
+                   block_hasher=get_request_block_hasher(block_size, hash_fn))
 
 
 def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
@@ -105,11 +107,11 @@ def make_kv_cache_config_hybrid_model(block_size: int,
 
 @pytest.mark.parametrize("hash_algo", ["sha256", "sha256_cbor_64bit", "hash"])
 def test_prefill(hash_algo):
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
-        caching_hash_algo=hash_algo,
     )
 
     # choose the hash function according to the parameter
@@ -123,9 +125,9 @@ def test_prefill(hash_algo):
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
     all_token_ids = common_token_ids + unique_token_ids
-    req0 = make_request("0", all_token_ids)
+    req0 = make_request("0", all_token_ids, block_size, hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert len(manager.req_to_block_hashes[req0.request_id]) == 3
+    assert len(req0.block_hashes) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
@@ -152,9 +154,10 @@ def test_prefill(hash_algo):
     # Cache hit in the common prefix when the original block is still in use.
     # Incomplete 1 block (5 tokens)
     unique_token_ids = [3] * 5
-    req1 = make_request("1", common_token_ids + unique_token_ids)
+    req1 = make_request("1", common_token_ids + unique_token_ids, block_size,
+                        hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert len(req1.block_hashes) == 3
     assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
@@ -187,9 +190,10 @@ def test_prefill(hash_algo):
     # Cache hit in the common prefix when the original block is already free.
     # Incomplete 1 block (6 tokens)
     unique_token_ids = [3] * 6
-    req2 = make_request("2", common_token_ids + unique_token_ids)
+    req2 = make_request("2", common_token_ids + unique_token_ids, block_size,
+                        hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert len(manager.req_to_block_hashes[req2.request_id]) == 3
+    assert len(req2.block_hashes) == 3
     assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
@@ -208,7 +212,7 @@ def test_prefill(hash_algo):
     manager.free(req2)
 
     # Cache miss and eviction.
-    req3 = make_request("3", [99] * (16 * 10))
+    req3 = make_request("3", [99] * (16 * 10), block_size, hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -242,9 +246,9 @@ def test_prefill_hybrid_model():
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
     all_token_ids = common_token_ids + unique_token_ids
-    req0 = make_request("0", all_token_ids)
+    req0 = make_request("0", all_token_ids, block_size, hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert len(manager.req_to_block_hashes[req0.request_id]) == 3
+    assert len(req0.block_hashes) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
@@ -274,9 +278,10 @@ def test_prefill_hybrid_model():
     # Cache hit in the common prefix
     # Incomplete 1 block (5 tokens)
     unique_token_ids = [3] * 5
-    req1 = make_request("1", common_token_ids + unique_token_ids)
+    req1 = make_request("1", common_token_ids + unique_token_ids, block_size,
+                        hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert len(req1.block_hashes) == 3
     assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6,
                                                            7], [0, 10, 11])
     assert num_computed_tokens == 3 * 16
@@ -290,7 +295,7 @@ def test_prefill_hybrid_model():
             if block != manager.block_pool.null_block:
                 assert block.ref_cnt == 2
 
-    block_hashes = manager.req_to_block_hashes[req1.request_id]
+    block_hashes = req1.block_hashes
     manager.free(req0)
     manager.free(req1)
 
@@ -300,12 +305,13 @@ def test_prefill_hybrid_model():
     def test_partial_request_hit(request_id: str,
                                  hash_to_evict: list[BlockHashWithGroupId],
                                  expect_hit_length: int):
-        req = make_request(request_id, common_token_ids + unique_token_ids)
+        req = make_request(request_id, common_token_ids + unique_token_ids,
+                           block_size, hash)
         for hash_with_group_id in hash_to_evict:
             manager.block_pool.cached_block_hash_to_block.pop(
                 hash_with_group_id)
         computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-        assert len(manager.req_to_block_hashes[req.request_id]) == 3
+        assert len(req.block_hashes) == 3
         assert num_computed_tokens == expect_hit_length * block_size
         for block_per_group in computed_blocks.blocks:
             assert len(block_per_group) == num_computed_tokens // block_size
@@ -364,8 +370,9 @@ def test_prefill_plp():
     2. Schedule non-plp request and validate blocks
     3. Schedule plp request; no hit should occur; validate blocks
     '''
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
     )
@@ -380,9 +387,13 @@ def test_prefill_plp():
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
     all_token_ids = common_token_ids + unique_token_ids
-    req0 = make_request("0", all_token_ids, prompt_logprobs=5)
+    req0 = make_request("0",
+                        all_token_ids,
+                        block_size,
+                        hash_fn,
+                        prompt_logprobs=5)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert len(manager.req_to_block_hashes[req0.request_id]) == 0
+    assert len(req0.block_hashes) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
@@ -411,9 +422,10 @@ def test_prefill_plp():
     # Cache hit in the common prefix when the original block is still in use.
     # Incomplete 1 block (5 tokens)
     unique_token_ids = [3] * 5
-    req1 = make_request("1", common_token_ids + unique_token_ids)
+    req1 = make_request("1", common_token_ids + unique_token_ids, block_size,
+                        hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert len(req1.block_hashes) == 3
     assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
@@ -447,9 +459,11 @@ def test_prefill_plp():
     unique_token_ids = [3] * 6
     req2 = make_request("2",
                         common_token_ids + unique_token_ids,
+                        block_size,
+                        hash_fn,
                         prompt_logprobs=5)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert len(manager.req_to_block_hashes[req2.request_id]) == 0
+    assert len(req2.block_hashes) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req2, 55,
@@ -469,8 +483,9 @@ def test_prefill_plp():
 
 
 def test_decode():
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
     )
@@ -481,7 +496,8 @@ def test_decode():
     # Fully cache miss
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
-    req0 = make_request("0", common_token_ids + unique_token_ids)
+    req0 = make_request("0", common_token_ids + unique_token_ids, block_size,
+                        hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -518,14 +534,15 @@ def test_decode():
 
 
 def test_evict():
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
     )
 
     last_token_id = 5 * 16 + 7
-    req0 = make_request("0", list(range(last_token_id)))
+    req0 = make_request("0", list(range(last_token_id)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -536,7 +553,8 @@ def test_evict():
 
     # 3 blocks.
     req1 = make_request("1", list(range(last_token_id,
-                                        last_token_id + 3 * 16)))
+                                        last_token_id + 3 * 16)), block_size,
+                        hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -558,7 +576,7 @@ def test_evict():
     ] == [10, 6, 5, 4, 3, 2, 1, 9, 8, 7]
 
     # Touch the first 2 blocks.
-    req2 = make_request("2", list(range(2 * 16 + 3)))
+    req2 = make_request("2", list(range(2 * 16 + 3)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert computed_blocks.get_block_ids() == ([1, 2], )
     assert num_computed_tokens == 2 * 16
@@ -583,7 +601,7 @@ def test_hash_block_correct_reuse():
 
     # Allocate 1 block and cache it.
     num_tokens = block_size * 1
-    req = make_request("0", list(range(num_tokens)))
+    req = make_request("0", list(range(num_tokens)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -597,7 +615,7 @@ def test_hash_block_correct_reuse():
 
     # Allocate a new block that's not full, make sure hash info on the
     # block is cleared.
-    req = make_request("1", list(range(num_tokens - 1)))
+    req = make_request("1", list(range(num_tokens - 1)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -624,7 +642,7 @@ def test_computed_blocks_not_evicted():
 
     # Allocate a block and cache it.
     num_tokens = block_size * 1
-    req0 = make_request("0", list(range(num_tokens)))
+    req0 = make_request("0", list(range(num_tokens)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -635,7 +653,8 @@ def test_computed_blocks_not_evicted():
     assert blocks.blocks[0][0].block_id == 1
 
     # Allocate another block.
-    req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
+    req1 = make_request("1", list(range(num_tokens, num_tokens * 2)),
+                        block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -651,7 +670,7 @@ def test_computed_blocks_not_evicted():
 
     # Now if we have a cache hit on the first block, we should evict the second
     # cached block rather than the first one.
-    req2 = make_request("2", list(range(num_tokens * 2)))
+    req2 = make_request("2", list(range(num_tokens * 2)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(computed_blocks.blocks[0]) == 1
     assert computed_blocks.blocks[0][0].block_id == 1
@@ -675,7 +694,8 @@ def test_basic_prefix_caching_disabled():
         enable_caching=False,
     )
 
-    req1 = make_request("1", list(range(10)))  # 2 blocks and some more
+    req1 = make_request("1", list(range(10)), block_size,
+                        hash)  # 2 blocks and some more
 
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
@@ -689,7 +709,8 @@ def test_basic_prefix_caching_disabled():
     manager.free(req1)
 
     # No caching.
-    req2 = make_request("2", list(range(16)))  # shared prefix
+    req2 = make_request("2", list(range(16)), block_size,
+                        hash)  # shared prefix
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -699,7 +720,7 @@ def test_basic_prefix_caching_disabled():
     assert len(blocks.blocks[0]) == 4
 
     # New requests should not have any blocks.
-    req3 = make_request("3", list(range(4)))
+    req3 = make_request("3", list(range(4)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -727,20 +748,17 @@ def test_cache_blocks(hash_fn):
     #  Block 1: [4, 5, 6, 7]
     #  Block 2: [8, 9, 10, 11]
     #  Block 3: [12, 13]
-    req = make_request("0", list(range(14)))
+    req = make_request("0", list(range(14)), block_size, hash_fn)
 
     # Test that blocks are cached correctly for 2 full blocks from the start.
     blocks = [KVCacheBlock(block_id=i) for i in range(2)]
-    block_hashes: list[BlockHash] = []
 
     block_pool.cache_full_blocks(
         request=req,
         blocks=blocks,
-        block_hashes=block_hashes,
         num_cached_blocks=0,
         num_full_blocks=2,
         block_size=block_size,
-        hash_fn=hash_fn,
         kv_cache_group_id=0,
     )
 
@@ -752,11 +770,9 @@ def test_cache_blocks(hash_fn):
     block_pool.cache_full_blocks(
         request=req,
         blocks=blocks,
-        block_hashes=block_hashes,
         num_cached_blocks=2,
         num_full_blocks=3,
         block_size=block_size,
-        hash_fn=hash_fn,
         kv_cache_group_id=0,
     )
     assert len(block_pool.cached_block_hash_to_block) == 3
@@ -775,23 +791,20 @@ def test_cache_blocks_multi_group():
     #  Block 1/5: [4, 5, 6, 7]
     #  Block 2/6: [8, 9, 10, 11]
     #  Block 3/7: [12, 13]
-    req = make_request("0", list(range(14)))
+    req = make_request("0", list(range(14)), block_size, hash)
 
     # Cache the blocks for group 0.
     blocks = [KVCacheBlock(block_id=i) for i in range(2)]
-    block_hashes: list[BlockHash] = []
     block_pool.cache_full_blocks(
         request=req,
         blocks=blocks,
-        block_hashes=block_hashes,
         num_cached_blocks=0,
         num_full_blocks=2,
         block_size=block_size,
-        hash_fn=hash,
         kv_cache_group_id=0,
     )
     assert len(block_pool.cached_block_hash_to_block) == 2
-    assert len(block_hashes) == 2
+    assert len(req.block_hashes) == 3
     assert all([block.block_hash is not None for block in blocks])
 
     # Cache the blocks for group 1.
@@ -799,38 +812,36 @@ def test_cache_blocks_multi_group():
     block_pool.cache_full_blocks(
         request=req,
         blocks=blocks,
-        block_hashes=block_hashes,
         num_cached_blocks=0,
         num_full_blocks=3,
         block_size=block_size,
-        hash_fn=hash,
         kv_cache_group_id=1,
     )
     assert len(block_pool.cached_block_hash_to_block) == 5
-    assert len(block_hashes) == 3
+    assert len(req.block_hashes) == 3
     assert all([block.block_hash is not None for block in blocks])
 
     # Block hash 0: hit for group 0 and 1
     # Block hash 1: hit for group 0 and 1
     # Block hash 2: hit for group 1
 
-    assert block_pool.get_cached_block(block_hashes[0],
+    assert block_pool.get_cached_block(req.block_hashes[0],
                                        kv_cache_group_ids=[0]) is not None
-    assert block_pool.get_cached_block(block_hashes[1],
+    assert block_pool.get_cached_block(req.block_hashes[1],
                                        kv_cache_group_ids=[0]) is not None
-    assert block_pool.get_cached_block(block_hashes[2],
+    assert block_pool.get_cached_block(req.block_hashes[2],
                                        kv_cache_group_ids=[0]) is None
-    assert block_pool.get_cached_block(block_hashes[0],
+    assert block_pool.get_cached_block(req.block_hashes[0],
                                        kv_cache_group_ids=[1]) is not None
-    assert block_pool.get_cached_block(block_hashes[1],
+    assert block_pool.get_cached_block(req.block_hashes[1],
                                        kv_cache_group_ids=[1]) is not None
-    assert block_pool.get_cached_block(block_hashes[2],
+    assert block_pool.get_cached_block(req.block_hashes[2],
                                        kv_cache_group_ids=[1]) is not None
-    assert block_pool.get_cached_block(block_hashes[0],
+    assert block_pool.get_cached_block(req.block_hashes[0],
                                        kv_cache_group_ids=[0, 1]) is not None
-    assert block_pool.get_cached_block(block_hashes[1],
+    assert block_pool.get_cached_block(req.block_hashes[1],
                                        kv_cache_group_ids=[0, 1]) is not None
-    assert block_pool.get_cached_block(block_hashes[2],
+    assert block_pool.get_cached_block(req.block_hashes[2],
                                        kv_cache_group_ids=[0, 1]) is None
 
 
@@ -838,8 +849,9 @@ def test_mm_prefix_caching():
     """
     This tests that the multi-modal prefix caching is correct.
     """
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
     )
@@ -865,6 +877,8 @@ def test_mm_prefix_caching():
     mm_hashes = common_mm_hashes + ["ccc"]
     req0 = make_request("0",
                         all_token_ids,
+                        block_size,
+                        hash,
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
@@ -872,7 +886,7 @@ def test_mm_prefix_caching():
     # Completed block should have hashes with extra keys.
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    block_hashes = manager.req_to_block_hashes[req0.request_id]
+    block_hashes = req0.block_hashes
     assert len(block_hashes) == 3
     assert block_hashes[0].extra_keys == ("aaa", )
     assert block_hashes[1].extra_keys == ("aaa", "bbb")
@@ -905,6 +919,8 @@ def test_mm_prefix_caching():
     mm_hashes = common_mm_hashes + ["ccc"]
     req1 = make_request("1",
                         all_token_ids,
+                        block_size,
+                        hash,
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
@@ -927,13 +943,13 @@ def test_cache_key_salting():
     # 3 complete blocks and an incomplete block with 11 tokens.
     common_token_ids = [i for i in range(3) for _ in range(block_size)]
     token_ids = common_token_ids + [3] * 11
-    req0 = make_request("0", token_ids, cache_salt="salt1")
+    req0 = make_request("0", token_ids, block_size, hash, cache_salt="salt1")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
     # Completed block should have hashes with extra keys.
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    block_hashes = manager.req_to_block_hashes[req0.request_id]
+    block_hashes = req0.block_hashes
     assert len(block_hashes) == 3
     assert block_hashes[0].extra_keys == ("salt1", )
     assert block_hashes[1].extra_keys is None
@@ -959,7 +975,7 @@ def test_cache_key_salting():
 
     # Test cache hit with a new request that has the same salt.
     token_ids = common_token_ids + [4] * 11
-    req1 = make_request("1", token_ids, cache_salt="salt1")
+    req1 = make_request("1", token_ids, block_size, hash, cache_salt="salt1")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     # Should match only a prefix of 3 blocks.
     assert len(computed_blocks.blocks[0]) == 3
@@ -967,11 +983,11 @@ def test_cache_key_salting():
 
     # Test cache miss with same content but different salt.
     token_ids = common_token_ids + [4] * 11
-    req2 = make_request("2", token_ids, cache_salt="salt2")
+    req2 = make_request("2", token_ids, block_size, hash, cache_salt="salt2")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(computed_blocks.blocks[0]) == 0
     assert num_computed_tokens == 0
-    block_hashes = manager.req_to_block_hashes[req2.request_id]
+    block_hashes = req2.block_hashes
     assert len(block_hashes) == 3
     assert block_hashes[0].extra_keys == ("salt2", )
 
@@ -992,7 +1008,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # Complete 3 blocks (48 tokens)
     # | Common-0 | Common-1 | Common-2 | ... |
     common_token_ids = [i for i in range(3) for _ in range(16)]
-    req0 = make_request("0", common_token_ids)
+    req0 = make_request("0", common_token_ids, block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -1003,7 +1019,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
         req0.request_id]
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
-    req1 = make_request("1", common_token_ids * 2)
+    req1 = make_request("1", common_token_ids * 2, block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert computed_blocks.blocks[0] == block_part0
     assert num_computed_tokens == 3 * 16
@@ -1020,19 +1036,19 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
     # | Req1-5(F)| Req2-0   | Req2-1   | ... |
-    req2 = make_request("2", [7] * block_size * 2)
+    req2 = make_request("2", [7] * block_size * 2, block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     manager.allocate_slots(req2, block_size * 2,
-                           len(computed_blocks.blocks[0]) * 16,
+                           len(computed_blocks.blocks[0]) * block_size,
                            computed_blocks)
 
     # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
     # but it cannot be allocated due to insufficient free blocks (2).
     # In this case, the ref_cnt of the computed blocks should not be changed.
     assert manager.block_pool.free_block_queue.num_free_blocks == 5
-    req3 = make_request("3", common_token_ids * 3)
+    req3 = make_request("3", common_token_ids * 3, block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert computed_blocks.blocks[0] == block_part1
     assert num_computed_tokens == 6 * 16
@@ -1047,8 +1063,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
 
 
 def test_reset_prefix_cache():
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
     )
@@ -1056,15 +1073,15 @@ def test_reset_prefix_cache():
     full_block_token_ids = [i for i in range(3) for _ in range(16)]
     unique_token_ids = [3] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
-    req0 = make_request("0", all_token_ids)
+    req0 = make_request("0", all_token_ids, block_size, hash)
     blocks = manager.allocate_slots(req0, 55)
     assert blocks.get_block_ids() == ([1, 2, 3, 4], )
 
     unique_token_ids = [4] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
-    req1 = make_request("1", all_token_ids)
+    req1 = make_request("1", all_token_ids, block_size, hash)
     computed_blocks, _ = manager.get_computed_blocks(req1)
-    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert len(req1.block_hashes) == 3
     assert len(computed_blocks.blocks[0]) == 3
     blocks = manager.allocate_slots(req1, 7,
                                     len(computed_blocks.blocks[0]) * 16,
@@ -1086,8 +1103,9 @@ def test_reset_prefix_cache():
 
 def test_prefix_cache_stats_disabled():
     """Test that prefix_cache_stats is None when log_stats is False."""
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
         log_stats=False,  # Disable logging stats
@@ -1095,7 +1113,7 @@ def test_prefix_cache_stats_disabled():
     assert manager.prefix_cache_stats is None
 
     # Call all functions that check whether log_stats is disabled.
-    req = make_request("0", list(range(16)))
+    req = make_request("0", list(range(16)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -1192,7 +1210,7 @@ def test_kv_cache_events(blocks_to_cache: int):
     )
 
     num_tokens = block_size * blocks_to_cache
-    req0 = make_request("0", list(range(num_tokens)))
+    req0 = make_request("0", list(range(num_tokens)), block_size, hash)
     _ = manager.allocate_slots(req0, num_tokens)
     events = manager.take_events()
 
@@ -1208,7 +1226,7 @@ def test_kv_cache_events(blocks_to_cache: int):
     # Should see block_to_cache number of removed block events and a new block
     # stored event
     manager.free(req0)
-    req1 = make_request("1", list(range(num_tokens)))
+    req1 = make_request("1", list(range(num_tokens)), block_size, hash)
     _ = manager.allocate_slots(req1, num_tokens)
     events = manager.take_events()
 
@@ -1242,7 +1260,7 @@ def test_eagle_enabled_removes_last_block():
 
     # Request with 3 full blocks (48 tokens)
     token_ids = [0] * (3 * block_size)
-    req = make_request("divisible_request", token_ids)
+    req = make_request("divisible_request", token_ids, block_size, hash)
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
@@ -1252,7 +1270,7 @@ def test_eagle_enabled_removes_last_block():
     manager.free(req)
 
     # New request with same tokens + Eagle enabled
-    req_eagle = make_request("eagle_divisible", token_ids)
+    req_eagle = make_request("eagle_divisible", token_ids, block_size, hash)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
 
     # Should retain 1 block:
@@ -1273,7 +1291,7 @@ def test_eagle_with_partial_blocks():
     )
     # 2 full blocks + 5 tokens (non-divisible length)
     token_ids = [0] * (2 * block_size + 5)
-    req = make_request("partial_block_test", token_ids)
+    req = make_request("partial_block_test", token_ids, block_size, hash)
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
@@ -1283,7 +1301,7 @@ def test_eagle_with_partial_blocks():
     manager.free(req)
 
     # New request with Eagle enabled
-    req_eagle = make_request("partial_eagle", token_ids)
+    req_eagle = make_request("partial_eagle", token_ids, block_size, hash)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
     # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
     assert len(computed_blocks.blocks[0]) == 1
@@ -1314,7 +1332,7 @@ def test_eagle_with_sliding_window():
 
     # 2 full blocks + 5 tokens (non-divisible length)
     token_ids = [0] * (2 * block_size + 5)
-    req = make_request("partial_block_test", token_ids)
+    req = make_request("partial_block_test", token_ids, block_size, hash)
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
@@ -1322,12 +1340,12 @@ def test_eagle_with_sliding_window():
                            len(computed_blocks.blocks[0]) * 16,
                            computed_blocks)
     # record the block hash of the first block in the request for later use
-    block_hash_first_block = manager.req_to_block_hashes[req.request_id][0]
+    block_hash_first_block = req.block_hashes[0]
     assert block_hash_first_block is not None
     manager.free(req)
 
     # New request with Eagle enabled
-    req_eagle = make_request("partial_eagle", token_ids)
+    req_eagle = make_request("partial_eagle", token_ids, block_size, hash)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
     # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
     assert len(computed_blocks.blocks[0]) == 1
@@ -1340,7 +1358,8 @@ def test_eagle_with_sliding_window():
         BlockHashWithGroupId(block_hash_first_block, 0))
 
     # New request
-    req_after_evict = make_request("partial_eagle_after_evict", token_ids)
+    req_after_evict = make_request("partial_eagle_after_evict", token_ids,
+                                   block_size, hash)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_after_evict)
     # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is
     # not considered. But after dropping the last matched block due to eagle,
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 1c7dd0ca90..ac70c90d92 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -589,7 +589,7 @@ def test_preempt_during_execution():
                                  block_size=16,
                                  num_blocks=11,
                                  enable_prefix_caching=False)
-    requests = create_requests(num_requests=2, num_tokens=80)
+    requests = create_requests(num_requests=2, num_tokens=80, block_size=16)
 
     # Schedule the first request.
     scheduler.add_request(requests[0])
@@ -762,7 +762,7 @@ def _assert_right_scheduler_output(
 
 def _assert_right_kv_cache_manager(
     scheduler: Scheduler,
-    req_ids: list[str],
+    requests: list[Request],
     num_tokens: int,
     block_size: int,
     num_requests: int,
@@ -772,12 +772,12 @@ def _assert_right_kv_cache_manager(
 
     # Make sure the request stats are right.
     EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
-    for req_id in req_ids:
+    for req in requests:
         blocks = (scheduler.kv_cache_manager.coordinator.
-                  single_type_managers[0].req_to_blocks[req_id])
-        hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
+                  single_type_managers[0].req_to_blocks[req.request_id])
+        hashes = req.block_hashes
         assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0].
-                num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS)
+                num_cached_block[req.request_id] == EXPECTED_TOTAL_BLOCKS)
         assert len(blocks) == EXPECTED_TOTAL_BLOCKS
         assert len(hashes) == EXPECTED_TOTAL_BLOCKS
 
@@ -840,7 +840,8 @@ def test_kv_connector_basic():
     MAX_TOKENS = 3
     requests = create_requests(num_requests=NUM_REQUESTS,
                                num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+                               max_tokens=MAX_TOKENS,
+                               block_size=BLOCK_SIZE)
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -868,7 +869,7 @@ def test_kv_connector_basic():
     )
 
     # Ensure KVCacheManager is correct.
-    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
+    _assert_right_kv_cache_manager(scheduler, requests, NUM_TOKENS, BLOCK_SIZE,
                                    NUM_REQUESTS, NUM_TOTAL_BLOCKS)
 
     # Continue Generation until done.
@@ -886,7 +887,8 @@ def test_kv_connector_basic():
     NUM_TOKENS = NUM_TOKENS_PREFIX * 2
     requests = create_requests(num_requests=NUM_REQUESTS,
                                num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+                               max_tokens=MAX_TOKENS,
+                               block_size=BLOCK_SIZE)
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -915,7 +917,7 @@ def test_kv_connector_basic():
                                        NUM_MATCHED_NEW_TOKENS))
 
     # Ensure KVCacheManager is correct.
-    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
+    _assert_right_kv_cache_manager(scheduler, requests, NUM_TOKENS, BLOCK_SIZE,
                                    NUM_REQUESTS, NUM_TOTAL_BLOCKS)
 
     # Continue Generation until done.
@@ -953,7 +955,8 @@ def test_kv_connector_unable_to_allocate():
     MAX_TOKENS = 2
     requests = create_requests(num_requests=NUM_REQUESTS,
                                num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+                               max_tokens=MAX_TOKENS,
+                               block_size=BLOCK_SIZE)
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -1034,7 +1037,8 @@ def test_kv_connector_handles_preemption():
     MAX_TOKENS = BLOCK_SIZE * 2
     requests = create_requests(num_requests=NUM_REQUESTS,
                                num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+                               max_tokens=MAX_TOKENS,
+                               block_size=BLOCK_SIZE)
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -1162,7 +1166,6 @@ def assert_scheduler_empty(scheduler: Scheduler):
     # KVCache Manager.
     assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                req_to_blocks) == 0
-    assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
     assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                num_cached_block) == 0
     num_free_blocks = (
diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py
index b67c05bd7a..7dcebba491 100644
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@@ -17,7 +17,6 @@ from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
 def get_sliding_window_manager(sliding_window_spec, block_pool):
     return SlidingWindowManager(sliding_window_spec,
                                 block_pool,
-                                caching_hash_fn=lambda x: x,
                                 kv_cache_group_id=0)
 
 
@@ -25,7 +24,6 @@ def get_chunked_local_attention_manager(chunked_local_attention_spec,
                                         block_pool):
     return ChunkedLocalAttentionManager(chunked_local_attention_spec,
                                         block_pool,
-                                        caching_hash_fn=lambda x: x,
                                         kv_cache_group_id=0)
 
 
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 484afe61fc..52093d3d38 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -10,6 +10,8 @@ from vllm.multimodal.inputs import (MultiModalBatchedField,
                                     MultiModalFieldElem, MultiModalKwargsItem,
                                     PlaceholderRange)
 from vllm.sampling_params import SamplingParams
+from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
+                                         init_none_hash)
 from vllm.v1.core.sched.async_scheduler import AsyncScheduler
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -114,6 +116,9 @@ def create_scheduler(
     )
 
 
+_none_hash_initialized = False
+
+
 def create_requests(
     num_requests: int,
     num_tokens: int = 10,
@@ -122,7 +127,14 @@ def create_requests(
     stop_token_ids: Optional[list[int]] = None,
     prompt_logprobs: Optional[int] = None,
     same_prompt: bool = False,
+    block_size: int = 16,
 ) -> list[Request]:
+    global _none_hash_initialized
+    if not _none_hash_initialized:
+        init_none_hash(hash)
+        _none_hash_initialized = True
+
+    block_hasher = get_request_block_hasher(block_size, hash)
     sampling_params = SamplingParams(ignore_eos=False,
                                      max_tokens=max_tokens,
                                      stop_token_ids=stop_token_ids,
@@ -139,9 +151,11 @@ def create_requests(
             )
             mm_item = MultiModalKwargsItem.from_elems([mm_elem])
             mm_kwargs = [mm_item] * len(mm_position)
+            mm_hashes = ["hash"] * len(mm_position)
         else:
             mm_position = None
             mm_kwargs = None
+            mm_hashes = None
         prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
                             num_tokens)
         request = Request(
@@ -151,8 +165,9 @@ def create_requests(
             pooling_params=None,
             multi_modal_kwargs=mm_kwargs,
             multi_modal_placeholders=mm_position,
-            multi_modal_hashes=None,
+            multi_modal_hashes=mm_hashes,
             eos_token_id=EOS_TOKEN_ID,
+            block_hasher=block_hasher,
         )
         requests.append(request)
     return requests
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index b185936ab0..e6859ea738 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -147,6 +147,7 @@ def test_basic_interface():
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
     request = create_request(request_id=1,
+                             block_size=BLOCK_SIZE,
                              num_tokens=NUM_TOKENS,
                              do_remote_prefill=True)
     request_id = request.request_id
@@ -186,6 +187,7 @@ def test_prompt_less_than_block_size():
 
     # Request will have 1 partial remote block.
     request = create_request(request_id=1,
+                             block_size=BLOCK_SIZE,
                              num_tokens=NUM_TOKENS,
                              do_remote_prefill=True,
                              num_remote_blocks=1)
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index 2f8228864e..d8c56ac42f 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -21,6 +21,7 @@ def test_basic_lifecycle():
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
     request = create_request(request_id=1,
+                             block_size=BLOCK_SIZE,
                              max_tokens=1,
                              num_tokens=NUM_TOKENS,
                              do_remote_decode=True)
@@ -103,8 +104,10 @@ def test_short_prompt_lifecycle():
     scheduler = create_scheduler(vllm_config)
 
     # Not enough tokens for full block.
-    NUM_TOKENS = vllm_config.cache_config.block_size // 2
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_TOKENS = BLOCK_SIZE // 2
     request = create_request(request_id=1,
+                             block_size=BLOCK_SIZE,
                              max_tokens=1,
                              num_tokens=NUM_TOKENS,
                              do_remote_decode=True)
@@ -148,7 +151,9 @@ def test_prefix_cache_lifecycle():
     NUM_EXTERNAL_FULL_BLOCKS = 3
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
-    request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS)
+    request_normal = create_request(request_id=1,
+                                    block_size=BLOCK_SIZE,
+                                    num_tokens=NUM_TOKENS)
 
     scheduler.add_request(request_normal)
     scheduler_output = scheduler.schedule()
@@ -166,6 +171,7 @@ def test_prefix_cache_lifecycle():
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
     request_remote = create_request(request_id=1,
+                                    block_size=BLOCK_SIZE,
                                     num_tokens=NUM_TOKENS,
                                     do_remote_decode=True)
 
diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
index 87f7490698..21fec53442 100644
--- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -23,6 +23,7 @@ def test_basic_lifecycle():
         scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
 
     request = create_request(request_id=1,
+                             block_size=BLOCK_SIZE,
                              num_tokens=NUM_TOKENS,
                              do_remote_prefill=True)
 
@@ -133,14 +134,17 @@ def test_interleaved_lifecycle():
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
     request_remote = create_request(request_id=1,
+                                    block_size=BLOCK_SIZE,
                                     num_tokens=NUM_TOKENS,
                                     do_remote_prefill=True)
     request_local_a = create_request(
         request_id=2,
+        block_size=BLOCK_SIZE,
         num_tokens=NUM_TOKENS,
     )
     request_local_b = create_request(
         request_id=3,
+        block_size=BLOCK_SIZE,
         num_tokens=NUM_TOKENS,
     )
 
@@ -236,6 +240,7 @@ def test_no_spurious_prefix_caching():
     # Both of these requests have prompts like [1,1,1,1,1, ...]
     request_remote = create_request(
         request_id=1,
+        block_size=BLOCK_SIZE,
         num_tokens=NUM_TOKENS,
         do_remote_prefill=True,
         use_all_1s_for_prompt_tokens=True,
@@ -243,6 +248,7 @@ def test_no_spurious_prefix_caching():
 
     request_local = create_request(
         request_id=2,
+        block_size=BLOCK_SIZE,
         num_tokens=NUM_TOKENS,
         do_remote_prefill=False,
         use_all_1s_for_prompt_tokens=True,
@@ -292,6 +298,7 @@ def test_full_block_prompt():
     NUM_TOKENS = int(BLOCK_SIZE * NUM_EXTERNAL_FULL_BLOCKS)
 
     request = create_request(request_id=1,
+                             block_size=BLOCK_SIZE,
                              num_tokens=NUM_TOKENS,
                              do_remote_prefill=True)
 
@@ -364,8 +371,11 @@ def test_cannot_schedule_after_recv():
     NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
     NUM_TOKENS_REMOTE = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
 
-    request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL)
+    request_normal = create_request(request_id=1,
+                                    block_size=BLOCK_SIZE,
+                                    num_tokens=NUM_TOKENS_LOCAL)
     request_remote = create_request(request_id=2,
+                                    block_size=BLOCK_SIZE,
                                     num_tokens=NUM_TOKENS_REMOTE,
                                     do_remote_prefill=True)
 
@@ -456,8 +466,11 @@ def test_cannot_recv():
     NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
     NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5))
 
-    request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL)
+    request_normal = create_request(request_id=1,
+                                    block_size=BLOCK_SIZE,
+                                    num_tokens=NUM_TOKENS_LOCAL)
     request_remote = create_request(request_id=2,
+                                    block_size=BLOCK_SIZE,
                                     num_tokens=NUM_TOKENS_REMOTE,
                                     do_remote_prefill=True)
 
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 60847c4858..8c5d132c00 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import tempfile
 from collections import defaultdict
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 
 import torch
 
@@ -14,6 +14,8 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
     SharedStorageConnector)
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
+                                         init_none_hash)
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec)
@@ -40,7 +42,6 @@ def assert_scheduler_empty(scheduler: Scheduler):
     # KVCache Manager.
     assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                req_to_blocks) == 0
-    assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
     assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                num_cached_block) == 0
     num_free_blocks = (
@@ -115,16 +116,23 @@ def create_scheduler(
     )
 
 
-def create_request(
-    request_id: int,
-    num_tokens: int = 10,
-    max_tokens: int = 16,
-    do_remote_decode: bool = False,
-    do_remote_prefill: bool = False,
-    use_all_1s_for_prompt_tokens: bool = False,
-    num_remote_blocks: int = 3,
-) -> Request:
+_none_hash_initialized = False
+
+
+def create_request(request_id: int,
+                   num_tokens: int = 10,
+                   max_tokens: int = 16,
+                   do_remote_decode: bool = False,
+                   do_remote_prefill: bool = False,
+                   use_all_1s_for_prompt_tokens: bool = False,
+                   num_remote_blocks: int = 3,
+                   block_size: int = 16,
+                   hash_fn: Callable = hash) -> Request:
     """Make dummy request for testing."""
+    global _none_hash_initialized
+    if not _none_hash_initialized:
+        init_none_hash(hash)
+        _none_hash_initialized = True
 
     kv_transfer_params: Optional[dict[str, Any]] = None
 
@@ -158,6 +166,7 @@ def create_request(
         multi_modal_placeholders=None,
         multi_modal_hashes=None,
         eos_token_id=EOS_TOKEN_ID,
+        block_hasher=get_request_block_hasher(block_size, hash_fn),
     )
     req.kv_transfer_params = kv_transfer_params
     return req
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index a1f8ad1647..72857ee2ab 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -3243,6 +3243,24 @@ def sha256_cbor_64bit(input) -> int:
     return full_hash & ((1 << 64) - 1)
 
 
+def get_hash_fn_by_name(hash_fn_name: str) -> Callable:
+    """Get a hash function by name, or raise an error if
+    the function is not found.
+    Args:
+        hash_fn_name: Name of the hash function.
+    Returns:
+        A hash function.
+    """
+    if hash_fn_name == "sha256":
+        return sha256
+    if hash_fn_name == "sha256_cbor_64bit":
+        return sha256_cbor_64bit
+    if hash_fn_name == "builtin":
+        return hash
+
+    raise ValueError(f"Unsupported hash function: {hash_fn_name}")
+
+
 def is_torch_equal_or_newer(target: str) -> bool:
     """Check if the installed torch version is >= the target version.
 
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index ad9854dd29..839297135f 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -2,15 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
 from collections.abc import Iterable
-from typing import Callable, Optional
+from typing import Optional
 
 from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved,
                                         BlockStored, KVCacheEvent)
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
-                                         FreeKVCacheBlockQueue, KVCacheBlock,
-                                         generate_block_hash_extra_keys,
-                                         hash_block_tokens)
+                                         FreeKVCacheBlockQueue, KVCacheBlock)
 from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -97,84 +95,39 @@ class BlockPool:
         self,
         request: Request,
         blocks: list[KVCacheBlock],
-        block_hashes: list[BlockHash],
         num_cached_blocks: int,
         num_full_blocks: int,
         block_size: int,
         kv_cache_group_id: int,
-        hash_fn: Callable,
     ) -> None:
         """Cache a list of full blocks for prefix caching.
         This function takes a list of blocks that will have their block hash
-        metadata to be updated and cached. Given a request, it computes the
-        block hashes for the blocks starting from `num_cached_blocks` to
-        `num_full_blocks`, updating the metadata for each block
-        and caching them in the `cached_block_hash_to_block`.
+        metadata to be updated and cached. Given a request, it updates the
+        metadata for each block and caching it in the
+        `cached_block_hash_to_block`.
+        The block hashes values are computed by the Request object immediately
+        when it is created and when new tokens are appended.
 
         Args:
             request: The request to cache the blocks.
             blocks: All blocks in the request.
-            block_hashes: Block hashes of the blocks in the request. Note that
-            this list may be shorter than the blocks list. In this case the
-            missed block hash will be computed in this function.
             num_cached_blocks: The number of blocks that are already cached.
             num_full_blocks: The number of blocks that are full and should
                 be cached after this function.
             block_size: Number of tokens in each block.
             kv_cache_group_id: The id of the KV cache group.
-            hash_fn: The hash function to use for block hashes.
         """
         if num_cached_blocks == num_full_blocks:
             return
         new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
-        assert len(block_hashes) >= num_cached_blocks
-        new_block_hashes = block_hashes[num_cached_blocks:]
+        assert len(request.block_hashes) >= num_full_blocks
+        new_block_hashes = request.block_hashes[num_cached_blocks:]
 
-        # Update the new blocks with the block hashes through the chain.
-        if num_cached_blocks == 0:
-            prev_block_hash_value = None
-        else:
-            prev_block = blocks[num_cached_blocks - 1]
-            assert prev_block.block_hash is not None
-            prev_block_hash_value = prev_block.block_hash.get_hash_value()
-
-        parent_block_hash = prev_block_hash_value
         new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events
                                            else None)
         for i, blk in enumerate(new_full_blocks):
             assert blk.block_hash is None
-
-            if i < len(new_block_hashes):
-                # The block hash may already be computed in
-                # "get_computed_blocks" if the tokens are not generated by
-                # this request (either the prompt tokens or the previously
-                # generated tokens with preemption), or by other
-                # single_type_managers with the same block_size.
-                # In this case we simply reuse the block hash.
-                block_hash = new_block_hashes[i]
-            else:
-                # Otherwise compute the block hash and cache it in the request
-                # in case it will be preempted in the future.
-                blk_idx = num_cached_blocks + i
-                start_token_idx = blk_idx * block_size
-                end_token_idx = (blk_idx + 1) * block_size
-                block_tokens = request.all_token_ids[
-                    start_token_idx:end_token_idx]
-                assert len(block_tokens) == block_size, (
-                    f"Expected {block_size} tokens, got "
-                    f"{len(block_tokens)} at {blk_idx}th block for request "
-                    f"{request.request_id}({request})")
-
-                # Generate extra keys for multi-modal inputs. Note that since
-                # we reach to this branch only when the block is completed with
-                # generated tokens, we only need to consider the last mm input.
-                extra_keys, _ = generate_block_hash_extra_keys(
-                    request, start_token_idx, end_token_idx, -1)
-
-                # Compute the hash of the current block.
-                block_hash = hash_block_tokens(hash_fn, prev_block_hash_value,
-                                               block_tokens, extra_keys)
-                block_hashes.append(block_hash)
+            block_hash = new_block_hashes[i]
 
             # Update and added the full block to the cache.
             block_hash_with_group_id = BlockHashWithGroupId(
@@ -184,9 +137,15 @@ class BlockPool:
                 blk.block_id] = blk
             if new_hashes is not None:
                 new_hashes.append(block_hash.hash_value)
-            prev_block_hash_value = block_hash.hash_value
 
         if self.enable_kv_cache_events:
+            if num_cached_blocks == 0:
+                parent_block_hash = None
+            else:
+                parent_block = blocks[num_cached_blocks - 1]
+                assert parent_block.block_hash is not None
+                parent_block_hash = parent_block.block_hash.get_hash_value()
+
             self.kv_event_queue.append(
                 BlockStored(
                     block_hashes=new_hashes,
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index f3a16d64e1..a0ea4d9601 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
-from typing import Callable, Optional
+from typing import Optional
 
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
@@ -23,7 +23,6 @@ class KVCacheCoordinator(ABC):
         max_model_len: int,
         use_eagle: bool,
         enable_caching: bool,
-        caching_hash_fn: Callable,
         enable_kv_cache_events: bool,
     ):
         self.kv_cache_config = kv_cache_config
@@ -40,7 +39,6 @@ class KVCacheCoordinator(ABC):
                 kv_cache_spec=kv_cache_group.kv_cache_spec,
                 block_pool=self.block_pool,
                 kv_cache_group_id=i,
-                caching_hash_fn=caching_hash_fn,
             ) for i, kv_cache_group in enumerate(
                 self.kv_cache_config.kv_cache_groups))
 
@@ -99,19 +97,17 @@ class KVCacheCoordinator(ABC):
             manager.allocate_new_blocks(request_id, num_tokens)
             for manager in self.single_type_managers)
 
-    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
-                     num_computed_tokens: int) -> None:
+    def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
         """
         Cache the blocks for the request.
 
         Args:
             request: The request.
-            block_hashes: The block hashes of the request.
             num_tokens: The total number of tokens that need to be cached 
                 (including tokens that are already cached).
         """
         for manager in self.single_type_managers:
-            manager.cache_blocks(request, block_hashes, num_computed_tokens)
+            manager.cache_blocks(request, num_computed_tokens)
 
     def free(self, request_id: str) -> None:
         """
@@ -184,10 +180,9 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
     """
 
     def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
-                 use_eagle: bool, caching_hash_fn: Callable,
-                 enable_kv_cache_events: bool):
+                 use_eagle: bool, enable_kv_cache_events: bool):
         super().__init__(kv_cache_config, max_model_len, use_eagle, False,
-                         caching_hash_fn, enable_kv_cache_events)
+                         enable_kv_cache_events)
         self.num_single_type_manager = len(self.single_type_managers)
 
     def get_num_common_prefix_blocks(self, request_id: str,
@@ -213,10 +208,9 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
 
     def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
                  use_eagle: bool, enable_caching: bool,
-                 caching_hash_fn: Callable, enable_kv_cache_events: bool):
+                 enable_kv_cache_events: bool):
         super().__init__(kv_cache_config, max_model_len, use_eagle,
-                         enable_caching, caching_hash_fn,
-                         enable_kv_cache_events)
+                         enable_caching, enable_kv_cache_events)
         self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[
             0].kv_cache_spec
         self.block_size = self.kv_cache_spec.block_size
@@ -250,10 +244,9 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
 
     def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
                  use_eagle: bool, enable_caching: bool,
-                 caching_hash_fn: Callable, enable_kv_cache_events: bool):
+                 enable_kv_cache_events: bool):
         super().__init__(kv_cache_config, max_model_len, use_eagle,
-                         enable_caching, caching_hash_fn,
-                         enable_kv_cache_events)
+                         enable_caching, enable_kv_cache_events)
         self.verify_and_split_kv_cache_groups()
 
     def verify_and_split_kv_cache_groups(self) -> None:
@@ -386,17 +379,15 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
 
 def get_kv_cache_coordinator(
         kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
-        enable_caching: bool, caching_hash_fn: Callable,
+        enable_caching: bool,
         enable_kv_cache_events: bool) -> KVCacheCoordinator:
     if not enable_caching:
         return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len,
-                                               use_eagle, caching_hash_fn,
+                                               use_eagle,
                                                enable_kv_cache_events)
     if len(kv_cache_config.kv_cache_groups) == 1:
         return UnitaryKVCacheCoordinator(kv_cache_config, max_model_len,
                                          use_eagle, enable_caching,
-                                         caching_hash_fn,
                                          enable_kv_cache_events)
     return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle,
-                                    enable_caching, caching_hash_fn,
-                                    enable_kv_cache_events)
+                                    enable_caching, enable_kv_cache_events)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index ce333dbe61..bfaa7ab08f 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,16 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections import defaultdict
 from dataclasses import dataclass
 from typing import Optional
 
 from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
-from vllm.utils import sha256, sha256_cbor_64bit
 from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
-from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
-                                         hash_request_tokens, init_none_hash)
+from vllm.v1.core.kv_cache_utils import KVCacheBlock
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request, RequestStatus
@@ -71,23 +68,13 @@ class KVCacheManager:
         kv_cache_config: KVCacheConfig,
         max_model_len: int,
         enable_caching: bool = True,
-        caching_hash_algo: str = "builtin",
         use_eagle: bool = False,
         log_stats: bool = False,
         enable_kv_cache_events: bool = False,
     ) -> None:
         self.max_model_len = max_model_len
 
-        if len(kv_cache_config.kv_cache_groups) == 0:
-            # Attention free models don't have kv cache,
-            # thus don't need prefix caching.
-            enable_caching = False
         self.enable_caching = enable_caching
-
-        self.caching_hash_fn = (
-            sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else
-            sha256 if caching_hash_algo == "sha256" else hash)
-        init_none_hash(self.caching_hash_fn)
         self.use_eagle = use_eagle
         self.log_stats = log_stats
         # FIXME: make prefix cache stats conditional on log_stats
@@ -107,19 +94,12 @@ class KVCacheManager:
             max_model_len=self.max_model_len,
             use_eagle=self.use_eagle,
             enable_caching=self.enable_caching,
-            caching_hash_fn=self.caching_hash_fn,
             enable_kv_cache_events=enable_kv_cache_events,
         )
         self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
         self.block_pool = self.coordinator.block_pool
         self.kv_cache_config = kv_cache_config
 
-        # Mapping from request ID to kv block hashes.
-        # This is to avoid recomputing the block hashes for each call of
-        # `get_computed_blocks` or `allocate_slots`.
-        self.req_to_block_hashes: defaultdict[
-            str, list[BlockHash]] = defaultdict(list)
-
     @property
     def usage(self) -> float:
         """Get the KV cache usage.
@@ -161,15 +141,6 @@ class KVCacheManager:
                     and request.sampling_params.prompt_logprobs is not None)):
             return self.create_empty_block_list(), 0
 
-        # The block hashes for the request may already be computed
-        # if the scheduler has tried to schedule the request before.
-        block_hashes = self.req_to_block_hashes[request.request_id]
-        if not block_hashes:
-            assert self.block_size is not None
-            block_hashes = hash_request_tokens(self.caching_hash_fn,
-                                               self.block_size, request)
-            self.req_to_block_hashes[request.request_id] = block_hashes
-
         # NOTE: When all tokens hit the cache, we must recompute the last token
         # to obtain logits. Thus, set max_cache_hit_length to prompt_length - 1.
         # This can trigger recomputation of an entire block, rather than just
@@ -178,7 +149,7 @@ class KVCacheManager:
         # could slightly improve performance in the future.
         max_cache_hit_length = request.num_tokens - 1
         computed_blocks, num_new_computed_tokens = (
-            self.coordinator.find_longest_cache_hit(block_hashes,
+            self.coordinator.find_longest_cache_hit(request.block_hashes,
                                                     max_cache_hit_length))
 
         if self.log_stats:
@@ -296,11 +267,7 @@ class KVCacheManager:
         # at `request.num_tokens`, ensuring only "finalized" tokens are cached.
         num_tokens_to_cache = min(num_computed_tokens + num_new_tokens,
                                   request.num_tokens)
-        self.coordinator.cache_blocks(
-            request,
-            self.req_to_block_hashes[request.request_id],
-            num_tokens_to_cache,
-        )
+        self.coordinator.cache_blocks(request, num_tokens_to_cache)
 
         return KVCacheBlocks(new_blocks)
 
@@ -373,14 +340,6 @@ class KVCacheManager:
         return self.coordinator.get_num_common_prefix_blocks(
             request.request_id, num_running_requests)
 
-    def free_block_hashes(self, request: Request) -> None:
-        """Discard the block hashes for the request.
-
-        NOTE: Unlike `free`, this method should be called only when the request
-        is finished, not when it is preempted.
-        """
-        self.req_to_block_hashes.pop(request.request_id, None)
-
     def take_events(self) -> list[KVCacheEvent]:
         """Take the KV cache events from the block pool.
 
@@ -397,9 +356,7 @@ class KVCacheManager:
     def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
         """Cache the blocks for the request, if enabled."""
         if self.enable_caching:
-            block_hashes = self.req_to_block_hashes[request.request_id]
-            self.coordinator.cache_blocks(request, block_hashes,
-                                          num_computed_tokens)
+            self.coordinator.cache_blocks(request, num_computed_tokens)
 
     def create_empty_block_list(self) -> KVCacheBlocks:
         """Creates a new KVCacheBlocks instance with no blocks."""
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 626aa35a77..6a62c55fb2 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -547,41 +547,61 @@ def hash_block_tokens(
         curr_block_token_ids_tuple, extra_keys)
 
 
-def hash_request_tokens(hash_function: Any, block_size: int,
-                        request: Request) -> list[BlockHash]:
-    """Computes hash values of a chain of blocks given a sequence of
-    token IDs. The hash value is used for prefix caching.
-
-    Args:
-        block_size: The size of each block.
-        request: The request object.
-
-    Returns:
-        The list of computed hash values.
+def get_request_block_hasher(
+    block_size: int,
+    caching_hash_fn: Callable[[Any],
+                              int]) -> Callable[[Request], list[BlockHash]]:
     """
-    token_ids = request.all_token_ids
+    Returns a function which computes the list of un-computed block hashes
+    of a request.
 
-    req_need_extra_keys = need_extra_keys(request)
-    req_extra_keys = None
-    curr_mm_idx = 0
+    Each request holds a list of its block hashes (request.block_hashes).
+    When a request is created, it calls the below function to compute
+    the hashes of all full blocks of the request's initial tokens.
+    The hashes are then stored in request.block_hashes.
+    Later, whenever new tokens are appended to the request, it calls
+    the below function again to compute any new full blocks of tokens.
+    The returned new hashes are appended to request.block_hashes.
+    """
 
-    ret = []
-    parent_block_hash_value = None
-    # Only full blocks will be hashed
-    for start in range(0, len(token_ids) - block_size + 1, block_size):
-        end = start + block_size
-        block_token_ids = token_ids[start:end]
+    def request_block_hasher(request: Request) -> list[BlockHash]:
+        start_token_idx = len(request.block_hashes) * block_size
+        num_tokens = request.num_tokens
+
+        curr_mm_idx = 0
+        if start_token_idx > 0:
+            # Set curr_mm_idx = -1 to indicate the last mm input.
+            # Note that since we reach to this branch only when the block is
+            # completed with generated tokens, we only need to consider the
+            # last mm input.
+            curr_mm_idx = -1
+
+        prev_block_hash_value = request.block_hashes[-1].hash_value \
+            if request.block_hashes else None
+        new_block_hashes: list[BlockHash] = []
+        while True:
+            end_token_idx = start_token_idx + block_size
+            if end_token_idx > num_tokens:
+                # We only hash full blocks
+                break
 
-        if req_need_extra_keys:
             # MM and LoRA requests need extra keys for block-hash computation.
-            req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
-                request, start, end, curr_mm_idx)
+            extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
+                request, start_token_idx, end_token_idx, curr_mm_idx)
 
-        block_hash = hash_block_tokens(hash_function, parent_block_hash_value,
-                                       block_token_ids, req_extra_keys)
-        ret.append(block_hash)
-        parent_block_hash_value = block_hash.hash_value
-    return ret
+            # Compute the hash of the current block
+            block_tokens = request.all_token_ids[start_token_idx:end_token_idx]
+            block_hash = hash_block_tokens(caching_hash_fn,
+                                           prev_block_hash_value, block_tokens,
+                                           extra_keys)
+
+            new_block_hashes.append(block_hash)
+            start_token_idx += block_size
+            prev_block_hash_value = block_hash.hash_value
+
+        return new_block_hashes
+
+    return request_block_hasher
 
 
 def max_memory_usage_bytes(vllm_config: VllmConfig,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index dcb9f4dd36..9810234090 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -155,7 +155,6 @@ class Scheduler(SchedulerInterface):
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
             enable_caching=self.cache_config.enable_prefix_caching,
-            caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
             use_eagle=self.use_eagle,
             log_stats=self.log_stats,
             enable_kv_cache_events=self.enable_kv_cache_events,
@@ -1036,7 +1035,6 @@ class Scheduler(SchedulerInterface):
     def _free_blocks(self, request: Request):
         assert request.is_finished()
         self.kv_cache_manager.free(request)
-        self.kv_cache_manager.free_block_hashes(request)
         del self.requests[request.request_id]
 
     def get_num_unfinished_requests(self) -> int:
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 8f310023a8..82e0292522 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -3,7 +3,6 @@
 import itertools
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from typing import Callable
 
 from vllm.utils import cdiv
 from vllm.v1.core.block_pool import BlockPool
@@ -25,7 +24,6 @@ class SingleTypeKVCacheManager(ABC):
         kv_cache_spec: KVCacheSpec,
         block_pool: BlockPool,
         kv_cache_group_id: int,
-        caching_hash_fn: Callable,
     ) -> None:
         """
         Initializes the SingleTypeKVCacheManager.
@@ -33,7 +31,6 @@ class SingleTypeKVCacheManager(ABC):
             kv_cache_spec: The kv_cache_spec for this manager.
             block_pool: The block pool.
             kv_cache_group_id: The id of the kv cache group of this manager.
-            caching_hash_fn: The caching hash function.
         """
 
         self.block_size = kv_cache_spec.block_size
@@ -52,7 +49,6 @@ class SingleTypeKVCacheManager(ABC):
         # data for reempted ones.
         self.num_cached_block: dict[str, int] = {}
 
-        self.caching_hash_fn = caching_hash_fn
         self.kv_cache_group_id = kv_cache_group_id
         self._null_block = block_pool.null_block
 
@@ -130,14 +126,12 @@ class SingleTypeKVCacheManager(ABC):
             req_blocks.extend(new_blocks)
             return new_blocks
 
-    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
-                     num_tokens: int) -> None:
+    def cache_blocks(self, request: Request, num_tokens: int) -> None:
         """
         Cache the blocks for the request.
 
         Args:
             request: The request.
-            block_hashes: The block hashes of the request.
             num_tokens: The total number of tokens that need to be cached 
                 (including tokens that are already cached).
         """
@@ -147,12 +141,10 @@ class SingleTypeKVCacheManager(ABC):
         self.block_pool.cache_full_blocks(
             request=request,
             blocks=self.req_to_blocks[request.request_id],
-            block_hashes=block_hashes,
             num_cached_blocks=num_cached_blocks,
             num_full_blocks=num_full_blocks,
             block_size=self.block_size,
             kv_cache_group_id=self.kv_cache_group_id,
-            hash_fn=self.caching_hash_fn,
         )
 
         self.num_cached_block[request.request_id] = num_full_blocks
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ed426f8ff4..1e52f93a58 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -25,9 +25,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.utils import (decorate_logs, make_zmq_socket,
+from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket,
                         resolve_obj_by_qualname, set_process_title)
-from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
+from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_config,
+                                         get_request_block_hasher,
+                                         init_none_hash,
                                          unify_kv_cache_configs)
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -140,6 +142,19 @@ class EngineCore:
                         self.batch_queue_size)
             self.batch_queue = queue.Queue(self.batch_queue_size)
 
+        self.request_block_hasher: Optional[Callable[[Request],
+                                                     list[BlockHash]]] = None
+        if (self.vllm_config.cache_config.enable_prefix_caching
+                or self.scheduler.get_kv_connector() is not None):
+
+            block_size = vllm_config.cache_config.block_size
+            caching_hash_fn = get_hash_fn_by_name(
+                vllm_config.cache_config.prefix_caching_hash_algo)
+            init_none_hash(caching_hash_fn)
+
+            self.request_block_hasher = get_request_block_hasher(
+                block_size, caching_hash_fn)
+
     def _initialize_kv_caches(
             self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
         start = time.time()
@@ -417,7 +432,8 @@ class EngineCore:
             request.mm_kwargs = self.mm_input_cache_server.get_and_update(
                 request.mm_kwargs, request.mm_hashes)
 
-        req = Request.from_engine_core_request(request)
+        req = Request.from_engine_core_request(request,
+                                               self.request_block_hasher)
         if req.use_structured_output:
             # Note on thread safety: no race condition.
             # `grammar_init` is only invoked in input processing thread. For
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index d1f1c7f987..562925bde6 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -3,7 +3,8 @@
 
 import enum
 import time
-from typing import TYPE_CHECKING, Any, Optional, Union
+from functools import partial
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.pooling_params import PoolingParams
@@ -16,6 +17,7 @@ from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
     from vllm.lora.request import LoRARequest
+    from vllm.v1.core.kv_cache_utils import BlockHash
 
 
 class Request:
@@ -36,6 +38,8 @@ class Request:
         structured_output_request: Optional["StructuredOutputRequest"] = None,
         cache_salt: Optional[str] = None,
         priority: int = 0,
+        block_hasher: Optional[Callable[["Request"],
+                                        list["BlockHash"]]] = None,
     ) -> None:
         self.request_id = request_id
         self.client_index = client_index
@@ -108,8 +112,18 @@ class Request:
         # indicates that the output is corrupted
         self.num_nans_in_logits = 0
 
+        self.block_hashes: list[BlockHash] = []
+        self.get_hash_new_full_blocks: Optional[Callable[
+            [], list[BlockHash]]] = None
+        if block_hasher is not None:
+            self.get_hash_new_full_blocks = partial(block_hasher, self)
+            self.block_hashes = self.get_hash_new_full_blocks()
+
     @classmethod
-    def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
+    def from_engine_core_request(
+        cls, request: EngineCoreRequest,
+        block_hasher: Optional[Callable[["Request"], list["BlockHash"]]]
+    ) -> "Request":
         if request.mm_kwargs is not None:
             assert is_list_of(request.mm_kwargs, MultiModalKwargsItem), (
                 "mm_kwargs was not updated in EngineCore.add_request")
@@ -131,6 +145,7 @@ class Request:
                     if request.sampling_params else None,
             cache_salt=request.cache_salt,
             priority=request.priority,
+            block_hasher=block_hasher,
         )
 
     def append_output_token_ids(
@@ -144,6 +159,9 @@ class Request:
             self._output_token_ids.extend(token_ids)
             self._all_token_ids.extend(token_ids)
 
+        if self.get_hash_new_full_blocks is not None:
+            self.block_hashes.extend(self.get_hash_new_full_blocks())
+
     @property
     def is_output_corrupted(self) -> bool:
         return self.num_nans_in_logits > 0

From 3e2f7985a2fc69288c952d950e68ded7f5ef530f Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Fri, 15 Aug 2025 16:54:10 -0700
Subject: [PATCH 317/932] Support multiple attention groups for KV sharing
 (#22672)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/test_kv_sharing.py | 189 ++++++++++++++++++++++++++++++++++++
 vllm/v1/worker/utils.py     |  38 +++++---
 2 files changed, 212 insertions(+), 15 deletions(-)
 create mode 100644 tests/v1/test_kv_sharing.py

diff --git a/tests/v1/test_kv_sharing.py b/tests/v1/test_kv_sharing.py
new file mode 100644
index 0000000000..6b01b7d3e1
--- /dev/null
+++ b/tests/v1/test_kv_sharing.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import Mock
+
+import torch
+
+from vllm.v1.attention.backends.flash_attn import (
+    FlashAttentionBackend, FlashAttentionMetadataBuilder)
+from vllm.v1.attention.backends.flex_attention import (
+    FlexAttentionBackend, FlexAttentionMetadataBuilder)
+from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
+from vllm.v1.worker.utils import (AttentionGroup,
+                                  initialize_kv_cache_for_kv_sharing)
+
+
+def new_kv_cache_spec():
+    return FullAttentionSpec(16, 1, 1, torch.float32, False)
+
+
+def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
+    """
+    Test initializing KV cache sharing with different attention groups.
+    Layers in the same KV cache group might be placed in different attn groups
+    if they have different attention backends.
+    """
+    shared_kv_cache_layers = {
+        "model.layers.2": "model.layers.0",
+        "model.layers.3": "model.layers.1",
+    }
+
+    # Layers 0 and 1 both belong in KV cache group 0
+    # However, if they have have different attention backends, they will be
+    # placed in different attention groups for KV cache group 0
+    kv_cache_groups = [
+        KVCacheGroupSpec(["model.layers.0", "model.layers.1"],
+                         new_kv_cache_spec()),
+    ]
+
+    attn_groups = [
+        # KV cache group 0 has two attention groups
+        [
+            AttentionGroup(
+                backend=FlashAttentionBackend,
+                metadata_builder=Mock(spec=FlashAttentionMetadataBuilder),
+                layer_names=["model.layers.0"],
+            ),
+            AttentionGroup(
+                backend=FlexAttentionBackend,
+                metadata_builder=Mock(spec=FlexAttentionMetadataBuilder),
+                layer_names=["model.layers.1"],
+            ),
+        ],
+    ]
+
+    # Only layers 0 and 1 will have KV caches allocated
+    kv_caches = {
+        "model.layers.0": torch.zeros(1, 2, 3),
+        "model.layers.1": torch.ones(1, 2, 3),
+    }
+
+    initialize_kv_cache_for_kv_sharing(
+        shared_kv_cache_layers=shared_kv_cache_layers,
+        kv_cache_groups=kv_cache_groups,
+        kv_caches=kv_caches,
+        attn_groups=attn_groups,
+    )
+
+    # Check that the KV caches were shared correctly
+    assert kv_caches["model.layers.2"].data_ptr(
+    ) == kv_caches["model.layers.0"].data_ptr()
+    assert kv_caches["model.layers.3"].data_ptr(
+    ) == kv_caches["model.layers.1"].data_ptr()
+
+    # Check that the layers were added to the correct KV cache group
+    assert len(kv_cache_groups) == 1
+    assert kv_cache_groups[0].layer_names == [
+        "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
+    ]
+
+    # Check that the layers were added to the attention groups
+    assert len(attn_groups) == 1 and len(attn_groups[0]) == 2
+    assert attn_groups[0][0].layer_names == [
+        "model.layers.0", "model.layers.2"
+    ]
+    assert attn_groups[0][1].layer_names == [
+        "model.layers.1", "model.layers.3"
+    ]
+
+
+def test_initialize_kv_cache_for_kv_sharing_same_attn_groups():
+    """
+    Test case assuming that all layers in the same KV cache group have the same
+    attention backends. This is true for most models.
+    """
+    shared_kv_cache_layers = {
+        "model.layers.2": "model.layers.0",
+        "model.layers.3": "model.layers.1",
+    }
+
+    kv_cache_groups = [
+        KVCacheGroupSpec(["model.layers.0", "model.layers.1"],
+                         new_kv_cache_spec()),
+    ]
+
+    attn_groups = [
+        # KV cache group 0 has a single attention group
+        # as all layers have the same flash attention backend
+        [
+            AttentionGroup(
+                backend=FlashAttentionBackend,
+                metadata_builder=Mock(spec=FlashAttentionMetadataBuilder),
+                layer_names=["model.layers.0", "model.layers.1"],
+            ),
+        ],
+    ]
+
+    kv_caches = {
+        "model.layers.0": torch.zeros(1, 2, 3),
+        "model.layers.1": torch.ones(1, 2, 3),
+    }
+
+    initialize_kv_cache_for_kv_sharing(
+        shared_kv_cache_layers=shared_kv_cache_layers,
+        kv_cache_groups=kv_cache_groups,
+        kv_caches=kv_caches,
+        attn_groups=attn_groups,
+    )
+
+    # Check that the KV caches were shared correctly
+    assert kv_caches["model.layers.2"].data_ptr(
+    ) == kv_caches["model.layers.0"].data_ptr()
+    assert kv_caches["model.layers.3"].data_ptr(
+    ) == kv_caches["model.layers.1"].data_ptr()
+
+    # Check that the layers were added to the correct KV cache group
+    assert len(kv_cache_groups) == 1
+    assert kv_cache_groups[0].layer_names == [
+        "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
+    ]
+
+    # Check that the layers were added to the attention groups
+    assert len(attn_groups) == 1 and len(attn_groups[0]) == 1
+    assert attn_groups[0][0].layer_names == [
+        "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
+    ]
+
+
+def test_initialize_kv_cache_for_kv_sharing_no_attn_groups():
+    """
+    Test KV sharing set up when no attention groups are provided.
+    This is the case for the TPU model runner, which doesn't have 
+    support for attention groups yet.
+    """
+    shared_kv_cache_layers = {
+        "model.layers.2": "model.layers.0",
+        "model.layers.3": "model.layers.1",
+    }
+
+    kv_cache_groups = [
+        KVCacheGroupSpec(["model.layers.0"], new_kv_cache_spec()),
+        KVCacheGroupSpec(["model.layers.1"], new_kv_cache_spec()),
+    ]
+
+    kv_caches = {
+        "model.layers.0": torch.zeros(1, 2, 3),
+        "model.layers.1": torch.ones(1, 2, 3),
+    }
+
+    initialize_kv_cache_for_kv_sharing(
+        shared_kv_cache_layers=shared_kv_cache_layers,
+        kv_cache_groups=kv_cache_groups,
+        kv_caches=kv_caches,
+    )
+
+    # Check that the KV caches were shared correctly
+    assert kv_caches["model.layers.2"].data_ptr(
+    ) == kv_caches["model.layers.0"].data_ptr()
+    assert kv_caches["model.layers.3"].data_ptr(
+    ) == kv_caches["model.layers.1"].data_ptr()
+
+    # Check that the layers were added to the correct KV cache group
+    assert len(kv_cache_groups) == 2
+    assert kv_cache_groups[0].layer_names == [
+        "model.layers.0", "model.layers.2"
+    ]
+    assert kv_cache_groups[1].layer_names == [
+        "model.layers.1", "model.layers.3"
+    ]
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index e7079235d6..b138f11af1 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -225,26 +225,34 @@ def initialize_kv_cache_for_kv_sharing(
             Note that layers in shared_kv_cache_layers.keys() are not
             originally included as it only contains layers which have its own
             KV cache allocation.
+        attn_groups: Optional list of attention groups. Layers in the same KV
+            cache group may be placed in different attention groups if they
+            have different attention backends.  Currently only provided by 
+            GPU model runner.
     """
-    # Record index of KV cache group for each layer that allocates a KV cache.
-    layer_to_kv_cache_group_idx: dict[str, int] = {}
-    for i, kv_cache_group in enumerate(kv_cache_groups):
-        for layer_name in kv_cache_group.layer_names:
-            layer_to_kv_cache_group_idx[layer_name] = i
+    # mapping from layer name to tuple of (kv_cache_group_idx, attn_group_idx)
+    layer_to_attn_group_idx: dict[str, tuple[int, int]] = {}
+    if attn_groups:
+        for kv_cache_group_idx, kv_attn_groups in enumerate(attn_groups):
+            for attn_group_idx, attn_group in enumerate(kv_attn_groups):
+                for layer_name in attn_group.layer_names:
+                    layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx,
+                                                           attn_group_idx)
+    else:
+        for kv_cache_group_idx, kv_cache_group in enumerate(kv_cache_groups):
+            for layer_name in kv_cache_group.layer_names:
+                # attn group idx default to 0 if not provided
+                layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx, 0)
 
     for layer_name, target_layer_name in shared_kv_cache_layers.items():
         kv_caches[layer_name] = kv_caches[target_layer_name]
-        group_idx = layer_to_kv_cache_group_idx[target_layer_name]
-        kv_cache_groups[group_idx].layer_names.append(layer_name)
+        kv_cache_group_idx = layer_to_attn_group_idx[target_layer_name][0]
+        kv_cache_groups[kv_cache_group_idx].layer_names.append(layer_name)
 
-        if attn_groups is not None:
-            assert len(attn_groups[group_idx]) == 1, (
-                "Only one attention group per KV cache group is supported "
-                "for KV-cache sharing for now.")
-            # TODO(lucas): I think in the future the layers that re-use a
-            # KV cache will be in a different attention group so we can
-            # remove this code from here.
-            attn_groups[group_idx][0].layer_names.append(layer_name)
+        if attn_groups:
+            attn_group_idx = layer_to_attn_group_idx[target_layer_name][1]
+            attn_groups[kv_cache_group_idx][attn_group_idx].layer_names.append(
+                layer_name)
 
 
 def bind_kv_cache(

From 236b864e4f74c4018e1111e0d5b787d54b37c694 Mon Sep 17 00:00:00 2001
From: Yichen Yan <oraluben@outlook.com>
Date: Sat, 16 Aug 2025 07:56:17 +0800
Subject: [PATCH 318/932] [BugFix] Make `run_once` thread-safe (#22978)

Signed-off-by: <wenji.yyc@alibaba-inc.com>
Signed-off-by: Yichen Yan <wenji.yyc@alibaba-inc.com>
---
 vllm/utils/__init__.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 72857ee2ab..40f41893ab 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1640,15 +1640,19 @@ def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]:
     return weak_bound
 
 
-# From: https://stackoverflow.com/a/4104188/2749989
 def run_once(f: Callable[P, None]) -> Callable[P, None]:
 
     def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
-        if not wrapper.has_run:  # type: ignore[attr-defined]
-            wrapper.has_run = True  # type: ignore[attr-defined]
-            return f(*args, **kwargs)
+        if wrapper.has_run:  # type: ignore[attr-defined]
+            return
+
+        with wrapper.lock:  # type: ignore[attr-defined]
+            if not wrapper.has_run:  # type: ignore[attr-defined]
+                wrapper.has_run = True  # type: ignore[attr-defined]
+                return f(*args, **kwargs)
 
     wrapper.has_run = False  # type: ignore[attr-defined]
+    wrapper.lock = threading.Lock()  # type: ignore[attr-defined]
     return wrapper
 
 
From ad0297d1139f55cbd602652afd54276a8ae217ce Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 15 Aug 2025 17:00:36 -0700
Subject: [PATCH 319/932] [Misc] Support passing multiple request ids at once
 to `AsyncLLM.abort()` (#22944)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/engine/test_async_llm.py     | 77 ++++++++++++++++++++++++++-
 vllm/engine/async_llm_engine.py       |  5 +-
 vllm/engine/multiprocessing/client.py | 10 ++--
 vllm/engine/protocol.py               |  7 +--
 vllm/utils/__init__.py                |  5 ++
 vllm/v1/engine/async_llm.py           | 15 +++---
 6 files changed, 105 insertions(+), 14 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 484640233f..df04a14af7 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -212,6 +212,79 @@ async def test_abort(
         assert not engine.output_processor.has_unfinished_requests()
 
 
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.asyncio
+async def test_multi_abort(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+):
+
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 50
+        NUM_EXPECTED_TOKENS = 100
+        NUM_EXPECTED_TOKENS_LONG = 50000
+        REQUEST_IDS_TO_ABORT = [5, 10, 15, 20, 25]
+        PARALLEL_SAMPLE_REQ_IDS = [5, 15, 30, 35]
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks: list[asyncio.Task] = []
+        for idx, request_id in enumerate(request_ids):
+            max_tokens = (NUM_EXPECTED_TOKENS_LONG if
+                          (idx
+                           in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS)
+            n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, TEXT_PROMPT, output_kind,
+                             max_tokens, n)))
+
+        # Let requests start
+        await asyncio.sleep(0.5)
+
+        # Use multi-abort to abort multiple requests at once
+        abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
+        await engine.abort(abort_request_ids)
+
+        # Wait for all tasks to complete
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        # Verify results
+        for idx, result in enumerate(results):
+            if idx in REQUEST_IDS_TO_ABORT:
+                # Aborted requests should return partial results
+                assert isinstance(
+                    result, tuple
+                ), f"Request {idx} should have completed with partial results"
+                num_generated_tokens, request_id = result
+                # Should have generated some tokens before abort
+                assert num_generated_tokens > 0, (
+                    f"Aborted request "
+                    f"{request_id} should have generated some tokens")
+            else:
+                # Non-aborted requests should complete normally
+                assert isinstance(
+                    result,
+                    tuple), f"Request {idx} should have completed successfully"
+                num_generated_tokens, request_id = result
+                n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
+                expected_tokens = NUM_EXPECTED_TOKENS * n
+                assert num_generated_tokens == expected_tokens, (
+                    f"{request_id} generated {num_generated_tokens} but "
+                    f"expected {expected_tokens}")
+
+        # Make sure all aborted requests were cleaned up
+        assert not engine.output_processor.has_unfinished_requests()
+
+
 @pytest.mark.parametrize("n", [1, 3])
 @pytest.mark.parametrize(
     "engine_args,prompt",
@@ -460,7 +533,9 @@ async def test_abort_final_output(
             token_count = sum(
                 len(output.outputs[0].token_ids) for output in outputs)
             assert token_count > 0
-            assert len(final_output.outputs[0].token_ids) == 0
+            # This would ordinarily be 0, but could end up > 0 if the
+            # final abort is coalesced with another chunk in the output queue.
+            assert len(final_output.outputs[0].token_ids) >= 0
         else:
             # For FINAL_ONLY, we should only get the final output
             assert len(outputs) == 0
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 73726eeab5..84ad2299b0 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -998,7 +998,7 @@ class AsyncLLMEngine(EngineClient):
             await self.abort(request_id)
             raise
 
-    async def abort(self, request_id: str) -> None:
+    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
         """Abort a request.
 
         Abort a submitted request. If the request is finished or not found,
@@ -1007,6 +1007,9 @@ class AsyncLLMEngine(EngineClient):
         Args:
             request_id: The unique id of the request.
         """
+        if not isinstance(request_id, str):
+            raise RuntimeError("Only single-request abort supported in"
+                               " deprecated V0")
         if not self.is_running:
             raise AsyncEngineDeadError(
                 "Background loop is not running. If it was running, "
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index f69f72edf6..eca29af500 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -5,8 +5,8 @@ import asyncio
 import copy
 import pickle
 from contextlib import contextmanager, suppress
-from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
-                    Optional, Union, cast)
+from typing import (Any, AsyncGenerator, Dict, Iterable, Iterator, List,
+                    Mapping, Optional, Union, cast)
 
 import cloudpickle
 import psutil
@@ -404,9 +404,13 @@ class MQLLMEngineClient(EngineClient):
             error_message="Unable to start RPC Server",
             socket=socket)
 
-    async def abort(self, request_id: str):
+    async def abort(self, request_id: Union[str, Iterable[str]]):
         """Send an ABORT_REQUEST signal to the RPC Server"""
 
+        if not isinstance(request_id, str):
+            raise RuntimeError("Only single-request abort supported in"
+                               " deprecated V0")
+
         with suppress(MQClientClosedError):
             await self._send_one_way_rpc_request(
                 request=RPCAbortRequest(request_id), socket=self.input_socket)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 671e9648a3..c610fb5eae 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -3,7 +3,7 @@
 
 import asyncio
 from abc import ABC, abstractmethod
-from typing import AsyncGenerator, Mapping, Optional
+from typing import AsyncGenerator, Iterable, Mapping, Optional, Union
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
 from vllm.config import DecodingConfig, ModelConfig, VllmConfig
@@ -229,11 +229,12 @@ class EngineClient(ABC):
         ...
 
     @abstractmethod
-    async def abort(self, request_id: str) -> None:
+    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
         """Abort a request.
 
         Args:
-            request_id: The unique id of the request.
+            request_id: The unique id of the request,
+                        or an iterable of such ids.
         """
         ...
 
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 40f41893ab..64f7426bd6 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1315,6 +1315,11 @@ def common_broadcastable_dtype(dtypes: Collection[torch.dtype]):
     )
 
 
+def as_list(maybe_list: Iterable[T]) -> list[T]:
+    """Convert iterable to list, unless it's already a list."""
+    return maybe_list if isinstance(maybe_list, list) else list(maybe_list)
+
+
 # `collections` helpers
 def is_list_of(
     value: object,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index edc2e235c3..664fec31a4 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import time
-from collections.abc import AsyncGenerator, Mapping
+from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
 from typing import Any, Optional, Union
 
@@ -27,7 +27,8 @@ from vllm.transformers_utils.config import (
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, cancel_task_threadsafe, cdiv, deprecate_kwargs
+from vllm.utils import (Device, as_list, cancel_task_threadsafe, cdiv,
+                        deprecate_kwargs)
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
@@ -431,14 +432,16 @@ class AsyncLLM(EngineClient):
 
         self.output_handler = asyncio.create_task(output_handler())
 
-    async def abort(self, request_id: str) -> None:
+    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
 
-        request_ids = self.output_processor.abort_requests((request_id, ))
-        await self.engine_core.abort_requests_async(request_ids)
+        request_ids = (request_id, ) if isinstance(
+            request_id, str) else as_list(request_id)
+        all_request_ids = self.output_processor.abort_requests(request_ids)
+        await self.engine_core.abort_requests_async(all_request_ids)
 
         if self.log_requests:
-            logger.info("Aborted request %s.", request_id)
+            logger.info("Aborted request(s) %s.", ",".join(request_ids))
 
     async def encode(
         self,

From 070da660c1bf9e7a7be8b9efeff4b06f91c7342f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 16 Aug 2025 02:14:08 +0200
Subject: [PATCH 320/932] [Kernel] Simplify `get_kv_cache_layout` and cache
 `use_trtllm_attention` env-dependent bit (#22735)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/utils/flashinfer.py            | 46 +++++++++++++++++++----------
 vllm/v1/attention/backends/utils.py | 18 ++++++-----
 2 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 0d7d4b694f..2e31b7bad7 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -148,6 +148,31 @@ def has_nvidia_artifactory() -> bool:
         return False
 
 
+@functools.cache
+def supports_trtllm_attention() -> tuple[bool, Optional[str]]:
+    """Cache result which only depends on the environment"""
+    # This is a lambda, call it once
+    env_value = envs.VLLM_USE_TRTLLM_ATTENTION
+
+    # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
+    if not (current_platform.is_device_capability(100)
+            and has_nvidia_artifactory()):
+        return False, env_value
+
+    if env_value is not None:
+        logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value)
+        # Environment variable is set - respect it
+        # Making the conditional check for zero because
+        # the path is automatically enabled if the batch size condition
+        # is satisfied.
+        use_trtllm = (env_value == "1")
+        if use_trtllm:
+            logger.info_once("Using TRTLLM attention.")
+        return use_trtllm, env_value
+
+    return True, None
+
+
 def use_trtllm_attention(
     num_tokens: int,
     max_seq_len: int,
@@ -157,9 +182,8 @@ def use_trtllm_attention(
     attn_head_size: Optional[int],
     has_sinks: bool = False,
 ) -> bool:
-    # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
-    if not (current_platform.is_device_capability(100)
-            and has_nvidia_artifactory()):
+    use_trtllm, env_value = supports_trtllm_attention()
+    if not use_trtllm:
         return False
 
     # Check if the dimensions are supported by TRTLLM decode attention
@@ -174,18 +198,7 @@ def use_trtllm_attention(
             "Using TRTLLM attention (required for attention sinks).")
         return True
 
-    env_value = envs.VLLM_USE_TRTLLM_ATTENTION
-    if env_value is not None:
-        logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value)
-        # Environment variable is set - respect it
-        # Making the conditional check for zero because
-        # the path is automatically enabled if the batch size condition
-        # is satisfied.
-        use_trtllm = (env_value == "1")
-        if use_trtllm:
-            logger.info_once("Using TRTLLM attention.")
-        return use_trtllm
-    else:
+    if env_value is None:
         # Environment variable not set - use auto-detection
         use_trtllm = (num_tokens <= 256 and max_seq_len < 131072
                       and kv_cache_dtype == "auto")
@@ -193,6 +206,9 @@ def use_trtllm_attention(
             logger.warning_once("Using TRTLLM attention (auto-detected).")
         return use_trtllm
 
+    # Environment variable is set to 1 - respect it
+    return True
+
 
 if has_flashinfer():
 
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 1c7d087989..5e6bc33183 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -248,19 +248,23 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
 
 @functools.lru_cache
 def get_kv_cache_layout():
+    # Format specified by the code.
     global _KV_CACHE_LAYOUT_OVERRIDE
-    # Override with format specified by the user.
+
+    if _KV_CACHE_LAYOUT_OVERRIDE is not None:
+        cache_layout = _KV_CACHE_LAYOUT_OVERRIDE
+        logger.info_once("`_KV_CACHE_LAYOUT_OVERRIDE` variable detected. " \
+                         "Setting KV cache layout to %s.", cache_layout)
+        return cache_layout
+
+    # Format specified by the user.
     cache_layout = envs.VLLM_KV_CACHE_LAYOUT
+    # When neither the user nor the override specified a layout, get default
     if cache_layout is None:
-        if envs.VLLM_USE_TRTLLM_ATTENTION:
-            cache_layout = "HND"
-        else:
-            cache_layout = get_kv_connector_cache_layout()
+        cache_layout = get_kv_connector_cache_layout()
     else:
         logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \
         "detected. Setting KV cache layout to %s.", cache_layout)
-    if _KV_CACHE_LAYOUT_OVERRIDE is not None:
-        cache_layout = _KV_CACHE_LAYOUT_OVERRIDE
     return cache_layout
 
 
From fbd88728b3aa7add999529e2c3f1b6b0aa8e428d Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <benjamin.chislett@centml.ai>
Date: Fri, 15 Aug 2025 21:25:06 -0400
Subject: [PATCH 321/932] [Bugfix] Fix DeepSeek MTP (#22934)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
---
 vllm/model_executor/models/deepseek_mtp.py | 13 +++++++------
 vllm/model_executor/models/glm4_moe_mtp.py |  7 +++----
 vllm/model_executor/models/mimo_mtp.py     |  7 +++----
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 2e026d582a..0ad001be71 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -158,14 +158,13 @@ class DeepSeekMTP(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        previous_hidden_states: torch.Tensor,
+        hidden_states: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions,
-                                   previous_hidden_states, inputs_embeds,
-                                   spec_step_idx)
+        hidden_states = self.model(input_ids, positions, hidden_states,
+                                   inputs_embeds, spec_step_idx)
         return hidden_states
 
     def compute_logits(
@@ -213,13 +212,15 @@ class DeepSeekMTP(nn.Module, SupportsPP):
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                 if (("mlp.experts." in name) and name not in params_dict):
                     continue
-                name = name.replace(weight_name, param_name)
+                name_mapped = name.replace(weight_name, param_name)
 
                 # QKV fusion is optional, fall back to normal
                 # weight loading if it's not enabled
                 if ((param_name == "fused_qkv_a_proj")
-                        and name not in params_dict):
+                        and name_mapped not in params_dict):
                     continue
+                else:
+                    name = name_mapped
 
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py
index 0624640054..322c5619c1 100644
--- a/vllm/model_executor/models/glm4_moe_mtp.py
+++ b/vllm/model_executor/models/glm4_moe_mtp.py
@@ -180,14 +180,13 @@ class Glm4MoeMTP(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        previous_hidden_states: torch.Tensor,
+        hidden_states: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions,
-                                   previous_hidden_states, inputs_embeds,
-                                   spec_step_idx)
+        hidden_states = self.model(input_ids, positions, hidden_states,
+                                   inputs_embeds, spec_step_idx)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py
index 19afc5be3f..5a2079bf51 100644
--- a/vllm/model_executor/models/mimo_mtp.py
+++ b/vllm/model_executor/models/mimo_mtp.py
@@ -164,15 +164,14 @@ class MiMoMTP(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        previous_hidden_states: torch.Tensor,
+        hidden_states: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         assert spec_step_idx == 0, "mimo_mtp only support predict one token now"
-        hidden_states = self.model(input_ids, positions,
-                                   previous_hidden_states, inputs_embeds,
-                                   spec_step_idx)
+        hidden_states = self.model(input_ids, positions, hidden_states,
+                                   inputs_embeds, spec_step_idx)
         return hidden_states
 
     def compute_logits(

From f6b5040590e2ca986e6221d98b736a71896eaa53 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 15 Aug 2025 19:06:30 -0700
Subject: [PATCH 322/932] [Frontend] Avoid list copies in `serving_chat.py`
 (#22947)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/serving_chat.py | 29 +++++++++++++------------
 vllm/reasoning/abs_reasoning_parsers.py |  2 +-
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index b4231c6d10..12349234c3 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -50,6 +50,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
                                                 truncate_tool_call_ids,
                                                 validate_request_params)
+from vllm.utils import as_list
 
 logger = init_logger(__name__)
 
@@ -670,10 +671,10 @@ class OpenAIServingChat(OpenAIServing):
 
                         # avoid the None + list error.
                         if previous_token_ids:
-                            current_token_ids = previous_token_ids + list(
+                            current_token_ids = previous_token_ids + as_list(
                                 output.token_ids)
                         else:
-                            current_token_ids = list(output.token_ids)
+                            current_token_ids = as_list(output.token_ids)
 
                     if self.use_harmony:
                         if is_final:
@@ -703,11 +704,10 @@ class OpenAIServingChat(OpenAIServing):
                             # set reasoning status to end.
                             # Only keep 'content', remove 'reasoning_content'.
                             if reasoning_parser.is_reasoning_end(
-                                    list(output.token_ids)) or \
-                                    (res.prompt_token_ids and
-                                        reasoning_parser.is_reasoning_end(
-                                            list(res.prompt_token_ids)
-                                        )):
+                                    as_list(output.token_ids)) or (
+                                        res.prompt_token_ids
+                                        and reasoning_parser.is_reasoning_end(
+                                            res.prompt_token_ids)):
                                 reasoning_end_arr[i] = True
                                 if delta_message and delta_message.content:
                                     # This need to be added to next `delta_text`
@@ -771,6 +771,7 @@ class OpenAIServingChat(OpenAIServing):
                         assert reasoning_parser is not None
                         assert added_content_delta_arr is not None
                         assert reasoning_end_arr is not None
+                        output_token_ids = as_list(output.token_ids)
                         if not reasoning_end_arr[i]:
                             delta_message = (
                                 reasoning_parser.
@@ -780,7 +781,7 @@ class OpenAIServingChat(OpenAIServing):
                                     delta_text,
                                     previous_token_ids,
                                     current_token_ids,
-                                    output.token_ids,
+                                    output_token_ids,
                                 ))
                             # When encountering think end id in prompt_token_ids
                             # i.e {"enable_thinking": False},
@@ -789,9 +790,9 @@ class OpenAIServingChat(OpenAIServing):
                             # to 'reasoning_content'.
                             if res.prompt_token_ids and \
                                 reasoning_parser.is_reasoning_end(
-                                    list(res.prompt_token_ids)):
+                                    res.prompt_token_ids):
                                 reasoning_end_arr[i] = True
-                                current_token_ids = list(output.token_ids)
+                                current_token_ids = output_token_ids
                                 if delta_message and delta_message.content:
                                     current_text = delta_message.content
                                     delta_message.content = None
@@ -802,11 +803,11 @@ class OpenAIServingChat(OpenAIServing):
                             # Remove the text and token ids related
                             # to 'reasoning_content'.
                             if reasoning_parser.is_reasoning_end(
-                                    list(output.token_ids)):
+                                    output_token_ids):
                                 reasoning_end_arr[i] = True
                                 current_token_ids =  \
                                     reasoning_parser.extract_content_ids(
-                                        list(output.token_ids))
+                                        output_token_ids)
                                 if delta_message and delta_message.content:
                                     current_text = delta_message.content
                                     delta_message.content = None
@@ -815,7 +816,7 @@ class OpenAIServingChat(OpenAIServing):
 
                         # handle tool calls only after reasoning is done,
                         else:
-                            delta_token_ids = list(output.token_ids)
+                            delta_token_ids = output_token_ids
                             # First time to tool call,
                             # add the remaining text and token ids
                             # to delta from previous
@@ -899,7 +900,7 @@ class OpenAIServingChat(OpenAIServing):
                             self.request_logger.log_outputs(
                                 request_id=request_id,
                                 outputs=delta_content,
-                                output_token_ids=list(output.token_ids),
+                                output_token_ids=as_list(output.token_ids),
                                 finish_reason=output.finish_reason,
                                 is_streaming=True,
                                 delta=True,
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 4f4522d726..df9e84163f 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -44,7 +44,7 @@ class ReasoningParser:
         return self.model_tokenizer.get_vocab()
 
     @abstractmethod
-    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
         """
         Check if the reasoning content ends in the input_ids.
 

From e4e37ded563912d6d413cfc23cc1db098b3d2f09 Mon Sep 17 00:00:00 2001
From: Calvin Chen <wen.chen@dynamia.ai>
Date: Sat, 16 Aug 2025 10:28:10 +0800
Subject: [PATCH 323/932] [V1] support min_tokens for detokener (#22014)

Signed-off-by: calvin chen <wen.chen@dynamia.ai>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 tests/detokenizer/test_min_tokens.py | 50 ++++++++++++++++++++++++++++
 vllm/v1/engine/detokenizer.py        | 11 ++++--
 2 files changed, 58 insertions(+), 3 deletions(-)
 create mode 100644 tests/detokenizer/test_min_tokens.py

diff --git a/tests/detokenizer/test_min_tokens.py b/tests/detokenizer/test_min_tokens.py
new file mode 100644
index 0000000000..887e833425
--- /dev/null
+++ b/tests/detokenizer/test_min_tokens.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm import SamplingParams
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import FastIncrementalDetokenizer
+
+PROMPT = "Hello, my name is Lee, and I'm a student in the " + \
+         "college of engineering"
+
+
+@pytest.mark.parametrize("min_tokens,stop,truth", [
+    (0, None, " is Lee, and I'm a student in the college of engineering"),
+    (0, "e", " is L"),
+    (5, "e", " is Lee, and I'm a stud"),
+])
+def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str):
+    """Test for a specific min_tokens and stop.
+
+    See https://github.com/vllm-project/vllm/pull/22014
+    """
+    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+    all_prompt_ids = tokenizer(PROMPT, add_special_tokens=False).input_ids
+
+    # The prompt is "Hello, my name is"
+    prompt_token_ids = all_prompt_ids[:4]
+    params = SamplingParams(
+        stop=stop,
+        min_tokens=min_tokens,
+    )
+    request = EngineCoreRequest("",
+                                prompt_token_ids,
+                                None,
+                                None,
+                                None,
+                                params,
+                                None,
+                                None,
+                                0.0,
+                                None,
+                                cache_salt=None,
+                                data_parallel_rank=None)
+
+    detokenizer = FastIncrementalDetokenizer(tokenizer, request)
+
+    detokenizer.update(all_prompt_ids[4:], False)
+    assert detokenizer.output_text == truth
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 2f5504ea14..04ad51aae0 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -74,6 +74,7 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
         params = request.sampling_params
         assert params is not None
         self.stop = stop = params.stop
+        self.min_tokens = params.min_tokens
         self.include_stop_str_in_output = params.include_stop_str_in_output
 
         # Number of chars to hold back when stop strings are to be excluded
@@ -111,10 +112,14 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
-        offset_before = len(self.output_text)
+        stop_check_offset = len(self.output_text)
         for new_token_id in new_token_ids:
             self.token_ids.append(new_token_id)
             self.output_text += self.decode_next(new_token_id)
+            # Support min_tokens, see https://github.com/vllm-project/vllm/pull/22014
+            if self.min_tokens and len(
+                    self.output_token_ids) <= self.min_tokens:
+                stop_check_offset = len(self.output_text)
 
         if stop_terminated:
             if skipped_stop_token_id is not None:
@@ -125,10 +130,10 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
 
         # 2) Evaluate stop strings.
         stop_string = None
-        if self.stop:
+        if self.stop and len(self.output_token_ids) > self.min_tokens:
             stop = StopChecker.check_stop_strings(
                 output_text=self.output_text,
-                new_char_count=len(self.output_text) - offset_before,
+                new_char_count=len(self.output_text) - stop_check_offset,
                 stop=self.stop,
                 include_in_output=self.include_stop_str_in_output,
             )

From 1f83e7d849ccb03990bb896f49df20343a2828b9 Mon Sep 17 00:00:00 2001
From: Grace Ho <146482179+gracehonv@users.noreply.github.com>
Date: Fri, 15 Aug 2025 19:52:51 -0700
Subject: [PATCH 324/932] [misc] nsys profile output kernel classifier and
 visualizer (#22971)

Signed-off-by: Grace Ho <grho@nvidia.com>
---
 tools/profiler/nsys_profile_tools/README.md   | 175 +++++++
 .../nsys_profile_tools/gputrc2graph.py        | 426 ++++++++++++++++++
 .../nsys_profile_tools/images/csv1.png        | Bin 0 -> 148416 bytes
 .../nsys_profile_tools/images/html.png        | Bin 0 -> 72163 bytes
 .../nsys_profile_tools/images/html_tbl.png    | Bin 0 -> 36615 bytes
 5 files changed, 601 insertions(+)
 create mode 100644 tools/profiler/nsys_profile_tools/README.md
 create mode 100755 tools/profiler/nsys_profile_tools/gputrc2graph.py
 create mode 100644 tools/profiler/nsys_profile_tools/images/csv1.png
 create mode 100644 tools/profiler/nsys_profile_tools/images/html.png
 create mode 100644 tools/profiler/nsys_profile_tools/images/html_tbl.png

diff --git a/tools/profiler/nsys_profile_tools/README.md b/tools/profiler/nsys_profile_tools/README.md
new file mode 100644
index 0000000000..75ae0811cc
--- /dev/null
+++ b/tools/profiler/nsys_profile_tools/README.md
@@ -0,0 +1,175 @@
+# gputrc2graph.py
+
+This script processes NVIDIA Nsight Systems (`nsys`) GPU trace files
+(`.nsys-rep`) with -t cuda tracing enabled, and generates kernel-level
+summaries and visualizations of GPU and non-GPU time. It is useful for
+profiling and analyzing nsys profile output.
+
+## Usage
+
+### Command-line Arguments
+
+- `--in_file`  
+  **(required)**  
+  List of input files and their metadata. Each entry should be in the format:  
+  `<nsys-rep>,<engine>,<model>,<elapsed_nonprofiled_sec>`  
+    - `nsys-rep`: Path to the `.nsys-rep` file.
+    - `engine`: Engine name (e.g., `vllm`).
+    - `model`: Model name (e.g., `llama`, `gpt-oss`, `ds`).
+    - `elapsed_nonprofiled_sec`: Wall-clock runtime (in seconds) without
+    profiling. Specify `0` to use the elapsed time from the nsys-rep file
+    (this may inflate non-GPU time if actual runtime without profiling is
+    less). Multiple entries can be provided, separated by spaces.
+
+- `--out_dir`  
+  Output directory for the generated CSV and HTML files.  
+  If not specified, results are saved in the current directory.
+
+- `--title`  
+  Title for the HTML chart/visualization.
+
+- `--nsys_cmd`  
+  Path to the `nsys` command.  
+  Default: `nsys` (assumes it is in your PATH).  
+  Use this if `nsys` is not in your system PATH.
+
+## Notes
+
+- Make sure you have pandas installed.
+- Make sure nsys is installed, and specify the path to the `nsys` command with
+  `--nsys_cmd` if it is not in your PATH.
+- For more details on available engines and models, see the help string in
+  the script or run:
+
+```bash
+python3 gputrc2graph.py --help
+```
+
+## Example 1: analyze a single profile
+
+To analyze the GPU cycles for say, gpt-oss model with vLLM engine:
+
+1. Run the following command to collect nsys profile, for vllm serve config.
+
+   ```bash
+   nsys profile -t cuda -o run1 -f true --trace-fork-before-exec=true \
+   --cuda-graph-trace=node --delay <DELAY> --duration <DURATION> \
+   vllm serve openai/gpt-oss-120b ...
+   ```
+
+   where:
+
+   - DELAY: how many seconds to delay nsys from collecting profiles, needed so
+     that profiles aren't captured till vllm server has come up and load
+     generation starts.
+   - DURATION: how many seconds for nsys profile to run before generating the
+     profile. This should be > the duration of the run.
+
+2. Run again, this time without collecting the profile, and get the total run
+   time in seconds. This value will be used by the script to calculate the
+   CPU(non-GPU) seconds for the analysis.
+
+3. Say the run elapsed time is 306 seconds, from step #2. Run script to
+   analyze:
+
+   ```bash
+   python3 gputrc2graph.py \
+   --in_file run1.nsys-rep,vllm,gpt-oss,306 \
+   --title "vLLM-gpt-oss profile"
+   ```
+
+The command will produce 2 files for analysis:
+
+- result.html: this categorizes kernel names into different categories in a
+  stacked bar chart.
+- result.csv: shows how the kernel names are mapped to the different
+  categories.
+
+### HTML visualization with result.html
+
+The html file shows the number of elapsed seconds due to different GPU
+Substages or categories, which consist of moe_gemm (Mixture of Experts GEMM)
+kernels the biggest category, at 148 seconds, followed by "attn" or attention
+kernels. This lets the user prioritize the kernels to focus on for performance
+optimizations.
+
+![Example GPU Trace Visualization](images/html.png)
+
+There's also an appended data table underneath the bar chart for copying out to other post-processing tools.
+
+![Example GPU Trace Table](images/html_tbl.png)
+
+### Kernel to category mapping with result.csv
+
+Suppose the user would like to focus on improving triton kernels. It's not the
+biggest consumer of cycles at 9.74 sec but perhaps it hasn't been optimized.
+The next step is to use the result.csv to dive into what the kernels are which
+compose the triton kernel GPU cycles. The following image shows that
+triton_poi_fused__to_copy_add_addmm_cat_.. kernel to be the biggest
+contributor to GPU cycles.
+
+![Example GPU Trace csv](images/csv1.png)
+
+## Example 2: analyze multiple profiles
+
+Suppose the user has multiple nsys trace files, captured for different models,
+say llama and gpt-oss in this case, and wish to compare their GPU/non-GPU
+time, something like the following command can be used.
+
+```bash
+python3 gputrc2graph.py \
+--in_file run1.nsys-rep,vllm,llama,100 run2.nsys-rep,vllm,gpt-oss,102 \
+--out_dir results \
+--title "Comparison of vLLM Models"
+```
+
+The analysis process is similar to example 1 but now there will be multiple
+stack bar charts that can be compared.  The categories for the different
+kernels will remain the same, so that it's easy to compare the GPU cycles for
+the same categories.
+
+Once a category is shown to have more cycles for one configuration than
+another, the next step would be to use the csv file to see what kernels are
+mapped into that category, and which kernels are taking the largest amount of
+time which would cause a difference for the overall category.
+
+## Example 3: add new classification for a new model
+
+Suppose there's a new model ABC that is available for engine DEF, and say there
+are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels
+have names with "*H*" or "*I*" in them, and attn kernels have names with "*J*"
+or "*K*" in them, add a new entry like so:
+
+```python
+engine_model = {
+        'DEF': {
+            'ABC': { 
+                'layer_anno': {
+                    'Stage': {
+                        '.*': 'layer',
+                    },
+                    'Substage': {
+                        'H|I': 'gemm',
+                        'J|K': 'attn',
+                        'CUDA mem': 'non-gpu-H_D_memops',
+                        '.*': 'misc'
+                    }
+                }
+            },
+        }
+      'vllm': {...}
+```
+
+Basically Substage is a dictionary with a list of key/value pairs, where the
+keys are regex's of the kernel names to be classified, and values are the
+classification bins which one wishes to compare across engines/models.
+
+The last 2 entries are common for all engine/models, consisting of CUDA memory
+operations and a 'misc' for anything that's leftover and can't be classified.
+
+When invoking gputrc2graph.py, specify a trace file with this new model/engine
+like the following:
+
+```bash
+--infile new.nsys-rep,DEF,ABC,<runtime>
+```
diff --git a/tools/profiler/nsys_profile_tools/gputrc2graph.py b/tools/profiler/nsys_profile_tools/gputrc2graph.py
new file mode 100755
index 0000000000..8921e1f20f
--- /dev/null
+++ b/tools/profiler/nsys_profile_tools/gputrc2graph.py
@@ -0,0 +1,426 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+    This generates gpu kernel analysis output from nsys rep. Will call nsys
+    stats  -r cuda_gpu_kern_trace, get non-overlapped gpu cycles, then generate
+    csv and html output for analysis
+"""
+import argparse
+import logging
+import os
+
+import regex as re
+
+logger = logging.getLogger(__name__)
+
+
+# helper data class for annotating kernels
+class EngineModelData:
+    # engine + model mappings
+    engine_model = {
+        'vllm': {
+            'llama': {
+                'layer_anno': {
+                    'Stage': {
+                        '.*': 'layer',
+                    },
+                    'Substage': {
+                        'gemm': 'gemm',
+                        'fused_moe_kernel|GroupProblemShape|group_gemm_starts':
+                        'moe_gemm',  #llama4
+                        'moe|sigmoid': 'moe',  #llama4
+                        'CatArrayBatched|prepare_inputs': 'prepare_next',
+                        'flash': 'attn',
+                        'ncclDevKernel|cross_device_reduce':
+                        'nccl_and_custom_ar',
+                        '_norm_': 'norm',
+                        'act_and_mul_': 'silu',
+                        'rotary_embedding_kernel': 'rope',
+                        'SoftMax': 'softmax',
+                        'elementwise': 'elementwise',
+                        'fp8_quant': 'quantize',
+                        'reduce_kernel': 'reduce',
+                        'triton': 'triton_kernel',
+                        'CUDA mem': 'non-gpu-H_D_memops',
+                        '.*': 'misc'
+                    }
+                }
+            },
+            'ds': {
+                'layer_anno': {
+                    'Stage': {
+                        '.*': 'layer',
+                    },
+                    'Substage': {
+                        'block_fp8|gemm_fp8_blockwise':
+                        'block_fp8_gemm',
+                        'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal':
+                        'moe_gemm',
+                        'gemm|matmul|nvjet':
+                        'gemm',
+                        'moe|sigmoid|expert':
+                        'moe',
+                        '_fwd_|FlashAttn|_mla_|_attn_':
+                        'attn',
+                        'CatArrayBatched':
+                        'prepare_next',
+                        'ncclDevKernel|cross_device_reduce':
+                        'nccl_and_custom_ar',
+                        'Norm|_norm_':
+                        'norm',
+                        'sbtopk':
+                        'topk',
+                        'act_and_mul_':
+                        'activation',
+                        'compute_position_kernel':
+                        'rope',
+                        'elementwise':
+                        'elementwise',
+                        'fp8_quant|quant_fp8|cvt_fp16_to_fp4':
+                        'quantize',
+                        'reduce':
+                        'reduce',
+                        'SoftMax':
+                        'softmax',
+                        'triton':
+                        'triton_kernel',
+                        'CUDA mem':
+                        'non-gpu-H_D_memops',
+                        '.*':
+                        'misc'
+                    }
+                }
+            },
+            'gpt-oss': {
+                'layer_anno': {
+                    'Stage': {
+                        '.*': 'layer',
+                    },
+                    'Substage': {
+                        'block_fp8|gemm_fp8_blockwise':
+                        'block_fp8_gemm',
+                        'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_'
+                        # this section is triton_moe_gemm
+                        '|matmul_ogs_|_topk_forward|_combined_routing'
+                        '|_sum_bitmatrix_rows|_compute_writeback_idx':
+                        'moe_gemm',
+                        'gemm|matmul|nvjet':
+                        'gemm',
+                        'moe|sigmoid|expert|splitKreduce':
+                        'moe',
+                        '_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha':
+                        'attn',
+                        'CatArrayBatched':
+                        'prepare_next',
+                        'ncclDevKernel|cross_device_reduce':
+                        'nccl_and_custom_ar',
+                        'Norm|_norm_':
+                        'norm',
+                        'sbtopk':
+                        'topk',
+                        'act_and_mul_':
+                        'activation',
+                        'compute_position_kernel':
+                        'rope',
+                        'elementwise':
+                        'elementwise',
+                        'fp8_quant|quant_fp8|cvt_fp16_to_fp4|quantize':
+                        'quantize',
+                        'reduce':
+                        'reduce',
+                        'SoftMax':
+                        'softmax',
+                        'triton':
+                        'triton_kernel',
+                        'CUDA mem':
+                        'non-gpu-H_D_memops',
+                        '.*':
+                        'misc'
+                    }
+                }
+            }
+        },
+    }
+
+
+class GPUTrace2Graph:
+    """ 
+        Parses output of nsys report, generates csv and bar chart output
+    """
+
+    def __init__(self, nsys_cmd):
+        self.nsys_cmd = nsys_cmd
+        import pandas as pd  # avoid importing till needed
+        self.pd = pd
+        self.pd.options.mode.copy_on_write = True
+
+    # helper functions for generating trace->summary csvs
+    def gen_nonoverlapped_sum_from_gputrace(self, in_file, out_file):
+        logger.info('loading %s', in_file)
+        df = self.pd.read_csv(
+            in_file,
+            usecols=['Start (ns)', 'Duration (ns)', 'Device', 'Strm', 'Name'])
+        df['End (ns)'] = df['Start (ns)'] + df['Duration (ns)']
+        df = self.sum_non_overlapping_intervals(df)
+        # get ready to print table with elapsed times per kernel
+        df['Instances'] = 1
+        df_sum = df.groupby('Name', as_index=False).agg({
+            'Elapsed Time (ns)': 'sum',
+            'Duration (ns)': 'sum',
+            'Instances': 'size'
+        })
+
+        # generate csv
+        df_sum['Total Time (sec)'] = df_sum['Duration (ns)'] / 1e9
+        df_sum['Elapsed Time (sec)'] = df_sum['Elapsed Time (ns)'] / 1e9
+        df_sum = df_sum.sort_values(by='Elapsed Time (sec)', ascending=False)
+        df_sum[['Elapsed Time (sec)', 'Total Time (sec)', 'Instances',
+                'Name']].to_csv(out_file, index=False)
+
+    def sum_non_overlapping_intervals(self, df):
+        """ 
+            returns new sorted df with Elapsed Time (ns) column using 
+            vectorized operations 
+        """
+        logger.info("sorting %s trace records by start time", str(df.shape))
+
+        # Sort by start time and reset index
+        df = df.sort_values(by='Start (ns)').reset_index(drop=True)
+
+        # Initialize elapsed time as duration
+        df['Elapsed Time (ns)'] = df['Duration (ns)']
+
+        # Get numpy arrays for faster operations
+        starts = df['Start (ns)'].values
+        ends = df['End (ns)'].values
+
+        # Keep track of current interval end
+        current_end = ends[0]
+        display_units = int(len(df) / 100)
+        # Update current_end for overlapping intervals
+        for i in range(1, len(df)):
+            if i % display_units == 0:
+                print(f'processing trace: {int(i/len(df) * 100)} %', end="\r")
+            if starts[i] <= current_end:
+                if ends[i] > current_end:
+                    # Partial overlap
+                    df.iloc[i, df.columns.get_loc('Elapsed Time (ns)'
+                                                  )] = ends[i] - current_end
+                    current_end = ends[i]
+                else:
+                    # Complete overlap
+                    df.iloc[i, df.columns.get_loc('Elapsed Time (ns)')] = 0
+            else:
+                # No overlap
+                current_end = ends[i]
+
+        return df
+
+    # functions for generating html files
+    def make_html(self, df, output_dir, title):
+        """ make html graph from df """
+        import plotly.express as px
+        if df.empty:
+            return
+        output_name = output_dir + '/result'
+        if not title:
+            title = 'Model_Engine'
+        x = 'Model_Engine'
+        y = 'Elapsed Time (sec)'
+        color = 'Substage'
+        """ generate kernel mapping table  """
+        # Sort Model_Engine categories by last field after underscore
+        df['Model_Engine'] = self.pd.Categorical(
+            df['Model_Engine'],
+            sorted(df['Model_Engine'].unique(),
+                   key=lambda x: x.split('_')[-1]))
+        df[['Model_Engine', color, 'Instances', 'Name',
+            y]].sort_values(by=color).to_csv(f'{output_name}.csv', index=False)
+        graph = px.histogram(df.round(2),
+                             x=x,
+                             y=y,
+                             title=(f'{y} for {title}'),
+                             color=color,
+                             text_auto=True)
+        # wrap x axis labels
+        graph.update_xaxes(automargin=True)
+        graph.write_html(f'{output_name}.html')
+        """
+            Generate data table with columns per Model_Engine into result.html
+        """
+        pivot_df = df.pivot_table(values='Elapsed Time (sec)',
+                                  index='Substage',
+                                  columns='Model_Engine',
+                                  aggfunc='sum',
+                                  observed=False).round(2)
+        # Add sum row at bottom
+        pivot_df.loc['total_elapsed_sec'] = pivot_df.sum()
+        pivot_df.fillna('').to_html('temp.html')
+        print('got')
+        with (open(f'{output_name}.html', 'a', encoding='utf-8') as
+              outfile, open('temp.html', encoding='utf-8') as infile):
+            outfile.write(infile.read())
+        os.remove('temp.html')
+
+        print(f'Finished generating: \n'
+              f' {output_name}.html for stack bar chart \n'
+              f' {output_name}.csv for Kernel-Substage mapping')
+
+    def anno_gpu_kernname(self, df, mapping):
+        """ add "stage" and "substage" columns """
+
+        def anno_gpu_kernname_helper(name, stage):
+            for kern_name, val in mapping['layer_anno'][stage].items():
+                if re.search(kern_name, name):
+                    return val
+
+        for stage in ['Stage', 'Substage']:
+            df[stage] = df['Name'].apply(anno_gpu_kernname_helper, stage=stage)
+
+    def make_nongpu_row(self, df, nongpu_sec):
+        """ this will append non-gpu time entry at end of df """
+        nongpu_row = self.pd.DataFrame([df.iloc[-1]])
+        nongpu_row['Substage'] = nongpu_row['Name'] = 'CPU(non-GPU)'
+        nongpu_row['Instances'] = 1
+        nongpu_row['Elapsed Time (sec)'] = nongpu_sec
+        return (nongpu_row)
+
+    def is_valid_file(self, base_file):
+        """ asserts if base_file is non-existent or is empty """
+        assert os.path.isfile(base_file) and os.path.getsize(base_file) > 0, \
+           f"{base_file} doesn't exist or is empty"
+
+    def should_gen_file(self, new_file, base_file):
+        """ figure out if new file should be generated from base_file """
+        self.is_valid_file(base_file)
+        if (os.path.exists(new_file)
+                and (os.path.getmtime(new_file) > os.path.getmtime(base_file))
+                and (os.path.getsize(base_file) > 0)):
+            logger.info('reusing %s', new_file)
+            return False
+        else:
+            logger.info('generating %s', new_file)
+            return True
+
+    def gen_sum_file(self, file):
+        """ 
+            generates sum file from nsys trace with times per kernel and
+            returns the name of the sum file
+        """
+        import subprocess
+        file_dir = os.path.dirname(file)
+        file_name = os.path.basename(file)
+
+        if not file_dir:
+            file_dir = '.'
+        # Walk through trace and get the total non-overlapped time
+        nsys_stats_file = f'{file_dir}/{file_name}_cuda_gpu_trace.csv'
+        sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv'
+        if self.should_gen_file(nsys_stats_file, file):
+            cmd = [
+                self.nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o',
+                f'{file_dir}/{file_name}'
+            ]
+            cmd_str = ' '.join(cmd)
+            logger.info('+ %s', cmd_str)
+            try:
+                subprocess.run(cmd)
+            except Exception:
+                logger.error(
+                    "%s failed, specify --nsys_cmd for correct nsys path",
+                    cmd_str)
+                exit(1)
+            logger.info('generating non-overalapped sum %s', sum_file)
+            self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file)
+        self.is_valid_file(sum_file)
+        logger.info('Finished generating %s', sum_file)
+        return sum_file
+
+    def gen_graph(self, in_file, out_dir, title):
+        """ generates graph and csv file from in_file into out_dir """
+        # Initialize an empty DataFrame to store combined data
+        combined_df = self.pd.DataFrame()
+        for idx, (file, engine, model, total_sec) in enumerate(in_file):
+            file_dir = os.path.dirname(file)
+            file_name = os.path.basename(file)
+            if not file_dir:
+                file_dir = '.'
+            sum_file = self.gen_sum_file(file)
+            # read kernel summary file
+            df = self.pd.read_csv(sum_file)
+            # annotate kernel to their categories
+            assert EngineModelData.engine_model.get(engine)
+            assert EngineModelData.engine_model[engine].get(model)
+            # remove nsys-rep from file_name for shorter x-label
+            file_name = file_name.replace('.nsys-rep', '')
+            df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}'
+            self.anno_gpu_kernname(df,
+                                   EngineModelData.engine_model[engine][model])
+            # patch in non-gpu time
+            gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1)
+            total_sec = round(float(total_sec), 1)
+            if total_sec < gpu_sec:
+                logger.warning(
+                    "Elapsed sec %.2f < GPU sec %.2f resetting Elapsed sec ",
+                    total_sec,
+                    gpu_sec,
+                )
+                total_sec = gpu_sec
+            nongpu_row = self.make_nongpu_row(df, total_sec - gpu_sec)
+            df = self.pd.concat([df, nongpu_row], ignore_index=True)
+            combined_df = self.pd.concat([combined_df, df], ignore_index=True)
+        if out_dir is None:
+            out_dir = '.'
+        else:
+            os.makedirs(out_dir, exist_ok=True)
+        # generate html file
+        self.make_html(combined_df, out_dir, title)
+
+
+def parse_tuple(s):
+    return tuple(s.split(','))
+
+
+def main():
+    logging.basicConfig(format=('%(asctime)s - %(levelname)s - %(message)s'),
+                        level=logging.INFO)
+    parser = argparse.ArgumentParser(
+        description=(
+            'Process nsys rep and generate kernel non-overlapped cycles. \n'
+            'Example:\n'
+            "gputrc2graph.py --in_file d1.nsys-rep,vllm,llama,100 \n"
+            "d2.nsys-rep,vllm,gpt-oss,102 "
+            "--out_dir results/ --title \"Model=gpt-oss vLLM chart\""),
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    # Build help string showing available engine/model combinations
+    engine_model_help = []
+    for engine, models in EngineModelData.engine_model.items():
+        model_list = list(models.keys())
+        engine_model_help.append(f"{engine}:[{','.join(model_list)}]")
+    engine_model_str = ' '.join(engine_model_help)
+    parser.add_argument(
+        '--in_file',
+        type=parse_tuple,
+        nargs='+',
+        help=(
+            'list of (nsys-rep, engine, model, elapsed_nonprofiled_sec) '
+            'separated by space. Elapsed_nonprofiled_sec is runtime without '
+            'profiling used to calculate non-gpu time. Specify 0 to use '
+            'elapsed time from nsys-rep but that might inflate non-gpu time. '
+            f'Available engine:[model] are: {engine_model_str} '
+            f'Example: --infile d1.nsys-rep,vllm,llama,100 '
+            'd2.nsys-rep,vllm,gpt-oss,102'),
+        required=True)
+    parser.add_argument('--out_dir', help=('output dir for result.csv/html'))
+    parser.add_argument('--title', help=('title for html chart'))
+    parser.add_argument('--nsys_cmd',
+                        help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'),
+                        default="nsys")
+    args = parser.parse_args()
+    gputrace = GPUTrace2Graph(args.nsys_cmd)
+    gputrace.gen_graph(args.in_file, args.out_dir, args.title)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/profiler/nsys_profile_tools/images/csv1.png b/tools/profiler/nsys_profile_tools/images/csv1.png
new file mode 100644
index 0000000000000000000000000000000000000000..bdeb47c3c2a3575c200ae8dee23bd14ca6dc491b
GIT binary patch
literal 148416
zcmeFYbyVET5<iH0@F2k*f<tf*5Zv7*4DRk4oZ#-k9fG?%1cC*3ch^CF!@c*tcb{a>
z{<Y`qp5^q+d};6M>guZc)HhUKR_qPJI|MK=us0IoB8p&O&=_D~5J_+_pd7TOWiv1^
zgc>tpVR;E*VPbiE8)Gv|02r8fXhIUKywX0_K!=A7H$45fhM#FySb~y!;CU{HU&sZi
znV}#;2sE_2>at1why5$Us2i~ae?f#8YGGnlF@_XT9~ufN)15*H+ur{0uvr;O=jyoM
zUAoLTO#=dC!Ls+h2X;HEWBDQp)l-ko1`uN4i48=7@e2AP*@AIf$|gqRqNBsfr+xKl
zxb*@{4Qm)JFn<ExzRYYjw$dSk$-@nYF44)t3{xq*45NirgA2;TG&jw6)3gV$$b4ZB
zx!A*`l4CV4L8iA0GMnN;GsvXI0~aK&na?o-8>}yq9bYHUbs9vY)<S6X1RoBhZZS5Z
z+s;|?t42#D6LBlyCp&N99`Y$op)MXWrjl|sW}NzTwqXK4H9zt^zInHr7{idM(}U_{
zA2!H{^tEI!g-=539kGFsq5K)$Wz*FCR66NRcsf8f(PW9vBtN*uIA!DpA$nQ@1tEQ#
zygiCMIO|7PGC()c<&G{qF-YQ<47wYONeX@Hp-?&&&4iLGoy5~)e8`^0*t!_Bgu4_X
zO7h5g{2r-YvJoS%U**73Gk>hZ;NXMxsb}KW{82odzz|o}?!zv8vIAR<A$gmfo9>DO
zS4^y@L4B|hy@7iOc~;U1?L>`zH^sPdF5~V}4M{AkGyoVp%5ny=LX7(KQwXjKyuS0@
zwW+eOOY)DAk<5kYUBz`4GyyJ(v@sDIf~YZc+tJ;La{BUPu;k_**2=*}o+%d$Me<QI
zTqZE~0eN-K!gB;L&dYagkvuye+T_8EHo#5O#?iv<jcfN?f}qK-8vJ0CeJOuJf9ga!
z9w+34bg32gK}8}Jq*I6BB!)HuLmPyq?bMWqlI+B)f>ic>Uxk7XA=YL06Kc$t-4<RO
zl0%5G5e24`$rZl8(}M|K2$e8LP+1&$(*KG?Y4Dw)U_~?sCS0J<cX^0-BzR)xS_n<p
z1i^fHXyrFEA3Hu4eq-N5n0~Vy2=fhX@68qbCw&&_05*NdISAuUX-=?6U+kVCTgr44
z?C!KR`$iak2##*6Gqy%-LSMb^O*p|kR8bK;W?`HVD)fk65{VEvjX+E?fcP85s9NHT
zL3$%FTv2{QTb%bnk<2}upC6fGaF8i6Y7lDBl>E59nDsj9TN}VIe@?DcM^GiO2y^PL
zu40}uvSg@-(+cAA<qK>JIqpOLyt--2DVk0)2EH2{-j~0TxXHe8v-ZxmywQ3I)fzu)
zpma^<oZ(KN7cc88WA|w{QCHT$?OO0T5u73vy+2!5Fsc|TDgqqfOV1aFFNEE!-Q)_1
zMRJa$Fc?j}fZiXyPrX<hZ#J0hV#&o=$eW@`#f$Um^6VA)_DT1__siqs<0Ip9m5VU-
z%)cr(O*0pdC<Z7J4P6*vR8zU3X-Kb#@QYhhtWtF3%TvQ(x=7+vq*aG1N;L_(iQE&*
zB(NtLOE~1KNvTP4NZyKf#F7l}jW7Wc2e(Gzhti{)NLb%X#j{0U#{)(xV$0+C=ocAz
z7{n54Vi6NZq8F)M)K9{e9Nb!4{aPD%)!OB~vmT-EN#1{)$#+-ens%Cgnhu{fco%6b
z{&_;tjG{KTmSG{;ztz9|WBkX=kNMpif#9Md6M`}M$rX(n=Z%J08&ni;C<@T!%mNhK
z6r(1;RHs(Au*fy3HwiVV-Nzy5^l4GZ07@m}hJ{rkN<z(x%qtFbXTnQz%|Ec2utjQq
zU$Cq5IH+n4bx(e<zxFzqof%;Jz~;&J<z34M2Ga=h_vXzO1?JYXyoFpjqBHh08x~fx
z+?Ch<6@n&{Q~TRj$PdZ&&GlLJ>MfcrQXYx}@>Gh2f`vl6-xws4zNO{0=rriKNXtpt
z_^lwcakqIqh<F8ghds+a$v(kD3I~`5T*Jt~jK4{EqY-o;<P?MwLH)`4Q_3f{PYqw~
z`jIw2Z1VMU$mnD6#9_us#^DuI6}Qo>I!2k@@*eRHxcR%WT|WF6G?uQxui>r{auB@~
zyo9)<yi~(-$FjrHqUWY>*Idvj&^l>owV|-two%caF}&_y=uQ2sW9GFOWVrdcW}#qy
zr>?x1e%!h8n1H8b8KULd`Pf<Sh~P-{=K6*V*&P`kxrN{X_roW+&t#j`WWOeQ?CLgg
z2cl$eWaniEGM??(9b_H&j~+eS*F9GrpDu&79ku5eW*8tV9A}|svvgpWYM1A%*{q|M
zotB$UV3)F(zorzAl8venCK8fHDG_31%4X8?X|<s;>Qx$6GT6ACik~8!it%&v5Ai$k
ze{OefAMw(9s`3W+)_Qu-ct?j^n#i_uy4Eq-LFS8(N+^$Z#`%+69&@f5On0qAf6*US
z=t0OoyFedx6k!GHk*Sx7oOP=~UfWdM6tFkNR9f}cAY*;{?Cz{*{gjNClwE8o>O&Z&
zSiRVg2z^+|*VK@kFov&$A>TtZa2@DX*%FBs@q?qn!+20R+JAbiM+WwUA7Q<vKhUyl
z<g__YpeT`=OiKTn6lyY%CEiIXLWM7TH;QaLTQz{85!!+=z?qqzk)EXOa^idfHy)@q
z?2)(?tDJhpJ5S-Jbs-+6C8;q?y&9+2<&$xcq?y1SgO%>gbgbp>=q?OHLPoM#E2tYy
zxu;&z^uflg;is}XIsTyI@KOvHB?2nJpno-%37g5(PlNsDIq}B;SEd2BLY7@&Yvq$r
z-X>n5m1_Xy2s@oe;%VH=i2bPj*u~A;1=?Q~INC$n8f^%g35_LQbtebiGd&ibv-p})
znrZbnw-6)#lm0G;IkR<`=9m>U_zzvT!Of)mqzVcz?z`720*2mOFLQNW#yu5p^un9^
z)THR7h9&uJwk%s8;%4Nw1-usHYl|&vo_5-_Y_*`Y9_pK|_>|wPrRs6lT6NX;A735!
zEv_DitlH&|)|Z)6nH9S33cBgLT{5)kXgPaTh!^jbUlm!-mA8BJoP_c%@$q>q-~LiF
zQ%+F!dPsS>%rIX)=zkZGTJ6>7#RbefiCS2_N8C(WfGmNPME2yO=H#@cK9UzJ7q&V!
zxb8_iT;sM|9*hwcvnIUeUeLBt)mzS<leb60#~|`Jx9G0)8i0&!$kI<Q5-9euzoj`S
z-uP5xr?IK7BsnGMVCisc_A+%$KSp2Z&H8e@hI`DKVDfg%nqQ1t&GlsQ{_%3Ny!+Qg
z{Z{?Ao4XqzwjsmP3Ec5sGmh=~1x1z=Hg`(KE@O)}zeC^t@P27Tsm$zAC7W(Ti}!&`
z%c`8yybIfkZR3RJ)VZ<L?c<39pEaLVL#wq;^N(h)a*m>g7RKsR*_WJWKb|F?@za7;
zFE7U3m8~Z>lq!@kqG5hIkG$ui9h6OhcQ8k=W617rbbauj@19Xj_O|27sIr0)vlKHE
z`KH`eflCkfDQH<SX^dFyINmg#TwI!0g-cm2dbn-e_ens#*#X1;AiZ#1ACH>z^5MmR
zQ_I!5sNe_z_pEbo`<%Og;b37RTp}I;Q6G2q6d>+P(_{E~Qeq{z9+BtB-Hps3*7rg&
za408E8hBq0Sn%v#2*EaZFthd12tKQyqc8qu5HF!7J7;D!uxo?wz`B>f;XZwS`+h%{
zVHrUXu0*sEa<2c=M$P5f)e%3SS`6%L@9eA|9^d^V#7k{T({oGv3$9<@!Rkf_n3xHC
z2N;on7)Xm%1*l6H%gBJe2R*}qfd`v`L4lsYL7#V^4;UC^Y!DbM=ocOI5y^)5M=3N$
zHsn8^A(CDz3MmOoNPvEoKHCET)()mNj&(<0ctBmvnklP0s>?`of3~q=Ffg()1TeT-
z*}gUb<8|c*Jz4=A4TxQ>EUg{5UHM4=sKE_-e$8eiCH|v|qXi$Sx{N%ru#G){n4N)%
zfr*qKftZ+>*WSpOTT$fGU(G@P@sXN3I@)qGGP=09Fu1TX*w~veGIMcpF*30*varyD
zYS25lSvwlI(px)_{n^Ps`Vj#*e6}~Ubu_cFCVuVLz|h9Yk&l%0b)bL#{W(v7tJ%Ls
zvUd1uS)c_nzLqdDGcYm!vu{vS-q&1ic{5jlrG|)^6^J~bG5FcpS$Y4c|6e8l8u4!}
z)&JF!nT3n%?@j+!^q);t902yhHddf99r^#YHGeh!d*NRVc^O}q{x?$mN#{RuL4@W<
z;AQ;hrtu>HhcqI=zy!f0M1+)G!H+Va(^dMg2T&9t!l+66iOBPLpe6R>6XQ`up!QEN
z^T?qvDe|=P-atV?Lnc^9%Qt(3zTE#f@oK%DQyV$CSZQCKA79jR*d6mU*;#4Y&E&$M
zB`4o%mlyYi`fcij^rPa%-!(ud2LEkhg4)36_DrV!Z6~1qJIVai@ptGT0?kvWi}|_@
z!G7BtoG}Fc4*qEVcxxOOeDwcJz9>*d(pTVE0kQwH7?df<gptj{oGg{)M)liy6T!)+
z>_O7OQ2efjpFX7^z>q14|56r$T=0J<e$;H130QR`$^TspYSty~JKU?!;I)(STPk}J
z3j%Ubs&L|eTf;vEEFrW@`(c#lHzWarzZ3Kw79rASrv7aWC|2<I7y(UJDK5Vy{+S<W
z9R5y2`QO$6ZI3b~jKe<b>F-EFz6J>z$IG`b`?obvM?o8iv@>b4BJn%k>x2c3^8gku
z^xGQ5$?)<iIDasTFCIiNG7;7zHE2W>41J($GW;zRA^5M&PFmK2<X5Wt$AD$2b*UeW
z)-*s7ssTZpwuo-XiGC&w;c~JR!}9ihD$y^R*Vb@VUcCBUb)6KQ?)Z&>f%sP@6l6kM
zyL_K4ndWw#ASi7~1e%y8BB8@wk67g3uAV1N5~JSDdkTl$@uVzv>#O&cj?nMeJ->K%
zI9oaDFnrS6jVjjy+-A1z1YZkAr*YVryWb8b(5iXLo&;sF54S=idh$n6K~eb$S`u|>
zD2~?gK$H!f?49BKv5~$Vs38*Lv_3w&XDU={+G6!qs|<V(R|#4T)?7qklgBawFQlcv
z%0eeCi0xY~#!^{1hATHmpbZs)RTnaV98psA#DtX+2AavdI6jZtAUv&Y^KSzNLGKiV
zdT=<m$eLWwW7QNQvX~@P%P!ixR4Y{mw!rVJUc6YG)754#G55xnTdFPlB<EY*!V9x`
zPoVuE?k}ZwWeI+u96r;TjAz6Z{Lm8bFPoo<t&TMsO3{*EjeNX}njO-u_iU1Ww5*<8
z&wxBxs?#cvPGcY;AsGv9B!Gm6QUnTW73nm1=OuKHYSaKmDmy-DdPeM>qrug>r=MPT
zFNihmfms5_bPDABl2{S`poHKkIjZS6Ezm{BV%dGyYuY8Uh{3>x+fP)g-Ptv&;lh|6
z+vlmya`n5CYK1n%?pRjTLEgfKfZIj<2cM3F-YBBgdd3n}UAKDxF28*|oksJA5{7k$
zR=;Wf)*ssVHjpgoCJ^Tv{cL{5qnTs@z*`4SgV~eii`msh{G0p3PmxFSfZnPo{Me)q
zW{SbbA=bU21X}DBxhjSt_kk`e!R0F`HVv!Chm++kM~gm!iS$pJEC5%TXD)YTZl`^7
z%m|B)Yu)uHYTU(EK=>6Nr?~k{5rbXN!eMUjfWuRkV0LZPAmx3GrtfUAhVW0jZ&3Vx
zf+Yh~av8BFZNR}^yU&nBybga#p*0LYL3drzZJtWGpQK#2#jzAyEh@CqWj@c>4UYRO
z!a*nmq+AYLOh8@tb5XaQ6q7vf<p#3*;geiEKIbyQA+>5(yIXLwb<>UB(0rjMck?f7
z$|b55baK4wMC^{Q1fhX>$v%*ggrpu-V>LzyvR@dH#x|~BElMCPQ>fk^BfG}qh;cgP
zn9HKP3)3sq6&{MA%Bnq_lc>;b+p&Foegd+%kE+w?wC7-Y<`y{~PbgJrJ0NY^`eWn4
zV~|Hj;IeQ&#`3+ia@EMafL}ZZTLHRq`V^0TOe$6=0#cYA3WUo9Dzt2*92SuY0NV#M
ziml4H_op3kl-k#tb%5@?@lo#f^tHKDAIpUIGuM3$wrKb29i-aLSDhCZ-GPWZl*t`G
z@f7O~DR4RLVtcOry*Agr3HxVtjM(@*l0aYX##w5xNTqYykz1vHV6l62-Jhx{Tx$1P
zB<~Q}XJIi}@zhd{B;c_=fR)}j20R62ZT+a;hIY#FIBiEhw%RO^DPRB$XHHFz10H7H
zuozxZv3Pfj)aZUUXT#-m4bd2=gDtMJSvRDR<|(_C{oLh5FQnDa1XjzKXLH=_G`Y3B
zb1_jBRZa|1nzlo)mEs(B)4H>GYB|t&;R+8W39KrFI&@mXUPsO`yWEMT)dSCU6|g<a
zrk^|C9LTq;*(n&qqB$=pUUiwdz|%{%JDDj}1GKLcevhm|ZEW*Zu`RIVf=ax&8OO6Z
zImSVsdThIu;S=N4lg%*RsC!;_Q!TaT8%p8msC9jToNuwoavH3iIN<{(dw8BeR7#~e
z6F}WMhq*vO6cbYY<nOgFM?}PCv`gWVyVaSNYjoUSuIwnbnd^^cXl&|i<2*eK?5!sJ
z&d?3467`pQy);}sp&<6%wofu<Q;5qBRxGal_T$!{AYssc0o%Pgj-S|xByX17pieQF
z46C3FmahYN`FeQwp~?>R;YGh)Gi0%dA~;Y3!s6?_IG=x1zb%MzeR_D~z*~~la(lAO
zdMXBxOk$a>dG=xkl*H31jYX&n6HR)5DWoFkyuZZGbC|WhaE~!mEHXV04zfz3QPCVr
z=OXKPdTw!Q8fZt#yIFrj_a39|+m>+hfsIN+-Z-C0(1m8<a-*GayY9o+4>k+E3xGuV
zX0N&W404&9)~kaRnGIgF#pGw}r%Of$d;*iDS_bV_Vk>@AitmPfEIZ9NF%gQQ8-*pw
zrK)vd@Q@TzX()^E@ev9n@^IG5L=oXLMvWSAE?X(R61D3$tlsdpe676gw?Y<8&QGb1
zrn9OWec@D3Fi0%J6)i5pRB&AOSIOyoF2Bf8+KDELV32W1ZB&X$MG>%L<y17_D9E6C
za#n4YRkNkd7Ypi?OOz!(`LKMV^dda}RzbOebAfV{P1pG<nVaml;we_?;)e8OW2Oq_
z((>Bx4DD%j%ZiGr=IqwiNMJ=T$`>1?+bb;as<&$SF&|Lg(Xo}P#f5E;jHGip<Q2Zh
zByrXMV7a8ME0rWmDfg&szdJ@>uGydvKKP_qCXPm<ToesITPE?D=6Wa1*em{5VWp$E
z>E@?5`4$6__R@zYaXa**%#VjFr472DE0inLg%0+ydG8({b)J+mxjg#62<_(B)fgWP
zT!mfj&nW52q<s7tb2k%3s9iq_H7`kAhXTy;%YimKU2f<w8c)<YXx9-_^|@_B0=j;o
zSKoBe8oWo`1$56bPu8CETok__HYri@%=rSJ4+wYpff{Q$U!CgY_-+FJc&R+NRkz9)
zNqOX61U`%9`Y~QJMu>NL@msWK;eN|SR?z7}Q{2`_(yaRZMk?yvPTwV+Rch^IiwO#s
z&vUwhb9WoK(b-RfYEAlR;LQom6YwTR&j%>ee0>CnJkRog@KcRKd`m~o*W$z$CMXr-
z^^*BHt9Qx^`-G8F=K14BZ{+6(c31Z7=dKq8i@N^tM2hf;X^#e{tZlD3FM<t)*E17>
z4B>avdJ!lw@~d2BqD~GdNRV&P15@pIk1*9Bgs<9B?lHb{p`0T!S>Qy=b-(Jf)Arok
z@sc{ycnIs}9^{sbaYn%i=W%~QF*V!j@si2K20AYC=o<kXnX8GNYjCHIRnd=*qg3cp
zF9&SqmMbrO`(SI;C2k|Pfa8bjk~mYkI;Rg~93zQR!3@h~dlX9jvk~CoH40($f*&e-
zdvDOFQgQq5M*393X{ot(Ml&pe6iItli?SN-N|XaG46Dso=~J%mv19o>*i(G_bHln1
zY}x=HE|m&>D4HC$<L?o%hIM8(#*-iN$>lPa(qRq@XI8x}eoYKgdfzlTngYMon%`&6
zC$aRsNR#zqqpN$1iq!8(=`KX15XffmI0S8tqzxRkVAzWlct5A`-fzYATjHq3t$N=E
zu+K$M{;1*>PR;VV(ZC?YN=UT-Nut>Drqy+8JZb!BJ|82Dn7@_SMkW78fu46_emK@U
zN;q3#dPV`YK;;pvAIFAkB^?;PZeCMZ^Z;K?sEu3Ahbzt9QpsGH>+Su*tQ+q8<VHY+
zW{>eW)&o-*9%F?91Ipg2+8PGhagOt`W~_jXE=}2R&x1_|{Wz7^iZ)83$|{|fafO9W
zO#!l7MWKOW2|6l-C@&q_IU8MMz1Fl&Ype7ATo*oL`EjG^5Uk$Jjv3FJCIV8Wjyr91
zN@-%Yk~RS0*wpk#T(j(GK@s>?otAoUk^9bcZ>bJbVLMo{`J!C;XqSVZx?!#7LH9y5
zQM)K7Bb+Eb+$?Q3sTyKX`zl@S#(gY9=`?Ynj3$vs=4M)|_0QKy@r+u(jvPE}^cuy_
z43Msus+E1%CD6N~RR;?(1d>QaSE9cKz@{Fv<gdi@LL&|x%vg*v#dS1T#YLpBNCw((
zkDfoe*^f{#QlYa_o#_dpnVws-PN{WWf)2?%O8)Y)*X#k$mOOwnemq*~0UUt{&1b$!
z_W8EsWj{F2K2`@v$m4wU;Z|cMnCRmu!Jn1^A)KrGZk?=HhlQ^<oR>!M9zW5}10usH
zyUqO!f?=ZQ3Uyvjr~u31_DFmT=|g*!`kSzY)>Xx&e_R^)=;v^F3inHV4wTc;d)rBR
z-}BFGoWO%`zq>z7;OD-(D5_ZB6A;_ZY#({LzFoC;0>haJK?$2G^d$LJwO_k18|aUI
z>Tg>PlcYMN=PLA7=A(e+BEx8cI2p+9%lqbuidL4CeY8;dygpedXA)$Uvw3usYqfi#
zqu3y8l`Gym&zY&FSM}4C*yHwC#bImKGEC&C2QfEada05z&h@)?m1zt<a@zFv!Sx}W
zG4~vr3T1&zoMu8Cy$tWK4&`FqI#r6V?`h7v5&Zf_N-Nvni_N?cK!tVEJ#?G4A?dH;
z1tNMVQyTw>eJk|UXe3qCd-7XJGy%6KCX2(3Wo~xg!>xsn@<_i!fAcYPn-Ek@cEqd~
zQHge>shb$K0Cle#oert|Uip(<#`}4-LHW81;0^E;F!AWNQ*bJynW;okmoF}Imt%v;
zE_uzTd7Djqze-yvPF5#NcW*2~Q<7LwrCW;`yxy0^V5K>KcR7BLeT&ECv{U=5#*K20
zO-1viv`2En&y{R62Mt!y%uDVYV@={?`!z}(VLP=K@V3Eu$mYQpn59XlNXY0XXil^q
zGn{(eC6-~SQG-C|IG)aDao}1U+Xu))UVae`{)wp+Xh>0)F1qYO!Ps_{qup+=!IZw0
ztcgp=h0c9G0Wz4Z3L9G*PM7MWSoSUl70Ea9WRDl;$NI#G;DXZBA+|S>RnCOptCgAZ
zlXkLwI?oZ!_iU<$&=bWRhl2P{*uya~7!_Px8v1_cS$pd%DKf5;%WB;N%Kmjg97W^l
zW-cveDC(CkwlWgqRurwgMsko>z;8f|_-CeC=87AAtn2ey?opIEbb}WVnE=;bDDluE
z;<Cj{p~ZwCQ_W1V5UVvy@SluX=E#4*@;<9&*RdC2Yo2_3E>@#indaW`;KS!U1<Bb}
zIt6J-Oj^qCDrkEWu)f|j58ya&3?#M@wxI<)K>B*GIodwaaIc^R(G9*%xl+nr*eKc2
zz}H+~Wbm~7U!n3?hI1EtXg%9U^RRDmy&?6{k9pJ3f}mGh-A1&Z`pXv$WeyA}U<c^G
zd*ZjIUZ_Y^L@JY1>3u1#oLOh*@p^=%Wl&dZj^(s7<iQUk0KzXilh%zs-0BTz;M?p)
zo5&dw?P7434WYM2d_1EVj4e@lKcXbZ2T<zP1O_;T;wceJ+*r;l<0i{2c+Gu+DKg_J
za{?LBW=eE|Zo9IC=Bx&+{@?g06r_sZa(9j7R3;VK^jsgL<uyF8<1>U~k2;tYBfd}|
zg0i_(Y_pYORJ^z~<0uS@AhtA<V^<_+om!6544kIT1K<MKO)DM0k8QF%z(Z*QtEj$^
zWG3={Qw_eknEwJ_{qf1KLiHtm(=p`oXH@WrdYye`j%42%RL#88-gKX5l|MS8)xvcD
zOQDxK5Xf}1xVmU_Xp64SA4KJvDYWJ>*hH{};Bd3ts-ZL#%T+_CGP0J>jRbwdwWg`B
z-_gD<I^-q$$~a^{bhw?ONrVLwcc-7i+ZtO=HP;O_Xq&IDwAZ}mkg?QW9TaQK=x}+v
z3571Yh-48iC9@S|g|Zp!!xh-qko4r=niIb#-O2SgtbRDXZ|K=ku$@J@xI((j!?11K
zBeN9QcHod*k|*fTU}`WCr;I3KQb3|(<gi>KXEpi35=4UNC=x(L0JZ|ml&4_Cj5-1N
z>>_*Xrm`9l@dySWioINW$R0T$ZN`jn8lcuTOHO_L>}4|6uzyN`z&*+BZ0RqQd^MV9
z>xV&Y`l&Z%58cpR5e=8v4D4Uw3p%39xyj-vIEl{V>6I3!>Aw>ZSWD~1QcJ@d27avo
zID1Jv3DLtE=!D|tu8o7mfr%(r2i>c59<@{!NSoy}IqXKJXoIQ7V8(nmhzq;BJd&>}
zFrP1}tM#l5j8_d8l0v|u){hVUZWzc9m)*)^v<p1%4fQF{uHW{dk{j+g2;SOG(q#ea
zlcEe|3v(O%dY)Daofa@tx(tXy<Gg;-^kQ>XCdy0{WQOol8Cin9%(ps3iLl22f_w?q
zN|c7w>}9sQ`@L^fo0*%!wqcRi9d?KF6LqN~TEJ_?;5I~;3wgaE{C@I#=0j~0c;8iB
zBtDD;R=1d6r?PzRM}h@x(6?}Ne%`TkqKMJk`H^YB(BLsnc&+!)c^p*u!DwQAB3T^C
zX|Y=R<pfBybx>D&P}5LiCk$Sy<6$A%Nbb9c2*<tQiiPFgmMy_!>@rYw+<ss(jj&1c
zrpuF8{?M+b|0`gogv}V8t@M$1*`?gJoz9KuCJ?LV)oUp@jRrTpo4`Yw+zFXzkGMPb
zt6ka~!!heYZmC!};}~%GOTen)^H13BUN?#`Y*wLvik71(RGq`j$)ohh-5&n;x{bC9
z*QDRX&MUsa8#X)bTY6v4<jaa>%)tbyXF7s$M&9!8$*vJ1`%74|+pN&ZOsl*L;u2;k
z(cQYwizNE+!*Ae<v7j#MC&%gl6f313mN2b>KARfb@!339=CnLfXfiHV0zv{2>+p#@
zV);7*Rln_##JEy-GV^lTW1nY0&3m1K!CAdvS!w^fZ2huq?*M*6=E>0q_`=NO7*@4s
z|N4};o|cM2FZuY+u)b##0ouZr%L!Lz@@D5Y7-KBjQ5FQT7qo<>hdv0&-kSrLeUvt#
z&moq}FmIgdS5gV--;%oj5ahN2xrc^r>3trnPAVa2+tNy;_rArY(5dth9=tzzM$JIG
z55$h$ha`)W;C{3=!~q(X?{-+O<giXhdQ(ePNn4h)hJF4B&+E80*l~6kIDN6Lq0wYt
zcYe#IM5Ww?q-V?Ki)*c?BuWOwnkLkKXCKx%!L;G7S+33TbP=Ay3s<w~T_F2H?-!fv
zleJR)iK?ShdMvrvZ7u{2xkrLlwa7F+Jh=$^7%GmqtB|yefhH;m<jX<Pqgfwu4WgUR
zvxv%sC|rIq)#*1<-S0Wz+7KRa2>2x`9RacZh7xA6!6)mSF#t%N`L<nrTQVWyW0lUp
zSpO-?0wLw%gP7N%pm|B9JoRy*7n|@}4K@ZqpS>sHfECSu{fCOhGz)=6-pdzn%uX@(
z_vmj38(tuq2sa@|iqMGCp2HF(KsGyVI-}G|u-CV~s#lc@l?@8w*lqim$7YgAAhh(C
zCoPk5^0)RypQWmGl8~C+A4&TMtOOX|wUaJyaXRB?Zec@OyrT<ElBqH}h2cYk2woWv
zYUC}-3ON1#luh>5VA^WyWtSQslH^Yt@hpaN*Gcd#1p@Ggm_5V*j+Ks;S?~Q@fk#s%
z@*Epa_EElQrym1NNVOm<oc7fU$Mj>`qx(z>!&wyE&B$w3gdwM2w0l+A9*dArI%rQ&
z{j1W2kw%UEsvu4-BiaydZuMb9t<W1%qYOjH5jhR2JIp5jfqb}9xJ2s^X)JjHTU7~T
z;A@=>r9t2&py{O(<j3?93tla$!u}|C&Rd=cu&rNd@le~;LcTYfuF5Q&(k2x1)Y~^=
zNDe75&S=OtoH4f-4)*i8pC@;yv2!zw#vjbBd22G9+y&yiahB~%iJgMg0xwu4HD=b`
zIBtCXiPon2>R!4Ud4}5KpF^WYx;5_C!PQ@a(J=93v|6dEtcPQ{;v#@XdGKoAK606@
zl!ac+!t2^^{U~&53}~&yxK@U<P=ZF>nuRAG9==McnO<GOVO93tix#T3xFw`fsqlaG
z6TUNTsm11m&Sbr$z1~#l9@Xcu`8CQkvZ_vr^T-hiG>T8jz_R;bzA&mzd!s->zI_h5
zif+~sY3>B@zui9Ds0?LP${RT7eW{$6Jtn+k@~m`o$n7kCk;V8t6povWK_yGaX5L{%
zx!?UouN`h-{!Z=_o?iQ_&zR5sHs&3--z4mTGX7+h5G1R@#GD+J_;(H|Jb-%|ie}0#
zaikB6{g?0619^<l(}h3Xo=n-iM|+46Q4=@BiGLn4)2X(YJ=H+97rx5Bsuc#1{7b~_
ziTO+Z)$S%VoVGGKclKtnPOF2%K(1EmP|Y|xhSWGJ%@X1`o!7r>KT;X#i-LJmb?@f#
z=cW0HYArpnXyqo}c=zygj?@)w{i!R{`x@aE_E@!5Jp<}55;IJxcIoe8$)ysddH2E4
z_fZ|B+heH}KAN4$e!hGT98%@S<yvyyw{oYCSfJC~db8L*(1dP!+=Hk3eY#coc-LP`
zUld#*I7&%h6nf;n8iZBbc06w@6Vs(Aj+*xZ(_5fT#}na7alKdaQs29F<Hh|=1Ufp7
z{Vke0i(j>%Yv^9BIQH^%)#rL|wsU!#DITSUrzuUkXZfCY2M@d!QQWBEP5kD!9x?Jh
zT1Gsg;Pw22(G=E;BZu+%cDg`p7(JkMQ5_i}`<OpT_8od)Fq*D*M&HG`@f)fB0GP3O
zgSKtDyKcCwj9*N<o7`@Fczsfm<Alqj^5(!Z(*Yb~tJB5lXU^Hr$bLoUMD6JQh<@Y;
z7hUu4jc-vFLffv14ye-=JT90&XArHRkEa%@Fq^$0u|p2HK_qfp*MaQuzB&|YJpRQ+
zXf4L)H=q%<yzD|mvD=CwfWVh2^@?#lORU-PSJIK<BqDa~k5$y)94*u4kEq3`Och9e
z&k-HvX)3*SmU|+W8)uT^2B$5bXu66wrlk)EMven;+x=h^D0guc2zcokSrDi-9<J}!
zl9J;F!w-HD8KF;iBK)+jKRq8*x2n}*cywV`hw>hyP7>VPh*tWsyufhK^w>QTOKr^T
z5zx~tmf8q?{&c%4Q5Wkbl_`CehpGIc4n)EK;7!t_)XJ%swMf1e1;tOuB}plro~KtA
zHmDSumH3sr^}&sxp3kJ;F*xPd6Z5e&#=pa5pnKU?mGaYa0tAmS^+#rR=o2}~1UlUO
zlrB^g93Ld2hnN&SQb|Uv-Wa8nqCg^~VG(t(PM36^?+v<k*ke=7OVmtSRGyPvOE{6{
z^aDkiW7~G<XJQa|GYih)nm4)~&N3`Tt9=&-&BVOn;y%(mQzHKk9D*eDnup!WBebIE
zGE=_zc}G!`I8x}_Q{~-pz4-SDw4cl(nKUmd1sdLQ@G7`}BaTWR7AXho%9cjDa34GR
z9bzg10WRB#4v&fp{qL&<9mIbms8`S~;9Go(*l(e~5GW8N_X$GelhD7<`ESr@7YKyx
z)u#sR#QYY@Bz=V#W58k*{yFr2J_s^_KviZO$rQI&BkX@BKXDLbYDg|<g!!i@;eULi
zBE4daL5jwNzef;3fxIa=|KB1Ds>Nx$*DV<d;s3=hk)Pqf<w<04FI2n(MF5cRC4_>|
z)5N_mgjV?@h_BN5e<ne4^fjhk@h?L~zXke<DPLiC$|Q!%Prqf67;zAb{J*J}5R5oh
z>}9tWFJQfPtE+<bN;Adn>1x!;$fditG_h)#`i0TsT`ai-dMGwyCQ;#sgV_EXG$%_)
zKI3{6wE8H)%0AMvfwBQVz3>!BkWWFMxO4X{2#b~<CrcHpM>!qpLx6{ecQ2p)))P!2
zb1=Q4gh)lM3niDyO(Bu#_9@fjq{;T9ty&cZr_Xb%cB`X=u5>2v9%dK_EY=0`?PeVZ
zjH8U#^}YyK5!Mg_edqxQulz3fWt^73at>J=18(PLJ$1N8jC1tffD&y1GF;?6I@uQn
zQ(6C5Tqh_ein%YxWOI^Bw~y6cXTtHPb^R}6`h3QGHyxTon6JFh*#mXgx^a21uGz(h
z<)?2=gav7ns7&cDJQU7^vk)OEMgr~vB`+qg<}WkH?_b_amZ(=B6=Juk@hs|lJ=ZZD
zP)zd25r)eGA@J)C>$>P*e)?8K?eY9xln)<AWYizT9U+5*@!_C8>%jN(@?dgvv{J4~
z>2SGhwx_px$}y~Bg%)*0s;5@>9`s-J^~#A)Ld}JFpHqxaWO3^<lS27_mkwYaM=QqT
z4Sp3mkD-E0D=na$XXRZ88u~KY6RGfFky}sKKcg!;QQ+}gT;(ElLW>a2g@RCfC{#Y_
za(l=r@)AfXl7H)ZhUC^J`wyAp7Yu(^zHdOeB>V?guMI%y$Kc?GiR{vm%J$0qxxBy2
zDO(xe#L$ZQGq^uak}nZ@=g%Fnm>J;dU@D|}WRW7f4lhm8b5|~^9hD3!P$6fx=|ANc
zYA!slYe3k<ywT2|hs_WH{7$?!>CXJK&tawuR-in2#R`!jkJt6{&q2Dt9EYWPU+w>%
z2on_NXv$l^KV&g+HXLW^8yh^~K6P|jEJ}HJlVV`mlLa!bJh^5{eHrHWWBi7<L4RO)
ze_so+pFBwFS5tyy0u;U<ZjJHKI++**!?LvPG-{)bvhTyHDMXJ3H-1#=2PW13l-6)q
zudzII6hAy+c>RQg`C~g8Y;I|YxT*nhx_r$*$FWVMR*<I4WqanUT&z3<U^!W7qs{VR
z7Tp+#Vk}n5XIVEM%On#GZzZ8o%CmJNMCsLrs1nz1<`IcJKpL;ebKhya7-sz*5hY?_
zTJsjm8-s>?4F$@7o&OXiJq6~E+4zZ(b?(mOo^Q@GrwLZoqG(d~AGwb%#Rv+b-2pC*
zaRm%w7e5R{5@>lpoWmA_y!P}<Yu|c|##h?vttz0f{eb4NIOZ99+^br$OCVho$UQG<
z`P4T<GQoYO;iKO0NYTunCidR~kq+Yf$B!TVS45}(I@VZmaP%*rSb)g_35;(JZi=>O
zoUVJ(t*I;kdB8&2&H4GD0*hBaQj`02YU|^q)MT+2Pq9K<s&cWaaOwv$I=%hxIxWs4
zD;bOVpE8;!cb*bo(UxNMx>S5VXX#A2RApVaZGgu855*qmA&wEFhpYW+mlyD%EUKv5
zr!(Q94BB9>n;#e3)mw-8iWQa`xUAN#y!S?)X7ZI4AFP%*i-!A%SPx}@clUET!Fj?o
zAtYmHI@}@!c!7!{r<P+YwJ5tE>=!%WU6rDqcwpNLBd_ONjCliIIg5}3oWGLL1^(>e
zS(+GUF_>{%XhcNMl$j3X<%mZsdbwG=a6VozF1px~z{qp`p0lGlAe$cD#ogOdg6#Ed
zevo9m3cu{~)c-~zo%1`f<tv8gRC){88t554cRHMv3YN{-E-z65oQzhII~^!r$W9g5
zKi<JRJYH<mUR*M!0j9>&j2@GjK~8yseGc2Iqq(Lj{N0XD(zQvo99p6<Ss)~BgEakr
z-fFzM5LYR8uJ@M*<ftSNbqBv3JXZX<YL*Fa2Lh_ze-SnGhk_@!NTfrSu)L5ZGb<A?
z3B{m_gvZlLs%%eHWC4tt`b%fBsHAdOk$%Nv`>@DWZNl+PiA}FTYPDF)4m>G3?ThIN
zmeS_K#tNrZt4y4+X!qJ%7@OUye?jvV%~_iDF&ME+vuTnsN}Ju}`GVqp>Df^0dS-_f
z5_)?-&=))VXIxqcH-x9ID1lZ#{vDK|Jun+!xHc=d;|pcT>o=<_b*e-Na=lXAUu-*Z
zQe`1Kt<*pHZ5x2NCW3SVAztQfvRD*B9eWcJF+}3Hh{Zn5zA7i^n>Xse6-8cR!I55A
zy2eis;KIbH^e9Bo)^wlkcf61|Y?fnP?KZj7Q5ki4o4kMpRzzjE1W>A0@xJ7~58#I%
zM@&fRPzXG!514_Qpzd*_Q;c&y^ZMzxj1SyF0AOBkBjO-*PykgB7G2F-n!i?&r;PUL
zBgiA60d=zMkgnac=z6-`W`dYX;b1&p99K6tB6}U2tZ}PkNa*)7CH=Z1j!|T>UIBF4
z2romJ#QsZOIzgpNJLH^vj7R~Eb^*@`)Q7ubB-)=UjPv3KUSzg;Q0OewLS%)k3|j0@
z4!H<C5(gr*F@$D$YnmU>{d|H&j^%A~fPbW5(0KS-1~kicJowp_wueL63D?U)8I)>T
z$Iw>Uj`1CyC~T}9Lzo_m2*05lR^2&eiI@y^>f};L)pL~LmBXOkfDVIyK~c&C9BN4D
zaX?}yS`i#?AZSwNaG3Aa<-^{(2@k7Pyk|9W69|>QbV(Zqg@EMA#~4x?I2}$W?)*p{
z@+jAo0zseKQ=xH2k5su7QdpXlpJaykR@CWNgVhZMYwC{E-|uezs(0YD{_lNkq(lYj
zbiGJO4g};oL|l${RUl38A%^8)d?GiNT_#m5EPx6Y)(>Ql4VBU;!H606*za2a8$qtz
zUXT!^AN6!FD!OG2J=~%_zt|oDfdm`b8w7(X*9BgWo}+fut=mAR94QmsK*e6?mACne
z?e$UAGDu`mm=G(|o8nE;M55j9M`D3pd9OO8ZxTp#xIo7YfTr-%w<K!E{6LTKmn&-G
zBUn$GmkK$l!59cM%Oyp_G_0TVBx}A%*)#ANE$CVLPc>9<3mv3d+Pkr^|0&j~Dc3A3
zAB-$VckvGpCuS+e>aYjsfn?hM9q$PSGHf(Z{uI#=VS*d3kwa~69G3hhb%I~eNa^0T
z>8v>7Gt-aR@#uzX{U>7yzFvOFuZscgEv6qfhS+j0?fXq+>OpCceEWAjxse#v$;OC*
zVQ~4cUe2hYVktu?Xfh)o!G}UH?3Didp3ue!*?d5a`l}~jA9V4UEqDo>m%mgWWP)G#
zG4UUrLcaZ|63-(AX*+Jn|30i>6&i>xv68%`tmB_1^KoT-(ct=<(IecDtIYQ~hOYA~
z1n-kR2w8SWc!*$ZNM(|_zsbaI`%7t>&?<Fhd!=WHCq$R)DRMurnD-C(NE{8dr6ks1
zu9n@+>j|pOfuzgbaSwt_@PIh51GodpD)>dQssz{Gi;){NW^3M&4!9~ppqd*K+n>u`
z#{ivmPlhEa2x8Q-<DR6+LY4_^P3vw-iC7T^-DZ)%9K26m-f$e`(`eHP`WW?5`m}bd
zmm@b_yTSiAwa?e9^~3{%i8A<~{9WS;ys-iWGv+L{I!T?ZqPUkujj~dTf?=4c?+O)T
zF=ld`B7QUg!gYVVWf`#~BDEyaRmBoO?sp$`KN^^cllVjB!vme&yVhzIm<nDI-{VaZ
zat0Uqlp<47sg%7~7L@<k?SK6%h(z3JFvgsXngacw3eZ0_DNu+9$aaPW=>BnK{x=_h
zatvP?^|K%e3*vt>;a@qZ1R$F>0(rO<=HHp%Z+zGb1{$c}_oETk{~Y8slbHL}>?MeI
zd-MD0f<qY|FE^ZeBMrjV{LQ}q&zYh2K^X>q`2y9(a{d^Ic=55?NFgG1;$Kn;#L$CJ
z`_M1aI7*^_S%l6|_1<S?%YWczgxG$ZQHR~nj?3ZLjEX6Ha?sti|D>x3kDo>zx)_kn
zP3W(Ao+S!CK_|7n8MXz@LqXb?A0rosDfhom{9jb}Oa2HlHIo%%M`3?UE)ixB(f|Km
z{>DIm(*OT)x_HM%1;C;VE!`tp7HibHh8`_9jMo!Psa0sVy&s6m3fZ5JsD51T@X@1`
zOSw=#jBp@`tR8j8{oav3Fk-~24h65>>?m<V#7PX&hwN}64|<Om8*~W8cs?d0Vly-%
z^>`#mec%PLtGodASO%pS@=4SNt*Fiw+EC9AEe^obEiK?VV#xk-=WCHvdU&#nB~X~%
z=5eA=;Utm{B9k+hkWPz5Y%D{7KqJX9WVZuj!03L~0sn@}CY$hNwPnQ3Mc(qI^%*E;
z4y&N!9isp@TcOA1bae4{*2{OWQC2#%C(S#l((QU$?vjn)rhBvLYSrgus9dW-8|F5C
z7WLTUWU)h!L9hLY1(}c-+yer<Sen=SvGIfXq_kq51QpR6;2ggIc*pZ*v(v?)O$W3k
z|MQ(g>D9^Nc;d4+B<v7lTEkrZlh^i2o5v^IRDLBa8l`WsvqNp(PtC2X&0kv+xk|Ta
zmDNF-2Pu#9$<@Q7K3pTn@X~0v&tEM2ct26N3Z~I)*+h>YKP484|J_RLIt)bMgG0Bl
z)z!&Onl@fSW(bq<%sB3AJjs4bt~}Er#aM}?@Fed|)cFIOjyQiC%mfa}_;@-l`=M+S
zB6#ko8y3LAhasgiCk)D(AUCnBgx#@>8tC(FU8my(HX7~LFRk$`32Qyk6d(<i6<D?U
z%;9(kM+=uEB)RtUnXiONYn%NxsZu*0o(yI<YL%~Xyi&7W{+Z8v`q=H+k*Sr}RdRKY
zq|NBg*SgwbfE~VR^Wn2^p)0AoXAerL(ZrD%8sGAIE9l;W8$Rm)dWZ~cBH0MN{E9bg
zbhV~?v3mYl!T&J3zDps%o$t!Bs$;Hpd|>cYz@}>TZS)~paqtvXRKmOxl?Z1CXgkql
zlb1u2#<Dn^j*f~471V9+0Rfh)H!aI=4f{(to*01X&U;IJ-^M>*`~0k|?o|?aV=&>;
z)1<zDxbjlYujV?;tLDXW1vzos!(;jyV_R~CE4_L;no?80$G82Ks0tQ|kd#KH;Whp_
zmc+t&e=@glWwYnyjLqYOH!gSy8eyTLD`AZs)pocS6ziTi8;+F(Qr}Y09X!7xJbq07
za4(h8an>cJwYE!bYwughHe)fvKK#>hfAS&zEK;X=7|RQ;?8&+A@oB&&78J`<yqFy`
zw>sz*#smZN?0u}cKjq`Qn9g^*<WOxhi@ZPGo&gWLZ4H#o035C^E@ch`kfZKFEjS-h
zyPPg1ly_Jsk^yfef%QbguvYx>!oetMW$T+FaV}|684Gnr<2N0E;jQuQDT%Bhs3(nT
z{Rz_j`NFKL)2YSUT$w|k_$iZd@s`MpGUAPqwi2yz-PpVRLaSX(+FxF8&R<#$GLCOo
zT7HxfYs1ihtGCPGunk(BWJx@h(Jh`ee6XJQ^`)iq_I~}Tc{r26r-^So3z>VjE3MY6
z!7%iL&7$bT8$2pGf$Ip4!aJ*EjDZ(sfb;zYbcYNX2G%9KNFHHiTD9OO>2%!7brC~k
z?6Et=cAa)k^v_pOMZDK7=}K+~S7!Iw+aC4&Q8O<R(Zqi62gO-In|C;^2Ah;_{AP;Z
zXAQ0th0{ye%q4~~X{5Esh{C>4sbnDqz1stNv9Bv}Re2OzKH8PzzubPgI$E6V4fM&p
z;0A@lgJRkfM^l(}kWUxMR?9U>(!8HFnk@u=YUtzpw;Q$dx}~Go@xZ}DrL&yF)-74Q
z7~rJ{m?W;got-J2%`0UG9nFiqBtQSx&~&r8#?aB^x2dY?>RlI5ScFv!LY!X)+@jyl
zm=6YJdB<uv7^iJ!(k<$T@fz(hT|$G~)gAs+kWIgv?S~fLw&hcwUzZx#Vl13%S8nDp
z4ku1euv%<LtJPp^go7x>?(MLCdaxaHtr{5?2>R}i;$1rHNn6$5doOMhu1Xa0gq;a%
zBG78w#{r6X@3G(7W4<^%?MS<t_mR+OH2tnzsWTwrOERmEGQy#bvXM%i`y!hO+VZhM
z+RE0P9wK(}^;tciJAa0o&eW4bo5Z2D=O<9KX~e7Iz~U<C!&R#N%gT1VS2Uj4nxozM
zE8a_S>T1@9N0TvR0{?(YJx~xkrOix{S-b7GCG%--tzSjZzXZghBuSoZ!S)x*TZ+ky
zg<rlM*&z>CO4PUzl<AebGar}Y?r6eI)yU~J^n5CJ?a{KHYn;%=b9(E~{VUH6(c$>v
z^n-KJt?BsP<Xp3rhXej@bE(6xYx$S6K8$0WYjkPqF5$J4<(60mnRojW<)y7x8{y~z
z-kxq+2|>|1EiP#knRqFzfcW}yfoKzu!#1o{L7ltaf;Nsn%QA9v^$tERPp0%X-@Y%b
zq|8-vGEbIX3Y$Lu{4+nZfoGH254}cttOK?QE~iA$`$I!gjBTbHavARv)H_VFynb;}
zHw1)qntT>e49^O+-?@?~)~jnQQ|F%DYL_^!u6A8<G+S(qj2}(=g>-)WGbht>ay<wq
zc0OBE0>e69^MtKLclZ-hq77%E;>6Kp>1V`y;5ck9zHOO9sY|W*_=nk#$=``?v1=)8
zdbwYKDtc*`rAb4KxdI1dvlYQN1E{ccY?MCS+R@(lFO5DXTuu){L?goig@**d-RLHm
zZkZR$GK~%|U?S4NgzC=t<K+64%b_YJQqLFdCifw2jM@50lU0Fk+l7@3t#|y(3+xH2
z^^aSq*c+cb^;x~|z?!^hd)Hk>IP~2X8kA@q9)e}eN6v(nEftz*S+$6<X>ab`Duv#%
zN>A|@wKOU?>q+B~WJTebUxc%y@jN*=JhU(KAYJB)Aur?v0B8;t5@k++rl6Mu*Z^Ou
z)<PZ{J(6-K7corYUwo~tM(+n_$2#g4Cf9!qpXZ9@iM6==xHnac<Hy~QACC!+q{mC%
z<m3J@dvlv5T#KU^Rqw~GT=>xd<WKVq-c;4{Jp{)eu-R-rh5BWx+8*OQf-p0ChK!A9
z0%>(K5=v%uT&6e6Zhvzmp(UP|JHs<0{Vbnl=M~c8Zpy=zo-ni-gh<CqVp|7RAmH87
z?fj;yzN5YHqGl3BHRNzACcx4f$@WEXf!;X;+v&0;VFmZA<)pn}hS0JwZ^ujXL?W5X
zY#ns9*1dBd>Ee^4?f8xT#+*QXuB!I0wy}03eoIxU=bwia>b&Dyj0H$hVwrr`EtbCB
zs<P&Ke4m4mUWM6h=v&%E`1RtfdanUlw?YG9jq@L~3&HL~Bo83h$1sY8s?=qH48e^S
z-pPZ}P#jfKQ;o6s5PBblLXJo*yVW}y^=x1KDFt#=n~R6*YQSWX0VL>Mg5=jw>{vD_
zEVKDXos<UK82(#A^+rQDCnYaf^MyNtV(q4!1YU-KWWi1vjNq$NDuOHM_qt1s-fEUs
zJdUw_4;M4C2^kqYj?w2BRKuWP*7)$Gk59<jP4<1%h`~|s=*w4a%(_WDgBf(Bb-8PS
z^14!4x?xx}q>vBSF_<sFtVP}p+I~)zyqQX1{ng35&a?7i`6qV_f7}o4rqji%f?Rzu
z#d^|-&2<)drBy_;<t`qh$I#dMJJKl;7iWv7A9-5fI#d(<f{_UlTiq_2`Id)YK+!7H
zkPkO8mMJP|c^r8ti!RgHYpRpE7T6l~o`DaeX$S0thuLpeWAml*9R_X`>!oSDd{;k6
z7PYvXYIH%ZM?PXa{OpxMl`PRM(KesX_N$%1zWkx(oiOjPBk|z4zcuy!GkLK${e@_x
zsb{qGLynCE=q0Ql;V+<ofHEDSwldILB=Iyll9`ji_y^AGAHNN2-fau=jU})aM&R*D
zZkV_5mnEvJRp`bTjSDb7O?M+Go*r{gf&yh~kVTow-Z~hKGm!BU8s7+MSl;i{9`qU3
z8J;ZU*GyNaDE)XTOtWYA^SZkQ5^3ejw!suU#nWo_BQ;d4o)KU`Ry?J@mpeFMZ=V#B
zeFkQI$Lc}cU-NsC$*@=3wT-;ifA&~p-(M^(jAu|8X6izPkhDb&A>gUMn)NU|(dM$m
zeKwmKH|CSt88voWTN4_}q)+2L?+qAmzB!s76+@)P$-Wcxkj&!Ga6Fh?_AC=>@nsW;
zk3zLMRjHahnQy7Q+Mn;aNmvp=y*a(wXN`Rf#5Ot;6aJdhX5I4%qEuYc7K2)&Agon)
zOGYA@DW>*hPRo7=xo!N#Vb52=7%&_w+>C<r?e^Z){^MKn*-E__@^Oj;)nku$wr@*j
z5^oo8nGRi>g6j-}M(6WBSN{B_f1Ea3bFa}_Fz(d4_FQFO{J~;IVpzF^L~r~>vi|A1
zR4q_HL%?a0O0i+cV={Xl_9Eatme>K4kyL*%g!S08gyJ`I0<-!0(xa6o8TWcCYv7vM
zb{E2_`Mq{EHWCUxqtZHk4Iz~*K9|>lUX$Z~YmXTYrT6i|+ZvmVJ(7#tU#oc*tLyHv
zL~-pYO8se{anM)FoiJ*U`NzrKWShNzs@w8IAz)Ky*zfx2a9`n*zTopyuhK=}Ta-m;
z)JXhj{x)lUrHfa5rfTINBL^^sXu5+9#J#}CY1fZg!FN+`Di!vdt$-&$IHfsZHF=Jk
zSE4L&I$WLnHb6|!rTl*=d&l5NyLWFp$;7rbu`$uan%K$2wryJz&BV5CcC3lbj%}My
z-+R}ypI!fY?+<S(ACjuBzWTyiz3@AZQyD9&e=lROf%4osZErU8w%&3**zXiH{!QGK
zK#@t#Y||;Ead~{XR40g~;VPtW-tgybu`zl5ejLemF~JXP`Tgb9Z45^DMRMd@lBq>b
ztX++zKdM@XqMa_lVX`q;df^0r{{X$}^mjkZqm;WU#qd&!e<8DGeTqpY9rC&9`1D4x
z?YwVE5n!&qYW&41C|6F)s40|@z%Nma$V&>e_*tY-lKidGp1KU!Sj28Ur@2@bH#q2A
zoTlP~WUo8CfIiAqSmv{@HG38utEK)Z1vIlqz-$9WZhv3crc3_%X;~z*8AmCaY;({B
z231$#b#%hB_`z19Ff!<DK2DYR@|4SJ_rt$Try+aZ`libU_)<}+gI?}%U;K83W;&92
ztXy@$d!Scub?o!bg0|{jRs9-k1<b(Wtn|7`-A3niBc#o{?@7pwd0ce*t5REwh|3|R
zCs&1d_hqz*#oEAq$u*W>bUV+tqtrWbt=Kp7yWfvfIX2)hhP&x>?O^x~Xy0QZ;S<UE
zQu^a&Y;p!>uNgFpIE*IZ_mYhz7G&Z$ooDkU!KHh9y$nW?&3>^2tqgOL9CpN~|6dnC
z$nMGershG}H&l}7ehdZQ`Zv#8Y1RdzT#^W$*LM_a(~qFggLSxT&xED#TFmyF?dJ}j
zxWvL|hY(L^QOq&fdDhgIm&V9{oXCgJJX$$lJ+Tugq;^th2Wo4QD%3|pu6GNylBUCd
z1in2NI@F<~6yd;E@U@BZvPj)g*0^>&IB6vnKH*>OMCq5JvxX)rg$D5ka_oXo(NRz=
z-4>ex>gJFuS>sA6mEIiL7;3S~ncB0!;p;8EW_}3Ys|`ALO&7btSL_W|_oV?=mw-G#
zh?FY`#*w=Mh5V&rtQPjlO3*!d)(La?u2ll-8P#JmZ_ZH!B~#>yY&@51EfZew*Y;KU
z)N(zoo<VApJa{3EAWw1<fA$}vDfHGs**vb&U7B4wJk9o}6J;o&z5P%SQDojyWupex
z-@D#F40NWFIVw!1YdpJcR&Es6?Pe;=?vPY4R~tYbOJ11o+}UW}=+LZvShd2-n{%0N
zLAz4dmVu}0O%jcY+WHrU_znJcVE)t2t<;b-=DFi~7}Cx9<7S4N0m7D{>0$Lc&(F?0
z7nM*G>!ydGErfN15&hp&)bgk<&~&OaHPYknR=rKjG<vAeS%J)g873+(nMzUW5fwZw
z&rACI%<rgIz^RNq|4;A1kk0(rxb94zg+(+L-e+bD*sWj*Y9f)@RwdEi6noe%EzvX!
z=0$!O0mN%cyxD@3e+cwsl>09eB>o?k40?IB;yN1}72<w3k!6RMoVG^(+`yN#26MTr
zQ0<@tx%9sv7S>PzetkC9BG}8-cDof_2XfYYoUClFlM49prkcc^_h=m@H1Q%=XxgDD
zme+(b1__5=4&`OB9GzBQ@vXj)8(^@Aqui{_p~@cS_rt5e!_xA>4eHr#pUiURAh54y
z^9t@}&3UBIWJUVs^&n?21xZk{nlDdU_Vu(?>yq1~(PR>Rl|&}<2@h|!qo7e>QzD5C
z&U7VTbGB`hg;u|;n44dF@@?pwIdGA2b;+yM{_q4p5?#P__YbYLJg~juYVl(o_!&>w
zb}JRwH#lBBzT&*B_n^kCO*6&+8Fcq<0XR;<qF0s2s(N*?SVi|__H}>ux=>ec-}&MP
zm1G^<@=IGuaQ54C)MtW?=S|x88*B!$k%Vx8764<z6;qd1Qch&HmsWUyfB9oQWHBG8
ziIl%DOdpnTIf}CK8_b)(%zUd$Or&#qo)y#l?)(b`^eE7&)fB+jVJWv<dmr2oELQ*7
zDWXfQ-7Xqlg*nh4*QK?bHD&2gE|ey}T&Ssp&;Jrn<trYSsidCUnAvs2D|Ky!k0Ako
zR2XeFe6N&g)!k6(54Z>x|4UD+Ts9Ut{g{&l{QOO1wGJD~H2PYkt}K~jxVt~{q=0XC
zIF*wuC+KIs6Y6@JzAsa#k`q@blB8H<9~Qk*=-DZaqT4bxY&>!G8l8SPL9i&D@CXE+
zd@Sm79fvUHsog9_c7?S}Ht6s4(+fbjobhYWsrdp4N0Z5G=r#?u-*PA<?-G4p7RWpq
zML7#lX@6EGR(`q_ovl8X08|?cTkod@{Op%_T}TT16~<Dx%%?Kd{Hy5;m2Oc~>yEnK
za$7aOTbJMA^9|KXKuYC!3wya)d?j%?F<k_@k{DdjOC?Y)-MnZCWHgO8YARUr`WV!}
zDYt3ys|>9;?<LSR5gfH$NB8|*cg@<(=Jk#LS*@g?)p%irysr#p8l(Jd3MA2{@Xm&?
z8za>ZBzuXy&j*ygs~o<z-?~D0@&f!)eEnGPugdXH5lOhEku-R4cf-r(N-`d!K&T$a
zWfQ1_t+k4d@Hk1(bZl1gg%BSV=4rBLMmk+;LFQ)YAG3}$5_2`#(<##|NM~IPfN&gH
z%Jb`}dc=+P8_>bp6~wc>IYp)A3QGQGnET#T=yr+pj<2JL6_IQZffp9UEdv@I*>=Ed
zVr!NE?zz%158VH1r}hAvyy-NF!@Z}!d-Xyzi-Ld(+9%u1HWlek%~_4_eOjRTqN;Mf
z2Wy#P9^f3m*3ZRt96K*6socsmj2R=96<)e`!K&!H@d+kQ9+7)6%w!-afMms(pgd<I
z_07~l8Y^o9ey0I5aw9v~;Xy5gd%hhe*%}QCkvgP@46`4DvCZkey>NC?V2~f22EG%E
zZ#>V#da<?cJkjHNr!2m<_6^h<HTzzP8Et0YlsQ4^V{~G<;7TXa+XoqIlu_PdGqC)6
zs$M4nVA^li^=sht{X&SRH@C|mo8du`QFpa7PrdP(Pn|a8f<?fALv?3+NC$hv*@fLg
zV5h<j9)r{<*9t-rZXI5ZYI;xXa#&ZTM}|GQs61Ag7?a+0$4{dS6A?{2n&`9w#V}6L
zjmxhyzQ}A}QDh<K^$wRd+iGR>w<2HoVyMn2m>PKuw4(eHv4>j3@}7sU)yfiYR@pi6
z6;SUI8M+x?h1*BNj7&CJfD8Y4U3z`xn;c7g?!Bcp?+T(S_y4Jc$$X}n&9)RQfSt(s
zM6}#6M{Sy)B%q&CG}cY2@53V!TZ*-EY7d|l+hj0j2tl1nH;o#1xS0_ZO{!XM(Vv5o
zsW7YJ$+%CyC-y<?8E)fyhOY?CD4?X6j6%F_9@o*e?tYP;)Gm!sqR=ulpoG5WwA0WP
z{$l}`NLtI%&$78=N;e9j%8<?DsZ;em;e>*g-gNr!RAnwxiel!+^Dy%_(;b?EWxm3$
z0a7meO_pN_XZK8$icg>kGXRvS-&tGp>{NH_&p#SWQTfZ$G|%sKlpDL+TG_wlU%L?e
z-r)v!!U#Hhp=TRf3`T?>-cY;@9ESWh%#sEB!?(vJ@u{Mvr^&flS=_g|_7~P3^u&kz
z+h6kdoDUgocoUVYP&*z;41rz0w`jA#-w{CEF<hJ5emvc5^~gB^m+dgQF_bLBNGfH8
zQneva<(WtTGyMZg=3tbhI~1QyXFAUIKt~n&VER>p)^^xhl^YJ~8$<4#IV_Q_0Yuw#
zINZELoOQKmM^u-l+D)gPPLe039KlMPaRv9>f~#Sk6<k>B#ViWO(|Q|$(K+W>Fi*kb
zoYg#pqdGK|5aP#S`KD>kKJZ1U*)aHJRwJBN8#^TFzVqcie{cb2N1TJH)Bw?~(l5=m
zx+%s_>&HNbKpK`kP7}6|L^z%Oi)TZ7&$Euifo^lmRefLUs5Lc_65QtJKAkG-2jO;k
zh5Q8mP{fWW5iK;{H=ES!1dYyBZ8aC2_yEwhnN-qN8Vl6Qz4ekUt5NB7(wMppP@|Vc
zVXOtBxoUg0IuW|ou2ByRjToZ)=Vuu0TCQXr4q#f-aRNj?S3Y0x_Ws@j`8-lL>nWpL
zzdPBfezXagk?4Dza=UuaCMIu|sT$TFCZzFU(Q8ova#7BsO8PU%RCEU2Hz%tYyUuu`
z28mvCBJM+O4Re}%A)J)W<${Z1DZ^t4m5Zt$7ZDacdz%~WQL=CR)b+o8z0UE;ZR7x6
z@2sBiH*d2I=w(kMd;ZwhZnAJ`#{9_1LV$Q`4yLT7R2Yagt>R(OU1%dP%rbiYK4U0V
z?h2PBBz=Y2)7haJ^%&!Xk5Q?y7)BZ(l@&|m&yorK%0SZgRBE(-2){RWYH$xm!gEFb
z3%QW%vHVX?GzXECPf|9wH7AoX<cAN&ad?80sWd@O^rD}2Bqs#r_sWCLCgqmN{co_a
zPgj|p&Rll_B+S{nzQpfD75-n0YONYmGz2yd^Azj+KL+4VbJn^%fz5(;J`$lkiP0^h
zgyBIeCA~}cSD9zpjzVe4s_#eDpLR(M`rkl+IH%UC%Q5|em7rpMSL&O;nggwRQ?bc>
z-X6A~2Fr@fnupTTeoOUn+4$L<aW^r#lg!S5n~5#aBj-h{{NZpU20n>Eqse$;KTVMT
z$FT$v_}wq1;(gzB=WZ-%knBJp-p491?W>HNR4UsaB^PpAwe~f~p~`i~Oo362<NBLJ
zyU4Fq6IS7V%M4oefI`FuJ6X)))K(%Vl{W5V=d)TL@{QlM?f4R}k`L(2<MHdOhneEm
z-4#VO+q#3!T$NSBYALeYBhB6Z32r=lcxqFg@R;!_dLKFRF1S2*{D3R8b#woB4Yx-+
zg?;n&I6b)pz0C;`2}w$Yk9xj}QJhy4qLKJci+Jb?ZTtNd^PH42gr(HH4{aBtia>q=
zc2|$SXv*f-+>+BBt*`DPSP`ExX<C_=IC%B@(|%@0?~B=iEtb)6+iUk7XSd;xzv99X
zn>Q)Tl#9Myj5ON<?GhtF-<8PJ-fi1*Ipe=ZZ?<9mBjM&DymDV0!m*0(<@cCsj#_$$
zA4s%6*_oUT!2T#wC-+7$+!487u8>D{>y;N^oQ_nobU$`~JcQ3U`*1iH<MAJy!OTkI
zq=3?SPwtngS%)+uL_S0!EXVylAHEYjVFG>91A4VFBQgd}u)QHvsHZ-a;cI?ZgGhd;
zSI)`NczSFdABzO39P>z%_uFh0Q5)Yu;efrq!Ki}+X!EyV806`%L`4h!i|8EslLt4r
zm@(OqS@utP+I3bjTK@d@i@KV%9>3`8<ERn6+X6QEttGbvB|an1#=Z}Ln0O0wzl9Ty
z<{H&nwSGk>J`EZa*pM~1H@cUth*u6Dv=E`TAm*U%&yR4qyug&YO&8ucEfVqKH}|^;
zpDRP{ETxvJv<1Yhu+f+O8APImG&jaTAec8v)}o(%ie`O?U(wKSW$ObS#4M0$BFGII
zaOvUY%^!BF0Fl}T5IwBsvGJ>^)M>u^8g+X265UH7t_ZIsfSOXH@zvog*eO=&*bFC2
zWMZemGLnE%FWFkT!o`KEmv>6*%MQ2InZ~Qde9SO@o8N7+5>=l9DkO%SW{@FUEo?E_
z3>i8==5&8S9irvuHAG+%R@m>0wJz0P$BI8;FK&5V`_K?EVi^KM=Nv)HE6k>J82frn
zvB8DPpCbxk_mmG8%S=-^D5{VkLc8}V-TFq)8{<OhlTz}xWEJuwFu%>=>wNyms^h~N
ztuc3+Zn%1X%KBFJxk)6le%QA<jpLhoYhcc7t*148>vfTvf}l_!2_{U3D~$zQ)8F8&
z{`uJM&&0iQ$tc-2XWpM^*QKV(tk%fSVpzVC)EDNch?>K`b+yQjkHrFKYt0K92z^8-
zCZMI=XjRg%G||yxhx?rQ$4wbpy(F$Nbu9D261crP`-OTa<&P?@*|sG`XUK#B0}t8(
zg2}QuGZuko#b!lh;fISiAVcSKi|^^B)Abv7T7yq#vaW^xZOvwj<XLN8^lW#s*vD*M
z>s1x5%saP;8!e2Co?O;s<pr=!H$HU1`04A9%n#DC{;;^q#r8LmGDYil1e~cd|7rX2
z+U*z~LL#-_31$o&@O~N_eK^Gp8u!zlp9OZ_blvloqML(y%xcO8a0tRg0u=&$2~=`+
z&7W)Snz~fu$viO|9LYnt@<^vL3PFCPg<4f%2kwgRc19jA`rRL7+<ld$y1I{dWuG=P
z1=)RvU#qSCy90_WMWU@>RjAtOzR1veNeH?Pv0u+$)=x3NedY8hi5#E*L$cUGw9$0E
z@ve94E~lvQ&i?uY(0wdc2y;)u<UwE;kJbp1(jZM)utf(o_?usGlUJSaONxu7!vZ{Y
zG%s-@ce*6Ld_Jvu9d=dEU+r*Zg=?-yTCd}kB<2*;VCj0P<ILi*d;mHJxXhOQ;eHhQ
zr8x~Cw@8Aty=nq@a9C;z{b9oN(LeNP8dS6KlHHKWQ<AzKXRqFz{Fzo31t`wheRQVJ
zm8sBV+YeD4{qp;?@~{SEF-S9PP8y0zX$yk7A=uR5zCGaTLNe&AHy)OsJ3<qDGi&29
z>j(%>Do>(I8!=%4nlFqd@rA^8^&iiL3gm)znW<B!jncYojQ{HWgs-GzK{$<=DKNcn
z*~Ei#eAMfV(U(PJE(t;JskCIt(9i$io6NqiCXX7Snk}5cvvf^nO{6ZAuhP1CPEm<o
zz%=8#6m<BF_oCBgbR=Vbv_O0=NSuxO#2pehfL*1PG2%;i!f~~LyfjcqPjd+o8ub^G
zS{<@WhbY;^qUEybMTlh^!7Qp;>Co14<)v39pZS{A#sEEl1mEZES%~_Or!;F3lzhz~
z5fBi@4|^s%%TqgW_{FcP%hod(;(J&p3T!?YyEaegR-_-=Ew1YARv>E19+V)pzfjFl
zc>39Fw{mvXe;<xaOmXFoFp@+b?uZmvGvnY6j_~BSN>%<e)6hea_?e#t_SPq?6d1{t
zi+LrU3mJH0y?(M^Z?1mi+f^xF3Q;t1Q*&5a)2wSn#5tf5f<=$v`RG|_D|z`)9=&2?
zCPK_n+hMYhSI-Q%s^?31(xfdFg_l4+wlsSSMYnW#d@x%!bUW7;3oD#R9eFD5AFmE~
zk)$wdXoiH3#CE06SV?C^xQ;ZIa7jIM^yIKDbt*jqk-hVVQK=?S3DpLROD{>k@Jz{f
zXtc}PV@Rkfo=i0CcRQwjeJZ6aqAp`;38eNw+|ZbYy%|VeEXTd3-3$#-UGch8*{JE;
z?+h{Q6hOOkxxyYA-()(eTCLV=>6}jL`@hjubm@@}j)l=_+QR5oQurTZ1&Y{DRd<`d
zHgB5l<q{)*<#(J+=dCGY3vO@0nEy&6c|NxCwcdd<8|fCN%Wc6)m|rNH<@z_Jm~;}y
zejFS-%bwRr5T4nh?&)_-;WihICe=7r;h8Ps^UD4>UpT?vp)!fd+aP}h2z<jr+$36T
zX-LB&?yt`KE!t+cQi#EC#ZR))ogtduUlEQBPVCsP@=WP@z`c885K@fLAYVWfH@`A*
zYNZYEyDd)}#2emX!|u{^J{b@phjwF281s-`ZNE&B0WokBvU90W_@j=n7fIRGr-WQo
zExi<6sUrmEHrttH$|XLAE1kQ{yHRtxc%)3SANBi(NDzZGYtTLSU}m1;z9BYhV^ZUg
zw(AI9t~Q2L>)&ABpK?n4)VsmI6oK}>sbDPciJ;YviFjOD-#<Y)9jT7kyis0jWhwg?
z%dVZPVufV&7}}vy5&QPZB8R8HXwR0rRL4O7+`aKX>&08QiAx#f&K3*pF$_A5%QVO(
zvKeL@IaQHNhBkX^Tm!XMG`MwweErn=x1(2qQl#<*RN5*?s{Ggo-TrUEHKj|lB`TCa
z@yIHDaqFehuT5E*IekA{O|EyuH3NQ%zQ)06R_1XwSue2khg)~)s#ks;sdnP>Tf?<n
z%4?YSFR6nIlF~gGe;XYakH(7>#Alx#Z-(oRLPgB-(Ke3{+L>Em#h8xrf7$_$N^Llt
zJgGhAvzw~DIVi8S8SXdo+<{}3;m9<oUB+XzDcu6E^_plIVke+ld5VW{>`G>Sm4#&b
zF#MLzZw0XdGP_Yh+HDz&$29sHx;ttqo)0NL`Dm$M)t)SLW+<h5d;t?cU~OP{^XSp4
z9qgDGkKXAQk7gg0q9NEu#Kl0xgDcVFL!)ozO7WsW>^hKJ*e)L2U|@WQLE=1zPGE?^
z<&dT->w^w@SzsD>*YR*&QppMi26;A^OQl?_7dbVZP|2_144rU*cB~EF2Bzh?PtrOY
z)%J}PY?(``b1J^obja5#H<}!cADu6VVE3mo_7*CqhQR*fe<Mw;{OcD~@V8Zcp;S-q
zAA=)E+9%U4T?zr2;GoG?t$n=x?Fn{oW;<8$FVe}1ntpc2DDblJ3SK1G>~d_`@V!X2
zz^eY?F<806PlavnIuWiQ#tEHynjbnAQ&z-)q7ub|#M9$Z>vebXRPo0i^G{ix#!`Fa
zZL#j*t(j&&ktyMV{oN1Uaho>S-`v{rc${|Wo`2k9>B)cw4ZBmD`vn4dAbVc89CDCP
z4^PeJ@UCjztQh_hgK>Via;a;{h8e&01E1M!xMhy7k=f&*yzDoK92-<wc8JqLrC<@D
zx!-+3xZ(7nOJCdH--oIrx5-+PlUV|q$mLU4hPx%6Ol({@ld0U{X+BT_WQKzv$z}1Z
zw(CUu@Jnvq27|U3(@7zHsVM3mPJ)F7gcw)lT5jg(AGS(jO&DDU?Zyiuvw7PEn!A-X
zofhi2&t)|UK1V;bceV!-R^hR{5})P9W<0%hfGrN*t&%>Dy31OYR9is*o<evA$%zGM
z2PJw_C?rIo%Z(p;lszlS=NfP<J~RK++5GLifweip%l9r4@pxrME$)=l($c5Ql1ijs
zLk5VqXg54LkMU2Q-`1nVVKQU=HEz~zQPsQ?`CPD|3WbPGjB#&Q>e;Hv#i%I8Ox!wC
zU6eYj*YWh68Q^t9*nQjlPA~59i1_Nek*FWnTVSFg*r{+-IfuHYF7C&vlH~F-PRun=
zgGq24{WqdTF6CpkQr^?IJA+)+n%zM@z28l$YRcRFf~7yh5zG#6)#cpWjC#1a<5aKH
z1xncFzELgMY?X6zBm^y3cD-(Aw1SOYbS()yHl<2O)NiF#jpwA(X{`||o>P~K+wY7d
z%a(T@0-Yo_w-AuGxKT?cm^aL@ME`wLe^90Slq75I;y2x)!2--~Sg7;ik+&X#F?$GK
zFDlq81J1XNyEz8zI%C1@lcr_Jx|;As?&{>rQiZc9tHAG);T+oJWHLz%c?pfNwOKo~
zHTSCFjz@nDE$6y|xnabyuA<HrphcU<9R_P)2b3|^U;UEuhkxU0_${Nb#Y&<(+!DTA
zJFqo2=ku6^_Bg?*{hI5_WH6OUQA7H~%*u03_v>9T@G)9fyR|U)kNA$_*joesM&d=D
znn?DezrWYbBgilncgGCry%xpTACb*(ry`8EOs6d0R->uIA<hdiIir@xnX(4(R<DuS
zj}Tn*g5#RX#qiy??Q1wr3bQ3>k*-`TKrcCk!#D8?E;rjygO7N81AY_(#rbvn&Vrg{
zvn9;O3nPWu^q5uvkxQ-=MSJ{&P}2y>+HoP7{FEn}x)yG0?Do^;dxNic?G#$&Dr>*9
zG|__*l5MkK>^-QpZ*xNx8NuIwJvrf-T-tB`{I9fbnkv0xAq8qb8b~I?X?;}0^=Ct6
z)9vwM!}*Fe{FTqFu*t0co{Crl|6;Rq%yfE>xr?F1wRoA*xn!)e=(Mrhb?gbNA*`P#
z2bP}jDckvB#v=6gz$=w+FiEGwPt`~e4QP0S_FAi6qr3Q_%@;1kc^0SN<)q}R&5vbA
zavA%;W?v>K@(OtOek--n`t0<@L_U38h-dXJu0(!4<ElRxwasXSwwWHT52GT_bWZbZ
z*JD_?FfcQc`JGdO&R^vl5-teAGi;kRS0L(4^y@Ld@lrrhznEF(tyKZvG|`n%<}QI?
ziH@9dOf}i;(j8NOHuUQS90nf|lkor%vH4jG&K|u|Ux{s{;$<>Zswv4{-aWc0eQKJ^
zp9g}L>JA&2JRK%{s728~A?HG$J8-H72Funik<7MVT#u96J?m>K)Oc7=oo&w9+Hbfm
z7W$j4yJn=qm^4cG8yjmnP>Ako``g|!gA0&#x;-aM@m9hgi!Sm$qj-?4kFN$&PV=*S
zP!8Fw(C{}pS9I~Vbk<-PAHiDnfh|gfbL}Vw%bgpz#Lw^afHxz3P!`(s6lVL@!Bzgp
z1NqbwcX!R3`%R%Eb(lah{q?+pv+G8|nS_?Y2@Ls@j(fIBFVMp==lk_#cRY_OUCJY`
z{jO2#7AdNyeQ$5?m?_gtS|0_<WwXzw27w4#tKB=aair=G^K@5iLiZwuA3D{s+{3}P
zG;C|M>}S+ajslw>8j^pBG`oz7oKeHi^TjqIuuXULvXtg^wOF=8c}>ZPEovvDL3>k-
zipb$5q_NR11Bu^#%7ALwO4(}EgR9|gJ5gk4htJB%i>^A1zMah#9%qj#kroQUr`52z
z35W8z%VSqsrOedo+hn_Xp_3;Cei6<|0v9MY5Q_*U%xyhUW$$COlY_n+IAUaz+&&K7
zZ;ftPYW-MGhu*!D?xeFPOr1}t1RKK|^iKBGhw^E-**62f2xMOIew{yRpI$r}<Rc!A
zd;?{A&y!Iqf60HikN^6}GevKTZ#L^ij*I%|!6IfdsF8I^AvtU%k<Pq;jav#MQ5)-9
z(I2JEnpjPzTuX839UadK8D8NAICmMqZdGURyaAv#SwL2R<LyIfk+xQa-Q6Sor&&%B
z;EKw&8S9_9v$@H@O0TCwdqD!h(#hc}+t+sM^INxL9ly!LJY<U4i~&#gR$$jW|DCTc
zEjn5o^O6{UFRG>bPBo%K^}Fxvv!3my1d4*v@k#mzfW&iXO0%*JKJyW1?|o9T5*}0O
zTDXwW*w#UOfpNS*i606HpcESWL5YQz9Et7l#IyFYr0;ZkAm<vEBV~TeX!*(mCc`@(
z_ERQY@PEN6J^HXeicRyo_eNBCKy`fn_VO#YiyU34s&BUnvcUFi>iJ}Kvb%4HY*Tsc
z#%+$)k%t#XXhmpYT!aw(<^}OyZ`ndFnp7|RuvEVx-vU6mzZUGDGTFP}Z>H$~4@&5v
zhUhJny@Nyi^53|?6A~zi|6WX!ii(bUnU#!&{jZt*-$IK2VCJP!Icfiz{Rh7U$-$5K
z>BIhAA&~oz-CYq!kWJxVS@jLjab~>L5T*Y06~b~L#2}Z+&n)|2g$Dp?klp>U5;y)|
zUlEiJx<<mfZ{vLbdd?mSAiMjeCM(6iz5<d1bp3+cpVB=4mFl?479zY19$Nb?6;Bb_
zmS#NpzwQPC#66DA0~u!U|L_-}KOlJnHDHj50VHRrss8`^wy-}TEawU%n+u2ki)1(5
zc}Bq53pE+ZN8(}r552|zLP-K8LHtJ3Jd+*(H^Ow7{r@o||G^>%HU0yBX*u2+{U?0#
zKW~QG`j6^it=@R_AK38!^OscXA6a}=t0(KfpN0;IoPoAmVcq}t16BRQ->9tt{@cy|
zAD{HU{)J=$-G(nMXQ@vAdY}lke^?-TJH6<C`LF?ypqd{^k%8K7vfTfAzkksGL#|+P
zG5@=2Ay5YNaP~0r7H9tJf%20;2b%enf%Bit`~M*DfjyL<cCwXBr1I}o5N4!72eKHP
zi26@z?|-h4&=xxEchj9t@5eomireXCudJttI9SE+YU^tnuYVw8coa7CpU&5Zm1g6Y
zQBVeD|LGcol|Mr%x%>6;a%_)Qh1fdFvASO=S(BOkq2<ma%?3J(G*BjmY2@Pz%-&=c
zFIx2W_8dq*PWAe5nF5mI^ZT&aOvMmz?d{PnO{wyEqY(1ZfHuk9@nBGX2XH~Yff#W)
zo=&)1P<(mpQLyr!s+4j31r(p2qVoR=vSBozM^z_faM_Q7(pG!($8$5TBe{S{P}X=9
z<f~sl@Ll<ZRd0MK-(pDSHJ#xUE4dzH;pLI-4MD*5&d|!NNQ@Hr_r!gBB~0Ko6aj~r
zmONtj;T*`c3z{h(o|xP*Dr7#XgIM=dI?%Q=N;}^ln4oweAzBO^1}z3nr@i={qEX2%
zk<cQOG$-*o+yv>1Vb{~WrA<GCWrp|bL$%}emK?_ScWC5zeNOv2+t*ikz0^EJxg6T4
z#4^Q^(>=XzKiv)cb#cA1%!T=ho1}^#n$$C=Kb|#vbUPmZ-Xzhl7sI1$3A{u9oi!b^
z-Bu+2|I2*fWg~?~rTk7&E{-2dVFt!-f3R8nl`s=bPR`}$uvz;tYd%T#&&;)K*8I2S
zTz5Tk#(o`0UT&7<6;CBfHIh^lU^12@fql}jsCU4uLxWdul0Zhe0is8oW!A%DN`E%O
z47d^m3bjoE(cdEI>D4OdQ4CZ%V@*Nk@SEo5TUP?kBZNb-nyIMD01vqH+0GK76)ZgT
z^{K<ZlFlvzb=t4ISsjn4G&njV5lT6K<4(C+dEO*nJ88}JPcO(RBd9wlYaM3Do&~*^
zz^TM$8WAv~OKD1J$TY$K&~Tf>#V731a35w0Q^IV2kpjtADGo{DO9Ldug(0EDenS3$
zdVj2Gx$sKWb#ph1>j>_wXg9Yv<Cy~TuP=apKxRc`d~yB)5^ye`6#gY40CwMaI$OBY
z{+L4Ouaw+zh6;=MLcI5v(;Brqrgd-FIOvaw57G3nMcZY`XLzhX?4*}qg65Z^YgaCT
zqp**5{IAo_mUAi>Z%{O*=`MWwB(fSFEPG)j?1rCjw+c}tp*z2o-^565LLyUtY>QSZ
zb>iEu*JfjPaY;~o%4q}8O04WbdAhp~<#(FPA4Nu{<ysS3WT0n7Kvz=iXtjX)e4De~
zw4aBk_7OokhlyyRYV)_>^S+Vf@xXphMzc*GcyKtR@8B?re<mo(_+y>NJ>CHoR3U6J
z>w)Nr$Aa(vCuAQ{V61M(tpDeS#H+xjTuC=KNZRa{+kDQeu`2WK@u%47K0WcI?Ql;O
zDHv`7o27==%};tH+au^qWl3BvV@}M>ti^U!s3ex8XytM;cjFyKV|Dkb>tx<uC{FNC
zV%OSznvb{c_sxf_j%m<SM41EXCRB!*)-B#rXIu{UxbBW0s$34vH1WTiShR{(@5g~d
zHs+uppByj(ON~O|8!a}x6Y0%kw*oJn^oepVZ{TNlXh9sXB4p+hllPthU=fA^@q*U?
z6^VFS|B2>G3Ww=KB&*%6AwGc91E^*{ZO+^5`A+Py1r8T{m%(l?3|eU_G=@Q$uEd+O
z9BurZ;eBJ<8<>Nl-@Q_@pG~&aws=2D>O-m4>53M3c{0Wqw5+GN1l_Owo+kB8zdOlz
z3ODM`xDwcq7?x`Wdm=_n!1w#3)lkwSTAfnVBvAC1N-KqF@Z+Sq`ynW)C;QD&xPcsm
zeKcB(yyjyCF;bFNZi(DNqWe_xkrMWi6CX8|FId7<v@vTmD!$FoLKYBImh%Pi!dPUI
zR>|wO|2nCU46btei=yQ=ZT1g^($t4GJ~i;@{{FHhOq=2BjjX2k`=j|B_%2qrHvVc%
zK!4S1zy5**X)8M*!L8kFDf6I;PhX!ykg&1mUOf65hsPCIYx`wcL=1@9Xj8sW&KS4s
zgye70CG3srDdK$irXV;Is_&Nvq8F0gF4u%X#TK|`h-x%wUygQ@*nO@HMnU+*oypA}
zyU*XP2&uQ&t<>*S#1asQcmhFGa@it>Q=b%1Qy@q62BNL|!5R0@A}=J(GSN=e8rAT{
z+Y>@|YIsaqs%@`_6Gq#Cp1^hq!g>7!vx&||eYuX*+YP=E1j83(T-<%z0ubkx2$)e5
zwlJ2{YD@&2a#)=$TwL=TQ`Uh5i#u_xy{nm?B#nX$$y>bQ9gtMg*D&BVTv9oljQ7V2
z3Ceu$?Cb8EzCXk9q^JKBNbJ8%v`|~#zPQfkTdH_+ej*d|$j0Ea=Pr==|DaVaWSOjw
zxsK}eF0(F?zY`gXh74=cNt^u{J3JwE2_WCDC0lRv^~Ps4fqnFyQj8BLDR$Z){j~}5
zr|x_0kD3Mr)QLQ+(}J@0iJWrybh`PF=dUO9^vs<~kO4y9?{CdAYj06J!VTVe-|G0=
zOjev`yG1K@Ut$Ox+{Bj8Yt7cRv<89Nz2T(HclV51E%(3TuC3;Q1&mx46{Ji<-=a|-
z&VS1r!!(Zt)`;j7UQ<58%?S4nnKOJC>xgILmP~kDQt7|Gf)xk@EDSd<a2RY=N&-D>
zuKxBc$|t1yhZHwq1Ri2B+WS!_TC-fX*Jb1{*Sls|c<g57{c*W?<F29T1(Mpc2?m5&
z(YQR1Vx=($*o)&&Xf;1*Rl2E6^{!TLtMo8uJiH$qvEHp8&-DrJcCtta3*;*H^LASQ
zoNZ{q^7p|wzz&-l=FRiPg{TR8PVk*PXk=U;7{s~`izB3rTJM2=HqhfRJz*~+nV&hT
z+pVNoZi9R~5BXNL**bHv_d*O{Tm8K)#yY$LiifKE0BQsUp2s?03z0+vAwuX648}Z(
zjbY!8fW|}V65Z-5<}@&)!GONVwJY6+N4Np&uj8PHTj`avstlbOGcXP!r*Wshh~2Z#
zgFZ4Rb;|!b6o&?6X42~h1t2AH@3hm{IwgsKO+^wUzKQ#UP-*_Bl5@pVsQ$3zSsv{M
zrhM=3kKbB+xvkZ<VL(b+DeeaIk_NzcM5lffX`zqsbvl_gz^AwQFUMJHKJkBg?u81q
zJZ?5~#dt~DCnU2ix65Z8D!n-Vhh<%*(CH10Arkcx<nuD`6ef1hlb@@Nru4cm<VL)!
zSZ2_q3_ck!scu!;Qr|-^XA8m;yJbp{6In%}>6BYpknq?8vSre~9eE3@Kt(Hdw!4=4
zZ*DZ^G_~O0{J<z5p4KYUtr4J<cd4-`n6FZmY3ysZ0}{_OTZEmZajQ@%0XJ%|4|1DA
zVa`-ax(e3XoM2+?HvH!fs@P4oz(z;4FIiAQ;A0Z+Hs^eaIf_=BRnsY>C{^b^y_(`<
zbyk^8n({1O(8&8`_h-Xski*dr;aWWCPrvbbj!ONURp*izw?>B<^wHk}{BmjE1G5dz
zvlWlnOJGI^f&=m-h&_KmhpKF5amru6Xf@qflfC&r<%Jj8;xM3otqdjNYTPB{%az0D
zQVYS_WS#TdKf#jDX23(*Q5Bdzb+wwy{dnIa4l=NyH}ifr#eZMS*nt*@k4F027-c#p
zn4itxu2PlG;qlt5Q)Ui_+9$}O+oWHk@Z>j|#7GXTI_0pSpe4N=PN3S4lqAP~BE)FP
z(Etlh8W5~`m+B8kPUgg_ocN0VmT}|muytBv39?211nIqdDdtDQFoixtDIq{CB?{7k
z0IPYdjoP!IA%S!*LmJzd>qwC2Y{}o?bN%(zrjTb@m&;C0^?Yw`x!=BIv4^f}?giaL
zba%lh0yL{1Bk}yg8kv3ADcj_mg@-Cd`Fu5ipsk;k7D^ZT9e?>h3?X8wJZtvGE!&P`
z$iN``%5u1NL$dq>xMM)w1l1wSx^k%yxYy}~l1or1j7FoAOlP}$K_zqNz~)ahJeav+
zdGZ4Yzo)N8-^^#Z-9PwrBN{Cuz6om=!74E1mBs&g#<WQYJ6A5L`vNfNH=8)R`a{m{
zaz$ExqeW@8S=KOe&|DojWDc8g!eJx(c>@7A^fz-&!R2sJNPiCQ<6C$SQaac2eqd-)
z*!}7C&Wq6rmqjh4yEJg#l9ZSiG}!DzSZ=vz+zOSuBI$+oH4UbcGY!PO{P}s`&K?6L
z+CVNRE)+$85QW0$kkc4dk|Xt1OZtJ``~H~Tv&9}5=C0>b90_QJ{}^Tzu7JIT8aN+0
zxA+WvpO0}L2w0nMPyurIj2=1emQ4`YUqPt);tM*WCwVqOV1fKACpY2Fg340@Xg(UH
z+gTfI^LLr60yZh#ljo=%A-Sq3erP`Am}Ej*3oQOG;lQ;m;V-7WNWj^^A_9@O5r*#r
zxIAum+K`6!&(cX^N`?cqUw@``H?i(gPiIdFrtBFMOm^*K#cmPQ=ydH+b!hkHo*PKa
zV@T!vZud+)9C~t#Mwez0z=L2M4}Oqfg5B}{qEKrnIuOJiVv`yrOo|;Xn}tNk7h=Vh
zvtH-hVbFWYBM2Hrc1{O+FkbyAjYGtnz2#cb#x!N2e*S$(iVBa<<9xu770FKjWt$Nj
z#I<*Ts%B-(qux=<dZ@79-i&D$Z3f225M(nc;weEt34=v+fdySY9!!-WWpJrrW#<I0
zT?NC!Y&&m$e6bYT6j+qUYo#AFhSZ=rgc~NQSn|sDyYNH-Cbpm7G@u5V+P6sHFufTi
z4Hpi;3`PJ4XfGq??%d5s*$`-CQBl}5c82JYchpYrU$U$Dy|h09?W%R!=&m(hXW;uF
z*c^eamJNrXYBx^Zd%-9SpP<@lUa3+@C+1MdPW@lPOGNhR)FOuDGCWt^jwq)7GQUTq
zF?OfaAKtO_Q%)Vcw~@!26Q31DujOfO6L8sYTW*xc+!LM+6%K1hRv7?{m@OelL>tVe
zuJ`^Xs1s`%kbhV&R<mtn3%tX4EQ_bzM`wTlD-Y>zt==)l=`Yrl(&^*5UQpmnj9MMu
zI5iQDcZPj?v}*jR>H}PS2Ga(&H0q5cb9QNEXf_~yaVS=?&Y*NQau$#F)cCXqJSO9T
zVHK_;+j*alibF`Zy?`2o09P>*36IZ@)o=FTwIa365ru=RG&@7GA&m*W{b6Iy{X=s6
zl%t9FxocsWtAxPGcy~VU2N;aO{#D#~WI|a_28GVZTi9IYVfO&X$rMie6}t0KjvL?3
zLdNt<uxyK{r!)blVQOAVrY_!S%`g_|J|w}rq{>iZfL$AzZ-{YBlJ?uYFdA%%-$pdX
z0Z?dy`~GqZZq+q<%N6zGWTAri>iZUOP5E6srnBp`MLh1>K(+zni4@x>Vs7=G0MNyy
zM|gF%{+I;A7dy8Mu7|pL(z2P|7!M-|$%Rd-VW-r2%_Y##oI-8i+egeR7~<~OuTV#o
zmKZ~Qp!R36TKn>NwKIi}x5)N{9W~DO$z=<|sLFoi@~F=EhVY2wK(c3T<1nb34M6uK
z24F)Ft)fs2?@5FDLwp5jHSaCmNBMxTGxIa>glq->v^zxG$l?I*G_C)4Lxvy15%nlh
zvjwI}OpUK_-6C=Tk|YB%6z80+UQ6lc@lKE-vyqHX&bXcT@c{Twe)A}>&gAnjs~H3$
zKCkXkZ8SR>&lIzR4QZc0%atek)1&pI`agq<NPcM#PwG}a>|xRLeA&roW)1NJ_p$2+
z4;~naRH)W!NQUu6fpugl`694odCgDiYk&@Fn|X1+KvTub8(%>mg|H<EW=KE)$q+;%
zYcz%f+kM<axcQsLUxrP7561blG>lzuHdmt?<L#CKR^LSJY~<^6(*d?FsJp>p!q$2)
z-4nu(u*|34k6>uA@!-~pz~9Iwc+%H6;P$FvENJC@eqd_B+TAnpbk~a&9f7gLanrJ+
zDbykvUrhVT<$qx+Y$^jXx$UZhuvzf6sJs50*L4NfIdOVl=rA9e%3$A0iw#7kKi}>*
zz2(XUZ3=G?>z13q%{YWK?}6&Nlz}8{KeOxluys%yNxnR5!a=u|jaT4)db?BEYw{~h
z_XHP@0ah9yLLS_pi6g#voz+EPW&G4J6CG*u5xiB?XbHK?RmGrNX@YII9z|ha%9mEO
zgfSGu0Ae8Nwt2-UQ+v<yop69btPnOroXzC&-(y*Caayk}JIR9tZ<U!Ds%joRZkd*+
z9%0y6YLM%*s?}r4`(!WJHry-&LHsM_>j~<$NJM-tlI%ZfrC~;UVSX;FW6QM)*<M_2
zlcSrQg}_mjrb_k%*Qljo3EA&3Kxo|C{srzCd;?D}ZxL0YR>+61QO&F5b{^pZe~siQ
zS4~<tE|jvjy6Z>YO;4hSr9WhWM6peru=!kAHfJ{tNd6t&4Ui$dznvm5@1%o>g<Xqz
z2QYu4R{CE~@KR4Nwb;1LjrWLFTIv@%e2pS;hWrW-?o`J~G}JJ=KdKU{KBnZz`KVI-
zQ);F7fZdJIG=>{s-0osom5ERnW-L3q6s-i}r~n>$`C`BRrb5e6@y6|GBV*S}dDh7c
zpt?bB!8l0+2jB*bap*Jxq?2eS4_rCn;{b+pW-||A#cC|=DNS7}{3*)gE42p6?gYB%
zT{ygvLNIM>9*3`&-J8<ICUr(eLomq0WbSgQM0gHk@!S}OC1Kh@i*w~1)}GpLHg!1N
zCdc7wKTEuzH}{&))7?dOaLH$v8|UH$J|F|R>Qzd&&mG^_JLIOy&PB-?q~gO(qK;Nb
z{0qnUs`JnM>nu@3EvU_+Xvd?rcyHIU?fPCi%(lPaAW{0V8|fLlC4SW%AJzyk4*Kzm
zb;~2lBCH(U&4zt7!t{k&0FM_r+Oj;RGK(|5iB`M54GLQeE-uCkjLPn)=<hm)UEfBP
zp<VdpH(S_q>)y4MkdJ)Ff-t8+?EN7mf@#-_SNz&yS7T18;Hq)bzsxr{v_lWo?#hEV
z`}^Lv3(*`?D1w-*Q!Zn3&;4yE)*^jxtFWB?p&-^91|{<RRKf!pSfgN1FU{hIs<q5!
zHffg8?QBY&h9zac!vSGPEvxhP@ydQv)lz15Umn6p?vB8ZPfObIaL7?1$xzTG7D_Z8
zeC&bB!ioZxD<l#OV8~?eql5eV8Go}l>ih?S3s(6IQhcJ2!~Q|GZa9!}grz->1hKkL
zUW7%^+w0?^;noctF&;M{o?3<pQE7%O^c|{mkkOIs@@4~_^hdbRv8T;2>Q(zLV;TP1
z_8*pBxuhFyh;|1s?A#o`E}`#21JfW%CDo7-R1h2=4E*&Gl{8}ijf8D5_w}NiwlV2&
zAhOae&Rq89ERn@&tYm%Sod0Y(al2~RP1=%87!2sfUB9`Ooqm-K`H;Klo6GsQq>naU
z#V^M~q~M@!G=W0jwK3oeIqlw$jZL+_JZq7YDZYQs`gOU(9OrzsA-qN7UHtjxz>uH`
z7#a!LZw>pr_cvM25A`P?ldZ*O=6IZn7qOR0qb0l30@pr;LA$-taj}AI3qG);E(c{m
zaQW3}t~;ZS`VY#~zO`Nt7Os4thjvxo-bljC!vujP6p#(r=t^H#oO-2iy)0fnx9s?y
zb2@J9b*uAmxv__qrS1<Q%J#q=V~*<eoV=vR$z{oVy}VGaT0p!O7lY55D4^FCe~7+t
zuKCnCoP4uWL=KsbMW1NiagNqENg+i>_4N7nj`Q3c<xTGCO%dS*hDW#;_!&Ce&9z^3
z!7}ni`x>-kk#v58iW-!8!kyTh!IJ0yA@&WVlbpZui@$Pm%E(p$4nOSBaah60hiV37
z|MKKq=4?r@&!RLE_=X8Ta0(^;#kuc@0v7c=#I}~}qq7&Ljr(?_7D4c4qGbS@s+kW|
z4ORj2EXS`oDz;WL1f5g3h#$rO(e89yf32hU-0ivgA6bMR`6YduSb)Vw*^g_Z`XbyQ
zY1o!ceAYFO5f0ard85^{(;AdtENI#{Qy&hK^oY$MH6;zC)7YGp0k9ji@;V22MzBE4
zunbtCQS1VsjY@u(t&wP{Hra#ged+~`N`FlgnL(a$iW~Nwe_dW<DgfjmJ$N{KJ9DBz
z*C6I;JnH#AQ2=i47E6`<n*1;3ty34+Eo702_!NKDxvbP{t6kSRfs0NKVTaaj8t#zq
z#GG=ePu}cJ$~`^LMlqXUmV8-*OIsi(goNlo`d$cKVc31!%dT-AAiTUS8m?HUT;v(l
znrRzuf^-BUT!3My494qHu~cn-USr~P$y>@@gr@R=B5;*Ufz;t$%MbqYG5+WjQ|3Q)
z{fi@_MM8sK-r&Yz85SC*C<At_mAFY&tMVj}<*%XY<rwp$bVnEgZX@#(w>mq}f!F_l
zz3F`VkHY-9B*9WO72;dcllN+ee>b6IAV4_PV%gS3ytVx^hf*d)4salsp+Q~HD=-Su
z_%p3iU#QIf-i`)4b8<c2NU-%M#$oI&wa@`>lDPR$$)<~F9I^HZwleAfG^jLS5i)WC
zMB&NIzMJ?G4vI46Kc}vD{!Xbm9ZeZvAX3Y6S<fyzEfI+IkU(tDPXWhBD*cC-6F~Ze
z)fRga)#{&0`P<$vK{+Y{1ifFD=bXGzcvYv#M6|@G-B0cw6X~4{XDy1^E!QHhcF(Kx
zIX;i8b@$laOuSh8IPvZGsbDFEYpaOXO@_=1Gh&{ZELL*`+3;Vy@yHFr-I2sK8%*+A
zCZqp+xkl7SAaKz4=?%}m?&y#LMh*{cE5Y6hSU>cd5qX$UzhvHqt386W(Tz4~{>|20
zOH2Id9H%mx#|`TpfQ^3^O{z@T%*II{LHmA2c|!1Q!zP55?O*3f&T9IrU1ClaD5Q5@
z(W9Eb5`!8}@oIHO{qRrRlIE%6=pSIg3D&IUfuvrzB&kj%M|t5S`51sN6R0P5ZgA)I
z!ku36h))KDN07EsTt43p0#N}9X`C^ncOrxqs5J)Qy!OsTGkn9bD40Y(IM|6S%)Q{*
zZhd*4J-8j$1}JrY1yt-g95@pI)LWPFl}(EooqW@+L4CrXmLYKt1BSY_4G6EXb{;(5
z9}nIIe_jI=NPPSYjfPyaGlW%UB1!fXB5~x=XO<%|&PQ&0jZTkOH$O{RW)PB|ZXLYo
zH0vTTLZBu#EGW6nc0`4iFA(Z{F`wTp*bMNa#BN66F=|^RVhY_o5g_D`$~B7UknlJ~
zD4z7dpTQyES;!PwFGuMMYxYJdsV&I}qpj@?C(tU9dvRS!gP?WN;wj5`hr@?pq5p7Z
zGR@!R`39X^tvDb4NusW_A>?t<Xr%b#$I>nI{M@cyt)YaR5Xh?DukjkzhB1XVD*<~s
zRJUzW`dYIWi%Yk{Dy1<ISa&tEJA@{$8xqj!-u=O_HY)sJ3(Kk*<{(K>vJm7s6zPb=
z+-{uDiq}g4iHLlH@w3BBE?vMmx$>7`exM2#D_+Gw6yl7S0ho||456@sz&%Ay*or*z
zV_A<U$hoF}Gw=iQe!@-OA~}vWNO3f9LDr|m{wgv9^<qoATCW4S$!duy(S+S5l^l!_
zoZ+EyzrRiX+iv7FxFCaoSSV>~&qI5%?P{&~7;zfu9np75s%c$b_e2WTya1)A2cizi
zjv;WE@4Xnd1SJB&xlM{yW&_f0*=uc;PI)#4c1f7vy#L*HV!TT}I@MzE7ng$_yVN(q
zog4z~dGh%BGr=v+LLx&LO<IkDEMZ}2O$_JoT6gYNK?Ua`aT@0`hmZ;LL+XlLJCMIs
z3ypPAlZ2t`?%DfP_w;e!kpzb!V;SNgqxa{Td><H(yt&eVMao>P^o?{x^_A@}o;YG#
z{QIR%ju$uXodv<QT)m1eTpg@Ck+u^G>|->QK{JFDJKYu)zUK{<-4QYA@chu@Fff>W
zBh9w>x|bCprX^K}Hnlrou%<xJ4wNt3zjQH{g|OXP7ec+O)M^N$l25l0b5@yRopGyz
zSPLX5Y5ts7Q5XsOFvIT4w{zgR&MIW!vln~cCMC0>yqKZxW0*Wol#-v@4-V*oK_O!O
z7KBobYnVM8sQ``g4)*$gT>$=yg$^Olf$<EokXQ%7Zq3)*R*E4kFtw2qu%EfQ(JI~7
zF#(*m8)Ve{8KNwXqu!r^!J?9^X2Cqyy5Hr9OImrAQer_O3kmwY0XssIa8G3U{^|+d
z6w-0|;oF>3u4<JOJN`ocprw=frVI+}aT4~e;4C4VVSQvtVL=EsBQu<0f^2#8tJ}*m
z>KshXey{`ZrK(L*{4!jXFk}+iVv=d*_HCGwZnn^e^Of^^QMZCHlma}JlBW^Pter0q
zh6QP@^fx~kevSh9R_f-)H1+e%$rqx}FY0{bFgCuKNlwWQ$+w5|o@fDl^VRms|Ii{d
z4<ys5$%a5V1W$-&lG-$bEtG5OHlr++sw}O>+w8pPQ7fbiWwpED#GpM1do)>mcst><
z%2N<?+tB2UbB~_P*SXds=Nv%xnv)7{@<H(TL!+q4FnxIaX^}gG$CO>0=s>Hncp@JY
zoSkbL1?1f2N(-bbXbyk{AY*teRlM>bShPr<bb0p9RB4-Bzv*c;*$5A(3ip&n<({qR
z6~h@1|Fw$WguR|d!&gSlY_VL*e!0J(6v5zP=`s5%k4&$A%@NdWFm^EU0}LV$CL{rs
zQ=V8b_U2#&<4<Oe8)zKJZ&%+=iP$VNb#PuRCX-AhU?SF);l7N#4w_EyvrGICdOgAP
z{FT++d^+D{&}=Z$2Q^>dZeRYBL*x|fz3<1QDD4MnK*CpH3v)NC;l9DHx6Y^dZa3E{
zez?x|m2FYz>&L}E<>K&PW=Q1C!<a$q+VO<!qf1p<Qs+Fb@lY(^MV?p;&n)y{X{ihs
z8+r)S`A9Q$>`E{skk^gE`Y&o8uTcVPj}PCYF_xjwLEmsK5rfv5FYrO)D9ysaCXXzu
zwWUgUdAA|lZ?T~fPrcb3YvZY^Rnn$BJwY=6hrPEBimO={cZ0iSaF-y#3GNOdcnD5#
zcL`2#2~L8$OM*KD_h7+-yGs}t+=sjPww!P8ea?64{&(yCepOUaMJ;AlPj|2B?)Q10
z*VX~iC>M>V($ED9*Y?og?QCR!%2ZZ)+nW0Y>+N%9HyMJPn6bI#K#@?a+Rq_+35|-S
zyn4}B<7?VNe&pGo79v&jvDy3Y0+$NFy(Tnd8P?Gu>BOuJd^AEMDV-X06whcxqsCqb
zVXXx;LQWiF2>ZK(obptjWx*G-ngq;`{~81}XM=4|euEe5ibPorE)OadwN{yK_g}`-
zMNkqJA4;r6OeIkjAT)oqC~o-eASAxvD+1!X+fMI$?<4t0U=3CBQ3H<AEd|H=98q~o
zq-*KAnbLgjPppxXDc4APuILb=u%=lArA3TCW2Xg!1>JEBej)uxpx@BX(0lCJAH<&7
z4EBs#1V&t5-jbg`>5VC-TA3x)41=yb0vc`{-<Zf<lwm@~RG{Nv_@NoE(W9B2ph#S=
zKp~-!0KRO+`#X6z9P!_2V?`WY$Jf*0$G6Z!ZGpaHli50%+>f5+n!|~OP+NCw*8qwV
zd>b6B!&T*@fa%G;NRODgx41EDp9F$PZke$cxNd*0JRo9IV7_?Zb_>eE;sWBAsg6yz
zNlr&)?(^>yDXb^+3h~f4hf98dXYH@WnetW`V0qx5SEYfM&(s%D@zWc}@2ea*W8SQ&
za8UF$kXF|>z>=sV7;&TPM7Ydp2{)vU8rpqJEmLjw-o;dH-kr+4SL%(!x8;N9d}qf-
zqY-HDcrB*;x(WlNkTx%25^HnS!{gt4wzJsg&|LYhI<TJX2!G7-gZ~bl#v&|E@BpHP
zpOImDn6e9qBovA$<mUw29P&1aX+%%o2ytR=967<<$4yqSC&#Qotho#JBD{<A1btmF
zfD9WsZuu>P>wW@V1GUf(P~l!(Y6y5VM{Ko5E%lcH4kYXuS3sqHo5WYO;>%qY`|)7e
zM>EkT21COO6?jr?<SSnht;ZO!mERekd85V0H2Ms_aT@FZEUPZquVV165Kg3DH=#lJ
z?EQi*@6Ejlfz$HipgxWsB<f1TdvdOD_Ga$OPO#_t%Fm!1l58AA&yA1hScv6it8=8v
zltY<JyxSw66<be4<`xJ(VO=Vcd|o=8;|qS4W-W_(m|V?6i1~UJ(zZ4{p^Uh}-x=?J
z3~wKRyf|~BKc`{+gT(-r9C;^d91*>U4#VZ5++x$d0=iIf68ZHv*q20U8+GQgQXgRj
zMe-F9=#8K&V|t}%zt$X2@Y^s^=SX$HRbjTC&xlrE-SW)tCxtQ@o(R#lvo5CJu|M>g
z-BR3gFx&$Bf_=`V&BhXo=2}>`sO>`{&O;9ozSstKyqGIGhKo;FJJQnpRFD`CtwVal
znCBt%Y4PQ8Cw#D(48(uwwRI=-l`q+cuofo9z3Jh*WjH7HrH|@F1!PE1$6M8Hv<u<k
z;9Z?8jm2G;lGi9c$w}x0-1|&*aRw80!4n6hDEHRiEjOL8Em~;7?fM7DqeZi%`v>H8
zZSpx>p=jWj;Ua<P@`8fh%s;3wn00M+i*$X8x!=?w&aJaMJ1S9L6~|hYqbyqKJ!=bn
zo80d7F<WwUk+;3W0+0GuQhM2*0LHR{Z{84oljPG)+wwiP@4Y1D(%sEd@Wi7%z7<V~
zAFaPO2LT>L2U}^FB93giJL{vHFg3S+C!%yuzCsTWbm#txduC%(f9u>C-m%PV-yw4n
zrN#J@?|vBw)G{fA2X`K{y;x94tIu}C+^BB*Jx!nn0X#c45C3%fy0_6}Kk3`Gbc(F5
zczDnY>Zs#&M@N@opm^YV@9eE{%sF_g61?zWvH^ALz$ku&V8dk3b?7sNRhW6(@9O22
zMj?ezYG_Bz=<{lYigBC$_-ME^9RdBi1#c*|?&uuvdVM#kQ?f0NV?!ZG%xP&PcM4wn
zTKc=a$9J)te32;djF?dZ1$uDQ_d7{3ZV*x8-natSTKCL)du4l`9~7F7QDgpPsv7+a
zMp1BX$kD|sLr%Fe!k;<^nF02{eqj^B(36vN_rA0p#t{7c?ewzU5o8PR>v9q9fV@_v
z%<!$iBwp>!AJ;F*!M{uSu_s&1exjsNv{U{suk)|3nuFZ?0v5&Om6M__jtV?3jI`MQ
z^lJkXmQf)}(DkcB-e12S!_#%D9^6j<^J@X`LwQ7*)7{&omV8V1Uv2#Ndjr}p7DWwQ
zM;M|B()>T>_Dnq}ybr4h5w!oeHp9ZAv<rJqf(Z1bR>l9Pj{BoiKgA*!$fxtn=nHlM
zpx#Cs`$^-fUom|=eJuAlH_c!76e-TQ-A_H6WY+hn1K+1!G5u*!{%40(NBE<HOz`YS
z75?&mkPUz#O4#mNO#9c}YK#YiN7v7|l$QE$)8Sv=8M__1gq_3Zw~~CtUk^^kr@^V-
z#LfBFP4>Z2F*3fM$xPDw%T|0+e`-akqmB7rkKya5F$^geWNH5ER+K)qLSubq<gX?o
zZU97DEF^qg*Imc?uUnz<)QYn1@>GDq`uE4lpFY3!NTADl8jtF>{_3g-v9hODXocu<
zs{Q3m>?R1P0?~O1k>9QlQ0xySGOb5?k+R+l#ur<j+)-#bT(8$`f4B!+6Z{pcfV?!(
z_j-Mx37W=n>3iL7F_PvgyBOs_5>uV*nhIIgAW!a_F;taWixq|DIJ7?`L4h)a3`LGM
zb)}k*1)x=6rFeffR-W~l)$ieg;Nkpi4CdkS_z@ZlOp7nRci={UqhFFZjzOKv<6|u^
zi__V&{cdRO7FzohRhULDl|^zNM?I32K3-*R<LmPJJwEV`>Q?#twmVWa)I=rU`1tLh
zj~Z05@3mCwwWVQ_iI8$Byc1sj<??R9{@rl?Ngz$t+0+>FL*QhlIOX^23X+k=0E+=a
zf=|BpC#0GD-aK)Qz0$;hv%tHPUFn{fnp$Lq2|Iv7$nBnws(dl`O!NUap|Lfyj@#aY
z5}T)6$|uETceHqf&7$}|B=YT!Eq{q|qV=n06s(fvMCacPcIP()22E%_1fjrr8^r|$
z-w($5oZKGE*(2eZvt{2P6wcsWqE}|QueO=KvmVt0`I~?~E`j4r{`^fq;JJ0-tZ|gw
zhxa8d%xj<X2P~)4xW(l|x*R%lsOpTj8JS2AHqZB`W?U3juN&4=evZ9-Xek#`+E#xo
zp>jOgsC?I=59h^U!^8MC57%ttcMTqeH*IKju3_~?0=2sJcb}V@?J%gt+^5eQF{M;K
z${57Fx#+32DidD{U%G}8SpHnPifYswgF@h@OCAIYKT?6VJ!sj?Mn6X;`8JI(jois6
zPoinpyj|a*eJOvL@vc!^5|7b~ssIZyz;S<9(bM7TyVnJ={<CzjKq`R4s*pi{HiGB6
z;@jjaHl;Pin3>P_mXj$DBQPVtC;S3Kq$F0PX1>MV$0Fh(sQCji#pR3cy|0~?ez}y`
z@yein3rB^O7rl=IfY>t2-dC{p0G;?5_CyDn(|iejXv9Z1cY~Hev;Dpi;$T1_^u&th
z0<wISmB5f~DCBeg<(Ab<2LE#cE3uywC32j%7qbTC?Dp67WP<K*7CvG5K5wPK$3!>E
z_3B*f>c+xnd#~Zq(IupL)#(56e2!zCsmq1hLDF0t+hse$^Hfgff|$Dqu&1X}dPU1V
zp^8q^3$DLnL*7M`@Z@0FXiRip?TjPnYuo**^BnIISUPb212Uc5Ue`F}Q`Ik9dsnJ0
zADj!+-$C2Y{C>G7=k1Z#@fVosI*je>aZO++HL&0NS@IVk3`9X%x8Jp2wptCS4C+@?
zmZn_7*_*3pdbm3Ft2;hh{&1VV`K#Fk$$GIpsIV>gr1GDT>F5wImu{!W(cH~TTJH^%
zn#wQ}pVl8ydku;td%sTdx$c&UeziP4llLZF)y%xb*gK*BGMU^W67Nunw!7HwLZDqO
zBjbGqE(Xl$6W1L1`y+!6dsARtGxd*aak=Z5%}Jt&*yFw;d?o3Pw)qiwjN^Hp$Ep<>
z{DDaA!yjFJ+J~85M-~GQd~qDdjh4gDK_@!nm*m3sq>7nB;V~2<ugh00U*0c1;UX_`
zUh1`JkbP5us^4Dj840*So|&ynuLxWmOaarL-blnSz<CuW$P(fY?T!qX{qA@|@N@`s
z-LgKLL#uUnar>^^8c)F4P?T(`F*yS7r9xuqbX6U$#Yl$qu;^_HPsMvh3+0dU$836H
zQ(9_JxNK=t=GR;f(`ovHiNlydOO^U98n~i#iiB*(wSYr^!qIl(`zVXiG}RFeF!Ot*
zu9hU9s3G#@?{A9;3<|nKP*o?oLrmX-Y=3LmYEX%^W_`R1H<$g{dSf~9<w}|fJ`)|8
zscBrOYsR2VNem8t9S7@NXZoNnXnF|%+ZmurixfFZ!p6VSYYzPD1YGyusATg5Ke4Gy
zNaN<>NXB$IUH1w;ewwv4fORZL9aPX^cjxA(%FKHPo$E*Sf#^2+!ikfBPE>yaCydWE
ze1SX`hf_f{1o<*lqYH$&h3SS|GqrLJJGtk&73)}|yp~wh%9cktaP5#`w)4Z<vB{%M
z!4gC0y?q3wQO?8J!~l~OU*&m@ok*$>KL*7sKI7*dG<<XwoG54hlA*CVa`DBGZ_~<w
zkHCX(E4s><$K{gr+uOF#GT+=4_O*)DD%F35QT_i17<JEmdbEJxFhX6VPPtAUdqFE6
zpM}<;tapA<xZT1swEp*;_B{@-RyyZR-UASXkX+zcpk-H$5JmP+Rrq%twOB(DX6?ts
zr^Me5?ZR^Dvz7Vyis@c3E|75}KsF3h4!x#%^O-tiJIpT1;BZ;~y`XCz!7K9uBytb`
zp~RPp+i#Nd{vc92*WBz^vt-bTx<9URCTy*T0j}nn4fi9xUevQhreqXY8TPL5=d^is
zR+B3G&uJ}JH4U^q|AjQ`6omi1x*iK2qy^L-(6-0Ut`9&7H@+6?LpQBjdzSJIaQT{K
z9Kr1^e)*zT-+%}A0eB*5QGgr^hrooWAL-8RHLzV?-0sw*Vx7GHau>WuX~}~~oSiEk
z?L_2hoWZl%mhJ;ShKoWx$?LoLJzHMVHOb>j{AmC&7Tp%i6@DXYsUwKvp+9lD<1W$c
z!|mB%yG%#C;$dM4$`tgACy-rOvy0>PaD}vwt6+q%)>_d+2tGw}rPbSl>$Si|SwZ#h
zdy4%Hc))j~6DE5tQ^g^E{oY0~fj(lnq3&L@B?T4^Owen(5f=39Sa*p#c6eqEzwYT@
z;ZqGCKm?k-BqGiU%x-WbTS1=F316pP2p8~s9r7`uy0OCe3Fa#N=C!XAY;H$7j^Z)(
zP#tmNv0d20qd5K@s|<BM7P0CvM`Ny7b4orF>c_sGXdH;*j`gi9`l*7MfaDnhYF=+S
zJO+K5l^`79)#0*{D14KYBm3nAjtC17D=g<h0==qFEw-8cjwgIRE_uF}AN#gEZQ3;J
zdbkg{Fn;h@A^*}&M~znB7M2ZB-=;52-s%<Fp_u(OcTeMViALfVI>k3naN+a~c`!fg
zp6?DD(#5eXo7tem)rvRUuo~*y9R;?Pn)HfQf0*ZPVjb<z$DqfD^Ax5h#6q+y^gvYd
z+<Ly|15ki3a9r(W7M0^>KZY{j1N8RJ0c+9r)OM+EJ|5aPk_8Yut8O>VlBPt9V6`dy
z@&bk9pK0LrpVMkNmK<YKW5UO&eA|F0?6!q-J9Ro;_-uy_LP@$3Kzp+>2A3Vy*yw59
zroY+)7qvTCC@+pSov&;l7mLI&_618MuIY)S(rfdY?xCoqm(6O;x{W6G7MZ{KYVkZ7
z#%#XT%toanhp2{r0d+{|!f&cPrxihIvfX+6TLK(3RDURIiqreZ`_yuU;&`CJSO|lJ
zH_-#RCs1>&4A4z_-WLqgd5xFKVI2#(uLzBGNrnlHnChBweE!T@^^ENL)`m&>*F^B3
z+SW&pmT(kSpls`Zab?QiXcf)Bw+80u2~k0uBP+3(gD>l>R>iAi7R~$ha>UYm0nbzx
zR+6t}JzL$b=;QrVt=j|Q2?e`lRTvR%cu%12FveAMI#0Ys!vhtx==~qa(xQ~`$6G9H
zKOZD}WY2V-`@Upv00W~Kch>_t)qbTrsCCix@CCG0UHo1#Jyi6G=WI1eH$m)Dt1za;
zT0mnb<n#?JXf~WMmYdQicN+IQfrXj+IofEfPyCUuoycz!?v@#U@AWuPg6U1uw9QOf
zi8J>qDd(jA5i{;d6&Z!{B4(x#Vg7jb9ZeJrVX=VQ(Z-nW@&?CKpeEg^`==-$z?!K+
z9t?%L=V0{_$C|!V`LtZiXp?Ap2(2`)dD@&4?mvQ0IDD&Q+WB%7e!vM0#wfda34AzU
z9&7#d+lg4P6D<&{56jF#JS@uKb*%f_xmxrak@4$$a3eh}79?vAI4>n(-R_&)k!p=&
zzRqb~8&mM~Y^joR{Tor5PJjOvAOBEWT3ic1O?wP#KdzalKv;aD6q+sLOSyu|u)mQx
z!HVvHnb8#)q|g>%FfbS6v+l5i^qk>`XaUL!oYO9g7K0GWHAY;zmz#A<D6dSjRt^Ig
zV_Pz7O%AfMmR~yijma~VSf3i<hQQiO$u2vDERfBkPQ7&Y$5rr^xCmuWsS)WNBp>fo
zv718cI~9oHetV!)L+<XG#&!G2l6$Avpx;%N@E6i+ReTXI+UWZc_W)-u2Rsgfb5z_l
z7^jop06+IDdOe5(3zi59^AYbTTJ+j93j8UOpB`C=KTEgXj()6lS^a<vOi7q3(Br+S
zyIkXHQ?+KaZsB;g-Qt7huv<Sa`b=~r1RU(C7&4@OsHA^&U38+D8~e<_Y*H+<yUlu3
zXA;XxI>HCJw=$P`ZA{-`?#J1509;nvUHj>~3Ju{spCp3N7Y-xb=+!cXO-7i(PGn|q
zewII?63c|B9w_>GMz$wbnfgTG88YC*8A@@`bBuT`mr?}#nA}6yPr5@4woKq?jP|y^
zs%@C3AoT1(zQv(X2l1?g)W7b<P_UXMZ)15B<27f07G$`04mi4evFp;p3))=#JK?&c
z4Sng#4QGL}gdVPOgazy{2sfxA_b4DwJcaI4aoV>l0JNHHaSsOqyNPjo1XA&CX{0on
zg9Sj+3%CLXf+5rjTtQeVB!VymGz<+b1GC=9_R%V{hW-s$<^xKG3JNXUL0l=ql7@R-
zDI~_Drbb&)*nM?qXNpv$bg=C=Dc9|qQ=wR7BGl-K@0&2z*(bZjFkz<<in`Hr4B0vO
zrJFH`LYY+3O%}CJ$^37);pyP_{S`c;vh_DbVE2Yk$j%QIkCT!Srb3VV<x)O<vXlgH
z&F9_7y?t0FNmR@wMD9NMU4&LX7RbL_Jd7tNZSG|#^=>^ml~Nqw>UVy1<X46{c4Nb1
ztvM~Bf^e{4>F?aZFBG`YtarRkScFq41Cc&FO_B}Whs(wIq8~GPU@$7^b56%eYJHoO
ziurCBB_V>F68t*i&~TWvw)kv?E*mU}kMG}bOswus!{BBly6sC5g8R&M^)5CEWDA0~
zfysR>VnB?9;L1Dz+cTiUKRuNKewe~{Z51pOs4C;1{kJ)x+;we?d~6D@;F!)YC;<a|
zl^ZIXVTk)Q8z@{u1SFROI%|y-GKDQFgn~5TRvm)dQb=+|##ffGH-ZSbgPSQfNVahc
zA}#V2a6sKs=d_{YZ2r7pUc)#({*yuMce%$fRG7i~ok&9EI5BYXhNcJTAK&Ta82Th1
z->rMUa*1)-{VDg}1Zq{h&uM{L<i_iQwT)bdg`6e6w|WvK(CHEAW}SeiK=R5d^O}3A
zky1FG?pDAr_y)8<*N(th*~JUjjw7KMCu&5!GzuFsh?J6YBYdB@fdzXNnoeyFU=1%V
zE)VOx#|XHx8{v4Z7Xp)*Gu#IZHup|U-iG>k>Q@iJ;%(!;%xcIQRs2ahKgxo~fL)z=
z*&FU)>Rq;t;~B(42)KhTob?+_IZ9<@GfGlL4fKbPZ{B@H?g+A#MEf=)jY@#8t68d9
zS-8;`@z!&tD3qls$Iw?J5XPQS(>EG=b)>*icl(1TcKmyEy5-8DHjIR~ARJaN43Gzs
z@2uEo#*>gSe8uSy;wt|d%X%sIEuF?bPXc(m%=UbS#AI`K2i5fhcIB_(ODWLq(6mj0
zz<m6UY9$QzkN(wK-R>D^<(#i^ZGUt*15n1cNCE+E(s!{zelCX7jts_NFg5<?TojV-
zXSgN3?eEO$N9rfwxuYF?F8sqm4RDsN)-rY_{IHxk+^tO0_8J`HbGp{)K&c!a_iG4_
zob?lsDaY~IAL;Irby&?|g9~=f`Bz-yV3%pPnCPOsgs11+@mGir-cTB6vaMXegOshG
z_D~TI;JzMtgX%^&Tc#&&@iSw)M^mKWRi#GG-?>Z}&sPe^3U?Hvsr*52MIT%Lz{@^$
zF3*kKonm#}UabgwW8dB;joPOXbzW9b+u9F6OkyadsKhEr`L`BX-d6~zd(@NSvFT?F
zB{04;N6YAs$@h6nzrykUd!5G55D$6X|3EY2rP$Z8KY2W7-X4t;X~Qefn|~Ed@#Cc(
z=p}aS`w~MKMs-*^`M<N7^D9>yXlyAZzLi&}8lCUP3UOU{m{SVn{q&x<^8#g8o<k{<
zNsfIUAFgyy<nfuzQ)T$wHiDZ7qPDI8dzcHISIT0n?ob?+Foj`bQe)p(lA2C3^TT+=
z!Mm}w^reRPXG*V0O{9pk1_<BWUW4FQekkRJHV$04AM*$oH^kZ%$bAu#0+Nn$YDJ;1
zlRfpM=RMui6ImAIfU3n?L_5-*ytnqsjj(skgWX81#BXjbds}y!6JGYA>>G>p2NHo^
zEL5h<5(T1sF}}~N_K#hH4E9Cf*DIXcF14o0V3J6%%rC)WE<O7PsyULn4Bt}PvxK>N
zp|VAaU4(x40X$}tu=N>8W^4{a;e>dU>u@Erj5>0GvF$zpAZN`((-X)kP15g)6B>A8
z!!GIHEdRv9;0ze$^7z3WM}!BKlklnxC5)w+ev*C<@BLWfGN-K2`#@^Rk!}gc*5m>8
zVJbVuXz1<961J60+Tz%^SdkF6?M42!T7vTNy6ZU;%c{pMDK8GPppo$mb1F*5P^Efa
zL9D1u>#~@@m@Qj}kF3q!#A_HL0W8UD6(A$Sn0y;`r|Sc;<LeQ)XAE)Ulx%49(N%9?
zPetd%`3PP*22u7Y8U9kxIWr%_^(H0JnnTLJyDkUvIGuNg2D>{teJ{`Nmn|&HI9?k+
z1G0^p*`=iCJ4;F4Wc+Apwcmh812G84f(c<)Or7PQBf5LMP;e|=+f70iC-q0zE6!%a
z#>D3MaKPn$KpUAZ+lDCC2^EG%Hs*gEP~g~WziXp^aYd_STJL>r^2MS!*w%&n8@wQw
z*_*`Eh^$v%jDl%w7q@?8jh-rdD8sv0^GmJ3Tf6u@;B_r6Oxdb*kPCO&g>#Gg#82$<
zMLoN<pX=@}@Scg<*N>knY2~4V_r5yJd+&V-3HR?%ZEy9QlL5Qk-mXy@y;V$0f#3Zn
z*vb2`>7M3bDYp$pq6%fE+=7jm#{)Z&<^94RSQ`ecyNM{pC%>wzz9K(f6J=_OtYL-D
zCBrp=sL`Ewr(1)TcRSH_f%HH;(4KY%7X-DzSbXhDELidAlIf}7kcAdlo$}(JTxZ}l
zz;$Z;cU-4+O<*9X+KQG@UqB*9yIR8Y4aLhtg)c9Rscy169;sK|oE%;_#MY@|Q3eGG
zK}1_16HN%;0mLORkwK*RmF*diU0PkjMPag(*e8h61?ihb#guW;$LA$_xuIAIF;4D(
zu$^55>ej!WXBdK5@y={pBY&9EXs!y#_xhMJ;C`A#OF-NSo{+<)T-ev&adEi!4#IMF
z*@-tI@ZCyx2**&E6CKTF%SN6PeXu1U<97~zGY6k|DRjTwQqJk;bd1=zxZPq%qY6LK
zz=;OYw-|%I)_!Gn>Ls1=DXNFx30bp4q>ZKH3I9AYvY1$rO=i$nIAoQWLxJIEAna(c
z9DM$AxXitHO8D~`IkSh^dVFSbxt2KD!io?j0*`8S6?(JS!LP5hb{Tskg5{>wCJ&()
z8~uyv+Ywd39L5oGVq{}9F<M|yr4;+l_4TV>->l^$^dVt4F+D>4x%Ok96})j$=_i@J
z10GwcSuJBMlY-Eyx5%&h<)iO1o6@Ga2#v@DJaTZ2_G`=<%QlRr0~+JPjnv_dFN{!Z
zHa6G$FVdMTJe_yx?tS+A2wp`@1yo|`l*;lU2RDYP#8$jF<vzTUPcAXOIjuS6@Y%y=
z)cD4wjGYYPI)S|rAK#No!Tt43J4mDQ{n&!R;MR(}ipv|CqI(LTHF3E#F#q_lcJ0u^
zl|_5(*TWafyVQ$R4zY3S_I$>8gkjNZ?=1eof-d}K)`Ky^4#{0(>sYcW-wO~oX(9J{
zRpheRru}@k<;+4!Z4r~~MyesE#yYI4;klo(w`Wdo0B$Jz#PxAU2?^=X?ns&(fCc@u
zZoagMoww?H_r6XUQigkl(~0Vwd?Y+y_24w|JSw8Uo{W|~1VpDu#e7b<PO*f~s?Q8!
z4i2GmTX`cnoZRjZO~Y40tNv}*!)eQciT~V!&ldUr6$%=;0HC0cYe85Mbl8)x;(w?W
zcNFV=MvI?2d1gJ?^!eVSrPMOZ=erF7e>7RDzK=NOtIhHy_}mwkqBdmGys0Nd`{pL?
zXh^ZS|AK?wi245w2aVFet@CKV4#X1PymF6k0L4c4<YVuAoH-P9K2L%C=o*6W7~}V|
zboGPIE4@my$ObQrUf@-~PQ55BJD)##uYY{Mz4D+@L&RwwcL#@rr5$Ekok}4$g^Ig{
zc+qGztdYiL&-~;Jl%Q*8tEN@>@NGB6x<x#KbW<PLCXEh#O8R{}%_JQF7r1BSV>pzV
z7e?;;sp@})LaY7>h2|Gd<R6i0G)BrZoa9Fy4~;COc49u@BI`u_)&^m_m`P$I*R<(L
z%nfc1=yZ!*o2a6?kdVS!)04J_d3DE#@ZK+owmQO?q0+pT(%D(5;13T1!2dJ_wAFMP
zR2e$uPc6BM;hB0S^h&rEoLOf#aOc=p>slVrvxV$)e`UWI9{3uY$A}<k<Qei()*B7x
z?jRPpyGuW5;OsswUVr{4d3F(x-kUhy<I(c=YONQQW$ZBC-oIXZB{x#81y@9<f18Xm
zq5<4-Oq8NxVlXoB#hq^3cyCEtV&b{Pi9GvU&}?E1U4KB=Ba6WRS(nqll({Ks0w466
zM2Vo3W?Up7!WnR0m~Nxc3sL-o6&2OR7k6ONEMuToHYx08pAo0&{bCV%j?>-Z1fmb`
zXQVjqF4h{0nOMpti%`>&jU|!gbH#z(Sb4aub!0-CZ=~wAr0S-0%FsXyB|SJHV1pif
z>rII=d>$xHyvQq7Z!=Q>EU|HvR(ho1iW!Y{WoTyzKe2YNVR+eWA$y&ErN7}CaqH#D
zDS_Rsc3<@!1c7jV@h;-!X^RvgOT0wVpgvh{@1nO4)b4E5YjY#Bp00S$Flaf)F8fRe
zR_oAexcDJc$n$MKQWTDsPXwwsdjQy#WwFjEI!XAI_@1C+oI*~E+Yy7Du|yXVzsg<A
z`-1l-k?0zw6AexWoX9$8)7CRdcS6lB+S?NIl+RS(!XF|BtBE?W@g8N~Ug>aKYnfp6
z-{^lt8vm-MdMBc|!fx@5SkQ?2WF6VLFRv;Wk4aM%pC>bbNvU$V(J$LWTyWG10MU^9
zlV-pIKbl}!?y}nKt$R|0^1Lp1MKsqCc>jToDtTpg>oB|8OgoRzVa0~bB`v4bvvFYN
z#ahkP5q<0@_M~jILfWjRFwa%G_@-{=O#!KOxI=r_X*x&^Up;Pnmvm>ILa}f%NUs4o
zZG9j;7IJj#6|8OJ)af)&LGRjn({+!)>`V5*LGjw6m8`P=#W>;$2E6zfkiuh0jRuR}
z{B<|QvW4zZ(8<;MPFeUOfqAbM<MmgP3?(0W6k_WV(Tt>MyF%gH^JegdAv6S(*Y$7A
z=$|=Ja~JW&hGXodyl>iwVB?J2Xb1Q@t22E5{Zuf{5T}wSAUCdBTE%z-3>ggutN<gL
zSZ`B|Sr-=4;^6>M{dlwGHbRv1;#nYj3(l1pAsg|pQu!7)*&olw2Lg})4c^@6q>ptH
z2e`yhDNO59b{Vi3wB)?^I{TBfAr`-9YGdornBm<ICl&Ek-Xuq@?W@oWdtD`ppzP<N
zUmHPTa(CK=TiSflwAPRVUuGnY0ygEuH;JFHz7!JGnh(C*iPO$e?pb<SX-$hNbFyzR
zXGzCnW+h6}k`x=Iyz@n%n{ns5I{&5IR_~*drryKs@0!f;^AXd-0`Kdfvc5%nYv~H&
z)EXmsTNL~#39V;=qdhM{^jf;vHq&LSi(xjuzS`Ky@i={~jHg%30R$S6{mA!}tvXzf
z!d`)?viHE+_>GTO%Jf&0-=;vlTdBpDaoG;!0a->tNQ=x*5UD1&GXg5z(adELJTyof
z`gx^7ednP7i5oKE5JQdr?DvdA99Zn)p#3N80?Ue7HC`XDpT~_$t#tEtgZeQ3hbW?8
zx>h`d1mVSDE~7(28|l3y040U_Pn6UM$k~<tNWKtAIKqA{u?MsMA~-sJAU40%^Hl1^
zZt;6y^%<L9W{(nt*FN964A?^^^KN60cgPwfdA$y`()txhk-?>EcgS7Y07zQNGQa#c
zlvJqcs_2b6O&#Vqo5Ge*0<b}jFII|A?!yNO)|;#w8$k7a23JC(R)w!#yzoAw5HZE}
zb0GU_UBHgLId=Y4?MajH9D#WIw?=t^Tc*GuEFmeCPfF*ewG(-<*l%q^Ohs~)WyI($
zYiK5)^Bz7YX}@yi3FwI@1yj5+RW~AdT?Hju-H6@&iIdIB%#kw@WH;`{qF=0Y#a$X`
zXF@31U(JpqI#(N-PE`9%{i_0!mSZ@0lu}^^2xTxYjzdWQL6w$xSBkuA*1-smY=@7`
zoRDtc<aBGyiCyopHKiihwwpS@(A*TRbHE?VmR<Hb|8kla)k%1a=KRMO<cDunsV|t9
zIuTn)fSuCZ&=#EGR@e!Vm7x-B_gOkk=t-|%1_Q1?{r*gIEi-%id9hN~j|W?eNCSt}
zBJKyLn;K0&yY5^431#I}E>-w0Ce3pE0GoWo3+dO9RrUw-U%|H|%5e?ZWAHy=Jh#Nw
zY2U{A<?bB!@=llPhBch&m5Ij}eX)r&R}9wVy1Cw*#%34q<h^9|>AYsnILZ4V%lup-
zzUt#H>qhkLv(!_mrO5c-eq^b~!BxK4Y$Ya3<iL_0ZyO4?^@H9e#aJ3`BF)w2t6>1_
zbEG#cmMw$)EXd25KlI%|mj(?ugJI#=6mO`N)+Va?oWTnQ#`#z}vpzi0D82)gN<On!
zt$kOx9kK7Hh(j*7>nVNuTFr%+YI3qW^N@G^WqOFwq_KGdb?U5XG|wEmmiQ;f+H@+I
zFLukvN|f)#>a{?ghL66^$DsyHZ%g9NJK#JqZSlFr={NL(GWe-SIK+{WZ$HdhoW>(T
z!#sa4BvPag<Y*qg?J_>qb%+JLL!qy=Mvgm`i81<eP)aN^Bq(7>!rHvstqH~@5=lyd
zVr~1MSgEMFL=P9E-s5g2Q8LVyG_E9-*W+F1fUq?fa|@A8F6PX6g6hAT|ICtXaX*Sv
zM-|~8JeESY>I#))mz=(YHKg4`48y7mfnQ|rDoADFx@>Y2Ez20m<Xt`bIQ+3N0<Y}n
z$$x>U6ScMm-D`(z=(cAND|_Rf@m2p_HYLhNl_BSl)M2I(O5!_$fXS$_BTEZH+);c6
zUacGvjue_KDj)h}A{!Ig-X^3L07mSZ44`OW{#P8;<3LoJQVKo?#pePrS!4>hcBcHd
zFzQbq&WF*>l;x}ot1QP2u~;#9kpcv?0$3bK_ZX?lE(s=xH_F|uW#p0_5WcJ+MVf?S
zN$Gc83tK)HH#9j6SUl@)%S1Zk6`?^t?Y$!;N7vErsOHZSOAyqwx9N_&>bh(d^Qm%J
zcs<1uJ6ar9BNtBoVXM=Z$R=pr{{zM<T07~3DP<O%5X!aE8vds$JF;W5Y{!;u`A)d5
z44ei3u3x+qt6rBESF>&(d!(+;{vc6N{(BO&*Ph333%?Zi>)YXOS#^AC6DP8NCs8|X
z%R@T;iK6~yYZKf@yoa2Hm3KI=zt2_w1b2{k8?$F(Sd5KS4sT5lL_^LIa>O1IJ5Iw6
z5N)eclFq9KT5s!ArY0ck(PF^&1aZoVlSn*uh-a%!1p@)5r{&7G`m(oyFoECsUR<b<
zUC~}68Gnh>0MY_JK+uh3*aKuJcR)^HUtarrd70lJUnp{@-sqc1(Z@Y!!JnmkT|<G1
z8;0Fd=%(X8qcBkmnN#%U=m{r2`{~KkBsAEqkX=|=D6PdDzxXBoD&aeqC(`%OH20hj
zhewQ3b@{nh6CuyIG&5}31FbN6n`AWkxiYVV#G0<ox^CT>t>+0ER7wf1xrPme(7OLj
zp~hup0!c2bS56yO5D~d+!vG{|u;Kha`!&iTOHlM7VU#B$G}*xG2}AX!`*n4qI8Kq3
zT&aiC;yHZd_|no<{2824yjmMkI{}@Y50-G<E?U-$6iOVUk62+r>ALOkuo(31GXyRL
z4)lI@vqu!5tHwJ!zQmVWx=#5<yXn3R;9tBIQ?*m~l#E&kHXzmY_!CUkmtmgn2Kvj)
zhMlYBDnAQciz_>^L>q(M6+xyGGl~LhJ^DWgomeGuAV5bIw^OPt<Tk;j&&8nxzoo+<
zxkC=lG&fcd!zD`#FVd?bsX8vL+@oknj7zmMO<>gSHWtaTLeeF&SBmA_-&g++01lRB
zt=B{{g)hV|&4~W3`icm-ePDE_Lxso*1`O#Xige-!Tjw`&PyZGm7!HtSL=StobSr~H
zzgqGB0x!i4NRgs6QlP7_`fI*0LTpJKpx>N|SuYg-nvx@YqU1PGE9w3hB#`Wh;d4KD
zyDIe`P5tv5d{TY7E+1mO@z<2x|DSFT{n#W{7(buV=6`9ko)~3UBfYZMMt9_%;Gd!8
zKT(E%|I|)!0zOzx^)J}aH#u#}v=Y!kFYk_@BxTiz-^~mD65b4^f&D(3Y6Av>)azRA
zI_Cf1gdYK3HPm6*YVfZIyVld-tVV4#FZ$~yvjMy+`s(+oFMmBN{x5a=pC6;x->KV{
zmYOYk<*dBEV?V=0aF5yW{&hHL#zn6KP`)!hKzd@G9=J^oAB4!UMpL+d$21VF@{pRa
z{eHC$@Q<vK<z2?u0jqwGnb7Oa*c|?h$9?+S-E6<WP*r8=H6=<%-t)7@yA@T>FaYyR
z*Qn4}RD0yjgaXRRMHY!4C!&w1@(XpI@;^?vBrK8n)}!3m9lBs|Z@NRqpSuy=-f>!3
z0_B6nCG*M11$_7{Q<@H%eK-3TZ!wL(L-XL=N2JI5JDC{xX&g6K{ZLwYZe4jlK<_<e
z$ww3g+Kr%p0$q6x*amk0&cuQej$52I5y=F7GsZ*VKBv2@L0@5eolFTchlHC9DCg`>
z7A3C&%l^KvX^Y?ONIpXQ2#oHOyswV)0l%yEHU-xQLhwS{fLHsX!}w=O!Za#1d`7kQ
zyx$x8EpE{|jW)_eoE=h?xD2l=HN03>URei-0q0wU<vO)=ag9wuCq(R)$mLqiL#Sc5
zd>?P?E=Z?%+wJ&@&RpI++B3zfc}|O3BQrlZ5*5BqyiHmKbcp4^26aj939=lScOH+_
zu2lu7Q52A-q)cZ3l*uB?CqbYG?Omm+39JjUPTkgn?j)~gsG6wVie{xdkp4Y6w#w=Z
z?Po0JYPmX4eE<C-=c54`rNV)6x<;jbPo!&YKJ6k}$31oHa}Nqy2lx!f{(=tK<97I5
z1BG$bo%-)rN1%7?ue=UD7QTO!RWPL}wUQ|$iviq=bV^yzx1IYxJV-YiHhk*PZ)LDM
zEY9|8XqI{4S=ue?S|J5fah|C?sYdl~PNx7Q76~0weAPz1R1M(tLXd1G^Jw3<`^+;5
z?LZJB@d)&XuL0n7nC(~{0Oy#F{u|_#EtmRs!H7oYjX)D|x7M#5C(uwFe<UP$?QUn}
z<Cw=V-lk%#$j6sw!}DJ~_G=R8HY2$J1;U;Iejc&27cV_@vLJu<I!tU11$5;D<&RG1
z3x6womGLWSQ9ao4>w*=h`6TeBVe(F!@Az}~)ymCTa6*fevsnw=fB#`8g)totb^Ty?
zOpqY*Uf`Q<2i3#ra9`YH_;3KP-Mj=GA{NSR1OQ_50sF~M-(8PhtWz$5m&ygff)Lkn
zkAtts!Gf?I>}T0miA(46F?RYL!2bA0sS$0{<?oBHI@63#%Ugja2R1yWw<(O8rAFm(
zPgG>^NjzOtu|q6*(fbU1t{j~@dm5{EWhyI7c3a#fexfJdBI>;<#7BJAH^<DdCxzKp
zWTi_v?FsJI=iuc?bA?QaSfSHT=?w06gv<~lTPT=!8aTjZdP$DU%m{YeKE>J(>g(d>
z-Guo-)n=@hnJ(e&a^B>`<$C%;!1tQ6-Z!7N-flI3(gspEtXZLwZ$7#oZmK-qKIT3}
zugg32Rc%!jINL^X!BY{{DEg|`w!`u&0uWJd^?85%{*kHh^$C~R&jh&*FklH5aNC#J
z9Ed97r0<}*&gS<{Lc}CugpN~(R=EuPRDL-RQjUA?SED+$^WkLHz-qAw<HNzVYTCmQ
z`oXfF|14`4d23UZ)tR{ty94V-_YFc8Z$Fe#Ac_lYsA%?EJb~VQ`+U;x<at2Vb2|NX
ztY-Da%Xc*%PSy)epWkT#k;tR8h#qV8vGxaKCTyT!7l;MB3_qt9R=nq%TN`3<dB-@b
zIA5@=5VnS(o6ji<%ag)1vw>2K#k;qJ*Nhg{%JpC}5#ZK~igtKceVJ~=<q=9>03Meb
z4aT4wIV0HnyC;e1;@9K$T8aCiOv1@Vn{lnI29iJVqDFmaf8#~JEGF9j;V?x_(t015
zgjrSZKo!^@tNqZGWOFTfhOrRz1LPVUE}+4e{qa(Jq0I#eZzOwF<8-Z23-7;mm;U@X
zpje)__O;h;m8wT}{DnS>h#!=`%MaQuecC-<)@`Wqq%O7lySmh2j-m7tAp$qvQCK@4
z&aM~xfZR#NPtn7%(MCPhjbpdX^9w=uIT`!74Z_cY%^g%b_O0j!R0)zE>lc$DGs$hs
zo8J8qMlCU!eH2bX_5AUo3kE=tqJS*_eHzTl^>6MY8pnXf)<VVfq{#v$@P+qU0Mgd>
zSZ3rnYnf^}Bduau!WRxn-OOY0y%sSbAVnEv-u$X?KR1j#?^ni5soC)4$tHd9yy7XO
zN8d^`n-K_`vlKolcsR#|Xsq^t&FQy4j#>M|#qJ5&9XF~+w6=o@8pZ;SpG6b#>K_>e
zK5HR@@@~Gb+yi(}8rvRt@lpgIvTMN}Ht3B7d<XKQp$X3YXP(I(t!ckC!|&vZ3vqKs
z<7H+{rs9?Am8>SoLXZwalUfwep0yvvAv#BEEeZv*T~yn69nsJx+X_Kz-xFr}ZCy9l
z(*!wkfDn)}Et_ZD>p$Kdi*`*wO4bA~!J=z;Gbk3nL_Ks9(2g&*SK9PXGRZp~B6n~G
zWCe*w1bq*Qr9Hxk>kX36Nw}?Kc7LvBAW_;u)Yk&WLp$Bs^jdqF<j+`|wnrr~juTU}
zky?jK8fA4|QUQV$RoOn!IL7eWwa#J-3v~ulQO|F7JzwTYMcO-z;kWb7ujt;ArX(9i
zh=AP%dt^ezjk6M_GOQuoGn_<cpp|_AeF1ZF{-=wSWY;OT(V=G4MW{pQvqM23o!N9!
zVt!od&=vb&2@peLYL0;Z1byZLc<)eNg_X&r`&^qvEHaiX28GPaEMJ`jnNSRJsq_XZ
zWD~+No`1LipxBq`4$6}|7r$SSdFwHV&3g($OjEGxttTV`-#0s_e*bVNhBwsl=1*v^
zR^$WIiKxdOJGb??KV^afKo2K!DbV_KJ;==}W$uMU5uw~uUw1ec%1;!yuY7g{;suPW
zBPqPHs(`g&A9{COe1CH~scGP=bYW1*WVF<p4)U|Yd*iS{25UsCkO45BIOFJKT#$+~
zhB8KG6LF(IsZ94xSs!q6VKIytf<qk6C}%2MnNY$Y@4e3BuJ$Lt!kLZ41w`PoDo7$?
z<m|pJLch(@f=TTStn+gHt@7C3W+iIlz`YfNJwnIDV>4T-91dD$D#Zm3=%1HYl`l3R
zU7)2=c^l&)boT&+r?z+_L@ahX6|k7<+C%L}?i?0c6GPES#k93)NhDG+Gh5!?@z|9p
zkCE3oPg~C}Ix2V#YnJ64y%$uQe^uCv3EvQNe`V0ih%*%6Slscb9|4a@4t+?ew$|zp
zsdtzKT*o&V_^dh-M3cqEz<KgGsPzDZni;e7?aRxlNDVGaGSi)1Kxf*pv~i|5JyfYm
zM7~6B4uiDd)&+ZmfH`+1CiwHSu;7!>6mjA>#i`x-0+m)dDGCu|k8NGQQf+ZECy<hm
z7-wGJH&v$NqUc&u%huK<&8^z@M)p^&tot|h+R(Dj+$_mLHel&s$L=G#J(NUM7C38z
zT^$$DN@0<!^xn6BMr2%>8It-6?03FfFKTr?-ZfbiSkK=Z;HL(~#PoUK#yFf!ev`~d
zn{xf?9=R0P9mih``++CLN8z<&-`0Xk%;CJL&z`7dRUMv!B@*Q0OtStLw@}AQXOa+>
zjCC5>@b{z&uc|Xk#UH#9*w@Q@9u{1G7lIqCE<y)oDkH=aaFC0i{DcA59RAo*1A!ex
zrH&L)Sw07HL|52|kuZdC$8A^o+V=>eX7o|Cnydym^p=H((0jBrb~kj4bvdT%zLQo?
zq%AIdqgUvhoFS;VIgK9AJp+N_u}(3V-E)h1o}6;u<OgETy)p%ykGXh|hDl&KqsnJ1
z4U$iH#?oaQtXOwLbmG(&2)#LE=f>iXc>$}-Br_>t|HGR1*{-fnsW!Hhm##--C8WIB
zWu1c(SU+kw0uRu%1g3T2D|ThbcjPpjK@<zfCPq4UKA^B1SE=Y>1hA9KOaY{ku~|4`
zWsEeg#AYlHOa*9hb)1LW7LUxFXiwe_9M%&3w)br+E;AqLRw6)HuOPD&T-q%{Dq?@W
zqVvCcW!jF=*y>rbt|$^vVYv&$n~T2J+o-bb%qv0i{HBTPQ(_;LZR0vts;ZIg0Vi(*
z5)=z|Lk!HWBGUhGuT0=EtI`Xb0rJ}^f`*<N-ze}<jJG<Oq%N}P+>10Wi+p)sTco(B
zi_gCPAl4(>KDY`MewXTLOyAABQs9Zk__l;{G>s$k)hTHqCQ|{Sw^!Hcg(8U^h08Bs
zz6@Rk9G8nQy(KhDGW_Ey#e3XxL9~j0s|!(a>-K?1<!wJF8{5snyZ#lgtfLR0<jhz7
zrd?kU&P<Oi5Fjo=ZW!yHJ%|@C&4w&!rK<C!J;@jQP%+k7N3dJJa27YMFYW;#s+5rZ
z+`j%%tk@nexGrXQZ*LV6V-CFo;^ISj@c6it%px9pam)fVS?TU*i5+#xvd5czu&uEx
zAj}RRm&)^RrqY|ZQ>Anf$*dci=0B9ByN-V-OR2YwbkQ^<=#?{{^O`&M+Wo<tBj9Qf
z>H~k!e~;a$Q#pGD6uetJ#roi~J)|B!z*AL5*$GQ%2hi*;CmYhu2B%#}Nsqt~7vBAu
zjrE7Ll+zJ{tqhR1W!M0)eO3n^m&^KWa7)0c>&s>~wr^A<n>BiO7EvX12H>kQ=9guV
z==4SikbTqdh3RH8xmn(;!LDB)&lM1xt6jV6;k-4-6w6$N^T=&8OXQ%O4TTeuev?p>
z<<t{YSA1TI*SZ8NT&0m$Fc|MqN!c0FNZ{4nw;P5MU#xXu=qToG7Vyb4foUPus!?bU
zwSmI7t{e4<{#FiC$}G1qM-)bgz(1?fh-yl8Giray*S*S1!32{{@;Bzy?g@F<KsJB&
zf@KDR%}Wj`%tR;W(aR#?u|IgJT2zL#&9n7urvW@ib^N2$moCC$JTCX2&eC|~cjemh
z3*wdS4?<6X|JQegX+oSm7Hw<`FWQODdK#U#_Ed){cnLvKq7{1s3fVSTpBf^4W5IiK
znHTyvO)T);g@C$Piqdx^d;s?KW%Uq-<u$pee*F+?ghK|x;|!-g_Rf>8lcaqtCbLBg
zDr9s4d7+t=*-!0r1s0Hjn0<1*I8evB=76M3h__;>3|Uh&X7ukE-qau_Ly*e4%n&4X
zC4>VqCi~|iukJQa&6s>=kJ?OIL*cG91eK9QrZD{aburp*way1hXdqLR^ch{TG(u<z
zFNB~0^dTha{nbmb%Z9!U?yH(vK)DfZ@lIW05u#RWAn#7N48?sa`^K_hED(PG&Zz?+
z?1xvQXxd#soJxErJ3EU`Br8L6W6?K1cUZJ6Z}Ot^!LZ(~yekxZ{+-_IE$L6J?43>6
z2F?~m73XRqFnQnkU03*5emojT>)S9t7#TTJXj>dOr|df~d*2~GX@ntx(hp111)IXZ
zJ&Z>7d=`R~Qd>=?$Rs$Du5yHt&f}SJ@a*%_PsMWd?zNqzW|u&iooF|}{Zuq7G;<xG
zDUl79@BB<)-I~D*_+_oqc?_?53H|vjR|AmGh=Ms>9k9Yg21eZ1V)&7K%~@MF<1?QV
zoKU43M!)mycZa)yUC3Q$>kx1h`{H(IZ+|22?Rg<`M6<@S(H@CVEEyV&UBAJrBUKi<
zVka%^B_zT5(jZH-EKgjKhzG~#rA5qPk{V2YuImQLJh_dh)QZ@d<K<Gb{?Fkg<{kB+
zPw`nf-)xa}XCQ(<8t`&zFfe4xq{{W%k|GG0-fU}sB-V8l2a?0Q0suwU-D(SGUx_Fe
zqzd+x!)2LRSKCrRST)}HoJnki0V8W|?v`GAw)Vv$3l-4K%WUszY;erI`Af?dLeoM~
zV#|Xl%M4clwSRwV{qG9Z15;kjDrT&<6A4O^O4#qTwt{yN+>Yrds8zYW;UKl#Km=);
zt|p_4@jBNgBiG{*EKx#-f!}`^OZ5{SwJ^AzjHPMgweG#&IG3uwhfRZFKGG~XSFM}+
z=@<qqEUov*zPPCd(WTh-O2Lx*Xp)pa^8)Q+i;;xh#6)Zl54VVKw!UZ-CryK`MVYe>
z;n6NKm9y?${8GIk1U#mU$xVLuW7KqGu#j8Ybsk}j1l*4-5CAnpb8e&4#=|I>PjsgB
zYwa^L?s*+zTG|DF!c?%~mOlAQg`Oez#u_mZG7!en!Wcs+MhFdmA34oX2labN@2ppO
zBjDOHAB?sLv9s3O&irB|8zIMisrO@pj3w6cyOSUf($J9&t3x%lv}t-4%2WXNzL-6b
zGNKbY_sz1zazdic@PvBa`IguL2VvDK>E;;(sq~s_6HwTv<iJ$UQPlw%$a_zDRmo@s
z`|OK(2u37KL)*b;sxzeU*=elrb(mr$%B*&ha=OwEniGuQgdu$SupcYm?UL^17<Ur}
zdvi88BOPx(XxztK%RHbpjAiN;v?e0IqfjFce-$e6v_N+_J!vE7cYBwCy2u_ddLp+;
z(?He337AVg%F@Lx{O&)qo?U)`Sok6EN6+wL@s0&ta~9tUdL1^s=rHV}zJ1}0Rx{xh
zJVSDe<ugEW7Lz8@N`opmt3ULmPw_K@W(?`1hI}%UGM+w*Y@eqBo7%SmADTICIMf7(
zh~n&;Q~x9q+tc-{%Gj@P_{>xJu`(llfzttrTBP4S8Jv&I8w%1jpEROxPJfL0_BM%j
zuUG&6p|l4lCa1U1X(kOf@ut`Qvseq3HbnUtf=Pd_{59KeB`b5)d(OM;@0@fU!JKk1
zqLA%5o#HJa?V2d|Dc4!T_tr(U`)ybr&u170yvO*ce*29%2ZR<l$Y@5?4J@|XAQp~i
zB-2pg)(bc;mrL7~1@#zxr@3#~Q{2~<rm}%bKd^`M%E65N7>hz+H*!#|9lQ!<uDCwF
z60uRD9NcBiPifR8%GcGPU~Rf`psH*Hn9No#ltbfQgVmtl`iNiqnqY1$E34z~aOp(M
zviWm#Go5mhUdop|A8h}BrC>*ez>iMC*I&BGfWnGP-yg<!eX1-BVQ%+#(ctqBLzOaj
z2K@#%{=n0#{IFmGwSGQIV=i4S`>9!?jom$`uUQbz;&7?~HPA^XdTb{VHNDlJP+L&I
z{`FNozR}8}_8znX;&9Z({LBkXF$2E`7OF!l<8i;h#pi@rh1UVk4!7q}x})9-d9v$f
z60m4fM{WF#rvwvq2Qd58oW0_?@DIOwXD-$yMI4H=Unn1+Hl*$oeR;g9?5Q8Fb=bWH
zyJ$Q9bHvE}vK9E2`eYA()bA@oC~A%$9C)HN5rA{uD;De^fOXa|XD#TrF5%@pqNT4z
zD5Yo@X?5w;6D4h)p|I6Macln>Ey3a4l46#UiAzphHKn`53(-T8e!!2%ZC)((`F^KE
zC}!sec~_(TLS5?H8b?f5to1^W_@_QWtP-sW>I7k3o}n`pd+l*4A+-uaK7+?yfpn77
z9d9bi5At)9I;|gwP-5WfCeJi9Re%J+$hkkHr#lV<I5$Cu_efdMRisyxtFL655TO*-
z*J3Xpt{mWFK_zw@lu?YW+eEsmREKZ+S_y0uWO?ULs>e_=4zR%Z=iO*T%pkcjkuzp-
zj_&rHhj#_W;ne|$MfZd8^!ea)-bqy7RYl##(^MF+=}~<sc&B-k;Kr~O2{?QRB_LB9
z5{An8eFmG^4|M(RjKChSuY&l8gt#=+B0?hmKw;+!lGzH45OO(1il}{!Nm6otea#5*
zMZP*Y;I4V_m}tF%X1AR^tIgZ#hIFy=yi>tZ6|<#$gY**k2|Qm>Ex1}KrIwbG5|5Z<
z#>Lxz4f+a8O_}G9j2cmL?~n28E}Kn0biSo=#4OqD_$YaIzU}{E@2#TZ3b!@w;O@bK
zTOeqV;7)M&!X1LUI|K;9Ew}{_?oQ$E?(Xg`y|VW?yU*$I?;ic%^+j`448|yGeYI-U
z`sRG*`zW&{5A!5%13%Es_K@0BPQZ-qdRV2^!KD2t9Jil;P-!ZAKkUuHeb|Q&*ebXf
zcILM|qdL9my*hjON+Mj=P;I``zFPcY7i)(*3oV0*x5Uq~T+l7`-x8xkQyf#9%coSs
z&-YN{KFTNX*nqcU>99^{$h;<ziRagh?MGYOO`%89^}-7~)L}|+FxKy&I&`B>s?7}J
zk>-P|Rh}P0;7A(Q2p2DU<cumqiRhsr1!z{L&jRESqSxfoKC3Y%(5fiBSqJ8ViY8fH
zU%iNe`Z2Bv-$!gfa>Ja9)Z1MsS<_FYSo@MOYJb2z@;bCsPR-pI+K(aR8oqT6X=?`6
zvmThLbi9Xn?OUhDfXJD#D+|L=3%-IL&EWF%p$;fz-GgRl^V5G$X7{6+gWnI&78-UO
zt(Slv2oCqS9v9lGFLPeHt@64M9*F-Gr=nC`S^6j5p3aB#xY^q&Sca{A;CRogjLmkj
zyYi1;wM|JZe&{+rsX6gaOh}q86FL=^Uted6Rh=Co>N;o1d@T#mr5uVSzEU<@i_bD!
z_>?~O*uyQEJ83z5$NIK1=a>pjPKC~J<3;06?|f)EVgwggNp6Bk^XfdOSMa*gW-oWZ
z_Qi~VK;ZQ{PE&4oa?}7~i?t!)Q8B%w?JP?Mc?M@AHDptNsm@xdQAsrJ<9kFcr?)cX
zy#$0x8y9~bv$s(5sjVgUF8PI@QR*LhLR9MvNaf8AQ($0PYMwyz(7%M$n1l90VA6@A
zLzbKLC&-OXuCAw44M!*uZtp9QL%>xNOp%`zKR2-2JA{9!#~nrEyZ5`8if~2RL5F-!
z^+ZXn_cDE^ve5;PRvK%94RS7;L)aX6(?n)%m-XpqN+F-COPA8O0V4{3b*s@HquB;*
zdau=_6S+<&3LUFRSyh;yK?nT3aCylq)eKdPdx$hH<L6*Jh2I^x`Mwo9<8N+WcU`Rl
zOSpkia`&ww?e)PyYNri+z1-(hHw4Cy;mM}G@8~0j66ua=j<PR97UbY%oq8c(oh(|D
z)6WlrQIrbxg&Y~@y?qtv<OmexCi=XT&W?WRL=p4R&J_!vfE@#8J68Qwe>Oekvja+v
zkzWj{k48HB_nR7|(hM`n#D#RvB5JjaVyO_)JLEw~$y3t57+Ar{tUY)M_j+5a;*EG*
z9!}#z+SdddZ#6ln^nVj^z0Jq1bGclW-TZ~3RQWgOOX}BLyo3x51fcifmxWgPiohpH
z{C^V7u4wg|&N@!wh1RRSxefo#O5cEbGJLr?Sm-h)qC4Yv9L2cK^TKg@)Ke)Kvn|Y{
zR3j(`iKbf}O<RR~+{1CshJ5x@)#o{a|H|+DBbBTx{o@D1<FaWC7&iX8z<po7TLXxY
z0~?(ik8oqbcK@u<Cj*tu@$9ZxR$CudQCK?Ov&GsakR<tw!XD%b-FW37HeE`OvA&pf
z(2U>(=2KEBUSDCH1O2y3!GbEm4KHRWdR&j2eMXADaEy``SQz^;es9&rgdrIwaME35
zeoeJSAncWA(x3?F8k3YTNBMg%@flV&5NIlvlnGe4^m`mjtbP=YB{)s<XG50Ypo<}s
ze6-7WlPP95H3&oWK$F4l;A0MgIn#Q<9(XUH4jr?dBiPom+w85T?%6$!?KL!GG4l(E
zLHHVryd}mAfjr(G!$Ge<&X_gY%Z6Km396GZzcS8~PGn1;1T1sO4rm=%HuR*+zZmh2
zxU}+=sHV(2{-oKFM!MvxF1hp~7WfU(+C!IO<}}n$o|K3y^P$CdY**T`s0p{gfWY)q
z_{5+EY@%kHRu8U$-y%N_B0#yQ#pqq3uNz$<>Hd3m)`}yCGwCjvERs>HFX5u$j)87A
zpgT5J(-9ZK=e`2#;29#=E@tfaC)_yf=F>He&xMF+&`jn=@9Hh<PsP;QZ|tyCv~cCC
zAsmi^BT~bU;H^9nDh?Ibg8#f*CQH4kx4g@!WJK!2pIwh0%CwvstvOMMb~(e~i2xxy
zp5)F1?W5~QggE$_X?oglGejoN9h66lKB5s?!B)-y&b>4~qyiZ~^1^eYptLqfHEYq-
zXcfr(c|-eV0Psv+kc;+TbUn4g36;#*vr!KsTrGagVjWv3vaAPbpsAo5iT>D=Yq$%6
zA&^d>DMK=MQMas^+2uL?z4U;DJ<%g8m#Uh^bHuNWdVMl8CF9v!?UC#pAR1CtyhQjc
z%++Y^IXt~fpInVc*RODQ<7-9JT?vnjbKB+0E{VO_h{T(3Zo@H^CejL(yR*g-G)Y{{
zo5M@4Px_8dXsL%kh0W4NEt-Qm?i@VYbbjzrb$*L$T`IqRRshtE8u$c%2wtYw3^?(!
zn1@2z0bCOJ8bsb_7)+6|vrT?PrpUoaXTImzMS&P=0{Re%f4*2GcU6dm5Po|b$ALz|
zbMvPQG}Ek|?D@)(97N-JN21@BFiNzPI-~$6$c6BL$%<&t)L<>B18URe!s;8Uc8_5S
z+==Up<ASKw54(f>m`EORfu+leDjGKkNpGQH2$=?Pr1E}kzmn-`y}Kcr@p;|F0a<*2
zLM+lDEvNH<A4jxOD2RVc20v1HW^s@h_1^Px`jg%i4num)y1qJ3lAJcZ7i37W1d^{^
zn`OFI2=^W(x+#>Leq}`{`%gKUw;u9!^d;{6AZ0CHFv_fd#nJ=ACH&r`Zh)`H?FVd|
z^gW=AXE)2(4%w!ko#VpZG{!CNg1X-uZO~=Jb>xzXRD)75PMgJ5vO&*FhI5MmXs`H!
z@2t~b+b$2GukhS(ZTe<7f3So(7zZ*a|FDEgtIe_6z-wb}5g~v~K;ks)+vL{`Xw8fW
zyZiB^Z-5NFjeFP4{G9Ez(BrD$vLaW8u_m1y$m;3^W|y5t3K?VQ-^?V*9f*F0Z%5!B
zEM$PX?-4dSeb={AIAldQZ96p60Y>>2c46Ic<n877berz_8r2hS$G0d4iz)F9azbg$
z2qV<)62yr=1TIDF!(2=Egutn_a9#YG7P#!Mx0GQ$-!KsaXWybZ!Y$YA=NV)XRs&_R
zNUj|MZbBh=;C4FQECz}O4OSlfQ9Vb|bvz>q){YI|1vDFI)RumVP8}PXHL$BJg8X1u
z@O(~Kx7-z}n*vJ(I{Ksqm%<$krw9v>XblO%0!ZC9!hIlcraGKySfjy}CngX$xoD`j
zuhA$@O~vItIlXeXYiay^zj(g~(PEGwRgmgqz|v<KOtt?O=ts+d>CiJpq)rw8u(nLu
zEwGFh_Iv(WsI9Y;1|HPjnSR#>1)IdevT+kC_tSqc+mWRm7H72*slqL0tg}^s{&CBu
zSy7i?<F2TY6=%?n3E3DIOB5d-cRH(^KH9V~>sbE+nLv1Dj<G%9i7bm=zmk}a;JyI+
zkZ9fGt?koetQ#X8?otpJkfEtdqj!&!LzX3A*qG%5YWP+eXWgP6FTOMuv~2vu?8d}t
z`!q8u+LS$LtDJE{F4{o6Lp~2#TrTg?_37=_dr7ZG<kat#YBdi9WQA!R7!mc<u!P?&
z@&QCM)7x=nm@VK6e`lq;E$!;DEXM_{lx;`p=xsjV*QLU8$F^003;iQYR4jw!;&-y9
zmQcDeg?<gX+E2URh*w78(6Slavih<J%HG(qX$QEclrcEZ#7($YcA0l;z3>wn5m6><
zP%tuG-M^__4u9}OJ`rAC?_T4dPTAMdsg|tE6h}I;K%lidVC7782nX|=y&vLIAS`VV
z4MI2Szz;3Y7c>`*O;qN_bU)mdG;0#ex>kB*j%Sq#t5<etId6aJ4!}A+xW1Q@qqRKZ
zU6kgIX&JIRD2gU6nhBVYD?-Fw8v^F`#NclkJSBl3-BdqBZ-?V~W>#z_h4hF5kp&TW
zwc878{h!+-=9AqQ5hje=0+<c`(6JtT<$(<u+syii)J8q^I{wRcL61TxyR(qakX}s&
zO@>+Me;AbT^<F30=j3jBZs5>}<4q2EK7$l`{I@=SR7eAQ^oQ3+cmf^zaZ(hGN3?Hs
zT+1({IkgHy3<T%Ab4P}7vk`%Ok$P`6CyipVDG1M#5qRt>?MEra<v6~+iP6QX17xaB
z7#+kq&EI#{M{)Dv4W+(S-14KFrE)xl?FAGg4$)#J6S((T<taGx8?P$hC?&BY<#!Cm
zjF<_(id|Y~(Ra`e>!?}l%4LeJG9f)i%sapzmFvCaJ1(-FA2BwcZt2#&tYyA^p@E9t
z2xz5neU@lLP^BTzm^+DENmrx1>$wZ`AsLh;#pj*oX|FwP`?T$F$MMr^u?oeVXieVh
zN$+&8C26Uj2gj^4fclhoEBRt?4Dr|RB4@GRiLTzvS0{1=tZj0c{8Y))RLT{8u2E+c
z)xGv$9@l5PF<9bwrj_mnB5Z!FoQ*HpmmTD2BU=1s!}Hnj+O4Lg$(`CN>Ic_ooFp<D
zKWYNfujJvL&}{+`I5|m5AT9-5Z;u6<I#<XEQ9rDo3%n4yLi1%RP~RRRx@BJmz%U7a
z!cvqpd=wJ?E^!G-MiWgMkNzORX#Q2H4!A~mKXQ28%aL36Q@AJIu6FoPw7OO}E%z0D
zb()e$Ua~_x)s10853(~sTN5*!p7Ou!HKKe6DcPFg%JkK*C3w)56Q8c_M2a(+jQ<7#
zc6~X}&W)qbDdA4xa;6>*%zA2v&Y{pCijAR`j3;q>ZJXM)f_7&|hb1y&`)Uq1s7I=6
zVNGfAl7saA`8Pi`LGlp!fI1Bo5hhaif`IMo7<T9|N>3r!VLF|^;rmd|Tc^|M5!0*-
z<y!E=s7f&E`!e;n-`N2SAxHc#Dld~H$RB2OTmr=q9Xj7$9>hg}%SxXvmgGWU0HcZk
zW9-xKwC{%mK^|}ovpVvRL)&UGKg_Px+ZOiZ{($>=-dnvwjD-ulMZe!Et}(a8ehxyU
z5zijPEJL}oTE}3NE95=bzN8Qw`s!VPV37PR4Ax7<dTOTNZFZv8>JY6Uo_nVj@*Vxj
zI&Ohjtpz)`?|-$HW5&GOsr>pq9lkz*slu|Xt+(W!VBZBqYP;qSr6lByya<C1)wA_K
zE#_x{kz7R$p-ARBGK2ZQKGA>kpp!Pif9<gtod}9=r23cH`+s@hzby$^ytf~8A5{LW
z06lsg_;)gPxiZoJ)+ioJ4S31p_B?l#|Gf-601HsMcf_y={aZ)6M8@BLlyR0IFADkZ
zUJZgA@Oq)e2ciG{VY~lz$Gf+w^JD*8k2)nM@OrTbKVeb+FYEEIU;mF6MkUvLGFQ^B
z;(xa~1pX?3q5O`jK4kMM`d@nV|K)+86q%jvFZ$5i!lKa(zSPZxQ1I(*DscC%^IK>y
z;=jGK|J8O+`T@Z%rMj&ai(2mMR<lvYzh4(c1cl_J!UlKNfBOdf(ETf^KWjD`{eRCw
z`ma*~0`vdIb&-zcS_8lNwY;2q%B}8>rZRbgUl$kTPYDX-GJgTZ2C_zhT~SZ|%rWY?
ze3`~vlU^{gUQNhNlAY2}RG$^=A`IsJ+S@Dj+VK+g-{?}9eDu4jsRG}`a`1!LD}Z=3
zP>&HkT#X6r^kw=Wdb|QuX&m6&7%B(9eqckDkM_fbl+4b!iyc$2XWYgII8!E#3cG9>
zH}Bu;p3=t^57lGFJ$-aoPjxMsVeeMo%2t|n#4%m2nngm4dxtd;QfZVa{Eg+-O9r)~
zYZc2q5emUkulGj%h0zIe8NB#xsKzZMKAhxluTQq)oSeS<=Nnmjf$8mzUY_Ur{k-I(
z6FK4{gFzjW^?j|&eZ4!jmsIFq2D^d7)@3mr_^?+lmfkdQcv7O-stITnF>A+aoS)ba
zmVEu<-T=3GKd{|Qd&zC{M`exAzT=D68H_LA=W6zSz6jx+%P(EdJTVwhFRb9zI4BCE
z{p>Lc!u)x^lTp5%BN`^WyI;TN6Xyd3I{=5o@36VyO*k#aTp8etFIVye+bpj%Bwyim
z>GljyDDWCK{bx>)c=)1_SNC^3Z!Y+X|NgVu?{5T%xO9vMXQS_>B}89Xjpywg3Bl~^
zo)+T`q05O3&Af_LHO5V65XrZ@TDLV1d?nVFH=52=Lr`dvWnM6p;$^%nW~VJ6`_*(-
zapL|Ak)JqBpB5Z<e{rLHP%6R;LwBY3|5mhT?Zb4<p`eA$SPE!4r3x5VjF<L!wvQL8
zH-1y!KLi0S8vT`NXnBroEZ*CzF)>t;)?W%_Mg6lB;$X_+*%H~pzMpZU#0k|+x5qn@
zpD;`KLgQ%Aq6Jw#ezq6}4dh>{#cWbh%6If__UoLA3*4``Mz+NB_zF=l0qF7)ktjc~
z3qizjU+$Me<9+@S)9P}%xYI4vI-BRolrz@<1R`ssyJ}suC8oi%s@e!k2N)Pmy(~M$
z>ei<rKH#8HZeN6LyeM6DKV7>q3lSHq6cbC6A;7H1n@`F3esT5h3)h|p@=hNf_O|36
zqan!59^XoJt9wJSPWk1&+E98zD(1-Wz78?%njjyp$>+;vY(E3K^)DK&xpN<7f4S}r
z%d3sOXF!7r%L2B?A56Z@mfVTOpNy_F4ShV>FW0X7>SDD*u5s3?4*2NtiX9?pI*jq$
zdj=CGZP&A+<z0pt>ig&PPGcE~c#_-{PF;>CRqIaf$+w+(B_Su|vb^_3FV6(t2J1!$
zcGfvO^NgNcCY!8zxXw@NcPAeANo4``#rA19{s=#`Ff^!95MH<WR9VvVs1(z2IxSXA
zHWg`cbcXKNfPbr}@gYH6uGyMT<PB$AIUw!qXQ*GL&$I^FwzbN6tkn_m45)kFrF;bx
z&WtBK&M}E%5%D5V;28P25|#k-H=#fyo6a}Nb@FJXL-)73LBKb^OPF3X(QJm6%x2KC
z?6wslw3T;_g~$7~`R%A2OofLyI9sU*Ybg)xdx;S!Z6T-3M&o0Qdi%et_YSPC@a#02
zWimU=<&UIp@p2lMc|?2|s-4qi03s>lY}=L@px~}?kuf!>mSQNxb3IPDC$n^Q9xV1y
z9}JJoE+ryE@@xXdMy=wq$H1g?>gDlyNVs^7Rvpjom#-ltgVI~N%y-|FDm6zIm9n4X
zs%*z>;^bV2(93kn+l+efj-PazZPEt>wmG+Lk9uz$s&9)wzq2acFqocbUVG>MGIR0q
z#kp8*@J`$(diCXrXYBdvVoQlsPgeubpp<}t_F^ih?&%g~Z-lWtsYQG!I4xCEHQ(ve
zCykbdt3+zoGRmvi&j}?!P_Jn&xvPHw%Tu~*gep8O9W)5kaS5&L%WMIGgx=lf3@^{8
zBRlJJ846UGhSJbDupvzsx@wdHdJrZL$BU|&li5QL1Y{DrUi3}iEPnxcqMSJi*`o_b
zP%n!Um`fd^X?iu)Kd-G#KJ0T7R*yxQm#hbZsiYGT#KP_QE+1ryXE&YM7W3~0i?N2U
zjGPdr+}dZ=QW}`VK!0XSf@E2}arO)zV2^~Bv3~J<R`2`f7L1H<ywy~zbYIyg<Hhu*
zpMCxsf@Hkk?teKBU^LbTlLO;IP)6JrmSa(yZjyA3z4&ZSN#UL*Eh*D)-&UiD21FLp
zd7V@6xw_!-RtY{#<PH5o%0JRKM#*kwpQWK-Utck*K{+dztlq6G(X(z$x}9}$Wa>Ux
zDF;tQFR`_}{+{S=RRjXQdsAxMdY1_Tf|{4NJcCU@skcNU?%HAIuN{MFBT$2S1!XHT
zTw;yTSi$ik@20e?Ob5tYJ)9D+U6;w!^}K7R%K(kgy@UMlw`-24+o3+`M2WC`SwP^@
zXj#u@F)oM$gmUM~XRdPHO;s9InH;GdEN2Ju+P%HdzSE=I8I1qsuoxWad83MjCT!VP
zXKgZ{C8T+`@`*#+w>6$leKw89VY~2w&-pM{@N<y9=T3sU)UV;>Jf~1h3!R%rM<$!-
zcko}8`D~X$^)K-<sMEqtmT0?Cus?gFTEFWoWM)dXZXj&83ZA2G#W<RbH5nS1+!-2S
zmWc_}hI<$t3<JLrQ!tSW7uLb=0lcdS0sK%RD~6^>(^Q`f3f&O-iJjFrjxfLTQ1|IU
zeMbfg(TeF<O*g}Kk;RNm^N$l?+B&(0lp{d2+APBn*0QU%GG=hLfdq0pPQw<wRdW|j
z0c62oI4Zq`3}TTBW%wWaKa;OX(8cNxeWTGZ-T5=qtAu!lH!CV0NqpfHrhgJz5QZNL
z9g6_rrNxl)B}R!#x<7(rS$~_3X5#$h0S**L!g*dA?|UgZ;5Mh@@NzRJx$_fShV5we
za5$Nj#b<eF^3P7S!;~zCBw#d!@jya8Tx??~ApklDl;Wrqi8~{4S`K@|pF+lD&4Qbg
zHEG{!-OKW{U00kxrl)-Qlfss@Edqc0!l>0I?7;h60od*Ox6ej-&f;IaJyN}L$^;vL
zNV}br<f%NsfB5mm^K6715aHOJMsfD3_zeGMG2fL*Hhd9^{Ce&{f)(?o1ns}C`0?=l
z=kE+!{H%k)n$84xJM#i$UMQ5e1mACZc~#%Le4A5iayg_dmq{X}Q!9(bdJDTqwv!4X
zqe0Rm<m*+z{1I}W-sH@xaWb8qzU;f^ed#vwo8_CJfU-uA2W+@ZxU#5bD|c<4?O8^Z
zPHFuI3vsN@k75cI2Q<B^>beFciiguB<|UnKrMg&`+J#RW+I1nsEQ2B{l<XxcCZx3U
zK<#2t1j6^^s{Gz9p}mtWB(g^E(^Q#}nfcP6gzLjqSsbsL@hdjM)YWGo$+Rq;M+)Fe
z!;?AV6@VL@%s9Aik)4Ql5X5^=g?@7tKHdP6UkhZaYM74rh@TFnsP=++)ofIcG9)^M
zWYIPNe%pyLa7KuIMd18IXb&W{>eh6roG;X>O8|U{bp{#6*Pd?$JETbyOxhcX5t#JU
zTZB3vUVoBIWG_YDpPSP!)@uHRyuZ|EC4zxm93)QS{#7i}t%EQy+Sc1~(Sh0a^=YzI
zOT9=VrzzzcY`*H4d$taAuyWN_9s|w_fOs_1CTBh(gRkv?x97o>=)#gRX))wHhgQEA
zeB1Q=+IPm{bS|izFPy-o9e5>s!!v);gPT9Wr1K}<0Mon=%AENtwf_(kSR2+NJ&9*7
zXB5io^1!1y9cX)BJw+n=7AilBq-%c3Yn?Sx*R>ZdQC9z!0*`L9Vh}pJQdn+~ki_k5
zQKVb?Cs)?9SP@l!@M?U6sh`hvjR7zpvnFB#>EES#JY16PV`%vYJ45M9T)ilm@BF|0
z*|X-skRk56`W{<pw_2w<kBV*6{K7tUN(Ll%$fusrU3&%z&?EOYK`q7tHsNHEir19}
z_popxJ}FofLjCQYL|tOD(GCwL6|L{1pO=T<e^QYUYXK}TrGZ2HoSrCnpT!;v1?*mo
zU@oNrKdMO^Y8v@>qrf`!%Y(H)3V}DMuxy`SW@$wN`3M$oa7I#-J)JDpbPs3Ctm_O$
zpxhe)f_1Xh5;wuMUK%$YBLKZDB0{B(J@inZpE}9e;b^2j<@WWXDtr6l66!T@8JOIk
zD6Ws;TdcD&xqPWYEhLvM@Ax&LUxi%K$DD;A{i!8~=Tyd)KC&g3q=fInfnh6J(1G+F
zKKu<sye&}Hgh#e{?ow6Pp&ChH`&DTn)FAeB>Dl%ub&MRLlqZ$4pRDqrruxLQADDC6
zJ#9R_pqlY#8<Znr8koR+_-%Qw6ra_2p^rowyA_peitnx;z4Xj>_B9BFuJ#O&uM;^u
zS=T3vbhNDvF*Ofg3miM&_k`m@A9*tSuC`o$Kg1^;66!z@XgX`-5I<VP%OU6vvPa3t
z1{?vV=SW~Y?W&F;Z!7KB{CALl%<n|IGJRUwcvx1qh933VGifosb~681-8xURuRf?#
zefMF#K<bNpvAWFRhfe!`OTwui*~<H>>S(n|5I53BOOwWF@x?5%2s@NZt=h(0Z_kD=
z{i=gqE}h%DO!-Kh`X}x1aODmi|1FV#&$1QxYSnYpy>!|OxR9c_kA@N&=Df8FwIL=H
zb25M6u->&?DGOZadpVD^a^p0!S)1lqV`;v9aOSRDn2RBd6EBspcGBD0W$+r&s&lv%
zv7W!49t*^hrE`n>ct8Ky<d99T>=|abjMi%SK{PCJeyUB^`{`j?Jh?h50tg&_kP^@!
zo2}I6u}JtfsX~_tQf$xbaa_%ynhhFoGCQW*O25-5!B}$wLqMIBJ%~P3?x-=;Aulza
z`L1_=Qm>3H*9nOnZF8aY3d*3_RW8PJ9R7v$B)dU6`n1+3*>2nQYeT0Q%x0odK+MUU
z;cK5ODb2t;%xoPl%Z1NrT)1F6-%wQkHMxp^dnwq?#O7wPb{8Lw<aJedlK==RmV|?|
z@=1$ZmbishNZjJtpJ<j(GkbB{p;gue{N5OQgopG`MC+7;<gZpljkj^myP96E->R(y
z34R#ei|7>ng!ZHzWttS{bv`B9BLwylHPcsh>F^(;g5oOA^H;1B5;=gFlCCIzhu>;{
zlx8@b?ypAq+0pk}xg1BB`>9TNTpUJ^1PkSPGED`vwZ1oH$v)vVBIBm<doRMdBeUcu
zW~l|`GmekuV-cO<EVon$Y5jP%3|Y}eTfiEta~n{qCOKSi`k5ViWP?QpZn|4bVws8g
z0({LjWUDVM8lNr;Y}W%#jZrdH)TCRmJQ!jZRLCnd+H=Ed20qb7ONQ*T%53MtFz))t
zCv02greIeYNrD=tf4m`N=QItz3gZtSf&&6toM07cmU8S!eSl96k4zfxLXp;TUu9|7
zAG~D5wWJpjfmMMlJ<GA2Q~SA`@kYTYgRw1jI8Cp8d1yon6zk2VB~*g3w<nz{tK`t)
zYOoBE-_s&@2N-R-%9(%~?Qm%3qq$FJnc`GhAo9;99Bb^$qBfF|2Z>WGI)hV6X~;@h
z{Md@i_ib5&35*QD_{8UrO^@uk2^NEEYStbcM0ww_r}B_JZ+k%>v4=^FM*Lg!x$(p>
zpG8i!xza^~N`hwR){+h?M?)P6@D|b^cT+1-$2|9}zqm$fC|*?tz@#}=eC$}0yUB)$
zvj%Z?YC3Fm_9(T?V=3tt6d52g3Ku%c1qiXz9_<et*~+jN3wA)F{bHOdkVC-axivVi
zXbYIkZ5&8z(Ymq{;T~C&=f<A36!fV06Ux?53Qyuq8G_Fj$%aBeq6~%cc{B1Lls%36
zCB#^Yfc2+w)CC8lzqs@Jam)7UKVvhH^%`5fqf~ed^hymCR<5$mU%d^6xXC+z9!6{G
zxT6%@==`F+%MC>>W+G6jG|RVeRYyW1SO?+%!g%rMBOJ;@Qqj)O34A33B`>u;Nn<%*
z&sUC8*~R#++p6w<T^FmOdbd`J;@tE>`=h>xlo#alJ5{$oKezkS`|g#yRR}<01}e|H
zsZ*SiU1(dsVMzM3MM7p6y0t%D8N)n)Ucj1f;q1R%!go|_TY}aM!!uZ;{nQ~PD>KAb
z0|$)ev>hYdYZO?1&6$D6-Da+}h+aG47m8=CDnR9hEQF@zq~9&VJPzs6YNW^5EDx7I
zOzjXeX?DjDrDF-Pw~Zf#@x3-A2b<;8C?oMegN`Y_8$s0-Ddn-)Z?r+2RHX+K94-vQ
z_`IDqGZizg*MW5WWUw24xton@gYhcOLb}z<szXIw>L6^0+o7xB(S>&o=$vf>^vw+P
zc|7YvpsQKQ5weYw(`k9$dz56&SN(JXGE8?Hj6=8i!=)iVMYKEbJ=)lbucP`tzlJQ^
zZK0fsm=av9qg^m1+)@8oTS}c;rVmSGOkgMCPATf&cU@8_lhH_*jFojc6sA?4A#X8E
zC@^{06wS9Cmp1Da>Lp>9&@no&ieqv)Wx<K$u<&`jJ~WT}46X<W=AEw+bu3aLGnH~K
zf%oAtsY(B~T;b1nJ}ELyBQ%~k@UTyjztV7$rkc~5tCcu<NM6v!h*zzso64!UKG{18
zE0X!ANG_oa)YcLji62c>EC1cAD?**}uJUf{pQWO&GOFE~DCi2bsx(X_2Zv62MMd^a
z<f6w#(wCl7TxN@f->v*Qr{vTt`$#1~8;E^nN*mB*Nt0QAwA7_lgI}k8KYQ~#$Zf!i
z{^Su%HknG{cdoi<4g54<*Aykd=@H}`tgyFy0JAt+G^LPLqg`gYa4?zA+Btz0g^Tk3
z%V=eBS$O&yCk8n$Iz3;eB|an(ZdBZ#ok*<2yr0GQp^id7p!EIEx8(bv<rb_t?auvZ
zox_%NUSY}BUx@?0+M~V|4}Se#T+UTq^W{lh=OEtmIA?1cH8DN)@_>KWnSeib+^(rh
z=a8uf_P%Z?;p+K%dWbRVfOp~QmIIXhbTtN+dGDHnayS>KRZHKINGvi)=GR2f)}ED?
z!D4P$BDWc&;J0aTl#fznF!eEA?xFaxXP}(lVh(TP263+Do0FI$-(%qeUCxwq1?IZ8
zGuF7Gj{W>QUmkP?=9l~u)MMsoK}zUYLp~BI;%X@2Y~vD%3;~dKe`p6}yz9f&F0)k4
z-%zix95G9A8$XxJ4?KwtODIb?3oe&Z0uSSB*a2}80MsX>o)#znBCmzdN|?$Cq{}SA
zGL4yTZPo*sWxc8h&rv1=X+1Ks-#e#?0%|KyIb&=@{P$7qNbGMjt9KO&dD9<wdRxEq
z$T#$t-a8f$Sn<TuU)|~X{uJ2KKcfdd^2$MeN_gi*nF4|8%&~zQg7!wbFq4<)J6~ZR
zmy8nuBfV~ho-MH<eEG6*7U9WQ&w|m0c@g4FL_BLgosdyRn>uX*1N%OIEa_9uv}|m;
z8nSN?d=d)^OlGP2h6^l`yP68sZ*c5{Hu^fr;sOv9b^!VL#`vhN^e~X875{4!_0daj
z`+#@oO~{WYJ<pRbETEUxt~``O!utUixM{7tpSZ61+~N)-uH4IV8PLDQlIMXu`RZkE
zHG!&#r&n9FlzNA;mBdxy&Pbdt>XCK-7umMm;Z!p*m}OD89O!dVsA5rQ`fmaH<$bpc
ztjvQjR$cWFug_h9Xv?;!C-;U0_VZCh;j^72(j2A8>om9ZSk%LXkYcRC7CTBMD&{Ao
zqw0E&Tmcz@znQfKsUUk!JA0&O9_dOK>dq*aYjL0cfgEbJCt{>6k$buB^IUmWzM2f<
z<$xv+cp~Ll;Tu6#(8b)WY1DSNGU4bxuRrQbSr;pX1Os8g_L<&wGx&KTW>#URdy@Jj
zHsWYRY_Gza_15JSKar@E^JSDI#SiLi)`*L<XUx$my_`Hee_vHrnT?scq;Oap5(*l@
z`N&kn<~h4~@w3;*(m~vx9Zc6YKfW$omOwLgZ!V6&zS{Lh5-{QIR}Kmgrrz8xMMjZ=
z@D5ZYk_Gm33W$V~o1ad@xj8T6;s_5}?{_;9r_S-a1tGKHze@i-XQk60HLrej7?&uF
zo!u-uAGW_)rY9H%Hc?ch?&7ZLpK3RTJ9^zC_gbY59KPWWb}Lw8EDq3Xung5%FXcMq
z{rt$__4H2!5w6p4W!M(Zr)ymrX>P$E4!Pu@#3X0^>-BvF!i^O>otilu<GBDTc<}t<
zr`^nqI{uL~!Q4=}R^2Toi9LI9ri;@s2B~GcE$G$?Ewx7vxL^Z?dDgk(`Nkm)JftwT
zc5dEm;m8oj{>KP{@HegE=U1zv+wW2t2hoC6)X}ouO1>5P{u&j~Xh=t0c|(akEV@<b
z75YSw?`==*YER!TwT7)aK27Bh{O%bt9_6V{qtRe7V_7;EN!lq-YJ00aEMFul<1!<)
z7kiRW_-Gg`b@GS;{ouKiHvtJ=)a@)d;z-ia1ikBhoEKYr-*Je){`NU%RO}$Tn%lZ!
zP?0_yMsM*SFz88Ln<(!-qgCcCaAUvtEc*(*X%?IC!%JTgLOhSVZw;hYNWzbQu_?id
zjZBm&O*9VSdGWAu87$Y)e(3($oS4lx3IFDNjDUaphJZZ<;Qfm8#M^k?how?}xPloW
zA;pXD6<%b^R}R`C%VY*(CsoTsdf=f6;ZyKTAn0V&hmtwK$>dPfYt1ev>}s(R0@GY~
ze8Sd%d3}QZW8i^<8VChjpmKC2@r#%{hZxbSA?;v5P*c{>b`kE)ZQ*?-;!!`dk?}y)
zj+oGLwY(lH`A!D8qarH?OsG+<Cj6w`y3g}+fYe463Iu*SIoqv^g<|Q4FUG&iWFiyD
zm|^s@xRd*)%|Nu&Diz8x{{{K8&qVZy0%lfIJv;4LSL+|?V?X(n!IKZbsYQ0VK9!Ui
zqwb)#?O4=WNDr}FlS>BM=cK*MXEm%Ss4ZKs_kHu3;@8i!gSW%Q%zD?Z(hM+NN?{qY
zrvATIU*M8rZvHY|l*|?1VMrJ2&Ws1YsHJ-wY|~)D74uW*cl`PD#gNdkOA67w%2$Ey
zbF^QoAluAUzw2jr`n2#JpIxMt;F}?q^o?x(`f%1ao7U0hvG#h^CT;g>i531*w2B{u
zYELypo&O>L180FSM^+>z_Yq4`>!Z({5ue<@nN@<`;gCFB>?^WDQN%yvpAv6mj5M_=
zg7tPAOQi`bI)yDk7Y`;n&}6aes&%*WSS*ZcJ+M%qQ#_5F%{WX(dAn+$rYRg59?9LH
zx->t_|9bb+3H%(lBcrndOq1t;(*OM2O|BlJV<wl4x{xHz!`LrD%Cwn;fNFRj_&lX`
z#6yF^+z{ng)QR$;&q^y1W~{KJ&3y<*Szwc!p3tIz6t`bbkbk8@7nhgl+ac{V0U%BL
zVvem?ZweBG@qYE84P($~8eYZ<lR_ot)|N`3u~E|xR&~qCACd81$<ifSs9+Y;d)GMR
z+rb!DaYSOuXlOw5+);($Y;yM6v|cD@R)^TtV&Ud5YP_1pFEV{s>s58CEJPG=Fb)XP
zII5{h)F|L?U~G-~_R-f@VFRg6hT_>>uN+9!>l%`m5x#b9(y>RCw{UlGzN?j!rc-ra
zgoFEY`yBN8u^W<}+t3`6Q2al+yNY&r9G{BOUmG&`qp{-Qggd-&X=$CzwJ5aL$((Lt
zwRKe?P_G%h&VzF`qSDk1^gn;&Q2qnEgWLFpOW2}GU6DP?h46jBFVL3r$d!qG0}O&D
z^eL{*mqd`WzIbMDC*!=_b@2(~HQc~t>JZ@bL?Iko1h~73tOy-H7%%%V5}*4HvAYFP
zyELdKW}u-phE%c6P2Zm#@Vo0n*mp|fMu$BH&H9VBQY<~MA$T|3Q_(QH;kBaHe-d|H
z|0M3}p|%Bgeml+)_aaob@=uoo(47%(K}DDWDY$_eP@}{%HQ%ESG7eVoxP5Z8+m1Y&
z%MbO~IB$f}c)r?_OI;cW@$<fW1s8@b18TZDB#p+g>{WkdAA<D;@LyW4QP_L`tlM*~
z?S)_4>@4gI(PR|%M1p6}its6n{f9sFGW74fj63qbJJRPRT8<FGrF)y@HIlT4C3cLx
z^U|%kI<v)4r)?p^zIv-Am|PM7W(rC-53?hB1tPV#kH#Cr<V$rRO&HdhohjZImj*)l
zHs6Cd+5H!V`wc+oXk^RP1_r#_OS`yX6&j7G2U!_i?$AuuR+Lq>k}7XxFB=rPY)Aw8
z1kMyV6cuDD8R&3f+6EUe+F=6pb%8-<z3E!<6Zu%jb7AyXyVnPm#$@!lex9OD@1(PC
ziK7p<{xb`J64d6F1|5q9aV@^=Jk%)~mfqL3Nfmu(Er46a{n&P!+Uwk;S?x%SYuNFA
zT9F;K&7B_81nSV+b&cp^+_&a(dXLA2b8eCCm}wi}+bGe5eX1`m!aTEe>LA7-iGsXM
zyE}xHU9$PRyU4X%HgHmBEwQK<D2Htup1OYyryI+XV!yYW$o<^&W4E3g2%3YR%;h>0
z<mVrIH-yHs=$U=+?NiL2c+0o^Sv3Xf1^5c>fCTci8Kaa1%G&%Cpb256T|ax@t&~k4
zDezT0q-H&?w>1ea5)KIzzgf~7V(yUTXPyMQ?v_)v@3YVB`UWpXD;y++O~*O4A>0#o
z#s#u5{7Mb#x7$lM<bwN|laRM>QD6$qhp8v)KlziA322O~A&tj56zoQkeTMi45jW)k
z+4s}?{TbGj;L6m&MS45sTYjn`&Sj{#=z6Qfs>%BMQIa0E@6(4`l$Zi4d*!GtK?fu7
z9c_TZ?1Bd2`^R_E>r(~t$wiMHhMkziH;JAbzxD0i)1W&sBo1#{P&Ykd`_9ugj7#q8
z=<ZL~!<dq^kWP$0ye)P-p8)KpN-qZzY=gHgiHc>s2^~M`BO)1VtsTQ^z;*dt4+Nxa
zEmGgdec91;_!}z;o`l}&DmvqL4Ojj%`$7HAYIXd7&Q^!63cx&ZEq}s5d@m|gMi|hR
zguabE(is(;B=e1Hz^5%dl`aG6lJU|^B=>Ayw@(jKA{07H%a4i#=a=Ig!+qWP-rI_Z
zuJ8(sbeBGa^UMyn2y`1h6wLCKS^8smkRs$vsejpp2eNrr%8Ohr8}e*syJN=+^mDbA
zK*a087Jd{Fk7>ia$2?KrkoWRMV>&Ow*NCyNVI)BhZBJE;E0kAZgK%NIY_<<+8XKrT
z+X2zk2glvtO|-GT?GO5c!nxSTX};X^AuJT}!d<O2Ki+Om9f5HnBb=IxsIApL3qMf8
znIVw>c^5p4gwq0_C-b$(t5}Xm_6NG8E@?$`(CzmIUk7Q5*edHxqO&P*WsfUnC^?3?
z=q2MxYv7BR$C>^?Llm6l^@@FiEMyF(-&>CzKVEh}eE#r{YoBoWe`0Fd0sjqC1DQ?#
zsse85{=up>z_vA4xEU0#t9=ZX5eR?AKP2YFJzRL+(RAA1qL|ZI;GWzQ=MP+&lg@a?
zRTT2LU|^N#g5^!riutjHdl}Bo_ZllGMe%M05{L2v?eA)nwDe2AuI2iEXrRu=)(iKm
zDfR8rKiF#QyxCI}2>FxKx=cO);?Dbk{)2m{JItS-j2_FnZT-k@X)kkV?a+}d^|7nt
zP)=<NO}d${>C!2}V#dnpHTJ)dM3m$ZgGQn7JW{qnFxUpjc4w{3boF8fMxU4dgQ&8b
zfLBN^{D!5J>^8Cb9_!<O!Cd|;@<Qq_EkkOd=GfMq=H~ftfgy>Dzr@yssNcJP%O|xH
z0H{@~y&?Vody;=)TmM*-1mzSq^OlMK-NJN0E1fKX*AfFVGo$4GEfS|FyS_@Vfywap
z|F|*W4F127)%w4Lr&BDFS@vD`Bjj<R_b8&?N;_Ng6<qbX<J&{n`~T06iBg^~*@6e&
zUhlxsuAGlsQI~gXz9HdVag=tS;3<PV_}j+3$;Qs-^n3>bf{;u)kFa*e1m41Jamd?z
zuFNtAfMGl_uk<;j&b+9-Odzh{`8~{~r-Oy-io<=PXIpI1)av!46cgtv&}bL=RU0;N
z2Ru^(FnWTtB&K`bC2B`@7@8v<RmBkLw~+V4<2KV5+641#NV=T|g*p&MYVSXIHKf6B
z73)+?x~IO|{TL4$Aru`p6mR>iThGAX3D{7dYJ?`vzlRB`e8K{Z-wZ*SG>kn`2{F7d
zNRmCYsypkhYmLOzj~wcy0OV3{6}kww(Bz$ZeIn34@aEScV%*g$JL;pfCMb$#vhR&%
zk_ZT|Jw{JSe4ok2YR&2H%sfGt*tGQWY93N9C#xq9kMZaj!pGuaI6+=2G?I@fc$6gc
z_x+(EclPJ&LS}D<w)cq}VcuZ71M1r6??X&udU*a^j(d_oa!JpFR;y`5+&Yl0ldB)o
zN41pGxZOb*HHbwejl_M<4uXL@EC<BUQu$BB(Eyc`I*PnG?6bGRd9dUurp#xZnFyAQ
z`rCB5pu#OIMNBlEBj(FfPPC~yxgv77JzlVzz<Pe}3<`6qu(1dzu*5=6d_9-~|LtS$
z0KnccmE<i4S8cNmm)Gj^!(_3ZuYc^j4Z|BQu?T!RCNRsYJNVUMxo5lC`e~2F`2Ok=
zu!D2n$seQC^eE>8#7e+23^A*<ea^>`?Y+z>Y8OC4{TW(8t-6R;HP;PchJp9vUovhV
zTLGH?#_3B3PZ08KZV#&i;1&X1OoRYt?1q@y<-Pw8-mO=WHh3s3@yQq1Sxxx{Wl)|O
z1<B}+GcNhPS|pyfQX2g#*?rbYKAZ7*670~ALW#LH2OP_;d+yA39nqKYUn5xTJAU^}
z0(<&TN)6FooVJm}UXCf1awT&Wm1$g#9~W90|H+G;8@0S=dY>bsYB|j|I9{T07D0oQ
z-t3$xyXaT)-)S+4jt)`#f0AOPb1@C6^~(-nUN1Y02a}b7TnIZheT+ILOOevl?|Vn6
zqM`*mvF;!4?LS2`m6zx>bI<DeKR=9&=$nIzZM=&fov<Gq`dk(ot&87#wiK$AX|CXt
z={2}jD)vSZJjEU_Rco<-)2#5bxX`AH6YD|TG0KbZL;?ExY0Z15;RtUEGR!>~5}$5b
ze6@CVM8_r~)y^{bj3>WeIBZ35cr<>ummn2FxIUa%=?DF#WF+r^cNAwhlkTjQZ)lmK
za;M=hH=@qt+K(j~_K~Jp0<){#{mchy`$oMlwa#tjPr~I(3`wr*+iMJER|9VHJNi8F
zNPG$`U3_X!*R%M&3~w=@rbnI1ZVPDq0K!EQh&0@pt*QS978j>VGUN}nCUxX;0%Jm$
z9ra-fkWq@9j^a_sYe@G2MG@&IIi2l1MT(vASKIBWFU8yA8(ovW<NAC28<(3quYY^h
z7PF<)R6lW9-Ire;*c0fkijF~J_=0$k&q!4cH&Pl+cK(F(&EmijQJ?wVXScAOVpB5#
zXMpGRDFw%>U7#ZahZ3=Wq2kBLDU7lZVlRbYRAS0Thu!$TFq{cA{W7(8<(woJc|P8F
z&L?Fgs*&36PSd}$Dk04QPBVGdzs10~Osy3ba)hr$8XfJzo(NKm=YLXRW(%;&`v#hy
z+PPuOo%g4?eo0;dR8#U(hZIZ>B``E@FVi5B&YFyHdG;56Tt)SID#hAL)bn9JXlD49
z7FUMy^qka-$bYX@<&=z%FZvoi2Gn<O!hg6Su(zt~rt4>rP;ggFLTKH+CkH(PVLU^W
zp`YeB(S|rcXqy>s;`@apPY-4T(PV+Zj!F$t;5I3dK8dqJ!R@UJhQ(=JP+81YI#Dhd
z^#e^`!q8#uUg=L>>Lg?e)DPS;=O`q6k2xD?bVkteD+9^}k_i?wBQ(UTTD2*)CI?kJ
zy&l=)TRk!@rxp|bEQV04d}SR^mcJvKm;Ma?3NyEr<+u}0Tjj<?i+e>vsDSPICmQzo
zL#UI%%?{(;eMSf}0i*SzE8CvEbs>w{Rp8}ZktK(6q#|R-cQO+UzoSTcjVg(M06;pP
zC$+HM07^Jw3lJJZBOGh==Binx`AhJH&FH8kuey_~9>wu=%9>@(r`!X%=6o7;CC4jI
zwdT|l{!RtHV=Jyl>obJrK;LNDeJ}kZ$}5FG5*51aO^Y>kz<`lHx?FFSwJN7>5<grc
zl)-dT;xM~5x;}f?QQYbeXLq352d(@SLw^Rc5%n@X0oMS=Pz0d6?4tp?OABTw?l0Zd
zX)!V=DFHXUOs4(lX4CJ~AYkg@Rp(s%<UsL;s*1#dj7N@o3bXKDS>=Q^3c!#WVNWb6
zFVc|$ts0heS~4LPE745T(97rrfBCb#WAXRat)JoCj@r{BG&e?AF3ubbv*G$8fiq%2
zL)&X1zBV`zt#M+7LZ?Unc5k$&Q|ne%>-93i0TJh=XR-#HBK}?Ck&vdm;qAjo&ip&+
z=GQWa&E)Dec5l=)lE>$(09QD=X?+L6O(A~qraG^{>O}OP2#?f%%d;<JEk@_JZz4J-
zP>X_e^1{s5rIsIugKiUSz%MTDp1V#t(?VKaXRR8!fKXyljHIr9#}<>+&qo-rRX`(t
zqv_UUv$lYBemUYccHYXD6?4CT9w&=^jpPZR=#Tb?GVU3~>ml9rs<s7twK#=J!aV`4
zU1ufkT3c%DL4i-|y6$<#18Evki8R@2hnY(>n2f-;^;_L2|3Dm1qSMjxz&1vpE)&0i
z_H5DR=Nm2!OjhoRK5qa-^Y=>Tkhd8q^}yC*r#P6}T<RdjkKaZ9lqje|gY`D?@H}3J
zt6JP$WZPz5(q}PUM0Yjwo=d9^ZYRaEOcK);JqPvbI_Ge_tkmgbT2eRpbB78G^((Fs
zq_>p#mn?cOMS^+uv0AnZbg**0&BwQN?wYu#vu1hhJW-QzKcCo(vSwj^Trt7kBJxt*
z^u@_~SNVmqSuK=W^6wR)6VaMCheH55My9-$MCoiX%cf>ftX7WT&bW<oV+_D!m^5gK
z40;=1b&4RV#mp{u2aX7lY78=cd-{rv72bSeDN<<zRQ?YLGWPF@zHbruRKon|V{#!O
zg5R&aRT`)N10iEaPc@l6cHM3((J*-)GTKo)2^YsiweWyP<Gaps#4kauI(YTBMdR*M
zy6l}w$c$yllODLCRmrx(WqhAg64V~%5DpOlAY`dxw_4cH^6C6kKoc_o;^py!m9yoX
z1ASo@yx88>gg9|b42ACJy=?lrHZA}V<#_WxSS>A{E>YAeNNxESKNkK4uPxKTWC#zU
zO*c^NBmVLej_Tp`gf~yny143nxbKPc!%Nhf^SV*iC*|xxLd?#G{70hW@lRYYPuIUD
z^Q7=UR<HU{Pad=QyvOnrM!#X_X5GJEpIgBnNw`v+;{O|?8lPaj<XVWmDL8O(-j$!W
zF_AgV^0SC`-_EzzwY)^Tc2VXFAPLXVcF|bg^|^@6%RjEOc(8gOXEYm!_%A{XYD$3x
zXLP9!aVfjgx_-@ck*~W$|J`CcU0FjX4G*b&<G%ng&4vF2#LNq@FE^!`qFDE`WDsX?
zvRT6qN<|T|V=<U`ne>WdQPWk+A=)yJ*sZL%hb!`hJi}aYp$6eMCPN4pD>PUaUUm{$
zKv%juMN*S>B7LBGanFaJw5ZAjatt}*k?5e+_;Qs6<Xcn%775y;FFCp&jz9>a9|UGc
zo?Wu~ErG%MWJO5J)?iI>Vf<O~qiI1hZ4E#Az)#B$Gl+#?O-~V6y!@LAlt5MYFB%0&
z9Jb5z{cqK4KI&U{RRtoiqccSV3SWig#0rR@X2yzc-UlQwX#&^KolLLEej+&3<HX=U
z`_;<HE1{Xw_?%Cwy5c`_$Lw!r)Q3tr*2s_MY(LMX`1sL&CMy6H(}SqYhS8^NxgNKe
z)03|p6}V7C(`H#%6e9?^U5{zL{85jq&ma37K8(`&eANj|Bd|I+rm*dLc5#G&fhkQ^
z?{}75!9s@DE5HYWf~jy4=4b!opOT;$@j@S!q+`j~7WW7f%J_buUr|2k3b@>&P4Hu3
zA>y^B>!Y)mAzm<OVqUuH7c;x`PfqGrcGEmKJ=v?RbvRmb5RI>2o<%CDGsb-^sP{e(
z_5gw{D!tR?-K4}8w?=_O_i3SV6n#IaKtT%UcSurQ0G_zKTr5|bA_nn;n{e!5HXOf7
z2E0SNC@*rpSkPnDw(9*P4PI>}l$MEm_m;;Cd05|*IQ=#)?spa$ewPF;K{LH765O0p
zq^@3TGb(V~(@Q{KH0@?0<hiXLh<ent!<-B<9Ij0416xGOdKpzMsKW$diHq+HGdoLI
zN6wUJt<5!R$?thpL;xp2>d`dZFDy}P)7|+?Y?7x)5&@gKHTN^Vz2{S=yZcB&z7Lma
zSX|w~kzXHaRmwKIzb)4!mpeD3nv&dg@l<VWq&0dguXzm}FYe7D_8%8lSX>rjCr6xG
zF1G}gyAP?2`V^w1W;~MG|2`-zlEm_|N@8(NVN!pEG*!oD-+lhwBYkYH2PFM@MJ3Yg
z92<$i2<RP9({*#>Sc#Wn%BC!aMq$HfJ9!wC7=3d84@`|~y;&{j$I2HljX>S<A0z!P
zm(wQ_PJ19Waoe)M*$;(Q<_VEcakzkDWH*`t_cpatDD4g$)w=pN_n(lO>{~hw?RaX<
zbY~Z#vVVJgoWDqL$_p^I^C%Vn=}4r_8eV-Dut{2GD3%T$Q}NeEe?D0E8;Np^B^Zcp
zpZa2v;*g~*=4C6Bd|0fK_h(_P(K7L>g(j|!p_{2Kf$8qpe$)ucI)DfDkXdW5iQ@P1
z@71H(;`*o0^^&Bze;WbyCtv^33xkn3ts6vQ^GYP~xsJiYUTdIzP4~7U9rbkFO)<LX
zC)--gB2vqAc^$TCsDoegPQ7Jzz-Rm$Hjj3RXDq|C0mR9*l`ipfTz=Wi8R@z@+$7q)
zv^x+%?WsOqs$Xj)GG-=JPW-%!w<uk+$1dp8>rNME3Ac2th0ij1&%PoEws1C|RHMpr
z)Pz&_N^idX{B-IvC#@ma_JoSJo*Q%*ZnRI}$5*%V0#(cZrLUb!+wWKj7^BJ5luapY
zatn0ln6JF-DK2k4r8IhiM8aiY?^|}@1R1WumV2XitM`<dCsst%DDq!u?`sL}3O!Wp
z5ID3jH2ZIJp7+$AUizOwc)w#JdyiEUDxsC|QW-0(H@6C9f&}56T*l|gbupgVUOS{#
z^21dRA+*kFr8hT(O+3<Ppb?FOkq<&AtGk8Znqb9Xb9*44zB$T(pbB#^%D26F#y<rV
zBB0wim`im0B<9-_KM@Ew;;j4kuKvC*hVhZtg3D$R_qh3K<{s5azrVYS>v@B~Pv*yZ
zt$f%c>Rsgx=A^$|AlVK^1Nq`~fiw;yy3ELAucT&;)u*%JWXqB${T`=JT^X&+qq0c5
zV|&I>ro#tQYa7S^gT1$ks-s)ib%R6j5Zoa^aCZ&v5ZqmZyF+jY9^4&5aCdjtpaXYz
zw^RIc&o$@TZOwn5i*tAO)kvc;sH)!k$f)Yy^F8lU1fVrtl%X`Dt!aA^W#FCV*K^eW
ztzp>I-Quz`8#9;`pG82e*1-uycem8&QO~##6i6xxB!6nr^oH>gZ~tjoTp^v|r068V
zjd!}*xMnBEEqtly(9BW!@gE>J<Fi=Vz-ydn9gXx8OmkaJRK5psB*R7wa~!s|YhM2t
z*YK4^+k-2@pOnY<s5BmF?UismzjMFqKooR%mSHrEIy$Zf6_+(hdKF<L9GC5bWAWA)
z<DWA;xa>Wi9+T<&FdFFm2E~%=c+~3Deu2_sR&H({&Jl<Q#6a(I9bg-0D(#7=EA))5
zZ%^n)od8`bYx`YrAQOt8Od}J;J-s@r0Jim;vYkeEg$I*nOzIo9u=@8UNNH^0WE#b)
zJeDc*L99V<s|{R}-;2Muhqua_nFc(a>fap_-#x!I%{=8L5DNt%#sX5p0nqJA4D}zJ
zp&${dgrp+d=|Z>S=3}QPGGBP&=#PhPJYo!PBV(>`WQN+X!Qy4Uxc=+(;nA(%uA#%O
ze^7Pn-@FGpCw>)<T?jN^Nmmz)9W(s-aN=8Xv<IM-c0e1#AR@a~q0`7G&Zll<@G!QT
zFrVMs%!})``=?)r%?3=9R?LL}gY|L2WvVu2Y4!R}s=<@^qB`dr8OK!W>}~Zil%>rf
zdo>7YB<V?C^ao;+*LVKcchoikw#J=S>v5zW&JLq;{oV=|=k236Bgxg<u@2J#`nAFD
zhCjPB?PRUC+aLS}$M6J9?(v0ynCreR4*=x!O4LNOEhmu&ev(xew2GCY5BL7S%uFX)
z)OE9)F=OU^jFsbO-98^KXDUujj<EluteSG6@Ac%~8rAH%rupRAk0h7quD_9qa)Hzy
zXm%-Ir{qR*W*pq^qh*XRm+cv+GIqXCO`LA6%_RlI$ZZ3zr)$lWU(ETkT+G(C8S7K1
z4JT=<@;)vQ|Df><wFtknfAdtsYxYl1@bSM1Y=u@6D8A4In%_PppEUMs?fkV1mhT-5
zd%5Z{w-2F@_-+q85^vuMw4et@J`mrnR3_ZM6_(x{ekz)`TBs^_K2<1^&S8F`l+61q
z!<;(;ihEU{@x9~?`xL%5cD0`l%8n8K-e?_ofo{|&oBS+UC{ReP&=cxF$!5)^8k6;*
z>WkC#m?OFe!Zgoh$bxPBgRB5-0n6vCA&E!*C5tus!vW4f+>ewe?~D^<EFNYC`)-j6
zKHVNIjX!lxQazCdaP7*{PdLk_pZMKYXpQDf=gXKDl<{T5gSo<~YGZS^>y*KvQ>eM~
zopUDc-{n?YvD3I5=_XK}T=C<M50}%%^1nDruVr=xLBP~r>Q*%@Z`+`NpPUklf80H5
zr+OmR@)~u^FI|{}o=Ny8KF56OLU`F!N;;z;%58sQn>Y;h<v4{^$@lJ;!$Z~ZxX<d?
zmt|O$otLLedbH7W@`C)8-^OtM<Or1h@1FJ>AmXs|m59v>xyHSB@tJ!~hH%l+_o?KO
zjOOCgcZ-UHol7+bA<^X-Bc`b8Vpdv{nI#&3$PL~cFO6Pd;t5f_pWcxlJ94DQGbQp3
z6Ct9Uz0BTX<R<zk*(V9B5{>2$<v9(ob1_klU|j$`hb=M&v~}ebSul()ThG@G)Wxbk
z3<jTc2I$?^^jE(75`Vxt`cdHX?~_?J<hE?omOP)GQ|85;-h^9*+9iHS3;O5Hg=g{2
zrNbwmY{;|mOdq^v8{;rr<MsD*mrce<(jua+Yrc<o>|EF<BScs{F39eSkN2;F3n5Ur
zJMFL8G5d^3$oac75@_kaUfh~DE`T7sLNsD9Hil<3updTu*J%tV#*LeiOA$+?16oNd
z``d1oDre7)LRlB<e5WfnT|M}OFZ@mkNi;KcAAZ<F*7(E9xAfgd5_t8bg~{o{!Ko4U
zo+Y{-*Uk;-T^B{{m%T9kZp3~iqXN4V4_NPvA&l&4I$iJP7k75r-zH=#_MIT+S4HLj
zEhifw?b81l>bd-UHTW<YuwcP@(k1{|=A6fG%RiG$5yS9o?FA_;V>(K2ZNps|OE!~H
zQM4$PoO_$V7!AytHS6hSsu2HrYB&fVq(73xIqZhiVDE46(YqO20m-+4*5iYxGY56=
z1~t~8P_AWZdr1_F(rhBv{G`EM9yU)!``}ob+vSLOI=2nhLZBRy8ZC+1EAx#z9=8X%
zXybRQyXB5dAJ;SA$I)zj-6IxRl}d=hqCT52HYKdV=i?qY!)<_mW2&^3sZ_{^@)p$#
z#XS0N-amA_sT8Z0+&mNo-PA=Rp+a`7-KzQflL;40Rts$A72~PkSH?qXh%O8a+|J))
z2)l4;ybdc}YfqDBnx<oLoAMRk&rLI}(Bmv19sBC`yfYly#JUQ1X!|CGU?NtVX04Wn
zFM8q0VrQA-XTDh$QyGJP5T@II92P6ApBe8T@*dljyeyDOL!VqT+y(aInZ0Gl`nOc<
z$-Xlo1fN%Y`VI8uTqc=OTjF2TJC99*ReIZ?3|cS6IEAO^7Qgg^IyKZfa|VKtG3l7R
ztxe^8h6y(4;XYx#H7pAa8Wt()TDU^SR(=4V1=N2OfZ{VEHcqzEv99ubQ;xrA+la2(
z7D7eaCf{3$qWD$!Gn!DX$lXph(L$Y7s#2NKWFB1aAjWb#wNQA=heJ^?=Na_yN>KNY
z%;<X}4`j|=hzY5Oztnv)!KRXh_9Wm>?iBIwhC5M%MU#7PE0Vy$F?c~$BA9B5@OCb+
z6gJJ9(2I3*fc;r;jT;f{6e7$Ylnw6`03Xe0E#;E>fMIF#;`qC<qX~9y>D?jMQq*^P
z<}0Yh;#X(-&gETLbV7SRn$2|!#0%P?9jgO=q&5=9&0oTm2HA9&)ct|sTjzP0<ZMMi
z31f)4=`*5d?e3V9_ZsFl)<_18DqDhd@ZDT-4y6)5m6k3XUljAfk2c;W=gI}!635Dl
z5AmT$2~tflw3=+!lT97>JH&s9`t~*-f=j*`N3a<#7alsXDG@uk4^|oPusal7G9Nva
zS3mAy8u%7Oa@m4PBn&;~?-3I*>2G~F9)%A0y~~SqOYC2k0I<}<Xvi_DlBQo)i<7ZW
zU^Dz?vgOLWRsu!sOQiR`actdlz4&9N&o45z_|OFLp}STZ4T5Ti;G4dE+pvUE$Lm#A
zCfRg0Ilu#8_x_mRYm?GA#BPBa<9<YuVnp}!59OJ`n`nJehto_77N$dXOP1me=ULPr
zo(FgALP6XzHIGh1y^>ySa&tDYjZ?TZYZJd*{v0w+fgPg1(R`F@MD2J$B)0v|s7=AZ
zQ&ukBzmUGy7U-pNt;!sAY3O1Ob$wPfP!ErHW{Lx8_*aT;wF!exLG1|<nM!(-jt~Ys
z*C;$@(F7Vz>yJIWk5{bJ-}K!%g%Es8niOd9f(@y5&NnO5$6yR04^bR{Ow4b3k<9%5
z+AV$GX@(W>>U&^BpqNRm9Fk0cLsKdqxrBO&+8qp+w#z^LJ3@Uv$8QBfkozR`D_@Jv
zwecs#CoNQA`*QI6i<HwFb?xg9@NH|FWh%vWHI~!lKaH^1f52r`BZ;jXcud#8|Egw0
zgf50m;AQX9Y;u&C<~*HhO{Ba<k!Ey)>ayDZ$(T^BOr^|}I|e8n7$a*ZBWu1bjj2Up
z$`^uQ_j)3;lnp*6Cgi1G1Kn<4n38xfWcwsTfIGHCW3Fxp>P3T?&c9rK8{D+oWi9G?
zvd=-OB3l)U1<F`~7Gd91%1>%yv1(2xn=`QmY^?zcTk)izeqLL<v?j1eTd$@XAVnDV
zPJqBm78CkQDm47-39oyf<fg*O)|@lubVa7ca^IHCbqZ_KcYS8{a`=n;;JXAUngi9}
z!gku0mCZN0Dyc5ZBK4P;R5|coX*ccKX<ljwyee0Aq=W*KP^zNQUqL77Gr?JUjHm-B
zd+=+X#EH0h9HAN9YqjJ0<@7f_n<hU#(U4|q1u(+UB=5Ao`a-7kMIM3=+Of65KVNr2
zns0J<{ob!Q@Z&*pA-=2rp++O>8(&o!jqtG|){)EY1Uzbx4WA_vsP>d2S(7faJADQA
zz@7)2<eat-JAW$D5==LnENSmuUN!{~PpD(^E$2bN+ENDDPin?r{lmPcwvJz1NV=ui
zO%V4Ai1(6D5&oNa@4Z@9{<nyvUKc&9$cMg{>~NBRxAIi|gh=_sxL-3a5AuP|%(5B#
z$sax}j40o+Y~5F|f;Is>ns1!0d$x*V+T{>hIuA8yy7=@<Y*yjd!KNLOb3iCnKe^C*
z5)|}=P**}VIE-A$XK17;^yt|nRml8>!_8RP>{8wONbd@D<-s~xy19Kk)?($@l<R|D
zDp@l}Mt2xvFmUPXO23M9aHVQdR%WjTpf%s1@Ke8JNA2ZML*-PqqAK&0Z?_tf_f2^H
z2+Mm|`tD!^wqp03pTZq+b&HF(g6G&rX`y@TgimQ5{3GYOk*UfPg4za|G73<U+iCnR
zz|E1L@KrmH0E@IPRuMI34%l}+sFiES?bG;T)s*7r(?EZnsQcBG*42w-43j3W*d5vt
zeOLrF-$@Om#?68mE9pQn(pgEUW<FjIaSQE9kkt2a!4j;XqLc~@Pf^v)Zb?7-a6Dri
zna{YoZsy?}ZSLT4VZ#dAC)+A^8+un*ilG%$I3IcMtaZGM)57t1HFQ$1f<mJ%{$yK)
zZ_vvxAgZ<z1+*|*6hxlf#!1TctL10wxT>FaM%%*zeu!1Ia6>0sT)hY*ug`pCipu1_
zql08;Miv;SCFR>YbxUjOIzn8+t8C-Su6PDM`P*0zioy42eD0z`^`nK<`-iwsX*@W#
zUvN+jt`pH}&E$9h2u>LMG`FLb8o~ylLKdszp$xeBsDQ3NmR)#$6>(?_Uf=K7)Y*$9
zdDq=Bq&bn-GEX0R!{7?ObT3W<7eC8NHOyu<H6v^@SrUppUwy3JaJ4?{e$g|RXAz~N
z(D=<M7j#g1%m=Mu1v{g&k=7a<VQSnjzR5iYGyk1zd5O6|D$$x(`UM3ld`*l-KBR_b
z-c;OaEMm+x{R5~?(X?8SZAxsxL+<e0RuZ=7c(S&b%Gt+$3qtELVfJzM0@kj{>DW?b
zxNq&lM;KrS`{VGzxu>@Qym-tNr)_4}E=|=B{&NmzneR4VuoN0hzjoRW;y-P3+;ZO5
zdAK(`HoQ82Ccja)i=g8&j}w`%TSa~I))C2v@GRvtB3TovimVbbGr*A140WI#I1sE0
zK1!ZhcnZ>@1n<?Dw4gzLj_VJ{Gq1a+H%IBnf_WgNF9I8?*wtl#K-g<mx7a>)e7F0Z
z%6~l<N7C-|FhDo&d4|dVb~%|R-s_&PYEVK%w&b)Oc5OI2q<6t~F<HOxtLa)M43c18
zO(SQJ{a%R-?g$a9le#ujY$NL?1f+~JK`4{^y%cNYSMl&${#r)AF~|Q1Iujs@PqBOC
z>5*dk%J2sqpV(}!bT#bG1W9r5$m}$*Akh5Vc$Dj;bZKdpY!#|rV}S9(x)FLC(7eX#
zcEGaGU|pKblvBLfdmbEe7!9<4&+mgcFsM3iht(r87j4{ATcM}Hh&URdq6Bt3FmlbZ
z!Ey972ZMfKt~ykJ0{*C1M6Zu=|1mW|*cGr_vyfGtWa2}xGA8((k!TpRwH+79l&Ju=
zAv;PaqzC@l6l+qn`-pYTD~HiVuCU+5Z=WP)2I5WY=hm&1GehMnx)DsJm6Eg)j&h`l
z<ia~H#_tOhWCtq^`@;=S1f1A1P>hj@)fW2e!SXu+s~?A?umjj@i~z|S#K;qeptHre
z_HR<Z>s|qytnXZ4d@je|zx$*Cs9eFfmwc<TrQ4e?$9VLb=aoK8V|+#h7zg%X>sR}$
zPV!e(S#I!gZLt%Lh+D`I4xI<guXk{j$QI+ef3D>!qS@tiOi`eZ>7cz^2SDpXf)$So
zn`?oO3r*ZVt02NCuW;9NN1@fyyF~hCBMzZv^Yo5ymaR4eA8Ngj)Og)#F|()Oo#~?S
zFh5&+CPPXxgBTeEbm4}U1*(dg3DG5V2kUyze2-7(n6xrn`OabWP;M&drb3{;g1kgq
z_@C|*;neC1W*-*4`{9zO_1x@MNkPvseE<;qfyVLuW{v=u-5MWK*?T|iwhB4i5WJ{+
zv=E$O;Z!@8n{(x-g_;i+nDNM>Gb#0+V`{i0nZ<J@tO;r&PVus@-!D5MG5?)#B#qNs
z=X+D}UBqaV2E4bu%fuS$X>C$a`qiS>874m@tQ6i!H6cHaxu^BX&9~zOV-X@haVkng
zp^lciMeY>{fR@NZbo+EVwZc7blXr`h5c$XvA7YI|>41wb;$q79KU(;j`@iT2ZL_C^
zqpv!XrH!CuKD>jlJ^8}Xl4`i<i$JDJ9H~rhu&DV9<+Io8-TDFb+sw4fs^aq5j;X3;
zU0Hw4kEz{@%v(lO#Q<L(_Cja@@sLn*%ag+>e%>UAjsY>XKDK4oS!I&`v`3|C)!PNW
zj(s=CKg9~{!*D@EXFniVm&IJg&tt&kq+dnqXOTJ-NZT<5-3If&+A?kz1LM6*{5IaL
zw<dPFA53xWx?}*y3>y*~0jvPg0Sy`ik5CITS%7iQC?w$1dwg=Z*AzpkF5*v9IIvxa
z6z!Zs^0xc7cV!CwZ*-w=4~2jm&3W==B%Itkc#vmaTW6&nB<8iaYnyb;UkW=S7Jx~N
zmlLEy6WZKS|F^gPcNh{t$yU-6L{6ERcK?%(4){5oz?+fo`@pCFh(HpNp#VlYvKC`L
z(*JCvdk35n+B=fi{{~_H`}z3By&Wj-Ea5)<Kb}qWyEh*5U5K>bU)T9xz~R6BLjfE(
zkYbnS4)ecbqyNoHCjkC-zF<U5aQ|^=#Npl!1ZR^xF8&|6#Q&EA^L5o803j8+n*Z*M
zT43`Q>wVhkbtuma{^Lmj%&O>LDCF_V|3xAHFADh|*U3K-(f@B1dU6vV<;lm=IMJ?n
z+DLoukLy?c`t-HYKO)L$8NIpk0G+GO0Q1xJoNkrgx?u9zgYcVPZ!<M8q<r;h!L({)
zGagm9AD)C1C6all^D3EnyU?(oL~7pf=oAV6CpNQbV0eUmgT?JhwB66cL7<(l|B$<3
z{*<1$A=gU`8=ef}Q;*kADI8Pf6~B=H_hl&#w}ZH&rFMQD`;^4TCm;wKAnGLOze4nJ
zdfb$F0u}MB5omaw46^y!8;(0K03|seDwYXrpA>u3W1Y6#W*Ck9WL;am!e-8m@{GnS
z|6eY>Ez2qXmVesxCjZT*cMgP8WB(sGNUGlV-#W9r8h5Gh%_ej73P<f=`JSN>ej;Od
zZuR)vKdg27kG=G+oY8k<K6<ZD$$h(rOVrJ1X8`YydL^knV2y@x;GqHvkz~mGr!B~Q
z;qmi;izy!<B7p$h4?mwLwU~_N&p5hH0a4tWolZK<5~uKMtjPsX(B;?{R#1Hg`X?0{
z)a)dDT*x~+q7j3U2-*5=LC?2OVWd_cUkc>k9Nhi4*O1?eyYd%m#pL#9@>4AzJ%h7i
z6X*5UyF|Fu*jc|=dmQa%MwA3R1Q4_N+NJ@)Z9v%b5v}}#c6f!DF<{*D=rzX$-QsXQ
zkg3<2ls?KUS?h{(;<1{Pv^)*n!F#WSJ-^U#7?#2NaC(E{DhYia2A!_D*mGNLa1I8)
z5>yqnKsCOvifspNx7`F2MvAwJMr`{Yapsww{}h$=H(0^%8Z>v<84Dw0SI7PaJaNl1
zWtOe#8MPj9IslqLeiFGj8d`3bW2RkX&qu3{!<ez8#QtOsGu@`^1KI_GLCRjK#Q2e0
z{?FaISg2C>``hZ4K{uY)@ebqNt3Ec!saMd5_~b6d-q3EsI(Dm4jA}bZKX5~3*M11M
zZ$hn9h_82D%*G8?mgy9bmM}YRZQm@}L^<)Kxag#^s57m^f4$fp#vF}8jdu$ckq6r4
zjBXP>K3U9qYy<1#@(J%RZt^#zPNQWg$o;UCNsW4r>l*jqML{Bwu5{P&r$Y}qwdzn!
z{mM@uDhtMIMAN!ar0vB%Q7e(FHWcC?1m#mh;#;Hem-0=x4<XLr$J{OnKCkzxbyh2%
z>p`B?(tuz4jXmTf93%!hTG;>zOG<t8TP0;$6mc6UapajSyW3aySv_k>UEZCJrc<{*
zB1VUz^+^GUKy+qp_iB^2$3D=~C7p?pD;$znb}-o-1#<01w3`6-{_Px&03h*`@^7WF
zdlvTPElUSd;}8GjEUm`B20on;`{&zrL+j*F>~mnP2`y>m$rDR>m9@%FVVmsy1gc_`
z!9ZzJCV_@-M~G~$jk1LOjhqC^h%rz1mv9J)L@vhSKEWRcGWjtSKF8p)Sy8zxCx4kO
z(;{{;9w0g$OQs&@nbW}VwoTj`yZ%MNd2FhDg)xw*+86$V_s+eaQ@ih<0=`01<RV~i
zU`&vc+CK;>z6lX$@P;<^lVM+YmG|+1nj5$Kl~@t583oE@E#_R2fkB(~d=uNf6z~9{
zz*X8X6*z3>RA%)VR+CySJYlV18*D%;&(?6P7*Oy1hMc;}XsC;U<v2R~_>o4wL(PJ3
zsVU-0Y2iG7GS{GYTb7ADYtcfN<)vt_tQ>#_M+oaSFA=y9RcTW<2>}Jqv0_Ba=^LKV
zc!7B~>i&qY`z#vaWiy(j{~_~vthp`4>G07}sY?~8XrsLmmV)_I%3dx;E&WgHEs=sr
zo1HJ|lP+(r7`|EqM)TU!`N2X(%k;f(_Le7!*PWJaf^r`(-{=atc;+2XKu-b=#^&sK
z9ys_2q|2vO_eT-3?;j4t%TXH;o|r<YNqb$=R$FBH0QKr1WGq^ps+oHGKU5C9WflNm
zo2k(~KP>dJc-0r-^M|KBvgSc4+w`I^cYCb%bTs+Uqe)bzx_oD4=joA<?kv7oZEy?y
zbZ=lXmyf~xPhC3lc<zwE8#TdU9<V6S?07kbL94SNtk|2!d2^D+IaV3h!e~1n-;J}4
zYwYnbq7DrtvEeEfJW#2Ynz*jF`f{rgJe_c(Vb0AnSz>Q2Z{pzD0E~wXz(d@`jVHjz
zaf9IP+^>&K4`3-2LJ(h>N)W3kGd~x@em0Cqt!LPuwVEFYwzq+3GmT51J?2Wf1Tp}U
ztLL(x%=<^;4CAC2Dm(f4Z#aBnk-^fZjt<%NvTP3P#?ylpECerQ_I}lKvgu9e>4q{H
zdjYnxa<tvvuembGl`v<#*)q1l07t*%ve5(Q@cC!c`)#Dv42|(5-5O1{Jr+jA;3Y4h
zH|c-!>H$@+(;DOGP#_E3+G?RG;<Pq2)<FA*jpy5jHP#mFfKvmskD0D)W@HQ|Dlwqk
zx)jIV&qh3&w)rjx0~0PC;&2Z4D>j;N2040j!g&GZw^f~;-*ZzYjAiMO<z}!Xu4NGK
ztDjt{(a2>Eo#*x+xE%%R`rnVRsc}VU#F#!U_VErh6VL}Hw!&Jr0igA=LOU6bG1u2<
zD?8o{_$tWxb+=Z?fu(3gJKiCrMer*(%MNCxV)g#CH%pSu#Y_>o7xk%7nIA%cOVBN=
z!_88`>RHv$80?Ma8P!|v!4C60eTMN}2xfYl^kqV6xU}_VZj|1(Q=BP?kXVVcW)Aw{
zm|O3=T<OCoCH;kZ!>#FM6qDbr|CJRX<T2xg^&{)6=_%J*{;%Y_3FVFpI5rfpFO($8
z?(tM2`I!46`9K=hk1LY*3Ln9!L#~ii1i<3SKZ`_@<dEn=T({kid#K+eG%xY$tS@^o
zx34arkS~0CT(txev7|c=v#kttHUd>j)v2wZPYLcVU(SRM>*TiyIPHZOJKCNfORun|
z5m&<-kt`b2?a1k85RZFakh;=SHIhX;;1<zV64Sq<(swNSZ-G4PTapeiVP4)pa}K~~
z2C>}mP17^Cbu@dNKIhMR;-3@J=+d2{84410F=stzw+lW`w68T+qV*v$d%L<}IMDYI
ztzF$?a(~jTlV<#?K>nb`oxLi0UxIFdJbNh|F3^+2$-UhO*!;*(V2i}&=;BC3-kV-I
zHJ&WyV$$x!?*jjwE1nYYnk|N7*LZJ|bKPpjwfn;#8LkWIP`cW62_N*Nd=d}7;(fS^
zdAQk|4I~&yjvY#%6e0~m)JN<cA<i^JNkx<MGKc2<(}0RdFrVq{bzNJM(!mn3M3Xwg
znm<#dFa=UI{kjQ_Xf|+$T4R2h5N$^czcs+q=nyst0y;5D^Kchf9?#ZEmGW#+sc9xc
z-Ch7c!DL9b&}ZF@mYLtn(=+)VatEMn2!P5DP5^k%1%&>sdZT@XgXL;&<x&NSC&3QK
zo-*YSN1CLcROdY!(o)cf*pj2EZ24p9TvI?b&U`(m{xe1So^&RYItx6+AAYs@Ni)TG
zAm&qD!n*C=q>l=-elk}@O@XiIxo1-;-=0hou>92=1L!_1-dUxuvsQ%wSPGXNc)n!(
z4_6|<xJobIhk$K1AKm%0Od>ak#qx}ZGZ>DAh2(<&i6H{_(J$bS`NYIUwb^3t2HC0N
zDqm|65bVrix?*vWL1u^LqkA+jftbn?e_E*Vhbln%tkBg7C7>$%%WP7){vr&Q*TqK}
zpL_HNfaEm48d#qy5Uk|{`QObx;hzNr1H8F-Jhk!%X;((@NlA%!VBz}RjdDFhb=gsT
z_e!#t0Q4tzx9i5C%O{!fG*t1-U_qN<jAoM25Yh<ORtPj?$d~gy7s*#{1e|X0z93nL
z&SG6Hs(z93SX;|xF<N^!)K$DD%jK{`(R3Nf9YP^#0KMFx?%=1Ar!XZ-^z*@uSr&4$
z|BPJdTrGs%5PeK&65nNoc#~jirdUyP47i<;U|_CpCKBN3`w72GzZMr1i6x;~?;{(l
z-fh@(rr$*``-9EU0XJ-o0Mgndi<9FfaCe{RQ9u3m!E=zkzz54?sn&{g;c9O&K85Am
zr(J79hhO%c<A#=j^lCHpKm@WuLF1OcamGd8{BRfJY{&gi9oHAm-&*HiaeAN8<Sc31
zoUT)9RnXb0LV+Y|k@uSVzOo7BT4Wrv#foAF@oA3-K{6>Qo&i9x1~mUl3nGoq&7`<@
zupFOG7?;Z-!LMK6^()5A`fx<{b~_ye6oI?C@(=V0lJ0jx{tMd?>;*9o%TD9VIqBBp
zC4{N?elQ21N{H)&6>VuxMinfkRX^1g9?(@mnx0tAC4l{Z-3&tfMR;imvoP*{?2ZTo
zQLB1JvzxE{v<La{%NWSe)$pGA=$@|7?)on3SJ+G66-}E8TM*7!KP4yL!bs&U9PwpY
z4JR<wYhLXvft+E)?W{EFNial*E{XN79O&hT;wbmvc0!k7S2;>hhZ5*S985+(mR5@%
zy@&F?638BM+KH%(;{1j>OV|CDL~TB-l9L@&lfCk#!9me<0qz<Hh@xKdENL{Q7!g~m
zUzDJkDFnYK#85S2%`Blc8EAb#B2t0PVKJW)w`b==4Ao1C;^Zn3U8{o@yHjU{q%PqC
z@kK*F6PIjHoaUZ(fTkoSjM7*==&tn-G0uLPw82N5n><B!!q*udOC(7YZgXM{3|AB3
zgUVWa7Zd5jvAf5UaK3oQF^NbvUEPwkGx(>U5U#;UZM1cK6V7{nsWB(p;3>F{LL3#&
z&<g#O;YE9PODLiy6X3W~^74flWsW9`K|)vxRsSeNy(_D6zFN+Qnr|Mi2HP%EJzI#&
z%%0eC`2jIUJJ_K_7<&UrLgg{b4PdnL5S{S2JOj%6(Bw2Evaq3LXrdLv-cxIdGEeff
ztE>BBkfG+7lK;~MaG<a271W2R($f+>INl}+DZ{X3Uc(zRp4NCtLyJ=)Pr6bj5`wN-
z(zif~H!5t#Ng`LxWPAZerxu~|Ls0LcW`EX$A?Y2vnb86qbw$B46Eo$!LHrnsifOas
zp*5Y?2*N{Y<X_R!pL%|zSL0Db^oN>vd)%DF#=l4?7OGRuQ>1M{MWCaofMuJ%i@-yv
z0k7~2cE`21a8NUcMgkt_^sFmY*irQk3`Gtka>U+wakfF2eZcnX>z=pU9L>jb6C|?=
zeqMMZT&Z>&kT(ZFZsqq}PRHc>1;{sxs>#aMCPw*qHFC`+8v%ogpLET3a`4PJi6SOB
zZi@4CK2buF{1r0I;5`!2-z8NWrL3dF)P`!7gyq*4r5KB49zOfw?t@t%mR~}}WEt?e
zW}sVtWQUCScm_a~2L5$ct8}X^ULbqiRH1nIQbDs(><GF%5zQ_S@W$<kO3d-wrTfdD
z1U|eTx3T9qyK9=hp~Pp&I(N)w>+`kEdW-=sklr|3LCkk{d8WQUDK%Ub0^BC0d!C?E
zqD7nWlE2rpcrv*vha(>4jtmicvRnk4klb0N3E@tnxZ(zI?vm@cIb0sQ%GeAcI<l-L
zdmRbjiY-CixLWh@+rIO??Q#jWgZcOiUs5=PU(d9Sn*C+u_<6k<JmIEs6=xPCmgHuN
z#cdbS2pegwVp?!8At~%0eoPeJ-vOS~7!QBU2C`VB+p#<nn>MmCFE7<@*SQc{)H{90
z?go6(LkV}l!LLJ-j^G|cB5Kz#Qhd=w<se2<A)YpEq4C1l!Q9bqJ~tmgORyh+dBQJ=
z4&N!P*2Y5Q&x3loX`9qxAJLCcA?eabH9dfH17A$1+#D`%`>EN=1K-9uVmcA!htO_5
zCoSCmo=3V33wZ)L*VAy6jQvnIN0i9GlYCL6Ny;t%LowBakz}$!&4*jUkSS@ZF+NO$
z&hbCXM=qTbf$uRY{3LM@OZ``;@UfXS)^{LNf*o`{Mzx0Q&NXc=sElWZltMRpOBx=l
zjEJ9{PnV_QU6nC#BGp1QDh<$;KkG*RR`nU7B2`L@*N;Xk`PF)HV9Dil5LfcE#DrGl
zF7-^Xqr0R8<Zcn?Q%a*{Duj*gPq=XFuPW$`%>+7OZHyl)6G_Uf^{DvfQ5wgm&DDrb
zon+$l9t@N2<%>Eq$gXsOJo)kBk}6BpkF6uI#CIu3^*ds|Y<kL?B03IS5L3mf<m2s?
zc($P9$PUYV9=vnQz-g62ZU3hxYKw9lgHD3_$LSe%_dnR{YRBCNY&8u6Pz(B1+8)y2
z_w;J1OU*8spOP8esw5;^#EEv*;*}&dV9y{Uyq0RR0l+o1;J4eul67<ZMF0GtR-sgC
z<jeD^Zm8gr_SvG=thcxX?Ls|Cq-79=_hQSH@`Ph26}u<U55|GA-N4U%eYhOf<b3jJ
z@-vLrYIriUhy(cqFHS-Ys(8pV&x$yM_fZh8bu=c%kNzKr>>7!+SnDjW?<``z7`BpM
z+E(a*1>%G~RL8^g<rgm1)cuByh^sP|)NIK`N0p+`>EF@o^%}jV)vcqkm|r#{+wV$o
zBr12q)Tb3{j+(RImYnuu#~d_y7j;?Sx~5s__U_JqyT!b#WudyH$}hb`aL;h|gO5JQ
z^<jTReX-f6{1e~``jv<~_Snb|mFdoZn;axrU68CQ_?o`GGSUP8C1>6Fq}~BCW2we}
zd4xZX=4+a?he#Xxa3V!+;60+ny>B=x>=<y%L90T-!TyoXhQQigG4itEH1+v|QeO~U
z4N9SGiW!+qCSPf|e!pB=)+G<0LAasF2WmbxS-*2r>r<Ooc9^_7D0qxXF-tdSzj6bV
znw1HkHSU-1wBMjBL_X3?5e%iOZY;mN$E5jeRcl{+ptM=}O(|Ot>bF8sjwfkF3K$gi
z?MZXl5m`srN(~SUdjGsRe-{choUkqU%VkYvPCQ%=m)FI?8snn>)JOgybz1jYypR8?
zBvJQ6=8X86TJ3M8DoZ!N0tc&EJjcyxTjU+;Qq8)Gr~dAy3X*a5j@u{#Mm&jt-^xWp
z8h_l(*e%w_j7#m!gvvB)r)<1^Ul1<m5({K%OITvRGBviVPZyTBNf_3h2x7y}Z@?|K
z>SS!ol#K&L!hM<96C}~r`S-DyBEK|7`}0c${hRb#n{q~zd`pMpenNwzQb-FIa_CPP
zF}D6zuNL)lld2akCdfr&>M<}I*6I!D9GUj)ftAANRfi{M`P|nf!3@UY*0KIFXy%Ix
zN$K1bFt)|&b7HxH&g3Uy|Jo_ERc|?HpaN*-yV}?6Zl1u?G1nqt1&5t**(xZ%iRw!Z
z#nYmF-dENyQoNtL{SB!P4LLk6#z$TFddJH#xE6V~HTaV|x4Q`%BWs#J19i7<XQx{X
zxmv#7uW`l@1@dTBDEgDiS^Lwo{}c$tSV50uDo@t9)*}F0*&7XcI?sYkr_mU??C0un
zvK1XH4(qRR@E+=&qCrJr#4T_3Gm#3m+uh8s(v`e&3}E%uxkj_LjwZQtp;%y7%T^iI
z^j+pjtP2z&NqYFN>v^lw-O?WdLeuwm*XD82<b|SCKGPStqA0lU7x&4fkVq3tE3K$i
zOc;^UadmjA7Aquw`_tfl)XNTTf;HAOoiAmvTuAZb9SZOP^!)5SLIWGPQE9L_>L-UF
zic)}X42CUjwW`(-t%e&!k=pG_vN+Y$%V(WbIS?LOg!iORrP;1$j3Px*AYEU{sb;4Z
zEro4#-es{e)*GcX`Lx5&6Ny2C(pz7q+wRL$ZHshD&?Uv*C5WI&P;r(;lghDEm}4F`
z;75MKeEmL<S^8bN=kHb@XKF(7QgkZS7$a5-rQK05hXJ>TtFK}6KCjmloWu2-X3{9f
zPcTdsQM3&J*rlKS8FbZo(N$oWrDqk(io7qu<`zUr6`-x2bnB<bS~nOO<t4jz^D<%m
z1(`2t+CC;Olzs2JL4ennM_Gd0FV6ankO6nyI}w^}C8VDxN>*z^=lg$$uT%58>bgRL
z4piu-^i~jtGI(*d|L9gMlr3>br=qPBVh~hR*fOO=F-5sP-N6kF)3v4<-BG|oAQ}TQ
zb=&3cFk%$)rgQTK<dcP?0GmpM!1vw-V<G*I{pGgVepe{<uNt+76Ku{N`nqd=U8pM-
zv^r)?aX>#fXCT8O%8-Chzc(`3z_EHX;R7!XPl<Z7%87o2FsA`2<5e(JBR^z${Z*@r
zh&<vPT;oFPoJ<Q5<g}S$P?0|Y8a=qG2_EW2_?dqYG~y~YoCr1QM6$9VK#p>H{>r*P
z+32nEGYQ6^{x<PDV3q{dn_pnZJ59IE(YUwNEQVx6TIV>7Va^|!i1JZ=NycRromKo9
z*w+t;YLpu%=s6g?I&5^Py6_EM-HRdp<(WTp#BTvtF1ZDd1M?*pS6|^BzazCu>3S!V
ze;Ri(M?KF5)k^_xt}|E=J?R}LgqP3JV*{1!MQb9qB1*AiSf+5AVJ!}hC6-!T8IkS%
z)HXGyewy|VlX(WBF3fFA9wHoHXmUd=BKYbXeaopLL$TgOiT*KdH&$Dy&M1k}gFoi|
zW!l~&?~=TlqsD9xquFiZ<XY-VHPtoH&wOuz4f&v6*C5TZ0y#C<4j;Lw6|aTcT`-MC
z%Y9{>IZky_^Mjgw$n30hU)W<D>8dpf3Mc@fjF4{W&Ps1H_K`{A5F!Oh8wp-3gIX`Q
zSkPsXn0hl1OSfK*quO6E^fv@%E8Z=rPOcYYgg3tb@T@2G6EkLNJb0tux`|R!hz=_X
z9haVsR;~xBO8EzIwNz813Cv9ir@FlJV8Hkq(vhYGn?<%j64p1GE{;&Row0Q8{ZI{7
zGH%$9e7lDu$vJoEilWj{iLaqJ(}rAc(b5lyxFToz0aj`QhEm%54lUu2x@WSv6BsOh
zlsx<x8Y6x@ceBRSstU7@C%KEYnOD`H;DR79#Mg#xy21>};5<`D)9{T$H=%YxjRYV0
zmtE^o5WcUif~c)tpWx3E8=j}5X!#FX*3<5mg$V8mhQu<hpTq4Q#;4RC4EpJ)A7KbW
zri`Ay-4gU|Si|)TqOyd10AFwQ>rh>G80WS3zBw9))ZLS-unp4wFvzV{ZIDbOF5n8I
zn<UG0Jb_IxZBc(~x?^xQsSbp`QX;J`7EKop9*sQ?7G@4c)*olOX|O~TSF@!iHu_fb
znDuH{-gUYV(3tSuA2Cw_ogi`OB_OntUUS&vj}FpilM7Jl6?9Vl;I*$gn5F)$5TqY#
z$7N(zV!TJX`HB+hG|7Qv>8$()lVy$MIj&4vOrK#QHf~IxJr0<1@4GWpKY7%xzC!&r
zg<4VCNt|-k`bdyJ{{WP9Y7(;zaBEb1b)u<?;ZSWl{KII&=Bw--avN%8EK|(~nat3J
zqlMg>EMl|@wC3)0cd=24<qRk(t%<P!k;HYV(S|XT>9~YvM!_W5%u4eHuP@Qx0@8#4
z7&&A5%H@V@&uv4P$b?G$JRA<vXp>y_L#Z$V6I>{Rghx#JVjGtWopOnW%uI|}I1Xtn
zwJGnqCHhFm4z1=dAf2ab*}XvetN-UZr_}~Gc>OcG-#SZ|oi``SR4REZp09l}sm3A@
z;pF5Fy$g1%O0n=CDXAbZYg7U82DQe|0e+>l)^Pb<xtS8m6e_iBhLLZ-uFla-xrRcI
zN-%IW&a7<=sxT9`g&gz++S@>E6jKMahr$9NK!%_$<LB0>@DZHT^*aOveDHxDMk`8<
z5CdhGUeqnzH?{UQhRcbR#cuNF?u9`f=}e3B-(Hre0rXjw;}OIh<33G76bqn2=tM9F
z)Hpaaj24@l7(oZTGT5#n7?#1aWxnZb(oRRHSfFZvJ6WBcBQ4%N2ozB0z!^+?4CKUC
z(heuGg_!vC!r#Thed~+7{c=PomgZkQ{+ZFmzp;|oGN|J2IsfWC_g?Tld>gA7Gpo_D
zIt{qnez8d#OS-f5(!s6J%Xk!lNVSP{Vorx}R}FM3uAU$`qO0Z2nQP5j*Nj+{1hTLA
z!S1=U-?^bRkBViBx><XPF+B23k!aH1H8w-X5C%A061zbx7sqvf&p8KuUTUCeedVt&
zt?siNp(2Og5Wxd^b5uk@6n(s-xbT?jNUw%WY(j@fVe1S~y$z8P-@<0C_E~P}iRj%7
z!RlQy+q$UNyv>eTHb!9(eQlmC57{QO`Wi9)5|BskfbGF}OVNpD8=v+ikS*QA04Y;j
z{ppNIpEIoZBaorhO)u0R2&7#zn_9!T79i)2n#a^|rMCNFVsm-s(HIzNsa5R#Kmqs5
zQXrGma0mn(7pRX7PI30nbd?N2qsQ)72c)yIYrahvq8}7p*N;>pn=i+>jrF}DY)nFT
zkAMP{t}#TdP&*`Ji>-%GJ@cu`{^a|X7!%!<HT!DM{?@w|v%#s7e4noYGQ9UVbTH7!
zd-UL#JjIyBXqIMiiOV}{u<Ki6+FuJs7pjdhX$R!Dxd?H=?JVv6Bc9LFM-6;kvkk~N
z#<i<Bun+Kw=yyRM1t0q@=HhQcTxlTViq#el;DNSaDgR(3F(to(jF$6#HjMerW-iA)
z1{)m~C08$+6MxrT<Aol`-FN~or3YQJkQ}+~XN+H7{bL~?O4ad}?^w?Ckt@N$YF;{4
z^}Sy{*er&OY7|%ff%&BdNU#`Q;@D;I>5Q}Zk5;Pdd*UOVBZA;PNo7lg12$7_rhn(h
zfx!#XTy@DnJhUDB`Ar;pSgrS^ZxDvg)hh!O)_YQz%f9vy7FdqY<&?mPuq}Y{o}JUg
zLAvqF>j!>tnyj#MX!7&;e!sx5o+1inxJr%P`T6gGs<;7b2w0Ko>sbW2;c-U&CC(K6
zeVo*3u^TXAB@}Y$O!Z;@ig{n~>hzICcImawYOQZe@<iM@=8Ypb4ly{EgAJ7wG15+P
zV9K&LN)WVU*IaRss*h7;IM~C!ld8<l#-**zV2qm#aG=z2w~OGfYc0ZaUNYY@I9Twq
z5BWoA%%B<J4P5+Am$0_-GlV)D{KYta&v{ta55*O%XjTmeb{159HgBowI}~2S&|<R*
z>AEPzocGW%bkcz1YgTP079^c2s3BxKYn}GAfm>~Zg!)Un!<QlLrw6fOh6E5<T{UWg
z^^1N;7|YvC*W?nvM!PxIVrxBrv@@BzO*fX}A#igNw~FJ?@F2gxC-A%5?HK6;i4ZQk
z)la^U=4a^){$@E_A#GT)oyTr^)g;yWC?B+?bOWD`R<ZCN4b!R<{bd^FrI*V2++(g#
zLgKs$QpFqIKBx%}CH5XmcREM76%{JYtmGs}t4+_EkCuljhaO_C%g_;}c3{T9v!WUn
zr6KA`H*zXj=e{>kC<guB=Ol=Hjj87ymp81|>>E}sxx&?~G*4+geIL)xrU!-`l<Q`~
zL%&1e3{D^7y0#g}z=oTFcPo?r@)Hyl!62Yx+t>0gb$`vf2K#K4<F*P6{sIO`Jc`g2
z0>k7krOwP;`L*``^7$^?4vkzw<7%475+xK7Pnux<3g)`A`g<BrEbiNLAmyhX!lCYh
z&jsFV<rR$YVR1F*53Si?QqPB^srv<vCw?H<`bD^mK+^!bJ@vQ2GCm^(?5@x?iBWtn
z!jc4?xLfn=&|N0x;)ZM@@bbLWA?T8xUeE|NwqKSyO>lw)<A-O}?Fok<vT60I3ATJM
z(%s%7X9+oyPv1Exj*y*e1<{)z>I`D8!Vd{PcGY9S1>A>pLOpl?h%YYrw%MA)o8m^t
zrqT<Vt5Meb?!72$?TN!SQCW|<)qC>sNd2yYF>=2c1(|1*-8aCbTlsM(?goLu*_VR(
zy5~~A8k5`24a37dF6s%Go$tHd?B?{ljg|C^Hq)1%z6&rR)P(99;XJ@)-|MS*2o)j_
z3|j?e5I*K4&CZ70Xn$8`F{A7nboetAZ%k&!Rt#hFd?$47(kcDZ*js-b+pW_j!HXH$
zqvaBhS~!uDx6T(?n4vo$G*ugMF|WG6({ru%-X`hS_kN9|!l@)>7-<wK-w9C9H))2U
zo*fkET><logMR6dI|hV)RmH59uA*TmozNEBmloqP)X(jC1VPTw)vlYdL+}KynIl<u
zR<OROTRetV(tNF3Xk*Qe!^k@Y?3K{Dj79cRr(^`=XjyuzRf#+-G8<y=OxTha(m0bm
z;LK{f@x>E7PHUx8(|`4d>IT}q$5EWm>qwI49-chR3l$JN-E~7vY&kP@^#Hzdr2El5
z!)%w(j&L_Zv+qEGjRr*TSuq%+<b2MPETp~cUca93&sLpw==JzP49iGov536K)W1CZ
z%-`8Tj!jW@nK)LtP|9;=cC6+wjFLft|7aTc2u;Cx$#zcbI!$$#kIQYs0#=SYm4_;0
zi55lrK%Egc9orI0#M0DVgeeWJ{{hFc1v#L))iQ0facz7-4;^VdqA6Dwx$pEH=2^>G
z5ka^f8@#lCZLZ#J?<bx#8wjgVOq1uu-;}s*u8Mi$1Xs9>$7j#^gcI~gm$I=ikvDY~
z$hY<<o<3t96VM&qP#$78SL-#r7@y(G;jQ)sKFT(lQ6$}Fr$cUZ3bjIjEHYF5=<9uQ
ze-)Z9t<9Zl_dC9z@40P~AUFlH!e|lAABLOujfbLZL3Ob6yJqSE_ifpkKmjH^&fL~v
zC(xS$$^veYJ;A;J&MCqr;uOqe{>H_)y??lRej0{LteM~LA}<c@NF98+P(KvHH{>!o
zNnT%U*I$NUk#9+7W}X-pG}6q?tI)s@U&(2~dM)P3pig}99>escp0E=HX=ocwH~(%c
zyxH@faAn;(pZBnMtP)qx6BZ5m49PY7zz>mL5Dsn9yz7aG?Q<H?l~T`qT=~`8V3;(m
z&ME`&g>s^7aG!V8B=guJ>}oHb!aEXJFZf>A1{I-9hKA_C2zK1<!hH;8fF&dFBpNEW
zu3)L0RPD6nDuB1^JSbd`iFjKp$GE6jW!VRZeTKb`T6UY*Ao8EteIzs-9GLiHPgjcK
zF9!pYr@i>H6&QE?7{XZk21F~<oNUeU!S+X8J@Vt5Bi!=%5bM2*q8l&krp~o~=}KSL
zXQ<<%k=&rVu-BdK`BXW0*dH{W{7`M}igO(doAlgzO<XxvgFH<~gkVV6bz2JXy`Kgt
zj&QcjvS>bytT_9VzyGifOt6s|gthqyFEG2Xrh=oJU6h!|pJqUxfoSs98}F&X*rh;m
z+4WSnlK^+&Wr%ZSSCgl-xa4KRhOb#qN4Hs-?`KpOrW?GbZ)$wUCanc)@j;v0>fqZ}
z$lV|^)2ki%8(UsE9N1qbB<;KN6VfcBP;E#q-uGnBqs10(^MtZw&O;Vh@5~lE>1Cgr
zoMh~Islb1HC&_~S{EwF&Qt+XuUeft<`H!Cd%h5Z~!!F6y;+9<3%j?}r{$2<vh$!ji
zR~o4DkRQQB1b~+=A%Vcqa8=9qXx;w0iq!w@kNvN|`Cl(F{sO|sJ9wNG-(=AL@$Uci
zNB*}L-=5&_@6pg!x&Kfo{PQ^f+pqlN?HiC_=;VEHB`HP!x`O{Y+P^L<`VK{2-(bU|
z@Im-rKm4yt{?{7>(ulIUdN_QQnU(+cupbD)sidVnPn9yo|1}W*!wH6<K`JF-QB-^@
z`1`|dyp3V#Y5D;g<=>8<ogXkPn~*A_N`HG;aZlJ^42<Wkie4mtP0{~wW`9i|kt{GQ
zB;5e(Ro@5K@P8V^Boy(R<!qt0o-2U|n!mpu1;MqKqlL|5yIAcuPtHMjG@tux)h1HI
zK352B_`jb~5>zl&hn;%o6HueLr;CQ^-(P?ZDlo4tU8RA4KW;H$z_^j#E}M`3{jzfv
z0p{WSeLY6OKR*utS{wiN84;!frav6yQpl?Mx65P${cUwpgEW%<c6FjOzD<|?YWl3!
z-%e|Y>f356x3$p!`_bPb0T%Iqmy3qO|33y#IE~>v)8{pRv)?BRfS$PRzsmKn@&k`Q
zt@k(FvuJJgu5kDqt`kkNM--TJI(a~Pb<8{=^H|3C?&V|&r$d=Vs$JbWhlg4eHbu>Y
zYe^k>vnZtZ<1zcgP4nqqrq2sp3dCYhzEq-L-zK+U3efz_YKC)M>?fFuY8NgbxVLNg
z5Hfdx;PZTqNvm1uXVStH3$TdeNh-ugkM%z1P^GICj7N_xD{Be-89pz5P*<DD%sSOA
z*83l(8!UFaZ;xMtcE&tt$LzGLt+fa0FcR-x04kQnt8X}<Qb_{!%&)_Er+$Kbvb-Pq
z84$T%0iVHuIxIAxFs3x%d=xkbe9JO<Ijp_Qenk}4IXFmwxw!sGE`N`MaStmT&VU-O
z3T)d`YL3Aa88u$Sk-F%7TPVO!=X*BXYKK}vvs`RRuJd_mr2SMS>vFUx0X#)kQ}J4E
zeSQEQu@AYjGqV%}8CWRl%c=fYxR<>NRKEam#sx4JJi<|NG$K)d+|zE3mJjb=@}g0s
zk1pb=h4`of2!(<ovVgQ&&HXdJNCMt(+Wps|)rkF0Zr1m}2sivu-7H7+tv2;32$EyX
z)w;h>To9xFYRyNd^Wdv%Qao3dyOBSvu>U?vVdV+Rd?snP%)onhBvmXhEHRqJjYB3(
z#cFRi)2HqNuCV+87?L8T`}dA62_*q=McIb^QBqi4$L)Zt&-CXWV<-s<<+G=>u3T2-
z5bgNN^MhiPj+fbPqV8bNFs+wmf4N%aPt3!eKIb|WZ|i&AmOJk2eg5b+e&>-UitLO}
z1$J{|!Sp&+0niB8l4{c6y?@dSpLzyaj29gbuet6U8hTuI%jnH#9LA>;b$zVj30En&
zMElq`=5z=azY*-zcc}Wh*k5RO+Mj?*o6q=LAFghtE`U`v<?nX+HTgX{rH!xP3_!y0
zXKys0(%x*QA|TfSK^WA8uq1@#+KKPcsfl^?i=E{Fzr=2-Yg)s(JIV>LyAplDqxBsy
zN=Ysd42X=kGR)d!FP4_+T%7&vE8lRB*+U7z(%+c3L)$~tne_C&`ut$K>7K%DL2k9w
zmY%Uc&-k@9F41(fpnkQL$y`baIkAHfx;V+{fE`8OEE4ZFSa~VK>tTuw1KdWdIi+p!
zoO8S3)%zQ$hV%JqdZFS%VkC6V1|%*N6a{-9^7^>?y0_NGJ5%hzE&;P{eXG-IzN<PH
zMTwxUAE5+k0Ot!Z*Hi7UPg30VHu<@$1X(BA&GNu*^xjzvsRB3#g+{HF7$WXWp^dW-
znc>=1qHcELo#|fs?Mf~nmYM{3-EHm%qhWF`hZ~B6+3G|I!SyR;pvyKVb-J+rgq5N|
zSLh|vyJ6vhce*Y@BB%e;4*`Y!-E_nF#JOiW_$`>5Q!!u{t1#83ZDE);MHDqe84~*H
zklIk_!Eyi7YD0<xjphYG>*bChC28AzgCsg(JY$tsGE?ObZI0x9ENmo=IeJ=_{C=4<
zapnS<d<YL0#YfW&j5hd-K>3=p(Rwog42t9RU~4zxc0S1g0^KL`&Nuhwvu)sbK0po&
ziG|lpJv-jepnK$n^s-|U{8pRH`ML8H2tSh}quIpUecXlYU1JV`&*vGy<-9L>mmr|y
z+RFC~Q{j9Dt-<ih%4L6stMPQBN%P$^!el(1!p*g)Sd(C2KnjOa{`CVrIL+4^k*;#t
zXVJFc_a85cEiP3`y*TZ+&npkgbX^<9yG<HODDfPcY}a$5RKfBxpu#XN?Myc<7IX4X
zui00kM=)Ej>&m~x7VzoBpP^GLi%t>qo<RI@uZlb^^Eizq+;BNvAYJc#p*wHJeFk0f
z%^JkCG&!HH8=`4a-*?)y$6tI$wQzVaY2#Yz$B*d={!z^YYkTo$PFHFf8K2LxH-pzh
z<zg3Ic2)DVh4i#Xcq4t(>)5H0Wt`vP#**-?)8@|sYg<Gr+u-vr)8-p|&k{sKYh*W;
z`?qZap4SGrSWlXX*oVrS2&Yr-)3)7W?RH<S>fiM}?XR6ox`6dGugp~p*bBM<i$zLr
zzB}U~6^lAo_AUvE7yE^#VwmVsXaDCLH8wmsS-7fKNkyI~ljn?CC(bdhmB));AUMq8
zNwLHK!`@qlMY%70+e#zd%}@eLcej8vN_WE`4bt5WA|OMjqS6i0-QC^Y9Rv7WthM%f
z_TKNk_kFzIpKtR8W{xZWasJNV)?LPT6^)ZNPuQcDvz_Z@Iqe0Lxzbk5WAbZ?2D5$c
zCMA*i4>?}n^s*MS8S{u(-NG)g)N?TDhPWFxb7S+obBY2+y0oip=x(_!m)b&I?2wi@
z<`lb`7iq|^Lq0fd30I2Xr^pIO)x*(-q+PPDddLeRmC97u^^~qQkqI<cl1Ee_Z>Abn
zyhSGAHqJX91vu&_ls9p!^Asm<5c14Ri4f<YgIUaLdni3V%l__lrGS@bp(KrTlupP@
zkiwmvQ<1bJ0P?;UvFi#qyeG`)j_zhR!5a(7Ic^d#jW$~eXJz)c^R70bTkhwuxLpK5
z-pjcvYy@E#FS4UnjP?G`$anWt*g?dRn6V9Upsp4%iI7`22T?ci4tL5E^AII*;g$hK
z+uQZJ*70mF!(cXC?-zS$ZRe#fRj~jeiq;1M;y(OJGzBD4IB;z}oD52Hy%#zKfR)uC
zjE9#!d`G{k;4d0iE?aRR+g!up_pB`Lwg-`Ia#@9~?IE`ZUa!aCs4s5gzsXnKn4T0$
z$B@Qtgfwf``!rq_15?dbi_Gbn+mq`hgVzl>w&)f^{#Oz19a|V8367`J4Q5tEHS#S_
ztH*~Z!=bJuQL!910V2{(-@IWhSa4t93@G`rZArLw^SVBcXb_3(p^Aj_J!4r(rniiR
z)U2lSZ@*K|c?LYN5uUw#j*x92p;Lu32XmKa*kKE1TtYpM&1i&b$*_S41#BCfqj4@6
zzycvH{r<}sQec4{I;Gz?xrrM<G3u96;Ifm2kd$<{S<A!<FDMd<3=poVYp}Sx$(XPh
z4CBzjVC5#yWcldv_kYs{(!Ke1%?~k4I%mQF-Q!QpV=emXecoHlG5ta*#{U_f{cr}a
zZ*y~!8@tnqjPnC{0KFs|yWSa>-doPqA@HEEP<-8AU<tsp*`6F0=(oCoGih#navVE9
zx}w1+EsvFvRW&I!whF8Y>sx)aP71G_e<pBVPijrniouS)n6Gq~>#6+&?`WE_P}gZ{
zd-&^8eJt8K#T%lb5P{2V91PFTHr(vfPD{-85oxnOJ}K+h;{SrqdWp9LZ7uues@o}J
z<e7v>{e&|}4Fh$Ulm9zCYY{`r>D_1g3+Nn&RTBXN{A_&s6F+O2<@_nM1OvqPA^7EH
z{G5`YqS|YXoR5~{vh&Pp<v}g2IEdl`%$ixx6w-P9fl_420h$WrA^1KBB`lf=8iR7Y
z6q@DCikKrh4j9a^<$Gg9%yR{gTri%My=gI88QHlYeQ`LRN<2|QSK#0_ygEi8;PSqm
zR$&wRL%k{jq9Za{7;G9?{@Zol#=31J!0=CaR!ryMTEr)RlsZ%{^~L0tJ;s{Oxob;+
z2QfdGLakU;hPAaX|NWTgGm{znBODaG0Xv}+=}fQJ+wSKZanu9)<U>iZpXr*}twrwb
zg^37^exZUQZ!#RW2E<tFU#S-IuGY6`y7x2Hyd~F%z6M)jnAT89AfRV4UN2cq_NJxr
zo#QOJcawDl$gu^65MikIC4T0HiJKTsb)X^?3E?4%=oW7vDzGPyII2BIC%XRh4G`Zt
zX0V&d@;j{MWum}zm@wAaKf)+Vrkd?v4O^ZCopKuAu&%ObeW!7-f57^{7@+lPKu0c#
zQO3l;Ed#wAH0540oaEVf-+Ju?I3YUwV6?dj?bwcd8a-<kj=qT8jBMG=jR3AqFwOKs
zZ?_fJ0L&qvFDHgp*YAEy1-I3f!9}O-Rb^oE(3wa@XVKkcoW|2e;$a`5@H93uZu9BV
zRe=}P8D+DIrW<xuDh)?dIl(jb7Vqm^tWb75joS1s;-ta!=dV|uciI02hY=82a;HLS
zY-Zoz{Fbz9Vj>oz6`nX1;#LSjK|bj%Y2ehmyT2-|LTA613{}+oerFPkyzFpuLiu6n
zh$-q-h0yW&LJi2+hYWWpjangv^CQ&ecdnAigZxqOepFTAC7f_4SWb-Lc1ix-`4^S^
z;Zn09C9zvxQ^&aZo`G4X?-tibs?GX(9^Q{60dcr1XWQ`c{YfmNHZ@DHmr1r%Z^S&w
zot|+Ni`An#)ldbajH3*T(0zHPpi@l+*$7RMu2YTu81+$N=dL?<F{gR(dPKq}vDFLC
zryvlkP=vflIRg;9_^5q8+w42_stm<LZ?Yc$g0UMX9$Z+;aj%zUBQ}m{Yh@qQtR*4s
zwgmCt&R!`)6-wZVz<4u%*|9G<>^@q<y?%x2Q>q7z>5!F8__A$8QS&Rs-RZkvLmu03
zRyV-x*J^uBGYF}7dR&7jKD)d<IQ1;o_Xa^R!{XGbGP;`rx<NNwSQw=5>8`2_7h@;c
zCY)Ya4f$sgg<0V%Yg(4Rj$w6q%Vebae1A5-84q*-Hy2eq-1I&QPc*1Vr&+9W-#!M9
zN5b>^FiDLcyaXS~VJSgY*7trroUVvQGufXOzo(Ep<t;SgySq<ebztd-Fb~2(g&?tY
zl!Bl7JsY8kSP~;}5*bqSIIBQ<uA&m0r4{a0k55@SRoZ8gZ_kO7y%{DrDs>2_Tl%^L
zH@IcPa=hr;koCFkNKZMAx}QRQ=N`wHA=e^iCBd&J1XeQsS%D9=CN-w%T=4BT%n&yV
zJ|v7c-_VX{>y^}V=@in<Uz&vIpRpHgkI)Bp1#4|~$qK3Szq6CDwLxciaFbfn>|0pl
z&;}#1UY1c^(u`93HIXhq=L|;`t7xlR8^rOn6o5A0c7@_uRZ=g6Is~O3K>xsBr34nN
z41yCqyf_+qBXMKB4pYK$XqDo;vV#L1vj+V+rPgOAwEoyT-Q6DK#fkzst(Tpc9|EIQ
zo@l#`z|H=myb)BV57^Xbl~tlEa#_jXFuKpe;7-4VE~+TGn@2P;68L+<I|LRu8*{L1
zpfrNfK>PWcCt}?uhA{5%*`jMGjnHFDB;^43z^I|a%@*mNV+~CSU5;2c^^sxWf_7O{
z8oWti2Nov|*vcDaFORL)pMyCox8hwDqDFOsHH>wF-nW23L`4oNp`y|!%samod{NPy
zayxBrP+VM%7C&sk0I`qHtO6i(MhBao{WREJtx|@$C|>C$aHbpc08`6(mN(=IO?att
zQPhj{q_gqr3UHM=Jp*3H@!rsDxg{yj@r}@G>poU>ZAibNtz{2bTWp+z*6Ae~HChhx
zwHpwGwWNda%pp%Xf&2ZDoTMvtavS$1<UcM2x+9N~M2mN{zehJ(IW`P7<qV4EqT(#Y
z+M-!#M-k6(6VmLY6v}{Q2gV!U(pZ}B`jV@31@`w|Vz5}k;D7TEf8n1HM@2`*wdJ1j
zy9?&04pio~-p>TEp`mocR?+-O&TR>fx_b%Tp$O*JWYvZpYIUx1i+(BBW$z?s8^srR
zjE0Io@<vDaZ<rS2*mCh~6|>xVudAj9zLF9&U(%MyOC4gW<+!mi95pd&AvG@^=%5Ho
zWUYqb)!;FxzY9Lw{O)ntUC1upsYQgq=N?cspzrR?ZItv1yi#P&)OqW)4JUI5LKTcE
z$<ncl9{bYU=FFWC$98i%Vd=E%@y3T}rsPen=`_{A@SM;IwL_7q*RbJB@`9QWX5IV1
za`W2qa?`fUomf#HefSI68HYo;sTP~|0%*v4KvnJCR?O1ODOUc|;&(>rYqx_?P@!85
zP9a3&=M{uzzi+M}1oC{##W&@yoCRA#b2rfC*&k%<DnPal<e^M$T^%jb$<WGYSbxT5
z!R~SYPUczE;qm!9rk^@RrL5z2-^-MrTdTJ9hw7vJJkz{6!_U9|1-J4^{{goW;pkL@
znY)uLhH}0Prp6^Ty{La2Oc6@zo}7N8qRH-j|IdUg+h2t156k$jm<f*`NSH4>e8`Vq
z5{kIAqw(}up>&#bUV0YwYe(5beEq7HvdZ-1-FY=j{KTUrA7=Yt*T=?fv~|>ixMgSA
zzcMY2Mj$!D*Z{?R<JfmG>vW2koO2^J_q_rE#-uL&z9QGuE5k?IO>N4%V_ye;I1luv
zP7(b&{6a7Cx$8BZ7H~|*%Q1UsvhBxpc=#3F%9QsIBXK24g<FPYc5r}vTx;=a0*jq~
z-F7#JNO|YLvSX8BE7e_CQM~BL9IO~eUq%TW|Km@!qSwS;ow7;U-BuHZk4;pGjPDj$
z)QCu`A&4zWBq)=bI%O#azaY8^TemM#)B6=SK;CEJmEGRUrt1p$HShN(e&l%*Hrwg<
zm(^f+MKbQ3d@LZ4Pb11c5=j#jJKr9ptYiYawc3&T{GOR3Jd%KZAK*3E*OlphOLvES
z%x_b<tPFlNzZ<*!JL`hS;E*8&PYS#;lHGP~+MhD%BiG^t=5Nox1aMyZeeY{6ONw^z
zVtMe_S)~282*KrN%OV~f_P>(%;RnqHvWMZnZu9ZXG1KFgBt_+nahgP1aqw6U;Oo;{
zJh#G;7++BI-}~c+JJ@`Jc2@3WR|Jjw7+f#*7V1>LzL#|P@WEEjQ$^b$?XEM1OtoPK
zFFPeXc6goX-al{znaH@T!mQZ1&2Y<-A13Ild4-}fYql60zG4b;7R^TU{7mJUX=Mvq
z0Iq<MxW!qDh%5K$9$hO<-R~93Go5Nvwt{m4<5hzbUQZwYAryN$ICC?yR9@48Pw#-D
z+5Z(wEry`+kGt3X38qr=ngQw69V7F**AO`&1hJM!AvhUh_}?T#S>NhbBS!G!EFp1R
zhvd^z(V*?D4L1AW7KVw1Yt@K8`a6+|L3lChCCpqy|G&`GwZG9+Uzzuw8HUenx2roC
z?uM){__3({xEHo|uc*?xXicQD{H43TDR%qv?p|N8#={wn+1>$RrfQbrR#HUT@W0X2
zHa7@gQ<-758oBOAzuRK<uk4d+$VKrd(SAfB?wY?oTBeVBq<1`_;r~RYCO(m=di*X~
zIg&5N&k_rEqJvQ+KXG_;ux5l+SK5}vl9)5HubaL`^o6{Rl618eJuz)WFor=AQ^O}e
z*R0U$yZrq7YctC!Bx&v^o#Mg6=Wrx${S>!twuOr!=pYyE_Rw82@Al{kzO4=A*F*P1
zK}39#KgrSD_}vWopiU=fPJV^16Pgh<x1;9SBVyaQ2^&a!xK2=t&~b{%EFytD268jQ
z+jfrwC&s>o&+$z$yZ-2x$`;{D-O5~d|2gE>3baBc>&hs$CeyREBO`<BU3gIk7`|ST
z-ByG`9!~7UW70K2M0Uop=;>{8xI;JKWX+7|v=%?=mGEQpbe)c#rT)SZ`-Yw#$FRxH
zEw^2|IX`n|14L5CX*tp3SYPchZoW5J1Bx~#z))k*xa$9jXjg1OcjBJ-TD4R@y`qe2
zj4f0Sa7yPUSeKjE#U9mdlh+9WcLKnl%Ba*DiWSf&s5Dt?-Q7o$sS_#&Ois#?>yO>w
zeIp3R_7B6PO;Kz7I9h<TrI4rK8K0KYTZ`OGE?uFGn#oBWijnxbq28^mkaT80FcD5Y
zmKMwu5cQkLNu-V0AOf3tr2&9U&)VzK9ffZEbbT%_ZGS<F=?}YswP-a?ZfI#ULgB_^
zpSb4D$CfN^j}4^P20z%L!`f{O*tY>)Vk0|J5Zo4w#RxSkh(iy>_K9;6<_^G@&cS&R
zwKfy(KuwmrEhmo}dHSSLQ+tp&g4<fGjYj;RiSE9DNn+6(Wv#{D^Kp5VLey=D6mp3|
zj*3<h(o2ZY-Nv0sbHw*6pV#@|`wjc<OdSBZR|S-t56V{Ec+Qns5zHn(=jJ;*AKlHU
z@Qr8~ZBm}AwlzCULjk!p-=8rox~)e)hF`uOrF!;T2xuubTh2rTZ<c@6u+-f)jn4Zq
z{w-2@L_Xw8i6&*d_l>T6YtPDjPr|E+a86Ng=pv=oiN92>rBe3Y>!YPtcqrR+^2x^K
zpVnoMmLCn#NRV(4%ZIfv)ko1<lQ0OK@MwtNd_H1(e|M{VKcm1SdD+<NRygBUZ>hE8
zbFxySu5N8?nZ6!Gc2corDyGF4`WI=c^X@Ow6#04YKS)z0B*lM_rp*B`x}|)XjPDDz
z%H;7|C^1R!4gr_xW1W)_eberVJW;X|Tx*G|I3*Z8Mu<B@@swuqWW;mk=l1z<bC!A`
zmnj{O&wK{=NL^hsyAEor5)sldtl6^jcDldqDGF??gLId`OpT$&ZP^b#v=w2(Ymxw$
z^kiwn)t!}^<@Xv_%SXWnPBF^6t=k$#HbT^@V)G4w55HP9Xu6W5`;#!71&D;1O{rqz
z^jX=oS|j!>uLuwg4qci%z}+neWgpZY-1zg9x&yu`pVQaWeU6DkcnI?T3j{UyJ|B6n
zTVY!?V3p3*y(WxrYcW`793JXU9>#N*9dfxJ$%=K}oUdJ(cf`nP4Ip40sUia!Kc~LG
z)Ly^bn__FFB49n21Xmeb-$5SClbF@skDYwebsYG@g<|xsCwjU+L#RG4RJTR(06WyX
z&LA0RvM5LbCM#u0CEMXR{u^;8nwqx(chI%#q_J}ogFJxWf#{n^Hy-56)5>&4;x!({
z`pe8k9yAaTC*T#R=cXu?#~;2kF?HN5Buy(iJF2JzR?+u*83<xa*O%p!bJU5+4dWw$
z$o&XirVs5mxFOF$F3uI!@7G(<vY*ekiwrs0l3~faW61JhBqJiiQU93$?Oi^+c{7+&
zo;*<B;2&kAzIYTK>ozW<XiB#^GZNq`ju+op%Sgh|X1^k2yY@!A(Ip)z42nTK&{h;+
ztWTa~CjEYi&&|G_R#r~4e)V8BQ%H?$juL`vRN4)NGinJLw=F~sxCIR<ZW-O_7DzcO
zxqHZ%st<Gv^MtGj{@z7TpHbaq`PpjbXDOY`s?-KGH<04&UX5sxBcer9Cpi6s1vP3X
z@Op}1<X8wuCVxu#=pny=)RKl3DMaS(R4fsS-h1Q61{0~2Wj?yqez2~V7HA@lOn}46
z!#$B}r9nhzkbL>titUkNnOMG_xcrI{-yaAdaTug13e?zKLMeZ5OULGD0ph}U@aNkE
z)0hXx>I-_Ja`34_g5*rKte2xEV-imx-iCC;dk6`DZ`<L|u~^Nv1wO=K)4BtERcU!}
z=?@@fQ!YcsZ4i(PUGI+7fua648dS<%{W&w8YxyS>oJ}-<v+4Rp-Uz)}T>T=#P1RaE
zmBEA13PC{o-Slt=-E<IQi8!l8X%&UWr04nUSgmsJjz1%*>a1ikRB4utRbLszhhlSX
zoD_SWx{rSTsud5-VL$~I<MXi1`d;`x6C=<i@|SL<bq<*cr7+^aI>CgXP&DNYqJ5Ux
z7W7DWf<XI{7#N6mMxui3*+^K*#n;KsD@4r?YFRKr7H9QSxdif`c$#vr()n>J8&{5Y
zN3z?`I07`S*Z7|Nf^QLEX0!%$@L#J3xSZuU$ZyH=OX=^YMlVA>&bspF-JU;Z?i}H!
zfI}#W?exZkvLvC&+oQK;#q+cY-v5w_eo1<dhp)CjVHdWMqZZ@Sc|U^R8fwIW*TaGt
zbIT{6U8ZHFrvDvYs7?0G%@7cTrd;OOx-l^zRU*sumSV(->x!NZ3}M}ibTpI<^JrwX
z6|AS3k(w-!tP{2`_~RZE^G?L(fx*g74%1+bb)`8H0f!wVL@d4+!(>%-b13|3L+L+#
ziqVt<r1ArUtwAa;^!dQbHOyRYd#{>_jYc~6$bjbh?|UYQMkX*pdIhJe)9XXM5E`hN
zVLgP~ncaDRR7YJD6Lobmhew});{_|ra5V9&dYhbhxYm#5SQu%TUI>x#LFA7s+i;IY
zLyunSE@CLok?!2G_`HdDAhL8p_fJ7X^RP65T|>M}b(pZv2pLu$RM%-4vli)H1^UIa
zrdcgcul6l6ZKao41808P2p}`J&|=T{vt=J}+Li$Oa-pvT;kL;jWh4{LnMeu^6Y#<)
zWfj&{DY4_PrtrN|T-Vzb0|*Hz(}GGJ=JLW#Z}*T#m_O46E~($ZjgvvJ0ZE7!f|(Z&
zi$bdGI<0(Gl9nN?ICKyHP=Hx^-r35)Ugu=04Zh*ma^{E080{Z#npuPUm1X&n3^i0R
zAtD0K#;a?hkNP8u=H;vy>l6Y_Nxf9+O-5bTNW}Poh!+H{<UyWU2n~19i;`%aQU>q_
zwwSgTWUOI#?^Tz8@}_DRT^_Stwp~h0nTG}vk*7poG*4u3;GtmL;X-4mX{a4p5F0FQ
zW`({_IG{FCFng_<Z~}nKV{fl!;R7aRzn3Aor}Mghc5_0OiS>ao9~p7E5I<S33U;40
zFVdDfbFtCx)wJs3sJREUrB?$<KU|xiU%W~87&FKQwTl;Ii>|+1lP980UnOf-uv-MJ
zz@>b&Yhud`_ISwic?%QOHB(u1SQ9*tKfpl~>5|Klt`Javwew2Al-i)O(G{PmG6?pn
zJZ<~I<J!FSC>Gz>&(pv{WYeW$MYz$k&vteM98kTeo53+#MP4$|ohkMLSkm`MjF?LJ
zsa#yBrf`EFL<K*WM`_#Uv$3&$Ph~^kat<7Bwft;Ww&94>EH(rqtR63E-V(J61f$i1
zWEUHE2Pord*XJBG1WLv^QNAlaj9*hvFEbc}fZy#oh+EY0n$OJ!&RKe3V7F|gbZ$NM
zTll;#z8`4X>KX)jeCH}O^s~d9s7-DTIdAp$?68?dI%+rK1ILDL)y<L-9}+~?q}RLD
zG`S-kd%7H(;~fT1@Asn&ZVyS`t^mvH^SL1yxx@#a1RGzAZQUtAES8@XU!78YikhtT
zj=OOCU7B)gNlwas5<#juVG@{iYcv2`%q8-jS;7kYTh*3C=LhCW+*L0;VVu%6F3%&u
zxx3TQwf-R2?)p9uRiCu}$*A%20JUo43Ure_zx(RYA8G5EVcJ1dsEkEyAQ1|uuml!@
z;dpO|EWtQ0*j9X>k<iRI6YB}r%!)xbNY#8-&4KSf_A5A8B?+aW&ujJGy+1!#XGw;Q
z{1BC4P9MQM5MzmMK=g3y;zrSp$F=`=v<%4};ZQwlBF37b>_;4oq1Bsg$9|lx!6#sI
z{((4?6LTf~`K&Hm{PMM-uOnoS)P$C|zwsJD&pI-?>BTy3P{a&tD@1vIKYLq-z&~hd
zSfXbo7}QSkalOo#zj#GQz8COydP@MrHfCmhtJ{*Y36?z9?=`mRBTp#(HOLB(i_h`4
za#UI2ynGXcF@gK)Gh&fW)_poru8$@Jr6Z7kylaZ9Qpnc<ax;6*{Ju8K>@3280Hs;q
z&9<i6kcE1L??l|aOHGp329a0=!qYOHN(oMHvv%yBQbL}~Ya#Zs<2BtP9r3uBV{YqR
z&O_?}>|0Q_Z;+YuGY@oBR#01x9>vmy_BQa$_V9+C<{_&@FdFoaet+O3Rf%#BQxy)G
zB7C?@=CH^cF!ZM81lt+IQ;Zw%_FuH^XCLOU00W$TE{0mS-gdzD_HfDXK!M4Y5{_WY
zs_P<MM|<x98of1?EIORdoXh@)3!vgqavoE@cZ&%WL90%*RM#sLhS60^QDW#Y<YkX$
zBt`Ep1;jGRU2o4sD!n@UL?tziX3$uLgTfKrt{aq{TSEx4gTf6_v(t%4a254hO6QZW
zX6QxAHn9}vxrdpBS8vkike6jjpHF2B`H&CQS1uFm2riVvZmy!LQ0CxZHKGSSgS08b
zWp@N(yt$o3`@+l(%y}<6K2kDkK5z<_W4+yZXh@HJqbD&r9o-$VTf5f9F%vB9uPq5Y
zXi)60BzYHnLtL@ZI}IqKv1otYilQZm9M!0NzndNJ+`8><m<J>*(t+l;BKX8azui}(
z>VfGY*nbqOr9Va*I7w|AhA?KP7{a|n<MorB9jScMyoYTIBtvYNAcCF8nGPD|NB=fI
zvG06=WYhLfZ&Cztfz@C!2tinhczp{CGt$E^o#i9$<*TRTKR16|B^dcMj;#Aqewh&z
zE81a6+XbIjmYodVgEt5jXukjaxP{jtcz}_QEuh<F2ovy)%<Jof#ks}He<7v(Z=^+U
z%Wk@L<)XadN?Xym*V?{?jXbLUO|Grs0ALsUT-hu6sNtl1v;PjCora5^-7w1c|NYnh
zGmhuq^8}y8L60i8{yW~)@Pv1{1?!-w{8#vliU9bHQb+aUDgT|xJdJo_3h`{2<o`Q-
zruzh+)g3EZ75`T@u7lx;jnk;DV*Qsx`tP(Sh2s;sx6vd}RQKQ6xRNI}PJeZ(?SDRM
zfBoP8UkC3^2y_ZqWYrk?-+m)D)}O@V+rru@t^ay-l;OgThBJhg4Bxuko^8^w$lq^H
z8|=nJ5v`+rWcpti(Km1t1If#Cbxzw<r;C>9|Bq1o`&yy+e|YEr_g)j-pu%)$>kxUo
ziErYw_w)I^9@`@+gf)E+OhW*p&#|V`&C=S-k1H!;`}}2@y0va`J~yjxjC}`qJ%uxe
zr7N}u>WXc;W<%BwjD*b7M)uar=295Zv2HGb3$|w22Suw5$e8XPH~&UYmZHY<-}nC$
zaa{P{5yy*=6HNPdqC>;KAjeg+1GdZS?wC;$6#NkRXqQ@x<E2lQ*j5{WKG67rtm6_I
ziMGH-AVvaPlC=@jvg!Lsc`~;%-8k|?C12N_mF2cWl9*@d800gBL#GHZWRD~SomLq#
znEqi{v>7748FND88RTB{AOQJ6*!0*jB7V3>`3NlJzqy>_wpo&&7x^t{oC#!|(>dLC
zSPLH8rDAf1t8FWaaqfD7vT?e$5x}?e9;jmy(zwRJw*?Y|$>crfm_qN%afQ_PPigwU
z!^7GvKiYwB`x0AoUXn%)e=?#514YZ~{7Ct+{|Re!UDlCzwdnPjv^)R8q#gqDxo5*;
z(fe|?!Sc55Y%|{xj{%3aI3&ss_CX^%r}LH75aR9+d?w^57VxN!ok3#rD~`{1mgfx~
z@@`0PWJN$K8!*vow(eI))0sdHQ37~{MKBvQ9efd-TGuGl?2UAGGI_boA@V@efb9l)
z=?k}D-{f}cx;O4iA+|$6P?_mowDf=&gc}u=${NmXbIZyc5j!nd@|7$mlQ;RBe5%jj
zWz_z)>mXjm;?4PN#aNdjfVD7P%qdZO+XBYZQSviZ5w%}!xvI5p$l47TfmyxH0w2T3
z|G_3|0&F7A1`Y+wJ>FgaJCh6?VN_{}?idh8PpZ)GHC~yqI-P63)0Jfkz=HAc2^qii
z4wdljwfV4^q-;Xo!0b_eoGv2Ldof$H+R1n9qs=toqJ3}O_gscu@|0?Pq}Blx;cVL%
zBK)|a5|9<qOhG);Y>+1yVkKmd!IgS6=*5EPy@54dnL^=$<@ZeqNQ{Dyn;sIVyB``p
zoM&#scwFtu&1)hxI2?X>VoWSIcGxCrEuEV5C5Sl8Ptuf(6>ELgrBbf%_Ejr9F0*29
zn}0vVq94smSla!P856CYZ`80$5q5rIHkiV-<4v56^oAcsgzenmAdZUD!3*~soUvUj
zaGao0$jxItmhj8<Q2bE4=JTWF-o@_IjOt?WsAT7hj%_xuLKV*Hb5v0AUH-@C3ml~z
zPy6MGVsAw4$4Bg+TDkmqEb0@cOx;!cQ~jDA246fM+AhVhYVqMd%ZeP67Bm=MGV6R0
z<oQ5TWxX^YcLtY^fZ|1%`owDRNXAYh|7z2&dlTMLRoQMPw<MRK*|90{a;olCCC?MI
zc)mT9og)d#u32&q<##y}Pd;jjjKI6xP8>3*6=j;M{EFzfm6twWGX~u6iZ&TOU!E|;
zlELJ|J_9kFTE1a6j)e?=-<o(m#IyX&p}u`6fMUviVFVV8?97_bIo(#C?Tr={;Kjsm
zcQy5#PkKM805z5{cBO@`1#RZ5)2B-GC1=Z*qz$|qQ};N*Z?%pZle9Ul_OwHE$lis#
zAA3VV#fjtlxJiITQ*AXK0Wdkz7{Vvdsw^izzgGV)$};T1)%u;-r0VB!i&#R5p^<(S
zJ#zu>pmsI3niD4rSd+`^CsFGK!Q4y0Cs>OSG#P^-au<frs7-%rON*qbK-fN)JfbM3
zzvyv+J89M%1uP#O3B1D4e*LviBTK7dTazI%du`ysKjz7o<}Bc~5r+2wn4jeyUC@Xb
z#$1duK<0%lh|QB$Q`qVa$|4izyXa)S`?58?c0k;7<F@P<921$L6%S=V7Y4sPrhG9L
z#vl(EBS*f0T7IIeE%}uS?bDP__2(O8ok59Aaf#Ijj4z`*tL?8`i*-LrcSUgmdu#mt
zwSej|LjB#2F1G6d5`MAbX5xxUz9Pz_9iuJ{MpU%`Ff;Thw-wTIZucyh&%!Fws#jD$
zl>5TOvU^u9g*X>R+2FYlcs(6~=vuFyXi0FcpUGlIIoLueVRkvR02;!YGo&Z6!3*LJ
znS8r}QpDm}-+di5*`=sgFDH0}Hn}A)H1g<iaYTJcc!&+5f5`+lRNAWpB;b>K`}eNv
zPrFFq!{X_3D^d0*e<;)}x{#Rl*b7W<I$_>1kDRWHAjNAJ$(qTgau*@3fC{t|(MfsV
zVh-dhB@MA01DdDonq_Z|Ad>3^r`U~b--wnfs;6}gB03;9-qP}fDkhTXvu46BKPMJ(
z_?*@b|Gbb$--vul5x8flAw~gNr!Y<E_vxzyZ2nxh9~i}nH=^%Z)zQfL)0jIybJH-Q
zvW6oF>DLrT={dIgW_Ov#)6R@O3YMa_zFAhx27G?d1r!s>afD%%CF8rOq)nIh8u5Z2
zO#E23V!+cn3kNUQhKtEdOpfOT5XlC&U!B{DjzJAfd%@d)yaBx#y2#vO7pT<v0G4jo
zii4SZ8*+krNz5I0gI_c^(yKzJkU-k1;g2Xz0<+u*z`*xwgEcFOJQzHU21d`BPZ&p9
z*qxGH4h3E)*?u;TF}!iEv`D~sF^N9d=pU1t>6_7|0HJ^PM^e>dFP1ytnj>f%#vDC>
zk9f~^i)-cbBn^_}mOVLuk~_GvNS(<fD9wrcQ#ZcLr$f{3#vG|7<|M$6GF~Ig74v9e
zVuJ`-dH{z)=ez3ZyI*|X9{S&Hm?D3Di=NFi+OM|x2-q7u01zD5HL9oYez+e0zS<6c
z=u&_y65@IMaGlY}SZO82AncZT)C9HloVqw?5pkN)OAylG79mG#AV)fv6|i%YNa3us
zzJ0MbU7v&mT67cH@DI+^&`*&92)8frSwKk++i-HUp3{RLc$$4-k3M<F;4^T02ZblG
zz;h$cmRiO)RK}0e>P|z^Tn<Mg@hh@Qb&`fA>ZLJKxJ^PnvTyrDzV~lxDImtgZ|Vuu
z%)c6z<p+S+{8-gS=^EzpwRhF>M8s^?NLKXb=(lg5Ajj7zV;LV7!ZdL2#*0guw$x&$
z%C%t<w}DW>lOhzmQk9qo+yc9Rguh^e<!@a256Tg_O#)m&IM)L!#6Lq790|$tlRm!(
z3eR$|Btn&&1R0Txv*nYHgnl@MqH|{1fp$q7Q&u7OUl{dd8$Dg}f2zI4@=E?o4l3RJ
zTYnD@f&4TOQubk~@|E+^L|pZm>|mRu-Mb-DKJZ;aZ;lq>$@2g>@@Q_g-gJSa#I>zd
zJM%)vQD9nJ1R%-pq2b*>WH)6KV&F-^^Iv79kgYI$piJ3h1dPgl-xSkCfvVITYG6aZ
z*3F;YyhpS>1QmY<uvorrN5J^(gcOjC7595J)N)~lfCjbAcs+F-;2Y(Z@|(Op`e8cK
z+)H{r(XYw8%1rXXfvZn^;|n9a`yb{^G$aWRBIG+H=Unno(R{5Hj)zT+nf(1@PLB<H
zOO@<7(jzf1t4H*vyF07D0Y?Y+`?m!NZdiVzm=D|#FZs9j9UF7XtuVJ{UW9_ywUv1C
z;LsEIrcm<FE+HRd9Y`cWyeS%4t2%`7@(?F3t2^A6N<NqN%^xm_E$hiLyt{C7r6pHV
zu)^|umZET4Z`k}W|D9q)h<r{63+O*zMK-VG9|V<cwB4d$bAIf2JNN6sDMipoFnUFX
z;igkY<(q8Uyk<}@YoRNvKWRU{5Td&S5zh7uMO5SATy<Qb$Hzli1!AtxPJnDlbrut1
zWF5RuHBkd+rQD9|hRiY%umC(N1kCQ_D);+Yc7wMwbNd)c{b+~w?HY_ui_FC^Z*5F*
z3|4g_al6cvwVNChS2Rl$$$n@!PQiSb$qP3l-U;CdY95?#Su+DgZ@AuUtXFlUv4Z_$
zkQiaOtR}Vb^IM%bZN@Yc=(n1#e&xLpiIp5Av6H91tl!V5*RtmwU`g_^o<k!4?$nJA
z1SwpJa9TWx{<HYgeYDx75n*i{O}HnO)9zJ3s!U;3Y&GH|2F#k_A*&`I?ft|qM^06&
z72qE1hX%eJvvvWERGY~q&og<P+t}8_=GAgrOP&C<*v2g;>s=ke&AgJB606dI)J>NT
z5XUf+OUmJ^Fdu^zl(y`6@v6Br0F7R?Zm41OicOm7eQl{1=w>OU+;5J)BFF=Abx0xI
zjBdipOJ~QfRbm$T9E*N|cNb}RJL{d>HkqsDJB-ai>TQ2reBN7z%Hv|eZTU{`=J=ts
zTbVQXV}$IF>*j;I>DK{PBgPre#SAY$Owus0G%TA;{(N7I2ucV>s-$EL$inanfaZyA
zFj6n@BlM}XW50qqsOcm?Zq9ir2X)-&TZitf$9nrdUM9u9JZq{yYE<$8Qbf_C9Hp26
z+xBeKk3D(J2Ue|pJ0q*@*j=bGL4#h-)w&gaZL;kJD%f2yke7u(2%-r{z37~_Dl+TV
z(qEUwA}}DM4A4B;KAS2mhWq#mtaVPUoAymwt$KJF9KzzW+of+l2%hF8<~+p&V)TCX
za`m-)vR^3siwm23Y>WwtlpytUaUj2!Houa0H0ZBB^s>o3G}H^_XS*%it4fU0IHXZp
zF#!)PxHknzM)tl$v8M6a{5eTG$76VcX#hl`w5EAi#pb%3_#WB<^h(upHhSaTnrT{s
z2T>l{KA&~|2|?Po@C?9UIlYpu1dZEjF^HCW1CnR;ln-21w@)af9tXz`gH~}vY?U$1
zKqbn#0ww0@kYgb#Dczqm<lGss(O!t#;KA%(77`kC`k}Zoel=IB-x-i&d&jE(*pp~P
ztE;LT1o_Yp_b^0m;V+kBQNIMiLGCo;JU-I{T8n1?%0M4j#9RGzj%khkUVg_$&tdX?
z_fgux+ED7IM1cC!IpuAI7%Bz&3|!P*1%c~#u4h7Hgfxfo{0S+6c-Ehz0b4<Q(Q?29
zD%$RP+j_RC{@!a{(7m$QA%OF^0h$rC3O$%>48<G9Q3okGZqC^-wz)3<u);M6Az{{U
z*nt&6Am`u)&tDzB*MQb;a&w2iPU5N!5-~($W+0lwUc%{NvWseTYg!)4tT_T8(9Ec5
zq-V<?ws%`BJXVJ_*LzB0C4Hm*#S5C(4FbfVWmcpNj5Ekv76L9ly8^C9X^C(RUsvN>
z3-B4#y1G;BHP(`2r5+g><C;L*DEN(*ov98Td^~LHK_qszO=KmzMFFsBkYj5gZlcaO
z$2-Sd+U>M{>ko_6K+sfQcdj45FM6m<C4T>0;9RfnNnMDsjCWEs{GeH(*)|}+aV~hi
zbZFuc_Tlcy3VvIPkWFJER&V%xV4-g_7`<-#1QAkE22v>C9Y|F^uG*i5E-XE_J>%!8
zdJ*OqA5W+B0s$Z?Ew>J4$GQ%T0EttO@Vrb*9KnbCfW;ez3lj6801`U+3@V_>z9G2s
z5XjK<R6W$Q!e<RjCPO^udKr#Gj}foeL$AofU=pzu$zV5s7=;t~7j&*8tAg*wCVExR
zd-+wHPgT^>3e<|_3uZ~-7~rQ<ul^Q;is=!3=vM^j89zMLF)*0yUR^b+C~OA@Og`j&
z$kn{kY1i0WknoN{MOlR#E}rY&rNRK=yR}3+7Q;IR^}?JNoBi)D_hPlof+9F>;RTE(
z4-+*kB2|3<Kqmjxfx=$<**0?Z5xl`KuE>rm?0cY2ZS#1!k>o%4(Dt$(-{>*)y!WR~
ztvlpG*#^-Fo&$itX(3hnk6DF8#f64iSRQMke<G8t0?^M~_VXt%O0y?^sHp6||N4fy
zH7HxjPs=Y``UPH&$>`5}TE>Y}s^)&#N%D9aO&QLP03o)?Ba+6wD0_2MGiOgEDCcuL
zMrHaee9cns8rNgtI=i-wUdIj)DPJJ=Jc7!Z@*M%@^UBpJCLK*-$!rJxxOs{>m<8IF
zi5E8IvYinV0p7(%Y7d*qB@v!c3j0&<w+52mL?UXFJZZib{O)_^Vsg8pvcEBDk9r<A
zY}QaUIbMDUaOt~uU%ET;OPmswY@vohF3Xgj3_LaAS+EZj)S@TvU(YpgVy|WM+dX>z
zsF?i+EP1mJ^DjQozL5FA|KtOetaBFq#at+Zc$VUdHN_tEBPaVWBWQT<?z%PAi>6E7
z^XDrVU(Rj2&drC}Wdg~KDB(bdEKwv%BI$f`p4Z2Tm8>R`jSKcm!zqC)D#y(t^=eX<
zQ=I3rue2gx^*1JWCZcmL3_Z9gy4jASyKwwgE`Q^|eu#Ig&S<dY*njp9BdF095(Ab%
zaDQrO>XQDK2d`xw=w^8o8V)2@F&)<|B&r8sJkK<FB7v0+vF%wwvU?(i#F>ilQ+L)U
z1MlJQEx&}uCpwdU4vwfPLTtoC;<x|t4Q%yyq89vjrMjN<cKYG2PzK%7tjfv0$)76G
z-9O(2zrg!0`!(V$y5$1`Zs9`xal~0w6er_M4w-;!?=dK}02ZGyve_&T<q{n;bB5mL
zVqp-Hm%0sI2P#}11~u-maC|uOLnq_XM-ctSq=&5(bZhU`(2qcd<qK6#biFxdkz4GU
zX>3m4z|^T;4rxH{HzzUdv;~v1w(-nIZpmAR)5!>M=WWlHG{{fZdC$I5CR&{s%}C0i
z^MRw1f*?6xrOyikNrw5pz+JnwH$EJxuS6k`%O+?sH1*fZU({lBreS6yU+N-|+n?FG
z7N?{P@{6?wA#`d2br)*17&pqiL`?R96rGBg=E)6P!|6;-u7;l%d`~cMnRLn_lM5(i
zCoFDhLd~Mv@Hcv*;5yrO1a8SedU=_;ax<8hjjtJ<U;n<-Pox;|GrYaEF1Go-e!W5y
zD9{#9CwCNuzE7;_r}*0~WF=v7WHndtwnVd4caIGT<_f5K$7Z-$-tiY0MAHbW4wH}P
zPn<k*hz^fWH%$3h)WJ|Q(4ux$FZ4uG<|15b`9YQ=lrpH2ooz*sr+|`oj)+&)+sG9Y
z^L0#RA-O^*LP{M-7c0?f2SL3?FEa!QRkkJV8eif>@|6tXhR(l5Srye^nssNZ_h{+x
zL`ZFFi5f$yj@x>)*7^ZBQ!S^9^`^S?A-XNncymN)b_}mz8~}mH!M4PyNf(nSJvjq6
zD{LlV)QJMY-&Rk3QW1Xw{E*We3AufIXsuBf6^G(C$CFV1f&IcLV>jczk0hyqc(6|6
z38VL=9MRJatF)bpKXPsdececzG$|tN^qPHO&CFLxmxO|nY~d*Jb;y-(9nF4-KHjTx
z+F)soba;Q$&$e#@4CXF88^hP)qC1I(F`C(qcrr0)m5!x980fw4zDpi7|GtY)+njfJ
zb4wxc+uy4GgzfcGa|{xS<dDUSL~hlE=HG9Cckdjn75P}a^%<&)DpGwp>oGiNvrNIo
z>%a*Ha3r~)c$~DReTmGIzcyWqCg8YZ0xuI4X&MuJsTYFfwp%DSB4gUz)*H5M%s8Bp
z+0tj2-OiYHe238Gi8)|gvd3~JGH}Ry@lSTovksZ&d;y<(xJ|PpeZ{`67vkNd#Z-zF
z81l+fj2au5(`^QmKxMO5+JHbQ1XbnY<8ICt=gA<FD;@w`d^q*59R=Pyl~NEre*&@<
zn1jhWr-z{M*1i*M>4fZrO3V}C96f2>7Nmw291s#(cF}X=MAs{ht0-QvPUbJusROSm
zh0o&JLuH+p^cpnPM-H6ZBZ$4!M6Iyknn4s!B10?&zslFtE8k+7)0SLGLiSq`KdHKy
z1W;>Ya>Mw4lgZ2wg3B}*PRC&)`1mYWIwtEZ@%6Z;+?UeKI4@jA^Z+TIw!CK~tA(0X
zA&C8`jz^|z3uj>kH>tK!bMS;sQ9#T~DdP5V=Gs9mZNrm=;~T|Iw_oNPO!b1YStNDM
z#DDd5wmgGfXK9)dGdFBEJ#cDb9+(R)lgW=E!)ta19xz{>3)Fke@vDAn<dZQbl%e>v
z9L#%v4ABW~7!}r-E4L%dReNr+2#}NonO~%8Z$I4ih&{8Z{Sd94BcobkOIEN($RH0o
z<57A+Nz*;#9Wqd-rPclOHaaF<Z7^@Nxc9U*B5{quV)TAmLEQsLYNPqT-F>>|c@}ZM
zQihVTz@BQBJy+MxpddFbz?Cqsk$3XG>jr5#l~b@}5a49ClFh{KSzmc08W{sS!OMbh
zh`{hDkM6@{!U1dRLs|)kMZ}thJQ16a{v+K7`PkhVR1q^^l8xIMidAhy8O!91Od{(}
zbX``4%o!YI_+l6XjlY5dqse);FOfazyB<_VMDDX8uOT~zfd!5}DT7RDEr*j!0(T8T
zn$xz2^8Ls4MR8uZ0AHt)%7=574uiETC6Vg;?-D;?e6B?S4JRFms<_G`QcLqpS%2T7
zKVlB2Rx>bwEUVRNTMfk|fun_E7v#8kU4c9^TOoqumEE31Qcv^Qwbs0o{auG2CR!6~
zy6Idthv2tq+vDMZ&XwE>uR6!~q)lAbm5$Tt@8<k7z3zwxeNFr|QANjV`Ju%e>mJ|l
zk_W$NmvzGUSZlLPA!mAB5hXI}>)N8JA0bbzaR#|W@X>m0&ALykG%Snq<9~l3XatiT
zkP$L~puhqS*>1O>`&p^Tlvw~v27JRrinEoO{=OGCc*|(O@0;W9`<E|M77KgXbf)OP
zV(aRn_;P)!QWq-~XM+=o!gL`PqiV~SPo?zsWJ(1b@4)iR%~;4+6t7_o^L=~?KKGai
z1)l@lLevuH%I&Y)GO+%TAEuo+Qm@?4KHgo#b~NR6w6b#^W~`zKPzZ&7Vx+`WPt`Zg
zAO3V=Qd9O9U71I1KZOx<*@|jNFq7d89i=|N<Wc$NNkc?>V7t%|%z6Vs&N=tdm4HRx
zvTeNe;hjYcgq6Y}Q*v<8^*)$!c&(iG9nzDwqLw7&c@^=6<_rUm;CrE{(UV;>wYQ-k
zy%nv3m|o8-baTL=8J90EJ;6jG(#*P#-Rf$Aw+Mlg0Lbmj0rt-XMIx9rpyIp2z90)|
zPqSRbN^nPC`hkxMjLupaJt$eb|AH&I3JnY(w><caGzV@m>1Y668L4q-5*dX&GPB)<
z36#PwX2)AwzYFa5t3&DwLPkL_y3%;_r)5*ROgf^+6^eR%g*1-$f6+>Tp!lS3(eiMm
zqOj6SBYv&5e!|d~z@UgoRxRUg;A@Ry`uH(@Z^XCt-2w0mntpXfx@@veSH0R*regtm
ze5<0Q>qyWgET%N$*f(`6EpIFeKfIJOmk9UMmVPgS!3&Jq{!nSY_*12MaPaGQs@?Zt
zzvt@JH;P6}VG4VtC~TZ>V>pJb@G};>au6=}wAN|dK0moMVFJ`dQI(=RQ+V*1%eiYK
zduLFK1Q<iwadFrAxw$Pze=((-`y=+qEortKF6Nh<yr>1)1`wZaS%$RK_*6Tm*2c5|
zGEFGbl}!&!O9@y21Vs6cT~kShB<q&p4VIcz2#Dq>DGuj{MKGFC$q0ztsQED~Pe;&C
z<Y#(DF$Zv3BT>nO-{Y6fCQ&eHR*t-hbkY5R{j#VI#wD*4@<|z~k78z_Sh_^m_z;f#
zyVY<;xy(QjsVg1I#)&@9iDuZJ7R||jvS^;y7hQnim)=&e*xA=<HLdG=4f#B3<sRf5
ztS!WstU4N=_&pk5MOj=LI|nF)o|Vs&l2WEE)G_o`@_AMt<iHdC)ZI3bpomehEiD5z
zRcJ9{vZxlqZP&>SqyhT0jm@-Fad50&#wfWMAUReXF|X)Xx0&lhKf_M;`~3%}sc9Iy
z?Z-fbG(h3jz9t;@kkqfZ$Lkw}=b4o)_^|_Nb&~Cx9J=PV*F!Rstd~@h*!;Rjdw7Nh
znM6vgr{9OiY(2j_mmpC2@!irV#AO$jrIks>S_;yQE~r>m+P=T;L=ullTfMkGq3g=s
z>S>BbC6z?t>erZeRinRYLZ4aPbb`-6-~9TGYNtF_XeQThNl(gjUhGo#{`yId$t4HK
zF(C@?I+9<f5hiU!gANp9R_WajYEP$}T6TrGCD}UUGkH})OjM_;*;}8-{zH(NNI-gU
z1~)Dn8Le~>qrU1j&0B+xXJNMD2!mKYGZTa;$?(;w0;z#+9fGfDb9x^V1#47)t}}r(
z^tJI5hlmpfN6nN|uAnv81Ta1}wS3Vyly}T@h_U{6J?1(q!EL*in{{=8sF{>D;Zn`4
zs_{SRF%|Q#z9~=^QSw?z&x;c_u{B6Y_|L(UmYFz#SWNpkEKkYbp>uaV%Wk5_F}lh2
zP9Ag}aXC~hj$4vT4dih3YNl1XK<UoqF?E<ibbtL)7;XiQ%x0qUIq7W2(6LyVJM0~M
zJi<fjqCa<+i5JP04``VbI#+(yYA#g^B5`^33Z^VtH1Fbk=1p7lQ~WXw*n#+RbM$|W
zm|lo6=7LX3%t+9ie^O$)y=VACi5c@ye|fNJHBp(Y!zI*ha?ZbDGt4nDbMWQ#43FWV
zOWiufoLiv15!ie{U|ean=NyjOmdpgxjr+oYSc1xk4m&pZ5_wcRZhy}T;JBs!WQk!R
z=9C^44N?s;9njsPlwglfR^3V-sGyPf*273?_T7{`z7O+rka!NgsyNOhv}D=G4z#Uz
zo#Dwc3~h!k*?9{er#lVLT4Z`RS!6a&>w6wHmeC{8;0}Qa4B5zZLj_?UNTAt_Ede*J
ztxKR^{^+jJA4Y1!<L|X9EV{=+rJ_70gSufDF&Qz#;W@2IH7^i%TPsApoWB|Va<5$-
zR1kbz_0h!b{DyIdhcD%Ch=t=i2faA9JL~C~rW0Vh>kl205fRXI>t=4jj_|=hcrY3T
zxh_WEpN)Qt9ZWCl3Gp<dIOJp<wj}s=5ihv?i9Z1%DZ)swLB_ad+_~T6@*IBwBcQ>r
z)1%OKg(PWOpyjg3Mx_!^ZK|`VW9Xmj-ss!2Y_NDm#!-M{HJRx_Uzl}{Xh@FnZbz$~
zvUz4*P?u1K=3Tg$U#K;UlVBaUzksy57%p>FYtG<C9%+i{z*?R%joS<3E@E5~^`U!W
z2y*n{yL#zgBfX5~E?S^vg6aJfPSqRm6E9Mc)T2+{+*<Fhg!6$=*4)#Rg@<{No89_`
z!5nP!mGIi+BBxajD_$^Zy3=?BhtsA@ezw6fH~CmRk;dXC7}^wua!&q>;M_j*dQlo<
z6duhte)xo&1ihWLY$x1K31O&y*yyIEaMo_F|K7F$j1fY1w$39;N*1(D>yZyD^Q+Rb
z)A5P`q0y&@;e;8<qL|j#Avpd^#T<_N)c}!2ZAkOY=R39y4%A4DZD=QSB4R+-;&xSl
zFSoEgd5a>=)$$kzT);ror@((Of&SwKE_2#C;SwtH1yd*vLAw;~bG4f{0>cB!g1%<l
z886&1u<4K2=UXs)RbQ#-v-5nZ`}F(m%4?jrJDUwIYaw~D&sA8-_GS0+cF+j@alpb{
z3CNIfw56Ai9R<wq$qXLH_@~h?J1<X!wZ1wbB4Bfo@{V`hpu~$9_>lRK0o@BTG~=R2
z<$X1_=q@5VU?$zj&J}mt(N6ROPV^(rn=YGSF55X>R+hz+Yzepq+qZGzUfcCS`bARc
z?%D)c9ma^Oc_`yZMsLAccS|6PRPOptC!?P)`d^c#lUR0xUzV_g_)9K7m0y3fW3$GW
zhux%hovEF}*q#^F572-Jl+1lJT&lGymLWeODZ4#cyk1H&xX^>gc3nclr5$ltzTPH5
zcYnsOo8`KJ4pLlF2`nFVUUVw2Uw^UPbmKE}Bq4m7AL!W#tGjQTD$e5SoBK9@aDD_(
zY#LZX`ybLYoIsY{Yw{4q!E|#$jz=%8R+S4{vN*!NdUU$Q6_jH|#(UiF!Y2_JmT2Fg
zt6F7mF&limGy55AJ&$C^!RxEsr<h@3d^n4>Dj(L7e32EDZw%u?EeycG7LE~82Milw
z+kI{E*e(Mvs7Eub7AE~X8nHQh0q(uMy|6O;hUZaRu<ys<dNW>9W!6<+FG8H_{TFWf
zmaPE<`dx2S?FoG_39uElv*2!3g4U?W!e7SNpH+LCHz?JN1+3pFwiWfXyC+>O`B$$l
z2`CF5t#eZSvZNXG9!Ry^9zuOXp}Evvg9Y0Bn00Z=i%fIjxYUUGd~7dq$&i04af+e!
zV+g}5j1h-zf&5Eko3|dj8q4I5L-L_G^aRV<AxKv87x|udQ-iHXq0rjsaQ)3xyM@QG
zo!BG~lHPrlZ+*l3ld>G?zUbJBzZ#-GhV_FiNL?e3oq}_n+FAFkNS(XUm|Mqo>RHpF
zcLi0Tp&Fwmj>+tYaL*-P-)$S=g^SZ}ewGI#U$Fr;;8YUQw;(^AN>q0WO+rrvUUv)x
z{t_2m-fKzW(+?!>Pp6NW&fD41>TL&yeoS0n`K_;Y+?6~LhrXlL(`noMu&Z&?Rnk$1
zY>D=TTd*KqWb(RB{MP%cCF&+QDvlN$)ey(&hm~lQ;Ww$w1MjNV&xbed4)ywr&sP&h
zW!#Hko+$|oM5+`>p^>EN{8OE^zZ;FHHzT=GcYBfg{z5!YUTtgP&Hk~ACdF|}w#0jx
zZNf?viaV5UlfR4NvvIQ6-f4AyWn=qzfhzMKE8F8FDX7$)Vi@z}nf`NAy&?=JE$z^C
zUq018*VF&=XMbKgQ4`}dYO|off55+g_{57m0)g-@Gh{~m$Ey0Zv}e)b4lxN>|GBAt
zjRyFlHLw4}-dhFL)va&73GNU`a19y=?hYYXa0u=m+}%9{w*Y~KySux)yDfa-?s{hS
z>HY8B-Su^ys&ms9UB$(^SheQhoi@fhe$Vrid(!?#dH!=KZ!v#50ORfBRuuc^r25b}
z;6S05trvw7zW*>9(lKDXxDW|Evj5+O$@(ah%wn7x@V~3h0s;lE?U~+p0}mPj^nX6$
z2Pkx-F+!WDKSy0b$dYSGcf;lFL)vCjJp_(xKmXxB@rC&q^CpD*P?Oi|!I7ekH!t}g
zzqpwI)aCqF3McvhSW+Nc{=fbC4Nw9n03dAhQvdI-{JSVXb^Z^l3x-`^4?rWTw%zag
zK4oiu>ms*3TPJfnJa$P9q}QrZyYYFhz2vPtP33oyZF_wQ>3K;~J-*M%$}$?d+JBwr
z+e!&3U+NCVwfiC^_jAysqF7>mUe_l9?EN@;iPrD)r10|cXfw{lCkge64uzPR8dVHg
zdjK-+O^%x?vK|3bsqO1eN;?zv;9w_hEtr?X@^+(}RPKxEP%OK5L|X_#>&@ii+oRkL
z(5VpuEkHo(z5O`xIFst(_Uy!HfEV&fOJdTHZ}D~HCW}su%-O1lONQ3B^)A6z1V^jw
zVfpR5^DSYg3i-{(Kl<MHJ2>19yV4z+Pub&7-RS)=&Ixoedbf?S4Q^IO7!tO9JphHK
zgU@cou+M55lc+;X9JZMQ<Olj>epD>-gdI%gEf2?iHZQph-RK8WU5o&*<=rFAvdc-d
z!1Dv|{;A-P3eP6DuuI;|(<i_vvk&({uZ~c$%wQ)v?y;Z5M`D=6YA5q3YQRwR*HD~t
z+C%%NP%>(sG-&Tymvqkx%Vdal=HAt@pgiEs<%fmVx8EIt&f8vo6KO--wseb(?E_S}
z;_BjQ=bc)GZOlpv*%qY_fxy#5sO?ssjAFq8RxH0N@!%iU#P^KaDigld1_VE7+Apwa
zDDX%;)ebnVKgxTOUGBDfNj}}A#eA1thU{=`y!#n%FHkl>uiYfg+hGz&xVlomD+O#h
z8+Z3bEL8{+T{zi&I9q81=W0gjKpCoPNqEkW+r{kO{WL5;;MMlHx?+<J$Gt8pd9^k4
zq=3YFXrZq1E!3}%dAvBbN?{H{+lj41l=4ShUbo}~d(~4zDYIg-`z4%b^M56Lro2b`
z@LTk>mE>#^jgXH<PPv%O4H=t*M{OE=yq9Hxh}(UpJd*tg*!fEZH921rz#U7_RY1R|
zSC|UCG{txVj5k4-d$_j4{pp_OS3QL0)NXe%;t^>LkHd^0(gryIp@{%d`fPl(*Uf{k
zSrrh{JF3V0C4he<z`<aRCFloZEUndWr~T>aD<Ze+UNqV12}wsIS+~w_>huy)l9px(
ze2xM}!@h__yy`WH?>1&r@n#n{{tK`oS?(P$PfTZ5g*wG`%)usu7eZ>Ejsbs7%gHXE
z)uIno7LOc*7ah=w<-+9LZWrIx&M9^~Eapn%+Mwaw1vZIXR>*XHu5%st7dit2?CK2@
z%c>0Lank0`DG<N%_{9|+;dp*-_pGuLDQWkIMM@G6$kzv%;!9kc%MGhGI}e=@E$8-N
zMM(kHg&^XK+~Gt5!>0!3@*%%gk8P!h!*Rjacx-yjWG-6OB3wG_m13XtwcV9xyeV;>
zFL{D*G4!}kpE2jkmPvuQ-B`D3aI~k}r7OJbx~aM#^%+jQZycofka9hT#Vj=GO$)nt
zI59m|e^lJFhqKf$Bs~tu9UlB$s>$efz-l`FZEJUY2vm3)Po&8?EPgmYlqmxKcIbQz
zYfRQtcQ5c_>-Aat@ZL0q_q>y?<r;asp9fw0sE@#gP2as(M0OUxmO|v=2WTX{emkty
zeHbvaI_NQTi6r8sVn3XV3%*W|@oIJ%#90=M6L%2!IL>DI`Rr?8$L}iZZXWeU$56th
zcGcgXbtq2TUfWuVI6lyIUQNoM7;bHHoMTDI_x02sB+OT8Yr1)?rAiGLN+1Z7yj?mQ
zyxz;cFqmhEWfe!m<Mr;M)PL25pE#V;7CcLJyD%#kCEJ3Xs*qIL(uicsgs+dhk;uk*
zyGDr+a@K(6zZMz6^JF~M(zxqcv4L(oOk5NM)uM!<8X^md7_JJHeg#Z$eH%$yzE5;B
z0|u@6i8F_(t!v<$u+?UnrIl_6@0WX1toG%AK5*^LJ#+X6&$D`i9a=w5s;Ll1f%&4^
z1`pGxUc;!xyS{dgNj^dM#PwY0x*tNC!st@hOHLzlD2;t?u5Fzj7`HvVUqqgJ2IFEG
zw0Zt60^ieXE$9$Ch7r%XfOP@iHk4>f_3FPb`+;ab=<Rs>#ftMRRfUP4n0CH^sp$~c
zvq}`n>*Se_!wKJI%|P?C$w+*@#S9z2t*Z^Y`Fuh`u?oR>=*OkTH~TLJ=yBpB=+7Z8
zQ5I(}f6RAIZi=ez4R#5tM_V!SRKeb|qQEl33E75eTW(t!;R4s4{2gOU+sZvL_;Njy
zQ8ML*Xw*&XFactTAN@L^8G4F)k10(f8P<WQ%*1Ls_BolwB>dX+!-!#jRx7fFN;wsP
zo5ju)P0USrAq+7LOlE_n4gka%X=i9?loR|JkG+g~I5F}0v7KY+q_lnolkVh^)o5a@
z7<MX0K|~UOKtlt7m83o5S4D1{L&W+QU&swD*p9gAhZzx3zy-+~A<&BI*ocP7Ci-2f
z#z3-nrm{!YfHU*}+%o-5#McSSamGP&Ar08KU!Q&Tr&RoBiEx|XTao5JMvm9n)F<jp
zVy8v;9hkb#X6W?sE66QB2*eWr+4A8J=!6uI@pKo4>tD-l(?UEPjz$$u!(6i@hwPTy
z0^2SmJdBQ_d{|TLLgK3necTLc_1*z9C70pcU^Q`7B3|#$Dtr@Gwl>QGz9HsZ{vKp<
zLO7kzk40@}*ILDfLi4q5mpkvscQlD6->%}Ai9JHOb<($|30R?!-8>o08m{^`bsnfQ
zC6Nl$!#ra9@9-BNlF2@vv|QCGTg>$sgj;lUr&soZWSh~SEeE8!rQzAr@W7Z$U$CfT
zSx#Y3O?nC@IT{w-na_r>Za{Yy7ClYEz9csf@?f)yIyv8E7(NsgTp})sYxN5MGBnvx
znP{*En+>h;w?vk{M(oi)$+Ftp^ty{J#$n9FXap>3C#?|TEjAWcAs_W>>(&nPcNYN9
z38*yJ$dg%(f8|;Tn^s?BA3Ul9M%Z(1dtQUFl&r#lltg~6Op8n+Oi(yYE5kW(UblJX
z#jna9u=1Pw7LfobLz;6W<1~PLN5iG;IxDZc=2K~c+sRb!h_|dhZ7s2!F4nUo+39a@
zQSzdK?MZyhAeB9mq{4VuAqV1%g(4n}FHMIAlYOo)t&cZ@hWD3)PRx*4YB>fVG?s~E
z;FMv0T@9!2GU^7@H1P4cgSSAq#v)&q#e4Is(m?79z5AGd^-}R$!wotO^;)xgf5eb8
zP?Hnur{kGS5Z~2$z(vni)Nsd(7v;0pt((AGcg75a*>Y9YhpPiJ-dT_07;FJsAF&;c
zd|=nrt#l-bNAzH}%&Mhp&gO37G8KY)ZFFO~2-DZCTs+SuJYUF+(aL&%x#JRfzg#)q
zZgT7nxNntjkCX+IvJcf9E~qcnnN#vNyTnnFd`5K}h>oy1_c)cDA0x{3Y-H(^A^3LE
z{wj8hNk)Va(uL*E-DBHUPwNzc&lX*0r7;WylNq11++XPZ{>-9(gxjb@u(dIuAAufA
zuc-p5TDESMQ8%9DjzQz?kG+tsEs{${I>u(VCAGn!$FwSWen_!_x#x;fG1?M~VgcXQ
z7uBTmyQlymne@+(H_QuZg_`~68UFTgHlmKS;kYkV+~`4gLpVLVVWJ14E3KNUh1$h=
z(&^m8Ev8tc;XdhBVEvZPv*+(Lx??EyW##;y7%Pux$`}XxHYmpdPX%!4h;cfJ;MY6t
zU=)14i+I(vZgwuPbbEldt|u8F@s6!B9>S&C;NZZ(p|nTrApL1i5W&HQT`3CP`3<lW
zm42Sce|;0D>U^}4tMe0@0zw{@<QZ%$XHMKJ|A8&(<``^)->_#JKrMJ6t*hQ<kUQ8c
z&-aPKHW%h&VQT$#pr%iUBficKFMbj(u||o+JG_?*rh5(4cR#{|Y4fnO+UGHQ)0wXI
zcEgilE@ix4pL@j!H+k}=F*BIqh5!5!SpJemqw+mZF8MQhVVb}GR0D(B>Ras!h0EP4
zNO8GQ^ufHGpt9(a^WKlm)+rz0se&7Qz-ex<%Pvlo%xR;uUj*`F`UQi9+mFxbF&BQZ
zcW>Y_J9%IMdC;8F9Ojm&XgKCY%`t=w#`W((ooskI)EHttgd}zx7U)l#<hA;lnx9GY
zC+CP6^Rgfr)AL~4v@iz^;@sx6s-lKG8;af=jO*;KS!vc!#o!P{ki%T1%Ns{K56Dh$
zT#a*>87!e+uDNT5l-{r<Y7#*!3D&y5xS6d~__gpjr^b6!#lEN0nAxKRlROGH-5wDg
zI(N^mBXqLn%K4<%OclCY_J8J)o6H+bXH0M5JECN5H-%=;(}0BNfxpnWjUTv61bDgl
zU2$-+klL)cFVnM__t*|*5a~}qPu?IXXQW4{1^^~Gt45o?`Rcv2$`KBo6P?hxMcyRj
zx<p&9xo*q|zp0OS&*$e5C|mzNraoQNs4i2-do*?X&MqL94~~rsa-O!A;4i}!>zwzc
z*fa%*UJ0y`IlAo0TMN6rA+Ij?EI#IToLlpn*B&brw>RQ^)I;Y6oJzUTiBN*A9ly6!
z+h%f`z7lcUe~;XYx8Cy<!gXZnm>wT`;;xo?oP2S%VfHUmP4}LvSC#8`nDinh8kntZ
zh1JG{Yv<QJW;cF2<Xt00c6~6z(I?^A?JC<ct!yzrH0=&b(0lj+z@2t>DKH3H4Yp*o
zna}&;ADrYK9<$+4hz+LlTJ?A9Rq`sQidrdl8&p^-BSRkf%mXJ*PlmoCW5i*DEILCa
zLM&1YHAqto0WV+H_F18gKn2)gLZy0Bhmsps)GQvC!;Mt*8i$)BZU@kq-wupIKr(lo
z`{+^uAnHhdzCUf*N}c~g`7Po5xPz=-6v!JIPE5Jzp$02UvU*pcblHii-LCz!dNnVE
zco4+-?E}N*vOqg_@NPZBcY~<{*kr7>!D{-fG_TBr%VmrHzI^K)S*GJkPCNgognd>M
zv7Jxy6Hng*BDmWDR^NlC$TYo8K`x$l>!}azLSvDdu9pXUFT|bl4_7Tq)CU+XM_2t4
zL&^L8q;ss)TRrY|Ue$n~&t`Q4f*fFTaew`0pnsam1cb_ln$KruUuq_l6@2WK`dX+O
z&TVoT^qgml0r5KB`nq0iczKrS(Z=GYuKfzaIKb1}RI~viCtG2q1%%ZB5C0tX?}hd|
z+0oW?ofoNNAmcQJ)}bRxg@XwF&Ca(e#1IM>OiDlVTJsdojBeGeJlXVj$~*f9yY*vE
zTYQWc6m58qAvQb3tsbDu1!0ihxM3LJCDeH0JJ3vX3z%T+2#k1PHhhGNQ5;)#?<y69
zp#z`O*P4-(9=@3ZWVKltTS7L-%J^*7m_arxzwjNi<ntS0-J|ipjb!f7sV1M|xiftw
z_^Gh$;z<lkME&<EfnnU?2Ms@3dk61@(C?^$4O!SZxhqb0s*(QQnH448cumG9Me`6J
zsmN@mM4l2yaeVgkVC7o$l2>iz1_U=%S(=8%fDsSU%N#@6b;<Dgq2znNKb;REoj5Hi
zL-KkzUeY%a2Dv8sXw9yE@c8}TUI1!I7dVgRxK3lsRV#JqsaW35(c`wQ&0duN=;wf9
zR>jk|<a7usj!DDOsJHW$OJUx^TqSfuIe3duH*r|X5ak8BoNOcpBTi+k`R%Gz*pS7y
zK<2KGnvrwrulK=%zfXd`u4~qWg96=;gajRYEY?F0tq3qqk=m_Ndc%Y(`1ApzfI6{^
z?WIEzul&v>0rLj-@!euZm?lBBOv}gP5j<*yHj%xlP|=dow?`{1Zc8<4;9vak+KAxr
za?<f1`LW!5<|As|_sL=r*(M{YA7%k_mP?PD)W%)DB{YO6L3S<ijYsjv_gY@pY%Lyv
z4vD;DE*aNvU?BS9XCG>ckG<pt3stkQVBBzwtN5en+nhA9N%d$RXu8~1;US+S13GGB
z^cY*!Jz_{Js+xP7j+XTGP8`l#+g*HWOcgVsqPXjWQ&?9(UX(UzVXYaWB_VV3;G~Vp
zHvJ7ZF2lJJwGf<9A1MzBALf1}9NN9q?r8R><GI<5^(s_!@vF}a+D%&De$y~~pcB|%
z={`)w()R@wEqR<j%$S)d`dm4d$TRjXvgW}f5Ir)vF094Gf3o4Ep6vcX8G-ihMKYu-
znD!@QE!r)z>WWTaBk`!2dw6$5cX0oNFoL@QkocvKUhJ|;63cwuOZ*uE<6`h`;6o2x
z2NABwqo6AK%I*@c%?^+l%-MRnCtT~lCW69Qf05)ao<Ys@ojd=uN;-&I=IrL6)m#+S
z!=%9Z7B}h*VSi8Go~W1{8AX(=`_VY^XZf_{Y6zXfY5}w12=;4*JX}xx2rK+OqXMQc
zyks?1M6P}Buig7~I~|Qn(hI0O8DCYj3!}!V+}uhuDY5bgpw%*ZR(oBnI_Pzqi-n>T
zP!G5J{7Jhi=P73`637KffQv{O>T=l~cAD2IxkxNkBRdOE=4Y*`;#K$p5$PrH3sf6>
zJV4jk@jxfy_N#hn+#Mw}N&2=P+8@UKkmT?Y&E=wyljxwj>I3aJN)T&Hj`#e|P&`NX
zG%(Bem^DNjzw3Zb2EG5Fgx+8LNtY%Ya8mA_3i>5@9{8zWUarZB0_<4eH=2W!{k25z
z>zfzS&ds`cn5oLJ5A+|6zv&Of&Wefa)S8r@zv+q=VQf?x9z{854B5AwVWP|T;kCNb
z38(NsnseMrwrSXaqy95R`Hf2{`zQ>LU0>XYR6g=(5r&-oOAIhOP0@4M9f^xF2yRFZ
zPhNsbQ|1H{u8LLd3$K4KRQyI|6`&^R)DMDc?W1>PoY!)Q;0~l;f<2{IHwd<|0aq%W
z-A0WwdADQ(f++ZBYzFF6w|y%hL?*}`f*qzD@?rkUpuv<YRs=_~CidXwha5dG5=yu9
zWU!*4lsueqE%6Po1|ask67UdM;8=0N|J-If2AQAqJ#oALg~LeYu=d9^c%C}Y*@bI-
zlksSj*@}}_+ZXi`h@mflKgzv)x0biTd@w-&BTmRV;{0Ja%b1>TV!7VRFp|@3Wgy-S
zi&~C@cob5L(jyT=g-XD)7NT5ZrE?dG>2DTO@?y$xa}STc_i5T>&$GEYRpF0-aBBL8
zp@jLn>S62aJy7P_WkPxxRdFICt5sau0U~0neZfJ(K)UYHi{it+Bc^NH8ox&pm;ZkD
zjhN1DrGA6V6AVM$XUwx|gR-~Zo08Cn=mnq70`A>(qc=KEEXw9yj+tQ*k;v%GqkdPL
zjxD8|gh47UKd$={O%V<Jke82$n^Q9;hl(;}hU3IlT!9iD?PK@oW1C!)Vx?#lPjB-V
zl&<-8|E^`MaqF@^T{9#xt4q7odeTfWpZPSefdK~NmzIzA(N(a0gldw|x#(c;j>pUN
z?K<6W-biY>-z&F{YVDm+7h7`FFmc~BxgFIAUUQ|E(pTU}*I$UCVC>Ld36$*e8#b~g
zv8k1`d_Zxh=L8Xa^%PMGNvr)|R{Qz*cO3UBHCK#fSC?vMe3OQ`K~H~-#@aln<<lMk
zi$7^5msEL<lbM#BG?EjVhV-9^j^<#7x#>)O4KY-B=Yq?Y`^yLg{fC)BS!j&laKfgl
z%a5RgOb}XnN`c4`bAA@PC3zP-i*Y<sLV7HS*X5gEe->|Z4gAtbdyuCI^?jt1!wab&
z7gaRI@V(GOK=bFio#s-HGT<Gs0{6frl1Buq!UUNOu}~fMFdxvU_HHexPo8Z5-TGR_
zanO1gdFhj`O2k?>2HU+b``!mmMDrH5czlBCR;HY{G_!|Dy1z`P6_Aj;WT6o7NUW)L
ztC!-luhgG4N0IGN4vP`6RsK}|jcW)gtrv5lczvEv-%MdFbCQ`4ynRzicqFsZC+;x<
z#vVs?yW4;VnYm8o%H&^W<}!ow<~m~d8huu~byK;Vy?!mYK~GCOKaRJLX+;CRI#aSm
zkEz3>_0fk4s907)=Uyha^|st@<5D(K2`ss2K?0B*Sl)w5yh9fnL8yJxw)w2cKo|pz
z&vFZQE2mHPOZXdY(|T4UDP{6?&Kk@FH%&=Z+v#7k61(J)pl>QkA{WKkmb1luq6v9&
z6U#fee0oeZkVkwF29q|tt4p)OEbft((3#eNfGthyM)hEHB1}4jV3h73PrW$J{1KtR
z{MTZtIQ!vmLmb4#<b{utokX_`1Y$ht$CQgZ;<_Z5&pO?`a7>3XKaO&<DCTo?Tq#6<
zfz1ZR>d7C?<yLo;VMvppI+N6A$Ds~eZBp7g)&AnzzWFgGUQ2CTs~1EzGHb8mu^^yU
zxANK5?)U>P(k5+a-7Uf;1O<O${A;t`WR{dW)FkUqNso<MgibzE3=jSr#?&Ykj7nWn
zAMeW$uX-b|JF-iML~Ma11Z@8xp>?yVQU~g3z_Bx7a@eoSY9PZ;F-uxqy}`9|j6G2a
z3KWpZoRp#9>$dlsLAUUiw0ZTZ<~~f=mN}?>zi*FNW2D!A(3lKNbbn&LI@S}`XhNFg
zgK|mnWxdn4SFEClb?2iWGSJj{de=wnen~3d)_GSwUYFt-VaNm|mxAwS9+?<+?qa%4
z)|!CkrA5@5qHvI|XnX##dAyx!jXdqDgJW<?Fd?%;8Q*eA;WRQz`Jt}&{jB;b{R$VJ
zKotThdSdzaURVCv5`|Ssz5^u9;HOelMYg&UYDB0-ujT4Uv7HF!v?mVCWxZ!5$oHy?
zmnmbDvQH&6Qe=J&R2Y&wEsu+>LoS};OrMVyDn*J^3N0A|I`vat+89?w1HyRQ<ZqaL
z2wINcC9+=;@jQXH!}gPUe!@#AU2F7_>~{EvT=b`V({>(2TB?(9To&jDDX@JeOVZ>m
z>zy0_$(eYdeOBQIskxp2mxXtrjUP~|eYd-di7^r8E1owX>#|yQoMiR7k>5niGfbpc
z9@^WnVO0CED|LARQ)RYVbldOmt|_><J!1San4fa(CF*g#<4KY4S#>|HHvY7D1G=$y
z+slL>eJNYjErRT{nvlS~?ZNw{=0$Qj*GY%r|K#6me$x18&6@8?bAA`R*yL0!QT%&M
z|MOU!5}4$IXNf{ig@#0Gp1aJ!Z7mBfNxm(pVa%hkrW+e%oFwwT3AYW|RZE*5v2T4V
zO7MIHOO;OBZ?hHOiewue_pUSs*O-J4<VB<fmHP=Z(sVoxFl^28^b<<dXmn37n}5h7
z(Fvi^<=AfyNn|zZEmAf~`D9q5B*}RF5TM5v`e7s=A)68<6dsM3ja+msmA!lNcpWjU
z?(vd7tv2y)l}d_g$;=KEj?c?6cW=CO^R!jo9>Hw)Fp^wlc~7g(^qN%(=pw?Y_;!Uy
zo>Zx-eG1cXk2CCamVf*hho2S?SnXd4e2qo%qjMPiB`&?%<jCS;kzZV?@2V0K2^tZP
z5aq@6@7Q_#0LpPa2*)#FXK#3qPRKX&p0sVhyXQgvYXy>}=`op?G&PrOB10)+@L=t{
z)t&rbs|nLw4)EbQ0kJLa$((X|Kdt?lj#=(`R!3JCThcna%L|kVdl^a1{LV1rEiR*E
z&#BWGxb=ysSfjB@T^ddVrNrQvQOlFWgGs99+`2|qpwuC908^X&o%M#Q(nMImU>DHT
zFa0oVZb*6pM2GLc>pHXl#^(3ANs&I@6s4EXiI$BLKjIzR7p5z?Q!K@N75Zd8tET;A
z5fSKO+AL-jQGU41Z%1d@8;qz4%_nVAs@;}`*mVUtw;dBCD_GrOXKhZ_ACpg5KL~T}
z)7+_fU#<$2(^(sI0uG_<2QvvcOJC9B5U3e~Bc!4BGCIGm@!aMW;>eq{=b=)UV5$k^
z)i+uKaxta}Aml~3O*{lPuhQ3Ev(B*wax|@6gy0U!mrl2)niJ#Ca=5`gJ@;oQ@7ZXg
zrD??^XDz3=`Yx`>eLBpdcCXWHB9nh2EPfW&`Vd_ECh+w6j8}7R4^0AOG5q;YAU*zX
zm5OyWG!PH8CT<kaAJUgK9*bWr;Xwj1%CEkaGfK5t%mWpwt{a23aYRDt&Q%3fr_eZO
zBA($<S&ZLfV|Hg5sr7tLdOy*G(f8~3fmf(I29JHV$><K2HU&DS`HBt?2mTd%k#gMF
zY7>EX67k2R92I!DpLn|^eQ%@%VQfh0J~-WRr!|6o_e|#+ryMtu!Xn(H8x?D8--+w?
z7gf+CihTHt^mAho#5T%Pp2JM+JoyY*!)pI_No!?1NCY;~k{)0*Ybc9ULT^gCKB5Bq
zR5&ey+x?%49Cql{)Gl>vV%QeyG3MRSL8TMDOoZ>lH1uMsis^Mw{C7f&K^)+_a<OH2
zw0GL<>B6^t21En5PKIrUxOkOcxFUt;O0-Q?JnO==4cJ{uoYN<As~J3-M0t_V2$R0S
z^$C4833!6*G`~>mCAvuBxz#%_0pyqX+C<>tGgYx~7TO(SWpk<2stl}FzsD$l<Uw`s
z34LC={w@ta1HFXC<FnD?crY~@kJn%tGNvv4bYg}v@;2QkPB(>=x6U&1I)qeB+kL19
zk;<_X3br<TAFFza7dtnwY@yOY)uBu+rpv?*cY*NX=FlcjzvEe<9tHm*_LHH!KyX$R
zTys<-*()U;6yh}NyIq%rz1PW8sIn%tx43jl0_CV4Y&XqC@WcW~fgja2jx*t_3t}rJ
z%O*1^YMt)B9zR?gQlg6(NF~tdhr@EcRn;pYwG4SPTc+HNLQ<w)pT<hTtpZa<veh3E
zg9d8)qr7d48@}6mAB+e0c4sTMGhn1Bt*%0RtFnw1fyon&2l8vp0mtD?54J=}FrB5i
z)eW|89Q!b+iNSWPpAdPm6buqJC~E~=VT=RIs(vE9p{w+Bd3GN!@N)(sZg?;fqFk$u
z&_7x0xRwPSO@HXGJordaj|Pj!YM!&jy?vWS;fx`&p%ALy+p)&46F=9K<woPrja=!D
z7i`|8dDdu}lJcQfcP;@zKQ*w@?x{FCSI-Y>a`Ju<XE071Yp9d~5n8SSugTx!e9AZ?
z6CONjvDH8LQz3iAJhI`<eX+>M6V1zob;T1brP0d@cN2>tI_vwlc3E1u#kUz0L{QL=
z*Q~@_YD6pFUIkh~`ix18=2bwu_m=@d5e=A0*J_P#vipelK;F!?M5yz4*(W0u4!rUU
z{WsU8TPNK+{vCPWPZSvQFdy?$Z<XZqD1NHmuZbO&kT}lDfJ)Z%=zxqRjxvk=_tb(x
z;gc=}<c?}HWZAPne+rIhh8Wh36}>nrFXjeIq1Xfq-S%p!COM#QD!b?j?3=|=@0$<t
z=bB&t*<I4nxXr3&E$gZG?%TtTS87U%*dGveuhj6Qx#r)U_C6e4oXTAeqiU5P$K7%x
z``3I^j8ARxeYT@{FjEiS26F`B?$8XZ-U!KoX^(!$;^ytJ2>dV|xi~=x)E<ReB_7d;
z3RNTtofVIx<JEQD!MlPmO&G<nW8*PrV9R33ZALw9GNFphb<AZ(5Ak{9G3itK2;#h;
zp!$lFiZsz1L-?uC+d#6&IVCI|#ir<Nd<pnJXmly3Ou%v)xN)7=F5}c5QP;h3V;^3+
zV5?L@9!{5EYrR>WMhoI08g+g9(LL$j!N`o0lLk|py>CQrS}%Gi?H-z=#EYJ_ZKm+o
z{{XX45A_V!W4Bvu<MHzLG>#_D$n28pY<s4}JO-qlk}oLQgkliD#%sC-$5r*vo#SK=
zb6X&M?MIZ}wSy;kZ$xN$)LqQS<*I;i3R06c$Fx#w1)iLoWlpz%L=vT%7H3Y6bY%QW
zm~NJ`aUcZYOze=Z142EJ<wMDsg?5`icQ6_WVK?WoT$h_^0nXaQr?IBPgY&r<IGsMa
zL<YM&i6nNfq70=bvoJ{N@`g6L18O_kzqW3u?N2D&p$2DKIW~Nb%jSKd2w~Ow!tp4}
zf;kOol0A|t51q+Q4U(2BHp-JSYZDw<lo8x%71TyjIYfsN=`GxE@<x+BrEprYA#>~%
z$$GE$zQft&zxq?K1Y)|v(4IK+3y#*qrHW_c4eawOeI(p9PG*0Ux!sq#DgIuz{k|$O
zHM|@8Y~^dzH&V$&j?x=NaR-2lz5lA(!9tw6@xJF}6N#GM)Cn!Ix`;Gy_DbOiOGqGq
zpp^(}NAUYmwrhI9$ZGfgIVe?OPCn~Pg6*1d1((wfd!ytBTV5CnwLAH-Wzk$Hui?y|
zPNo={^^wJD`^_H2LD$!3?_o<mq6=aQ`L+J~x6tsr$z8VeNQgb0eiDojU<!#M^(b}7
zd}}L+(%$1w><8m6fy^!ei+R?SuAMQ)NyvjJ4{n4QJBJ{uxU9QPaGP8Utqe<nMFMbx
zX?aN+9(5L64)tCROanXl&-ql~r<KKba6pmN(c9yR;C<ZAOFjG$A@>TKm7iE^4w-uk
zN$n*{s@Et>wbOf}I)oVIvmTj=hPXMZjmWLj?w>X^bUV+q_!677(8L*q*Pv5`%+6YM
zY1%t4ut5%^vMC#Fa88Izh^|yVJSpuqonHJmUYBX7_wwF803TDLT`#+Iu+nj}s98XE
zK9Lzv<-CK0NNp2jFX)2H2-kTL$V=SplWQCl(GTW`ual!G>D!ZEYM!kKzJJb$TZ#S{
zlq@ywCATsRGF?$(7t-x_v1#|ZbwacwF%v8bWBYPypO1G+?9=8(qO-|aXc##f);1pK
z>BsUu>=8zz1kP^#u~Fkv#qfZYi=z=|(k>rs+*2_`0PHlcUAdOcD^2t8qvEk)Ys-6+
zBASUGi(f5!y~;tt4JPbXiN>>Hpu3u}hD}FQ-j~C(VbaxrzpU8t0ci|gF&0BL!=deb
zoiiLy2QETrh`P)vz1eny<X)&%r3D7-Wto_u+*Z}yy1h(eCjtBFy$2pvjNCd1?=LPp
z(O}ETw7K~bO8+lh*7@JKER^#04G-9g>vZZ?0}S3+TfTMoX*c_NB~EEZ6tIaSabHaV
zZ1#E*oGZvKU)f4QwZVLBO}%~Sn<(5lS?H=Z6rg1U!Roh!hpzARqKFCAw*}v=V3wSm
zd8O$+5T`*@=)<g0Ch#`qI-h38B~niLIlZ1L$MC!tKu2niE(M)K?&`qq0}wkw^5UAM
z&v%!rU*&q<Y*MR?J5WLMrQ+7)T1VP!t9dW;v5m&16(tnmuz3I>GAKKd6nt+lN~ro#
z=X@i%#nH?M>~EvdvU!V2uy+fPD8o-K*h%eQf{^wdS(is!w$J>70^{nTR>>#q{&n`-
zxyoxw&T#VivA-=-zgOW9u!P<xZ{uk^F6dxy>*FMp8t@0&t<;pdoz4B}<uKFb(Tx7w
zpJ2nfZ__qj&{VKDl8*_?{5H`<frW0mklm&0CK-lZTBs_tmzFEllB0fP5mCAigU?|q
z@}d)a0Sk|Qm9oIPS-mRIL9KF6jfnbBoc1rT*8$_dzraK#JJC2fF;Je@{qS*g{oUPQ
zTpymRgh6E}IggCEhW0?7?Z%gjGZA#kh>$2qzE-PqsXO@^X%tTL3?5ZlT-S2U8$=3Q
z05?5A{v~B-0ObRgCjLiwQ;WlIJJE}hk>bB#%Kznqgd)rjX=%ms6m+S7M0OM9eEWKa
zP5bF2{}El4e3!w@EZJ%;@cEwyBk&d<i!q(3{2xc-M<EIvh~m2GzT$sod4>KxINN(*
zye*#2EwTRygT}rG4iq<YxZC}YP;bW$04twgES`V+51{gYtj+(|!hAJOWHBBZrXKj;
zR3{We!g+nYvw-5F#8dj8kLU<PJdn!0miB2e2pK1~h4~!FVoqqD$_eG%YkBvNb^ZGB
zEt<nzz4OD>J`gl7&H9gD?4bkdvh{(}M(p3k{I9D1`$LQrZ~}nDb!qPZ`z!x>3I3m0
z7etmFcy!{}`Ze!<$>lE5Fyq&GUEX%A$FW5R$oAu@kHk@w+iV&mE$ugelz8zOWo>i2
zE8@5O`Hs5gDlo|su2~wY{rM)v6M)Ttly||mhC*pvJ};gfmv@mEy8wENiW_XKEX2lV
zmw-qSX#et!<lH`OqrCkkmQpHl2oOyT1DgSExsnKv52fi|R2Fk($d5ceI3-%GJY$;|
zIPLS5hW(L)=z@foNL>zpYl*?)hg@VMZ2F~<SQ=w@xjSU%E=tfH4mB^|O@~mNI#WEW
z<S*?C-S@MJW}zCUiR--=`$R2Ynkxc@n&IZD&l6cdFiOHza<#-xesTZD7l7N8%FY`d
zZTt(^9%|*hs<rC}SgzL58;}@2juB1e>-tbX1MBSHa-24H-qcqhAdh$)SA=Bbx5$uk
z38B6H>%`Mim5inv%_I4RT1?i3Q9}OqFu%u_Hi1ttPv)lr_j-#g<KjW***33!7i+QT
zskThABy+!b+j=<B`{%u&SE(~|spWaxo`w)od9JHx3c>PKyCamqCN=q@WAk3?t)Xa>
z!L0eG??;HARI1{)2NJ~L(7pCCcU*wxP;9xpJ{DDTuvv-I4!~L-^<c@WUq7C<OHE=@
z?yRBU!4iq6L=y6wM{YDc+APqKTVhzp(fCPxf|Orf-xU}8_bvkaZ&QU*oQvo842{4$
zqhj68+k1b?cuAd;Ty_(HUc<wQ1j%GKk=SFSIJ_*csyu^!NxXE4N-qXJiIM$%9}QFb
zR()<h(3~jF;JrW4K5?}>nQXbw9l)Zb0OH&)93%cfc@5BK?wN5s0W>cG5v@(<mz(IC
z;|b>9Iy-Zfmej0W%~6K!7t^z;Tp19j{k7=@NqLpE#QnEcDd-rEKh(LOR$UaU$5><U
zH!OG6?RN*MZN@;Raw+tierch1m9?0^+RVG{Q|=HcBn&rknr|frS$1GZ7U=;%@3rys
z0WwZIUC=eT007mx@hXHK5y|{vky+ERt(^QlK+GsCh~}<VG+$&Do~d5NhoPdu>vG7T
zIfKw*J&Uwh>y={Uw*IzoJ|iZT&yBXk71LoIpg|?UZ=v}~&LeqGqRZPK6q_86^tw75
zmMf)H7b_2u#}8d&VLpO{9>+jCV4s(Szc}iK&5aZgSdini>}<ccY%`jj>5#QwO`jus
ztWjli%aAN!TqT`!)zp&bqq3&nAn$&C%n1E(!wjD6Cn-zB%9s5H^d>AZ5c5miD^+b8
zkLNT&7RQnF^BAwx1e{zx<ATd8a2p(<m=d%*EjHR8h7NTy;O=#RmG~zt>H(|zRqP>3
zM2?ZzUgRB6{cfM;!_CPub!~bfWA3-#Ovc-3<1P}N{lsxR?kGBQ@BhA65E458TEnPk
zZ<59CiEFXlw7$mkg*)5DYO(1$wt&NW$!N?;arG#r{AS>0G+lz$s(Qv^Vc?t%T~x06
zT{bz)!-C=c#iEGGa=xmZcz)G`a>7so1%SHSkFw7vp>1#V`@|1wL@it{$a)_eK*pl=
z8Hbxt4~=%FDD8i|>Zn(_#<V=XJYF0$obM8!m6d=SLRVZ3krp2YsV@1TA8xT6U4i^(
z5iGcIDCk46;~q&b2xzme9%fqAkRQVbSgavVB{f*g%MOC*5-qjbRZf}?%hVY4OWjYL
zEaz3$(m1K-&I8Z3{&M8a9Bq25;^mz!_cl^M<}qy02N+_NGd{l%_!cP)v<*9=?Lu>Q
zqb+fg!BjVg(R;l#tG}QvtQgR!kkKOIaf)QWf)?4a6?9$fY&III*^k5to9B*8dh^=0
z(7K#6%3=EsMThYx7+dHqgHswww^}E>50Ds&>1w#?o^#o#QVPpYmicps`=vH?_kOZD
zP@hllkoZfD-p0p5o#x0;Op!7>;xwBx;vW9<$n{OH&234^L<W%MT8+XlYzD$_n9{he
zG(f2^@mTbHf%rn@lW%ci!6<~J`SHWS5tBjo12L3SzR<J7uBq<K8Bj1j%a1`Vyns}5
zx;Z3`v85ir_HgLib;rZWz`N;3c#73pe-YwJf0rm?;g<Hi(e#L&XY<S65l)j74>IZU
zwr9SOh$Yu&t4yKwwSi)c?aDm*G}c=&<D;?LyR=D<`8+Ri2jY469c~Z7?RLOUHQ6)V
z6Ok%DN92u&e3jAIXMWuyhTAoVs2o7ycu+KnaWT8rY7*v7?on4V)h1)n2y~xz&J9F!
z9YStc%r4$L5WfQCu_OQ&ep04Yj@7hj6lF0TqBy13BHik5G4BdgQfY0~^Xs@fZ;}$O
z<$SnLOy61aIAuOJ9ZFI-(k=R4`1fH)>L)=9vEqLxLDGGQoM*V$M7vPjXUjR`5gn7Q
z%l-&RP3rMR&mQ6nD;XGpE`c5b^Y%zGD^x35Q`Rib2EAo(F~b+!o`^en_%6B~Pu#WW
zYLj*|M+(P1bz$w0rn5v{?-;V~;J?kP<4@Tl_r=|3sSUd#vsR@Ynoa4&hIV5u_2Fga
z<B!X2xetzQEp9oQ5ZS!&K<3*7ea1m5D<-fbkov~3!Tk5OJEQP*{t~t;lPkTin736U
zT5Yl<86!C7sLkOIkeCu<;>0m|GWb9$_R43o>AVCWe0mUI+=rU338X9Juh?L|HZDjn
zzZ+#MqgAF&wm257AuE*)Iz}x*XSmlZY+0xcqmcS7Et@W2<uCAu)wIM`BI<)HQ5I1c
z6Q}uolAx~6U<z+NVZ?b>pef&!_W2<7UwSkdQdk5h^%GQNahvFoW7i#Ab@l9`3~rtm
zvkI1mpH>BRtVm+QVCFDae%l%9+f1W!I*`)ont1!18^@7R)_&3?HTK+&UUNDlify)4
z9DvW;?0i501dql8&(Yz1@K{m~h~Hxl7g<*UwnSzaN17|}bfz?g8(b*D>$cYkzZ2R|
zlIZvR_y>LEFxmakcGJ#WqFA@Q>}r2j#JT8Jg`6HC(=uy5Q}!8dXkR0l+3D9zvG%fS
zP$~E<6T)5oxS<7*tl{SvK}88p<|tn?;{j2w$H3{F!0VsdLu0-6wpi9?eE%9ItNF@m
z4qr{l@sm|)KfD7bLIUvS?OI$Ji1!;{L==w987}|&;-T<XjO%>~#`|?Hux5nRzYx|X
zCUA>!@d$`hmv|&rlq#Hv+{GMSn#*2oE5K+vxD6<9#)i+{a`%mV?3zmPLwr1)W}hoQ
z`TxYP+5`M*?BP}Oiwb|`SQ;<g-OFQf$cOFZCZ`-gFkwOxtmjI%>wcwx$NXgMfYN8c
z%b7ol6E+jp=uuy$*sLOxEa33lZJf#$_f=rIO@yBWKj}38%e}oxgVS?xgbYf<Es;52
zVSf9ud@)HB5$Bpg`|Ei#3Laa>_NQ{bke?-rm4{)B;RcF;rEE7$+G2c;@SE0A=o;yf
z6zasw4ic}Y(8ci>{^mtuF4qZ~;K4ZEk~$JEw*37XA$XMF_H_@yGU@YFt|t1ldcY~v
z+7pw&moN0i>$$7Na)SHC{M&d_`JD4juOKcv#gn<nHZn$3)?lcsk#4%!*qZ+rjSm%i
zz+KBpj10Q0E9+`YTyeV97Rwh&`ZHUc{A^L)I_LP(luz2>HbZGTp4wY>MZhJ&VY3P(
zghs^s6+M2@^>jMoazIn^ah(-q`fczh*+BHyz*%C*;QUf!c(6l(Vnv7HMdLwy`svcK
zmVXmx#DO@|Hqq;OU}OuI*_hj`{vK{4>nzd5Aa(km3fDD&jr<cMJ|~Z8Gk3Lr;&8HM
z1HaFhk@j=KZ4wcYd>?^*Br7xgBJRuI<<AC&(0#xKHr$u`41TB(0^LS2C&s-S>&_pT
zKZy<qrRA#U+XS5cm=(L6PAw86HS{99ccoM*k?#z=SLbygwH4@c)Up-wWsUGS2zPff
zPZ`sjfp-1Px#l??-z;aLiDLhJJuh{cCu0=uPMehtp9U_lAvFddA<Z7YvBSffGJAl!
zE8^w51@uD;+!TI%Hbe2Bp!0eTwK~E!H(-Lj=rt>~bO;1xi};m$SKbY&O>R;$Hu9pV
z?CA&P%!ug_fJin1X@61v!b$WvNiGuWxl)uxMbd$?>Eo<{xL*BKB}D)ISM)<{Gop}0
z?ZU7H8=MwyAR3m`u1G2Mt?2tFnQ|7so+r>{6ZM$z+u(Cd2f+AQ3mMCEv1&&#TZ4VN
z`rVOxYQ2c(#rBXDQoXe{5BGS*`Bs!uS!el|&^|;~U4B8O{I=<*U}`lQmZ#jz3!|V5
zsy!fzp5oN!%+K*;K_Fg=5z%3LK*0fpSk!tjqm|#8tL@QMaYfwt`K%j#IA60)E4Dt}
zVomy?7Z+*O(z<CM;CQqPccz9~E%*1HB?0TTWBl2ObJ&ga@8r6J_B&4QzCly@rs&ka
z3gf8p7!^FD2so_c0Za3w(+)%oBhztJ52pfm5jO?YSgV<3WES01zu=AS(DG!YI{RMg
z*`N0jDobJE8?gF%U(;?LC@RHBme-NuAsmOcNNh`WUKf=sF1kElGb09B4?tYqxC*0h
zU!)XaCN?HTL)w90)rl*<t&MhW3@d5JfC^)@_lpG94V|e6i<%L!Zh)=yamHesL;ks|
zs3D16kcLWx=e2U&c)Hcxj2Exz>vo|M)p%FIM3$M-Vx3p>{`-f=)61##WOrGv6AF5$
z7_~9@sebFGb1FF^<h0M3NG*Y-vWcorg3TmH4@A6ziC>Du4Sq0E=%%fVbxCy|e00p{
zv<;O-lHispzm8;s_kS8Lx@JG9QnpY@kwtSg^UiQ!SV9xH-RV52z@+>&40k*bY^ktC
z!!tU0UTO0>AA7wKVibp#O~Bse@p8#HD`nChLLigfROt+pKE;9%JPUnrjR=bU_kFUA
zZhyP#(%<yis;Q<5Y<eODLz5+o5$6<0!dJMl?t2dpUmU`*-JjH}=D%-7qgqLdH0Oc)
zda#!<t=%DoL7qS*gL~@uXTi*bsu1E<-4{_?>a&6}D{ys8wuKRJ^_@mlRu3g`D<~lt
z)$z|yOv)oU)~cQ9(Ujb>NWB_=w$-if?lk0@>-=&!q~Wd5I3a&FDzT3Tt&g|KMk43c
z{$wV&NV{}y%49YEU5FR8_!R3)M|68ZAZ81dlJ2nG_S`j^Nq(##<PVNxtLbC`>o&J(
z{C&m*#F1yewVE6o<G&l1W#J1P@wbd|zk^#SEaE5FQx<*;hu-p`cz)mrZyGu<(hHaG
zNK5#Wl9<P^(FB{OWHySoqKIjM|6K_g^Na<?*s*AmRI5jZt8Q6F<{Lk88uyLTRQ}r#
zBWWU!q&EKkV^Juk_rbJ3COXdMS|qU`m{%1a=rq1Ndjt8ZD3=}!Es+6Hg?2kX8CKpG
zrB1u3=XJ$Vak=l7jQBu^40BG`vh(>Vn%keITyeU>kYlM`bo!(BV5UGqZq{<)=bkf3
z%(Ax9F>lT_D(4YLKwm}Q==iDWp(s)5Aw+y|glP=n+H83C0!XBR_E*u{X!6ssWcF^y
zCpG}1vt)jc-`68QR0Me29>(Lc8m%3v_bDx`fm=r}giNOKX4|X?9B^-i7daJBpV_FI
zB<;YM5<hqNrm!VQs{7m+iG~H1XP4&W4N37qkF#fRQBICJofz=vMPcrHH0U!@dj-|@
z)nkG$JzKqmtCPgHOLJxFO4s9@BlT^f;<g!<!Vpk7OmX+G!nDVs0R<EMxTu<L-0}0-
zHfbLIL6hs~Mu5{9vt?|Q?H0yvzpPWLH?+3cYv_->W)VSaErW3XoqE{<=?;0vyJ^_S
zkk`9s^v3HtDabD|YK*G_hE?78mxJDr3nrH`ahYVxd?Vz%e6p8J#&*kH$r7Dov!|<g
z#5k&CwD<AJD%P#n*sOf`k!ULU0Wnop{jxmYD)X7)Ze}K3eu$$G6-?Wh<xk)=x3i^%
z)bF$2s^lZ{IgB=4Z!;eM=vxUZRLqM70Bn2+Rk#siZT08qHuLErt0_`Iw@%m!SvI7a
z79%r)6^S^u{osLuJft@UP3PR;Oq%_+g}_Q*A6UeE?>9iPX3A-RMdcxw2QIpi6<*xf
zm25fP%MeL5W*7f?ZWqs&8;8R)MO=E-L-_=@!s(>U;@))E72OG`NbSFHpbjSK2(;g3
zn7z!0$MiOM-O3d^KDVmvw*2k~RP1OHA0GYS*M%^2B7q!Od&$SR<u@=ZU=d%^u??Tu
zBm_yjZqe-0`%+ZY6o3sxrLL!F_t%#%p2?n^kJ}cw=Bf2>1fUm7%6oR4b{h|rWg)i{
zplQL<xnh?Ee=d9d(52=6ce&u!aPL4LVTX@gq80#$pr??Z4JHYLh2~$ua4cG|N4tFP
z2v|XC`NK2=`%Ra|cy>Uw|Ic9EB>Bs3Y9j|eYYyCyOOU<sXlnRaXcCbpxxCH>d(?d(
z4aNFjGSH3DYsCGx$G+MLq<O__FVrtb`%p!Z)JK2_u}TYqmu7)tF9LTnA8Z8NU@oot
zhaWZNbRgLz4``|m9}z2*Bq7sX@@CRxQ0To#Jj#eIF<z8vM8Wz27`R-_B-=f9hEDIt
zR-snkJF)YTu7Z5p&bM3uJ&ufc8t=+h@2}(x+@ifS#u4x}-@lrt)=+9%Xp!*!23Is5
zCwr456ykN3H~H0~L>!J2LS}01ZI<9lZHRU2yyL`5`VJYZ4;vKR^fXhd>tyrkK0>>x
zk!<+w7Qm53l)e#(VTKwWlLOjvNq?ZoiC2}l`C`dKk`uv_Nhf+n<VNLVl@Z)VrtRF~
zVh7))&oTnv)8Zbx24;?|I{si6W6D}tsqqH!Wq`Ud!VR8Y_HO+P`@S$!x$hTKeR+{$
z0kBoB*b#DF5PWTG@7)<Z>Yk~!e`eHPqb;(x;W?%&{g;Xi#2%ffXJklr1o8r7v|#&v
zcbg^?8%)(<*s2MO`H`W$7Yb1)%xn_xhAJqh7C^7_HaNc`cK{Kz2T$r@PGya$Z^t_)
zaHZul42YX@hgFN9l9rAP%bEj8asR+{YKbMeVqdJroQ~X>KTD+xEL8yMf8Vm-Zwbi%
zVU8#Kfc)RNyEV6C{(z1K|KNVpbSoDW{2$>~{{h}jHEDZjx0be+=(K%jF&UvzzVcuZ
z5qDzeq)Q54;}_58CqCd1kKE%vXUdr=OAX^b17!NDtR&wH&Y84~usWXj0qysE@T0eS
z-FB-KUm3~wD2mSpFxeR(=<|SFsZmcsdA9P&{{DJD(VP{3eNLu79pr?UVZtp#wDfDR
zIzrifkxt^-)2=1)f-eutxLB1L?_iK+)H~&IXm;5sG(JkY5PaHcNfdCNmYD>9BPF6^
zSUVXtckC~CPv?C1=lMAst$R3yI?4>eU~FAQgG6Rp;XM<T`>bdIHFN(QAXd&XPZ*-&
z87XX%@c}2E7>Fme%4N{!t@@ho{!%VBK5sr9mJ)%xN5%g}q#{c&xHsdg6KLvBEmu$6
zD&4qE><+nwb3cnAT$En?38?^4g1tQ!V8)ZqUxX3?1DxnLh%gk<+K?P?n?cO#0wkzT
zLz|rg<vi^}kI8<XQ#vuBTK5gN{8D&rB!lI%&Z`>m>;?j;UCj*72ZM_!lcDR|u)5RN
zU+h$oM1qs?TgqBGp$Ji;qWPn#W-34Gj5j7pvcz?aCDMgoW;W)d37m>(>Q^wf0s+QN
zA2$oN@w{(u-_EiDnDjt&v&99L3}5Oxuate}&(r{J)R;j%G*N0nE?5j-<#$TaWvafx
z-jLiO#d1K%pS}beBPMyzi+z)70CICm?5jH4cZ`G^5NSdf-~nmE1^q5RtmP%XLKERW
zESfLZZ0{^#LKek}X#4_;oYYup4`I{`9_a9Zy&TQx`0|v*NjJfL%inTSeDIMw>FGnU
zmxl2you?<rLq4QcNbFFHM{U9&l%tN>=+qpE52pg@@aJT9#1tyDUDTS_NJ|o~j%+BE
z?`(TT*R_H6{liRQx=0t)DF)jw`Nb`-F9$P4#=G=>^Y2yoNHWr}a38dC6?>yMQ7AFO
z88Jp2esW``Tg>)-Z%eo$UUoTRyZu^DjIRzGCcrT5mG=>0hnNF}Z}99d;+AaKiBmG6
zo-M2GmML?Ep-2$9+#~a*8*g%PS_22dQxI0FxAXle^Ad52$%LUR?Z7w^vDUOe;jNOi
z-ian1XsMpFbxp1Jx&ZC4zdm>)$Z5s{8zht&ul(P^sJjqUyrU7B-+$36b{O|5=2h61
zi1p5^@8KI=c_GU=;-s-dKnS+bgBsJ{%lLDPZUrtPe#n{H{@mr}6&}-{gi`s*&x#?E
z&y_z3RW@rP=JcH21=ARMGtR_KNa`HT&VP=_v2FrX*bH0LEhYyiQ<PE<t$gu3C11~}
z!yzo#EFutAl}|zWY%PwzKF}R->fCl>kEW!s>7g-|L`Jw{7K;b;Ll<ev|8>eC)q(;!
zZA{ZI3$P;dW@-tGe?qZxZu@~^YQ2zC7>b<@6@uci_$1ukwe+_2Vc#GPm4Iu_tMiJD
z>*(%4_3VA1-@E<|AsBAYyNa<<F4d+zgR(ijAF_MxPxqI}9CGE}&!hfXKM0&A_K7D{
zx=6751GF>C({wBC6FxCky3S>gF-j&YQp$vsYr|Y}Te$$+3!_{g^)=W_@U?RlDLsmP
zm-QaZJCNNzDqu>SrG=g~obi5rcY;Mb`HI!o+^MC2&YvV2VVSaxPh_IfEQVTGdNN_{
zcM+)b|Frj>K~XL3zc!nUL`8yRB#VGblq?w}O3pdw9EJe_0g)^qAW4$soU?#}WXU-*
z40*@{4B=h4ANRBO`May$Q|HV1aHy#%ie7ZMSFc{(cVGRxwMemPHcq~wE_q-o@9J>(
z?;H1<8->z0m3hLVsd=TaV@hAOITN~)hBx+ujsQK;ryBHr*84P7GmLjIF+&t5#<8@G
z&v4uZQE`^3=dzwoUle30ZI*BPrfW`FVjowg9zCs~SE5?cJt4%Q5}<Q~h^ZklOSF^k
zt<_?Dha1$J=XW<#-3uSy8SN4swbV^AN|HulebT5u+ES`?2m0hb6_H*MkA1FZG)d$z
zeLS=)HCRi#Rt%Ddx)3-dG&@%eP+f0hKRXBX62_qmxU&D`!P*^JecK6&AMnrJFOtS1
z75wVUOX~8(WQ@6et(#!24DYFH_jzAa`yt8w_;Xiiexek9!K;$SFhWR&RtpBhCIKlB
zILTXUjkB&=Wz~e^${<GRNu2_|z7Zts)i7(ery9OCj%jV6WNta;T^!I^kWs}$II^^}
ziiR;eFDWl@c$95Ex2^%7c)60bGzH}{5m3Q+^_*Is`e>wLWd)M8YE;By_H>d6b&(&{
z9i1#jm9d49lHW6-`{f6o?r>uEhe6c`cV1+V0$xKs5eflDebaTg<nsXIANw9S(odvK
zEcI~;6$i+oEICV~RoWHQGRKT1-7zV1a>$5EjAW?!m+VnysNi*yp@>v%PK7*j{tDV3
z-g)vLH#9?!NHC&-UHh2VDM%V`r9r6S&vDhoX(N2Bnx?<)wvOVZirY*DF;PkRMLgi{
z=pViKSXz!#kndUqcW1jKeV4x8>i_BLfRG7s0z^uWfG=`5Zg5$Y%v5uRzgZ@oNgZWs
z+lvpb`m8kGKneOd$ebFg=HFO|Lrzv=m{rmY5HTf&jWU2pM!z?o$|u*xVB1oCSFsy?
zSA|0@g)^gz!j`DnoEJ37c%hMUWYJ`2EX(J{AP`FEp>R;<ht7~TlaL9Gch0gQ(0k4I
zEfrH9Li#?a<2P(tsDB<v;raNyVsKm8Z1J4HndED51Lj<~x7R#n_$$31`&w5M82}v>
zQh$ExKvRsM(6~&^Wt9NQ0vigcViT-$3F$#l#^yofzXSta2ht{t7xvlrE&C5V6+W3R
z=RkHET(|!42|0qJ1{QS34lN}cL30i<w^BP066SkUiqWvOh{{@V;j}MQ0?W-WqpkW`
zaClo}Sr?t4nTPL4I#Xx^TFuLBz1cSj&NLEc5pH}{2#}MAQ1i^wMa7WgP16TRbH*!K
z#bQEe*(N?|;$wK39*aQ;*+fw>4rp|NG@C{bpaHOqi#}RWT5Zr?=T+j7ig6J#m?~tu
zKIcA^xizS5;C}aWb2~l4L7&ioqiCYoUHYyqHGf<ix_PnaAAWj@l*CbcSkVKpR)L3F
zU+~k`e{Ry$zd~l#T(ZX$Piq=@>w$hfw@dUUcp>g|0VL#pV&pTE@cDuLLwOF@Z8m<m
zsm+Z!?TF9enk;9D1PzpUBRt=9X;_9fppQ0YNr;dGnt4q<`Ds4@19FQKErdKlbvSd%
zKKSaRr}^=OC>7uAv4md0;CWZj;u8z62V9p5N%pJGfAT~|o6kVKFyf*=W~j*9{#e;q
z^3zutV`bCY@90Z<E5=!Rf52%3Ie5OLy8CXF1QDQ2H!IAykmo4*7Di7Y{&3|ERDTA9
zF%trUi5}EPkG|db+<qmB`pmqabR+&bUiQ~ce956Aa%LmFcy&@9a4w~Dz)1IMtO8AD
z;?Z94%NUKa<3H_+^_g?3!3_n1S=NY2iPxHj$5idSnQOQ#oM<cGoXY>0VxBN<h4g{=
z&n?UPhp%0}Wf%nCS}m|Vvp*EZA?zur*@%x~Sq`Ep@xbd6g$K2g=F_qM;1*Owc0fLs
zS0i&?PPWQ4Dea`>f3dyD4;AxB!^dEKI6aQEY#r+DY_qv<dFOAmgXksUCt>27esAet
zphmz~085boXE<BS%j-=CDT)6pK0$&--&Wrz^y?r036ubf)!Q99d4RC^V&@g|ulS10
zm)qh<aQ6O{UIC_ajKI>;h)3|>t3to@P}DRF8{P9?nVr|D$N&WM^a*+Tt!e+|;{<R$
z!)QTGe}^Idm)`?!RqK*)hk#k;zkTX4e<W=Lw(I|EF}HA(l{`*=h~>Yk<Gn`J11?b`
zWl{a?M!U@ZuVux&7D4U-bf8EfFw~#J?!Q!v-uF*oyD)i=<&I!_$DfgoA*Ec_E4ciQ
z690OOV`W}<rk<3f34{6XEmDrk|LcVZa=?xD`ZkbqJM{ndBf17CJUn$-q*PV>f4Th4
zM*o-D<t?A-3b5PAgn>`_p4$JC#if90C7xQ)OJ>0bK?%hoVb^(#XuF;{Pn+@tc7T``
z{cps9AjL(ezA7pz&&)55Z*0>K20Ar&jm0DQ0AS>+8RwKb{XU>i*X!7JweB6dDss(y
zyPIiAnob{ZgAwpXdC+jv)o_sW5{P762`JSipIw-p1B6Z)7nZSM+ey`s*}#muu^|)|
zm{6qIt*v*x0{yuQ`IZBj<oaCtlWn#hzGM`&$X-Fv3;0Z(f1BlV|Mea1=T7^dF0<iy
znm`!kB#P%gTubI}35@}a#<0!U=w*A0KOk$@2KA0X?Eqda5n<n#To*rbOuMYV2aucZ
z+E-J>fS5(>EyN-h&%zW<!~(<~Reus92@-y4H|aP<qg!VmJ|zt}!9$NXLH5?1NJu<%
zvJx-UCKqM>Z(ale9$nqkMJzDJIeym<j4DVB5I{IIbq>JB#dT_}S@F&Nzpf216$vsJ
z?oAUOp2*TJX@cNA7=0hSg&#%GgAw@g@DILAZ#oftz*PPSVFNo`UoHZ-865yL6Y5Kj
zAw#MFG@Z8j--y}iZ?_lgND~8Ec1ASiCb$&;zqIVG<$BKlw<sIl_rQtk=a8vkSQ|dv
zC4}jAJPk$at=uV8B4X8+J0yDaID7%L_#E1Vf-<K%ZX>f07#xW=QT@%;WgXBXAdT<>
zW8ge#)a<E*FdTJGo4u|LnnxEWJ^z#_luDN;Xi!;Q;rS`9hwV?{x_H;Uvo-)!*m!-8
zO%fX@NzID@wa#qoy;Cf75CT0wV0Fv^7|sK(V}^i=vEEs1Vm|m_tUwt6doX`c87`uv
zVAkhxaEN2P6|WNnUsR73A|l)-@t1*c^4yMJPw&y7cy7%o*i7WJZ)=vsH#qj+XLcRX
z5zNS6n9E@qUE4(qCmiRL?CtzAn&(q8(?$A`p%2BFp~<ywuTt`8qfwpJZ1U9pJtP1l
z#~?_`<#4T_d_}fDV6s>@FJVV?NV<77N?>iftTmpUQXt7Ge+&rL`$)h8yzEV>^j<WF
zkk5N7{{H`gcpY}HdH9DCc?v+9E`o@6Z@f*((RJ?{XOoVSb~FPClRm*^p|11}+7&Wo
z!EjM`_%rhr)P~*KS+NHC_7H$!$!tEFnDbIvY;bMs17=n6-G`8HIOr$@c<+UU|BkP*
zp07=_yCrNef|-o}P0Y@o=zF~N=uc117UU&lBNOHDbnf{x>t3}0&ikkkP2kYi+RWPI
z_bzwawIA0lBTbh0-lgWY(-p3XSfjd>E`<~M41oIHw%G(8^3W*&fB^BFl*q0e7Weeu
z(}C~`blU<}1Nd&)YV$q~?JAQ7iaB_~V;@j!GtZ`bvH*;Y5PnK2kPFUZKiv9$*JW*q
z_wXQeJ2KR?Eg-1hyUfGslUp+OIfv{|(K~AY3t2z)pp{AnSYI4bZ@14=Y4;7Q&G!ts
z00n_8J_NW#Z$l@o0dkvnB4V&!D9gCP#&rS0E3bt(ID$mxFCxc%i}yaxRj?;k2d!Xf
z(5E`nzY{qd)};J9=BsIe_55yIc5@#U%r0OlC>Z@$OSyqmc=ugul@IwX%iwi2Ef1F*
zAkYkdo`WVKtDWcY31N|tia`8pjbc59{Vs`%m_evc4YAjG@I#)~gE9rK$HPsi9F*nm
z__TrDN`U5_d_1X3<k7xw5S(T&%=1miMOYjYgMOo1rT3^5i%z{0v+NUNqcK1D6~i+L
zP<EyHz$4FGSvqlaycpG9K*2KHK$2EE2@?g6;+a}k0ZAtiEtu^37G+Dnm#+SI(YyS|
znx#5yi<HkFN~SX#cerat<E`Cg^@7)<!^#RY_8iI{rEplZKAoMzHU>yBk|N6=U4*^F
z3r6+4hmTObaav;ElC*^H`9{OM%TJMOtrv$=yK4-*2N$T%@zWU2qGCDP-@LEJOyzSH
zd-hB7&T}=AmyDo0ijsF0*mRc#;2HXwN7dYJ)Pa@QF2@`2_ZNc#xnd2KE=L2rFQp>4
zdk!JyH`is6m7s05y07mL*EkQVk8A>f4Nm2e&kD9HOCtB`^^jxL00EkZF~aER?QV~C
zHPKdsmtB7d(9lV@*_=Mb!OMPNFd6b$S1z%PhExnEK>E%x{}{H0At0q`1-=Vy*?O2N
zJ@*#hT|L7OW$hPwv-NORM#?vTTbabfYA(jO(+m)c<FlhzrVN38<)$ulu+4QIT^|`5
zui%p=Z~PG{9=iZW^y0F-l38465yKjMay#oJh(VgcV*AJooS=W%W54xvObi#iF(!Gn
z#F|oP_q40iSR5UYhnq7^l|skg;IN&LBwr(-GAHD>qV9<mFKs#1sZqb5urpoBPdCm3
z98vfJ@?T@^gtR}_M!nEq4Z)?Kc>=@=I|RC*`ED-}BikO=7pYfy9&Fa5hiKW0^L~h3
z0NHRKosh8P%F&8_cz9vEI{rOL;dbkqMKAx;RzkAdMTr&u9uQ_zChv4N;>97ruf_Ez
zKa-t)l{=<;?dsa1DZr=P*dKhEn1e;nY--YlfF(j+F2toiuQWV-Ws}d_)o-mz5iN;x
zce;4O)At83*=?r3yjevmz6u{xp32nM-JB_ii^$*`zW*%GKingrE5`mOf`b$MM+dEM
zp2NJni-5Zo6TZ;dC&7cW)o11YhO+*P;PmJKgTQ|hIK)1JpX@rUw4_0-0PXWX2%I7E
ztscNqSP`K`8fDIbrK+YhK9+~jJ{ZMOooR;uz|6_Bazn$O0-;Aw{-=~2Y1BWY<O0*l
zNi(mncpDQjH%03^7r~bAQNIyGK}F65DnaNYwF<Uz(d&aLj(1M6OuN6Q&5+_t<JtPg
z*y$K+1x)_&C`Npdsy6ifajnjHUc|x8|4@-r+M9dkaLoJya#})GI0QgAfNsWzYcPV;
zVQ8#m)l0dZ0LSs0UbOHdx{x-z!A>nZ=z8_J$?%G2E&$=^Oi#oED!s{YioKU-b_l#x
z+>oCHj?3A;alvAFMHo7~RXmda<d`=e<$JWve)6Z5*Z+WUY@J$@{V<wZ&%`dQQ3~vm
zpM_w32HhTRR9;bb$W9~=!LXYbPdZu~G~WspzQRaq4m8+lZOk}|m6jJqDPZD`&cdO{
zY8g!`vjpUFEtpL@(N7q>4__7`Agmf6n`eO#-f?(V!kg-Jw;BK?Ovsz2uh()-mzzt9
z%3%zi^Wek+I01UvANeT5KfXHgZm2rE(04hNKRy=TC!TJ0PXT;Nuh4C>i7r-(OuLhp
z0C<2a<mmtoSz@Ol6{Ddo^cmS;rd&I6bPu#p)vwjI&J}BGfvb?Zo%QG}n#pZV(v2xZ
zlk$TY`K0IPMseZuHC4jsVd}|J?$HO@;tb8&qj^UYp3YCDm<J5Yl?|x))n4V)FUXR3
z{>OF8?615a7V3gML+4a~kl<J&6goK51Z(Vjej*dmg1jpyGs+m5rf6{4)f`eUT`KLm
zl9&9E?RL9XTjPAwMrv|<q$RIbb;vsTL&uwB5zAU@k&o1+{^hY>yut>}ShuSc{9!F{
zKI=d_jfqfdo<YKm;}@a#W}xaBg(mDQD$Vt^i2~Do3iT({4!~P{ox9s;31<usFP@$0
z*&e+%@+4ypQ`>R~xtJoJ&zEIN_{^Erw?}K(Wv@hWEcHJ-e3MbCNs1=^4}3!>JO)VI
zCatp6SY|4OM!AS#WBRVa=Q!0P-)`onlHG4cU5IeB8amjxTPCBwTkzV5kAf_7e?+X4
z(7=epWB1{=cf?QqbH}PU)eGZ4Yd-(+ybQELao>oP<0*U|d#L6~dm@`I`|U|@K(OQa
z6lA@bawU#Q(?L5|&b*F2E!A==zlFze@BC4Ym))oHt@C7^7B`xGRBXwRHm<P)zbjb~
zXyIlFWpl0q%$#U&Z~c30_BtufEI{6L89gSJxT~ryQcN_LH&X>o`44r2Q{z@8R;JVN
zEO#JNPp2V~jj~0=`p~SPWRwy|Ifx4tJ^nqUw<vU6Y%{wPtyn%yu*WNG1vFRgoLy{C
z)#ZzE>RZmXP75H`na%m=Jv=2kAD67Ye=_FtUa@sa^TSSXuW!FDYl}q-Czf3_5K<r|
zjm>Kj?s>W}g|7KYuueNv_0w?eZd^2^6)}>`H&`g$Tm66&RZ2~6ga&DdDt><do?YhR
zSJ2jO1Z_Vldys(xDT*OEuSTz$*9a-r3$(6tI0=UvQahn@%ml-e+UN|v9*WScxlDU<
zril6WuXCN!QSYT@SgDFvdks3-u)OYy8iZH72*&^$bdkK?TUz^OSn5l6?Y3K*3lY;-
zMw76q^M?Dcgeui~?lD@m3N}E14FU@hE&VrmT%_#wLfVR>wk>o)^CZw=8cF?Hn*p01
z0I@q5xfF%!Ra5%z@db_>GsxXf#GTm){ts#sTdySaxk~sl#%|`gB7*Gc93_MSk>pw|
z4HJIV<{-3NKgbGD9f$9q2>v{MlX*|{7idF&3)+ZS9LFQR(z-K#`F^)ft8-zwb5YjW
z9n}T9cZ+A2Kk2Y^W>hIfHOckvg&*jI&%gCsRTs#qaO)z&;au%Q3%vBp>=QNZl3mNG
zKNyu#X#j21aQ6ojX?!HB--JP3ZrS2pcL<JFbAh9;m0lCa1=os4q*!kxZgoz@<torS
z&is$dO&o$A%_J#dR^G~z-McTs>O?IS9Z<HW9{fY@?sjG_Iu=g!36U4oU=la$#Ioek
z;;Xv#4Wgu_4m&jQiTyetdh+V6Aeq<8pMqqC&oouEr$Jt6RR?G@8tNQXlYr`A+(jpf
zW3UfPR{0&tFP(!#@z2cNGLv77+A%~vJClw4lzC^RD&(tj*$cip5?Y^#zYv@JUF{B7
z74$kLqJZPnXqgo9>L+4D>Jjp;-aC$QkO4Tq(n5w6-aAOj3r-b<We#kgMH<x68Ox7F
z$&pHt%YaS{YM$Msc}B#xf}*WDX``}LP=;`9<@@6V%1mXdCymLOhjzoBrbm-;c40*Y
zKclH#-bh_i%wl*gI42LPVbv_QO23=Nlgo|&ki%#O6(>O={c)buNtcb2#Ge9W$E4eu
zWzRx3zS+EfLwnbq?laoCtWYR=5s6YzmxB4Sz)|EMbI5wNCc!EXEr)PQc4{Y`Um_=p
z)6jl6@`@6@NZOq{9jRas73Z5YA`411xHbfNB6GN&+&Etb$(&%b^}hFGp_Maxc~=bE
zT2kfRL){O=)4iqlRpi3&O;^>(EGO0)niW+uZ;`x`#iV_=;=cgBh-#SZpKECJyxwam
zTXmj?`L->d55f_t4V;$3(9|LCF^z^VDL%zT2XX;?30fJFLUYXcVD^Gcc4j${lWgQx
zW}|7j@C1x8lC0qZ%D2c|BB*zvr?#r!`+mk!%UyGba)pdNzH6<mv|VfRV7zyw^uR0@
z(i1mJ&HEiU<#4Kfmt_yEj9aJ5D37UeK>i0R0iRvKDxR+=wo-U*(C!5|kun2zCLWbC
zv|FYkz08m_wdMua=WDi#4|CbHtz_oLEIJC1BRB?KPv?O#9+s#&isicbbe|C5@Y7j?
zi?g__>Kfhd+Utu}8V8|=?p*X97qmj_LyUMGy_;Z5$~V2N?|1W#?+K@`NP!C{>Cgf?
ziN3qFe`Qh4`}lA((y@L}(+LmWF^cdz$8V0^P+(^E;lv6lLfsyVqxa$*ZWMIT%9E@9
z6yU>)-u|d98==HTyN!=JcV7mJ<BkPdH<^e<^-xPTZI2Mu&aS^j%d^qCt*#kjzh;>}
z$o&2*G`8_aulCmhTRr6!c<+h|GAQk>LCJ@}CcA{YUI=^t4F|;BQA{Q3o=yg5cV_3!
zt3B%N+Q(`<mzBf#H(00oEfe7LGF5^49vCz7vOm6x2Uf}TEE=JD)L;r11<jOGHA3uO
z^InajVQR<ALA{0!$IsT@>D$9el}tU>#w?g)oLWKwLGL*=4~aXBz}1s;d-MKh`W?{)
z#sy3bnPyu;Qax12y{3c)_xtm=kptV6p*<XJJ3(t4txtMgD7}PO`IILHb_7RreKy82
zTZC50X2!@_FzvRzEQv%qOY6665XYlRC329QF7}hd2ZCn#o9@K)=Xm~hAMVOH2GAXV
zg*CvBWv~f~q7ta23Ynx>FH%Zh6=acP9ez+q;_AUzLqCCSlTqrt*}He$aMza@EhDZ$
zbqqg<qdHR{uky!IFYA@EbuNJl{x?>=bQfG_MeSDzoYC?ZL}+{o_=gWBI~$cVRl$uE
zV2vUSGp|*{-qr1^!o@Eygz=~5ypJv|aH-XXb9icO3`~dyMa~}V@a|J1W6}_ai$1?M
zeqhw{lJ~9w;1pwG1r0u5F1mDao_t}Y|4!{L`P|!k37m>bR1qMv4b$ru*~JEK6ME)$
zxFk0fhGYw6!sPx-_%*506BW)W+rXhBJpz@uI*CVQ16ja2fh}YGWeas@P?%hRt6|-q
z-_$YWWJ{7Cb(`R=E{!T0A<_NqnU*{RN}*$P<J!;*bB?{DfiILn6DB7~6hF$M{poSR
zIE@c5@JnXN(n}1&AAS5G!BF4!0lp`Dhux|{E?4eax^G!hTwF~UafzM5`>O9Fri8dO
zmu#eqoXs=x6Bm`DZ=e3SD$$tt;aN@4^xX@wn&tAPugT&->lNoklr)qgdsP~0ls3_2
zT3Nk+=kg#yfP+Q)8+^NU=!Yt+#txUiFN9h(S4%R%0+(U3crZQ)83f-E@}k3i3DO%9
zAwttKYqp#2&Ya)_KhWKB&=ib0qQec1Y}g*!p15dsr{woIWeYDP;Bxa<qHg*m@3kcD
zE!*Mh)+K25Cej>#P0!jL^fCiUr^)JG7MpJE#ajQ}y^g}ijGrd=A-H50=@OB11lON7
zsvdgb1FlaYVHqmado@bs6xGE=<Cmm~R5Z<~Ld70UdhcF%&*N(9wfICTP>HZ>eRD(|
zQJ|@69_lWPrUdOgU7*vUWECopaxE}!b<TbjnC13-SrQA*k+0m&UaT_?#c_Zg=0|yC
zGvAkwIML2&^9Db^T8mTK`52;kF8+w7Zfst-#Of88;nF#ajdeG~$6_!UPAww7*%^c`
zF~^-g^yDO8gGzcl+)W{`zS$HG-+)LL>TR{|cRjjOSC|(gMS?^Av;#)g++{$)IT7Uw
z^F}2!#j3>p1N9+RJd0|V!uRKWQ2sHG@!a*sSgMPtl*)L%ZTXwD>rOOmahAAs3adG0
zzpnPX9v|IK!k98DWmmg^<@|Cti+*E#TSFIavWA}+(KuY4T2=XJP1f`WN0-e+w*@63
z;m2wiV^{GUy4l)Xf5^uW%WV&NEQ+5{QQZfNjIYIdHlgnb=+t6r9FYff5`Xueg3Iqy
z5<RZZR?xl4ht?CLsMAO_WY%7sTZ0cZf>AR*`tuEZ=Q7*x{Jha-F{ElTevLwXSVF}z
zBpr4_;#d8BD8jSM_Cda0MOKPr{oBQo9FVJu*f<)yw);NXQ^=LOZb@fhs=GT}6FbPK
zEI+G2;{$e)#zE(cUd_Ul(C^GW@I=GX{T6z6UtFTFTDB2WNgdAtb<Oi0v$01MY8rIA
zi5u>gi86N<a6M&vxih~NPdkVu4x2tZ<{ER&w1hkGxScuOgp?pvxFDE0K8GZ#zDJd-
zPqyLitO+r3Sh%#+du;RMN8&^9)b_brLxr;`ImCi7<B9Y84BC>6m(R*Fhj8`j#C?cP
zv{Y2m#UV1yRag4gL5%^Rh-i`XvEwAU!eac5LXydH64jiCGJW1w>wH)9wa%Q$wjCr_
zY>Q5-3KOTL;zx72kouhzZ`3i-Y>9;}aP|Y$lk}2hCbBG-b932}u9L0tgE(mx?YfZq
ze6f5k;yZ80ic}{JcWQ9SXU?z)=jk;#E>Kr4D6AJN;$T7s+{KzJ*~fDIf*euu9-osp
z!3NdCTc!o$6(g(GDligAWV*2FXKFl8|Dp8E&PQjOuy+@J@A)CqBlIpJ^$T7vI$dot
zB+0TUd=(27M_M!;Z-Tjxk(}AN8n1moHfy8TRk{U)4->qEJI6rEEaY6V$z&NP-?UrY
zELtx<s+akdCo1Nw7D+>x!bunq$s4WtMa`Rx_RUq}lzz!K1foa*)tGbJjKccOZpj(_
zY6WIK4L#3^;#F5$<qUbTG>(X6`x5k_R4vhfTJPfI_{ltCm^NA)gEv4wM&Mxg9Cjx4
zKD<bOO`5h)IFEXKSAEOCr@$ZIadw$~F<GXib=G}jqi$RV0#@FPcWAo8yAfWvT>mVr
zAmQYzV3#YE1Moz;b?Ghg(25DP$0yTS)n=N830(Je2*2qq^0AOK`LtlbCmq4-$^?8A
z0d%#NCQpvJNXd?d-!o)xldm-5hbFo}(`+Tf>Csj@CYgNBKUgTtr$}Itm?3_+H1sOP
zkvo(#@-C+CaH)B}Vl56tdt`H>j=aCbpTgDJj!*5d_Ff)C{CylH%}$r9lCZJ<-sYAr
zX|+y!Q4e9)`Se?Ex`R#>j5BSa1Wy%7@fnB4iZIyjefyNfr%^Y9E^J(KwXNrq1Er8$
zlYS8Dq{Z`xFya|MBiF%H6%ToDIA#-z_pUOpYo;*LdVd;}wTPG3ejT|Qb_UAxeOwaZ
zsoxul-&WFNrO`M?%BEX$TEwlJXkmoogBJ`-(NE;QZxC>F>gisM;p0|^=OA_D_0EO(
z)Tca}y0Bm&@M7f?z9KH2veT>ft~@I0JtmsN?(qG_@5ywUg81&=g~<ZB*UwcnsuYE&
zH-;A@8MVpy?UIDHEo4=Pw^@^x9*)@Z)=VZoqUBKfxbcWasCe03wq@kF^+|^iIs*dp
z7&h7sXPn$wdZB{t8Y6!2erp_@Wk0ab?|ew#7A~w5U#3AwpP!$rO(dr7bQ_j;a?{aa
zki5!QQk8X#RfR}S{WS|_PMxTQPli&7F!AkN!Qlc&Z>9CSM>B%TzrG32mvM_zJZ_$)
zOd}E0q!T&ge(3>zLAkWRq{d~#Fku-Rl^b$eigAdrxkrK<q~X|IeIFvDwpT_@J|9z@
z*DUZ&ISk6Vvl7m6h?w*+><Y?S;Iw%?S)+X0pCz!FPr_TH1^4(r338p?6&VU>Z9*fo
z@q>{$aUHyKQ7dxoTzY|0VhU{%cX#KGnge+z{;)23s%hRXjeVt5{$dep@#9J3jUr!H
zZlU!y@>{aVtgJ!Qvij7C3&D44fiMDA&62|arQOj+dxTk&qOWyvM*jAJvG~sQo{(?4
z#O|l4yS-_k8!-Ih3fhQA(Ehe(zwtAV-_dwt&qyI!^fruixoxagx*ofzGG7#rm%_uJ
zxTUt9<E3~{;KV{bEZ#&KSCxFn<++WOJxZdUj-*iH*h}Z)mkx!gs6$N)YT4ZpHImeR
zOH6iCt052b`a|;(n_9g$#KOs;6Vj}5NKKQwWz&vyM7W(}IZTDApC2#Jj%^*hPkHc2
z!+Jx9H;{K)3vo4SztvE%z&rddusgst2OMwfkV3v*vVeATjLX3Wtu%!yPNKLq5j{r^
zoQ>6sdgJbsR$wWnGjf09d@R)a%*p2}&CYYeI{E4`+mN}92AP9XG2)Y=Emfgb_qY>A
zf|Y+?#a>@+F>N`x8*jGWIa}#1<)#o2>0vTmu6nZdn&&<jU#4h)GQsd-zzu;+%$L|}
z;kIjN#%f`77;6pxgvgbiO3N~HB8EDh-b11mCT6HZOeGCOxI#EQ{%{>1o=k2qsTQ5!
z?Kq_5o%F+gJee!|ayh>997o{&R(h!Gz#PUOsL$$NlNl$3D-*`P@wZgdmlI)WE4$or
zVW8SxD9aviUl~3-=+4mdxVh4J{1^iy2oJZU)*oWz{UgJoP|<tuU@ikpAQS#2b2ztp
zE(I)(>~l0sk6$uVd(?tUgy}mp(4S*K%iVFW1l{w;sa3leCF@M>()Ds1QU6<_A8+HI
z2-N$Z)wY0cDtYzF60{_-ZV2zn>p^skOwz&`o8J=2t3`{R49Z8p|KO?;EYv)bP8FRo
z9u@mx(!*)jM_BdraBNXlN96nHp&oiP<*ug=wZM#X$0VTy&gA#>KDj%k8CncAlj+wy
z_YQ<9w<)GSw4g;D!_W7_Ub>Z54D$YILD`bQeWeBbF(i@&%sfqq6r6!m<4O)e;qc_Z
zNw_mAr!lWIEf5KL`lTptU91anA$x8{JcH1dX0L3DisYnfz0-08zhJi{O)`=YA*Hm>
zu{@4iLiSc1q;FR#(KR!oFI2T|FqgO=JwM&!BV%g?`L+BDBw!^ddiy2QjC4}*HRb3n
zCe3RkV0s=?gNn)62;~$C3e(<(w2%M!*PluM{c=*KfwoJ2_%nx}+5G)0U;`PIFnhKf
z?N=FpPy7S511KQGxcu#HY5&dg*KfmUn2b_&#@N42#C(HAAdk~&boci;XJHe_M>up6
z|EFpHES3P1kS2*Vr2XOVbCw_hQs+b1epe@8(ZVSD<S9;&<nMES%MPSA;_Ck19yIc=
zjj}qKl|PA+Vuk-p_OdkHH?0T<{~X+6@xN}(-|HVhXSh_%IV3+NyFS(A=~#=;=rG+l
zUQ5<ONO)Sf8t}W4M-)D(wa;yO6p!J3ePYQ}Z9iK(mO2&nJF|<{0<BNlHAF`G%^d++
zpMli=^(=J?`*&s!u;Tz)Kih_i^!NJybtR4PAAw;9&nrlNYxZe_*G49_{#Z!AZvQ{*
zBkBuuq*YfFj^CQSvIVG>-OW9u-y7@<=w@BeuLl3f@mJ%`KtQdguNjbj{o$YOJ^;E|
z#86nv@61Z@{S>HG)zwQRq~BI+<+ht;5Q1ob*QA(TpnF>&5|NO8Tdl85F$F<khGlxz
zx33Q1hJEv_*$MtxM^R#usaDgJ68$R?K&F2vU$fAD5nMImH(&X5OW5u&(^poC>THQ4
z;IKLw5f<?2-n{K1xM_sPekR66^`oQXbY+j9R?`d+)De~s$1OFUEY(+m4~Jw--dJt>
z>p{zCN1_^QPqzKb4S7nV8<lbp!DZo4-%hyYb60GN1cSWAQd1_7>1m1zK74TQxyR`=
zLzcs2xaVP?>Tdsv<y5eGk<>hgZ)c@BwJsp~QL@g!T`U@ayquyX_46jtjAHK0!s}Z2
zO)Dcw=Y}$l*gUJNG<IRK9@mJ|-73UPO}Wws@Y<x|WKM(o-h@E)5Hy3W28b0jjlWoi
zRZ`?3Yu}VNiNxx4Sl~BL?Y~pojRCKCUS)u_tGeENIAQxmnN??JG1I-hBafP+vI%BK
zHCJuzDuP343x&<vkG<{QIoXW6Je8J?s(3`A0q?7*4aVz2{?#-0Z)jpBE`IDx<~8tZ
z4867VaX81`DH!HJyk4wQ5;PfQjKbR0%&~g9eYNkzs(*U<gP$!&BTx5w^B(!ww!!Ri
zU;FE|$N@)B_f8O-XS(3EmW?14PkCJn{8Ts7bg~eXJ3n941{v~v!t>03@G-f-R*&-4
z{*-BS71b4W4OtBb_PpbIHP&gz+ss<qNtj>S>E>KZo&Lho#SdYTtNj+-2R}wX_&0wo
zOF3GGet;*LUb+ZE$@vMewWVn01H<ff=8Y%y!8M`DToCh=o4qC;-wlF3&&vZp{mh5K
zrde0Jt@Q5O(|sQors(&KO!`qO{~SdF<Y~%x>)0l$T*(nz;gHNKbrjv+!;jcJ=m8S;
zvu*x{R*5CEel?19yhn9qjn`7FV29@^M;ADVjUdmG#Z$%B_eGH?)b-uMzHob0$Y~||
z<jlLF#Mu$an;Ld|hl&GEsP;m$Y=YQs(-EBV^hc0>jhBpe!wTp2aBigi{Gsi~TFOdJ
z)?ZIr04jD)u5gYjtKJ${5lFJr%(7u?n*;oMFJDiVHsR@z&@xI`p(%HLZ5X)5q=C)L
zv9+e9CUyX^@v<)ty7A-6bbND+v;1XI%2lfkpYTBrq5XOv;SS`0V@5XbekJ38&PN}S
zQ`p3}4{QcTfTB`OxZdi4m?l)!=m4H3VE_F)$<<XNo3;^(4+FPikzW!Me0Wc5b*CM{
zKNV;CdCD8|Xr}3CVLx*E4oz)+Rrb|2LeZZRAHHW{d7r~Lw>Q3u*HWQQ8gUAo;~_q!
z%rEM4ZSC3n9{HFq#-aV|A(T3zmP4m<3XO1JYpm3qDb+~~({?5znFrj(dU{Thp$m~q
z*jfL~I}J%3uX5k^D_>`Ggd+xgOl#Q;D#xURI7j3X9$Qnh>sfV%h&TYlggKyCRvUu5
zr3;Pn)E-Pt_Icy+{SDgfuoSSvR=x?v6^@yzhNPX2HrAZBUwVhWwwmf83_{FcF{PW~
zI@b4e35h(L7AC7K&U&{5Zb83*yviM{ncy`vHDtf}db&4l>c4k2igB;Td83L$JNDr>
zy}q->iN<Me2fo2apreybkCvipbVx80^%2{Oy?`fYO?7A@k)fZ3|MfvAZ20VWKQs=F
z$kZNN1ikOqcqf{LHl|e=qMqQjB|ZZSlpDW{{-DmaQ!ly7dgXFuOLo3U4c?Esxq7`F
zrqy*Kp+>nq(wln)uuW~p0Rr<fC5rIfcSIVcVv<EWQXnoz_rx~;h1ICKSaFP*UJl`-
zIuV1$9~k%4IVcV~%ofG>FD&Y&Q_2SSXKw&*tBm~QD}jHLE4<Z1e>Dfm9Ad-0gQ!_N
z_etgPYg-q%@ltTKwH>z^J4BS-Sf}B?Db=azz#t@PhSw4yJe`|r)IsF&S((q%k4`Wk
zZI795@9CJC0eIKFDR2AA<3;eHNB1t<?m*qTR#EM)-AH|fMy<Sd)9F?S&raA~jbg2*
zO}%J1N6<b;k?Z3)?iPrKK$8TP?PP5vn?d6k44B8!HWkRmjZwYd{nJ_$2guzSF^g3G
z##sqv%HOEAC>xlTzIUUaZ*TbeQ_ohP{}2p?CYJ4>gps8m&P2$1I5c3}b;Va)3!juW
zcj1RkS05}XC;OfsA?$(3xrYG~^IE&e?_Qj*Z*%O7TNCa^aK6;awqV1<V*J4D9@WBU
zUrY2zZS1~04hJwW!n&XR8W>}&0(N!lLa3hATqs+0-^Ygar$Imrg(NGviA*P26he?l
zGf;_c6~9|k%=R>C{MeS;xa}n86*KAi?5Tb2G_HI4&4lgs-iva_-J8~vy6y3{nFVi9
zEt{`@YZ1nE9~uouZDZ*|BAqlI_5P&=<W8HRHm?aJO!^`v@J()S>UIc+A&H#hXqJNI
zXf#<m_lP9!SO&ZH8cZ`&?A66{T+dP$72MhCWy?0<%Qmk;USIzYRZ*fHfniD_yuFH^
z0Ua!aU9(Re?7)4q4mDfcwzEgs-Ba~XvrAU>DSbbcU!FsXRYOk=ygEY+?dHO>cbks}
zYvPpndm@(??Q;i(hCbAPdf5K8_~_x9+|MWLvGT@P*5?c>^PVhx@OHV*H`|Gpj?YRD
zJinLrW*p6(B~KK%bCO>xXZ-Mh9BfYQ{^8$LUbJ6`rYjyty?yhEyY<9F_h33N6AT=l
zgC=r+^?74^rLj3ul|kt~mZ^@0#*v+%HJddbdh*vZ^AL5V1Yc4hvF-YzuS-y%0~qz(
zL*Jg{u<7#72I&i&fw|P4f7ho}N@Mq$bGMy*<0E3DpSK4tHn_6ATCXW;J7~*OBwh8#
zxEV;fDl9R$DyeBGDzbMXEJe-L&6~(np${i%cC50HuZZPPILcNF1TTCP>Vz7N*+;s0
zn;g@#dUPfG9Wf+Q^_P+EWE^@HA1O3&)K!Iuign$AdCc2!A00A`+n$MLZ7A{|NECS(
z6X`bEX{+^!1uNEMNrq3Y6^!}{OP<)NP1kHk=fjOr{_I%VlA@hf>B2u)D>}hK;FqpB
zwzK<vO2RSuqM0txv9{V+d<vVlRfklCD@7Vx4p;9(SlfW`Iui?!eU)vm%}VIG_`1N8
z*l8d>^?`V{?PLiI^is-4aN@6tSF#Gt!i93RZ3-vo^Y$`4h)A#|28Nd8(zaOZSX1iI
zA*k?HvtaAnMD5F0ICKmA`cBQhXotFSx;i@J;kFm*A9EZBi*t<o{cqf#IG!#s8J6EX
zzUXZ{-q?nx2wKF4C+<{&OYt7W-(%>TDuyRckJSn9XDg~1syXcLzq%}bKap>ZZOAbu
zlh~>+FjL3cc4v`c-KF>_(p#FxLMK4HiQMc<F7u0gxjn%iYIddcX7+Xj%#SBq2EWN@
zS=wWiZ2(;Mb*9E<rcH^1*qZl0&5B__i{}IFZrz*{uW05m41ByS>1)caTnaiDK0gr>
zx|l9f?LrV+4)kBHZLoY`)nw<TxFE6h^>)`iUWD&dDUs-Y>m`{DCK<0j;9Q^Sc97ag
zYqPJrqt0<Wt66&1G5A<zgxa(B40;6vBF0Ra;-8%MwflFX3vE2At<w#DM}l5-7Wr$=
zZ1yMs7{p`Qt%lM??2lDDFF$|S<)<EDRq1>WYp7~aqE&4*A=(x>widY9X!g7D+IF)+
znjnB8qbXLOdcu$4xE66#SN*Gg$Sqml6!Fv2OC=#6$v;l7;=@n+fMbl6_bfA36Hg51
zFEpnQZhB$<r4DobQuVY$MXhp$N;Y0cesI{_5j!TtU2TT)#6_()bq`F9OUX1MCI~I6
z=8rFCiV<u~d@-5GVJIx})f>=q@!M%_UAtVqSI1#c`E_7U@0-%+dlG}5FyX1~ecVJ%
z74wA@N3HfiG<yK-`@B4I|Ihh-WzZKx|Gk660^${I2Oj8tWe?=XIZVJ+q5Uf$j((JK
zNCl3-r{9vgo#kk<!7lAXZ!Rh4@=~rKWM>Z6<Aa0zP1U<o=P9;Ajs^NV#qVVk6d86k
zh93{Hgj<A`dT&B3h*TYxd43JHPHWitp+<rynHhd(3p{pb!`z=VuobQgd$z`klv?g>
z>pLq{hO5)42;Ji@NJ9<_l5SqwGU<ME8dj&sYw{fl1iql>T{ci{s<ZU>zcIz9kaibA
zgK>3Mz6+t-c0Pi#E-}42V4e5ROEZ|mc*?CK=}pM%z?Z~}C_-Uzc$dgG3-i;>X7yF-
zTfoP~hWPh``YZfq%TjnSLL)8&(vJq>LXAw+zQ{Mz(zZ*Sb2V*Bkpml#^B{lTBK@k*
zE!SP-wda$DCEZuDJ?qgfe@H1*AD&*G^Z0d0h+w@m=Iz&poRf5G0msqoj;9hY?XUk_
z?a~Nfs=0NVGBzJ(m?d6pjfL@m54D<(E72zkS6lpM8Qe>qIL{R7l=PxIsXQ00EfXKD
zS_G$)HyVDkAvvcpT`-M4dC~I?d$I22mYzj8iSJTK$K_t=4^-I}ZoTy~Az8XRZb<5l
z>ceU(*A!^9EPsuw7Wb9AKaXvRZ<R~}vzpc@u8j)IofRE-earm$(-Bvti<hk>3etVR
zqfpuBzY&1&T~cIiJNF5$vG)U8A?(@fOlrv7R|-c9NHPP%Y$Obh<{$MLE}>=!=whhO
zb10=}?{;i)UpbjYKDPGkJJHw+lq+4}9TG99d1-@K%5tc-oBihFW^JFHLFKU6n4COb
zU)-1_7m4CcK(06YHJ$oA=>+B-eFZ?Lv%6AJ>^EMrBf{?6=Zh8Bz@p4_KEpm)lSVu+
z?D^wR5*6XVPt~4PPaPLq=YjZ##+(M;<M{e})4P7QBYf76+;=CZ6*@**CQ&A;JgHeV
z_XL}AtyOi)m*_xUCsJcdvlAtDkV)1caj5}HqM3l7Q%Jxumnph`o;<0b<uS8qN#BI5
zkd^g*5!avP6+O#UXh{7r^|O(53jQy^W`@WkvQyth6qH@VZpcxcz}#pjZshf>#Zc<j
z;a>B~WWmVvJmeKl$S13i_5G|&o);`GGn;!_-59**{~b;E1Jt&~r6J|2cZg04EbgSH
zs=@BS61=>xw()>iaOC)>*Q(+rBV(xMh{0WA$F$P9^Mu{Z<)P`AKZL+|+e@Uqq4Nc{
zuPz>p6Jp@`Pf%^n&qvd%;7Jofn)W6fot#KWz7(4aU5J<f({<uIJcU#R6f@>SB};JI
znH9xJr^`fz2XL|c>wKzcwpsUzkjaJn{2z#2)&%(=QiCWJ>A<{V`bB-LIjT#&ZwP|W
zoBJb!%l3r5$4VHJgs+yp=H&do%ncq1rATj!6F0V-)U8C))HSlD+1JeHjDJ4cSVTXZ
z5*KUjDpavauppbV-pDrklly5|->%c_Fco{R>i=sC^#H9C+Q|N?8c^O4=h=<>m2Rz_
zix8TpcILyp;E|~u!9@Eu`wBz<X)8&kXoI}bn#+2J5{NU4(;X7U1uF^)pc{Zs`#40J
zx_hkiQeW|DrF~vJNOk;(_kGZsm|6Xacl0zDH4Uz(jMtN=xX=&f`dEyN5g#$ce&b4!
zu?&zvC&WKhRe**sIsJNce!T<!^qc&28d{V<<BI<4Sp3T+K_E)g3HV|xNZBy{^aA`Z
zh0xdn-j;ryAtueBf9Cq914T6CHu;GTUc_%xC+-2BtD1DBQuM!EPJhck00;2KBpFvz
zkp4Ecm;rEI*$QRo;r<$Z|KYTGaO?H!w0i#Jx2f7M0pHbxk8lIoZ~Z{e0Jon>$gqpi
zZ&NEj0M4u86JHnZ-&)ix38`H|TKAJ^01nFfm)~DP6p+23+sa?0*C)~(RdTa`JxBT5
zMt;SPf28%y82_Vb8!Hxwiwg61R{;-u2~uY&=dNmZ7~MaN@SnnCv;dD;*vb0vua){g
qa`|2j_{9F-KmYwk{a<Xb8;t$@7qb3v5?3VPM^;i%qEyT%@c#o}JO=3i

literal 0
HcmV?d00001

diff --git a/tools/profiler/nsys_profile_tools/images/html.png b/tools/profiler/nsys_profile_tools/images/html.png
new file mode 100644
index 0000000000000000000000000000000000000000..c3cebdcc9971f14f7ea1d75141500d50bd825fc3
GIT binary patch
literal 72163
zcmeFZXIN8P(?3iTMMY2$4<bdXbm_f`NH5YmNDp0l3spgibcjGGDn&ZdAruj$L+GIg
z=`BQR2%-GLId^;RgU|JTd_O#1SCZ}CYpt0zYu3zf&CDiJLrtFe=EIvfI5@<L3ePoh
zaIUT6;NVl;AjIAgB=sc4!MRxlkd@U?l$E8^aC5N%I9lW2C`6`!u4`y*P;|BWyND8T
zeykhHIG~VLTF1@xzV(Jt`UwvK-aBdo<F=X{`q19cvZyEZ6w(uT?<|bS$t$?ueR;BF
zA*0QK#FKSJ4fwmv_kcxO(QC8&naB)`wHi*&`q#JLJ@hGpZ_Cs^>4$~UkWtBZ#o>rc
z2j6zZ5p`5cji<bK?}kRk`=B}$5GOsVu0P-Y41>BDU#@TBxPzl{qc?JvLyfSP<>f{1
z-Ka`jX$`{0hN<tYEn&Q>Zv@}%t&_8;^Vt;L;d~tq7!|quD(eXqt~6cM)F&&P?%FSE
z(4UN-J-hEdF}hhFfZO}_3B<;VW98FqNafvhhUdOTk_@{L(Vn2<v?ql<HZ00MHr%5x
zel6M(jZXEQLzj->sR>+JCher2Zc*Lbx8E17r%5OpJ*0ajW1;bjW4~b(Jqo5jiUwP&
zrP|JN*ycr4*`)Oy-HaboB)JJ*VQh(GjL05{`e1FE>V3=+of@t<p?c4k*EWqaeM<&R
z!8)wv!=ZR~n*46vVDP8>HAO$=TO=R)c9YlXUw^RT77r=!nr)O!bnosyaYhEDE>CSI
z-;nANsaQK%Bl_SjP-Vf`{My%a-d!XiG2m5ggcav2zjut;pdI$%D!1=UP@B)(YqM34
z6ZurEF%kW|zwqYiNY933ZdBmbetURmr!DLKVW6)sYdU^S^C$0JDG{cO!RId2afA0<
z``3m`IZF{Z9~#^F%DZR8M!m^28T$d0PjLjhH1vfFc5mVc?>}sg6<g(R*1)k^#I?(S
z-i>y%sosEuUt>I|3nA1FW*)ls;@fQmltvieyIMAg^ftXThd!P#-8CzmyWQ8=zZq%}
zD1D=-z}F6Ds~~xVC*SsZh+r^S(3Qv-Ur2_#o`mq*Qy-$vZ~jk-WJqZ~Noy-y8wovl
ztkwO{Lb@zoi2TM|nXejn$+wB<c&hOXuct`oX<XAL9)H&Qtl*>I`pq%oxwnKL@2(Rc
z5WO(xRS6R?$DhQr`KBU_6C3=Xy~mXqO!DA+#)4Ztp(LKrcc)(h^$%!*&Au<)kj^EQ
zdrrk8`^P($dodl472n-3cuUS;tw5|9S524M&1r>0DJN;+`UhKhEKj?z<>}LeKkhJ-
zRo$$*s}&;h2GHSQ?)-|7$MQqD{!LwKhbYhQl@&abR*qb?H;lq1f+gNIzeD`EV+mh!
z6_x`(9>iUXi2jkcn7Smmc(m})wY1)Omel!CTvzde>Mqx@xj0q!d+zVZ@3d{%U8seK
zUD_L(1e~D)Z4spMq@*`*Sifn1<Nk)`JKuN4m#JUWJ?II^8ak{y20G3<C>DtqpT16H
zl;>q^h^JR5%&p0F)0Eht-@x4{O-@dZP5!L?h1|^ky>`PGPhp>Cm?mw{o)uXoi|<_n
zmHFqA3eHS$rq(=-CxqnQN{^T_DkC+O8>D@oqv=#r1VJ{6?s<C3dP+h{D23L<$Gz))
zPpwnCm-~`?!0`=_`Ph_`1>z5rt^3LnOOquyXSl_<<Ws5=Z>9Fd&ph$g--()a_ibtl
zX{r;~YtabIKD~y1%=T<N&refi%yaB)EPCwK!&n;y%VA9bQ}yR+uIUe<O`)aFlAmQg
z%lmHd7FX{1uyjJ+hq8Ku-Fl1cMHVJvru=*AfH2Kw&A5>_mFblbUiAk32AKvubka?e
zA4W{7*2PLmy|OwnMUnPj?8`Py$D@lr+w%+93d9<Ioqk>8zgf{3>G$Ep?GU&L8}Aa}
z7YGn|^AN(%Wfx=5X5Z+LZ|@8fFA(`8H|{pR=->nsEk6t`leQfh-B>xebMm3Ku{OI_
zA7Thm_SfvvV9_j)E|6LK$fXGSn2`%HsWb6bQCD^enZMaA+U$Sw92g!Lb*^@%c1DCR
z8)g@FNT^B(B~BqW2uFu|hLglRdExvb?S;ULy7#X;Z!hsLNpuRSnv;noktZo7QRP<@
zHnYM#;s7Y|ZSgMOP+x)llYwp<m8wToqE#~Pa{JQzc>B!zdK7*XuPKZ;MLAmxrw#Ir
zcIuj3m|Rv|bj-&s4m+ni(k)E@z?pE1CCjSm{HfKN(n3zCS2==OtY{7oa_EKdGTWx!
zmODB;Vz}dXhv*K3`h=4I#SKe_rAmg0;r7=xOO#!4YDa2QYF(M<Zi4P=?vmT50WCiR
z=1<S|BaA(aC%MMC@XI`41h8zA>$BB!lg<LpadVz?4LjFovw7a974|dq>(Qjr(8p=f
zkY%Z5u}c^=lX9DtTa<IT_#zdMH<9v^qLMw5o|2X=UM+n<qqB-Y+(4tV6N85wcZyR5
zR*?&>BdrX<k4R}W?*0-UV$>j?ti&;0Xf>Y+y)JVi6PlB6PTGHSp5pXr$5TeW<vI;x
zI|V!I_0gxr7586d{v7*t{Hy&Zl0lqaP<}R!KZ;zwR=(#sXH?Pq^mj*5T<>Y#eSK#@
z>CUMukV-rAC?YO8N{m#fWyt?$?A!L}ZHoJxn?{cH!Y;chOhw8gAn<!oq-|HW!Z+sU
zERWQV`|sGmD!RxFA|Yg5!dc)<Fv!?@$7|;X^sQd6f9i6gcKU(%6qB#fo<fw7l0oki
zc#>ILQ06AcFhw+h0_^n^VdUrGCrkL4;qg+nv}ru^x_(guzYCylNL!!rQMX6$Y=Q{$
zO;YOa&`Jth0o&1`R~wC!3a4Q{PrC#Pc-Lf|wRa-L8^mSi53QN|1UdXukx3VQZvAe9
zdq?-D*(b{WFzzunXufHfQeOnD+1dO)-tG_pdt|6=m{EI#!s`ni3H9Fk1gjypCof}t
zbkc^3Xr$kufBEvlZ|zV=$|7+2VzQ>qroD{VEV|)`o-&7Wuacz8vSZUp(zyDH6mTZF
zy3nEOY_-|Q)ri38q_)vXLYqx5-AuIFsjapXae(+S14q1rzs~EgEwN_-6!@%3`<nai
zb2Xb7c>&853fD^yzBo>nw)nU2L`uv`Nchj8CiDQ>DcZo3w2S>rd-!JO!?5&9U_DR-
zleH5!4M*Qv0!`x=U01pjAo4_5*!9V_hJ2~46XMljd&bs+=<B)e1UY$UnnTfPV;5bs
zxtvK2x7&}%X#IB`zLx{L@MG(;&B0%!3WMBGteb_4FTT7sSkl*08kKf;bVmU$MiHEY
zoCSe=7l;K)1Ye5n{Xu6*c~L!|of-7${!;1piQ(Gi+K)%aN0<i|T(dhj5bKS9xbE&T
zWh*}rP0L*4h8Rn_|Jdl=D2^#sg>9D$nASl8H@zWnb<ZhpfqB>Z;egRy8)ek#j=O}j
zgi~FUvq|GXBd}EHOC5x}5~+6a={!ViRt$>FhXaA!YxB!z0wfhAQMA319R9iIUsg$$
zq#hD(Umv{VM{FAO==}Jc)OLL(se~mv;#Rh1R;t9PpDt$h1f6y_J0XLcqUDc3)&LO^
z!-Im^Y={|UvnU#bF@tqkbcUNnn+Ew;?Uwe=gdrW_HE|I!QhwRHfo`9U!+IlRX(?&N
zq~wD91k*5-7Y(P;yP(u^Tr=8$zT+d+Zi=r3^0)+^!mLEWLf3I~I`E{MiEscv`(q@W
zhWal;0eBaYwyVDYRo54~AL4wU#l7*u@;=+fXRf)M(l?6a>hUK#Uo2Mb|2o)~w62uL
z`L+J*S1r*azh`(C)oBgqkd_O|klao9Vk?fkEm12Dt&}_##p+t?E83{4;;><_Z{XlY
z0B{JfSGd^EL+l3&b`!&Ku48}iVL#7v@c#Mq+IkNDKd<qrE*r{d$to&hf3+;#tgW5h
z?OZ(c(I)5Ep<n=QeGh$A6;VqUC$3jkE*92YK2ENeU2w#GM6s7n)*i3ue4HGe-9>#Q
z=>KdXioL$P%}r1DXA=(x33`214LVsDH)}dUuBTj2=_PN{(b0*!S=oqcK7a9#?%4lH
z(A#-<xQcRfdwY9xdGm3(xY=^^h=_=AKjr1-<>kb-;B@zO_ITyP>Fmz%*C78K=ef1J
zr5nK21K{FJcRB7W3l~oh33~cVLjU^r7oXNXfPYhRcK^q;uoL9Ie8SDc^_2TxV`ICD
zU)~kf0Qgut8axL$Va0=`At@vxApU3jzdiXk#sBE3|L>kW0zAC`+4Vmj{dZR#cWXCU
z7bh%D56OQo%|AN-=fi(=6z9I2`hSSxFFF5t7b|GVo8sL6S~SU<BsU*#<KRf+C_b0b
z_QBnrBLcr4r~0;Q+bNpb_J^23Wl|;wlQOx1=92`?XZ`>)?9XvsVeThlMOLB|dQQzL
z>z)>>PIHcS^VOg%_hwk;YunYq&BlGP3uq>WYYmKQf};jc_nX!vU555*GM2V<%b$=F
zkV@m=Uj5Y_5c2Nb4We7Ok{0@<$mM_AP8uhq1CL;xEV?2V2k!=5@Ks;Z6gNn7pm<hy
zLauuH=RIi)T)cxwIeq!x@gq%li{QqPEiP}w?^&Vyii3O7L0`gfB^Q@;;;&(Ot-?u@
zy_(TKo9GAQ04?rL-2E+`Zagfn2EqN0e=j`+4(VXYuQ#@SBM>Yds}Siy4q?LI%qE0`
zbh6Sqx<6YAPNN87*DbL#yU(UwfuxjpLoMbt$<>#_4q7dHz-eeTGf||Y{7hC>4l~=}
zUO>*KtzbV<0zj1-RK2)EE6HKqmwI19*TbXw6oYK_GOMx~81ZmK$VMM+B_DtzM&)yw
z#b8|`<;hK2M(_>Pvt}q*M2qmwqbL5T9o2DCr4%+MWDBOuV`GF#2XRl2Svp()jpvmL
zGs2qFW(NIMK5Ak^7%cGgVjn6c7Ey_Om<t1?3OY?IY)zCR++Liz;%%szrTID!^6)O2
zgq;~Snb0=#<Oty&tOn0>bbm<{wnI3yvR-S?4CI9bnRR_!+AYbLcrCJ)avvE+NUY@P
zYT8v}3#A#Z#V+F_+w0p`@)AI<@TmUC(QCQ;E~>;8ir6qYq5!MU7LYYT-_ZdV-N(Mw
zrdAVdc&;{4u+WtO?Sf~k?1qfZ)P@_?#B7a}kXcnbH9@wWEu(5s#ptYPvM=g=XyLQn
ztU%ngbYWOqL&K-+Yis+7^PlZ<=2(BYzwK_(;_<3cv3ognANY+6^h8&Q{s^-9HvB=-
zdiR+Z#MY=FZmfkbOtQve>`MKnqr_#=&s0gdJ(tI~oVkGc^|L)`+rK;hajof#T*z>6
zVGEdYB&#Yv>5=q3v8HQ=WG>20WhTBesU45+<qjya)o&4m)R3?!AR9OfTdGFA*8H|a
zia#$U9fzy>Ns`1CBuyy54m(q6py-J(E?bY`g@BHgI%VYHz7@bYCIIH}?%-@yK;$K(
z_6rG2{y|VKD7mSra27qtvZwI=-VPnmuS+IavN^Q+D(1T<@FK&KdS2ND^AytAx*-y0
z>O8k52p|Ha+4VkzJD#=8tNAF<Vt=BVlObh6CK;pJP5?NASxpVIyeWDHpppo$$}-NV
z-f_p9%F2$cK1wk_fO-6m%3-~uav*_sI|M_Sjq}r@m_$F+X@1x~ClPR7da)(RK6?To
z>TVjgZ$>-G*0rS8BP-R!-HOy_w`kGlv7dc9)o7WtJqs~eTw3De5H7c5lFpuBe<60I
zy2J#NogQ`!;n*yuO=pD!%$4KHJ8e;}A8fb7tXaC6$`3~kxdqxb$HqKQ$2wr2WQ+?;
zlmbO5I1AG}OM&(m4?Xf3$z3HOSulOj;r{lv^5l1v078DQCOh=ntwvgck`TYStSOn3
zl&(9I3r(@d3LB^N7Y&^)?kGWs>&Y0a!Rd+>tn>B>C0OQci1=z7`UfsC*QSb-YTKs}
zD;DVz8MC<pxv)4&9ZjknbtfQo0h#cjXa7SL&q1*~LHot}Ty2<pYdHc;=WiJM(^ovz
zL`dYIs#Hh}?64gm=#f#y&8e#<<(GMYlErvO4d%d{231r0Z;XMUWNgI`XBmOfdQ}nQ
zF6q_w<4Wm5_N*=~#~;VOXwH;v-9Cxo=+A*FsGdB#!ccb~;i>(a<8z86jLclf<8zn%
zoWpOTY8blHLIWoP+XJEQdyYsa2${JmARN+|Tf06GH&(0cH0Vd2<j$!j*0Q|muZ&r2
z7|5`8J7bz(o6=hZHfCj=XF6;&iNW;o*9HaG=4QSoO|J%Yf^nKg%WZ*k`Iy6%ZS}sd
z;+O;Dc`shW>GE^kK=JBP!o^Hr&}o%MmM{910pwLp@{`ftDex;K!8%_|V(gWMo0RIy
zSlWJ;T-aVCr+|zF^oX)yHeWR(W_u8c8KHxm_)B_d+X5VNgbKG3C~?hOBN~J8$EfuE
zBn~7RrgpPr{MIt=Qi^PA9kNAp2n)HxH^dnEK~*1qUeW|7-?_@4R8~nZE;LytXg{5t
z%AP<?lp3*okU^rQC##P(rUujUO4^1ys;V-lCg*D%zKsL6rpy+tiNOwvT;^Hz?;ncE
z93<O}Z2eJPMi?mYwEQrWawc-d@2GgGCuZ>j03MsMR(n9h$AD21r|{HL?}GBUmA$Qk
zg0^<1>P)Yz`$K_|phNxiB*~gCh1s||PgRG{t?F5Fqq9CUI|eKgeFW>HpXH6N+%flv
z?CA8vA|5t*;DOZWwD2=1P56JjAB?&{uVq_f?H1)+GlILOO+&)WY(|(bpPIo;{TpCS
z8A#mk00eFb40$#SJ|v%@Vo`I|M{P+G<Je9R-#6>rD)$`gPg}`4e(1lme>lzJo#jUj
zhQ2w<Oy9~hZauG!!)b0cx!W1`Ct&!WU#>52**$$37X2{y;I_7?%tF7gi0wo<;f5BQ
z8g+rH(qZOV*4m<@@}`7<XC|GaJ^rE^B)4;+VXm-v!};K3T{&7ZV2qVul{#`6o|*B-
z<xbB02HY#vsuQZojvPDOp{*er!gP6>lV9b~aT#+$%vhe3guqKa;FjQ`Zr*3>#xA8(
zFBQ<WWUvhO3F84F1AEVFQRPBol$R;j4{L&mXY-kTw#LTkDosgPy0?uPB+9h9U_4gO
zoBLJdlkP5!N_xbovC$N$AG(jstqYnQ3AWD}Vf{!n0@xOMjfjRO<K+^IboR@{gR8_5
z&v^|T?6R2TZ37TPMxwSFZG$Ig&c~cbgPU5KuFYqfH(&D8Jg&M?|1kbgoSV2~vmuvh
zvJB2Kv`EmM>FF{97HxzUt~IFEvFM1qLkm1h6b~~^YFX}#iLI5wBsgL7fO4cC^<kqh
zNzk~9nD7txslu)8zHU@b4xP96e5wG}7tNW7P~XBTQHZ?Ol}ZHu=3>vpUx9RR6{!KT
z=)j3O#oExhn|al9M4h5CCkF&gnf!ce^WK?}rDsKP`=oj%FvQ?%eqm&_GMgQEOvJ8r
z@HnZc{pc1${FtYrz5k1tN~@m4N^j&mzo$--b|HVwzT@-Yd95((E6I=0#i!wR@ZV;c
zVpfYcW%uP`nd$CRgSsCXA8C%43yR2GlswY5ccIVqa!i|RwPnjnj$JVjs#$yecr%Vx
z$`lKdK5Wu86CS~Vs5uW35j*tsz^-R8#O-AsL`d|v*bxhtt*+72(=&2#sDvh!{!qJ;
zx4-ODH#rt(rhQi`xJv2%y2r?k#hJdeNA$l(ysTKn%l9@#?JE287aavtEaKgh+?Kn-
zr2jP$I;~6WeUm-&f1%Ta#oo)n$>{&NG5>=v>HN!60J*N{|3b$XJL4Wq#e?4*O0d#X
zxl1RV&069S^?N#d*cm6GUU6UTN-vRr8a-BQTKXI0zo)Z&?ZyxbvfcFe>%%LD6<dDA
z2<z|Z^xPuNQJ;@!{`~@4MM)2;Y^10GDJC0iTTkm3TpF5QsTLa4ghCOgrHhedUwN%N
z#S4wUmKaY}tA!IIefo(aqoYB1Xj$G8Xw3FHNJ_VR)d`ZThUgvnRarziRih4cg5Q5E
z#c51`O?F=qwPlDN>#Q1)VxQ=X8mC&(?|;o|)L1_|w95va<6v-od4uu%k$3az%}Ac0
zB)7FerUv(hbC(?mZ4l^-Y}fJx4T3w$-6H;PGl-YCREZJKmZo+b5gK>io_RArn_v%n
z*fX!~X%@?_ATD(Y*DX2f94SBX&a4Q+&cs_o@FAqZRE4#CBG+K-^f+5@@-9>vWR#u!
zt~+dRZF@IS%FNWZ&a4SJPsNIokkBpC=3to<t81jQWHT!I;5gZ&)gsW=bx*E6u|{3|
z+b-=uavgKv#=LQ@c0HV6otE~ztRD?X=ubN>YJR;yliT%uwG@)Og;4J!rZ()YVA(mC
zOmZFGU^i2sL0fkkB?`Ih5Zo7Znq!5*&tv9}G%je|Y_rvfj%-(YkwK-YQlQe*`MGJu
zo<+H;y`cw)uT8*MfhQMZ(MZLMd00=F_hI$g5n4#9kJp_q3g=ADzmQV+zcm4u6{}}S
z+{J_?%_$>H(f%`!e$5z~y@bxbdxDgNsxo&s`khe}YE>m_bLdSsH^&fBH~^+AQXh$v
zbBeehn+fI*Bh_jw1Ge%aa?L{myhMtF2(Dl6tLQ?;r#J}pcEh9+%ePt?efJn|OZ!tL
z1un^i?XUEu#=~BS)1|N{r-Tj5H|sRh%?x&Ae(ic69jCr`{CaWPo0*HNN3^khL)qoS
z0KX8gpO<rH&bh7GpnG}27zkp)%voYsCBv+m#sI5r1miT5bSeU4VJ?$goZRIMpk($9
zS#E8azncm!FxCL5RFkkjVq?f^WG9Pr{RxQ`!X3kUgzW2X7nyTpV`ml<cvLya`Meqr
zNUe1!urZOQbu1Rq6vk2|V=L-l{|Vx<#g?bs=*-NhC86cL!dvy07E(}`IUohtG^X@1
z0+Gknd7I^yj($X}2nv_8%RRdVhH>!`L&fx&U4|$R$XK9>RARNsk3`%iKg&^z)0+i|
znR%GG!F?0~b|K5jBMm2`=ed$ys;a$DfiJHBfk6-Gq>&MNcB`n?9uS-}-UPzn<o7^T
z!p&fSZR8>~R`YN~d6prI$NOV{#zoyhYfW?j<_uZsI_xImwq4Kfpgoh{&-$|D9c1T3
zb!NF)-L@x6hsrTixi>l4=DkA?=UkcEQ!Vc{+P>u_?$zGbt=h?2kkA*Vu<LV-_wNln
z&vY7z3*bfgQyFvLHGTdNR{T>^_XSte$&uAmGb;EL`4;0;=Wss5wj_D1>AdZc;yBZw
z-g5SnB!gGpw(m|0hMCo}CGFSwvWxXR_eFQU69mzJYD?RqYOh@wZ?2zCvqC%y<z2dK
zZL<y&5?Q`}gka`OA5+udE=+;VXV`cjZUhea>KxGubI__~GDWM9nswz=jctt=(F>Wt
zb-9XP_L#u+ww%OJQTe+=274HiM~Ug<YzmN6)m)=4b|yv_ni;sxz}B?3m?-;sDpk3f
z*NcmwjM}~m9x^A0L8r?=W2qN91a~fPI&VeJmMIRi&YpI$rD^%-@i;pO#I>41*X-iR
z($-d%!Na27*}L%3MJcSWNHMOgbfp+c(c9~p!N?yJ=zIP(u(q{BYToDK)^OT@y3{Pp
zxWIzQD@XwI0NrC;1;KbGO^-t<nZ6_srpN%x>i{2CFKV^RT8h3R5X$ah3ei>2vfRb4
zqMD2q&V7-SrR?!q9f<DxfjS8{B`$aTg&zebLTg~CBkYb|?*zi%dgrut+g4Y~orI)u
zD&VMSvUYR;c`CH(ZPtg~fT`cFnFy))c{LjzoaAdCJ^uhtLbb&IP(H88-gt3Xl-5dV
zGKf*RINg%sUD?WBZdDa?0^R3gee7h&pO{hZvUGm*mfN%GeHW}%u_Yi-Q#5;7vg>3v
zs&4*5)9(XSP^lsRFF|dTpji;f9I^ftsDf7m>x2j`Y_#>FK_w}9>_U?{ujl%Ty64|1
zWW-@WQmMT$kqmQuwE~lMuDeQ@5laWI1FZsRw|XmA!Z@?5uz#-|Cb*+?##<RB7BT&E
zqhuuX+J2u>Qnc@4@|0$cra{InGjFwYQTa}=>ddE*PHJWx$qQ88kpMCp)<JX}WR7=M
zo@QQ5flwa}?N3KVm_XNQizgf_eLrUgl027V5w>mi{fO)8soxw}Or#PqnN#TugY^nz
ztfVC%`}D}<g-zZ`YE;6*7!ZD8qB@SfnecN%rZAcyo%u)T_-cDoxCw0LS;u);rSp;?
zz~5SyNFWt^LYwg>4xA!f6{wi|=)^Z==hoq{AP}J_zeKPOtU33e5^Y%e3EF<_QSxxo
z^c+cuwnh6Dntc8=R`|LQ^UI_R@Twdx)dy4DVo$uIv}wz@X<qH*EzYcalty*a(83bh
z4X9Fg;zz^>W{GC3F>)9+o0?Lh(n^HZwgW!PU^Mm*GrYRHMqBG}Um;{$1roJ`Tda0n
z)66Bs4g*iMb{cD{bnE283SEMP+vkNxw8heN4w&1q{*;qe<$REEUA;4_Qpo*4RF}<+
zc|7859DE_69`hB!|A*<&EckfNaWw6Cb0IlbvMQaV83|;t>dbwBsQV+9RthlPIc?(y
zJn*v<q4fXW!h({erI`JELF&J~x=NTC$o1g7VKy+|j32%@><;@xq68dSwt>0aUVkr6
zoXl-<n0@9yT5B!4FQ(OBAgcKO2Ug=)fq?-cml`J@_hS-j8{L=G`t|1)@|EUVpV!}U
z4_z}BSB<iNQ1Q=91CAe-rNJYMCH6IF0WNzg{_H2_?FF=r0&TJ{rOp@qeLZKbhNa!q
z`Pj#*JODv&#obT?_jGe}z@napgi@$*j$_NWrxoU3iCduQb=|tKZyWDnHI;o2$<F{k
z;hNQ798%8)=P-`0dSSiD_rTfAo#(b4cYtVo^m??&-myg8hpFuZtOvmDhR)M01_l?@
zB3=6|v)X&dR+RfdsR_{A9#Z<KudiG((t4uv>=K^wI(>gCW@$PaE6(c(R+X!QirdxJ
zr)>nS_ES>wcqLNLz?h%<ZA=AM%y+EMqF8+`zk^r_*FmL*E;D9Mc<GUSmE%;7t}UbE
z11ov9*BiaJI5r1nyrw+ON)HY1Q0;$o+HbqaN@PAPo>^JqVOc(dwxq;AZ&)=-GnnuT
zTPeTqkVHHL<0V<3K05`v?SEU#1EmQ_tRFe=yx!z+CLTmR^_@P<xGrjG>LNOOwruC0
z^wAb3xN1bS=6rJ<KS1DHDV*_(haw<S(yxC`Z}F{zkx;_V&hz4_Tg!v<nX^^9pfsvG
z({Y2R>Zdrc4q&dW2`%qzfxo{mGRXy@nm*@`nhExTXHhSACoC0iicT13B$NZKW&%zp
zs&)*1`GtMnFO7rXBeF6#!{QEoaR=+nC6Bkoe4W1YqnG!Q2w7*$QT>b943}>)83#4Z
zl75*1oGxipkNhmn$Kqv1hG)q~D&3)h4ZE?n%RgX}se*d^$yk7DwRc;u<qX;-#eKj3
zaO4wAk2WVAxh?wvSZGPbyw8x=s;D-aD=F0iR(H!@lEd1_NAx}3SWB0*9ZdZ<ycsM_
zc;gtJB*eFnds-O1XC0kEIltEWY%iQqq(6%JxTNhlakxVcAT{mqA*E;c1pKSmVw2aI
zpNT`*exvMu0^HL<&q&_(r$Z0G#@S)`oIm)B9DX^%P=JS3$#_E`kat6+`e18Z=4RUU
ztj;up-4pwn@Au!Fd(YWGE<keZ+x}f%s&j3=bMYw0qWciOTY(3>8@l$or3;dYNqD*H
zab7zfg6UKjz4@MwnJdh$W!8vrC&Az{-1R7C=?(B2leO}+3y0%mr7Zt(x^d(?j{#wB
zt<BTxXPdG{HR}Uu>kMn5<DwI4HiC^5W~H7b=@$7$5ve;1O|{SmiF!2aaM>blwu0gv
z&68~(kMJPFzBp#<#8iKkB1)Fb_k;u2=lz=JB%!(TyBvD_4JxKo=fV6%MgxI_WChRm
zN{Xu*4?p&Zp`Q1Sj9)Z(1I19@kbtlr;_RtBTTty0X?pazEMJmo*y<osdePD$omG`{
z8GX`+HFkaZoMH%H59C-cH5i5vabI;^tAlw$-OfBYFNHB*Any;sl&?657yFt_wBlzi
z4Out)=F7Q=4)CxR9OS2vVS9;EogZb0PJTN}ywBxAot=us4>R6pA7QB4jDC3hvug-J
zv(0)cmap+6aK?hPI;gt+VAlo<C$i<7CO^+*`iewN%gv`@^LmB+W1|MEw5<Y+<D5cR
z6t5oZ1I>S_IIBO}ldH}4!MrL}f>|IowU(1=%Ai5i{djaks{6j`R`1H8gA(Y7bCxev
z0NRks65i@y=JjJKVznJs>AB{NnXe9X!t8KM?M2i(`+#}iw|{Y5gE~vu*kn4qc6MD4
zlQ~=|-Ij9w)%0j1>+lY>*R}QNhzCcrjv%UuLi^DSJGDKEOypf+=?MY!8YFF@Dv7>Z
zZwmo2hAmebuGy4idrQX=Hw+9=p7m1;`}1{Kf}qhED$aoj2JsS&P{9=TM$K=ILPEF2
zl_mMu#RT#)e7%D6F=($rmys`O0+AxGM<iWj@K@3=B&(uL@tYMFmi;c^6_`ZvW!urU
zDdXL&`-Ddb3zt;KxrQ$1vX`ZyYdhMUYdx8L`FBS$o}Ij>SPOqwr*?j9o1!YS(on88
zpmfyM>whFSH6T!&CAHL|B&XM#W3oDfXUISPe!d=p-1sUepe?(9J4Bara|X0uVQaQI
zIjFhIu!~)-!vAlpMVf=sVT<0Rj*(=$JZecymr9f;uxE@Vx#hL5bPq3$xPNmmd?+^v
zE}NPa9rfE88{Ql2C|{9JnpnNQAFl*_kNL%pUVmTUSQP!a%LI2Db29ftu3Z)?rWTk~
zWvG1cN{@{cSdlB0l2b*(tt!qufY~vKO&5n7^4Ix=`RPb_H-MtSnTp%<@pf$u*W>G5
zO7pAYY=NS^;_=@3{b{HIRmEIDWT)z>Qm154`PoEh`9zoM=3UOmzHU>flH-pglw2RA
z;Jy(f3#}9b23c7n-^59EvWb6<gWv9T;5VzX<y+T6c1#TO3TCux5DtOm{L<wlLTX%k
z7wodv)2VJ0+&eHS`LfmdgD9!hRJnf&suijP(C0BbQmCCBDb1{IEm8~>ML%<!Ih?G5
zF~X*>!Q>S1)UU6$UB#?rN}wmqWRe-ICjt0p;o+vYnq~HS>K`BNZz)qlEpu(A&>6|b
z8Dm%zdX`%ag~8jCPhLs=_c0Wk_oTrFRyKzF-qiLD;H%Ww(VSdIZf;11^39AkKTdl~
z%}m>r>Sk*@vQ;T*tWEn%<h19!(t5@@?Er0@_6pA(etsM=8)WNUs6Tk$wOp&X($~jI
z)oHPYoZGa+IgX#?D+hA871kSA|8;hX|EcwWYTOR9ep>21-EL2O%U0idd+Z?^y7ehX
z@$Nx)P6k4Gt6va`^T*jBc^bb{Eb$P=-lQyfEB)9HAplIsp8%NK>=x9ltM~#d$|R+A
zY<<M6kuCUMUmepH2lM%xw>S`3<ZLSuc<vM#z=sl=feI*!$30fPuJ|!wH8^N9#7H&j
zv!-JH=V30kjEH4s9i{My8IwMhA{teaq|{H)KPGFuvoqz8{ifb0BJRvTg{+lQp0J(l
z=_+Nwoj7MujHg~<&NEfel$|dgJx@()@(ey3u6j46s(RN147~7NY6cw1^hblrQYci7
z;W4{;v7_1C$oX?-rg5&CwIppRRwvXU>S<o!WZd6FRJrnWKMxiU4xZ24zB}oHS7PY4
zCi>vf_+ib3(|8?ubbjx#>!zboxnlFfqcfP4UesVe>{G}$rTe`K<y6KE^+JyG)^)H#
zgC8xed>hKx_&B8PbSNTGl*FXOPjI})Tk7=Zb}jOvY$n}&&P=aJ*Xx>zRJx!ZHXErK
zCTyt#LAP?f)=diFrr1fBYi3zArg15#l)SC0MJ^&|Ym5>^@9n3#(Ag~*MVC+Lur-YI
z28bgt1nBLWxEiY3ETKS_yqCKcYX$Sef-T8}%%G-JmjQPY@xB9O;MZbL2^!L^1}T9)
z>T)lM4(-;q-L*u4PrkE<xcLSK&J929sdCk}cpP-x^Og*(G$}Xm!)xqzJeoK1sz!jU
zqqb)j`|Wl0r^bz6hBWkOj+-&`zzu8_)1$o;*R`P3a?k*?T!NN|xLH5gMbAOI&>X&Y
z+SGl9DXrQP7#ON?GKN`h-O-z-G20aG`n57Q1J7Na&RY#g++xWCUOJL763pM@pCJb~
zo`y+E__0=xPxV7%zza1K#F^!Z;h;>Z(T4dcob_Djn!?ym2F)V<muP>qVFud|2YH(%
zEpD~=;&MkuUlq3Gx6z=PvHR~g(`H8owq}_5R7K<DBbu48QLDsso!-9T4@tc4JLO(x
z+)1Lc(Su;8xlbYe7r;V~)_sG9HUicCwLZ(e1s9<W*Ynhq)mN3G#Ygr@HVlrl+8M1T
zqdVF{RSBSSG&GmZ;1y;UMSDWm&w@lUaJQlhJvK182Ye*M98NU(4L=ABg3y+E10pT<
zZSp-?FC>z*O#7P5K#G{>e64!raQgNxl}c8RO<1Y@=;*11V=UD^Hqv50gSIdAT|gqd
z!&KwkM}t4Qxz`=td-#w(h}*yxe?7aIZ)~|nN{H_S_M_PG;{BF(9TI?k?H?D*3+R1B
zd9?X_%7sSIAMkdB`CZF|sSEH1AVN@ebi^rC&~NmUNpa`i5*`d607;7|VAgrjDlzWR
zOAY*^LXy;=FI)Ni@WfxYQTxL#gr~4<w8n(i2V;J*vv-0S&2xtrI7*_D@%FI?C|NJW
zl#Axpn<*;ibQ^h>+1c01EGYe=47fUHd6U48*<cZl^z?k2nwzZiDn547>XZk^#Usd%
zpcUu+U^w2YHwuf85o_rDGN)W)N?09FmJsalJ<(|1Rt;yKCOXkZ{Ca;KV0H4f@#u8Q
zvo4V64(^{gB$qM0K?vg~xkCuh|ApPl#3rKze~%RZG6=?LhH3+fh4&-iO@fXSler0B
zaZld#6bPJk@{z!qxzCXdX`)LkGrI29aDU9<kxV&}1Ef*Xb2GU~U{@LS`rz>gb;+Tw
z=0KhO#!A0A=HawSwc&3$ghdjgQ%#}=&S}D)vA#f-^4N@R?V7f(T0Oyvt@I<S?;pCB
zvsgNNXVK9JWsF;BN?W8`3q(xOqOGSnlcBqg+g<qVy5h4)y4N*W=-up&6Spbq-)NT*
z`l${6I;q7gP6fJjlp$6ROEP?7!fVX0FR@l2$LsOSiO41hR%9r%2`h8NU(c!<@=QI>
zY!-t+4t^N2!^aDf?4BMU5{>rzKuwacyd)>mwE32bwG&v?@B`vKqUep<PF*#N2(GQT
z3QnOuz52K}9>LHb7TVP}LnC-nG4mj+zAy7$Cm=iLHJfguryT(QU{k-K6HU@aEa?wC
z3{R*VY0)8LHJVP@W1`>7@xe%p8mv6b@jh{ryg7g3Udx2a(JhGGpKn#SWi_BzX%T9@
z?vJ9b6tX_(oQVg`C&f3d1u;mwcdCOIPrRa!K4vHVRG(+U#t4N?l<?gdR7M$q=L6!t
z*0Z)=m?3#g1B#_4r_BbqfZ*d+uLu$Y`<?hskn5^#_yVcYjt-$m`EgaH0aN$(_Nu~W
zNeGXyc3!X$m3+>ecYxkK3;+k-3yK|J5ik76G`WL4Gl#w?xdN(EJ|V3RK&&1&j<egC
zgFI61;)#j@3woGnEk^W=HEWZ7Zaq4OF2|m`?s@ZCF5XDWZv6241vM}7C$mH>3d1!l
zA3Ih3<nd!)Cs&wnu+`#jXIEF>IerA=$vTf#xm_quwU*wft|Q!9)3>85Z8wE~X`l1k
zPBjYTEim<flIg9gDf*M#v$>G1<jw*>*p9DT3~`RsP-~a(xcZ0dO`-rLB?^)Usxv=J
zk1a|O)$=XgvTlpZivEbbWyMK29$9zGMH&q89xPMmz3=r!+hz0=o$&JdfUHS$^y+v)
zE=Nvw0T%3jg_Hh)IR(~+>>Kz_%=Qmck=<p^4odj`(KM(W%&>kq0lzryG-*Bj=`alM
z%k)2*ia>87^VZx6z>?L-?k{#CKI;|e*$4a~=<jQ1z(4)zn!o)8vv2KT?RFp9#{o3$
z<k!M_HlXa3eO3KTBH6ol@8T;ep6=50sFpEdDxXW^e-h|-bTm^w5Aao0N>A+3g{4iN
zC>5@kj4j$sw6r9$`L*XfYgmp0X&VJi?-VO8@|aCk$5som$}k6r_T>W$HG5KK<_imO
z(KkWKURaNb`e8`r(0uPllUdy7E(@!U3rWF{1J@1MRwH9v$~1L))C)6mB;G?GScW!e
z{0rmBKo~yr`q}FBQKyxyM1^a)LQt>=-@L!4)A$xbxw!+lj$C1~9i3vFUn_Bdvzi!T
z{ni=N;Z7esH+Nce1}H*X;OsYl7@&2M0m?cDT#-{_WWr_^SSf26;zPeJp=K@i_Ai*c
zr6gb6TMJYF2na{SN_ZO5(42(ZrGro}q+Vso#=F`8!>QlJ4w6kaIn~7k+V=X+U=HlK
zos0xUUBsxa2!<1oVyoDqGCJrV44-UzW~{Uhh;8Fr0MjUg?q*KQ@r^MRpC>tq9Vw(l
zt!%4omeADa%b@$jpVynWkS6K=IE;su(8zj8WaU>G)Y^aEj(u0nYOO@UO+U_V<~A?H
za+IDdh%L}C+#E0ANMTDRvg0~uzIL#(vivtr=<u(B$%U{<OwyFHzdig{9zsF!8_|#%
zSp7(4WBV%P^^eUNoQ+M%q4sZe{QB)scv{$ys=2A&;LX3={J$cw-=Ns!AO?NS!sqW{
zihp3fkju=^({gs(w|@tLf3dQ#h7G5ko#OF`|2@?9*Q`Rmk<&@jet@M^T~%!JCo8|)
zvA8}kFys%KzhnNtQm@kHH%W6y`nP%w9{)!!_J1&S+<=4YC)Fx_o8)&BrhAH=@by*C
zm6G2|?bI0S4YSSY!Xy62*#BV@Nk3pGjKAOLru$o|1#(FT&qKLM+hzZ?uKyMDzM+Dh
zaEPyo2>b7(hAVZM#94dr?&ohOOpuRFReftJ1K<Ch)c#LR{=SO;rzZcQ%zy2r|5KCy
zf2&FSD75PVY5NQ{B^l{jo8sTr56it3cgV=k$5UHxM2rY3^@Ax73~mqp{H>KE)xesc
zfIGi`UXOz{a@;tg5zl{T!K8z6B(i^JXtABRg0aTwR`B5CzaLot*^5pZYWv&aaYAm<
z#qZ-iN|OCuvCbM1Sokh`E2rsWY)n{bXSN~NXZRU5#oep<Nk3tKley;A)Fv_S38H83
z<Xr?KHZG|-J{cQqhOOQ~MfZK9yK?f~^LQDxJIO{nfikgO;N*($A1{$4q=fBr`0d14
zrwlZK$AMi;iE)*ITBayM>#QS%g4=}s?0PE)9C#FEteO9Oajt|R{v%d;e91|JSJIHC
ztHFEydUtPI898GE4?IoOuFtJDnJ0TKEtw^$r<N_5G|e??Y}WOONpk)$Yc<yHT`&oD
zJfQTan^)U;tE_LkAIK`PH5eP7?c>;JJ58x}<fpxk=H|KUv7!ObY5Z0Ne0_!nG+Vvy
zL(cls_(qUz32yS`Te&*XggcG<L1xVEL8gp-x-Q9=<t$ZUE=h)~CdwV>Pc9Qkl|CD-
zZp)Le3%vj$8&fs!`OEZVO#_?~OZV6JU6d8(jokomEB9tUpIMEJiSPm^0;Yl%;^)i?
zR`V{aCFJ0lm4Tyzl!p^mXlzCk!x>EIS=1ZyMRxBp(`4LW{BhFkL3uJ!c7SD<4z>cu
z16#tg0WXMN?oVsq2uC?GV(|x+iKE*6e=5!YaHK)xPvWJp6$a&8@P4!oSF!0!L%~w+
zx6k&og~qhL7C4%ViJtg`0w2DK6DQ9!pV=bI?@@)-rJQ&c^+Y7KCW+DbDF&YJ5%*+s
zmx~KJg16`WrFRfPVaoyyI~S(4rmOuJEkgkpx`Br<Y`mW#*E8BrGuQPot6^?10=2%o
zggxk47HSrjfJMkyvQ;l$?)X_HpYor14(}qTA!dzmkukzm+Q#C7H*fDJQ^L<Ix~LvJ
zu*Do7LQ~wJNtU_p5DGUV=$%Rn1T&@3hC@#hOB^<L!_#(b&_jb=@TxA)RgNon&o76N
zi0iYA+%aj4>TzK6E(7|ecl=KJef=@WCs+`wcsR+@$~QI`2n$!G8cW5Ei%onR#FgY=
zXyP2UD&+0JK3C=hz#xY5)7J6r_g4ovFSGcCe1VBu(*mG9Yvsl`e>k<!v~}iJTF><_
z%l!$mH2y!Sr<#Cum#OD~lNA!Li)tcnu>TjQ_(@BV%u|<Be!%>f+L!>F;JL9{YleO4
z*|WWYSu>&`F@>q^OdOyfWC6@|rShIXaW#0abeY#gyZ0EPz2}(GsMgZG0aWW0%!uK7
zAGQ!9-+mJP?Ah$0gI>8g$Qu!xys%&nfHy`Ge=mnusB*dJj}A`4mRR)ofdlJx1?jt`
ziW-QHNG&hmStmlX*$wU9h9;=#kgWD<Po;KSSa`KD9Q4H>d1T^0N7C-;V)c0DvYt;6
zlumYKLEpF5YoVc`xdxxpkMbDif)*+~HupaS?6;Q_;T`Pm0{rb9H8d)&`K(OIto~7B
z^^=x;c?kgTgJ7$=#GD({XlRB;*)<LlECqq8ndvFZ&BJlZd3$-w-NTk6rSPhqlLGb3
z)Bv;_1V+a3VLwU$Q&ZjWDa2PmrNXFR#b4<Ni=4A1%EfHo|J^~+1qZ`#*x>|>*3Ru*
zZxzMA#Ysx<s~ULpVTols{}gRpwMarEvZ<X=U+;r*)4X-TVP@;C)0F!L2&2NCD|fIq
zD^OWDZP1gf_T8l2hR@|T^jL3!MOV@(%h}nnl@BvC&1)2>Kh(i7H8WsZP%<mP7Sjb5
z2GDS2bU91<bqF{|#?>_-+iylFRr@|cq^NEQ4{ZB4O{w=}UFz<d=UU0tx=V%K{PMZj
zl7L)93h_$%#QVHwQ<{~QD9JR)0(TNp!m!b^X(U#MwsU$cH#bcjm{saNlpBpU@AWo?
zTAm~OavOXXwu)+~tWUf6<lLw0>YK{e2ZD`uXqZv+!>aZ=#f)9CUELH(@}{Gsb_*@K
zHBq&#0614XO{QT_f+b6?t`4krgYSVt?4!0y7qjG;i{%-DE(A7OLY32cv?`ycg|Vr0
zP|rO**h(bu#Acz`xSMw)V>l_fOs0i|sy3nBl{!>bKR>PT1|C=vZB{1!SwI-K)o*a6
zpIWHH`}yiEHnV!fi|LXgBd8Jha-Qg$WADFju2(QSTwE37w@*ej1aqoGZPKyB93{Dx
z`y-NUr_AdoSdG|>=V=7=_m&5lc5OO=h4<uEb|1`-o7Aqq55XXt+6dR(*Lqn+wM=K!
z(e_z(h8fWLKs6dlkE*8q;oE8Gc9&XJm_u2-CskcTTro-1xG_3AP(+X7toen#Ub!9Z
zsp#uo$DWiv0rm?UvJNRH8eXxTc5b;kvAk?wQZ6R{n*^L@5RsNLbcF`5y1BQN%BrUt
z^vW}jRigC9FLx5dfm~TItaQ*dj+bw#J&HVWC=?ByK{`&huzwei7w9LP%XG>@rk~h&
zKhK@h?(ElxVN*maR@<yoby7-7<lJlzlhus}KS-ZoLqoax!{2Qt0`5SJX11pwX4n9a
zP@!vLw-9o8_}AAYtKAU+-mOtR@!F^!A2JurM8DvVD2hswGF6R`l;8*d2&9NyBt7_5
zFi3Re!UPAm-n7$@Z1~p1x1-!!zt<zS6$uXu3+wMkTSN*$V4i}&2CI{yoB&0#CKShX
z{&%K4384m%fKL$Sj9Y1mPymZ4Do*QOLsWhF4u%7FL<d9XIi5JTP)p1FZ^(!)6Hg>G
zEYEl<ac?HOSj8Ew&#Kp0AM=n`L9`l>K9~id>Bep!eDtQ9^Od5ei)X^aCa~l6RHU;)
zrR@nqwY1V2uWtD#wvXq*4#(G-V0twaxrAQ|DOjYY{rV;4l`i)3d>RsSS*^?ydk}h+
z8T#{{bP<7%f@4wrmzb-YJkX?*G_^Wxwbc}O6LsHBAq20`{+TsF<#TM1BK4^|>D9;J
z?i(RFbP0e_OQy}7$T5t*=^qa*@DB1HU&ECmqx<nb+k`ays`m!|n)&~}F)!O3r{S<V
zpKgaf1xKqdjIkydH*mGQ=Ck;3UPjozwMLGzD`28!a!?ZO%e{SDJtVizQ%(F)irDHp
zr8O^3tL@{$N}Ds?jj^(ZSy(HCoXC@@!R!}C(-+aqZG|<_BROjKeUCoQ<SOO49cg<b
z=D0u4gE^u>=j@*Lrfd?Tb*V?C>Bk+vwzAo+`^B_o&pNZjIk_eHq}hx$4b{N!Qeg0l
zXEsobRunlKokU}G0#!LgX%FkWs{&6=0@0Rl$vNWHTQX1)$!w<D;o{Tk6c<&7k{32D
z<eZucJCG$mfV(w=SnNv|Aah%rOr`IZGa6B@GgAFZ)pzlHey^<n-UDsrPQaG_^qqNo
zfr>z;i=*|0$^Wc|myW_K){E}P%%i}ztX>M&{#!&`_d<%xi4so^xf9!XRR~6&RJaY;
zL`I4*g*k6o_`L(kX9XFxr0o~qZo$}44iyyN_Vo=ND>8lQyzZ0YINPYT6SV5N0mq0^
z5nOEd(dJ$(=viU&=ZWmLYLv(8#hP_C_tOqKM=L$MXhvbeX?%H~k~wuariPt(+Rpv6
z9x+Hjs#70iZF578wv)QdeDo~TI;XuJYMa%)%7}UBR!5M=c-G#fmi+7^x_=;y;+`*^
z;p(6n$C;mJvXE(jet!S)OXt=Lxl2$H6X2J_VN#<O{vNGhXf`}lJJ(`aep=<$EY82c
zyfIVtx#{UD{)QKZ&rL5|t@f;7bgetL(A_$>K}1@y6}yiPs7CPd?m5^DYhAVB@U+ak
z8R`_zrggS^#>)idMUCfFoO!FpYK%woBTuUwn~8^@=nZu;M-mV=u?;}!JkioLoO5mE
zFx@Lp8_3eDMa0Cf(LzRR%8gkIO-es;7*{{LOiDsH3u)mQIR5%SH7I$Hy^Qi+tn96x
znLtXa)(W78bl4L1V&dMdPHgdRU(;J<$6!;KC&lSPh|tC-uq5zB;E88sjPH&q#B^rX
z*ypm=YLYFP9Z*NSJ}2kr>EHkot9D|e5=6#dKS5P&@$m5M%@&g(c%?=z-`Y3}Klm$%
z@V9+G^_-5;j3yGLkgq{!_701!zh#|eHNRwzHyhx5$<Lv!^yWc=1m=!XmUx-QHWgWZ
z18S`|r9HPmWW!Y5{+0LhWPTUznXMgc4i#}kS@ocIHv2)t)|-qaKXzYPH<rl~fNOO}
zXQRy$%OpVe+1|2hg`x}pbLuFyiuWnimoF?ps5!4hAkuUhM9VkssO`l>le5xApPMet
z2W=k{MZOca+iQ9dFW#`P)DkbwyL(u`Bp>@|N3?}wtjs{M<tcixIpt}RtZCn^IQV0M
zBHfC<p0gW?Im=$zbx^DRk5-?gN3!!(#Gjb_?!E00W>>kS#>!WHclM>Hez+az#{?Xp
z9sQ#Y?e$VlKsvR3<xLU>*kfk}H_&HG{nmwI!bbr1k8xm2Mk&#qEWAii>q<|;(wx(?
zTdSbX)wWXZ*2BW>NMUoW$x}#RbO1lX-<z7^R1bUjKlgX~gYsW^TbdW|?X$34*PnD#
zU-{G)7Zc0Akb!%XiBi9O@e%RrUxIWi3cVuvS92UZ$y@6L`#I(?{(YPG_C3vK&J`qy
z<99qdPq*5Upher!fhNqR<<ZB=q*kyk26o0Rk-68d%gw6wEL_|Tk}21^30HN~*+1D6
zSuZ{L=idSdJ7w~9hm4FiS<xJwd|!_oInQsaH9O;3m>BdjpnXX7DEUC#X)efgw-({Z
zR9e22Zxti`*Bk!n*KswTtEIC5cZAGDdLG=Pe<gP_Oa9)22izF?&zCsCv^D&b_YVv8
z?9_^9L^C_FF(T#Mf&{VODy`|OY<J_%zBPOT6J1X4+gtJ{rsDcp{>-mpv2`a0<!U{r
zX2$B)NFOJzhFT!kKHRb_p#{~ZXz+C}2KK71RTSj0k*o|3GTWY}X5nu2qn`?HfyGX~
z`-iTZUK91Yi8Xr7I~JC~H<Is=?Dp|d?ghj=r#_IN+5bL7%4#^KOedN&xU<}&e|aoU
zW_vm2@QCN6n<rAjb2_($NZ_LMp_T1Sw#X82BJ#YhWH3?6eKiJBo>wf`?SzVpZ(UP+
zN|G%B(lU+ne{%BWO~idS*g&pU?u^IH=g*%j=<9!Y8g|aHJoJ*bJ4WQzTnRB2@MB+_
zV8o^iC!J0_le!|}_{=0;OH|7s^$)jar*}rnbHzn0nl7{VCgqlH&zc2KACq-^uLQYy
z_u~Ca1*9wSY*}b}`V9)KW{%u1q^F#1dK?<cWVwcM&QeQX@6Ca=?om>**u=hpX%<;k
zcp$*af-aFsyUg!`06)wrU2SO^8f=}@#K$rC6-{0pFaGT=TFIVhiR$+zA3<I<Tu)|E
zcfNd!$6sPMlyFv}?}K-K@1O)m9E-Eu*ZNtPUP#anh|O^x+UxL;WDGbh#iI@(JTk7P
zRO>DGete&GGWZpQjbquCzns$ivAmB8R~%QuzJok}HiBw-VBP5Z>$}NnF&&Fq$MnLf
z#HuWOt+Uv=4cWK+5nGYJA6~P~J)K-0m`I#a*Vn{tnskcZ>&2EZr`FA9bG2vHk6uXO
z#*qFK68(}5L5M9{&7HidTJ&6vAAA=V69XE)5KCD6LM7m#`ZfDtS5d@|aelB8{_3Pu
zODsY~?W@53f4hKF1M{KaZ%SV;<0P8PG-BJhQw`qP%l&sbunv+<0O&*K7?r8{GZbsL
zr2L5p5B?u%R~Z)78m$#U6ckiS8bwI~X=zXq>F!jzo1s%g5e22Bk?xU>8A?UEW2j+3
zdWHcchGy=@<5AB!dY}6|_uocj&z^68Ypr*^?^^3ae`&ayE&pU<{%)v2`630Ep{-@l
z4sv9~pFhjK^MP7}-T6K9eu8B`Q;`IbfKs={lFC{U%}-UR!TS1c-fgItOBswtX_c|`
zr)!>8Om|G1botN~fgrieZ0t|Qj<0aM`?w{zMS<?k<O1R`IN|~~)^MRi0$e_8`YH{(
zy)arHJtx-43Ps@V$j8bFV{@uY-(UQDMF1Wq;F48;R+Yqs5N@;gr?A3bPYpwrXC)F@
zt0i_byb4^%mPf1d;ndCy9cSlU7llC*nQD7HWr7=l@J2tF5vw+4Qd_Gdmh(|Hy^!~0
zCW!o}vsSxu$N9PupYmaep|i^*F5JE9pwIGdm$rLIR<c)LVccx%eR2+Q#!l_zIJVf3
z`F{M~US06^v962W7sbir1m=5lpB-wm9n(bB7R$<?qbhOmetBx#OEpb6i`RQ7JQ<Fi
zcJJ`kBO8lF=A5>;+fzj1tpe*Mk1q*b@iVx)!E_jAOjnLwdVO^>vcXdXA|9zUFRbsf
z=uwKiXUd<ZQTkyKBjoN$OcR9TwT;>-2^+!d(RF6}3yd3F4Z0ULYd)9fx@>Es(R*zN
zo+5)B?!K1WdyFXVd5fs-jg1!X!JSSL?6#)IDQx!a%_v_*crRv64vAG4yKdzb-c?Y@
z2v2>f^WZOvG)Ut-X<O3ZC$XB-scd{~Co$%PBoe#5^}N+I6#_d8#~0lXhw?7ldNt3z
z=DM1w{8%aZAS^=wsIr%ni@Q+HYPS(O_j*(Z_R-Yc*lxwzue{mMPr~y_SXBPK^tiXt
zhl~;`V2;IaLZ8zNo=4Kymfl7vZhObe671&!ool1C>AltilUDtQThGsI)fsM%68rRT
zlaArwg_sRJyL8Z}1DE;UPZhL9_$gV`{tMmtH0tPMqvhr_+;9rnJJ!Q#pFitdI)@7w
ztx-wF!b`ADLxtT~)x;`*2NX`QjXD#aQc1bB!`8d^KtqxBj<H%!7BDag0Uvw?zwA*9
zG>>79wkUCG9Fk8NOmCc>mY$FKW@DUhC0lJ>Wv8H08=W82T?V?|Hd>~mlAf_^r&M_@
zMbf^Arv~@l#FmIL-jv6kNZ1bRE;_iQO=J92g8q7c|JMzq4s4E~n1qas`KH|2PTK1E
zQmMoC!MCE$&78OKzrBEScTOJ(ayv%`LW+_v0d)I2Kx+&s&bOL6aLbwOn~`I+-Dm~x
zITrfzE^617Ycu<>Ci3MuAaZ?=qPH4mY;@NTc*bg()k<u%U=EE{nj{J0D9h!chT6H_
zcx~~N<B;XTo5as5g%5wwo4ypcb#b%pQOTI>VmGbBcwVKkX=q8~!Le*DHWli@1TCQ#
zjGjgK9!v^S-s`@##}Ju&YkM_TpH0)vzHj<Cub){V@h+2c9_^34Ut@*WwGyGuPM&MC
zE5r_qNe>QH8-Chx&MfqMpw$AuC2QT7Fr^Xkzv~cSl2tTO(=GP71ZHsU9$l>=9}F6|
z*T@lzpXvv%$1dX^$blW~(REb;zN2?^Ydo_J!2lqI3Ve||=^Qge&@S)mRYJH4Pv3lR
zm(aGNdw5SGos*6ETe{{Xg||ic;%0XQh(o0N<Kz155>_B6bj4G{^Bvw$IUVCjS>*Ud
zt&Gw!Gu;Dny}zOdk50O2E<?VtM2JYXNaGb2TDmXVv(9H-J)IndNGf}E6~ET6FDc)+
z12qfZ&I^>(I{za;PCY{v?Ql&6Nb&X2QMx}252N5F4prPWfsJVMMUhD;$G|V+XFVBP
z%c|=q%t5!l^uC`FNyA&atSn!eG5NByfS9NC?~UC=(${qGD#g-u<c701q-P;V`Ej++
zCikYvsG=o35o)UKXAC`sLwRF;W^PU4!7C?g1kwc*sr>RP7YMV;63ZUv>S%g8C<|yk
z^=Y$87Yk^}TFb)2$Cn4<n5f<jg9{vQC1%c!{0twzUNM;9X_dfll$M@PbLO+1q8~ul
z72;r$g-RpHfF!*%b?fPWkoY&31hn__e!=cqRst9~?xelMXWyNdcOj{bn2@~ldb!_N
z!NrZv3^KKBH6d1mnn1D5MWY0U(0hb(uvZ+}s_=r1p=Uc;>W~GVZ4^EF*neRx9>}_g
z7;Y-vEC&40Ae+qZr;=Pc=NUB4&(^-p)Mvz3lzzBzK;SjE=e#k|rKeV{MRuYp01QZ$
z0Rpfg6$bEJN5VZOr34Oursm==jER5K@}lkCk||~^K3k47HkTf%=&>@z3?y@Jfaa0Q
zJTAD0t;{VXHzd=pU;ZWB{$sr$1Y5`vtr5pbPJmx~Gnm_}^AsrGkuA0%yIJ(`8c8~F
z{O9dL=IQ)L!J9o*_xh924KNu|1}P~i53j>kMB6|tDjU#A&X1SAIXdYTZnW*8{5`@4
zWSZ7jX;DBz%7GHFix=Ral54*rZCA+a>?~fPR43PrbM1=jhS>nwXte#wos#6cpZ}hl
z0*D=QE?s+zI>#tPqu)fzs|BDHN4`(mH<%wLXCQoL*Qm}+J&*twf62>P@2`OKA5l=U
z4)^ZejWG;RTo(0_DgJN{yl`^~tTh~7=DYpiBq(GM(Ul(XRd#OxJwd`W_V=V1gaF6f
zzWn|V^%G^qP4gFR+J-r(+h2l+UbELtG6rz&!SrySv&Z{F3aL5(+Adr5;CQ1i3bxW9
zE?5!G%DW(YRJazU+?Z^;(c`n|y1(8KUpyww@O{dFb{FTOk5rE<$gB%bh!ek?D#*O^
zzD4f_#yqD;Nc`_;@*{Rn-#zcPLV9BxWt^AkI$3lHZ~<?~!eqC(iHKJ@EHf~BlO=z7
zfUMqa677fx;akl@!8eOknm<8S_Ji`QSV<F&T^70?!T4Jghu$%&RXof)Xnt=*UB*4}
zH=2$L5ufnKz2p27u>NwzsT4#r&-T2btN_5_au^lqk9SE{h>3{eV#aS+HOW<-*q=R@
z81ru@h&8pas!Y$!P%Ji4^5Z<mZIenn_eArToBz6Hfrwvr7a+#8SX_TU8S2}AzH%N&
za&bSf{Xp&iv3uQXK!VO5BXsu9|K&Nb5t(rw{a#)>dAy*aOKk~c_%m1kw9OuzOMdN{
z;OC@YP*(>vk!4Ngbu1WHp#=U8SPrrdnlnG%-#@fop8E`8xa~(10nzr%k$aPfIPZGD
zlBAJvZe}I9RRf`u)Y}?;T@hV{o@9sY&_w~6Xm-Be1OD+kzv}z<>Eb%6huZQu%cUi<
zYTC);6QX}FR0ma@6XkylwVNzs9LoVR0l8>f{I(C$>~{6U%mK_2w{(*2k|M@yJ(u;o
zP#>2mhYb@CfLIm_hyp0S>fVxz=ld*S?7d#Uw|9UPD_=OerW*DptvF2t#F>DZx^1li
zu0uP{k1GNc+bZcn4h~LM%?$5P82-;I&&yXuj$I+!wc1P%XZnH%Doy3BHL43PcrE*R
z9Rjpm`_|e`o{e;%!R?aQ@=N8pe0;}PTF7;?eu+Z!6My|8LDd-W*IqNvMHHL-<0Cwe
zOixtS+6rIm+G=A#YgoZ^bij~)nru(9klALO;YM%AHjtU0b-~iYvkVfh7(IwR%;~i}
zzU3NltZwAL;J`{}q?6+(Vl18@p#6&G<!h9P5o>*kBm2!46tOS$L8&0Rd-P~#<(Hny
z!j-n1>W#<!ba}>LrGavJTSyUt)br?w%M1gR$bR?^XCPCay7p^b;SI;B$#&xKdRd^|
zNnDo7<3En-+qRYTAYU42s1-X(f>g#ghd9KMGxho?C=?YLHA>U@_$O-!1Yxw8-j<k;
zAz!-OTF4qI)|4(V)Ke<4QfsYRcE6{8;?d5E?iY}@#Va#499I|&);+%<fQC7Y+Dteg
zO0{$RHp}@0?Wa7yJ{o(p(GkFH8w8I}q~~4WjJa$o@dlu_qOP=rx6zQ~(I50^)WUEN
z5PsX2$1GlL_T1N|qH<h(H!)&A$t2&U7<>|i9MJtzPa{)q^vf6^82YHscXq!FQ@yt;
zAjr7i5ko1_0X4=dbzjO)dI+#F&n0z!D=h-{ppov<;zFMBAa-M*rh(nB``uEv$Aln#
z<E_2T`O3~z2{}^2==aw{vkaD<KuHL!o;_W<w~9EW;C-Oi!^UDpJ9=#qdlR65?*YB>
zCP~C6gdJkEhZD3mASa$Au6&jl+Fp(_ix1FQkG}x)o%)<f_eZl;tqxIcW*K&VR(of_
z3YjX?HKg2Lu;kh5pMd*$r2-x0C9XbxpUzw%B{f4uV0Wo?ZE!G()FW}r3pvf|#FK1*
zzCPx6qKSJE#=oW9;g~!sjS>bzNB(Apz#lvf!`|JE-J(mEy(B*ngpE%_2lI^31!b1T
zG3ANV^(M!co(o-$Hqm!s2O@NwOjpDq06HIqV3puNmt8v$R?H4c>U)?(AlT`d%+>9f
zo=lU|ym8UYKtIjH35(Awp-iDY*FgvC>(^J!$<<C+$!&{@A4vKio;Aqs#w7}hcx3s@
z9jnV)dFft#b`><Q#4Rk+7OQ;(94A<ZySfe|Sg6bb=2DG#e7kPc7L8=PM7LiAXV;uh
z1#wFq==)SitljpxPLy2DQG0Q}_!I}1sDIc{o?bKOCgTy#T)J}OHL*Wz@<0MBo22<d
z!M5c=OS?dL{9uu0WrlTm|3Ye{u;+c`+ZbM|`&g5XTbu7kW_hr%n*x?cWG_+s1BT)5
zY}&UN#YC#(N_!Euu6?$3)EsEA&8@WnOtBS`s=oa_T(oJ#Yu!FPg{k0@-t9)kvCR$r
zwD7Y#!?VWGFR+=`2<C!z8ynoWQfX8i$UmTtsw+#b_e3ZK78of7zp_)h=~L7e=pNnZ
zMYK;$trk38I9>eWB)|i8B<6d&#n(1EqI~AR&diztJ@!>cO$|BV1-jGw<w6L<6v##+
zvll}zEuH2>rOjiyRqszZkc3_DYqjVFqAO&y9MzLzUGNOmdK(X+=9!+=?J7kOUINzI
zYk4_n<2yO%VHj|srn21716c!lKOOxeM_E!gXUkSJ0cEZ2Wej-W>KQ)#jv5XJ2102r
zwtml-e?%Py6z(^cPPUb;+<}3Hbr4rBEX+&~vAkUKAzayfqTN&+RGMw%m>&fRJi!e^
zV~eU`uO9Yaf42&>e3Z=EA4zIdC|XDJS^{F^!I-^8T3C`Bx7X$ZX5>b*KuWg%%I<UA
z2~q8eNYPqgj_gLAFVKzs8~`7o1mx?>HAW&|M?E*pifF>c*#IT@h71v`)@#M0AOHJY
zQWA^4gFtDr-y?x_`XJ%G57NfEn!zC<_P2#ATiL0|6Vsn#)T=Z~aVOsB>sC6HL+veA
znKk)fhr>)S73HKumTrcJW@~gW9(e)Eao@48xYH@E8<O;a6tU{REx-k)iCSpY3PdUQ
z`Z;VXuI`^VO?3j7U-HBey&<q(!fN^CKe*yM6mYr>R8BOW<Ggb3_rVu<1?NNozYqjc
z6k`1S7X<Ld3!swhNZRT3)$4!o9f6Y50Ma3b+ICR>u@u$-7e|OIn49PRA3mmu;8W}K
z6+=HJO{zz+E)P@xuj(U1G>O^#kh;sQf%{0mTd-FbF0)>mGqo?c7Ud2E_I6?0SASfc
zz(9SVlsGB)KU&#Oe^jOvNGhmn+5P(GKmB?&eVt?!Ks(UgedZ0I3+;sUV<k7zaX=0{
zD6t>Z0A@m(k5_NbM_aoB?Wb338)jn%Q2SMCWuB^IHSW2h{9nXQ7&u?dJh2OMi_*)=
zgt$FgBp$l+RYAyeH*c%9w0_dv){>nE)ry!0<}W>L#@U;Lcm+IU9c+z~IcWb9pw+pE
z=!(yfCTRBq8F1N`*txwv&oeo9;zW{(4t{;qpU{qBf*?6U3@GB~e$ex&rQIi!KHie9
z4#?UjnE|uXix!f=vYH|i*iMQyrcNSkl_?m?`_&f0#l)m_5@Fe44sRYs?g$1Q2=KTK
z;qy3m(41fyuZ<x6xtGK_FQ!iQ{Vm|(5j<(5<`TaJOd`2ijlgEujW@_V5EoMcW)mp`
z0$$q`>HCYjmoJ?N>R)(VuKXUsM)l)#YCFcH_*|dglUX~#nPb`s12d~16;imB$1Ck_
zk#)qDP3Q>}vb$2;isQBHsX;~1dZblYsp;=e4K+JZV|Hl=N;FB9fqE*9d>y5@BS^nt
zwe`nD9?xk-7q1YiflOCwh+miMPTly<l8M8G`<+!Omsj_185{yzsCE~8&GMh1@JH-O
zV0h295jaQK4vZ^eovGZhw-7k`9t>BHhB-Mb2VpgW$%bu7%+-bi6?c?aqa-4i6nucA
z2%Z%-lyf;6!MyE?cJ`ITrnoo(Go*lRKd$)08c5^t7ivWTGaqgO+QfN|w^(Q<IWgCD
z!;;#&0Ma>z`T_Co#yEM5mDt`egRJ4voR2BEks#{JXJhb%+248<pnj80CEu6wJTOQ}
z`9gq<KL9}@XtA}o3~QUkaxmKbaXS(RT3W^MTh2_%M;OD)Rj(j8gy*stnl9{<29Ex=
z;OP|3m^uRs%fn;A!_!`yAzX7WcXH)7nc`*l%5L5O(P`^!S0yPL?Jil>&aXalNCyVj
zjF}JewOk>!TWC)!&Rc7ilI_j)KxU7!l5w&HrO(?6w!(^yzGhHuwFr#DUvpDTL{U1j
z;l^>0B#5C5R^MyzW&bC;qNtQ?RgzL`JeKw&xFC*qQgzEZRC62B7_k3@=0(Kj-ZbDP
zUi;37-|wp5$|Lsee$5yLcLlV*3iNS0RLJE$U%oNiP#*>KN5$R6BYYOkK2;x$hYLOT
zldd8pO8RbF)7X;%+TFw0)~`EVZ~b@)KaTF)lz(z}EL`n%j1@4^XkMo9dPYD#3&(h-
z@wLmvyU!m|-+XebUHmEKL4<Dih6G}bAEr?Lw*M-~!f0`)#<j6?FLh0*7fjcf&?~m%
z`Y#hU&}Mj_n1oX(6ue*wi6?&LEB*ESa{|G=g#crSG)LlydQ#jg-3{HHDxBTRv3JWo
zykye*0OKN5l$QRz-*v(ZBDx2aFQO(Usxu037|S0bS*}~{;Iu*H<WDNvP!Zs?04Zw2
z9a8H*WQiSFl9}F>X7w!Rk}n1wYSZ4K24yP#{<FT}z4t(a$0qUVAHn~pU;67R$dy4d
z@R%muRsPR60q)@V@4x$bL*Ncg*Gm^qrRe`2L^V+om{~mZfl;8p$Zg+P3(PRQXOAlf
z#5lk646cN~|5O&H(}E!LV{t=a|Gd77b=wKRn)K;u+>$-5HW`y8u=9HZJ9(z*EP#hT
zX8qwZ{$t)wM7R%~U5WZ?Z~i{I#t;Q1;YSYNq(}uzclr-b9hKJQEZ#&8*Jm=-A3YZU
zURfao)OQAC431)ud39heQ9Q!F)-_UP*lz<g0jE+4s9Z#jyvK)tu%{J0?VoCu9IKyf
zltv@N;Y@kRz-cM2-tberOT7f_FMq#YlDe04fGR55<VHsm$Co}UoZZO~9T0JVk+vJ;
zAvnV4Ap#54k`Fk9(;*QenS5LA?MQEDc10hwY-p$!pS!V&noIb=z<~SOM=rKLgz}&f
zZGMBc`@l19M;04IP+8Nbt34_}4rY0-)%-t;^Ud$7h~qwZc{NX~n`$(~e#9!s+i1k-
z$Olxu!?iY|1k8X^MX-z5HoqY&Fyc2LhP#INm3UOLAqPNtTF1w^zP`Q{>}(HA>1sQj
zOCD(Q+iJ*k2JGDv+^+8u{~Wo&47|ugG1muwD2lRVnMN%ZcXg>#;P39Sd1J2~G&LNB
zDKa#^-Zay(in{*cUIiEC8ML1E-KaL=Xvk}7aPVHcg@;)7A<zuP)!3W*`VYj!WjxWz
z&0Z?HD_7Eu{U?^}xUX^Ra^pv`bkXc!HeRQJt8<j-Sc!>=EolWhswI!Hi5=C1nZFy0
zePUGMH=LXyapQHSk?p#TYON%1wq}a<Fx!iQw!osHCMLKP65f5vGCSBl>J6}J-_Vq<
z!;kHZ173RP$fg<J9~M0r$6>+9VRb|~-!EyCZ|+rt#a9?Brh(tof1~DIf!)BQ&U`Ti
zeSh8}VmP9gT%=b*&?TzAJ92|{y2@h^(-rp;71P@v-<Ow3%oN*~Qqx^~NvHflBG$d?
zlj&jx$t1~7gZ*{|Sgq6iu!GGol4kO)e%xD0vl_?e@In^Bj>9ji%QcK^K95RVQM^(c
z#9Ru)kMpD7yeT*n_Nu_<zE0}0-Ip&UG2Yj^0lEH7QW9g0@h!oe*J_NHNJujK`c$|>
z8iqu9g#R$dvr5wGV!L;()mIEgZM&^w55vjQCxLKHW-$i*W+!aZ;`aeL#`|<N=RNtO
zt5;9q#GQKaX=Ckwbx8kzm-hd^gUSreWb_|=;wc=<8&Qu+_Qg4iAHu4(HdU<fs;?W~
zGsl%XVL4@4T#5hS7s*ev-)yU?mkn3yzUAg4pNRXmu7y6`xYFST+u6NBe>5_5^+lmp
zS6iEk9P<3$z9+<nqSLJLp|d&@J$uOSy=G83$NCI2VhJ)|*o(0bA37$PMT{UsFr>>=
zg&Q5iNkgmQQo>h2&^DM$bHa%n?<!LAr}V}^ZW`4hfm-i>a(yS?E4nUmyGI9a?|^(y
zkMoWz;JcG+X|@00^(ZMuA=p4BsKCdr+t^=Sra@*;SJzOkF~_h>8mGqL*Bjfl8&}|?
zi)xzrG&zl+2g}dDt}S)Qa}8|uk}u!at9uHcP@XbM^<Hc=Mz)pC*3H+^)h&(~c|h<g
zx3{Y$mJiafD(_{BH-|IQhH9h~FE$(r8;;gJXQ~(LY&c$~nBva|;kb9lrc{{j7!4O_
zX#%cGL5nI0@>BwR-~bsZX=Jx)q@&-713X6l=$SBnhcA4J0j?GB&k?mR_)S-@)828@
z;4jqdj}ifE#uF9CBqtZNuXT?cwy=|4p-K0zhb-3>3U{`)vd5ptSK<P|-5|B<pMw26
z$;9(nh=cVP1Y0|U71}ZdYnmy1kGrVD$3YD7pM%%5tOk<OMuS$WjcDE(`e0Dq`Dqi|
zZ{Da??yVW>Ra+;07Wb|`KM5LKI=^O%?82jRX|41=Y);xj9=KMxf+`Ff)2oQ+!UsQO
zv(KYY#qG0U`ARQsysbMF4(g`Fgh$uTl%U_u>|9i76<B12K9=@_9BLozKJ6B%utAe|
zV=<OK0r1a_ic@=hA~mUY8@-Oe9ITPBGqwVD!>(ZeEc~?wrNP_?wMN^zZ@xV7JXjev
zzOB#yXz0lM%ex6a*Qsi~!;g*_A%9XsVBhz5*WmoiO@05dBdDoBqRL8JBcg;q$g6Ou
zL{+n!YPf&H__&RKUwMH;Ye&f*Ej-e$gx0s2DT|d;RtAlv@L^87+rbp7x7V!csSlP?
zkPa5JJ1Zkb7m-7Ki4xeV$h4B;7o1iFmzhpE)+45{E+V@Dw>YJ?6{)qX2h!ult89v>
zhOZBOhU!c04Ut&u?GLV5kV7+#N5Y5$4r>)@x$G>AqmRP)<)_#h8X5}JdDR5pI$qS_
zT`!(>Z7|9)iZ`~iSU+H8>C;9}2n>FiZ?tSoMqfV{^(E(=j>hl#&!7#)OSH3ZS+Ph+
znlL_kdHE5}Fl*0I(Mv<sQV-#`z`Qq0TD44X{*^8Mo-@pXzEIj_lcF)C?#(8I{)mE4
zA!#-T`!k2wtT{6g^vn8abl7%Kux1hEAz`DwwFO)vD=anf7V2ZzFscgq&S&4e5It%)
zSoYds39cDJpJYY9)L)r$Y>Ldz&K^71T;LM4Kl%dY6cznc+x?&)kVPoEUyT1DdMV+4
zQ1HGg5iR$*qknZ~Ms+%mJ&D2Hk+=MC#9z#1c!FZ~A+am+$wYp)Xcd7CdnB@!NVj&O
zJUrJD&v3G&)}s5MM||kLk)ol!+Kg}MG`>B&RHir))?0w|tuCSDKlF(Hl9L7`!#VMY
zV-0nMy3|&B=vc{SM8X~->yf>spFocRz#LdQw8sA70iWWP1;hRh%;p?@b!K}72AQk<
zwZ&6O$FU=h;|^S(>wFv|T>D21VwGK|5f$dyRl2q=ZThTsMHPZtwM)Sk`!4;Fbp;Sc
z8NcdR6ia}kn^^V^#yN7`UI-&Y@ddPVVXjwEB=QMwqqGE<9iR$w)Bz&yJ)7WwTc~^Z
zD7t_|=r+CoexrfCQ@RsQS2|ZRN#(!tjqg9EC!g0jy<BbNdp|QZK$}-w-!5`0Q58Gc
zCWWa=N(!5tq$!`lCtAw?=<Hm#d|axOEwCsktlR6wg19IMs=iw_&jhMg-z-s{n({((
z_(5z}J`rz!9;&Sa0A=|(w7ZREFh;9(>kHw^&SxeWzb6S_15}f8Fb&&`2g%34kbA2Y
zdG+nln(CEJ+HpSKe5aD~>NILEa4Pqr_ZGdwn!dzY_;}~HJux}rFrGSA=A4pYD2={;
z{yx0|o0=b6JZBKUZPuP~S_g$zowu^%&pM{RzTX|!Aa<~adZCu3znFGi0{}f38*2-<
zD<*P0rE|%}>B_TY0}H%T2yD{YJprh(5~}s_Zg6t*I&{30JXFkc_igZ*Gjp@EZTxB4
z^&e<N2CIrl1;Zjgqul&B@`$wn?qBB1+}a|eG5wl>rn;J%JKt)}hkGH)Nyp5v78(@=
zg$UqSot6vxcG#;$n$L)7#c~Q_Kh8Udm$KOirAB3Gl&jpeaxzin$`=29hCr&8{yH*@
zYLB$cg<G?eolL<<QNI?+$QHWbUeMOsQ^5fot0m%OG)HX#Y1+34B=hYBdnScCys%B)
zgJ+7;FiQq(uuAOCnQ<IhSJqSLaX+m0@fdpMgMSBmaf@>INfkQWV^yCz#BnY2z_D?p
zM{Ij{w>kw@ZKbCTj-bwwd$fl=)6;)R<PRI&Do7BuOIF88cG<}s;5eAmlrr~+`L}>E
zxw;&88xN0jp8{E|m~f@~CX;(guj5h*EMErZ(Q`CCeagpYVe;mQoJij5-*Vj1I^AkC
zfLiPBA8lQA88QDui~?S1&<YNg^Q#&rPHwNNQs-_+9lx0$f2Mqx7UueScCDiXQ;nIK
z<^sda@J+HSbLaZF4C>Ti#>2zD*^Kv3A2-y^@6#nhOlpfn75|rWt2VBfob{t)6WLkO
z?rLVV(E(e$Do^vPSFnX)iwiEltJ^5U6lm6m`}yxo9Tlk7mz6ubY&+*8a_pHwkd)Nn
z#ZCSD<pL6(c>r=9Nzl38eO&f5(%-OHpReyG1wSYi-Je~oduQZcM_1n)dYILC5#Ih5
z+;S!6&U0DT4UBN*o6`9BkYPQ(5rRJa{pOV?w?e4JQz<Ppmd%~aE}Z{Y919A@^A&|F
zPKMxK#AA0R2Y%aT9N6Y8kTf@^L6RJxNJ+WuZGL#0BuZjC;C9H`5~|1x)?Gie`8lj|
z2p8!{->2^1FeRs`=$xvos90WI>?WvHswJ8eV}3e#(|m84N8q1$F(?>!KymV;f>qYH
zL^%01P>3IGrzM<-pXM#yE}NgYKOWuD!Ic(Y061Vxg2KGHIkN+|ZX3-CpS9DL!(#p1
z7!Q*`Q~u?eCnP<tb}?c>pGFM}C|>Jf&VGLOmk~QT2f1xh)`lf0zkU>VB-`5-Wf`g-
z&29iix&2ix5yZn7MY&i~^JL%LLiECoYRNz_;Zx#bcY49q4)|E`t05lN{4+Gs)`SJ0
zg>dEcn((lM?AAD}tU}V&M3lLFo7z)ZrNk`FSvjRfbQWj*hM(O=hmV7Se0zA_aDq*=
zBICH)EmXj6?A_&i4izFk((xXNo7<qeIvM=EgT19$p%+8dmdqAC@rRkB%phomLq$o!
z&R3yYLBYR5Z@MU<-RSFr^i#xWkr)gAwKwDZ!m%#zMIpDpkI0o%V-VUTrzJbR|AueT
z=t0x|ykif~a@*Grt`T%PyLFs;ydM?hWM##!8Pof(x%bPsJ%C@AbQ}d7R)xKG#YIoJ
zVP<mFJF<|iqebx80e!boUi?(@vgl4*RMJq6G^0;|B%-qdXi+vCaeP~kC{m@p*s!S~
zN3$qx7b6?Qm@uqi56q}`vAghnT8TEf(+fBQXwAfn=`$NEqweZ+TEkk%)?ekPK|HKZ
zS@VeNrH%={6&E`2a8s{ht5)mw<c7iOA6L5p6wkLbLs4<qDM-+DJcDDHODKKGNgrbE
zW6}B9J={rluPD*F0hp!v5PBv04XN%;13J3e%|hLvW76n=1~nZgqO7G5Ss~0At3Hm5
znxIYYL`E4u%Y(l<jRIc~nk7O7N+GY<p-&qhSJB6VB=JI}g$D@hJYmb<=a-nvrF&Qq
zhkEjJsGB~%ErfUdqahzh*Qm90niM_NTURH}1{kzcvL3rZVgoBEg%8d$&9<)AbK7Wu
zAEcJbQqU8@55gpWc<z9vxoTUEEh=)VtlN0=+R{=%$9t)>_s7WcE>&u~_(th^;J3GZ
z-=^F_b8~#OqtOfq6E10c@uEJPV}fF&e7F>akCGeyiJ7La5PGgoaSSs{8e`Jj_UPWl
z9(x~l(%ZDGQEL`@KVb{i*1%4|-K48u@ExGcnXDR&8CPjd7|}0;vSu=k2O-gP>Vb`5
z@x=Ov^gW5-w;?iS=BTTExk;<kqpipF+bG9?fgK6#yJ?01zse<Meg_rx5XVL(iYkW;
zS;yY837`E?kR~^a3d~?k7Of{RQ1g6{@2I<f-(w8CZ?bSmL!2W8r}ADXg>=;=<R>0Z
z*>xL3?_O&@jkMhNhHIwG&(6kXii?y0wBZzg>X~|$7Mz){uWk665AWXnyGZW}*)BJy
z^vi&C?Ur51l(sIF9qHo@kKQAW)+=aI!M|p*0GNXYsd<le-mdFHwJ$J9;C0TmNaeDS
z4afTRXOrEDu>cg235yioK+y99?TD$CD#Z+uFt1dy7yEwI0(!nPQfWMrY%~hZU_MSm
zg*M~wON2pI`5|FnDx8+(;Nt1IOp)|Cp@`xkd36gLj8AV~wJyq~;CvHtO;uQ;KwdUm
z>^)LPi#~nSW4#1%y<}KZt@y&sM;ZRdU8IH{veXBuBt~f^-TENw4|xY@Vw}&4i8kW6
z>x08^+4I(CYDbB2wm318OG?Wmqe)TkJj+*4{aq$0r<GoZY}+r-drni1!NjhhYpK7-
z^{YWO8lX_dAW2H{UDH7Uba;4w63K(N3_ef925er3Vzm+V@*?&Vl-Y_*CjQgY+&<(-
zO1zAX<$B}xhHg^oFKc?a52gqfcIwdQX<x`YEu=U;4&Jm^&Z!yOg(>B_-Os+H3*Opt
zgYPWKBOl`&gC^Z##eDo<B@AOfg;bAAd7*+a_DR-XHxNKu__l#<$f<f;F=B6Z0z9oM
zQDqX_?D!9<ga0|><~XJ4SvXxC62ad*e6FSf;K%Ui7E$O6JrtGYBX*d`#an8ySH+rz
z51xp!d$MfKt?UO>0m)t4PFhT|U2gr};vN)}tB{nNx1`hUy-3LmK`0)W5|0lrbQ?QZ
z%wfMkZ+rv%Pi+B-5mdY=<@lE!{kwReNg5Xk%P;wqMXF0!p(iwt)tyufZz0KNKK2@&
zl5o4zL)6nKxa=$fhPtn>p;f&`y4Os!3X-8cTnAn+r}mcE{q|RN7RTSdyVmS6Y)MzI
zu1Gb~gW`Er`!*2rVmoa0PXsRKwZ1$fMW3_vl}kvVGot%(iA3VzYVOE&(MGR(#d_5(
zw@9fo^Zf9;g5MLS9AP$Gx?a9|C6@l=lbMgy=r9WFk}3gz8ar-*ep)OiFJC=4Wd?SH
zJ1(_pZ>>)!Y1G@8x6mtBtX(Eua?JhYb-W#E&Y@##c#zw!kDbZIcKQoVaNcOT;7#Fu
z)$Q;7^{@3YkZIB*#e5BO=wDhyz?e7V@-)l$f%M(g<GuUlSsP27Wo`Ijo?nqqd{vKY
z&}sku-gN2}+hP=IIGj&t=}gKZd~$Mb)x+kvVyyCo`y^ZS7P_RAL}G|qeG%CZR6kri
zi-&PWwFbAaqtWp>UAGB4d)`+wDo`c0xEl4lGOi8taw%#?9)Cp7jzmgTV_Y1{XY_C!
zYz6Im?Z)2XarQ2lJ{N!MCXkeicD=R!Qul_(7gd+OwAt1s^uJ(wPf%b=$P`v1@GFg5
zkbbIpP}};XwPDW&2O;cm)Oz6cY-pkFFqN1JffyML*J0?Z#1Vz9tT|F$shuF=SFm|Q
z&H+U~E>{85?DDTWo3A4y<2Q9ip7Q&MbBar8GJYr{BHM?{IlJy!K|kCLjv=ezT4`Zr
z%@?Ku8O*$E-dJB}Sur1CzZUfN?bXnyPs1s<ZsPXXca$_epXElbtO5?c?Yo2jxiQ;h
z((h&@ykDSy-K$H)=tPcpon3aM@QEB=c}*c%axeHv=-GKt2hpT{^8qjG$OV7)CV!@u
zi3xc?x#^u#D`Axa;PTC9-NhX_P2JWaZrHP}+Rr-{&Qbnnn3NacCw_?R`RkHt)kHGA
z1x;ewDZ>kdHixT8-i0<~I0!sS=aTgk8OTxk57G?FB2K))P)0K0?ZYpm46-`THUs3p
zPc}~MAQTvSVEmA!;HTJM^GeeDX~nWh0sP%X?tHy+F6poN3k?6<ku1`Hn#w9o{G#M1
z<8}wvqcxkANnG*RXm#B4wyrGG9b^8yV4V6(|KtYcA_2fSuvxV9^9-aW*&tyu_BL<b
z>;*5W4!$)944uS3wtmn6BS7wNiL!hDq<*6!Pc`rPn%wz8_s2*DDe#Ih3YhD^en9?(
zl9db8_<!U*1vlaC5|Ms|SzDQ*4q*NHYoR6jlVrqn;yL`Nsyv@~j}|x|I`%a14$Y^p
z@jP*5zXYi-%lNV;UY#<%`fu=`eCBkTed<QB2a!V;$7NupCP3)Hb#6ziBOaWs?4{CJ
z9qsdd@mKi;*$Q!<H*8nhSH|we{kzv&a_>qW45rvHZ`&4eg6LVafDdo!kTw1Hr~T#2
zRN`<Q=tQ-4CMV@3iVSkQ1J(v=WY)CIVf>M%%B38bLh*B2U-kdTk<G0KV*H%tdW
z#ZbD^MdsEteUGJ3!5q{_?KBrU2(~}o4aQM_0w)i~dlv%0l&<8qYsWe#7~|vFZS6vf
zixtR7j>J3WIaO<~M$F?YhF|(|X_kL>PeH#|6~>#Cx2#^~|HH;}!r%dfOUXNBB`rK<
z;_W5hcfNjpd^6k2dJz9YrieOYWl^^zi=G3m7XiE7t6czxxHS+%S+|Pp*`Gsm*Jb$;
zS~5p6->}`TEAH{LHJvqvxE*?C_^l+hNZ78tP<NMxd3b{cOp#7fg!Vj(ptYC%w8Teg
z-fu-QC!C?#u*yXrq^3xES|+!LTnZqb$io>kKmK89-{7b-MF44=Fb=xCJs-#ec*MR=
z7ZVU|ltcO(?0Gqijv*zhQ>x{SdO1c5N5>Q3%HEqda+X3U5u^o>H2<$De(3ZG?}=@F
zzI@BZMj`uD8_mc*Y5PPZz$XhE4#_$C?Ped3H|W1a4zp=YXt*c>Z@Og9VCjczP+7(0
z;TfD?U3k4#GsvsF<T-vCPvcy4SQ4mVo51x$U3^j!WU>p(^2_rlD={=-hinMHMc>74
z<C0JWK)3M$(RXU9icwG7Xk^L@%0jJQh3@AU<7#(rPp!vyo9<H;6Wn85tkvp9qNd+@
zlB9#^9%quCs9&MPFFAjxU*?L_OV3%lBd9Wx3LCQ-dJBY}f_6V~azrGgn=)g@Te&XO
z>K3z4MP(|D^u#^M-si-`78x!ch*etV2PZ|=2V7^A9n3tNuh{m|hTJo|oaoo|ne+hz
z-;L1{OjjnTK=$K!EvI#V#jD4UiRXnKXxH;poIZcG(PTrd(Y`|XlnY)Wr^_PLPs`O#
z^J6)9RMi;>I2NAuOO!v_LR1p*#08o+c1gIsh^~oqXq9KsY49T~kfi?A*P)3^_bt>?
z-O(WyQk=Hr#}|i<dQ@{iv{UZL6GJn)TjLk*v=aTYeWfp-BiPcO(0aLjh@waBF|TQ7
zzYiZTv#-|uw9R!9I^$C5ICvMx#7r8s2`&ym9UDLbh>9Ra+U~M6SYC_QcBNCh9IZIj
zqG*6_?nvTxi<-ODpaJc8YOXQNPWAEQj5;@~;t5OfLU7}etQf>FE3@^=7Mi$lD98NX
zFxU|8B}SdC4Di^?J{DV7|0V!f)Nx=Ml;GX1tA3Y{G)kP0Jk076_ExhC8B?8e8ov(Z
zgDDp4nrNKj!|sZ1_I&huv?OV0V$l;&7oQ;GHRgQb@B2ssoMG=vsCF-uPg8847-V#N
zX~YQQvD0JE-#Yh>{*D{89`R0**OFGEylG{6;=pR@VQaNG#Ldb&&6T&YMQ^@`8mzT-
ze2?mI_r4FJ7dGL0&MA%4YD&elJL1SWT6~D4x4|^6+-0D;F+Cp;76;gW3X4;hV}es<
z=x>?CX|pa#7~ayEZep{ZnCuGvi^@0&!33o5Ejj|Ai>U^z4ms$C;T}PHoofbPt?|iL
z4V7u{t!b!)J`<goa5)t51DTZB>{jvw`8f`+iy158c;5**(50wvL@^m&^(JI>D3kR3
zOL?8XfQQ_6SI-u(^yH3@B>(_OFtbc@p_DJhLC%|jNb-FQoQ?Q3SVrX?ilXYVoLs@H
z&$QZ}R_{ecm5Os?tQ*E^xfY@TbTNlU#}?KC5HLHT#V91?&^4u;C@b~h;Ooq<79RfZ
zC*jK3Ev4gPo_Ca{jL6Ko=W=6=e&UYo9m)aMzuA@aF2Wnxt*oC4pCa)L?C>YGY4`4@
zNfX2D`EwGQ>McvY;(KXn%?6fiVDj>6D*W{P*Eyt_9aVPZBIgP^f4s?4K?Ce2iqr3n
z7VC7Hcb>_;tF4<^SjcH(+uP|y^p=XNl@Kb^t#1^WjJw|wPO5)3TtZfn8$){%+7@+r
zuKaiy1%iqK&lIXXP-QFm47{&LHemOYlIDg@lr2=VJ}KWOTiy7Y5opEr$j!5W^CEq!
zExBLG3!$I2RVp^P<4-Z{{C?S)=SXijArCI&4S%8e#hw-=8JfDnh2LD6hq~qrpwSgw
zq>H7&0*BoZ-i2B0mFYIg|D5O9^Bk~l*AG34KcQsViC>IKT|SqtALM2$QOLKxR<!L5
zoHg;>N~CoOEMqbD;`39Vk@7;{Qmib8*tCvGr|>#NsF8UYl+yN)ht=V6#m0K8M>vhL
zpv>w}M3<W@m;_UqKsn+^JApav|Iakx=E+>!D`yuye!5v9yeYVQ$oUQti_Za9w$o_?
z8*?#s5VeV@<8VcI2BqrG#6jw6LtO#VN_S05$m22C3^<UC?JE-t7IjRi3v^0F^l8SX
zd8`JI@#^)<Qm-7*6E||9h8y<_3AT&oi$4CdlSD=BZ>U|PBnt-{Ofm19sMXSdyePMV
zGzY3`b8*hhn@>i36XRb5T_5{<7Qm^XH>3{*UbN&`++b_NoxE9%<LGyt7szbG=H`Jj
zOCl!;ZOJ^jnZGY<`a_)U{X{(W3DV^&Z1FrnZBDJ9o~e!4*S!erB>eR06AK@oMprs$
zPevO5=Zo+a4K}MMP%m;5AQLO3$YFE;CwOyZ0~GJ%4c8xA{h#_e%bUR1HWSe0a?1N}
z9UtwJE^}%;NWTwgLh<K#1;5h_Klk^ytDwi1-}BHNzqv-sb=UTcXt+v3!MT{^K%aJa
zxX4c=`_C76uI5dXIgEHv`-Oj3e>NWCD5J;k>B5~jgp!6^Q6h+P$#)y-{NzD|-#rwN
z(5zi~Wk~|`)Labt1I(68hZ35RbB9FD^KTEo-bXg}E-#0zy$OSaIn)IdX@L=XQ>@A&
z{(vqW_l)4@p854k#R%^NwHc0Ak7TR`3<AS9-0!71Q1QG60}##b%^%8tdQ(=%p$eeY
zS0m-o5(D7~!2+%AbD9L;ykkKA4R@eeJ|Tq$(qL>&tulo1#Tr&Ek}rL&(m8AMLQhI}
zP*Kj_tN8~SkOyRtWx?ETfM05KK*Jcp`edlmXYa!cR5|We+CD&j7*QmhUd+Tscx!ep
z)Uyc+vU8~(w@|@O=%%Jq5`N{omYT&hIjY2ncd>;6RwEShWaL+h@K;wd9pWoWsb!Uv
zhUa!HY>&T?5u7Jdk(hM__$0KY+E7*1+GOF^Xlx;3#7?VT@AZjpjGW3&@k;{~+-gMi
zTEs|i4p(vFFl?9kY<>=<CE$CFyip_^dizOl)YXPEti{Ck`ie2ToAlPJyui8Q_8%_)
z?X~{7uOyquzZE<2HEiajCf-K+`|a<6_U+LY2b2FTZW@ArJgIK#um*R(5WKnlWHrDp
zV}O@ut4BT(DNC|~AR#OKC_{UGhnOy$_w<ahX1<zq{cW3P08f#Vn;HOfVEd@xgLrQq
zrlv-_Q$_SZnqFh&-A6AtTWcv$=`~EPF5%ih?!ROEYEv3sinDR13!f5Dl@f^gA1nFP
z@|T3lYqK#lX@kWN=t<^E3MRtD3+*90%h(4~w!Xcv(dYDRn{z|{>`o0MT|L#Qtt7<o
zoP^DW;V<<5<!kCORkzlph#RfM71jrb+8N!wU>;QB3m5Ij>joWccT$fqVW;52hUw2u
z&$4rhtF9gH$RA9$x;9jXY=VyvxBb>TNNl`}{Ah2_ve-O2>!;e1_nR|FPE2WDZhBc}
zm9IRPden4v<SS0GPJMG~WrImOfhL%$By6>|SzH^UN9MjPqQM@3RW}r_s*0TVDQ@3v
zutquL&Zg^oqk1_+$ST*;rv#e4Y~!jl3#6ZlpmeAy4Eu(<m>PS;M}V4%6Uy2nQPRaM
zHhJ5{z|5lvx`Bln10<*+TE`$5gO(s`lkhHD3R(!5>P=L1jD;_pd8#)|%e#-&Wj<)&
zFPYiFR1@nW&gyJ@Tua>Dy5!TSJ1gwHm<c{99n)kBwdsL%ZUGz_+nu&val}sm-#^7=
zr|<L99Z-PwmtLwCP+iUu;GhOhdpi>aKr&rLs-dchI`n=g+VS?N$U<v5*w-#VB`P`b
zmV`0ao~O?^;_V%<YDy8Cv^T)@-r)Q;GvHTa|0^^~dXb+yx2373R-kBtSE<i$J2!WZ
z@S@s%&u5qxI%m4hhSx{qHM6q9G4`L`pC<6r`>aQu-}e>?*&G9R+lpkUJu}3D?IB9k
z>Afg_U3&})u37JGkX%`~C-2|?1ZHt(r`ODMtwHCYOt`Xue9wiRWVgqtY~Xx$PL55$
ziqydnbg#3r&vz+G^TSIYqwMsm#}Oc_76y?!Y3}H~ZmPsZ5b|Sq(d>v1ZFEEHzLw-l
zb0*TqoZ_fqmCFzWV>zJQT64``r5}}P5T6?35<&}{-XFZ8`lkxeHjYc*YvM#-N4>S{
zZ6o4sRn`Vx=m@(HaAa-Om1s2uwpd&|ynRcF*hP;a;JlursA-ehw64w6#KcrfICM4M
zvgNVx9n;k-2Oq<cn(Sn?(t`78<>EtgnbjS<kyH6hfHIriR26&%9SGUJ7(C4owu3QL
zwHZp+O6_(STiE@4?^*lN+Ptn5J@oKN#t!{ZfrLO(!Zx<j6}wYk=9&sy07cPk?X^46
z{42uz{UeLy^cN3~N?;Zi2M=Kto$|whu=0h8%3|G7OEu(3fkp3<zuja~PPZ}tGFn%!
z-of@&4iW7u?XDokaZ?wF@}5bmVIxqiRG`xQ(I^W8q1R>wWW)L;WMpGWr!#tN@;P^T
zM=_p5af^K#AgfpeRBA-0`Q?o|TKhTmS81AxVfS0dDO}r(Xm*sNeXt*uvD$M!mA9>5
z6gl34N^IIr*yYO>mRo#8)YaI_Tk89Cg)xkj8<syR%t?Z7!XXuakq-iu-;H3b{7Cgx
zAjim-43O9(bH+g3fnFGq{IQ=D*FUe6xSU+d>c*>ul<%!B6<#*Y8Kux|o@$?cTHqqS
ztW_kZ3tMofuyr1eZxzdfO{EoUBa$+S(1}Th=*t@$;$)vI#V@_z=+STK2*8fJevFC5
zS-|*<@mqZ1s~0??KB&8GTjg~e-M@(p_A5#kUjTi}&;2^&z77_6-Rt(YEGlxuCC$;9
z^#FASaMrqu`*`)-qB-c2S0B#AIxWrs<YQU&ivg{?1Sv<3s)MB(K|c4K*^jY4j;P`u
zS|z-t;>n{lzw$1JiG7pl=kKb=C-N<VbZxKc7wYuZ`{Y(3r)rDqjbrHl;x4V<tf=>-
zN^Ui0mBa3Eq`3y2Da^8W<?Z#7So%*l&P@&#FgS^=l64v4()fn-*#*$8>w0f?ZVlCo
zb6Xp{qA1A895_Y1yrpXy-e=NNa6&seE#;+jPJaA2i877>D0}nDN8OVRTl-;K<JgPZ
zA|y1an0B0_tqEa8FMpt1A|<OYs>uLURw4luBP5K`CZY^fiOno565T|e?-^NikMHbk
zw(mgQGwzJHt(}cK=&zY2Ain64IImruKC5+%wc@3gnDkA5z5OUmGoa0sWVp_Y5$@&(
z^2zZBR9DsUqN=i@-nnC7uo%I`i{@wj$eGsu>!wb=wu&6?-R=lv0DS;wRkOE`PrkR=
z{9|iim@$||{F>@WMs6b*o<XjN%2(`sAT*BQ+w0vV=+TcYn)Y|bT`aCDO5P`q^n;Ig
zl_w*L8urdMyL+W}3hU6=n^kW2f;#K(d`7OWIyAhD3Me@|B=_eavbE^LZsw<k`dRd)
z;b~CVyrJCU+`7H=uQAM$&w<X?hWoX>?cD|*XOD#XSrU>Q#;WAM*Si;}R+h<T4r*wf
z8e`M1UhDUd4mf^8s2`(X*|BE`j9v?23&qQSRG-+w*?eLDF>|)mAE3iD;yho0w)_Y?
zcp%;3qAr(Q`#Co|*h&#%X$Xm9d89RYDyWS0{SB8e@p^0hF&di8e{!6F@OLVR`Yy^k
zP-o)(sq0E7m=y1XNQnxIK9_97z;RoVJvvj__%+qP*FJ_js3FL!cZ0+9quCwcmrc4P
zPB*xOZMz3=|B^iY`>*CsPWIp;d-X@`%it7;2b8AA1zFSm?b-fm@8Q}Eo1=->`IpE&
zQ|DGSvuyJsBxQk)Gp_UIt3f}kK!q<zfz=n~FMZsg&Ke{>-d|a&cI4+$sfr5Y*bqyM
zn%zB)<9DyHsM*w!>{8V<tk>7~sdAz-{aN3U?nY>~n!uS&!?wjRnb^qUm9kG;+DAcN
z`uM?vr)p}-ZLn6zGi|kcDA4bc0~PoB{G&Ie=DlQ!dD8GwC%<rCO#$>v^f0D7+okii
z>OnvZ$?+6Z;M$_ErY=P0G<R!qB{VY07Gad5CviM4`uW#9%2M8B9;s*pYGI&#F5+M1
zAO$g30B99TpY#jg1ltn`$wG)z=|7p#SR!tGpf^15^q8V!0(UuNhpvaeVritZGE?64
zz&+3ZW^JyymGKKb53$H0zDn-qJu!;mUUypjUy`3Y)HvNqL&HU&&undo9XmedBy$y@
zmdt$p9nX!wcV2o0VU4*(BIPC}<~9!;9%In2Egq)GmPGYvkEnQgv9PjUpCsEyJIdX!
zGtytnZt$VBp^i*tBO-atJ^7b+n9lHj?kz|^)k1ft$eAS9dOs->&1BhSSVZYcvE1}7
zvp}~v{OP1u@kyWnsz>U-&KPhA*fTL@BZD`om%rjWXw9!j1=oG3<zWH%_zU71Pk(ge
z#dw^y)$-j;V0W*PGWVjFk3lrg%I$PuwPA8e8S}0;R3Rb)6=6o$0P2{T`FD+AE6y}i
zX4fHoN|#AF$c3Yy_3$j#LBJ4QcGl%5fXF>DDXh-;S)reT(7$i}0N<*Q17<a}xrT&(
zb|2d5=#dlHTNDJ!G!%FrnRP!c_Ff&1<a6#a87?NvAErLm+VYo~I$UH5$V&6q2VVp2
zRG!({Y&bUJ#(xED?By^N&Q2#5Ohd6mWB79+S<-MsMEFNe^B%ytFf}}EHHE`r5^V2W
zOj56o*zJ90GCmq%PaJbEmJ^q`Lb?wss(ZCJnE+{+c<Z%)+-`(-c7GFk`j^zWqrNHc
z@|ocR-7^#KbTjdgUb;TpYw+%=MyS3>v0ix#QJ-a4nMJ#P3k;_1Zu6kPKZOd@;O@H`
zJbxm~yP?TaR!k}QKni@Npk39Hlccz?wcF(H)H_J+x`_pZ`3XKNeDEVK0*2IisdDqF
z5YAR*yyM7MHZu43kH~y(5BpCz;Ll`o<)LUM_o=yD9Q1jYxcnalHBkE$Z(jYCeof8~
z1gBLeTnj)VPe~XrcI~RmWS^kmr&EUfyQR-DGjxU%%Mv?V>Uw&K^#~+%f2^?FPzg`x
zMpd%7=hL62S|+(Wa)jLPTJ$i1i}4%UIzmRr;5LnDjkYQsAA}xhRr81K@d1>tZY`4a
z7hjfaf0}sbL#T$B7`xTb`o#8ZygJ|f-sSO9S76rTk->?cOHqxE5T7v2K2kgBT~k|+
zTKdt+sxpO3G#uF%j?bm;lAgvR=5l;`Yw#I+GL6=LQg-ISot@3`heR0I7fWz2inR4t
zE49UuDALK@eS+(b@(kjHrso5v-Z>1wJ%RdKL{?`$7~7+p2e9+`9@_=`-d}r#5nXv;
zZOFbZiufWVN3y5U!=OBJ>*vWoCnc8UGtq2k=LmY@dCxxmc@&ch0dCQnb6v`zfF)+l
zO)F1D%G5Zqph|l(ldukvZQA<9eNj<+jFAfaiNV~$47m?^P}n`cvs541_$D{klJzQw
z;Vp;L)s;zAmQ{R(7yb!aB4?|SrDqzGH~stm<KpG?Ozvi5>TdRxI$6jfk`7`cPI@xh
zC-a_x+DF&l$GGJ6Y1aw{sBRE&xgC8ecx(M;m|pk_bASXrQ0KpBgYmrkMooQCVVt_v
zTfUafDZPu}I~4?|faQ^IgMJ%#R36~@+CCiVY<ZECl=R^qt<d5o(k8HR@6;y=FfR=|
z*GW@GXS7A~FB{+|V-s`v^o}=i?CaOJBQ}5oej1=yLX$(R5yR1wyKv~SpGd_|s_}bf
zWleHBfaI@1ut34?siTAm=bJcHO-&pS_nj3=tDm?2>k3d+l(q`<wg{8rAkMrdfHzLB
z!JT|jcy0Xml+(904f+5yDR4o_`2RM*|E%;S7J(C`?t$#R-+rxcpG|5MNV64Mox$|`
zh+Y%ew106~8Y!Lmlx|~#ZQ$lNI!sP6P@I=n-NC`(X7C5apSxGih+%MYBYjV#6L#1^
z5ry_i9Z53^>n7errGMyl1dl1rPqb&|G(fDG^8!1xXbUyAoS>UOJ9g5y&iM*6YlcPY
zG)j?|i1LnWKYD}}ukr+n)t3_}3C<I+V07kzVu(sFl#)AoRBvi(5i|G%hz+zalvO;f
zqC`c**C-aUB@Rp8uZP?8XBo}V83^Ugzd5~MQQ>w{({DdnC!d5J&jx%tD>th&K<sE|
zo~<0p^H((NpTCQ@dk_~Q^Xbyx>lswW;Us!5(f7Vy{K{2J5=U!=8q<T#JJuG5`Ckv-
zHs;d_E+zuQCp(V$tS6;p(h+q9iF@{-iu^RPC)LMKlKd6+je}!sJVX_Ic&{s0_Wmk-
zV@ccE+WLyP;*iC}60n)f1qu!IL*M%0#@=iyf5!m7Beb&jmoZRs{7GLKP$pR@&uQ}x
zAgM-DuwF{G@xD=vePVN^&zX93)rqIzy$wC#VMIPQWhG1FN>lKSJ#&~>So#03_n&c1
zZCf8Gysg-=0V>r3hzN-E8r+B|2uPQ%AiW7lhXghok*)$F9YI8T?}VVzl-_%SgdP%v
zgai_jyepoqqQ{)){cwNxe%apwD_LvJF-QH6GOJ2#Mpd9&`0TJTYjBRdfuPf{F#+LH
zz^()Fc~1f=QN4%w+-r-2IZnG~+6_kxjB|o>;_Rsd3__sKvd)g8D_8H(N}{AI?Rg|b
z!sk*ypTNyRcrRvX-Hh^?b#I;BiPO6R?g-bW#iJEYy$Wa``^8^Z6`#HNnullZ$&mAC
zA<(hVYrg`1)BAy8sZZVU6RON&;u|X}<ggU)oQrOfwrWNgY1*yuawi7(>#d53Oi?Ys
z&d#=Tmah{&>3tzXGy9r88Hl+yKpZHhrB;aOf(zaWQ=}7b(0p$wZC8|`RUHgFK)kp7
zcfM97XPuA|vfehuD$sZ*XUE2k6?Li#(5a`GGG3+Sy0*W-xa``A?2gi1xS>5j;i4$E
zAUp15a(;KirfCpo+FPK+?P*>?{;tz=s8@O|PxIG$m#H6zTaAL8>UdChLSfx6>ww6O
z`0NJAFU04OojqjVDxvG5|5eXWy~44D<aIby)hM#M2)4ObvEJ0*xe~)c`HR_~SlaQ#
zHG+@y$JLIiBl&&JelqF$(NRA4KOIu@^SOz?H0v>*wG+1j*{U!VW~`XcLW_BC-5Gq#
z{A@l*%I~sizkEW`{Y<kmRKt~}23eHL_W`8}jTL*qS4Bn(aAJCm(Aw<GqK46iYrZx6
zU(1B=uQqx|(1x@~c8{>G&UsQ0x4-zm+`kZ#=74NbSQI;vUR5Y%jw!pWSu1d}_7hw7
zHof#vI#4oZ&T`Gy4HuhmaiYw~x2`s=C>lBm)a6JCM(0m&oLM+eK)jB&7V3N4IezjO
z*!9iLs6;8xzW#n1{E)b|wXdcw$;?aR1<+=y8;>ujSRf&QvRm*ZP%`-V&@w+Kr;tGz
zv4HW2{H9;%^oir{i_MO>k)9+bwizM*x0I$0&qwZrBXe6-9`>FSUKK#|hFIRB@ycrJ
z3G~!frlOM`EU!+oTf+8^kp*X6yWhoUOR~&Us5~GiJ6d!<$f+m5xAmeAb;tZ>zvG0P
zs`FPYZvtGfG%$_I7i3DOVpmcUMXjFHdfMtwxg9!uq#Tq|J26mZHNI_tw@P;78LV$Z
zoRt-j%jB@^>(>#5MB^lO;H-E$)$pZL)#dMZBMPLW=dlv%LupqHR)*G(@di;^Z$AW+
zKc19^B-moZE!YUqfsT=t1hQhlvV<k;cQu_panNC$$&uCA$EE9Dz55lI>}m-E<M{Ay
z24*L{n*aiZcUZ?ec9j*i6HA<s9^4l$px}Yf&Sf7?IdP0yh5hJ&dJ1-QYZjkRITqxs
zGpKUkiigA|N1i76SWX0~sHhCi!8~*9uD`q?FM7u}!$QodPgKTV{Z2YW4=V+y!fS@%
zu(iH$SGMl468V#DdCQa54W)J}N+*g097?pt%N++t9^459YJGc*FDZcRHqiK;2ZQ}z
z&L{Y(uiW-2FCNVKaa7)v`OXraDU9Pz++j5?Xl8Wz!0yilqT)gcr`N^Q_YM~bu86Ns
zcO3FYg_pEy2x5rcuQrm}{MfOcL2-R8h7Of~*%FldP4!Bm44OYFBZmC`C>|`ALcc_^
zOk;L0vP*9LXvAwf`*7+(@ubRq?K`*BZ#|1UH}|F2g6KzKEM4m{+CdOQRWUzG@OL6b
zL4l&$$Cq^4@Vb35`f}v96f2Xiz4%ly3rO?QVUGX`b`ychZI@nnY#L>-nv;{`QF+Y%
z*m(X`@2#M&U<U+cg$01n!c)wx-g&}o`iCYB21VOCY&@exnvU6kfc^=6L>e;R?N;3j
zynn$iT6_;tQ%QxAEX0sy&9&&q?cw!$HWp=F3kV(s1XA=wMGE`H53HlSr&6%@4f8mG
z^XrW|9el8ZQIo!%xHk^7lhF-*^}+2a%B5yV*|r~HqC(Cw8^@j4;Y66Hpm>aTswm<>
zrE9BEj$J7WKZx9ZjK?}FVO9R)%(LzySxc*7Bk$$-mpbC0m*V-s_X>{ttvRP~VZqA9
zNYf$TgKjI!*Zh<n!9NP6xTcQ2J;`C`%qIE3D)8hT00k|#o3xS<B28A|hsuXj%XeWo
zc!MxQ=ZZ&F6K~lg4c-ZR`cxS_6_cHqv6WTZzYkYo-D)(Lk-?bj-Cx_qpD3E|=-kRC
zIV1bR|NNCkV|)4Q#VQl47ucmw&iffGy5{P5yq*u0TR4>s_+gum1G33PgeT0d-%JK=
zU%>gInUj@T*}r<5LqaF|P{T)g>w>e1lxP>L2a@2kMN`k*QoZgzESB4AojL1uK0D&z
zwcBaf0&z-(>IaNV{)6N}F$lyg5=`-5Uv_<#;>?GV+y*CQX>j#ziP!eOPX1_`4>D0t
zK9p|_a&jCNT`x$l)8!>>ol}k?l5SrC<waaUR6E|Y8ccc#%T2ULb_5TZmkligncyTx
zk!Pvfs-qNAce>k4yS^6M{Omb7KqzH%fBsy1IoPBiXy_2>mIX(f3w$hx&B`kHWhu4s
z{$Q?YGrSUfJ)vAA-m|U;US!IB8$sSk!W!Qf2a@*{<Of9dJWAok)(i7f=?cvpm+##O
z&l!h#N|55n^6qwe6`nBTjZ92UMf?`oPjni-178A?SZOoWWginC7U2;hgH*3yf3m2Y
z|1mnv_Af$|S+zw+bpbHdv(l`ulhV84rKH!%>zU=Rw7L0NOJt#}%z31K(oZeZKPvcG
zJ~rOnb;h@p`9}H=8xZRhsM_J+$*(}oOIUT<GYgCI+}&xkDc!Ojc<Wl(yW6;1!t?l~
zn?&Q7aJb=Zaehe<J{@<X5(YYc`v@D#Qs9(K>nD5Xv4+i=s%J?_qicE%Bu2F)y<E1u
zNl+*1FUU-t&}0Vg2L_ZbxhG#qe}&U45*m;jH~2w?Y`&+lKvWp&<h@*8Rl3$VEOPKw
z1K`j|1NIOAGd+A>UKFu!aBOeEudmxF+m+ptvJhZ48gaGF)-x>j3LbJC<Kf>Y?*=y)
zD>PG$`{1_s{lRh5djV-TCL|iw?XjQpgQWpnb21GW3X1x29~<s|Rs}T|c{?mkQcWy;
z*f9K);lX8fANv=Z_s;z(mYY!Y!HOS3hAJOM>2-U&-f~LxzFuhH=U=bJk-4izx@z37
z_CZ{JQNT=Y<98MRN1)dEdjflb#FMLJ?}5^^L*)poolA!`3a>F){I#a1PR!S8-og`$
zkwz`}S2k|&CQQn#M%mH)$rn=QS!Ldz{Ish+e*D8PJt6VxgrD1p%CoYC2X<Zfxu5`h
z^}SzGB8z{(VF3!_f!C@`Etp-_2eC1rN)g|?+5fndqjs~<#ru5?^;d{sIbuiBl3%37
zW2ieCt8v7&>uuFv!+K%KK;=&*c4&U`Ir8mCS&Mj?y~dWV)yk+jQd$Y2fXhp-YcpRu
zDlr>eJBr|!tDtrZ0HZ%27Q^6PkU~=*w>Q{F@kay(Uzv!^gTbuNbl-~s$VX)2J?1>X
z|1s%Y8V=87m~Sg`_HSIp<{tLJ1+rr8bB)<OzqMu<JN2A^zbvFV6F9!i!46#a<m9B^
znMc<xI_#SAhN)Scu;g<Z8u4Vmvmw^FYcYSsX<6-&)f0@JU*@y)@H_OM>Y>2rM~ki6
zIIo^~d?_j2M~|40Pcjri73?uBP`VeSjwNgGTdGCmu$qsp%5Ao)>d64+y8iVpo?_?O
ztnMOpJrnppo`3qiz!T)X{N(ri2MU`;wFHiy2CwBs3W|qmT0FnqKv>omJAj!I$@FJp
ze$g%TWRF=Qa`~)C(!?iDGrs%`kx1331ACU)<c|y43y%WT{%)3sONJ)5?#~*k`L5@j
ztFZ6Xs?tJV*ZCwKo)sW}5+A#+wj&SyMCEL${DX@sC0Y}A0k3Gp(aYvxp(=G{F#4qZ
zbUiQ*<H~T&+9NI=o*S=Ty>cDhNAR!0_g1Mw`a6TtN<hzY^|DGUF(VeB%=r%9+gaR_
zW~mfSVZCp--qh_q^_>V6J$Yh%Shke)Y_!nr60MP$VfCT)p){7E+kQWgZG}aqx6F%$
zc2T%5+|o~qdqk*2OPibNi+SsxO!fl3m_ignko?Ql3QR+mKIXtC+x`Q%`1MPe<M)Pk
zRG3<8<!@p2?9%E#G4c=D&P_BMirQ<LX1C7@$CLY;KCR$pmAg^>LNDydFVHILw}*lq
z6W{6+qJS<6j~c(bhtB*Eob8`pllG=`SlqpFrM|<#jbEu#x=b)TE*m~hdeGH+^=d=@
z{R<a(?3QMpZSMV2wiK7`8GpFPi1A(buyR$BMzl)q-EbB*X`kD={2}w3y2nwd(=;#3
z?EXho9{Z3!NUOi^<e6jw?|p8uBN#eo3T3ILo1&LWO)YDTd(j!Cvh^Zol2jQM^b6sy
zg_yU}f#ZN|dj$LkMlavIvRa^VQct*gWEX)^UdlifV(57OvECuAl~i^yKxL~BtUxG=
z(NVr1OpD)1OWHM|YhUT>ll;uJN;x>Gl~^K>!$@RP`EZ*H<Qc_n9pvys8~^et);_TR
za+y2E0Wg2sz|<6YzG;1L{>^&*)%~sLuDfO91zgZ|-RSAdkxTl+6O1kPgH-t7i1Nxn
zdBVYRaK8tMd8oq&P}v(%sB({w3_i07ug*xgBxYn-ps^zJg1tlNx(Y8mKXEpxYUVz3
zxjz{y)jTRU_G@b4dzHgQi=Be3>1krlp@Ptvo&GW;%M^&|E?(adaWypht$);1HMR*h
zamxI!724Pstc?3&kIJ#nXAMi1H0pHic_8LmVNUgUE#JXuRhbISf~8|bN<3&8doaFm
zU|rw6LSL+sS)=-&4$7+{2Vu`~ux0u9?{%QxqRMaCf!M1%;?-Qrmg6gYY<kvR-Q3!*
zojJMCwvy^|PM~#ldSbN38fJckL-T`=7~0C>2z%wOwITDv23bx8c4Lq)PoZ@{pfdQ@
zR0<Vqcm2l5I4$VVjV*`^=d;vrob0p&&0nfpi1<;n_>~Ne+`IE(p(3Ha)lFsTCxE1t
z^d5DkWRraAVU+H$bDl)rN?eK?ZaH2TwM@?E@`{RSE{=KnG9T!z(nm2Le)LhP6U~f7
z9-l=aV=~>Hg*4cKwj}G1Mr9}{RK0)=Jv)1^f;sXqQ206fG<kT==1Gc{(9<RTYoS@-
zlZU?V!$070V8+4SiLg-U6W^2UQTzZx6Bt9=pA|%15(SGx9RTwF9f2Q+_P()i{7W0V
zK#<N^mUb0x?TO}3Vjw@9{in8@!~DpQkCQXct5ohiO?&=4#%)3ns71-u*<WG3LE7};
z7ZYptmKPfot^?&olNh<Bx=S?iyJe4S%^KhzH~-T-`2)(9ENB$QO1Mb7ME6U_%^i65
zxtTMFzmo&U9yrbXaP$wdWy-kSz>{$?mru84s3c&@5Prnhbj&nb{-L&uOq+@_c1pYG
zq7<L5wwbTi*=swxe6Rl<YzNFqP=8r2oyyxkaX0+E>B!hKMwnH_r7<WWpp25baLQNa
zN@KTo#Nlsh@K-7U)0PibI|RU&W)4GZbGw=1ZJZzFACJ*KX;7gR)#!~6+)6@U=*Jf*
zw(tqanZ&fnr|*tQZ1M4ac^U__b_^7{ThwhPqzUzQ`U~}!rVRmehpxFhA^POOziraY
zHr~Wi)i`TtHS|9Sv+Yy5!+zjxz>wv$juhXjv2ure>56lf0u$2qn5wn)bzQ1`?Z?*E
zyph!F#(sa5S}m9XWf0bN|Fm8vY+>O(?pumhkI4_98!d-OTWuju+Uqx4-xu$?-2+}H
z3e@H!u8@q&Zn%qXbTxE&jBI=IFNB#&qcIBvGABWFY33v@WwOKqb8!L{(*{3hx_D5l
zfU2pI=*ji&lH^tJ^{hJuX88Mo!=BLU3d)94ZidAXzH_oo2}>=tZHZ#kmcNgc?j-Z^
zQ2nxsr*@7{U-&)HNK<c7x_3-^?tf@O*Up8h162x&H1)|a5AyfShx>d!CSY8twaCM+
zc8kTncVvmDQ(cdP2!Vqx_M}a1fiy>cWNm&~SetgE<u05ua$1*fLihwZT5WvU00q?0
z=mAo<tYkqQYvAqPu?<l9g&05;E7w%%4kB=N>|)(4tX;)gggVZ&G*VfEu0gzhSrS&{
zxjLI(#X7@4e`VYkr%I(3oKc=xn5}L%!v@!sPM6k5IW)ohAmk}9mFZlbG_#GsHGZeB
zhj9D~Cx2CH3MDk=XmHGHNtAG}A99Aj8tP;P%Iz-U)6c(G6r+^X&CV{Cng~^fDAI>}
z0qvEQIkAo!c)I?{)TyD+7jIJe4etDJuDlqxQ!p-wGWwA^7-iBM{8$$7yFawnn=TM_
ze#GOy5L=;9cQ*hRq&@fha~Fm+Q=?%+eQ|Md&lv%-ONCRleXIv&%<P_=`nCx4m(@;%
z0S}#?TMTj52O8{gz@czUojKS?{zY8?3t8yq7;7?J(z&x7m^lVCA9f~Dp_K^<K$X=(
z4#^3H3@gO_+L8WaS%EwE0B^SP`SU9y(t3|&ieS%x0d^f*#ER=uHk&=A02Bb)4OJJW
z`YxG@+1`<9mGbz#n9}&;qQFi7L+buB4=Bxg#t64tG0btoFGp%Fg^CUq0_{E;@yo7y
z!`^gxQX&1$PC-s|rPZKOp!W~w>zn!dZtN{@HD)n~3aA!G{M_`ko$5dnJTSB5^#>_B
zY6W=TuIE6(a@lUxvS01w_Xh!m(LiQ{i@oc&>65Q}^zg-Q6{P@qDJflB+ePcp0IUDO
zIo>+}CoDTV8&zP79jNKPQvsdk5HPNNKU4$s@`ySN<Z%l%v;U9Z-=O`TTI`wve~GWD
zEeDwm<~^1PnIL>pHw0?w!ooFvO632y#PZN2)&Nk=0ro<Jz+49{pyxliurq0>tcrjE
zP5qpQ|Ai=7YBYUeRHaNS&e-(v@%5SWjRNfXFO2&A&+6KM_Pq-Q32RsuYJNmS2is$y
z|Hf`bKRiDfDM@F11?|VNJb?!a$3{sErkN<rnJjpf+?x&Ltu;Kp4`_NUpO|^W>EQ|T
z_7x6->!z`j>3Uw0(17-?^s2sbKRiWpFvmdYBOf0y!^aLCE|-zbS)&NF;|?w=-}oO!
z(D-hnrFD;ymEKcI1(nqaYiVWmsk5^)m~Zh6eU%o>@^JAFsMsNX27hxc*TQ2!_GC`F
z<%hKW*C!RAYJ2ZOCZ4^=s9%1#svRP-gBWX^3i21D^|jYTx&69}Kko34298H;>VOJ2
z6oZ^<fa*#8*gEB|N1H(PU)H;LdHM?FX*TV-`URu;ffB~A>YS{ujmW04ZA**)BIPO-
z3SG8~f}*j2FKz=_KE|?i%!Ga#E`4J+t@ejq0^is83h9p4P0``YH!nP<eB<x-$w`4>
zmm_rfhU?Cm-NxbyoQA@8fT0()=)hHP>C&4Hsh8R4YS)Y0U%l*&xkEsiC+Zo4lh*aS
zZxaO4L|CI?X&QWI-z3pz)0wY+)*F}yszrYC5n0#7m5_Lz{?}t&1C#svOl+<QBuVrX
zzP<bZ4;lQ6Fzlj_X83C}#+7o7`6P#*uny)W@T1iVRoJ#`tRFi|`_!nk2OtB8kTfek
zy6Zge?5);wCEleEe`Gj*VT|<HeKa>|!=LQlEdX7!eDG|<&sK6967sVa1@hd$?ak_L
z#A!7uv;0oyP|~2JfTFTo>K|LG($63DnMkKQytVw4YWd{k1ExC)v03R#N0pQm9$d;e
z_ckD4*U^)Ke;u%l>n(k-&+@=p2($EJx<HD7+8T^-xMo81R+71oud_F1J_e3WCDgA-
z&DR_ocUJFG4ftcn&fW3}3Z{E-G4Lb~3yw{FHqU__s{lsUIzX5&m&_AluhocswA?f8
z&ASgcL*wfYe*T~!e{e=u0p#Aouu3v&Z91EVINu^mydA+#@C)|{VK1%=Xi#FO^J6Z=
zp0!Kq2#a|E<R*jjlfLIsF3o$gm0rW@e|`q-HNXC+(eUD(WDPf;`AJ)sB?6cuoLQ(d
z`~K;J->!fE;(tEaxCPc9SYQz<h=@0Z@LtM$QL%Y_^64*M`nIZ>7#bZmcUxTk&$IsP
zYxmp_%nWki4xjLc3^8b$YW6-HStg~slWw@UXqL_E9RGHspXn6n$I{jwKxxkuG|d_B
zCz!HM)_R!Q${eP%LIwE)GnNVviUDtvH2;2tOkCVtX0|YJ<neDv|3{$Se&EY6LgBbl
zn(3sG*p5hgcRhOmI1%|Q*scYr=K3A6_6R-I-{Ie@-0kP+CFErSw08Xddjk7r2AL(s
zkBnb1WresMRMb`b{r7ecaRk;IUW4I_6Zik_c>sy;mIMTVo8`Oc(QWV?SGt)%zLN@_
zC+J-Uu9rlY(c7&G;8hjPSuPtNXNuD;NVGw~QP(v!HS@;ofODRI`%Y_WTtz#HSRtxn
zgs;z^c@e1%gfXC?&F%mEiFSkQLpGP5FHH+Yp3#xbs_`v+t?WZ>%b!qGW-<{Mp>CNK
zxy1f6UwnUqz(0?-<pqF8=2)$k`>t($X23A{qHEpX2%5aj%M0!ACb^d~smJ3jBfF&N
z?Mm>ur&WxhCiSmn;VEn26Rq-4-wW=}FqwjnQcQHnlYVFSns_>5vn(yN4#UcM!qivB
z&ffk+cUV|%^=4GabJw)g$%D-!lKX%E)(R&A>&K>fd>>i+k=wlR&D5sVY_r5M=Y|VI
ztRo-_scYu-M_XPt=AgZ}PNzaTOt<@h3YYUPVna@WL(2paTTP<yPDfV3BR4q(BMLxe
z8p{>!l_ex~Bj%H(@91&K>>gkl8jR)T<-k#KuCUqG<(I$T+<hnO6-*g$-D+MUl+DH%
zVQ(VI<&#TDvuT|ZyS_1@YN0}%Ao+RW*q&(Tf0WIy_xZBO)HWW{)UXO!X&!DMm3dLJ
zB&Jh5vs*K`!(%jTCmWzKPa8^0m-<OL3Bc)9zau>G4)DFWDk`~m#520P<%6f`>2W`+
zm_xY2aYnUI{qVGu9BsOr%=5Hr2YJKVG)m5MR5~my>`a0OYD`S3FxPbGEga~+|J?(R
zUc3-ke|SM+(&o;V?|VqnU3u*Lyjzn~JIIO2+SlmF!2u;MXU^lKEJb|iit$7DM~jBc
zWpf?j)Fe>>C*t_7j}&YO7))5`tI59;5gGF<nf}onqQ~0C1Hdq-#}jO~^x48{eC1t3
z)6*w-yKn9VnIlZ584%KyAFu2Dp1uB{3B`DTeB?%R5FzTrk&*Lnx?i%Z@?WxOto6@O
zt`ZEMvCseQdbt$R4Fc;YgTQmkHNq4#Atha1cZXt1z`{h%WhWTvbM+>kAu}T*cl6_1
zAiDVNyVInJe}U>X?!P)vR@4No+LFD*pEteH4>^vNH-1^I?&IuZ{0RRs*xVtR-aCXU
zzzfqM<j7kt3p;#oknm{piT)GMA5x+aW+#WqTcp*gjroi>Z{B=-K%W9M0#ZH%hi7Q4
zU_z1n&gv0hsBaT<(xFgv<=6#pn`Nq6qT$7smRrAH#_q&@GmZ+d)xEorCA_8)I1h0G
zXv<-_^23XdQ`TLHN%iq@j~uR^{$C2hgOOOo;eIa}m(o5RI&Z3WSwq(9`Wl76kOrtU
zZsB{A4gUuX_Tpq5>b$qeEvm>0Y6I)IV~uS-=q=>x2C$NrENh5>v`5?cD~W4*^jPr)
zOB*WI7kPMZ>kY-d%^~40nzn+&GXVxzojosKwZ+Cp$5I9UbpoJXLuI9SA#`!Yr9Fj_
z8#rdM{Y_P+)aFZ}rd8$WC(onU4;&c}pr_!1zDg>Y+*?iQCNu%^8O~nCeg>^=G`5A>
z)JIl9YWd-5auR|Y@X*6<U)1UB!2xxehGTWE6MF1edCi#GzG8hRnmRp?)E`E0NWeWH
ze)9uDH>wI#MCMP@wcYzq&ZF*;aE81daIj6xXLjP|<a1ykl`hVd9{b&)X+;ENBWm>v
zQs}yxn%ensZdLvmq&sQdNe7|FP*LIj3{IuQ)H4h5MSCA)_J6KwqWJqIq+JEX38*#C
zILD)r#Kh}AU%Jo>HJY>VcK$@VwYk+!eOgagfLC<lGmG-_!o!QI`D#{z5zh6k<om5@
zCus8CCAKBJ^!JD9Bwn^upC^t=TVMK67Bq3tHaa|crLcCUW1&CKS5bR~-fEuFwxYQi
zZQh+ysj}C|L^tI$u8{&gPaeFZZvwsShJo+_r#TpJ&Ey(cz(Woqp9DRY@XhPjkUob;
z__lV`U?%G>%#LHgqVQBvZeY4vE}#rVZ={ghtBH)6$c!?Lu5w*^T6c7(H){_BxuICr
zS~J$}yIij}0shdkiekM{HKM-)pR{P3qNsI&Br!FTYw6HTvowy6HOQz5P*?4jvFa}j
zyh?z}5re-(Z*PHx9p+T?F(crZueQVb7@s+|f2qqaztCrvA|3Y{4=plksGHd+6h<Ov
zhvA{X_C@R>jB^ih>TnKeB*=^GWn4<CvHB7KzUAFJoz+`lKYts#la3{@IZ9K=L&@GQ
zbJ5@vMX+_J4|iFiApeyLslzY=XM^JFs>#sWF)v(B{Pde~&L=Wph)cdivDp&CzBtXk
zk_#dYQS2y#evt7^2Ao`-dq=YNUD9ql6`AYg%<$ocnuKFNKlko4h4=wj==euDz3Ve<
zz%{*#H2bv<Y2dZ$NMuET&?6hoVvxvih?gsFVMwg7mJ&6fcU3M^+hFkk@6Zsha*E_t
zxAqvA-@^#WIbUHxrh&AFxF-X5RWfpykWVUebHQd?f+_9K?O*5(Jt)+@dvH&#{qXx+
z3CDkVj~WF_BBIGPI!=-4VR&Cu&YnC#Z1TlvX3-2xPLZU9Gc@&vpp$mvjGP-Z9V8#z
zJE`R2hng@`vRrysf*vnQpej6C=#lR;TGr7SaE6w_8!5t;#N>_;QTLYo*Ao@0*)!1h
zR832Vp_V`E5)&7S#nn5Kf8Ov$mEF18RGz|TzWLX48Z|rxzO1s!2g<Ts>>6H;(~nh-
zFHFg>xO7OP=uX;Is5}mC(^mWODQ|-WR4v5R$cS}tu>CoCK3>4+MXbF&ija{e>u5FM
zM+y>l!%y6;%;&-QZY(}PrJ$==6C*wce3ofh5RPunm%^Th7IYS5wzjk#^Q~Ou*mRUO
zkN^e+{l@48iV4zeo-clZc}2}1v}4RMyI{IIlQXQ~vE%V*IkJ{kyy-jmi=ti&j%Zb|
zyY}1*s^3CR^#TKLbK5S0%92wJ_T9$pic@>C20$RPUJ`e!a`c1{Pd#35*>fP_<7#oI
z_!CWD|9WoqxpTo4T2r0PNiB7(#+7<qov(i(RoHIvyT;5(g>M7hXbPeDPufm-QkJc!
zsXXep_CnsXC^y9`o3Qz^6ANZG+;SKy3&)Fw5Aasz^1{5@;Lj*_m<mcdW%WyneoOsk
zxEABz0?|9jzRGnToX$o4HQKhuZkq-r4||^v49T=>jYIe)#J|k~Ks<8Bh%XIk1#-(t
zM6k>aQY)JcrIq^6{Y<mY130w7fCE&8=5g7i(3Deln%@Jfy?{EqNC6|WRyz(lw^*?j
zvB)@R!(8S1xfp^oGBOqh%ZzYgP6GHLKunptrRbTwQkw4W=6FzBzq&Ulrf3$$PS3{u
z2~@}hfG4z1m;*z_4QDs96bYe#RWlsaJO6uUkn5@QPH(23Xm4D?DH+VB-f4D^M$As4
z-Ywg2AyJz~AU;Q#IPiu8sby5RghU~VB$+SghOcGuLyp|e?o#Tl^3=SYU^wnzhI*vo
zX!%l_`YyTlZt=reg3)?!?O)q@tCI!{Kz<GAhwK8oa(cG1S>l`fyDx{!+|U=38Q#UX
zaFii(;V41w4m$yKH>G?O0z2Z;NMX;gXnitp{x9?Um+ru};+a-I|1}r1-YzH*?qq`E
zG}Vu{yXx)gxHbJLY{t|?YaLWme0j{89O*@j7^~i5Vba9z2ix>4U-&6Dzqe;+Grwd!
zR$H~5f4V|~R|znWeT3&(k<tBe;qkE&b}`s%OW#3)Dai2+il-`a9^}XtAq4s0Sm_l1
zH+FQLUF#b06@%xKkJ@%f+{)$1GKdbxVmkF@rnasB!6K<L)bwMJUJId%FPecPH|nfn
zm_;`G(uqxc4s3t+E|MAf&AG0dlg=IFLE(oAF3Y8=dgh4t{<_<DI$dBl0FF>sWVDeZ
z&ZZ{;*E6{T-Ra50+sY|;_jv*e0{0QhQ4JydZ~}sPANHs5Bdv3ER|Fg#Mea>bOK$%C
z+-vKjWhBpVNjaE%E1A*mJoF6M#V$O_8;b%Q%NJCYq$3D3?+TvL1H91Vpvn0tnb-KC
zThP8dO#eKudcK-ohenDX%mIAuVBRY0;g+A`j4_4AwRCZXM+d*LTj0>aj!iBoyPC=I
zQXz=04{QT>faHlNP^)YhwU3Ub*4937F{vk*IxfF3{G4jKNHMir_gyhnazS=>IzKDh
zf=VFrO-^Q)ZiU92Surd7{J7br=@q-hm#OIs`4IFv5iw^>@oC-eC)n5rzKCb7aM!>n
ze^+e44ljP$N?wV7Qz@Dso>%BZsp2+!c_(H!3F^b#{6%iOwQoSt9FbJNrU3P=%Gj9W
z<JNpui&nHi5z3rTsYGL%doz_akHbCH(5h*01U!5@SJxcf@VccX=+n>e-OdCu@9r}i
z$KBpDD`Gv!owvZ6h_3M^UF72n<4w(vI<M^Uq}NlwwYoLWqD*bP`K`9<bccuva_e9_
zYodg=M$sqGN=Vi%ayM8LKC`1(sG<CFpV?v$b^72Z^O^hXNN3M-CtYw%*H3}MJ*M5e
zad`n*eBWN8ap}ay!}}&%gn~DDpYyZ~yuH0m9H^s2SZxC(jR;*t5MhZM-AQ64>o3-Q
zKS@ZGKAM7HMlE{<2WR@)KV^b_Q|1hBbfoCBh~;4V$LGAZBTif_JH1c$aC)yrXDE_9
zG@<zvP8&1n+1AtHk85uSfA=>OuI+zKNn~l}v|L(r^-)cDv@Z?3fO0M>a1f2R2T$^)
z`hyK*Uyi-tf$IBR57}4~=5|4^Gxc&;Tm@qXE)kV-po^@g@y2(V4{Bed3W0TxrPQ5l
z-UGT6o}mM$WGC@w+S?^1;q=umA<|^C#68r|mT3)?S0Q=Gpc;3Vna}Lcxu}D~iIk`5
zpDFO|UVYCCn$jaI^@>GFd#&ho7FmQGDX`1_?yo3-_ZvS18<fVW`RLT4$K*F_iJoz*
za}a&D2n`jviS=HWc5}jjm_8*#`)bap+tTwd(EE=kwS3!H{Bf~asi>1mq-aVdX+1{e
zOyULK5eTXy>((9EWF>mKY}_B$|Cdg<zSifh6B>g>J>ACX+igkU;DtN5wWOG(O1NLp
z;3MBEwSyfS>9ca0`<WUVRfUu$Ky%1A-(b*rk^i`-n?g_FMPT?=?SP^gwTjB9UR1c!
zUnf~nDsOIQr|YkaoNt{2{eA2!RuU+7`Qa2xnEANl@E5K}FvC52ae_t#1W&h}X?lSM
zWmZbvsdDFw^m-tMphH-mZ*O&1Eb#QoaN*b<f$~DtSyI{jz?%j`oAku(5|F1km1*{e
zX{21W*f;LmewiknJ=YZ!5Px6aL+AQk^3I*aLFasiZ+hdG@AX`#0dXbBwD6EtP}-r=
ztLuLkTDIFI9=c2Sj;vpm<v55lTmq4hTn(mX*drMsr_01oCGGm<?%&pGa}K~f$v|A$
zV(+XTHa1qvvrCtrQ6Az6#BYiw1Q8Gd4O^Yz7V$yqYLC}HLExd*rhWyZXH7|ED@ya~
zS)Hlc^uqBq6WXE*Dn~A2$XH8Z%wjx}=4NwD)-^SP{gcEl3kup*nsU>5*^9erP<H=4
z#Ua+Cyl)za1k^F85AF$j)nxrq;B4eyH{hQ&=08_}@4+3;<SgkHytN9=5wV!A>(8h!
zy}Gw>T%vN!{DaMKp^2gh-N=iJ=8F^+nliIcrdz75m$f7B4Ra0%71nRyMjm{4_rF0+
z0KJHY)JW`~*N|~}u#6jhRiZxJwK|QWvPewQ0^e3xBn0l45QhuKrC8BHb9!#l;%A0d
zInML=N$v)htyc*<oM~{*aGrGp@iWsZe1Kq)c{b?k{}n#-=jEEWr8lTOmMDf4y~cb{
zQce2b0N5^~V~8WO1Y-5xZ2<2X7#!qI#WG(GLpjw*@Hsd1`urcrhm#AoAV#?*Qv-T5
zB=w;a&N$l$x6~^K2YcFG4a7FM`8~quj6JwYH!yrf18+d-kQJ|uD1KxYBr05jM?ESm
zizz3h+36U=3e4!x<fEIfHqP+UVHg`d8s%^{b?#_3UQR-=6+<DAasREroh(&=W3HLC
zy=|cfW9ab!SbM@#>~_n$En>bWwr}5p{;pgjdMIt&e&9OK3~&JJZ|`vTFijr>EWP;c
zRW5~lmOIh<L-*8DX@QTd#OKbJc*-)*hg0rbpEV7~N^U_9{QkaNclOc3cZ5F`>GH0d
z8hZ+u2y<Tgk<D-$HWqor_R;cpm*JNWCjq(Hi-W;tQ%Br@NGMkomU3Vo>+v!eXTZw;
z`$fzEC<Q=7Jws+i3C?rL-I>nJiojs--;g}8l7>xz5pq?L%4uGFc6N4SPL-H%9{4u0
ze*N*<buMQ@6o6U<A-g?gPg*DY>f$gp>L&`_8puqHp^h0DKRMk$C`dzbcM)N5^25sl
zMG-8f{`@p(eyqyAegu*bw;sGEXGI6T?1`YQo*;|U@pIZZ@--y~pco>G`bxiB4nU3r
z`su5sIN3yH1KV2;SE^wT<)h_K{O{E_f$N$Sk*U$SOO}ojMxMWfSl?IS4PdcG6PHa&
zMQP}h8L7*%w@jJ9V+@ib#4hOQjlDLdXAx)$^YB9ETS@$cZC+DxZEo$+%XIZ*xgm{K
z&TJ5yJl<Dlf4{*;FN)Ich+10GvoX-oDSwYur^oJo%rxj$21Id`CC-EkjL88Y3k{rM
z?v|lvVAc$1Z*cy|8IC$kvE_+6d&(nEK*Grv1q5te)#)zsB<+<5BQ=yACGCnH3q68h
ze3DUokq7;ZOL}$sjifWn9!5@QfB=YQ4-PmkwEQO({BtBuW8JGe_b~Mg4-Ema8Z7lt
zgWG&*#|64T<*WTOXl4l(V@m5OYDasd?7V;V6)DQZ7j|=KAo)y++i#_T#y+=bYfq`<
zJVPepZ61u~<X`p}ce(3O`g!Wc`jP*BsjvT00G{Tk0GVp2JDHfZN^d9h4$h!Ca77Km
z!iCisXJN?a=?7OCqapH9AY`8;dYRvRtvsP1HU05-hWq&g7bndC!e+ZdrUxb9p&XO7
zP0NgPt)4>lJj?>QPpWv-Bh$!pLw@C=JajZrj~xv@okQQ!Mt%u;%zMInYgGR^0YTo%
zzbGUW9R%}H*U`~YvbAm1)6h|*bC0JQJH6}vl8+oHQ6zUWnu6ESBM*Y!^%iMF4+L5G
za)VduB?QvgB7=XUYiO)Ya{>y}5q1kLtMPRlVIJskWNL{<bB>wZg@lpun5Cdi4%8!)
ztWZ08OtS{;kA4q?TrG5Ae8kwbS*;gHb8akYGF)(ypr9!;UO?&#`?2>fot<jD1n|WH
z;=5g>W^xtzs#V7p*N|#lczF2p&=jwZz}^DY`Hs~~oq87Zmd{I5;@SRvzCDbmCt$}0
z^^6<_7Dj%GM7KTq;`Oh9^-<x-1UdvM*8{v$Lq7DjA8psV3akhAEsP)SFQ9kbe$Q@P
zL0n%Hz1zPE2xvGgJQ+ss`oE9=|Cac$WF1FGlYyZjL4700OL++ZMSYu-V~gLK=AIRH
zewSg@okK4-IJy&E#wd@Tt*x!C-T(%kK0XM8n4gborLL=NO+B|axAnHv`dYlf!U&h)
z6A~zT!fEQ`Pb(*Itsm?;jLcQ-ijaraO4NqzqQmVyT=({nm5*Drf>=H<g3e=Vo&AE;
z=AaoFm-f#qr+8hj-&#nU&JEXyR@J&jDL-8nBVIL0wyBr1Rgra_*In)KA6BZHo~C6s
zM27k4Wmjhb4(1G9F2RbskPfr7u;9nty)(U>-Y0$hv}th!Au(2A&qBh~_yfs0Gg(Rp
z|E+Pr1!qy_DEGvn`p?%s5)7y1I56O;(3?eV1bW816Y%;#T73Khp%i~#rEGO>wPdT#
zl=$6i>@t4!=X)Nb7$XumJJiP1zhyF<zGX5Jr1#Jv&r^XfF4M^-c@sD9^}f&2)H57=
zTlm@{<Zk_2Sp)7gcznZaWa~R{Yt(xFVn_>+AD2>58564=dN;=1^U8$y>OI)V98J{e
zeEgGa8lDF(YF;7~Q_95j#sH$FOYPX<?{=wu&>cO*_lf_~nL#~0U1cgoF7xf%i_&p%
z*L8KtA1wMKb8}yCJNA!HzBV&a5#~K=zsSixKK$D1#gRqm&&s1QTQ%K+<dHM+;eC_F
zaJEzB?tSH10W;N1iDaMkh>!hNO782W3txK0B^E>8W6CFcW|r3X;-mTc=O18xq#67%
ze?@;F-98^&apPWSv}eKQWwpP5T8N(7y<4-TFYD65ucmWNa@=3-JG5W|`=jDci+%F$
z18R*cT>iMw!J#3Z9e!$RCYOyWFY^J$-Px!V>5;@+;6!T8)YR5K5iP&^=WSnV(1?9}
zK@4%%<CRmHt+KQdfM{H$cvTb!>v`qng#1FGa_4<WV6xUvZ}6XG2Tts}dyV=G3*;eM
zxnZ?@nUwC3)Ua^yL#+teheZPvEca@XDvaHFYhidb+@6wHE=3qOi@dgLiSv*zu8nN|
z-rl(xteldN%vw3RS9t05vjks0`n()w3U<?)97=L7_Jbm=cK78iTLT#;vJaun!98Zu
z;d2JJX2KYxL8AyWIw*_`TD#GDWJoYD-ZZQl5)sl=Xp)vY`qRx7>~|m8`cu=P&e7h`
zPW$c7%4)s=jjn2}$-SX`gWfXh?&80|O!7I?)6=KQozj?#Xf;CN^U2$7$ab2H?!>2e
z58e5WvHbEqspCg`=Jy#&{Ooyrrz)3`f<jusD|)us^#WIV_px%PpV{sE;}t3c0_2Sp
z=~=>=o50P_+f4qn{O>>hBhUd5gUZKO={Uq~&w!gN_tgGA%-OMX6o~2^4qMWb<nzGI
zqte~!<ehhy0y=8+%qMzeQ3h_#b`ABjApfAkgM<4P%EV%~<AB>ZRUrz1Po@4EI!te6
z%tw2y?JsVJ<G*i6z#oBZfMRpoUBF6jWdgvS{JY^l_~<`We){i*|5ncazmf1E@xY7<
z$3N!)@e5=FgvISpp$t8K2Bd)`|BKrCE|>NS3JS2jJ?`{y&)v8Nh97eI69pR296j1o
zwBwNpo$zhP4(JLU3FRu@3y84&A13kNU*zWx3JNQ}+hq2q+vM-qv4ea@VVEA+3M`l7
z-{pSQm;dhWPipM{ab^zGxNwIMTIl*U8zl<BsH_+2>6#rNE_HiSz)wd^d$_pLouASz
zMRUCfSf#U-g>+<5Aqa?Cuf3P~Nl<=BYa0;M%7={9()0Cc2=3nf^b!eNAv9}kj-nS5
z|GV&Q&dCpk3uOC>MEI^gUwc_~Pm>kiO9zv<=g4FOt5nY~n0Fspoo{OTC~{`fV_<(s
zvxJ7E?28xrO^iZN`4&03vr@3_QK;PtJq`Jv4KZ8=6K^RuQG8ck+Kxhf<6rxYPe!+l
z$GmAe?`g`EA`{p0whVGwpL+ihtsXX^^1#n3`1{Sdj2f9GM$ho`5ipZ-j}KweHkWz}
z<kKM!C)`|SuK6m%<mEC{=Sw9qZDb^dA@vCmUB^xI+Wwx;H_8v(y#_mwwq;_`(%N1G
zJ%PfbPBUPtRB4r^uIsUjA{JlL8xmv=bMy%D2^rRl*1DsUbM)!y^RJ8>8dtvpj83~K
z=)|I_&l)Yit~K$9w9BP+MJNgg!&K4N%uaK&t6ue77%52=BlnCvQa}`0+s-AwALH$*
za4@jb8nj0=bv*g8b2KT+zn*SyK+?gSH8tEH(+`lwMzaxe&FHnQpZP%Oc@ZXF$mY~3
zmkxrt@ujyQpz^#oC;vij?j@1=yxhJ(#|uDIi2!YULAotk6ji9+yA@mzF;u8A;P8|D
zrdRA#z4Ri>H9B!fe7NwBJ(pz#8W}sE9#m0KpzfD1Ji<PvoMwu;uB4>oy6&@!)Ec|!
zcGWXc+ConYGnS>jW~$BK*g2ISyoYneaN||t&!uzaDqLh|^BU#6x1xa5E0pczI6i(^
z75NE>R$Dr&K&M2kyCZ?D>~up&TGTMYN1_tpWYhQM_vL3dfCK`PRf?YW;M{-r8sd|4
zuo1@Z3V5?_{AvLVGFDRZe-x6@!w9|raY+Lc;sA<qiIWlGZP}3oyST0}vUaKm=2>nZ
z?{jZixu`2e1)1319Tpt^;<}|Bb(03lB(KrSe<d;<?U88jE58agQ7cNF%+P;i@3WSf
z=wu>O<<+(S_k7CrRNx-tzwyt1tAu|8-k<fvHzWIR!24U@<6ng1w`AG(FT(L#z56eu
z^;@zWZTlC}`pI+p$2t8M!~AapP5aN&e=*E&c6mG7{J()=Qd-ykNkmdt1qFG39~o){
zDUdWiT+k=5#cvDm7VYw0v9{>OC!hBBaW|VZsRRAqcAR`0z=U)B@@~?DkhzL~&9~Z?
zqTeh+FkwN5g(H_h5l%reb#!V@@(X}aUVP&LCmX?-&ASIqD?<QUK`%$ixKgW0fxH=o
zg@uPP80O!zRk1iNqMg!Cx$19qh+{y)E&JUrBKNFPcsy|I>~=zLTQ!iwkOQ!M`D;>8
z6)=!sjeT_aqw!>kg`7R{RSD2dAGHJ84%Gh*$p?m}yvX9wt-z~wMEra&p#P@Ued*~Z
zxh7Tq`>wQVR)GT>LN?)|1u~jCIww;;0Zm1@SKY4Z>Pi<!KB!B&o%P<3zZi)}TSQN8
zY1kH!1JojZa@q_}zTJ{MVp|e<g0{|{(f|P`!%Exrgl|c^F^v>qem+lhAD?!1G5$cT
z-&Xqx`;`lwKi_4a==mu`Q)l{HATDaWM603Ok(c#`;SfUjMzr^3tIh<@MMp;`b~*7I
zQ6nOKd8l8Kir?Qz!424xIUT(c%)&EL^JCQEC*&w;5f7;aR!tL=6PfB~YQj8wlfQjE
zU2LaNJyEY73on1Y=&Nnj<aPq?&}r%2&qF?xnV6WUA({$QW#uTH#h&wL1dbv2PRavj
zjV+^&Uru{qbj}PLgY_;OmtVM~P3k5r4nhxOmcDIy;GobI$Cl-d!NZ<SLS0m=a{FPY
zb<dH#auZusB=h!8iKxhsLqKV~uRsgc{?Fy|NoL1~^Ai)(4ayWIeG`ba>{Vi}p6AOW
zGr+uZPM1zK>dnH%8uh9Tv_^MC!DFX^C10DH-=T#71qv;8e4KU*Xrc_pIxZ~g{pW=n
zmsswWKd_aRr64qo?3M6KL#aUVAka#dFFDUEk?*Q6M4eB$kPKU_ns5F)gnB9@Vnm>U
zghUW0^bGp;{8;pwYU2Uo7Ew0hTc+tt=<ZZM{9yI8XH_!*w3ycwh==BWvUC~s&Te`-
zAnq4$Mvk9J1?FiSqxyVaAMqGzaaokgZUUn3=4c_Rwkrnc=3qmQGqp+~C@SH3rL7>j
zvxT*vyKZxWq_G>cvf7X95Gd){25Qp6qHBUtn}FubPI=^Y2;n~NH5V)~Edhg_=gF$f
zSnRh?kz5Xto3?LduYWHE$5k<$W4AK<0zDyucG|$ol~mWuHux^ZUv+P1=^KY9yDc7e
znQD+}HN%`?9R%X`4(;eLpk3&kKY43UII{XYKDr2E*-Vz0AOOQE!ioTN!fUNY9|>Ez
z9i<BXQc=ylp8pDGN8W`Wm9l*y?rk#xLzZ%;Bs$c^V5kSv#>FW3=#UsJF9$YwWBRNw
z@9Z@cK9m*W?<9*KP+iT2McFRA(Om(W$;_9xMqjOt3P`ApaZ0RVUHTmvn~YBIwgR2a
zwu5pstl=A{a#45Zr+34;o1T2*!MqD!oG!(3u7BX4E;DjwhHOqoBNsm4BWd`<C;q(W
zq}vJ7A{fZb=!kW9cMB6sl!{|}mFWKbe%@#~bJ*)nPuObK21X((>TD2+x3!)6><CN)
zv@%>mz>MGUVe%|WMKuAf{n(2|Z)qb#WCTf_0xm7XQBPpTzF;=@1-P)q+{D}4>yS?b
z!DtgtHfD%gB&seOEDY2&z_tpK*qo~d-w6A3Y`m26b#g5%&6TEjjlUf631@Pi*D^JY
z+r3!SASEoQ61>>TR;jVXa1;{3s(lS_%gez5l0}YebaE;y`yhxzz>t0mt*i5R_OWJ)
zFZ!gVWmXQS=<R54PQfl?c}VZH&P{nSDW?T$^8m-@Jeahx@k#Ui^mDVxScjmOPEX-~
zzJ(x6d)-n56HrD*FQwc}mqG+kR*3|lc7v}wwTw$U`v;8!6g~q2WagEYaO@lId}hC_
zD5gedVO*e#irIgQ%*<Mp<^wC#Oe*5o+}!9pOM0*bb{#DE7<5Ad;RSu|MAE57yiz2a
z9R>t8^{$p?>Hq;~=v0@TYxJ7xwYKP~g(AnzsODOr3l%EhjkbPd4GZ-IH#he!69WSS
zho^=mK$H4<-V%#Y4ovx0NK}av3|^vX*S+D7ACb4M%K-I}a*KdMlnKI@GMBY@#xUMA
zZz<b`dBO4gh0=~1GZS3F#Y>mM9IGZIN15j`IFnB*O0Yjqrji8+&8j(Q^FEkCPvUSz
z&io;CNxxo!ZK5p}rP_UysG)C6orkpit#c3LSB1Vh54v@hoS#V{`-<0=Ya86U%x5<C
zwrhKpSupp#J^uIoO>cawmb5u{#bT&_#GQ0sZOJJ2X2=52q;;wcv{5|D<HmCNNsSu|
zj~SLlP-Uz>&F~!*UEFyU?Yq9r3{)mOAy{`uKdMfHw^$C0>Nkm#Qcqf&JW_kjDL<a&
zW{~XGqCZXRF!wm)V&}JR6yJ2Ps6MWyHS9nuP<B%hMk$#I*#~rBexxC$A|Qpkz3!v)
zDyq^vA&U(8;`Km%PAE==Fx1fJwqa&!QWu*Aj(N2PsZnxuBpN8k#SKPc%F64nO2@m~
z>CAr@HXsl99w<d%8Jf2&|401=^6dQZ*y-gM)z2q}_~H^gQIcC~&?2Q!)7UsR*1qh|
zWjF0tM2#-8&)Q(|yl*sziPm`QfLT0n=4oHp%k_6#3vsy`MfDm*aL7`sJ#s89g=wXM
z05<kTf}WE&fN33GpI;%eD;qXlaEtvSUA0q713_x`F>d@sizcv4YdHb*1SmUVMFPcD
zY-^j7`UCaBZtHILpJ#Y;)t$Aq<HlThhl_Aan;*4-!w^!$vcY-j4gR&(KRFT?Wq~S#
z&;C|7z7<f|TqwqRD8@jCA&FxQlxM9}cF@-1dSq8N<C_K$Z=X@uYeG1+eIW$t_QCT>
zRL(tra^N~C5w=B?0U*q{Wj@5rv`#Zd#)%!6uOikG1suQF-C2c56CJl2dGidcfF?)F
z$H+#wXAcD3mViIJV+X@W<@hgvo)FQlmn?uGFK;*=)*9%%){7Y)$d^rD0jeMU?!!!w
z3@m2xvTW|;Yn>~Rh>5%U){{|2&K%Jol2&QaRuYw9x`m|h8cv${a6^Dfp}Tt8F~%{1
z!+tuZqiXn3blJyUVBUJX$7NNyE&Na&hYC38epzJS_-XHLeO!0m0);3-v`Ut5RS}%~
zX${`VkfHr9^PtvdduqZdVDyY*MC5#CxdoY7XwiyM7Jqxf9kDSJ#$Guo+a*Qjzf}W_
zTJZxK;<Xt4$Of){AA}%B>l~;Fuan8jwV3h~$}I*grb#;4z(L6C+}X;A0da{F+Wd1u
z#Mwv}Joh>Yo?SJ*i64%Bf=7=xLm5F#`Uo$5*Uq^d=qdzEu`Ph-&V32OhY3e-@dNEz
zUidp&q`Ip1<yF&@oJG$y9A#jA`kG4;d;3WJkiOv^q~((qoDg9T884mJhbzZj`|sA4
zgWHo|IE`Ov3605a{mh#jzvxrfox;WdajM!_^v-^;WO&H|n-GZ{DB?KYng+J*yD&*`
zhi!o8B)AM~)#bK1%$ou~y0f-8vU<PMGXD-2^tcR^tU`=4wM;qUN+nONoM&%Ldojie
zt{Yfdw}{>{E+y95$G;)H$ky-H*VR|{DZ(;AfiWWHG>gvYkB(aunv{o1`(d_%dx4(h
z*tj8(Xys>lV4zjx(!hp~$4a737Ae>Gow%@xbi8=2=Li3>H8O<#L&vCFrbdzb*n?!-
zy5Mj_@Q6K;f!%f($B(AI+uBp&gn+-_tlWD1aGT#I#r#stUJtiCDgqc&;d;z>TdH_(
zSjbPLa<hX+^SRHJrRr?nrYd#WbNlDZNQUx~-SY8A*80u(eX~|THVO!y&c3`pUxEV|
z1f03U^(7jp=Tc7F@Qm>l)k*f8TNQrrWofc+4#KWTZGfJ6Oaj5O30NvzNx#Y~&GE5g
zG7%ko{yEl`8b~IUj|^>x%n#5UtoOwFh!_8u{hs_rpxT_H->NGLj}Eg4S9{#lqZaaD
zEUSBcaZo^f(gu1v<$KfN*Wctz?ciS+PKKs_V{Avk;_;{thV}DdT@_x>zWV6jEne~1
zo-lo1MrnEpAz*IGzJ;WxZ|PmnGXv@!h>`2xZ}`iHe}DczNS97tTKSXlyLY{L`HCem
z|D~j@A#Mf1UpeyNX6CD|L=hqQ#!6j@dbCpsx>)T@icwq;t-yBTAqq78#w?A6T{?fO
zDE+G^MMW35%r;wJN5JI_2X#NlCfe7J@}OD+D9|LCicjvdOmVbHJ3!DO{v=Z3RQ;#n
zx^4Du$Iin$`KiO2imLzQt=V0Jwf^|ykjJg-*K`2Jvd?PN?uIhQljmM600GD;-`1iG
zpw_fQ7UKV5p_=eH+aknKA{u1kt^NkAdl1Vk_(UJ^@?=Ft1#m!QZ^qhde_-}d-dHxP
zSVv~fL9lDrF!Cv-PQueJ4mAg<z6jnb1GNxtu=A&zS<10&0~>yUUi`xL%mIxQEwA5i
z0PP{E7jOGQ)Q@+uocTx_CI>{WtcUBWtI-}F%ZMs{S(k3H-kkE+nw&qiO%&$>@u4nL
zf9gLHR{Y`sn{N|%HN<Eok~Un4R_+U_@pU+)>WJ*iQZ*QOP~b=F`Wq$#0T}gscN1WW
zZ|V(0R+3u3isA#fx2K&kc-;&+%7VD5H^;az%<oqC>dTB<QOk<+rgc4zC?jk$33pkT
z_lz!L_q<u|j3|8dk=IAV`Bi_PoD*!m1SmFUEosMpn69kj2wfDhZhgkU7+u66c8<87
zgHW*^ElrI5AP0ETNMTgaW(iq5Nr@EO3aI<otuGn1wbBj#m^hauea7fX54IPEiy&dJ
zH`)yO5{Q1;2@&BEKx;gH%)L(Xvnqq=Rd45|j*UV0s%#=vBlbmX6JWfVto^YI3!|AR
z-o)(iZ~746%8P!1l}0K$8_ZVUgnC2gpx<oUuIG*Z%gFB5`aR*(Lo%&z{#;PZsD8U@
zb{axN8BISn!bGN2vNF&6&u_exvj+Pu#vTM3a)n4Zdyh*4@>@-u-0#+D)6fv|vuz7^
z`@BrrV!9j7WOg;2W9lG;=LyQo$xHaPR?oeGGzoah<3NzTNUGS`7P(FH%%amGpqm&O
zxpku*jL}q2<Fq9dHb-7#bM5TvybMPejEwwz8wT5$G>a)YG;aQ(dTYzM>4=eY>Q?&w
zp=Iw|Nn^FOGa(1CESGOa0i*Dqvr(s}QRdhIQv!lQWI<L)z@t<PtNGlz4fryHV6GmQ
z@0MQ#oc>jpz7n%1*{Kk*uDWI7ZPx9XR(b!Ok<={QViN51w}4@v--RnSlkzTYl)Bb+
z3B-w*?2d@YA<m<|m{P|HJV9CH#$viR?&Ye>d*Sob`P$LyOCT5_s$~9bcC&zIWTad7
zhKk*+zyF9oArce{ScnUbVPWCs@AI6kF>bT#o=t^0y2~Ik6^{Y6plYLbZ5%x1xg{wi
zC8+@3cj1&~z|C{~S=Sv4K~K&|CuC)=wI+885Odm}hn1eW+<7_a^iX-MgM-?uiw7ph
zj)u-1Z_;w9RFJI*=kzp_MPoOU38RxPX)Bn&yC*RJzjm%Ps0kwqSC9g=HmwfBQ3=%w
zj#@6GU<m};iVBpYAc#N!<qUyF2m%@sL|PRKL}ef#LXdLE5s)B<LW0T>kUK~s1OgLK
zAg4eegpj29VKC0LzxvnxwYxJryZiPX-+S}+0R=piNY-YL7f2*xTS&^(iiAv`@2)1+
zujrD@#c?wZ(XZD}*Z9irY_>F0ozj;11mw<*$26G$Bc9T;sxbj`%l1dgJA#@bo?jX5
z8;C^^i83m-(8xM4<uYWT&N{kR;#jzvNVt_)Xl2Z>BQMRh$f}zi**UfS?enQOvWeiv
zSc)lrs88^U_Ejc=0M%WL$ol=E`90ck6<i|qRs3JUB5GZf#ax8m!ClLL5yu3JQETz5
z>ofa(1(VbxrZ9htL?4K471=D@4Zq7>ibPm7p@x`0ObWE-SJbDurN(hZZq74DNz2nM
z>mchlt$4YJd_1}I*fgfP@bTx;o^QI2Eu8wYuD3%h1a)Ke4N*anK~gz)hrAb(Fftw?
z6wh)v>}63#OmxqN=Di8^jRPWE9F(DhhZY(k-wi*gk+WRPcPt=v3_nDX)Lk4L>>u%)
ztCgAKXLRx)F>&`rsi15d_E5NS@Kd7zCw5&$OlME+wj}7(qlafEM91U;4`NiCT`^xK
zi*LQVT?q3}9?b8BV`xJu@lwclMtqk8grD%VVL=QhpiUN9CY_9eGtSZzi$aErnfS@u
zd!c=5je9CP@1GN7!0$d?onn+~hqMC5RkE^!uPO|`W>029zS8|+elfXO=6(w90>|zZ
z%~^NMQ4+Yf#0~C4<<YRp&b31Kus5;PRPEgcvm*bdE>3S+m?nU4aHc{sI>v+L=fC~s
z#_Z&uNa}+&(w$d{DPTtD-1$ZrV@U+94`%k)Oi%cw=hA4$i;&<Lc4ujp)SguX3Qssk
z!Viw)X6(Ux?Nv0>g0(`0V~~fTTJITQrc<lb7`_j^I$Z7sG}<vg<jkfbvabKO>I5tB
z3Yk*>w&8o6sn}ypQUWN-(GVVP$Djg;p|M<7AP_~FYUdqcjNwv_lF=OV3j^zxPLJfW
zP;a#VCNI4rgcFq5?M{~eX+~MQ6tY62lr)7$;+h9+(0;=3vW?2!l&+G7`A|SQ`T`wf
zdFkcB`bBp}R;zF@tj{cCX8|xXC#co5mV5#B$~efAE2FQ(iqHT`A)Kt2e;EgAQuzV-
z%t4X=3HYOxYoG*fl!F$fqt5FHjJRt3CEJK%P204^U-BlhF-%lMC}UAQ2$!;Xtwr#B
zp#CyvExK{DZ5+hHVr_xFaZB7h7o1rZePV*OhL@UXR~o&XUrcg{4wR4qVPQ5VPtLEp
zgP)28W7Yxr8bUjQj{uU;=SuG;BoeSOSbP0bhjLoNyQu3-fWQ^Vx3MiicnAa7<rQAM
zJjoQOyc<hyIN~D@h6W_?sy5pO&sl)4)A#th9k6$}@!qT|Y(^@^)FN`eHiDT4M*6jr
zPwQ?~CArAb2!uiO272MtgPC@@9Zv9(_Io@dRDVY}c+kXjDXw^NtTg%86SE76Lf>5#
zv#*;uBC>7)>xBzM7mgsCnVpt_XOjHc)Z}*;GYmn8qV8T18>lD+b@yPoOgb_={IzVK
zUXqBZqg2jrHy!+P%lu%u8`&pMZdB2zI&WbfPy&IpB3t+N^sMk}<$Lw9vUO8hCiaq;
z&3u__J!%MPiJmB}$&#f3q(62W&nNL;JRueRJ&{U^tz5%MRQb|g&1YDIM=d%lC#9Y<
z3nTOr0&lg7ZWX_98+dl<EZXb&pedGr6GcO0pd|&Yxs7a=U2jT3pg7&wvbKIhS6gFL
zUquwJKC@IaJiMSGqzvo*7Y=`;9lu?e)+f;dDQ~Jg!4|x9RLIOe{n@SY$DQI6iqJrM
zOS$U4A6{~s+Nah3M*2@w`^baXVyXYjEpp8(!hS`)?g8Rwist0#cB1y@;A{T?=t8R5

literal 0
HcmV?d00001

diff --git a/tools/profiler/nsys_profile_tools/images/html_tbl.png b/tools/profiler/nsys_profile_tools/images/html_tbl.png
new file mode 100644
index 0000000000000000000000000000000000000000..0b47b6f31948ec6b6b42b0cbb41aa1dc4e3c2b74
GIT binary patch
literal 36615
zcmdSAWl$W!`!$LMCs-i3OG0pWcMI<B?(QzZ-7N%n4^DvK65QS0-QL;!@|U{rhg)^O
zU8=U&-I?jx?&*G>^PF=!TuxRD2_6R?3=9lOLR?q@3=F&)c-g~31K*%MzHoto!Pl7!
z3CT$a2@%OT*qWGI8H0g|hbN`L$SEFR_IG;Pa>CK%HT}uB#^jgW2mj%U5JblRi2({C
z3}0RAS3@pIz)(Qtw@*JY`DY=*3^YG{sHO`m{&Zv@phSHJA!K(q>S?<=n90%ku(y1b
zb(ZmDEDM&qUlZKzq=xB-DA4$6WG;{Z9apSB28^5E577>c(@Hiu78?x>RxTsTx9QFs
zEd5*4NTJ2c)7{(5_Rluz_h53cL*dKRvd}}6@^3?^-)h16<)B-d7rH-p1TxA5v4&ml
zf1v!zWK#N`#y-S+iUU<Y`x7oWKT+MncO$TY#$ws=4YGXa0n|^L@ISr4hk`$~nix^<
zd|&pjMNKCab}!{6y=dhe^es#KR5EBnDdlEDHzj($X$m*BF#I~cb-$JzN1LtPgW~M)
zZGa9js&qe%M?w>aNMFD}?wtCndFo**ljJrc(^xjybeY<;AhgaTZTJ>Gc3J`%K68hx
zBZe$AXY^aDu}-q<J#|EKh{UW6nmeOu8cq6<Kqltr2}L(*iI?ZZuzmHh4Y79;9#RO%
zslykE`y}?MMs(c%RsG8?yz!0$1CKUmUdh`F$BD3fgB;a+k9%;bjx2QsWbO9uI;)Nx
zaq(XIjiE*~`W|6qIVq=KCh8oz$;VCd>GqcEh~t^0jh{kC7|$VAiBMkt2*6f@H+JFN
zm?;Ulrj8B|XD`O?DQqyJ@^O%7j0xM~$Bd!bjqFWS&{Uj&rMC1iRShh0O}U~gkd2sQ
z(}S`0%c-#!UBH8}U*WVzbL}#>%Yhkff}3TGqeeKG)E~5lyd%4A@`qOPqxkbqv<vZM
zoPZtDwO+^<1(AfGS`C7o=$#Q5>cG1%T^e#wl3ke9kV<}3)yQ}dV!!PFK#lpa+QDf-
zvI)@rM27C7cZ2Ke@}!3oKq2_fuO$9%GT@q6aRA4FzcQBX18lHBjT}TGA{-GzJ%k2K
z5`Tf*J0+wU!A`-VJl1{qX{424=seVYq-!`)Jx1w37Cp#$2$L>pcCcu_k3EBS6q(2$
zyEE1uenRs?uytFXv;6!>;HTTY1<U^fMMM~vK?p015-qZqSRxEoJ@^Buu{e@KOg&N7
z0F4nCwg|6*9TrtcG(!)&;WK?4)_V%{I`}$NMSqSU^Ij)C8+~X7!_+D@colrhZ_eGd
z)eQ4SR<w<<njt)XJi+Z@C%@ktu5H<|i)0dyf$xPz{4Ur`-eTRnUB|Jj_-V6@VuKgc
zU$!oDL3^*qjhhoi*L~Ja_$#OXZawsZ5LN+-CV=HvD2f;g3OuZFP*0F!5J5LnH<^5L
z@mD7jX!Pb@<KEHUmtM?Gq)mGJcrr0YvgTM4@sb}6KO7Wz4oD8b4=NH96QdLJm5M*;
zT0|)|PcxJZD+DSK4qh6e*HXHps!Oj5^NQP$uaS2a$bEwT;3|nno>3dFAl1z8F8n|w
zlf;^0BH>t|Dy1sPCV3~`8BaX4KTL0&Jg_~SIG7pROw2?jmB<o%lW06#8DEjeL$gH3
zMJtw67mtuU9J}<%RqgcKvZH%jn}1sqw`zx+PtNnZ2VyG0nF0?5j%nxVm+6RUeVk|$
zal;7(bMpH9dfLU*fVO}N!9>Aq!Gdn}U~m!P3I4c()XJag7e5VhHYv%G$P3ZFng=Sh
zE5u9&)uz|BGJb7VYZho$eMo@U{;f$aV_YVgFeIcLSsHFpY*BfrGZRsoZ^6uB$`Y+n
zvuNMod05>N?veWFaN~VAH`C9;%;Loogwx7QYZhriWzk|;XkjzQUBvNSWX54;)6#m5
zv+5?GlHYW4>R{*k{bOolOJh!>TB}B@l&3<!9Hl}Lf04jm9<4-5UdE4B?Ivwk>910@
z{;Tlqob8^E!rmc1-(F>3WMAMQg#yh2Z=hwM$B~kd)I%OZoI{W!KZ)9irirqMHbvR@
zA#O2m@$|9D=%I5Zd`OT?z%8sUY5%<D6k~qJeazkO9^lS$^*B0UB3*}9$5|)fD00Pr
z1#v}jrHbi+X^*K%!%5Sjv8Z0CdD_%wOK!VktE@L;aMQQgn{KFW?!6RZuw_`cSh%p;
zP*FlN?oxGv&sDkt(R$-@;-Y(ue=Kr)b4&W(<2~H_R{Te7W>Hu}(ydz3*@+(ehAr&=
z7};Cd1=;?rR|i%{Sx4UEXRnS8uhr+5t57W`t$ErRTF6SLIjFfDZJ6cym3bQ$o0t{n
zmF82J<s62nw2~3h5mka@0+JX-0`zRz>@Pf;?I?7*RR&eGw(e)*XYglYyqvs)yw1Fa
z9WEWi-kLAfKHxr@FOTXt)bGoZS$5CXJ109y{qRr-<WSGq|B%Ujn6CxXS?|<a3V;!K
z6bQ&I)I%A8U&VZ;@1-YW+HR85G7~p5-k+i`tH#jJ+L%7SKkwN%BjqMx6<dyB{`Ntv
zQEX6{=38l0df4qZ+9-msnlN>2M;aBDWWpu9(3psCTqtZEe>^v$gL@*5F)?TkHLZTK
z+g>D*mr6~hWJaZgoA&33cTosa;>q5Ryf>Mv?nhS-Z$<BC&(6%sOwn>ZbvcC{4^|!W
zOx})HO26h_Aa~ci6#u3vsXp{+EkXB}Z`NUoMiOTnW~K}MiKd5>hY&O|De+c4zfLU0
zzFKKBv#oj4A0;(1yaA`7<v0!scoh7BfLcsb7SpLe`UfrZ;?IF@^!+SFjC(>hN~hu6
z&D;X3H^vmhtkj;#X9;h^4kHd@m$w*;UuG+@v<9`*+u=2mewKPSoE~=1^jLb$;b}-|
zWHjF1K@0~>2Dl!5pKJJF@uBiF-s7*k&=!&d5_$PIkG&gZJ_DcaxA}%&COwr%x)II4
zRi&t<h9r4yx2@V96K1~d@OdvK)|Xh;z3jGY+G#>*J~p;k^C(fNrt5OnTmNe8JGnmj
zy|i``wq{>2(pYXmX<p>E$M3G^ens1^t?A-jDPFQ)ab0XRU(w;&a~jUG%){fkayP4L
zu9T$Y{h0Q4m1VJZ*oPCCUhDnSo8u|_G-h$_0bwg;5wa9U^1T<wCw6wbPseg%6++f0
z`ZqlpN9&ySD+6&NVm1UfoQqnvD!MDV^KuS|c<6+l7na>s-u;l#O*wj*#e5~c4tJjq
zOEyJ|?bWx`6eXwl9jzSi%-^O?XvS!Ye3;%&*0E2Rl1wqiY<R^uRozaP9-gnZD!OMU
z8n+wsZtriOJ{r(2pTeH(w_w>_T$1NVedJ8b+M{dL;&uFeFmzBBStc`gT*ac()arBS
z+Pe1DdBK%s)$Zqn*VKiH)ZO!`BaaP_byJ&-cFSmscLiH<Q!8EVne5y5SAVW$uJN<N
zHE(aaz18g(7UXK=Z-hg<)Sf?Hi+7Q?_;8?)VaDEjAnEwxz23i~nC|Z+lvCz}BIGD!
zC-Y2ss5~t{KBS@M#AVQ7c3}B@_Tu2sxGq}GY1PGU=X^+c(w*x!=nK(}(DC)GyQmmi
z3Out~YlsPr<nze6@NxKlA2<{$M2Jnu#V6wH!J77j{nq>(agmZ-1+GizHGF?7Gk{rB
zBnA%U%>Eh9j|~Pqw-<uH9S+QVV<eKt`p?K)fH}llxasb>c^%C901jC9GB~WLAqLe!
zKJ5xTKWwSUPssT`(apN6^Xp??<61GW^ZoPlMmRhVL5R2dwC2~=jyG)oABSt3onT_7
zaGhX;d}08KRWVkRFp-e~qXOQ;f`NycgFylBz=0PI@B#yaj1K{W0sf-_FX3E>|2hg@
zoeTNj@9jY+3MdLmNC5v84IPY)Z5+*PofsQBNPwnh&6U)g)MTVN4Q;Jy^^I%|jA`Ah
z?Le1+al3H>Z>^1;^oiW8t!x}Q-FQg;p1}#c2YpRPLiG0(Crcg@H5oY~AzKGyB34>@
zT6z**cp@SqZU-Y1P6c7nf0qNFcu35goa{L1=v-Z0X<eCUZ5>SM7&tgM=;#^g7#V4R
zGiV&$ZJhMoXlxuw|I^5SwIgioXy{;W=VWecLj-D9-@w+{iHC#)bff?N`_Jz*b~FE<
zJJ~q?>lV;KI?xe123mT$|85(&lpFLdr<}Q)v6Z^8xiv66z&&`GSUI@=p8tOx`JX%f
zuPfF5=SpTK#{Yfg|2p#jyi(cG*g?qF8n~qs@BciRe;5Dnga0n%rUUi-e~rX{hWYQe
zz&P{5bJP9zGvkGCfQ1PH1LFsi5Ef8!13%7!TUK?v?e8>%R*W>l{x;BI#GEL|U?x5r
zM7@v@*KIV222Cun;M$k1o%~tzlb{8);pZpX+DNRC_3@XSJM2BKyN4Ui(=&IU@vG<T
zJ&wJb$CuOgwHKd`JD>5p%(LdTtB5_*&Lk%t$Aa~3SayhLKhkwrW^yP@Gs&o%M$Z3y
zW`_vzW1<)8j`@3(9YO{k+*D7dw6(3~-xvQsKHODx{A2ziQBn3Y=865k?nLv8ptwEk
zr1VrYt)`chOCv_czh0(v8d3d0=lHJ?X%Ja=?R?VSEw2<DK}@1_g2DqA>q0H2fMS%=
zo85Q#nZI+9Pby~)HLGD+QEYN<T06P#;U3N3LlWzgGhEVa1ueQk*Ku#a;q0a%mb(eN
zT{-;7q;4P9&M%hx3uVANr%vUVs<&L+ofBx)4hqk|RIm$MP1!k=tq=4OT#V=o#8_7?
z$<D!HP}g#^W6wujFZm)y?mE!p<AosfIv1$Gx$u3v^WDgDU4`R&xmK*W+#B9Tq$Z@|
z(0{w{=M!wQS?Q+pxrn&WU^11JEk%KqDmnmKa@z?0xGSYf?dR{)deRuNqvbG2{yRR@
zw)2J6>2UgU;rON9ir3X>o>1s}24w41rzx2mf{&V33`l!EUqq_fE+y4m7L0q)q^MDO
zU#_PlF*K`op09G=tQf6Xh$>r8Tjq-bVUXXC^W14wziJsLDO0N}6Jl?#x@|?3+8Ekw
zIoMZ@a;0gSG-&(2z1(fwF4@l68=N%E(YCsyQ_4=?z2yeJhmWA^aC1BV1v5-bstawZ
z_aHY27l_JpJ8#G$2>gN}oH?!S5T4ijE&Ubexhy*E8b9M?bqeWJdSVKRc(Uw0F7uTZ
zXDk*ArAVrz?4Z`O4$q$(A-Dzv1XQQCeUaE3TQPj^f$kykz1|x}Rvis?zP+5O^&;~<
zql}l`1c93!RW!W^KG`*}GT64>E|})vj4yc{ReXd-!bxPaZh$86zMjxOp0Al%Y->9i
z`rXWWX0#emMs{xN`=mMVIL^}pi~|nV)KEc+j)w&0&XP@Q1!LtX>k{?#=}PPIpm&Ey
zmj8|r`Pca>-<PB6$HTJFn>k%w!*SlH<MS2Qwcm+v597Yw_^!*Sjq?TwNKYrWzGZc5
zskxB}?>=f(M@|z{*Z&bTE}*<{7^eAnb>1zo+%sLG*rRTmrF_)bFQ`b_wr<;gqXJ+|
zWTB_q6Ges@SNOUe<17wGlN%OVp$JSJw+);Uhbl{9m4$ccHNY=!8q!Fr;Am!_+sztL
zQS2!aBn`T5So3+BDZK;+OXn(W+oXT$D_^_HP5~1Kq*Q*~#xv0PoYLFlTxY!^4Z?_J
z(~1j1Gji?ke3N;LvI@ZzTJ38XNa>@1v(__SBryoCgC7ahn%wcXZC7ImmgO}euOW!o
z-w%6Ib0H{Wcpr<$Vl^5pFl?H4FllmF?~WG~eT%_8ZVoZbw(>#=a5`QeEh~~5SAa>B
z=lk|rnl7yUyC>Sn)ctnV(>4k;k$PC>w9to*{%~IOqO%ZMNKfMCNiOP#;zR79rih&I
zXD58+<uH(VGKV>7zZ&B*Y!AhEQ3DfFvMSa~GZut|Muc@;^KQzH!D1k}KAQVFj_`tU
zr21oFqbJnzLHuH!-FmU!FCFo7psByzkM@atv5%!BA^9Zr9qVCKsXh?6Ljxo%SI-*Q
ziJ_%&uST&3g^@<GtzBd^1^uEQ*h{{ge$;l7ulA2N4QJg|oYK50{vOBvE9TaLP}ln=
zN%Uh+im9Jz>eJQ$ljgU5LweTbXr5cOB!aHYb5&jM1|?LMY<K9>DOoO=980Ae*A-_~
zx5tB`9@ZtRd~f?@`~G0+^be22i@!a)u?*b2t|!Did}nvZ`QEhFz*`G#o9S9jjv-tx
znffAe*=#yIHJm0y_7J{cbTPS~wclCPieRb>$FS;~v=&p}sn<sX$%5SY10^<!L4fnD
zI%RfKJ$hVMyluw=#lxtrPoNGRsyVe_HL#i}F1yLSAFn6a<GKu%%hl_ct#*=BWrc&2
z2Zl@M7=N0lo91|{RDD(?C>L%#`8M|>c#Qi>8Yv-^FV0_nd}n8h-QCXc@ov@gYmekh
z`|INo1M<#5Zq}a;K49*%)bgHpK{8|&3!y*gkm~3H0Vh3~b;?B^Y5yhsL+RX-ZKpTM
zQD`_K4ns_NsChOL>pW4&pkM6u{)AA~FHE#R<lkwQ>M@cbNVtU;dhd2TDMY^J{1K}c
znf4t_{H$nz53I|Q-MUxeQI5ULgaKia!aVEC&HkgSaX#HAC8phnFEdr&-%r)LnS62u
z(!^Jz_}Br2ij7S4CSdM-sdO(bMkNar=ynuca_=zO%)QY2jAqD9`MMs6JZp)K$im}|
zFxV?qB>~yUt=hZLAf6LW|LjI{s5Gj2T!r>?YV8*B4|Li-S$Ei_id07VC=huYc@66c
z@dgxO*S_*T2%klmVo~=;VkcPD4pO)u4InY&m4v#&kVr#6y2)m4WS(W!Kat{}`8=v`
zufz>%9cLU&6<IbHyvp+2OsDqK=XtFz0U?K`FZ@=hrluyMg(!`=m-#!1^n3PoKQ|gr
zwAo^vWK8C<)d`%qceKSRx93~lfuhlP*gZz0x;>Ze<2dr0<K*-Ljd%vPdJ3tCg&TvE
zIf$3l!O{3!AucD2J2XSLal$zFGgUqqMktd`#fS7QvC%V@?p?|dv18@(?~g=B9&>?6
zcSMbZLC%B~iF7ClOvg4!*Ww3<66>W#hW*Q7I@JRdWvU<%RHG0I#^#A9Lrj>O^I@Fz
zcq!VFxdXmM<Uy%&OSkUaYn;L4HDy)p0<p`fkhQnhI~?CB8MXt4HNwO3IUP@#d^yF`
z(f}~agc*d{?sCZZR8%IpAE^02<ZaQ*i7!;@EF|YJ$yr7p4@0EY2yPn?fSw_OERjT2
zvCZ3pK%hp+fxHApO_8cV&SkDr8^O2?H9K2G@r|J$54`||%+_RAJR9YutHtbdDby+T
z@zMxhiCU`n{SKXPT9XBtTedQG^-X(SmYUoXi9SZRDh0E~Du${SYPQ=Z+==gOV4d6}
zhd>szd<^2fdaY%8L9&Kq4A<$;e92|Acadi3XY6-K#uKcuv@bRnZ7(lpzR2Qk-KjHX
z0nDQ7X<VWOS22V@?A7o{LtHYih{^H3Ri9|%HgqO7)ob8C>w-j%Y?kvr7YddSbNiLY
zYP~qk?%*H1#Wc>HzwxD_KAynwZPoj31uKXXu>leiD=1i38*b(uG(s`UH`F7)Sfqf$
zTESq*U)4m4J6&LsaY^Q4llc&WJP-_j>^g4?I>dQ!vd@sAErd0te@m#PqdHob>dfLs
zm9#(XW(bdpm9PAoKMs9_qnfEIE+(3FR&x|45AlF*K)N0vje&a9B2!nsGt&#_!|bkT
z2OgBQ-X96R81mY=FjhqA2BA)aIE%=jUiC<uLj>2uqBA7d=whm<lJ0AwMXOYLzmMH>
z6w%T+^6hy9r%O#$$?}KDUo;F|=y543xbJRdQ-*S)$A4$BBLun&wL$KnjgwmoVhkJo
z&@ynaJ|U`VU)xDekky6H(F#c)%h*}QR>Mqmm@NOf`OP9uh#nHQ@N&crUVafihWQ|k
z9MhIn>T~^h27#~|*Va4CK@DGA+cx_=i{baCSwmq0OJuMJU!1tNV}+bpBb|DOtvSzH
z)Vi>lguiY2=+iFORVeRRA_R`WQD)}K<H^*D3<cVMATb;`@R2Aorq}P*<D0$6Rd|uq
zJpoWidUD$C>(g14C4PD9k0=bax0ds61QzWQK4|O=YGPDf-<KQxqAVAxqzav_BLr6*
zn$o#2uKTHvShzd!?ObhNgU@%SP$?|FLrfiW7wypdN=00=*{yfg{-|2Mo$Y6dY+L*^
znrnquNBR<*O+8b1GS+1gn7;iBtSbzh`))%5*{1N7u}nn$uDV=*lw}UFtfDbp=i{U3
zw~ynnq+t2(SG%KZVH#T2(O=4|1(gVEVzr8SnvL^^e%<?H6H&>KDNG}eI9N(0!V%jS
zEXiBx=Uk7(4-Q8_UI|$nAtv2>8SP|PFtZ^gCrB$C_Xlt(mFnv26}Rp))#)TT!znj2
zs_PN;ZlVUD8k0wUjI8#kI3m(Bp4m|&{5h39m_RAE{EIL=DM~kwne+(=4=j<qVg*^q
zx@0=lf)GsM1U)c_c(s3aGi(Wzy?(HT(@L%G#ul=oVWUU1-!4kouJ!RO{pspH3W#E=
zj@)OcGxYuqEeCC?X}Dk*!(;HdFvOq_DbWAE@tr{hT)c=L&m|4iqo<+_{XAnT;{f(W
zEiv&q5_FI=O8nT%-CEU=n7E|5i3C@joCZyrkAIw(o(^JhIbXpr6SP$+M_CFB-mGG*
zpuB!A=eB+W%qoG_qH%mu@Pe}k{r7Bbmj&`w=sN9L9Cci36OZADnh8<dlEc_nfkQ!Z
z1FBKA+{|hnLEUe}!Id{CDariJq9Oy^LA(Vy$EJ{GB|gFaHOap`mH?c`A18o_bghZ=
zLkQXs3r{$`94c^k!9VtJ(&*>n#N)st^+(go-P<zlqOD2i?QB(eS4C1%`+ODS(O6#Q
zWsb=lZe)}M+|vZtkh>^so->j)s-2#XH>5zZa!`H!PKbiWOD`u_EpQuGB5aRDrkQ<-
z#rznbk;3wR_Xq)@&afrgYfpvMz7LB!tD@N+>B_{bh6NbIfo$}#8j8ldxT9rOy#`o+
zXOEFTT_nD@*(2FJnu_YURx7RIvy<{R!D`(jBHxYsvS8~JhOs9aC=EKw)3$qdpoZ~a
zEpgn5zdJ_}<>)npVi1r~kwhiH=+f*oE;~pyP>{(IBti{;xjZM0CY|L~laTkdd8ij6
zkZdqWNTJ{TeOdJZ4pJ$Cy(}glF?XVNxLd?H|4whNUP`I3hIWxMP=<u!ieY5ZVf(cP
zZOd$<)-MazF>)@*kj3QQfhJPemY6N8cni&4pG7uRy-lVrG6}p?v{{9+A1FA~rWB#)
z)<czZ3s3`7)w0zM?0pSR33{%wJ{+l*1wQ2wmS{pp3R6&Pu&J1-5e1Z~o9>p8>ZW3$
z70+mM%Fgy~zS!w@gMI%^J*PHu@w~u;K&oUZvuLjWJz44bB)ua6j=v@SVeCtAYwAtE
z%enN0&Ib}|cJrokxI@qW_JPL~3c4UVmAdVXY~l6_mEQ01v`QkYcG&j~QH*~5Zq$bE
zDu_(fT#joJs~iz~VM#<PBba?pUws!?P7)g~)WkZ_%b}*c`<aq$Qi>1Js1Yipv)L9U
z2qqPDIp45NGxKK4lCxbEC^i;noj7_Ea_*SET(n(wT)h7i;(6L!Bs%ML<`@O_j@Abp
z@~F+kXA9iC7;UprU5f$1VGbii+Fi_qd4$(Wo|COf!nL?Owrzyub5?X3I@3=86F{2;
zJ%lwtOfl_6^j{8`v_5POBMFU?23C)msORfvb(^Rk%Nj84It-KZq{5*qk-}=OQ*f^_
zNwn8P=3^<g*6JA7Ig&A8V?p*I7q1d{>=P%2$Y&LPY~Rs_6ED7WFjW}GI8-fXHiu#I
z9byzh8#2L9B0aL19pAaeeHm^vO#Uc+7$MmZ5$uu1K7;ML7{mPTbzS*e?D?-`)AuFQ
z)AzyW1y+bEvC(R-qK_oV@o^Hx4~N2#@vNf!B(`A{%|q6z9gXOHcVUqjdP7)>gQHMG
z=A>u@i<FO4u5}^6xa2*Rpq`?sfzPf61x`&yI=N)}FEo3xt538V-VWbMn8*LpJYs=k
z##ar$@?v5oGdzu3uuc#1Ka@_S|3N&t-N=^JQ8R|yvR1olTH=E5JS{JS<NDP1XEkP-
z^dd@NX0ehPGs(iR<Bmi7Yu=TvgeeCL6I3j<^mz8!Y%JIp2`r2O$l?$h@%{ao8a?4V
zn!|KS73F$rdCPt31P_KH(Me%6Vb^gDbWyN!cY#!ay5v-yPHGJ|NMTl|PUH3hg%bml
z(Mkdt3|DHEsWqdW&G_UFhpoe_7LBE(j=esce5PtATjqo-$x_%_{s51MufC57r7&{i
zxwG+EL`vET<h})yc5B6m3yUzyU#P^CG_lfh1qD`|Ze~>b52bfLK;k_7$a|tWIx#U<
zRqVqjjV=$tQ)eg>Zn3adeO$$9jT-BkY~R>x5cRY5zh`09Z?sggY_?W+o>7M6D;gqQ
z|EWr#NLRxQVa!A?9d?5IZSuD#yjbolD!{0`GTh?e`CAC*T0!@{ddf!gvk<S38?oGk
zH2p?3V~4t2(r43p(MbRpk-xl_+!v^!<SsQ`I|nEnT=3}-3ik)5h57c48!iZEA)*)F
z6h(yv#K`U!p<M@}+xi=S{!`9!zJwe9+LQxWW<-eQGsjsC?ch|5&tLccHK+{B4p~-u
zdfxYsH!~9H!cHanU5UXi*V|Kjf)$l7LHUoqITXqL%|y@o{r5|Mv3&mYO>pzZz#&y;
zl$Jelcjf!%`O0F}-jD`!Rp^G#Jy(C5`xB31Hh~zaoxafSRX56tz_#jU$PQ2(Gf%c0
zp6_bQxZdMM8BIxWjd~eH<d0=>9i48bF}Vx=y`Kg<1WmgCMl}jrC+EZi+50O!@!;j*
zJx`qhMa}LT$mkophmC)1Uo5^q=4h9dF1J?e)&mAzb@9c0C2UN;8`sH9nd<&RUD4~g
z;85j^i+2wa+ej#LwkwzO;Sb)<*!^?)cQvG(1is-%vlSTx>b6^%k9i&5PognB<-ay=
zoPh%RPh*XGWpgbhjRG!LSAK`E!7@<!cMHJae{h`%A)Zuu#*Eutb{r?q_|GF6h4D{5
zMwVXYpFYo|YA@9hnpzdKZ9QSVaTdl~RSPB-IV-ERDYUUTACTnc<8GJDV5KQUo!d!=
zrmJ>i2nIoFnJ-7CaM_YHXNq5Wl0c-BQdN%|O_KHsdRyscX9RQ5Mx0uzmDODjGGAs`
z*|}ODPzIDbr@ETiS7ih2kiZs=EIRWApZ@B_`w>z{#B+3DFwGQTr*FM&CeW}%KgRIf
zrdImH1<G}!Gb)S&yidhkvkj&kO{3-9>E84eAv7`pp2zimUbb&IsA!n^4bz9;EgkUl
z;{8giEAr+yoteag+GI@|j+@1ZCT+4$iviQ7ybZ8Imx4Fu1TnuC^wv##H5MbWnF83I
zr-|$}E78b50=!-g`BnbkNi0JEdRBH4(T_->-uK~>t)~4Pgd_P{-De+ymaIsz3bYB`
z*8PX<<c>Drc?dW9;X_~?e=}^wi4kBY<ZXTk&+l_y#OJg*G969F*kX`7n5)XsQKYy#
z{yP=;9r<%bn5qFDfhO!x27A!w>7*zc1)1x(dcF!o>v7ZF8BK_Mj8dn+L{Zk|#|end
zR;x3Jsz3n|Ci(!Mhxk1pbKH{Nr|CQh-ky6e)A73PYYuOVP6(9gbyMdHO?5WKBh*Wh
zVWxrc{ls5R6Ii+;O!S=Fz>tK$Otv^5+0Ck(zEiiHtZrd9>uo{#PK^a$dCbt=bsk7R
z?N8s}1=S-E6%Ky^&|;Qq0E4wF)%bMUTBZ~VkA^`KdB5s;HskcA@~43`AHY1gy-tV`
zenyV~_t^w3SAKL+&<bR*`h%QWfh>>Ptrcrhy{>6dQb0|`eK|ln^vUW4uIZ-v`u9g&
zv)Geu27_K$mWjO{vrTA3EEL#4J!!XE7C{Zcm6YUXJ^ro!m?koR&d;X#QLOOA?@xLx
zf3#I~JW5X0K${-^(eDZQoZoiZa?}kFeTRfnVq`D9K!?-0A!ruhQHN6iK7zmE>GAF?
z2N92*KHK+lkkaCJAVx0Y^Tg0stc=135TU0MO!7dcM(jXiscgX^&T*+n))R{S)hMG>
zxPxhmyY_u@ScxY|Y47$3)A;mdCs1^m$^QhXLP`U#I9S8U%TIx8^EHNB4fI9AE;=zM
zHOwv_gpg=lcSq9TZq(WDKDT<CX*u4PRm><j{&ekx5zB2RmRq{xs5gQRQRgl^rvDu0
z5x-DpN=cvSMRwR^bh_M(BhtYf2Xo~r{)5d}?ZDv`s7EPlk3y4aJZY&ErS^(m=X8B}
zP$46*q_Ix&xWil~MY|uu?L!zv<OwOB>1#sO?U}o`a~$?Y(hLPgch#0)Iu^jz**0OB
zNueyBc=y->6XL4L<p4f%%leYokwLO5wBRZ(w1LS7r4QG<&ZT?a<{M#e=ty%A%Xe=j
zYx|&z=G}A`Y&ld7T=gdO!LKqq4A2bS-4oMcLHgBknw14N{}#>P0GDpDHwZH!pp*=*
z=sR+pqU9jbU}e-4V_Df;ShAYjwDfmA)}q)!t6*x)#9_gm12|u`&Nq_t-p1<bqzT!u
zFB0w+U&Fb6T65#?bb29PM=XD>u9%Cj(J>kq^^yIn9n=d$4Vma!Ry3%v?<6W}kf`{K
zD}VFjOHf{9Sfv)r%|`u{;lBb<#W%}jbgQVJkimT^hI@t;xgbX0`D4BjJ31bScOw5M
zgyTRMOX+p&PEHcU9k%KICtay$d2d%ZK-Gdogt}c~Z6IGs*P=tgP>SVFpbTh--k8=-
zv~+)QfjW%jKYhw<;8!I!5gD5LCt4_B0FpszDOVfAKb=Dz)yx4g5`W&<x0A>dvd?qd
za5aQvVN1@{-?sR%cA8Z4doaMj(G6sWGH`Gm7BwY(L&f0CBkcPA`47O}6}H~*S(Y6u
zmW&BqIB7?I3e+WrK%jSZ0K|$Aq1-|G>I~Zs^_2{#vsIo<-`5BI6m1tcH<mul3lK9I
zw!sN?)_;5SCok|+;g>9i=W&hL&$EuF>F{k(5jPc^`~pWQn^|nE1tRxC&$D?vp%-L$
zZ$pkbt4`&xu1o#T&whK?eBF{4!!KCdb3R>G?|HmEDa$T2r{o>z>ayFwQ)SS)JDmBl
zt{}?#iX?_-FDxyDw^}L|g;UV<xSjB|k9+$a!lBP@nrSKcHyrDqZ?8|5%l(s5bQPWw
zh3LP|?X7$2NLL}c0VN|YI(qNkd5Z&u4q{{8Fa~M;>9DM-&rNH|3ShPM0d{$gInk8X
z*?@%49_aj|K<=Vz{@X!O4%&ruIakH7i^SmFnlIs<p3i<>sMDSEax50U*ZHrB;DPPI
z1ZNnxVLDif1PYnSALgp0IX;gi#&cUh0xLL`n)DB<Ich!O%!#&)yrRr@BRS=dk@`sL
zQq|9bc;^93OHDfmfb9~|0;yB`V|R?GVfy`ej**2jTdqJr?tRXzF5}(#R{woN51c}8
z>W`AuoyEcz#%3T)BmfMcj4zPT)S`C7x4yO`+v}vCdts2cy!79U0)Z4i!4JLC(aycc
z=56}7qd8f$D9cFY%RVsm3s67Z@_roFzCoLJ-^==l&uLInyB>fbD*M{c_bOjdgGEGX
zZrl6AIK?ef7-xeSO+!Bz<2uZByM9U%BAB?e$xw#KcN3TT_~%x%0Cd%#Mr%z3n|jjH
z(iN8_8D*9FBV^xuAuEK|jp6l*I~2&B-qRF%mlo4;VLUCILA|P?VVb7QcQak_{4UeF
z0N1U&xrIX=D>;v<O92kW!nyIF@tl3Y^9`9fX?o##9@bLiHtIu;bqh9Qya}-fplm5k
zewbrn7}W7RDH!hoR24!aG*klL=L>qL{XbZ~1H_?bsqWj+sDE<?hFI<)c4|qr9<v+$
zSOGwlHEqwT93a_DK|0x}2Uk|L-)i93pZg%z9C2BY`_RV&#E`tQ)K4yKHXE%d137P(
zqgfeWH+fcNV-Dl#Pja-Mt^$#H|Eya#t+29bq_M7v@3c4%%B1!c81_duPo!1;f(rT8
zzh{x_IL0AXgIe<0?ki=MlL8Xm^zk}!9%uXh_Xwr-$DI`2U29G%4cMDfR10(1m?c1n
zs?IK2S*3sQHyKII6MJ(6%|W5uQf#a~Y;nF=B)n!h`asME85_GclXHJ??34N_S(o&O
z0UWgY=gZ+ERMBtKJ&1Us_IE3;p_SULrBdj#+a{yw1qXK4-JGZg*nfV)%u2;V)FtP=
z+j%C&#;TD`icu@u*y=Jcx)Z^fwX+Zkk!YZ&CmRH4=1@TUI68^M<6sOcl^gGTxz)VM
z>w9PEag81+J00YX#GGo!f8fmPRW48-n0_Ib!D`JM$v69vSibCV;tfzWb6-)4J)ljR
zItEB0=FNAvas>jrCBIbs2!xBrnT;rbzFF=Tc`H@12+H?h>V`>~xWCxy0T!A~iUlg{
z7WZ*=4>MvuYOz=hfk)7wK*7)-VfR<N0)yVl>zzq}GRYld5IMt(T{og($i*o`TJU?X
z_^zyv0?*fApULa~oQR-88N+KcoU-&o@HB!y9~O~+9=sPI_CdsJ((2Dow7Q<z0d#+8
zQI3yB6q9s!-+6ze`{a0VR>pO*W}z+@05Wz^iY=f+8ZsunRJ9otrNyXf*#|$4rgOR4
z@Z$^}W~pJ`8sY;y+LR-nI3@`+4YX`Nq3<(8HMD`r469JnO&y2{<&|piGN$hl>#ogl
zcf^GJP*xfuf)@`RpHi?F29+umO5jnv+cQF79^24co~Tl!PS8^rOEosha*aAekeq~a
zWYA|dTF0_saNeC4l2`!F_okl&kG^ErgLKo6gIhnwc?6{>7Qrw)Uh?)O%|1@$OP~1P
zu=S1PoG<SC_undo^kg5P#iqmPA)&<$p`9y!4drP;8)tAR(nj|DAe~($QZzmk3F)kV
zXea+##j$X_vU!PbuG*qA+_P?;s6t8N{QP>?IR~ENHeF!gZ6?8lf8rxyg*eYhqEGzN
zU@<ckaBX%XxmOXFi%Lho?H9W)^G$4%Q82(*Ft(~wJWw9R<Utewoq;$5UDs<~S29Su
zPxOlRxads{^uv2%KyEDNhES&Qw{Pvochzb;f+6&x&t8IqL_Xxg6`ryvziuvQ+Umo#
zwJM^b`HjFKuviBU+t-H!mzX@loI9Yq_^F$>r4iW|dw;3HQkCq^H~+j1F|d}<f*Q65
zRnznF3D6MMdimap3J?<J!z2ykc&p;nY2`Yx7r?CNygrTcXPNXSzRT}X7$?J%(nS)R
zCD51CgWJ8fT<n4{$3_^U#)Vh|R96A%^O-9a%*xvrVzdTS3#kFDqTnYG=%tF`pJAe(
zjRUb+?T*qpQ%8JWe&4V;B7gq~xX#Vs9S?s0g*jh}wz*ffXD`RMGT9Oc^SX#k7@8iG
z(1xO9-+dl<X$nL2^*d}Wwd>Jqk@dtPvGNEPlc@p);{9iQ8ht{^>}Ql!&C}@Fo|ygo
zoYKri1K~%*8oC67;3vsG+c69V`De!zOi^ql>Zj%!SXv2Ma~Tbwu>8rT!LmM=@V}vh
z<^z(ui`#abSingexDUPFrNA;Jq5qOb3iJ@6C}DX?JmF`=0EX`YMm(e99qm&-<E532
zUOs>W!e4_(?u3wW7?BS(Bo-)Rpb@}d2KCYg*7FDxFMpLz<h(uarH0fqqEIsaRJ-;P
zdqPRC_lj-(jj3stX3(HUT2cI=L92#!`W)!F&Xl_682eTv*$X(m@!(>bk1&n2>Jstp
zl151R;C}zS`f%|?Y$ih5!;x5BeV|A0rYv9Ejz+4_bC((-#-`Z#?Cy1cGska=?*>xa
zo^;l9geSKLQ(|YTSPmqJ6`~HjE``zy3*SZEymiCCCCdesN<Cz8k6+zH@4M8bR}4A1
zP$MKxY}4>Q>l`3AKJce89&Z`(Vinmw<f=>WRfM^bwHI5U$MLh^zYtmUVx3U}h)NE}
zA7FzHw7$QZ!y}H^1ZANLdx#rBR|LXER#cKrDv~hW31jXxNXO}2Vsf|~&kOTDkK(FQ
z$u(k1>Zutp%Tt+GY?{}ldQe`<Ta(1AM6F0#D6fyF&F2MyYmH2z$4OsdHUOm-r9nXo
zTO}_f0?)Y~alESp1<6rFxR+EBSNwKk0>0?+Li|bp<``-7!Q>o0W8D3ufX^wAwXsn@
zUGt_4>CA}YF``!rl#Au1m;3<a7_?TUAFidv@(1G|)#@%*rp=48Fs0w6(rI^81BDRx
zc^RFcw)v4$Qh6Z0q=W0{n#-MElgI-#PQneEG*Fnux00ww&L$$M8MEagaBBdtb<Drg
z$?z9n6-Nfm2M>YV!?~`*ok~mfPivAF;8$E{ZHmKa>0eptyYzx=OC~L?NB7>p$busO
zB2a3hsj>K_t*TbxEKaFeuS44ZL)FZf5VZa*n3p<L|KW1@vrs;1Li8kNXm4Zz1du>i
z&Y;BsoNXxSpK92|(XRQn*+GyK=Q*vsaP_>A9RexTuM7qEy=B!P2<8L#3V=<d@i$&@
zC<-{a`i|${2Aj*uJUE;)*oNYsOZvwMMZ9x-hq(ph(Q#{V2IwMm|7B-2Ukcnhwl*8F
zTpQT!j&XG<M+^g%AKY30ev&e%9w=?;!QBFt2Y#fn7=TAgJ?_#;1`T}qxO%JYgx8jh
z_(`#Jjjff|*=Wvt$-Pr;QSb(1cnf4ZlH-EP^q+a(RO`4gMt0&53~|04K}Q7;U_@L~
zq2Q3v#pATg3)D5eXc};l2)-6tTZ-0Kqio1P_OSm2z5czxbUb?l<c#Sj@GSReV{KwT
zkPbeSD!On41x#uH_ftifu1==DY?)dDr5po+uCmpKFQ>c)sX0o};LrIR64+<c_pgh9
z8bShz%i~f1c!M?$z2Of-0IOIi4F~Aq=BwW>!d9wu`Bw1m$GFZARdw8rQRW!+0sL99
z;aZ{Q?T&EE54ssF=w=Xx8-l67?oL-UvnVvErlkQA*))~IvGCnPjUcRE)TfR%r%<np
zUxm>^6=s=^BuN{&AI72CB{6(W-|@W5_3#2rH8V@~NxwqIa=b1jPzWt?l*)OBxBiBI
zLeNN|s+t@C6oV`ZOkAE~kt~&Rmium+KQ^y|3SesrU=N$tIKTD;Y!F98RAUS*h;!5<
zmpPuN9Lmahm4d&Wx1!ier~9JMnpgdg=C@6<@C)B<*k@#2+WV8fr(m%HaH7iDtOv0#
z1qZ%e7dWQU);+)_9I$v>P3H;0@L=pEdr+(7fDB&iu+zvue-Fcsf<NkDonug}X5Kup
zL$_~5bJ1qHuDa(bm#J)QM$kpG(B~333=p$)_eRveuLm$ic8dinO#GuiEn=nRT%|x8
zZCS*ldY#E{hi5WA2x%-i1VKRRHv(g#z`U1~!qp3N9J9OM2nATRd4V)s1qa?mv-~J#
zwanl?v*$1Y@d@>#h_t^X1JyS_rfSY^^Eyu@iTmG+dVxswz1U19#gC=htu8gtc%L?@
zQ7L0ZrRdt7s+oa2^W$CU#a4e=y$LuL96N>>*k?>mfDP#Zp(S)kC2#RA@j=ne26ajp
zK%i`uMTw1!OLc@3LZ0s2wCCgo3|`=nC<!5Pjdimrnw5VckTIx;7Q>S#mPu#+T+QkL
zP&&b$XYHy|A5yNJsnS?*lypP^V^x0EV+|Mt1%93j%i&sQXLe!1r#4&Cv=zWlroaCu
zZ)iQ%>1x6_MY~Yi3Sb@)p8-#myi6m2UV_SRcD~H@fR-^3*dWVDL+CMQ)mxknzK4MG
zcw847hG(}o!gi$+X*zCu<ppCZd=W%szeL6SCYJ{XF&ha~V!q<>WO}@B8EelJ<sFZQ
z7O)B#kCzR<{x<0XY7*auVGfx3#fKePE{hU#-I$tIHCTI;ND*MAYN-f^nxt6hYl|3*
z=N~cXrNc*sr6y>amcOIjrZO0I2O!WEm##S+eNhhW2LVDCV6qzbQ`ruIQd$oKl-qGl
ze>Y&6s+=oEQD+i>pzl6Jis-3VUIGkSGu2y3syZbX))Qa(UX-?a_F78GDv(AW?jv-c
zNiw}}7amViPTKrGKX<;~RM8s^#DaxT=6E@z@_M&)cZF^Pq((T-Ojj%s7J;Qf<7|Qd
zWvKiB0HEu=BKdL^Zp=ykRR-%13%21dKiCJQ2+_k%`LdrR;=?cuNZRxHYJ{bGNf>n2
zT$ZfEu@K1^rRM0P4akeC!w(um29No$xm>JBBI)C7;|*4s#@I<Y?$-miZWb-8vPLr4
z#)$zdw88kc^PF~y-yGnqN)CHx-~d#iX0(rjE^Bhq;UArCNsdxj=wBX>0Hb*a`IKD^
z{ESa79z#}D+$=5pjOoQ+46zCO!80df2XU<H+ph!~S(kZnqZM-05(%a@ph6EXt}rmE
zv6IpPv~XdQJBG-5kUNI)C)Pe?8tSK;u8718pt6Qxp)ay~40zpC3lnCP64sd72WUdT
znN%q^iKSEAk=peYG<%333Rd?7u`PFy%U`#D6{<6~a{LJAxt$CW*(fcmOn<L%qu3vf
zZyEF;vV+nk7(0&X5Z*CxQnk$Nb*R_tqN`pk=dFI}fdh(p_z)e?DqPFlDgTlIDJhT)
z;<&oXT>sfZ@qr*=5t^|FkRu?QsF@MekPhUQXPTlcM?qZ1$qX07n-Gx2-t;$2qxq!-
z0Y$?Fh!KrsUe7uiU=Xj;tp2i>6r2z@en28S0#~A&{FlgHfn9?TU+T%fzOc<$+CjaB
zf>{V9v^}Ri6Vm$Z!ZM3MO`bV&oX}<X%d=MsW#akw2@|>E&*lvF9V1mOp%&x*RW&BH
z{$EuMHI*OfYMjd}bwWN;<lNw1KqxX^;Y}^zdG~{<PKxIO#D2-~z}A0)rf3a0A_)5V
z3ya&$?&exBJDEN<T`HD)LL_h?Le=0_qZivyiUe!Uru76fS<N84D?1UOiQMn+w_+j$
z`3V+Y_^4ZtYc~KpUP+t^>x*(|L9M5ED!aWw6niJg2HPi+E5c9>tOd$)z)Knx;nueZ
z*aP>8Y*ZKvE7K6o!NsEFq!MsMV+bmITJ^X$TPUQ{7|FFf4hmSnCAEJYp8>SPIp7~C
zTdP(|%m(>i1lpj3fU=8*ZKw(#m=$rTX7Y%L<sShHq-aQ+(@nl;6#QPcr_DjoI|aL#
zVr1+@``iVEeIOblfNX)j&s&6$tz)*$JIO6q&p@QtL4lauP<{fj7^}=cSEN8J$yFbK
zD*$6IEyQTN|AznFA0cF<vhrHV{mB9;2@`tv%k65w1rYompL5^OwzL({C%CWwdK?Mm
zxoGP~81(@sE&^Tqb$n^^r%oW<dDv*H7rz4JB}Fn!0?5+XFebyc!dzNJ+p_=N?)jqM
zQXY_t@(VxaC@~P8q_JA3dmuG7;ye8j5XdyONY9VvlswbMV}$O$fI#B8TXxh3tbVhN
zAig3-{Igu%)7${XHO!W*FM3~l6<~`I$P8u8YdtRn?BM(+Q0e(X!>jF{WvVOvgudKq
z^!fqW5jYaw->s{EL3>0s8N$Z2LZ}F4`@TJ`9sY6!x&DA4TPBPZ*M}Ojw++~jn25(J
zft!vCrwX@WI%g%P75EJORkprVCyAM;$As%A#yCpLJCcJ^&TQbENv8fLeyZ0qf#)$p
zMM@)Jt`$41p4hBxIi!k0pD_naa2fl}D|O=3B4@hjst0R>B5gsw0D?G-!bUs~c<Dx<
zOKgi6rfrq3`Op*dF!_zF%>HuY+RL5*Bn&!><<388m0z1s`j4hE2+i>tox^WJ^NtOG
z+}Q!Uq?l)UNNmgW&w!D2mWA=660cA;3kOr%X(H8UdmwHj)i4?k*a|#Xtx};;GR*>r
z2JNDU-%k~8GMIkzdOt$_s0PG<-_MEuEoII}vp9fXH==|XVhk-5aJTAXVS_Sgx-8`O
zS1h$y0Zx!V5eb!EPkbtq!;x7B%L63@v&Hp{fVS<tySfVkq4)yA{A$IwC9=@zb(C!_
z2w-cV^&eYYW}0l*^d?OdTaazbfNAzYEO&vS8U!Y|R;}trH*ruPx%-fsnH_hfk#GQR
ziob!?e3Gmkpj2I=el)r?0$ht3`v{Vys39v9qqs>0p$XhVoFgJUJnD!98FiFs1G~@j
zy<60`i^2>Wc&w7`{M(bI{`&GeJ^9eXgqxi5yPT?J3nWc|=Tp*#rvMqJrNx(F^EEt&
zmv&&M3H(j^yC#{eXZeL^FomIJ2;-$`!x!ZOq2J>bPfh`ba~2vEzlJ=Aqj3$G5rygO
z^k*QW%&^e?_$;mzFyzL7O(zVZxFmn{GNsx!%&1u8sU1AgDft=(pVjmeXgGHaN*Oi`
z9ytTSVTkC@F&0+Dt>{$I*QO~VOQE4ugRXH(EIs}h?o07V)9T?Dg8jX1a^#&^-P$fC
z2<l@ZrKom~8+a_=R7nTGS_S`r#e5)73<Q+?Qja#k#Trn)2ka?skX+-tQAWSu0KXeH
z$pgi@KW!*xE{KTAxP;Y$<#Y_<h5|p2DqBO&I^TG!c_9`irfrI`zOrvXHJlsHj`_U-
zg|FDuWoWp|<2}GR&i!osS$;Ex`AHYQP{oKxCKADE<y(C*L=A1w?TV|eb`P2may#p8
zUvNW|<06mkDtANgR-pFeLJLSa_&5{&aGLWh6W*nO85k&|^$J+QLjW8{R{h!x(6EOU
z^c?y!h)MS8;s|NN>Cup#g7ybae@j55NPfq9!9?dQGn!9!6lw%R{l($MtJCR`^ge25
z`Wuvym7@26-!(skE)>r}9I?OqYpQ+#M($20z!IV}$i&R4J?gis6~7);b@CP#L;{$O
zE$*fmOH@6Uei^F~FoTmb(p$3of(*zc&>t@}c&C-iegkZ<3AO}&J+7s6`6mua*+|Xl
z@A2HfPx-V>8nX84G~y*BND@B&ey?TB7&nk^5><%cIOmoI+iS;A82$iTloPd0YWf?O
z`twvdC(SBo|B0L{6ktJBmqZ$k^S9qZ9I5Jj^&mN{qh=>#g#6lTNBXP24Nb}BrP=u#
zyyEDFV~>b~;*<wM3QH5t9A-A}rMsV1Q6hcU@SUamS<U2KPGu+W7*I*C6Cb6*^$Yg4
z7gD|po^<~S3f71a!K-@;4Rb?;sUxCg*=oodfGL5gsSa?c`s)@HFF!gSX#;u1G4L8k
zIkiL_t}CA2JBtpTN<vX;F<sTBMRE=BCLuW^5j4`O6^}{YI_7b5F9Uh|kkH?x30mbX
z)*T7x4=4uB_yL&RV*j0O;NOqY{0jgI{esIl{$I`oB-q4u=)ODVqD%)#mim>Yb=Xe;
zD0mE9hpm?e2~M?n=EK56XFy2-*di=i`*3$*r@uf{3YM$7wc7!B1C%)*0aO7;ePUbE
zfwQy22~mK;Z*3Nz!k19+=h96qw*?hc^VZk@-}_Q}{y*+ZNymc#=7hC8W!^GM>v$dd
z8$gf8J-h;Y80yY9LJ3V|&)`7$)(2EB@cr~Ci3$U=F$6I_)yK8NnAb;ub^olyGR?Yi
zfuaqVf%@U($)DyopUygU+ika$5(%G_(nC*y7&aSL$NW*tPT<!icQ?>cszCmVNOmg4
zhLlZMB?9mZ2$bG;-x$g9e0NE?owrm^%In79+3$l6K9Htv%&6+(1ECzKD@GWHg8=s}
z75CK$(_sRVrMRKr{re@L`~YEp`Ev0X5UCDN1fV(yA`jG=K(N*ckkq~bP{Htk7MxS1
zulA+84G3vi(LnHc*mQ)nv!d$QcAE3O`0{a*ph@Bj?}{M6b{jN;oY!G%4f_Y2qW9sz
z<5t202Shr*#9!wCX^9A=5@dW%Cs+_TKoLt2$P@|zypNV}cx;Piq)}&pU|(5M4P7^*
zqWNxG`Z@Ihy&usB(4Uaqz+Z=TU%$JHhr*cb13NEb+|Ma^+|Knz>i}8h-Njykf)KFd
z!YNibz-}nokD)@R-6J@Hwne-K7+E?5OrkNF9)L(i>0L0}2UKHrU^B*a9UJz3AoCT;
zF{s+#CC{(~Sy_ttK=n-5EhrOso;0Z5Ur&8?W*y_FSj`G#4_8`K1Qo6G00HVuk4~As
z0c=)5YWKW*|Fepa*UQZfpw_QlM;Q8{E(*(!80WMcW!ATN%nFiJ;E{Q*<=7v#6K)=X
z24mbq-s6`-QXZCtqf>@Nv8}Q%1G_KsfHGNswp?AHO|*<Sz~s8~J1l)3@C;y>k-VXO
z=qWDb@aabe#Ke_=;E$Xy7BjmaM=Dl&$e^n+1X9P9yN5rX>4O4@`rj^G0Z#G>rK9q?
z=+byj*VFR3O?(azq_m*t2OELwj|*zQnbp)koG#HHBu|;Hf7S*nj-fwtA_AaI4N-A%
z)kIhcZb8wOG(ik{-S@GtxqkrwJqQ4pWc8D=A)R*yz;XjH2gem)QlTTMW4rdzAK@jw
z5bBptw0pXquwBeo9R5Q8%FR&ayhR$r{T&^L;^SW&{RXW5VLyga64;P;-hhwFgSl#u
z$#iV`$bPFY*k+|ASR#SqfCW+~9aF=MxLwOAPUISvDL|virb6sBc5%{Z)+QN70x>_(
zDVxyeo;d6GI0BJhij`Ru%WT+ZD+50P(a#!=qbxnBVt9FjY9EIS;Ls_8fG4Gm#O1K{
zya-xDF@CuAMqM9R@Q+lB(bBZ*?2wUgn8jnfq@eV%V!0BU)H^B8J4HqGf-zM7JjVTu
zOjXiz^54z~miB0b3*OOp2yD3koDa1`d?3WgL%NQp2JMHv4j3tNMbWWzay>S=z5{qw
z$<g(2S|X-6HlA$4k*oxmI%Glz*+247%mAys8qjmgVyy$pS=Krm*vOXTy*%%yTbA?}
zK!82Cp~$dOAYG;dm2s~?07(X4VleF6*Z2^KekiOI3X%<!?oR5)c{Sj<I5X|K;q({l
z&GiAb6E;<^s8fUyW83%jicluUlMLnml=hZkRkdyZpoDZQNH?g2bV!#V(o&m{1_?<;
z0SW0Y5d=g+LQ+Wy=~5bLED!{wLAvK$JU;KtG4F@}9LM<W_TIwYd#!cGuP){0)>Sc|
zW7tV+i<$VW66pmk@eeHDprt(Oy;(R8KSG@MOX+KoKLUrgpW~QVzeufqt9F*5i{c~t
zMd>X=X}2MnToP_SJ>Uno=o4htZvzM*nP~SMc+<d4&p1iKA?#2tp0z;dvkF__o>Ner
zCoP&k3NKS+P6`Y6mUu-#BV4RlpHFtl+$|hc)3Jf@gYmH@vpdedTdx#~XP#G$n}~;m
z8}gGTTGuB{`u>II`)bq4s<Y(;kb&u8JI3{Sbmce3@7Bq$S4;@pv>(a00-8$S`}ZrJ
z4j#f7@6^m>hQ|};U!~!0)QC#LmFvBF_i;Ol;L&q$>-hN7w|+t406QTHm2wu%6`vb=
zcc3H_q@Q($hkqopdgD9fADhir35*s$1!Kjj0)Yrwd>8<SfP7W?OqU4p(j7E%l9Io<
z*uipg?QW@w0InxMla~(kxH=coKR*cwFD_!UP)(qj8;7Sa4$c2-<A76pylnl+aK>p4
z`(F*o9UawZuYzoEVw=cAg-XY7dVYNI89LnPDApU*-`zaFDM?eCuXd&C^T`QyRePV0
zU!~`dh=_wRzx?113B{4YnMGgVJ-Yq*p8rC;zethW;^Bv<a(I?638T=heAs9K8ekfK
zAttYtj(k|;=8;2w4i4+PoBKEaE0VxQq=vie;-KFs%-;ElrzAtNaes&bLn+gmSyLFJ
zZ7)87MN<uDU%)`E;g_Q;CN&Y-SzPU}(%sba)gGRnFqug*)EM^@jtNDcZqj7pe_F$v
z3=*($2wp$m$qCPQauY|uum6AS<Qy;5cl+=`cw`@ZXUhVQsoCkeoyLXWq1?7fN0Z&#
zs@Q!`q@JvOr%t~xuQH7WU;m&Pd|(IO{4=93XxOwQ8!Jy?Srl>U<Ib*weTVtKdIn-_
zx<{H5Hm@Af%{#*O0qc>M*5R3NBfP@p-RneO6PdVpj#(pxs>m<OVT931W83ht$a~VY
zEAC1c7G8jd#S&%ae{n>Pl=v%n<Gz&2<+8Z`aQ#I03vSsP9A#bVTTAOjnm6eYZh|mY
zrlgpJrZJ`OZ6|;MH|w{DR2p$zBp)q8bv$S18KuT77H5pqB=65knst!tNBw&Ix5)H_
zgBI#P+UOdvz}y~<v!CNx&YhJm&lV9y+o=CD6|<_9(Er({XBhbi=p{tCV4*H-43PZ+
zl;~1JEwWCl3wdn)u!!Ui5WTCDNNzquB0#_(B@y@T;X=P5J^kw}@IKhpj>yvn@;QjB
zACQuK%Aod-7`yFVy|@T87gzoYi>hmX+D$~_a$WrW{qu249{jr8Ws<PqVF>v8!d9d^
zWd<$R76CIRz^K>;*u#~&$E(x`*B%vmZ^huHc|)+o{1E;44ycX3xRbMDvtArAobo&K
z`2)0i4SXxQ9aV_njq?b2p!Z4KPN$Pn#~bnu$|E6)k=!4W)AM|44UCgmPTaxvck!?b
zc@_%sfrqGCAx=&tUCOnLxyio0K4lsRX3E^V<6)#M+td!C2^ZRT>aB-ad^;G2X42;X
z{K8xPT7?gwND03*D-P>N*fouZbFuG3PYb({fQeI;8UNZ3wr~oavG}eo(6a{cwU+Zg
zwLk_0edw;b7k&a9j#A+}GvgJh-=66k$6<NqJyN94Lns9`XjGoJuuy0W#UcS6pZaH`
z&r?Y!FyPuZN&=?o-QkhUXhxreSX>n)q6%_HBlybOaqZ3Wvps$yvreG02XEU~{&q55
zuS%GoUidek<rP|6{mn^~pI)gLAoBJLY7fo{NsaeukvcnmSYp-v;yT)LohfK!%mEHA
zH16mt%^}xA$*wO!b;pNkYw%&kZe>-eo72<3zq{-w;vK&G=R|93C#NF1R6%-#d!pYp
z6d+hhb$EwM=~UcwdPofrY4!C>6?x_}t2)$ZjRygsD5*;J*j<_QYHY!xX)i6l>P@5Q
zdjuf3a)m`@TYD>VF{rN~7o+_QOpF@62hejDYw6&pErVehmF9zOb6{>pEVZ9W{lM6z
z<zi~AlO8GCf3pBul8DozQK6oIr?etialn_>6VxSt0E`XBnFy6E00-T5fk`6-%y7Dw
zSo&Z`e<GDlpe>%-9p9estR?vsS)Paa^$hKDOS?IW{?wj9#gmz3n&xOXv>3>=eRO)T
zfNkt$?0Yar-R?w$C>MMatn4N{1Kkr4uV!Fed0ZXuKBcHjCbb>7bj8i5eD(;zYT58E
zLk+Ce@ZC!p64~G4BIWz*!V%SM3f|vRkdJYu^U^o2i@``~fyGWl*-zrzPO4<PpK?E)
z3eYNMNe1BusD04t7H5dJtKPg?>@0Tvz7Pa8$*1|*{lIkc7Aa3VYC&5~nIA9MrBc;~
z!D(EwER;NInKkjj@f&4yPO~8;2OohvYId_lpfzC+Q$7_T8k=5PDiN{c*fG=FaXOkk
z1mqpdHy7vn4dv%|yCi-hmjeeb2O;L8Ec;TIz*31`CHxyTnZCc%c>3G$Sj;H04({e$
z#XXpj?1k5+VgbfGCV1Fov<S5Y>12Vhw9(2O{}+n2L$(9E?S$%of5C|DUq0Sk8iF(u
zw}?#@rOZueB;}i>d#Q9#^Z!A-!ZuGEoRrL@fUHa-CL?yiG{l6x?E&eJ6_NAbBeNwB
zYg-iYW28|iUZ~1aHUTvFv9rfXjLeU7p7Mo`Fh=$UA`?y%lPF=)bj5TVH$O=!7Hu~H
zF4OFstK@wA|Cy`YeJiMO!N4g~(~I`GFEj^(hz&ucamXNzgE6xqHALwd;d3Z-le`b-
zdsPz`R1l?GNJ}7gD7c|Rn~vvr%n^S)A}!1ND-jQm&<GED75_~B6uajlylGMyi?1*y
z30%>+b^@$gi8T^1uo?CeEiLMFN&ocF3TxuhInu3SwsWEI%H%!{A}3m@TYd+;*R>O{
z5|1p6ea}Az$r}MkX|(+;Qt<bJ;4NzMAngdF7}+b%h3o2&df@Wa?rp)_pUBh<!#(Hp
zGQP-O4_Ae%=vvLJ{r+>Wwc3SNJtIeQgg=s&5SX?4VhcsU9Av$BP0ddZckuJ8)5@7P
z0G<xL`EWi6ac^mdSS%uwl)GF+=nA5FjiQ${Y`Dn-p8J&UT?OQw#NtRuA91G`Q@lld
z<?N{Yuia;AGgy^l=_NN@@}&)~Dj#kyUV1O`?LmdrNWRvl7yKt)O7D`;e@xQuiD8z#
z$r+d$5}QFLK%$D1UfJt{2{j{bx9EMN+h;$;%t3+c{Tx*qjj+F2LD7lG=FJbNNxZ{D
zpLXf6<!~Qf%!5i&LUhloBnb1*(Z^OQrcRV(Xn&pC9F#L7lLv@#UZs8}f^%ee9k#&t
z=1A$G=fipy*Njq{MD#+3$g+$?Nx)Vvu${VrJkm&{0wIzfkSyk27;<E4jtR>ewbc*_
zLXx$H-{s34;sLnQy!ja#A9kT)jySS5+Ik~hA)lD?PiEy+GyIB{?X%PMGr?%SH(*Qs
zQ$<$`c@(FlCG(xpG*>j0)S+e$hbw@TttYk_z}HMtk6NViw}mqJ=8rP~5jg!)#b9U(
zatVIrXU7bccgE1sAoaI&F3D(~;(X2=#KEbd(q(ye!@h;O%7W5td$oEBE0RIf#Y_}v
zW3KL4HWITRF30<uhbmWBr<(j^@Inf5ud5vP@H9!RR8RRL%f_!TmY6~3?ilG)522L}
zNy=f{!&;WX4V)Uxv5n+V5DM)X7WtSRL3eD|D+Vz#LPUF=fW1|rEf$e=d4sN-?{!9R
z?1e)89G#<--(K+D1z*?B>wU?j;?;O?=T*-+gxL=lRm{}$9F8W8MCjWY?oM>W06rFj
z?O3+?5Q8)yqwV9Z0SCk4Z;X5tIK4}IllK`Ke1WEJ1!E~mp4ux4N>Su{A>3;^#$jaR
zLo1yEt3@8JzM&mk713NkxXe_e&**br6)(PaT=;Y!=UL96a?`<rF_rIref)f;_Belh
zc15PVAEr%0ZU2GBLfAVO%{#WgY<gCRMHyuN_za4C2z--L#*h+{`UDmt1OWfyp6Kwt
zL9c=y_yzNcQgg4E7o~v72O*pL3IHC|^L}$o8wiPMV@4)xI+yU7OH1qo@m}n2DPxGc
zolkM$?*Jc9`i?++gDALmKG)^@4kJG)m_!MaO<M9evn6|2;@x;|{Bu8v7G+dn#EYTd
zJKMu_D8r?2iLd?o)mLv31egFZYdmI&=}f(OyTOLpG#?kxI>O^Ljz}3t-nq6pxIx~P
z6K)P6v%fk{4PKf0hNsOxfUFZqSg~YU-}#q%GlC1<!98F=0C?Q<R2&L)>-YX8Rbz&~
zATGwPc-N~Trr8@7ftw7haP2B_5G1bpr<X(Ig-+-4MMm%<4KH#!7qDn5V;Tpelm#2O
z{{1E_q^9@o`8kv}Zbn2fKKnYz%Hzi#d0|cqYzi`n6>0c;ITdAa)Ba54+{iCGI{yfk
zxQ|5316CbOyXbWOSn`Z}KK+JeMI;pv0P}C{C`3YebVuoaQvUlb#c(})B3`_8aKo84
z>Priwus;U0ee{#Q(=ifdVinu{eL4W@*GbhOO@iap-@NPO86pYT$jYt-k1KibCze;t
z=%(JAx6H1b0TlKjpSTKGCHdjrw3`Ay=K=aE{Mb(nbMmyB8B+20F*y}HeCT=2p4FiY
zfD9N(?186=HsNJNEbh59p<5BY;K?}85PQW7;ma}NYdE~|5O898@l`|+qHG$t68m0Q
z!eSQpsJ!O}KC(klLen=<CbGkqc7*j4VY<DL*cK1SgOO|UJ*oVDcXLa34Owh7DbTCR
z<+zHlG*)l~8PPp7I`5AIi+;H?mcg`i9+04rdry7{O$!RDRoL|1T}b3*I1V`Z*FuK>
z4lRoMfe^ehN}gC!CmW$$Hr|{7Nq5n8@{R9wezxW@q5?O{ITtuk;RqVxuGB81<4Zo>
z9bt%Jbd?I%Zq?N@U`z8oT&!y>1D-k*0%sH@87n9(6n>U`_{#BUap;yGBCCr=djS2#
zP%o5-?$L*64i3(&nAULXx9YDe*O1;a-Te1W<c%TbYYvEi>akPZV%`#KLKw2b{3IqK
zcYSyJTQwkv8z55;rg5r~{dT7|O<r_Rkf3G(2>|I?9V;<Hn21(nL`o07j;)jyJmy?k
zYO}rfuZ3Zj4&(ymr~NvH9;7X38j}E(>8<zGXm{lTouoK#<;u%vyV-GtICq9SmH|O%
zlW+v@hbDm(nT&Jk@EZuk$Tr<*-;cxh`5XGf9p9sGQ}z>}eXu%KCrj~PuF4N4(-L57
zMQt07DF~j1Urire2D<R7XpS*JXqSe30nws1UD3c&=u>%x^snC|+8WXBl91>TUi^b~
z=ADV3=)KC_h${;UBj$WOc1SKr$;<MZB!3Z7#eBOO3JfgAJJ3iiq}H;)U$p53<~S9F
zRr2?@2=$5&bjkI*(bDtJwi~So3l%@MkgGN`+|Qc6O2D1h@`cUEVW1BYeb1JsJpjh`
zt8?&BQUuS7fi^n-`6!u!z1qPB5hGkxtjyApyDF}lb*mq~1w7l{(#2_|MPH_amM~fh
zkmOQ)@KYRI0{4*+8mIj6)|CK{-@3CG&~ZdUU;MW*=5g*`4a`!6&3wtctb+poLm=is
zHE@3}AkAE?i{{mLw>ki+smbsZqIW@TRkyNF#paVO<<3q|go;5m#Ni%I$tU;T;3$R9
zG9;Y_t9Xu4Em<t`Gx(dBX~8XF4iL|H<$23~wKX-ECGTE>cf9ol$j)L#Dhlke+YSae
zy&Y|jMIi=hi=F3x9&K67R3VItj}+kw<NN>_I?8r?*PsAG6pXs%^O;f$FzW5EF~+>2
zDIH&HTxZb>YXEGt6HDn^E}Jl2=zlFlz2$^6Y=1jP=AN@GIZpVCx7Odr7mIsx!euGS
zKeT;Ey;q*f_z^qT(<fgOJZWaq{rh{jeW8+e2?!pw)P9zc1OYzIa^mU>%8(%7p*cuI
z9<QK;x=9Mc^WP|Bycx4*wn_+j6moZSvGSa?nj~>yiJ?s^l~K7AxvxkdM<MMpEmNga
zQNyLAT24h^@LP$x4jwom8VvRwZx|B1!0+#%gv+5)o6b&yUVP8ue4k^DynG1J3ZehI
zy|>5ZPZV~u&U;tUfj*0)uY4^r_D;t_&+Iu?OzTg<<WC-&X;WOV+IX}k9=#9l6pH`|
z6Vi_6!D;@gx?N7a2XY>Ylb@gQWaQ+6JIs_1DB~GNv$}(LKZV5A?9+E$E5hp;I^vT?
z_ofNSc_5vT&KHrwZyjON`Wb7qjMV1uAhdsb*)6)1D8_r1uC|XrhySfWT`!{JQ_#5$
z!J28Xm@1iTgp;xux8*_9b<%5*JJk1;5{jczB+ibvsAJivS@Dm=Fs7isY*TM7Us4xk
zQhg<?)^yPec66co35GG};#=m|KT}gPG`{&zA)He(6St8Ly1IMSZ2^NrnvFj~>D0*W
z#uo)c{5$42Keeu)R_Igj(AuoT?QkyK)>V~Tss3tTL`n<0BU@GfZ({<ZRu}@_<!u|o
z()sN${x3Ls+HylGoFee3TI&)Y>ePT@;@zFOVu7z-76l&M?dQ_|$2fxeoC@&|-UMec
zCWKx0u3VZmB)Cq-byhkto0`$vJ~}Z6ckhUU){(6k*C#6@rszu{+S#;!yOiEj>o2VX
zD;(`^@S4^RV=S6P&v~&1CKP>cJiEwX%N-=iYP?2T%ZCkE$tS9TiF3%P>Y-+`cBM#}
z6T9aYA8q(J%Bs6sE!QcVW2S@g>Ici$?qlH#X=Vo3@4ovv4l!fzQKPpvwt@|UyIENk
zEvv32gjxj>3g0G@+Bd7(PUAI~i+b55$My1lZ)vZj6u3K;3-(d@8DO-f^|x4m$Y*<Q
zji0~s_mrGH3b$7**8N%1bRD`~&^_{f#R$_IW%o#@VzPE;X_(M%CRC;Jl5j@^`{wX0
zJ%iizy_sIbNMElXlq7A3TWZsOLL~X?_n+_F9;tbtlxTvTOS&z&WF`<$@P+qW1XE6$
zzj|~mZS6OyE4)qsW6qx&-#$1N<*iYi9V}!hJ$!`UBUwJb1F<~$1@$Q%6_%KVTvFx2
zZ=7iBZwk?)tKLbL`OR|p^Bj?2fT@9e3FE`XnWyh8w!1ms2O-mF={NcJCJm2l2An8Y
zh%6i&I{rRQgG~<hiRLrF+jph8mL7dT6bXDXWpca}&H3nxA@OKael5M;8ub?Wp8t9A
z0fNCYy!67K%QlyK_V}Op2~<abu#VY02o0;#U9F`9+8#zFYnSc2wElS^GLr-=V@A%7
z_IrO;9p_=Xn(P&ya2_DN_DrVYG_JN;@@eUR-iVBV3`|kQ6TU8e7l@mJ*#rJbpB^y3
zp6`2viN^>YIegYG8e>?GUnFfXJv6D5D7F9RlC!Vtq?y5a5q<8GyYPQ<$-$=p*Y|i1
zNi!jl2@Z$aic{>>07PIjbnB@nGJLRuysH~?1sQ1oPtx3@su`RPl~+h(nkgM>6@6FM
zNxdXej3S4172n9QYSjC{rM4V6DP&__&THxD-l#caYF7LRn9}P*L-9YEf#^<}IGhSF
zUljW{>`Z|eQ<J3!ZtW`wg)-LwO|$Xo9l}S)t)#ZJFsdbn9|1{6+kR2Gmu*dKgqkSt
zG0g85U5CkLT{N0NUf;g#eH9`sYUTy*C*^o9=Dr-!3^^f%2&wE>CqDtoIw*gN!b78t
zoN<nJY$(;NU(9=ju$6*biCpOs%gM<U&V9!l7qN1Yq`4y_lxqh$M!&kNRYZcy)G24{
zpQ|x4*@*m^VsO@?EzG|!Z1U+LHFksn&J#KS5+jundX^3~*+9Ryo8KDu*VvJqsrxCj
z)F`^T{HB*9tM7>53SePXnbe;Up{V5biduf@dJNta8T61+C#&ZHfm&&=hd!NiV{|wp
zss12>HAmaQm$s?P*PmE$1{2<JpZ)6YMnMp(zsk;ydv!qw9D(?8vn!I!t#e}u=4UX~
zdfH8|*o*2YdbFKKynV~Jg#G2KFczE5Fv()^NpE4tfn~kaOc=|MuF%o}!|6sS>I^jS
zv;+9Bje#%Ql-l_14QRrm)80yx2aG=#A2%tLVA;Sjop#8V&4slFBdo?~dcIW43hY*+
z3=`CP)8inR?jsJg*0%8OVDf+N1nGihb{&VO?Uyv7tcMiqX}69ug{><ras2GS1U4!)
ze;|8H94zE0-?ebjgAqoF8#E~jX-+RYHYJ#|a>MV35++6W%8T@_!jy<7^4%rS>GOmC
z$R-sPGC$C#SnKcniKd8cy`S!BvrSTufW?>8s4k3VmwY#2CZf&`HLU+JGbyS<bx`jb
zQ6anYLbcX|J)PhB&VwFgA2UE;iMEa8C?%Vrtpqgw6i9{yiB`h=B*;322#9+-uC3Zt
zKUx3T1=owt54OI?T>6ymWfu1BJAb~g%&RD=W=KhfytIEgr}C1`!B&|%5@HQiB`Ihq
zUJ!rr3&I`NULQ`29C_=C5WWvE_O7b3n#^__<Rqxy5uUWVM3{XeUuSAKbPOw|{6q3I
zQ|iO|>weG=!*45jo&3p;Ww&?{A9g4lzkaLo*qh#XGe_0eKz~GVjHnf&+$h@cUTb38
z1$Q}+ePmb7scWydm?bqzewbx(F1zSfV4ONi%MH^s0$G3k#Ie<D#bvSr#rNi#akmM)
z`tXUnxTNR#^0W#G`jWR^(rZeC2uLAKq0`NEjtEI-qV6;!%EeYqIk{?~MzMu5qRg??
zxULv)W5(a6p->nljrHi`+dI6xd_>`MEfVE%#Ta(<F@rhE2^Lf!QK<`QVYOW-c$;pB
zL1TbD<nfW%btFXsq!4^f#WZ#jKB~ShA+V}242LN#s`@y)hhe_8R(&ldlqAbw<oDU@
zh5EDhDMMzm%)Gw6W*Q8HtrQ~k8(SAHh#>V;H%qQm9vc#T3>@dzbYPPC^C;d=Qa=Xp
z?i261wX2RPZr!K<u0@G%7{A9Y`_aP0^l$MUk3NqGQ*y`@Gkm^~wScJgc5MBU#$)ub
z^gQxEt~>#ad<aDReJa05l^H;Q+7uf%dROan|6bRUFVK6IhZa{av8#sd+?BTfWGv?x
zVn5)UU&o=G|0im~DVMhVsVU5q7Q*=Q_pr0_RNfP3h1~iPwqW~t#1mxo9vmUXs>|<T
zR{_1y_Pv`v&enP31XG?1bh><Z={Zci3t$(ru2a8-lHw=sw(I468aia(;q8f=-4y&~
z@sMS_SWPZpCmIzQ#CyRN@f(9Gl$oG-ko&W$cjGmhVw7o@Tyly)^4C~2T$Tu!hh!@r
zUM9~<k31~wG)pi#g@>n5R+x9LbHQX-vFN3xmszixk?HuGrKnrZm0uJ$>z`kh2400r
zU8khm+iEv^*%Hb8isD?z5%$Tj2|$sxa`nRwh2mZ|1)ZpQeZM5K?+X3_1UynplqC28
zH<#k?G+xryp-Y^KESx;6cCQw$un=px8uFhr;w(&>?+JQYL$=LqFe38X_<!(Yp?izk
zOZQGN*>n@IqSm#NiUTilbw`jYUeqjT>Fu6?8g1Dy!n3SEF>0oy)n|9_&0lSd<imVQ
z3<4IdSCe*Tf6d5L33@&5zG~DENCyf)2tid5Stk)Mi59Va-<wv>3G^uKOhoCLmm>M}
zLwaPlK**f9C!pejsIk8mf&&x-5y}Y)jQ9T19$>YSvcsK2P3UhZP`XVpLivJDEjM(L
zPtakkxVayvem^-V@=K_%N<uEdh#UdMuKcafp#`^DMk=6j6pjSj_ZHI(+X5jbz}OMS
zB=bGRUv_i6>9p!YU?qV`<rl0nz#lZRIdn4gc$8@V9ORGQ9(%E(!+kKcq&#9~a7{n@
z{?K&_XMzb{aODk?-6?Z(d~UXH*c5kFBubQ3o6qbE^QUN<w|cpVmS5)0#mMa#irXA<
zi78K6LD;$@xfs<<r`RPa$4hANQD^H(j>AYON?$JM3bTuxy^3E@?~{AwA>?LZBm_8-
zN^1gY7Gal?X|wEw<>sOlBJhT#%*~iF=uvj(Gz1HarV$Gf!IhlF!#2hX@@U1R+^SN$
zqe7xwpHY-o?ATi@G_X<>#+n3gI8fC_dh#c?dlgHGTZp2kkYc;PYC*T_H47A^%s@$f
zwGP3hwI*Th@(!vpu9Ns{*>Fu&ZGE77_8NvObKsP>bq+=lRneE*PK{7$_1g@-zd#ds
zo%^TOdx&H`oFCB|S)J^R$C^`$4T|V7?3*R?A-wMD)wB<m;V=6&qyw_%-VPLCU|hnd
z5wwl_P==aHcqaB4Lw$GUS&8@4+ylP)Y3d$DE)u?XEdiG~D*r^HeNq=+ePQP433B<X
zD0Yigdj$#i?&^CKITU4;7afMY9|2goc+z#1Dk9iBGo2?T6rp9f&uwHJ@6`3ea`CD?
zROhKY%p_3EV~JLMirCk%B(c<eg)g`TMfsbzuva*hFbuC5x85aL7XAY}lKQoe&sm?}
z`rCrdaQV1Jw3}(@3TN!$lH3dKyH=I$MJfirxySZ8n}3V*3VmFlVrq!LUFy|M>d-5g
z=rwJwb8z^I--=s+w2P18lR%e%X7Re+m5B{5gtnKl2I;@lmBJFe(`G9yBMNiD_z%B1
zI3K3BuPA*^ec5}wGi#vY21+4Cp*I9~sH4P)0-wq4;I~B<DsT~Us#TIE>M~iJ#5C-b
zlQA5=S>sa3|1}hX6FfwYC!*3Qw>cMbfAOhK3`$^rH-6RL;*;BwFlU#yryIAf7Wtbz
zF2(Okxn<Y&GCm>i#14}>$Uo6Z(epaW_T}V9r`5olr$}Ta{vPbRY3~wC)tQz!xpkyQ
zMuq5h-#ZYHRlTHUx?K3J_bYzKLP*b^mMa~nNX+t~S?j@4fdgd}MUQ%z2FrwX_6Les
zcgM&D@Dupwh07#ZnR)ZHudTy~Q%Fs<p?2wLv}}`mhys5N5@Fp}NNpyfZ&8}ONd570
z_``Pf-MQaIA1ZGP@LIA&*{7Yuh1anC*}Ix+WE(&6;kSIJ>73~;?g#NtofyeA>oF6$
zl5IVNIuK$3!Ncqk{bcOB)b*}!W332*73^L!^^Rpj02$)>vL0`oM^k5X6)q7uM<IOE
z$f5i^mP3~QzkH!-oZqqtgXMk|LMqBg|A`Kxf6$mTv1p2%Ya!SY`<k!O@>D^BlO^+i
za8AqJknvTV5kmj(dp8~IAYvPEOpxH6M{IK-T!QWs56geRXF2mU0T<`p=;Rp3vVGfc
z1rnGOimJQ3y)SSFu)Jw`p1w;|js14z@#1+hMJGfwp=M_?svpzdPhICHqXiIbNelIT
zVnJ=)Ux>8%M?aE=7Ng<#u`u=e>642r)PQHMg@$9ONQf+~sf9aGJ$_g=1@IeP_Yd?H
zKcT}E*hX{^=rvKUTyBn0sFY_K&^lTbLFxPR3!IAkUEVm1u4R(~s@GOOJ!lLfdsf*&
zpR!MCInb+>t+<$;3LH~WJgv>=F)NP;^Py+uph7HEq&{EsLX<M~^uh?IdXuzR)af=Q
zqAv9dBGKEwU{u4KVd0`%S@0w|k6GQg^X%(ry}^QVp6m>^8l5Mkq-A;i0F{}$Nf&+e
z?G_cTvLGis*wvk}yRDCkzOF=>=KB&OH&Y$cG+FZWC$lF&CWJtWR46cI1xE=8$|WS=
zWiy39oJf#NV#_q{*VYF=tXhL`&l4|Px~ZhRYrF#UBHO7fRC&~|>N<9?w}Vf?h(sFo
zH8Q_^gMa`KEM`Vr&FX!@;p5xrB+mmzUprwBp$U5awc0^NW7~D}I=50U`jPr0LUv0N
z3Y<%U8=Zu|NDYslU99(bvNFM!_Gn1CrT(r1`nh+hQy!Ud@o!Q(MEqh=iqYX|n29lY
z#WThFr)vAoBe$?|vv+{27TRqb_X<gyKU=a+eMt33&=)8P>QE7LocGO1^lx9CkH)v7
z7R+Iul!~zHmp(hTf>uTC^fRX9!4STHii=6Oh%F~fol|N^s8mDek<g}xcIP+281Yw^
z65XCXnY4a+ov6H1cg%5iC#6xKbu%~rN&+Y}l+rv+kjE?24d;bd2L9;C3>$CpfV#w{
z!Y?$Y?WIwb*xMGh?X5nkZsRf!eDJC+vqs6s4(F*2vkWDXR_uc~Lot`9wH>2eG1tda
z%k<;pRYBUR`*wx^Dz-w*33+}`&O5|dXclC4_bXtJlrCBUU$yE9LkqOu=0$F7N#OJ*
z^Ra*-<qXHpk}66lt~)Lde4!=F))G7hB{W+)v$Dvt%7kesfnn()V!_w+p>opZ)#70n
z^qjZqLRylt2L!kqQUsG)-$$0+$E2~eB`_m%G7UC35YNSOZAnHnjfnba77|wCf@S))
z6~|0Mz++?j*^=lJH;dX*O$peZwx7H6opXN1=f9_{kt`^D2WU_#ls*D`z01~p^Igum
zlsJ=*Gv^)9_x5)QWHbFSu(4PpY9B5XR=wsEjoxlLAQ6R3)mBjW7oCWhT5&WVCv%s7
zW-ic_MrfW+86?X|dl>B$EGWk1#Ju~TmCEPI)Io87lKuxk3<65ckt<Bxw7&bl3?=T)
zf(!1E-<_q^k%EK}X|OkLW*f8C-<^^l)i-)2S6kkgizQy3PWj5AQ;=_Z;S&2rv!BKi
zmJ2^K0Pk1B^`{tUZWm>Fr*Hna?`uejWv;jz+ySF1Iq3-bn`|E=!`B$b(Da7?vV4RK
z?1ddGw`F1dzE@Atc0sb5W!a^+J(c_}8b!n|U2^UfjM&d{?UV}ArPhQ5%QawRQi?=!
zNnT`?V|}Vt2QQPY6M5g^S5}9F+p9L_^ME-M=A)RTw5Dls55Y5|TO`kQ2`JeXMnyhz
z*k3se5|Y9(^s>PGhx_uJ#+$&YA-8z`Swyc3NL_}K-rgmRLv}-^?$8e0vA`hL?=VX4
z7)P-_hibs%2@Y9H-{MoSRTg`2FSCaizi)R~BEzEW$?h$BS9gcd(Q2GfO^ME;JOcsm
z`4|?1%0_GQ=q!F`ng2y|W+p(D{JHHQZ4k?8)jn+17HPoUbswut8W_f-phkh`T>&$4
zHIxq&q6j#}5y-3yfiPw}uCaG<_jGS(Bhmmu1UxHy9w<<H*4oy523Qyypd=~^UJ)4M
z2urm&Ys|`QZJ%Swt9og?J(4x2h4~_D`9y~L-M_R6-hZJQim(k+sL1@VsEH5zt<PnU
z5M8v2F+dndIKOES=1an<bwa?b;-!JOvgcSJ;cD{9Ox$&mAl-9<bipps#H%mz2jv*b
zl28pEMH<PxGt@-2M=M`3?`*`QI*Wz;a<Ojg3isTvBlpI|Ufnr$JIn#1vTlA<XRYts
zB2FbMrr|L)%IQ8LuB_t>4Ys8GZ?c}I4=}|^!&kkv*~GcqHKnlr2LO|T4mP*l1wt$8
z|66w`0u8kk$VJZ2CjO&7`akO^f+0AHyYS<Q4V^zo3L&V85gdH>a){e~iO@NcWzYqu
zhE%D|d{qcW4}_uAA@MI~E1f!7dqQ6PGtv#y{=as>V!Dv@9@2b@^LFdV0d~NBB+*oz
zE`5wNUlhG|x%5<3kJf@`kkpnJ(`{;@nEWFiALrfleh|JO1+5Sf>FfUZ6%F}DNDuyu
zn2h0YiaPe-XDjB(<0-aTul9i(MsJd9-?ac2jPD~Y?Q5??5eIA=f()gXbynR0`e5`P
z_7Ux&q1p-hDL^Yhz^~j`X*xSSx2}U2LXEMQ<QG3;W>2LEP1+s>D3UtC56p&$oM36e
z5%k759@d~xdTCh?(CfK;3Hjmch&~R;QcLb~jfWC@3Ej-k-oJ<Q5%{>kHf5jX0Vef}
z!uQy@7=9g<m^S&=8LY#h7*rE`<v<`I4tB!ZF5BddBQ9n)LEX*hWEkMVH7Ie=y~PWk
z)fNR;OX$MS#taLLXI#3Bt5V3p5@1m-lS>OA3&}}T5j{ct>TMvGq{s#|m&cug!*^n&
z?7B(nBu&NBy)`skM@TKwVYeX?uv9!>$gz}Q3PJna(UsfZuEn!j+D7yX*ahvKyCBjJ
z*hj~&F}S&2zcu26cSC36d*mmRu#_Vp*yNzO37&!wrbr7vkbUm2yyOn3DMR3p60s1z
zQn}lgYd&7R2ljV_LkisAAF8p~aR{K;zZ%22^G<7v>X$1`DsV|4(@n2rV&!8wBp|#0
z5>eHot@qjm#LL1R;UP9LfKYy<B>AIH%%TVTvEI7@PX1+bSNZDe9QTT~O@L+>Zw4|s
z=_kiSRMn#SbhE!XR6|f&7&+D6(hY%(I^vn@H^bh}QLh<XGlQLl7%{tt_(7#9oDcXF
z{^UOQ)NwqMGGfV;rF{^BWS&z^f;*a?GG7^{67HOfr)4G=bA76~!7NA3PM?G^q?0NP
z_J;;<plB_4eSc*lU<DtS4|95T8`$e~)KZA{5V^Ij2c|JPXTSjr+*Fp3CAPM)*K~G|
zDyinB{=6)!z+Z%uO(7ST&omT+Vk9&*-4fV_Xn&wXx1(ZSH^&XVco(?=zPamLGiiss
zH=0NB7uy9MeL3tJChJNjicyMQ`JTg#zfjXorV<2{<~f_xoXopRDdLpIz6h#3v4V5~
zS6O~>xI-2qw=VqWJ)Q35iFp4|D^$c1EXL7dYTWFkDO*L=wdhD4{JunJ$yoVNTghBU
z^G2?&2|*<&(9C2GXHdnb5U{}ZpnkJ+PH&uRhzEqwUJR<t8!?1b+A2llLNg0Z;ArGJ
zUexd#Qbu3u1i2q<6CSCW!~#qyqONc5v@MECBLiNiZ02!%mFw5ZKUJ;uQE|_r`lXH@
z+%q-*5t&g^OVTtbhi_0`9&0!3JT}Xid_|}DG^xA3`J%v+YbTOdmLoOmOa-SGnjo{D
z?>ii$@%2ga0L$F--B`b4OEH6HO3Xy=Ws85fENHK8Nhe#~8nYYD>mcJ@yUnxq7JDf9
z+5)SiOkB(z;+kZaw*{yZcGsNY>K6||G0E#L!k>*~AvOo#(Hk_?gI3Vp=L<VNYiAr5
zxsw+mVPg5Y0P_Zb>k-q=dcmy9Dhh~#mbMoq@arjy&8D}!g4og8pY*3|T0QLm(eRD&
z-|wfDtQO)0EucO)Q{_|x`OwKOOfkqJMtKj=<Lt25N)6r`s%R0c_7$)S3$Dt%w9$HZ
z3pD#-PkbP1CltZYye2!#E1ahqT}yh%0@h)8KqnK@d#eb_n2Nkx=t!k+LyU4s;RyZ8
zb{+!(5hViyzABw<Gprc4s*Da8oAoy{!bZ)R^@6<Ji<{JhCt2f#@;!P$QLNT}fw41T
z7g9OD?89f~A&qRMyc(r8VEs%@^a}e+92S)^5Hlq!V*N}GaqIZQND4pYIZ9Rj5ZbEI
z@-;ob2UY@7*SA$8F5WK((x2_GqrMxm#9J_>hEek|%S!mvL2q|R_7=}uZ08q@4_8qA
zggZWVR;yHIUmQPnZkpJ|x$KpplK2PGVEE@)%qe7(6&kM*wkgmT;D;|D-ex-S$5DBs
z`TM*TN_BEW_CM;Xe8m;%S#+Y4=Mgte7KBOEnLGx`Xwj|g+!jHG>?laPP+CTTMJ~`_
zl)Px_Z*@CMC(TM>i#j#sKqX*~SOo2szhyII;7k);(WU(CR*s_LyDW59yvG=M2EBis
ztDiu!4eF=eK!e6PineLoHoiLoJovN8^rC#92H14;;nae=kYaX2$4_-oPNNr(Z)&yL
z@tZK#7FzB3ouY>6O;~KV`>Gl-u>f@&3qOqlMCx#&Ds)FdHKdUD`EKc*3wj1;NUuF5
zd;oeLJj7NPiQSKuPmg5Aq&IFPfhav3X}Kq-|GVYlmHofP{uArIdmr)i)^FWJuaf_#
z(HtX^ADF|@!AD=*t4$L|Hr!+AcwBFZG?u~h^`AT$PNK3ae{cjT&t3Qt|EFx>%np9u
z-J7cGycy)Y8smZ_&53FhPlOHD6@}4=pyf8MM<>JoSiljyfZiA4|4W~9wV;Ob@q@S?
z#aa)sQ}9(;-WB##GyN3iB~}aBq$527M|AYA?W8+8DytFJ0yDr9seXI;^L^RlTp{Ck
zU}q5l54yngbsFp!`<ty=)=bRY`wOSNZ1}H)h$(qx=<dz%UK;3s(0U#p8yzcCVQ!|x
z;PZ$QQX<$DhSeGRKJqm-O-$^7*{@$mk&O)T<J8UGwATIhZUN~Dho1`DeS~$WR4op7
zmf!l0OvInU<gUZu_{yoSU`;*aTMVpficg9^rMiM-=CI9=vJy#($^gWY-jH7X7Lw%H
z&tK5?9~#r&rlpw@<szJ%NSWs9-nvo;lQUY#2Oy$DfW@%keq*tS9BUMjsJj?YSnGJO
zwgVD%95_<x0g0&@pBIGr;UQ5oHK4|}L>*!h_ljae0`kjKXmVGrQLn|^@pwgNkW)}V
z9rTXu2SnBus1n%y_>ZdFpM9YDzt(%s2oEVmHr5z6j0&gz&38{>6)4~+P<eSnmiE??
zOcgxwMSCV=LYky0k^7`9ihsC%)?JLPhXbjGrxa-%FSA4FrB?pLwTvkwA|ao~eU#dv
zm@lK+l(T3gUWmqK??u-$G3)NjeSaYZSEb=XcK0lOl))_S*RL*11#d&Z29+<(S1)|u
z(KM@ErK*a0cF_n&xff^Z0g1``hqzqdD($pcD7&J&2u2|PtV8nk1x2i11AXTogUbKR
zd0o{qmm^8^=bH^dMn!*#u>3eiW~Mi;EN?n5@~s)^jO0Tc0Askycd2XdaNF~7YLq%C
zbqJ`*T`|Zz%v!OtvBs}){KnOY+VsEEUu0@WcSQ8iroZ>Kc)EBsN6Mm9gU*w)0oCz>
zwI%J>KUf5QMnh6xN%}+U8D#}VHhzCL{;an@A8iJ4@h;zQ3Ah-pdUp~08wGooJy+g7
zU*e|H$OIpESW?2t<r|kgsmC2W?Z+KR)2=i8MbP+d>ezlyTO`Op4B1a-W`?AwKr6=l
z^roHr*6}f5A6Jp&y}yW)&ijk-knpT)gxQ_H+waNN5%r>HTeYoDe;_A)MQ!ZkrauCr
zagSlB)29m%Z5b(ttJlZHNsZ!%q=sJa0i?KmCB3%i*WZ>q>|iE=`|8QPC?drDDGb)L
z!M7R=Db=T}l||M)Nx`t2hhPwiW~$M%StvaRv`3q`OQJ&~O;n2t@gq@3WRAFiLxH-}
zu{T9P>A)ex+Xr?Uiat63V?>&<H3ldNWDp51Nmh}o>r;l6l1l<My@P^>_>N#utgGzs
zjluajSfu}6c$RCt$-X|(_3B~kH6={9WN`PBFy-jwApiquDJv>^@Vl!-#Jp}yGrn5C
z;HjInL?0q+4+SC5jIGHY^)Ok&6+Ig+syVjeNt8+8gOo0bgLkMhi%XY|VG?6*f2s)Q
zL-l@pk@QO1uki+9#D{G32@Xik{6sO@@S2cD_>a($q$|yvDm%&D2i;n=r*m`nVobG$
z!QDa*N3W4ITf)d(_et3l^6Ipzdfxm90XJiTxssiEl=lsYTDT_%C%HJ=eLy63&C<_g
z-i|l_H4E7&OjOlu->GvyqIoy*obQIVvp0;=BZZVNlS7mO43x?IK&qyTK{9H{{d-U_
zdZy99u!@&w-QGLmMHBSK0xW#R<bmuI=FdR9YL~r20x4xMq(J-Ko%HedbCr=W78P+$
zqpnNH&?`^6|5R~p&_;}5OEwTI_q9orZ$qbbtb#my2Ly2C97Kmj(TNM2#i5sBhA3k3
zG0~WJI|j9&Cp>zZ&*EAA)-<~C2#PFrF#(^`B20(3K15#O8S_p`R#^mHfF{Ok)JLK0
zjG=PhRaiy)<TKeYlkIp`v`T}v5uss{chsG!8uNfK@AP_0!%ADNX}edFT*E;t(}i?l
z9d}7fZp;TBxcBx1ne=`%{_dfPa9BfIN-x@upN<yl`um9N?lAJv4?eV~eP+X-EBTWI
zE#+dm$?=YLeeL;uq1IC=Z1M9}ERR9C!a!NLR+QlzLqCJZ$RT+mWtVR;z6PM}6^>Z3
z=A_cBujL8sL653by+NuMof`_>b3=n^aI%D?OeY@qUdboSHY-q0hbPBfEvL%J>(X`)
z2&QID@G1voVy<VoRnDS~?6t4D%-C;+%NJfhF_BB9eK98Xw=+rsQ>onh;IFXbcL*{F
z$H{!LzVWLq0rm*x11lfo_&z(&ug%@OA2?Wvx(+H?Kk&A@ZNdt8r`?RDWMINNmfexV
z`K;Wd$CHCj#QWegYx>_Ca;!b}ivZP)9x-C^gi{(n7}6!}6iY{pPJM0miBpj{WrEQ-
zJC4V>zW{s~&nq@X*2WVYtCNf(*Y5;q!fq302Yxn}-q$q&cU5^o&*j#mdo7%qR;Bj0
zk|dw6&@Nfj_>-p^2pQUuDx`+`U+f9>49BS4l+w17ET&X@%GJl&p!kp^i`)xT_#1OQ
zF~9GL#VFppf5Kii{T*x$^10)tS@92J`~HMHJp<_>c7K%l$_#>y_uXP#J~J-4!oB4z
z^zg4eMt}GHa9hpFb@G>OI!)<;Mafg2l6?6zX5eTn@4PQEb_N8(a3tQ{)bDZ2F~5Jr
z5sftmjr;4eRIpdMeLrdm<dkkVFd#<mV^N&Ss0*A#IevRqzgmN{(uA@T!v-=8QjzP3
zNO%rNT+ot;-`3sPOuvNybx1Xfk<DZ)CK1P?URZ)xzza34Fzv^h9|H%}90K|LnwK$;
zMGRhu|2lz^5h>G$9`$4usFz!E&3*~D>S%smQOfkh*(bBuD&0-`2g_53$@EbtHxM@X
z`?%=E$n@cqKCn%FdB*M#GQcT{9xTUrDm^|_mU@x0H_{`nHc9sBWPhy8qEzu2yZD*p
z4(X$Nn0L+wJmWXm+Usw7-u(AI<(Q_?i#4m;#Q&SJcLnW=nP>c8x7T=RKcDQwjbOc-
zBC_+UZKzl7c?->fEBzrTqVzfP(s4O9i^eh{dL@V919vlP-bUL6-%8TzgIooZM>HP=
z3Unx*u-2c&5Su*yCST*o7dx<FRO@E_bV8_d4PtvKJ?@d8EJ2LFyyulfs(=~LtP&z}
zow}@FJ}+u=`IyfDP9{PU>1yE6rZyPvYr~&7cF(9UL<#f{dVgPReow1At6w5L=i!wK
zm*Ka3rVIMHh**ivU8nKqnQp|}SwyA`r!FiMK0$KuT??KUJ*bkaA3X*Qy-MV!&DNVi
z_%ICYYN<Le)&z4IL^J9k@nZzPIp89GtC4d9xlMmsf#cSgkHn4H25S7yx9MISkpCk5
zs$P?7U|jU*J1FMbU`-+g=C86Gor}2X&WRl!<Igab!B(6R>);73k9rt?mUKv}**tNt
zTm!p>$Gf@A<hQj$VZ%$&d*DL+Sx9H%3<pCkZ3ssUUEonybl|TzY$Fm^{V|(-6>XE8
zqJ6<orj+Ve{Wcswrt{<`4tFtChdzo#+C|p8*`Tc4T*wz94&F~>l&eZ2aj0%Anv{eq
zK$s99md6-BKz0yzftaW1xZET%u$vcTqopvzEn)L~^q{mkaDqe5a9@OORKh6~I?<5=
zay!M|Fb{?x+G*r`Pp*4)I$KKzZyI2SG4gC#2ciZ*`q~!@^`2XgQX*UjKLM=Cf#^gH
zR{%jD+>Cz${lvD(V8*hxCrP8D%Yx)ns+dWo%J-lRLE^G;H+Vxu1Dw1q+NGpF6TZ|>
zWBa+H><nt<E2j~u_)A4xjhl0qON<BuK?ooljJA50SS>H5T|OL_=c|y~owP)I5f7OF
z<eWh!Q?P)JivFx;x@5S02H*rfxO)^()lRSH6MfI`jNIdM^F17ja1De!6%z5vCX&-L
zbbBBs(p~SG(NG!QxXsnibSXt=JGGoYi+w<1E6Sv&^iutApihKJTk?JeVMC^ld;5A)
zQp+-*Cy7aPLEx1+iAS*3I(U-Cg|8IY8nZtx;Rw)s|Lf}0t6k;RJ==%K@d3w>F|oXZ
z<N=I(JjyHfNe5W^H}tY`heNJZ1?E<n_lTpuL-zZRt8RL7%aNT0o*pg~vIVXN?86e>
zE&7$Qzk!mvBY9I`SAtbtv`Wgci_E|cpl~*i$`B57#vWJ0*fO__KNybP^qT254mY&B
z0+(Z@Pm@()3_4;&=OASvn2UYlcu4-Vdwx%b`lQoH#8nDBW(Le(Q#ub;!`nvG4pGq=
zx}TR~=brw-dE}1CagxDlu1!fuFwj14TdM!wqz-N1^toq%B1W6h8aPs|d~tt)CQgU5
z0($o!AyfDg8~!%q^Bdw}k%NdqdGF)xA{(_N{z&GHkMpSxg4Qsm>$c%Yh*ZQfqYH2x
z+#gAw_`<>D>iqr}SP8Su!G)zg@Fi*;gj+g<@v4XQ9-A|dQkFptOyu#HLiWv9%G3aE
z|CVvV7zkSog0+GxjCVx%9q7Lq(Y1wcseL}^NGC-e3e2R>6Y?-grLA4JHkUbjS2KfG
z61v*~@maesDIF?s@!OCC%Z?f>I`BqaYC}<%<;upbBhNtKil)Yp3Q8?RTdRYC__lT1
zG9dbIzQc5M-(a%%6xE1ScufZMU{W)KAEuSI@iu%^auj6uQ%Y&eyh9cR0%R(gUdI(1
zo2cC4)c7p*iZscA3zy$c{=@J42SMQYj&nM-T-}E{hV|;WJH_DJslkdk!QL(oIt~kZ
z{l$S)A(fYN7Q_RUfg6F8>#*wOxHc*z42xpnSH(#(GTxq~36NH)NaU6ILq(5B<>yh2
zZTyvh<yHw!#g!v=4w6;<XhDU9yJ$g-vuM~63}UJ{WyN9l?ZenCSd65qLP=Gw7IzMG
z+w6S$D~ZI%9+KocJ}!D6{dlw-w=;QywN2=$nL!XqoOP`WlHf^hpjRugDA7Pub{0ax
zYG2|DGrn}s9r@m|ms|#K3@;>zqi0fQIZ`t{N*0}M;NERoYuGax4JxLULuYdG3BPF7
zkUBslzCtqbEq+egF~o;^cZ9>7gyzSD{6$ON)y~5sjUW_>9EyPXHlxANa2(HY(j<O`
z_cv{}3~Yi<{r-v%i32lACwC)`v{M(5Yy(Xi^x!LI$^7!d{-v~*mz_I}sF@P3b<)_9
z0+ZnaxfPN#E2*=pONt)|K3A>e=Jec6(g^$T2r$wjl4&kIW9b`LKFun-Ag(IVs_SzH
zbX}GuC`${@P5T0|H_l(0$xy`|9fpdAFyBWLo8vv+_2rMUvrsF%j4`ZW7Mo}qkAoJ>
zJ8zwpb`UVw8HK=$(cS5`W;dy2ZLsL9827EtIPVo!c?FH`cUW`N?|mAQ_|La|?ZpLw
z;m7Q&2U`n8<XHy6+@_7jN}r{OgBB%@q(5LPQB&ipm~58QCmcgA*=N1-YJp_N%E7-q
z!7E*_!Q_@I<oH$aUaA*ZlOJVZ`<5kz>3wd|$B7Q}VdZ1~;?Q_!ps=i0*@v?YQ_O4I
zmK>8yvfTSKKdXWmgAx7ZHjL3ERAG+aN_d$FcGgYTFuf?_?bJp_mm<=qK^xXo6szQE
zi<jg?@iBO5rG4wm)5V{>!!z7BhL|aL16jRgOTANo4+0-xVP9IoaizzcU{GMWl^|DP
z?bn0jJ+U>Dh1N$!NW<jxG?%L{fKKl%2|8hP>XfXI@93eW&&DA1pd~1{dSR&4&s1q6
zcq?)8M9NlVOgH0m1=K!qsIqHUm9~+QJnzyTw%VtT@Rw)K`;D6vm9t6uaC_SjQ-=TP
z?uuZnHz{*>(a=kr^!DPjd<?&oKVK^~>e86*v>Qv$@gx{CV_jX`L83kTP-m3HV_qX2
z=%fUUd})IYTvy7t+6y%?E9S%PyzTGZmgM5FZc^T|>f)bMN?%F1z77iEBJ0-_g&S;N
zvt!hVyso-2ijrm}e-88?``r3Wm6Rf!mQ4LqS2ALM#N)2wQNh&hHfs+frCzAM5EAuj
z)9rm(dsO&TCvpH~jhTN0TbAuD*LVeboJy&rs3=qVydm$Uwu~bJ-I<qfrLSAYilX16
z^mjX3kZ=iN#^n|34Khn`mALTbE|~$f&ZS+^*EK5xtZz~ZCadS-*?Z)}B_ti5OAr=a
zU~G87Ta?6B(W~*`v05!eNxV@r$~~^Jp~az9oo8a-_b_H}{!N>lx~r|%?`vn!Xhp0s
z8$^5k8e2EN%}pb{)I?sr&X}80tNiiQOhFvcqcJ>YhiVz3g_?H~d7nw|U3aRe^ksQh
z;k)tO^iXq6>eH&5-^%S<X-=*i)@ihC*VuS3&L)@d&#h|o_RPk}=*uIcHDRb+L%4kU
z1NZlAKb@HfTYIIJcQh#ROEE8BvfHGm>PaYbhp>bO`ijd@!`%JPsZ|s;>!&Oa=Hxm_
zsy~sUiL;Zuyzd-aZQX9&Ro!p{e^#uW&*nN^MWT7J@UBOAqBoJsr=%#oPjX58o<GQr
z%z!y6`z>tFMxem^wkO$<LufJJNS&cjt9A-yjO|^~X|N_`v)S`4qsB27p~a*y@vb)i
zb^lnF%Ud(uM`3XJ)!O~U0ya7(bDIT7!pUj0&Q24u<d)b<x>w_3Zcj3KhNgSr0$PQ#
zf~=1AJ0hMl$fZW8IRVCLbk#L7XX*j3ZyrkD71CEoSru(x_Pv?7TV~+sC4N5iKf*-6
z(JY2O$|1ow`Ck^&mH}M|pSOQ_;vjQZXtfGF>5S7&U7gc^jA{M*Ly!@F@9ck#_?MR$
zA8v)La{nV7A3er&vootVX#MH+a}s;y<X_yWEg`1xEaTQ^^FSj@K`Fd{v-^#}XFN=g
zQ;b$IfWn7&UIlx|nBm^A()lbe^HR3?BI;%BsFddU>kaW?vM+I$_Ceg|(>RZ)FpZzN
z|3Ck+Cfe>1gJ0!XzW0A#0ZxqAXLzNiXVqDA*h?5t+0K(zxiNK~ogDYPFA?Hkh(6zc
zVq|()GSMHb)|s}h`#pGY&~kVI)+S^RGQ{?G`nJ1PkZrJ&QB@+q!GHvGA^Tzv%it=U
z|D7nyj>i3<&ku6l(2oA+A7S`Aoh^NlBv_f!P^xg-p}t~K;%e%1`TT8HN}!)%<36~o
VbJ;o8^#c4+zNN14PR=ag{{qS|ao+#{

literal 0
HcmV?d00001


From 7caec10e7b978853f8f87fe1d0cf77aa85066cdb Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Sat, 16 Aug 2025 13:16:34 +0800
Subject: [PATCH 325/932] [XPU]avoid circular import during XPU init (#23017)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/platforms/xpu.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 66ebc8ad9d..af24437f64 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Optional
 import torch
 
 import vllm.envs as envs
-from vllm.config import CUDAGraphMode
 from vllm.logger import init_logger
 from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
@@ -105,6 +104,8 @@ class XPUPlatform(Platform):
             and not cls.device_support_bf16():
             model_config.dtype = torch.float16
 
+        # lazy import to avoid circular import
+        from vllm.config import CUDAGraphMode
         compilation_config = vllm_config.compilation_config
         if compilation_config.cudagraph_mode is None or \
                 compilation_config.cudagraph_mode.max_cudagraph_mode() \

From 5157827cfc0fd06d361897b2cc912ee1b5bc6277 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sat, 16 Aug 2025 01:36:27 -0400
Subject: [PATCH 326/932] [Build] Env var to disable sccache (#22968)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 919300e143..cc3037ebb7 100644
--- a/setup.py
+++ b/setup.py
@@ -60,7 +60,8 @@ MAIN_CUDA_VERSION = "12.8"
 
 
 def is_sccache_available() -> bool:
-    return which("sccache") is not None
+    return which("sccache") is not None and \
+        not bool(int(os.getenv("VLLM_DISABLE_SCCACHE", "0")))
 
 
 def is_ccache_available() -> bool:

From 78863f8c5c67367f32533dd0230faae51ec51145 Mon Sep 17 00:00:00 2001
From: Andrew Sansom <andrew@protopia.ai>
Date: Sat, 16 Aug 2025 01:25:10 -0500
Subject: [PATCH 327/932] [BugFix] Add support for loading prompt embeds
 tensors serialized on unavailable devices and sparse tensors (#22962)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
---
 .../openai/test_prompt_validation.py          | 49 +++++++++++++++++++
 vllm/entrypoints/openai/serving_engine.py     |  6 ++-
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index e31a1d0776..4197583074 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -1,10 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import io
+
 # imports for guided decoding tests
 import openai
+import pybase64
 import pytest
 import regex as re
+import torch
+
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
 
 from ...utils import RemoteOpenAIServer
 
@@ -42,3 +48,46 @@ async def test_out_of_vocab_token_ids():
                                             prompt=[999999],
                                             max_tokens=5,
                                             temperature=0.0)
+
+
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.bfloat16, torch.float16])
+@pytest.mark.parametrize(
+    "layout",
+    [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr])
+@pytest.mark.parametrize("seq_len", [2, 10])
+@pytest.mark.parametrize("hidden_size", [2, 10])
+def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
+                            seq_len: int, hidden_size: int):
+    # construct arbitrary tensors of various dtypes, layouts, and sizes.
+    # We need to check against different layouts to make sure that if a user
+    # uses sparse tensors to reduce the transmission size of prompt embeddings,
+    # we must cast them to dense/strided before passing them into the engine.
+    # We don't use non-CPU tensors in this test to avoid preemptively
+    # initializing cuda and break other tests in the suite that fork processes.
+    # We also need to make sure that we only use devices that are actually
+    # available in the environment the test is running on. For simplicity,
+    # we just test against CPU.
+    tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
+    if layout == torch.strided:
+        tensor = tensor.contiguous()
+    elif layout == torch.sparse_coo:
+        tensor = tensor.to_sparse_coo()
+    elif layout == torch.sparse_csc:
+        tensor = tensor.to_sparse_csc()
+    elif layout == torch.sparse_csr:
+        tensor = tensor.to_sparse_csr()
+
+    buffer = io.BytesIO()
+    torch.save(tensor, buffer)
+    buffer.seek(0)
+    encoded_tensor = pybase64.b64encode(buffer.getvalue())
+
+    loaded_prompt_embeds = OpenAIServing._load_prompt_embeds(encoded_tensor)
+    assert len(loaded_prompt_embeds) == 1
+    loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
+    assert loaded_tensor.device.type == "cpu"
+    assert loaded_tensor.layout == torch.strided
+    torch.testing.assert_close(loaded_tensor,
+                               tensor.to("cpu").to_dense(),
+                               equal_nan=True)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index d6f92a6330..0f4a7c0186 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1006,8 +1006,8 @@ class OpenAIServing:
             # OPTIMIZATION
             priority = orig_priority - 1
 
+    @staticmethod
     def _load_prompt_embeds(
-        self,
         prompt_embeds: Optional[Union[bytes, list[bytes]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     ) -> list[EmbedsPrompt]:
@@ -1015,12 +1015,14 @@ class OpenAIServing:
         def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
             tensor = torch.load(io.BytesIO(
                 pybase64.b64decode(embed, validate=True)),
-                                weights_only=True)
+                                weights_only=True,
+                                map_location=torch.device("cpu"))
             assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
                 torch.float32,
                 torch.bfloat16,
                 torch.float16,
             )
+            tensor = tensor.to_dense()
             if tensor.dim() > 2:
                 tensor = tensor.squeeze(0)
                 assert tensor.dim() == 2

From 6d3da472bc8f202229a8e178671f4fe72037cfb1 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 16 Aug 2025 15:26:10 +0800
Subject: [PATCH 328/932] [Misc] Add --save-dir option to benchmark_moe
 (#23020)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 13bf1be836..b4a03665ef 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -3,6 +3,7 @@
 
 import argparse
 import json
+import os
 import time
 from contextlib import nullcontext
 from datetime import datetime
@@ -542,6 +543,7 @@ def save_configs(
     use_fp8_w8a8: bool,
     use_int8_w8a16: bool,
     block_quant_shape: list[int],
+    save_dir: str,
 ) -> None:
     dtype_str = get_config_dtype_str(
         dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@@ -552,7 +554,8 @@ def save_configs(
     filename = get_config_file_name(
         num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape
     )
-
+    os.makedirs(save_dir, exist_ok=True)
+    filename = os.path.join(save_dir, filename)
     print(f"Writing best config to {filename}...")
     with open(filename, "w") as f:
         json.dump(configs, f, indent=4)
@@ -707,6 +710,7 @@ def main(args: argparse.Namespace):
             use_fp8_w8a8,
             use_int8_w8a16,
             block_quant_shape,
+            args.save_dir,
         )
         end = time.time()
         print(f"Tuning took {end - start:.2f} seconds")
@@ -748,6 +752,9 @@ if __name__ == "__main__":
         "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
     )
     parser.add_argument("--use-deep-gemm", action="store_true")
+    parser.add_argument(
+        "--save-dir", type=str, default="./", help="Directory to save tuned results"
+    )
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--batch-size", type=int, nargs="+", required=False)
     parser.add_argument("--tune", action="store_true")

From cc826a202b7b66af222374129573763237db3c1c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 16 Aug 2025 15:44:50 +0800
Subject: [PATCH 329/932] [Multimodal] Update Tensor schema test to cover
 arbitrary shape mm inputs (#22867)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/multimodal/test_tensor_schema.py | 143 +++++++++++++++---
 vllm/model_executor/models/keye.py            |  22 ++-
 2 files changed, 138 insertions(+), 27 deletions(-)

diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py
index 92390d8c2f..036624431c 100644
--- a/tests/models/multimodal/test_tensor_schema.py
+++ b/tests/models/multimodal/test_tensor_schema.py
@@ -1,17 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
 from functools import partial
+from typing import Any, Union
 from unittest.mock import patch
 
+import numpy as np
 import pytest
+from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
+                                                       UserMessage)
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from PIL import Image
 
 from vllm.config import ModelConfig
 from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
 from vllm.inputs import InputProcessingContext
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
+                             MultiModalKwargs)
 from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils import GiB_bytes, set_default_torch_num_threads
+from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads
 from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.engine.core import EngineCore as V1EngineCore
 
@@ -23,12 +32,64 @@ ARCH_TO_SKIP = {
     "MolmoForCausalLM": "incompatible requirements",
     "MiniMaxVL01ForConditionalGeneration": "broken model",
 }
+ARCH_NEEDS_EXTRAS = [
+    "InternVLChatModel",
+    "Idefics3ForConditionalGeneration",
+    "LlavaForConditionalGeneration",
+    "MiniCPMV",
+    "PaliGemmaForConditionalGeneration",
+]
+REPO_ID_TO_SKIP = {"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test"}
+
+ImageInput = list[Image.Image]
+VideoInput = Union[list[Image.Image], list[np.ndarray],
+                   list[tuple[np.ndarray, dict[str, Any]]]]
+AudioInput = list[tuple[np.ndarray, int]]
+
+
+def _resize_data(_data: Union[Image.Image, np.ndarray],
+                 size_factor: float) -> Union[Image.Image, np.ndarray]:
+    assert size_factor <= 1, "Size factor must be less than 1"
+    # Image input
+    if isinstance(_data, Image.Image):
+        W, H = _data.width, _data.height
+        W, H = map(lambda x: int(x * size_factor), (W, H))
+        return _data.resize((W, H))
+    # Video input with PIL Images
+    elif is_list_of(_data, Image.Image):
+        W, H = next(iter(_data)).width, next(iter(_data)).height
+        T = len(_data)
+        T, W, H = map(lambda x: max(int(x * size_factor), 1), (T, W, H))
+        return [d.resize((W, H)) for d in _data[:T]]
+    # Video input with numpy arrays
+    elif isinstance(_data, np.ndarray) and _data.ndim >= 4:
+        T, H, W, C = _data.shape[-4:]
+        T, H, W = map(lambda x: max(int(x * size_factor), 1), (T, H, W))
+        return _data[..., :T, :H, :W, :C]
+    # Audio input
+    elif isinstance(_data, np.ndarray) and _data.ndim == 1:
+        return _data[:int(len(_data) * size_factor)]
+    raise AssertionError("This line should be unreachable.")
+
+
+def resize_mm_data(
+    data: Union[ImageInput, VideoInput, AudioInput],
+    size_factors: tuple[float,
+                        ...]) -> Union[ImageInput, VideoInput, AudioInput]:
+    size_factors = size_factors[:len(data)]
+    if is_list_of(data, (Image.Image, np.ndarray, list)):
+        return [_resize_data(d, s) for d, s in zip(data, size_factors)]
+    elif is_list_of(data, tuple):
+        return [(_resize_data(d, s), meta)
+                for (d, meta), s in zip(data, size_factors)]
+    raise ValueError("Unsupported multimodal data type.")
 
 
 def create_batched_mm_kwargs(
     model_config: ModelConfig,
     processor: BaseMultiModalProcessor,
-) -> MultiModalKwargs:
+    size_factors: tuple[float, ...] = (1.0, 0.5, 0.25),
+) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
     processing_info = processor.info
     dummy_inputs = processor.dummy_inputs
     supported_mm_limits = processing_info.get_supported_mm_limits()
@@ -40,30 +101,69 @@ def create_batched_mm_kwargs(
         seq_len=model_config.max_model_len,
         mm_counts=mm_counts,
     )
+    mm_data = processor_inputs.mm_data
+    resized_mm_data = {
+        modality: resize_mm_data(data, size_factors)
+        for modality, data in mm_data.items()
+    }
+    # Mistral chat outputs tokens directly, rather than text prompts
+    if model_config.tokenizer_mode == "mistral":
+        images = resized_mm_data.get("image", [])
+        request = ChatCompletionRequest(messages=[
+            UserMessage(content=[
+                TextChunk(text=""),
+                *(ImageChunk(image=image) for image in images),
+            ]),
+        ])
+        tokenizer = processing_info.get_tokenizer()
+        res = tokenizer.mistral.encode_chat_completion(request)
+        prompt = res.tokens
+    else:
+        prompt = processor_inputs.prompt
     mm_kwargs = processor.apply(
-        prompt=processor_inputs.prompt,
-        mm_data=processor_inputs.mm_data,
+        prompt=prompt,
+        mm_data=resized_mm_data,
         hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         tokenization_kwargs=processor_inputs.tokenization_kwargs,
     )["mm_kwargs"]
-    mm_kwargs = MultiModalKwargs.batch([mm_kwargs])
-    return mm_kwargs
+    items = [
+        item for modality in supported_mm_limits
+        for item in mm_kwargs.get_items(modality)
+    ]
+    return group_mm_kwargs_by_modality(items)
+
+
+def get_model_id_to_test(
+        model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
+    filtered_results = []
+    for model_arch in model_arch_list:
+        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+        if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS:
+            available_repos = list(
+                map(lambda model_id: (model_arch, model_id),
+                    [model_info.default, *model_info.extras.values()]))
+            filtered_results.extend(available_repos)
+        else:
+            filtered_results.append((model_arch, model_info.default))
+    return filtered_results
 
 
 @pytest.mark.core_model
-@pytest.mark.parametrize("model_arch", list(_MULTIMODAL_EXAMPLE_MODELS.keys()))
-def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner],
-                             monkeypatch):
+@pytest.mark.parametrize(
+    "model_arch, model_id",
+    get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
+def test_model_tensor_schema(model_arch: str, model_id: str,
+                             vllm_runner: type[VllmRunner], monkeypatch):
     if model_arch in ARCH_TO_SKIP:
         pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
+    if model_id in REPO_ID_TO_SKIP:
+        pytest.skip(f"Skipping {model_id} due to {REPO_ID_TO_SKIP[model_id]}")
 
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip",
                                           check_max_version=False)
 
-    model_id = model_info.default
-
     hf_overrides_fn = partial(dummy_hf_overrides,
                               model_arch=model_arch,
                               exist_overrides=model_info.hf_overrides)
@@ -119,6 +219,7 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner],
         if model_info.v0_only:
             m.setenv("VLLM_USE_V1", "0")
 
+        # TODO(Isotr0py): Can we avoid initializing engine?
         with (
                 set_default_torch_num_threads(1),
                 vllm_runner(
@@ -145,12 +246,16 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner],
                 mm_registry = llm_engine.input_preprocessor.mm_registry
 
             processor = mm_registry.create_processor(model_config)
-            mm_kwargs = create_batched_mm_kwargs(model_config, processor)
 
-            def validate_model_input(model):
-                for modality in ("audio", "image", "video"):
-                    method_name = f"_parse_and_validate_{modality}_input"
-                    if hasattr(model, method_name):
-                        getattr(model, method_name)(**mm_kwargs)
+            def validate_model_input(model, modality: str,
+                                     mm_kwargs: MultiModalKwargs):
+                method_name = f"_parse_and_validate_{modality}_input"
+                if hasattr(model, method_name):
+                    getattr(model, method_name)(**mm_kwargs)
 
-            vllm_model.apply_model(validate_model_input)
\ No newline at end of file
+            for modality, _, mm_kwargs in create_batched_mm_kwargs(
+                    model_config, processor):
+                valid_func = partial(validate_model_input,
+                                     modality=modality,
+                                     mm_kwargs=mm_kwargs)
+                vllm_model.apply_model(valid_func)
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 40c66c2268..db9ed5910d 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -30,7 +30,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargs, VideoItem)
@@ -44,6 +44,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
+from vllm.utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -112,8 +113,9 @@ class KeyeImagePixelInputs(TensorSchema):
         - g: Grid dimensions (3 for t, h, w)
     """
     type: Literal["pixel_values"]
-    pixel_values: Annotated[torch.Tensor,
-                            TensorShape("b", "np", 3, "ps", "ps")]
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("b", "np", 3, "ps", "ps", dynamic_dims={"np"})]
     image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
@@ -145,8 +147,9 @@ class KeyeVideoPixelInputs(TensorSchema):
         - g: Grid dimensions (3 for t, h, w)
     """
     type: Literal["pixel_values_videos"]
-    pixel_values_videos: Annotated[torch.Tensor,
-                                   TensorShape("b", "np", 3, "ps", "ps")]
+    pixel_values_videos: Annotated[
+        torch.Tensor,
+        TensorShape("b", "np", 3, "ps", "ps", dynamic_dims={"np"})]
     video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
 
 
@@ -1295,7 +1298,7 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
             return None
         return quant_config
 
-    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+    def _validate_and_reshape_mm_tensor(self, mm_input: NestedTensors,
                                         name: str) -> torch.Tensor:
         if not isinstance(mm_input, (torch.Tensor, list)):
             raise ValueError(f"Incorrect type of {name}. "
@@ -1310,8 +1313,11 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
                                  f"Got ndim: {mm_input.ndim} "
                                  f"(shape={mm_input.shape})")
             return torch.concat(list(mm_input))
-        else:
-            return torch.concat(mm_input)
+        elif is_list_of(mm_input, torch.Tensor):
+            if all(p.dim() == 4 for p in mm_input) or all(p.dim() == 2
+                                                          for p in mm_input):
+                return mm_input
+        return torch.concat(list(mm_input))
 
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[KeyeImageInputs]:

From 933f45334a79dcb69aa93178b3bbf3d9e0d46f09 Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Sat, 16 Aug 2025 00:46:00 -0700
Subject: [PATCH 330/932] [Core] Make cudagraph check cuda platform only
 (#23005)

Signed-off-by: Chengji Yao <chengjiyao@gmail.com>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: Chengji Yao <chengjiyao@gmail.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 vllm/config/__init__.py | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 280ae60c91..72fec5e205 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3535,15 +3535,6 @@ class VllmConfig:
                 # in V0 means the compilation level wins out.
                 self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
-        # if cudagraph_mode is not explicitly set by users, set default value
-        if self.compilation_config.cudagraph_mode is None:
-            if envs.VLLM_USE_V1 and self.compilation_config.level \
-                == CompilationLevel.PIECEWISE:
-                self.compilation_config.cudagraph_mode = \
-                    CUDAGraphMode.PIECEWISE
-            else:
-                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
@@ -3552,14 +3543,28 @@ class VllmConfig:
         if self.compilation_config.pass_config.enable_sequence_parallelism:
             self.compilation_config.custom_ops.append("+rms_norm")
 
-        # disable cudagraph when enforce eager execution
-        if self.model_config is not None and self.model_config.enforce_eager:
-            logger.info("Cudagraph is disabled under eager mode")
-            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-        elif envs.VLLM_USE_V1:
-            self.compilation_config.cudagraph_num_of_warmups = 1
+        if current_platform.is_cuda_alike():
+            # if cudagraph_mode is not explicitly set by users, set default
+            # value
+            if self.compilation_config.cudagraph_mode is None:
+                if envs.VLLM_USE_V1 and self.compilation_config.level \
+                    == CompilationLevel.PIECEWISE:
+                    self.compilation_config.cudagraph_mode = \
+                        CUDAGraphMode.PIECEWISE
+                else:
+                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
-        self._set_cudagraph_sizes()
+            # disable cudagraph when enforce eager execution
+            if self.model_config is not None and \
+                    self.model_config.enforce_eager:
+                logger.info("Cudagraph is disabled under eager mode")
+                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+            elif envs.VLLM_USE_V1:
+                self.compilation_config.cudagraph_num_of_warmups = 1
+
+            self._set_cudagraph_sizes()
+        else:
+            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
         if self.cache_config.cpu_offload_gb > 0 and \
             self.compilation_config.level != CompilationLevel.NO_COMPILATION \
@@ -3618,7 +3623,7 @@ class VllmConfig:
         current_platform.check_and_update_config(self)
 
         # final check of cudagraph mode after platform-specific update
-        if envs.VLLM_USE_V1:
+        if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
             if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \
                 and self.model_config is not None and \
                 not self.model_config.disable_cascade_attn:

From 2dbccce8a67e8004b365e7e533107c54c9542ce7 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 16 Aug 2025 17:44:19 +0800
Subject: [PATCH 331/932] [CI][Bugfix] Skip Ovis2 generation test because of
 broken remote code (#22954)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/registry.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3efc9a99ea..10e29e01e8 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -196,7 +196,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                        {"alias": "gpt2"}),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
                                              extras={"tiny": "bigcode/tiny_starcoder_py"},  # noqa: E501
-                                             min_transformers_version="4.55.1"),
+                                             min_transformers_version="4.55.1",
+                                             transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
     "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M",
                                        {"6b": "EleutherAI/gpt-j-6b"}),
     "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
@@ -408,14 +409,16 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                       extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
                                       max_transformers_version="4.48",  # noqa: E501
                                       transformers_version_reason="HF model is not compatible."),  # noqa: E501
+    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
+                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},    # noqa: E501
+                                                        min_transformers_version="4.55.1",
+                                                        transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B",
                                                  "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
                                          trust_remote_code=True),
     "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
                                          trust_remote_code=True),
-    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
-                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                     trust_remote_code=True),
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
@@ -455,6 +458,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501
                                                      trust_remote_code=True),
     "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
+                            max_transformers_version="4.53",
+                            transformers_version_reason="HF model is not compatible",  # noqa: E501
                             extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
                                     "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
     "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
@@ -482,7 +487,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B",
                                            trust_remote_code=True),
-    "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
+    "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct",  # noqa: E501
+                                                       min_transformers_version="4.55.1",
+                                                       transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
     "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3",
                                                         trust_remote_code=True,
                                                         is_available_online=False),

From de9cb617637deabab4e34db05d26c8d4d6b2ed98 Mon Sep 17 00:00:00 2001
From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Date: Sat, 16 Aug 2025 03:21:20 -0700
Subject: [PATCH 332/932] Add docs for PrefixRepetitionDataset + enable usage
 with `vllm bench throughput` (#23012)

Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
---
 benchmarks/README.md          | 22 +++++++++++++-
 vllm/benchmarks/throughput.py | 57 ++++++++++++++++++++++++++++++++---
 2 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index caff8f0342..1d715a193e 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -40,7 +40,7 @@ become available.
       <td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
     </tr>
     <tr>
-      <td><strong>Sonnet</strong></td>
+      <td><strong>Sonnet (deprecated)</strong></td>
       <td style="text-align: center;">✅</td>
       <td style="text-align: center;">✅</td>
       <td>Local file: <code>benchmarks/sonnet.txt</code></td>
@@ -51,6 +51,12 @@ become available.
       <td style="text-align: center;">✅</td>
       <td><code>synthetic</code></td>
     </tr>
+    <tr>
+      <td><strong>Prefix Repetition</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>synthetic</code></td>
+    </tr>
     <tr>
       <td><strong>HuggingFace-VisionArena</strong></td>
       <td style="text-align: center;">✅</td>
@@ -592,6 +598,20 @@ python3 benchmarks/benchmark_prefix_caching.py \
   --input-length-range 128:256
 ```
 
+### Prefix Repetition Dataset
+
+```bash
+vllm bench serve \
+  --backend openai \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --dataset-name prefix_repetition \
+  --num-prompts 100 \
+  --prefix-repetition-prefix-len 512 \
+  --prefix-repetition-suffix-len 128 \
+  --prefix-repetition-num-prefixes 5 \
+  --prefix-repetition-output-len 128 
+```
+
 </details>
 
 ## ⚡ Example - Request Prioritization Benchmark
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index fdf6548ada..0c19fa6dcf 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -18,9 +18,11 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer,
 
 from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
                                       ConversationDataset,
-                                      InstructCoderDataset, RandomDataset,
-                                      SampleRequest, ShareGPTDataset,
-                                      SonnetDataset, VisionArenaDataset)
+                                      InstructCoderDataset,
+                                      PrefixRepetitionRandomDataset,
+                                      RandomDataset, SampleRequest,
+                                      ShareGPTDataset, SonnetDataset,
+                                      VisionArenaDataset)
 from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
                                        write_to_json)
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
@@ -327,6 +329,12 @@ def get_requests(args, tokenizer):
             dataset_cls = AIMODataset
             common_kwargs['dataset_subset'] = None
             common_kwargs['dataset_split'] = "train"
+    elif args.dataset_name == "prefix_repetition":
+        dataset_cls = PrefixRepetitionRandomDataset
+        sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len
+        sample_kwargs["suffix_len"] = args.prefix_repetition_suffix_len
+        sample_kwargs["num_prefixes"] = args.prefix_repetition_num_prefixes
+        sample_kwargs["output_len"] = args.prefix_repetition_output_len
     else:
         raise ValueError(f"Unknown dataset name: {args.dataset_name}")
     # Remove None values
@@ -356,7 +364,11 @@ def validate_args(args):
         raise ValueError(f"Unsupported backend: {args.backend}")
 
     # === Dataset Configuration ===
-    if not args.dataset and not args.dataset_path:
+    if (
+        not args.dataset
+        and not args.dataset_path
+        and args.dataset_name not in {"prefix_repetition"}
+    ):
         print(
             "When dataset path is not set, it will default to random dataset")
         args.dataset_name = 'random'
@@ -432,7 +444,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--dataset-name",
         type=str,
-        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
+        choices=[
+            "sharegpt", "random", "sonnet", "burstgpt", "hf",
+            "prefix_repetition"
+        ],
         help="Name of the dataset to benchmark on.",
         default="sharegpt")
     parser.add_argument(
@@ -521,6 +536,38 @@ def add_cli_args(parser: argparse.ArgumentParser):
                         default=None,
                         help="Split of the HF dataset.")
 
+    # prefix repetition dataset
+    prefix_repetition_group = parser.add_argument_group(
+        "prefix repetition dataset options")
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-prefix-len",
+        type=int,
+        default=None,
+        help="Number of prefix tokens per request, used only for prefix "
+        "repetition dataset.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-suffix-len",
+        type=int,
+        default=None,
+        help="Number of suffix tokens per request, used only for prefix "
+        "repetition dataset. Total input length is prefix_len + suffix_len.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-num-prefixes",
+        type=int,
+        default=None,
+        help="Number of prefixes to generate, used only for prefix repetition "
+        "dataset. Prompts per prefix is num_requests // num_prefixes.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-output-len",
+        type=int,
+        default=None,
+        help="Number of output tokens per request, used only for prefix "
+        "repetition dataset.",
+    )
+
     parser = AsyncEngineArgs.add_cli_args(parser)
 
 
From 4dff91c93da668f4cca3f80aa3a94622d21c34fc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 16 Aug 2025 19:30:49 +0800
Subject: [PATCH 333/932] [Refactor] Allow optional MultiModalKwargsItem in IPC
 (#23022)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/v1/core/test_kv_cache_utils.py | 12 +-----
 tests/v1/core/test_prefix_caching.py | 12 +-----
 tests/v1/core/test_scheduler.py      | 12 +-----
 tests/v1/core/utils.py               | 12 +-----
 vllm/multimodal/inputs.py            | 62 ++++++++--------------------
 vllm/v1/engine/__init__.py           |  3 +-
 vllm/v1/engine/mm_input_cache.py     | 33 ++++++++-------
 vllm/v1/engine/processor.py          | 10 +++--
 vllm/v1/request.py                   |  7 +++-
 vllm/v1/worker/gpu_model_runner.py   |  4 +-
 10 files changed, 59 insertions(+), 108 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index e0b91e6dd7..47c74aff1e 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -7,9 +7,7 @@ import pytest
 import torch
 
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
-from vllm.multimodal.inputs import (MultiModalBatchedField,
-                                    MultiModalFieldElem, MultiModalKwargsItem,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit
 from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -42,13 +40,7 @@ def make_request(
     if mm_positions is None:
         mm_kwargs = None
     else:
-        mm_elem = MultiModalFieldElem(
-            modality="dummy_m",
-            key="dummy_k",
-            data=None,
-            field=MultiModalBatchedField(),
-        )
-        mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+        mm_item = MultiModalKwargsItem.dummy("dummy_m")
         mm_kwargs = [mm_item] * len(mm_positions)
 
     return Request(request_id=request_id,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 28cfca6767..89824768ed 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -9,9 +9,7 @@ import pytest
 import torch
 
 from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
-from vllm.multimodal.inputs import (MultiModalBatchedField,
-                                    MultiModalFieldElem, MultiModalKwargsItem,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256, sha256_cbor_64bit
 from vllm.v1.core.block_pool import BlockPool
@@ -37,13 +35,7 @@ def make_request(
     if mm_positions is None:
         mm_kwargs = None
     else:
-        mm_elem = MultiModalFieldElem(
-            modality="dummy_m",
-            key="dummy_k",
-            data=None,
-            field=MultiModalBatchedField(),
-        )
-        mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+        mm_item = MultiModalKwargsItem.dummy("dummy_m")
         mm_kwargs = [mm_item] * len(mm_positions)
 
     return Request(request_id=request_id,
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index ac70c90d92..23762a0fb6 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -8,9 +8,7 @@ import torch
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import (MultiModalBatchedField,
-                                    MultiModalFieldElem, MultiModalKwargsItem,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
@@ -1328,13 +1326,7 @@ def create_requests_with_priority(
     for i in range(num_requests):
         if mm_positions is not None:
             mm_position = mm_positions[i]
-            mm_elem = MultiModalFieldElem(
-                modality="dummy_m",
-                key="dummy_k",
-                data=None,
-                field=MultiModalBatchedField(),
-            )
-            mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+            mm_item = MultiModalKwargsItem.dummy("dummy_m")
             mm_kwargs = [mm_item] * len(mm_position)
         else:
             mm_position = None
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 52093d3d38..849c3f59ae 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -6,9 +6,7 @@ import torch
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import (MultiModalBatchedField,
-                                    MultiModalFieldElem, MultiModalKwargsItem,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
                                          init_none_hash)
@@ -143,13 +141,7 @@ def create_requests(
     for i in range(num_requests):
         if mm_positions is not None:
             mm_position = mm_positions[i]
-            mm_elem = MultiModalFieldElem(
-                modality="dummy_m",
-                key="dummy_k",
-                data=None,
-                field=MultiModalBatchedField(),
-            )
-            mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+            mm_item = MultiModalKwargsItem.dummy("dummy_m")
             mm_kwargs = [mm_item] * len(mm_position)
             mm_hashes = ["hash"] * len(mm_position)
         else:
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 0bbac45c12..a33ce14699 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -4,7 +4,7 @@
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
-from dataclasses import dataclass, replace
+from dataclasses import dataclass
 from functools import partial
 from itertools import accumulate
 from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
@@ -218,7 +218,7 @@ class MultiModalFieldElem:
     i.e. the name of the keyword argument to be passed to the model.
     """
 
-    data: Optional[NestedTensors]
+    data: NestedTensors
     """
     The tensor data of this field in
     [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
@@ -315,13 +315,8 @@ class BaseMultiModalField(ABC):
         if len(set(field_types)) > 1:
             raise ValueError(f"Cannot merge different {field_types=}")
 
-        validated_data = list[NestedTensors]()
-        for i, elem in enumerate(elems):
-            assert elem.data is not None, (
-                f"Cannot merge with empty `elems[{i}]`")
-            validated_data.append(elem.data)
-
-        return self._reduce_data(validated_data, pin_memory=pin_memory)
+        batch = [elem.data for elem in elems]
+        return self._reduce_data(batch, pin_memory=pin_memory)
 
 
 @dataclass(frozen=True)
@@ -643,6 +638,17 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
     [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
     """
 
+    @staticmethod
+    def dummy(modality: str):
+        """Convenience class for testing."""
+        mm_elem = MultiModalFieldElem(
+            modality=modality,
+            key="dummy",
+            data=torch.empty(1),
+            field=MultiModalSharedField(1),
+        )
+        return MultiModalKwargsItem.from_elems([mm_elem])
+
     @staticmethod
     def from_elems(elems: Sequence[MultiModalFieldElem]):
         return MultiModalKwargsItem({elem.key: elem for elem in elems})
@@ -654,46 +660,12 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
         assert len(modalities) == 1, f"Found different modalities={modalities}"
         self._modality = next(iter(modalities))
 
-        self._is_empty = any(elem.data is None for elem in self.values())
-
     @property
     def modality(self) -> str:
         return self._modality
 
-    @property
-    def is_empty(self) -> bool:
-        return self._is_empty
-
-    def get_data(self) -> Optional[Mapping[str, NestedTensors]]:
-        if self._is_empty:
-            return None
-
-        out_data = dict[str, NestedTensors]()
-        for key, elem in self.items():
-            assert elem.data is not None, (
-                f"Cannot get data of empty `elem[{key!r}]`")
-            out_data[key] = elem.data
-
-        return out_data
-
-    def require_data(self) -> Mapping[str, NestedTensors]:
-        if (data := self.get_data()) is None:
-            raise RuntimeError("Cannot get data of empty item")
-
-        return data
-
-    # These methods create a new item to avoid mutating cached items in place
-    def with_data(self, data: Mapping[str, NestedTensors]):
-        return MultiModalKwargsItem({
-            key: replace(elem, data=data[key])
-            for key, elem in self.items()
-        })
-
-    def without_data(self):
-        return MultiModalKwargsItem({
-            key: replace(elem, data=None)
-            for key, elem in self.items()
-        })
+    def get_data(self) -> Mapping[str, NestedTensors]:
+        return {key: elem.data for key, elem in self.items()}
 
 
 # NOTE: UserDict is for V0 compatibility.
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index b29394f3e6..f7ec982db4 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -3,6 +3,7 @@
 
 import enum
 import time
+from collections.abc import Sequence
 from typing import Any, Optional, Union
 
 import msgspec
@@ -47,7 +48,7 @@ class EngineCoreRequest(
 
     request_id: str
     prompt_token_ids: list[int]
-    mm_kwargs: Optional[list[MultiModalKwargsItem]]
+    mm_kwargs: Optional[Sequence[Optional[MultiModalKwargsItem]]]
     mm_hashes: Optional[list[str]]
     mm_placeholders: Optional[list[PlaceholderRange]]
     sampling_params: Optional[SamplingParams]
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 1fed74330f..aa7dc62fd4 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Mapping
-from typing import TYPE_CHECKING
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Optional
 
 from vllm.multimodal import MultiModalRegistry
 from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
-from vllm.multimodal.inputs import MultiModalKwargsItem, NestedTensors
+from vllm.multimodal.inputs import MultiModalKwargsItem
+from vllm.utils import is_list_of
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -58,21 +59,21 @@ class MultiModalInputCacheClient:
 
     def get_and_update(
         self,
-        mm_kwargs: list[MultiModalKwargsItem],
+        mm_kwargs: Sequence[MultiModalKwargsItem],
         mm_hashes: list[str],
-    ) -> list[MultiModalKwargsItem]:
+    ) -> list[Optional[MultiModalKwargsItem]]:
         if not self.enabled:
-            return mm_kwargs
+            return list(mm_kwargs)
 
         assert len(mm_kwargs) == len(mm_hashes)
 
-        out_mm_items = list[MultiModalKwargsItem]()
+        out_mm_items = list[Optional[MultiModalKwargsItem]]()
         for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
             if self.mm_cache.get(mm_hash) is not None:
-                out_mm_items.append(mm_item.without_data())
+                out_mm_items.append(None)
             else:
                 self.mm_cache[mm_hash] = \
-                    MultiModalCacheItemMetadata.wraps(mm_item.require_data())
+                    MultiModalCacheItemMetadata.wraps(mm_item)
                 out_mm_items.append(mm_item)
 
         return out_mm_items
@@ -91,25 +92,27 @@ class MultiModalInputCacheServer:
         self.enabled = mm_registry.enable_mm_input_cache(model_config)
         self.mm_cache = MultiModalCache.get_lru_cache(
             model_config.get_mm_input_cache_gb(),
-            Mapping[str, NestedTensors],
+            MultiModalKwargsItem,
         )
 
     def get_and_update(
         self,
-        mm_kwargs: list[MultiModalKwargsItem],
+        mm_kwargs: Sequence[Optional[MultiModalKwargsItem]],
         mm_hashes: list[str],
     ) -> list[MultiModalKwargsItem]:
         if not self.enabled:
-            return mm_kwargs
+            mm_kwargs_lst = list(mm_kwargs)
+            assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem)
+            return mm_kwargs_lst
 
         assert len(mm_kwargs) == len(mm_hashes)
 
         out_mm_items = list[MultiModalKwargsItem]()
         for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
-            if (mm_data := mm_item.get_data()) is None:
-                out_mm_items.append(mm_item.with_data(self.mm_cache[mm_hash]))
+            if mm_item is None:
+                out_mm_items.append(self.mm_cache[mm_hash])
             else:
-                self.mm_cache[mm_hash] = mm_data
+                self.mm_cache[mm_hash] = mm_item
                 out_mm_items.append(mm_item)
 
         return out_mm_items
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 376c76a7e7..c6a23cdbf6 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -17,6 +17,7 @@ from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+from vllm.utils import is_list_of
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
 from vllm.v1.structured_output.backend_guidance import (
@@ -295,7 +296,7 @@ class Processor:
             pooling_params = params.clone()
 
         # Multimodal related.
-        sorted_mm_inputs: Optional[list[MultiModalKwargsItem]] = None
+        sorted_mm_inputs: Optional[list[Optional[MultiModalKwargsItem]]] = None
         sorted_mm_positions: Optional[list[PlaceholderRange]] = None
         sorted_mm_hashes: Optional[list[str]] = None
         if decoder_inputs["type"] == "multimodal":
@@ -308,7 +309,7 @@ class Processor:
             # in the input sequence.
             sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
 
-            sorted_mm_inputs = [
+            orig_sorted_mm_inputs = [
                 decoder_mm_inputs.get_item(modality, idx)
                 for modality, idx in sorted_mm_idxs
             ]
@@ -323,9 +324,12 @@ class Processor:
 
             if sorted_mm_hashes is not None:
                 sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
-                    sorted_mm_inputs,
+                    orig_sorted_mm_inputs,
                     sorted_mm_hashes,
                 )
+            else:
+                assert is_list_of(orig_sorted_mm_inputs, MultiModalKwargsItem)
+                sorted_mm_inputs = orig_sorted_mm_inputs
 
         return decoder_inputs.get("prompt"), EngineCoreRequest(
             request_id=request_id,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 562925bde6..8b703b6191 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -125,14 +125,17 @@ class Request:
         block_hasher: Optional[Callable[["Request"], list["BlockHash"]]]
     ) -> "Request":
         if request.mm_kwargs is not None:
-            assert is_list_of(request.mm_kwargs, MultiModalKwargsItem), (
+            mm_kwargs_lst = list(request.mm_kwargs)
+            assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem), (
                 "mm_kwargs was not updated in EngineCore.add_request")
+        else:
+            mm_kwargs_lst = None
 
         return cls(
             request_id=request.request_id,
             client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
-            multi_modal_kwargs=request.mm_kwargs,
+            multi_modal_kwargs=mm_kwargs_lst,
             multi_modal_hashes=request.mm_hashes,
             multi_modal_placeholders=request.mm_placeholders,
             sampling_params=request.sampling_params,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4c919b392f..5ee44a8257 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -500,8 +500,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 second_per_grid_ts = []
                 audio_feature_lengths = []
                 use_audio_in_video = False
-                for item in self.requests[req_id].mm_kwargs:
-                    mm_input = item.require_data()
+                for mm_item in self.requests[req_id].mm_kwargs:
+                    mm_input = mm_item.get_data()
                     if mm_input.get("image_grid_thw") is not None:
                         image_grid_thw.append(
                             mm_input["image_grid_thw"].tolist())

From 829bbd7882222c85c0ca5a17fbb2f70e543f50ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Sat, 16 Aug 2025 20:16:58 +0800
Subject: [PATCH 334/932] [New Model]mBART model (#22883)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
---
 docs/models/supported_models.md               |   4 +
 examples/offline_inference/encoder_decoder.py | 235 +++++----
 .../models/language/generation/test_mbart.py  | 123 +++++
 tests/models/registry.py                      |   2 +
 vllm/model_executor/models/bart.py            | 444 +++++++++++++++++-
 vllm/model_executor/models/registry.py        |   1 +
 6 files changed, 717 insertions(+), 92 deletions(-)
 create mode 100644 tests/models/language/generation/test_mbart.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index a24fa4bcce..a514572945 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -330,6 +330,7 @@ th {
 | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
 | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
+| `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
@@ -418,6 +419,9 @@ Some models are supported only via the [Transformers backend](#transformers). Th
 !!! note
     Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
 
+!!! note
+    Some mBART models' config files do not have an `architecture` defined. Therefore, you need to use `--hf-overrides '{"architectures": ["MBartForConditionalGeneration"]}'` to explicitly specify the use of the `MBartForConditionalGeneration` architecture.
+
 ### Pooling Models
 
 See [this page](./pooling_models.md) for more information on how to use pooling models.
diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index 0da6fa5c4a..df6c1eaf4a 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -2,9 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstrate prompting of text-to-text
-encoder/decoder models, specifically BART
+encoder/decoder models, specifically BART and mBART.
+
+This script is refactored to allow model selection via command-line arguments.
 """
 
+import argparse
+from typing import NamedTuple, Optional
+
 from vllm import LLM, SamplingParams
 from vllm.inputs import (
     ExplicitEncoderDecoderPrompt,
@@ -14,119 +19,175 @@ from vllm.inputs import (
 )
 
 
-def create_prompts(tokenizer):
-    # Test prompts
-    #
-    # This section shows all of the valid ways to prompt an
-    # encoder/decoder model.
-    #
-    # - Helpers for building prompts
-    text_prompt_raw = "Hello, my name is"
-    text_prompt = TextPrompt(prompt="The president of the United States is")
+class ModelRequestData(NamedTuple):
+    """
+    Holds the configuration for a specific model, including its
+    HuggingFace ID and the prompts to use for the demo.
+    """
+
+    model_id: str
+    encoder_prompts: list
+    decoder_prompts: list
+    hf_overrides: Optional[dict] = None
+
+
+def get_bart_config() -> ModelRequestData:
+    """
+    Returns the configuration for facebook/bart-large-cnn.
+    This uses the exact test cases from the original script.
+    """
+    encoder_prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "An encoder prompt",
+    ]
+    decoder_prompts = [
+        "A decoder prompt",
+        "Another decoder prompt",
+    ]
+    return ModelRequestData(
+        model_id="facebook/bart-large-cnn",
+        encoder_prompts=encoder_prompts,
+        decoder_prompts=decoder_prompts,
+    )
+
+
+def get_mbart_config() -> ModelRequestData:
+    """
+    Returns the configuration for facebook/mbart-large-en-ro.
+    This uses prompts suitable for an English-to-Romanian translation task.
+    """
+    encoder_prompts = [
+        "The quick brown fox jumps over the lazy dog.",
+        "How are you today?",
+    ]
+    decoder_prompts = ["", ""]
+    hf_overrides = {"architectures": ["MBartForConditionalGeneration"]}
+    return ModelRequestData(
+        model_id="facebook/mbart-large-en-ro",
+        encoder_prompts=encoder_prompts,
+        decoder_prompts=decoder_prompts,
+        hf_overrides=hf_overrides,
+    )
+
+
+MODEL_GETTERS = {
+    "bart": get_bart_config,
+    "mbart": get_mbart_config,
+}
+
+
+def create_all_prompt_types(
+    encoder_prompts_raw: list,
+    decoder_prompts_raw: list,
+    tokenizer,
+) -> list:
+    """
+    Generates a list of diverse prompt types for demonstration.
+    This function is generic and uses the provided raw prompts
+    to create various vLLM input objects.
+    """
+    text_prompt_raw = encoder_prompts_raw[0]
+    text_prompt = TextPrompt(prompt=encoder_prompts_raw[1 % len(encoder_prompts_raw)])
     tokens_prompt = TokensPrompt(
-        prompt_token_ids=tokenizer.encode(prompt="The capital of France is")
-    )
-    # - Pass a single prompt to encoder/decoder model
-    #   (implicitly encoder input prompt);
-    #   decoder input prompt is assumed to be None
-
-    single_text_prompt_raw = text_prompt_raw  # Pass a string directly
-    single_text_prompt = text_prompt  # Pass a TextPrompt
-    single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
-
-    # ruff: noqa: E501
-    # - Pass explicit encoder and decoder input prompts within one data structure.
-    #   Encoder and decoder prompts can both independently be text or tokens, with
-    #   no requirement that they be the same prompt type. Some example prompt-type
-    #   combinations are shown below, note that these are not exhaustive.
-
-    enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
-        # Pass encoder prompt string directly, &
-        # pass decoder prompt tokens
-        encoder_prompt=single_text_prompt_raw,
-        decoder_prompt=single_tokens_prompt,
-    )
-    enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
-        # Pass TextPrompt to encoder, and
-        # pass decoder prompt string directly
-        encoder_prompt=single_text_prompt,
-        decoder_prompt=single_text_prompt_raw,
-    )
-    enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
-        # Pass encoder prompt tokens directly, and
-        # pass TextPrompt to decoder
-        encoder_prompt=single_tokens_prompt,
-        decoder_prompt=single_text_prompt,
+        prompt_token_ids=tokenizer.encode(
+            encoder_prompts_raw[2 % len(encoder_prompts_raw)]
+        )
     )
 
-    # - Finally, here's a useful helper function for zipping encoder and
-    #   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
-    #   instances
+    decoder_tokens_prompt = TokensPrompt(
+        prompt_token_ids=tokenizer.encode(decoder_prompts_raw[0])
+    )
+    single_prompt_examples = [
+        text_prompt_raw,
+        text_prompt,
+        tokens_prompt,
+    ]
+    explicit_pair_examples = [
+        ExplicitEncoderDecoderPrompt(
+            encoder_prompt=text_prompt_raw,
+            decoder_prompt=decoder_tokens_prompt,
+        ),
+        ExplicitEncoderDecoderPrompt(
+            encoder_prompt=text_prompt,
+            decoder_prompt=decoder_prompts_raw[1 % len(decoder_prompts_raw)],
+        ),
+        ExplicitEncoderDecoderPrompt(
+            encoder_prompt=tokens_prompt,
+            decoder_prompt=text_prompt,
+        ),
+    ]
     zipped_prompt_list = zip_enc_dec_prompts(
-        ["An encoder prompt", "Another encoder prompt"],
-        ["A decoder prompt", "Another decoder prompt"],
+        encoder_prompts_raw,
+        decoder_prompts_raw,
     )
-
-    # - Let's put all of the above example prompts together into one list
-    #   which we will pass to the encoder/decoder LLM.
-    return [
-        single_text_prompt_raw,
-        single_text_prompt,
-        single_tokens_prompt,
-        enc_dec_prompt1,
-        enc_dec_prompt2,
-        enc_dec_prompt3,
-    ] + zipped_prompt_list
+    return single_prompt_examples + explicit_pair_examples + zipped_prompt_list
 
 
-# Create a sampling params object.
-def create_sampling_params():
+def create_sampling_params() -> SamplingParams:
+    """Create a sampling params object."""
     return SamplingParams(
         temperature=0,
         top_p=1.0,
         min_tokens=0,
-        max_tokens=20,
+        max_tokens=30,
     )
 
 
-# Print the outputs.
-def print_outputs(outputs):
-    print("-" * 50)
+def print_outputs(outputs: list):
+    """Formats and prints the generation outputs."""
+    print("-" * 80)
     for i, output in enumerate(outputs):
         prompt = output.prompt
         encoder_prompt = output.encoder_prompt
         generated_text = output.outputs[0].text
         print(f"Output {i + 1}:")
-        print(
-            f"Encoder prompt: {encoder_prompt!r}\n"
-            f"Decoder prompt: {prompt!r}\n"
-            f"Generated text: {generated_text!r}"
+        print(f"Encoder Prompt: {encoder_prompt!r}")
+        print(f"Decoder Prompt: {prompt!r}")
+        print(f"Generated Text: {generated_text!r}")
+        print("-" * 80)
+
+
+def main(args):
+    """Main execution function."""
+    model_key = args.model
+    if model_key not in MODEL_GETTERS:
+        raise ValueError(
+            f"Unknown model: {model_key}. "
+            f"Available models: {list(MODEL_GETTERS.keys())}"
         )
-        print("-" * 50)
+    config_getter = MODEL_GETTERS[model_key]
+    model_config = config_getter()
 
-
-def main():
-    dtype = "float"
-
-    # Create a BART encoder/decoder model instance
+    print(f"🚀 Running demo for model: {model_config.model_id}")
     llm = LLM(
-        model="facebook/bart-large-cnn",
-        dtype=dtype,
+        model=model_config.model_id,
+        dtype="float",
+        hf_overrides=model_config.hf_overrides,
     )
-
-    # Get BART tokenizer
     tokenizer = llm.llm_engine.get_tokenizer_group()
-
-    prompts = create_prompts(tokenizer)
+    prompts = create_all_prompt_types(
+        encoder_prompts_raw=model_config.encoder_prompts,
+        decoder_prompts_raw=model_config.decoder_prompts,
+        tokenizer=tokenizer,
+    )
     sampling_params = create_sampling_params()
-
-    # Generate output tokens from the prompts. The output is a list of
-    # RequestOutput objects that contain the prompt, generated
-    # text, and other information.
     outputs = llm.generate(prompts, sampling_params)
-
     print_outputs(outputs)
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(
+        description="A flexible demo for vLLM encoder-decoder models."
+    )
+    parser.add_argument(
+        "--model",
+        "-m",
+        type=str,
+        default="bart",
+        choices=MODEL_GETTERS.keys(),
+        help="The short name of the model to run.",
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/tests/models/language/generation/test_mbart.py b/tests/models/language/generation/test_mbart.py
new file mode 100644
index 0000000000..854a727139
--- /dev/null
+++ b/tests/models/language/generation/test_mbart.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import pytest
+from transformers import AutoModelForSeq2SeqLM
+
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import DecoderPromptType, HfRunner, VllmRunner
+from ...utils import check_logprobs_close
+
+
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
+    decoder_prompt_type: DecoderPromptType,
+):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+    hf_output_str = output_str + "</s>"
+    return output_ids, hf_output_str, out_logprobs
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    prompts: list[dict[str, str]],
+    decoder_prompt_type: DecoderPromptType,
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+) -> None:
+    '''
+    Test the vLLM mBART model by validating it against HuggingFace (HF).
+    (Docstring content is omitted for brevity)
+    '''
+
+    vllm_prompts = prompts
+    if decoder_prompt_type == DecoderPromptType.NONE:
+        vllm_prompts = [{
+            "encoder_prompt": p['encoder_prompt'],
+            "decoder_prompt": ""
+        } for p in prompts]
+
+    vllm_kwargs = {
+        "hf_overrides": {
+            "architectures": ["MBartForConditionalGeneration"]
+        }
+    }
+
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True,
+                     **vllm_kwargs) as vllm_model:  # type: ignore
+        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+            vllm_prompts, max_tokens, num_logprobs)
+
+    hf_kwargs = {
+        "top_k": None,
+        "num_beams": 1,
+        "repetition_penalty": 1.0,
+        "top_p": 1.0,
+        "length_penalty": 1.0,
+        "early_stopping": False,
+        "no_repeat_ngram_size": None,
+        "min_length": 0
+    }
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+        hf_kwargs["decoder_start_token_id"] = (
+            hf_model.tokenizer.lang_code_to_id["ro_RO"])
+
+        hf_outputs = (
+            hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+                prompts,  # HF runner still uses the original prompts
+                max_tokens,
+                num_logprobs,
+                **hf_kwargs,
+            ))
+
+    hf_skip_tokens = 0
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=[
+            vllm_to_hf_output(vllm_output, decoder_prompt_type)
+            for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+        num_outputs_0_skip_tokens=hf_skip_tokens,
+    )
+
+
+@pytest.mark.parametrize(
+    "model",
+    [pytest.param("facebook/mbart-large-en-ro")],
+)
+@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
+                dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        example_encoder_decoder_prompts[decoder_prompt_type],
+        decoder_prompt_type,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 10e29e01e8..99cf997790 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -316,6 +316,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Encoder-decoder]
     "BartModel": _HfExamplesInfo("facebook/bart-base"),
     "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
+    "MBartForConditionalGeneration": _HfExamplesInfo("facebook/mbart-large-en-ro",  # noqa: E501
+                                                    hf_overrides={"architectures": ["MBartForConditionalGeneration"]}),  # noqa: E501
 }
 
 _EMBEDDING_EXAMPLE_MODELS = {
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 3d328c88ff..32551d8102 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -46,7 +46,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsQuant, SupportsV0Only
-from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
+from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors,
+                    maybe_prefix)
 
 logger = logging.get_logger(__name__)
 
@@ -422,10 +423,7 @@ class BartEncoderLayer(nn.Module):
         if hidden_states.dtype == torch.float16 and (
                 torch.isinf(hidden_states).any()
                 or torch.isnan(hidden_states).any()):
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states,
-                                        min=-clamp_value,
-                                        max=clamp_value)
+            hidden_states = cast_overflow_tensors(hidden_states)
 
         return hidden_states
 
@@ -906,3 +904,439 @@ class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant):
             })
 
         return loaded_params
+
+
+class MBartEncoderLayer(BartEncoderLayer):
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            hidden_states
+                torch.Tensor of *encoder* input embeddings.
+        Returns:
+            Encoder layer output torch.Tensor
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        fc1_out, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(fc1_out)
+
+        hidden_states, _ = self.fc2(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+                torch.isinf(hidden_states).any()
+                or torch.isnan(hidden_states).any()):
+            hidden_states = cast_overflow_tensors(hidden_states)
+
+        return hidden_states
+
+
+class MBartDecoderLayer(BartDecoderLayer):
+
+    def forward(
+        self,
+        decoder_hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        residual = decoder_hidden_states
+        hidden_states = self.self_attn_layer_norm(decoder_hidden_states)
+
+        # Self Attention
+        hidden_states = self.self_attn(hidden_states=hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+
+        residual = hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        hidden_states = self.encoder_attn(
+            decoder_hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        fc1_out, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(fc1_out)
+
+        hidden_states, _ = self.fc2(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class MBartEncoder(nn.Module):
+    """
+    Transformer encoder consisting of *config.encoder_layers*
+    self attention layers. Each layer is a [`BartEncoderLayer`].
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self,
+                 config: BartConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 lora_config: Optional[LoRAConfig] = None,
+                 embed_tokens: Optional[nn.Embedding] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+        embed_dim = config.d_model
+        self.max_source_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
+                                                    embed_dim,
+                                                    embed_scale=embed_scale)
+
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+        )
+        self.layers = nn.ModuleList([
+            MBartEncoderLayer(config,
+                              cache_config,
+                              quant_config,
+                              prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(config.encoder_layers)
+        ])
+
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+        self.layer_norm = nn.LayerNorm(config.d_model)  # 改动
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids
+                Indices of *encoder* input sequence tokens in the vocabulary.
+                Padding will be ignored by default should you
+                provide it.
+            positions
+                Positions of *encoder* input sequence tokens.
+        Returns:
+            Decoder output torch.Tensor
+        """
+        # retrieve input_ids and inputs_embeds
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        embed_pos = self.embed_positions(positions)
+        embed_pos = embed_pos.to(inputs_embeds.device)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states=hidden_states)
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class MBartDecoder(nn.Module):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers.
+    Each layer is a [`BartDecoderLayer`]
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(
+        self,
+        config: BartConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+        embed_tokens: Optional[nn.Embedding] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+        self.max_target_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(
+            config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
+                                                    config.d_model,
+                                                    embed_scale=embed_scale)
+
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+        )
+
+        self.layers = nn.ModuleList(
+            [MBartDecoderLayer(config, cache_config, quant_config,
+                               prefix=f"{prefix}.layers.{layer_idx}") \
+             for layer_idx in range(config.decoder_layers)])
+
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        decoder_input_ids: torch.Tensor,
+        decoder_positions: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            decoder_input_ids
+                Indices of *decoder* input sequence tokens in the vocabulary.
+                Padding will be ignored by default should you
+                provide it.
+            decoder_positions
+                Positions of *decoder* input sequence tokens.
+            encoder_hidden_states:
+                Tensor of encoder output embeddings
+        Returns:
+            Decoder output torch.Tensor
+        """
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(decoder_input_ids)
+        else:
+            decoder_positions = inputs_embeds[:, -1]
+
+        # embed positions
+        embed_pos = self.embed_positions(decoder_positions)
+        embed_pos = embed_pos.to(inputs_embeds.device)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        # decoder layers
+
+        for decoder_layer in self.layers:
+            hidden_states = decoder_layer(
+                decoder_hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+            )
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class MBartModel(nn.Module, SupportsQuant):
+    _tied_weights_keys = [
+        "encoder.embed_tokens.weight", "decoder.embed_tokens.weight"
+    ]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.encoder = MBartEncoder(config,
+                                    cache_config,
+                                    quant_config=quant_config,
+                                    prefix=f"{prefix}.encoder")
+        self.decoder = MBartDecoder(config,
+                                    cache_config,
+                                    quant_config=quant_config,
+                                    prefix=f"{prefix}.decoder")
+
+    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+                encoder_input_ids: torch.Tensor,
+                encoder_positions: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids
+                Indices of *decoder* input sequence tokens in the vocabulary.
+                Padding will be ignored by default should you
+                provide it.
+            positions
+                Positions of *decoder* input sequence tokens.
+            encoder_input_ids
+                Indices of *encoder* input sequence tokens in the vocabulary.
+            encoder_positions:
+                Positions of *encoder* input sequence tokens.
+        Returns:
+            Model output torch.Tensor
+        """
+
+        encoder_hidden_states = None
+
+        if encoder_input_ids.numel() > 0:
+            # Run encoder attention if a non-zero number of encoder tokens
+            # are provided as input
+            encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
+                                                 positions=encoder_positions)
+
+        # decoder outputs consists of
+        # (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            decoder_input_ids=input_ids,
+            decoder_positions=positions,
+            encoder_hidden_states=encoder_hidden_states)
+
+        return decoder_outputs
+
+
+class MBartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant):
+    base_model_prefix = "model"
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "decoder.": "model.decoder.",
+            "encoder.": "model.encoder.",
+            "shared.": "model.shared."
+        },
+        orig_to_new_substr={
+            "beta": "bias",
+            "gamma": "weight",
+            "LayerNorm": "layernorm",
+        },
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        lora_config = vllm_config.lora_config
+        assert config.tie_word_embeddings
+        self.config = config
+        self.model = MBartModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        embed_scale = math.sqrt(
+            config.d_model) if config.scale_embedding else 1.0
+
+        self.lm_head = BartParallelLMHead(config.vocab_size,
+                                          config.d_model,
+                                          embed_scale=embed_scale)
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        *,
+        encoder_input_ids: torch.Tensor,
+        encoder_positions: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        return self.model(input_ids, positions, encoder_input_ids,
+                          encoder_positions)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        model_params_dict = dict(self.named_parameters())
+        loaded_params = set()
+        remaining_weights = []
+        shared_embedding_weight = None
+
+        for name, loaded_weight in weights:
+            if any(skip in name
+                   for skip in ["cls.", "pooler.", "final_logits_bias"]):
+                continue
+            if any(embed_name in name for embed_name in [
+                    'shared.weight', 'encoder.embed_tokens.weight',
+                    'decoder.embed_tokens.weight'
+            ]):
+                if shared_embedding_weight is None:
+                    shared_embedding_weight = loaded_weight
+                continue
+            is_stacked = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                vllm_name = name
+                for src, dst in self.hf_to_vllm_mapper.orig_to_new_substr.items(
+                ):
+                    vllm_name = vllm_name.replace(src, dst)
+                for src, dst in self.hf_to_vllm_mapper.orig_to_new_prefix.items(
+                ):
+                    if vllm_name.startswith(src):
+                        vllm_name = dst + vllm_name[len(src):]
+                        break
+                vllm_name = vllm_name.replace(weight_name, param_name)
+                if vllm_name in model_params_dict:
+                    param = model_params_dict[vllm_name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight, shard_id)
+                    loaded_params.add(vllm_name)
+                is_stacked = True
+                break
+            if not is_stacked:
+                remaining_weights.append((name, loaded_weight))
+        loader = AutoWeightsLoader(self, skip_prefixes=["cls.", "pooler."])
+        auto_loaded_params = loader.load_weights(remaining_weights,
+                                                 mapper=self.hf_to_vllm_mapper)
+        loaded_params.update(auto_loaded_params)
+        if shared_embedding_weight is not None:
+            lm_head_param = self.lm_head.weight
+            weight_loader = getattr(lm_head_param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(lm_head_param, shared_embedding_weight)
+            self.model.encoder.embed_tokens.weight = self.lm_head.weight
+            self.model.decoder.embed_tokens.weight = self.lm_head.weight
+            loaded_params.update({
+                'model.encoder.embed_tokens.weight', 'lm_head.weight',
+                'model.decoder.embed_tokens.weight'
+            })
+        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b817615b43..109bc1fe5c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -141,6 +141,7 @@ _TEXT_GENERATION_MODELS = {
     # [Encoder-decoder]
     "BartModel": ("bart", "BartForConditionalGeneration"),
     "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
+    "MBartForConditionalGeneration": ("bart", "MBartForConditionalGeneration"),
 }
 
 _EMBEDDING_MODELS = {

From 52ce1420e9f6f52308f49a2898433a52674a4a8b Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Sat, 16 Aug 2025 14:36:30 -0300
Subject: [PATCH 335/932] Fix handling of `max_num_batched_tokens` for pooling
 tasks (#23004)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/config/__init__.py  |  3 ---
 vllm/engine/arg_utils.py | 10 +++++-----
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 72fec5e205..14fc5589a8 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3600,9 +3600,6 @@ class VllmConfig:
                 logger.info(reason)
             self.scheduler_config.chunked_prefill_enabled = False
             self.scheduler_config.long_prefill_token_threshold = 0
-            self.scheduler_config.max_num_batched_tokens = max(
-                self.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS)
 
             if self.cache_config is not None:
                 self.cache_config.enable_prefix_caching = False
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f8af6d36e0..630fbec453 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1602,9 +1602,6 @@ class EngineArgs:
                 self.enable_prefix_caching = incremental_prefill_supported
                 logger.info("(%s) prefix caching by default", action)
 
-        if not self.enable_chunked_prefill:
-            self.max_num_batched_tokens = model_config.max_model_len
-
         # V1 should use the new scheduler by default.
         # Swap it only if this arg is set to the original V0 default
         if self.scheduler_cls == EngineArgs.scheduler_cls:
@@ -1692,8 +1689,11 @@ class EngineArgs:
                     self.max_num_batched_tokens = \
                         default_max_num_batched_tokens[usage_context]
             else:
-                self.max_num_batched_tokens = default_max_num_batched_tokens[
-                    usage_context]
+                if not self.enable_chunked_prefill:
+                    self.max_num_batched_tokens = model_config.max_model_len
+                else:
+                    self.max_num_batched_tokens = \
+                        default_max_num_batched_tokens[usage_context]
             logger.debug(
                 "Setting max_num_batched_tokens to %d for %s usage context.",
                 self.max_num_batched_tokens, use_context_value)

From 68373d3126b4d2c49a9983fe0696bbd48fc8aad7 Mon Sep 17 00:00:00 2001
From: Woonggi Min <kali2005611@gmail.com>
Date: Sun, 17 Aug 2025 02:38:42 +0900
Subject: [PATCH 336/932] [Frontend] Added support for HermesToolParser for
 models without special tokens (#16890)

Signed-off-by: minpeter <kali2005611@gmail.com>
---
 .../tool_parsers/test_hermes_tool_parser.py   | 127 ++++++++++++++++++
 .../openai/tool_parsers/hermes_tool_parser.py |  81 ++++++++---
 2 files changed, 191 insertions(+), 17 deletions(-)
 create mode 100644 tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py

diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
new file mode 100644
index 0000000000..28b1f8358d
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from ....utils import RemoteOpenAIServer
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+LORA_MODEL = "minpeter/LoRA-Llama-3.2-1B-tool-vllm-ci"
+
+SERVER_ARGS = [
+    "--enforce-eager",
+    "--enable-auto-tool-choice",
+    "--tool-call-parser",
+    "hermes",
+    "--enable-lora",
+    "--lora-modules",
+    f"{LORA_MODEL}={LORA_MODEL}",
+]
+
+TOOLS = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description":
+                    "The city and state, e.g. San Francisco, CA",
+                },
+                "unit": {
+                    "type": "string",
+                    "enum": ["celsius", "fahrenheit"]
+                },
+            },
+            "required": ["location"],
+        },
+    },
+}]
+
+MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}]
+
+
+@pytest.mark.asyncio
+async def test_non_streaming_tool_call():
+    """Test tool call in non-streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        response = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=MESSAGES,
+            tools=TOOLS,
+            tool_choice="auto",
+            temperature=0.0,
+        )
+
+        assert response.choices
+        choice = response.choices[0]
+        message = choice.message
+
+        assert choice.finish_reason == "tool_calls"
+        assert message.tool_calls is not None
+
+        tool_call = message.tool_calls[0]
+        assert tool_call.type == "function"
+        assert tool_call.function.name == "get_current_weather"
+
+        arguments = json.loads(tool_call.function.arguments)
+        assert "location" in arguments
+        assert "Boston" in arguments["location"]
+        print("\n[Non-Streaming Test Passed]")
+        print(f"Tool Call: {tool_call.function.name}")
+        print(f"Arguments: {arguments}")
+
+
+@pytest.mark.asyncio
+async def test_streaming_tool_call():
+    """Test tool call in streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        stream = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=MESSAGES,
+            tools=TOOLS,
+            tool_choice="auto",
+            temperature=0.0,
+            stream=True,
+        )
+
+        tool_call_chunks = {}
+        async for chunk in stream:
+            if not chunk.choices:
+                continue
+
+            delta = chunk.choices[0].delta
+            if not delta or not delta.tool_calls:
+                continue
+
+            for tool_chunk in delta.tool_calls:
+                index = tool_chunk.index
+                if index not in tool_call_chunks:
+                    tool_call_chunks[index] = {"name": "", "arguments": ""}
+
+                if tool_chunk.function.name:
+                    tool_call_chunks[index]["name"] += tool_chunk.function.name
+                if tool_chunk.function.arguments:
+                    tool_call_chunks[index][
+                        "arguments"] += tool_chunk.function.arguments
+
+        assert len(tool_call_chunks) == 1
+        reconstructed_tool_call = tool_call_chunks[0]
+
+        assert reconstructed_tool_call["name"] == "get_current_weather"
+
+        arguments = json.loads(reconstructed_tool_call["arguments"])
+        assert "location" in arguments
+        assert "Boston" in arguments["location"]
+        print("\n[Streaming Test Passed]")
+        print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
+        print(f"Reconstructed Arguments: {arguments}")
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index c7030d34d4..d126130ab9 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -52,14 +52,51 @@ class Hermes2ProToolParser(ToolParser):
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
                 "constructor during construction.")
-        self.tool_call_start_token_id = self.vocab.get(
-            self.tool_call_start_token)
-        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
-        if (self.tool_call_start_token_id is None
-                or self.tool_call_end_token_id is None):
-            raise RuntimeError(
-                "Hermes 2 Pro Tool parser could not locate tool call start/end "
-                "tokens in the tokenizer!")
+        self.tool_call_start_token_ids = self.model_tokenizer.encode(
+            self.tool_call_start_token, add_special_tokens=False)
+        self.tool_call_end_token_ids = self.model_tokenizer.encode(
+            self.tool_call_end_token, add_special_tokens=False)
+
+        self.tool_call_start_token_array = [
+            self.model_tokenizer.decode([token_id])
+            for token_id in self.tool_call_start_token_ids
+        ]
+
+        self.tool_call_end_token_array = [
+            self.model_tokenizer.decode([token_id])
+            for token_id in self.tool_call_end_token_ids
+        ]
+
+        self.buffered_delta_text = ""
+
+    # Very simple idea: when encountering tokens like <, tool, _call, >,
+    # <, /, tool, _call, >, store them in a buffer.
+    # When the last token is encountered, empty the buffer and return it.
+    # If a token appears in an incorrect sequence while storing in the buffer,
+    # return the preceding buffer along with the token.
+    def tool_call_delta_buffer(self, delta_text: str):
+        # If the sequence of tool_call_start or tool_call_end tokens is not yet
+        # complete, fill the buffer with the token and return "".
+        if (delta_text in self.tool_call_start_token_array
+                or delta_text in self.tool_call_end_token_array):
+            # If delta_text is the last token of tool_call_start_token or
+            # tool_call_end_token, empty the buffer and return
+            # the buffered text + delta_text.
+            if (delta_text == self.tool_call_start_token_array[-1]
+                    or delta_text == self.tool_call_end_token_array[-1]):
+                buffered_text = self.buffered_delta_text
+                self.buffered_delta_text = ""
+                return buffered_text + delta_text
+            else:
+                self.buffered_delta_text = self.buffered_delta_text + delta_text
+                return ""
+        else:
+            if self.buffered_delta_text:
+                buffered_text = self.buffered_delta_text
+                self.buffered_delta_text = ""
+                return buffered_text + delta_text
+            else:
+                return delta_text
 
     def extract_tool_calls(
         self,
@@ -124,11 +161,23 @@ class Hermes2ProToolParser(ToolParser):
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
     ) -> Union[DeltaMessage, None]:
+        # 1. All tokens are parsed based on _text, not token_ids.
+        # 2. All incoming text data is processed by the tool_call_delta_buffer
+        #    function for buffering before being used for parsing.
+
+        delta_text = self.tool_call_delta_buffer(delta_text)
+        # If the last characters of previous_text
+        # match self.buffered_delta_text, remove only the matching part.
+        if (len(previous_text) >= len(self.buffered_delta_text)
+                and previous_text[-len(self.buffered_delta_text):]
+                == self.buffered_delta_text):
+            previous_text = previous_text[:-len(self.buffered_delta_text)]
+            current_text = previous_text + delta_text
 
         logger.debug("delta_text: %s", delta_text)
         logger.debug("delta_token_ids: %s", delta_token_ids)
         # check to see if we should be streaming a tool call - is there a
-        if self.tool_call_start_token_id not in current_token_ids:
+        if self.tool_call_start_token not in current_text:
             logger.debug("No tool call tokens found!")
             return DeltaMessage(content=delta_text)
 
@@ -136,14 +185,12 @@ class Hermes2ProToolParser(ToolParser):
 
             # figure out where we are in the parsing by counting tool call
             # start & end tags
-            prev_tool_start_count = previous_token_ids.count(
-                self.tool_call_start_token_id)
-            prev_tool_end_count = previous_token_ids.count(
-                self.tool_call_end_token_id)
-            cur_tool_start_count = current_token_ids.count(
-                self.tool_call_start_token_id)
-            cur_tool_end_count = current_token_ids.count(
-                self.tool_call_end_token_id)
+            prev_tool_start_count = previous_text.count(
+                self.tool_call_start_token)
+            prev_tool_end_count = previous_text.count(self.tool_call_end_token)
+            cur_tool_start_count = current_text.count(
+                self.tool_call_start_token)
+            cur_tool_end_count = current_text.count(self.tool_call_end_token)
             tool_call_portion = None
             text_portion = None
 

From 000cceca8c329d5b5d99e0186fbd444a390384cd Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sat, 16 Aug 2025 14:16:00 -0400
Subject: [PATCH 337/932] [Bugfix gpt-oss] Fix float32 convert for flashinfer
 sink support (#23016)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/attention/layer.py                  | 9 +++++++++
 vllm/v1/attention/backends/flashinfer.py | 3 ---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 1a9c0e26b5..0e87fa3f23 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -308,6 +308,15 @@ class Attention(nn.Module):
         if hasattr(self.impl, "process_weights_after_loading"):
             self.impl.process_weights_after_loading(act_dtype)
 
+        # FlashInfer requires attention sinks to be float32
+        if (self.backend == _Backend.FLASHINFER_VLLM_V1
+                and hasattr(self.impl, 'sinks')):
+            from vllm.v1.attention.backends.flashinfer import FlashInferImpl
+            assert isinstance(self.impl, FlashInferImpl)
+            if (self.impl.sinks is not None
+                    and self.impl.sinks.dtype != torch.float32):
+                self.impl.sinks = self.impl.sinks.to(torch.float32)
+
     def get_attn_backend(self) -> type[AttentionBackend]:
         return self.attn_backend
 
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index eac3f33e15..991904229f 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -642,9 +642,6 @@ class FlashInferImpl(AttentionImpl):
                     f"heads in the layer. Expected {num_heads}, but got "
                     f"{sinks.shape[0]}."
                 )
-            # Cast sinks to float32 if needed (FlashInfer requirement)
-            if sinks.dtype != torch.float32:
-                sinks = sinks.to(torch.float32)
             self.sinks = sinks
 
     def forward(

From 3253ae765ef4dc0604a6f3ed3a1dcd61fdda6bda Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sat, 16 Aug 2025 14:33:08 -0400
Subject: [PATCH 338/932] [Flaky CI] Increase timeout tolerance for
 test_mp_crash_detection+test_default_mm_lora_chat_completions (#23028)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/entrypoints/openai/test_default_mm_loras.py | 3 ++-
 tests/mq_llm_engine/test_error_handling.py        | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py
index 372e9b1fec..b9c466a6fb 100644
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@@ -48,7 +48,8 @@ def multimodal_server():  # noqa: F811
         f"{{\"audio\": \"{AUDIO_LORA_PATH}\"}}",
     ]
 
-    with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args,
+                            max_wait_seconds=480) as remote_server:
         yield remote_server
 
 
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index 3feee01dad..77e3732cd0 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -255,8 +255,8 @@ async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
             pass
         end = time.perf_counter()
 
-        assert end - start < 60, (
-            "Expected vLLM to gracefully shutdown in <60s "
+        assert end - start < 100, (
+            "Expected vLLM to gracefully shutdown in <100s "
             "if there is an error in the startup.")
 
 
From 4fc722eca4f6ad63edf1936989f4d2171aab3ca2 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sat, 16 Aug 2025 15:38:21 -0400
Subject: [PATCH 339/932] [Kernel/Quant] Remove AQLM (#22943)

Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       |   1 -
 CMakeLists.txt                                |   1 -
 benchmarks/kernels/benchmark_aqlm.py          | 345 ----------
 csrc/ops.h                                    |   9 -
 csrc/quantization/aqlm/gemm_kernels.cu        | 597 ------------------
 csrc/torch_bindings.cpp                       |  15 -
 .../quantization/supported_hardware.md        |   1 -
 docs/mkdocs/hooks/generate_examples.py        |   1 -
 examples/offline_inference/basic/README.md    |  14 -
 tests/compile/test_full_graph.py              |   4 -
 tests/kernels/quantization/test_aqlm.py       |  40 --
 tests/models/quantization/test_aqlm.py        |  68 --
 vllm/_custom_ops.py                           |  41 --
 vllm/model_executor/layers/linear.py          |  18 -
 .../layers/quantization/__init__.py           |   3 -
 .../layers/quantization/aqlm.py               | 376 -----------
 16 files changed, 1534 deletions(-)
 delete mode 100644 benchmarks/kernels/benchmark_aqlm.py
 delete mode 100644 csrc/quantization/aqlm/gemm_kernels.cu
 delete mode 100644 tests/kernels/quantization/test_aqlm.py
 delete mode 100644 tests/models/quantization/test_aqlm.py
 delete mode 100644 vllm/model_executor/layers/quantization/aqlm.py

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 5e5a532cb5..df0bae0c9c 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -121,7 +121,6 @@ fi
 if [[ $commands == *" kernels/quantization"* ]]; then
   commands="${commands} \
   --ignore=kernels/quantization/test_int8_quant.py \
-  --ignore=kernels/quantization/test_aqlm.py \
   --ignore=kernels/quantization/test_machete_mm.py \
   --ignore=kernels/quantization/test_block_fp8.py \
   --ignore=kernels/quantization/test_block_int8.py \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cda1ffc795..34386d670a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -286,7 +286,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   FetchContent_MakeAvailable(cutlass)
 
   list(APPEND VLLM_EXT_SRC
-    "csrc/quantization/aqlm/gemm_kernels.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
deleted file mode 100644
index 42de062b08..0000000000
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-import sys
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.aqlm import (
-    dequantize_weight,
-    generic_dequantize_gemm,
-    get_int_dtype,
-    optimized_dequantize_gemm,
-)
-from vllm.utils import FlexibleArgumentParser
-
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-
-
-def torch_mult(
-    # [..., in_features]
-    input: torch.Tensor,
-    weights: torch.Tensor,
-    # [num_out_groups, 1, 1, 1]
-    scales: torch.Tensor,
-) -> torch.Tensor:
-    output = F.linear(input, weights)
-    return output
-
-
-def dequant_out_scale(
-    # [..., in_features]
-    input: torch.Tensor,
-    # [num_out_groups, num_in_groups, num_codebooks]
-    codes: torch.IntTensor,
-    # [num_codebooks, codebook_size, out_group_size, in_group_size]
-    codebooks: torch.Tensor,
-    # [num_out_groups, 1, 1, 1]
-    scales: torch.Tensor,
-    output_partition_sizes: torch.IntTensor,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-
-    if bias is None:
-        output = F.linear(input, weights, bias)
-        orig_shape = output.shape
-        flattened_output = output.view(-1, output.size(-1))
-        f_scales = scales.view(-1, scales.shape[0])
-        b_scales = f_scales.expand(flattened_output.shape[0], -1)
-        flattened_output *= b_scales
-        return flattened_output.view(orig_shape)
-    else:
-        b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
-        weights *= b_scales
-        return F.linear(input, weights, bias)
-
-
-def dequant_weight_scale(
-    # [..., in_features]
-    input: torch.Tensor,
-    # [num_out_groups, num_in_groups, num_codebooks]
-    codes: torch.IntTensor,
-    # [num_codebooks, codebook_size, out_group_size, in_group_size]
-    codebooks: torch.Tensor,
-    # [num_out_groups, 1, 1, 1]
-    scales: torch.Tensor,
-    output_partition_sizes: torch.IntTensor,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-
-    b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
-    weights *= b_scales
-    return F.linear(input, weights, bias)
-
-
-def dequant_no_scale(
-    # [..., in_features]
-    input: torch.Tensor,
-    # [num_out_groups, num_in_groups, num_codebooks]
-    codes: torch.IntTensor,
-    # [num_codebooks, codebook_size, out_group_size, in_group_size]
-    codebooks: torch.Tensor,
-    # [num_out_groups, 1, 1, 1]
-    scales: torch.Tensor,
-    output_partition_sizes: torch.IntTensor,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-
-    return F.linear(input, weights, bias)
-
-
-# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
-# the generic pytorch version.
-# Just visual comparison.
-def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
-    n = int(parts.sum().item())
-
-    device = torch.device("cuda:0")
-
-    code_range = (1 << bits) // 2
-    ingroups = 8
-
-    codes = torch.randint(
-        -code_range,
-        code_range,
-        size=(n, k // ingroups, nbooks),
-        dtype=get_int_dtype(bits),
-        device=device,
-    )
-
-    codebooks = torch.randn(
-        size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
-        dtype=torch.float16,
-        device=device,
-    )
-
-    count = 0
-    for index in range(16):
-        for i in range(8):
-            for book in range(nbooks):
-                codebooks[book, index, 0, i] = count * (10**book)
-            count += 1
-
-    print("codes shape", codes.shape)
-
-    for i in range(16):
-        for book in range(nbooks):
-            codes[0, i, book] = i
-            codes[0, -i, book] = i
-
-    weights = dequantize_weight(codes, codebooks, None)
-    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
-
-    print("weights shape:", weights.shape)
-    print("weights2 shape:", weights2.shape)
-
-    print("weights are:", weights)
-    print("weights2 are:", weights2)
-
-    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
-    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
-
-    print("last 128 weights are", weights[0, -128:])
-    print("last 128 weights2 are:", weights2[0, -128:])
-
-
-def main():
-    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
-
-    # Add arguments
-    parser.add_argument(
-        "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)"
-    )
-    parser.add_argument(
-        "--bits",
-        type=int,
-        default=16,
-        help="Number of bits per code element (default: 16)",
-    )
-    parser.add_argument(
-        "--test",
-        type=bool,
-        default=False,
-        help="Run the decompression/dequant tester rather than benchmarking "
-        "(default: False)",
-    )
-
-    # Parse the arguments
-    args = parser.parse_args()
-
-    # Extract values
-    nbooks = args.nbooks
-    bits = args.bits
-
-    if args.test:
-        dequant_test(4096, torch.tensor((4096,)), nbooks, bits)
-        return
-
-    # Otherwise, benchmark.
-    methods = [
-        ops.aqlm_gemm,
-        dequant_out_scale,
-        generic_dequantize_gemm,
-        optimized_dequantize_gemm,
-        dequant_weight_scale,
-        torch_mult,
-        dequant_no_scale,
-    ]
-
-    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
-    print(f"writing benchmarks to file {filename}")
-    with open(filename, "w") as f:
-        sys.stdout = f
-
-        print("m | k | n | n parts", end="")
-        for method in methods:
-            print(f" | {method.__name__.replace('_', ' ')} (µs)", end="")
-        print("")
-
-        # These are reasonable prefill sizes.
-        ksandpartions = (
-            (4096, (4096, 4096, 4096)),
-            (4096, (4096,)),
-            (4096, (11008, 11008)),
-            (11008, (4096,)),
-        )
-
-        # reasonable ranges for m.
-        for m in [
-            1,
-            2,
-            4,
-            8,
-            10,
-            12,
-            14,
-            16,
-            24,
-            32,
-            48,
-            52,
-            56,
-            64,
-            96,
-            112,
-            128,
-            256,
-            512,
-            1024,
-            1536,
-            2048,
-            3072,
-            4096,
-        ]:
-            print(f"{m}", file=sys.__stdout__)
-            for ksp in ksandpartions:
-                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods)
-
-        sys.stdout = sys.__stdout__
-
-
-def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods):
-    # I didn't see visible improvements from increasing these, but feel free :)
-    num_warmup_trials = 1
-    num_trials = 1
-
-    num_calls = 100
-
-    # warmup.
-    for method in methods:
-        for _ in range(num_warmup_trials):
-            run_timing(
-                num_calls=num_calls,
-                m=m,
-                k=k,
-                parts=parts,
-                nbooks=nbooks,
-                bits=bits,
-                method=method,
-            )
-
-    n = parts.sum().item()
-    print(f"{m} | {k} | {n} | {parts.tolist()}", end="")
-
-    for method in methods:
-        best_time_us = 1e20
-        for _ in range(num_trials):
-            kernel_dur_ms = run_timing(
-                num_calls=num_calls,
-                m=m,
-                k=k,
-                parts=parts,
-                nbooks=nbooks,
-                bits=bits,
-                method=method,
-            )
-
-            kernel_dur_us = 1000 * kernel_dur_ms
-
-            if kernel_dur_us < best_time_us:
-                best_time_us = kernel_dur_us
-
-        print(f" | {kernel_dur_us:.0f}", end="")
-
-    print("")
-
-
-def run_timing(
-    num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method
-) -> float:
-    n = int(parts.sum().item())
-
-    device = torch.device("cuda:0")
-
-    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
-
-    code_range = (1 << bits) // 2
-    ingroups = 8
-
-    codes = torch.randint(
-        -code_range,
-        code_range,
-        size=(n, k // ingroups, nbooks),
-        dtype=get_int_dtype(bits),
-        device=device,
-    )
-
-    codebooks = torch.randn(
-        size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
-        dtype=torch.float16,
-        device=device,
-    )
-
-    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
-
-    # for comparison to just a pytorch mult.
-    weights = torch.randn((n, k), dtype=torch.float16, device=device)
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    start_event.record()
-
-    if method is torch_mult:
-        for i in range(num_calls):
-            torch_mult(input, weights, scales)
-    else:
-        for i in range(num_calls):
-            method(input, codes, codebooks, scales, parts, None)
-
-    end_event.record()
-    end_event.synchronize()
-
-    dur_ms = start_event.elapsed_time(end_event) / num_calls
-    return dur_ms
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/csrc/ops.h b/csrc/ops.h
index 3e29f0a973..6e39758f16 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -154,15 +154,6 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
 torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
 
 #ifndef USE_ROCM
-torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
-                        const torch::Tensor& codebooks,
-                        const torch::Tensor& scales,
-                        const std::vector<int64_t>& codebook_partition_sizes,
-                        const std::optional<torch::Tensor>& bias);
-
-torch::Tensor aqlm_dequant(
-    const torch::Tensor& codes, const torch::Tensor& codebooks,
-    const std::vector<int64_t>& codebook_partition_sizes);
 
 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                        torch::Tensor _scaling_factors, torch::Tensor _zeros,
diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu
deleted file mode 100644
index 79cd2c610b..0000000000
--- a/csrc/quantization/aqlm/gemm_kernels.cu
+++ /dev/null
@@ -1,597 +0,0 @@
-/*
- * Modified by Neural Magic
- * Adapted from https://github.com/Vahe1994/AQLM
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *         http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <torch/all.h>
-#include <c10/cuda/CUDAStream.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <iostream>
-#include <cstdlib>
-
-namespace vllm {
-namespace aqlm {
-
-__global__ void Code1x16MatVec(
-    const int4* __restrict__ A, const int4* __restrict__ B,
-    int4* __restrict__ C, const int4* __restrict__ codebook, const int prob_m,
-    const int prob_k,
-    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
-                                  // codebook, at most 3 long.
-    const int codebook_stride     // as int4.
-) {
-  int a_gl_stride = prob_k / 8 / 8;
-  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
-  bool pred = a_gl_rd < prob_m;
-
-  if (pred) {
-    // advance to the correct codebook, this easy because we only multiply one
-    // column of the codebook.
-    auto codebook_size = &codebook_a_sizes.x;
-    while (a_gl_rd >= *codebook_size) {
-      codebook += codebook_stride;
-      ++codebook_size;
-    }
-  }
-
-  int b_gl_rd = 0;
-  int c_gl_wr = a_gl_rd;
-  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
-  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
-
-  __shared__ int4 sh_b[32 * 9];
-  float res = 0;
-
-  int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
-  while (iters--) {
-    // We pad shared memory to avoid bank conflicts during reads
-    __syncthreads();
-    for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
-      if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
-    }
-    __syncthreads();
-    b_gl_rd += 32 * 8;
-
-    int b_sh_rd = 9 * (threadIdx.x % 32);
-    if (pred && a_gl_rd < a_gl_end) {
-      const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
-#pragma unroll
-      for (int i = 0; i < 8; i++) {
-        uint32_t dec[4];
-        // We bypass the L1 cache to avoid massive amounts of memory streaming
-        // that doesn't actually help us; this brings > 2x speedup.
-        asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-                     : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
-                     : "l"((void*)&codebook[enc[i]]));
-        half2* a = reinterpret_cast<half2*>(&dec);
-        half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
-        half2 res2 = {};
-#pragma unroll
-        for (int j = 0; j < 4; j++) res2 = __hfma2(a[j], b[j], res2);
-        res += __half2float(res2.x) + __half2float(res2.y);
-        b_sh_rd++;
-      }
-      a_gl_rd += 32;
-    }
-  }
-
-  if (pred) {
-#pragma unroll
-    for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i);
-    if (threadIdx.x % 32 == 0)
-      reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
-  }
-}
-
-__global__ void Code2x8MatVec(
-    const int4* __restrict__ A, const int4* __restrict__ B,
-    int4* __restrict__ C, const int4* __restrict__ codebook, int prob_m,
-    int prob_k,
-    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
-                                  // codebook, at most 3 long.
-    const int codebook_stride     // as int4.
-
-) {
-  int a_gl_stride = prob_k / 8 / 8;
-  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
-  bool pred = a_gl_rd < prob_m;
-
-  if (pred) {
-    // advance to the correct codebook, this easy because we only multiply one
-    // column of the codebook.
-    auto codebook_size = &codebook_a_sizes.x;
-    while (a_gl_rd >= *codebook_size) {
-      codebook += codebook_stride;
-      ++codebook_size;
-    }
-  }
-
-  int b_gl_rd = 0;
-  int c_gl_wr = a_gl_rd;
-  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
-  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
-  int lane = threadIdx.x % 8;
-
-  extern __shared__ int4 sh[];
-  int4* sh_b = sh;
-  int4* sh_code = sh_b + 32 * 9;
-  int4* sh_code0 = sh_code;
-  int4* sh_code1 = sh_code + 256 * 8;
-
-  for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
-    int4 dec = codebook[i];
-#pragma unroll
-    for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec;
-  }
-  __syncthreads();
-
-  float res = 0;
-
-  int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
-  while (iters--) {
-    // We pad shared memory to avoid bank conflicts during reads
-    __syncthreads();
-    for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
-      if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
-    }
-    __syncthreads();
-    b_gl_rd += 32 * 8;
-
-    int b_sh_rd = 9 * (threadIdx.x % 32);
-    if (pred && a_gl_rd < a_gl_end) {
-      const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
-#pragma unroll
-      for (int i = 0; i < 8; i++) {
-        half2* a0 =
-            reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
-        half2* a1 =
-            reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
-        half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
-        half2 res2 = {};
-#pragma unroll
-        for (int j = 0; j < 4; j++)
-          res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2);
-        res += __half2float(res2.x) + __half2float(res2.y);
-        b_sh_rd++;
-      }
-      a_gl_rd += 32;
-    }
-  }
-
-  if (pred) {
-#pragma unroll
-    for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i);
-    if (threadIdx.x % 32 == 0)
-      reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
-  }
-}
-
-__global__ void Code1x16Dequant(
-    const int4* __restrict__ A, int4* __restrict__ C,
-    const int4* __restrict__ codebook, int prob_m, int prob_k,
-    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
-                                  // codebook, at most 3 long, sums to m.
-    const int codebook_stride     // as int4
-) {
-  int a_gl_stride = prob_k / 8 / 8;
-  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
-  bool pred = a_gl_rd < prob_m;
-
-  if (pred) {
-    // advance to the correct codebook, this easy because we only multiply one
-    // column of the codebook.
-    auto codebook_size = &codebook_a_sizes.x;
-    while (a_gl_rd >= *codebook_size) {
-      codebook += codebook_stride;
-      ++codebook_size;
-    }
-  }
-
-  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
-  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
-
-  int c_gl_stride = prob_k / 8;
-  int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
-  c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
-
-  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
-  while (iters--) {
-    if (pred && a_gl_rd < a_gl_end) {
-      const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
-#pragma unroll
-      for (int i = 0; i < 8; i++) {
-        int4 chunk;
-        auto dec = reinterpret_cast<uint32_t*>(&chunk);
-        // We bypass the L1 cache to avoid massive amounts of memory streaming
-        // that doesn't actually help us; this brings > 2x speedup.
-        asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-                     : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
-                     : "l"((void*)&codebook[enc[i]]));
-
-        C[a_gl_rd * 8 + i] = chunk;
-      }
-    }
-    a_gl_rd += 32;
-  }
-}
-
-__global__ void Code2x8Dequant(
-    const int4* __restrict__ A, int4* __restrict__ C,
-    const int4* __restrict__ codebook, int prob_m, int prob_k,
-    const int4
-        codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at
-                           // most 3 long, corresponds to cols.
-    const int codebook_stride  // as int4
-) {
-  int a_gl_stride = prob_k / 8 / 8;
-  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
-  bool pred = a_gl_rd < prob_m;
-
-  if (pred) {
-    // advance to the correct codebook, this easy because we only multiply one
-    // column of the codebook.
-    auto codebook_size = &codebook_a_sizes.x;
-    while (a_gl_rd >= *codebook_size) {
-      codebook += codebook_stride;
-      ++codebook_size;
-    }
-  }
-
-  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
-  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
-  int lane = threadIdx.x % 8;
-
-  int c_gl_stride = prob_k / 8;
-  int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
-  c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
-
-  extern __shared__ int4 sh[];
-  int4* sh_code = sh;
-  int4* sh_code0 = sh_code;
-  int4* sh_code1 = sh_code + 256 * 8;
-
-  for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
-    int4 dec = codebook[i];
-#pragma unroll
-    for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec;
-  }
-  __syncthreads();
-
-  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
-  while (iters--) {
-    if (pred && a_gl_rd < a_gl_end) {
-      const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
-#pragma unroll
-      for (int i = 0; i < 8; i++) {
-        int4 chunk;
-        half2* a0 =
-            reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
-        half2* a1 =
-            reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
-#pragma unroll
-        for (int j = 0; j < 4; j++)
-          reinterpret_cast<half2*>(&chunk)[j] = __hadd2(a0[j], a1[j]);
-        C[a_gl_rd * 8 + i] = chunk;
-      }
-    }
-    a_gl_rd += 32;
-  }
-}
-
-inline int ceildiv(int a, int b) { return (a + b - 1) / b; }
-
-const int THREAD_M = 16;
-
-void code1x16_matvec_cuda(const void* __restrict__ A,
-                          const void* __restrict__ B, void* __restrict__ C,
-                          const void* __restrict__ codebook, int prob_m,
-                          int prob_k, const int4 codebook_a_sizes,
-                          const int codebook_stride) {
-  int sms;
-  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
-  int waves = 0;
-  int thread_m;
-  do {
-    waves++;
-    thread_m = ceildiv(prob_m, waves * sms);
-  } while (thread_m > THREAD_M);
-
-  int blocks = ceildiv(prob_m, thread_m);
-  int threads = 32 * thread_m;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  Code1x16MatVec<<<blocks, threads, 16 * 32 * 9, stream>>>(
-      (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m,
-      prob_k, codebook_a_sizes, codebook_stride);
-}
-
-void code2x8_matvec_cuda(const void* __restrict__ A, const void* __restrict__ B,
-                         void* __restrict__ C,
-                         const void* __restrict__ codebook, int prob_m,
-                         int prob_k, const int4 codebook_a_sizes,
-                         const int codebook_stride) {
-  int sms;
-  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
-  int waves = 0;
-  int thread_m;
-  do {
-    waves++;
-    thread_m = ceildiv(prob_m, waves * sms);
-  } while (thread_m > THREAD_M);
-
-  int blocks = ceildiv(prob_m, thread_m);
-  int threads = 32 * thread_m;
-  int shared = 16 * (2 * 256 * 8 + 32 * 9);
-  cudaFuncSetAttribute(Code2x8MatVec,
-                       cudaFuncAttributeMaxDynamicSharedMemorySize, shared);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  Code2x8MatVec<<<blocks, threads, shared, stream>>>(
-      (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m,
-      prob_k, codebook_a_sizes, codebook_stride);
-}
-
-void code1x16_dequant_cuda(
-    const void* __restrict__ A, void* __restrict__ C,
-    const void* __restrict__ codebook, int prob_m, int prob_k,
-    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
-                                  // codebook, at most 3 long.
-    const int codebook_stride     // as int4.
-) {
-  int sms;
-  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
-  int waves = 0;
-  int thread_m;
-  do {
-    waves++;
-    thread_m = ceildiv(prob_m, waves * sms);
-  } while (thread_m > THREAD_M);
-
-  int blocks = ceildiv(prob_m, thread_m);
-  int threads = 32 * thread_m;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  Code1x16Dequant<<<blocks, threads, 0, stream>>>(
-      (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k,
-      codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at
-                         // most 3 long.
-      codebook_stride    // as int4.
-  );
-}
-
-// Dequantizes the code and codebook into weights.
-void code2x8_dequant_cuda(
-    const void* __restrict__ A, void* __restrict__ C,
-    const void* __restrict__ codebook, int prob_m, int prob_k,
-    const int4
-        codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at
-                           // most 3 long, corresponds to cols.
-    const int codebook_stride  // as int4
-) {
-  int sms;
-  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
-  int waves = 0;
-  int thread_m;
-  do {
-    waves++;
-    thread_m = ceildiv(prob_m, waves * sms);
-  } while (thread_m > THREAD_M);
-
-  int blocks = ceildiv(prob_m, thread_m);
-  int threads = 32 * thread_m;
-  int shared = 16 * (2 * 256 * 8 + 32 * 9);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-
-  cudaFuncSetAttribute(Code2x8Dequant,
-                       cudaFuncAttributeMaxDynamicSharedMemorySize, shared);
-  Code2x8Dequant<<<blocks, threads, shared, stream>>>(
-      (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k,
-      codebook_a_sizes, codebook_stride);
-}
-
-int codebook_stride(const torch::Tensor& codebooks) {
-  return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
-}
-
-void code1x16_matvec(
-    const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C,
-    const torch::Tensor& codebook,
-    const int4 codebook_a_sizes  // cumulative sizes of A spanning each
-                                 // codebook, at most 3 long.
-) {
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
-  int prob_m = C.size(0);
-  int prob_k = B.size(0);
-
-  code1x16_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(),
-                       codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes,
-                       codebook_stride(codebook));
-}
-
-torch::Tensor code1x16_matmat(const torch::Tensor& input,
-                              const torch::Tensor& codes,
-                              const torch::Tensor& codebooks,
-                              const torch::Tensor& scales,
-                              const int4 codebook_a_sizes,
-                              const std::optional<torch::Tensor>& bias) {
-  auto input_sizes = input.sizes();
-  auto out_features = codes.size(0) * codebooks.size(2);
-  auto flat_input = input.reshape({-1, input.size(-1)});
-  auto flat_output = torch::empty(
-      {flat_input.size(0), out_features},
-      torch::TensorOptions().dtype(input.dtype()).device(input.device()));
-
-  for (int i = 0; i < flat_input.size(0); ++i) {
-    auto input_vec = flat_input.index({i});
-    auto output_vec = flat_output.index({i});
-    code1x16_matvec(codes.squeeze(2), input_vec, output_vec, codebooks,
-                    codebook_a_sizes);
-  }
-  flat_output *= scales.flatten().unsqueeze(0);
-
-  if (bias.has_value()) {
-    flat_output += bias->unsqueeze(0);
-  }
-
-  auto output_sizes = input_sizes.vec();
-  output_sizes.pop_back();
-  output_sizes.push_back(-1);
-  auto output = flat_output.reshape(output_sizes);
-  return output;
-}
-
-void code2x8_matvec(const torch::Tensor& A, const torch::Tensor& B,
-                    torch::Tensor& C, const torch::Tensor& codebook,
-                    const int4 codebook_a_sizes) {
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
-  int prob_m = C.size(0);
-  int prob_k = B.size(0);
-  code2x8_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(),
-                      codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes,
-                      2 * codebook_stride(codebook));
-}
-
-torch::Tensor code2x8_matmat(const torch::Tensor& input,
-                             const torch::Tensor& codes,
-                             const torch::Tensor& codebooks,
-                             const torch::Tensor& scales,
-                             const int4 codebook_a_sizes,
-                             const std::optional<torch::Tensor>& bias) {
-  auto input_sizes = input.sizes();
-  auto out_features = codes.size(0) * codebooks.size(2);
-  auto flat_input = input.reshape({-1, input.size(-1)});
-  auto flat_output = torch::empty(
-      {flat_input.size(0), out_features},
-      torch::TensorOptions().dtype(input.dtype()).device(input.device()));
-
-  for (int i = 0; i < flat_input.size(0); ++i) {
-    auto input_vec = flat_input.index({i});
-    auto output_vec = flat_output.index({i});
-    code2x8_matvec(codes.squeeze(2), input_vec, output_vec, codebooks,
-                   codebook_a_sizes);
-  }
-  flat_output *= scales.flatten().unsqueeze(0);
-  if (bias.has_value()) {
-    flat_output += bias->unsqueeze(0);
-  }
-
-  auto output_sizes = input_sizes.vec();
-  output_sizes.pop_back();
-  output_sizes.push_back(-1);
-  auto output = flat_output.reshape(output_sizes);
-  return output;
-}
-
-// Accumulate the partition sizes.
-int4 accumulate_sizes(const std::vector<int64_t>& codebook_partition_sizes) {
-  int4 cumulative_sizes;
-  auto cumulative_size = &cumulative_sizes.x;
-  size_t i = 0;
-  int last = 0;
-  assert(codebook_partition_sizes.size() <= 4);
-  for (; i < codebook_partition_sizes.size(); ++i, ++cumulative_size) {
-    *cumulative_size = codebook_partition_sizes[i] + last;
-    last = *cumulative_size;
-  }
-  // fill in the rest with unreachable.
-  for (; i < 4; ++i, ++cumulative_size) {
-    *cumulative_size = last * 10;
-  }
-  return cumulative_sizes;
-}
-
-}  // namespace aqlm
-}  // namespace vllm
-
-torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
-                        const torch::Tensor& codebooks,
-                        const torch::Tensor& scales,
-                        const std::vector<int64_t>& codebook_partition_sizes,
-                        const std::optional<torch::Tensor>& bias) {
-  int4 cumulative_sizes =
-      vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
-
-  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
-  int const entries = codebooks.size(1);
-
-  if (nbooks == 1 && entries == (1 << 16)) {
-    return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales,
-                                       cumulative_sizes, bias);
-  }
-  if (nbooks == 2 && entries == (1 << 8)) {
-    return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales,
-                                      cumulative_sizes, bias);
-  }
-
-  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries,
-              " entries is not currently supported.")
-  return {};
-}
-
-torch::Tensor aqlm_dequant(
-    const torch::Tensor& codes, const torch::Tensor& codebooks,
-    const std::vector<int64_t>& codebook_partition_sizes) {
-  int4 cumulative_sizes =
-      vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
-
-  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
-  int const entries = codebooks.size(1);
-
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
-  int rows = codes.size(1);
-  int cols = codes.size(0);
-
-  auto in_features = codes.size(1) * 8;
-  auto out_features = codes.size(0);
-
-  assert(out_features == std::accumulate(codebook_partition_sizes.begin(),
-                                         codebook_partition_sizes.end(), 0));
-
-  auto weights = torch::empty({out_features, in_features},
-                              torch::TensorOptions()
-                                  .dtype(codebooks.dtype())
-                                  .device(codebooks.device()));
-
-  if (nbooks == 1 && entries == (1 << 16)) {
-    vllm::aqlm::code1x16_dequant_cuda(codes.data_ptr(), weights.data_ptr(),
-                                      codebooks.data_ptr(), out_features,
-                                      in_features, cumulative_sizes,
-                                      vllm::aqlm::codebook_stride(codebooks));
-
-    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower
-    // and not consistent with gemv implementation.) weights *=
-    // scales.index({"...", 0, 0});
-
-    return weights;
-  }
-
-  if (nbooks == 2 && entries == (1 << 8)) {
-    vllm::aqlm::code2x8_dequant_cuda(codes.data_ptr(), weights.data_ptr(),
-                                     codebooks.data_ptr(), out_features,
-                                     in_features, cumulative_sizes,
-                                     vllm::aqlm::codebook_stride(codebooks));
-
-    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower
-    // and not consistent with gemv implementation) weights *=
-    // scales.index({"...", 0, 0});
-
-    return weights;
-  }
-
-  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries,
-              " entries is not currently supported.")
-  return {};
-}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index a547baec50..5fee106335 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -207,21 +207,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // Quantization ops
 #ifndef USE_ROCM
-  // Quantized GEMM for AQLM.
-  ops.def(
-      "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
-      "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
-      "-> Tensor",
-      {stride_tag});
-  ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
-
-  // Decompression method for AQLM.
-  ops.def(
-      "aqlm_dequant(Tensor codes, Tensor codebooks, "
-      "int[] codebook_partition_sizes) -> Tensor",
-      {stride_tag});
-  ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
-
   // Quantized GEMM for AWQ.
   ops.def(
       "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md
index f53e69ecc6..06264d08b5 100644
--- a/docs/features/quantization/supported_hardware.md
+++ b/docs/features/quantization/supported_hardware.md
@@ -17,7 +17,6 @@ th {
 | INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
 | FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
 | BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 6b4c5b3107..1e8b848db4 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -24,7 +24,6 @@ def fix_case(text: str) -> str:
         "llm": "LLM",
         "mae": "MAE",
         "tpu": "TPU",
-        "aqlm": "AQLM",
         "gguf": "GGUF",
         "lora": "LoRA",
         "rlhf": "RLHF",
diff --git a/examples/offline_inference/basic/README.md b/examples/offline_inference/basic/README.md
index 0a2bd6e2b7..cbb3116e97 100644
--- a/examples/offline_inference/basic/README.md
+++ b/examples/offline_inference/basic/README.md
@@ -52,20 +52,6 @@ Try it yourself with the following argument:
 
 ### Quantization
 
-#### AQLM
-
-vLLM supports models that are quantized using AQLM.
-
-Try one yourself by passing one of the following models to the `--model` argument:
-
-- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf`
-- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf`
-- `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf`
-- `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf`
-- `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf`
-
-> Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs.
-
 #### GGUF
 
 vLLM supports models that are quantized using GGUF.
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 72f962ed74..a2fc6ffeb8 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -31,10 +31,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
     ]
 
     if all:
-        if is_quant_method_supported("aqlm"):
-            TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
-                "quantization": "aqlm"
-            }))
 
         # TODO: figure out why this fails.
         if False and is_quant_method_supported("gguf"):  # noqa: SIM223
diff --git a/tests/kernels/quantization/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py
deleted file mode 100644
index 427db3e602..0000000000
--- a/tests/kernels/quantization/test_aqlm.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-from tests.kernels.utils import opcheck
-from vllm import _custom_ops as ops  # noqa: F401
-
-
-def test_aqlm_dequant_opcheck():
-    codes = torch.randint(-32768,
-                          32767, (22016, 512, 1),
-                          device='cuda',
-                          dtype=torch.int16)
-    codebooks = torch.rand((2, 65536, 1, 8),
-                           device='cuda',
-                           dtype=torch.float16)
-    codebook_partition_sizes = [11008, 11008]
-
-    opcheck(torch.ops._C.aqlm_dequant,
-            (codes, codebooks, codebook_partition_sizes))
-
-
-def test_aqlm_gemm_opcheck():
-    input = torch.rand((4, 4096), device='cuda', dtype=torch.float16)
-    codes = torch.randint(-32768,
-                          32767, (12288, 512, 1),
-                          device='cuda',
-                          dtype=torch.int16)
-    codebooks = torch.rand((3, 65536, 1, 8),
-                           device='cuda',
-                           dtype=torch.float16)
-    scales = torch.rand((12288, 1, 1, 1), device='cuda', dtype=torch.float16)
-    codebook_partition_sizes = [4096, 4096, 4096]
-    bias = None
-
-    opcheck(torch.ops._C.aqlm_gemm,
-            (input, codes, codebooks, scales, codebook_partition_sizes, None))
-    opcheck(torch.ops._C.aqlm_gemm,
-            (input, codes, codebooks, scales, codebook_partition_sizes, bias))
diff --git a/tests/models/quantization/test_aqlm.py b/tests/models/quantization/test_aqlm.py
deleted file mode 100644
index de6851e2fc..0000000000
--- a/tests/models/quantization/test_aqlm.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-from tests.quantization.utils import is_quant_method_supported
-from vllm.platforms import current_platform
-
-# These ground truth generations were generated using `transformers==4.38.1
-# aqlm==1.1.0 torch==2.2.0`
-# and the below code:
-# ```python
-# from transformers import AutoTokenizer, AutoModelForCausalLM
-# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
-# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
-# torch_dtype="auto", device_map="cuda").cuda()
-# tokenizer = AutoTokenizer.from_pretrained(model_id)
-# outputs = []
-# for prompt in example_prompts:
-#     input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
-#     hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
-#     outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
-# print(outputs)
-# ```
-ground_truth_generations = [
-    '\n### Features\n\n- **High-throughput**: v',
-    'The major milestones in the development of artificial intelligence from '
-    '195',
-    'Compare and contrast artificial intelligence with human intelligence in '
-    'terms of processing information. The',
-    'Explain the difference between supervised and unsupervised learning.'
-    '\nExplain',
-    'Write a short story about a robot that dreams for the first time. The',
-    'Analyze the impact of the COVID-19 pandemic on global economic',
-    'The Mona Lisa is a painting by Leonardo da Vinci, and it',
-    'The early bird catches the worm.\nThe early bird catches the'
-]
-
-
-@pytest.mark.skipif(not is_quant_method_supported("aqlm")
-                    or current_platform.is_rocm()
-                    or not current_platform.is_cuda(),
-                    reason="AQLM is not supported on this GPU type.")
-@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [16])
-@pytest.mark.parametrize("num_logprobs", [1])
-def test_models(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
-    # loop through the prompts to compare against the ground truth generations
-    for prompt_idx in range(len(example_prompts)):
-        vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
-            prompt_idx]
-
-        print("Prompt:          ", repr(example_prompts[prompt_idx]))
-        print("Reference output:", repr(ground_truth_generations[prompt_idx]))
-        print("Output output:   ", repr(vllm_output_str))
-        assert vllm_output_str == ground_truth_generations[prompt_idx]
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index a318637c5a..0d556053f8 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -476,32 +476,6 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
                            dtype=input.dtype,
                            device=input.device).sum(0)
 
-    @register_fake("_C::aqlm_gemm")
-    def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor,
-                        codebooks: torch.Tensor, scales: torch.Tensor,
-                        codebook_partition_sizes: list[int],
-                        bias: Optional[torch.Tensor]) -> torch.Tensor:
-        out_features = codes.size(0) * codebooks.size(2)
-        flat_input = input.reshape((-1, input.size(-1)))
-        flat_output = torch.empty((flat_input.size(0), out_features),
-                                  dtype=input.dtype,
-                                  device=input.device)
-
-        output_sizes = list(input.shape)
-        output_sizes.pop()
-        output_sizes.append(-1)
-        return flat_output.reshape(tuple(output_sizes))
-
-    @register_fake("_C::aqlm_dequant")
-    def _aqlm_dequant_fake(
-            codes: torch.Tensor, codebooks: torch.Tensor,
-            codebook_partition_sizes: list[int]) -> torch.Tensor:
-        in_features = codes.size(1) * 8
-        out_features = codes.size(0)
-        return torch.empty((out_features, in_features),
-                           dtype=codebooks.dtype,
-                           device=codebooks.device)
-
     @register_fake("_C::machete_mm")
     def machete_mm_fake(
         a: torch.Tensor,
@@ -957,21 +931,6 @@ def cutlass_fp4_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
                                              sf_offsets)
 
 
-# aqlm
-def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
-              codebooks: torch.Tensor, scales: torch.Tensor,
-              codebook_partition_sizes: list[int],
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-    return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales,
-                                  codebook_partition_sizes, bias)
-
-
-def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
-                 codebook_partition_sizes: list[int]) -> torch.Tensor:
-    return torch.ops._C.aqlm_dequant(codes, codebooks,
-                                     codebook_partition_sizes)
-
-
 # gptq_marlin
 def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                        size_k: int, size_n: int,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 75391c51f7..671ad9eed2 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -692,8 +692,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
-        # Special case for AQLM codebooks.
-        is_metadata = getattr(param, "is_metadata", False)
         # Special case for per-tensor scale to load scalar into fused array.
         needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
 
@@ -781,13 +779,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             if not is_sharded_weight:
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                      shard_size)
-        # Special case for AQLM codebooks.
-        elif is_metadata:
-            # metadata indicates fixed size concatenated along dim 0
-            shard_size = loaded_weight.shape[0]
-            shard_offset = loaded_shard_id * shard_size
-            param_data = param_data.narrow(0, shard_offset, shard_size)
-
         # Special case for per-tensor scales in fused case.
         elif needs_scalar_to_array:
             param_data, loaded_weight = adjust_scalar_to_fused_array(
@@ -1081,8 +1072,6 @@ class QKVParallelLinear(ColumnParallelLinear):
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
-        # Special case for AQLM codebooks.
-        is_metadata = getattr(param, "is_metadata", False)
 
         # Special case for per-tensor scales in fused case.
         needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
@@ -1204,13 +1193,6 @@ class QKVParallelLinear(ColumnParallelLinear):
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                      shard_size)
 
-        # Special case for for AQLM codebooks.
-        elif is_metadata:
-            # metadata indicates fixed size concatenated along dim 0
-            shard_size = loaded_weight.shape[0]
-            shard_index = ["q", "k", "v"].index(loaded_shard_id)
-            param_data = param_data.narrow(0, shard_index * shard_size,
-                                           shard_size)
         # Special case for per-tensor scales in fused case.
         elif needs_scalar_to_array:
             param_data, loaded_weight = adjust_scalar_to_fused_array(
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 8d63027e18..a4c2671225 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -7,7 +7,6 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 
 QuantizationMethods = Literal[
-    "aqlm",
     "awq",
     "deepspeedfp",
     "tpu_int8",
@@ -88,7 +87,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     # lazy import to avoid triggering `torch.compile` too early
     from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig
 
-    from .aqlm import AQLMConfig
     from .auto_round import AutoRoundConfig
     from .awq import AWQConfig
     from .awq_marlin import AWQMarlinConfig
@@ -120,7 +118,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .tpu_int8 import Int8TpuConfig
 
     method_to_config: dict[str, type[QuantizationConfig]] = {
-        "aqlm": AQLMConfig,
         "awq": AWQConfig,
         "deepspeedfp": DeepSpeedFPConfig,
         "tpu_int8": Int8TpuConfig,
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
deleted file mode 100644
index 2ea8c5dc51..0000000000
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Supports AQLM compression, see https://github.com/Vahe1994/AQLM
-# and https://arxiv.org/pdf/2401.06118.pdf
-
-import math
-from typing import Any, Optional
-
-import torch
-import torch.nn.functional as F
-from torch.nn.parameter import Parameter
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-from vllm.model_executor.utils import set_weight_attrs
-
-
-def get_int_dtype(nbits: int) -> torch.dtype:
-    if nbits <= 8:
-        return torch.int8
-    if nbits <= 16:
-        return torch.int16
-    if nbits <= 32:
-        return torch.int32
-    if nbits <= 64:
-        return torch.int64
-    raise ValueError(f"No dtype available for {nbits}-bit codebooks")
-
-
-@torch.inference_mode()
-def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor:
-    return data.to(torch.int64) % (2**nbits)
-
-
-def dequantize_weight(codes: torch.Tensor,
-                      codebooks: torch.Tensor,
-                      scales: Optional[torch.Tensor] = None) -> torch.Tensor:
-    """
-    Decode float weights from quantization codes. Differentiable.
-    :param codes: tensor of integer quantization codes, shape 
-        [*dims, num_out_groups, num_in_groups, num_codebooks]
-    :param codebooks: tensor of vectors for each quantization code, 
-        [num_codebooks, codebook_size, out_group_size, in_group_size]
-    :param scales: weight will be multiplied by this factor, must be 
-        broadcastble with 
-        [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
-    :return: reconstructed weight tensor of shape 
-        [*dims, num_in_groups*group_size]
-    """
-    num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
-    num_codebooks, codebook_size, out_group_size, in_group_size = \
-        codebooks.shape
-    out_features = num_out_groups * out_group_size
-    in_features = num_in_groups * in_group_size
-    codebook_offsets = torch.arange(
-        0, num_codebooks * codebook_size, codebook_size,
-        device=codes.device)  # shape: [num_codebooks]
-    reconstructed_weight_flat = F.embedding_bag(
-        codes.flatten(0, -2) + codebook_offsets,
-        codebooks.flatten(0, 1).flatten(-2, -1),
-        mode="sum"
-    )  # [prod(dims) * num_out_groups * num_in_groups, out_group_size
-    # * in_group_size]
-
-    reconstructed_weight_groupwise = reconstructed_weight_flat.view(
-        list(codes.shape[:-3]) +
-        [num_out_groups, num_in_groups, out_group_size, in_group_size])
-    if scales is not None:
-        reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(
-            scales)
-    return reconstructed_weight_groupwise.swapaxes(
-        -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
-
-
-def dequantize_gemm(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    dequantized_weight = dequantize_weight(
-        unpack_int_data(codes, codebooks.shape[1].bit_length() - 1),
-        codebooks,
-        scales,
-    )
-    return F.linear(input, dequantized_weight, bias)
-
-
-# Generic dequantization, slow but flexible.
-def generic_dequantize_gemm(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-    output_partition_sizes: list[int],
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    output_shape = input.shape[:-1] + (scales.shape[0], )
-    output = torch.empty(output_shape, dtype=input.dtype, device=input.device)
-    num_outputs = len(output_partition_sizes)
-
-    # break the inputs and codebooks apart then combine the outputs.
-    # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big
-    # multiply at the end.
-    num_codebooks = codebooks.shape[0] // num_outputs
-    assert (scales.shape[0] == codes.shape[0])
-    assert (sum(output_partition_sizes) == scales.shape[0])
-    output_offset = 0
-    codebooks_offset = 0
-    for output_size in output_partition_sizes:
-        shard_output = dequantize_gemm(
-            input, codes.narrow(0, output_offset, output_size),
-            codebooks.narrow(0, codebooks_offset, num_codebooks),
-            scales.narrow(0, output_offset, output_size), None
-            if bias is None else bias.narrow(0, output_offset, output_size))
-
-        output_slice = output.narrow(-1, output_offset, output_size)
-        assert (output_slice.shape == shard_output.shape)
-        output_slice.copy_(shard_output)
-        output_offset += output_size
-        codebooks_offset += num_codebooks
-    return output
-
-
-# Optimized dequnantize/decompression kernels, supports 1x16 and 2x8
-# at 6 and 9 times faster than the generic version above, respectively.
-def optimized_dequantize_gemm(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-    output_partition_sizes: list[int],
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-
-    if bias is None:
-        # scaling the output is fastest, so we do that when possible.
-        output = F.linear(input, weights, bias)
-        orig_shape = output.shape
-        flattened_output = output.view(-1, output.size(-1))
-        f_scales = scales.view(-1, scales.shape[0])
-        b_scales = f_scales.expand(flattened_output.shape[0], -1)
-        flattened_output *= b_scales
-        return output.view(orig_shape)
-    else:
-        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
-            -1, weights.shape[1])
-        weights *= b_scales
-        return F.linear(input, weights, bias)
-
-
-class AQLMConfig(QuantizationConfig):
-    """Config class for AQLM.
-
-    Reference: https://github.com/Vahe1994/AQLM
-    """
-
-    def __init__(
-        self,
-        in_group_size: int,
-        nbits_per_codebook: int,
-        num_codebooks: int,
-        out_group_size: int,
-    ) -> None:
-        super().__init__()
-        self.in_group_size = in_group_size
-        self.nbits_per_codebook = nbits_per_codebook
-        self.num_codebooks = num_codebooks
-        self.out_group_size = out_group_size
-
-        # out_group_size > 1 is untested, and probably won't work as-is.
-        assert (self.out_group_size == 1)
-        self.pack_factor = (self.in_group_size * self.out_group_size)
-
-    def __repr__(self) -> str:
-        return (f"AQLMConfig(in_group_size={self.in_group_size}, "
-                f"nbits_per_codebook={self.nbits_per_codebook}, "
-                f"num_codebooks={self.num_codebooks}, "
-                f"out_group_size={self.out_group_size})")
-
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
-        return "aqlm"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
-        return [torch.half]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 60
-
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return []  # no extra configs.
-
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "AQLMConfig":
-        in_group_size = cls.get_from_keys(config, ["in_group_size"])
-        nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"])
-        num_code_books = cls.get_from_keys(config, ["num_codebooks"])
-        out_group_size = cls.get_from_keys(config, ["out_group_size"])
-        return cls(in_group_size, nbits_per_codebook, num_code_books,
-                   out_group_size)
-
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["AQLMLinearMethod"]:
-        if isinstance(layer, LinearBase):
-            return AQLMLinearMethod(self)
-        return None
-
-
-class AQLMLinearMethod(LinearMethodBase):
-    """Linear method for AQLM.
-
-    Args:
-        quant_config: The AQLM quantization config.
-    """
-
-    def __init__(self, quant_config: AQLMConfig):
-        self.quant_config = quant_config
-
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
-        del output_size  # Unused.
-        del input_size  # Unused.
-
-        if params_dtype != torch.half:
-            raise ValueError("Only half is currently supported by aqlm")
-        if input_size_per_partition % self.quant_config.in_group_size != 0:
-            raise ValueError(
-                "The input size is not aligned with the quantized "
-                "weight shape. This can be caused by too large "
-                "tensor parallel size.")
-
-        output_size_per_partition = sum(output_partition_sizes)
-        if output_size_per_partition % self.quant_config.out_group_size != 0:
-            raise ValueError(
-                "The output size is not aligned with the quantized "
-                "weight shape. This can be caused by too large "
-                "tensor parallel size.")
-
-        codes = Parameter(
-            torch.empty(
-                # There could actually be two pack factors, one along input and
-                # one along output, but we don't currently support
-                # out_group_size, and only the one along output needs to be
-                # marked with "packed_dim" in order for QKVLinear to work.
-                output_size_per_partition,
-                input_size_per_partition // self.quant_config.pack_factor,
-                self.quant_config.num_codebooks,
-                dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
-            ),
-            requires_grad=False,
-        )
-
-        set_weight_attrs(
-            codes,
-            {
-                "input_dim": 1,
-                "output_dim": 0,
-                "packed_dim": 1,
-                "pack_factor": self.quant_config.pack_factor,
-            },
-        )
-
-        codebooks = Parameter(
-            torch.empty(
-                self.quant_config.num_codebooks * len(output_partition_sizes),
-                2**self.quant_config.nbits_per_codebook,
-                self.quant_config.out_group_size,
-                self.quant_config.in_group_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            codebooks,
-            {
-                # metadata indicates fixed size concatenated along dim 0
-                "is_metadata": True,
-                "output_partition_sizes": output_partition_sizes
-            },
-        )
-
-        scales = Parameter(
-            torch.empty(
-                (
-                    output_size_per_partition //
-                    self.quant_config.out_group_size,
-                    1,
-                    1,
-                    1,
-                ),
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            scales,
-            {
-                "output_dim": 0,
-                "packed_dim": 0,
-                "pack_factor": self.quant_config.out_group_size
-            },
-        )
-
-        layer.register_parameter("codes", codes)
-        set_weight_attrs(codes, extra_weight_attrs)
-        layer.register_parameter("codebooks", codebooks)
-        set_weight_attrs(codebooks, extra_weight_attrs)
-        layer.register_parameter("scales", scales)
-        set_weight_attrs(scales, extra_weight_attrs)
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        codebooks = layer.codebooks
-        codes = layer.codes
-        scales = layer.scales
-        output_partition_sizes = getattr(codebooks, "output_partition_sizes",
-                                         [])
-
-        nbooks = codes.shape[2]
-        ingroups = codebooks.shape[3]
-        outgroups = codebooks.shape[2]
-        bits = codebooks.shape[1]
-
-        # We support these formats with dedicated gemm and decompression
-        # kernels.
-        if ingroups == 8 and outgroups == 1 and (
-            (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)):
-
-            # thresholds determined by timings on an A6000, one GPU
-            use_gemv = math.prod(x.shape[:-1]) <= 6
-
-            return ops.aqlm_gemm(
-                x,
-                codes,
-                codebooks,
-                scales,
-                output_partition_sizes,
-                bias,
-            ) if use_gemv else optimized_dequantize_gemm(
-                x,
-                codes,
-                codebooks,
-                scales,
-                output_partition_sizes,
-                bias,
-            )
-
-        # fall back all unoptimized formats
-        return generic_dequantize_gemm(
-            x,
-            codes,
-            codebooks,
-            scales,
-            output_partition_sizes,
-            bias,
-        )

From bf7f470b22e8bf26e1edb30b3bf465ab7dd69f0c Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Sat, 16 Aug 2025 15:59:17 -0400
Subject: [PATCH 340/932] [V1] Logits processors extensibility (#19912)

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Andrew Feldman <afeld2012@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 .../offline_inference/logits_processor.py     | 147 +++++++++
 tests/utils.py                                |  79 ++++-
 tests/v1/logits_processors/__init__.py        |   0
 .../test_correctness.py}                      |  24 +-
 .../logits_processors/test_custom_offline.py  | 237 ++++++++++++++
 .../logits_processors/test_custom_online.py   | 180 +++++++++++
 tests/v1/logits_processors/utils.py           | 127 ++++++++
 tests/v1/sample/test_rejection_sampler.py     |   4 +-
 tests/v1/sample/test_sampler.py               |   4 +-
 tests/v1/worker/test_gpu_input_batch.py       |   4 +-
 vllm/config/__init__.py                       |   5 +
 vllm/engine/arg_utils.py                      |   8 +
 vllm/entrypoints/llm.py                       |   4 +
 vllm/utils/__init__.py                        |   2 +-
 vllm/v1/sample/logits_processor/__init__.py   | 185 +++++++++++
 .../builtin.py}                               | 294 ++----------------
 vllm/v1/sample/logits_processor/interface.py  |  86 +++++
 vllm/v1/sample/logits_processor/state.py      | 149 +++++++++
 vllm/v1/sample/metadata.py                    |   4 +-
 vllm/v1/worker/gpu_input_batch.py             |  91 ++++--
 vllm/v1/worker/gpu_model_runner.py            |  11 +-
 22 files changed, 1312 insertions(+), 334 deletions(-)
 create mode 100644 examples/offline_inference/logits_processor.py
 create mode 100644 tests/v1/logits_processors/__init__.py
 rename tests/v1/{sample/test_logits_processors.py => logits_processors/test_correctness.py} (97%)
 create mode 100644 tests/v1/logits_processors/test_custom_offline.py
 create mode 100644 tests/v1/logits_processors/test_custom_online.py
 create mode 100644 tests/v1/logits_processors/utils.py
 create mode 100644 vllm/v1/sample/logits_processor/__init__.py
 rename vllm/v1/sample/{logits_processor.py => logits_processor/builtin.py} (54%)
 create mode 100644 vllm/v1/sample/logits_processor/interface.py
 create mode 100644 vllm/v1/sample/logits_processor/state.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 87296a08e2..4fc8857854 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -253,6 +253,7 @@ steps:
     - pytest -v -s v1/engine
     - pytest -v -s v1/entrypoints
     - pytest -v -s v1/sample
+    - pytest -v -s v1/logits_processors
     - pytest -v -s v1/worker
     - pytest -v -s v1/structured_output
     - pytest -v -s v1/spec_decode
diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py
new file mode 100644
index 0000000000..7ef20efa7d
--- /dev/null
+++ b/examples/offline_inference/logits_processor.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""This example demonstrates instantiating vLLM with a custom logits processor
+class object.
+
+For a basic example of implementing a custom logits processor, see
+the `DummyLogitsProcessor` implementation in `vllm/test_utils.py`.
+
+For testing purposes, a dummy logits processor is employed which, if
+`target_token` is passed as a keyword argument to `SamplingParams.extra_args`,
+will mask out all tokens except `target_token`.
+
+A batch is constructed with `temperature=0.0` and 50% of requests specifying
+`target_token`, and for these requests - and *only* these requests - we
+expect the `target_token` to be decoded in each step, yielding an output
+similar to that shown below:
+
+Generated Outputs:
+------------------------------------------------------------
+Prompt:    'Hello, my name is'
+Output:    " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
+------------------------------------------------------------
+Prompt:    'The president of the United States is'
+Output:    " not a racist. He is a racist.\nHe's a racist because he"
+------------------------------------------------------------
+Prompt:    'The capital of France is'
+Output:    ' also also also also also also also also also also also also also
+             also also also'
+------------------------------------------------------------
+Prompt:    'The future of AI is'
+Output:    ' in the hands of the people.\n\nThe future of AI is in the'
+------------------------------------------------------------
+"""
+
+from typing import Optional
+
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config import VllmConfig
+from vllm.v1.sample.logits_processor import (
+    BatchUpdate,
+    LogitsProcessor,
+    MoveDirectionality,
+)
+
+
+# Hypothetical custom logits processor
+class DummyLogitsProcessor(LogitsProcessor):
+    """Fake logit processor to support unit testing and examples"""
+
+    def __init__(
+        self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
+    ):
+        self.req_info: dict[int, SamplingParams] = {}
+
+    def is_argmax_invariant(self) -> bool:
+        """Never impacts greedy sampling"""
+        return False
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        # Process added requests.
+        for index, params, _, _ in batch_update.added:
+            assert params is not None
+            if params.extra_args and (
+                target_token := params.extra_args.get("target_token")
+            ):
+                self.req_info[index] = target_token
+
+        if self.req_info:
+            # Process removed requests.
+            for index in batch_update.removed:
+                self.req_info.pop(index, None)
+
+            # Process moved requests, unidirectional move (a->b) and swap
+            # (a<->b)
+            for adx, bdx, direct in batch_update.moved:
+                a_val = self.req_info.pop(adx, None)
+                b_val = self.req_info.pop(bdx, None)
+                if a_val is not None:
+                    self.req_info[bdx] = a_val
+                if direct == MoveDirectionality.SWAP and b_val is not None:
+                    self.req_info[adx] = b_val
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self.req_info:
+            return logits
+
+        # Save target values before modification
+        rows_list = list(self.req_info.keys())
+        cols = torch.tensor(
+            [self.req_info[i] for i in rows_list],
+            dtype=torch.long,
+            device=logits.device,
+        )
+        rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device)
+        values_to_keep = logits[rows, cols].clone()
+
+        # Mask all but target tokens
+        logits[rows] = float("-inf")
+        logits[rows, cols] = values_to_keep
+
+        return logits
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[DummyLogitsProcessor],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/utils.py b/tests/utils.py
index 18fcde9491..e98707fb44 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -13,6 +13,7 @@ import tempfile
 import time
 import warnings
 from contextlib import contextmanager, suppress
+from multiprocessing import Process
 from pathlib import Path
 from typing import Any, Callable, Literal, Optional, Union
 
@@ -76,6 +77,23 @@ VLLM_PATH = Path(__file__).parent.parent
 class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
 
+    def _start_server(self, model: str, vllm_serve_args: list[str],
+                      env_dict: Optional[dict[str, str]]) -> None:
+        """Subclasses override this method to customize server process launch
+        """
+        env = os.environ.copy()
+        # the current process might initialize cuda,
+        # to be safe, we should use spawn method
+        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        if env_dict is not None:
+            env.update(env_dict)
+        self.proc: subprocess.Popen = subprocess.Popen(
+            ["vllm", "serve", model, *vllm_serve_args],
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+
     def __init__(self,
                  model: str,
                  vllm_serve_args: list[str],
@@ -128,18 +146,7 @@ class RemoteOpenAIServer:
             model_loader = get_model_loader(load_config)
             model_loader.download_model(model_config)
 
-        env = os.environ.copy()
-        # the current process might initialize cuda,
-        # to be safe, we should use spawn method
-        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
-        if env_dict is not None:
-            env.update(env_dict)
-        self.proc = subprocess.Popen(
-            ["vllm", "serve", model, *vllm_serve_args],
-            env=env,
-            stdout=sys.stdout,
-            stderr=sys.stderr,
-        )
+        self._start_server(model, vllm_serve_args, env_dict)
         max_wait_seconds = max_wait_seconds or 240
         self._wait_for_server(url=self.url_for("health"),
                               timeout=max_wait_seconds)
@@ -155,6 +162,10 @@ class RemoteOpenAIServer:
             # force kill if needed
             self.proc.kill()
 
+    def _poll(self) -> Optional[int]:
+        """Subclasses override this method to customize process polling"""
+        return self.proc.poll()
+
     def _wait_for_server(self, *, url: str, timeout: float):
         # run health check
         start = time.time()
@@ -169,7 +180,7 @@ class RemoteOpenAIServer:
                 # which means the server is not ready yet.
                 # the stack trace is not useful, so we suppress it
                 # by using `raise from None`.
-                result = self.proc.poll()
+                result = self._poll()
                 if result is not None and result != 0:
                     raise RuntimeError("Server exited unexpectedly.") from None
 
@@ -205,6 +216,48 @@ class RemoteOpenAIServer:
                                   **kwargs)
 
 
+class RemoteOpenAIServerCustom(RemoteOpenAIServer):
+    """Launch test server with custom child process"""
+
+    def _start_server(self, model: str, vllm_serve_args: list[str],
+                      env_dict: Optional[dict[str, str]]) -> None:
+        self.proc: Process = Process(
+            target=self.child_process_fxn,
+            args=(env_dict, model,
+                  vllm_serve_args))  # type: ignore[assignment]
+        self.proc.start()
+
+    def __init__(self,
+                 model: str,
+                 vllm_serve_args: list[str],
+                 child_process_fxn: Callable[
+                     [Optional[dict[str, str]], str, list[str]], None],
+                 *,
+                 env_dict: Optional[dict[str, str]] = None,
+                 seed: Optional[int] = 0,
+                 auto_port: bool = True,
+                 max_wait_seconds: Optional[float] = None) -> None:
+        """Store custom child process function then invoke superclass
+        constructor which will indirectly launch it."""
+        self.child_process_fxn = child_process_fxn
+        super().__init__(model=model,
+                         vllm_serve_args=vllm_serve_args,
+                         env_dict=env_dict,
+                         seed=seed,
+                         auto_port=auto_port,
+                         max_wait_seconds=max_wait_seconds)
+
+    def _poll(self) -> Optional[int]:
+        return self.proc.exitcode
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.proc.terminate()
+        self.proc.join(8)
+        if self.proc.is_alive():
+            # force kill if needed
+            self.proc.kill()
+
+
 def _test_completion(
     client: openai.OpenAI,
     model: str,
diff --git a/tests/v1/logits_processors/__init__.py b/tests/v1/logits_processors/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/logits_processors/test_correctness.py
similarity index 97%
rename from tests/v1/sample/test_logits_processors.py
rename to tests/v1/logits_processors/test_correctness.py
index 84ee3b0392..43caef79b0 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/logits_processors/test_correctness.py
@@ -9,11 +9,13 @@ import numpy as np
 import pytest
 import torch
 
+from tests.utils import create_new_process_for_each_test
 from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits,
                                    create_penalty_tensor,
                                    create_prompt_tokens_tensor,
                                    fake_apply_logitsprocs,
                                    fake_update_logitsprocs_state)
+from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
@@ -24,7 +26,7 @@ from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
                                              MinPLogitsProcessor,
                                              MinTokensLogitsProcessor,
                                              MoveDirectionality,
-                                             init_builtin_logitsprocs)
+                                             build_logitsprocs)
 # yapf: enable
 from vllm.v1.sample.metadata import SamplingMetadata
 
@@ -53,6 +55,7 @@ class LogitsProcsRequestParams:
     workload_index: int
     logitproc_type: LogitprocType  # Logitproc enabled, specified by str id
     out_tokens: list[int]  # Output tokens required for min tokens test
+    prompt_tokens: list[int]  # Dummy prompt tokens placeholder
     params: SamplingParams  # Settings customized for logitproc
 
     def __init__(self, workload_index: int, logitproc_type: LogitprocType):
@@ -63,6 +66,7 @@ class LogitsProcsRequestParams:
         # don't matter *for these tests* so use 0 as a dummy value
         self.out_tokens = ([0] *
                            (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2)))
+        self.prompt_tokens = []
         self.params = _sampling_params_from_logitproc(logitproc_type)
 
     def __str__(self):
@@ -88,11 +92,12 @@ def _generate_fake_sampling_metadata(
                               vocab_size,
                               size=np.random.randint(
                                   1, MAX_NUM_PROMPT_TOKENS)).tolist())
-    logitsprocs = init_builtin_logitsprocs(
-        pin_memory_available=PIN_MEMORY_AVAILABLE,
-        max_num_reqs=MAX_NUM_REQS + 1,
-        device=device)
-
+    logitsprocs = build_logitsprocs(
+        vllm_config=VllmConfig(),
+        device=device,
+        is_pin_memory=PIN_MEMORY_AVAILABLE,
+        is_pooling_model=False,
+    )
     fake_sampling_metadata = SamplingMetadata(
         temperature=torch.full((batch_size, ), 0.0),
         all_greedy=True,
@@ -462,7 +467,8 @@ def _generate_fake_step_update(
         # Replace as many removed requests as possible with added requests
         add_remove_idx = batch_update_builder.pop_removed()
         batch_update_builder.added.append(
-            (add_remove_idx, add_req_params.params, add_req_params.out_tokens))
+            (add_remove_idx, add_req_params.params,
+             add_req_params.prompt_tokens, add_req_params.out_tokens))
         persistent_batch[add_remove_idx] = add_req_params
 
     # Append remaining added requests to end of batch
@@ -470,7 +476,8 @@ def _generate_fake_step_update(
                                        num_step_add_replace):(wdx +
                                                               num_step_add)]
     batch_update_builder.added.extend([
-        (adx + batch_size, add_req_params.params, add_req_params.out_tokens)
+        (adx + batch_size, add_req_params.params, add_req_params.prompt_tokens,
+         add_req_params.out_tokens)
         for adx, add_req_params in enumerate(add_reqs_append)
     ])
     persistent_batch.extend(add_reqs_append)
@@ -561,6 +568,7 @@ def _assert_valid(
             step_idx=step_idx)
 
 
+@create_new_process_for_each_test()
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC])
 @pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases())
diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py
new file mode 100644
index 0000000000..a7fde1990f
--- /dev/null
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+import sys
+from typing import Union
+
+import pytest
+
+from tests.utils import create_new_process_for_each_test
+# yapf: disable
+from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
+                                              DUMMY_LOGITPROC_FQCN,
+                                              DUMMY_LOGITPROC_MODULE,
+                                              MAX_TOKENS, MODEL_NAME,
+                                              POOLING_MODEL_NAME, TEMP_GREEDY,
+                                              CustomLogitprocSource,
+                                              DummyLogitsProcessor,
+                                              dummy_module)
+from tests.v1.logits_processors.utils import entry_points as fake_entry_points
+from tests.v1.logits_processors.utils import prompts
+# yapf: enable
+from vllm import LLM, SamplingParams
+from vllm.v1.sample.logits_processor import (STR_POOLING_REJECTS_LOGITSPROCS,
+                                             LogitsProcessor)
+
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=TEMP_GREEDY,
+                   max_tokens=MAX_TOKENS,
+                   extra_args={DUMMY_LOGITPROC_ARG: 128}),
+    SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS),
+    SamplingParams(temperature=TEMP_GREEDY,
+                   max_tokens=MAX_TOKENS,
+                   extra_args={DUMMY_LOGITPROC_ARG: 67}),
+    SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS),
+]
+
+
+def _run_test(kwargs: dict, logitproc_loaded: bool) -> None:
+    """Compare `LLM` instance initialized with specified `kwargs` against
+    reference `LLM` instance.
+
+    Two scenarios:
+    1. Server has loaded dummy logitproc; test that requests which specify
+       dummy logitproc arg value behave as if logitproc is operating (output
+       token value should repeat), while requests that don't specify dummy
+       logitproc arg value should match reference `LLM` output.
+    2. Server has *not* loaded dummy logitproc; test that all requests
+       behave as if logitproc is *not* operating (output matches reference
+       `LLM` output.)
+    
+    Args:
+      kwargs: `LLM` constructor kwargs
+      logitproc_loaded: server has loaded dummy logitproc if True
+    """
+
+    # Create a vLLM instance and load custom logitproc
+    llm_logitproc = LLM(
+        model=MODEL_NAME,
+        gpu_memory_utilization=0.1,
+        **kwargs,
+    )
+
+    # Create a reference vLLM instance without custom logitproc
+    llm_ref = LLM(model=MODEL_NAME, gpu_memory_utilization=0.1)
+
+    # Run inference with logitproc loaded
+    outputs_logitproc = llm_logitproc.generate(prompts, sampling_params_list)
+
+    # Reference run
+    outputs_ref = llm_ref.generate(prompts, sampling_params_list)
+
+    # Validate outputs
+    for bdx, (out_lp, out_ref, params) in enumerate(
+            zip(outputs_logitproc, outputs_ref, sampling_params_list)):
+        lp_toks = out_lp.outputs[0].token_ids
+        if logitproc_loaded and params.extra_args:
+            # This request exercises custom logitproc; validate that logitproc
+            # forces `target_token` to be decoded in each step
+            target_token = params.extra_args[DUMMY_LOGITPROC_ARG]
+            if not all(x == target_token for x in lp_toks):
+                raise AssertionError(
+                    f"Request {bdx} generated {lp_toks}, shoud all be "
+                    f"{target_token}")
+        else:
+            # This request does not exercise custom logitproc (or custom
+            # logitproc is not enabled on this server); validate against
+            # reference result
+            ref_toks = out_ref.outputs[0].token_ids
+            if lp_toks != ref_toks:
+                raise AssertionError(
+                    f"Request {bdx} generated {lp_toks}, should match "
+                    f"{ref_toks}")
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("logitproc_source", list(CustomLogitprocSource))
+def test_custom_logitsprocs(monkeypatch,
+                            logitproc_source: CustomLogitprocSource):
+    """Test offline Python interface for passing custom logitsprocs
+    
+    Construct an `LLM` instance which loads a custom logitproc that has a
+    well-defined behavior (mask out all tokens except one `target_token`)
+
+    Construct a reference `LLM` instance with no custom logitproc
+
+    Pass in a batch of requests, 50% of which pass a `target_token` value
+    in through `SamplingParams.extra_args`, 50% of which do not.
+
+    Validate that
+    * Requests which do not activate the custom logitproc, yield the same
+      results for both `LLM` instances
+    * Requests which activate the custom logitproc, only output `target_token`
+
+    Test four scenarios, corresponding to `logitproc_source` value
+    * No logitsprocs loaded - test that generated tokens match reference `LLM`
+      instance output
+    * Logitproc passed in via {entrypoint, class object, fully-qualified class
+      name (FQCN)} - test that dummy logitproc is utilized correctly when
+      provided via any of these three possible sources 
+
+    Args:
+      monkeypatch: for setting env vars
+      logitproc_source: what source (entrypoint, fully-qualified class name
+                        (FQCN), class object, or None) the user pulls the
+                        logitproc from
+    """
+
+    # Test that logitproc info is passed to workers
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
+    random.seed(40)
+
+    # Choose LLM args based on logitproc source
+    if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_NONE:
+        # Scenario: the server does not load any custom logitproc
+        # Every other scenario is a different way of loading a custom logitproc
+        _run_test({}, logitproc_loaded=False)
+        return
+
+    if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT:
+        # Scenario: vLLM loads a logitproc from a preconfigured entrypoint
+        # To that end, mock a dummy logitproc entrypoint
+        import importlib.metadata
+        importlib.metadata.entry_points = fake_entry_points  # type: ignore
+
+        # fork is required for workers to see entrypoint patch
+        monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork")
+        _run_test({}, logitproc_loaded=True)
+        return
+
+    kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}
+    if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
+        # Scenario: load logitproc based on fully-qualified class name (FQCN)
+        # Inject dummy module which defines logitproc
+        sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
+        kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
+    elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
+        # Scenario: load logitproc from provided class object
+        kwargs["logits_processors"] = [DummyLogitsProcessor]
+
+    _run_test(kwargs, logitproc_loaded=True)
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("logitproc_source", [
+    CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT,
+    CustomLogitprocSource.LOGITPROC_SOURCE_FQCN,
+    CustomLogitprocSource.LOGITPROC_SOURCE_CLASS,
+])
+def test_pooling_rejects_custom_logitsprocs(
+        monkeypatch, logitproc_source: CustomLogitprocSource):
+    """Validate that vLLM engine initialization properly rejects custom
+    logitsprocs when the model is a pooling model.
+
+    Use `LLM` entrypoint. We expect `LLM` initialization to fail before the
+    logitproc is actually loaded.
+
+    Scenario 1:
+    * Mock a logitproc entrypoint
+    * Validate that `LLM` does not load the logitproc
+
+    Scenario 2:
+    * Pass custom logitproc to `LLM` constructor
+      * Scenario 2a: via FQCN
+      * Scenario 2b: via class object
+    * Validate that initialization fails with appropriate exception
+
+    Args:
+      monkeypatch: used to set environment variables
+      logitproc_source: what source (entrypoint, fully-qualified class name
+                        (FQCN), or class object) the user pulls the
+                        logitproc from
+    """
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    random.seed(40)
+
+    if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT:
+        # Scenario: vLLM loads a pooling model and ignores a logitproc that is
+        # available at a preconfigured entrypoint
+
+        # Patch in dummy logitproc entrypoint
+        import importlib.metadata
+        importlib.metadata.entry_points = fake_entry_points  # type: ignore
+
+        # fork is required for entrypoint patch to be visible to workers,
+        # although they should ignore the entrypoint patch anyway
+        monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork")
+
+        llm = LLM(
+            runner="pooling",
+            model=POOLING_MODEL_NAME,
+            gpu_memory_utilization=0.1,
+        )
+        # Require that no logitsprocs have been loaded
+        assert sum([
+            1 for _ in llm.llm_engine.model_executor.driver_worker.worker.
+            model_runner.input_batch.logitsprocs.all
+        ]) == 0
+        return
+
+    kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}
+    if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
+        # Scenario: load logitproc based on fully-qualified class name (FQCN)
+        kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
+    elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
+        # Scenario: load logitproc from provided class object
+        kwargs["logits_processors"] = [DummyLogitsProcessor]
+
+    with pytest.raises(ValueError, match=STR_POOLING_REJECTS_LOGITSPROCS):
+        # Require that loading a pooling model alongside the logitproc raises
+        # the appropriate exception.
+        LLM(
+            runner="pooling",
+            model=POOLING_MODEL_NAME,
+            gpu_memory_utilization=0.1,
+            **kwargs,
+        )
diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py
new file mode 100644
index 0000000000..a01a479e5b
--- /dev/null
+++ b/tests/v1/logits_processors/test_custom_online.py
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import random
+import sys
+from typing import Any, Optional
+
+import openai
+import pytest
+import pytest_asyncio
+
+from tests.utils import (RemoteOpenAIServerCustom,
+                         create_new_process_for_each_test)
+# yapf: disable
+from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
+                                              DUMMY_LOGITPROC_FQCN,
+                                              DUMMY_LOGITPROC_MODULE,
+                                              MAX_TOKENS, MODEL_NAME,
+                                              TEMP_GREEDY, dummy_module)
+from tests.v1.logits_processors.utils import entry_points as fake_entry_points
+from tests.v1.logits_processors.utils import prompts
+
+# yapf: enable
+
+
+def _server_with_logitproc_entrypoint(
+    env_dict: Optional[dict[str, str]],
+    model: str,
+    vllm_serve_args: list[str],
+) -> None:
+    """Start vLLM server, inject dummy logitproc entrypoint"""
+
+    # Patch `entry_points` to inject logitproc entrypoint
+    import importlib.metadata
+    importlib.metadata.entry_points = fake_entry_points  # type: ignore
+    from vllm.entrypoints.cli import main
+
+    # fork is required for workers to see entrypoint patch
+    os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "fork"
+    if env_dict is not None:
+        os.environ.update(env_dict)
+
+    # Emulate `vllm serve <model> <CLI args>`
+    sys.argv = ["vllm", "serve", model] + vllm_serve_args
+    main.main()
+
+
+def _server_with_logitproc_module(
+    env_dict: Optional[dict[str, str]],
+    model: str,
+    vllm_serve_args: list[str],
+) -> None:
+    """Start vLLM server, inject module with dummy logitproc"""
+
+    # Patch `modules` to inject dummy logitproc module
+    from vllm.entrypoints.cli import main
+    sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
+
+    # fork is required for workers to see entrypoint patch
+    os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "fork"
+    if env_dict is not None:
+        os.environ.update(env_dict)
+
+    # Emulate `vllm serve <model> <CLI args>`
+    sys.argv = ["vllm", "serve", model] + vllm_serve_args
+    main.main()
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+    ]
+
+
+@pytest.fixture(scope="function",
+                params=[[], ["--logits-processors", DUMMY_LOGITPROC_FQCN]])
+def server(default_server_args, request, monkeypatch):
+    """Consider two server configurations:
+    (1) --logits-processors cli arg specifies dummy logits processor via fully-
+    qualified class name (FQCN); patch in a dummy logits processor module
+    (2) No --logits-processors cli arg; patch in a dummy logits processor
+    entrypoint
+    """
+
+    # Test that logitproc info is passed to workers
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
+
+    if request.param:
+        # Launch server, append FQCN argument, inject dummy logitproc module
+        args = default_server_args + request.param
+        _server_fxn = _server_with_logitproc_module
+    else:
+        # Launch server, inject dummy logitproc entrypoint
+        args = default_server_args
+        _server_fxn = _server_with_logitproc_entrypoint
+
+    with RemoteOpenAIServerCustom(MODEL_NAME, args,
+                                  _server_fxn) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+# General request argument values for these tests
+api_keyword_args = {
+    # Greedy sampling ensures that requests which receive the `target_token`
+    # arg will decode it in every step
+    "temperature": TEMP_GREEDY,
+    # Since EOS will never be decoded (unless `target_token` is EOS)
+    "max_tokens": MAX_TOKENS,
+    # Return decoded token logprobs (as a way of getting token id)
+    "logprobs": 0,
+}
+
+
+@create_new_process_for_each_test()
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str):
+    """Test custom logitsprocs when starting OpenAI server from CLI
+    
+    Launch vLLM OpenAI-compatible server, configured to load a custom logitproc
+    that has a well-defined behavior (mask out all tokens except one
+    `target_token`).
+
+    Pass in requests, 50% of which pass a `target_token` value
+    in through `extra_body["vllm_xargs"]`, 50% of which do not.
+
+    Validate that requests which activate the custom logitproc, repeat the same
+    token
+    """
+
+    use_dummy_logitproc = True
+    for prompt in prompts:
+        # Build request arguments
+        request_keyword_args: dict[str, Any] = {
+            **api_keyword_args,
+        }
+        if use_dummy_logitproc:
+            # 50% of requests pass target_token custom arg
+            target_token = random.choice([128, 67])
+            # For requests which activate the dummy logitproc, choose one of
+            # two `target_token` values which are known not to be EOS tokens
+            request_keyword_args["extra_body"] = {
+                "vllm_xargs": {
+                    DUMMY_LOGITPROC_ARG: target_token
+                }
+            }
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            **request_keyword_args,
+        )
+
+        if use_dummy_logitproc:
+            # Only for requests which activate dummy logitproc - validate that
+            # output token is repeated
+            choices: openai.types.CompletionChoice = batch.choices
+            toks = choices[0].logprobs.tokens
+            if not all([x == toks[0] for x in toks]):
+                raise AssertionError(
+                    f"Generated {toks} should all be {toks[0]}")
+
+        # Alternate whether to activate dummy logitproc for each request
+        use_dummy_logitproc = not use_dummy_logitproc
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
new file mode 100644
index 0000000000..c0bfc1a18f
--- /dev/null
+++ b/tests/v1/logits_processors/utils.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import types
+from enum import Enum, auto
+from typing import Optional
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.sampling_params import SamplingParams
+from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate,
+                                             LogitsProcessor,
+                                             MoveDirectionality)
+
+MODEL_NAME = "facebook/opt-125m"
+POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
+DUMMY_LOGITPROC_ARG = "target_token"
+TEMP_GREEDY = 0.0
+MAX_TOKENS = 20
+DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
+DUMMY_LOGITPROC_MODULE = "DummyModule"
+DUMMY_LOGITPROC_FQCN = f"{DUMMY_LOGITPROC_MODULE}:DummyLogitsProcessor"
+
+
+class CustomLogitprocSource(Enum):
+    """How to source a logitproc for testing purposes"""
+    LOGITPROC_SOURCE_NONE = auto()  # No custom logitproc
+    LOGITPROC_SOURCE_ENTRYPOINT = auto()  # Via entrypoint
+    LOGITPROC_SOURCE_FQCN = auto()  # Via fully-qualified class name (FQCN)
+    LOGITPROC_SOURCE_CLASS = auto()  # Via provided class object
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+class DummyLogitsProcessor(LogitsProcessor):
+    """Fake logit processor to support unit testing and examples"""
+
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool):
+        self.req_info: dict[int, SamplingParams] = {}
+
+    def is_argmax_invariant(self) -> bool:
+        """Never impacts greedy sampling"""
+        return False
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        # Process added requests.
+        for index, params, _, _ in batch_update.added:
+            assert params is not None
+            if params.extra_args and (target_token :=
+                                      params.extra_args.get("target_token")):
+                self.req_info[index] = target_token
+
+        if self.req_info:
+            # Process removed requests.
+            for index in batch_update.removed:
+                self.req_info.pop(index, None)
+
+            # Process moved requests, unidirectional move (a->b) and swap
+            # (a<->b)
+            for adx, bdx, direct in batch_update.moved:
+                a_val = self.req_info.pop(adx, None)
+                b_val = self.req_info.pop(bdx, None)
+                if a_val is not None:
+                    self.req_info[bdx] = a_val
+                if direct == MoveDirectionality.SWAP and b_val is not None:
+                    self.req_info[adx] = b_val
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self.req_info:
+            return logits
+
+        # Save target values before modification
+        rows_list = list(self.req_info.keys())
+        cols = torch.tensor([self.req_info[i] for i in rows_list],
+                            dtype=torch.long,
+                            device=logits.device)
+        rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device)
+        values_to_keep = logits[rows, cols].clone()
+
+        # Mask all but target tokens
+        logits[rows] = float('-inf')
+        logits[rows, cols] = values_to_keep
+
+        return logits
+
+
+"""Dummy module with dummy logitproc class"""
+dummy_module = types.ModuleType(DUMMY_LOGITPROC_MODULE)
+dummy_module.DummyLogitsProcessor = DummyLogitsProcessor  # type: ignore
+
+
+class EntryPoint:
+    """Dummy entrypoint class for logitsprocs testing"""
+
+    def __init__(self):
+        self.name = DUMMY_LOGITPROC_ENTRYPOINT
+        self.value = DUMMY_LOGITPROC_FQCN
+
+    def load(self):
+        return DummyLogitsProcessor
+
+
+class EntryPoints(list):
+    """Dummy EntryPoints class for logitsprocs testing"""
+
+    def __init__(self, group: str):
+        # Emulate list-like functionality
+        eps = [EntryPoint()] if group == LOGITSPROCS_GROUP else []
+        super().__init__(eps)
+        # Extra attributes
+        self.names = [ep.name for ep in eps]
+
+
+"""Fake version of importlib.metadata.entry_points"""
+entry_points = lambda group: EntryPoints(group)
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 3a4d48afc9..4e912f98f3 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn.functional as F
 
 from vllm.platforms import current_platform
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
                                               RejectionSampler)
@@ -69,7 +69,7 @@ def create_sampling_metadata(
         output_token_ids=[],
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logitsprocs=LogitsProcessorManager(),
+        logitsprocs=LogitsProcessors(),
     )
 
 
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 31c6c881d7..53215f88bb 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -9,7 +9,7 @@ import torch
 
 from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
 
@@ -173,7 +173,7 @@ def _create_default_sampling_metadata(
         no_penalties=True,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logitsprocs=LogitsProcessorManager(),
+        logitsprocs=LogitsProcessors(),
     )
     return fake_sampling_metadata
 
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 74ab19a3ce..d7b4746562 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -13,7 +13,7 @@ from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.pool.metadata import PoolingMetadata
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -169,7 +169,7 @@ def _construct_expected_sampling_metadata(
                       and all(x == 1 for x in repetition_penalties)),
         allowed_token_ids_mask=allowed_token_ids_mask,
         bad_words_token_ids=bad_words_token_ids,
-        logitsprocs=LogitsProcessorManager(),
+        logitsprocs=LogitsProcessors(),
     )
 
 
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 14fc5589a8..51db277f65 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -62,6 +62,7 @@ if TYPE_CHECKING:
         QuantizationConfig)
     from vllm.model_executor.model_loader import LoadFormats
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+    from vllm.v1.sample.logits_processor import LogitsProcessor
 
     HfOverrides = Union[dict, Callable[[type], type]]
 else:
@@ -72,6 +73,7 @@ else:
     BaseModelLoader = Any
     LoadFormats = Any
     TensorizerConfig = Any
+    LogitsProcessor = Any
     HfOverrides = Union[dict[str, Any], Callable[[type], type]]
 
     me_quant = LazyLoader("model_executor", globals(),
@@ -465,6 +467,9 @@ class ModelConfig:
     - "transformers" will use the Transformers model implementation."""
     override_attention_dtype: Optional[str] = None
     """Override dtype for attention"""
+    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
+    """One or more logits processors' fully-qualified class names or class
+    definitions"""
 
     def compute_hash(self) -> str:
         """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 630fbec453..6fc894827c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -43,6 +43,7 @@ from vllm.transformers_utils.config import is_interleaved
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                         GiB_bytes, get_ip, is_in_ray_actor)
+from vllm.v1.sample.logits_processor import LogitsProcessor
 
 # yapf: enable
 
@@ -435,6 +436,10 @@ class EngineArgs:
     enable_multimodal_encoder_data_parallel: bool = \
         ParallelConfig.enable_multimodal_encoder_data_parallel
 
+    logits_processors: Optional[list[Union[
+        str, type[LogitsProcessor]]]] = ModelConfig.logits_processors
+    """Custom logitproc types"""
+
     async_scheduling: bool = SchedulerConfig.async_scheduling
     # DEPRECATED
     enable_prompt_adapter: bool = False
@@ -549,6 +554,8 @@ class EngineArgs:
                                  **model_kwargs["model_impl"])
         model_group.add_argument("--override-attention-dtype",
                                  **model_kwargs["override_attention_dtype"])
+        model_group.add_argument("--logits-processors",
+                                 **model_kwargs["logits_processors"])
 
         # Model loading arguments
         load_kwargs = get_kwargs(LoadConfig)
@@ -940,6 +947,7 @@ class EngineArgs:
             enable_sleep_mode=self.enable_sleep_mode,
             model_impl=self.model_impl,
             override_attention_dtype=self.override_attention_dtype,
+            logits_processors=self.logits_processors,
         )
 
     def validate_tensorizer_args(self):
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 915f14a29b..b002f234c0 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -55,6 +55,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of
+from vllm.v1.sample.logits_processor import LogitsProcessor
 
 if TYPE_CHECKING:
     from vllm.v1.metrics.reader import Metric
@@ -198,6 +199,8 @@ class LLM:
         override_pooler_config: Optional[PoolerConfig] = None,
         compilation_config: Optional[Union[int, dict[str, Any],
                                            CompilationConfig]] = None,
+        logits_processors: Optional[list[Union[str,
+                                               type[LogitsProcessor]]]] = None,
         **kwargs,
     ) -> None:
         """LLM constructor."""
@@ -272,6 +275,7 @@ class LLM:
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
             compilation_config=compilation_config_instance,
+            logits_processors=logits_processors,
             **kwargs,
         )
 
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 64f7426bd6..5cb9f97ae0 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -2562,7 +2562,7 @@ def direct_register_custom_op(
 
 def resolve_obj_by_qualname(qualname: str) -> Any:
     """
-    Resolve an object by its fully qualified name.
+    Resolve an object by its fully-qualified class name.
     """
     module_name, obj_name = qualname.rsplit(".", 1)
     module = importlib.import_module(module_name)
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
new file mode 100644
index 0000000000..8220269162
--- /dev/null
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -0,0 +1,185 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+import itertools
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor,
+                                                     MinPLogitsProcessor,
+                                                     MinTokensLogitsProcessor)
+from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
+                                                       LogitsProcessor,
+                                                       MoveDirectionality)
+from vllm.v1.sample.logits_processor.state import (BatchUpdateBuilder,
+                                                   LogitsProcessors)
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+logger = init_logger(__name__)
+
+# Error message when the user tries to initialize vLLM with a pooling model
+# and custom logitsproces
+STR_POOLING_REJECTS_LOGITSPROCS = ("Pooling models do not support custom"
+                                   " logits processors.")
+
+LOGITSPROCS_GROUP = 'vllm.logits_processors'
+
+BUILTIN_LOGITS_PROCESSORS: list[type[LogitsProcessor]] = [
+    MinTokensLogitsProcessor,
+    LogitBiasLogitsProcessor,
+    MinPLogitsProcessor,
+]
+
+
+def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]:
+    """Load all installed logit processor plugins"""
+
+    import sys
+
+    if sys.version_info < (3, 10):
+        from importlib_metadata import entry_points
+    else:
+        from importlib.metadata import entry_points
+
+    installed_logitsprocs_plugins = entry_points(group=LOGITSPROCS_GROUP)
+    if len(installed_logitsprocs_plugins) == 0:
+        logger.debug("No logitsprocs plugins installed (group %s).",
+                     LOGITSPROCS_GROUP)
+        return []
+
+    # Load logitsprocs plugins
+    logger.debug("Loading installed logitsprocs plugins (group %s):",
+                 LOGITSPROCS_GROUP)
+    classes: list[type[LogitsProcessor]] = []
+    for entrypoint in installed_logitsprocs_plugins:
+        try:
+            logger.debug("- Loading logitproc plugin entrypoint=%s target=%s",
+                         entrypoint.name, entrypoint.value)
+            classes.append(entrypoint.load())
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to load LogitsProcessor plugin {entrypoint}") from e
+    return classes
+
+
+def _load_logitsprocs_by_fqcns(
+    logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]]
+) -> list[type[LogitsProcessor]]:
+    """Load logit processor types, identifying them by fully-qualified class
+    names (FQCNs).
+
+    Effectively, a mixed list of logitproc types and FQCN strings is converted
+    into a list of entirely logitproc types, by loading from the FQCNs.
+
+    FQCN syntax is <module>:<type> i.e. x.y.z:CustomLogitProc
+
+    Already-loaded logitproc types must be subclasses of LogitsProcessor
+
+    Args:
+      logits_processors: Potentially mixed list of logitsprocs types and FQCN
+                         strings for logitproc types
+
+    Returns:
+      List of logitproc types
+
+    """
+    if not logits_processors:
+        return []
+
+    logger.debug(
+        "%s additional custom logits processors specified, checking whether "
+        "they need to be loaded.", len(logits_processors))
+
+    classes: list[type[LogitsProcessor]] = []
+    for ldx, logitproc in enumerate(logits_processors):
+        if isinstance(logitproc, type):
+            logger.debug(" - Already-loaded logit processor: %s",
+                         logitproc.__name__)
+            if not issubclass(logitproc, LogitsProcessor):
+                raise ValueError(
+                    f"{logitproc.__name__} is not a subclass of LogitsProcessor"
+                )
+            classes.append(logitproc)
+            continue
+
+        logger.debug("- Loading logits processor %s", logitproc)
+        module_path, qualname = logitproc.split(":")
+
+        try:
+            # Load module
+            module = importlib.import_module(module_path)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to load {ldx}th LogitsProcessor plugin {logitproc}"
+            ) from e
+
+        # Walk down dotted name to get logitproc class
+        obj = module
+        for attr in qualname.split("."):
+            obj = getattr(obj, attr)
+        if not isinstance(obj, type):
+            raise ValueError("Loaded logit processor must be a type.")
+        if not issubclass(obj, LogitsProcessor):
+            raise ValueError(
+                f"{obj.__name__} must be a subclass of LogitsProcessor")
+        classes.append(obj)
+
+    return classes
+
+
+def _load_custom_logitsprocs(
+    logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]],
+) -> list[type[LogitsProcessor]]:
+    """Load all custom logits processors.
+
+    * First load all installed logitproc plugins
+    * Second load custom logitsprocs pass by the user at initialization time
+
+    Args:
+      logits_processors: potentially mixed list of logitproc types and
+                         logitproc type fully-qualified names (FQCNs)
+                         which need to be loaded
+
+    Returns:
+      A list of all loaded logitproc types
+    """
+    from vllm.platforms import current_platform
+    if current_platform.is_tpu():
+        # No logitsprocs specified by caller
+        # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
+        return []
+
+    return (_load_logitsprocs_plugins() +
+            _load_logitsprocs_by_fqcns(logits_processors))
+
+
+def build_logitsprocs(
+    vllm_config: "VllmConfig",
+    device: torch.device,
+    is_pin_memory: bool,
+    is_pooling_model: bool,
+    custom_logitsprocs: Sequence[Union[str, type[LogitsProcessor]]] = (),
+) -> LogitsProcessors:
+    if is_pooling_model:
+        if custom_logitsprocs:
+            raise ValueError(STR_POOLING_REJECTS_LOGITSPROCS)
+        logger.debug("Skipping logits processor loading because pooling models"
+                     " do not support logits processors.")
+        return LogitsProcessors()
+    custom_logitsprocs_classes = _load_custom_logitsprocs(custom_logitsprocs)
+    return LogitsProcessors(
+        ctor(vllm_config, device, is_pin_memory) for ctor in itertools.chain(
+            BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes))
+
+
+__all__ = [
+    "LogitsProcessor", "LogitBiasLogitsProcessor", "MinPLogitsProcessor",
+    "MinTokensLogitsProcessor", "BatchUpdate", "BatchUpdateBuilder",
+    "MoveDirectionality", "LogitsProcessors", "build_logitsprocs",
+    "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP"
+]
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor/builtin.py
similarity index 54%
rename from vllm/v1/sample/logits_processor.py
rename to vllm/v1/sample/logits_processor/builtin.py
index 3a06e71057..24387ab793 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -1,241 +1,32 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import dataclasses
-from abc import ABC, abstractmethod
-from collections.abc import Iterator, Sequence
-from dataclasses import dataclass, field
-from enum import Enum
-from itertools import chain
-from typing import Optional, Union
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Optional
 
 import torch
-from torch._prims_common import DeviceLikeType
 
-from vllm import PoolingParams, SamplingParams
-from vllm.logger import init_logger
+from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
+                                                       LogitsProcessor,
+                                                       MoveDirectionality)
 
-logger = init_logger(__name__)
-
-
-class MoveDirectionality(Enum):
-    # One-way i1->i2 req move within batch
-    UNIDIRECTIONAL = 0
-    # Two-way i1<->i2 req swap within batch
-    SWAP = 1
-
-
-# (index, params, output_tok_ids) tuples for new
-# requests added to the batch.
-AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int]]
-# (index 1, index 2, directionality) tuples representing
-# one-way moves or two-way swaps of requests in batch
-MovedRequest = tuple[int, int, MoveDirectionality]
-# Batch indices of any removed requests.
-RemovedRequest = int
-
-
-@dataclasses.dataclass(frozen=True)
-class BatchUpdate:
-    """Persistent batch state change info for logitsprocs"""
-    batch_size: int  # Current num reqs in batch
-
-    # Metadata for requests added to, removed from, and moved
-    # within the persistent batch.
-    #
-    # Note: each added request is represented as
-    # (index, params, output_tok_ids)
-    # Key assumption: output_tok_ids is a reference to the
-    # request's running output tokens list; in this way
-    # the logits processors always see the latest list of
-    # generated tokens
-    removed: Sequence[RemovedRequest]
-    moved: Sequence[MovedRequest]
-    added: Sequence[AddedRequest]
-
-
-class BatchUpdateBuilder:
-    """Helps track persistent batch state changes and build
-    a batch update data structure for logitsprocs
-    
-    Assumptions:
-    * All information about requests removed from persistent batch
-      during a step is aggregated in self._removed through calls to
-      self.removed_append() at the beginning of a step. This must happen
-      before the first time that self.removed, self.pop_removed()
-      or self.peek_removed() are invoked in a given step
-    * After the first time that self.removed, self.pop_removed()
-      or self.peek_removed() are read in a step, no new removals
-      are registered using self.removed_append()
-    * Elements of self._removed are never directly modified, added or
-      removed (i.e. modification is only via self.removed_append() and
-      self.pop_removed())
-    
-    Guarantees under above assumptions:
-    * self.removed is always sorted in descending order
-    * self.pop_removed() and self.peek_removed() both return
-      the lowest removed request index in the current step
-    """
-
-    _removed: list[RemovedRequest]
-    _is_removed_sorted: bool
-    moved: list[MovedRequest]
-    added: list[AddedRequest]
-
-    def __init__(
-        self,
-        removed: Optional[list[RemovedRequest]] = None,
-        moved: Optional[list[MovedRequest]] = None,
-        added: Optional[list[AddedRequest]] = None,
-    ) -> None:
-        self._removed = removed or []
-        self.moved = moved or []
-        self.added = added or []
-        self._is_removed_sorted = False
-
-    def _ensure_removed_sorted(self) -> None:
-        """Sort removed request indices in
-        descending order.
-        
-        Idempotent after first call in a
-        given step, until reset.
-        """
-        if not self._is_removed_sorted:
-            self._removed.sort(reverse=True)
-            self._is_removed_sorted = True
-
-    @property
-    def removed(self) -> list[RemovedRequest]:
-        """Removed request indices sorted in
-        descending order"""
-        self._ensure_removed_sorted()
-        return self._removed
-
-    def removed_append(self, index: int) -> None:
-        """Register the removal of a request from
-        the persistent batch.
-
-        Must not be called after the first time
-        self.removed, self.pop_removed() or
-        self.peek_removed() are invoked.
-        
-        Args:
-          index: request index
-        """
-        if self._is_removed_sorted:
-            raise RuntimeError("Cannot register new removed request after"
-                               " self.removed has been read.")
-        self._removed.append(index)
-
-    def has_removed(self) -> bool:
-        return bool(self._removed)
-
-    def peek_removed(self) -> Optional[int]:
-        """Return lowest removed request index"""
-        if self.has_removed():
-            self._ensure_removed_sorted()
-            return self._removed[-1]
-        return None
-
-    def pop_removed(self) -> Optional[int]:
-        """Pop lowest removed request index"""
-        if self.has_removed():
-            self._ensure_removed_sorted()
-            return self._removed.pop()
-        return None
-
-    def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
-        """Generate a logitsprocs batch update data structure
-        and reset internal batch update builder state.
-        
-        Args:
-          batch_size: current persistent batch size
-
-        Returns:
-          Frozen logitsprocs batch update instance; `None` if no updates
-        """
-        # Reset removal-sorting logic
-        self._is_removed_sorted = False
-        if not any((self._removed, self.moved, self.added)):
-            # No update; short-circuit
-            return None
-        # Build batch state update
-        batch_update = BatchUpdate(
-            batch_size=batch_size,
-            removed=self._removed,
-            moved=self.moved,
-            added=self.added,
-        )
-        # Reset removed/moved/added update lists
-        self._removed = []
-        self.moved = []
-        self.added = []
-        return batch_update
-
-
-class LogitsProcessor(ABC):
-
-    @abstractmethod
-    def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        raise NotImplementedError
-
-    @abstractmethod
-    def is_argmax_invariant(self) -> bool:
-        """True if logits processor has no impact on the
-        argmax computation in greedy sampling.
-        NOTE: may or may not have the same value for all
-        instances of a given LogitsProcessor subclass,
-        depending on subclass implementation.
-        TODO(andy): won't be utilized until logits
-        processors are user-extensible
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def update_state(
-        self,
-        batch_update: Optional[BatchUpdate],
-    ) -> None:
-        """Called when there are new output tokens, prior
-        to each forward pass.
-
-        Args:
-            batch_update is non-None iff there have been
-            changes to the batch makeup.
-        """
-        raise NotImplementedError
-
-
-@dataclass
-class LogitsProcessorManager:
-    """Encapsulates initialized logitsproc objects."""
-    argmax_invariant: list[LogitsProcessor] = field(
-        default_factory=list)  # argmax-invariant logitsprocs
-    non_argmax_invariant: list[LogitsProcessor] = field(
-        default_factory=list)  # non-argmax-invariant logitsprocs
-
-    @property
-    def all(self) -> Iterator[LogitsProcessor]:
-        """Iterator over all logits processors."""
-        return chain(self.argmax_invariant, self.non_argmax_invariant)
-
-
-###### ----- Built-in LogitsProcessor impls below here
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
 
 
 class MinPLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, max_num_reqs: int, pin_memory: bool,
-                 device: DeviceLikeType):
-        super().__init__()
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool):
+        max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         self.min_p_count: int = 0
 
         self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
                                             dtype=torch.float32,
                                             device="cpu",
-                                            pin_memory=pin_memory)
+                                            pin_memory=is_pin_memory)
         self.min_p_cpu = self.min_p_cpu_tensor.numpy()
 
-        self.use_double_tensor = torch.device("cpu") != torch.device(device)
+        self.use_double_tensor = torch.device(device).type != "cpu"
 
         if self.use_double_tensor:
             # Pre-allocated device tensor
@@ -260,8 +51,8 @@ class MinPLogitsProcessor(LogitsProcessor):
 
         needs_update = False
         # Process added requests.
-        for index, params, _ in batch_update.added:
-            min_p = params.min_p if isinstance(params, SamplingParams) else 0.0
+        for index, params, _, _ in batch_update.added:
+            min_p = params.min_p
             if self.min_p_cpu[index] != min_p:
                 needs_update = True
                 self.min_p_cpu[index] = min_p
@@ -316,11 +107,10 @@ class MinPLogitsProcessor(LogitsProcessor):
 
 class LogitBiasLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, pin_memory: bool, device: torch.device):
-        super().__init__()
-        self.biases: dict[int, dict[int, float]] = {}
+    def __init__(self, _, device: torch.device, is_pin_memory: bool):
         self.device = device
-        self.pin_memory = pin_memory
+        self.pin_memory = is_pin_memory
+        self.biases: dict[int, dict[int, float]] = {}
 
         self.bias_tensor: torch.Tensor = torch.tensor(())
         self.logits_slice = (self._device_tensor([], torch.int32),
@@ -337,9 +127,8 @@ class LogitBiasLogitsProcessor(LogitsProcessor):
 
         needs_update: bool = False
         # Process added requests.
-        for index, params, _ in batch_update.added:
-            if isinstance(params, SamplingParams) and (lb :=
-                                                       params.logit_bias):
+        for index, params, _, _ in batch_update.added:
+            if lb := params.logit_bias:
                 self.biases[index] = lb
                 needs_update = True
             else:
@@ -400,12 +189,12 @@ class LogitBiasLogitsProcessor(LogitsProcessor):
 
 class MinTokensLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, pin_memory: bool, device: torch.device):
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool):
         # index -> (min_toks, output_token_ids, stop_token_ids)
-        super().__init__()
-        self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
         self.device = device
-        self.pin_memory = pin_memory
+        self.pin_memory = is_pin_memory
+        self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
 
         # (req_idx_tensor,eos_tok_id_tensor)
         self.logits_slice: tuple[torch.Tensor,
@@ -424,9 +213,8 @@ class MinTokensLogitsProcessor(LogitsProcessor):
 
         if batch_update:
             # Process added requests.
-            for index, params, output_tok_ids in batch_update.added:
-                if (isinstance(params, SamplingParams)
-                        and (min_tokens := params.min_tokens)
+            for index, params, _, output_tok_ids in batch_update.added:
+                if ((min_tokens := params.min_tokens)
                         and len(output_tok_ids) < min_tokens):
                     # Replace request metadata at batch index
                     self.min_toks[index] = (min_tokens, output_tok_ids,
@@ -499,35 +287,3 @@ class MinTokensLogitsProcessor(LogitsProcessor):
             # Inhibit EOS token for requests which have not reached min length
             logits[self.logits_slice] = -float("inf")
         return logits
-
-
-def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
-                             device: torch.device) -> LogitsProcessorManager:
-    """Construct 'builtin' vLLM logitsprocs which the engine
-    loads by default.
-
-    Args:
-      pin_memory_available: pinned memory is available for use
-                            for use by logitsproc
-      max_num_reqs: ceiling on request count in persistent batch
-      device: inference device
-
-    Returns:
-      Data structure encapsulating loaded logitsprocs
-    """
-    min_tokens_logitproc = MinTokensLogitsProcessor(
-        pin_memory=pin_memory_available, device=device)
-    logit_bias_logitproc = LogitBiasLogitsProcessor(
-        pin_memory=pin_memory_available, device=device)
-    min_p_logitproc = MinPLogitsProcessor(
-        pin_memory=pin_memory_available,
-        device=device,
-        # +1 for temporary swap space
-        max_num_reqs=max_num_reqs + 1)
-    return LogitsProcessorManager(
-        non_argmax_invariant=[
-            min_tokens_logitproc,
-            logit_bias_logitproc,
-        ],
-        argmax_invariant=[min_p_logitproc],
-    )
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
new file mode 100644
index 0000000000..12b4db24bf
--- /dev/null
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Sequence
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from vllm import SamplingParams
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+
+class MoveDirectionality(Enum):
+    # One-way i1->i2 req move within batch
+    UNIDIRECTIONAL = auto()
+    # Two-way i1<->i2 req swap within batch
+    SWAP = auto()
+
+
+# (index, params, prompt_tok_ids, output_tok_ids) tuples for new
+# requests added to the batch.
+AddedRequest = tuple[int, SamplingParams, list[int], list[int]]
+
+# (index 1, index 2, directionality) tuples representing
+# one-way moves or two-way swaps of requests in batch
+MovedRequest = tuple[int, int, MoveDirectionality]
+
+# Batch indices of any removed requests.
+RemovedRequest = int
+
+
+@dataclass(frozen=True)
+class BatchUpdate:
+    """Persistent batch state change info for logitsprocs"""
+    batch_size: int  # Current num reqs in batch
+
+    # Metadata for requests added to, removed from, and moved
+    # within the persistent batch.
+    #
+    # Key assumption: the `output_tok_ids` list (which is an element of each
+    # tuple in `added`) is a reference to the request's running output tokens
+    # list; via this reference, the logits processors always see the latest
+    # list of generated output tokens
+    removed: Sequence[RemovedRequest]
+    moved: Sequence[MovedRequest]
+    added: Sequence[AddedRequest]
+
+
+class LogitsProcessor(ABC):
+
+    @abstractmethod
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def is_argmax_invariant(self) -> bool:
+        """True if logits processor has no impact on the
+        argmax computation in greedy sampling.
+        NOTE: may or may not have the same value for all
+        instances of a given LogitsProcessor subclass,
+        depending on subclass implementation.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def update_state(
+        self,
+        batch_update: Optional["BatchUpdate"],
+    ) -> None:
+        """Called when there are new output tokens, prior
+        to each forward pass.
+
+        Args:
+            batch_update is non-None iff there have been
+            changes to the batch makeup.
+        """
+        raise NotImplementedError
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
new file mode 100644
index 0000000000..0f58b52496
--- /dev/null
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterator
+from itertools import chain
+from typing import TYPE_CHECKING, Optional
+
+from vllm.v1.sample.logits_processor.interface import (AddedRequest,
+                                                       BatchUpdate,
+                                                       MovedRequest,
+                                                       RemovedRequest)
+
+if TYPE_CHECKING:
+    from vllm.v1.sample.logits_processor.interface import LogitsProcessor
+
+
+class BatchUpdateBuilder:
+    """Helps track persistent batch state changes and build
+    a batch update data structure for logitsprocs
+    Assumptions:
+    * All information about requests removed from persistent batch
+      during a step is aggregated in self._removed through calls to
+      self.removed_append() at the beginning of a step. This must happen
+      before the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are invoked in a given step
+    * After the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are read in a step, no new removals
+      are registered using self.removed_append()
+    * Elements of self._removed are never directly modified, added or
+      removed (i.e. modification is only via self.removed_append() and
+      self.pop_removed())
+    Guarantees under above assumptions:
+    * self.removed is always sorted in descending order
+    * self.pop_removed() and self.peek_removed() both return
+      the lowest removed request index in the current step
+    """
+
+    _removed: list[RemovedRequest]
+    _is_removed_sorted: bool
+    moved: list[MovedRequest]
+    added: list[AddedRequest]
+
+    def __init__(
+        self,
+        removed: Optional[list[RemovedRequest]] = None,
+        moved: Optional[list[MovedRequest]] = None,
+        added: Optional[list[AddedRequest]] = None,
+    ) -> None:
+        self._removed = removed or []
+        self.moved = moved or []
+        self.added = added or []
+        self._is_removed_sorted = False
+
+    def _ensure_removed_sorted(self) -> None:
+        """Sort removed request indices in
+        descending order.
+        Idempotent after first call in a
+        given step, until reset.
+        """
+        if not self._is_removed_sorted:
+            self._removed.sort(reverse=True)
+            self._is_removed_sorted = True
+
+    @property
+    def removed(self) -> list[RemovedRequest]:
+        """Removed request indices sorted in
+        descending order"""
+        self._ensure_removed_sorted()
+        return self._removed
+
+    def removed_append(self, index: int) -> None:
+        """Register the removal of a request from the persistent batch.
+
+        Must not be called after the first time self.removed,
+        self.pop_removed() or self.peek_removed() are invoked.
+
+        Args:
+          index: request index
+        """
+        if self._is_removed_sorted:
+            raise RuntimeError("Cannot register new removed request after"
+                               " self.removed has been read.")
+        self._removed.append(index)
+
+    def has_removed(self) -> bool:
+        return bool(self._removed)
+
+    def peek_removed(self) -> Optional[int]:
+        """Return lowest removed request index"""
+        if self.has_removed():
+            self._ensure_removed_sorted()
+            return self._removed[-1]
+        return None
+
+    def pop_removed(self) -> Optional[int]:
+        """Pop lowest removed request index"""
+        if self.has_removed():
+            self._ensure_removed_sorted()
+            return self._removed.pop()
+        return None
+
+    def _is_update(self) -> bool:
+        """True if there is a batch state change"""
+        return any((self._removed, self.moved, self.added))
+
+    def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
+        """Generate a logitsprocs batch update data structure and reset
+        internal batch update builder state.
+        
+        Args:
+          batch_size: current persistent batch size
+
+        Returns:
+          Frozen logitsprocs batch update instance; `None` if no updates
+        """
+        # Reset removal-sorting logic
+        self._is_removed_sorted = False
+        if not self._is_update():
+            # No update; short-circuit
+            return None
+        # Build batch state update
+        batch_update = BatchUpdate(
+            batch_size=batch_size,
+            removed=self._removed,
+            moved=self.moved,
+            added=self.added,
+        )
+        self._removed = []
+        self.moved = []
+        self.added = []
+        return batch_update
+
+
+class LogitsProcessors:
+    """Encapsulates initialized logitsproc objects."""
+
+    def __init__(
+            self,
+            logitsprocs: Optional[Iterator["LogitsProcessor"]] = None) -> None:
+        self.argmax_invariant: list[LogitsProcessor] = []
+        self.non_argmax_invariant: list[LogitsProcessor] = []
+        if logitsprocs:
+            for logitproc in logitsprocs:
+                (self.argmax_invariant if logitproc.is_argmax_invariant() else
+                 self.non_argmax_invariant).append(logitproc)
+
+    @property
+    def all(self) -> Iterator["LogitsProcessor"]:
+        """Iterator over all logits processors."""
+        return chain(self.argmax_invariant, self.non_argmax_invariant)
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 1189b12f30..9d6a87cea3 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -6,7 +6,7 @@ from typing import Optional
 
 import torch
 
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 
 
 @dataclass
@@ -40,4 +40,4 @@ class SamplingMetadata:
     bad_words_token_ids: dict[int, list[list[int]]]
 
     # Loaded logits processors
-    logitsprocs: LogitsProcessorManager
+    logitsprocs: LogitsProcessors
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 2469e09f82..e718d9d5e0 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -18,8 +18,8 @@ from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
-                                             MoveDirectionality,
-                                             init_builtin_logitsprocs)
+                                             LogitsProcessors,
+                                             MoveDirectionality)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
@@ -78,8 +78,11 @@ class InputBatch:
         pin_memory: bool,
         vocab_size: int,
         block_sizes: list[int],  # The block_size of each kv cache group
+        logitsprocs: Optional[LogitsProcessors] = None,
         is_spec_decode: bool = False,
+        is_pooling_model: bool = False,
     ):
+        self.is_pooling_model = is_pooling_model
         self.is_spec_decode = is_spec_decode
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
@@ -221,14 +224,6 @@ class InputBatch:
         # updates. Should reset each step.
         self.batch_update_builder = BatchUpdateBuilder()
 
-        # Define logits processors.
-        # TODO(andy): logits processor list should be extensible via engine
-        # constructor argument; for now the list is fixed.
-        self.logitsprocs = init_builtin_logitsprocs(
-            pin_memory_available=pin_memory,
-            max_num_reqs=max_num_reqs + 1,
-            device=device)
-
         # TODO convert this to LogitsProcessor
         self.has_allowed_token_ids: set[str] = set()
         # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
@@ -244,6 +239,10 @@ class InputBatch:
 
         self.req_output_token_ids: list[Optional[list[int]]] = []
 
+        # Store provided logitsprocs. If none are provided, initialize empty
+        # data structure
+        self.logitsprocs = logitsprocs or LogitsProcessors()
+
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
 
@@ -255,28 +254,35 @@ class InputBatch:
         # while performing state updates to the batch.
         return cast(list[str], self._req_ids)
 
-    def _get_next_add_index(self) -> int:
-        if (req_index := self.batch_update_builder.pop_removed()) is not None:
-            # Fill the empty index.
-            return req_index
-        # Append to end
-        return self.num_reqs
-
     def _register_add_request(self, request: "CachedRequestState") -> int:
-        """Track add-request operations"""
-        req_index = self._get_next_add_index()
-        assert req_index < self.max_num_reqs
-        params = (request.sampling_params
-                  if request.sampling_params else request.pooling_params)
+        """Track add-request operations for logits processors.
+        Not applicable to pooling models.
+        """
+
+        # Detailed added request metadata is only required for non-pooling
+        # models, to support logitsprocs
+        assert request.sampling_params
+
+        # Fill the next empty index if there is one.
+        if (new_req_index := self.batch_update_builder.pop_removed()) is None:
+            # Append to end otherwise.
+            new_req_index = self.num_reqs
+
+        assert new_req_index < self.max_num_reqs
         self.batch_update_builder.added.append(
-            (req_index, params, request.output_token_ids))
-        return req_index
+            (new_req_index, request.sampling_params, request.prompt_token_ids,
+             request.output_token_ids))
+        return new_req_index
 
     def add_request(
         self,
         request: "CachedRequestState",
     ) -> int:
-        req_index = self._register_add_request(request)
+        if not self.is_pooling_model:
+            # New request index bookkeeping for autoregressive models.
+            req_index = self._register_add_request(request)
+        else:
+            req_index = self.num_reqs
 
         req_id = request.req_id
         if req_index == len(self._req_ids):
@@ -411,7 +417,10 @@ class InputBatch:
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
             return None
-        self.batch_update_builder.removed_append(req_index)
+        if not self.is_pooling_model:
+            # Autoregressive models require bookkeeping of removed requests to
+            # support logitsprocs.
+            self.batch_update_builder.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
 
@@ -446,6 +455,8 @@ class InputBatch:
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
+        # For autoregressive models, track detailed request reordering info
+        # to support logitsprocs
         self.batch_update_builder.moved.append(
             (i1, i2, MoveDirectionality.SWAP))
         old_id_i1 = self._req_ids[i1]
@@ -513,11 +524,18 @@ class InputBatch:
           swaps: list of (from,to) swap tuples for moved requests
           empty_req_indices: indices not filled by condensation
         """
+        num_reqs = self.num_reqs
+
+        if self.is_pooling_model:
+            # Will be contiguous in pooling case, just trim the lists.
+            del self._req_ids[num_reqs:]
+            del self.req_output_token_ids[num_reqs:]
+            return
+
         if not (empty_req_indices := self.batch_update_builder.removed):
             # All removed requests were replaced by added requests, or else no
             # requests were removed at all. No condense() needed
             return
-        num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
             self._req_ids.clear()
@@ -541,6 +559,8 @@ class InputBatch:
             # Move active request down into empty request
             # index.
             self.batch_update_builder.pop_removed()
+            # Autoregressive models require detailed tracking of condense
+            # operations to support logitsprocs
             self.batch_update_builder.moved.append(
                 (last_req_index, empty_index,
                  MoveDirectionality.UNIDIRECTIONAL))
@@ -596,15 +616,20 @@ class InputBatch:
             last_req_index -= 1
 
         # Trim lists to the batch size.
-        del self._req_ids[self.num_reqs:]
-        del self.req_output_token_ids[self.num_reqs:]
+        del self._req_ids[num_reqs:]
+        del self.req_output_token_ids[num_reqs:]
 
     def refresh_metadata(self):
-        """Apply batch updates, reset input batch at end of step
+        """Apply any batch updates to sampling metadata."""
 
-        * Apply batch add/remove/permute to logits procs' states
-        * If batch state is modified, update sampling metadata
-        """
+        if self.is_pooling_model:
+            # Batch changes every step for pooling models.
+            self.sampling_metadata = self._make_sampling_metadata()
+            return
+
+        # For non-pooling models - generate and apply logitsprocs update;
+        # reset batch update tracking.
+        # Update sampling metadata if batch state is changed.
         batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
         for logit_proc in self.logitsprocs.all:
             logit_proc.update_state(batch_update)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5ee44a8257..4219d9147a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -68,6 +68,7 @@ from vllm.v1.kv_cache_interface import (AttentionSpec,
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
 from vllm.v1.sample.sampler import Sampler
@@ -80,7 +81,6 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import (
     KVConnectorModelRunnerMixin, KVConnectorOutput)
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from ..sample.logits_processor import LogitsProcessorManager
 from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache,
                     gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
@@ -221,6 +221,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[self.cache_config.block_size],
             is_spec_decode=bool(self.vllm_config.speculative_config),
+            logitsprocs=build_logitsprocs(
+                self.vllm_config, self.device, self.pin_memory,
+                self.is_pooling_model,
+                self.vllm_config.model_config.logits_processors),
+            is_pooling_model=self.is_pooling_model,
         )
 
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
@@ -2447,7 +2452,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             output_token_ids=[[] for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
             bad_words_token_ids={},
-            logitsprocs=LogitsProcessorManager(),
+            logitsprocs=LogitsProcessors(),
         )
         try:
             sampler_output = self.sampler(logits=logits,
@@ -2968,6 +2973,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 vocab_size=self.model_config.get_vocab_size(),
                 block_sizes=block_sizes,
                 is_spec_decode=bool(self.vllm_config.speculative_config),
+                logitsprocs=self.input_batch.logitsprocs,
+                is_pooling_model=self.is_pooling_model,
             )
 
     def _allocate_kv_cache_tensors(

From a258ad8bcc0014c04d11a9bc8c6591b379c31b68 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Sun, 17 Aug 2025 08:41:23 +0800
Subject: [PATCH 341/932] [Bugfix] fix qwen3 moe fp8 accuracy issue (#23031)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a497449132..f07be08554 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -125,6 +125,10 @@ class Fp8Config(QuantizationConfig):
         ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
         weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"],
                                                  None)
+        if not ignored_layers:
+            ignored_layers = cls.get_from_keys_or(config,
+                                                  ["modules_to_not_convert"],
+                                                  None)
         return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
                    activation_scheme=activation_scheme,
                    ignored_layers=ignored_layers,

From 94096a47c92c4a53ad44cfffdca918669c0f89e0 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sat, 16 Aug 2025 22:16:42 -0400
Subject: [PATCH 342/932] [UX] Separate marlin moe config logic from triton moe
 (#23006)

---
 .../layers/fused_moe/fused_marlin_moe.py      | 20 ++++++-------------
 .../layers/fused_moe/fused_moe.py             |  9 +--------
 2 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index a49d41c184..3c6ece6737 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused MoE utilities for GPTQ."""
-import functools
 from typing import Optional
 
 import torch
 
 import vllm._custom_ops as ops
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    moe_align_block_size, try_get_optimal_moe_config)
+from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_make_workspace_new, maybe_warn_marlin_atomic_add)
 from vllm.scalar_type import ScalarType, scalar_types
@@ -98,17 +96,11 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
     N = w2.shape[1] * 16
     topk = topk_ids.shape[1]
 
-    get_config_func = functools.partial(
-        try_get_optimal_moe_config,
-        w1.shape,
-        w2.shape,
-        topk_ids.shape[1],
-        None,
-        is_marlin=True,
-    )
-    config = get_config_func(M)
-
-    block_size_m = config["BLOCK_SIZE_M"]
+    # M block size selection logic
+    # TODO: tune this further for specific models
+    for block_size_m in [8, 16, 32, 48, 64]:
+        if M * topk / E / block_size_m < 0.9:
+            break
 
     if global_num_experts == -1:
         global_num_experts = E
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index e58a9e568d..3579ca22ba 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -801,7 +801,6 @@ def get_default_config(
     K: int,
     topk: int,
     dtype: Optional[str],
-    is_marlin: bool,
     block_shape: Optional[list[int]] = None,
 ) -> dict[str, int]:
     if dtype == "fp8_w8a8" and block_shape is not None:
@@ -832,11 +831,6 @@ def get_default_config(
             config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1}
         else:
             config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1}
-    elif is_marlin:
-        for block_size_m in [8, 16, 32, 48, 64]:
-            if M * topk / E / block_size_m < 0.9:
-                break
-        return {"BLOCK_SIZE_M": block_size_m}
     elif M <= E:
         config = {
             "BLOCK_SIZE_M": 16,
@@ -860,7 +854,6 @@ def try_get_optimal_moe_config(
     top_k: int,
     dtype: Optional[str],
     M: int,
-    is_marlin: bool = False,
     block_shape: Optional[list[int]] = None,
 ) -> dict[str, int]:
     from vllm.model_executor.layers.fused_moe import get_config
@@ -883,7 +876,7 @@ def try_get_optimal_moe_config(
         else:
             # Else use the default config
             config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
-                                        is_marlin, block_shape)
+                                        block_shape)
     return config
 
 
From 5c32143b9db19ae728087019678843fa238afa82 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 17 Aug 2025 12:05:50 +0800
Subject: [PATCH 343/932] [Refactor] Defer tensor data construction in
 MultiModalKwargs (#23030)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_cache.py                |  2 +-
 tests/v1/test_serial_utils.py                 | 34 +------
 vllm/inputs/registry.py                       |  2 +-
 .../models/prithvi_geospatial_mae.py          |  2 +-
 vllm/multimodal/base.py                       |  2 +-
 vllm/multimodal/cache.py                      |  2 +-
 vllm/multimodal/inputs.py                     | 96 +++++++++++--------
 vllm/multimodal/processing.py                 |  2 +-
 vllm/multimodal/utils.py                      | 12 ++-
 vllm/sequence.py                              |  4 +-
 vllm/v1/serial_utils.py                       | 17 +---
 vllm/v1/worker/gpu_input_batch.py             |  2 +-
 12 files changed, 73 insertions(+), 104 deletions(-)

diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index e07b73bd25..2149f05b6a 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -25,7 +25,7 @@ def _dummy_item(modality: str, size_by_key: dict[str, int]):
 
 
 def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
-    return MultiModalKwargs.from_items([
+    return MultiModalKwargs([
         _dummy_item(modality, size_by_key)
         for modality, size_by_key in size_by_key_modality.items()
     ])
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index 0ab4e0bf59..586276ee08 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -100,38 +100,6 @@ class MyRequest(msgspec.Struct):
 
 
 def test_multimodal_kwargs():
-    d = {
-        "foo":
-        torch.zeros(20000, dtype=torch.float16),
-        "bar": [torch.zeros(i * 1000, dtype=torch.int8) for i in range(3)],
-        "baz": [
-            torch.rand((256), dtype=torch.float16),
-            [
-                torch.rand((1, 12), dtype=torch.float32),
-                torch.rand((3, 5, 7), dtype=torch.float64),
-            ], [torch.rand((4, 4), dtype=torch.float16)]
-        ],
-    }
-
-    # pack mm kwargs into a mock request so that it can be decoded properly
-    req = MyRequest(mm=[MultiModalKwargs(d)])
-
-    encoder = MsgpackEncoder()
-    decoder = MsgpackDecoder(MyRequest)
-
-    encoded = encoder.encode(req)
-
-    assert len(encoded) == 6
-
-    total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
-
-    # expected total encoding length, should be 44559, +-20 for minor changes
-    assert 44539 <= total_len <= 44579
-    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
-    assert all(nested_equal(d[k], decoded[k]) for k in d)
-
-
-def test_multimodal_items_by_modality():
     e1 = MultiModalFieldElem("audio", "a0",
                              torch.zeros(1000, dtype=torch.bfloat16),
                              MultiModalBatchedField())
@@ -151,7 +119,7 @@ def test_multimodal_items_by_modality():
     audio = MultiModalKwargsItem.from_elems([e1])
     video = MultiModalKwargsItem.from_elems([e2])
     image = MultiModalKwargsItem.from_elems([e3, e4])
-    mm = MultiModalKwargs.from_items([audio, video, image])
+    mm = MultiModalKwargs([audio, video, image])
 
     # pack mm kwargs into a mock request so that it can be decoded properly
     req = MyRequest([mm])
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index dc32365083..ef146fdfbf 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -240,6 +240,6 @@ class InputRegistry:
 
         return DummyData(
             seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids),
-            multi_modal_data=dec_data.multi_modal_data,
+            multi_modal_data=dec_data.multi_modal_data.get_data(),
             multi_modal_placeholders=dec_data.multi_modal_placeholders,
         )
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 20f423cc76..6848882907 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -136,7 +136,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=[1],
-            mm_kwargs=MultiModalKwargs.from_items(multimodal_kwargs_items),
+            mm_kwargs=MultiModalKwargs(multimodal_kwargs_items),
             mm_hashes=None,
             mm_placeholders=mm_placeholders,
         )
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 7188ed14c5..ef8f1b2e17 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -99,7 +99,7 @@ class MultiModalPlaceholderMap:
         seq_mm_placeholders = seq_group.multi_modal_placeholders
 
         if not seq_mm_data or not seq_mm_placeholders:
-            return MultiModalKwargs({}), {}
+            return MultiModalKwargs(), {}
 
         placeholder_maps = dict[str, MultiModalPlaceholderMap]()
 
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 6074a4d54f..8c4136e06f 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -46,7 +46,7 @@ class MultiModalCache:
     ) -> int:
         # MultiModalKwargs is not a subclass of dict
         if isinstance(leaf, MultiModalKwargs):
-            return cls.get_item_size(leaf.data, debug=debug)
+            return cls.get_item_size(leaf.get_data(), debug=debug)
 
         # MultiModalKwargsItem is not a subclass of dict
         if isinstance(leaf, MultiModalKwargsItem):
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index a33ce14699..d3f57cf533 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -653,7 +653,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
     def from_elems(elems: Sequence[MultiModalFieldElem]):
         return MultiModalKwargsItem({elem.key: elem for elem in elems})
 
-    def __init__(self, data: Mapping[str, MultiModalFieldElem]) -> None:
+    def __init__(self, data: Mapping[str, MultiModalFieldElem] = {}) -> None:
         super().__init__(data)
 
         modalities = {elem.modality for elem in self.data.values()}
@@ -668,9 +668,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
         return {key: elem.data for key, elem in self.items()}
 
 
-# NOTE: UserDict is for V0 compatibility.
-# V1 should access individual items via `get_item`.
-class MultiModalKwargs(UserDict[str, NestedTensors]):
+class MultiModalKwargs:
     """
     A dictionary that represents the keyword arguments to
     [`torch.nn.Module.forward`][].
@@ -714,40 +712,16 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
                 elems = [v[item_idx] for v in elems_in_modality.values()]
                 items.append(MultiModalKwargsItem.from_elems(elems))
 
-        return MultiModalKwargs.from_items(items)
+        return MultiModalKwargs(items)
 
-    @staticmethod
-    def from_items(
-        items: Sequence[MultiModalKwargsItem],
-        *,
-        pin_memory: bool = False,
-    ):
-        """Construct a new
-        [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]
-        from multiple items."""
-        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
-        for item in items:
-            for key, elem in item.items():
-                elems_by_key[key].append(elem)
+    def __init__(self, items: Sequence[MultiModalKwargsItem] = ()) -> None:
+        super().__init__()
 
-        data = {
-            key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
-            for key, elems in elems_by_key.items() if len(elems) > 0
-        }
-
-        return MultiModalKwargs(data, items=items)
-
-    def __init__(
-        self,
-        data: Mapping[str, NestedTensors],
-        *,
-        items: Optional[Sequence[MultiModalKwargsItem]] = None,
-    ) -> None:
-        super().__init__(data)
-
-        items_by_modality = full_groupby(items or [], key=lambda x: x.modality)
+        items_by_modality = full_groupby(items, key=lambda x: x.modality)
         self._items_by_modality = dict(items_by_modality)
 
+        self._data: Optional[Mapping[str, NestedTensors]] = None
+
     @property
     def modalities(self):
         return self._items_by_modality.keys()
@@ -839,22 +813,41 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
 
         return cast(BatchedTensorInputs, json_mapped)
 
-    def __delitem__(self, key: str) -> None:
-        super().__delitem__(key)
+    def keys(self):
+        return self.get_data().keys()
+
+    def values(self):
+        return self.get_data().values()
+
+    def items(self):
+        return self.get_data().items()
+
+    def get(self, key: str, /, default=None):
+        return self.get_data().get(key, default)
+
+    def pop(self, key: str, *args, **kwargs):
+        data = dict(self.get_data())
+        res = data.pop(key, *args, **kwargs)
 
         for items in self._items_by_modality.values():
             for item in items:
-                item.pop(key, None)
+                item.pop(key, *args, **kwargs)
+
+        self._data = None
+
+        return res
+
+    def __iter__(self):
+        return iter(self.get_data())
+
+    def __getitem__(self, key: str):
+        return self.get_data()[key]
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, self.__class__):
             return False
-        if self._items_by_modality != other._items_by_modality:
-            return False
 
-        ks = self.keys()
-        return (ks == other.keys()
-                and all(nested_tensors_equal(self[k], other[k]) for k in ks))
+        return self._items_by_modality == other._items_by_modality
 
     def _validate_modality(self, method_name: str, modality: str) -> None:
         if not self._items_by_modality:
@@ -888,6 +881,25 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
         self._validate_modality("get_items", modality)
         return self._items_by_modality[modality]
 
+    def get_data(self,
+                 *,
+                 pin_memory: bool = False) -> Mapping[str, NestedTensors]:
+        if self._data is not None:
+            return self._data
+
+        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
+        for items in self._items_by_modality.values():
+            for item in items:
+                for key, elem in item.items():
+                    elems_by_key[key].append(elem)
+
+        data = {
+            key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
+            for key, elems in elems_by_key.items() if len(elems) > 0
+        }
+        self._data = data
+        return data
+
 
 MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]]
 """
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 38c5d5d99f..4684bf6f3d 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1480,7 +1480,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             mm_missing_kwargs=mm_missing_kwargs,
         )
 
-        mm_kwargs = MultiModalKwargs.from_items([
+        mm_kwargs = MultiModalKwargs([
             item for cache_items in mm_cache_items_merged.values()
             for item in cache_items
         ])
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index f914d0dc6c..a80f09bb19 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -402,12 +402,14 @@ def group_mm_kwargs_by_modality(
     for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
         items_lst = list(items)
 
-        # mm_kwargs_group = MultiModalKwargs.from_items(items_lst,
-        #                                               pin_memory=pin_memory)
+        # mm_kwargs_group = MultiModalKwargs(items_lst) \
+        #    .get_data(pin_memory=pin_memory)
 
         # if device is not None:
-        #     mm_kwargs_group = json_map_leaves(lambda x: x.to(device=device),
-        #                                       mm_kwargs_group.data)
+        #     mm_kwargs_group = json_map_leaves(
+        #         lambda x: x.to(device=device),
+        #         mm_kwargs_group,
+        #     )
 
         # TODO: Once V0 is removed, we can use the merging logic above
         # to avoid creating an extra batch dimension (except for fields
@@ -415,7 +417,7 @@ def group_mm_kwargs_by_modality(
         # We will also need to update each model to remove `flatten_bn`.
         mm_kwargs_group = MultiModalKwargs.as_kwargs(
             MultiModalKwargs.batch(
-                [MultiModalKwargs.from_items([item]) for item in items_lst],
+                [MultiModalKwargs([item]) for item in items_lst],
                 pin_memory=pin_memory,
             ),
             device=device,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index cbe63f8d1d..b3be10b6bb 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -524,7 +524,7 @@ class Sequence:
         if self.inputs["type"] == "multimodal":
             return self.inputs["mm_kwargs"]
 
-        return MultiModalKwargs({})
+        return MultiModalKwargs()
 
     @property
     def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
@@ -780,7 +780,7 @@ class SequenceGroup:
             return self.first_seq.multi_modal_data
         elif self.encoder_seq is not None:
             return self.encoder_seq.multi_modal_data
-        return MultiModalKwargs({})
+        return MultiModalKwargs()
 
     @property
     def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 3f0fad8a64..2857d8ef42 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -117,16 +117,9 @@ class MsgpackEncoder:
             return self._encode_mm_item(obj)
 
         if isinstance(obj, MultiModalKwargs):
-            mm: MultiModalKwargs = obj
-            if not mm.modalities:
-                # just return the main dict if there are no modalities.
-                return dict(mm)
-
-            # ignore the main dict, it will be re-indexed.
-            # Any tensors *not* indexed by modality will be ignored.
             return [
                 self._encode_mm_item(item)
-                for itemlist in mm._items_by_modality.values()
+                for itemlist in obj._items_by_modality.values()
                 for item in itemlist
             ]
 
@@ -268,13 +261,7 @@ class MsgpackDecoder:
             if issubclass(t, MultiModalKwargsItem):
                 return self._decode_mm_item(obj)
             if issubclass(t, MultiModalKwargs):
-                if isinstance(obj, list):
-                    return MultiModalKwargs.from_items(
-                        self._decode_mm_items(obj))
-                return MultiModalKwargs({
-                    k: self._decode_nested_tensors(v)
-                    for k, v in obj.items()
-                })
+                return MultiModalKwargs(self._decode_mm_items(obj))
             if t is UtilityResult:
                 return self._decode_utility_result(obj)
         return obj
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index e718d9d5e0..3d4cf27a6c 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -58,7 +58,7 @@ class CachedRequestState:
     @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
                 "removed in v0.13. Please use `mm_kwargs` instead.")
     def mm_inputs(self) -> list[MultiModalKwargs]:
-        return [MultiModalKwargs.from_items([item]) for item in self.mm_kwargs]
+        return [MultiModalKwargs([item]) for item in self.mm_kwargs]
 
     def get_token_id(self, idx: int) -> int:
         if idx < self.num_prompt_tokens:

From 87f48623a537d379284bb3e3d1b23ab0ee2af1c1 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sun, 17 Aug 2025 12:49:14 +0800
Subject: [PATCH 344/932] [Misc] method name typo fix (#23042)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/v1/worker/cpu_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 11b96d9463..a7180afbd6 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -29,7 +29,7 @@ class CPUModelRunner(GPUModelRunner):
         self.use_cuda_graph = False
         self.cascade_attn_enabled = False
 
-        self._postprocess_tenosrs()
+        self._postprocess_tensors()
 
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
@@ -59,7 +59,7 @@ class CPUModelRunner(GPUModelRunner):
         self.attn_groups[0][0].metadata_builder.reorder_batch(
             self.input_batch, scheduler_output)
 
-    def _postprocess_tenosrs(self) -> None:
+    def _postprocess_tensors(self) -> None:
         # Note: replace device tensors with cpu tensors
         def replace_tensor(obj: Any, cpu_attr_name: str,
                            device_attr_name) -> None:

From 4d4061b6e73d82f7e561fff64c2bd914d66ebaff Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 17 Aug 2025 13:03:24 +0800
Subject: [PATCH 345/932] [Kernel] Add cuda kernel for gpt_oss activation
 (#22951)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 csrc/activation_kernels.cu                    | 59 +++++++++++++++++++
 csrc/ops.h                                    |  2 +
 csrc/torch_bindings.cpp                       |  6 ++
 tests/kernels/core/test_activation.py         | 45 ++++++++++++--
 vllm/model_executor/layers/activation.py      | 41 ++++++++++++-
 .../layers/fused_moe/fused_marlin_moe.py      | 22 ++-----
 .../layers/fused_moe/fused_moe.py             | 18 ++----
 .../layers/quantization/utils/mxfp4_utils.py  |  4 +-
 vllm/model_executor/models/gpt_oss.py         |  2 +-
 9 files changed, 157 insertions(+), 42 deletions(-)

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 55e6596797..a4a880f13c 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -128,6 +128,45 @@ __global__ void act_and_mul_kernel_with_param(
   }
 }
 
+template <typename T>
+__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
+                                               float alpha, float limit) {
+  // clamp gate: min=None, max=limit
+  const float gate_f = (float)gate;
+  const float clamped_gate = gate_f > limit ? limit : gate_f;
+
+  // clamp up: min=-limit, max=limit
+  const float up_f = (float)up;
+  const float clamped_up =
+      up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
+
+  // glu = gate * sigmoid(gate * alpha)
+  const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
+  const float glu = clamped_gate * sigmoid_val;
+
+  // (up + 1) * glu
+  return (T)((clamped_up + 1.0f) * glu);
+}
+
+template <typename scalar_t,
+          scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
+                             const float)>
+__global__ void swigluoai_and_mul_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d, const float alpha, const float limit) {
+  const int64_t token_idx = blockIdx.x;
+  // TODO: Vectorize loads and stores.
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    // gate = x[..., ::2]  (even indices)
+    const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
+    // up = x[..., 1::2]   (odd indices)
+    const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
+
+    out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
+  }
+}
+
 }  // namespace vllm
 
 #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM)         \
@@ -145,11 +184,31 @@ __global__ void act_and_mul_kernel_with_param(
                                          PARAM);                        \
       });
 
+#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT)                          \
+  int d = input.size(-1) / 2;                                                  \
+  int64_t num_tokens = input.numel() / input.size(-1);                         \
+  dim3 grid(num_tokens);                                                       \
+  dim3 block(std::min(d, 1024));                                               \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
+  VLLM_DISPATCH_FLOATING_TYPES(                                                \
+      input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] {            \
+        vllm::swigluoai_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),             \
+                                         input.data_ptr<scalar_t>(), d, ALPHA, \
+                                         LIMIT);                               \
+      });
+
 void fatrelu_and_mul(torch::Tensor& out,    // [..., d],
                      torch::Tensor& input,  // [..., 2 * d]
                      double threshold) {
   LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
 }
+void swigluoai_and_mul(torch::Tensor& out,    // [..., d]
+                       torch::Tensor& input,  // [..., 2 * d]
+                       double alpha, double limit) {
+  LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit);
+}
 namespace vllm {
 
 // Element-wise activation kernel template.
diff --git a/csrc/ops.h b/csrc/ops.h
index 6e39758f16..64bcec6ca1 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -138,6 +138,8 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
 
 void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
                      double threshold);
+void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input,
+                       double alpha = 1.702, double limit = 7.0);
 
 void gelu_new(torch::Tensor& out, torch::Tensor& input);
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 5fee106335..7079671c2e 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -130,6 +130,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()");
   ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul);
 
+  ops.def(
+      "swigluoai_and_mul(Tensor! out, Tensor input, float alpha=1.702, float "
+      "limit=7.0) "
+      "-> ()");
+  ops.impl("swigluoai_and_mul", torch::kCUDA, &swigluoai_and_mul);
+
   // GELU implementation used in GPT-2.
   ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_new", torch::kCUDA, &gelu_new);
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index 29c5e70a8b..ec5c60fd7b 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -11,7 +11,7 @@ from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
                                                    GeluAndMul, MulAndSilu,
                                                    NewGELU, QuickGELU,
-                                                   SiluAndMul)
+                                                   SiluAndMul, SwigluOAIAndMul)
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -25,7 +25,15 @@ CUDA_DEVICES = [
 
 @pytest.mark.parametrize(
     "activation",
-    ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"])
+    [
+        "silu_and_mul",
+        "mul_and_silu",
+        "gelu",
+        "gelu_tanh",
+        "fatrelu",
+        "swigluoai_and_mul",
+    ],
+)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -59,18 +67,43 @@ def test_act_and_mul(
         threshold = random.uniform(0, 1)
         layer = FatreluAndMul(threshold)
         fn = torch.ops._C.fatrelu_and_mul
+    elif activation == "swigluoai_and_mul":
+        layer = SwigluOAIAndMul()
+        fn = torch.ops._C.swigluoai_and_mul
     out = layer(x)
     ref_out = layer.forward_native(x)
-    # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
-    # equivalent to the native PyTorch implementations, so we can do exact
-    # comparison.
-    torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
+    if activation == "swigluoai_and_mul":
+
+        rtol = {
+            #For fp16, change the relative tolerance from 1e-3 to 2e-3
+            torch.float16:
+            2e-3,
+            torch.bfloat16:
+            2e-2,
+            torch.float:
+            1.3e-6
+        }
+
+        def _get_rtol(output) -> float:
+            return rtol[output.dtype]
+
+        torch.testing.assert_close(out,
+                                   ref_out,
+                                   atol=get_default_atol(out),
+                                   rtol=_get_rtol(out))
+    else:
+        # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
+        # equivalent to the native PyTorch implementations, so we can do exact
+        # comparison.
+        torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
 
     d = x.shape[-1] // 2
     output_shape = (x.shape[:-1] + (d, ))
     out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
     if activation == "fatrelu":
         opcheck(fn, (out, x, threshold))
+    elif activation == "swigluoai_and_mul":
+        opcheck(fn, (out, x, layer.alpha, layer.limit))
     else:
         opcheck(fn, (out, x))
 
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 7ce44174ea..86ab4f546d 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -239,6 +239,35 @@ class GeluAndMul(CustomOp):
         return f'approximate={repr(self.approximate)}'
 
 
+@CustomOp.register("swigluoai_and_mul")
+class SwigluOAIAndMul(CustomOp):
+    # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110
+    def __init__(self, alpha: float = 1.702, limit: float = 7.0):
+        super().__init__()
+        self.alpha = alpha
+        self.limit = limit
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+
+        gate, up = x[..., ::2], x[..., 1::2]
+        gate = gate.clamp(min=None, max=self.limit)
+        up = up.clamp(min=-self.limit, max=self.limit)
+        glu = gate * torch.sigmoid(gate * self.alpha)
+        gated_output = (up + 1) * glu
+        return gated_output
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit)
+        return out
+
+    def extra_repr(self) -> str:
+        return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}"
+
+
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
 
@@ -330,6 +359,7 @@ class ReLUSquaredActivation(CustomOp):
         return torch.square(F.relu(x))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        #TODO : implement cuda kenrels
         return self.forward_native(x)
 
 
@@ -406,9 +436,14 @@ def get_act_fn(act_fn_name: str) -> nn.Module:
 
 
 _ACTIVATION_AND_MUL_REGISTRY = LazyDict({
-    "gelu": lambda: GeluAndMul(),
-    "silu": lambda: SiluAndMul(),
-    "geglu": lambda: GeluAndMul(),
+    "gelu":
+    lambda: GeluAndMul(),
+    "silu":
+    lambda: SiluAndMul(),
+    "geglu":
+    lambda: GeluAndMul(),
+    "swigluoai":
+    lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs),
 })
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 3c6ece6737..1e3ac6cd79 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -161,25 +161,13 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
     if activation == "silu":
         torch.ops._C.silu_and_mul(intermediate_cache2,
                                   intermediate_cache1.view(-1, 2 * N))
-    elif activation == "swiglu_oai":
-        # NOTE: in gpt-oss, the gate_proj and up_proj is interleaved
-        # - interleaved: gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-        # - origin: gate, up = gate_up[..., :N], gate_up[..., N:]
-
-        @torch.compile(dynamic=True)
-        def swiglu_oai(gate_up):
-            alpha = 1.702
-            limit = 7.0
-            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-            gate = gate.clamp(min=None, max=limit)
-            up = up.clamp(min=-limit, max=limit)
-            glu = gate * torch.sigmoid(gate * alpha)
-            return (up + 1) * glu
-
-        intermediate_cache2 = swiglu_oai(intermediate_cache1)
+    elif activation == "swigluoai":
+        # alpha = 1.702, limit = 7.0
+        torch.ops._C.swigluoai_and_mul(intermediate_cache2,
+                                       intermediate_cache1.view(-1, 2 * N))
     else:
         raise ValueError(f"Unsupported activation: {activation}. "
-                         "Only silu and swiglu_oai activations are supported.")
+                         "Only silu and swigluoai activations are supported.")
 
     if expert_map is not None:
         intermediate_cache3.zero_()
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 3579ca22ba..02b7b65f4a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1621,17 +1621,6 @@ def fused_experts_impl(
                                 block_shape=block_shape,
                                 B_bias=w1_bias)
 
-        # TODO fused kernel
-        def swiglu_oai(gate_up):
-            alpha = 1.702
-            limit = 7.0
-            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-            gate = gate.clamp(min=None, max=limit)
-            up = up.clamp(min=-limit, max=limit)
-            glu = gate * torch.sigmoid(gate * alpha)
-            gated_output = (up + 1) * glu
-            return gated_output
-
         # Activation function with multiplication
         if activation == "silu" and is_act_and_mul:
             torch.ops._C.silu_and_mul(intermediate_cache2,
@@ -1639,13 +1628,16 @@ def fused_experts_impl(
         elif activation == "gelu" and is_act_and_mul:
             torch.ops._C.gelu_and_mul(intermediate_cache2,
                                       intermediate_cache1.view(-1, N))
+        elif activation == "swigluoai" and is_act_and_mul:
+            # alpha = 1.702, limit = 7.0
+            torch.ops._C.swigluoai_and_mul(intermediate_cache2,
+                                           intermediate_cache1.view(-1, N))
         # Activation function without multiplication
         elif activation == "silu":
             intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
         elif activation == "gelu":
             intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
-        elif activation == "swiglu_oai":
-            intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N))
+
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}, "
                              f"with is_act_and_mul={is_act_and_mul}.")
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index deeb69bcad..48f9cc3737 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -61,14 +61,14 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
                        e_score_correction_bias: Optional[torch.Tensor] = None,
                        apply_router_weight_on_input: bool = False,
                        scoring_func: str = "softmax",
-                       activation: str = "swiglu_oai",
+                       activation: str = "swigluoai",
                        expert_load_view: Optional[torch.Tensor] = None,
                        logical_to_physical_map: Optional[torch.Tensor] = None,
                        logical_replica_count: Optional[torch.Tensor] = None):
     return not (use_grouped_topk or topk_group or num_expert_group
                 or expert_map or custom_routing_function
                 or e_score_correction_bias or apply_router_weight_on_input
-                or scoring_func != "softmax" or activation != "swiglu_oai"
+                or scoring_func != "softmax" or activation != "swigluoai"
                 or expert_load_view or logical_to_physical_map
                 or logical_replica_count)
 
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 7c7712dbe1..2f5d9ddd90 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -159,7 +159,7 @@ class MLPBlock(torch.nn.Module):
                                 prefix=f"{prefix}.experts",
                                 apply_router_weight_on_input=False,
                                 has_bias=True,
-                                activation="swiglu_oai")
+                                activation="swigluoai")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         t = self.norm(x)

From fe0411fc6fa32cebeacd3a3aef87a591e7309c45 Mon Sep 17 00:00:00 2001
From: 947132885 <947132885@qq.com>
Date: Sun, 17 Aug 2025 16:46:36 +0800
Subject: [PATCH 346/932] [Bugfix] should use stack instead of concat (#22972)

Signed-off-by: 947132885 <947132885@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/models/transformers.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 4ec2b683fc..f3b7263ca3 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -694,6 +694,17 @@ class TransformersForCausalLM(TransformersBase):
         return logits
 
 
+def flatten_and_concat(x: list[torch.Tensor]) -> torch.Tensor:
+    """Flatten until a list of tensors can be concatenated then do concat"""
+
+    def _can_concat(x: list[torch.Tensor]):
+        return len(set(map(lambda _x: _x.shape[1:], x))) == 1
+
+    if _can_concat(x):
+        return torch.concat(x)
+    return flatten_and_concat(flatten_bn(x))
+
+
 @MULTIMODAL_REGISTRY.register_processor(
     MultiModalProcessor,
     info=MultiModalProcessingInfo,
@@ -766,8 +777,7 @@ class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal):
             if isinstance(pixel_values, torch.Tensor):
                 pixel_values = flatten_bn(pixel_values).to(self.dtype)
             elif is_list_of(pixel_values, torch.Tensor):
-                pixel_values = flatten_bn(flatten_bn(pixel_values),
-                                          concat=True).to(self.dtype)
+                pixel_values = flatten_and_concat(pixel_values).to(self.dtype)
             else:
                 raise ValueError(
                     f"Unsupported pixel_values type {type(pixel_values)}. "

From 16bff144be6739c9f773968ace0b9cd239f67f19 Mon Sep 17 00:00:00 2001
From: Kevinzz <kevinzz08@foxmail.com>
Date: Sun, 17 Aug 2025 16:56:20 +0800
Subject: [PATCH 347/932] [Misc] fix typo in the multimodal doc (#23051)

---
 docs/features/multimodal_inputs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index cdd32924b5..9d51f9cf52 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -216,7 +216,7 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
     from vllm import LLM, SamplingParams
     from qwen_vl_utils import process_vision_info
 
-    model_path = "Qwen/Qwen2.5-VL-3B-Instruct/"
+    model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
     video_path = "https://content.pexels.com/videos/free-videos.mp4"
 
     llm = LLM(

From 292084e72ac553dbe14eb897372617a786322a2a Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sun, 17 Aug 2025 11:52:04 -0400
Subject: [PATCH 348/932] [BugFix] Fix for IMA in FA3 varlen combine (#22967)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 cmake/external_projects/vllm_flash_attn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 4e2a0e4533..49defccbb1 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 2d3b7508f67ad976f781e2042ace676419dd78dd
+          GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From c55bc1db26f5e4385c8a2c1b7e6ba8b54ab2e060 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 17 Aug 2025 10:36:46 -0700
Subject: [PATCH 349/932] [Misc] Remove dead return (#23061)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/model_executor/models/qwen2_vl.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index f2d438b385..9e2f7ca42b 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1225,7 +1225,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
-            return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
         # tensor correspoending to a multimodal data item (image or video).

From 6d243efedab9a03348cbd55fe966b62a08d90676 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 17 Aug 2025 12:41:38 -0700
Subject: [PATCH 350/932] [Misc] Convert use_structured_output property into
 constant (#23060)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/request.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 8b703b6191..4e99a9ccef 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -54,8 +54,7 @@ class Request:
             time.time()
 
         self.status = RequestStatus.WAITING
-        if sampling_params and sampling_params.guided_decoding is not None:
-            self.status = RequestStatus.WAITING_FOR_FSM
+        self.use_structured_output = False
         self.events: list[EngineCoreEvent] = []
         self.stop_reason: Union[int, str, None] = None
 
@@ -63,12 +62,15 @@ class Request:
         self.kv_transfer_params: Optional[dict[str, Any]] = None
 
         if pooling_params is not None:
+            # Pooling models.
             self.max_tokens = 1
         elif sampling_params is not None:
+            # Generative models.
             assert sampling_params.max_tokens is not None
             self.max_tokens = sampling_params.max_tokens
             if sampling_params.guided_decoding is not None:
                 self.status = RequestStatus.WAITING_FOR_FSM
+                self.use_structured_output = True
 
             if sampling_params.extra_args is not None:
                 self.kv_transfer_params = \
@@ -192,11 +194,6 @@ class Request:
         num_tokens = self.mm_positions[input_id].length
         return num_tokens
 
-    @property
-    def use_structured_output(self) -> bool:
-        return self.sampling_params is not None and \
-            self.sampling_params.guided_decoding is not None
-
     def record_event(
         self,
         event_type: EngineCoreEventType,

From 21e39436c8062ebbf4a160eebf56d7d303896e68 Mon Sep 17 00:00:00 2001
From: Calvin Chen <wen.chen@dynamia.ai>
Date: Mon, 18 Aug 2025 05:45:42 +0800
Subject: [PATCH 351/932] [XPU] fix xpu to set cudagraph batch sizes (#23044)

Signed-off-by: calvin chen <wen.chen@dynamia.ai>
---
 vllm/v1/worker/gpu_model_runner.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4219d9147a..adaa1306f6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -232,8 +232,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # The convention is different.
         # self.cudagraph_batch_sizes sorts in ascending order.
         # The batch sizes in the config are in descending order.
-        self.cudagraph_batch_sizes = list(
-            reversed(self.compilation_config.cudagraph_capture_sizes))
+        if self.compilation_config.cudagraph_capture_sizes and \
+                self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
+            self.cudagraph_batch_sizes = list(
+                reversed(self.compilation_config.cudagraph_capture_sizes))
 
         # Cache the device properties.
         self._init_device_properties()

From 0fc8fa751a4321d6531467537ff77cf3c1c70260 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sun, 17 Aug 2025 15:56:07 -0700
Subject: [PATCH 352/932] fix: gptq marlin weight loading failure (#23066)

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index bd14ab9ef6..c5d1e01701 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -56,7 +56,7 @@ def get_moe_quant_method(
             # Dynamic per module/layer rules may override base config
             override_config(cloned_config, prefix=prefix)
 
-        return moe_method_cls(cloned_config)
+        return moe_method_cls(cloned_config, layer.moe_config)
     return None
 
 
From 8ea0c2753a273e24957ab4587c200a3254ebe970 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 17 Aug 2025 18:16:03 -0700
Subject: [PATCH 353/932] [Misc] Minor code cleanup for
 _get_prompt_logprobs_dict (#23064)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index adaa1306f6..fc320be1c3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1722,7 +1722,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Compute prompt logprobs if needed.
         prompt_logprobs_dict = self._get_prompt_logprobs_dict(
             hidden_states[:num_scheduled_tokens],
-            scheduler_output,
+            scheduler_output.num_scheduled_tokens,
         )
 
         # Get the valid generated tokens.
@@ -2064,7 +2064,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def _get_prompt_logprobs_dict(
         self,
         hidden_states: torch.Tensor,
-        scheduler_output: "SchedulerOutput",
+        num_scheduled_tokens: dict[str, int],
     ) -> dict[str, Optional[LogprobsTensors]]:
         num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
         if not num_prompt_logprobs_dict:
@@ -2077,8 +2077,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # maintainable loop over optimal performance.
         completed_prefill_reqs = []
         for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items():
-
-            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            num_tokens = num_scheduled_tokens[req_id]
 
             # Get metadata for this request.
             request = self.requests[req_id]

From 7be3a59d8ee7014d6462c258222cbfa8be815831 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Mon, 18 Aug 2025 13:09:08 +0800
Subject: [PATCH 354/932] [Misc] enhance static type hint (#23059)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/v1/worker/lora_model_runner_mixin.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 2fbdee4724..84ed46989e 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -8,6 +8,7 @@ from contextlib import contextmanager
 from typing import Union
 
 import numpy as np
+import torch
 import torch.nn as nn
 
 from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
@@ -31,7 +32,8 @@ class LoRAModelRunnerMixin:
 
     def load_lora_model(self, model: nn.Module, model_config: ModelConfig,
                         scheduler_config: SchedulerConfig,
-                        lora_config: LoRAConfig, device: str) -> nn.Module:
+                        lora_config: LoRAConfig,
+                        device: torch.device) -> nn.Module:
 
         if not supports_lora(model):
             raise ValueError(

From 9f1c6422549d37eee22bfa4dbadaaa91d95e98ba Mon Sep 17 00:00:00 2001
From: double7 <33449816+DoubleVII@users.noreply.github.com>
Date: Mon, 18 Aug 2025 13:09:11 +0800
Subject: [PATCH 355/932] [Bugfix] fix Qwen2.5-Omni processor output mapping
 (#23058)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: double7 <33449816+DoubleVII@users.noreply.github.com>
Co-authored-by: 杨森 <yangsen.double7@bytedance.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/models/qwen2_5_omni_thinker.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index e95295c318..59411eb750 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -88,6 +88,11 @@ def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
     video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
     video_grid_sizes = video_grid_thw.prod(-1)
 
+    # vllm use `second_per_grid_ts` to compute multimodal rotary embedding
+    video_second_per_grid = hf_inputs.get("video_second_per_grid", None)
+    if video_second_per_grid is not None:
+        hf_inputs["second_per_grid_ts"] = video_second_per_grid
+
     return dict(
         input_audio_features=MultiModalFieldConfig.flat_from_sizes(
             "audio", audio_feature_lengths, dim=1),

From b2fd0b81e065c677ceebecb9a0e1ee6f226b7cec Mon Sep 17 00:00:00 2001
From: Andy Lo <andy@mistral.ai>
Date: Mon, 18 Aug 2025 07:10:26 +0200
Subject: [PATCH 356/932] [Bugfix][CI] Machete kernels: deterministic ordering
 for more cache hits (#23055)

Signed-off-by: Andy Lo <andy@mistral.ai>
---
 csrc/quantization/machete/generate.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index 9af7833d09..88b3f9c734 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -349,9 +349,12 @@ def to_cute_constant(value: list[int]):
 
 
 def unique_schedules(impl_configs: list[ImplConfig]):
-    return list(
-        set(sch for impl_config in impl_configs
-            for sch in impl_config.schedules))
+    # Use dict over set for deterministic ordering
+    return list({
+        sch: None
+        for impl_config in impl_configs
+        for sch in impl_config.schedules
+    }.keys())
 
 
 def unsigned_type_with_bitwidth(num_bits):

From 08d5f7113a024818b2867782c2539794b7aa162b Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Mon, 18 Aug 2025 13:16:21 +0800
Subject: [PATCH 357/932] [Misc] refactor function name (#23029)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/platforms/cpu.py        | 2 +-
 vllm/v1/worker/cpu_worker.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 0b16a8e1d1..fe258f76b9 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -268,7 +268,7 @@ class CpuPlatform(Platform):
                 DEFAULT_MAX_NUM_BATCHED_TOKENS)
 
     @classmethod
-    def get_allowed_cpu_memory_node_list(
+    def get_allowed_cpu_core_node_list(
             cls) -> tuple[list[int], list[LogicalCPUInfo]]:
         assert platform.system() == "Linux"
 
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index 2dc28d9304..f83d680484 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -132,7 +132,7 @@ class CPUWorker(Worker):
         """
 
         allowed_numa_nodes, logical_cpu_list = \
-            CpuPlatform.get_allowed_cpu_memory_node_list()
+            CpuPlatform.get_allowed_cpu_core_node_list()
         assert len(allowed_numa_nodes) >= self.parallel_config.world_size, (
             f"No enough allowed NUMA nodes to bind threads of "
             f"{self.parallel_config.world_size} CPUWorkers. "

From 89657a557c6831cca9fa5e59822af0cf27d67a98 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Sun, 17 Aug 2025 23:33:29 -0700
Subject: [PATCH 358/932] [Misc] Fix backward compatibility from #23030
 (#23070)

Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.me>
---
 vllm/multimodal/base.py   | 9 ++++++---
 vllm/multimodal/inputs.py | 6 +++---
 vllm/sequence.py          | 4 +++-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index ef8f1b2e17..c4bb8d56ce 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar
 if TYPE_CHECKING:
     from vllm.sequence import SequenceGroupMetadata
 
-from .inputs import MultiModalKwargs, PlaceholderRange
+from .inputs import MultiModalKwargs, NestedTensors, PlaceholderRange
 
 _T = TypeVar("_T")
 
@@ -56,7 +56,8 @@ class MultiModalPlaceholderMap:
     @classmethod
     def from_seq_group(
         cls, seq_group: "SequenceGroupMetadata", positions: range
-    ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]:
+    ) -> tuple[dict[str, NestedTensors], dict[str,
+                                              "MultiModalPlaceholderMap"]]:
         """
         Returns the multi-modal items that intersect with the portion of a
         prompt (``seq_group``) represented by ``positions``, as well as a
@@ -99,7 +100,7 @@ class MultiModalPlaceholderMap:
         seq_mm_placeholders = seq_group.multi_modal_placeholders
 
         if not seq_mm_data or not seq_mm_placeholders:
-            return MultiModalKwargs(), {}
+            return MultiModalKwargs().get_data(), {}
 
         placeholder_maps = dict[str, MultiModalPlaceholderMap]()
 
@@ -116,6 +117,8 @@ class MultiModalPlaceholderMap:
 
             placeholder_maps[modality] = placeholder_map
 
+        seq_mm_data = seq_mm_data if isinstance(
+            seq_mm_data, dict) else seq_mm_data.get_data()
         return seq_mm_data, placeholder_maps
 
     def append_items_from_seq_group(
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index d3f57cf533..3e0bfce59c 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -664,7 +664,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
     def modality(self) -> str:
         return self._modality
 
-    def get_data(self) -> Mapping[str, NestedTensors]:
+    def get_data(self) -> dict[str, NestedTensors]:
         return {key: elem.data for key, elem in self.items()}
 
 
@@ -720,7 +720,7 @@ class MultiModalKwargs:
         items_by_modality = full_groupby(items, key=lambda x: x.modality)
         self._items_by_modality = dict(items_by_modality)
 
-        self._data: Optional[Mapping[str, NestedTensors]] = None
+        self._data: Optional[dict[str, NestedTensors]] = None
 
     @property
     def modalities(self):
@@ -883,7 +883,7 @@ class MultiModalKwargs:
 
     def get_data(self,
                  *,
-                 pin_memory: bool = False) -> Mapping[str, NestedTensors]:
+                 pin_memory: bool = False) -> dict[str, NestedTensors]:
         if self._data is not None:
             return self._data
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index b3be10b6bb..2cb254381e 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -22,6 +22,7 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
 if TYPE_CHECKING:
+    from vllm.multimodal.inputs import NestedTensors
     from vllm.v1.worker.kv_connector_model_runner_mixin import (
         KVConnectorOutput)
 
@@ -978,7 +979,8 @@ class SequenceGroupMetadata(
     state: Optional[SequenceGroupState] = msgspec.field(
         default_factory=lambda: SequenceGroupState())
     token_type_ids: Optional[list[int]] = None
-    multi_modal_data: Optional[MultiModalKwargs] = None
+    multi_modal_data: Optional[Union[MultiModalKwargs,
+                                     dict[str, "NestedTensors"]]] = None
     multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
     encoder_seq_data: Optional[SequenceData] = None
     cross_block_table: Optional[list[int]] = None

From 5f5664b3e4ff8046e26c36165a1294205cb429c5 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Mon, 18 Aug 2025 15:04:08 +0800
Subject: [PATCH 359/932] [XPU] Fix compile size for xpu (#23069)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/config/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 51db277f65..cd2be212c2 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3548,7 +3548,7 @@ class VllmConfig:
         if self.compilation_config.pass_config.enable_sequence_parallelism:
             self.compilation_config.custom_ops.append("+rms_norm")
 
-        if current_platform.is_cuda_alike():
+        if current_platform.is_cuda_alike() or current_platform.is_xpu():
             # if cudagraph_mode is not explicitly set by users, set default
             # value
             if self.compilation_config.cudagraph_mode is None:

From 5c79b0d6484d7d4c5fe007c3c7ad04c72d3bc59e Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Mon, 18 Aug 2025 17:47:03 +0800
Subject: [PATCH 360/932] [XPU][CI]add xpu env vars in CI scripts (#22946)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 .buildkite/scripts/hardware_ci/run-xpu-test.sh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index deb61a9baf..445cd2735c 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -23,9 +23,13 @@ docker run \
     --device /dev/dri \
     -v /dev/dri/by-path:/dev/dri/by-path \
     --entrypoint="" \
+    -e "HF_TOKEN=${HF_TOKEN}" \
+    -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
     --name "${container_name}" \
     "${image_name}" \
-    sh -c '
+    bash -c '
+    set -e
+    echo $ZE_AFFINITY_MASK
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
@@ -35,8 +39,8 @@ docker run \
     pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
     pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
     pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
     pytest -v -s v1/test_serial_utils.py
     pytest -v -s v1/test_utils.py
     pytest -v -s v1/test_metrics_reader.py

From 27e8d1ea3ea9864f371f639daaa5315bf3250364 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 18 Aug 2025 17:52:00 +0800
Subject: [PATCH 361/932] [Refactor] Define MultiModalKwargsItems separate from
 MultiModalKwargs (#23053)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/api/README.md                            |   1 +
 docs/contributing/model/multimodal.md         |   4 +-
 .../multimodal/processing/test_common.py      |  14 +-
 .../multimodal/processing/test_glm4_1v.py     |   3 +-
 .../multimodal/processing/test_h2ovl.py       |   3 +-
 .../multimodal/processing/test_internvl.py    |   3 +-
 .../multimodal/processing/test_llama4.py      |  10 +-
 .../multimodal/processing/test_mllama.py      |   6 +-
 .../multimodal/processing/test_mllama4.py     |  10 +-
 .../multimodal/processing/test_nemotron_vl.py |   3 +-
 .../multimodal/processing/test_qwen2_vl.py    |   3 +-
 tests/models/multimodal/test_tensor_schema.py |   2 +-
 tests/multimodal/test_cache.py                |  11 +-
 tests/v1/test_serial_utils.py                 |  22 ++-
 vllm/executor/msgspec_utils.py                |   9 +-
 vllm/model_executor/models/aria.py            |   4 +-
 vllm/model_executor/models/aya_vision.py      |   4 +-
 vllm/model_executor/models/blip2.py           |   4 +-
 vllm/model_executor/models/chameleon.py       |   4 +-
 vllm/model_executor/models/cohere2_vision.py  |   4 +-
 vllm/model_executor/models/deepseek_vl2.py    |   7 +-
 vllm/model_executor/models/florence2.py       |   4 +-
 vllm/model_executor/models/fuyu.py            |   4 +-
 vllm/model_executor/models/gemma3_mm.py       |   4 +-
 vllm/model_executor/models/gemma3n_mm.py      |   4 +-
 vllm/model_executor/models/glm4_1v.py         |  10 +-
 vllm/model_executor/models/glm4v.py           |   4 +-
 vllm/model_executor/models/granite_speech.py  |   4 +-
 vllm/model_executor/models/h2ovl.py           |  16 +-
 .../models/hyperclovax_vision.py              |  27 +--
 vllm/model_executor/models/idefics3.py        |   4 +-
 vllm/model_executor/models/interns1.py        |  13 +-
 vllm/model_executor/models/internvl.py        |  34 ++--
 vllm/model_executor/models/keye.py            |   7 +-
 vllm/model_executor/models/kimi_vl.py         |   4 +-
 vllm/model_executor/models/llava.py           |   6 +-
 .../model_executor/models/llava_next_video.py |   4 +-
 vllm/model_executor/models/llava_onevision.py |   4 +-
 vllm/model_executor/models/minicpmo.py        |   4 +-
 vllm/model_executor/models/minicpmv.py        |   4 +-
 vllm/model_executor/models/mistral3.py        |   4 +-
 vllm/model_executor/models/mllama.py          |   7 +-
 vllm/model_executor/models/mllama4.py         |  12 +-
 vllm/model_executor/models/molmo.py           |   4 +-
 vllm/model_executor/models/nvlm_d.py          |  13 +-
 vllm/model_executor/models/ovis.py            |   9 +-
 vllm/model_executor/models/paligemma.py       |   4 +-
 vllm/model_executor/models/phi3v.py           |   4 +-
 vllm/model_executor/models/phi4_multimodal.py |   4 +-
 vllm/model_executor/models/phi4mm.py          |   4 +-
 vllm/model_executor/models/pixtral.py         |   7 +-
 .../models/prithvi_geospatial_mae.py          |   7 +-
 .../models/qwen2_5_omni_thinker.py            |  15 +-
 vllm/model_executor/models/qwen2_audio.py     |   7 +-
 vllm/model_executor/models/qwen2_vl.py        |   7 +-
 vllm/model_executor/models/qwen_vl.py         |   4 +-
 vllm/model_executor/models/skyworkr1v.py      |  13 +-
 vllm/model_executor/models/step3_vl.py        |  14 +-
 vllm/model_executor/models/tarsier.py         |   4 +-
 vllm/model_executor/models/transformers.py    |   6 +-
 vllm/model_executor/models/ultravox.py        |   9 +-
 vllm/model_executor/models/voxtral.py         |   7 +-
 vllm/model_executor/models/whisper.py         |   4 +-
 vllm/multimodal/__init__.py                   |   4 +-
 vllm/multimodal/base.py                       |   9 +-
 vllm/multimodal/cache.py                      |  21 ++-
 vllm/multimodal/inputs.py                     | 172 ++++++++----------
 vllm/multimodal/parse.py                      |  11 +-
 vllm/multimodal/processing.py                 |  38 ++--
 vllm/multimodal/profiling.py                  |   4 +-
 vllm/multimodal/utils.py                      |  25 ++-
 vllm/sequence.py                              |   6 +-
 vllm/v1/engine/processor.py                   |   2 +-
 vllm/v1/serial_utils.py                       |  41 ++++-
 vllm/v1/worker/gpu_input_batch.py             |  10 +-
 vllm/v1/worker/gpu_model_runner.py            |   5 +-
 vllm/v1/worker/tpu_model_runner.py            |   5 +-
 77 files changed, 431 insertions(+), 383 deletions(-)

diff --git a/docs/api/README.md b/docs/api/README.md
index 327472df1d..57142e8f56 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -77,6 +77,7 @@ Internal data structures.
 - [vllm.multimodal.inputs.MultiModalFieldElem][]
 - [vllm.multimodal.inputs.MultiModalFieldConfig][]
 - [vllm.multimodal.inputs.MultiModalKwargsItem][]
+- [vllm.multimodal.inputs.MultiModalKwargsItems][]
 - [vllm.multimodal.inputs.MultiModalKwargs][]
 - [vllm.multimodal.inputs.MultiModalInputs][]
 
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 64a48be326..76d0f067fd 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -629,7 +629,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
             self,
             mm_items: MultiModalDataItems,
             hf_processor_mm_kwargs: Mapping[str, object],
-            out_mm_kwargs: MultiModalKwargs,
+            out_mm_kwargs: MultiModalKwargsItems,
         ) -> Sequence[PromptUpdate]:
             hf_config = self.info.get_hf_config()
             image_token_id = hf_config.image_token_index
@@ -778,7 +778,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
             self,
             mm_items: MultiModalDataItems,
             hf_processor_mm_kwargs: Mapping[str, object],
-            out_mm_kwargs: MultiModalKwargs,
+            out_mm_kwargs: MultiModalKwargsItems,
         ) -> Sequence[PromptUpdate]:
             hf_config = self.info.get_hf_config()
             bos_token_id = hf_config.bos_token_id
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 906966ddd0..a1744317b3 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -370,10 +370,16 @@ def _assert_inputs_equal(
     if ignore_mm_keys is None:
         ignore_mm_keys = set()
 
-    assert "mm_kwargs" in a and "mm_kwargs" in b, msg
+    a_rest = {k: v for k, v in a.items() if k != "mm_kwargs"}
+    b_rest = {k: v for k, v in b.items() if k != "mm_kwargs"}
+
+    assert a_rest == b_rest, msg
+
+    a_data = a["mm_kwargs"].get_data()
+    b_data = b["mm_kwargs"].get_data()
 
     for key in ignore_mm_keys:
-        a["mm_kwargs"].pop(key, None)
-        b["mm_kwargs"].pop(key, None)
+        a_data.pop(key, None)
+        b_data.pop(key, None)
 
-    assert a == b, msg
+    assert a_data == b_data, msg
diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index a6d900ec5d..a49842e109 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -45,7 +45,8 @@ def test_processor_override(
     video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
     video_tok_count = processed_inputs["prompt_token_ids"].count(
         video_token_id)
-    grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0]
+    grid_t, _, _ = processed_inputs["mm_kwargs"].get_data(
+    )["video_grid_thw"][0]
 
     assert grid_t == expected_grid_t
     assert video_tok_count == expected_toks_per_frame * grid_t
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 76e4acc67d..1adfe21352 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -108,7 +108,8 @@ def _run_check(
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data(
+    )["pixel_values_flat"].shape
 
     assert img_tok_count == 256 * total_expected_num_patches
     assert pixel_shape[0] == total_expected_num_patches
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index c3e2841a8f..e4f25f5ac7 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -68,7 +68,8 @@ def _run_check(
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data(
+    )["pixel_values_flat"].shape
 
     assert img_tok_count == 256 * total_expected_num_patches
     assert pixel_shape[0] == total_expected_num_patches
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 5e14f0f996..bea4f43567 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -51,14 +51,14 @@ def test_processor_override(
         prompt = encode_tokens(tokenizer, prompt)
 
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
-    mm_kwargs = processed_inputs["mm_kwargs"]
+    mm_data = processed_inputs["mm_kwargs"].get_data()
 
     # place holder replacements
     prompt_token_ids = processed_inputs["prompt_token_ids"]
     assert prompt_token_ids.count(config.boi_token_index) == num_imgs
     assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
     assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
-    aspect_ratios = mm_kwargs["aspect_ratios"]
+    aspect_ratios = mm_data["aspect_ratios"]
     num_x_separators = num_y_separators = 0
     for tiles_y, tiles_x in aspect_ratios:
         if tiles_x * tiles_y > 1:
@@ -80,6 +80,6 @@ def test_processor_override(
     num_patches_per_chunk = processor.info.get_patch_per_chunk(
         config.vision_config)
     assert prompt_token_ids.count(config.image_token_index) \
-        == mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
-    assert mm_kwargs["pixel_values"].shape[0] \
-        == mm_kwargs["patches_per_image"].sum()
+        == sum(mm_data["patches_per_image"]) * num_patches_per_chunk
+    assert len(mm_data["pixel_values"]) \
+        == sum(mm_data["patches_per_image"])
diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py
index a6b20a1e36..b42d3f89f3 100644
--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
@@ -49,18 +49,18 @@ def test_profiling(
     encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
                         ] * max_num_seqs
 
-    mm_kwargs = processor.apply(
+    mm_data = processor.apply(
         prompt=dummy_mm_data.prompt,
         mm_data=dummy_mm_data.mm_data,
         hf_processor_mm_kwargs=dict(),
-    )["mm_kwargs"]
+    )["mm_kwargs"].get_data()
 
     # Get the actual number of encoder tokens for each sample.
     # Because attn_metadata.encoder_seq_lens only counts the last
     # group of images for each sample, which is used to cheat the
     # block manager to allocate blocks for those images only.
     # See MllamaMultiModalProcessor for more details.
-    num_tiles = [[t] for t in mm_kwargs.pop("num_tiles")]
+    num_tiles = [[t] for t in mm_data.pop("num_tiles")]
     num_tokens_per_tile = calc_token_per_chunk(image_size)
     actual_encoder_seq_lens = [
         sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
index f3871b60c3..3be77b5da6 100644
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -38,21 +38,21 @@ def test_profiling(model_id: str, max_model_len: int):
 
     hf_config = ctx.get_hf_config(Llama4Config)
 
-    mm_kwargs = processor.apply(
+    mm_data = processor.apply(
         prompt=dummy_mm_data.prompt,
         mm_data=dummy_mm_data.mm_data,
         hf_processor_mm_kwargs=dict(),
-    )["mm_kwargs"]
+    )["mm_kwargs"].get_data()
 
     image_size = hf_config.vision_config.image_size
     patch_size = hf_config.vision_config.patch_size
     downsample_ratio = int(
         round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
     tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
-    chunks_per_image = prod(mm_kwargs["patches_per_image"])
+    chunks_per_image = prod(mm_data["patches_per_image"])
     total_num_patches = chunks_per_image * tokens_per_patch
-    num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][
-        0][1]  # x-y seperator tokens
+    num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
+        1]  # x-y seperator tokens
     total_tokens = total_num_patches.item() + num_tiles.item(
     ) + 3  # image start, image, image end
 
diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py
index 6fbbab0d26..d9f1965a05 100644
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -70,7 +70,8 @@ def _run_check(
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = tokenizer.convert_tokens_to_ids("<image>")
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data(
+    )["pixel_values_flat"].shape
     print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
     assert img_tok_count == 256 * total_expected_num_patches
     assert pixel_shape[0] == total_expected_num_patches
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 9d1cd18338..985f4188fd 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -48,7 +48,8 @@ def test_processor_override(
     hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
     image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data(
+    )["pixel_values"].shape
 
     assert img_tok_count == expected_toks_per_img * num_imgs
     assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py
index 036624431c..51e5b84b6c 100644
--- a/tests/models/multimodal/test_tensor_schema.py
+++ b/tests/models/multimodal/test_tensor_schema.py
@@ -128,7 +128,7 @@ def create_batched_mm_kwargs(
     )["mm_kwargs"]
     items = [
         item for modality in supported_mm_limits
-        for item in mm_kwargs.get_items(modality)
+        for item in mm_kwargs[modality]
     ]
     return group_mm_kwargs_by_modality(items)
 
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index 2149f05b6a..088cd00db2 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -4,8 +4,8 @@ import pytest
 import torch
 
 from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
-from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
-                                    MultiModalKwargsItem,
+from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
+                                    MultiModalKwargsItems,
                                     MultiModalSharedField)
 
 
@@ -24,8 +24,8 @@ def _dummy_item(modality: str, size_by_key: dict[str, int]):
     ])
 
 
-def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
-    return MultiModalKwargs([
+def _dummy_items(size_by_key_modality: dict[str, dict[str, int]]):
+    return MultiModalKwargsItems.from_seq([
         _dummy_item(modality, size_by_key)
         for modality, size_by_key in size_by_key_modality.items()
     ])
@@ -37,7 +37,8 @@ def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
     [
         (_dummy_item("a", {"a1": 100}), 100),
         (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
-        (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+        (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+        (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460),  # noqa: E501
     ],
 )
 # yapf: enable
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index 586276ee08..118b40d0ef 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -11,7 +11,8 @@ import torch
 
 from vllm.multimodal.inputs import (MultiModalBatchedField,
                                     MultiModalFieldElem, MultiModalFlatField,
-                                    MultiModalKwargs, MultiModalKwargsItem,
+                                    MultiModalKwargsItem,
+                                    MultiModalKwargsItems,
                                     MultiModalSharedField, NestedTensors)
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 
@@ -96,7 +97,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
 
 
 class MyRequest(msgspec.Struct):
-    mm: Optional[list[MultiModalKwargs]]
+    mm: Optional[list[MultiModalKwargsItems]]
 
 
 def test_multimodal_kwargs():
@@ -119,7 +120,7 @@ def test_multimodal_kwargs():
     audio = MultiModalKwargsItem.from_elems([e1])
     video = MultiModalKwargsItem.from_elems([e2])
     image = MultiModalKwargsItem.from_elems([e3, e4])
-    mm = MultiModalKwargs([audio, video, image])
+    mm = MultiModalKwargsItems.from_seq([audio, video, image])
 
     # pack mm kwargs into a mock request so that it can be decoded properly
     req = MyRequest([mm])
@@ -133,19 +134,22 @@ def test_multimodal_kwargs():
 
     total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
 
-    # expected total encoding length, should be 14255, +-20 for minor changes
-    assert 14250 <= total_len <= 14300
-    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
+    # expected total encoding length, should be 14306, +-20 for minor changes
+    assert 14275 <= total_len <= 14325
+    decoded = decoder.decode(encoded).mm[0]
+    assert isinstance(decoded, MultiModalKwargsItems)
 
     # check all modalities were recovered and do some basic sanity checks
-    assert len(decoded.modalities) == 3
-    images = decoded.get_items("image")
+    assert len(decoded) == 3
+    images = decoded["image"]
     assert len(images) == 1
     assert len(images[0].items()) == 2
     assert list(images[0].keys()) == ["i0", "i1"]
 
     # check the tensor contents and layout in the main dict
-    assert all(nested_equal(mm[k], decoded[k]) for k in mm)
+    mm_data = mm.get_data()
+    decoded_data = decoded.get_data()
+    assert all(nested_equal(mm_data[k], decoded_data[k]) for k in mm_data)
 
 
 def nested_equal(a: NestedTensors, b: NestedTensors):
diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py
index 852c8f5cff..4ce6d8dfad 100644
--- a/vllm/executor/msgspec_utils.py
+++ b/vllm/executor/msgspec_utils.py
@@ -4,11 +4,12 @@
 from array import array
 from typing import Any, Type
 
+from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
 
 
 def encode_hook(obj: Any) -> Any:
-    """Custom msgspec enc hook that supports array types.
+    """Custom msgspec enc hook that supports array types and MultiModalKwargs.
 
     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
     """
@@ -17,10 +18,12 @@ def encode_hook(obj: Any) -> Any:
             f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
             f"Given array has a type code of {obj.typecode}.")
         return obj.tobytes()
+    if isinstance(obj, MultiModalKwargs):
+        return dict(obj)
 
 
 def decode_hook(type: Type, obj: Any) -> Any:
-    """Custom msgspec dec hook that supports array types.
+    """Custom msgspec dec hook that supports array types and MultiModalKwargs.
 
     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
     """
@@ -28,3 +31,5 @@ def decode_hook(type: Type, obj: Any) -> Any:
         deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
         deserialized.frombytes(obj)
         return deserialized
+    if type is MultiModalKwargs:
+        return MultiModalKwargs(obj)
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index e1368a3f64..1c7960fa3e 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -22,7 +22,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -470,7 +470,7 @@ class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 5cd74bbba4..b02a973d94 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -18,7 +18,7 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import (
 from vllm.config import VllmConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -242,7 +242,7 @@ class AyaVisionMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 8e3505f872..2f2b880bb0 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -15,7 +15,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
@@ -492,7 +492,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 8d705f40ce..e6914ad4c4 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -31,7 +31,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -151,7 +151,7 @@ class ChameleonMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index f17583768f..bc526fd661 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -21,7 +21,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -241,7 +241,7 @@ class Cohere2VisionMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index e0acca75d9..e881e9c6dd 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -21,7 +21,7 @@ from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.model_executor.models.transformers import replace_linear_class
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -252,7 +252,7 @@ class DeepseekVL2MultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
@@ -291,7 +291,8 @@ class DeepseekVL2MultiModalProcessor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
+               bool]:
         # The processor logic is different for len(images) <= 2 vs > 2
         # Since the processing cache assumes that the processor output is
         # invariant of how many images are passed per prompt, we only
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 56e456c2f1..4a8cb35a54 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -21,7 +21,7 @@ from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
@@ -860,7 +860,7 @@ class Florence2MultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         pad_token_id = hf_config.pad_token_id
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index b61e0361fe..90af859ab9 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -32,7 +32,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -226,7 +226,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         bos_token_id = hf_config.bos_token_id
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 9871b11b37..bf5ad633b9 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -17,7 +17,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 # yapf: disable
@@ -311,7 +311,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.boi_token
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index a0c3bb5007..79061fd30c 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 # yapf: disable
@@ -209,7 +209,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 88c53c8363..015577322f 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -59,7 +59,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, VideoItem)
+                                    MultiModalKwargsItems, VideoItem)
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -1158,7 +1158,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(
@@ -1175,14 +1175,16 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
         merge_length = image_processor.merge_size**2
 
         def get_image_replacement_glm4v(item_idx: int):
-            grid_thw = out_mm_kwargs["image_grid_thw"][item_idx]
+            out_item = out_mm_kwargs["image"][item_idx]
+            grid_thw = out_item["image_grid_thw"].data
             assert isinstance(grid_thw, torch.Tensor)
 
             num_tokens = int(grid_thw.prod()) // merge_length
             return [hf_processor.image_token_id] * num_tokens
 
         def get_video_replacement_glm4v(item_idx: int):
-            grid_thw = out_mm_kwargs["video_grid_thw"][item_idx]
+            out_item = out_mm_kwargs["video"][item_idx]
+            grid_thw = out_item["video_grid_thw"].data
             assert isinstance(grid_thw, torch.Tensor)
 
             video, metadata = mm_items["video"][item_idx]
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 1751fccd08..bf33575859 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -30,7 +30,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -503,7 +503,7 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
 
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index c9e3b74e7c..c3ac3bb78c 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -40,7 +40,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -118,7 +118,7 @@ class GraniteSpeechMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> list[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index c3e4f81597..9ab3f4d0d9 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -17,7 +17,7 @@ from transformers import PretrainedConfig
 
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement,
@@ -425,18 +425,19 @@ class H2OVLMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
-        elif "image_embeds" in out_mm_kwargs:
+        elif "image_embeds" in out_mm_data:
             # TODO: Use image size information in dictionary embedding inputs
             # to compute num_patches (similar to Qwen2-VL)
-            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+            image_num_patches = [None] * len(out_mm_data["image_embeds"])
         else:
             image_num_patches = []
 
@@ -479,7 +480,8 @@ class H2OVLMultiModalProcessor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
+               bool]:
         # The processor logic is different for len(images) <= 1 vs > 1
         # Since the processing cache assumes that the processor output is
         # invariant of how many images are passed per prompt, we only
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index e5c94c7f3a..d3ddc47ea9 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
@@ -295,7 +295,7 @@ class HCXVisionMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         placeholder = {
@@ -306,21 +306,22 @@ class HCXVisionMultiModalProcessor(
         def get_replacement_hyperclovax(
             item_idx: int,
             modality: str,
-            out_mm_kwargs: MultiModalKwargs,
+            out_mm_kwargs: MultiModalKwargsItems,
         ):
-            num_tokens = None
+            out_item = out_mm_kwargs[modality][item_idx]
+
             if modality == "image":
+                lens = out_item["vision_query_lengths_images"].data
                 num_tokens = self.info.get_num_image_tokens(
-                    vision_query_length=out_mm_kwargs[
-                        "vision_query_lengths_images"][item_idx], )
-            if modality == "video":
+                    vision_query_length=lens)
+            elif modality == "video":
+                lens = out_item["vision_query_lengths_videos"].data
                 num_tokens = self.info.get_num_video_tokens(
-                    vision_query_length=out_mm_kwargs[
-                        "vision_query_lengths_videos"][item_idx], )
-            assert isinstance(num_tokens, int)
-            return [
-                placeholder[modality],
-            ] * num_tokens
+                    vision_query_length=lens)
+            else:
+                raise NotImplementedError(modality)
+
+            return [placeholder[modality]] * num_tokens
 
         return [
             PromptReplacement(
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 3c01789b90..63307470d9 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -34,7 +34,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import ImageProcessorItems, ImageSize
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -374,7 +374,7 @@ class Idefics3MultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token, _, _ = self.info._get_image_token(hf_processor)
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index d952ced2fa..c739e74b05 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -399,7 +399,7 @@ class InternS1MultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         img_context_token = hf_processor.image_token
@@ -407,15 +407,16 @@ class InternS1MultiModalProcessor(
         end_image_token = hf_processor.end_image_token
         video_token = hf_processor.video_token
 
-        if "video_num_patches" in out_mm_kwargs:
-            video_num_patches = out_mm_kwargs["video_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "video_num_patches" in out_mm_data:
+            video_num_patches = out_mm_data["video_num_patches"]
             assert isinstance(video_num_patches, torch.Tensor)
             video_num_patches = video_num_patches.tolist()
         else:
             video_num_patches = []
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
         else:
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 8e766dd4c4..da8ad83967 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -28,7 +28,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -797,18 +797,19 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
-        elif "image_embeds" in out_mm_kwargs:
+        elif "image_embeds" in out_mm_data:
             # TODO: Use image size information in dictionary embedding inputs
             # to compute num_patches (similar to Qwen2-VL)
-            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+            image_num_patches = [None] * len(out_mm_data["image_embeds"])
         else:
             image_num_patches = []
 
@@ -966,15 +967,19 @@ class InternVLMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
-        prompt_repl: list[PromptUpdate] = super()._get_prompt_updates(
-            mm_items, hf_processor_mm_kwargs, out_mm_kwargs)
+        prompt_repl = super()._get_prompt_updates(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            out_mm_kwargs=out_mm_kwargs,
+        )
 
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "video_num_patches" in out_mm_kwargs:
-            video_num_patches = out_mm_kwargs["video_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "video_num_patches" in out_mm_data:
+            video_num_patches = out_mm_data["video_num_patches"]
             assert isinstance(video_num_patches, torch.Tensor)
             video_num_patches = video_num_patches.tolist()
         else:
@@ -992,12 +997,15 @@ class InternVLMultiModalProcessor(
                 video_context_token=hf_processor.video_token)
 
         if self.info.supports_video:
-            prompt_repl.append(
+            prompt_repl = [
+                *prompt_repl,
                 PromptReplacement(
                     modality="video",
                     target="<video>",
                     replacement=get_video_replacement_internvl,
-                ))
+                )
+            ]
+
         return prompt_repl
 
 
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index db9ed5910d..c6dbd62b90 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -33,7 +33,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, VideoItem)
+                                    MultiModalKwargsItems, VideoItem)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
@@ -1192,7 +1192,7 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(
@@ -1208,7 +1208,8 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
         merge_length = image_processor.merge_size**2
 
         def get_replacement_keye(item_idx: int, modality: str):
-            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            out_item = out_mm_kwargs[modality][item_idx]
+            grid_thw = out_item[f"{modality}_grid_thw"].data
             assert isinstance(grid_thw, torch.Tensor)
 
             num_tokens = int(grid_thw.prod()) // merge_length
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index 1c7ddd7df7..cbf0008a88 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -69,7 +69,7 @@ from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -239,7 +239,7 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         image_token_id = self.info.image_token_id
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 4927d6b62c..3caaaa9f7d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -23,7 +23,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargs)
+                                    MultiModalInputs, MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -250,7 +250,7 @@ class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
@@ -343,7 +343,7 @@ class PixtralHFMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         hf_config = self.info.get_hf_config()
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index abc519edad..cf9852de63 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -16,7 +16,7 @@ from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -185,7 +185,7 @@ class LlavaNextVideoMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         video_token_id = hf_config.video_token_index
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index ecd24af030..babd72a4b7 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -18,7 +18,7 @@ from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
@@ -372,7 +372,7 @@ class LlavaOnevisionMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         image_repls = super()._get_prompt_updates(
             mm_items=mm_items,
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index e1746695bd..98ea366d3a 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinConfig)
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     NestedTensors)
 from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
@@ -316,7 +316,7 @@ class MiniCPMOMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         base_updates = super()._get_prompt_updates(
             mm_items=mm_items,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 47ce771d8c..48ce1b9d38 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -48,7 +48,7 @@ from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     NestedTensors)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
@@ -694,7 +694,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         placeholders = [("image", self.info.image_pattern),
                         ("video", self.info.video_pattern)]
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 9e29a96c6e..a647292d3a 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -23,7 +23,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -265,7 +265,7 @@ class Mistral3MultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         hf_config = self.info.get_hf_config()
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 30ae3f26c8..9d2ac77147 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -56,7 +56,8 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                                    MultiModalFieldConfig, MultiModalKwargs)
+                                    MultiModalFieldConfig,
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseProcessingInfo,
@@ -217,7 +218,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
             # Set encoder prompt length based on the number of tiles.
             # This tells the block manager to allocate correct number
             # of slots for encoder tokens.
-            num_tiles = mm_inputs["mm_kwargs"]["num_tiles"]
+            num_tiles = mm_inputs["mm_kwargs"].get_data()["num_tiles"]
             decode_tiles = num_tiles[num_encode_images:num_images].sum().item()
             num_tokens = decode_tiles * token_per_chunk
             mm_inputs["encoder_prompt_token_ids"] = [image_token_id
@@ -302,7 +303,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         token_per_chunk = self.info.get_token_per_chunk_from_config()
         image_token_id = self.info.get_hf_config().image_token_index
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index b405dfca6d..35103eac8f 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -44,7 +44,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -646,13 +646,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> list[PromptUpdate]:
-        assert (
-            mm_items.get_count("image", strict=False) == 0
-            or "aspect_ratios" in out_mm_kwargs
-        ), "Transformers expect to include aspect_ratios in out_mm_kwargs"
-
         config = self.info.get_hf_config()
         vision_config = config.vision_config
 
@@ -662,7 +657,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
         img_patch_token = hf_processor.img_patch_token
 
         def get_replacement(item_idx: int):
-            aspect_ratio = out_mm_kwargs["aspect_ratios"][item_idx]
+            out_item = out_mm_kwargs["image"][item_idx]
+            aspect_ratio = out_item["aspect_ratios"].data
 
             repl = hf_processor._prompt_split_image(
                 aspect_ratio=aspect_ratio,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 78dc0dca95..6a08d2793f 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -42,7 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -1282,7 +1282,7 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 4bea1392a6..3bbf4c6760 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -16,7 +16,7 @@ from transformers import PretrainedConfig
 
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
@@ -106,18 +106,19 @@ class NVLMMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
-        elif "image_embeds" in out_mm_kwargs:
+        elif "image_embeds" in out_mm_data:
             # TODO: Use image size information in dictionary embedding inputs
             # to compute num_patches (similar to Qwen2-VL)
-            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+            image_num_patches = [None] * len(out_mm_data["image_embeds"])
         else:
             image_num_patches = []
 
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 6b27980e0b..5b3ad7cbd0 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -42,7 +42,7 @@ from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement)
@@ -375,11 +375,12 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> list[PromptReplacement]:
 
-        def get_replacement_ovis(item_idx):
-            grid = out_mm_kwargs["grids"][item_idx]
+        def get_replacement_ovis(item_idx: int):
+            out_item = out_mm_kwargs["image"][item_idx]
+            grid = out_item["grids"].data
 
             hf_processor = self.info.get_hf_processor()
             return hf_processor.construct_image_placeholders(grid)
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index b1f2e53b0c..f15e7a17d5 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -12,7 +12,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargs)
+                                    MultiModalInputs, MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -146,7 +146,7 @@ class PaliGemmaMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 9ef4f8371e..078251ee2b 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -32,7 +32,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 # yapf conflicts with isort for this block
@@ -410,7 +410,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
index e13b8276bf..ee8b71caf3 100644
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -30,7 +30,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
                                    ImageProcessorItems, ImageSize,
                                    MultiModalDataItems, MultiModalDataParser)
@@ -1029,7 +1029,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         tokenizer = self.info.get_tokenizer()
         image_token_id = tokenizer.vocab[tokenizer.image_token]
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 73e8446e6d..391117f075 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -21,7 +21,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
                                    ImageProcessorItems, ImageSize,
                                    MultiModalDataItems, MultiModalDataParser)
@@ -802,7 +802,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         image_tokens: list[str] = self.info.image_tokens  # type: ignore
         audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 41eaf37278..5427e9a593 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -33,7 +33,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
@@ -273,7 +273,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
@@ -309,7 +309,8 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
+               bool]:
         (
             prompt_ids,
             mm_kwargs,
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 6848882907..442596a6b5 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -34,7 +34,8 @@ from vllm.model_executor.models.utils import AutoWeightsLoader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalFieldElem, MultiModalInputs,
-                                    MultiModalKwargs, MultiModalKwargsItem,
+                                    MultiModalKwargsItem,
+                                    MultiModalKwargsItems,
                                     MultiModalSharedField, PlaceholderRange)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -88,7 +89,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         return []
 
@@ -136,7 +137,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=[1],
-            mm_kwargs=MultiModalKwargs(multimodal_kwargs_items),
+            mm_kwargs=MultiModalKwargsItems.from_seq(multimodal_kwargs_items),
             mm_hashes=None,
             mm_placeholders=mm_placeholders,
         )
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 59411eb750..d43573ea27 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -54,7 +54,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
@@ -265,7 +265,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         prompt_ids: list[int],
-        mm_kwargs: MultiModalKwargs,
+        mm_kwargs: MultiModalKwargsItems,
         is_update_applied: bool,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
         """
@@ -325,7 +325,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
@@ -340,8 +340,9 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         image_token_id = vocab[image_token]
         video_token_id = vocab[video_token]
 
-        audio_feature_lengths = out_mm_kwargs.get("audio_feature_lengths")
-        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
+        out_mm_data = out_mm_kwargs.get_data()
+        audio_feature_lengths = out_mm_data.get("audio_feature_lengths")
+        feature_attention_mask = out_mm_data.get("feature_attention_mask")
         if audio_feature_lengths is None and feature_attention_mask is None:
             audio_output_lengths = []
         elif audio_feature_lengths is not None:
@@ -371,7 +372,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             return [audio_token_id] * num_features
 
         def get_replacement_qwen2_vision(item_idx: int, modality: str):
-            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            grid_thw = out_mm_data[f"{modality}_grid_thw"][item_idx]
             assert isinstance(grid_thw, torch.Tensor)
             merge_length = image_processor.merge_size**2
 
@@ -387,7 +388,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
 
             audio_num_features = audio_output_lengths[audio_in_video_item_idx +
                                                       item_idx]
-            video_grid_thw = out_mm_kwargs["video_grid_thw"][item_idx]
+            video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
 
             audio_in_video_item_idx += 1
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 3ef55cd704..86c567ca36 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -37,7 +37,7 @@ from vllm.config import VllmConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -182,7 +182,7 @@ class Qwen2AudioMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
@@ -199,7 +199,8 @@ class Qwen2AudioMultiModalProcessor(
         audio_bos_id = vocab[audio_bos_token]
         audio_eos_id = vocab[audio_eos_token]
 
-        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
+        out_mm_data = out_mm_kwargs.get_data()
+        feature_attention_mask = out_mm_data.get("feature_attention_mask")
         if feature_attention_mask is None:
             audio_output_lengths = []
         else:
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 9e2f7ca42b..3361878a20 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -58,7 +58,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, VideoItem)
+                                    MultiModalKwargsItems, VideoItem)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
@@ -975,7 +975,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(
@@ -991,7 +991,8 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
         merge_length = image_processor.merge_size**2
 
         def get_replacement_qwen2vl(item_idx: int, modality: str):
-            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            out_item = out_mm_kwargs[modality][item_idx]
+            grid_thw = out_item[f"{modality}_grid_thw"].data
             assert isinstance(grid_thw, torch.Tensor)
 
             num_tokens = int(grid_thw.prod()) // merge_length
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 4c3fd6b515..2950ca664a 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -33,7 +33,7 @@ from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -627,7 +627,7 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         tokenizer = self.info.get_tokenizer()
         special_tokens: dict[str,
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index c76aabcd27..920f4def69 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -26,7 +26,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -552,18 +552,19 @@ class SkyworkR1VMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
-        elif "image_embeds" in out_mm_kwargs:
+        elif "image_embeds" in out_mm_data:
             # TODO: Use image size information in dictionary embedding inputs
             # to compute num_patches (similar to Qwen2-VL)
-            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+            image_num_patches = [None] * len(out_mm_data["image_embeds"])
         else:
             image_num_patches = []
 
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index f1f38c01b7..5d41a9e569 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -28,7 +28,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -520,20 +520,18 @@ class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo]
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_placeholder_token_id = hf_processor.image_token_id
-        batch_num_patches = out_mm_kwargs["num_patches"].tolist()
 
         def get_replacement_step1o(item_idx: int):
-            img_out = out_mm_kwargs.get_item("image", item_idx)
-            num_patches = batch_num_patches[item_idx]
+            out_item = out_mm_kwargs["image"][item_idx]
+            num_patches = int(out_item["num_patches"].data)
             if num_patches > 0:
-                patch_newline_mask = img_out["patch_newline_mask"].data.tolist(
-                )
+                patch_newline_mask = out_item["patch_newline_mask"].data
                 image_repl_ids = hf_processor._get_image_repl_features(
-                    1, num_patches, patch_newline_mask)[1]
+                    1, num_patches, patch_newline_mask.tolist())[1]
             else:
                 image_repl_ids = hf_processor._get_image_repl_features(
                     1, 0, None)[1]
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index c8709d866b..0990be8d02 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -275,7 +275,7 @@ class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index  # The <IMAGE> token ID
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index f3b7263ca3..712667b1e2 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -41,7 +41,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputs, PlaceholderRange)
 from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
@@ -237,7 +237,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ):
         """
         Given the original multi-modal items for this modality
@@ -372,7 +372,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
             mm_tokens_per_modality["num_image_patches"]
         ) if "num_image_patches" in mm_tokens_per_modality else None
         processed_data['num_image_patches'] = num_image_patches
-        mm_kwargs = MultiModalKwargs.from_hf_inputs(
+        mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
             processed_data,
             self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
                                        num_image_patches),
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index bef34c1be4..f91c4ddb6e 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -23,7 +23,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -194,7 +194,7 @@ class UltravoxMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
@@ -203,7 +203,8 @@ class UltravoxMultiModalProcessor(
         # Each audio can be split into multiple chunks.
         # chunks_start_idx[i] indicates the start index of the chunks
         # belonging to the i-th audio.
-        num_chunks = out_mm_kwargs.get("audio_num_chunks", torch.zeros(0))
+        out_mm_data = out_mm_kwargs.get_data()
+        num_chunks = out_mm_data.get("audio_num_chunks", torch.zeros(0))
         chunks_start_idx: torch.Tensor = torch.cumsum(num_chunks,
                                                       dim=0,
                                                       dtype=torch.int32)
@@ -213,7 +214,7 @@ class UltravoxMultiModalProcessor(
         def get_replacement_ultravox(item_idx: int):
             start = chunks_start_idx[item_idx]
             end = chunks_start_idx[item_idx + 1]
-            audio_token_len = out_mm_kwargs["audio_token_len"][start:end].sum()
+            audio_token_len = out_mm_data["audio_token_len"][start:end].sum()
             return [replacement_id] * int(audio_token_len)  # type: ignore
 
         return [
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 6b06c0ac66..70ba561642 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -31,7 +31,7 @@ from vllm.model_executor.models.whisper import WhisperEncoder
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -259,7 +259,7 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
@@ -289,7 +289,8 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
+               bool]:
         prompt_ids, mm_kwargs, mm_hashes, _ = super(
         )._cached_apply_hf_processor(
             prompt=prompt,
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index ca02ecd828..16bbe2f201 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -33,7 +33,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
@@ -728,7 +728,7 @@ class WhisperMultiModalProcessor(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         num_tokens = self.info.get_num_audio_tokens()
         return [
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 2ef9f1ccc0..69eed22741 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -4,7 +4,8 @@ from .base import MultiModalPlaceholderMap
 from .hasher import MultiModalHashDict, MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
                      MultiModalDataDict, MultiModalKwargs,
-                     MultiModalPlaceholderDict, NestedTensors)
+                     MultiModalKwargsItems, MultiModalPlaceholderDict,
+                     NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -25,6 +26,7 @@ __all__ = [
     "MultiModalHashDict",
     "MultiModalHasher",
     "MultiModalKwargs",
+    "MultiModalKwargsItems",
     "MultiModalPlaceholderDict",
     "MultiModalPlaceholderMap",
     "NestedTensors",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index c4bb8d56ce..ef8f1b2e17 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar
 if TYPE_CHECKING:
     from vllm.sequence import SequenceGroupMetadata
 
-from .inputs import MultiModalKwargs, NestedTensors, PlaceholderRange
+from .inputs import MultiModalKwargs, PlaceholderRange
 
 _T = TypeVar("_T")
 
@@ -56,8 +56,7 @@ class MultiModalPlaceholderMap:
     @classmethod
     def from_seq_group(
         cls, seq_group: "SequenceGroupMetadata", positions: range
-    ) -> tuple[dict[str, NestedTensors], dict[str,
-                                              "MultiModalPlaceholderMap"]]:
+    ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]:
         """
         Returns the multi-modal items that intersect with the portion of a
         prompt (``seq_group``) represented by ``positions``, as well as a
@@ -100,7 +99,7 @@ class MultiModalPlaceholderMap:
         seq_mm_placeholders = seq_group.multi_modal_placeholders
 
         if not seq_mm_data or not seq_mm_placeholders:
-            return MultiModalKwargs().get_data(), {}
+            return MultiModalKwargs(), {}
 
         placeholder_maps = dict[str, MultiModalPlaceholderMap]()
 
@@ -117,8 +116,6 @@ class MultiModalPlaceholderMap:
 
             placeholder_maps[modality] = placeholder_map
 
-        seq_mm_data = seq_mm_data if isinstance(
-            seq_mm_data, dict) else seq_mm_data.get_data()
         return seq_mm_data, placeholder_maps
 
     def append_items_from_seq_group(
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 8c4136e06f..5cec8e71fb 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -11,7 +11,9 @@ from vllm.logger import init_logger
 from vllm.utils import GiB_bytes, LRUCache
 from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves
 
-from .inputs import MultiModalKwargs, MultiModalKwargsItem, NestedTensors
+from .inputs import (MultiModalFieldElem, MultiModalKwargs,
+                     MultiModalKwargsItem, MultiModalKwargsItems,
+                     NestedTensors)
 
 logger = init_logger(__name__)
 
@@ -26,8 +28,9 @@ class MultiModalCacheItemMetadata:
 
 
 MultiModalCacheValue = Union[
-    MultiModalKwargs,
+    MultiModalKwargsItems,
     MultiModalKwargsItem,
+    MultiModalKwargs,
     Mapping[str, NestedTensors],
     MultiModalCacheItemMetadata,
 ]
@@ -44,14 +47,16 @@ class MultiModalCache:
         *,
         debug: bool = False,
     ) -> int:
-        # MultiModalKwargs is not a subclass of dict
-        if isinstance(leaf, MultiModalKwargs):
-            return cls.get_item_size(leaf.get_data(), debug=debug)
+        if isinstance(leaf, MultiModalFieldElem):
+            return cls.get_item_size(leaf.data)  # type: ignore
 
-        # MultiModalKwargsItem is not a subclass of dict
+        # These are not subclasses of dict
+        if isinstance(leaf, MultiModalKwargsItems):
+            return cls.get_item_size(leaf.data)  # type: ignore
         if isinstance(leaf, MultiModalKwargsItem):
-            leaf_data = {k: v.data for k, v in leaf.items()}
-            return cls.get_item_size(leaf_data, debug=debug)
+            return cls.get_item_size(leaf.data)  # type: ignore
+        if isinstance(leaf, MultiModalKwargs):
+            return cls.get_item_size(leaf.data)  # type: ignore
 
         # sys.getsizeof doesn't work for tensors
         if isinstance(leaf, torch.Tensor):
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 3e0bfce59c..d46d81fe14 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -11,7 +11,7 @@ from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
                     Union, cast, final)
 
 import numpy as np
-from typing_extensions import NotRequired, TypeAlias
+from typing_extensions import NotRequired, TypeAlias, deprecated
 
 from vllm.utils import LazyLoader, full_groupby, is_list_of
 from vllm.utils.jsontree import JSONTree, json_map_leaves
@@ -656,7 +656,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
     def __init__(self, data: Mapping[str, MultiModalFieldElem] = {}) -> None:
         super().__init__(data)
 
-        modalities = {elem.modality for elem in self.data.values()}
+        modalities = {elem.modality for elem in self.values()}
         assert len(modalities) == 1, f"Found different modalities={modalities}"
         self._modality = next(iter(modalities))
 
@@ -668,16 +668,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
         return {key: elem.data for key, elem in self.items()}
 
 
-class MultiModalKwargs:
+class MultiModalKwargsItems(UserDict[str, Sequence[MultiModalKwargsItem]]):
     """
-    A dictionary that represents the keyword arguments to
-    [`torch.nn.Module.forward`][].
-
-    The metadata `items` enables us to obtain the keyword arguments
-    corresponding to each data item in
-    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems], via
-    [`get_item`][vllm.multimodal.inputs.MultiModalKwargs.get_item] and
-    [`get_items`][vllm.multimodal.inputs.MultiModalKwargs.get_items].
+    A dictionary of
+    [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem]s
+    by modality.
     """
 
     @staticmethod
@@ -712,19 +707,64 @@ class MultiModalKwargs:
                 elems = [v[item_idx] for v in elems_in_modality.values()]
                 items.append(MultiModalKwargsItem.from_elems(elems))
 
-        return MultiModalKwargs(items)
-
-    def __init__(self, items: Sequence[MultiModalKwargsItem] = ()) -> None:
-        super().__init__()
+        return MultiModalKwargsItems.from_seq(items)
 
+    @staticmethod
+    def from_seq(items: Sequence[MultiModalKwargsItem]):
         items_by_modality = full_groupby(items, key=lambda x: x.modality)
-        self._items_by_modality = dict(items_by_modality)
+        return MultiModalKwargsItems(items_by_modality)
 
-        self._data: Optional[dict[str, NestedTensors]] = None
+    def __getitem__(self, modality: str):
+        if modality not in self:
+            raise KeyError(f"Modality {modality!r} not found. "
+                           f"Available modalities: {set(self.keys())}")
 
-    @property
-    def modalities(self):
-        return self._items_by_modality.keys()
+        return super().__getitem__(modality)
+
+    def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs":
+        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
+        for items in self.values():
+            for item in items:
+                for key, elem in item.items():
+                    elems_by_key[key].append(elem)
+
+        return MultiModalKwargs({
+            key:
+            elems[0].field.reduce_data(elems, pin_memory=pin_memory)
+            for key, elems in elems_by_key.items() if len(elems) > 0
+        })
+
+
+class MultiModalKwargs(UserDict[str, NestedTensors]):
+    """
+    A dictionary that represents the keyword arguments to
+    [`torch.nn.Module.forward`][].
+    """
+
+    @staticmethod
+    @deprecated("`MultiModalKwargs.from_hf_inputs` is deprecated and "
+                "will be removed in v0.13. "
+                "Please use `MultiModalKwargsItems.from_hf_inputs` and "
+                "access the tensor data using `.get_data()`.")
+    def from_hf_inputs(
+        hf_inputs: "BatchFeature",
+        config_by_key: Mapping[str, MultiModalFieldConfig],
+    ):
+        return MultiModalKwargsItems.from_hf_inputs(hf_inputs, config_by_key) \
+            .get_data()
+
+    @staticmethod
+    @deprecated("`MultiModalKwargs.from_items` is deprecated and "
+                "will be removed in v0.13. "
+                "Please use `MultiModalKwargsItems.from_seq` and "
+                "access the tensor data using `.get_data()`.")
+    def from_items(
+        items: Sequence[MultiModalKwargsItem],
+        *,
+        pin_memory: bool = False,
+    ):
+        return MultiModalKwargsItems.from_seq(items) \
+            .get_data(pin_memory=pin_memory)
 
     @staticmethod
     def _try_stack(nested_tensors: NestedTensors,
@@ -813,92 +853,24 @@ class MultiModalKwargs:
 
         return cast(BatchedTensorInputs, json_mapped)
 
-    def keys(self):
-        return self.get_data().keys()
-
-    def values(self):
-        return self.get_data().values()
-
-    def items(self):
-        return self.get_data().items()
-
-    def get(self, key: str, /, default=None):
-        return self.get_data().get(key, default)
-
-    def pop(self, key: str, *args, **kwargs):
-        data = dict(self.get_data())
-        res = data.pop(key, *args, **kwargs)
-
-        for items in self._items_by_modality.values():
-            for item in items:
-                item.pop(key, *args, **kwargs)
-
-        self._data = None
-
-        return res
-
-    def __iter__(self):
-        return iter(self.get_data())
-
     def __getitem__(self, key: str):
-        return self.get_data()[key]
+        if key not in self:
+            raise KeyError(f"Keyword argument {key!r} not found. "
+                           f"Available keys: {set(self.keys())}")
+
+        return super().__getitem__(key)
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, self.__class__):
             return False
 
-        return self._items_by_modality == other._items_by_modality
+        for k in self:
+            if k not in other:
+                return False
+            if not nested_tensors_equal(self[k], other[k]):
+                return False
 
-    def _validate_modality(self, method_name: str, modality: str) -> None:
-        if not self._items_by_modality:
-            raise RuntimeError(
-                f"`{method_name}` is not supported when "
-                "MultiModalKwargs is not initialized with `items`")
-
-        if modality not in self._items_by_modality:
-            available_modalities = set(self._items_by_modality.keys())
-            raise KeyError(f"Modality {modality!r} not found. "
-                           f"Available modalities: {available_modalities}")
-
-    def get_item_count(self, modality: str) -> int:
-        """Get the number of items belonging to a modality."""
-        self._validate_modality("get_item_count", modality)
-        return len(self._items_by_modality[modality])
-
-    def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem:
-        """
-        Get the keyword arguments corresponding to an item identified by
-        its modality and index.
-        """
-        self._validate_modality("get_item", modality)
-        return self._items_by_modality[modality][item_index]
-
-    def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
-        """
-        Get the keyword arguments corresponding to each item belonging to
-        a modality.
-        """
-        self._validate_modality("get_items", modality)
-        return self._items_by_modality[modality]
-
-    def get_data(self,
-                 *,
-                 pin_memory: bool = False) -> dict[str, NestedTensors]:
-        if self._data is not None:
-            return self._data
-
-        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
-        for items in self._items_by_modality.values():
-            for item in items:
-                for key, elem in item.items():
-                    elems_by_key[key].append(elem)
-
-        data = {
-            key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
-            for key, elems in elems_by_key.items() if len(elems) > 0
-        }
-        self._data = data
-        return data
+        return True
 
 
 MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]]
@@ -926,7 +898,7 @@ class MultiModalInputs(TypedDict):
     token_type_ids: NotRequired[list[int]]
     """The token type IDs of the prompt."""
 
-    mm_kwargs: MultiModalKwargs
+    mm_kwargs: MultiModalKwargsItems
     """Keyword arguments to be directly passed to the model after batching."""
 
     mm_hashes: Optional["MultiModalHashDict"]
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 37f5612742..88bb99529f 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -16,7 +16,7 @@ from vllm.utils import LazyLoader, is_list_of
 from .audio import AudioResampler
 from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
                      ImageItem, ModalityData, MultiModalDataDict,
-                     MultiModalFieldConfig, MultiModalKwargs, VideoItem)
+                     MultiModalFieldConfig, MultiModalKwargsItems, VideoItem)
 
 _T = TypeVar("_T")
 _I = TypeVar("_I")
@@ -157,19 +157,16 @@ class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor],
         self.fields_config = fields_config
         self.required_fields = required_fields
 
-        self._kwargs = MultiModalKwargs.from_hf_inputs(
+        self._kwargs = MultiModalKwargsItems.from_hf_inputs(
             BatchFeature(dict(data)),
             fields_config,
         )
 
     def get_count(self) -> int:
-        return self._kwargs.get_item_count(self.modality)
+        return len(self._kwargs[self.modality])
 
     def get(self, index: int) -> Mapping[str, torch.Tensor]:
-        return {
-            k: v.data
-            for k, v in self._kwargs.get_item(self.modality, index).items()
-        }
+        return self._kwargs[self.modality][index].get_data()
 
     def get_processor_data(self) -> Mapping[str, object]:
         return {}
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 4684bf6f3d..08113da74a 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -23,8 +23,9 @@ from vllm.utils import flatten_2d_lists, full_groupby
 from .cache import MultiModalCache
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                     MultiModalFieldConfig, MultiModalInputs, MultiModalKwargs,
-                     MultiModalKwargsItem, PlaceholderRange)
+                     MultiModalFieldConfig, MultiModalInputs,
+                     MultiModalKwargsItem, MultiModalKwargsItems,
+                     PlaceholderRange)
 from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
                     MultiModalDataParser)
 
@@ -985,7 +986,7 @@ _I = TypeVar("_I", bound=BaseProcessingInfo)
 MultiModalHashes = dict[str, list[str]]
 """
 A collection of hashes with a similar structure as
-[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
+[`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
 """
 
 
@@ -1095,7 +1096,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         """
         Given the original multi-modal items for this modality
@@ -1361,7 +1362,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         self,
         cache: ProcessingCache,
         mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]],
-        mm_missing_kwargs: MultiModalKwargs,
+        mm_missing_kwargs: MultiModalKwargsItems,
     ) -> dict[str, list[MultiModalKwargsItem]]:
         mm_missing_next_idx = defaultdict[str, int](lambda: 0)
 
@@ -1369,10 +1370,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         for modality, items_or_hashes in mm_cache_items_or_hashes.items():
             for item_or_hash in items_or_hashes:
                 if isinstance(item_or_hash, str):
-                    kw_item = mm_missing_kwargs.get_item(
-                        modality,
-                        mm_missing_next_idx[modality],
-                    )
+                    kw_item = mm_missing_kwargs[modality][
+                        mm_missing_next_idx[modality]]
                     cache.put(item_or_hash, kw_item)
                     mm_missing_next_idx[modality] += 1
                 else:
@@ -1390,7 +1389,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
+               bool]:
         (
             prompt_ids,
             mm_processed_data,
@@ -1403,7 +1403,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             enable_hf_prompt_update=True,
         )
 
-        mm_kwargs = MultiModalKwargs.from_hf_inputs(
+        mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
             mm_processed_data,
             self._get_mm_fields_config(mm_processed_data,
                                        hf_processor_mm_kwargs),
@@ -1423,7 +1423,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
+               bool]:
         """
         Apply the HF processor on the full prompt text,
         caching the results and reusing cached results.
@@ -1468,7 +1469,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             enable_hf_prompt_update=False,
         )
 
-        mm_missing_kwargs = MultiModalKwargs.from_hf_inputs(
+        mm_missing_kwargs = MultiModalKwargsItems.from_hf_inputs(
             mm_missing_processed_data,
             self._get_mm_fields_config(mm_missing_processed_data,
                                        hf_processor_mm_kwargs),
@@ -1480,7 +1481,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             mm_missing_kwargs=mm_missing_kwargs,
         )
 
-        mm_kwargs = MultiModalKwargs([
+        mm_kwargs = MultiModalKwargsItems.from_seq([
             item for cache_items in mm_cache_items_merged.values()
             for item in cache_items
         ])
@@ -1585,14 +1586,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _validate_mm_kwargs(
         self,
-        mm_kwargs: MultiModalKwargs,
+        mm_kwargs: MultiModalKwargsItems,
         mm_item_counts: Mapping[str, int],
     ) -> None:
         for modality, item_count in mm_item_counts.items():
-            if modality in mm_kwargs.modalities:
-                items = mm_kwargs.get_items(modality)
-            else:
-                items = []
+            items = mm_kwargs.get(modality, [])
 
             if len(items) != item_count:
                 raise RuntimeError(
@@ -1630,7 +1628,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         prompt_ids: list[int],
-        mm_kwargs: MultiModalKwargs,
+        mm_kwargs: MultiModalKwargsItems,
         is_update_applied: bool,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
         unbound_prompt_updates = self._get_prompt_updates(
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index d876887fc1..2da9b4c721 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -13,7 +13,7 @@ import vllm.envs as envs
 from vllm.logger import init_logger
 
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                     MultiModalInputs, MultiModalKwargs,
+                     MultiModalInputs, MultiModalKwargsItems,
                      MultiModalPlaceholderDict)
 from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                          EncDecMultiModalProcessor)
@@ -43,7 +43,7 @@ class DummyDecoderData(NamedTuple):
     """Dummy data used for profiling."""
 
     prompt_token_ids: list[int]
-    multi_modal_data: MultiModalKwargs
+    multi_modal_data: MultiModalKwargsItems
     multi_modal_placeholders: MultiModalPlaceholderDict
 
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index a80f09bb19..99f3db25a7 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -32,11 +32,13 @@ _M = TypeVar("_M")
 
 if TYPE_CHECKING:
     from .inputs import (BatchedTensorInputs, MultiModalKwargs,
-                         MultiModalKwargsItem, MultiModalPlaceholderDict)
+                         MultiModalKwargsItem, MultiModalKwargsItems,
+                         MultiModalPlaceholderDict)
 else:
     BatchedTensorInputs = Any
     MultiModalKwargs = Any
     MultiModalKwargsItem = Any
+    MultiModalKwargsItems = Any
     MultiModalPlaceholderDict = Any
 
 global_thread_pool = ThreadPoolExecutor(
@@ -359,18 +361,20 @@ def argsort_mm_positions(
             "`group_mm_kwargs_by_modality` and will be removed in v0.13. "
             "Please use `group_mm_kwargs_by_modality` instead.")
 def group_mm_inputs_by_modality(
-        mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]:
+    mm_inputs: list[MultiModalKwargsItems]
+) -> list[list[MultiModalKwargsItems]]:
     if not mm_inputs:
         return []
 
-    def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]:
+    def modality_group_func(
+            mm_input: MultiModalKwargsItems) -> Union[str, int]:
         # If the input has multiple modalities, return a id as the unique key
         # for the mm_input input.
-        if len(mm_input.modalities) > 1:
+        if len(mm_input) > 1:
             return id(mm_input)
 
-        elif len(mm_input.modalities) == 1:
-            return list(mm_input.modalities)[0]
+        elif len(mm_input) == 1:
+            return next(iter(mm_input.keys()))
 
         # FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty,
         # this is used to make InternVL with legacy pipeline still work with v1.
@@ -397,12 +401,12 @@ def group_mm_kwargs_by_modality(
     Yields:
         A tuple `(modality, num_items, grouped_kwargs)`.
     """
-    from vllm.multimodal.inputs import MultiModalKwargs
+    from vllm.multimodal.inputs import MultiModalKwargs, MultiModalKwargsItems
 
     for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
         items_lst = list(items)
 
-        # mm_kwargs_group = MultiModalKwargs(items_lst) \
+        # mm_kwargs_group = MultiModalKwargsItems.from_items(items_lst) \
         #    .get_data(pin_memory=pin_memory)
 
         # if device is not None:
@@ -417,7 +421,10 @@ def group_mm_kwargs_by_modality(
         # We will also need to update each model to remove `flatten_bn`.
         mm_kwargs_group = MultiModalKwargs.as_kwargs(
             MultiModalKwargs.batch(
-                [MultiModalKwargs([item]) for item in items_lst],
+                [
+                    MultiModalKwargsItems.from_seq([item]).get_data()
+                    for item in items_lst
+                ],
                 pin_memory=pin_memory,
             ),
             device=device,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 2cb254381e..347015c7ef 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -22,7 +22,6 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
 if TYPE_CHECKING:
-    from vllm.multimodal.inputs import NestedTensors
     from vllm.v1.worker.kv_connector_model_runner_mixin import (
         KVConnectorOutput)
 
@@ -523,7 +522,7 @@ class Sequence:
     @property
     def multi_modal_data(self) -> MultiModalKwargs:
         if self.inputs["type"] == "multimodal":
-            return self.inputs["mm_kwargs"]
+            return self.inputs["mm_kwargs"].get_data()
 
         return MultiModalKwargs()
 
@@ -979,8 +978,7 @@ class SequenceGroupMetadata(
     state: Optional[SequenceGroupState] = msgspec.field(
         default_factory=lambda: SequenceGroupState())
     token_type_ids: Optional[list[int]] = None
-    multi_modal_data: Optional[Union[MultiModalKwargs,
-                                     dict[str, "NestedTensors"]]] = None
+    multi_modal_data: Optional[MultiModalKwargs] = None
     multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
     encoder_seq_data: Optional[SequenceData] = None
     cross_block_table: Optional[list[int]] = None
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index c6a23cdbf6..97d79c2ae0 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -310,7 +310,7 @@ class Processor:
             sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
 
             orig_sorted_mm_inputs = [
-                decoder_mm_inputs.get_item(modality, idx)
+                decoder_mm_inputs[modality][idx]
                 for modality, idx in sorted_mm_idxs
             ]
             sorted_mm_positions = [
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 2857d8ef42..c8375d6f15 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -18,12 +18,15 @@ from msgspec import msgpack
 
 from vllm import envs
 from vllm.logger import init_logger
+# yapf: disable
 from vllm.multimodal.inputs import (BaseMultiModalField,
                                     MultiModalBatchedField,
                                     MultiModalFieldConfig, MultiModalFieldElem,
                                     MultiModalFlatField, MultiModalKwargs,
                                     MultiModalKwargsItem,
+                                    MultiModalKwargsItems,
                                     MultiModalSharedField, NestedTensors)
+# yapf: enable
 from vllm.v1.engine import UtilityResult
 
 logger = init_logger(__name__)
@@ -116,12 +119,11 @@ class MsgpackEncoder:
         if isinstance(obj, MultiModalKwargsItem):
             return self._encode_mm_item(obj)
 
+        if isinstance(obj, MultiModalKwargsItems):
+            return self._encode_mm_items(obj)
+
         if isinstance(obj, MultiModalKwargs):
-            return [
-                self._encode_mm_item(item)
-                for itemlist in obj._items_by_modality.values()
-                for item in itemlist
-            ]
+            return self._encode_mm_kwargs(obj)
 
         if isinstance(obj, UtilityResult):
             result = obj.result
@@ -183,6 +185,12 @@ class MsgpackEncoder:
         dtype = str(obj.dtype).removeprefix("torch.")
         return dtype, obj.shape, data
 
+    def _encode_mm_items(self, items: MultiModalKwargsItems) -> dict[str, Any]:
+        return {
+            modality: [self._encode_mm_item(item) for item in itemlist]
+            for modality, itemlist in items.items()
+        }
+
     def _encode_mm_item(self,
                         item: MultiModalKwargsItem) -> list[dict[str, Any]]:
         return [self._encode_mm_field_elem(elem) for elem in item.values()]
@@ -200,6 +208,12 @@ class MsgpackEncoder:
             self._encode_mm_field(elem.field),
         }
 
+    def _encode_mm_kwargs(self, kw: MultiModalKwargs) -> dict[str, Any]:
+        return {
+            modality: self._encode_nested_tensors(data)
+            for modality, data in kw.items()
+        }
+
     def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
         if isinstance(nt, torch.Tensor):
             return self._encode_tensor(nt)
@@ -260,8 +274,10 @@ class MsgpackDecoder:
                 return slice(*obj)
             if issubclass(t, MultiModalKwargsItem):
                 return self._decode_mm_item(obj)
+            if issubclass(t, MultiModalKwargsItems):
+                return self._decode_mm_items(obj)
             if issubclass(t, MultiModalKwargs):
-                return MultiModalKwargs(self._decode_mm_items(obj))
+                return self._decode_mm_kwargs(obj)
             if t is UtilityResult:
                 return self._decode_utility_result(obj)
         return obj
@@ -315,8 +331,11 @@ class MsgpackDecoder:
         # Convert back to proper shape & type
         return arr.view(torch_dtype).view(shape)
 
-    def _decode_mm_items(self, obj: list[Any]) -> list[MultiModalKwargsItem]:
-        return [self._decode_mm_item(v) for v in obj]
+    def _decode_mm_items(self, obj: dict[str, Any]) -> MultiModalKwargsItems:
+        return MultiModalKwargsItems({
+            modality: [self._decode_mm_item(item) for item in itemlist]
+            for modality, itemlist in obj.items()
+        })
 
     def _decode_mm_item(self, obj: list[Any]) -> MultiModalKwargsItem:
         return MultiModalKwargsItem.from_elems(
@@ -339,6 +358,12 @@ class MsgpackDecoder:
         obj["field"] = factory_meth(None, *field_args).field
         return MultiModalFieldElem(**obj)
 
+    def _decode_mm_kwargs(self, obj: dict[str, Any]) -> MultiModalKwargs:
+        return MultiModalKwargs({
+            modality: self._decode_nested_tensors(data)
+            for modality, data in obj.items()
+        })
+
     def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
         if isinstance(obj, (int, float)):
             # Although it violates NestedTensors type, MultiModalKwargs
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 3d4cf27a6c..8d08bd7742 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -10,8 +10,8 @@ import torch
 from typing_extensions import deprecated
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.inputs import (MultiModalKwargs, MultiModalKwargsItem,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import (MultiModalKwargsItem,
+                                    MultiModalKwargsItems, PlaceholderRange)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
@@ -57,8 +57,10 @@ class CachedRequestState:
     @property
     @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
                 "removed in v0.13. Please use `mm_kwargs` instead.")
-    def mm_inputs(self) -> list[MultiModalKwargs]:
-        return [MultiModalKwargs([item]) for item in self.mm_kwargs]
+    def mm_inputs(self) -> list[MultiModalKwargsItems]:
+        return [
+            MultiModalKwargsItems.from_seq([item]) for item in self.mm_kwargs
+        ]
 
     def get_token_id(self, idx: int) -> int:
         if idx < self.num_prompt_tokens:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fc320be1c3..b49c3e05fa 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2218,11 +2218,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         dummy_mm_data = dummy_decoder_data.multi_modal_data
 
         # Result in the maximum GPU consumption of the model
-        dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
+        dummy_mm_item = dummy_mm_data[modality][0]
+        dummy_mm_items = [dummy_mm_item] * max_items_per_batch
 
         return next(mm_kwargs_group
                     for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
-                        [dummy_mm_item] * max_items_per_batch,
+                        dummy_mm_items,
                         device=self.device,
                         pin_memory=self.pin_memory,
                     ))
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index f7e68edba3..af837e4d94 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1824,11 +1824,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         dummy_mm_data = dummy_decoder_data.multi_modal_data
 
         # Result in the maximum GPU consumption of the model
-        dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
+        dummy_mm_item = dummy_mm_data[modality][0]
+        dummy_mm_items = [dummy_mm_item] * max_items_per_batch
 
         return next(grouped_mm_kwargs
                     for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality(
-                        [dummy_mm_item] * max_items_per_batch,
+                        dummy_mm_items,
                         device=self.device,
                         pin_memory=self.pin_memory,
                     ))

From 5a30bd10d82987b0231606836efaa79938dc24b4 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Mon, 18 Aug 2025 17:58:11 +0800
Subject: [PATCH 362/932] [Bugfix] fix IntermediateTensors equal method
 (#23027)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 tests/test_sequence.py | 40 ++++++++++++++++++++++++++++++++++++++--
 vllm/sequence.py       |  8 +++++++-
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index c734c8514a..1b019be9e5 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -2,10 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+import torch
 
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import (CompletionSequenceGroupOutput, SequenceData,
-                           SequenceOutput)
+from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
+                           SequenceData, SequenceOutput)
 
 from .core.utils import create_dummy_prompt
 
@@ -98,3 +99,38 @@ def test_sequence_group_stage():
     assert seq_group.is_prefill() is True
     seq_group.update_num_computed_tokens(1)
     assert seq_group.is_prefill() is False
+
+
+def test_sequence_intermediate_tensors_equal():
+
+    class AnotherIntermediateTensors(IntermediateTensors):
+        pass
+
+    intermediate_tensors = IntermediateTensors({})
+    another_intermediate_tensors = AnotherIntermediateTensors({})
+    assert intermediate_tensors != another_intermediate_tensors
+
+    empty_intermediate_tensors_1 = IntermediateTensors({})
+    empty_intermediate_tensors_2 = IntermediateTensors({})
+    assert empty_intermediate_tensors_1 == empty_intermediate_tensors_2
+
+    different_key_intermediate_tensors_1 = IntermediateTensors(
+        {"1": torch.zeros([2, 4], dtype=torch.int32)})
+    difference_key_intermediate_tensors_2 = IntermediateTensors(
+        {"2": torch.zeros([2, 4], dtype=torch.int32)})
+    assert (different_key_intermediate_tensors_1
+            != difference_key_intermediate_tensors_2)
+
+    same_key_different_value_intermediate_tensors_1 = IntermediateTensors(
+        {"1": torch.zeros([2, 4], dtype=torch.int32)})
+    same_key_different_value_intermediate_tensors_2 = IntermediateTensors(
+        {"1": torch.zeros([2, 5], dtype=torch.int32)})
+    assert (same_key_different_value_intermediate_tensors_1
+            != same_key_different_value_intermediate_tensors_2)
+
+    same_key_same_value_intermediate_tensors_1 = IntermediateTensors(
+        {"1": torch.zeros([2, 4], dtype=torch.int32)})
+    same_key_same_value_intermediate_tensors_2 = IntermediateTensors(
+        {"1": torch.zeros([2, 4], dtype=torch.int32)})
+    assert (same_key_same_value_intermediate_tensors_1 ==
+            same_key_same_value_intermediate_tensors_2)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 347015c7ef..43d5c8beef 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1163,7 +1163,13 @@ class IntermediateTensors:
         return len(self.tensors)
 
     def __eq__(self, other: object):
-        return isinstance(other, self.__class__) and self
+        if not isinstance(other, self.__class__):
+            return False
+        if self.tensors.keys() != other.tensors.keys():
+            return False
+        return all(
+            torch.equal(self.tensors[k], other.tensors[k])
+            for k in self.tensors)
 
     def __repr__(self) -> str:
         return f"IntermediateTensors(tensors={self.tensors})"

From d3f71f1224403fef0d59ef73b894ac51800b8068 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 18 Aug 2025 20:31:53 +0800
Subject: [PATCH 363/932] [Refactor] Get prompt updates earlier (#23097)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/deepseek_vl2.py    |  6 +-
 vllm/model_executor/models/h2ovl.py           |  8 +-
 vllm/model_executor/models/pixtral.py         | 15 ++--
 .../models/qwen2_5_omni_thinker.py            | 33 ++++----
 vllm/model_executor/models/voxtral.py         | 11 ++-
 vllm/multimodal/processing.py                 | 80 ++++++++++++-------
 6 files changed, 84 insertions(+), 69 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index e881e9c6dd..4210763483 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -25,7 +25,8 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, MultiModalHashes,
+                                        BaseProcessingInfo,
+                                        MultiModalProcessingInfo,
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
@@ -291,8 +292,7 @@ class DeepseekVL2MultiModalProcessor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
-               bool]:
+    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 2 vs > 2
         # Since the processing cache assumes that the processor output is
         # invariant of how many images are passed per prompt, we only
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 9ab3f4d0d9..75ab4dbe7b 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -20,8 +20,9 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
-from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.processing import (MultiModalProcessingInfo,
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .intern_vit import InternVisionModel
@@ -480,8 +481,7 @@ class H2OVLMultiModalProcessor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
-               bool]:
+    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 1 vs > 1
         # Since the processing cache assumes that the processor output is
         # invariant of how many images are passed per prompt, we only
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 5427e9a593..25be44e3f6 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -39,7 +39,8 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, MultiModalHashes,
+                                        BaseProcessingInfo,
+                                        MultiModalProcessingInfo,
                                         PromptReplacement, PromptUpdate,
                                         PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
@@ -309,14 +310,8 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
-               bool]:
-        (
-            prompt_ids,
-            mm_kwargs,
-            mm_hashes,
-            _,
-        ) = super()._cached_apply_hf_processor(
+    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
+        prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
@@ -325,7 +320,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         )
 
         # NOTE: The tokens are already inserted by the chat template
-        return prompt_ids, mm_kwargs, mm_hashes, True
+        return prompt_ids, mm_info, True
 
 
 @MULTIMODAL_REGISTRY.register_processor(PixtralMultiModalProcessor,
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index d43573ea27..5aadebc333 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -59,6 +59,7 @@ from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalPromptUpdates,
                                         PlaceholderFeaturesInfo,
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
@@ -88,10 +89,7 @@ def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
     video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
     video_grid_sizes = video_grid_thw.prod(-1)
 
-    # vllm use `second_per_grid_ts` to compute multimodal rotary embedding
-    video_second_per_grid = hf_inputs.get("video_second_per_grid", None)
-    if video_second_per_grid is not None:
-        hf_inputs["second_per_grid_ts"] = video_second_per_grid
+    num_videos = len(video_grid_sizes)
 
     return dict(
         input_audio_features=MultiModalFieldConfig.flat_from_sizes(
@@ -109,6 +107,7 @@ def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
             "video", video_grid_sizes),
         video_grid_thw=MultiModalFieldConfig.batched("video"),
         second_per_grid_ts=MultiModalFieldConfig.batched("video"),
+        use_audio_in_video=MultiModalFieldConfig.shared("video", num_videos),
     )
 
 
@@ -251,6 +250,14 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         if ('audio_feature_lengths' not in hf_inputs
                 and feature_attention_mask is not None):
             hf_inputs['audio_feature_lengths'] = feature_attention_mask.sum(-1)
+
+        video_second_per_grid = hf_inputs.get("video_second_per_grid", None)
+        if video_second_per_grid is not None:
+            hf_inputs["second_per_grid_ts"] = video_second_per_grid
+
+        use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
+        hf_inputs["use_audio_in_video"] = torch.tensor(use_audio_in_video)
+
         return hf_inputs
 
     def _get_mm_fields_config(
@@ -263,27 +270,20 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
     def _maybe_apply_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
         prompt_ids: list[int],
         mm_kwargs: MultiModalKwargsItems,
+        mm_prompt_updates: MultiModalPromptUpdates,
         is_update_applied: bool,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
         """
         Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
         """
-        unbound_prompt_updates = self._get_prompt_updates(
-            mm_items,
-            hf_processor_mm_kwargs,
-            mm_kwargs,
-        )
-        mm_prompt_updates = self._bind_and_group_updates(
-            unbound_prompt_updates)
-
         mm_item_counts = mm_items.get_all_counts()
         self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
 
-        use_audio_in_video = hf_processor_mm_kwargs.get(
-            "use_audio_in_video", False)
+        use_audio_in_video = (all(
+            item["use_audio_in_video"].data
+            for item in mm_kwargs["video"]) if "video" in mm_kwargs else False)
 
         if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
@@ -316,9 +316,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         tokenizer = self.info.get_tokenizer()
         prompt = decode_tokens(tokenizer, prompt_ids)
 
-        if use_audio_in_video:
-            mm_kwargs["use_audio_in_video"] = True
-
         return prompt_ids, prompt, mm_placeholders
 
     def _get_prompt_updates(
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 70ba561642..d0e8e3d39b 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -35,7 +35,8 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, MultiModalHashes,
+                                        BaseProcessingInfo,
+                                        MultiModalProcessingInfo,
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -289,10 +290,8 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
-               bool]:
-        prompt_ids, mm_kwargs, mm_hashes, _ = super(
-        )._cached_apply_hf_processor(
+    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
+        prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
@@ -301,7 +300,7 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
         )
 
         # NOTE: The tokens are already inserted by the chat template
-        return prompt_ids, mm_kwargs, mm_hashes, True
+        return prompt_ids, mm_info, True
 
     def _get_data_parser(self) -> MultiModalDataParser:
         sampling_rate = self.info.get_hf_processor().sampling_rate
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 08113da74a..e1363b7b0d 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -989,6 +989,18 @@ A collection of hashes with a similar structure as
 [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
 """
 
+MultiModalPromptUpdates = dict[str, Sequence[BoundPromptUpdate]]
+"""
+A collection of prompt updates with a similar structure as
+[`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
+"""
+
+
+class MultiModalProcessingInfo(NamedTuple):
+    kwargs: MultiModalKwargsItems
+    hashes: Optional[MultiModalHashes]
+    prompt_updates: MultiModalPromptUpdates
+
 
 class BaseMultiModalProcessor(ABC, Generic[_I]):
     """
@@ -1363,7 +1375,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         cache: ProcessingCache,
         mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]],
         mm_missing_kwargs: MultiModalKwargsItems,
-    ) -> dict[str, list[MultiModalKwargsItem]]:
+    ) -> MultiModalKwargsItems:
         mm_missing_next_idx = defaultdict[str, int](lambda: 0)
 
         merged_items = defaultdict[str, list[MultiModalKwargsItem]](list)
@@ -1379,7 +1391,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
                 merged_items[modality].append(kw_item)
 
-        return dict(merged_items)
+        return MultiModalKwargsItems(merged_items)
 
     def _apply_hf_processor(
         self,
@@ -1389,8 +1401,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
-               bool]:
+    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         (
             prompt_ids,
             mm_processed_data,
@@ -1413,7 +1424,21 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                                          tokenization_kwargs)
                      if return_mm_hashes else None)
 
-        return prompt_ids, mm_kwargs, mm_hashes, is_update_applied
+        unbound_prompt_updates = self._get_prompt_updates(
+            mm_data_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        mm_prompt_updates = self._bind_and_group_updates(
+            unbound_prompt_updates)
+
+        mm_info = MultiModalProcessingInfo(
+            kwargs=mm_kwargs,
+            hashes=mm_hashes,
+            prompt_updates=mm_prompt_updates,
+        )
+
+        return prompt_ids, mm_info, is_update_applied
 
     def _cached_apply_hf_processor(
         self,
@@ -1423,8 +1448,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
-               bool]:
+    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         """
         Apply the HF processor on the full prompt text,
         caching the results and reusing cached results.
@@ -1475,18 +1499,27 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                                        hf_processor_mm_kwargs),
         )
 
-        mm_cache_items_merged = self._merge_mm_kwargs(
+        mm_kwargs = self._merge_mm_kwargs(
             cache,
             mm_cache_items_or_hashes=mm_cache_items_or_hashes,
             mm_missing_kwargs=mm_missing_kwargs,
         )
 
-        mm_kwargs = MultiModalKwargsItems.from_seq([
-            item for cache_items in mm_cache_items_merged.values()
-            for item in cache_items
-        ])
+        unbound_prompt_updates = self._get_prompt_updates(
+            mm_data_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        mm_prompt_updates = self._bind_and_group_updates(
+            unbound_prompt_updates)
 
-        return prompt_ids, mm_kwargs, mm_hashes_to_return, is_update_applied
+        mm_info = MultiModalProcessingInfo(
+            kwargs=mm_kwargs,
+            hashes=mm_hashes_to_return,
+            prompt_updates=mm_prompt_updates,
+        )
+
+        return prompt_ids, mm_info, is_update_applied
 
     def _bind_and_group_updates(
         self,
@@ -1626,19 +1659,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
     def _maybe_apply_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
         prompt_ids: list[int],
         mm_kwargs: MultiModalKwargsItems,
+        mm_prompt_updates: MultiModalPromptUpdates,
         is_update_applied: bool,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
-        unbound_prompt_updates = self._get_prompt_updates(
-            mm_items,
-            hf_processor_mm_kwargs,
-            mm_kwargs,
-        )
-        mm_prompt_updates = self._bind_and_group_updates(
-            unbound_prompt_updates)
-
         mm_item_counts = mm_items.get_all_counts()
         self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
 
@@ -1694,8 +1719,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         (
             prompt_ids,
-            mm_kwargs,
-            mm_hashes,
+            mm_info,
             is_update_applied,
         ) = self._cached_apply_hf_processor(
             prompt,
@@ -1708,9 +1732,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         # NOTE: tokenization_kwargs are not required to init processor
         prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates(
             mm_items=mm_items,
-            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             prompt_ids=prompt_ids,
-            mm_kwargs=mm_kwargs,
+            mm_kwargs=mm_info.kwargs,
+            mm_prompt_updates=mm_info.prompt_updates,
             is_update_applied=is_update_applied,
         )
 
@@ -1723,8 +1747,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=prompt_ids,
-            mm_kwargs=mm_kwargs,
-            mm_hashes=mm_hashes,
+            mm_kwargs=mm_info.kwargs,
+            mm_hashes=mm_info.hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
 

From 569aefd13437ea0c49c0302f1100c3997fe067eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=A8=E6=9C=B1=20=C2=B7=20Kiki?= <baofa.fan@daocloud.io>
Date: Mon, 18 Aug 2025 20:32:13 +0800
Subject: [PATCH 364/932] chore: remove unnecessary patch_padding_side for the
 chatglm model (#23090)

Signed-off-by: carlory <baofa.fan@daocloud.io>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../generation/vlm_utils/model_utils.py       |  2 --
 vllm/transformers_utils/tokenizer.py          | 27 -------------------
 2 files changed, 29 deletions(-)

diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index 5e8dac6bce..e43db4937e 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -19,7 +19,6 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
 from transformers.video_utils import VideoMetadata
 
 from vllm.sequence import SampleLogprobs
-from vllm.transformers_utils.tokenizer import patch_padding_side
 from vllm.utils import is_list_of
 
 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
@@ -343,7 +342,6 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
 def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for GLM4V."""
     hf_processor = hf_model.processor
-    patch_padding_side(hf_processor)
 
     def processor(*args, text="", images=None, **kwargs):
         if images is None:
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index d2be2ceeea..4546f60aae 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -7,7 +7,6 @@ import os
 import warnings
 from functools import lru_cache
 from pathlib import Path
-from types import MethodType
 from typing import TYPE_CHECKING, Any, Optional, Union
 
 import huggingface_hub
@@ -144,26 +143,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
     return cached_tokenizer
 
 
-def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None:
-    """Patch _pad method to accept `padding_side` for older tokenizers."""
-    orig_pad = tokenizer._pad
-
-    def _pad(
-        self: PreTrainedTokenizer,
-        *args,
-        padding_side: Optional[str] = None,
-        **kwargs,
-    ):
-        if padding_side is not None and padding_side != self.padding_side:
-            msg = ("`padding_side` argument is not supported by "
-                   f"{type(tokenizer).__name__} and will be ignored.")
-            warnings.warn(msg, stacklevel=2)
-
-        return orig_pad(*args, **kwargs)
-
-    tokenizer._pad = MethodType(_pad, tokenizer)
-
-
 def get_tokenizer(
     tokenizer_name: Union[str, Path],
     *args,
@@ -271,12 +250,6 @@ def get_tokenizer(
             }
             tokenizer.add_special_tokens(special_tokens_map)
 
-        # NOTE: We can remove this after https://github.com/zai-org/ChatGLM3/issues/1324
-        if type(tokenizer).__name__ in ("ChatGLMTokenizer",
-                                        "ChatGLM4Tokenizer"):
-            assert isinstance(tokenizer, PreTrainedTokenizer)
-            patch_padding_side(tokenizer)
-
         if not isinstance(tokenizer, PreTrainedTokenizerFast):
             logger.warning(
                 "Using a slow tokenizer. This might cause a significant "

From 0e3bb543f064eb416bca4f6f3013efa3830b12f7 Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Mon, 18 Aug 2025 15:35:48 +0200
Subject: [PATCH 365/932] [Bugfix] Support compile for Transformers multimodal
 (#23095)

Signed-off-by: raushan <raushan@huggingface.co>
---
 vllm/model_executor/models/transformers.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 712667b1e2..ed9d6c0ab4 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -709,6 +709,13 @@ def flatten_and_concat(x: list[torch.Tensor]) -> torch.Tensor:
     MultiModalProcessor,
     info=MultiModalProcessingInfo,
     dummy_inputs=MultiModalDummyInputsBuilder)
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })  # set `positions` to last dim to support Qwen-mrope
 class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal):
     # Backwards compatibility for prev released models. State dicts back then
     # had different formats and cannot be loaded with `AutoModel` mapping as is

From bf756321c72340466911b64602e88013d0210c1c Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 18 Aug 2025 15:14:01 -0400
Subject: [PATCH 366/932] [CI Bugfix] Pin `openai<1.100` to unblock CI (#23118)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 requirements/common.txt | 2 +-
 requirements/docs.txt   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 1a8fea0dd7..e7bc5682a6 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -12,7 +12,7 @@ tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
-openai >= 1.99.1  # For Responses API with reasoning content
+openai >= 1.99.1, < 1.100.0  # For Responses API with reasoning content
 pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
diff --git a/requirements/docs.txt b/requirements/docs.txt
index a24b9c7e92..87a44fc99d 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -18,7 +18,7 @@ cbor2
 cloudpickle
 fastapi
 msgspec
-openai
+openai < 1.100.0
 openai-harmony
 partial-json-parser
 pillow

From ac6eb49de38978336104f18f0349e072a442e054 Mon Sep 17 00:00:00 2001
From: Breno Baldas Skuk <breno.skuk@hcompany.ai>
Date: Tue, 19 Aug 2025 00:22:59 +0200
Subject: [PATCH 367/932] fix: OpenAI SDK compat (ResponseTextConfig) (#23126)

Signed-off-by: breno.skuk <breno.skuk@hcompany.ai>
Signed-off-by: Breno Baldas Skuk <breno.skuk@hcompany.ai>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 requirements/common.txt             |  2 +-
 requirements/docs.txt               |  2 +-
 vllm/entrypoints/openai/protocol.py | 10 +++++++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index e7bc5682a6..1a8fea0dd7 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -12,7 +12,7 @@ tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
-openai >= 1.99.1, < 1.100.0  # For Responses API with reasoning content
+openai >= 1.99.1  # For Responses API with reasoning content
 pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 87a44fc99d..a24b9c7e92 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -18,7 +18,7 @@ cbor2
 cloudpickle
 fastapi
 msgspec
-openai < 1.100.0
+openai
 openai-harmony
 partial-json-parser
 pillow
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 543701ed14..61f1a09d3a 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -20,7 +20,15 @@ from openai.types.chat.chat_completion_message import (
 from openai.types.responses import (ResponseFunctionToolCall,
                                     ResponseInputItemParam, ResponseOutputItem,
                                     ResponsePrompt, ResponseReasoningItem,
-                                    ResponseStatus, ResponseTextConfig)
+                                    ResponseStatus)
+
+# Backward compatibility for OpenAI client versions
+try:  # For older openai versions (< 1.100.0)
+    from openai.types.responses import ResponseTextConfig
+except ImportError:  # For newer openai versions (>= 1.100.0)
+    from openai.types.responses import (ResponseFormatTextConfig as
+                                        ResponseTextConfig)
+
 from openai.types.responses.response import ToolChoice
 from openai.types.responses.tool import Tool
 from openai.types.shared import Metadata, Reasoning

From 6d25e3fd6ea9562b8d73ecf0a527ce3da23261b2 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 18 Aug 2025 18:25:49 -0400
Subject: [PATCH 368/932] Use Blackwell FlashInfer MXFP4 MoE by default if
 available  (#23008)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 10 ++--
 .../layers/quantization/mxfp4.py              | 60 ++++++++++++++-----
 2 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index c3c6e47827..4924f1fadb 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -762,11 +762,11 @@ class FusedMoE(CustomOp):
         self.global_num_experts = num_experts + num_redundant_experts
 
         # we padding globally so EP buffer allocation works
-        if (quant_config and quant_config.get_name() == "mxfp4"
-                and (current_platform.is_rocm()
-                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16)):
-            hidden_size = round_up(hidden_size, 256)
+        if quant_config and quant_config.get_name() == "mxfp4":
+            from vllm.model_executor.layers.quantization.mxfp4 import (  # noqa: E501
+                should_use_flashinfer_mxfp4)
+            if current_platform.is_rocm() or should_use_flashinfer_mxfp4():
+                hidden_size = round_up(hidden_size, 256)
 
         # For smuggling this layer into the fused moe custom op
         compilation_config = vllm_config.compilation_config
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 3c5d83037c..6a190ebbc0 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -6,6 +6,7 @@ import torch
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                   FusedMoEMethodBase)
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
@@ -26,12 +27,38 @@ from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils import (has_triton_kernels, is_torch_equal_or_newer,
                         next_power_of_2, round_up)
+from vllm.utils.flashinfer import has_flashinfer
 
-if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-        or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
-    # from flashinfer.fused_moe import cutlass_fused_moe
-    from flashinfer import (mxfp8_quantize, shuffle_matrix_a,
-                            shuffle_matrix_sf_a, trtllm_fp4_block_scale_moe)
+logger = init_logger(__name__)
+
+
+def _should_use_flashinfer_mxfp4_bf16():
+    """Determine if FlashInfer MXFP4 BF16 should be used."""
+    # If explicitly set, respect the setting
+    if envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16"):
+        return envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
+
+    # Enable by default on SM100 if MXFP8 is not explicitly enabled
+    if (current_platform.is_device_capability(100) and has_flashinfer()
+            and not envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8")):
+        logger.info_once(
+            "Enabling FlashInfer MXFP4 BF16 backend by default for Blackwell. "
+            "For faster performance, consider setting "
+            "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, "
+            "though this may impact accuracy.")
+        return True
+
+    return False
+
+
+def _should_use_flashinfer_mxfp4_mxfp8():
+    """Determine if FlashInfer MXFP4 MXFP8 should be used."""
+    return envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+
+
+def should_use_flashinfer_mxfp4():
+    return (_should_use_flashinfer_mxfp4_mxfp8()
+            or _should_use_flashinfer_mxfp4_bf16())
 
 
 class Mxfp4Config(QuantizationConfig):
@@ -87,12 +114,18 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
 
+        if current_platform.is_device_capability(100) and not has_flashinfer():
+            logger.warning_once(
+                "MXFP4 MoE is enabled on Blackwell but FlashInfer "
+                "is not available. This may result in degraded performance. "
+                "Please `pip install vllm[flashinfer]` for best results.")
+
     def _should_use_marlin(self):
         if envs.VLLM_MXFP4_USE_MARLIN is not None:
             return envs.VLLM_MXFP4_USE_MARLIN
         if current_platform.is_cuda() and \
-                not current_platform.has_device_capability(100):
-            if not current_platform.is_device_capability(90):
+                not current_platform.is_device_capability(100):
+            if not current_platform.has_device_capability(90):
                 # marlin kernel has better performance on ampere
                 return True
             if not has_triton_kernels():
@@ -138,8 +171,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             layer.hidden_size = hidden_size
             layer.intermediate_size_per_partition = \
                 intermediate_size_per_partition_after_pad
-        elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-              or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        elif should_use_flashinfer_mxfp4():
             # pad the intermediate size to be a multiple of 2 * mxfp4_block
             # for to hold non-uniform sharded tensor as well as swizzling
             # other padding to increase performance
@@ -230,8 +262,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
     def process_weights_after_loading(self, layer):
         if self.use_marlin:
             prepare_moe_fp4_layer_for_marlin(layer)
-        elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-              or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        elif should_use_flashinfer_mxfp4():
+            from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a
             layer.gemm1_alpha = Parameter(torch.tensor(
                 [1.702] * self.num_experts, dtype=torch.float32).cuda(),
                                           requires_grad=False)
@@ -478,11 +510,11 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             logical_replica_count), (
                 "MXFP4 are not supported with this configuration.")
 
-        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        if should_use_flashinfer_mxfp4():
+            from flashinfer import mxfp8_quantize, trtllm_fp4_block_scale_moe
             assert not self.moe.use_ep, (
                 "EP is not supported for flashinfer mxfp4 moe backend yet.")
-            if envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16:
+            if _should_use_flashinfer_mxfp4_bf16():
                 assert x.dtype == torch.bfloat16
                 x_quant = x
                 x_scale = None

From 498259ccce7fd9ad5c767fd0bbc5a70fb456b836 Mon Sep 17 00:00:00 2001
From: Xiang Xu <117880274+xiangxu-google@users.noreply.github.com>
Date: Mon, 18 Aug 2025 16:23:33 -0700
Subject: [PATCH 369/932] Install tpu_info==0.4.0 to fix core dump for TPU
 (#23135)

---
 requirements/tpu.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 7bb77c4a99..7ea239b48e 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -11,6 +11,7 @@ ray[default]
 ray[data]
 setuptools==78.1.0
 nixl==0.3.0
+tpu_info==0.4.0
 
 # Install torch_xla
 --pre

From 0dd3f4f5abf5d8a99fecb18b53a643ad85442272 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 18 Aug 2025 16:58:05 -0700
Subject: [PATCH 370/932] [Misc] Minor refactoring for prepare_inputs (#23116)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 43 +++++++++++++++---------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b49c3e05fa..43119fcad3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -757,10 +757,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Prepare the attention metadata.
         self.query_start_loc_np[0] = 0
         self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
+        # Note: pad query_start_loc to be non-decreasing, as kernels
+        # like FlashAttention requires that
+        self.query_start_loc_np[num_reqs + 1:].fill(cu_num_tokens[-1])
+        self.query_start_loc.copy_(self.query_start_loc_cpu, non_blocking=True)
+        query_start_loc = self.query_start_loc[:num_reqs + 1]
 
         self.seq_lens_np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] +
             num_scheduled_tokens)
+        # Fill unused with 0 for full cuda graph mode.
+        self.seq_lens_np[num_reqs:].fill(0)
+        self.seq_lens.copy_(self.seq_lens_cpu, non_blocking=True)
+        seq_lens = self.seq_lens[:num_reqs]
 
         # Copy the tensors to the GPU.
         self.input_ids[:total_num_scheduled_tokens].copy_(
@@ -776,22 +785,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 self.positions_cpu[:total_num_scheduled_tokens],
                 non_blocking=True)
 
-        self.query_start_loc[:num_reqs + 1].copy_(
-            self.query_start_loc_cpu[:num_reqs + 1], non_blocking=True)
-        self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
-                                       non_blocking=True)
-
-        # Fill unused with 0 for full cuda graph mode.
-        self.seq_lens[num_reqs:].fill_(0)
-        # Note: pad query_start_loc to be non-decreasing, as kernels
-        # like FlashAttention requires that
-        self.query_start_loc[num_reqs + 1:].fill_(
-            self.query_start_loc_cpu[num_reqs].item())
-
-        query_start_loc = self.query_start_loc[:num_reqs + 1]
-
-        spec_decode_common_attn_metadata = None
-
         use_spec_decode = len(
             scheduler_output.scheduled_spec_decode_tokens) > 0
         if not use_spec_decode:
@@ -860,6 +853,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         per_layer_metadata[layer_name]
                     attn_metadata[layer_name] = encoder_attn_metadata
 
+        # Used in the below loop.
+        query_start_loc_cpu = self.query_start_loc_cpu[:num_reqs + 1]
+        seq_lens_cpu = self.seq_lens_cpu[:num_reqs]
+        num_computed_tokens_cpu = (
+            self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs])
+        spec_decode_common_attn_metadata = None
+
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
@@ -874,12 +874,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             blk_table.slot_mapping[total_num_scheduled_tokens:].fill_(-1)
 
             common_attn_metadata = CommonAttentionMetadata(
-                query_start_loc=self.query_start_loc[:num_reqs + 1],
-                query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
-                seq_lens=self.seq_lens[:num_reqs],
-                seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
-                num_computed_tokens_cpu=self.input_batch.
-                num_computed_tokens_cpu_tensor[:num_reqs],
+                query_start_loc=query_start_loc,
+                query_start_loc_cpu=query_start_loc_cpu,
+                seq_lens=seq_lens,
+                seq_lens_cpu=seq_lens_cpu,
+                num_computed_tokens_cpu=num_computed_tokens_cpu,
                 num_reqs=num_reqs,
                 num_actual_tokens=total_num_scheduled_tokens,
                 max_query_len=max_num_scheduled_tokens,

From c9b38be8aafb02b69ccb704b33d2bb4329fbb0e6 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 18 Aug 2025 17:20:38 -0700
Subject: [PATCH 371/932] [Spec Decode] Make `propose_draft_token_ids`
 non-blocking for lower TTFT (#23041)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/core/test_async_scheduler.py  |  1 -
 tests/v1/core/test_scheduler.py        | 26 ++---------------
 tests/v1/kv_connector/unit/utils.py    |  1 -
 vllm/v1/core/sched/interface.py        | 10 ++++++-
 vllm/v1/core/sched/scheduler.py        | 39 +++++++++++++++++---------
 vllm/v1/engine/core.py                 | 10 +++++++
 vllm/v1/executor/abstract.py           |  8 ++++--
 vllm/v1/executor/multiproc_executor.py |  8 +++++-
 vllm/v1/outputs.py                     | 13 ++++++---
 vllm/v1/spec_decode/medusa.py          |  4 ++-
 vllm/v1/worker/gpu_model_runner.py     | 37 +++++++++++++++---------
 vllm/v1/worker/gpu_worker.py           |  6 +++-
 vllm/v1/worker/tpu_model_runner.py     |  1 -
 13 files changed, 100 insertions(+), 64 deletions(-)

diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index 3a9492269f..c153e38fe3 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -22,7 +22,6 @@ def _make_model_runner_output(
             for i, req_id in enumerate(req_ids)
         },
         sampled_token_ids=[[i] for i in range(len(req_ids))],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 23762a0fb6..070008fcbf 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -14,7 +14,7 @@ from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec)
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 from vllm.v1.structured_output.request import StructuredOutputRequest
@@ -158,7 +158,6 @@ def test_schedule_partial_requests():
         # Only the first request has a sampled token id because
         # the rest requests are still being prefilled.
         sampled_token_ids=[[0], [], []],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -209,7 +208,6 @@ def test_no_mm_input_chunking():
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
         sampled_token_ids=[[] for _ in range(len(requests))],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -273,7 +271,6 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
         sampled_token_ids=[[] for _ in range(len(requests))],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -298,7 +295,6 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
         sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -355,7 +351,6 @@ def test_stop_via_update_from_output():
         sampled_token_ids=[[EOS_TOKEN_ID],
                            [10,
                             11]],  # First request hits EOS, second continues
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[])
@@ -409,7 +404,6 @@ def test_stop_via_update_from_output():
         },
         sampled_token_ids=[[10, 42, 12],
                            [13, 14]],  # First request hits stop token
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[])
@@ -462,7 +456,6 @@ def test_stop_via_update_from_output():
         },
         sampled_token_ids=[[10, 11, 12],
                            [13]],  # First request exceeds max_tokens
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[])
@@ -505,7 +498,6 @@ def test_stop_via_update_from_output():
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
         sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[])
@@ -554,7 +546,6 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
         sampled_token_ids=[[0]],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -572,7 +563,6 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
         sampled_token_ids=[[0]],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -608,7 +598,6 @@ def test_preempt_during_execution():
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
         sampled_token_ids=[[0]],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -626,7 +615,6 @@ def test_preempt_during_execution():
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
         sampled_token_ids=[[42]],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -682,13 +670,14 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         req_ids=req_ids,
         req_id_to_index=req_to_index,
         sampled_token_ids=[[0] for _ in range(len(requests))],
-        spec_token_ids=spec_tokens,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
     )
     engine_core_outputs = scheduler.update_from_output(output,
                                                        model_runner_output)
+    draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
+    scheduler.update_draft_token_ids(draft_token_ids)
 
     for i in range(len(requests)):
         running_req = scheduler.running[i]
@@ -722,7 +711,6 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         req_ids=req_ids,
         req_id_to_index=req_to_index,
         sampled_token_ids=output_tokens,
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -851,7 +839,6 @@ def test_kv_connector_basic():
         req_ids=req_ids,
         req_id_to_index=req_to_index,
         sampled_token_ids=[[1000]] * len(req_ids),
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -898,7 +885,6 @@ def test_kv_connector_basic():
         req_ids=req_ids,
         req_id_to_index=req_to_index,
         sampled_token_ids=[[1000]] * len(req_ids),
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -966,7 +952,6 @@ def test_kv_connector_unable_to_allocate():
         req_ids=req_ids,
         req_id_to_index=req_to_index,
         sampled_token_ids=[[1000]] * len(req_ids),
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1048,7 +1033,6 @@ def test_kv_connector_handles_preemption():
         req_ids=req_ids,
         req_id_to_index=req_to_index,
         sampled_token_ids=[[1000]] * len(req_ids),
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1142,7 +1126,6 @@ def make_output(scheduler: Scheduler):
             for i, req in enumerate(scheduler.running)
         },
         sampled_token_ids=[[1000]] * len(scheduler.running),
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1468,7 +1451,6 @@ def test_priority_scheduling_preemption():
             for i, req in enumerate(low_priority_requests)
         },
         sampled_token_ids=[[100] for _ in low_priority_requests],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1541,7 +1523,6 @@ def test_priority_scheduling_no_preemption_when_space_available():
             for i, req in enumerate(low_priority_requests)
         },
         sampled_token_ids=[[100] for _ in low_priority_requests],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1783,7 +1764,6 @@ def test_priority_scheduling_heap_property():
                 req_ids=[req.req_id],
                 req_id_to_index={req.req_id: 0},
                 sampled_token_ids=[[100]],
-                spec_token_ids=None,
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[],
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 8c5d132c00..a47f583b32 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -200,7 +200,6 @@ def create_model_runner_output(
         req_ids=req_ids,
         req_id_to_index=req_id_to_index,
         sampled_token_ids=sampled_token_ids,
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=None,
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index dd5052a348..5b1de3a66c 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.engine import EngineCoreOutputs
     from vllm.v1.metrics.stats import SchedulerStats
-    from vllm.v1.outputs import ModelRunnerOutput
+    from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
     from vllm.v1.request import Request, RequestStatus
 
 
@@ -61,6 +61,14 @@ class SchedulerInterface(ABC):
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def update_draft_token_ids(
+        self,
+        draft_token_ids: "DraftTokenIds",
+    ) -> None:
+        """Update the draft token ids for the scheduled requests."""
+        raise NotImplementedError
+
     @abstractmethod
     def add_request(self, request: "Request") -> None:
         """Add a new request to the scheduler's internal queue.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 9810234090..b3defa4431 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -30,7 +30,7 @@ from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
                             EngineCoreOutputs)
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
-from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 from vllm.v1.structured_output import StructuredOutputManager
@@ -141,7 +141,6 @@ class Scheduler(SchedulerInterface):
             cache_size=encoder_cache_size)
 
         speculative_config = vllm_config.speculative_config
-
         self.use_eagle = False
         self.num_spec_tokens = self.num_lookahead_tokens = 0
         if speculative_config:
@@ -760,7 +759,6 @@ class Scheduler(SchedulerInterface):
         model_runner_output: ModelRunnerOutput,
     ) -> dict[int, EngineCoreOutputs]:
         sampled_token_ids = model_runner_output.sampled_token_ids
-        spec_token_ids = model_runner_output.spec_token_ids
         logprobs = model_runner_output.logprobs
         prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
@@ -845,20 +843,9 @@ class Scheduler(SchedulerInterface):
                 request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
                     req_id, new_token_ids)
 
-            # spec_token_ids comes from the model runner output
             if num_nans_in_logits is not None and req_id in num_nans_in_logits:
                 request.num_nans_in_logits = num_nans_in_logits[req_id]
 
-            # Add newly generated spec token ids to the request.
-            if spec_token_ids is not None:
-                if self.structured_output_manager.should_advance(request):
-                    metadata = request.structured_output_request
-                    # Needs to happen after new_token_ids are accepted.
-                    request.spec_token_ids = metadata.grammar.validate_tokens(  # type: ignore[union-attr]
-                        spec_token_ids[req_index])
-                else:
-                    request.spec_token_ids = spec_token_ids[req_index]
-
             # Get prompt logprobs for this request.
             prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
             if new_token_ids or pooler_output is not None \
@@ -963,6 +950,30 @@ class Scheduler(SchedulerInterface):
                 self.encoder_cache_manager.free_encoder_input(
                     request, input_id)
 
+    def update_draft_token_ids(
+        self,
+        draft_token_ids: DraftTokenIds,
+    ) -> None:
+        for req_id, spec_token_ids in zip(
+                draft_token_ids.req_ids,
+                draft_token_ids.draft_token_ids,
+        ):
+            request = self.requests.get(req_id)
+            if request is None or request.is_finished():
+                # The request may have been finished. Skip.
+                continue
+
+            # Add newly generated spec token ids to the request.
+            if not spec_token_ids:
+                # NOTE(woosuk): request.spec_token_ids should be updated.
+                request.spec_token_ids.clear()
+            elif self.structured_output_manager.should_advance(request):
+                metadata = request.structured_output_request
+                request.spec_token_ids = metadata.grammar.validate_tokens(  # type: ignore[union-attr]
+                    spec_token_ids)
+            else:
+                request.spec_token_ids = spec_token_ids
+
     def get_request_counts(self) -> tuple[int, int]:
         """Returns (num_running_reqs, num_waiting_reqs)."""
         return len(self.running), len(self.waiting)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 1e52f93a58..32765cda64 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -126,6 +126,7 @@ class EngineCore:
             > 1,
             log_stats=self.log_stats,
         )
+        self.use_spec_decode = vllm_config.speculative_config is not None
 
         self.mm_input_cache_server = MultiModalInputCacheServer(
             vllm_config.model_config, MULTIMODAL_REGISTRY)
@@ -294,6 +295,13 @@ class EngineCore:
         return (engine_core_outputs,
                 scheduler_output.total_num_scheduled_tokens > 0)
 
+    def post_step(self, model_executed: bool) -> None:
+        if self.use_spec_decode and model_executed:
+            # Take the draft token ids.
+            draft_token_ids = self.model_executor.take_draft_token_ids()
+            if draft_token_ids is not None:
+                self.scheduler.update_draft_token_ids(draft_token_ids)
+
     def step_with_batch_queue(
             self) -> tuple[Optional[dict[int, EngineCoreOutputs]], bool]:
         """Schedule and execute batches with the batch queue.
@@ -746,6 +754,8 @@ class EngineCoreProc(EngineCore):
         # Put EngineCoreOutputs into the output queue.
         for output in (outputs.items() if outputs else ()):
             self.output_queue.put_nowait(output)
+        # Post-step hook.
+        self.post_step(model_executed)
 
         return model_executed
 
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 50b9634a49..063a5f592e 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from concurrent.futures import Future
-from typing import Callable, Union
+from typing import Callable, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -14,7 +14,7 @@ from vllm.executor.uniproc_executor import (  # noqa
 from vllm.executor.uniproc_executor import (  # noqa
     UniProcExecutor as UniProcExecutorV0)
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 
 FailureCallback = Callable[[], None]
 
@@ -88,6 +88,10 @@ class Executor(ExecutorBase):
                                      args=(scheduler_output, ))
         return output[0]
 
+    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+        output = self.collective_rpc("take_draft_token_ids")
+        return output[0]
+
     @property
     def max_concurrent_batches(self) -> int:
         return 1
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 0db3bcd7fb..15b88a2128 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -33,7 +33,7 @@ from vllm.utils import (decorate_logs, get_distributed_init_method,
                         get_loopback_ip, get_mp_context, get_open_port,
                         set_process_title)
 from vllm.v1.executor.abstract import Executor, FailureCallback
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -191,6 +191,12 @@ class MultiprocExecutor(Executor):
                 outputs, self.output_rank)
         return self.kv_output_aggregator.aggregate(outputs, self.output_rank)
 
+    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+        # OPTIMIZATION: Get output only from a single worker (output_rank)
+        outputs = self.collective_rpc("take_draft_token_ids",
+                                      unique_reply_rank=self.output_rank)
+        return outputs[0]
+
     def collective_rpc(self,
                        method: Union[str, Callable],
                        timeout: Optional[float] = None,
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 7d7cd0c94d..f8d6b24702 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -94,9 +94,6 @@ class ModelRunnerOutput:
     # each request due to speculative/jump decoding.
     sampled_token_ids: list[list[int]]
 
-    # num_reqs x num_spec_tokens
-    spec_token_ids: Optional[list[list[int]]]
-
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs]
@@ -117,10 +114,18 @@ class ModelRunnerOutput:
     num_nans_in_logits: Optional[dict[str, int]] = None
 
 
+@dataclass
+class DraftTokenIds:
+
+    # [num_reqs]
+    req_ids: list[str]
+    # num_reqs x num_draft_tokens
+    draft_token_ids: list[list[int]]
+
+
 EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(req_ids=[],
                                               req_id_to_index={},
                                               sampled_token_ids=[],
-                                              spec_token_ids=None,
                                               logprobs=None,
                                               prompt_logprobs_dict={},
                                               pooler_output=[],
diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py
index 309fd926ae..3e90179e78 100644
--- a/vllm/v1/spec_decode/medusa.py
+++ b/vllm/v1/spec_decode/medusa.py
@@ -38,12 +38,14 @@ class MedusaProposer:
         self,
         target_hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> torch.Tensor:
+    ) -> list[list[int]]:
         # Generate blocks and compute logits
         blocks = self.model(target_hidden_states)
         logits = self.model.compute_logits(blocks, None)
 
         # Get draft tokens and transpose the result
+        # TODO(woosuk): OPTIMIZATION: Return GPU tensor without GPU-CPU
+        # synchronization.
         draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits]
         return [list(row) for row in zip(*draft_tokens)]
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 43119fcad3..9b0345a6aa 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -65,8 +65,8 @@ from vllm.v1.kv_cache_interface import (AttentionSpec,
                                         FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec, MambaSpec,
                                         SlidingWindowSpec)
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
-                             ModelRunnerOutput)
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
+                             LogprobsTensors, ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -348,6 +348,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         self.reorder_batch_threshold: Optional[int] = None
 
+        # Cached outputs.
+        self._draft_token_ids: Optional[Union[list[list[int]],
+                                              torch.Tensor]] = None
+
     def _init_model_kwargs(self, num_tokens: int):
         model_kwargs = dict[str, Any]()
         num_reqs = self.input_batch.num_reqs
@@ -1493,7 +1497,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             req_ids=self.input_batch.req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=[],
-            spec_token_ids=None,
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=pooler_output,
@@ -1764,12 +1767,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             req_state = self.requests[req_id]
             req_state.output_token_ids.extend(sampled_ids)
 
-        if not self.speculative_config:
-            # Speculative decoding is not enabled.
-            spec_token_ids = None
-        else:
+        if self.speculative_config:
             assert spec_decode_common_attn_metadata is not None
-            spec_token_ids = self.propose_draft_token_ids(
+            self._draft_token_ids = self.propose_draft_token_ids(
                 scheduler_output,
                 valid_sampled_token_ids,
                 sampling_metadata,
@@ -1786,7 +1786,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             req_ids=self.input_batch.req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=valid_sampled_token_ids,
-            spec_token_ids=spec_token_ids,
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
             pooler_output=[],
@@ -1794,6 +1793,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_nans_in_logits=num_nans_in_logits,
         )
 
+    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+        if self._draft_token_ids is None:
+            return None
+        req_ids = self.input_batch.req_ids
+        if isinstance(self._draft_token_ids, torch.Tensor):
+            draft_token_ids = self._draft_token_ids.tolist()
+        else:
+            draft_token_ids = self._draft_token_ids
+        self._draft_token_ids = None
+        return DraftTokenIds(req_ids, draft_token_ids)
+
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
@@ -1804,11 +1814,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         aux_hidden_states: Optional[torch.Tensor],
         spec_decode_metadata: Optional[SpecDecodeMetadata],
         common_attn_metadata: CommonAttentionMetadata,
-    ) -> list[list[int]]:
+    ) -> Union[list[list[int]], torch.Tensor]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if self.speculative_config.method == "ngram":
             assert isinstance(self.drafter, NgramProposer)
-            spec_token_ids = self.propose_ngram_draft_token_ids(
+            draft_token_ids = self.propose_ngram_draft_token_ids(
                 sampled_token_ids)
         elif self.speculative_config.method == "medusa":
             assert isinstance(self.drafter, MedusaProposer)
@@ -1826,7 +1836,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 indices = torch.tensor(indices, device=self.device)
                 hidden_states = sample_hidden_states[indices]
 
-            spec_token_ids = self.drafter.propose(
+            draft_token_ids = self.drafter.propose(
                 target_hidden_states=hidden_states,
                 sampling_metadata=sampling_metadata,
             )
@@ -1897,8 +1907,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 common_attn_metadata=common_attn_metadata,
                 mm_embeds=mm_embeds,
             )
-            spec_token_ids = draft_token_ids.tolist()
-        return spec_token_ids
+        return draft_token_ids
 
     def propose_ngram_draft_token_ids(
         self,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 04de8d3668..22e639b97d 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -28,7 +28,8 @@ from vllm.tasks import SupportedTask
 from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
+                             ModelRunnerOutput)
 from vllm.v1.utils import report_usage_stats
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 from vllm.v1.worker.worker_base import WorkerBase
@@ -386,6 +387,9 @@ class Worker(WorkerBase):
         assert isinstance(output, ModelRunnerOutput)
         return output
 
+    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+        return self.model_runner.take_draft_token_ids()
+
     def profile(self, is_start: bool = True):
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index af837e4d94..9196c62377 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1145,7 +1145,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=valid_sampled_token_ids,
-            spec_token_ids=None,
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
             pooler_output=[],

From 95e30951365d4e5be35117f4fe9c2ffea27975ab Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 19 Aug 2025 02:31:38 +0200
Subject: [PATCH 372/932] [Misc] Add @tdoublep as a maintainer of hybrid model
 and Triton-attention related code (#23122)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 .github/CODEOWNERS | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index b0dd5e99d4..7dce62fc9c 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -10,6 +10,7 @@
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
+/vllm/model_executor/layers/mamba @tdoublep
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
@@ -25,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/structured_output @mgoin @russellb @aarnphm
+/vllm/v1/attention/backends/triton_attn.py @tdoublep
 
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
@@ -44,6 +46,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/structured_output @mgoin @russellb @aarnphm
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
+/tests/models/language/generation/test_hybrid.py @tdoublep
 
 # Docs
 /docs @hmellor
@@ -72,3 +75,9 @@ mkdocs.yaml @hmellor
 /vllm/model_executor/models/pixtral*.py @patrickvonplaten
 /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
 /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
+
+# Kernels
+/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
+/vllm/attention/ops/triton_unified_attention.py @tdoublep
+
+

From 660328873619f7a711b33c7a6f1c5669a1ccdea2 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Mon, 18 Aug 2025 20:39:01 -0400
Subject: [PATCH 373/932] [CI][V0 Deprecation] Removed V0 Only Chunked Prefill
 and Prefix Caching Tests (#22871)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/test-pipeline.yaml                 |  18 --
 .github/CODEOWNERS                            |   1 -
 .../basic_correctness/test_chunked_prefill.py | 296 ------------------
 tests/prefix_caching/__init__.py              |   0
 .../test_disable_sliding_window.py            |  49 ---
 tests/prefix_caching/test_prefix_caching.py   | 231 --------------
 6 files changed, 595 deletions(-)
 delete mode 100644 tests/basic_correctness/test_chunked_prefill.py
 delete mode 100644 tests/prefix_caching/__init__.py
 delete mode 100644 tests/prefix_caching/test_disable_sliding_window.py
 delete mode 100644 tests/prefix_caching/test_prefix_caching.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 4fc8857854..0912bc1fd9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -88,15 +88,6 @@ steps:
   - pytest -v -s basic_correctness/test_cpu_offload.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
-- label: Chunked Prefill Test
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/
-  - tests/basic_correctness/test_chunked_prefill
-  commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-
 - label: Core Test # 10min
   mirror_hardwares: [amdexperimental]
   fast_check: true
@@ -295,15 +286,6 @@ steps:
     - python3 offline_inference/basic/score.py
     - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
-- label: Prefix Caching Test # 9min
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/
-  - tests/prefix_caching
-  commands:
-    - pytest -v -s prefix_caching
-
-
 - label: Platform Tests (CUDA)
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 7dce62fc9c..ce9590f02c 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -31,7 +31,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
-/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
deleted file mode 100644
index 4816b76996..0000000000
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-It tests chunked prefill. Chunked prefill can be enabled by
-enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens,
-prefill requests are chunked.
-
-Run `pytest tests/models/test_chunked_prefill.py`.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-import pytest
-
-from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_logprobs_close, check_outputs_equal
-from ..utils import multi_gpu_test
-
-if TYPE_CHECKING:
-    from .conftest import HfRunner, VllmRunner
-
-MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-3.2-1B-Instruct",
-]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the file.
-    """
-    with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
-        yield
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
-@pytest.mark.parametrize("enforce_eager", [False, True])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-@pytest.mark.parametrize("attention_backend", [
-    pytest.param("FLASHINFER",
-                 marks=pytest.mark.skipif(
-                     current_platform.is_rocm(),
-                     reason="FLASHINFER isn't supported on ROCm")),
-    "FLASH_ATTN"
-])
-def test_models(
-    hf_runner: HfRunner,
-    vllm_runner: VllmRunner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    enforce_eager: bool,
-    tensor_parallel_size: int,
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """
-    Checks exact match decode between huggingface model and vllm runner with
-    chunked prefill.
-    """
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        max_num_seqs = chunked_prefill_token_size
-        max_num_batched_tokens = chunked_prefill_token_size
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                max_num_batched_tokens=max_num_batched_tokens,
-                enable_chunked_prefill=True,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                max_num_seqs=max_num_seqs,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("attention_backend", [
-    pytest.param("FLASHINFER",
-                 marks=pytest.mark.skipif(
-                     current_platform.is_rocm(),
-                     reason="FLASHINFER isn't supported on ROCm")),
-    "FLASH_ATTN"
-])
-def test_models_distributed(
-    hf_runner: HfRunner,
-    vllm_runner: VllmRunner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-        if (model == "meta-llama/Llama-3.2-1B-Instruct"
-                and distributed_executor_backend == "ray"):
-            # test Ray Compiled Graph
-            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
-            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
-
-        dtype = "half"
-        max_tokens = 5
-        chunked_prefill_token_size = 16
-
-        # Add a chunked prefill config.
-        max_num_seqs = min(chunked_prefill_token_size, 256)
-        assert chunked_prefill_token_size != -1
-        enable_chunked_prefill = True
-        max_num_batched_tokens = chunked_prefill_token_size
-
-        # NOTE: take care of the order. run vLLM first, and then run HF.
-        # vLLM needs a fresh new process without cuda initialization.
-        # if we run HF first, the cuda initialization will be done and it
-        # will hurt multiprocessing backend with
-        # fork method (the default method).
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                tensor_parallel_size=2,
-                max_num_seqs=max_num_seqs,
-                enable_chunked_prefill=enable_chunked_prefill,
-                max_num_batched_tokens=max_num_batched_tokens,
-                distributed_executor_backend=distributed_executor_backend,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(
-                example_prompts,
-                max_tokens,
-            )
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize(
-    "kv_cache_dtype,model",
-    [("fp8_e4m3",
-      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
-# Due to low-precision numerical divergence, we only test logprob of 4 tokens
-@pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
-@pytest.mark.parametrize("enforce_eager", [False, True])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-# Due to low-precision numerical divergence, this test is too sensitive to
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="machete_prepack_B isn't supported on ROCm")
-def test_models_with_fp8_kv_cache(
-    vllm_runner: VllmRunner,
-    example_prompts,
-    kv_cache_dtype: str,
-    model: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    enforce_eager: bool,
-    tensor_parallel_size: int,
-    disable_async_output_proc: bool,
-) -> None:
-    """
-    Check output logprobs match between no_chunked_prefill and chunked_prefill
-    with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
-    so here we only check chunked prefill.
-    """
-    NUM_LOG_PROBS = 8
-
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    with vllm_runner(
-            model,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    with vllm_runner(
-            model,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    check_logprobs_close(
-        outputs_0_lst=no_chunked_prefill_outputs,
-        outputs_1_lst=chunked_prefill_outputs,
-        name_0="no_chunked_prefill",
-        name_1="chunked_prefill",
-    )
-
-
-@pytest.mark.parametrize("max_tokens", [16])
-@pytest.mark.parametrize("enforce_eager", [False])
-@pytest.mark.parametrize("chunk_size", [30, 32])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-@pytest.mark.parametrize("dtype", ["half"])
-def test_with_prefix_caching(
-    vllm_runner: VllmRunner,
-    max_tokens: int,
-    enforce_eager: bool,
-    chunk_size: int,
-    tensor_parallel_size: int,
-    dtype: str,
-) -> None:
-    """
-    Checks exact match decode with and without prefix caching
-    with chunked prefill enabled.
-    """
-    model = "meta-llama/Llama-3.2-1B-Instruct"
-    # The common prompt has 142 tokens with Llama-2 tokenizer.
-    common_prompt = "You are a helpful AI assistant " * 20
-    unique_prompts = [
-        "Question",  # Warmup
-        "Question",  # Fully cached
-        "Another question",  # Partial cached
-    ]
-    full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
-
-    max_num_batched_tokens = max_num_seqs = chunk_size
-    outputs = {}  # type: ignore
-    for enable in (True, False):
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                max_num_batched_tokens=max_num_batched_tokens,
-                enable_chunked_prefill=True,
-                enable_prefix_caching=enable,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                max_num_seqs=max_num_seqs,
-        ) as vllm_model:
-            outputs[enable] = []
-            for prompt in full_prompts:
-                outputs[enable] += vllm_model.generate_greedy(
-                    [prompt],
-                    max_tokens,
-                )
-
-    check_outputs_equal(
-        outputs_0_lst=outputs[False],
-        outputs_1_lst=outputs[True],
-        name_0="w/o prefix caching",
-        name_1="with prefix caching",
-    )
diff --git a/tests/prefix_caching/__init__.py b/tests/prefix_caching/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
deleted file mode 100644
index b940ab416e..0000000000
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the with and without prefix caching.
-
-Run `pytest tests/prefix_caching/test_prefix_caching.py`.
-"""
-import pytest
-
-from vllm import LLM
-from vllm.distributed import cleanup_dist_env_and_memory
-
-MODEL_LEN_LEN = [
-    # Example models with sliding window.
-    ("bigcode/starcoder2-3b", 4096, 16384),
-    # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
-
-    # Confirm model with sliding window works.
-    # config has "use_sliding_window": false
-    ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
-    # config has no sliding window attribute.
-    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048),
-]
-
-
-@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
-def test_disable_sliding_window(model_len_len, ):
-    model, sliding_len, full_len = model_len_len
-    disabled_llm = LLM(model, disable_sliding_window=True)
-    disabled_llm.generate("Hi my name is")
-    model_config = disabled_llm.llm_engine.model_config
-    assert model_config.max_model_len == sliding_len, (
-        "Max len expected to equal sliding_len of %s, but got %s", sliding_len,
-        model_config.max_model_len)
-
-    del disabled_llm
-    cleanup_dist_env_and_memory()
-
-    enabled_llm = LLM(model,
-                      enforce_eager=True,
-                      disable_sliding_window=False,
-                      enable_prefix_caching=False)
-    enabled_llm.generate("Hi my name is")
-    model_config = enabled_llm.llm_engine.model_config
-    assert model_config.max_model_len == full_len, (
-        "Max len expected to equal full_len of %s, but got %s", full_len,
-        model_config.max_model_len)
-
-    del enabled_llm
-    cleanup_dist_env_and_memory()
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
deleted file mode 100644
index 5bf6ed957c..0000000000
--- a/tests/prefix_caching/test_prefix_caching.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the with and without prefix caching.
-
-Run `pytest tests/prefix_caching/test_prefix_caching.py`.
-"""
-
-from __future__ import annotations
-
-import pytest
-
-from tests.conftest import VllmRunner
-from tests.core.utils import SchedulerProxy, create_dummy_prompt
-from vllm import SamplingParams, TokensPrompt
-from vllm.core.scheduler import Scheduler
-from vllm.engine.llm_engine import LLMEngine
-from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_outputs_equal
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    This module relies on V0 internals, so set VLLM_USE_V1=0.
-    """
-    with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
-        yield
-
-
-MODELS = [
-    "distilbert/distilgpt2",
-]
-
-UNSTABLE_PROMPT_SEQUENCE = [
-    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1),
-    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50),
-    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95),
-    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174),
-    ([0] * 588) + ([8] * 1539),
-]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("cached_position", [0, 1])
-@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
-@pytest.mark.parametrize("block_size", [16])
-def test_mixed_requests(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    backend: str,
-    dtype: str,
-    max_tokens: int,
-    cached_position: int,
-    enable_chunked_prefill: bool,
-    block_size: int,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """
-    Test the case when some sequences have the prefix cache hit
-    and the others don't. The cached position determines where
-    the sequence is at among the batch of prefills.
-    """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-    if backend == "XFORMERS" and current_platform.is_rocm():
-        pytest.skip("Xformers does not support ROCm/HIP.")
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, backend)
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        cached_prompt = example_prompts[cached_position]
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enable_prefix_caching=True,
-                enable_chunked_prefill=enable_chunked_prefill,
-                block_size=block_size,
-        ) as vllm_model:
-            # Run the first prompt so the cache is populated
-            vllm_outputs = vllm_model.generate_greedy([cached_prompt],
-                                                      max_tokens)
-
-            # Run all the promopts
-            greedy_params = SamplingParams(temperature=0.0,
-                                           max_tokens=max_tokens)
-            req_outputs = vllm_model.llm.generate(example_prompts,
-                                                  greedy_params)
-
-            # Verify number of cached tokens
-            for i in range(len(req_outputs)):
-                if i == cached_position:
-                    expected_num_cached_tokens = (
-                        len(req_outputs[i].prompt_token_ids) //
-                        block_size) * block_size
-                else:
-                    expected_num_cached_tokens = 0
-                assert (req_outputs[i].num_cached_tokens ==
-                        expected_num_cached_tokens)
-
-            vllm_outputs = [(
-                output.prompt_token_ids + list(output.outputs[0].token_ids),
-                output.prompt + output.outputs[0].text,
-            ) for output in req_outputs]
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-def test_unstable_prompt_sequence(
-    vllm_runner,
-    backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-    if backend == "XFORMERS" and current_platform.is_rocm():
-        pytest.skip("Xformers does not support ROCm/HIP.")
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, backend)
-
-        with vllm_runner(
-                "Qwen/Qwen2.5-0.5B-Instruct",
-                enable_chunked_prefill=True,
-                enable_prefix_caching=True,
-                max_model_len=4096,
-        ) as vllm_model:
-            for prompt in UNSTABLE_PROMPT_SEQUENCE:
-                vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
-                                    SamplingParams(max_tokens=1))
-
-
-@pytest.mark.parametrize("model", MODELS)
-def test_fully_cached_prefill_needs_uncached_token(model):
-    block_size = 16
-    max_num_batched_tokens = 16
-    num_output_tokens = 5
-    # Make a vllm engine
-    runner = VllmRunner(
-        model_name=model,
-        gpu_memory_utilization=0.7,
-        enable_chunked_prefill=True,
-        enforce_eager=True,
-        enable_prefix_caching=True,
-        block_size=block_size,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_num_seqs=max_num_batched_tokens,
-    )
-    engine: LLMEngine = runner.llm.llm_engine
-
-    scheduler: Scheduler = SchedulerProxy(engine.scheduler[0])  # type: ignore
-    engine.scheduler[0] = scheduler
-
-    # SeqA
-    seqA_tokens = list(range(2 * block_size))
-    seqA, seq_groupA = create_dummy_prompt(
-        request_id="0",
-        prompt_tokens=seqA_tokens,
-        max_tokens=num_output_tokens,
-        block_size=block_size,
-    )
-
-    scheduler.add_seq_group(seq_groupA)
-
-    assert seqA.data.get_num_computed_tokens() == 0
-
-    # Prefill seqA
-    while not seqA.is_finished():
-        engine.step()
-
-    # seqB
-    seqB_tokens = [t + 1 for t in seqA_tokens]  # shift by 1
-    seqB, seq_groupB = create_dummy_prompt(
-        request_id="1",
-        prompt_tokens=seqB_tokens,
-        max_tokens=num_output_tokens,
-        block_size=block_size,
-    )
-
-    # seqC is the same as seqA
-    seqC, seq_groupC = create_dummy_prompt(
-        request_id="2",
-        prompt_tokens=seqA_tokens,
-        max_tokens=num_output_tokens,
-        block_size=block_size,
-    )
-
-    scheduler.add_seq_group(seq_groupB)
-    scheduler.add_seq_group(seq_groupC)
-
-    # Even seqC is fully cached, it should not be prefilled since we
-    # require at least 1 uncached token.
-    engine.step()
-
-    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
-    assert len(sched_out.scheduled_seq_groups) == 1
-    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-            seq_groupB.request_id)
-    assert (sched_out.scheduled_seq_groups[0].token_chunk_size ==
-            max_num_batched_tokens)
-
-    # When seqB is finished, seqC could be prefilled.
-    while not seqB.is_finished():
-        engine.step()
-        sched_metas, sched_out, _ = scheduler.last_schedule_ret()
-        assert len(sched_out.scheduled_seq_groups) == 1
-        assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-                seq_groupB.request_id)
-
-    engine.step()
-    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
-    assert len(sched_out.scheduled_seq_groups) == 1
-    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-            seq_groupC.request_id)
-    assert sched_out.scheduled_seq_groups[0].token_chunk_size == len(
-        seqA_tokens)

From 14006840eacf74f83e0d486eca6a24e75cafa6d3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 18 Aug 2025 19:54:16 -0700
Subject: [PATCH 374/932] [V0 Deprecation] Remove V0 FlashInfer attention
 backend (#22776)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .../test_basic_correctness.py                 |    9 +-
 tests/compile/test_basic_correctness.py       |    2 +-
 .../e2e/test_correctness_sliding_window.py    |    8 +-
 tests/distributed/test_pp_cudagraph.py        |    1 -
 .../attention/test_attention_selector.py      |    3 +
 tests/models/quantization/test_fp8.py         |    5 +-
 vllm/attention/backends/flashinfer.py         | 1098 -----------------
 vllm/platforms/cuda.py                        |   16 +-
 8 files changed, 9 insertions(+), 1133 deletions(-)
 delete mode 100644 vllm/attention/backends/flashinfer.py

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 13ddf035a5..a3b09cc817 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -12,7 +12,6 @@ import pytest
 import torch
 
 from vllm import LLM, envs
-from vllm.platforms import current_platform
 from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
 
 from ..conftest import HfRunner, VllmRunner
@@ -78,11 +77,7 @@ def test_models(
             "VLLM_USE_V1") and envs.VLLM_USE_V1:
         pytest.skip("enable_prompt_embeds is not supported in v1.")
 
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-
-    if backend in ("XFORMERS",
-                   "FLASHINFER") and model == "google/gemma-2-2b-it":
+    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
         pytest.skip(
             f"{backend} does not support gemma2 with full context length.")
 
@@ -141,8 +136,6 @@ def test_models(
         ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
         ("distilbert/distilgpt2", "ray", "", "A100", {}),
         ("distilbert/distilgpt2", "mp", "", "A100", {}),
-        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100", {}),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", {}),
     ])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models_distributed(
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index cf715cd032..422cb94b03 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -34,7 +34,7 @@ class TestSetting:
             model_args=["--max-model-len", "2048"],
             pp_size=2,
             tp_size=2,
-            attn_backend="FLASHINFER",
+            attn_backend="FLASH_ATTN",
             method="generate",
             fullgraph=True,
         ),
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index 4d67eea226..27fe27a880 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -32,7 +32,7 @@ BLOCK_SIZE = 16
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
 def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
                                   batch_size, seed, backend, monkeypatch):
     """
@@ -43,8 +43,6 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
 
     Additionally, we compare the results of the v1 and v2 managers.
     """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
     if backend == "XFORMERS" and current_platform.is_rocm():
         pytest.skip("Xformers does not support ROCm/HIP.")
 
@@ -96,7 +94,7 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
 @pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
 def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
                                         backend, monkeypatch):
     """
@@ -107,8 +105,6 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
     The results with and without chunked prefill are not the same due to
     numerical instabilities.
     """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
     if backend == "XFORMERS" and current_platform.is_rocm():
         pytest.skip("Xformers does not support ROCm/HIP.")
     override_backend_env_variable(monkeypatch, backend)
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index a027a9e37d..5ca65a0e8d 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -17,7 +17,6 @@ if TYPE_CHECKING:
 ])
 @pytest.mark.parametrize("ATTN_BACKEND", [
     "FLASH_ATTN",
-    "FLASHINFER",
 ])
 @create_new_process_for_each_test()
 def test_pp_cudagraph(
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index bfeafaa9e2..aea166da3a 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -81,6 +81,9 @@ def test_env(
         m.setenv(STR_BACKEND_ENV_VAR, name)
         m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
 
+        if name == "FLASHINFER" and not use_v1:
+            pytest.skip("FlashInfer backend is only available on V1 engine")
+
         if device == "cpu":
             if not use_v1:
                 pytest.skip("CPU backend only supports V1")
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index 10914abf9a..afc27b6e05 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -32,7 +32,7 @@ from ..utils import check_logprobs_close
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
 @pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
@@ -57,9 +57,6 @@ def test_models(
     numerical sensitive kernels.
     """
 
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-
     if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
         pytest.skip(
             f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
deleted file mode 100644
index a85ec24632..0000000000
--- a/vllm/attention/backends/flashinfer.py
+++ /dev/null
@@ -1,1098 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from collections import defaultdict
-from contextlib import contextmanager
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type
-
-from vllm.multimodal import MultiModalPlaceholderMap
-
-try:
-    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
-    from flashinfer.decode import (CUDAGraphBatchDecodeWithPagedKVCacheWrapper,
-                                   trtllm_batch_decode_with_kv_cache)
-    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
-
-    from vllm.vllm_flash_attn import flash_attn_varlen_func
-    FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
-except ImportError:
-    # Avoid turning these types into variables during type checking
-    if not TYPE_CHECKING:
-        BatchDecodeWithPagedKVCacheWrapper = None
-        CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
-        BatchPrefillWithPagedKVCacheWrapper = None
-        trtllm_batch_decode_with_kv_cache = None
-    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
-    raise ImportError("FlashInfer is not installed. Please install it from "
-                      "https://github.com/flashinfer-ai/flashinfer") from None
-
-import torch
-
-import vllm.envs as envs
-from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata,
-                                              AttentionMetadataBuilder,
-                                              AttentionState, AttentionType)
-from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
-                                           compute_slot_mapping_start_idx,
-                                           is_block_tables_empty)
-from vllm.attention.layer import Attention
-from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.config import VllmConfig, get_layers_from_vllm_config
-from vllm.logger import init_logger
-from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
-                        make_tensor_with_pad)
-from vllm.utils.flashinfer import use_trtllm_attention
-
-logger = init_logger(__name__)
-
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-
-
-class FlashInferBackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        return "FLASHINFER"
-
-    @staticmethod
-    def get_impl_cls() -> Type["FlashInferImpl"]:
-        return FlashInferImpl
-
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return FlashInferMetadata
-
-    @staticmethod
-    def get_builder_cls() -> Type["FlashInferMetadataBuilder"]:
-        return FlashInferMetadataBuilder
-
-    @staticmethod
-    def get_state_cls() -> Type["FlashInferState"]:
-        return FlashInferState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (num_blocks, 2, block_size, num_kv_heads, head_size)
-
-    @staticmethod
-    def get_kv_cache_stride_order() -> Tuple[int, ...]:
-        cache_layout = FlashInferState.get_kv_cache_layout()
-        assert (cache_layout in ("NHD", "HND"))
-        stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3,
-                                                                      2, 4)
-        return stride_order
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
-
-    @staticmethod
-    def get_supported_head_sizes() -> List[int]:
-        return [64, 128, 256]
-
-    @staticmethod
-    def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
-        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
-            return torch.float8_e4m3fn
-        elif kv_cache_dtype == "fp8_e5m2":
-            return torch.float8_e5m2
-        else:
-            raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
-
-
-@dataclass
-class PerLayerParameters:
-    """
-    Currently, FlashInfer backend only support models in which all layers share
-    the same values for the following hyperparameters.
-    """
-
-    window_left: int
-    logits_soft_cap: Optional[float]
-    sm_scale: float
-
-
-def get_per_layer_parameters(
-        vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]:
-    """
-    Scan all attention layers and determine some hyperparameters
-    to use during `plan`.
-    """
-
-    layers = get_layers_from_vllm_config(vllm_config, Attention)
-    per_layer_params: Dict[str, PerLayerParameters] = {}
-
-    for key, layer in layers.items():
-        impl = layer.impl
-        assert isinstance(impl, FlashInferImpl)
-
-        # Infer hyperparameters from the attention layer
-        window_size = impl.sliding_window
-        window_left = window_size[0] if window_size is not None else -1
-        logits_soft_cap = impl.logits_soft_cap
-        sm_scale = impl.scale
-
-        per_layer_params[key] = PerLayerParameters(window_left,
-                                                   logits_soft_cap, sm_scale)
-
-    return per_layer_params
-
-
-def infer_global_hyperparameters(
-        per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters:
-    """
-    Currently, FlashInfer backend only support models in which all layers share
-    the same values for the following hyperparameters:
-    - `window_left`
-    - `logits_soft_cap`
-    - `sm_scale`
-
-    So this function asserts that all layers share the same values for these
-    hyperparameters and returns the global values.
-    """
-
-    assert len(per_layer_params) > 0, "No attention layers found in the model."
-
-    param_sets = list(per_layer_params.values())
-    global_params = param_sets[0]
-    for params in param_sets:
-        assert params == global_params, (
-            "FlashInfer backend currently only supports models in which all "
-            "layers share the same values for the following hyperparameters: "
-            "`window_left`, `logits_soft_cap`, `sm_scale`.")
-
-    return global_params
-
-
-class FlashInferState(AttentionState):
-
-    def __init__(self, runner):
-        self.runner = runner
-        self._is_graph_capturing = False
-        self._workspace_buffer = None
-        self._decode_wrapper = None
-        self._prefill_wrapper = None
-
-        # Global hyperparameters shared by all attention layers
-        self.global_hyperparameters: Optional[PerLayerParameters] = None
-
-        self.vllm_config = self.runner.vllm_config
-        self._kv_cache_layout = None
-
-    def _get_workspace_buffer(self):
-        if self._workspace_buffer is None:
-            self._workspace_buffer = torch.zeros(
-                FLASHINFER_WORKSPACE_BUFFER_SIZE,
-                dtype=torch.uint8,
-                device=self.runner.device)
-        return self._workspace_buffer
-
-    @staticmethod
-    def get_kv_cache_layout():
-        from vllm.v1.attention.backends.utils import _KV_CACHE_LAYOUT_OVERRIDE
-        if _KV_CACHE_LAYOUT_OVERRIDE is not None:
-            logger.info_once("Using KV cache layout %s",
-                             _KV_CACHE_LAYOUT_OVERRIDE)
-            return _KV_CACHE_LAYOUT_OVERRIDE
-        cache_layout = envs.VLLM_KV_CACHE_LAYOUT
-        if cache_layout is None:
-            logger.info_once("Using default KV cache layout NHD")
-            return "NHD"
-        logger.info_once("Using KV cache layout %s", cache_layout)
-        return cache_layout
-
-    def _get_prefill_wrapper(self):
-        if self._prefill_wrapper is None:
-            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
-                self._get_workspace_buffer(), self.get_kv_cache_layout())
-        return self._prefill_wrapper
-
-    def _get_decode_wrapper(self):
-        if self._decode_wrapper is None:
-            num_qo_heads = (self.runner.model_config.get_num_attention_heads(
-                self.runner.parallel_config))
-            num_kv_heads = self.runner.model_config.get_num_kv_heads(
-                self.runner.parallel_config)
-            use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
-                num_qo_heads // num_kv_heads > 4)
-            self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
-                self._get_workspace_buffer(),
-                self.get_kv_cache_layout(),
-                use_tensor_cores=use_tensor_cores)
-        return self._decode_wrapper
-
-    @contextmanager
-    def graph_capture(self, max_batch_size: int):
-        self._is_graph_capturing = True
-        self._graph_decode_wrapper = None
-        self._graph_slot_mapping = torch.full((max_batch_size, ),
-                                              PAD_SLOT_ID,
-                                              dtype=torch.long,
-                                              device=self.runner.device)
-        self._graph_seq_lens = torch.ones(max_batch_size,
-                                          dtype=torch.int32,
-                                          device=self.runner.device)
-        self._graph_block_tables = torch.from_numpy(
-            self.runner.graph_block_tables).to(device=self.runner.device)
-        self._graph_decode_workspace_buffer = self._get_workspace_buffer()
-        self._graph_indices_buffer = torch.empty(
-            max_batch_size * self.runner.cache_config.num_gpu_blocks,
-            dtype=torch.int32,
-            device=self.runner.device)
-        self._graph_indptr_buffer = torch.empty(max_batch_size + 1,
-                                                dtype=torch.int32,
-                                                device=self.runner.device)
-        self._graph_last_page_len_buffer = torch.empty(
-            max_batch_size, dtype=torch.int32, device=self.runner.device)
-        yield
-        self._is_graph_capturing = False
-        del self._graph_slot_mapping
-        del self._graph_seq_lens
-        del self._graph_block_tables
-        del self._graph_decode_workspace_buffer
-        del self._graph_indices_buffer
-        del self._graph_indptr_buffer
-        del self._graph_last_page_len_buffer
-        del self._graph_decode_wrapper
-
-    def graph_clone(self, batch_size: int):
-        assert self._is_graph_capturing
-        state = self.__class__(self.runner)
-        state._workspace_buffer = self._graph_decode_workspace_buffer
-        state._decode_wrapper = self._graph_decode_wrapper
-        state._prefill_wrapper = self._get_prefill_wrapper()
-        return state
-
-    def graph_capture_get_metadata_for_batch(
-            self, batch_size: int, is_encoder_decoder_model: bool = False):
-        assert self._is_graph_capturing
-        _indptr_buffer = self._graph_indptr_buffer[:batch_size + 1]
-        _last_page_len_buffer = self._graph_last_page_len_buffer[:batch_size]
-
-        num_qo_heads = (self.runner.model_config.get_num_attention_heads(
-            self.runner.parallel_config))
-        num_kv_heads = self.runner.model_config.get_num_kv_heads(
-            self.runner.parallel_config)
-        use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
-            num_qo_heads // num_kv_heads > 4)
-        self._graph_decode_wrapper = \
-            CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
-            self._graph_decode_workspace_buffer, _indptr_buffer,
-            self._graph_indices_buffer, _last_page_len_buffer,
-            self.get_kv_cache_layout(),
-            use_tensor_cores)
-        if self.runner.kv_cache_dtype.startswith("fp8"):
-            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                self.runner.kv_cache_dtype)
-        else:
-            kv_cache_dtype = get_kv_cache_torch_dtype(
-                self.runner.kv_cache_dtype, self.runner.model_config.dtype)
-
-        paged_kv_indptr_tensor_host = torch.arange(0,
-                                                   batch_size + 1,
-                                                   dtype=torch.int32)
-        paged_kv_indices_tensor_host = torch.arange(0,
-                                                    batch_size,
-                                                    dtype=torch.int32)
-        paged_kv_last_page_len_tensor_host = torch.full((batch_size, ),
-                                                        self.runner.block_size,
-                                                        dtype=torch.int32)
-        query_start_loc_host = torch.arange(0,
-                                            batch_size + 1,
-                                            dtype=torch.int32)
-
-        global_params = infer_global_hyperparameters(
-            get_per_layer_parameters(self.vllm_config))
-
-        attn_metadata = self.runner.attn_backend.make_metadata(
-            num_prefills=0,
-            slot_mapping=self._graph_slot_mapping[:batch_size],
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=False,
-            num_prefill_tokens=0,
-            num_decode_tokens=batch_size,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=0,
-            seq_lens_tensor=self._graph_seq_lens,
-            block_tables=self._graph_block_tables,
-            paged_kv_indptr=paged_kv_indptr_tensor_host,
-            paged_kv_indices=paged_kv_indices_tensor_host,
-            paged_kv_last_page_len=paged_kv_last_page_len_tensor_host,
-            num_qo_heads=num_qo_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim=self.runner.model_config.get_head_size(),
-            page_size=self.runner.block_size,
-            seq_start_loc=None,
-            query_start_loc=query_start_loc_host,
-            device=self.runner.device,
-            data_type=kv_cache_dtype,
-            q_data_type=self.runner.model_config.dtype,
-            use_cuda_graph=True,
-            decode_wrapper=self._graph_decode_wrapper,
-            prefill_wrapper=None,
-            **dataclasses.asdict(global_params),
-        )
-        attn_metadata.begin_forward()
-        return attn_metadata
-
-    def get_graph_input_buffers(self,
-                                attn_metadata,
-                                is_encoder_decoder_model: bool = False):
-        return {
-            "block_tables": attn_metadata.block_tables,
-            "seq_lens_tensor": attn_metadata.seq_lens_tensor,
-            "slot_mapping": attn_metadata.slot_mapping,
-        }
-
-    def prepare_graph_input_buffers(self,
-                                    input_buffers,
-                                    attn_metadata,
-                                    is_encoder_decoder_model: bool = False):
-        # FlashInfer-specific logic: copy additional tensors
-        num_total_blocks = attn_metadata.decode_metadata.seq_lens_tensor.shape[
-            0]
-        input_buffers["seq_lens_tensor"][:num_total_blocks].copy_(
-            attn_metadata.seq_lens_tensor, non_blocking=True)
-        input_buffers["block_tables"][:num_total_blocks].copy_(
-            attn_metadata.block_tables, non_blocking=True)
-
-    def begin_forward(self, model_input):
-        assert not self._is_graph_capturing
-        state = self
-        use_cuda_graph = model_input.attn_metadata.use_cuda_graph
-        is_decode = model_input.attn_metadata.num_prefills == 0
-        # In case of multistep chunked-prefill, there might be prefill requests
-        # scheduled while CUDA graph mode is enabled. We don't run graph in that
-        # case.
-        if use_cuda_graph and is_decode:
-            if model_input.inputs_embeds is None:
-                batch_size = model_input.input_tokens.shape[0]
-                state = (
-                    self.runner.graph_runners[model_input.virtual_engine][(
-                        batch_size, False)].attn_state)
-            else:
-                batch_size = model_input.inputs_embeds.shape[0]
-                state = (
-                    self.runner.graph_runners[model_input.virtual_engine][(
-                        batch_size, True)].attn_state)
-
-        model_input.attn_metadata.prefill_wrapper = state._get_prefill_wrapper(
-        )
-        model_input.attn_metadata.decode_wrapper = state._get_decode_wrapper()
-        model_input.attn_metadata.begin_forward()
-
-
-@dataclass
-class FlashInferMetadata(AttentionMetadata):
-    # Maximum sequence length among prefill batch. 0 if there are decoding
-    # requests only.
-    max_prefill_seq_len: int
-    max_decode_seq_len: int
-
-    # Number of query tokens for each request in the batch.
-    # Currently, we require that all requests have the same number of query
-    # tokens during the decoding phase. When speculavie decoding is enabled,
-    # decode_query_len might be greater than 1. In all other cases, it is 1.
-    decode_query_len: Optional[int] = 1
-
-    use_cuda_graph: bool = True
-
-    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
-    decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
-
-    # Metadata for the prefill stage
-    seq_start_loc: Optional[torch.Tensor] = None
-    query_start_loc: Optional[torch.Tensor] = None
-    block_tables: Optional[torch.Tensor] = None
-
-    # used for GPU operations
-    seq_lens_tensor: Optional[torch.Tensor] = None
-    block_table_bound: Optional[torch.Tensor] = None
-
-    # An example for paged_kv_indices, paged_kv_indptr:
-    # request 1, page indices [0, 5, 8]
-    # request 2, page indices [1, 6, 7]
-    # request 3, page indices [3, 4]
-    # paged_kv_indices is a concatenation of page indices of all requests:
-    # [0, 5, 8, 1, 6, 7, 3, 4]
-    # paged_kv_indptr is used to index into paged_kv_indices:
-    # [0, 3, 6, 8]
-    # The indptr of the paged kv cache, shape: [batch_size + 1]
-    paged_kv_indptr: Optional[torch.Tensor] = None
-    # The page indices of the paged kv cache
-    paged_kv_indices: Optional[torch.Tensor] = None
-    # The number of entries in the last page of each request in
-    # the paged kv cache, shape: [batch_size]
-    paged_kv_last_page_len: Optional[torch.Tensor] = None
-    # The number of query/output heads
-    num_qo_heads: Optional[int] = None
-    # The number of key/value heads
-    num_kv_heads: Optional[int] = None
-    # The dimension of the attention heads
-    head_dim: Optional[int] = None
-    # Block size of vllm
-    page_size: Optional[int] = None
-    # The data type of the paged kv cache
-    data_type: torch.dtype = None
-    # The data type of the query
-    q_data_type: torch.dtype = None
-    # FlashInfer 0.2 encourages passing host tensors
-    device: torch.device = torch.device("cpu")
-    is_profile_run: bool = False
-
-    # The FlashInfer backend currently supports only models in which all layers
-    # share the same following hyperparameters:
-
-    # The left (inclusive) window size for the attention window, when
-    # set to `-1`, the window size will be set to the full length of
-    # the sequence. Defaults to `-1`.
-    window_left: int = -1
-    # The attention logits soft capping value (used in Gemini, Grok and
-    # Gemma-2, etc.), if not provided, will be set to `0`. If greater
-    # than 0, the logits will be capped according to formula:
-    # $$\texttt{logits\_soft\_cap} \times
-    # \mathrm{tanh}(x / \texttt{logits\_soft\_cap})$$,
-    # where $x$ is the input logits.
-    logits_soft_cap: Optional[float] = None
-    # The scale used in softmax, if not provided, will be set to
-    # `1.0 / sqrt(head_dim)`.
-    sm_scale: Optional[float] = None
-
-    def __post_init__(self):
-        # Refer to
-        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
-        supported_head_sizes = FlashInferBackend.get_supported_head_sizes()
-        if self.head_dim is not None and self.head_dim \
-                not in supported_head_sizes:
-            raise ValueError(
-                f"Only {supported_head_sizes} are supported for head_dim,",
-                f" received {self.head_dim}.")
-
-    def begin_forward(self):
-        if self.num_prefill_tokens > 0:
-            if self.paged_kv_indices is None:
-                return
-
-            assert self.prefill_wrapper is not None
-            assert self.query_start_loc is not None
-            assert self.paged_kv_indices is not None
-            assert self.paged_kv_indptr is not None
-            assert self.paged_kv_last_page_len is not None
-            assert self.block_table_bound is not None
-            assert self.seq_lens_tensor is not None
-            self.query_start_loc = self.query_start_loc[:self.num_prefills + 1]
-            batch_size = self.query_start_loc.shape[0] - 1
-            assert batch_size >= 0
-            # We will use flash attention for profiling to
-            # determine the number of blocks. Therefore,
-            # we don't need to prepare the input for flashinfer for profile run.
-            if not self.is_profile_run:
-                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
-                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
-                    self.device)
-                self.block_table_bound = self.block_table_bound.to(self.device)
-                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
-                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-                self.prefill_wrapper.plan(
-                    self.query_start_loc,
-                    self.paged_kv_indptr[:self.num_prefills + 1],
-                    self.paged_kv_indices,
-                    self.paged_kv_last_page_len[:self.num_prefills],
-                    self.num_qo_heads,
-                    self.num_kv_heads,
-                    self.head_dim,
-                    self.page_size,
-                    causal=True,
-                    sm_scale=self.sm_scale,
-                    window_left=self.window_left,
-                    logits_soft_cap=self.logits_soft_cap,
-                    q_data_type=self.q_data_type,
-                    kv_data_type=self.data_type)
-        if self.num_decode_tokens > 0:
-            assert self.paged_kv_indices is not None
-            assert self.paged_kv_indptr is not None
-            assert self.paged_kv_last_page_len is not None
-            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
-            self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
-                self.device)
-            # handle model warmup path
-            if self.block_table_bound is not None:
-                self.block_table_bound = self.block_table_bound.to(self.device)
-            if self.seq_lens_tensor is not None:
-                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
-
-            assert self.decode_wrapper is not None
-            self.decode_wrapper.plan(
-                self.paged_kv_indptr[self.num_prefills:],
-                self.paged_kv_indices,
-                self.paged_kv_last_page_len[self.num_prefills:],
-                self.num_qo_heads,
-                self.num_kv_heads,
-                self.head_dim,
-                self.page_size,
-                # Disable flashinfer's pos encoding and use vllm's rope.
-                pos_encoding_mode="NONE",
-                window_left=self.window_left,
-                logits_soft_cap=self.logits_soft_cap,
-                sm_scale=self.sm_scale,
-                # kv-cache data type.
-                kv_data_type=self.data_type,
-                # query data type.
-                q_data_type=self.q_data_type)
-
-    def asdict_zerocopy(self,
-                        skip_fields: Optional[Set[str]] = None
-                        ) -> Dict[str, Any]:
-        if skip_fields is None:
-            skip_fields = set()
-        # We need to skip the prefill/decode_wrapper field since it cannot be
-        # broadcasted with nccl when TP is enabled.
-        skip_fields.add('prefill_wrapper')
-        skip_fields.add('decode_wrapper')
-        return super().asdict_zerocopy(skip_fields)
-
-    @property
-    def prefill_metadata(self) -> Optional["FlashInferMetadata"]:
-        if self.num_prefills == 0:
-            return None
-        return self
-
-    @property
-    def decode_metadata(self) -> Optional["FlashInferMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-        return self
-
-
-class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
-
-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
-
-        self.input_builder = input_builder
-        self.runner = input_builder.runner
-
-        self.sliding_window = input_builder.sliding_window
-        self.block_size = input_builder.block_size
-
-        # Global hyperparameters shared by all attention layers
-        self.global_hyperparameters: Optional[PerLayerParameters] = None
-
-        self.vllm_config = self.runner.vllm_config
-
-    def prepare(self):
-        self.slot_mapping: List[int] = []
-        self.prefill_seq_lens: List[int] = []
-        self.context_lens: List[int] = []
-        self.block_tables: List[List[int]] = []
-        self.curr_seq_lens: List[int] = []
-        self.multimodal_placeholder_maps: Dict[
-            str,
-            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
-        self.num_prefills = 0
-        self.num_prefill_tokens = 0
-        self.num_decode_tokens = 0
-
-        # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout
-        # for the precise definition of the following fields.
-        # An example:
-        # request 1, page indices [0, 5, 8]
-        # request 2, page indices [1, 6, 7]
-        # request 3, page indices [3, 4]
-        # paged_kv_indices is a concatenation of page indices of all requests:
-        # [0, 5, 8, 1, 6, 7, 3, 4]
-        # paged_kv_indptr is used to index into paged_kv_indices:
-        # [0, 3, 6, 8]
-        self.paged_kv_indices: List[int] = []
-        # 0 at the beginning of paged_kv_indptr indicates the start of the
-        # first request’s page indices in the paged_kv_indices list.
-        self.paged_kv_indptr: List[int] = [0]
-        # paged_kv_last_page_len is the length of the last page of each request
-        self.paged_kv_last_page_len: List[int] = []
-        self.total_blocks = 0
-        self.is_profile_run: bool = False
-
-        if self.global_hyperparameters is None:
-            # Infer global hyperparameters, since currently we only support
-            # models in which all layers share the same values for the
-            # following hyperparameters:
-            # - `window_left`
-            # - `logits_soft_cap`
-            # - `sm_scale`
-            inferred_params = infer_global_hyperparameters(
-                get_per_layer_parameters(self.vllm_config))
-            self.global_hyperparameters = inferred_params
-            self.window_left = inferred_params.window_left
-            self.logits_soft_cap = inferred_params.logits_soft_cap
-            self.sm_scale = inferred_params.sm_scale
-
-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool):
-        """Add a sequence group to the metadata. Specifically update/append
-        1. context length.
-        2. block table.
-        3. slot mapping.
-        """
-        is_prompt = inter_data.is_prompt
-        block_tables = inter_data.block_tables
-        computed_block_nums = inter_data.computed_block_nums
-
-        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
-             curr_sliding_window_block) in zip(
-                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
-                 inter_data.orig_seq_lens, inter_data.seq_lens,
-                 inter_data.query_lens, inter_data.context_lens,
-                 inter_data.curr_sliding_window_blocks):
-            self.context_lens.append(context_len)
-            if is_prompt:
-                mm_maps = inter_data.multi_modal_placeholder_maps
-                if mm_maps:
-                    for modality, placeholders in mm_maps.items():
-                        self.multimodal_placeholder_maps[modality].extend(
-                            placeholders)
-                self.num_prefills += 1
-                self.num_prefill_tokens += token_len
-                self.prefill_seq_lens.append(seq_len)
-            else:
-                assert query_len == 1, (
-                    "seq_len: {}, context_len: {}, query_len: {}".format(
-                        seq_len, context_len, query_len))
-                self.num_decode_tokens += query_len
-                self.curr_seq_lens.append(curr_seq_len)
-
-            # Compute block table.
-            # TODO(sang): Combine chunked prefill and prefix caching by
-            # only allowing multiple of block_size chunk size.
-            # NOTE: This only works for oooooooxxx style attention.
-            block_table = []
-            if inter_data.prefix_cache_hit:
-                block_table = computed_block_nums
-            elif ((chunked_prefill_enabled or not is_prompt)
-                  and block_tables is not None):
-                block_table = block_tables[seq_id][-curr_sliding_window_block:]
-            self.block_tables.append(block_table)
-
-            is_profile_run = is_block_tables_empty(block_tables)
-
-            # Compute slot mapping.
-            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
-                                                       context_len,
-                                                       self.sliding_window)
-            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
-                                 seq_len, context_len, start_idx,
-                                 self.block_size, inter_data.block_tables)
-
-            # It is not necessary to add paged_kv_indices, paged_kv_indptr,
-            # and paged_kv_last_page_len for profile run because we will
-            # create dummy inputs.
-            if is_profile_run:
-                self.is_profile_run = is_profile_run
-                return
-
-            block_table = block_tables[seq_id]
-            self._update_paged_kv_tensors(block_table, seq_len)
-
-    def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int):
-        # Get the number of valid blocks based on sequence length.
-        # If seq_len = 16, block_size = 16,
-        # block_table_bound is 1 with 1 valid block.
-        # If seq_len = 15, block_size = 16,
-        # block_table_bound is 0 + 1 with 1 valid block.
-        self.total_blocks += len(block_table)
-        block_table_bound = seq_len // self.block_size + 1 \
-                            if seq_len % self.block_size != 0 \
-                            else seq_len // self.block_size
-        self.paged_kv_indices.extend(block_table[:block_table_bound])
-        self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
-                                    block_table_bound)
-
-        last_page_len = seq_len % self.block_size
-        if last_page_len == 0:
-            last_page_len = self.block_size
-        self.paged_kv_last_page_len.append(last_page_len)
-
-    def build(self, seq_lens: List[int], query_lens: List[int],
-              cuda_graph_pad_size: int, batch_size: int):
-        """Build attention metadata with on-device tensors.
-
-        Args:
-            seq_lens: The maybe padded sequence lengths of the input sequences.
-            query_lens: The query lengths of the input sequences.
-            cuda_graph_pad_size: The padding size for cuda graph.
-                                 -1 if cuda graph is not used.
-            batch_size: The maybe padded batch size.
-        """
-        for inter_data in self.input_builder.inter_data_list:
-            self._add_seq_group(inter_data,
-                                self.input_builder.chunked_prefill_enabled)
-
-        device = self.runner.device
-        use_captured_graph = cuda_graph_pad_size != -1
-
-        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
-        max_decode_seq_len = max(self.curr_seq_lens, default=0)
-        num_decode_tokens = self.num_decode_tokens
-        decode_query_len = max(query_lens[self.num_prefills:], default=1)
-
-        if use_captured_graph:
-            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
-            self.block_tables.extend([] * cuda_graph_pad_size)
-            num_decode_tokens = batch_size - self.num_prefill_tokens
-
-            # The shape of graph_block_tables is
-            # [max batch size, max context len // block size].
-            input_block_tables = self.runner.graph_block_tables[:batch_size]
-            max_blocks = input_block_tables.shape[1]
-            for i, block_table in enumerate(self.block_tables):
-                if block_table:
-                    num_blocks = len(block_table)
-                    if num_blocks <= max_blocks:
-                        input_block_tables[i, :num_blocks] = block_table
-                    else:
-                        # It may be possible to have more blocks allocated due
-                        # to lookahead slots of multi-step, however, they are
-                        # not used anyway, so can be safely ignored.
-                        input_block_tables[
-                            i, :max_blocks] = block_table[:max_blocks]
-
-            block_tables = torch.from_numpy(input_block_tables).to(
-                device, non_blocking=True)
-
-            last_paged_kv_indptr = self.paged_kv_indptr[-1]
-            self.paged_kv_indptr.extend([last_paged_kv_indptr] *
-                                        cuda_graph_pad_size)
-            self.paged_kv_last_page_len.extend([0] * cuda_graph_pad_size)
-        else:
-            block_tables = make_tensor_with_pad(
-                self.block_tables,
-                pad=0,
-                dtype=torch.int,
-                device=device,
-            )
-
-        assert device is not None
-        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
-                                           self.runner.pin_memory)
-        query_lens_tensor = async_tensor_h2d(query_lens, torch.long, device,
-                                             self.runner.pin_memory)
-        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
-                                               device, self.runner.pin_memory)
-        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
-                                      dtype=torch.int32,
-                                      device=device)
-        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
-                                    dtype=torch.int32,
-                                    device=device)
-        placeholder_index_maps = {
-            modality: placeholder_map.index_map()
-            for modality, placeholder_map in
-            self.multimodal_placeholder_maps.items()
-        }
-        torch.cumsum(seq_lens_tensor,
-                     dim=0,
-                     dtype=seq_start_loc.dtype,
-                     out=seq_start_loc[1:])
-        torch.cumsum(query_lens_tensor,
-                     dim=0,
-                     dtype=query_start_loc.dtype,
-                     out=query_start_loc[1:])
-
-        if len(self.paged_kv_indptr) > 0:
-            # extend to the maximum number of blocks as returned by the
-            # scheduler
-            self.paged_kv_indices.extend(
-                [0] * (self.total_blocks - len(self.paged_kv_indices)))
-            paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
-                                                   device="cpu",
-                                                   dtype=torch.int)
-            paged_kv_indptr_tensor = torch.tensor(self.paged_kv_indptr,
-                                                  device="cpu",
-                                                  dtype=torch.int)
-            paged_kv_last_page_len_tensor = torch.tensor(
-                self.paged_kv_last_page_len, device="cpu", dtype=torch.int)
-            block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
-                                                   1,
-                                                   device="cpu",
-                                                   dtype=torch.int)
-        else:
-            paged_kv_indices_tensor = None
-            paged_kv_indptr_tensor = None
-            paged_kv_last_page_len_tensor = None
-            block_table_bound_tensor = None
-
-        if self.runner.kv_cache_dtype.startswith("fp8"):
-            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                self.runner.kv_cache_dtype)
-        else:
-            kv_cache_dtype = get_kv_cache_torch_dtype(
-                self.runner.kv_cache_dtype, self.runner.model_config.dtype)
-
-        return FlashInferMetadata(
-            decode_query_len=decode_query_len,
-            num_prefills=self.num_prefills,
-            slot_mapping=slot_mapping_tensor,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-            enable_kv_scales_calculation=False,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            max_prefill_seq_len=max_prefill_seq_len,
-            max_decode_seq_len=max_decode_seq_len,
-            block_tables=block_tables,
-            paged_kv_indptr=paged_kv_indptr_tensor,
-            paged_kv_indices=paged_kv_indices_tensor,
-            paged_kv_last_page_len=paged_kv_last_page_len_tensor,
-            block_table_bound=block_table_bound_tensor,
-            seq_lens_tensor=seq_lens_tensor,
-            num_qo_heads=self.runner.model_config.get_num_attention_heads(
-                self.runner.parallel_config),
-            num_kv_heads=self.runner.model_config.get_num_kv_heads(
-                self.runner.parallel_config),
-            head_dim=self.runner.model_config.get_head_size(),
-            page_size=self.block_size,
-            seq_start_loc=seq_start_loc,
-            query_start_loc=query_start_loc,
-            device=device,
-            data_type=kv_cache_dtype,
-            q_data_type=self.runner.model_config.dtype,
-            use_cuda_graph=use_captured_graph,
-            is_profile_run=self.is_profile_run,
-            window_left=self.window_left,
-            logits_soft_cap=self.logits_soft_cap,
-            sm_scale=self.sm_scale,
-        )
-
-
-class FlashInferImpl(AttentionImpl):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0 "
-                                      "FLASHINFER backend.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in FlashInfer is not supported yet, it will fall"
-                " back to global attention for long context.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        self.sliding_window = ((sliding_window - 1,
-                                0) if sliding_window is not None else (-1, -1))
-        self.kv_cache_dtype = kv_cache_dtype
-        self.logits_soft_cap = logits_soft_cap
-
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashInferImpl")
-
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: FlashInferMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for FlashInferImpl")
-
-        # TODO: directly write to output tensor
-        num_heads: int = self.num_heads
-        head_size: int = self.head_size
-        num_kv_heads: int = self.num_kv_heads
-        kv_cache_dtype: str = self.kv_cache_dtype
-        softmax_scale: float = self.scale
-        window_size = self.sliding_window
-        alibi_slopes = self.alibi_slopes
-        logits_soft_cap = self.logits_soft_cap
-
-        num_tokens, hidden_size = query.shape
-        query = query.view(-1, num_heads, head_size)
-        key = key.view(-1, num_kv_heads, head_size)
-        value = value.view(-1, num_kv_heads, head_size)
-
-        if kv_cache.numel() > 0:
-            # Use the same reshape and cache kernel as flash attention.
-            ops.reshape_and_cache_flash(
-                key,
-                value,
-                kv_cache[:, 0],
-                kv_cache[:, 1],
-                attn_metadata.slot_mapping.flatten(),
-                kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-            # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
-            # to process the cache when the kv_cache_dtype is fp8
-            if kv_cache_dtype.startswith("fp8"):
-                torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                    kv_cache_dtype)
-                kv_cache = kv_cache.view(torch_dtype)
-
-        num_prefill_tokens = attn_metadata.num_prefill_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \
-                    f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa
-        assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \
-                    f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa
-        query = query.contiguous(
-        )  # Flashinfer requires query to be contiguous
-        # Query for decode. KV is not needed because it is already cached.
-        # QKV for prefill.
-        decode_query = query[num_prefill_tokens:]
-        query = query[:num_prefill_tokens]
-
-        key = key[:num_prefill_tokens]
-        value = value[:num_prefill_tokens]
-
-        assert query.shape[0] == num_prefill_tokens
-        assert decode_query.shape[0] == num_decode_tokens
-
-        window_left = window_size[0] if window_size is not None else -1
-
-        prefill_output: Optional[torch.Tensor] = None
-        if num_decode_tokens > 0:
-            decode_output = torch.empty(decode_query.shape,
-                                        dtype=decode_query.dtype,
-                                        device=decode_query.device)
-        else:
-            decode_output = None
-        stride_order = FlashInferBackend.get_kv_cache_stride_order()
-        if prefill_meta := attn_metadata.prefill_metadata:
-            # We will use flash attention for prefill
-            # when kv_cache is not provided.
-            # This happens when vllm runs the profiling to
-            # determine the number of blocks.
-            if kv_cache.numel() == 0:
-                prefill_output = flash_attn_varlen_func(
-                    q=query,
-                    k=key,
-                    v=value,
-                    cu_seqlens_q=prefill_meta.seq_start_loc,
-                    cu_seqlens_k=prefill_meta.seq_start_loc,
-                    max_seqlen_q=prefill_meta.max_prefill_seq_len,
-                    max_seqlen_k=prefill_meta.max_prefill_seq_len,
-                    softmax_scale=softmax_scale,
-                    causal=True,
-                    window_size=window_size,
-                    alibi_slopes=alibi_slopes,
-                )
-            else:
-                assert prefill_meta is not None
-                assert prefill_meta.prefill_wrapper is not None
-
-                assert prefill_meta.prefill_wrapper._causal
-                assert prefill_meta.prefill_wrapper._window_left == window_left
-                assert prefill_meta.prefill_wrapper._logits_soft_cap == (
-                    logits_soft_cap or 0.0)
-                assert prefill_meta.prefill_wrapper._sm_scale == softmax_scale
-
-                prefill_output = prefill_meta.prefill_wrapper.run(
-                    query,
-                    kv_cache.permute(*stride_order),
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                )
-        if decode_meta := attn_metadata.decode_metadata:
-            assert decode_meta is not None
-            assert decode_meta.decode_wrapper is not None
-
-            assert decode_meta.decode_wrapper._window_left == window_left
-            assert decode_meta.decode_wrapper._logits_soft_cap == (
-                logits_soft_cap or 0.0)
-            assert decode_meta.decode_wrapper._sm_scale == softmax_scale
-            # TODO: @pavanimajety Remove this once the switch happens
-            # inside flashinfer.
-            if not use_trtllm_attention(
-                    num_decode_tokens, attn_metadata.max_decode_seq_len,
-                    kv_cache_dtype, attn_metadata.num_qo_heads,
-                    attn_metadata.num_kv_heads, attn_metadata.head_dim):
-                decode_meta.decode_wrapper.run(
-                    decode_query,
-                    kv_cache.permute(*stride_order),
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                    out=decode_output,
-                )
-            else:
-                workspace_buffer = (
-                    decode_meta.decode_wrapper._float_workspace_buffer)
-                assert FlashInferState.get_kv_cache_layout() == "HND"
-                trtllm_batch_decode_with_kv_cache(
-                    query=decode_query,
-                    kv_cache=kv_cache.permute(*stride_order),
-                    workspace_buffer=workspace_buffer,
-                    block_tables=attn_metadata.block_tables,
-                    seq_lens=decode_meta.seq_lens_tensor,
-                    max_seq_len=attn_metadata.max_decode_seq_len,
-                    bmm1_scale=layer._k_scale_float * softmax_scale,
-                    bmm2_scale=layer._v_scale_float,
-                    out=decode_output,
-                )
-
-        if prefill_output is None and decode_output is not None:
-            # Decode only batch.
-            output, num_tokens = decode_output, num_decode_tokens
-        elif decode_output is None and prefill_output is not None:
-            # Prefill only batch.
-            output, num_tokens = prefill_output, num_prefill_tokens
-        else:
-            # Chunked prefill batch does not work with speculative decoding in
-            # FlashInfer backend, so the query length for decode should be 1.
-            assert prefill_output is not None
-            assert decode_output is not None
-            assert decode_meta is not None
-            assert decode_meta.decode_query_len == 1
-            decode_output = decode_output.squeeze(1)
-            output = torch.cat([prefill_output, decode_output], dim=0)
-        return output.view(num_tokens, hidden_size)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 321db8287c..55d7afeef6 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -350,17 +350,7 @@ class CudaPlatformBase(Platform):
             return FLEX_ATTENTION_V1
 
         # Backends for V0 engine
-        if selected_backend == _Backend.FLASHINFER:
-            logger.info("Using FlashInfer backend.")
-            if cls.has_device_capability(100):
-                from vllm.v1.attention.backends.utils import (
-                    set_kv_cache_layout)
-                logger.info_once(
-                    "Using HND KV cache layout on V1 engine by default for "
-                    "Blackwell (SM 10.0) GPUs.")
-                set_kv_cache_layout("HND")
-            return "vllm.attention.backends.flashinfer.FlashInferBackend"
-        elif selected_backend == _Backend.XFORMERS:
+        if selected_backend == _Backend.XFORMERS:
             logger.info("Using XFormers backend.")
             return "vllm.attention.backends.xformers.XFormersBackend"
         elif selected_backend == _Backend.DUAL_CHUNK_FLASH_ATTN:
@@ -416,10 +406,6 @@ class CudaPlatformBase(Platform):
                 if (fp8_kv_cache and not flash_attn_supports_fp8()):
                     logger.info(
                         "Cannot use FlashAttention backend for FP8 KV cache.")
-                    logger.warning(
-                        "Please use FlashInfer backend with FP8 KV Cache for "
-                        "better performance by setting environment variable "
-                        "VLLM_ATTENTION_BACKEND=FLASHINFER")
                     target_backend = _Backend.XFORMERS
             except ImportError:
                 logger.info(

From a4454e9401c9c76150dc3a4d23d10377926b2fb6 Mon Sep 17 00:00:00 2001
From: Xiao <xiszishu@gmail.com>
Date: Mon, 18 Aug 2025 20:08:05 -0700
Subject: [PATCH 375/932] chore: disable enable_cpp_symbolic_shape_guards
 (#23048)

Signed-off-by: Xiao Liu <xiszishu@gmail.com>
---
 vllm/compilation/decorators.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 1370862d58..58f70ef9ef 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -267,8 +267,24 @@ def _support_torch_compile(
                     code.co_filename)
                 return inline_call(parent, func, args, kwargs)
 
+            # Disable the C++ compilation of symbolic shape guards. C++-fication
+            # of symbolic shape guards can improve guard overhead. But, since
+            # vllm skip guards anyways, setting this flag to False can improve
+            # compile time.
+            dynamo_config_patches = {}
+            try:
+                _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards
+                dynamo_config_patches[
+                    "enable_cpp_symbolic_shape_guards"] = False
+            except AttributeError:
+                # Note: this config is not available in torch 2.6, we can skip
+                # if the config doesn't exist
+                logger.debug(
+                    "enable_cpp_symbolic_shape_guards config not available")
+
             with patch.object(InliningInstructionTranslator, 'inline_call',
-                              patched_inline_call):
+                              patched_inline_call), torch._dynamo.config.patch(
+                                  **dynamo_config_patches):
                 output = self.compiled_callable(*args, **kwargs)
             return output
 

From e9d6a3db69c57b02aefe189a2c9f8e91a7bd48d6 Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Mon, 18 Aug 2025 20:46:42 -0700
Subject: [PATCH 376/932] [TPU] make ptxla not imported when using tpu_commons
 (#23081)

Signed-off-by: Chengji Yao <chengjiyao@gmail.com>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: Chengji Yao <chengjiyao@gmail.com>
---
 .../device_communicators/tpu_communicator.py  | 27 +++---
 .../layers/fused_moe/moe_pallas.py            |  2 +-
 .../model_loader/default_loader.py            | 21 ++--
 vllm/platforms/tpu.py                         |  3 +
 vllm/v1/attention/backends/pallas.py          | 97 ++++++++++---------
 vllm/v1/worker/tpu_worker.py                  | 22 +++--
 6 files changed, 94 insertions(+), 78 deletions(-)

diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index c60a7a7eb2..942dd67f06 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -10,6 +10,7 @@ from torch.distributed import ProcessGroup
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.platforms.tpu import USE_TPU_COMMONS
 
 from .base_device_communicator import DeviceCommunicatorBase
 
@@ -18,16 +19,17 @@ USE_RAY = parallel_config = get_current_vllm_config(
 
 logger = init_logger(__name__)
 
-if current_platform.is_tpu():
-    import torch_xla
-    import torch_xla.core.xla_model as xm
-    import torch_xla.runtime as xr
-    from torch_xla._internal import pjrt
-    from torch_xla.distributed.xla_multiprocessing import (
-        create_optimized_replica_groups)
-
-    if USE_RAY:
-        from vllm.executor import ray_utils
+if not USE_TPU_COMMONS:
+    logger.info("tpu_commons not found, using vLLM's TpuCommunicator")
+    if current_platform.is_tpu():
+        import torch_xla
+        import torch_xla.core.xla_model as xm
+        import torch_xla.runtime as xr
+        from torch_xla._internal import pjrt
+        from torch_xla.distributed.xla_multiprocessing import (
+            create_optimized_replica_groups)
+        if USE_RAY:
+            from vllm.executor import ray_utils
 
 
 class TpuCommunicator(DeviceCommunicatorBase):
@@ -94,10 +96,7 @@ class TpuCommunicator(DeviceCommunicatorBase):
         return xm.all_gather(input_, dim=dim)
 
 
-try:
+if USE_TPU_COMMONS:
     from tpu_commons.distributed.device_communicators import (
         TpuCommunicator as TpuCommonsCommunicator)
     TpuCommunicator = TpuCommonsCommunicator  # type: ignore
-except ImportError:
-    logger.info("tpu_commons not found, using vLLM's TpuCommunicator")
-    pass
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index d35bd0098b..582ae3e12c 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch.nn.functional as F
-import torch_xla.experimental.custom_kernel  # noqa: F401
 
 
 def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor:
@@ -41,6 +40,7 @@ def fused_moe(
         gating_output: [*, num_experts]
     """
     assert expert_map is None, "expert_map is not supported for pallas MoE."
+    import torch_xla.experimental.custom_kernel  # noqa: F401
     orig_shape = hidden_states.shape
     hidden_size = hidden_states.shape[-1]
     num_tokens = hidden_states.shape[:-1].numel()
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 2b8e442759..34b8d8e4ed 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -207,16 +207,21 @@ class DefaultModelLoader(BaseModelLoader):
             )
 
         if current_platform.is_tpu():
-            # In PyTorch XLA, we should call `xm.mark_step` frequently so that
-            # not too many ops are accumulated in the XLA program.
-            import torch_xla.core.xla_model as xm
+            from vllm.platforms.tpu import USE_TPU_COMMONS
 
-            def _xla_weights_iterator(iterator: Generator):
-                for weights in iterator:
-                    yield weights
-                    xm.mark_step()
+            if not USE_TPU_COMMONS:
+                # In PyTorch XLA, we should call `xm.mark_step`
+                # requently so that not too many ops are accumulated
+                # in the XLA program. import torch_xla.core.xla_model
+                # as xm
+                import torch_xla.core.xla_model as xm
 
-            weights_iterator = _xla_weights_iterator(weights_iterator)
+                def _xla_weights_iterator(iterator: Generator):
+                    for weights in iterator:
+                        yield weights
+                        xm.mark_step()
+
+                weights_iterator = _xla_weights_iterator(weights_iterator)
 
         if self.counter_before_loading_weights == 0.0:
             self.counter_before_loading_weights = time.perf_counter()
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index ba06abd07f..dc2be5c250 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -24,6 +24,8 @@ else:
 
 logger = init_logger(__name__)
 
+USE_TPU_COMMONS = False
+
 
 class TpuPlatform(Platform):
     _enum = PlatformEnum.TPU
@@ -201,6 +203,7 @@ class TpuPlatform(Platform):
 try:
     from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform
     TpuPlatform = TpuCommonsPlatform  # type: ignore
+    USE_TPU_COMMONS = True
 except ImportError:
     logger.info("tpu_commons not found, using vLLM's TpuPlatform")
     pass
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 9b122136af..3eb4a0e7a5 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -5,12 +5,6 @@ from dataclasses import dataclass
 from typing import Optional
 
 import torch
-import torch_xla.core.xla_builder as xb
-import torch_xla.experimental.custom_kernel  # noqa: F401
-# Required to register custom ops.
-from torch.library import impl
-from torch_xla._internal.jax_workarounds import requires_jax
-from torch_xla.experimental.custom_kernel import XLA_LIB
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
@@ -37,6 +31,57 @@ TPU_STR_DTYPE_TO_TORCH_DTYPE = {
     "uint8": torch.uint8,
 }
 
+try:
+    import tpu_commons  # noqa: F401
+except ImportError:
+    # Lazy import torch_xla
+    import torch_xla.core.xla_builder as xb
+    import torch_xla.experimental.custom_kernel  # noqa: F401
+    from torch.library import impl
+    from torch_xla._internal.jax_workarounds import requires_jax
+    from torch_xla.experimental.custom_kernel import XLA_LIB
+
+    @requires_jax
+    def kv_cache_update_op_impl(kv: torch.Tensor, slot_mapping: torch.Tensor,
+                                kv_cache: torch.Tensor,
+                                num_kv_update_slices: torch.Tensor,
+                                page_size: int, num_slices_per_block: int):
+        from vllm.attention.ops.pallas_kv_cache_update import kv_cache_update
+        new_kv_cache = xb.call_jax(
+            kv_cache_update,
+            (kv, slot_mapping, kv_cache, num_kv_update_slices), {
+                "page_size": page_size,
+                "num_slices_per_block": num_slices_per_block
+            })
+        return new_kv_cache
+
+
+    XLA_LIB.define(
+        "kv_cache_update_op(Tensor kv, Tensor slot_mapping," \
+        "Tensor kv_cache, Tensor num_kv_update_slices, int page_size," \
+        "int num_slices_per_block)" \
+        "-> Tensor", )
+
+    @impl(XLA_LIB, "kv_cache_update_op", "XLA")
+    def kv_cache_update_op_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
+                               kv_cache: torch.Tensor,
+                               num_kv_update_slices: torch.Tensor,
+                               page_size: int,
+                               num_slices_per_block: int) -> torch.Tensor:
+        new_kv_cache = kv_cache_update_op_impl(kv, slot_mapping, kv_cache,
+                                               num_kv_update_slices, page_size,
+                                               num_slices_per_block)
+        return new_kv_cache
+
+    @impl(XLA_LIB, "kv_cache_update_op", "CompositeExplicitAutograd")
+    def kv_cache_update_op_non_xla(kv: torch.Tensor,
+                                   slot_mapping: torch.Tensor,
+                                   kv_cache: torch.Tensor,
+                                   num_kv_update_slices: torch.Tensor,
+                                   page_size: int,
+                                   num_slices_per_block: int) -> torch.Tensor:
+        return kv_cache
+
 
 class PallasAttentionBackend(AttentionBackend):
 
@@ -313,46 +358,6 @@ def write_to_kv_cache(
     kv_cache.copy_(new_kv_cache)
 
 
-@requires_jax
-def kv_cache_update_op_impl(kv: torch.Tensor, slot_mapping: torch.Tensor,
-                            kv_cache: torch.Tensor,
-                            num_kv_update_slices: torch.Tensor, page_size: int,
-                            num_slices_per_block: int):
-    from vllm.attention.ops.pallas_kv_cache_update import kv_cache_update
-    new_kv_cache = xb.call_jax(
-        kv_cache_update, (kv, slot_mapping, kv_cache, num_kv_update_slices), {
-            "page_size": page_size,
-            "num_slices_per_block": num_slices_per_block
-        })
-    return new_kv_cache
-
-
-XLA_LIB.define(
-    "kv_cache_update_op(Tensor kv, Tensor slot_mapping, Tensor kv_cache," \
-    "Tensor num_kv_update_slices, int page_size, int num_slices_per_block)" \
-    "-> Tensor", )
-
-
-@impl(XLA_LIB, "kv_cache_update_op", "XLA")
-def kv_cache_update_op_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
-                           kv_cache: torch.Tensor,
-                           num_kv_update_slices: torch.Tensor, page_size: int,
-                           num_slices_per_block: int) -> torch.Tensor:
-    new_kv_cache = kv_cache_update_op_impl(kv, slot_mapping, kv_cache,
-                                           num_kv_update_slices, page_size,
-                                           num_slices_per_block)
-    return new_kv_cache
-
-
-@impl(XLA_LIB, "kv_cache_update_op", "CompositeExplicitAutograd")
-def kv_cache_update_op_non_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
-                               kv_cache: torch.Tensor,
-                               num_kv_update_slices: torch.Tensor,
-                               page_size: int,
-                               num_slices_per_block: int) -> torch.Tensor:
-    return kv_cache
-
-
 # We can move this function to a common utils file if it's also useful for other
 # hardware.
 def dtype_bits(dtype: torch.dtype):
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 72e0e4230a..9adf8a1421 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -1,15 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A TPU worker class."""
+
 import os
 from typing import Any, Optional
 
 import torch
 import torch.distributed
 import torch.nn as nn
-import torch_xla.core.xla_model as xm
-import torch_xla.debug.profiler as xp
-import torch_xla.runtime as xr
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
@@ -21,19 +19,27 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
+from vllm.platforms.tpu import USE_TPU_COMMONS
 from vllm.tasks import SupportedTask
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
-from vllm.v1.attention.backends.pallas import TPU_HEAD_SIZE_ALIGNMENT
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import report_usage_stats
-from vllm.v1.worker.tpu_model_runner import TPUModelRunner
 from vllm.v1.worker.utils import bind_kv_cache
 
 logger = init_logger(__name__)
 
+if not USE_TPU_COMMONS:
+    logger.info("tpu_commons not found, using vLLM's TPUWorker.")
+    import torch_xla.core.xla_model as xm
+    import torch_xla.debug.profiler as xp
+    import torch_xla.runtime as xr
+
+    from vllm.v1.attention.backends.pallas import TPU_HEAD_SIZE_ALIGNMENT
+    from vllm.v1.worker.tpu_model_runner import TPUModelRunner
+
 
 class TPUWorker:
 
@@ -325,9 +331,7 @@ class TPUWorker:
         ensure_kv_transfer_initialized(vllm_config)
 
 
-try:
+if USE_TPU_COMMONS:
     from tpu_commons.worker import TPUWorker as TPUCommonsWorker
+
     TPUWorker = TPUCommonsWorker  # type: ignore
-except ImportError:
-    logger.info("tpu_commons not found, using vLLM's TPUWorker.")
-    pass

From 78dba404adaa2b9814a4480249e2d9c0512364c1 Mon Sep 17 00:00:00 2001
From: Nikhil Suryawanshi <77109245+nikheal2@users.noreply.github.com>
Date: Tue, 19 Aug 2025 10:10:37 +0530
Subject: [PATCH 377/932] [Hardware][IBM Z]Enable v1 for s390x and s390x
 dockerfile fixes (#22725)

Signed-off-by: Nikhil Suryawanshi <suryawanshin74@gmail.com>
---
 docker/Dockerfile.s390x      | 87 ++++++++++++++++++++++++++++++++----
 requirements/common.txt      |  3 +-
 requirements/cpu.txt         |  4 +-
 vllm/engine/arg_utils.py     |  7 +--
 vllm/platforms/cpu.py        |  5 ++-
 vllm/platforms/interface.py  |  3 ++
 vllm/v1/worker/cpu_worker.py |  5 ++-
 7 files changed, 96 insertions(+), 18 deletions(-)

diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x
index 4e89bb3057..9270b48c54 100644
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \
 RUN microdnf install -y \
     which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
     libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
-    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \
+    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile && \
     microdnf clean all
 
 # Python Installation
@@ -136,6 +136,71 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     mkdir -p /tmp/hf-xet/dist && \
     cp dist/*.whl /tmp/hf-xet/dist/
 
+# Build numba
+FROM python-install AS numba-builder
+
+ARG MAX_JOBS
+ARG NUMBA_VERSION=0.61.2
+
+WORKDIR /tmp
+
+# Clone all required dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    microdnf install ninja-build gcc gcc-c++ -y && \
+    git clone --recursive https://github.com/llvm/llvm-project.git -b llvmorg-15.0.7  && \
+    git clone --recursive https://github.com/numba/llvmlite.git -b v0.44.0 && \
+    git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
+    cd llvm-project && mkdir build && cd  build && \
+    uv pip install 'cmake<4' setuptools numpy && \
+    export PREFIX=/usr/local && CMAKE_ARGS="${CMAKE_ARGS} -DLLVM_ENABLE_PROJECTS=lld;libunwind;compiler-rt" \
+    CFLAGS="$(echo $CFLAGS | sed 's/-fno-plt //g')" \
+    CXXFLAGS="$(echo $CXXFLAGS | sed 's/-fno-plt //g')" \
+    CMAKE_ARGS="${CMAKE_ARGS} -DFFI_INCLUDE_DIR=$PREFIX/include" \
+    CMAKE_ARGS="${CMAKE_ARGS} -DFFI_LIBRARY_DIR=$PREFIX/lib" \
+    cmake -DCMAKE_INSTALL_PREFIX="${PREFIX}"               \
+        -DCMAKE_BUILD_TYPE=Release                       \
+        -DCMAKE_LIBRARY_PATH="${PREFIX}"                 \
+        -DLLVM_ENABLE_LIBEDIT=OFF                        \
+        -DLLVM_ENABLE_LIBXML2=OFF                        \
+        -DLLVM_ENABLE_RTTI=ON                            \
+        -DLLVM_ENABLE_TERMINFO=OFF                       \
+        -DLLVM_INCLUDE_BENCHMARKS=OFF                    \
+        -DLLVM_INCLUDE_DOCS=OFF                          \
+        -DLLVM_INCLUDE_EXAMPLES=OFF                      \
+        -DLLVM_INCLUDE_GO_TESTS=OFF                      \
+        -DLLVM_INCLUDE_TESTS=OFF                         \
+        -DLLVM_INCLUDE_UTILS=ON                          \
+        -DLLVM_INSTALL_UTILS=ON                          \
+        -DLLVM_UTILS_INSTALL_DIR=libexec/llvm            \
+        -DLLVM_BUILD_LLVM_DYLIB=OFF                      \
+        -DLLVM_LINK_LLVM_DYLIB=OFF                       \
+        -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=WebAssembly \
+        -DLLVM_ENABLE_FFI=ON                             \
+        -DLLVM_ENABLE_Z3_SOLVER=OFF                      \
+        -DLLVM_OPTIMIZED_TABLEGEN=ON                     \
+        -DCMAKE_POLICY_DEFAULT_CMP0111=NEW               \
+        -DCOMPILER_RT_BUILD_BUILTINS=ON                  \
+        -DCOMPILER_RT_BUILTINS_HIDE_SYMBOLS=OFF          \
+        -DCOMPILER_RT_BUILD_LIBFUZZER=OFF                \
+        -DCOMPILER_RT_BUILD_CRT=OFF                      \
+        -DCOMPILER_RT_BUILD_MEMPROF=OFF                  \
+        -DCOMPILER_RT_BUILD_PROFILE=OFF                  \
+        -DCOMPILER_RT_BUILD_SANITIZERS=OFF               \
+        -DCOMPILER_RT_BUILD_XRAY=OFF                     \
+        -DCOMPILER_RT_BUILD_GWP_ASAN=OFF                 \
+        -DCOMPILER_RT_BUILD_ORC=OFF                      \
+        -DCOMPILER_RT_INCLUDE_TESTS=OFF                  \
+        ${CMAKE_ARGS} -GNinja ../llvm                    \
+
+    && ninja install  . && \
+    #  build llvmlite
+    cd ../../llvmlite && python setup.py bdist_wheel && \
+    cd ../numba && \
+    if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
+       sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
+    fi && python setup.py bdist_wheel
+
+
 # Final build stage
 FROM python-install AS vllm-cpu
 ARG PYTHON_VERSION
@@ -163,23 +228,30 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
     --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
     --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
+    --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
+    --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
      sed -i '/^torch/d' requirements/build.txt && \
-     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
-     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
-     HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
-     TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
+     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
+     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
+     HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl) && \
+     TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \
+     LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
+     NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
     uv pip install -v \    
         $ARROW_WHL_FILE  \
         $VISION_WHL_FILE \
         $HF_XET_WHL_FILE \
         $TORCH_WHL_FILE \
+        $LLVM_WHL_FILE \
+        $NUMBA_WHL_FILE \
         --index-strategy unsafe-best-match \
         -r requirements/build.txt \
-        -r requirements/cpu.txt 
+        -r requirements/cpu.txt
+
 
 # Build and install vllm
 RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
+    VLLM_TARGET_DEVICE=cpu VLLM_CPU_MOE_PREPACK=0 python setup.py bdist_wheel && \
     uv pip install "$(echo dist/*.whl)[tensorizer]"
 
 # setup non-root user for vllm
@@ -196,4 +268,3 @@ WORKDIR /home/vllm
 
 # Set the default entrypoint
 ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
-
diff --git a/requirements/common.txt b/requirements/common.txt
index 1a8fea0dd7..6bc71df24f 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -20,7 +20,8 @@ prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
 llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
-outlines_core == 0.2.10
+outlines_core == 0.2.10 ; platform_machine != "s390x"
+outlines == 0.1.11 ; platform_machine == "s390x"
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 6860275aca..f4b95b7289 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -1,8 +1,8 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9'
+numba == 0.60.0; python_version == '3.9' and platform_machine != "s390x" # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x"
 
 # Dependencies for CPUs
 packaging>=24.2
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6fc894827c..679905aed9 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1076,12 +1076,13 @@ class EngineArgs:
         # Set default arguments for V0 or V1 Engine.
         if use_v1:
             self._set_default_args_v1(usage_context, model_config)
-            # Disable chunked prefill for POWER (ppc64le)/ARM CPUs in V1
+            # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
             if current_platform.is_cpu(
             ) and current_platform.get_cpu_architecture() in (
-                    CpuArchEnum.POWERPC, CpuArchEnum.ARM):
+                    CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
                 logger.info(
-                    "Chunked prefill is not supported for ARM and POWER CPUs; "
+                    "Chunked prefill is not supported for ARM and POWER "
+                    "and S390X CPUs; "
                     "disabling it for V1 backend.")
                 self.enable_chunked_prefill = False
         else:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index fe258f76b9..c748595a71 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -332,5 +332,6 @@ class CpuPlatform(Platform):
         supplied model configuration.
         """
         arch = cls.get_cpu_architecture()
-        return (cls.supports_v1(model_config) and arch
-                in (CpuArchEnum.X86, CpuArchEnum.POWERPC, CpuArchEnum.ARM))
+        return (cls.supports_v1(model_config)
+                and arch in (CpuArchEnum.X86, CpuArchEnum.POWERPC,
+                             CpuArchEnum.ARM, CpuArchEnum.S390X))
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 4017f1ca7e..40334375b8 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -81,6 +81,7 @@ class CpuArchEnum(enum.Enum):
     X86 = enum.auto()
     ARM = enum.auto()
     POWERPC = enum.auto()
+    S390X = enum.auto()
     OTHER = enum.auto()
     UNKNOWN = enum.auto()
 
@@ -377,6 +378,8 @@ class Platform:
             return CpuArchEnum.ARM
         elif machine.startswith("ppc"):
             return CpuArchEnum.POWERPC
+        elif machine == "s390x":
+            return CpuArchEnum.S390X
 
         return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
 
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index f83d680484..be78597926 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -43,8 +43,9 @@ class CPUWorker(Worker):
         # Setup OpenMP threads affinity.
         omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
         if omp_cpuids == "auto" and platform.system() == "Linux":
-            if current_platform.get_cpu_architecture() == CpuArchEnum.POWERPC:
-                # For POWERPC SMT-8/4/2
+            cpu_arch = current_platform.get_cpu_architecture()
+            if cpu_arch in (CpuArchEnum.POWERPC, CpuArchEnum.S390X):
+                # For S390X/POWERPC SMT-8/4/2
                 self.local_omp_cpuid = self._get_autobind_cpu_ids(
                     lambda cpus: [cpu for cpu in cpus if cpu.id % 8 < 4])
             elif current_platform.get_cpu_architecture() == CpuArchEnum.X86:

From e75f342261610c3150a9680f37731998d9e883d6 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Mon, 18 Aug 2025 22:48:26 -0700
Subject: [PATCH 378/932] Migrate InternVLImagePixelInputs (in nemotron_vl.py)
 to TensorSchema (#22023)

Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/nemotron_vl.py | 28 ++++-------------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 82bcd06462..a9c7d8044e 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -458,27 +458,6 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-
-        #use force_image_size to get image_size
-        h = w = self.config.force_image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f" per patch is {expected_expr}. "
-                    f"You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[InternVLImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
@@ -516,9 +495,12 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
-                pixel_values_flat=self._validate_pixel_values(
-                    pixel_values_flat),
+                pixel_values_flat=pixel_values_flat,
                 num_patches=image_num_patches,
+                resolve_bindings={
+                    "h": self.config.force_image_size,
+                    "w": self.config.force_image_size
+                },
             )
 
         raise AssertionError("This line should be unreachable.")

From 90bbe0a5adc0b9636e84ed3ad058d5853158f4bb Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 19 Aug 2025 02:24:16 -0400
Subject: [PATCH 379/932] [Log] Warning Once for Cutlass MLA  (#23137)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/v1/attention/backends/mla/cutlass_mla.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 6e1e5d6533..6937ce10ac 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -115,7 +115,7 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         self._use_old_cutlass_mla = False
         force_old_cutlass = os.environ.get("FORCE_OLD_CUTLASS_MLA", None)
         if force_old_cutlass:
-            logger.warning("Forcing old cutlass mla kernel")
+            logger.warning_once("Forcing old cutlass mla kernel")
             self._use_old_cutlass_mla = True
 
         # TODO: Currently, num_kv_splits is limited to 16 to avoid hanging
@@ -123,8 +123,8 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         #       FORCE_NUM_KV_SPLITS=1
         force_num_kv_splits = os.environ.get("FORCE_NUM_KV_SPLITS", None)
         if force_num_kv_splits:
-            logger.warning("Forcing num_kv_splits to %d",
-                           int(force_num_kv_splits))
+            logger.warning_once("Forcing num_kv_splits to %d",
+                                int(force_num_kv_splits))
             self._num_kv_splits = int(force_num_kv_splits)
         else:
             self._num_kv_splits = -1  # => Auto-detect

From fda9537c5e61ea8226fa7e5b60912deda97a8aab Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Tue, 19 Aug 2025 14:24:31 +0800
Subject: [PATCH 380/932] [Model] Support Pipeline Parallelism for
 moonshotai/Kimi-VL-A3B-Thinking-2506 (#23114)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 docs/models/supported_models.md       |  2 +-
 vllm/model_executor/models/kimi_vl.py | 29 ++++++++++++++++-----------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index a514572945..bfab5713c7 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -626,7 +626,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
-| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
+| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
 | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
 | `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index cbf0008a88..a08a9a62a5 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -54,8 +54,7 @@ from transformers import BatchFeature
 from transformers.activations import GELUActivation
 
 from vllm.config import VllmConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -63,7 +62,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.models.deepseek_v2 import DeepseekV2Model
-from vllm.model_executor.models.interfaces import SupportsMultiModal
+from vllm.model_executor.models.interfaces import (SupportsMultiModal,
+                                                   SupportsPP)
 from vllm.model_executor.models.moonvit import MoonVitPretrainedModel
 from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -81,7 +81,7 @@ from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .utils import is_pp_missing_parameter, maybe_prefix
+from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix
 
 
 # For dummy input only
@@ -270,7 +270,8 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
 @MULTIMODAL_REGISTRY.register_processor(KimiVLMultiModalProcessor,
                                         info=KimiVLProcessingInfo,
                                         dummy_inputs=KimiVLDummyInputsBuilder)
-class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
+class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                     SupportsPP):
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
@@ -304,17 +305,21 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
             prefix=maybe_prefix(prefix, "language_model"),
         )
         self.unpadded_vocab_size = config.text_config.vocab_size
-        self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
-            config.text_config.hidden_size,
-            org_num_embeddings=self.config.text_config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE)
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.text_config.hidden_size,
+                org_num_embeddings=self.config.text_config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
         self.media_placeholder: int = self.config.media_placeholder_token_id
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.tp_world_size = get_tensor_model_parallel_world_size()
 
     # ref: qwen2_vl.py
     def _validate_and_reshape_mm_tensor(self, mm_input: object,

From 01a08739e006c9711fd9edf23fbe7cf23d43df2d Mon Sep 17 00:00:00 2001
From: Grace Ho <146482179+gracehonv@users.noreply.github.com>
Date: Tue, 19 Aug 2025 00:44:53 -0700
Subject: [PATCH 381/932] [misc] split engine_model into json file for nsys
 profile tool (#23117)

Signed-off-by: Grace Ho <grho@nvidia.com>
Signed-off-by: Grace Ho <146482179+gracehonv@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tools/profiler/nsys_profile_tools/README.md   |  53 +++--
 .../nsys_profile_tools/gputrc2graph.py        | 205 ++++--------------
 .../nsys_profile_tools/vllm_engine_model.json |  63 ++++++
 3 files changed, 135 insertions(+), 186 deletions(-)
 create mode 100644 tools/profiler/nsys_profile_tools/vllm_engine_model.json

diff --git a/tools/profiler/nsys_profile_tools/README.md b/tools/profiler/nsys_profile_tools/README.md
index 75ae0811cc..9577efb68f 100644
--- a/tools/profiler/nsys_profile_tools/README.md
+++ b/tools/profiler/nsys_profile_tools/README.md
@@ -36,8 +36,7 @@ profiling and analyzing nsys profile output.
 ## Notes
 
 - Make sure you have pandas installed.
-- Make sure nsys is installed, and specify the path to the `nsys` command with
-  `--nsys_cmd` if it is not in your PATH.
+- Make sure [nsys](https://developer.nvidia.com/nsight-systems/get-started) is installed, and specify the path to the `nsys` command with `--nsys_cmd` if it is not in your PATH.
 - For more details on available engines and models, see the help string in
   the script or run:
 
@@ -135,34 +134,31 @@ time which would cause a difference for the overall category.
 
 ## Example 3: add new classification for a new model
 
-Suppose there's a new model ABC that is available for engine DEF, and say there
-are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels
-have names with "*H*" or "*I*" in them, and attn kernels have names with "*J*"
-or "*K*" in them, add a new entry like so:
+To create a new engine DEF with model ABC, just add another json file in the same directory as
+gputrc2graph.py with the same format as the other json files. The script will automatically pick up all the json files in the same directory as engine/model specifications.
 
-```python
-engine_model = {
-        'DEF': {
-            'ABC': { 
-                'layer_anno': {
-                    'Stage': {
-                        '.*': 'layer',
-                    },
-                    'Substage': {
-                        'H|I': 'gemm',
-                        'J|K': 'attn',
-                        'CUDA mem': 'non-gpu-H_D_memops',
-                        '.*': 'misc'
-                    }
-                }
-            },
-        }
-      'vllm': {...}
+Then, for this new model, suppose there are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels
+have names with "*H*" or "*I*" in them, and attn kernels have names with "*J*"
+or "*K*" in them, just add another .json file in the same directory as
+gputrc2graph.py with the same format as the other json files, like the following:
+
+```json
+{
+  "DEF": {
+      "ABC": { 
+          "H|I": "gemm",
+          "J|K": "attn",
+          "CUDA mem": "non-gpu-H_D_memops",
+          ".*": "misc"
+      }
+  }
+}
 ```
 
-Basically Substage is a dictionary with a list of key/value pairs, where the
-keys are regex's of the kernel names to be classified, and values are the
-classification bins which one wishes to compare across engines/models.
+Each entry in the dictionary consists of:
+
+- key: a regex used to classify the kernels
+- value: the category to classify the kernels into.
 
 The last 2 entries are common for all engine/models, consisting of CUDA memory
 operations and a 'misc' for anything that's leftover and can't be classified.
@@ -173,3 +169,6 @@ like the following:
 ```bash
 --infile new.nsys-rep,DEF,ABC,<runtime>
 ```
+
+If the engine_DEF.json file already exists, just add the model as a new node in
+the existing engine file, after the other models.
diff --git a/tools/profiler/nsys_profile_tools/gputrc2graph.py b/tools/profiler/nsys_profile_tools/gputrc2graph.py
index 8921e1f20f..42dfede9e9 100755
--- a/tools/profiler/nsys_profile_tools/gputrc2graph.py
+++ b/tools/profiler/nsys_profile_tools/gputrc2graph.py
@@ -15,132 +15,18 @@ logger = logging.getLogger(__name__)
 
 
 # helper data class for annotating kernels
-class EngineModelData:
-    # engine + model mappings
-    engine_model = {
-        'vllm': {
-            'llama': {
-                'layer_anno': {
-                    'Stage': {
-                        '.*': 'layer',
-                    },
-                    'Substage': {
-                        'gemm': 'gemm',
-                        'fused_moe_kernel|GroupProblemShape|group_gemm_starts':
-                        'moe_gemm',  #llama4
-                        'moe|sigmoid': 'moe',  #llama4
-                        'CatArrayBatched|prepare_inputs': 'prepare_next',
-                        'flash': 'attn',
-                        'ncclDevKernel|cross_device_reduce':
-                        'nccl_and_custom_ar',
-                        '_norm_': 'norm',
-                        'act_and_mul_': 'silu',
-                        'rotary_embedding_kernel': 'rope',
-                        'SoftMax': 'softmax',
-                        'elementwise': 'elementwise',
-                        'fp8_quant': 'quantize',
-                        'reduce_kernel': 'reduce',
-                        'triton': 'triton_kernel',
-                        'CUDA mem': 'non-gpu-H_D_memops',
-                        '.*': 'misc'
-                    }
-                }
-            },
-            'ds': {
-                'layer_anno': {
-                    'Stage': {
-                        '.*': 'layer',
-                    },
-                    'Substage': {
-                        'block_fp8|gemm_fp8_blockwise':
-                        'block_fp8_gemm',
-                        'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal':
-                        'moe_gemm',
-                        'gemm|matmul|nvjet':
-                        'gemm',
-                        'moe|sigmoid|expert':
-                        'moe',
-                        '_fwd_|FlashAttn|_mla_|_attn_':
-                        'attn',
-                        'CatArrayBatched':
-                        'prepare_next',
-                        'ncclDevKernel|cross_device_reduce':
-                        'nccl_and_custom_ar',
-                        'Norm|_norm_':
-                        'norm',
-                        'sbtopk':
-                        'topk',
-                        'act_and_mul_':
-                        'activation',
-                        'compute_position_kernel':
-                        'rope',
-                        'elementwise':
-                        'elementwise',
-                        'fp8_quant|quant_fp8|cvt_fp16_to_fp4':
-                        'quantize',
-                        'reduce':
-                        'reduce',
-                        'SoftMax':
-                        'softmax',
-                        'triton':
-                        'triton_kernel',
-                        'CUDA mem':
-                        'non-gpu-H_D_memops',
-                        '.*':
-                        'misc'
-                    }
-                }
-            },
-            'gpt-oss': {
-                'layer_anno': {
-                    'Stage': {
-                        '.*': 'layer',
-                    },
-                    'Substage': {
-                        'block_fp8|gemm_fp8_blockwise':
-                        'block_fp8_gemm',
-                        'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_'
-                        # this section is triton_moe_gemm
-                        '|matmul_ogs_|_topk_forward|_combined_routing'
-                        '|_sum_bitmatrix_rows|_compute_writeback_idx':
-                        'moe_gemm',
-                        'gemm|matmul|nvjet':
-                        'gemm',
-                        'moe|sigmoid|expert|splitKreduce':
-                        'moe',
-                        '_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha':
-                        'attn',
-                        'CatArrayBatched':
-                        'prepare_next',
-                        'ncclDevKernel|cross_device_reduce':
-                        'nccl_and_custom_ar',
-                        'Norm|_norm_':
-                        'norm',
-                        'sbtopk':
-                        'topk',
-                        'act_and_mul_':
-                        'activation',
-                        'compute_position_kernel':
-                        'rope',
-                        'elementwise':
-                        'elementwise',
-                        'fp8_quant|quant_fp8|cvt_fp16_to_fp4|quantize':
-                        'quantize',
-                        'reduce':
-                        'reduce',
-                        'SoftMax':
-                        'softmax',
-                        'triton':
-                        'triton_kernel',
-                        'CUDA mem':
-                        'non-gpu-H_D_memops',
-                        '.*':
-                        'misc'
-                    }
-                }
-            }
-        },
-    }
+def load_engine_model():
+    """ returns engine_model built from all json files in the current dir """
+    import glob
+    import json
+    engine_model = {}
+
+    json_files = glob.glob(
+        os.path.join(os.path.dirname(__file__) or ".", "*.json"))
+    for fname in json_files:
+        with open(fname, encoding="utf-8") as f:
+            engine_model.update(json.load(f))
+    return engine_model
 
 
 class GPUTrace2Graph:
@@ -148,8 +34,7 @@ class GPUTrace2Graph:
         Parses output of nsys report, generates csv and bar chart output
     """
 
-    def __init__(self, nsys_cmd):
-        self.nsys_cmd = nsys_cmd
+    def __init__(self):
         import pandas as pd  # avoid importing till needed
         self.pd = pd
         self.pd.options.mode.copy_on_write = True
@@ -227,7 +112,7 @@ class GPUTrace2Graph:
             title = 'Model_Engine'
         x = 'Model_Engine'
         y = 'Elapsed Time (sec)'
-        color = 'Substage'
+        color = 'Category'
         """ generate kernel mapping table  """
         # Sort Model_Engine categories by last field after underscore
         df['Model_Engine'] = self.pd.Categorical(
@@ -249,14 +134,13 @@ class GPUTrace2Graph:
             Generate data table with columns per Model_Engine into result.html
         """
         pivot_df = df.pivot_table(values='Elapsed Time (sec)',
-                                  index='Substage',
+                                  index='Category',
                                   columns='Model_Engine',
                                   aggfunc='sum',
                                   observed=False).round(2)
         # Add sum row at bottom
         pivot_df.loc['total_elapsed_sec'] = pivot_df.sum()
         pivot_df.fillna('').to_html('temp.html')
-        print('got')
         with (open(f'{output_name}.html', 'a', encoding='utf-8') as
               outfile, open('temp.html', encoding='utf-8') as infile):
             outfile.write(infile.read())
@@ -264,23 +148,22 @@ class GPUTrace2Graph:
 
         print(f'Finished generating: \n'
               f' {output_name}.html for stack bar chart \n'
-              f' {output_name}.csv for Kernel-Substage mapping')
+              f' {output_name}.csv for Kernel-Category mapping')
 
     def anno_gpu_kernname(self, df, mapping):
-        """ add "stage" and "substage" columns """
+        """ add "Category" column """
 
-        def anno_gpu_kernname_helper(name, stage):
-            for kern_name, val in mapping['layer_anno'][stage].items():
+        def anno_gpu_kernname_helper(name):
+            for kern_name, val in mapping.items():
                 if re.search(kern_name, name):
                     return val
 
-        for stage in ['Stage', 'Substage']:
-            df[stage] = df['Name'].apply(anno_gpu_kernname_helper, stage=stage)
+        df['Category'] = df['Name'].apply(anno_gpu_kernname_helper)
 
     def make_nongpu_row(self, df, nongpu_sec):
         """ this will append non-gpu time entry at end of df """
         nongpu_row = self.pd.DataFrame([df.iloc[-1]])
-        nongpu_row['Substage'] = nongpu_row['Name'] = 'CPU(non-GPU)'
+        nongpu_row['Category'] = nongpu_row['Name'] = 'CPU(non-GPU)'
         nongpu_row['Instances'] = 1
         nongpu_row['Elapsed Time (sec)'] = nongpu_sec
         return (nongpu_row)
@@ -302,7 +185,7 @@ class GPUTrace2Graph:
             logger.info('generating %s', new_file)
             return True
 
-    def gen_sum_file(self, file):
+    def gen_sum_file(self, file, nsys_cmd):
         """ 
             generates sum file from nsys trace with times per kernel and
             returns the name of the sum file
@@ -318,17 +201,21 @@ class GPUTrace2Graph:
         sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv'
         if self.should_gen_file(nsys_stats_file, file):
             cmd = [
-                self.nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o',
+                nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o',
                 f'{file_dir}/{file_name}'
             ]
             cmd_str = ' '.join(cmd)
             logger.info('+ %s', cmd_str)
+            # estimate time based on calibrated 240M/min
+            file_size_mb = os.path.getsize(file) / 1e6
+            logger.info(
+                'nsys stats for %.2f MB file expected to take %.2f min',
+                file_size_mb, file_size_mb / 240)
             try:
-                subprocess.run(cmd)
+                subprocess.run(cmd, check=True)
             except Exception:
-                logger.error(
-                    "%s failed, specify --nsys_cmd for correct nsys path",
-                    cmd_str)
+                logger.error("%s failed; Use --nsys_cmd to specify nsys path",
+                             cmd_str)
                 exit(1)
             logger.info('generating non-overalapped sum %s', sum_file)
             self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file)
@@ -336,7 +223,7 @@ class GPUTrace2Graph:
         logger.info('Finished generating %s', sum_file)
         return sum_file
 
-    def gen_graph(self, in_file, out_dir, title):
+    def gen_graph(self, in_file, out_dir, title, nsys_cmd, engine_model):
         """ generates graph and csv file from in_file into out_dir """
         # Initialize an empty DataFrame to store combined data
         combined_df = self.pd.DataFrame()
@@ -345,17 +232,16 @@ class GPUTrace2Graph:
             file_name = os.path.basename(file)
             if not file_dir:
                 file_dir = '.'
-            sum_file = self.gen_sum_file(file)
+            sum_file = self.gen_sum_file(file, nsys_cmd)
             # read kernel summary file
             df = self.pd.read_csv(sum_file)
             # annotate kernel to their categories
-            assert EngineModelData.engine_model.get(engine)
-            assert EngineModelData.engine_model[engine].get(model)
+            assert engine_model.get(engine), f'engine {engine} unknown'
+            assert engine_model[engine].get(model), f'model {model} unknown'
             # remove nsys-rep from file_name for shorter x-label
             file_name = file_name.replace('.nsys-rep', '')
             df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}'
-            self.anno_gpu_kernname(df,
-                                   EngineModelData.engine_model[engine][model])
+            self.anno_gpu_kernname(df, engine_model[engine][model])
             # patch in non-gpu time
             gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1)
             total_sec = round(float(total_sec), 1)
@@ -393,12 +279,12 @@ def main():
             "--out_dir results/ --title \"Model=gpt-oss vLLM chart\""),
         formatter_class=argparse.RawDescriptionHelpFormatter)
 
-    # Build help string showing available engine/model combinations
-    engine_model_help = []
-    for engine, models in EngineModelData.engine_model.items():
-        model_list = list(models.keys())
-        engine_model_help.append(f"{engine}:[{','.join(model_list)}]")
-    engine_model_str = ' '.join(engine_model_help)
+    # load supported engine_model
+    engine_model_supported = load_engine_model()
+    # Get a string representation of supported engine/model combinations
+    engine_model_supported_str = ', '.join(
+        f"{engine}:[{', '.join(models.keys())}]"
+        for engine, models in engine_model_supported.items())
     parser.add_argument(
         '--in_file',
         type=parse_tuple,
@@ -408,7 +294,7 @@ def main():
             'separated by space. Elapsed_nonprofiled_sec is runtime without '
             'profiling used to calculate non-gpu time. Specify 0 to use '
             'elapsed time from nsys-rep but that might inflate non-gpu time. '
-            f'Available engine:[model] are: {engine_model_str} '
+            f'Available engine:[model] are: {engine_model_supported_str} '
             f'Example: --infile d1.nsys-rep,vllm,llama,100 '
             'd2.nsys-rep,vllm,gpt-oss,102'),
         required=True)
@@ -418,8 +304,9 @@ def main():
                         help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'),
                         default="nsys")
     args = parser.parse_args()
-    gputrace = GPUTrace2Graph(args.nsys_cmd)
-    gputrace.gen_graph(args.in_file, args.out_dir, args.title)
+    gputrace = GPUTrace2Graph()
+    gputrace.gen_graph(args.in_file, args.out_dir, args.title, args.nsys_cmd,
+                       engine_model_supported)
 
 
 if __name__ == '__main__':
diff --git a/tools/profiler/nsys_profile_tools/vllm_engine_model.json b/tools/profiler/nsys_profile_tools/vllm_engine_model.json
new file mode 100644
index 0000000000..264c628dde
--- /dev/null
+++ b/tools/profiler/nsys_profile_tools/vllm_engine_model.json
@@ -0,0 +1,63 @@
+{
+  "vllm": {
+    "llama": {
+      "fused_moe_kernel|GroupProblemShape|group_gemm_starts|bmm_|GemmUniversal": "moe_gemm",
+      "gemm|nvjet": "gemm",
+      "moe|sigmoid": "moe",
+      "CatArrayBatched|prepare_inputs": "prepare_next",
+      "ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
+      "_norm_|Norm": "norm",
+      "act_and_mul_": "activation",
+      "Rotary": "rope",
+      "SoftMax": "softmax",
+      "flash|fmha": "attn",
+      "elementwise": "elementwise",
+      "fp8_quant|cvt_": "quantize",
+      "reduce_kernel": "reduce",
+      "triton": "triton_kernel",
+      "CUDA mem": "non-gpu-H_D_memops",
+      ".*": "misc"
+    },
+    "ds": {
+      "block_fp8|gemm_fp8_blockwise": "block_fp8_gemm",
+      "fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_": "moe_gemm",
+      "gemm|matmul|nvjet": "gemm",
+      "moe|sigmoid|expert": "moe",
+      "CatArrayBatched": "prepare_next",
+      "ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
+      "Norm|_norm_": "norm",
+      "sbtopk": "topk",
+      "act_and_mul_": "activation",
+      "compute_position_kernel": "rope",
+      "elementwise": "elementwise",
+      "fp8_quant|quant_fp8|cvt_": "quantize",
+      "reduce": "reduce",
+      "SoftMax": "softmax",
+      "_fwd_|FlashAttn|_mla_|_attn_|fmha": "attn",
+      "triton": "triton_kernel",
+      "topk": "topk",
+      "CUDA mem": "non-gpu-H_D_memops",
+      ".*": "misc"
+    },
+    "gpt-oss": {
+      "block_fp8|gemm_fp8_blockwise": "block_fp8_gemm",
+      "fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_|matmul_ogs_|_topk_forward|_combined_routing|_sum_bitmatrix_rows|_compute_writeback_idx": "moe_gemm",
+      "gemm|matmul|nvjet": "gemm",
+      "moe|sigmoid|expert|splitKreduce": "moe",
+      "CatArrayBatched": "prepare_next",
+      "ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
+      "Norm|_norm_": "norm",
+      "topk": "topk",
+      "act_and_mul_": "activation",
+      "compute_position_kernel": "rope",
+      "elementwise": "elementwise",
+      "fp8_quant|quant_fp8|cvt_|quantize": "quantize",
+      "reduce": "reduce",
+      "SoftMax": "softmax",
+      "_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha": "attn",
+      "triton": "triton_kernel",
+      "CUDA mem": "non-gpu-H_D_memops",
+      ".*": "misc"
+    }
+  }
+}
\ No newline at end of file

From 3c8a7872471a96b414f9861653dded55aa431795 Mon Sep 17 00:00:00 2001
From: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com>
Date: Tue, 19 Aug 2025 10:48:07 +0300
Subject: [PATCH 382/932] [Benchmark] Add flag --served-model-name to
 benchmark_serving_multi_turn (#22889)

Signed-off-by: daniels <daniels@pliops.com>
---
 benchmarks/multi_turn/README.md                    | 12 +++++++-----
 .../multi_turn/benchmark_serving_multi_turn.py     | 14 +++++++++++++-
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md
index ae0866ae60..7adf97bcf5 100644
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@@ -5,11 +5,13 @@ The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `re
 First start serving your model
 
 ```bash
-export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
+export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
 
-vllm serve $MODEL_NAME --disable-log-requests
+vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests
 ```
 
+The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface).
+
 ## Synthetic Multi-Turn Conversations
 
 Download the following text file (used for generation of synthetic conversations)
@@ -26,10 +28,10 @@ But you may use other text files if you prefer (using this specific file is not
 Then run the benchmarking script
 
 ```bash
-export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
+export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
 
-python benchmark_serving_multi_turn.py --model $MODEL_NAME --input-file generate_multi_turn.json \
---num-clients 2 --max-active-conversations 6
+python benchmark_serving_multi_turn.py --model $MODEL_PATH --served-model-name Llama \
+--input-file generate_multi_turn.json --num-clients 2 --max-active-conversations 6
 ```
 
 You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 53c3207491..d23b7b6e45 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -825,9 +825,11 @@ def get_client_config(
 
     # Arguments for API requests
     chat_url = f"{args.url}/v1/chat/completions"
+    model_name = args.served_model_name if args.served_model_name else args.model
+
     req_args = RequestArgs(
         chat_url=chat_url,
-        model=args.model,
+        model=model_name,
         stream=not args.no_stream,
         limit_min_tokens=args.limit_min_tokens,
         limit_max_tokens=args.limit_max_tokens,
@@ -1247,9 +1249,19 @@ async def main() -> None:
         default=0,
         help="Seed for random number generators (default: 0)",
     )
+
     parser.add_argument(
         "-m", "--model", type=str, required=True, help="Path of the LLM model"
     )
+    parser.add_argument(
+        "--served-model-name",
+        type=str,
+        default=None,
+        help="The model name used in the API. "
+        "If not specified, the model name will be the "
+        "same as the ``--model`` argument. ",
+    )
+
     parser.add_argument(
         "-u",
         "--url",

From 4efd43e9b45cec5a77c92bce3721774e8e51f533 Mon Sep 17 00:00:00 2001
From: qizixi <22851944+zixi-qi@users.noreply.github.com>
Date: Tue, 19 Aug 2025 16:56:31 +0900
Subject: [PATCH 383/932] Fix GLM-4.5V-FP8 numerical issue (#22949)

Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 examples/offline_inference/vision_language.py | 80 ++++++++++++++++++-
 .../vision_language_multi_image.py            | 72 +++++++++++++++++
 vllm/model_executor/models/glm4_1v.py         |  7 +-
 3 files changed, 154 insertions(+), 5 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 988ad35cdd..a13b6a9225 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -333,6 +333,80 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# GLM-4.5V
+def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "zai-org/GLM-4.5V"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        mm_processor_kwargs={
+            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
+            "fps": 1,
+        },
+        limit_mm_per_prompt={modality: 1},
+        enforce_eager=True,
+        tensor_parallel_size=4,
+    )
+
+    if modality == "image":
+        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+    elif modality == "video":
+        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    prompts = [
+        (
+            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
+            f"{placeholder}"
+            f"{question}<|assistant|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# GLM-4.5V-FP8
+def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "zai-org/GLM-4.5V-FP8"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        mm_processor_kwargs={
+            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
+            "fps": 1,
+        },
+        limit_mm_per_prompt={modality: 1},
+        enforce_eager=True,
+        tensor_parallel_size=4,
+    )
+
+    if modality == "image":
+        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+    elif modality == "video":
+        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    prompts = [
+        (
+            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
+            f"{placeholder}"
+            f"{question}<|assistant|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # H2OVL-Mississippi
 def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -383,8 +457,8 @@ def run_hyperclovax_seed_vision(
     for question in questions:
         if modality == "image":
             """
-            ocr: List the words in the image in raster order. 
-                Even if the word order feels unnatural for reading, 
+            ocr: List the words in the image in raster order.
+                Even if the word order feels unnatural for reading,
                 the model will handle it as long as it follows raster order.
                 e.g. "Naver, CLOVA, bigshane"
             lens_keywords: List the entity names in the image.
@@ -1448,6 +1522,8 @@ model_example_map = {
     "gemma3n": run_gemma3n,
     "glm4v": run_glm4v,
     "glm4_1v": run_glm4_1v,
+    "glm4_5v": run_glm4_5v,
+    "glm4_5v_fp8": run_glm4_5v_fp8,
     "h2ovl_chat": run_h2ovl,
     "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
     "idefics3": run_idefics3,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 799337ed68..56519c95f8 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1064,6 +1064,76 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+# GLM-4.5V
+def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "zai-org/GLM-4.5V"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=32768,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        enforce_eager=True,
+        tensor_parallel_size=4,
+    )
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+    processor = AutoProcessor.from_pretrained(model_name)
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
+# GLM-4.5V-FP8
+def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "zai-org/GLM-4.5V-FP8"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=32768,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        enforce_eager=True,
+        tensor_parallel_size=4,
+    )
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+    processor = AutoProcessor.from_pretrained(model_name)
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 model_example_map = {
     "aria": load_aria,
     "aya_vision": load_aya_vision,
@@ -1096,6 +1166,8 @@ model_example_map = {
     "step3": load_step3,
     "tarsier": load_tarsier,
     "tarsier2": load_tarsier2,
+    "glm4_5v": load_glm4_5v,
+    "glm4_5v_fp8": load_glm4_5v_fp8,
 }
 
 
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 015577322f..08252c5131 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -126,7 +126,7 @@ class Glm4vVideoPixelInputs(TensorSchema):
         - ctpp: Number of channels * temporal_patch_size *
             patch_size * patch_size
         - f: Number of frames
-        - g: Grid dimensions (3 for grid_t which is usually 1 for processed 
+        - g: Grid dimensions (3 for grid_t which is usually 1 for processed
           video, grid_h, grid_w)
     """
     type: Literal["pixel_values_videos"] = "pixel_values_videos"
@@ -141,7 +141,7 @@ class Glm4vVideoEmbeddingInputs(TensorSchema):
         - p: Number of video patches across all frames
         - h: Hidden size (must match language model backbone)
         - f: Number of frames
-        - g: Grid dimensions (3 for grid_t which is usually 1 for processed 
+        - g: Grid dimensions (3 for grid_t which is usually 1 for processed
           video, grid_h, grid_w)
     """
     type: Literal["video_embeds"] = "video_embeds"
@@ -234,7 +234,8 @@ class Glm4vVisionAttention(nn.Module):
             total_num_kv_heads=num_heads,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.qkv",
+            # Change qkv prefix to align with GLM-4.5V-FP8 quantization config
+            prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv",
         )
         self.proj = RowParallelLinear(
             input_size=projection_size,

From 31436e8b4f0a210cf5c6f9ec9b47fc1b5d7cc47a Mon Sep 17 00:00:00 2001
From: hustxiayang <yangxiast@gmail.com>
Date: Tue, 19 Aug 2025 04:32:18 -0400
Subject: [PATCH 384/932] [Misc] Add request_id into benchmark_serve.py
 (#23065)

Signed-off-by: yangxia <yangxiast@gmail.com>
---
 benchmarks/backend_request_func.py           |  23 +++-
 benchmarks/benchmark_dataset.py              | 109 ++++++++++++++----
 benchmarks/benchmark_serving.py              |  23 +++-
 vllm/benchmarks/datasets.py                  | 113 +++++++++++++++----
 vllm/benchmarks/lib/endpoint_request_func.py |   7 ++
 vllm/benchmarks/serve.py                     |  14 ++-
 6 files changed, 243 insertions(+), 46 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 1559ca2d92..ba7c733be0 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -34,6 +34,7 @@ class RequestFuncInput:
     multi_modal_content: Optional[dict | list[dict]] = None
     ignore_eos: bool = False
     language: Optional[str] = None
+    request_id: Optional[str] = None
 
 
 @dataclass
@@ -71,6 +72,9 @@ async def async_request_tgi(
             "inputs": request_func_input.prompt,
             "parameters": params,
         }
+        headers = None
+        if request_func_input.request_id:
+            headers = {"x-request-id": request_func_input.request_id}
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
         if request_func_input.ignore_eos:
@@ -82,7 +86,9 @@ async def async_request_tgi(
         st = time.perf_counter()
         most_recent_timestamp = st
         try:
-            async with session.post(url=api_url, json=payload) as response:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
                 if response.status == 200:
                     async for chunk_bytes in response.content:
                         chunk_bytes = chunk_bytes.strip()
@@ -145,6 +151,9 @@ async def async_request_trt_llm(
         }
         if request_func_input.ignore_eos:
             payload["min_length"] = request_func_input.output_len
+        headers = None
+        if request_func_input.request_id:
+            headers = {"x-request-id": request_func_input.request_id}
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
 
@@ -152,7 +161,9 @@ async def async_request_trt_llm(
         st = time.perf_counter()
         most_recent_timestamp = st
         try:
-            async with session.post(url=api_url, json=payload) as response:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
                 if response.status == 200:
                     async for chunk_bytes in response.content:
                         chunk_bytes = chunk_bytes.strip()
@@ -211,6 +222,8 @@ async def async_request_deepspeed_mii(
             "top_p": 1.0,
         }
         headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        if request_func_input.request_id:
+            headers["x-request-id"] = request_func_input.request_id
 
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
@@ -283,6 +296,8 @@ async def async_request_openai_completions(
         if request_func_input.extra_body:
             payload.update(request_func_input.extra_body)
         headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        if request_func_input.request_id:
+            headers["x-request-id"] = request_func_input.request_id
 
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
@@ -395,6 +410,8 @@ async def async_request_openai_chat_completions(
             "Content-Type": "application/json",
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
         }
+        if request_func_input.request_id:
+            headers["x-request-id"] = request_func_input.request_id
 
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
@@ -491,6 +508,8 @@ async def async_request_openai_audio(
         headers = {
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
         }
+        if request_func_input.request_id:
+            headers["x-request-id"] = request_func_input.request_id
 
         # Send audio file
         def to_bytes(y, sr):
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 572292a5ac..c62934ed94 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -19,6 +19,7 @@ import logging
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
+from copy import deepcopy
 from dataclasses import dataclass
 from functools import cache
 from io import BytesIO
@@ -54,6 +55,7 @@ class SampleRequest:
     expected_output_len: int
     multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None
     lora_request: Optional[LoRARequest] = None
+    request_id: Optional[str] = None
 
 
 # -----------------------------------------------------------------------------
@@ -155,7 +157,10 @@ class BenchmarkDataset(ABC):
 
     @abstractmethod
     def sample(
-        self, tokenizer: PreTrainedTokenizerBase, num_requests: int
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        request_id_prefix: str = "",
     ) -> list[SampleRequest]:
         """
         Abstract method to generate sample requests from the dataset.
@@ -167,6 +172,7 @@ class BenchmarkDataset(ABC):
             tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
              for processing the dataset's text.
             num_requests (int): The number of sample requests to generate.
+            request_id_prefix (str) The prefix of request_id.
 
         Returns:
             list[SampleRequest]: A list of sample requests generated from the
@@ -175,7 +181,10 @@ class BenchmarkDataset(ABC):
         raise NotImplementedError("sample must be implemented in subclasses.")
 
     def maybe_oversample_requests(
-        self, requests: list[SampleRequest], num_requests: int
+        self,
+        requests: list[SampleRequest],
+        num_requests: int,
+        request_id_prefix: str = "",
     ) -> None:
         """
         Oversamples the list of requests if its size is less than the desired
@@ -183,11 +192,18 @@ class BenchmarkDataset(ABC):
 
         Args:
             requests (List[SampleRequest]): The current list of sampled
-            requests.  num_requests (int): The target number of requests.
+            requests.
+            num_requests (int): The target number of requests.
+            request_id_prefix (str) The prefix of the request ids.
         """
         if len(requests) < num_requests:
             random.seed(self.random_seed)
-            additional = random.choices(requests, k=num_requests - len(requests))
+            additional = deepcopy(
+                random.choices(requests, k=num_requests - len(requests))
+            )
+            for i in range(len(additional)):
+                req = additional[i]
+                req.request_id = request_id_prefix + str(len(requests) + i)
             requests.extend(additional)
             logger.info("Oversampled requests to reach %d total samples.", num_requests)
 
@@ -303,6 +319,7 @@ class RandomDataset(BenchmarkDataset):
         range_ratio: float = DEFAULT_RANGE_RATIO,
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
         # Enforce range_ratio < 1
@@ -363,8 +380,10 @@ class RandomDataset(BenchmarkDataset):
                     prompt=prompt,
                     prompt_len=total_input_len,
                     expected_output_len=int(output_lens[i]),
+                    request_id=request_id_prefix + str(i),
                 )
             )
+
         return requests
 
 
@@ -406,9 +425,11 @@ class ShareGPTDataset(BenchmarkDataset):
         max_loras: Optional[int] = None,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         samples: list = []
+        ind = 0
         for entry in self.data:
             if len(samples) >= num_requests:
                 break
@@ -444,9 +465,11 @@ class ShareGPTDataset(BenchmarkDataset):
                     expected_output_len=new_output_len,
                     lora_request=lora_request,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
                 )
             )
-        self.maybe_oversample_requests(samples, num_requests)
+            ind += 1
+        self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
         return samples
 
 
@@ -512,10 +535,11 @@ class CustomDataset(BenchmarkDataset):
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
         skip_chat_template: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         sampled_requests = []
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = item["prompt"]
@@ -534,9 +558,12 @@ class CustomDataset(BenchmarkDataset):
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
                 )
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
 
         return sampled_requests
 
@@ -578,6 +605,7 @@ class SonnetDataset(BenchmarkDataset):
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
         return_prompt_formatted: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         # Calculate average token length for a poem line.
@@ -603,6 +631,7 @@ class SonnetDataset(BenchmarkDataset):
         prefix_lines = self.data[:num_prefix_lines]
 
         samples = []
+        ind = 0
         while len(samples) < num_requests:
             extra_lines = random.choices(
                 self.data, k=num_input_lines - num_prefix_lines
@@ -613,14 +642,17 @@ class SonnetDataset(BenchmarkDataset):
                 msg, add_generation_prompt=True, tokenize=False
             )
             prompt_len = len(tokenizer(prompt_formatted).input_ids)
+
             if prompt_len <= input_len:
                 samples.append(
                     SampleRequest(
                         prompt=prompt_formatted if return_prompt_formatted else prompt,
                         prompt_len=prompt_len,
                         expected_output_len=output_len,
+                        request_id=request_id_prefix + str(ind),
                     )
                 )
+                ind += 1
         return samples
 
 
@@ -672,6 +704,7 @@ class BurstGPTDataset(BenchmarkDataset):
         num_requests: int,
         max_loras: Optional[int] = None,
         lora_path: Optional[str] = None,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
         samples = []
@@ -693,6 +726,7 @@ class BurstGPTDataset(BenchmarkDataset):
                     prompt_len=input_len,
                     expected_output_len=output_len,
                     lora_request=lora_req,
+                    request_id=request_id_prefix + str(i),
                 )
             )
         return samples
@@ -752,12 +786,14 @@ class ConversationDataset(HuggingFaceDataset):
         num_requests: int,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         # Filter examples with at least 2 conversations
         filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
         sampled_requests = []
         dynamic_output = output_len is None
+        ind = 0
 
         for item in filtered_data:
             if len(sampled_requests) >= num_requests:
@@ -785,9 +821,13 @@ class ConversationDataset(HuggingFaceDataset):
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
                 )
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+            ind += 1
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
         return sampled_requests
 
 
@@ -814,11 +854,12 @@ class VisionArenaDataset(HuggingFaceDataset):
         num_requests: int,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
         sampled_requests = []
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
@@ -838,9 +879,12 @@ class VisionArenaDataset(HuggingFaceDataset):
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(i),
                 )
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
         return sampled_requests
 
 
@@ -870,11 +914,12 @@ class InstructCoderDataset(HuggingFaceDataset):
         num_requests: int,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
         sampled_requests = []
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = f"{item['input']}\n\n{item['instruction']} Just output \
@@ -892,9 +937,12 @@ class InstructCoderDataset(HuggingFaceDataset):
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
                 )
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
         return sampled_requests
 
 
@@ -924,12 +972,13 @@ class MTBenchDataset(HuggingFaceDataset):
         num_requests: int,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
         sampled_requests = []
 
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = item["turns"][0]
@@ -947,9 +996,12 @@ class MTBenchDataset(HuggingFaceDataset):
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
                 )
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
         return sampled_requests
 
 
@@ -974,10 +1026,12 @@ class AIMODataset(HuggingFaceDataset):
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
         output_len: Optional[int] = None,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         sampled_requests = []
         dynamic_output = output_len is None
+        ind = 0
 
         for item in self.data:
             if len(sampled_requests) >= num_requests:
@@ -1000,9 +1054,13 @@ class AIMODataset(HuggingFaceDataset):
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=None,
+                    request_id=request_id_prefix + str(ind),
                 )
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+            ind += 1
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
         return sampled_requests
 
 
@@ -1072,12 +1130,18 @@ class NextEditPredictionDataset(HuggingFaceDataset):
         "zed-industries/zeta": _format_zeta_prompt,
     }
 
-    def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs):
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        request_id_prefix: str = "",
+        **kwargs,
+    ):
         formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
         if formatting_prompt_func is None:
             raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
         samples = []
-        for sample in self.data:
+        for i, sample in enumerate(self.data):
             sample = formatting_prompt_func(sample)
             samples.append(
                 SampleRequest(
@@ -1086,11 +1150,12 @@ class NextEditPredictionDataset(HuggingFaceDataset):
                     expected_output_len=len(
                         tokenizer(sample["expected_output"]).input_ids
                     ),
+                    request_id=request_id_prefix + str(i),
                 )
             )
             if len(samples) >= num_requests:
                 break
-        self.maybe_oversample_requests(samples, num_requests)
+        self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
         return samples
 
 
@@ -1139,6 +1204,7 @@ class ASRDataset(HuggingFaceDataset):
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
         output_len: Optional[int] = None,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         import librosa
@@ -1148,6 +1214,7 @@ class ASRDataset(HuggingFaceDataset):
         prompt_len = len(tokenizer(prompt).input_ids)
         sampled_requests = []
         skipped = 0
+        ind = 0
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
@@ -1166,8 +1233,10 @@ class ASRDataset(HuggingFaceDataset):
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
                 )
             )
+            ind += 1
         if skipped:
             logger.warning(
                 "%d samples discarded from dataset due to"
@@ -1175,5 +1244,7 @@ class ASRDataset(HuggingFaceDataset):
                 " what Whisper supports.",
                 skipped,
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
         return sampled_requests
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index ae38caf729..02f5f585c0 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -375,11 +375,12 @@ async def benchmark(
                     rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
                 last_int_rps = current_int_rps
 
-        prompt, prompt_len, output_len, mm_content = (
+        prompt, prompt_len, output_len, mm_content, request_id = (
             request.prompt,
             request.prompt_len,
             request.expected_output_len,
             request.multi_modal_data,
+            request.request_id,
         )
         req_model_id, req_model_name = model_id, model_name
         if lora_modules:
@@ -397,6 +398,7 @@ async def benchmark(
             multi_modal_content=mm_content,
             ignore_eos=ignore_eos,
             extra_body=extra_body,
+            request_id=request_id,
         )
         task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
         tasks.append(asyncio.create_task(task))
@@ -665,6 +667,7 @@ def main(args: argparse.Namespace):
             tokenizer=tokenizer,
             output_len=args.custom_output_len,
             skip_chat_template=args.custom_skip_chat_template,
+            request_id_prefix=args.request_id_prefix,
         )
 
     elif args.dataset_name == "sonnet":
@@ -678,6 +681,7 @@ def main(args: argparse.Namespace):
                 prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
                 return_prompt_formatted=False,
+                request_id_prefix=args.request_id_prefix,
             )
         else:
             assert tokenizer.chat_template or tokenizer.default_chat_template, (
@@ -690,6 +694,7 @@ def main(args: argparse.Namespace):
                 prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
                 return_prompt_formatted=True,
+                request_id_prefix=args.request_id_prefix,
             )
 
     elif args.dataset_name == "hf":
@@ -751,6 +756,7 @@ def main(args: argparse.Namespace):
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             output_len=args.hf_output_len,
+            request_id_prefix=args.request_id_prefix,
         )
 
     else:
@@ -762,10 +768,15 @@ def main(args: argparse.Namespace):
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
                 output_len=args.sharegpt_output_len,
+                request_id_prefix=args.request_id_prefix,
             ),
             "burstgpt": lambda: BurstGPTDataset(
                 random_seed=args.seed, dataset_path=args.dataset_path
-            ).sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                request_id_prefix=args.request_id_prefix,
+            ),
             "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
@@ -773,6 +784,7 @@ def main(args: argparse.Namespace):
                 input_len=args.random_input_len,
                 output_len=args.random_output_len,
                 range_ratio=args.random_range_ratio,
+                request_id_prefix=args.request_id_prefix,
             ),
         }
 
@@ -1118,6 +1130,13 @@ def create_argument_parser():
         "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
         "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
     )
+    parser.add_argument(
+        "--request-id-prefix",
+        type=str,
+        required=False,
+        default="benchmark-serving",
+        help="Specify the prefix of request id.",
+    )
 
     # group for dataset specific arguments
     custom_group = parser.add_argument_group("custom dataset options")
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 72d7ce49b8..b575e8b9e0 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -18,6 +18,7 @@ import logging
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
+from copy import deepcopy
 from dataclasses import dataclass
 from functools import cache
 from io import BytesIO
@@ -76,6 +77,7 @@ class SampleRequest:
         Union[MultiModalDataDict, dict, list[dict]]
     ] = None
     lora_request: Optional[LoRARequest] = None
+    request_id: Optional[str] = None
 
 
 # -----------------------------------------------------------------------------
@@ -183,7 +185,8 @@ class BenchmarkDataset(ABC):
 
     @abstractmethod
     def sample(self, tokenizer: PreTrainedTokenizerBase,
-               num_requests: int) -> list[SampleRequest]:
+               num_requests: int, 
+               request_id_prefix: str = "") -> list[SampleRequest]:
         """
         Abstract method to generate sample requests from the dataset.
 
@@ -194,6 +197,8 @@ class BenchmarkDataset(ABC):
             tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
                 for processing the dataset's text.
             num_requests (int): The number of sample requests to generate.
+            request_id_prefix (str) The prefix of request_id.
+            
 
         Returns:
             list[SampleRequest]: A list of sample requests generated from the
@@ -201,8 +206,12 @@ class BenchmarkDataset(ABC):
         """
         raise NotImplementedError("sample must be implemented in subclasses.")
 
-    def maybe_oversample_requests(self, requests: list[SampleRequest],
-                                  num_requests: int) -> None:
+    def maybe_oversample_requests(
+        self,
+        requests: list[SampleRequest],
+        num_requests: int,
+        request_id_prefix: str = "",
+    ) -> None:
         """
         Oversamples the list of requests if its size is less than the desired
         number.
@@ -211,11 +220,17 @@ class BenchmarkDataset(ABC):
             requests (List[SampleRequest]): The current list of sampled
                 requests.
             num_requests (int): The target number of requests.
+            request_id_prefix (str) The prefix of the request ids.
+
         """
         if len(requests) < num_requests:
             random.seed(self.random_seed)
-            additional = random.choices(requests,
-                                        k=num_requests - len(requests))
+            additional = deepcopy(
+                random.choices(requests, k=num_requests - len(requests))
+            )
+            for i in range(len(additional)):
+                req = additional[i]
+                req.request_id = request_id_prefix + str(len(requests) + i)
             requests.extend(additional)
             logger.info("Oversampled requests to reach %d total samples.",
                         num_requests)
@@ -334,6 +349,7 @@ class RandomDataset(BenchmarkDataset):
         range_ratio: float = DEFAULT_RANGE_RATIO,
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
         # Enforce range_ratio < 1
@@ -391,6 +407,7 @@ class RandomDataset(BenchmarkDataset):
                     prompt=prompt,
                     prompt_len=total_input_len,
                     expected_output_len=int(output_lens[i]),
+                    request_id=request_id_prefix + str(i),
                 ))
         return requests
 
@@ -432,9 +449,11 @@ class ShareGPTDataset(BenchmarkDataset):
         max_loras: Optional[int] = None,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         samples: list = []
+        ind = 0
         for entry in self.data:
             if len(samples) >= num_requests:
                 break
@@ -470,8 +489,10 @@ class ShareGPTDataset(BenchmarkDataset):
                     expected_output_len=new_output_len,
                     lora_request=lora_request,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
                 ))
-        self.maybe_oversample_requests(samples, num_requests)
+            ind += 1
+        self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
         return samples
 
 
@@ -647,6 +668,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             tokenizer=tokenizer,
             output_len=args.custom_output_len,
             skip_chat_template=args.custom_skip_chat_template,
+            request_id_prefix=args.request_id_prefix,
         )
 
     elif args.dataset_name == "sonnet":
@@ -660,6 +682,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
                 return_prompt_formatted=False,
+                request_id_prefix=args.request_id_prefix,
             )
         else:
             assert tokenizer.chat_template or tokenizer.default_chat_template, (
@@ -671,6 +694,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
                 return_prompt_formatted=True,
+                request_id_prefix=args.request_id_prefix,
             )
 
     elif args.dataset_name == "hf":
@@ -730,6 +754,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             output_len=args.hf_output_len,
+            request_id_prefix=args.request_id_prefix,
         )
 
     else:
@@ -741,11 +766,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                                         tokenizer=tokenizer,
                                         num_requests=args.num_prompts,
                                         output_len=args.sharegpt_output_len,
+                                        request_id_prefix=args.request_id_prefix,
                                     ),
             "burstgpt":
             lambda: BurstGPTDataset(random_seed=args.seed,
                                     dataset_path=args.dataset_path).
-            sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+            sample(tokenizer=tokenizer, num_requests=args.num_prompts, 
+                   request_id_prefix=args.request_id_prefix,),
             "random":
             lambda: RandomDataset(random_seed=args.seed,
                                   dataset_path=args.dataset_path).sample(
@@ -755,6 +782,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 input_len=args.random_input_len,
                 output_len=args.random_output_len,
                 range_ratio=args.random_range_ratio,
+                request_id_prefix=args.request_id_prefix,
             ),
             "prefix_repetition":
             lambda: PrefixRepetitionRandomDataset(
@@ -766,6 +794,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 suffix_len=args.prefix_repetition_suffix_len,
                 num_prefixes=args.prefix_repetition_num_prefixes,
                 output_len=args.prefix_repetition_output_len,
+                request_id_prefix=args.request_id_prefix,
             ),
         }
 
@@ -839,10 +868,11 @@ class CustomDataset(BenchmarkDataset):
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
         skip_chat_template: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         sampled_requests = []
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = item["prompt"]
@@ -864,8 +894,10 @@ class CustomDataset(BenchmarkDataset):
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
                 ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
 
         return sampled_requests
 
@@ -909,6 +941,7 @@ class SonnetDataset(BenchmarkDataset):
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
         return_prompt_formatted: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         # Calculate average token length for a poem line.
@@ -934,6 +967,7 @@ class SonnetDataset(BenchmarkDataset):
         prefix_lines = self.data[:num_prefix_lines]
 
         samples = []
+        ind = 0
         while len(samples) < num_requests:
             extra_lines = random.choices(self.data,
                                          k=num_input_lines - num_prefix_lines)
@@ -949,7 +983,9 @@ class SonnetDataset(BenchmarkDataset):
                         if return_prompt_formatted else prompt,
                         prompt_len=prompt_len,
                         expected_output_len=output_len,
+                         request_id=request_id_prefix + str(ind),
                     ))
+                ind += 1
         return samples
 
 
@@ -1000,6 +1036,7 @@ class BurstGPTDataset(BenchmarkDataset):
         num_requests: int,
         max_loras: Optional[int] = None,
         lora_path: Optional[str] = None,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
         samples = []
@@ -1020,6 +1057,7 @@ class BurstGPTDataset(BenchmarkDataset):
                     prompt_len=input_len,
                     expected_output_len=output_len,
                     lora_request=lora_req,
+                    request_id=request_id_prefix + str(i),
                 ))
         return samples
 
@@ -1075,11 +1113,13 @@ class ConversationDataset(HuggingFaceDataset):
                num_requests: int,
                output_len: Optional[int] = None,
                enable_multimodal_chat: bool = False,
+               request_id_prefix: str = "",
                **kwargs) -> list:
         # Filter examples with at least 2 conversations
         filtered_data = self.data.filter(
             lambda x: len(x["conversations"]) >= 2)
         sampled_requests = []
+        ind = 0
         dynamic_output = output_len is None
 
         for item in filtered_data:
@@ -1111,8 +1151,11 @@ class ConversationDataset(HuggingFaceDataset):
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
                 ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+            ind += 1
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1141,12 +1184,13 @@ class VisionArenaDataset(HuggingFaceDataset):
         num_requests: int,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
         sampled_requests = []
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
@@ -1168,8 +1212,10 @@ class VisionArenaDataset(HuggingFaceDataset):
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(i),
                 ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1198,11 +1244,12 @@ class InstructCoderDataset(HuggingFaceDataset):
                num_requests: int,
                output_len: Optional[int] = None,
                enable_multimodal_chat: bool = False,
+               request_id_prefix: str = "",
                **kwargs) -> list:
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
         sampled_requests = []
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = f"{item['input']}\n\n{item['instruction']} Just output \
@@ -1224,8 +1271,10 @@ class InstructCoderDataset(HuggingFaceDataset):
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
                 ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1255,13 +1304,14 @@ class MTBenchDataset(HuggingFaceDataset):
         num_requests: int,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
         sampled_requests = []
 
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = item["turns"][0]
@@ -1282,8 +1332,10 @@ class MTBenchDataset(HuggingFaceDataset):
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
                 ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1305,8 +1357,10 @@ class AIMODataset(HuggingFaceDataset):
                tokenizer: PreTrainedTokenizerBase,
                num_requests: int,
                output_len: Optional[int] = None,
+               request_id_prefix: str = "",
                **kwargs) -> list:
         sampled_requests = []
+        ind = 0
         dynamic_output = output_len is None
 
         for item in self.data:
@@ -1331,8 +1385,12 @@ class AIMODataset(HuggingFaceDataset):
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=None,
+                    request_id=request_id_prefix + str(ind),
+                    
                 ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+            ind += 1
+        self.maybe_oversample_requests(sampled_requests, num_requests,
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1403,13 +1461,14 @@ class NextEditPredictionDataset(HuggingFaceDataset):
     }
 
     def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
+               request_id_prefix: str = "",
                **kwargs):
         formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(
             self.dataset_path)
         if formatting_prompt_func is None:
             raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
         samples = []
-        for sample in self.data:
+        for i, sample in enumerate(self.data):
             sample = formatting_prompt_func(sample)
             samples.append(
                 SampleRequest(
@@ -1417,10 +1476,11 @@ class NextEditPredictionDataset(HuggingFaceDataset):
                     prompt_len=len(tokenizer(sample["prompt"]).input_ids),
                     expected_output_len=len(
                         tokenizer(sample["expected_output"]).input_ids),
+                    request_id=request_id_prefix + str(i),
                 ))
             if len(samples) >= num_requests:
                 break
-        self.maybe_oversample_requests(samples, num_requests)
+        self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
         return samples
 
 
@@ -1470,6 +1530,7 @@ class ASRDataset(HuggingFaceDataset):
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
         output_len: Optional[int] = None,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         output_len = (output_len
@@ -1477,6 +1538,7 @@ class ASRDataset(HuggingFaceDataset):
         prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
         prompt_len = len(tokenizer(prompt).input_ids)
         sampled_requests = []
+        ind = 0
         skipped = 0
         for item in self.data:
             if len(sampled_requests) >= num_requests:
@@ -1496,7 +1558,9 @@ class ASRDataset(HuggingFaceDataset):
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
                 ))
+            ind += 1
         if skipped:
             logger.warning(
                 "%d samples discarded from dataset due to"
@@ -1504,7 +1568,8 @@ class ASRDataset(HuggingFaceDataset):
                 " what Whisper supports.",
                 skipped,
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1541,11 +1606,13 @@ class MLPerfDataset(HuggingFaceDataset):
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
         output_len: Optional[int] = None,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
         # Force dynamic output length based on reference completion.
         dynamic_output = output_len is None
         sampled_requests: list[SampleRequest] = []
+        ind = 0
 
         for item in self.data:
             if len(sampled_requests) >= num_requests:
@@ -1580,10 +1647,13 @@ class MLPerfDataset(HuggingFaceDataset):
                     prompt=prompt_formatted,
                     prompt_len=prompt_len,
                     expected_output_len=expected_output_len,
+                    request_id=request_id_prefix + str(ind),
                 )
             )
+            ind += 1
 
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1616,6 +1686,7 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
         suffix_len: int = DEFAULT_SUFFIX_LEN,
         num_prefixes: int = DEFAULT_NUM_PREFIXES,
         output_len: int = DEFAULT_OUTPUT_LEN,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
         vocab_size = tokenizer.vocab_size
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
index 47bc288774..677fe16cf5 100644
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -31,6 +31,7 @@ class RequestFuncInput:
     multi_modal_content: Optional[dict | list[dict]] = None
     ignore_eos: bool = False
     language: Optional[str] = None
+    request_id: Optional[str] = None
 
 
 @dataclass
@@ -87,6 +88,8 @@ async def async_request_openai_completions(
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
     }
+    if request_func_input.request_id:
+        headers["x-request-id"] = request_func_input.request_id
 
     output = RequestFuncOutput()
     output.prompt_len = request_func_input.prompt_len
@@ -210,6 +213,8 @@ async def async_request_openai_chat_completions(
         "Content-Type": "application/json",
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
     }
+    if request_func_input.request_id:
+        headers["x-request-id"] = request_func_input.request_id
 
     output = RequestFuncOutput()
     output.prompt_len = request_func_input.prompt_len
@@ -311,6 +316,8 @@ async def async_request_openai_audio(
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
     }
+    if request_func_input.request_id:
+        headers["x-request-id"] = request_func_input.request_id
 
     # Send audio file
     def to_bytes(y, sr):
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 7bf04c7532..79f2c475cb 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -478,11 +478,12 @@ async def benchmark(
                         "timestamp": timestamp
                     })
                 last_int_rps = current_int_rps
-        prompt, prompt_len, output_len, mm_content = (
+        prompt, prompt_len, output_len, mm_content, request_id = (
             request.prompt,
             request.prompt_len,
             request.expected_output_len,
             request.multi_modal_data,
+            request.request_id,
         )
         req_model_id, req_model_name = model_id, model_name
         if lora_modules:
@@ -498,7 +499,8 @@ async def benchmark(
                                               logprobs=logprobs,
                                               multi_modal_content=mm_content,
                                               ignore_eos=ignore_eos,
-                                              extra_body=extra_body)
+                                              extra_body=extra_body,
+                                              request_id=request_id,)
         tasks.append(
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
@@ -865,6 +867,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
         "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
     )
+    parser.add_argument(
+        "--request-id-prefix",
+        type=str,
+        required=False,
+        default="benchmark-serving",
+        help="Specify the prefix of request id.",
+    )
+
 
     sampling_group = parser.add_argument_group("sampling parameters")
     sampling_group.add_argument(

From 31fd3265c8b28f98120152020d4e5dcfebb8c5d2 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 19 Aug 2025 16:49:29 +0800
Subject: [PATCH 385/932] [Bugfix] Fix broken Minimax-01-VL model (#22116)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 examples/offline_inference/vision_language.py |  34 +++++
 tests/models/multimodal/test_tensor_schema.py |   1 -
 vllm/model_executor/models/minimax_vl_01.py   | 120 +++++++++++++-----
 3 files changed, 123 insertions(+), 32 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index a13b6a9225..9f6028d87c 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -889,6 +889,39 @@ def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
     return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
 
 
+def run_minimax_vl_01(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "MiniMaxAI/MiniMax-VL-01"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
+        tensor_parallel_size=8,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [{"type": "image"}, {"type": "text", "text": question}],
+            }
+        ]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Mistral-3 HF-format
 def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1539,6 +1572,7 @@ model_example_map = {
     "mantis": run_mantis,
     "minicpmo": run_minicpmo,
     "minicpmv": run_minicpmv,
+    "minimax_vl_01": run_minimax_vl_01,
     "mistral3": run_mistral3,
     "mllama": run_mllama,
     "molmo": run_molmo,
diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py
index 51e5b84b6c..143b4c8fc8 100644
--- a/tests/models/multimodal/test_tensor_schema.py
+++ b/tests/models/multimodal/test_tensor_schema.py
@@ -30,7 +30,6 @@ from ..utils import dummy_hf_overrides
 
 ARCH_TO_SKIP = {
     "MolmoForCausalLM": "incompatible requirements",
-    "MiniMaxVL01ForConditionalGeneration": "broken model",
 }
 ARCH_NEEDS_EXTRAS = [
     "InternVLChatModel",
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index 8107c6e8a0..cc7db849a2 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping
-from typing import Literal, Optional, TypedDict, Union, cast
+from typing import Annotated, Literal, Optional, Union, cast
 
 import torch
 import torch.nn as nn
 from transformers import BatchFeature, PretrainedConfig
+from transformers.models.llava_next.modeling_llava_next import (
+    get_anyres_image_grid_shape, unpad_image)
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
@@ -17,6 +19,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig
 from vllm.sequence import IntermediateTensors
 from vllm.utils.jsontree import json_map_leaves
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -29,24 +32,36 @@ from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
 
-class MiniMaxVL01ImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
+class MiniMaxVL01ImagePixelInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_images, num_channels, height, width)`
+    Dimensions:
+        - bn: Batch size * number of images
+        - np: Number of patches + 1
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
 
-    Note that `height` or `width` may be different per batch and image,
+    Note that `num_patches` may be different per batch and image,
     in which case the data is passed as a list instead of a batched tensor.
     """
+    type: Literal["pixel_values"] = "pixel_values"
+    pixel_values: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np", "h", "w"})]
+
+    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
+    # This should be in `(height, width)` format.
 
 
-class MiniMaxVL01ImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
+class MiniMaxVL01ImageEmbeddingInputs(TensorSchema):
     """
+    Dimensions:
+        - bn: Batch size * number of images
+        - ifs: Image feature size
+        - hs: Hidden size (must match language model backbone)
+    """
+    type: Literal["image_embeds"] = "image_embeds"
+    data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
 MiniMaxVL01ImageInputs = Union[MiniMaxVL01ImagePixelInputs,
@@ -141,6 +156,7 @@ class MiniMaxVL01MultiModalProcessor(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return {
             "pixel_values": MultiModalFieldConfig.batched("image"),
+            "image_sizes": MultiModalFieldConfig.batched("image"),
             "image_embeds": MultiModalFieldConfig.batched("image"),
         }
 
@@ -239,7 +255,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values)
+        image_features = tuple(vision_tower(p) for p in pixel_values)
 
         def select_features(leaf: torch.Tensor):
             return self._select_image_features(
@@ -252,6 +268,56 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
             json_map_leaves(select_features, image_features),
         )
 
+    # adapted from https://huggingface.co/MiniMaxAI/MiniMax-VL-01/blob/main/modeling_minimax_vl_01.py#L616-L631
+    def pack_image_features(self, image_features: list[torch.Tensor],
+                            image_sizes: torch.Tensor):
+        new_image_features = []
+        for image_idx, image_feature in enumerate(image_features):
+            if image_feature.shape[0] > 1:
+                base_image_feature = image_feature[0]
+                image_feature = image_feature[1:]
+                height = width = (self.config.vision_config.image_size //
+                                  self.config.vision_config.patch_size)
+                if height * width != base_image_feature.shape[0]:
+                    raise ValueError(
+                        "The number of patches is not consistent with "
+                        "the image size.")
+                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                    image_sizes[image_idx],
+                    self.config.image_grid_pinpoints,
+                    self.config.vision_config.image_size,
+                )
+
+                image_feature = image_feature.view(num_patch_height,
+                                                   num_patch_width, height,
+                                                   width, -1)
+                image_feature = image_feature.permute(4, 0, 2, 1,
+                                                      3).contiguous()
+                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                image_feature = unpad_image(image_feature,
+                                            image_sizes[image_idx])
+
+                image_feature = torch.cat(
+                    (
+                        image_feature,
+                        self.image_newline[:, None, None].expand(
+                            *image_feature.shape[:-1], 1).to(
+                                image_feature.dtype),
+                    ),
+                    dim=-1,
+                )
+                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                image_feature = torch.cat((base_image_feature, image_feature),
+                                          dim=0)
+            else:
+                image_feature = image_feature[0]
+                image_feature = torch.cat(
+                    (image_feature,
+                     self.image_newline[None].to(image_feature)),
+                    dim=0)
+            new_image_features.append(image_feature)
+        return new_image_features
+
     def _process_image_pixels(
         self,
         inputs: MiniMaxVL01ImagePixelInputs,
@@ -259,7 +325,6 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
         assert self.vision_tower is not None
 
         pixel_values = inputs["pixel_values"]
-
         return self._image_pixels_to_features(self.vision_tower, pixel_values)
 
     def _process_image_input(
@@ -281,38 +346,31 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         image_embeds = self.multi_modal_projector(torch.cat(image_features))
         image_embeds = torch.split(image_embeds, feature_sizes)
-        return image_embeds
-
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
+        image_sizes = image_input.get("image_sizes")
+        return self.pack_image_features(image_embeds, image_sizes)
 
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[MiniMaxVL01ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
+        image_sizes = kwargs.pop("image_sizes", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values is None and image_embeds is None:
             return None
 
-        if pixel_values is not None:
+        if pixel_values is not None and image_sizes is not None:
             if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
+            if not isinstance(image_sizes, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image sizes. "
+                                 f"Got type: {type(image_sizes)}")
+
             return MiniMaxVL01ImagePixelInputs(
                 type="pixel_values",
-                pixel_values=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                pixel_values=flatten_bn(pixel_values),
+                image_sizes=flatten_bn(image_sizes, concat=True),
             )
 
         if image_embeds is not None:

From 5bfe0dea7a3422f027ebb8fe36ce50ed5aa68d86 Mon Sep 17 00:00:00 2001
From: qizixi <22851944+zixi-qi@users.noreply.github.com>
Date: Tue, 19 Aug 2025 17:53:24 +0900
Subject: [PATCH 386/932] [bug fix] Fix llama4 spec decoding (#22691)

Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
---
 vllm/model_executor/models/llama4.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 308cb3e85e..ba08e6f81f 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -195,7 +195,9 @@ class Llama4Attention(nn.Module):
             is_neox_style=is_neox_style,
         ) if not self.nope else None
 
-        attn_cls = Attention if self.nope else ChunkedLocalAttention
+        use_chunked_local_attn = not self.nope and config.attention_chunk_size
+        attn_cls = (ChunkedLocalAttention
+                    if use_chunked_local_attn else Attention)
         self.attn = attn_cls(
             self.num_heads,
             self.head_dim,
@@ -206,7 +208,7 @@ class Llama4Attention(nn.Module):
             prefix=f"{prefix}.attn",
             **({
                 "attention_chunk_size": config.attention_chunk_size
-            } if not self.nope else {}))
+            } if use_chunked_local_attn else {}))
 
     def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
         floor = torch.floor((positions + 1.0) / self.floor_scale)

From 21bcc8263f06cf84deaf3a5d7b11b479468eaf27 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 19 Aug 2025 02:39:38 -0700
Subject: [PATCH 387/932] [Misc] Avoid accessing req_ids inside a loop (#23159)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9b0345a6aa..634f955207 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1748,6 +1748,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # NOTE(woosuk): As an exception, when using PP, the scheduler sends
         # the sampled tokens back, because there's no direct communication
         # between the first-stage worker and the last-stage worker.
+        req_ids = self.input_batch.req_ids
         for req_idx, sampled_ids in enumerate(valid_sampled_token_ids):
             if not sampled_ids:
                 continue
@@ -1763,7 +1764,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                            start_idx:end_idx] = sampled_ids
             self.input_batch.num_tokens_no_spec[req_idx] = end_idx
             self.input_batch.num_tokens[req_idx] = end_idx
-            req_id = self.input_batch.req_ids[req_idx]
+            req_id = req_ids[req_idx]
             req_state = self.requests[req_id]
             req_state.output_token_ids.extend(sampled_ids)
 
@@ -1843,6 +1844,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         elif self.speculative_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
             # TODO(woosuk): Refactor the loop.
+            req_ids = self.input_batch.req_ids
             next_token_ids: list[int] = []
             for i, token_ids in enumerate(sampled_token_ids):
                 if token_ids:
@@ -1851,7 +1853,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 else:
                     # Partial prefill (rare case).
                     # Get the next token id from the request state.
-                    req_id = self.input_batch.req_ids[i]
+                    req_id = req_ids[i]
                     req_state = self.requests[req_id]
                     seq_len = (req_state.num_computed_tokens +
                                scheduler_output.num_scheduled_tokens[req_id])
@@ -1914,6 +1916,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         sampled_token_ids: list[list[int]],
     ) -> list[list[int]]:
         # TODO(woosuk): Optimize.
+        req_ids = self.input_batch.req_ids
         draft_token_ids: list[list[int]] = []
         for i, sampled_ids in enumerate(sampled_token_ids):
             num_sampled_ids = len(sampled_ids)
@@ -1924,7 +1927,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
             # Skip requests that require sampling parameters that are not
             # supported with speculative decoding.
-            req_id = self.input_batch.req_ids[i]
+            req_id = req_ids[i]
             if req_id in self.input_batch.spec_decode_unsupported_reqs:
                 draft_token_ids.append([])
                 continue

From 2c3f557f08880b3cff86470b7ee358a047072990 Mon Sep 17 00:00:00 2001
From: Tialo <65392801+Tialo@users.noreply.github.com>
Date: Tue, 19 Aug 2025 13:16:23 +0300
Subject: [PATCH 388/932] [Doc] use power of 2 (#23172)

---
 docs/configuration/optimization.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 2eeb8ad25d..c7f50497d6 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -48,7 +48,7 @@ You can tune the performance by adjusting `max_num_batched_tokens`:
 
 - Smaller values (e.g., 2048) achieve better inter-token latency (ITL) because there are fewer prefills slowing down decodes.
 - Higher values achieve better time to first token (TTFT) as you can process more prefill tokens in a batch.
-- For optimal throughput, we recommend setting `max_num_batched_tokens > 8096` especially for smaller models on large GPUs.
+- For optimal throughput, we recommend setting `max_num_batched_tokens > 8192` especially for smaller models on large GPUs.
 - If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes).
 
 ```python

From 40f26734b92ededc6e604fa3296c61248b8cc902 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 19 Aug 2025 03:58:16 -0700
Subject: [PATCH 389/932] [Misc] Fix seq_lens for graph capture (#23175)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 634f955207..e0bab3367c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2317,15 +2317,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # If force_attention is True, we always capture attention. Otherwise,
         # it only happens for cudagraph_runtime_mode=FULL.
-        if force_attention or cudagraph_runtime_mode == \
-                CUDAGraphMode.FULL:
+        if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL:
             attn_metadata = {}
 
             # Make sure max_model_len is used at the graph capture time.
             self.seq_lens_np[:num_reqs] = self.max_model_len
             self.seq_lens_np[num_reqs:] = 0
-            self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
-                                           non_blocking=True)
+            self.seq_lens.copy_(self.seq_lens_cpu, non_blocking=True)
 
             for kv_cache_group_id, kv_cache_group_spec in enumerate(
                     self.kv_cache_config.kv_cache_groups):

From 03752dba8f1c4cd8eda99949b7d7ae90a47425d5 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Tue, 19 Aug 2025 20:22:15 +0800
Subject: [PATCH 390/932] [NVIDIA] Support Flashinfer TRTLLM FP8-q/kv/out
 Attention Kernel (#21716)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +
 .../benchmark_trtllm_decode_attention.py      | 295 ++++++------
 .../benchmark_trtllm_prefill_attention.py     | 227 ++++++----
 tests/compile/test_fusion_attn.py             | 249 +++++++++-
 .../test_flashinfer_trtllm_attention.py       | 424 ++++++++++--------
 vllm/attention/layer.py                       |  11 +-
 vllm/compilation/fusion_attn.py               |  77 ++--
 vllm/utils/flashinfer.py                      |  22 +-
 vllm/v1/attention/backends/flashinfer.py      | 109 +++--
 9 files changed, 916 insertions(+), 500 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0912bc1fd9..d4fcb91b11 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -631,6 +631,7 @@ steps:
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
   - vllm/v1/attention/backends/flashinfer.py
   - vllm/compilation/fusion.py
+  - vllm/compilation/fusion_attn.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
@@ -647,6 +648,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
     # Fusion
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 77136edca4..b3f8171546 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -3,16 +3,14 @@
 
 import csv
 import os
-import random
 from datetime import datetime
+from typing import Optional
 
 import flashinfer
 import torch
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
-
-# KV Cache Layout for TRT-LLM
-# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
+FP8_DTYPE = torch.float8_e4m3fn
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -26,65 +24,107 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 
 @torch.no_grad()
 def benchmark_decode(
-    num_seqs,
-    max_seq_len,
-    page_size=16,
-    dtype=torch.bfloat16,
-    kv_layout="HND",
-    num_kv_heads=8,
-    kv_cache_dtype="auto",
-    head_dim=128,
-    warmup=10,
-    trials=20,
+    dtype: torch.dtype,
+    quant_dtypes: tuple[
+        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
+    ],
+    batch_size: int,
+    max_seq_len: int,
+    num_heads: tuple[int, int] = (64, 8),
+    head_size: int = 128,
+    kv_layout: str = "HND",
+    block_size: int = 16,
+    warmup: int = 10,
+    trials: int = 20,
 ):
     torch.set_default_device("cuda")
-    device = "cuda"
     torch.manual_seed(0)
 
-    HEAD_GRP_SIZE = 8
-    MAX_SEQ_LEN = max_seq_len
+    q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
+    q_quant_dtype = q_quant_dtype or dtype
+    kv_quant_dtype = kv_quant_dtype or dtype
+    o_quant_dtype = o_quant_dtype or dtype
+
+    num_qo_heads, num_kv_heads = num_heads
+    assert num_qo_heads % num_kv_heads == 0
+
+    sm_scale = float(1.0 / (head_size**0.5))
 
     # large number to reduce kv_cache reuse
-    NUM_BLOCKS = int(256000 / page_size)
+    NUM_BLOCKS = int(256000 / block_size)
 
-    workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8, device=device)
+    kv_cache_shape = None
+    if kv_layout == "NHD":
+        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
+    elif kv_layout == "HND":
+        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+    else:
+        raise ValueError(f"Invalid kv_layout: {kv_layout}")
 
-    # For decode, batch_size is num_decode_token
-    num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
-    sm_scale = float(1.0 / (head_dim**0.5))
-    q = torch.randn(num_seqs, num_qo_heads, head_dim, device=device, dtype=dtype)
-    kv_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
+    query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
+    if q_quant_dtype == FP8_DTYPE:
+        query, q_scale = to_float8(query)
+        ref_query = query.to(dtype) * q_scale
+    else:
+        q_scale = 1.0
+        ref_query = query
 
-    max_kv_len = max(kv_lens)
-    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int, device=device)
-    max_num_blocks_per_seq = (max_kv_len + page_size - 1) // page_size
+    kv_lens = torch.randint(1, max_seq_len, (batch_size,), dtype=torch.int32)
+    kv_lens[-1] = max_seq_len
 
+    seq_lens = kv_lens
+    max_seq_len = torch.max(seq_lens).item()
+
+    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    if kv_quant_dtype == FP8_DTYPE:
+        kv_cache, kv_scale = to_float8(kv_cache)
+        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+    else:
+        kv_scale = 1.0
+        ref_kv_cache = kv_cache
+    k_scale = v_scale = kv_scale
+
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(
-        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+        0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
     )
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(batch_size):
+        seq_len = seq_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
 
-    kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
-    kv_cache = torch.randn(size=kv_cache_shape, device=device, dtype=dtype)
-    k_scale = v_scale = 1.0
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+    workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8)
 
-    if kv_cache_dtype.startswith("fp8"):
-        kv_cache, _ = to_float8(kv_cache)
-
-    output_trtllm = torch.empty(q.shape, dtype=dtype)
-
-    # Benchmark TRT decode
-    def trt_decode():
-        return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
-            q,
-            kv_cache,
-            workspace_buffer,
-            block_tables,
-            kv_lens_tensor,
-            max_kv_len,
-            bmm1_scale=k_scale * sm_scale,
-            bmm2_scale=v_scale,
-            out=output_trtllm,
-        )
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer,
+        kv_layout,
+        use_tensor_cores=((num_qo_heads // num_kv_heads) > 4),
+    )
+    wrapper.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_lens,
+        num_qo_heads,
+        num_kv_heads,
+        head_size,
+        block_size,
+        "NONE",
+        sm_scale=sm_scale,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+    )
 
     def time_fn(fn, warmup=10, trials=20):
         torch.cuda.synchronize()
@@ -101,74 +141,51 @@ def benchmark_decode(
             times.append(start.elapsed_time(end))  # ms
         return sum(times) / len(times), torch.std(torch.tensor(times))
 
-    # TRT Decode
-    trt_mean, trt_std = time_fn(trt_decode)
-
-    kv_indptr = [0]
-    kv_indices = []
-    kv_last_page_lens = []
-    for i in range(num_seqs):
-        seq_len = kv_lens[i]
-        assert seq_len > 0
-        num_blocks = (seq_len + page_size - 1) // page_size
-        kv_indices.extend(block_tables[i, :num_blocks])
-        kv_indptr.append(kv_indptr[-1] + num_blocks)
-        kv_last_page_len = seq_len % page_size
-        if kv_last_page_len == 0:
-            kv_last_page_len = page_size
-        kv_last_page_lens.append(kv_last_page_len)
-
-    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
-    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
-    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-
-    output_baseline = torch.empty(q.shape, dtype=dtype)
-
-    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-        workspace_buffer,
-        kv_layout,
-        use_tensor_cores=((num_qo_heads // num_kv_heads) > 4),
-    )
-
-    wrapper.plan(
-        kv_indptr,
-        kv_indices,
-        kv_last_page_lens,
-        num_qo_heads,
-        num_kv_heads,
-        head_dim,
-        page_size,
-        "NONE",
-        q_data_type=dtype,
-        kv_data_type=torch.float8_e4m3fn if kv_cache_dtype.startswith("fp8") else dtype,
-    )
+    o_scale = 1.0
+    output_baseline = torch.empty(ref_query.shape, dtype=dtype)
+    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
 
     def baseline_decode():
-        return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale, output_baseline)
+        return wrapper.run(ref_query, ref_kv_cache, out=output_baseline)
+
+    def trtllm_decode():
+        return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+            query=query,
+            kv_cache=kv_cache,
+            workspace_buffer=workspace_buffer,
+            block_tables=block_tables,
+            seq_lens=seq_lens,
+            max_seq_len=max_seq_len,
+            bmm1_scale=q_scale * k_scale * sm_scale,
+            bmm2_scale=v_scale / o_scale,
+            out=output_trtllm,
+        )
 
     baseline_mean, baseline_std = time_fn(baseline_decode)
+    trtllm_mean, trtllm_std = time_fn(trtllm_decode)
 
     # Calculate percentage speedup (positive means TRT is faster)
-    speedup_percent = (baseline_mean - trt_mean) / baseline_mean
+    speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean
 
     print(
-        f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.3f}\t{trt_std.item():.3f}"
+        f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:.3f}\t{trtllm_std.item():.3f}"
         f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}"
     )
 
     # Return results for CSV writing
     return {
-        "num_seqs": num_seqs,
-        "trt_mean": trt_mean,
-        "trt_std": trt_std.item(),
+        "batch_size": batch_size,
+        "trtllm_mean": trtllm_mean,
+        "trtllm_std": trtllm_std.item(),
         "baseline_mean": baseline_mean,
         "baseline_std": baseline_std.item(),
         "speedup_percent": speedup_percent,
-        "q_dtype": str(dtype),
-        "kv_cache_dtype": kv_cache_dtype,
-        "page_size": page_size,
+        "q_dtype": str(q_quant_dtype),
+        "kv_cache_dtype": str(kv_quant_dtype),
+        "output_dtype": str(o_quant_dtype),
+        "block_size": block_size,
         "num_kv_heads": num_kv_heads,
-        "head_dim": head_dim,
+        "head_size": head_size,
         "max_seq_len": max_seq_len,
     }
 
@@ -180,17 +197,18 @@ def write_results_to_csv(results, filename=None):
         filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
 
     fieldnames = [
-        "num_seqs",
-        "trt_mean",
-        "trt_std",
+        "batch_size",
+        "trtllm_mean",
+        "trtllm_std",
         "baseline_mean",
         "baseline_std",
         "speedup_percent",
         "q_dtype",
         "kv_cache_dtype",
-        "page_size",
+        "output_dtype",
+        "block_size",
         "num_kv_heads",
-        "head_dim",
+        "head_size",
         "max_seq_len",
     ]
 
@@ -209,45 +227,42 @@ def write_results_to_csv(results, filename=None):
 
 
 if __name__ == "__main__":
-    num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
+    batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256]
     max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
     all_results = []
 
-    print(
-        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
-        "output_dtype: bfloat16"
-    )
-    print(
-        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
-        "baseline_std\tspeedup_percent"
-    )
-    for max_seq_len in max_seq_lens:
-        for bs in num_seqs:
-            result = benchmark_decode(
-                bs,
-                max_seq_len,
-                dtype=torch.bfloat16,
-                kv_cache_dtype="auto",
-            )
-            all_results.append(result)
+    dtype = torch.bfloat16
+    quant_dtypes = [
+        # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
+        (None, None, None),
+        (None, FP8_DTYPE, None),
+        (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+    ]
 
-    print(
-        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8, "
-        "output_dtype: bfloat16"
-    )
-    print(
-        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
-        "baseline_std\tspeedup_percent"
-    )
-    for max_seq_len in max_seq_lens:
-        for bs in num_seqs:
-            result = benchmark_decode(
-                bs,
-                max_seq_len,
-                dtype=torch.bfloat16,
-                kv_cache_dtype="fp8",
-            )
-            all_results.append(result)
+    for quant_dtype in quant_dtypes:
+        q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype
+        q_quant_dtype = q_quant_dtype or dtype
+        kv_quant_dtype = kv_quant_dtype or dtype
+        o_quant_dtype = o_quant_dtype or dtype
+
+        print(
+            f"Running benchmark for q_dtype = {q_quant_dtype}, "
+            f"kv_cache_dtype: {kv_quant_dtype}, "
+            f"output_dtype: {o_quant_dtype}"
+        )
+        print(
+            "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t"
+            "baseline_std\tspeedup_percent"
+        )
+        for max_seq_len in max_seq_lens:
+            for bs in batch_sizes:
+                result = benchmark_decode(
+                    dtype=dtype,
+                    quant_dtypes=quant_dtype,
+                    batch_size=bs,
+                    max_seq_len=max_seq_len,
+                )
+                all_results.append(result)
 
     # Write all results to CSV
     write_results_to_csv(all_results)
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 67bd9aebbc..49810e20c7 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -3,16 +3,14 @@
 
 import csv
 import os
-import random
 from datetime import datetime
+from typing import Optional
 
 import flashinfer
 import torch
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
-
-# KV Cache Layout for TRT-LLM
-# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
+FP8_DTYPE = torch.float8_e4m3fn
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -26,84 +24,99 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 
 @torch.no_grad()
 def benchmark_prefill(
-    num_seqs,
-    max_seq_len,
-    page_size=16,
-    dtype=torch.bfloat16,
-    kv_layout="HND",
-    num_kv_heads=8,
-    kv_cache_dtype="auto",
-    head_dim=128,
-    warmup=10,
-    trials=20,
+    dtype: torch.dtype,
+    quant_dtypes: tuple[
+        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
+    ],
+    batch_size: int,
+    max_seq_len: int,
+    num_heads: tuple[int, int] = (64, 8),
+    head_size: int = 128,
+    kv_layout: str = "HND",
+    block_size: int = 16,
+    warmup: int = 10,
+    trials: int = 20,
 ):
     torch.set_default_device("cuda")
     torch.manual_seed(0)
 
-    HEAD_GRP_SIZE = 8
-    MAX_SEQ_LEN = max_seq_len
+    q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
+    q_quant_dtype = q_quant_dtype or dtype
+    kv_quant_dtype = kv_quant_dtype or dtype
+    o_quant_dtype = o_quant_dtype or dtype
+
+    max_q_len = max_kv_len = max_seq_len
+
+    num_qo_heads, num_kv_heads = num_heads
+    assert num_qo_heads % num_kv_heads == 0
+
+    sm_scale = float(1.0 / (head_size**0.5))
 
     # large number to reduce kv_cache reuse
-    NUM_BLOCKS = int(256000 / page_size)
+    NUM_BLOCKS = int(256000 / block_size)
 
-    workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8)
+    kv_cache_shape = None
+    if kv_layout == "NHD":
+        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
+    elif kv_layout == "HND":
+        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+    else:
+        raise ValueError(f"Invalid kv_layout: {kv_layout}")
 
-    num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
-    sm_scale = float(1.0 / (head_dim**0.5))
-
-    q_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
-    q_lens[-1] = MAX_SEQ_LEN
-    max_q_len = max(q_lens)
+    q_lens = torch.randint(1, max_q_len, (batch_size,), dtype=torch.int32)
+    q_lens[-1] = max_q_len
     q_indptr = torch.cat(
         [
             torch.tensor([0], dtype=torch.int32),
-            torch.cumsum(
-                torch.tensor(q_lens, dtype=torch.int32), dim=0, dtype=torch.int32
-            ),
+            torch.cumsum(q_lens, dim=0, dtype=torch.int32),
         ]
     )
-    q = torch.randn(sum(q_lens), num_qo_heads, head_dim, dtype=dtype)
 
-    kv_lens = [random.randint(0, MAX_SEQ_LEN) for _ in range(num_seqs)]
-    kv_lens[-1] = MAX_SEQ_LEN
+    query = torch.randn(torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype)
+    if q_quant_dtype == FP8_DTYPE:
+        query, q_scale = to_float8(query)
+        ref_query = query.to(dtype) * q_scale
+    else:
+        q_scale = 1.0
+        ref_query = query
 
-    seq_lens = [q_len + kv_len for q_len, kv_len in zip(q_lens, kv_lens)]
-    max_seq_len = max(seq_lens)
-    seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32)
+    kv_lens = torch.randint(0, max_kv_len, (batch_size,), dtype=torch.int32)
+    kv_lens[-1] = max_kv_len
 
-    max_num_blocks_per_seq = (max_seq_len + page_size - 1) // page_size
+    seq_lens = kv_lens + q_lens
+    max_seq_len = torch.max(seq_lens).item()
+
+    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    if kv_quant_dtype == FP8_DTYPE:
+        kv_cache, kv_scale = to_float8(kv_cache)
+        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+    else:
+        kv_scale = 1.0
+        ref_kv_cache = kv_cache
+    k_scale = v_scale = kv_scale
+
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(
-        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+        0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
     )
-
-    kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
-    kv_cache = torch.randn(size=kv_cache_shape, dtype=dtype)
-    k_scale = v_scale = 1.0
-
-    if kv_cache_dtype.startswith("fp8"):
-        kv_cache, _ = to_float8(kv_cache)
-
-    output_trtllm = torch.empty(q.shape, dtype=dtype)
-
     kv_indptr = [0]
     kv_indices = []
     kv_last_page_lens = []
-    for i in range(num_seqs):
+    for i in range(batch_size):
         seq_len = seq_lens[i]
         assert seq_len > 0
-        num_blocks = (seq_len + page_size - 1) // page_size
+        num_blocks = (seq_len + block_size - 1) // block_size
         kv_indices.extend(block_tables[i, :num_blocks])
         kv_indptr.append(kv_indptr[-1] + num_blocks)
-        kv_last_page_len = seq_len % page_size
+        kv_last_page_len = seq_len % block_size
         if kv_last_page_len == 0:
-            kv_last_page_len = page_size
+            kv_last_page_len = block_size
         kv_last_page_lens.append(kv_last_page_len)
 
     kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-
-    output_baseline = torch.empty(q.shape, dtype=dtype)
+    workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8)
 
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, kv_layout
@@ -115,12 +128,12 @@ def benchmark_prefill(
         kv_last_page_lens,
         num_qo_heads,
         num_kv_heads,
-        head_dim,
-        page_size,
+        head_size,
+        block_size,
         causal=True,
         sm_scale=sm_scale,
         q_data_type=dtype,
-        kv_data_type=kv_cache.dtype,
+        kv_data_type=dtype,
     )
 
     def time_fn(fn, warmup=10, trials=20):
@@ -138,52 +151,55 @@ def benchmark_prefill(
             times.append(start.elapsed_time(end))  # ms
         return sum(times) / len(times), torch.std(torch.tensor(times))
 
-    def baseline_prefill():
-        return wrapper.run(
-            q, kv_cache, k_scale=k_scale, v_scale=v_scale, out=output_baseline
-        )
+    o_scale = 1.0
+    output_baseline = torch.empty(ref_query.shape, dtype=dtype)
+    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
 
-    def trt_prefill():
+    def baseline_prefill():
+        return wrapper.run(ref_query, ref_kv_cache, out=output_baseline)
+
+    def trtllm_prefill():
         return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
-            query=q,
+            query=query,
             kv_cache=kv_cache,
             workspace_buffer=workspace_buffer,
             block_tables=block_tables,
-            seq_lens=seq_lens_tensor,
+            seq_lens=seq_lens,
             max_q_len=max_q_len,
             max_kv_len=max_seq_len,
-            bmm1_scale=k_scale * sm_scale,
-            bmm2_scale=v_scale,
-            batch_size=num_seqs,
+            bmm1_scale=q_scale * k_scale * sm_scale,
+            bmm2_scale=v_scale / o_scale,
+            batch_size=batch_size,
             cum_seq_lens_q=q_indptr,
             cum_seq_lens_kv=kv_indptr,
             out=output_trtllm,
         )
 
-    trt_mean, trt_std = time_fn(trt_prefill)
     baseline_mean, baseline_std = time_fn(baseline_prefill)
+    trtllm_mean, trtllm_std = time_fn(trtllm_prefill)
 
     # Calculate percentage speedup (positive means TRT is faster)
-    speedup_percent = (baseline_mean - trt_mean) / baseline_mean
+    speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean
 
     print(
-        f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.5f}\t{trt_std.item():.5f}"
-        f"\t{baseline_mean:.5f}\t{baseline_std.item():.5f}\t{speedup_percent:.5f}"
+        f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:8.3f}\t{trtllm_std.item():8.3f}"
+        f"\t{baseline_mean:8.3f}\t{baseline_std.item():8.3f}\t{speedup_percent:8.3f}"
     )
 
     # Return results for CSV writing
     return {
-        "num_seqs": num_seqs,
-        "trt_mean": trt_mean,
-        "trt_std": trt_std.item(),
+        "batch_size": batch_size,
+        "trtllm_mean": trtllm_mean,
+        "trtllm_std": trtllm_std.item(),
         "baseline_mean": baseline_mean,
         "baseline_std": baseline_std.item(),
         "speedup_percent": speedup_percent,
-        "q_dtype": str(dtype),
-        "kv_cache_dtype": kv_cache_dtype,
-        "page_size": page_size,
+        "q_dtype": str(q_quant_dtype),
+        "kv_cache_dtype": str(kv_quant_dtype),
+        "output_dtype": str(o_quant_dtype),
+        "block_size": block_size,
         "num_kv_heads": num_kv_heads,
-        "head_dim": head_dim,
+        "head_size": head_size,
         "max_seq_len": max_seq_len,
     }
 
@@ -195,17 +211,18 @@ def write_results_to_csv(results, filename=None):
         filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
 
     fieldnames = [
-        "num_seqs",
-        "trt_mean",
-        "trt_std",
+        "batch_size",
+        "trtllm_mean",
+        "trtllm_std",
         "baseline_mean",
         "baseline_std",
         "speedup_percent",
         "q_dtype",
         "kv_cache_dtype",
-        "page_size",
+        "output_dtype",
+        "block_size",
         "num_kv_heads",
-        "head_dim",
+        "head_size",
         "max_seq_len",
     ]
 
@@ -224,27 +241,41 @@ def write_results_to_csv(results, filename=None):
 
 
 if __name__ == "__main__":
-    num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
+    batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256]
     max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
     all_results = []
 
-    print(
-        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
-        "output_dtype: bfloat16"
-    )
-    print(
-        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
-        "baseline_std\tspeedup_percent"
-    )
-    for max_seq_len in max_seq_lens:
-        for bs in num_seqs:
-            result = benchmark_prefill(
-                bs,
-                max_seq_len,
-                dtype=torch.bfloat16,
-                kv_cache_dtype="auto",
-            )
-            all_results.append(result)
+    dtype = torch.bfloat16
+    quant_dtypes = [
+        # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
+        (None, None, None),
+        (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+    ]
+
+    for quant_dtype in quant_dtypes:
+        q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype
+        q_quant_dtype = q_quant_dtype or dtype
+        kv_quant_dtype = kv_quant_dtype or dtype
+        o_quant_dtype = o_quant_dtype or dtype
+
+        print(
+            f"Running benchmark for q_dtype = {q_quant_dtype}, "
+            f"kv_cache_dtype: {kv_quant_dtype}, "
+            f"output_dtype: {o_quant_dtype}"
+        )
+        print(
+            "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t"
+            "baseline_std\tspeedup_percent"
+        )
+        for max_seq_len in max_seq_lens:
+            for bs in batch_sizes:
+                result = benchmark_prefill(
+                    dtype=dtype,
+                    quant_dtypes=quant_dtype,
+                    batch_size=bs,
+                    max_seq_len=max_seq_len,
+                )
+                all_results.append(result)
 
     # Write all results to CSV
     write_results_to_csv(all_results)
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 70750eb9ac..bef0fdef98 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
 from typing import Optional
 
 import pytest
@@ -7,13 +8,27 @@ import torch._dynamo
 
 from tests.compile.backend import TestBackend
 from tests.models.utils import check_outputs_equal
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata)
 from vllm import LLM, SamplingParams
+from vllm.attention import Attention
+from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion import QUANT_OPS, QuantKey, kFp8StaticTensorSym
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
+from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
+                         ModelConfig, PassConfig, SchedulerConfig, VllmConfig,
+                         set_current_vllm_config)
+from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    Fp8LinearOp)
 from vllm.platforms import current_platform
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+FP8_DTYPE = current_platform.fp8_dtype()
 
 # globals needed for string-import custom Dynamo backend field
 backend: Optional[TestBackend] = None
@@ -132,3 +147,235 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
 
     # Reset backend to make sure llm2 gets released
     backend = None
+
+
+class TestAttentionStaticQuantPatternModel(torch.nn.Module):
+    """Test model for AttentionStaticQuantPattern fusion."""
+
+    def __init__(self, num_qo_heads: int, num_kv_heads: int, head_size: int,
+                 kv_cache_dtype: torch.dtype, device: torch.device,
+                 vllm_config: VllmConfig):
+        super().__init__()
+        self.num_qo_heads = num_qo_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_size = head_size
+        self.kv_cache_dtype = kv_cache_dtype
+        self.device = device
+        self.vllm_config = vllm_config
+
+        self.attn = Attention(
+            num_heads=self.num_qo_heads,
+            head_size=self.head_size,
+            scale=1.0 / (self.head_size**0.5),
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            prefix="model.layers.0.self_attn.attn",
+        )
+
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=True, act_quant_group_shape=GroupShape.PER_TENSOR)
+        self.wscale = torch.tensor([1.0], dtype=torch.float32)
+        self.scale = torch.tensor([1.0], dtype=torch.float32)
+
+        self.block_size = 16
+
+        # Initialize attn MetadataBuilder
+        self.builder = self.attn.attn_backend.get_builder_cls()(
+            kv_cache_spec=AttentionSpec(
+                block_size=self.block_size,
+                num_kv_heads=self.num_kv_heads,
+                head_size=self.head_size,
+                dtype=self.kv_cache_dtype,
+                use_mla=False,
+            ),
+            layer_names=[self.attn.layer_name],
+            vllm_config=self.vllm_config,
+            device=self.device,
+        )
+
+    def build_attn_metadata(self, batch_size: int):
+        """Initialize attention metadata."""
+
+        # Create common attn metadata
+        batch_spec = BatchSpec(seq_lens=[1] * batch_size,
+                               query_lens=[1] * batch_size)
+        common_attn_metadata = create_common_attn_metadata(
+            batch_spec,
+            self.block_size,
+            self.device,
+            arange_block_indices=True)
+
+        max_blocks = (max(batch_spec.seq_lens) + self.block_size -
+                      1) // self.block_size
+        num_blocks = batch_size * max_blocks
+
+        # Create dummy KV cache for FlashInfer TRTLLM
+        #   - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
+        #   - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
+        # Create kv_cache in HND layout and permute to NHD layout
+        # (later will be permuted back to HND layout in forward pass)
+        kv_cache = torch.zeros(num_blocks,
+                               2,
+                               self.num_kv_heads,
+                               self.block_size,
+                               self.head_size,
+                               dtype=self.kv_cache_dtype,
+                               device=self.device)
+        kv_cache = kv_cache.permute(0, 1, 3, 2, 4)
+        self.attn.kv_cache = [kv_cache]
+
+        # Build attn metadata
+        self.attn_metadata = self.builder.build(
+            common_prefix_len=0, common_attn_metadata=common_attn_metadata)
+
+        return self.attn_metadata
+
+    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                w: torch.Tensor):
+        """Forward pass that creates the pattern to be fused."""
+        attn_output = self.attn(q, k, v)
+        return self.fp8_linear.apply(input=attn_output,
+                                     weight=w,
+                                     weight_scale=self.wscale,
+                                     input_scale=self.scale)
+
+
+@pytest.mark.parametrize("num_qo_heads, num_kv_heads", [(64, 8), (40, 8)])
+@pytest.mark.parametrize("head_size", [128])
+@pytest.mark.parametrize("batch_size", [7, 256, 533])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize(
+    "model_name, quant_key",
+    [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", kFp8StaticTensorSym)])
+@pytest.mark.parametrize("backend", [_Backend.FLASHINFER])
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
+@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
+@pytest.mark.skipif(not current_platform.is_device_capability((10, 0)),
+                    reason="Only test on SM100(Blackwell)")
+def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
+                                 head_size: int, batch_size: int,
+                                 dtype: torch.dtype, model_name: str,
+                                 quant_key: QuantKey, backend: _Backend,
+                                 monkeypatch, dist_init):
+    """Test AttentionStaticQuantPattern fusion pass"""
+
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    device = torch.device("cuda:0")
+    torch.manual_seed(42)
+
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(
+            model=model_name,
+            max_model_len=2048,
+        ),
+        scheduler_config=SchedulerConfig(max_num_seqs=1024),
+        compilation_config=CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            custom_ops=["+quant_fp8"],
+        ),
+        cache_config=CacheConfig(cache_dtype="fp8"))
+
+    # Create test inputs
+    hidden_size = num_qo_heads * head_size
+    q = torch.randn(batch_size, hidden_size, dtype=dtype, device=device)
+    k = torch.randn(batch_size,
+                    num_kv_heads * head_size,
+                    dtype=dtype,
+                    device=device)
+    v = torch.randn(batch_size,
+                    num_kv_heads * head_size,
+                    dtype=dtype,
+                    device=device)
+    linear_w = torch.randn(hidden_size, hidden_size).to(FP8_DTYPE).t()
+
+    # Mark first dimension as dynamic for realistic testing
+    torch._dynamo.mark_dynamic(q, 0)
+    torch._dynamo.mark_dynamic(k, 0)
+    torch._dynamo.mark_dynamic(v, 0)
+
+    # Run model directly without compilation and fusion
+    vllm_config_unfused = copy.deepcopy(vllm_config)
+    with set_current_vllm_config(vllm_config_unfused), set_forward_context(
+            attn_metadata=None, vllm_config=vllm_config_unfused
+    ), global_force_attn_backend_context_manager(backend):
+        model_unfused = TestAttentionStaticQuantPatternModel(
+            num_qo_heads, num_kv_heads, head_size, FP8_DTYPE, device,
+            vllm_config_unfused)
+        model_unfused = model_unfused.to(device)
+
+        forward_ctx = get_forward_context()
+        forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
+            batch_size)
+
+        # Run model directly without compilation and fusion
+        result_unfused = model_unfused(q, k, v, linear_w)
+
+    # Run model with attn fusion enabled
+    vllm_config.compilation_config.pass_config = PassConfig(
+        enable_attn_fusion=True, enable_noop=True)
+    with set_current_vllm_config(vllm_config), set_forward_context(
+            attn_metadata=None, vllm_config=vllm_config
+    ), global_force_attn_backend_context_manager(backend):
+        model_fused = TestAttentionStaticQuantPatternModel(
+            num_qo_heads, num_kv_heads, head_size, FP8_DTYPE, device,
+            vllm_config)
+        model_fused = model_fused.to(device)
+
+        forward_ctx = get_forward_context()
+        forward_ctx.attn_metadata = model_fused.build_attn_metadata(batch_size)
+
+        # Create test backend with fusion passes enabled
+        noop_pass = NoOpEliminationPass(vllm_config)
+        attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw
+                                                                    )
+        test_backend = TestBackend(noop_pass, attn_pass)
+
+        # Compile model with fusion enabled
+        model_compiled = torch.compile(model_fused,
+                                       backend=test_backend,
+                                       fullgraph=True)
+        assert model_compiled.attn._o_scale_float is None
+        result_fused_1 = model_compiled(q, k, v, linear_w)
+
+        # After the 1st round of the forward pass, output quant scale should be
+        # loaded into the attn layer's _o_scale_float, the 2nd round should
+        # reuse the loaded _o_scale_float
+        assert model_compiled.attn._o_scale_float is not None
+        result_fused_2 = model_compiled(q, k, v, linear_w)
+        assert model_compiled.attn._o_scale_float is not None
+
+    # Check attn fusion support
+    attn_fusion_supported = [
+        layer.impl.fused_output_quant_supported(quant_key.dtype,
+                                                quant_key.static,
+                                                quant_key.group_shape) for key,
+        layer in vllm_config.compilation_config.static_forward_context.items()
+    ]
+    if any(attn_fusion_supported):
+        # Check quantization ops in the graph before and after fusion
+        test_backend.check_before_ops([QUANT_OPS[quant_key]],
+                                      fully_replaced=True)
+
+    # Check attention ops in the graph before and after fusion
+    attn_nodes_pre = list(find_op_nodes(ATTN_OP, test_backend.graph_pre_pass))
+    attn_nodes_post = list(find_op_nodes(ATTN_OP,
+                                         test_backend.graph_post_pass))
+
+    assert len(attn_nodes_pre) > 0, "Should have attention nodes before fusion"
+    assert len(attn_nodes_pre) == len(attn_nodes_post), \
+        "Should have same number of attention nodes before and after fusion"
+    assert attn_nodes_pre[0].kwargs.get("output_scale") is None, \
+        "Attention should not have output_scale before fusion"
+    assert attn_nodes_post[0].kwargs.get("output_scale") is not None, \
+        "Attention should have output_scale after fusion"
+
+    # Check that results are closed
+    torch.testing.assert_close(result_unfused,
+                               result_fused_1,
+                               atol=1e-2,
+                               rtol=1e-2)
+    torch.testing.assert_close(result_unfused,
+                               result_fused_2,
+                               atol=1e-2,
+                               rtol=1e-2)
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 4b84e6a00e..619822f3ee 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -13,21 +13,7 @@ if not current_platform.is_device_capability(100):
                 allow_module_level=True)
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
-
-# KV Cache Layout for TRT-LLM
-# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
-
-MAX_Q_LEN = 1024
-MAX_KV_LEN = 4096
-BATCH_SIZES = [4, 12]
-NUM_HEADS = [(16, 16), (40, 8)]
-HEAD_SIZES = [128]
-BLOCK_SIZES = [16]
-KV_LAYOUTS = ["HND"]
-DTYPES = [torch.bfloat16]
-KV_CACHE_DTYPES = [None, current_platform.fp8_dtype()]
-NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
-SOFT_CAPS = [None, 50.0]
+FP8_DTYPE = current_platform.fp8_dtype()
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -39,42 +25,59 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
     return x_scl_sat.to(dtype), scale.float().reciprocal()
 
 
-@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+DTYPE = [torch.bfloat16]
+QUANT_DTYPES = [
+    # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
+    (None, None, None),
+    (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+]
+BATCH_SIZE = [4, 12]
+MAX_SEQ_LENS = [(1024, 4096)]
+NUM_HEADS = [(64, 8), (40, 8)]
+HEAD_SIZE = [128]
+KV_LAYOUT = ["HND"]  # currently only HND is supported
+BLOCK_SIZE = [16]
+SOFT_CAP = [None, 50.0]
+
+NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+
+
+@pytest.mark.parametrize("dtype", DTYPE)
+@pytest.mark.parametrize("quant_dtypes", QUANT_DTYPES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZE)
+@pytest.mark.parametrize("max_seq_lens", MAX_SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("kv_layout", KV_LAYOUTS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
-@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.parametrize("head_size", HEAD_SIZE)
+@pytest.mark.parametrize("kv_layout", KV_LAYOUT)
+@pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("soft_cap", SOFT_CAP)
 @torch.inference_mode
 def test_flashinfer_trtllm_decode_with_baseline(
+    dtype: torch.dtype,
+    quant_dtypes: tuple[Optional[torch.dtype], Optional[torch.dtype],
+                        Optional[torch.dtype]],
     batch_size: int,
+    max_seq_lens: tuple[int, int],
     num_heads: tuple[int, int],
     head_size: int,
-    block_size: int,
     kv_layout: str,
-    dtype: torch.dtype,
-    kv_cache_dtype: Optional[torch.dtype],
+    block_size: int,
     soft_cap: Optional[float],
 ) -> None:
-    kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
-
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
 
-    kv_lens = torch.randint(1, MAX_KV_LEN, (batch_size, ), dtype=torch.int32)
-    kv_lens[-1] = MAX_KV_LEN
-    max_kv_len = torch.max(kv_lens).item()
-    num_seqs = len(kv_lens)
+    q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
+    q_quant_dtype = q_quant_dtype or dtype
+    kv_quant_dtype = kv_quant_dtype or dtype
+    o_quant_dtype = o_quant_dtype or dtype
 
-    num_query_heads = num_heads[0]
-    num_kv_heads = num_heads[1]
-    assert num_query_heads % num_kv_heads == 0
+    _, max_kv_len = max_seq_lens
 
-    scale = head_size**-0.5
+    num_qo_heads, num_kv_heads = num_heads
+    assert num_qo_heads % num_kv_heads == 0
 
-    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+    sm_scale = float(1.0 / (head_size**0.5))
 
     kv_cache_shape = None
     if kv_layout == "NHD":
@@ -83,156 +86,39 @@ def test_flashinfer_trtllm_decode_with_baseline(
         kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
     else:
         raise ValueError(f"Invalid kv_layout: {kv_layout}")
-    key_value_cache = torch.randn(kv_cache_shape, dtype=dtype)
-    kv_scale = 1.0
-    if kv_cache_dtype is current_platform.fp8_dtype():
-        key_value_cache, kv_scale = to_float8(key_value_cache,
-                                              current_platform.fp8_dtype())
 
-    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
-    block_tables = torch.randint(0,
-                                 NUM_BLOCKS,
-                                 (num_seqs, max_num_blocks_per_seq),
-                                 dtype=torch.int32)
-    k_scale = v_scale = kv_scale
-    kv_indptr = [0]
-    kv_indices = []
-    kv_last_page_lens = []
-    for i in range(num_seqs):
-        seq_len = kv_lens[i]
-        assert seq_len > 0
-        num_blocks = (seq_len + block_size - 1) // block_size
-        kv_indices.extend(block_tables[i, :num_blocks])
-        kv_indptr.append(kv_indptr[-1] + num_blocks)
-        kv_last_page_len = seq_len % block_size
-        if kv_last_page_len == 0:
-            kv_last_page_len = block_size
-        kv_last_page_lens.append(kv_last_page_len)
+    query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
+    if q_quant_dtype == FP8_DTYPE:
+        query, q_scale = to_float8(query)
+        ref_query = query.to(dtype) * q_scale
+    else:
+        q_scale = 1.0
+        ref_query = query
 
-    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
-    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
-    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+    kv_lens = torch.randint(1, max_kv_len, (batch_size, ), dtype=torch.int32)
+    kv_lens[-1] = max_kv_len
 
-    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
-    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-        workspace_buffer,
-        kv_layout,
-        use_tensor_cores=((num_query_heads // num_kv_heads) > 4))
-    wrapper.plan(kv_indptr,
-                 kv_indices,
-                 kv_last_page_lens,
-                 num_query_heads,
-                 num_kv_heads,
-                 head_size,
-                 block_size,
-                 "NONE",
-                 sm_scale=scale,
-                 q_data_type=dtype,
-                 kv_data_type=kv_cache_dtype,
-                 logits_soft_cap=soft_cap)
-
-    output = torch.empty(query.shape, dtype=dtype)
-    wrapper.run(query,
-                key_value_cache,
-                k_scale=k_scale,
-                v_scale=v_scale,
-                out=output)
-
-    # TRTLLM Decode
-    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
-    output_trtllm = torch.empty(query.shape, dtype=dtype)
-    flashinfer.decode.trtllm_batch_decode_with_kv_cache(
-        query=query.contiguous(),
-        kv_cache=key_value_cache,
-        workspace_buffer=workspace_buffer,
-        block_tables=block_tables,
-        seq_lens=kv_lens_tensor,
-        max_seq_len=max_kv_len,
-        bmm1_scale=k_scale * scale,
-        bmm2_scale=v_scale,
-        out=output_trtllm,
-    )
-
-    torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \
-        f"{torch.max(torch.abs(output - output_trtllm))}"
-
-
-@pytest.mark.parametrize("batch_size", BATCH_SIZES)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("kv_layout", KV_LAYOUTS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
-@pytest.mark.parametrize("soft_cap", [None])
-@torch.inference_mode
-def test_flashinfer_trtllm_prefill_with_baseline(
-    batch_size: int,
-    num_heads: tuple[int, int],
-    head_size: int,
-    block_size: int,
-    kv_layout: str,
-    dtype: torch.dtype,
-    kv_cache_dtype: Optional[torch.dtype],
-    soft_cap: Optional[float],
-) -> None:
-    kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
-    if dtype != kv_cache_dtype:
-        pytest.skip(f"Not supported dtype({dtype}) with "
-                    "kv_cache_dtype({kv_cache_dtype})")
-
-    torch.set_default_device("cuda")
-    current_platform.seed_everything(0)
-
-    q_lens = torch.randint(1, MAX_Q_LEN, (batch_size, ), dtype=torch.int32)
-    q_lens[-1] = MAX_Q_LEN
-    max_q_len = torch.max(q_lens).item()
-    q_indptr = torch.cat([
-        torch.tensor([0], dtype=torch.int32),
-        torch.cumsum(q_lens, dim=0, dtype=torch.int32),
-    ])
-
-    kv_lens = torch.randint(0, MAX_KV_LEN, (batch_size, ), dtype=torch.int32)
-    kv_lens[-1] = MAX_KV_LEN
-
-    seq_lens = kv_lens + q_lens
+    seq_lens = kv_lens
     max_seq_len = torch.max(seq_lens).item()
-    num_seqs = len(seq_lens)
 
-    num_query_heads = num_heads[0]
-    num_kv_heads = num_heads[1]
-    assert num_query_heads % num_kv_heads == 0
-
-    scale = head_size**-0.5
-
-    query = torch.randn(torch.sum(q_lens).item(),
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
-
-    kv_cache_shape = None
-    if kv_layout == "NHD":
-        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
-    elif kv_layout == "HND":
-        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    if kv_quant_dtype == FP8_DTYPE:
+        kv_cache, kv_scale = to_float8(kv_cache)
+        ref_kv_cache = kv_cache.to(dtype) * kv_scale
     else:
-        raise ValueError(f"Invalid kv_layout: {kv_layout}")
-    key_value_cache = torch.randn(kv_cache_shape, dtype=dtype)
-    kv_scale = 1.0
-    if kv_cache_dtype is current_platform.fp8_dtype():
-        key_value_cache, kv_scale = to_float8(key_value_cache,
-                                              current_platform.fp8_dtype())
+        kv_scale = 1.0
+        ref_kv_cache = kv_cache
+    k_scale = v_scale = kv_scale
 
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(0,
                                  NUM_BLOCKS,
-                                 (num_seqs, max_num_blocks_per_seq),
+                                 (batch_size, max_num_blocks_per_seq),
                                  dtype=torch.int32)
-    k_scale = v_scale = kv_scale
     kv_indptr = [0]
     kv_indices = []
     kv_last_page_lens = []
-    for i in range(num_seqs):
+    for i in range(batch_size):
         seq_len = seq_lens[i]
         assert seq_len > 0
         num_blocks = (seq_len + block_size - 1) // block_size
@@ -246,48 +132,206 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-
     workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
+
+    # Baseline Decode
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer,
+        kv_layout,
+        use_tensor_cores=((num_qo_heads // num_kv_heads) > 4))
+    wrapper.plan(kv_indptr,
+                 kv_indices,
+                 kv_last_page_lens,
+                 num_qo_heads,
+                 num_kv_heads,
+                 head_size,
+                 block_size,
+                 "NONE",
+                 sm_scale=sm_scale,
+                 q_data_type=dtype,
+                 kv_data_type=dtype,
+                 logits_soft_cap=soft_cap)
+
+    output = torch.empty(ref_query.shape, dtype=dtype)
+    wrapper.run(ref_query, ref_kv_cache, out=output)
+    o_scale = 1.0
+    if o_quant_dtype == FP8_DTYPE:
+        _, o_scale = to_float8(output)
+
+    # TRTLLM Decode
+    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+    flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+        query=query,
+        kv_cache=kv_cache,
+        workspace_buffer=workspace_buffer,
+        block_tables=block_tables,
+        seq_lens=seq_lens,
+        max_seq_len=max_seq_len,
+        bmm1_scale=q_scale * k_scale * sm_scale,
+        bmm2_scale=v_scale / o_scale,
+        out=output_trtllm,
+    )
+    if o_quant_dtype == FP8_DTYPE:
+        output_trtllm = output_trtllm.to(dtype) * o_scale
+
+    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
+        rtol, atol = 5e-2, 7e-2
+    else:
+        rtol, atol = 1e-2, 1e-2
+
+    torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol), \
+        f"{torch.max(torch.abs(output - output_trtllm))}"
+
+
+@pytest.mark.parametrize("dtype", DTYPE)
+@pytest.mark.parametrize("quant_dtypes", QUANT_DTYPES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZE)
+@pytest.mark.parametrize("max_seq_lens", MAX_SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZE)
+@pytest.mark.parametrize("kv_layout", KV_LAYOUT)
+@pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("soft_cap", [None])
+@torch.inference_mode
+def test_flashinfer_trtllm_prefill_with_baseline(
+    dtype: torch.dtype,
+    quant_dtypes: tuple[Optional[torch.dtype], Optional[torch.dtype],
+                        Optional[torch.dtype]],
+    batch_size: int,
+    max_seq_lens: tuple[int, int],
+    num_heads: tuple[int, int],
+    head_size: int,
+    kv_layout: str,
+    block_size: int,
+    soft_cap: Optional[float],
+) -> None:
+    torch.set_default_device("cuda")
+    current_platform.seed_everything(0)
+
+    q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
+    q_quant_dtype = q_quant_dtype or dtype
+    kv_quant_dtype = kv_quant_dtype or dtype
+    o_quant_dtype = o_quant_dtype or dtype
+
+    max_q_len, max_kv_len = max_seq_lens
+
+    num_qo_heads, num_kv_heads = num_heads
+    assert num_qo_heads % num_kv_heads == 0
+
+    sm_scale = float(1.0 / (head_size**0.5))
+
+    kv_cache_shape = None
+    if kv_layout == "NHD":
+        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
+    elif kv_layout == "HND":
+        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+    else:
+        raise ValueError(f"Invalid kv_layout: {kv_layout}")
+
+    q_lens = torch.randint(1, max_q_len, (batch_size, ), dtype=torch.int32)
+    q_lens[-1] = max_q_len
+    q_indptr = torch.cat([
+        torch.tensor([0], dtype=torch.int32),
+        torch.cumsum(q_lens, dim=0, dtype=torch.int32),
+    ])
+
+    query = torch.randn(torch.sum(q_lens).item(),
+                        num_qo_heads,
+                        head_size,
+                        dtype=dtype)
+    if q_quant_dtype == FP8_DTYPE:
+        query, q_scale = to_float8(query)
+        ref_query = query.to(dtype) * q_scale
+    else:
+        q_scale = 1.0
+        ref_query = query
+
+    kv_lens = torch.randint(0, max_kv_len, (batch_size, ), dtype=torch.int32)
+    kv_lens[-1] = max_kv_len
+
+    seq_lens = kv_lens + q_lens
+    max_seq_len = torch.max(seq_lens).item()
+
+    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    if kv_quant_dtype == FP8_DTYPE:
+        kv_cache, kv_scale = to_float8(kv_cache)
+        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+    else:
+        kv_scale = 1.0
+        ref_kv_cache = kv_cache
+    k_scale = v_scale = kv_scale
+
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables = torch.randint(0,
+                                 NUM_BLOCKS,
+                                 (batch_size, max_num_blocks_per_seq),
+                                 dtype=torch.int32)
+    kv_indptr = [0]
+    kv_indices = []
+    kv_last_page_lens = []
+    for i in range(batch_size):
+        seq_len = seq_lens[i]
+        assert seq_len > 0
+        num_blocks = (seq_len + block_size - 1) // block_size
+        kv_indices.extend(block_tables[i, :num_blocks])
+        kv_indptr.append(kv_indptr[-1] + num_blocks)
+        kv_last_page_len = seq_len % block_size
+        if kv_last_page_len == 0:
+            kv_last_page_len = block_size
+        kv_last_page_lens.append(kv_last_page_len)
+
+    kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
+    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
+    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
+    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
+
+    # Baseline Prefill
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, kv_layout)
     wrapper.plan(q_indptr,
                  kv_indptr,
                  kv_indices,
                  kv_last_page_lens,
-                 num_query_heads,
+                 num_qo_heads,
                  num_kv_heads,
                  head_size,
                  block_size,
                  causal=True,
-                 sm_scale=scale,
+                 sm_scale=sm_scale,
                  q_data_type=dtype,
-                 kv_data_type=kv_cache_dtype,
+                 kv_data_type=dtype,
                  logits_soft_cap=soft_cap)
 
-    output = torch.empty(query.shape, dtype=dtype)
-    wrapper.run(query,
-                key_value_cache,
-                k_scale=k_scale,
-                v_scale=v_scale,
-                out=output)
+    output = torch.empty(ref_query.shape, dtype=dtype)
+    wrapper.run(ref_query, ref_kv_cache, out=output)
+    o_scale = 1.0
+    if o_quant_dtype == FP8_DTYPE:
+        _, o_scale = to_float8(output)
 
-    # TRTLLM Decode
-    output_trtllm = torch.empty(query.shape, dtype=dtype)
+    # TRTLLM Prefill
+    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
     flashinfer.prefill.trtllm_batch_context_with_kv_cache(
-        query=query.contiguous(),
-        kv_cache=key_value_cache,
+        query=query,
+        kv_cache=kv_cache,
         workspace_buffer=workspace_buffer,
         block_tables=block_tables,
         seq_lens=seq_lens,
         max_q_len=max_q_len,
         max_kv_len=max_seq_len,
-        bmm1_scale=k_scale * scale,
-        bmm2_scale=v_scale,
-        batch_size=num_seqs,
+        bmm1_scale=q_scale * k_scale * sm_scale,
+        bmm2_scale=v_scale / o_scale,
+        batch_size=batch_size,
         cum_seq_lens_q=q_indptr,
         cum_seq_lens_kv=kv_indptr,
         out=output_trtllm,
     )
+    if o_quant_dtype == FP8_DTYPE:
+        output_trtllm = output_trtllm.to(dtype) * o_scale
 
-    torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \
+    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
+        rtol, atol = 5e-2, 7e-2
+    else:
+        rtol, atol = 1e-2, 1e-2
+
+    torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol), \
         f"{torch.max(torch.abs(output - output_trtllm))}"
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 0e87fa3f23..04ab100c87 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -128,11 +128,17 @@ class Attention(nn.Module):
         self._q_scale = torch.tensor(1.0, dtype=torch.float32)
         self._prob_scale = torch.tensor(1.0, dtype=torch.float32)
 
-        # We also keep the float32 versions of k/v_scale for attention
-        # backends that don't support tensors (Flashinfer)
+        # We also keep q/k/v_scale on host (cpu) memory for attention
+        # backends that require the scales to be on host instead of on device.
+        # e.g. Flashinfer
+        self._q_scale_float = 1.0
         self._k_scale_float = 1.0
         self._v_scale_float = 1.0
 
+        # The output scale on host memory. This should be the input scale of
+        # the quant op after this attention layer.
+        self._o_scale_float: Optional[float] = None
+
         self.use_mla = use_mla
         self.num_heads = num_heads
         self.head_size = head_size
@@ -291,6 +297,7 @@ class Attention(nn.Module):
         self._q_scale.copy_(torch.abs(query).max() / self.q_range)
         self._k_scale.copy_(torch.abs(key).max() / self.k_range)
         self._v_scale.copy_(torch.abs(value).max() / self.v_range)
+        self._q_scale_float = self._q_scale.item()
         self._k_scale_float = self._k_scale.item()
         self._v_scale_float = self._v_scale.item()
         # We only calculate the scales once
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index a40a8caf34..1f77a26676 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -9,7 +9,7 @@ from torch._subclasses.fake_tensor import (FakeTensorMode,
                                            unset_fake_temporarily)
 
 from vllm.attention import Attention
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
@@ -18,23 +18,32 @@ from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
+FP8_DTYPE = current_platform.fp8_dtype()
+
 ATTN_OP = torch.ops.vllm.unified_attention_with_output.default
 RESHAPE_OP = torch.ops.aten.reshape.default
 
 
 class AttentionStaticQuantPattern:
+    """
+    Fusion for Attention+StaticQuant.
+
+    Only triggers when the attention implementation returns True in
+    `fused_output_quant_supported()`. If the pattern is found, the StaticQuant
+    op will be removed from the graph, and its scale will be passed into
+    Attention op as the `output_scale` argument.
+    """
 
     def __init__(
         self,
-        layer_name: str,
-        num_heads: int,
-        head_size: int,
+        layer: Attention,
         quant_dtype: torch.dtype,
         symmetric=True,
     ):
-        self.layer_name = layer_name
-        self.num_heads = num_heads
-        self.head_size = head_size
+        self.layer = layer
+        self.layer_name = layer.layer_name
+        self.num_heads = layer.num_heads
+        self.head_size = layer.head_size
         self.quant_dtype = quant_dtype
         self.quant_key = QuantKey(dtype=quant_dtype,
                                   static=True,
@@ -48,11 +57,10 @@ class AttentionStaticQuantPattern:
         kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs}
         return torch.empty(*args, **kwargs)
 
-    def register_if_supported(self, pm_pass: PatternMatcherPass,
-                              layer: Attention):
-        if layer.impl.fused_output_quant_supported(self.quant_dtype,
-                                                   self.quant_key.static,
-                                                   self.quant_key.group_shape):
+    def register_if_supported(self, pm_pass: PatternMatcherPass):
+        if self.layer.impl.fused_output_quant_supported(
+                self.quant_dtype, self.quant_key.static,
+                self.quant_key.group_shape):
             self._register(pm_pass)
 
     def _register(self, pm_pass: PatternMatcherPass):
@@ -60,19 +68,15 @@ class AttentionStaticQuantPattern:
         def pattern(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                     output_attn: torch.Tensor, output_quant: torch.Tensor,
                     scale: torch.Tensor):
-            view_7 = RESHAPE_OP(output_attn,
-                                [-1, self.num_heads, self.head_size])
-
             at1 = auto_functionalized(ATTN_OP,
                                       query=q,
                                       key=k,
                                       value=v,
-                                      output=view_7,
+                                      output=output_attn,
                                       layer_name=self.layer_name,
                                       output_scale=None)
             attn_out_view = RESHAPE_OP(at1[1],
                                        [-1, self.num_heads * self.head_size])
-
             at2 = auto_functionalized(self.QUANT_OP,
                                       result=output_quant,
                                       input=attn_out_view,
@@ -82,17 +86,19 @@ class AttentionStaticQuantPattern:
         def replacement(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                         output_attn: torch.Tensor, output_quant: torch.Tensor,
                         scale: torch.Tensor):
-            view_7 = RESHAPE_OP(output_quant,
-                                [-1, self.num_heads, self.head_size])
-
+            # attn output in quant_dtype
+            output_attn = torch.ops.aten.full.default(
+                [q.shape[0], self.num_heads, self.head_size],
+                0.0,
+                dtype=self.quant_dtype,
+                device=q.device)
             at1 = auto_functionalized(ATTN_OP,
                                       query=q,
                                       key=k,
                                       value=v,
-                                      output=view_7,
+                                      output=output_attn,
                                       layer_name=self.layer_name,
                                       output_scale=scale)
-
             return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])
 
         # Need custom fake mode, otherwise tracing happens with real tensors.
@@ -102,7 +108,7 @@ class AttentionStaticQuantPattern:
                 empty_bf16(5, self.num_heads, self.head_size),  # q
                 empty_bf16(5, self.num_heads, self.head_size),  # k
                 empty_bf16(5, self.num_heads, self.head_size),  # v
-                empty_bf16(5, self.num_heads * self.head_size),  # attn_output
+                empty_bf16(5, self.num_heads, self.head_size),  # attn_output
                 self.empty_quant(5, self.num_heads *
                                  self.head_size),  # quant_output
                 empty_fp32(1, 1)  # scale
@@ -140,27 +146,30 @@ class AttnFusionPass(VllmInductorPass):
 
     def __init__(self, config: VllmConfig):
         super().__init__(config)
-        self.static_fwd_ctx = config.compilation_config.static_forward_context
 
         self.patterns = PatternMatcherPass(pass_name="attn_fusion_pass")
 
-        for key, layer in self.static_fwd_ctx.items():
-            pattern = AttentionStaticQuantPattern(key, layer.num_heads,
-                                                  layer.head_size,
-                                                  current_platform.fp8_dtype())
-            pattern.register_if_supported(self.patterns, layer)
-        if len(self.static_fwd_ctx) == 0:
+        attn_layers = get_layers_from_vllm_config(config, Attention)
+        for layer_name, layer in attn_layers.items():
+            pattern = AttentionStaticQuantPattern(layer, FP8_DTYPE)
+            pattern.register_if_supported(self.patterns)
+        if len(attn_layers) == 0:
             logger.warning(
-                "Attention + quant fusion is enabled, but "
-                "CompilationConfig.static_forward_context is empty. "
-                "Cannot access attention layers so no fusion "
-                "patterns were registered.")
+                "Attention + quant fusion is enabled, but no attention layers "
+                "were found in CompilationConfig.static_forward_context "
+                "so no fusion patterns were registered.")
 
     def __call__(self, graph: torch.fx.graph.Graph) -> None:
         self.begin()
         self.dump_graph(graph, "before_attn_fusion")
 
         count = self.patterns.apply(graph)
+
+        # TODO: Move this to pass_manager.py after the fx graph broken issue
+        # has been resolved.
+        # see https://github.com/vllm-project/vllm/issues/23091
+        graph.eliminate_dead_code()
+
         logger.debug("Fused quantization onto %s attention nodes", count)
         self.dump_graph(graph, "after_attn_fusion")
         self.end_and_log()
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 2e31b7bad7..996be12656 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -174,21 +174,30 @@ def supports_trtllm_attention() -> tuple[bool, Optional[str]]:
 
 
 def use_trtllm_attention(
+    num_qo_heads: int,
+    num_kv_heads: int,
     num_tokens: int,
     max_seq_len: int,
     kv_cache_dtype: str,
-    num_qo_heads: Optional[int],
-    num_kv_heads: Optional[int],
-    attn_head_size: Optional[int],
+    q_dtype: torch.dtype,
+    is_prefill: bool,
     has_sinks: bool = False,
 ) -> bool:
     use_trtllm, env_value = supports_trtllm_attention()
     if not use_trtllm:
         return False
 
-    # Check if the dimensions are supported by TRTLLM decode attention
-    if (attn_head_size is None or num_qo_heads is None or num_kv_heads is None
-            or num_qo_heads % num_kv_heads != 0):
+    if num_qo_heads % num_kv_heads != 0:
+        return False
+
+    # Must use TRTLLM attention if query is FP8 quantized
+    if q_dtype == current_platform.fp8_dtype():
+        logger.info_once("Using TRTLLM attention (query is quantized).")
+        return True
+
+    # TRTLLM prefill attention does not support FP8 kv cache with
+    # non-quantized query
+    if is_prefill and kv_cache_dtype.startswith("fp8"):
         return False
 
     # If sinks are being used, we must use TRTLLM attention as it's
@@ -290,6 +299,7 @@ __all__ = [
     "has_flashinfer_moe",
     "has_flashinfer_cutlass_fused_moe",
     "has_nvidia_artifactory",
+    "supports_trtllm_attention",
     "use_trtllm_attention",
     "flashinfer_scaled_fp4_mm",
 ]
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 991904229f..c56e721dff 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -15,12 +15,17 @@ from flashinfer.decode import (_get_range_buf, get_seq_lens,
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 
 import vllm.envs as envs
+from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionType)
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.platforms import current_platform
 from vllm.utils import cdiv, is_pin_memory_available
-from vllm.utils.flashinfer import use_trtllm_attention
+from vllm.utils.flashinfer import (supports_trtllm_attention,
+                                   use_trtllm_attention)
 from vllm.v1.attention.backends.flash_attn import use_cascade_attention
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -35,6 +40,8 @@ from vllm.v1.kv_cache_interface import AttentionSpec
 
 FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
 
+FP8_DTYPE = current_platform.fp8_dtype()
+
 logger = init_logger(__name__)
 
 
@@ -519,22 +526,27 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         else:
             kv_cache_dtype = self.kv_cache_spec.dtype
 
-        num_qo_heads = self.vllm_config.model_config.get_num_attention_heads(
-            self.vllm_config.parallel_config)
+        config = self.vllm_config
+        num_qo_heads = config.model_config.get_num_attention_heads(
+            config.parallel_config)
         num_kv_heads = self.kv_cache_spec.num_kv_heads
         head_dim = self.kv_cache_spec.head_size
 
         # Check if any layer uses sinks (requires TRTLLM attention)
         has_sinks = self.global_hyperparameters.has_sinks
 
-        # currently prefill trtllm attention does not support fp8 kv cache
-        prefill_use_trtllm = not cache_dtype.startswith("fp8") \
-                                and use_trtllm_attention(
-                                num_prefill_tokens, max_seq_len, cache_dtype,
-                                num_qo_heads, num_kv_heads, head_dim, has_sinks)
+        # Insert FP8 quant for query if FP8 kv cache and attn fusion enabled
+        q_dtype = config.model_config.dtype
+        enable_fusion = config.compilation_config.pass_config.enable_attn_fusion
+        if cache_dtype.startswith("fp8") and enable_fusion:
+            q_dtype = kv_cache_dtype
+
+        prefill_use_trtllm = use_trtllm_attention(
+            num_qo_heads, num_kv_heads, num_prefill_tokens, max_seq_len,
+            cache_dtype, q_dtype, is_prefill=True, has_sinks=has_sinks)
         decode_use_trtllm = use_trtllm_attention(
-                                num_decode_tokens, max_seq_len, cache_dtype,
-                                num_qo_heads, num_kv_heads, head_dim, has_sinks)
+            num_qo_heads, num_kv_heads, num_decode_tokens, max_seq_len,
+            cache_dtype, q_dtype, is_prefill=False, has_sinks=has_sinks)
 
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -548,7 +560,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             head_dim=head_dim,
             page_size=page_size,
             kv_data_type=kv_cache_dtype,
-            q_data_type=self.vllm_config.model_config.dtype,
+            q_data_type=q_dtype,
             slot_mapping=common_attn_metadata.slot_mapping,
             max_q_len=max_q_len,
             max_seq_len=max_seq_len,
@@ -622,6 +634,8 @@ class FlashInferImpl(AttentionImpl):
             self.sliding_window = (-1, -1)
         else:
             self.sliding_window = (sliding_window - 1, 0)
+        self.window_left = (self.sliding_window[0]
+                            if self.sliding_window is not None else -1)
         self.kv_cache_dtype = kv_cache_dtype
         self.logits_soft_cap = logits_soft_cap
         self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
@@ -644,6 +658,19 @@ class FlashInferImpl(AttentionImpl):
                 )
             self.sinks = sinks
 
+        self.support_trtllm_attn = (supports_trtllm_attention() and
+                                    num_heads % num_kv_heads == 0)
+        self.bmm1_scale: Optional[float] = None
+        self.bmm2_scale: Optional[float] = None
+
+    def fused_output_quant_supported(self, dtype: torch.dtype, static: bool,
+                                     group_shape: GroupShape):
+        supported_quant_type = (dtype == FP8_DTYPE and static and
+                                group_shape == GroupShape.PER_TENSOR)
+        return (self.support_trtllm_attn
+                and self.kv_cache_dtype.startswith("fp8")
+                and supported_quant_type)
+
     def forward(
         self,
         layer: torch.nn.Module,
@@ -672,15 +699,42 @@ class FlashInferImpl(AttentionImpl):
         """
         assert output is not None, "Output tensor must be provided."
 
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for FlashInferImpl")
-
         if attn_metadata is None:
             # Profiling run.
             return output
 
+        if self.bmm1_scale is None:
+            self.bmm1_scale = (layer._q_scale_float * layer._k_scale_float *
+                               self.scale)
+
+        if self.bmm2_scale is None:
+            self.bmm2_scale = layer._v_scale_float
+
+        # The attn+quant fusion happens when output_scale is provided.
+        if output_scale is None:
+            assert attn_metadata.q_data_type != FP8_DTYPE, \
+                "Query can only be FP8 if output fusion happened."
+        else:
+            assert attn_metadata.q_data_type == FP8_DTYPE, \
+                "Query must be FP8 when attn+quant fusion happened."
+            assert (attn_metadata.prefill_use_trtllm and
+                    attn_metadata.decode_use_trtllm), "Must use TRT-LLM attn"
+            assert output.dtype == FP8_DTYPE, \
+                "Output must be FP8 when attn+quant fusion happened."
+
+            # TRTLLM attn kernel requires o scale as a host scalar, store the
+            # o scale to host scalar in warmup run with cuda graph not enabled
+            if layer._o_scale_float is None:
+                layer._o_scale_float = output_scale.cpu().item()
+                self.bmm2_scale = self.bmm2_scale / layer._o_scale_float
+
+            # Insert FP8 quant for query
+            num_tokens, num_heads, head_size = query.shape
+            query, _ = ops.scaled_fp8_quant(
+                query.reshape((num_tokens, num_heads * head_size)).contiguous(),
+                layer._q_scale)
+            query = query.reshape((num_tokens, num_heads, head_size))
+
         # IMPORTANT!
         # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
         # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
@@ -718,9 +772,6 @@ class FlashInferImpl(AttentionImpl):
                     self.kv_cache_dtype)
                 kv_cache = kv_cache.view(torch_dtype)
 
-        window_left = (self.sliding_window[0]
-                       if self.sliding_window is not None else -1)
-
         # Inputs and outputs may be padded for CUDA graphs
         query = query[:num_actual_tokens]
         output_padded = output
@@ -748,7 +799,7 @@ class FlashInferImpl(AttentionImpl):
 
             if not attn_metadata.prefill_use_trtllm:
                 assert prefill_wrapper._causal
-                assert prefill_wrapper._window_left == window_left
+                assert prefill_wrapper._window_left == self.window_left
                 assert prefill_wrapper._logits_soft_cap == (
                     self.logits_soft_cap or 0.0)
                 assert prefill_wrapper._sm_scale == self.scale
@@ -783,12 +834,12 @@ class FlashInferImpl(AttentionImpl):
                     seq_lens=seq_lens_prefill,
                     max_q_len=attn_metadata.max_q_len,
                     max_kv_len=attn_metadata.max_seq_len,
-                    bmm1_scale=layer._k_scale_float * self.scale,
-                    bmm2_scale=layer._v_scale_float,
+                    bmm1_scale=self.bmm1_scale,
+                    bmm2_scale=self.bmm2_scale,
                     batch_size=attn_metadata.num_prefills,
                     cum_seq_lens_q=attn_metadata.qo_indptr_gpu,
                     cum_seq_lens_kv=attn_metadata.paged_kv_indptr_gpu,
-                    window_left=window_left,
+                    window_left=self.window_left,
                     sinks=self.sinks,
                     out=output[num_decode_tokens:],
                 )
@@ -800,7 +851,7 @@ class FlashInferImpl(AttentionImpl):
             assert decode_wrapper is not None
 
             if not attn_metadata.decode_use_trtllm:
-                assert decode_wrapper._window_left == window_left
+                assert decode_wrapper._window_left == self.window_left
                 assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap
                                                            or 0.0)
                 assert decode_wrapper._sm_scale == self.scale
@@ -815,8 +866,8 @@ class FlashInferImpl(AttentionImpl):
                 # decode_query may be non-contiguous
                 decode_query = decode_query.contiguous()
                 workspace_buffer = decode_wrapper._float_workspace_buffer
-                block_tables_decode = attn_metadata.block_table_tensor[:
-                                                                       num_decode_tokens]
+                block_tables_decode = attn_metadata.\
+                        block_table_tensor[:num_decode_tokens]
                 seq_lens_decode = attn_metadata.seq_lens[:num_decode_tokens]
 
                 # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
@@ -834,9 +885,9 @@ class FlashInferImpl(AttentionImpl):
                     block_tables=block_tables_decode,
                     seq_lens=seq_lens_decode,
                     max_seq_len=attn_metadata.max_seq_len,
-                    bmm1_scale=layer._k_scale_float * self.scale,
-                    bmm2_scale=layer._v_scale_float,
-                    window_left=window_left,
+                    bmm1_scale=self.bmm1_scale,
+                    bmm2_scale=self.bmm2_scale,
+                    window_left=self.window_left,
                     sinks=self.sinks,
                     out=output[:num_decode_tokens],
                 )

From f856c33ce934721595d06c25b58ab0a467392d60 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 19 Aug 2025 20:54:30 +0800
Subject: [PATCH 391/932] [Model] Add multi_label_classification support
 (#23173)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 tests/conftest.py                             | 10 +++++-
 .../test_multilabel_classification_support.py | 33 +++++++++++++++++++
 vllm/model_executor/layers/pooler.py          | 15 +++++++++
 3 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 tests/models/language/pooling/test_multilabel_classification_support.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 3f3790cab8..2bf88abb0f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -456,7 +456,15 @@ class HfRunner:
         outputs = []
         for inputs in all_inputs:
             output = self.model(**self.wrap_device(inputs))
-            logits = output.logits.softmax(dim=-1)[0].tolist()
+
+            problem_type = getattr(self.config, "problem_type", "")
+
+            if problem_type == "regression":
+                logits = output.logits[0].tolist()
+            elif problem_type == "multi_label_classification":
+                logits = output.logits.sigmoid()[0].tolist()
+            else:
+                logits = output.logits.softmax(dim=-1)[0].tolist()
             outputs.append(logits)
 
         return outputs
diff --git a/tests/models/language/pooling/test_multilabel_classification_support.py b/tests/models/language/pooling/test_multilabel_classification_support.py
new file mode 100644
index 0000000000..45366f2094
--- /dev/null
+++ b/tests/models/language/pooling/test_multilabel_classification_support.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModelForSequenceClassification
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["Rami/multi-label-class-classification-on-github-issues"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_classify_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.classify(example_prompts)
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+
+        assert torch.allclose(hf_output, vllm_output,
+                              1e-3 if dtype == "float" else 1e-2)
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index e2162e5cbf..75e65072b7 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -172,6 +172,15 @@ def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]:
 
 
 def get_classification_activation_function(config: PretrainedConfig):
+    # Implement alignment with transformers ForSequenceClassificationLoss
+    # https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92
+    problem_type = getattr(config, "problem_type", "")
+    if problem_type == "regression":
+        return PoolerIdentity()
+    if problem_type == "single_label_classification":
+        return PoolerClassify()
+    if problem_type == "multi_label_classification":
+        return PoolerMultiLabelClassify()
     return PoolerClassify()
 
 
@@ -409,6 +418,12 @@ class PoolerNormalize(PoolerActivation):
         return x.to(pooled_data.dtype)
 
 
+class PoolerMultiLabelClassify(PoolerActivation):
+
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
+
+
 class PoolerClassify(PoolerActivation):
 
     def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:

From b87cb97a53bcff92a90308528b3f313e43aff102 Mon Sep 17 00:00:00 2001
From: myselvess <244285088@qq.com>
Date: Tue, 19 Aug 2025 21:12:59 +0800
Subject: [PATCH 392/932] [Model] support new model ovis2.5 (#23084)

Signed-off-by: myselvess <244285088@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md               |   1 +
 examples/offline_inference/vision_language.py |  33 +
 .../vision_language_multi_image.py            |  31 +
 .../multimodal/generation/test_common.py      |  21 +
 .../generation/vlm_utils/model_utils.py       |  58 ++
 .../multimodal/processing/test_common.py      |   2 +
 tests/models/registry.py                      |   3 +
 vllm/model_executor/models/ovis2_5.py         | 570 ++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/siglip2navit.py    | 607 ++++++++++++++++++
 .../transformers_utils/processors/__init__.py |   3 +-
 vllm/transformers_utils/processors/ovis2_5.py | 458 +++++++++++++
 12 files changed, 1787 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/ovis2_5.py
 create mode 100644 vllm/model_executor/models/siglip2navit.py
 create mode 100644 vllm/transformers_utils/processors/ovis2_5.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index bfab5713c7..1d165fa6f1 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -641,6 +641,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
 | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
+| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | ✅︎ |
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 9f6028d87c..88bbbfdfbd 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1105,6 +1105,38 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Ovis2_5
+def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "AIDC-AI/Ovis2.5-2B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        limit_mm_per_prompt={modality: 1},
+    )
+    if modality == "image":
+        placeholder = "<image>"
+    elif modality == "video":
+        placeholder = "<video>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # PaliGemma
 def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1579,6 +1611,7 @@ model_example_map = {
     "nemotron_vl": run_nemotron_vl,
     "NVLM_D": run_nvlm_d,
     "ovis": run_ovis,
+    "ovis2_5": run_ovis2_5,
     "paligemma": run_paligemma,
     "paligemma2": run_paligemma2,
     "phi3_v": run_phi3v,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 56519c95f8..eabd9453f3 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -680,6 +680,36 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+# ovis2_5
+def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "AIDC-AI/Ovis2.5-2B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "mistral-community/pixtral-12b"
 
@@ -1155,6 +1185,7 @@ model_example_map = {
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "ovis": load_ovis,
+    "ovis2_5": load_ovis2_5,
     "phi3_v": load_phi3v,
     "phi4_mm": load_phi4mm,
     "phi4_multimodal": load_phi4_multimodal,
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 2919bdbe91..ea5de9d9f5 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -11,6 +11,7 @@ from pathlib import PosixPath
 import pytest
 from transformers import (AutoModel, AutoModelForImageTextToText,
                           AutoModelForTextToWaveform, AutoModelForVision2Seq)
+from transformers.utils import is_flash_attn_2_available
 
 from vllm.platforms import current_platform
 from vllm.utils import identity
@@ -621,6 +622,26 @@ VLM_TEST_SETTINGS = {
         hf_model_kwargs={"llm_attn_implementation": "sdpa"},
         patch_hf_runner=model_utils.ovis_patch_hf_runner,
     ),
+    "ovis2_5": VLMTestInfo(
+        models=["AIDC-AI/Ovis2.5-2B"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<video>\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype="half",
+        num_logprobs=10,
+        patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
+        marks=[pytest.mark.skipif(
+            not is_flash_attn_2_available(),
+            reason="HF model needs `flash_attn` installed"
+        )],
+    ),
     "phi3v": VLMTestInfo(
         models=["microsoft/Phi-3.5-vision-instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index e43db4937e..8b7d051218 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -10,6 +10,7 @@ from typing import Optional, Union
 
 import numpy as np
 import numpy.typing as npt
+import PIL.Image
 import pytest
 import regex as re
 import torch
@@ -810,6 +811,63 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     return hf_model
 
 
+def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Ovis2."""
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.llm.get_output_embeddings()
+
+    def processor(*args, text="", images=None, videos=None, **kwargs):
+        if images is None:
+            images = []
+        else:
+            images = [images] if isinstance(images, Image) else images
+        if videos is None:
+            videos = []
+        else:
+            videos = [videos] if isinstance(videos, np.ndarray) else videos
+            videos = [[PIL.Image.fromarray(frame) for frame in vid]
+                      for vid in videos]
+
+        prompt_start_and_end = {
+            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "llama":
+            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
+        }
+        for start, end in prompt_start_and_end.values():
+            if start in text and end in text:
+                text = text.split(start)[1].split(end)[0]
+                break
+
+        images_message = [{"type": "image", "image": img} for img in images]
+        videos_message = [{"type": "video", "video": vid} for vid in videos]
+
+        messages = [{
+            "role":
+            "user",
+            "content": [
+                *images_message,
+                *videos_message,
+                {
+                    "type": "text",
+                    "text": text
+                },
+            ],
+        }]
+
+        input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
+            messages=messages, enable_thinking=True)
+        inputs = {
+            "inputs": input_ids,
+            "pixel_values": pixel_values,
+            "grid_thws": grid_thws,
+        }
+        return BatchFeature(data=inputs, tensor_type="pt")
+
+    hf_model.processor = processor
+    return hf_model
+
+
 def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
     thinker = hf_model.model.thinker
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index a1744317b3..0fdc182b9e 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -162,6 +162,7 @@ def _test_processing_correctness(
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
     "mllama": False,
     "ovis": False,
+    "ovis2_5": False,
     "paligemma": False,
     "ultravox": False,
     "whisper": False,
@@ -301,6 +302,7 @@ def _test_processing_correctness_one(
     "AIDC-AI/Ovis1.6-Gemma2-9B",
     "AIDC-AI/Ovis1.6-Llama3.2-3B",
     "AIDC-AI/Ovis2-1B",
+    "AIDC-AI/Ovis2.5-2B",
     "google/paligemma-3b-mix-224",
     "google/paligemma2-3b-ft-docci-448",
     "microsoft/Phi-3.5-vision-instruct",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 99cf997790..cbdc9edbbc 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -464,6 +464,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                             transformers_version_reason="HF model is not compatible",  # noqa: E501
                             extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
                                     "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
+    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True,
+                            max_transformers_version="4.53",
+                            transformers_version_reason="HF model is not compatible"),  # noqa: E501
     "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
new file mode 100644
index 0000000000..aa4ea3dd48
--- /dev/null
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -0,0 +1,570 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+""" PyTorch Ovis model."""
+from collections.abc import Iterable, Mapping
+from functools import partial
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.models.ovis import (OvisImagePatchInputs,
+                                             VisualEmbedding)
+from vllm.model_executor.models.siglip2navit import Siglip2NavitModel
+from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
+                                              init_vllm_registered_model,
+                                              maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargsItems)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal
+
+IMAGE_TOKEN = "<image>"
+VIDEO_TOKEN = "<video>"
+INDICATOR_IDS = [-301, -302, -303, -304]
+
+IMAGE_PAD_TOKEN_MAP = {
+    "gemma2": "<unused0>",
+    "llama": "<|reserved_special_token_0|>",
+    "qwen2": "<|image_pad|>",
+    "qwen3": "<|image_pad|>",
+}
+IMAGE_PAD_TOKEN_ID_MAP = {
+    "gemma2": 7,
+    "llama": 128002,
+    "qwen2": 151655,
+    "qwen3": 151655,
+}
+
+
+def _ovis2_5_field_config():
+    return dict(pixel_values=MultiModalFieldConfig.batched("image"),
+                grids=MultiModalFieldConfig.batched("image"),
+                indicator_tokens=MultiModalFieldConfig.batched("image"),
+                video_pixel_values=MultiModalFieldConfig.batched("video"),
+                video_indicator_tokens=MultiModalFieldConfig.batched("video"),
+                video_grids=MultiModalFieldConfig.batched("video"))
+
+
+class VisualTokenizer(torch.nn.Module):
+    """
+    VIT
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        visual_vocab_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.vit = self._init_backbone(
+            config=config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.vit",
+        )
+        # reserved tokens for INDICATOR_IDS
+        head_dim = visual_vocab_size - len(INDICATOR_IDS)
+        self.head = torch.nn.Sequential(
+            ReplicatedLinear(
+                self.config.hidden_size * self.config.hidden_stride**2,
+                head_dim,
+                bias=False,
+                return_bias=False,
+            ), torch.nn.LayerNorm(head_dim))
+
+    def _init_backbone(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        model_type = config.model_type
+        if model_type == "siglip2_navit":
+            return Siglip2NavitModel(config=config, )
+        raise ValueError(
+            f"Unsupported visual tokenizer model_type: {model_type}")
+
+    @property
+    def dtype(self):
+        return next(self.head.parameters()).dtype
+
+    @property
+    def device(self):
+        return next(self.head.parameters()).device
+
+    def tokenize(self, logits):
+        tokens = torch.softmax(logits, dim=-1,
+                               dtype=torch.float32).to(logits.dtype)
+        return tokens
+
+    def encode(self, pixel_values, grid_thws):
+        features = self.vit(pixel_values,
+                            grid_thws,
+                            output_hidden_states=True,
+                            return_dict=True)
+        # refer to qwen2.5-vl patchmerger
+        seq_len, _ = features.shape
+        features = features.reshape(seq_len // (self.config.hidden_stride**2),
+                                    -1)
+
+        return features
+
+    def forward(self, pixel_values, grid_thws) -> torch.Tensor:
+        features = self.encode(pixel_values, grid_thws)
+        logits = self.head(features)
+        tokens = self.tokenize(logits)
+        # tokens' shape is [#Token, VocabSize-4],
+        # so padding with [#Token, 4], after which,
+        # tokens' shape should become [#Token, VocabSize];
+        tokens = torch.nn.functional.pad(
+            tokens,
+            (0, len(INDICATOR_IDS)),
+            mode="constant",
+            value=0,
+        )
+        return tokens
+
+
+class Ovis2_5ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs):
+        vit_config = self.get_hf_config().vit_config
+        return self.ctx.get_hf_processor(
+            Ovis2_5Processor,
+            image_pad_token=self.get_image_pad_token(),
+            patch_size=vit_config.patch_size,
+            hidden_stride=vit_config.hidden_stride,
+            temporal_patch_size=vit_config.temporal_patch_size,
+        )
+
+    def get_image_pad_token(self) -> str:
+        hf_text_config = self.get_hf_config().get_text_config()
+        text_model_type = hf_text_config.model_type
+        return IMAGE_PAD_TOKEN_MAP.get(text_model_type)
+
+    def get_image_processor(self) -> BaseImageProcessor:
+        return self.get_hf_processor().image_processor  # type: ignore
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": 1}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        # NOTE(myselvess): max_pixels 1792 * 1792 hardcoded in original code
+        # TODO(myselvess): Be adjusted based on the max_pixels
+        return ImageSize(width=1792, height=1792)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+    ) -> tuple[ImageSize, int]:
+        hf_config = self.get_hf_config()
+        vit_config = hf_config.vit_config
+        patch_size = vit_config.patch_size
+        temporal_patch_size = vit_config.temporal_patch_size
+        # NOTE: Frames are padded to be divisible by `temporal_patch_size`
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
+        padded_num_frames = num_frames + (-num_frames % temporal_patch_size)
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
+        grid_h = image_height // patch_size
+        grid_w = image_width // patch_size
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches
+        return num_vision_tokens
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(image_width=target_width,
+                                         image_height=target_height)
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        num_frames = 0
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+                image_processor=None,
+            )
+            if next_max_tokens > max_tokens:
+                break
+            num_frames = next_num_frames
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = max_total_frames // max(max_videos, 1)
+        return max(max_frames_per_video, 1)
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+        image_processor: Optional[BaseImageProcessor],
+    ) -> int:
+        num_video_tokens = self.get_num_image_tokens(image_width=image_width,
+                                                     image_height=image_height,
+                                                     num_frames=num_frames)
+        return num_video_tokens
+
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
+            image_processor=None,
+        )
+
+
+class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+        return IMAGE_TOKEN * num_images + VIDEO_TOKEN * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+            )
+        }
+        return mm_data
+
+
+class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]
+                                 ):
+
+    def visual_indicators_to_visual_tokens(
+        self,
+        visual_indicators: list[int],
+    ) -> list[int]:
+        """
+        Filter image indicators placeholders and convert them to corresponding 
+        tokens in visual tokenizer.
+        """
+        hf_config = self.info.get_hf_config()
+        vte_vocab_size = hf_config.visual_vocab_size
+        return [
+            vte_vocab_size - len(INDICATOR_IDS) + abs(x + 300) - 1
+            for x in visual_indicators if x < -300
+        ]
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            # Avoid warning from HF logger for text-only input
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+        hf_processor = self.info.get_hf_processor()
+
+        if "videos" in mm_data:
+            visual_indicators = [
+                hf_processor.construct_visual_indicators((1, 1, 1), True)
+                for grid in processed_outputs["video_grids"]
+            ]
+            indicator_tokens = [
+                self.visual_indicators_to_visual_tokens(indicator)
+                for indicator in visual_indicators
+            ]
+            processed_outputs["video_indicator_tokens"] = indicator_tokens
+        if "images" in mm_data:
+            visual_indicators = [
+                hf_processor.construct_visual_indicators((1, 1, 1), False)
+                for grid in processed_outputs["grids"]
+            ]
+            indicator_tokens = [
+                self.visual_indicators_to_visual_tokens(indicator)
+                for indicator in visual_indicators
+            ]
+
+            processed_outputs["indicator_tokens"] = indicator_tokens
+        return processed_outputs
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+
+        return prompt_tokens
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _ovis2_5_field_config()
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> list[PromptReplacement]:
+
+        def get_replacement_ovis(item_idx, modality: str):
+            if modality == "image":
+                out_item = out_mm_kwargs["image"][item_idx]
+                grid = out_item["grids"].data
+            elif modality == "video":
+                out_item = out_mm_kwargs["video"][item_idx]
+                grid = out_item["video_grids"].data
+            hf_processor = self.info.get_hf_processor()
+            return hf_processor.construct_visual_placeholders(grid[0], )
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=IMAGE_TOKEN if modality == "image" else VIDEO_TOKEN,
+                replacement=partial(get_replacement_ovis, modality=modality),
+            ) for modality in ("image", "video")
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(Ovis2_5MultiModalProcessor,
+                                        info=Ovis2_5ProcessingInfo,
+                                        dummy_inputs=Ovis2_5DummyInputsBuilder)
+class Ovis2_5(nn.Module, SupportsMultiModal):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config: PretrainedConfig = config
+        self.llm = init_vllm_registered_model(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "llm"),
+        )
+
+        self.visual_tokenizer = VisualTokenizer(
+            config=config.vit_config,
+            visual_vocab_size=config.visual_vocab_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.visual_tokenizer",
+        )
+
+        self.vte = VisualEmbedding(config.visual_vocab_size,
+                                   config.hidden_size)
+
+        text_model_type = self.config.get_text_config().model_type
+        self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
+
+        # TODO(Isotr0py): PP support
+        # self.make_empty_intermediate_tensors = (
+        #    self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_visual_input(
+            self, is_video,
+            **kwargs: object) -> Optional[OvisImagePatchInputs]:
+        if is_video:
+            pixel_values = kwargs.pop("video_pixel_values", None)
+            indicator_tokens = kwargs.pop("video_indicator_tokens", None)
+            grids = kwargs.pop("video_grids", None)
+        else:
+            pixel_values = kwargs.pop("pixel_values", None)
+            indicator_tokens = kwargs.pop("indicator_tokens", None)
+            grids = kwargs.pop("grids", None)
+        if pixel_values is None and indicator_tokens is None:
+            return None
+
+        if pixel_values is not None and indicator_tokens is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(indicator_tokens, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of indicator_tokens. "
+                                 f"Got type: {type(indicator_tokens)}")
+
+            return OvisImagePatchInputs(
+                type="image_patches",
+                flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
+                patches_per_image=[
+                    x.shape[0] // (self.config.vit_config.hidden_stride**2)
+                    for x in flatten_bn(pixel_values)
+                ],
+                indicator_tokens=flatten_bn(flatten_bn(indicator_tokens),
+                                            concat=True),
+                grids=flatten_bn(flatten_bn(grids), concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+            self, image_input: OvisImagePatchInputs) -> MultiModalEmbeddings:
+        image_patches_flat = image_input["flat_data"]
+        patches_per_image = image_input["patches_per_image"]
+        indicator_tokens = image_input["indicator_tokens"]
+        grid_thws = image_input["grids"]
+
+        indicator_per_image = list(
+            map(lambda x: 2 if x > 1 else x + 2, patches_per_image))
+
+        target_dtype = self.visual_tokenizer.dtype
+        visual_tokens = self.visual_tokenizer(
+            image_patches_flat.to(target_dtype), grid_thws)
+
+        visual_embeds = self.vte(visual_tokens)  # 1:1 numeric eq.
+        indicator_embeds = self.vte(indicator_tokens)
+
+        visual_embeds_per_image = visual_embeds.split(patches_per_image, dim=0)
+        indicator_embeds_per_image = indicator_embeds.split(
+            indicator_per_image)
+
+        vision_embeddings = []
+        for indicator, visual in zip(indicator_embeds_per_image,
+                                     visual_embeds_per_image):
+            vision_embeddings_per_image = []
+            visual = visual.unsqueeze(0)
+            for i in range(visual.shape[0]):
+                vision_embeddings_per_image.append(
+                    torch.cat([indicator[i:i + 1], visual[i]], dim=0))
+            vision_embeddings_per_image.append(indicator[i + 1:])
+            vision_embeddings.append(
+                torch.cat(vision_embeddings_per_image, dim=0))
+        return tuple(vision_embeddings)
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        embeddings = []
+
+        # NOTE: _parse_and_validate_visual_input has side-effects and pops
+        # keys from kwargs. We process images first, then videos.
+        image_input = self._parse_and_validate_visual_input(False, **kwargs)
+        if image_input:
+            embeddings.extend(self._process_image_input(image_input))
+
+        video_input = self._parse_and_validate_visual_input(True, **kwargs)
+        if video_input:
+            embeddings.extend(self._process_image_input(video_input))
+
+        return tuple(embeddings) if embeddings else None
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.llm.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            tmp = torch.concat(multimodal_embeddings, dim=0)
+            inputs_embeds[input_ids == self.image_pad_token_id] = tmp
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        # up until here we have a inputs_embeds 100% numerical identity
+        # between the OG HF Transformers implementation and ours
+        hidden_states = self.llm(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.llm.compute_logits(hidden_states, sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.llm
\ No newline at end of file
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 109bc1fe5c..8728684d8e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -231,6 +231,7 @@ _MULTIMODAL_MODELS = {
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
     "Ovis": ("ovis", "Ovis"),
+    "Ovis2_5": ("ovis2_5", "Ovis2_5"),
     "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
new file mode 100644
index 0000000000..10093f92a5
--- /dev/null
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -0,0 +1,607 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Implementation of SiglipVisionModel intended to be only used
+within a vision language model."""
+
+from typing import Optional, Union
+
+import torch
+from einops import rearrange, repeat
+from torch import nn
+from torch.nn import functional as F
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention
+
+from vllm.platforms import _Backend
+
+from .vision import get_vit_attn_backend
+
+
+class VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta
+                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen,
+                           device=self.inv_freq.device,
+                           dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class Siglip2VisionEmbeddings(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.image_size = config.image_size
+        self.num_patches = config.num_patches
+        self.preserve_original_pe = config.preserve_original_pe
+        self.hidden_stride = config.hidden_stride
+
+        # siglip2 naflex
+        if self.num_patches > 0:
+            self.patch_embedding = nn.Linear(
+                in_features=config.num_channels * self.patch_size *
+                self.patch_size,
+                out_features=self.embed_dim,
+            )
+            if self.preserve_original_pe:
+                self.position_embedding_size = int(self.num_patches**0.5)
+                self.position_embedding = nn.Embedding(self.num_patches,
+                                                       self.embed_dim)
+
+        else:
+            self.patch_embedding = nn.Conv2d(
+                in_channels=config.num_channels,
+                out_channels=self.embed_dim,
+                kernel_size=self.patch_size,
+                stride=self.patch_size,
+                padding="valid",
+            )
+            if self.preserve_original_pe:
+                self.num_patches = (self.image_size // self.patch_size)**2
+                self.position_embedding_size = (self.image_size //
+                                                self.patch_size)
+                self.position_embedding = nn.Embedding(self.num_patches,
+                                                       self.embed_dim)
+
+    def forward(self,
+                pixel_values: torch.FloatTensor,
+                grid_thws: Optional[torch.LongTensor] = None) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor`):
+                Pixel values of shape (
+                    num_patches,
+                    num_channels * temporal_patch_size * patch_size * patch_size
+                )
+            grid_thws: (`torch.LongTensor`):
+                grid shape (num_patches, 3)
+        """
+
+        # Apply patch embeddings to already patchified pixel values
+        target_dtype = self.patch_embedding.weight.dtype
+        if isinstance(self.patch_embedding, nn.Linear):
+            patch_embeds = self.patch_embedding(
+                pixel_values.to(dtype=target_dtype))
+        elif isinstance(self.patch_embedding, nn.Conv2d):
+            pixel_values = pixel_values.view(
+                -1, self.config.num_channels * self.config.temporal_patch_size,
+                self.patch_size, self.patch_size)
+            patch_embeds = self.patch_embedding(
+                pixel_values.to(dtype=target_dtype))
+            patch_embeds = patch_embeds.reshape(-1, self.embed_dim)
+
+        if self.preserve_original_pe:
+            assert grid_thws is not None
+            pos_embed_new = torch.zeros_like(patch_embeds)
+            positional_embeddings = self.position_embedding.weight.reshape(
+                self.position_embedding_size, self.position_embedding_size,
+                -1).unsqueeze(0).permute(0, 3, 1, 2)
+            cnt = 0
+            for t, h, w in grid_thws:
+                volume = t * h * w
+                pe = F.interpolate(positional_embeddings,
+                                   size=(h, w),
+                                   mode='bicubic',
+                                   align_corners=False)
+                pe = pe.permute(0, 2, 3, 1).reshape(1, h * w, -1)
+                pe = pe[0].repeat(t, 1)
+                pe = pe.reshape(t, h // self.hidden_stride, self.hidden_stride,
+                                w // self.hidden_stride, self.hidden_stride,
+                                -1)
+                pe = pe.permute(0, 1, 3, 2, 4, 5).reshape(volume, -1)
+                pos_embed_new[cnt:cnt + volume] = pe
+                cnt += volume
+            patch_embeds = patch_embeds + pos_embed_new
+
+        return patch_embeds
+
+
+# copy from flash_attn/layers/rotary.py
+def rotate_half(x, interleaved=False):
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1),
+                         "... d two -> ... (d two)",
+                         two=2)
+
+
+def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(
+        sin,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos +
+            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_flash_attn_backend: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.chunk(2, dim=-1)[0].contiguous()
+    sin = sin.chunk(2, dim=-1)[0].contiguous()
+    if is_flash_attn_backend:
+        from flash_attn.layers.rotary import apply_rotary_emb
+        apply_rotary_emb_func = apply_rotary_emb
+    else:
+        apply_rotary_emb_func = apply_rotary_emb_torch
+    q_embed = apply_rotary_emb_func(q.float(), cos.float(),
+                                    sin.float()).type_as(q)
+    k_embed = apply_rotary_emb_func(k.float(), cos.float(),
+                                    sin.float()).type_as(k)
+    return q_embed, k_embed
+
+
+class Siglip2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+        self.use_rope = config.use_rope
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA,
+                _Backend.ROCM_AITER_FA
+        }:
+            self.attn_backend = _Backend.TORCH_SDPA
+        self.is_flash_attn_backend = self.attn_backend in {
+            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+        }
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        position_embeddings: Optional[tuple[torch.Tensor,
+                                            torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(seq_length, self.num_heads, self.head_dim)
+        keys = keys.view(seq_length, self.num_heads, self.head_dim)
+        values = values.view(seq_length, self.num_heads, self.head_dim)
+
+        if self.use_rope:
+            cos, sin = position_embeddings
+            queries, keys = apply_rotary_pos_emb(queries.unsqueeze(0),
+                                                 keys.unsqueeze(0), cos, sin,
+                                                 self.is_flash_attn_backend)
+            queries = queries.squeeze(0)
+            keys = keys.squeeze(0)
+
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        if self.is_flash_attn_backend:
+            if self.attn_backend == _Backend.ROCM_AITER_FA:
+                from aiter import flash_attn_varlen_func
+            else:
+                from flash_attn import flash_attn_varlen_func
+            attn_output = flash_attn_varlen_func(
+                queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen,
+                max_seqlen).reshape(seq_length, -1)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            # Execute attention entry by entry for speed & less VRAM.
+            batch_size = cu_seqlens.shape[0] - 1
+            outputs = []
+            cu = cu_seqlens.tolist()
+            for i in range(batch_size):
+                start_idx = cu[i]
+                end_idx = cu[i + 1]
+
+                # Each sequence is processed independently.
+                q_i = queries[start_idx:end_idx].unsqueeze(0)
+                k_i = keys[start_idx:end_idx].unsqueeze(0)
+                v_i = values[start_idx:end_idx].unsqueeze(0)
+
+                # (1, seq_len, num_heads, head_dim) ->
+                # (1, num_heads, seq_len, head_dim)
+                q_i, k_i, v_i = [x.transpose(1, 2) for x in (q_i, k_i, v_i)]
+
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                          k_i,
+                                                          v_i,
+                                                          dropout_p=0.0)
+                # (1, num_heads, seq_len, head_dim) -> (seq_len, embed_dim)
+                output_i = output_i.transpose(1, 2).reshape(-1, self.embed_dim)
+                outputs.append(output_i)
+
+            attn_output = torch.cat(outputs, dim=0)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+
+
+class Siglip2MLP(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Siglip2EncoderLayer(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.self_attn = Siglip2Attention(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+
+    def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
+                position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all 
+                attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states,
+                                       cu_seqlens=cu_seqlens,
+                                       position_embeddings=position_embeddings)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Siglip2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` 
+    self attention layers. Each layer is a [`Siglip2EncoderLayer`].
+
+    Args:
+        config: PretrainedConfig
+    """
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            Siglip2EncoderLayer(config)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.gradient_checkpointing = False
+
+        self.rotary_pos_emb = VisionRotaryEmbedding(
+            config.hidden_size // config.num_attention_heads // 2)
+        self.patch_size = config.patch_size
+        self.hidden_stride = config.hidden_stride
+        self.window_size = config.window_size
+        self.spatial_merge_unit = config.hidden_stride * config.hidden_stride
+        if config.fullatt_block_indexes is None:
+            self.fullatt_block_indexes = None
+        else:
+            self.fullatt_block_indexes = [
+                int(i) for i in config.fullatt_block_indexes.split('|')
+            ]
+
+    # copied from qwen2.5_vl
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.hidden_stride,
+                self.hidden_stride,
+                w // self.hidden_stride,
+                self.hidden_stride,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.hidden_stride,
+                self.hidden_stride,
+                w // self.hidden_stride,
+                self.hidden_stride,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        # patch (after merge) number in each window
+        vit_merger_window_size = (self.window_size // self.hidden_stride //
+                                  self.patch_size)
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.hidden_stride,  # number of patch after merge
+                grid_w // self.hidden_stride,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+                grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(
+                0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+
+        return window_index, cu_window_seqlens
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        grid_thws: torch.Tensor,
+        output_hidden_states: bool = False,
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, ...]]]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape
+                `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation. This is useful if
+                you want more control over how to convert `input_ids` indices
+                into associated vectors than the model's internal embedding
+                lookup matrix.
+            grid_thws (`torch.LongTensor`):
+                grid shape (num_patches, 3)
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See
+                `hidden_states` under returned tensors for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of
+                a plain tuple.
+        """
+        rotary_pos_emb = self.rot_pos_emb(grid_thws)
+        window_index, cu_window_seqlens = self.get_window_index(grid_thws)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=inputs_embeds.device,
+            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
+        seq_len, _ = inputs_embeds.size()
+        inputs_embeds = inputs_embeds.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        inputs_embeds = inputs_embeds[window_index, :, :]
+        inputs_embeds = inputs_embeds.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        cu_seqlens = torch.repeat_interleave(
+            grid_thws[:, 1] * grid_thws[:, 2], grid_thws[:, 0]
+        ).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have
+            #    same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852
+            # for more information
+            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        reverse_indices = torch.argsort(window_index)
+        encoder_states = () if output_hidden_states else None
+
+        hidden_states = inputs_embeds
+        for index, block in enumerate(self.layers):
+            if (not self.fullatt_block_indexes
+                    or index in self.fullatt_block_indexes):
+                cu_seqlens_tmp = cu_seqlens
+            else:
+                cu_seqlens_tmp = cu_window_seqlens
+            hidden_states = block(hidden_states, cu_seqlens_tmp,
+                                  position_embeddings)
+            if output_hidden_states:
+                hidden_states_ = hidden_states.reshape(
+                    seq_len // self.spatial_merge_unit,
+                    self.spatial_merge_unit, -1)
+                encoder_states += (hidden_states_[reverse_indices, :].reshape(
+                    seq_len, -1), )
+        # tokens = self.post_trunk_norm(tokens)
+        hidden_states = hidden_states.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[reverse_indices, :].reshape(seq_len, -1)
+
+        return hidden_states, encoder_states
+
+
+class Siglip2VisionTransformer(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = Siglip2VisionEmbeddings(config)
+        self.encoder = Siglip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = \
+            (config._attn_implementation == "flash_attention_2")
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        grid_thws: torch.LongTensor,
+        output_hidden_states: Optional[bool] = True,
+        return_dict: Optional[bool] = True,
+    ) -> Union[
+            tuple[torch.Tensor],
+            tuple[torch.Tensor, tuple[torch.Tensor, ...]],
+            BaseModelOutputWithNoAttention,
+    ]:
+        r"""
+        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
+            Tensor containing the spatial dimensions (height, width)
+            of the input images.
+        """
+        hidden_states = self.embeddings(pixel_values, grid_thws)
+
+        last_hidden_state, hidden_states = self.encoder(
+            hidden_states, grid_thws, output_hidden_states)
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            output = (last_hidden_state, )
+            output += (hidden_states, ) if output_hidden_states else ()
+            return output
+
+        return last_hidden_state
+
+
+class Siglip2NavitModel(torch.nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+
+        self.vision_model = Siglip2VisionTransformer(config)
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        grid_thws: torch.LongTensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[
+            tuple[torch.Tensor],
+            tuple[torch.Tensor, tuple[torch.Tensor, ...]],
+            BaseModelOutputWithNoAttention,
+    ]:
+
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+        if return_dict is None:
+            return_dict = self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            grid_thws=grid_thws,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index eca4d7c884..8a1ad226d9 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -11,5 +11,6 @@ reasons:
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
 from vllm.transformers_utils.processors.ovis import OvisProcessor
+from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 
-__all__ = ["DeepseekVLV2Processor", "OvisProcessor"]
+__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py
new file mode 100644
index 0000000000..d3273257ff
--- /dev/null
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -0,0 +1,458 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from functools import cached_property
+from typing import Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import AutoProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
+                                           Unpack)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+__all__ = ['Ovis2_5Processor']
+IMAGE_TOKEN = "<image>"
+VIDEO_TOKEN = "<video>"
+MIN_PIXELS = 448 * 448
+MAX_PIXELS = 1792 * 1792
+
+
+class Ovis2_5ProcessorKwargs(ProcessingKwargs,
+                             total=False):  # type: ignore[call-arg]
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            'convert_to_rgb': True,
+            'min_pixels': MIN_PIXELS,
+            'max_pixels': MAX_PIXELS,
+        },
+        "videos_kwargs": {
+            'convert_to_rgb': True,
+            'min_pixels': MIN_PIXELS,
+            'max_pixels': MAX_PIXELS,
+        }
+    }
+
+
+class Ovis2_5Processor(ProcessorMixin):
+    r"""
+    Constructs a Ovis processor which wraps a Ovis image processor
+    and a Qwen2 tokenizer into a single processor.
+    [`OvisProcessor`] offers all the functionalities of 
+    [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. 
+    See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`]
+    for more information.
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will
+            be used to convert lists of messages in a chat into
+            a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template", "image_pad_token"]
+
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        image_pad_token=None,
+        patch_size=16,
+        hidden_stride=2,
+        temporal_patch_size=1,
+        **kwargs,
+    ):
+        self.image_token = IMAGE_TOKEN
+        self.video_token = VIDEO_TOKEN
+        self.image_pad_token = "<|image_pad|>"
+
+        self.patch_size = patch_size
+        self.hidden_stride = hidden_stride
+        self.temporal_patch_size = temporal_patch_size
+        super().__init__(image_processor,
+                         tokenizer,
+                         chat_template=chat_template)
+
+    @cached_property
+    def extra_special_tokens(self):
+        image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
+        extra_special_tokens = {
+            "image_token": -200,
+            "video_token": -201,
+            "visual_atom": -300,
+            "image_start": -301,
+            "image_end": -302,
+            "video_start": -303,
+            "video_end": -304,
+            'image_pad': image_pad_token_id,
+        }
+        return extra_special_tokens
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        videos: Union[np.ndarray, list[ImageInput]] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput],
+                    list[PreTokenizedInput]] = None,
+        **kwargs: Unpack[Ovis2_5ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s)
+        and image(s). This method forwards the `text`and `kwargs` arguments
+        to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text`
+        is not `None` to encode the text. To prepare the vision inputs,
+        this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
+        if `vision_infos` is not `None`.
+            Args:
+                images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`,
+                    `list[PIL.Image.Image]`, `list[np.ndarray]`,
+                    `list[torch.Tensor]`):
+                    The image or batch of images to be prepared.
+                    Each image can be a PIL image, NumPy array or PyTorch
+                    tensor. Both channels-first and channels-last formats
+                    are supported.
+                text (`str`, `list[str]`, `list[list[str]]`):
+                    The sequence or batch of sequences to be encoded.
+                    Each sequence can be a string or a list of strings
+                    (pretokenized string). If the sequences are provided as
+                    list of strings (pretokenized), you must set
+                    `is_split_into_words=True` (to lift the ambiguity with
+                    a batch of sequences).
+                videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`,
+                    `list[torch.Tensor]`):
+                    The image or batch of videos to be prepared. Each video
+                    can be a 4D NumPy array or PyTorch tensor, or a nested
+                    list of 3D frames. Both channels-first and channels-last
+                    formats are supported.
+                return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                    If set, will return tensors of a particular framework.
+                    Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+            Returns:
+                [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+                - **input_ids** -- list of token ids to be fed to a model.
+                  Returned when `text` is not `None`.
+                - **attention_mask** -- list of indices specifying which tokens 
+                  should be attended to by the model (when
+                  `return_attention_mask=True` or if *"attention_mask"* 
+                  is in `self.model_input_names` and if `text` is not `None`).
+                - **pixel_values** -- Pixel values to be fed to a model.
+                  Returned when `images` is not `None`.
+                - **pixel_values_videos** -- Pixel values of videos to be fed to
+                  a model. Returned when `videos` is not `None`.
+                - **image_grid_thw** -- list of image 3D grid in LLM. Returned
+                  when `images` is not `None`.
+                - **video_grid_thw** -- list of video 3D grid in LLM. Returned
+                  when `videos` is not `None`.
+                - **second_per_grid_ts** -- list of video seconds per time grid.
+                  Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Ovis2_5ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # Process all images first
+        visual_features = {}
+        output = BatchFeature()
+        if images is not None:
+            processed_images = []
+            image_placeholders_list = []
+            grids = []
+            # Process each image
+            for image in images if isinstance(images, list) else [images]:
+                pixel_values, image_placeholders, grid = (
+                    self.preprocess_multidata(
+                        images=image, **output_kwargs["images_kwargs"]))
+                processed_images.append(pixel_values)
+                image_placeholders_list.append(image_placeholders)
+                grids.append(grid)
+
+            # assign all processed images
+            if processed_images:
+                visual_features["image_placeholders"] = image_placeholders_list
+            output["pixel_values"] = processed_images
+            output["grids"] = grids
+
+        if videos is not None:
+            processed_videos = []
+            videos_placeholders_list = []
+            grids = []
+            # Process each video
+            for video in videos if isinstance(videos, list) else [videos]:
+                pixel_values, video_placeholders, grid = (
+                    self.preprocess_multidata(
+                        video=video, **output_kwargs["videos_kwargs"]))
+                processed_videos.append(pixel_values)
+                videos_placeholders_list.append(video_placeholders)
+                grids.append(grid)
+            # assign all processed videos
+            if processed_videos:
+                visual_features[
+                    "video_placeholders"] = videos_placeholders_list
+            output["video_pixel_values"] = processed_videos
+            output["video_grids"] = grids
+
+        # Process text input
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
+            tokenized_batched_text = self._tokenize_with_visual_symbol(text)
+            image_token_id = self.get_token_value("image_token")
+            video_token_id = self.get_token_value("video_token")
+            replaced_ids_list = []
+            image_idx = 0
+            video_idx = 0
+            for ids_tensor in tokenized_batched_text:
+                has_image_tokens = (image_token_id in ids_tensor
+                                    and "image_placeholders" in visual_features
+                                    and image_idx < len(
+                                        visual_features["image_placeholders"]))
+                has_video_tokens = (video_token_id in ids_tensor
+                                    and "video_placeholders" in visual_features
+                                    and video_idx < len(
+                                        visual_features["video_placeholders"]))
+                if has_image_tokens or has_video_tokens:
+                    # Convert to list for easier manipulation
+                    ids_list = ids_tensor.tolist()
+                    new_ids = []
+
+                    # Replace placeholders
+                    for token_id in ids_list:
+                        if token_id == image_token_id:
+                            new_ids.extend(
+                                visual_features["image_placeholders"]
+                                [image_idx])
+                            image_idx += 1
+                        elif token_id == video_token_id:
+                            new_ids.extend(
+                                visual_features["video_placeholders"]
+                                [video_idx])
+                            video_idx += 1
+                        else:
+                            new_ids.append(token_id)
+                    # Convert back to tensor
+                    ids_tensor = torch.tensor(new_ids, dtype=torch.long)
+                replaced_ids_list.append(ids_tensor)
+            if replaced_ids_list:
+                replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
+            else:
+                replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
+            output["input_ids"] = replaced_and_tokenized_ids
+
+            return output
+        # If only images were provided
+        return BatchFeature(data=visual_features)
+
+    def _tokenize_with_visual_symbol(self,
+                                     text_list: list[str]) -> torch.LongTensor:
+        batch_token_ids = []
+        for text in text_list:
+            token_ids = []
+            video_token_id = self.get_token_value("video_token")
+            image_token_id = self.get_token_value("image_token")
+            video_split_texts = text.split(self.video_token)
+
+            for j, video_segment in enumerate(video_split_texts):
+                image_split_texts = video_segment.split(self.image_token)
+                text_chunks = [
+                    self.tokenizer(chunk, add_special_tokens=False).input_ids
+                    for chunk in image_split_texts
+                ]
+                segment_tokens = []
+                for i, chunk in enumerate(text_chunks):
+                    segment_tokens.extend(chunk)
+                    if i < len(text_chunks) - 1:
+                        segment_tokens.append(image_token_id)
+                token_ids.extend(segment_tokens)
+                if j < len(video_split_texts) - 1:
+                    token_ids.append(video_token_id)
+
+            batch_token_ids.append(token_ids)
+        return torch.tensor(batch_token_ids, dtype=torch.long)
+
+    # Copied from qwen2_vl
+    def smart_resize(self,
+                     height: int,
+                     width: int,
+                     factor: int = 28,
+                     min_pixels: int = MIN_PIXELS,
+                     max_pixels: int = MAX_PIXELS):
+        """Rescales the image so that the following conditions are met:
+        1. Both dimensions (height and width) are divisible by 'factor'.
+        2. The total number of pixels is within the range 
+            ['min_pixels', 'max_pixels'].
+        3. The aspect ratio of the image is maintained as closely as possible.
+        """
+        if height < factor or width < factor:
+            print(f"height:{height} or width:{width} must be "
+                  f"larger than factor:{factor}")
+            if height < width:
+                width = round(factor / height * width)
+                height = factor
+            else:
+                height = round(factor / width * height)
+                width = factor
+
+        elif max(height, width) / min(height, width) > 200:
+            print(f"absolute aspect ratio must be smaller than 200, "
+                  f"got {max(height, width) / min(height, width)}")
+            if height > width:
+                height = 200 * width
+            else:
+                width = 200 * height
+
+        h_bar = round(height / factor) * factor
+        w_bar = round(width / factor) * factor
+        if h_bar * w_bar > max_pixels:
+            beta = math.sqrt((height * width) / max_pixels)
+            h_bar = math.floor(height / beta / factor) * factor
+            w_bar = math.floor(width / beta / factor) * factor
+        elif h_bar * w_bar < min_pixels:
+            beta = math.sqrt(min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / factor) * factor
+            w_bar = math.ceil(width * beta / factor) * factor
+        return h_bar, w_bar
+
+    def get_token_value(self, tok):
+        return self.extra_special_tokens[tok]
+
+    def construct_visual_indicators(self, grid, is_video: bool = False):
+        if is_video:
+            start_token = self.get_token_value('video_start')
+            end_token = self.get_token_value('video_end')
+        else:
+            start_token = self.get_token_value('image_start')
+            end_token = self.get_token_value('image_end')
+
+        image_placeholders = [start_token, self.get_token_value('visual_atom')]
+        if grid[0] * grid[1] > 1:
+            for r in range(grid[0]):
+                for c in range(grid[1]):
+                    image_placeholders.append(
+                        self.get_token_value('visual_atom'))
+
+        image_placeholders.append(end_token)
+        return image_placeholders
+
+    def construct_visual_placeholders(self, grid, is_video: bool = False):
+        visual_placeholders = self.construct_visual_indicators((1, 1),
+                                                               is_video)
+
+        image_atom_token_id = self.get_token_value('visual_atom')
+        # Extract the padding token ID from tokenizer
+        image_padding_token_id = self.get_token_value('image_pad')
+
+        num_image_atoms = grid[0] * grid[1] * grid[2]
+        num_image_atoms //= self.hidden_stride**2
+        num_image_atoms //= self.temporal_patch_size
+
+        # Create a new list with padding tokens inserted
+        padded_placeholder_tokens = []
+        for token in visual_placeholders:
+            if token == image_atom_token_id:
+                padded_placeholder_tokens.extend([image_padding_token_id] *
+                                                 num_image_atoms)
+            else:
+                padded_placeholder_tokens.append(image_padding_token_id)
+        return padded_placeholder_tokens
+
+    def preprocess_multidata(
+        self,
+        images: Optional[Union[PIL.Image.Image, list[PIL.Image.Image]]] = None,
+        video: Optional[Union[list[PIL.Image.Image], np.ndarray]] = None,
+        convert_to_rgb: Optional[bool] = True,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+        return_tensors: Optional[str] = 'pt',
+    ):
+        is_video = False
+        if images is not None:
+            if not isinstance(images, list):
+                images = [images]
+        elif video is not None:
+            is_video = True
+            # type of vidoe in dummy_mm_data is np.ndarray
+            if isinstance(video, np.ndarray):
+                images = []
+                for i in range(video.shape[0]):
+                    image = PIL.Image.fromarray(video[i].astype(np.uint8))
+                    images.append(image)
+            elif isinstance(video, list):
+                images = video
+        min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
+                         min_pixels if min_pixels is not None else MIN_PIXELS)
+        images = [
+            image.convert("RGB")
+            if convert_to_rgb and image.mode != 'RGB' else image
+            for image in images
+        ]
+
+        width, height = images[0].size
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            resized_height, resized_width = self.smart_resize(
+                height,
+                width,
+                factor=self.patch_size * self.hidden_stride,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
+            new_size = dict(height=resized_height, width=resized_width)
+            image_pt = self.image_processor.preprocess(
+                image, size=new_size, return_tensors="np")['pixel_values'][0]
+
+            processed_images.append(image_pt)
+
+        patches = np.array(processed_images)
+        if patches.shape[0] % self.temporal_patch_size != 0:
+            num_to_pad = self.temporal_patch_size - (patches.shape[0] %
+                                                     self.temporal_patch_size)
+            repeats = np.repeat(patches[-1][np.newaxis], num_to_pad, axis=0)
+            patches = np.concatenate([patches, repeats], axis=0)
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h = resized_height // self.patch_size
+        grid_w = resized_width // self.patch_size
+
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h // self.hidden_stride,
+            self.hidden_stride,
+            self.patch_size,
+            grid_w // self.hidden_stride,
+            self.hidden_stride,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * self.temporal_patch_size *
+            self.patch_size * self.patch_size)
+
+        visual_placeholders = self.construct_visual_placeholders(
+            [grid_t, grid_h, grid_w], is_video)
+        return torch.tensor(
+            flatten_patches), visual_placeholders, torch.tensor(
+                [[grid_t, grid_h, grid_w]])
+
+
+AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)

From 4d9c61993ac4209c97b3afef237b2387f2cd9b97 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 19 Aug 2025 21:39:40 +0800
Subject: [PATCH 393/932] [Bugfix] Fix benchmark_moe.py  (#23177)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index b4a03665ef..752c2d0082 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -430,7 +430,6 @@ class BenchmarkWorker:
                 hidden_size,
                 topk,
                 dtype_str,
-                is_marlin=False,
             )
         else:
             config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]

From 1298c67795b5955dc6c3ffc28a172b465046d8c9 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Tue, 19 Aug 2025 08:25:57 -0700
Subject: [PATCH 394/932] [FEAT] [Performance] Enable DP for ViT in Qwen2.5VL
 (#22742)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/multimodal/test_utils.py           | 326 ++++++++++++++++++++++-
 vllm/model_executor/layers/linear.py     |   2 +-
 vllm/model_executor/models/qwen2_5_vl.py | 136 +++++++---
 vllm/model_executor/models/qwen2_vl.py   |   2 -
 vllm/multimodal/utils.py                 | 215 +++++++++++++++
 5 files changed, 633 insertions(+), 48 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index ea964a5438..a028c668c8 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
+import math
 import mimetypes
 import os
 from tempfile import NamedTemporaryFile, TemporaryDirectory
@@ -20,6 +21,8 @@ from vllm.distributed.parallel_state import (init_distributed_environment,
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector, argsort_mm_positions,
+                                   get_load_balance_assignment,
+                                   run_dp_sharded_mrope_vision_model,
                                    run_dp_sharded_vision_model)
 from vllm.platforms import current_platform
 from vllm.utils import get_open_port, update_environment_variables
@@ -425,8 +428,8 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
     # Set random seed for reproducibility
     current_platform.seed_everything(0)
 
-    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
+    device = f"{current_platform.device_name}:{local_rank}"
+    current_platform.set_device(device)
     torch.set_default_device(device)
 
     update_environment_variables({
@@ -463,3 +466,322 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
 
     # Check that the outputs are close (they should be identical)
     assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5)
+
+
+@pytest.mark.parametrize(
+    "sizes,num_gpus,expected_shuffle_indices,expected_gpu_sample_counts,"
+    "expected_grouped_sizes_per_gpu,test_description",
+    [
+        # Empty input
+        ([], 2, [], [0, 0], [0, 0], "empty input"),
+
+        # Fewer samples than GPUs
+        ([100, 200], 4, [1, 0], [1, 1, 0, 0], [200, 100, 0, 0
+                                               ], "fewer samples than GPUs"),
+
+        # Single GPU
+        ([100, 200, 300], 1, [2, 1, 0], [3], [600], "single GPU"),
+
+        # Balanced assignment
+        ([100, 100, 100, 100
+          ], 2, [0, 2, 1, 3], [2, 2], [200, 200], "balanced assignment"),
+
+        # Unbalanced sizes - this one is trickier since the algorithm is greedy
+        ([1000, 100, 200, 50], 2, [0, 2, 1, 3
+                                   ], [1, 3], [1000, 350], "unbalanced sizes"),
+    ],
+)
+def test_get_load_balance_assignment_cases(sizes, num_gpus,
+                                           expected_shuffle_indices,
+                                           expected_gpu_sample_counts,
+                                           expected_grouped_sizes_per_gpu,
+                                           test_description):
+    """Test get_load_balance_assignment with various input cases."""
+    result = get_load_balance_assignment(sizes, num_gpus=num_gpus)
+    (shuffle_indices, gpu_sample_counts, grouped_sizes_per_gpu) = result
+
+    # Common assertions for all cases
+    assert len(shuffle_indices) == len(sizes)
+    assert len(gpu_sample_counts) == num_gpus
+    assert len(grouped_sizes_per_gpu) == num_gpus
+    assert sum(gpu_sample_counts) == len(sizes)
+
+    assert shuffle_indices == expected_shuffle_indices
+
+    assert gpu_sample_counts == expected_gpu_sample_counts
+    assert grouped_sizes_per_gpu == expected_grouped_sizes_per_gpu
+
+
+class SimpleMRopeVisionModel(torch.nn.Module):
+    """A simple vision model for testing mrope functionality."""
+
+    def __init__(self, spatial_merge_size: int = 2, out_hidden_size: int = 64):
+        super().__init__()
+        self.spatial_merge_size = spatial_merge_size
+        self.out_hidden_size = out_hidden_size
+        self.linear = torch.nn.Linear(768, out_hidden_size)
+
+    def forward(self, pixel_values: torch.Tensor,
+                grid_thw_list: list[list[int]]):
+        """Simple forward pass that simulates spatial merging."""
+        # Apply linear transformation
+        embeddings = self.linear(pixel_values)
+
+        # Simulate spatial merging by reducing the number of patches
+        merge_factor = self.spatial_merge_size * self.spatial_merge_size
+
+        # Group patches and merge spatially
+        merged_embeddings = []
+        start_idx = 0
+
+        for grid_thw in grid_thw_list:
+            num_patches = math.prod(grid_thw)
+            end_idx = start_idx + num_patches
+
+            # Get patches for this image
+            image_patches = embeddings[start_idx:end_idx]
+
+            # Simulate spatial merging by averaging groups of patches
+            merged_patches = num_patches // merge_factor
+            if merged_patches > 0:
+                # Reshape and average to simulate merging
+                reshaped = image_patches[:merged_patches * merge_factor].view(
+                    merged_patches, merge_factor, -1)
+                merged = reshaped.mean(dim=1)
+                merged_embeddings.append(merged)
+
+            start_idx = end_idx
+
+        if merged_embeddings:
+            return torch.cat(merged_embeddings, dim=0)
+        else:
+            return torch.empty((0, self.out_hidden_size),
+                               device=pixel_values.device,
+                               dtype=pixel_values.dtype)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "batch_size",
+    [
+        1,  # Single image
+        3,  # Small batch
+        5,  # Odd batch size (for testing padding)
+    ],
+)
+def test_run_dp_sharded_mrope_vision_model(batch_size: int):
+    world_size = 2
+    # Launch processes
+    mp.spawn(
+        run_dp_sharded_mrope_vision_model_vs_direct,
+        args=(
+            world_size,
+            batch_size,
+            get_open_port(),
+        ),
+        nprocs=world_size,
+    )
+
+
+def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
+                                                world_size: int,
+                                                batch_size: int,
+                                                master_port: int):
+    """
+    Test that run_dp_sharded_mrope_vision_model produces the same results as 
+    calling the model directly.
+    """
+    # Set random seed for reproducibility
+    current_platform.seed_everything(0)
+    device = f"{current_platform.device_name}:{local_rank}"
+    current_platform.set_device(device)
+    torch.set_default_device(device)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': str(master_port),
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Create test data
+    grid_thw_list = []
+    pixel_values_list = []
+
+    for i in range(batch_size):
+        # Varying image sizes for better testing
+        t, h, w = 1, 4 + i, 4 + i
+        grid_thw_list.append([t, h, w])
+
+        num_patches = t * h * w
+        # Create random pixel values for this image
+        image_pixels = torch.randn(num_patches, 768)
+        pixel_values_list.append(image_pixels)
+
+    # Concatenate all pixel values
+    pixel_values = torch.cat(pixel_values_list, dim=0)
+
+    # Create a simple mrope vision model
+    vision_model = SimpleMRopeVisionModel()
+
+    # Run the model directly on the full input (only on rank 0)
+    if local_rank == 0:
+        with torch.inference_mode():
+            direct_output = vision_model(pixel_values, grid_thw_list)
+
+    # Run the model through the sharded function
+    with torch.inference_mode():
+        sharded_output = run_dp_sharded_mrope_vision_model(
+            vision_model, pixel_values, grid_thw_list)
+        sharded_output = torch.cat(sharded_output, dim=0)
+
+    # Check that the world size is setup correctly
+    assert get_tensor_model_parallel_world_size() == world_size
+
+    # Compare outputs (only on rank 0)
+    if local_rank == 0:
+        # Check that the outputs have the same shape
+        assert direct_output.shape == sharded_output.shape
+        # Check that the outputs are close (they should be identical)
+        assert torch.allclose(direct_output,
+                              sharded_output,
+                              rtol=1e-5,
+                              atol=1e-5)
+
+
+@multi_gpu_test(num_gpus=2)
+def test_run_dp_sharded_mrope_vision_model_empty_input():
+    world_size = 2
+    mp.spawn(
+        run_dp_sharded_mrope_vision_model_empty_input_worker,
+        args=(world_size, get_open_port()),
+        nprocs=world_size,
+    )
+
+
+def run_dp_sharded_mrope_vision_model_empty_input_worker(
+        local_rank: int, world_size: int, master_port: int):
+    """Test run_dp_sharded_mrope_vision_model with empty input."""
+    # Set up distributed environment
+    device = f"{current_platform.device_name}:{local_rank}"
+    current_platform.set_device(device)
+    torch.set_default_device(device)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': str(master_port),
+    })
+
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Create empty inputs
+    pixel_values = torch.empty((0, 768))
+    grid_thw_list: list[list[int]] = []
+
+    vision_model = SimpleMRopeVisionModel()
+
+    # Should handle empty input gracefully
+    with torch.inference_mode():
+        output = run_dp_sharded_mrope_vision_model(vision_model, pixel_values,
+                                                   grid_thw_list)
+
+    assert len(output) == 0
+
+
+@multi_gpu_test(num_gpus=4)
+def test_run_dp_sharded_mrope_vision_model_uneven_load():
+    world_size = 4
+    mp.spawn(
+        run_dp_sharded_mrope_vision_model_uneven_load_worker,
+        args=(world_size, get_open_port()),
+        nprocs=world_size,
+    )
+
+
+def run_dp_sharded_mrope_vision_model_uneven_load_worker(
+        local_rank: int, world_size: int, master_port: int):
+    """Test run_dp_sharded_mrope_vision_model with uneven load distribution."""
+    # Set up distributed environment
+    current_platform.seed_everything(123)
+    device = f"{current_platform.device_name}:{local_rank}"
+    current_platform.set_device(device)
+    torch.set_default_device(device)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': str(master_port),
+    })
+
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Create images with very different sizes
+    grid_thw_list = [
+        [1, 2, 2],  # Small: 4 patches
+        [1, 8, 8],  # Large: 64 patches  
+        [1, 3, 3],  # Medium: 9 patches
+    ]
+
+    pixel_values_list = []
+    for grid_thw in grid_thw_list:
+        num_patches = math.prod(grid_thw)
+        image_pixels = torch.randn(num_patches, 768)
+        pixel_values_list.append(image_pixels)
+
+    pixel_values = torch.cat(pixel_values_list, dim=0)
+    vision_model = SimpleMRopeVisionModel()
+
+    # Should handle uneven distribution without errors
+    with torch.inference_mode():
+        output_tuple = run_dp_sharded_mrope_vision_model(
+            vision_model, pixel_values, grid_thw_list)
+
+    # Verify output shape is reasonable
+    merge_factor = vision_model.spatial_merge_size**2
+    expected_output_patches = list(
+        math.prod(grid_thw) // merge_factor for grid_thw in grid_thw_list)
+
+    for i, output in enumerate(output_tuple):
+        assert output.shape[0] == expected_output_patches[i]
+        assert output.shape[1] == vision_model.out_hidden_size
+
+
+@pytest.mark.parametrize("spatial_merge_size", [2, 4])
+def test_simple_mrope_vision_model_spatial_merge(spatial_merge_size: int):
+    """Test SimpleMRopeVisionModel with different spatial merge sizes."""
+    device = current_platform.device_type
+
+    grid_thw_list = [[1, 4, 4], [1, 6, 6]]  # Two images
+    pixel_values_list = []
+
+    for grid_thw in grid_thw_list:
+        num_patches = math.prod(grid_thw)
+        image_pixels = torch.randn(num_patches, 768, device=device)
+        pixel_values_list.append(image_pixels)
+
+    pixel_values = torch.cat(pixel_values_list, dim=0)
+    vision_model = SimpleMRopeVisionModel(
+        spatial_merge_size=spatial_merge_size).to(device)
+
+    with torch.inference_mode():
+        output = vision_model(pixel_values, grid_thw_list)
+
+    # Verify output dimensions based on spatial merging
+    total_patches = sum(math.prod(grid_thw) for grid_thw in grid_thw_list)
+    merge_factor = spatial_merge_size**2
+    expected_output_patches = total_patches // merge_factor
+
+    assert output.shape[0] == expected_output_patches
+    assert output.shape[1] == vision_model.out_hidden_size
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 671ad9eed2..d3b6b2089f 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -437,7 +437,7 @@ class MergedReplicatedLinear(ReplicatedLinear):
             shard_offset = sum(self.output_sizes[:loaded_shard_id])
             shard_size = self.output_sizes[loaded_shard_id]
 
-        param[shard_offset:shard_offset + shard_size] = loaded_weight
+        param.data[shard_offset:shard_offset + shard_size] = loaded_weight
 
 
 @CustomOp.register("column_parallel_linear")
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 5bcbcc4f0e..34eec10296 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -45,10 +45,14 @@ from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
+# yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
+                                               MergedReplicatedLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
+# yapf: enable
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
@@ -57,6 +61,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -170,19 +175,25 @@ class Qwen2_5_VisionMLP(nn.Module):
                  bias: bool = False,
                  act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
+        cls_gate_up_proj = (MergedReplicatedLinear if use_data_parallel else
+                            MergedColumnParallelLinear)
+        self.gate_up_proj = cls_gate_up_proj(
             input_size=in_features,
             output_sizes=[hidden_features] * 2,  # [gate_proj, up_proj]
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(hidden_features,
-                                           in_features,
-                                           bias=bias,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj")
+
+        cls_down_proj = (ReplicatedLinear
+                         if use_data_parallel else RowParallelLinear)
+        self.down_proj = cls_down_proj(hidden_features,
+                                       in_features,
+                                       bias=bias,
+                                       quant_config=quant_config,
+                                       prefix=f"{prefix}.down_proj")
         self.act_fn = act_fn
 
     def forward(self, x: torch.Tensor):
@@ -220,28 +231,42 @@ class Qwen2_5_VisionAttention(nn.Module):
         projection_size: int,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
-        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_size = (1 if use_data_parallel else
+                        parallel_state.get_tensor_model_parallel_world_size())
         self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
         self.hidden_size_per_attention_head = dist_utils.divide(
             projection_size, num_heads)
         self.num_attention_heads_per_partition = dist_utils.divide(
             num_heads, self.tp_size)
 
-        self.qkv = QKVParallelLinear(
-            hidden_size=embed_dim,
-            head_size=self.hidden_size_per_attention_head,
-            total_num_heads=num_heads,
-            total_num_kv_heads=num_heads,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.qkv")
-        self.proj = RowParallelLinear(input_size=projection_size,
-                                      output_size=embed_dim,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.proj")
+        if use_data_parallel:
+            self.qkv = ReplicatedLinear(embed_dim,
+                                        self.hidden_size_per_attention_head *
+                                        3 * num_heads,
+                                        bias=True,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.qkv")
+
+        else:
+            self.qkv = QKVParallelLinear(
+                hidden_size=embed_dim,
+                head_size=self.hidden_size_per_attention_head,
+                total_num_heads=num_heads,
+                total_num_kv_heads=num_heads,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv")
+
+        cls_proj = (ReplicatedLinear
+                    if use_data_parallel else RowParallelLinear)
+        self.proj = cls_proj(input_size=projection_size,
+                             output_size=embed_dim,
+                             quant_config=quant_config,
+                             prefix=f"{prefix}.proj")
 
         # Detect attention implementation.
         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
@@ -302,8 +327,6 @@ class Qwen2_5_VisionAttention(nn.Module):
             k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
 
         if self.is_flash_attn_backend:
-            # from vllm_flash_attn.flash_attn_interface import (
-            #   flash_attn_varlen_func)
             if self.attn_backend == _Backend.ROCM_AITER_FA:
                 from aiter import flash_attn_varlen_func
             else:
@@ -370,23 +393,27 @@ class Qwen2_5_VisionBlock(nn.Module):
         norm_layer: Optional[Callable[[int], nn.Module]] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         if norm_layer is None:
             norm_layer = partial(nn.LayerNorm, eps=1e-6)
         self.norm1 = norm_layer(dim)
         self.norm2 = norm_layer(dim)
-        self.attn = Qwen2_5_VisionAttention(embed_dim=dim,
-                                            num_heads=num_heads,
-                                            projection_size=dim,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.attn")
+        self.attn = Qwen2_5_VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_data_parallel=use_data_parallel)
         self.mlp = Qwen2_5_VisionMLP(dim,
                                      mlp_hidden_dim,
                                      act_fn=act_fn,
                                      bias=True,
                                      quant_config=quant_config,
-                                     prefix=f"{prefix}.mlp")
+                                     prefix=f"{prefix}.mlp",
+                                     use_data_parallel=use_data_parallel)
 
     def forward(
             self,
@@ -445,24 +472,30 @@ class Qwen2_5_VisionPatchMerger(nn.Module):
         spatial_merge_size: int = 2,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = context_dim * (spatial_merge_size**2)
         if norm_layer is None:
             norm_layer = partial(nn.LayerNorm, eps=1e-6)
         self.ln_q = norm_layer(context_dim)
+
+        cls_fc1 = (ReplicatedLinear
+                   if use_data_parallel else ColumnParallelLinear)
+        cls_fc2 = (ReplicatedLinear
+                   if use_data_parallel else RowParallelLinear)
         self.mlp = nn.ModuleList([
-            ColumnParallelLinear(self.hidden_size,
-                                 self.hidden_size,
-                                 bias=True,
-                                 quant_config=quant_config,
-                                 prefix=f"{prefix}.mlp.0"),
+            cls_fc1(self.hidden_size,
+                    self.hidden_size,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.mlp.0"),
             nn.GELU(),
-            RowParallelLinear(self.hidden_size,
-                              d_model,
-                              bias=True,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.mlp.2"),
+            cls_fc2(self.hidden_size,
+                    d_model,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.mlp.2"),
         ])
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -514,6 +547,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
         norm_eps: float = 1e-6,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
@@ -523,6 +557,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
         depth = vision_config.depth
         self.hidden_size = vision_config.hidden_size
         self.num_heads = vision_config.num_heads
+        self.use_data_parallel = use_data_parallel
+        self.out_hidden_size = vision_config.out_hidden_size
 
         # args for get_window_index_thw
         self.window_size = vision_config.window_size
@@ -550,7 +586,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
                                     vision_config.hidden_act),
                                 norm_layer=norm_layer,
                                 quant_config=quant_config,
-                                prefix=f"{prefix}.blocks.{layer_idx}")
+                                prefix=f"{prefix}.blocks.{layer_idx}",
+                                use_data_parallel=use_data_parallel)
             for layer_idx in range(depth)
         ])
         self.merger = Qwen2_5_VisionPatchMerger(
@@ -560,6 +597,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
             spatial_merge_size=self.spatial_merge_size,
             quant_config=quant_config,
             prefix=f"{prefix}.merger",
+            use_data_parallel=use_data_parallel,
         )
         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
 
@@ -767,7 +805,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
-
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -840,6 +877,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
         multimodal_config = vllm_config.model_config.multimodal_config
 
+        self.use_data_parallel = (vllm_config.parallel_config.
+                                  enable_multimodal_encoder_data_parallel)
         self.config = config
         self.multimodal_config = multimodal_config
 
@@ -851,6 +890,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                 quant_config=self._maybe_ignore_quant_config(
                     self.quant_config),
                 prefix=maybe_prefix(prefix, "visual"),
+                use_data_parallel=self.use_data_parallel,
             )
         else:
             self.visual = None
@@ -973,7 +1013,13 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"]
-            image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+
+            if self.use_data_parallel:
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual, pixel_values, grid_thw_list)
+            else:
+                image_embeds = self.visual(pixel_values,
+                                           grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each image item.
         # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
@@ -995,8 +1041,12 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
-            video_embeds = self.visual(pixel_values_videos,
-                                       grid_thw=grid_thw_list)
+            if self.use_data_parallel:
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual, pixel_values_videos, grid_thw_list)
+            else:
+                video_embeds = self.visual(pixel_values_videos,
+                                           grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 3361878a20..2315fe2ab9 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -329,8 +329,6 @@ class Qwen2VisionAttention(nn.Module):
             k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
 
         if self.is_flash_attn_backend:
-            # from vllm_flash_attn.flash_attn_interface import (
-            #   flash_attn_varlen_func)
             if self.attn_backend == _Backend.ROCM_AITER_FA:
                 from aiter import flash_attn_varlen_func
             else:
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 99f3db25a7..58c71d865d 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,6 +3,8 @@
 
 import asyncio
 import atexit
+import itertools
+import math
 from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
@@ -465,6 +467,219 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor,
     return vision_embeddings
 
 
+def get_load_balance_assignment(
+    sizes: list[int],
+    num_gpus: int = 2,
+) -> tuple[list[int], list[int], list[int]]:
+    """
+    Generate load balancing assignment and metadata 
+    for distributing data across GPUs.
+    The load is determined by the total image sizes,
+    not the number of images.
+    
+    Args:
+        sizes: The size of each image
+        num_gpus: Number of GPUs to balance across
+    
+    Returns:
+        shuffle_indices: 
+            Indices to reorder data for balanced loading
+        gpu_sample_counts: 
+            Number of samples assigned to each GPU
+        grouped_sizes_per_gpu: 
+            Total size assigned to each GPU
+    
+    Example:
+        ```
+        sizes = [1000, 100, 200, 50]
+        num_gpus=2
+        ```
+
+    """
+
+    n_samples = len(sizes)
+
+    # Handle edge cases
+    if n_samples == 0:
+        return [], [0] * num_gpus, [0] * num_gpus
+
+    # Use greedy algorithm - balance by total size, not sample count
+    gpu_assignments = [list[int]() for _ in range(num_gpus)]
+    gpu_loads = [0] * num_gpus  # This tracks total SIZE, not sample count
+
+    # Sort indices by size (largest first for better load balancing)
+    # sizes = [1000, 100, 200, 50]
+    # large_to_small_indices = [0, 2, 1, 3]
+    large_to_small_indices = sorted(range(n_samples),
+                                    key=lambda i: sizes[i],
+                                    reverse=True)
+
+    for idx in large_to_small_indices:
+        # Find GPU with minimum current load (by total size)
+        min_gpu = min(range(num_gpus), key=lambda i: gpu_loads[i])
+        gpu_assignments[min_gpu].append(idx)
+        gpu_loads[min_gpu] += sizes[idx]
+
+    # Create shuffle indices and counts
+    shuffle_indices = list[int]()
+    gpu_sample_counts = list[int]()
+    for gpu_id in range(num_gpus):
+        # GPU_0 = [1000] = [0]
+        # GPU_1 = [200, 100, 50] = [2, 1, 3]
+        # shuffle_indices = [0, 2, 1, 3]
+        shuffle_indices.extend(gpu_assignments[gpu_id])
+        # GPU_0 = [1]
+        # GPU_1 = [3]
+        # gpu_sample_counts = [1, 3]
+        gpu_sample_counts.append(len(gpu_assignments[gpu_id]))
+
+    return (shuffle_indices, gpu_sample_counts, gpu_loads)
+
+
+def run_dp_sharded_mrope_vision_model(
+    vision_model: torch.nn.Module,
+    pixel_values: torch.Tensor,
+    grid_thw_list: list[list[int]],
+) -> tuple[torch.Tensor, ...]:
+    """Run a vision model with data parallelism (DP) sharding. 
+    The function will shard the input image tensor on the 
+    first dimension and run the vision model.
+    This function is used to run the vision model with mrope.
+    
+    Args:
+        vision_model (torch.nn.Module): Vision model.
+        pixel_values (torch.Tensor): Image/Video input tensor.
+        grid_thw_list: List of grid dimensions for each image
+    Returns:
+        torch.Tensor: Output image embeddings
+
+    Example:
+        ```
+        vision_model.out_hidden_size = 64
+        vision_model.spatial_merge_size = 2
+        pixel_values.shape = (1350, channel)
+        grid_thw_list = [[1, 10, 100], [1, 10, 10], [1, 10, 20], [1, 50]]
+        tp_size=2
+        ```
+
+    """
+    tp_size = get_tensor_model_parallel_world_size()
+
+    # GPU_0 tp_rank_local = 0
+    # GPU_1 tp_rank_local = 1
+    tp_rank_local = get_tensor_model_parallel_rank()
+
+    # patches_per_image = [1000, 100, 200, 50]
+    patches_per_image = [math.prod(grid_thw) for grid_thw in grid_thw_list]
+    # patches_per_image = [0, 1000, 1100, 1300, 1350]
+    cum_patches_per_image = [0, *itertools.accumulate(patches_per_image)]
+
+    # Get load balancing assignment with all metadata
+    # image_to_tp_rank = [0, 2, 1, 3]
+    # gpu_sample_counts = [1, 3]
+    # grouped_pixel_values_len = [1000, 350]
+    (image_to_tp_rank, gpu_sample_counts,
+     grouped_pixel_values_len) = get_load_balance_assignment(
+         patches_per_image, tp_size)
+
+    # cu_gpu_sample_counts = [0, 1, 4]
+    cum_gpu_sample_counts = [0, *itertools.accumulate(gpu_sample_counts)]
+
+    # GPU_0 image_idxs_local = [0]
+    # GPU_1 image_idxs_local = [2, 1, 3]
+    image_idxs_local = image_to_tp_rank[cum_gpu_sample_counts[tp_rank_local]:
+                                        cum_gpu_sample_counts[tp_rank_local +
+                                                              1]]
+
+    # Get the pixel values for the local images based on the image_idxs_local
+    if len(image_idxs_local) > 0:
+        pixel_values_local = torch.cat([
+            pixel_values[cum_patches_per_image[i]:cum_patches_per_image[i + 1]]
+            for i in image_idxs_local
+        ])
+    else:
+        # Handle case where this rank has no images
+        pixel_values_local = torch.empty((0, pixel_values.shape[1]),
+                                         device=pixel_values.device,
+                                         dtype=pixel_values.dtype)
+    # embed_dim_reduction_factor = 2 * 2
+    embed_dim_reduction_factor = (vision_model.spatial_merge_size *
+                                  vision_model.spatial_merge_size)
+
+    # Find the max length across all ranks
+    # The output embedding of every DP rank has to be
+    # padded to this length for tensor_model_parallel_all_gather
+    # to work
+    max_len_per_rank = max(
+        grouped_pixel_values_len) // embed_dim_reduction_factor
+    local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local]
+
+    # Run the vision model on the local pixel_values_local
+    if pixel_values_local.shape[0] > 0:
+        image_embeds_local = vision_model(pixel_values_local,
+                                          local_grid_thw_list)
+    else:
+        # Handle empty case
+        image_embeds_local = torch.empty((0, vision_model.out_hidden_size),
+                                         device=pixel_values.device,
+                                         dtype=pixel_values.dtype)
+
+    # Pad the output based on max_len_per_rank
+    # for tensor_model_parallel_all_gather to work
+    current_len = image_embeds_local.shape[0]
+    if current_len < max_len_per_rank:
+        padding_size = max_len_per_rank - current_len
+        padding = torch.empty((padding_size, image_embeds_local.shape[1]),
+                              dtype=image_embeds_local.dtype,
+                              device=image_embeds_local.device)
+        image_embeds_local_padded = torch.cat([image_embeds_local, padding],
+                                              dim=0)
+    else:
+        image_embeds_local_padded = image_embeds_local
+
+    # Do all_gather to collect embeddings from all ranks
+    gathered_embeds = tensor_model_parallel_all_gather(
+        image_embeds_local_padded, dim=0)
+
+    # Remove padding and reconstruct per-rank embeddings
+    rank_embeddings = list[torch.Tensor]()
+    for rank in range(tp_size):
+        start_idx = rank * max_len_per_rank
+        end_idx = start_idx + (grouped_pixel_values_len[rank] //
+                               embed_dim_reduction_factor)
+        rank_embeddings.append(gathered_embeds[start_idx:end_idx])
+
+    patches_per_output_image = [(patch_size // embed_dim_reduction_factor)
+                                for patch_size in patches_per_image]
+
+    # Reconstruct embeddings in the original order
+    original_order_embeddings = [None] * len(grid_thw_list)
+    current_idx = 0
+    for rank in range(tp_size):
+        count = gpu_sample_counts[rank]
+        if count > 0:
+            # Get images assigned to this rank in shuffled order
+            # GPU_0 = image_idxs_local  [0]
+            # GPU_1 = image_idxs_local  [2, 1, 3]
+            rank_images = image_to_tp_rank[current_idx:current_idx + count]
+
+            rank_embed = rank_embeddings[rank]
+            # Split rank embeddings back to individual images
+            embed_start = 0
+            for img_idx in rank_images:
+                img_patches = patches_per_output_image[img_idx]
+                original_order_embeddings[img_idx] = rank_embed[
+                    embed_start:embed_start + img_patches]
+                embed_start += img_patches
+            current_idx += count
+
+    out_embeddings = tuple(embed for embed in original_order_embeddings
+                           if embed is not None)
+    assert len(out_embeddings) == len(
+        original_order_embeddings), "Found unassigned embeddings"
+    return out_embeddings
+
+
 def fetch_audio(
     audio_url: str,
     audio_io_kwargs: Optional[dict[str, Any]] = None,

From 4f510bc2a175a6eebfdceafa7013da8a3865eb38 Mon Sep 17 00:00:00 2001
From: yiz-liu <136800916+yiz-liu@users.noreply.github.com>
Date: Wed, 20 Aug 2025 00:18:41 +0800
Subject: [PATCH 395/932] [Model] Removes redundant all-reduce operation in
 Qwen3MoeSparseMoeBlock (#23169)

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
---
 vllm/model_executor/models/qwen3_moe.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 61b16b6a1d..05bbb0d2e8 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -139,7 +139,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
                                 top_k=config.num_experts_per_tok,
                                 hidden_size=config.hidden_size,
                                 intermediate_size=config.moe_intermediate_size,
-                                reduce_results=False,
+                                reduce_results=True,
                                 renormalize=config.norm_topk_prob,
                                 quant_config=quant_config,
                                 prefix=f"{prefix}.experts",
@@ -163,10 +163,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
         final_hidden_states = self.experts(hidden_states=hidden_states,
                                            router_logits=router_logits)
 
-        if self.tp_size > 1:
-            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
-                final_hidden_states)
-
         return final_hidden_states.view(orig_shape)
 
 
From 24f4d1a224ef1da6cb3878628c39ad09de480c98 Mon Sep 17 00:00:00 2001
From: Yuge Zhang <scottyugochang@gmail.com>
Date: Wed, 20 Aug 2025 00:48:31 +0800
Subject: [PATCH 396/932] Add return_token_ids parameter to OpenAI API
 endpoints (#22587)

Signed-off-by: Yuge Zhang <scottyugochang@gmail.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 .../entrypoints/openai/test_openai_schema.py  |  57 +--
 .../openai/test_return_token_ids.py           | 374 ++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           |  30 ++
 vllm/entrypoints/openai/serving_chat.py       |  22 +-
 vllm/entrypoints/openai/serving_completion.py |  21 +-
 5 files changed, 477 insertions(+), 27 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_return_token_ids.py

diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 246bd014aa..11ed1c4a9e 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -74,31 +74,44 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
             -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
             http://localhost:8000/v1/chat/completions
         """  # noqa: E501
-        if (hasattr(case, "body") and isinstance(case.body, dict)
-                and "messages" in case.body
-                and isinstance(case.body["messages"], list)
-                and len(case.body["messages"]) > 0):
+        if hasattr(case, "body") and isinstance(case.body, dict):
+            if ("messages" in case.body
+                    and isinstance(case.body["messages"], list)
+                    and len(case.body["messages"]) > 0):
 
-            for message in case.body["messages"]:
-                if not isinstance(message, dict):
-                    continue
+                for message in case.body["messages"]:
+                    if not isinstance(message, dict):
+                        continue
 
-                # Check for invalid file type in tokenize endpoint
-                if op.method.lower() == "post" and op.path == "/tokenize":
-                    content = message.get("content", [])
-                    if (isinstance(content, list) and len(content) > 0 and any(
-                            item.get("type") == "file" for item in content)):
-                        return False
+                    # Check for invalid file type in tokenize endpoint
+                    if op.method.lower() == "post" and op.path == "/tokenize":
+                        content = message.get("content", [])
+                        if (isinstance(content, list) and len(content) > 0
+                                and any(
+                                    item.get("type") == "file"
+                                    for item in content)):
+                            return False
+
+                    # Check for invalid tool_calls with non-function types
+                    tool_calls = message.get("tool_calls", [])
+                    if isinstance(tool_calls, list):
+                        for tool_call in tool_calls:
+                            if isinstance(tool_call, dict):
+                                if tool_call.get("type") != "function":
+                                    return False
+                                if "custom" in tool_call:
+                                    return False
+
+            # Sometimes guided_grammar is generated to be empty
+            # Causing a server error in EBNF grammar parsing
+            # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
+            guided_grammar = case.body.get("guided_grammar")
+
+            if guided_grammar == '':
+                # Allow None (will be handled as no grammar)
+                # But skip empty strings
+                return False
 
-                # Check for invalid tool_calls with non-function types
-                tool_calls = message.get("tool_calls", [])
-                if isinstance(tool_calls, list):
-                    for tool_call in tool_calls:
-                        if isinstance(tool_call, dict):
-                            if tool_call.get("type") != "function":
-                                return False
-                            if "custom" in tool_call:
-                                return False
         return True
 
     return strategy.filter(no_invalid_types)
diff --git a/tests/entrypoints/openai/test_return_token_ids.py b/tests/entrypoints/openai/test_return_token_ids.py
new file mode 100644
index 0000000000..6addcb41c4
--- /dev/null
+++ b/tests/entrypoints/openai/test_return_token_ids.py
@@ -0,0 +1,374 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "hermes",
+        "--enforce-eager",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_basic_completion_with_emoji(server):
+    """Test basic completion with emoji to verify token_ids field."""
+    async with server.get_async_client() as client:
+        # Test with return_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Complete this sentence with emojis: I love coding 🚀",
+            max_tokens=10,
+            temperature=0,
+            logprobs=1,
+            extra_body={"return_token_ids": True},
+        )
+
+        # Check the raw response to see the structure
+        completion_dict = completion.model_dump()
+
+        # Verify prompt_token_ids field is present in the completion response
+        assert "prompt_token_ids" in completion_dict["choices"][0]
+        assert isinstance(completion.choices[0].prompt_token_ids, list)
+
+        # Check against the expected prompt token IDs
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        encoded_tokens = tokenizer.encode(
+            "Complete this sentence with emojis: I love coding 🚀")
+        # Check that encoded_tokens is a subsequence of prompt_token_ids
+        assert any(completion.choices[0].prompt_token_ids[i:i +
+                                                          len(encoded_tokens)]
+                   == encoded_tokens for i in range(
+                       len(completion.choices[0].prompt_token_ids) -
+                       len(encoded_tokens) + 1))
+
+        # Verify token_ids field is present in the choice
+        assert completion.choices[0].token_ids is not None
+        assert isinstance(completion.choices[0].token_ids, list)
+        assert len(completion.choices[0].token_ids) > 0
+
+        # Verify decoding works correctly
+        decoded_text = tokenizer.decode(completion.choices[0].token_ids)
+        # The decoded text should contain a <|im_end|> at the end
+        assert decoded_text.startswith(completion.choices[0].text)
+
+        # Test without return_token_ids (should be None)
+        completion_without = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Complete this sentence with emojis: I love coding 🚀",
+            max_tokens=10,
+            temperature=0,
+            logprobs=1,
+            extra_body={"return_token_ids": False},
+        )
+
+        completion_without_dict = completion_without.model_dump()
+        assert completion_without_dict["choices"][0].get("token_ids") is None
+        assert completion_without_dict.get("prompt_token_ids") is None
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_tool_use(server):
+    """Test chat completion with tool use (get_weather function)."""
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The unit of temperature",
+                    },
+                },
+                "required": ["location"],
+            },
+        },
+    }]
+
+    async with server.get_async_client() as client:
+        # Test with return_token_ids enabled
+        response = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role": "user",
+                    "content": "What's the weather like in Paris?"
+                },
+            ],
+            tools=tools,
+            tool_choice="auto",
+            max_tokens=100,
+            temperature=0,
+            logprobs=True,
+            extra_body={"return_token_ids": True},
+        )
+
+        # Verify token_ids field is present in choices
+        assert response.choices[0].token_ids is not None
+        assert isinstance(response.choices[0].token_ids, list)
+
+        # Verify prompt_token_ids field is present
+        assert response.prompt_token_ids is not None
+        assert isinstance(response.prompt_token_ids, list)
+
+        # Verify the prompt texts and response texts
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        prompt_text = tokenizer.decode(response.prompt_token_ids)
+        assert prompt_text.startswith(
+            "<|im_start|>system\nYou are a helpful assistant.")
+        assert prompt_text.endswith(
+            "What's the weather like in Paris?<|im_end|>\n"
+            "<|im_start|>assistant\n")
+
+        response_text = tokenizer.decode(response.choices[0].token_ids)
+        assert response_text.startswith('<tool_call>\n{"name": "get_weather"')
+        assert response_text.endswith("</tool_call><|im_end|>")
+
+        # If tool call was made, verify the response structure
+        if response.choices[0].message.tool_calls:
+            assert len(response.choices[0].message.tool_calls) > 0
+            tool_call = response.choices[0].message.tool_calls[0]
+            assert tool_call.function.name == "get_weather"
+
+        # Test without return_token_ids
+        response_without = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role": "user",
+                    "content": "What's the weather like in Paris?"
+                },
+            ],
+            tools=tools,
+            tool_choice="auto",
+            max_tokens=100,
+            temperature=0,
+            logprobs=True,
+            extra_body={"return_token_ids": False},
+        )
+
+        assert response_without.choices[0].token_ids is None
+        assert response_without.prompt_token_ids is None
+
+
+@pytest.mark.asyncio
+async def test_comparison_with_prompt_logprobs_and_logprobs(server):
+    """
+    Test that token_ids align with prompt_logprobs and
+    logprobs when return_tokens_as_token_ids is enabled.
+    """
+    async with server.get_async_client() as client:
+        # Test with both return_token_ids and return_tokens_as_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Hello, world! How are you today?",
+            max_tokens=20,
+            temperature=0,
+            echo=True,
+            logprobs=1,
+            extra_body={
+                "return_token_ids": True,
+                "return_tokens_as_token_ids": True,
+                "prompt_logprobs": 1
+            },
+        )
+
+        # Verify all fields are present
+        assert completion.choices[0].token_ids is not None
+        assert completion.choices[0].prompt_token_ids is not None
+        assert completion.choices[0].prompt_logprobs is not None
+        assert completion.choices[0].logprobs is not None
+
+        # Extract token IDs from logprobs
+        # (when return_tokens_as_token_ids is True)
+        logprobs_token_ids = []
+        for token_str in completion.choices[0].logprobs.tokens:
+            # Token format is "token_id:12345" when
+            # return_tokens_as_token_ids is True
+            if token_str.startswith("token_id:"):
+                token_id = int(token_str.removeprefix("token_id:"))
+                logprobs_token_ids.append(token_id)
+
+        # When echo=True, the logprobs include both prompt and response tokens
+        # The token_ids field should match the the suffix of response portion
+        # The prompt_token_ids should match the prompt portion
+        assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
+        response_token_ids_length = len(completion.choices[0].token_ids)
+        assert logprobs_token_ids[-response_token_ids_length:] == \
+            completion.choices[0].token_ids
+
+        # Verify tokenizer consistency
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+        # Decode prompt tokens
+        if completion.choices[0].prompt_token_ids:
+            prompt_text = tokenizer.decode(
+                completion.choices[0].prompt_token_ids)
+            # The decoded prompt should match or close to original prompt
+            assert "Hello, world" in prompt_text
+
+        # Decode response tokens
+        if completion.choices[0].token_ids:
+            response_text = tokenizer.decode(completion.choices[0].token_ids)
+            assert completion.choices[0].text.endswith(response_text)
+
+        # Test streaming mode
+        stream = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Tell me a short fact about Python:",
+            max_tokens=30,
+            temperature=0,
+            stream=True,
+            echo=False,
+            logprobs=1,
+            extra_body={
+                "return_token_ids": True,
+                "return_tokens_as_token_ids": True
+            },
+        )
+
+        # Collect streamed tokens
+        streamed_prompt_token_ids = []
+        streamed_token_ids = []
+        streamed_logprob_token_ids = []
+        first_chunk = True
+        async for chunk in stream:
+            for token_str in chunk.choices[0].logprobs.tokens:
+                # Token format is "token_id:12345" when
+                # return_tokens_as_token_ids is True
+                if token_str.startswith("token_id:"):
+                    token_id = int(token_str.removeprefix("token_id:"))
+                    streamed_logprob_token_ids.append(token_id)
+            if first_chunk:
+                streamed_prompt_token_ids = chunk.choices[0].prompt_token_ids
+                first_chunk = False
+            streamed_token_ids += chunk.choices[0].token_ids
+
+        # Verify we collected some tokens and first chunk had prompt_token_ids
+        assert len(streamed_prompt_token_ids) > 0
+        assert streamed_token_ids == streamed_logprob_token_ids
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_emoji_and_token_ids(server):
+    """Test chat completion with emojis to verify token_ids handling."""
+    chat_messages = [
+        {
+            "role": "system",
+            "content": "You like to use emojis in your responses."
+        },
+        {
+            "role": "user",
+            "content": "Repeat after me: I love cats 🐱"
+        },
+    ]
+    async with server.get_async_client() as client:
+        response = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=chat_messages,
+            max_tokens=50,
+            temperature=0,
+            logprobs=True,
+            extra_body={"return_token_ids": True},
+        )
+
+        # Verify token_ids are present
+        response_dict = response.model_dump()
+        assert response.choices[0].token_ids is not None
+        assert "prompt_token_ids" in response_dict
+
+        # Verify the response contains the expected fields
+        assert response.choices[0].message.content is not None
+
+        # Decode token_ids and verify consistency
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+        decoded_prompt = tokenizer.decode(response.prompt_token_ids)
+        assert decoded_prompt.startswith(
+            "<|im_start|>system\nYou like to use emojis in your responses.")
+        assert decoded_prompt.endswith(
+            "I love cats 🐱<|im_end|>\n<|im_start|>assistant\n")
+
+        decoded_response = tokenizer.decode(response.choices[0].token_ids)
+        # The content should match the response text
+        # except the ending <|im_end|>
+        assert decoded_response == response.choices[
+            0].message.content + "<|im_end|>"
+
+        # Test with streaming
+        stream = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=chat_messages,
+            max_tokens=50,
+            temperature=0,
+            stream=True,
+            extra_body={"return_token_ids": True},
+        )
+
+        collected_content = ""
+        collected_token_ids = []
+        first_chunk = True
+
+        async for chunk in stream:
+            if first_chunk:
+                assert chunk.prompt_token_ids is not None
+                assert isinstance(chunk.prompt_token_ids, list)
+                # Check the prompt_token_ids match the initial prompt
+                decoded_prompt_stream = tokenizer.decode(
+                    chunk.prompt_token_ids)
+                assert decoded_prompt_stream == decoded_prompt
+                first_chunk = False
+            else:
+                chunk_dump = chunk.model_dump()
+                assert "prompt_token_ids" not in chunk_dump, \
+                    "Subsequent chunks should not have prompt_token_ids"
+
+            if chunk.choices:
+                if chunk.choices[0].delta.content:
+                    collected_content += chunk.choices[0].delta.content
+                # token_ids may not present in all chunks
+                choice_dump = chunk.choices[0].model_dump()
+                if "token_ids" in choice_dump:
+                    collected_token_ids.extend(chunk.choices[0].token_ids)
+
+        # Verify we got response and token_ids
+        assert len(collected_content) > 0
+        assert len(collected_token_ids) > 0
+
+        # Verify token_ids decode properly
+        decoded_response = tokenizer.decode(collected_token_ids)
+        assert decoded_response == collected_content + "<|im_end|>"
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 61f1a09d3a..39facd4d53 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -576,6 +576,14 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "If specified with 'logprobs', tokens are represented "
             " as strings of the form 'token_id:{token_id}' so that tokens "
             "that are not JSON-encodable can be identified."))
+    return_token_ids: Optional[bool] = Field(
+        default=None,
+        description=(
+            "If specified, the result will include token IDs alongside the "
+            "generated text. In streaming mode, prompt_token_ids is included "
+            "only in the first chunk, and token_ids contains the delta tokens "
+            "for each chunk. This is useful for debugging or when you "
+            "need to map generated text back to input tokens."))
     cache_salt: Optional[str] = Field(
         default=None,
         description=(
@@ -1062,6 +1070,14 @@ class CompletionRequest(OpenAIBaseModel):
             "If specified with 'logprobs', tokens are represented "
             " as strings of the form 'token_id:{token_id}' so that tokens "
             "that are not JSON-encodable can be identified."))
+    return_token_ids: Optional[bool] = Field(
+        default=None,
+        description=(
+            "If specified, the result will include token IDs alongside the "
+            "generated text. In streaming mode, prompt_token_ids is included "
+            "only in the first chunk, and token_ids contains the delta tokens "
+            "for each chunk. This is useful for debugging or when you "
+            "need to map generated text back to input tokens."))
 
     cache_salt: Optional[str] = Field(
         default=None,
@@ -1480,7 +1496,9 @@ class CompletionResponseChoice(OpenAIBaseModel):
             "to stop, None if the completion finished for some other reason "
             "including encountering the EOS token"),
     )
+    token_ids: Optional[list[int]] = None  # For response
     prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
+    prompt_token_ids: Optional[list[int]] = None  # For prompt
 
 
 class CompletionResponse(OpenAIBaseModel):
@@ -1511,6 +1529,10 @@ class CompletionResponseStreamChoice(OpenAIBaseModel):
             "to stop, None if the completion finished for some other reason "
             "including encountering the EOS token"),
     )
+    # not part of the OpenAI spec but for tracing the tokens
+    # prompt tokens is put into choice to align with CompletionResponseChoice
+    prompt_token_ids: Optional[list[int]] = None
+    token_ids: Optional[list[int]] = None
 
 
 class CompletionStreamResponse(OpenAIBaseModel):
@@ -1680,6 +1702,9 @@ class ChatCompletionResponseChoice(OpenAIBaseModel):
     finish_reason: Optional[str] = "stop"
     # not part of the OpenAI spec but included in vLLM for legacy reasons
     stop_reason: Optional[Union[int, str]] = None
+    # not part of the OpenAI spec but is useful for tracing the tokens
+    # in agent scenarios
+    token_ids: Optional[list[int]] = None
 
 
 class ChatCompletionResponse(OpenAIBaseModel):
@@ -1695,6 +1720,7 @@ class ChatCompletionResponse(OpenAIBaseModel):
 
     # vLLM-specific fields that are not in OpenAI spec
     prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
+    prompt_token_ids: Optional[list[int]] = None
     kv_transfer_params: Optional[dict[str, Any]] = Field(
         default=None, description="KVTransfer parameters.")
 
@@ -1712,6 +1738,8 @@ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
     logprobs: Optional[ChatCompletionLogProbs] = None
     finish_reason: Optional[str] = None
     stop_reason: Optional[Union[int, str]] = None
+    # not part of the OpenAI spec but for tracing the tokens
+    token_ids: Optional[list[int]] = None
 
 
 class ChatCompletionStreamResponse(OpenAIBaseModel):
@@ -1721,6 +1749,8 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
     model: str
     choices: list[ChatCompletionResponseStreamChoice]
     usage: Optional[UsageInfo] = Field(default=None)
+    # not part of the OpenAI spec but for tracing the tokens
+    prompt_token_ids: Optional[list[int]] = None
 
 
 class TranscriptionResponseStreamChoice(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 12349234c3..1789521afc 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -568,12 +568,17 @@ class OpenAIServingChat(OpenAIServing):
                             ),
                             logprobs=None,
                             finish_reason=None)
+
+                        # return prompt_token_ids at the first chunk ever
                         chunk = ChatCompletionStreamResponse(
                             id=request_id,
                             object=chunk_object_type,
                             created=created_time,
                             choices=[choice_data],
-                            model=model_name)
+                            model=model_name,
+                            prompt_token_ids=(res.prompt_token_ids
+                                              if request.return_token_ids else
+                                              None))
 
                         # if continuous usage stats are requested, add it
                         if include_continuous_usage:
@@ -912,7 +917,9 @@ class OpenAIServingChat(OpenAIServing):
                             index=i,
                             delta=delta_message,
                             logprobs=logprobs,
-                            finish_reason=None)
+                            finish_reason=None,
+                            token_ids=(as_list(output.token_ids)
+                                       if request.return_token_ids else None))
 
                     # if the model is finished generating
                     else:
@@ -973,7 +980,9 @@ class OpenAIServingChat(OpenAIServing):
                             logprobs=logprobs,
                             finish_reason=output.finish_reason
                             if not auto_tools_called else "tool_calls",
-                            stop_reason=output.stop_reason)
+                            stop_reason=output.stop_reason,
+                            token_ids=(as_list(output.token_ids)
+                                       if request.return_token_ids else None))
 
                         finish_reason_sent[i] = True
 
@@ -1260,7 +1269,10 @@ class OpenAIServingChat(OpenAIServing):
                 logprobs=logprobs,
                 finish_reason="tool_calls" if auto_tools_called else
                 output.finish_reason if output.finish_reason else "stop",
-                stop_reason=output.stop_reason)
+                stop_reason=output.stop_reason,
+                token_ids=(as_list(output.token_ids)
+                           if request.return_token_ids else None),
+            )
 
             choices.append(choice_data)
 
@@ -1301,6 +1313,8 @@ class OpenAIServingChat(OpenAIServing):
             choices=choices,
             usage=usage,
             prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
+            prompt_token_ids=(final_res.prompt_token_ids
+                              if request.return_token_ids else None),
             kv_transfer_params=final_res.kv_transfer_params,
         )
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 22c6b62503..a0ce654094 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -42,7 +42,7 @@ from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import merge_async_iterators
+from vllm.utils import as_list, merge_async_iterators
 
 logger = init_logger(__name__)
 
@@ -365,6 +365,11 @@ class OpenAIServingCompletion(OpenAIServing):
                 for output in res.outputs:
                     i = output.index + prompt_idx * num_choices
 
+                    # Useful when request.return_token_ids is True
+                    # Returning prompt token IDs shares the same logic
+                    # with the echo implementation.
+                    prompt_token_ids_to_return: Optional[list[int]] = None
+
                     assert request.max_tokens is not None
                     if request.echo and not has_echoed[i]:
                         assert prompt_token_ids is not None
@@ -385,6 +390,7 @@ class OpenAIServingCompletion(OpenAIServing):
                                 *(prompt_logprobs or []),
                                 *(output.logprobs or []),
                             ]
+                        prompt_token_ids_to_return = prompt_token_ids
                         has_echoed[i] = True
                     else:
                         # return just the delta
@@ -392,6 +398,12 @@ class OpenAIServingCompletion(OpenAIServing):
                         delta_token_ids = output.token_ids
                         out_logprobs = output.logprobs
 
+                        # has_echoed[i] is reused here to indicate whether
+                        # we have already returned the prompt token IDs.
+                        if not has_echoed[i]:
+                            prompt_token_ids_to_return = prompt_token_ids
+                            has_echoed[i] = True
+
                         if (not delta_text and not delta_token_ids
                                 and not previous_num_tokens[i]):
                             # Chunked prefill case, don't return empty chunks
@@ -428,6 +440,9 @@ class OpenAIServingCompletion(OpenAIServing):
                                 logprobs=logprobs,
                                 finish_reason=finish_reason,
                                 stop_reason=stop_reason,
+                                prompt_token_ids=prompt_token_ids_to_return,
+                                token_ids=(as_list(output.token_ids) if
+                                           request.return_token_ids else None),
                             )
                         ],
                     )
@@ -548,6 +563,10 @@ class OpenAIServingCompletion(OpenAIServing):
                     finish_reason=output.finish_reason,
                     stop_reason=output.stop_reason,
                     prompt_logprobs=final_res.prompt_logprobs,
+                    prompt_token_ids=(prompt_token_ids
+                                      if request.return_token_ids else None),
+                    token_ids=(as_list(output.token_ids)
+                               if request.return_token_ids else None),
                 )
                 choices.append(choice_data)
 

From a70d0bd0a39bfb278b7bda9c82d99df8f628d779 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Tue, 19 Aug 2025 10:02:02 -0700
Subject: [PATCH 397/932] Migrate LlavaOnevisionMultiInputs to TensorSchema
 (#21844)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/llava_onevision.py | 149 +++++++-----------
 1 file changed, 56 insertions(+), 93 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index babd72a4b7..42ab5e7c74 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -3,7 +3,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Final, Literal, Optional, Protocol, TypedDict, Union
+from typing import Annotated, Final, Literal, Optional, Protocol, Union
 
 import torch
 import torch.nn as nn
@@ -11,7 +11,6 @@ from transformers import (BatchFeature, LlavaOnevisionConfig,
                           LlavaOnevisionProcessor)
 from transformers.models.llava_onevision.modeling_llava_onevision import (
     get_anyres_image_grid_shape, unpad_image)
-from typing_extensions import NotRequired
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
@@ -23,6 +22,7 @@ from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -38,44 +38,62 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
 _MAX_FRAMES_PER_VIDEO = 16
 
 
-class LlavaOnevisionVideoPixelInputs(TypedDict):
-    type: Literal["pixel_values_videos"]
-    pixel_values_videos: Union[torch.Tensor, list[torch.Tensor]]
+class LlavaOnevisionVideoPixelInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_videos, num_frames, num_channels, height, width)`
+    Dimensions:
+        - bn: Batch size * number of videos
+        - f: Number of frames
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
 
-    Note that `num_videos` may be different for each batch, and 'num_frames'
-    may be different for each video, in which case the data is passed as a
-    list instead of a batched tensor.
+        Note that `num_videos` may be different for each batch, and 'num_frames'
+        may be different for each video, in which case the data is passed as a
+        list instead of a batched tensor.
     """
+    type: Literal["pixel_values_videos"] = "pixel_values_videos"
+
+    pixel_values_videos: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "f", 3, "h", "w", dynamic_dims={"f"}),
+    ]
 
 
-class LlavaOnevisionImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+class LlavaOnevisionImagePixelInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
+    Dimensions:
+        - bn: Batch size * number of images
+        - np: Number of patches (1 + num_patches)
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
 
-    Note that `num_patches` may be different per batch and image,
-    in which case the data is passed as a list instead of a batched tensor.
+        Note that `num_patches` may be different per batch and image,
+        in which case the data is passed as a list instead of a batched tensor.
     """
+    type: Literal["pixel_values"] = "pixel_values"
 
-    image_sizes: NotRequired[torch.Tensor]
+    pixel_values: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "np", 3, "h", "w"),
+    ]
+
+    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
+
+
+class LlavaOnevisionImageEmbeddingInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_images, 2)`
-
-    This should be in `(height, width)` format.
+    Dimensions:
+        - bn: Batch size * number of images
+        - ifs: Image feature size
+        - hs: Hidden size (must match language model backbone)
     """
+    type: Literal["image_embeds"] = "image_embeds"
 
-
-class LlavaOnevisionImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
-    """
+    data: Annotated[
+        torch.Tensor,
+        TensorShape("bn", "ifs", "hs"),
+    ]
 
 
 LlavaOnevisionImageInputs = Union[LlavaOnevisionImagePixelInputs,
@@ -482,44 +500,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.make_empty_intermediate_tensors = (
             self.language_model.model.make_empty_intermediate_tensors)
 
-    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
-        expected_dims = (2, )
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    f"The expected shape of image sizes per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
-    def _validate_image_pixel_values(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape[1:])
-
-            if actual_dims != expected_dims:
-                expected_expr = ("num_patches", *map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaOnevisionImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -540,11 +520,12 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
 
             return LlavaOnevisionImagePixelInputs(
                 type="pixel_values",
-                pixel_values=self._validate_image_pixel_values(
-                    flatten_bn(pixel_values)),
-                image_sizes=self._validate_image_sizes(
-                    flatten_bn(image_sizes, concat=True)),
-            )
+                pixel_values=flatten_bn(pixel_values),
+                image_sizes=flatten_bn(image_sizes, concat=True),
+                resolve_bindings={
+                    "h": self.config.vision_config.image_size,
+                    "w": self.config.vision_config.image_size
+                })
 
         if image_embeds is not None:
             if not isinstance(image_embeds, torch.Tensor):
@@ -558,27 +539,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         raise AssertionError("This line should be unreachable.")
 
-    def _validate_video_pixel_values(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape[2:])
-
-            if actual_dims != expected_dims:
-                expected_expr = ("num_frames", *map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values in each video frame "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_video_input(
             self,
             **kwargs: object) -> Optional[LlavaOnevisionVideoPixelInputs]:
@@ -600,7 +560,10 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
         return LlavaOnevisionVideoPixelInputs(
             type="pixel_values_videos",
             pixel_values_videos=flatten_bn(pixel_values_videos),
-        )
+            resolve_bindings={
+                "h": self.config.vision_config.image_size,
+                "w": self.config.vision_config.image_size
+            })
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         mm_input_by_modality = {}

From d6a1a20973dfffcb5eea376e8c19d9659b88aa3c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 20 Aug 2025 01:06:17 +0800
Subject: [PATCH 398/932] [CI/Build] Update transformers to v4.55.2 (#23093)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 requirements/common.txt                         |  2 +-
 requirements/test.in                            |  2 +-
 requirements/test.txt                           |  2 +-
 .../models/multimodal/generation/test_mllama.py | 17 +++++++++--------
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 6bc71df24f..3c3ac0abf5 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,7 +7,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.55.0
+transformers >= 4.55.2
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
diff --git a/requirements/test.in b/requirements/test.in
index 6652bfdfe6..7f141fe281 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -34,7 +34,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 mteb[bm25s]>=1.38.11, <2 # required for mteb test
-transformers==4.55.0
+transformers==4.55.2
 tokenizers==0.21.1
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
diff --git a/requirements/test.txt b/requirements/test.txt
index ff9886a315..48eb09811b 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1139,7 +1139,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.55.0
+transformers==4.55.2
     # via
     #   -r requirements/test.in
     #   genai-perf
diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py
index b413c4d6b3..1c32cc6d71 100644
--- a/tests/models/multimodal/generation/test_mllama.py
+++ b/tests/models/multimodal/generation/test_mllama.py
@@ -5,6 +5,7 @@ from typing import Optional, overload
 
 import pytest
 import torch
+from packaging.version import Version
 from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
 from transformers import __version__ as TRANSFORMERS_VERSION
 
@@ -287,8 +288,8 @@ def clear_cache():
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.skipif(
-    TRANSFORMERS_VERSION == "4.55.0",
-    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
+    reason="Transformers v4.55 has a regression issue on mllama, "
     "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
                                      model, sizes, dtype, max_tokens,
@@ -319,8 +320,8 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.skipif(
-    TRANSFORMERS_VERSION == "4.55.0",
-    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
+    reason="Transformers v4.55 has a regression issue on mllama, "
     "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
                                      model, dtype, max_tokens, num_logprobs,
@@ -372,8 +373,8 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.skipif(
-    TRANSFORMERS_VERSION == "4.55.0",
-    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
+    reason="Transformers v4.55 has a regression issue on mllama, "
     "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
                                    dtype, max_tokens, num_logprobs,
@@ -416,8 +417,8 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.skipif(
-    TRANSFORMERS_VERSION == "4.55.0",
-    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
+    reason="Transformers v4.55 has a regression issue on mllama, "
     "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_distributed(
     hf_runner,

From 03d4235fd239f62393909080a7d63d4a03739051 Mon Sep 17 00:00:00 2001
From: Ruixiang Tan <tanruixiang0104@gmail.com>
Date: Wed, 20 Aug 2025 01:18:51 +0800
Subject: [PATCH 399/932] [Misc] Fix the benchmark's README and improve the
 error messages for the benchmark's argument checks (#22654)

Signed-off-by: tanruixiang <tanruixiang0104@gmail.com>
---
 benchmarks/README.md        | 3 +++
 vllm/benchmarks/datasets.py | 5 +++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 1d715a193e..69d32e2228 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -194,6 +194,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
   --backend openai-chat \
+  --endpoint-type openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
@@ -230,6 +231,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
   --backend openai-chat \
+  --endpoint-type openai-chat \  
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
@@ -244,6 +246,7 @@ vllm bench serve \
 ```bash
 vllm bench serve \
   --backend openai-chat \
+  --endpoint-type openai-chat \  
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index b575e8b9e0..3532a083fb 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -740,10 +740,11 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 "openai-chat",
                 "openai-audio",
         ]:
-            # multi-modal benchmark is only available on OpenAI Chat backend.
+            # multi-modal benchmark is only available on OpenAI Chat
+            # endpoint-type.
             raise ValueError(
                 "Multi-modal content is only supported on 'openai-chat' and "
-                "'openai-audio' backend.")
+                "'openai-audio' endpoint-type.")
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,

From f7cf5b512ee41f36613deb2471a44de5f304f70d Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Tue, 19 Aug 2025 10:29:32 -0700
Subject: [PATCH 400/932] [Frontend] Add `/collective_rpc` API endpoint
 (#23075)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |  3 +-
 .../entrypoints/openai/test_collective_rpc.py | 88 +++++++++++++++++++
 vllm/engine/protocol.py                       |  8 ++
 vllm/entrypoints/openai/api_server.py         | 28 ++++++
 4 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 tests/entrypoints/openai/test_collective_rpc.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d4fcb91b11..265e6ad72a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -126,7 +126,8 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
   - pytest -v -s entrypoints/test_chat_utils.py
 
 - label: Distributed Tests (4 GPUs) # 10min
diff --git a/tests/entrypoints/openai/test_collective_rpc.py b/tests/entrypoints/openai/test_collective_rpc.py
new file mode 100644
index 0000000000..37c0b7a900
--- /dev/null
+++ b/tests/entrypoints/openai/test_collective_rpc.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+class TestWorkerExtension:
+
+    def get_model_name(self) -> str:
+        """Test non-pydantic return type."""
+        return MODEL_NAME
+
+    def echo_args_kwargs(self, *args, **kwargs) -> dict[str, Any]:
+        """Echo back both args and kwargs."""
+        return dict(
+            args=list(args),
+            kwargs=kwargs,
+            total_items=len(args) + len(kwargs),
+        )
+
+    def return_none(self, *args, **kwargs) -> None:
+        """Test method that does not return anything"""
+        return
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--worker-extension-cls",
+        "tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
+    ]
+    with RemoteOpenAIServer(
+            MODEL_NAME,
+            args,
+            env_dict={
+                "VLLM_SERVER_DEV_MODE": "1",
+                "CUDA_VISIBLE_DEVICES": "0"
+            },
+    ) as remote_server:
+        yield remote_server
+
+
+def test_get_model_name(server):
+    """Test basic response"""
+    response = requests.post(server.url_for("collective_rpc"),
+                             json={"method": "get_model_name"})
+    assert response.status_code == 200
+    results = response.json()
+    assert "results" in results
+    assert results["results"] == [MODEL_NAME]
+
+
+def test_return_none(server):
+    """Test return none"""
+    response = requests.post(server.url_for("collective_rpc"),
+                             json={"method": "return_none"})
+    assert response.status_code == 200
+    results = response.json()
+    assert results["results"] == [None]
+
+
+def test_echo_args_kwargs(server):
+    """Test args, kwargs, and dict response"""
+    args = ["arg1", "arg2"]
+    kwargs = {"key1": "value1", "key2": "value2"}
+    response = requests.post(server.url_for("collective_rpc"),
+                             json={
+                                 "method": "echo_args_kwargs",
+                                 "args": args,
+                                 "kwargs": kwargs
+                             })
+    assert response.status_code == 200
+    results = response.json()
+    result = results["results"][0]
+    assert result["args"] == args
+    assert result["kwargs"] == kwargs
+    assert result["total_items"] == len(args) + len(kwargs)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index c610fb5eae..5e8ac9c0b3 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -329,3 +329,11 @@ class EngineClient(ABC):
                                drain_timeout: int = 300) -> None:
         """Scale the engine"""
         raise NotImplementedError
+
+    async def collective_rpc(self,
+                             method: str,
+                             timeout: Optional[float] = None,
+                             args: tuple = (),
+                             kwargs: Optional[dict] = None):
+        """Perform a collective RPC call to the given path."""
+        raise NotImplementedError
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index af86835a49..765327da3b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1044,6 +1044,34 @@ if envs.VLLM_SERVER_DEV_MODE:
         is_sleeping = await engine_client(raw_request).is_sleeping()
         return JSONResponse(content={"is_sleeping": is_sleeping})
 
+    @router.post("/collective_rpc")
+    async def collective_rpc(raw_request: Request):
+        try:
+            body = await raw_request.json()
+        except json.JSONDecodeError as e:
+            raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
+                                detail=f"JSON decode error: {e}") from e
+        method = body.get("method")
+        if method is None:
+            raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
+                                detail="Missing 'method' in request body")
+        # For security reason, only serialized string args/kwargs are passed.
+        # User-defined `method` is responsible for deseralization if needed.
+        args: list[str] = body.get("args", [])
+        kwargs: dict[str, str] = body.get("kwargs", {})
+        timeout: Optional[float] = body.get("timeout")
+        results = await engine_client(raw_request).collective_rpc(
+            method=method, timeout=timeout, args=tuple(args), kwargs=kwargs)
+        if results is None:
+            return Response(status_code=200)
+        response: list[Any] = []
+        for result in results:
+            if result is None or isinstance(result, (dict, list)):
+                response.append(result)
+            else:
+                response.append(str(result))
+        return JSONResponse(content={"results": response})
+
 
 @router.post("/scale_elastic_ep",
              dependencies=[Depends(validate_json_request)],

From 5b5f350d67a1e1efb7dbe8b18fe2353ad94857a1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 19 Aug 2025 10:33:47 -0700
Subject: [PATCH 401/932] [Misc] Enable yapf for FlashInfer backend (#23193)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 37 +++++++++++++++---------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index c56e721dff..44f95c7686 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -36,6 +36,7 @@ from vllm.v1.attention.backends.utils import (AttentionCGSupport,
                                               get_per_layer_parameters,
                                               infer_global_hyperparameters,
                                               split_decodes_and_prefills)
+# yapf: enable
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
@@ -541,12 +542,22 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         if cache_dtype.startswith("fp8") and enable_fusion:
             q_dtype = kv_cache_dtype
 
-        prefill_use_trtllm = use_trtllm_attention(
-            num_qo_heads, num_kv_heads, num_prefill_tokens, max_seq_len,
-            cache_dtype, q_dtype, is_prefill=True, has_sinks=has_sinks)
-        decode_use_trtllm = use_trtllm_attention(
-            num_qo_heads, num_kv_heads, num_decode_tokens, max_seq_len,
-            cache_dtype, q_dtype, is_prefill=False, has_sinks=has_sinks)
+        prefill_use_trtllm = use_trtllm_attention(num_qo_heads,
+                                                  num_kv_heads,
+                                                  num_prefill_tokens,
+                                                  max_seq_len,
+                                                  cache_dtype,
+                                                  q_dtype,
+                                                  is_prefill=True,
+                                                  has_sinks=has_sinks)
+        decode_use_trtllm = use_trtllm_attention(num_qo_heads,
+                                                 num_kv_heads,
+                                                 num_decode_tokens,
+                                                 max_seq_len,
+                                                 cache_dtype,
+                                                 q_dtype,
+                                                 is_prefill=False,
+                                                 has_sinks=has_sinks)
 
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -654,19 +665,18 @@ class FlashInferImpl(AttentionImpl):
                 raise ValueError(
                     "Sinks must have the same number of heads as the number of "
                     f"heads in the layer. Expected {num_heads}, but got "
-                    f"{sinks.shape[0]}."
-                )
+                    f"{sinks.shape[0]}.")
             self.sinks = sinks
 
-        self.support_trtllm_attn = (supports_trtllm_attention() and
-                                    num_heads % num_kv_heads == 0)
+        self.support_trtllm_attn = (supports_trtllm_attention()
+                                    and num_heads % num_kv_heads == 0)
         self.bmm1_scale: Optional[float] = None
         self.bmm2_scale: Optional[float] = None
 
     def fused_output_quant_supported(self, dtype: torch.dtype, static: bool,
                                      group_shape: GroupShape):
-        supported_quant_type = (dtype == FP8_DTYPE and static and
-                                group_shape == GroupShape.PER_TENSOR)
+        supported_quant_type = (dtype == FP8_DTYPE and static
+                                and group_shape == GroupShape.PER_TENSOR)
         return (self.support_trtllm_attn
                 and self.kv_cache_dtype.startswith("fp8")
                 and supported_quant_type)
@@ -731,7 +741,8 @@ class FlashInferImpl(AttentionImpl):
             # Insert FP8 quant for query
             num_tokens, num_heads, head_size = query.shape
             query, _ = ops.scaled_fp8_quant(
-                query.reshape((num_tokens, num_heads * head_size)).contiguous(),
+                query.reshape(
+                    (num_tokens, num_heads * head_size)).contiguous(),
                 layer._q_scale)
             query = query.reshape((num_tokens, num_heads, head_size))
 

From b94faf9d50360217659f5605fce45a562dde6834 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Tue, 19 Aug 2025 14:00:51 -0400
Subject: [PATCH 402/932] [Bugfix] Fix accuracy issue when using flashinfer
 cutlass moe, TP=1 and modelopt. (#23125)

Signed-off-by: Bill Nell <bnell@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../fused_moe/flashinfer_cutlass_moe.py       | 49 ++++++++++
 vllm/model_executor/layers/fused_moe/layer.py |  2 +
 .../compressed_tensors_moe.py                 | 27 ++++++
 .../layers/quantization/modelopt.py           | 90 ++++++++++++-------
 4 files changed, 134 insertions(+), 34 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 3fbe2a0bc6..6a9c28b53c 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -7,6 +7,8 @@ import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+    FlashInferCutlassMoEPrepareAndFinalize)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP)
 from vllm.utils.flashinfer import (flashinfer_cutlass_fused_moe,
@@ -181,3 +183,50 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
             ep_rank=self.ep_rank,
             output=output,
         )
+
+
+def flashinfer_cutlass_moe_fp4(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    g1_alphas: torch.Tensor,
+    g2_alphas: torch.Tensor,
+    a1_gscale: torch.Tensor,
+    a2_gscale: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+
+    fused_experts = mk.FusedMoEModularKernel(
+        FlashInferCutlassMoEPrepareAndFinalize(use_dp=False,
+                                               a1_gscale=a1_gscale),
+        FlashInferExperts(
+            g1_alphas=g1_alphas,
+            g2_alphas=g2_alphas,
+            a1_gscale=a1_gscale,
+            a2_gscale=a2_gscale,
+            out_dtype=hidden_states.dtype,
+            quant_dtype="nvfp4",
+        ))
+
+    return fused_experts(
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=inplace,
+        activation=activation,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+    )
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 4924f1fadb..aa8ceda1bb 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -198,6 +198,8 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         else:
             return None
 
+    # Note: init_prepare_finalize should only be called by
+    # prepare_communication_buffer_for_model.
     def init_prepare_finalize(self):
         assert self.moe is not None
         prepare_finalize = self.maybe_make_prepare_finalize(self.moe)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 42c43cbc03..8ca8249e69 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -388,6 +388,33 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
 
+        elif self.allow_flashinfer:
+            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+                flashinfer_cutlass_moe_fp4)
+
+            assert is_valid_flashinfer_cutlass_fused_moe(
+                x, layer.w13_weight, layer.w2_weight), (
+                    "Flashinfer CUTLASS Fused MoE not applicable!")
+
+            return flashinfer_cutlass_moe_fp4(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=False,  # TODO(shuw): fix later, now output is high prec
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                w1_scale=layer.w13_blockscale_swizzled,
+                w2_scale=layer.w2_blockscale_swizzled,
+                g1_alphas=layer.g1_alphas,
+                g2_alphas=layer.g2_alphas,
+                a1_gscale=layer.w13_input_scale_quant,
+                a2_gscale=layer.w2_input_scale_quant,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+
         assert expert_map is None, ("Expert Parallelism / expert_map "
                                     "is currently not supported for "
                                     "CompressedTensorsW4A4MoeMethod.")
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index e0f462b369..28f16d1088 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -966,22 +966,21 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                     f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
                     f" expected one of {allowed_backends}")
 
-        self.fused_experts: Optional[
-            mk.FusedMoEModularKernel] = None  # type: ignore[assignment]
-
     def maybe_make_prepare_finalize(
         self,
         moe: FusedMoEConfig,
     ) -> Optional[mk.FusedMoEPrepareAndFinalize]:
-        if not self.allow_flashinfer:
-            return super().maybe_make_prepare_finalize(moe)
+        if (self.allow_flashinfer and self.flashinfer_moe_backend
+                == FlashinferMoeBackend.CUTLASS):
+            prepare_finalize = (
+                build_flashinfer_fp4_cutlass_moe_prepare_finalize(
+                    moe,
+                    a1_gscale=self.layer.w13_input_scale_quant,
+                ))
+            logger.debug_once("%s", prepare_finalize.__class__.__name__)
+            return prepare_finalize
 
-        prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(
-            moe,
-            a1_gscale=self.layer.w13_input_scale_quant,
-        )
-        logger.debug_once("%s", prepare_finalize.__class__.__name__)
-        return prepare_finalize
+        return super().maybe_make_prepare_finalize(moe)
 
     def select_gemm_impl(
         self,
@@ -1409,7 +1408,52 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 global_num_experts=global_num_experts,
                 expert_map=expert_map)
 
-        if self.fused_experts is None:
+        if self.fused_experts is not None:
+            assert self.allow_flashinfer and \
+               self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+
+            assert is_valid_flashinfer_cutlass_fused_moe(
+                x, layer.w13_weight, layer.w2_weight), (
+                    "Flashinfer CUTLASS Fused MoE not applicable!")
+
+            out = self.fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=False,  # TODO(shuw): fix later, now output is high prec
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                w1_scale=layer.w13_blockscale_swizzled,
+                w2_scale=layer.w2_blockscale_swizzled,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+        elif (self.allow_flashinfer
+              and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS):
+            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+                flashinfer_cutlass_moe_fp4)
+
+            out = flashinfer_cutlass_moe_fp4(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                w1_scale=layer.w13_blockscale_swizzled,
+                w2_scale=layer.w2_blockscale_swizzled,
+                g1_alphas=layer.g1_alphas,
+                g2_alphas=layer.g2_alphas,
+                a1_gscale=layer.w13_input_scale_quant,
+                a2_gscale=layer.w2_input_scale_quant,
+                inplace=False,  # TODO(shuw): fix later, now output is high prec
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+        else:
             # If no modular kernel is provided, use cutlass_moe_fp4 for TP case
             # only (no EP).
             from vllm.model_executor.layers.fused_moe.cutlass_moe import (
@@ -1432,27 +1476,5 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 e=layer.w13_weight.shape[0],
                 expert_map=expert_map,
                 apply_router_weight_on_input=apply_router_weight_on_input)
-        else:
-            assert self.allow_flashinfer and \
-               self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-
-            assert is_valid_flashinfer_cutlass_fused_moe(
-                x, layer.w13_weight, layer.w2_weight), (
-                    "Flashinfer CUTLASS Fused MoE not applicable!")
-
-            out = self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=False,  # TODO(shuw): fix later, now output is high prec
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                w1_scale=layer.w13_blockscale_swizzled,
-                w2_scale=layer.w2_blockscale_swizzled,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-            )
 
         return out

From 80141bbf2f1b8b0beaac097f94923f95773734ef Mon Sep 17 00:00:00 2001
From: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
Date: Tue, 19 Aug 2025 20:12:25 +0200
Subject: [PATCH 403/932] fix: use cache_salt for gpt-oss (#23186)

Signed-off-by: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
---
 tests/entrypoints/openai/test_serving_chat.py | 4 +++-
 vllm/entrypoints/openai/serving_chat.py       | 5 +++++
 vllm/entrypoints/openai/serving_responses.py  | 5 +++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 8a7892cf6d..10879f0be8 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -282,9 +282,11 @@ async def test_serving_chat_could_load_correct_generation_config():
     assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
 
 
+@pytest.mark.parametrize("model_type", ["gpt_oss", "any"])
 @pytest.mark.asyncio
-async def test_serving_chat_did_set_correct_cache_salt():
+async def test_serving_chat_did_set_correct_cache_salt(model_type):
     mock_model_config = MockModelConfig()
+    mock_model_config.hf_config.model_type = model_type
 
     mock_engine = MagicMock(spec=MQLLMEngineClient)
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 1789521afc..d57868847e 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1483,4 +1483,9 @@ class OpenAIServingChat(OpenAIServing):
         # Render prompt token ids.
         prompt_token_ids = render_for_completion(messages)
         engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+
+        # Add cache_salt if provided in the request
+        if request.cache_salt is not None:
+            engine_prompt["cache_salt"] = request.cache_salt
+
         return messages, [prompt_token_ids], [engine_prompt]
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 86c16df40e..1b30fa01ea 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -408,6 +408,11 @@ class OpenAIServingResponses(OpenAIServing):
             request, prev_response)
         prompt_token_ids = render_for_completion(messages)
         engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+
+        # Add cache_salt if provided in the request
+        if request.cache_salt is not None:
+            engine_prompt["cache_salt"] = request.cache_salt
+
         return messages, [prompt_token_ids], [engine_prompt]
 
     async def responses_full_generator(

From e61bac87eefdcee02f524b8379b310a276fd73f4 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 19 Aug 2025 13:11:51 -0700
Subject: [PATCH 404/932] [Misc] Minor refactoring for FlashInfer backend
 (#23147)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 156 ++++++++++-------------
 1 file changed, 65 insertions(+), 91 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 44f95c7686..53fafbc4af 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -10,8 +10,7 @@ import torch
 from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
                         BatchPrefillWithPagedKVCacheWrapper,
                         MultiLevelCascadeAttentionWrapper)
-from flashinfer.decode import (_get_range_buf, get_seq_lens,
-                               trtllm_batch_decode_with_kv_cache)
+from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 
 import vllm.envs as envs
@@ -142,19 +141,10 @@ class FlashInferMetadata:
     # The number of entries in the last page of each request in
     # the paged kv cache, shape: [batch_size] (CPU for plan)
     paged_kv_last_page_len_cpu: torch.Tensor
-    # The number of query/output heads
-    num_qo_heads: int
-    # The number of key/value heads
-    num_kv_heads: int
-    # The dimension of the attention heads
-    head_dim: int
-    # Block size of vllm
-    page_size: int
-    # The data type of the paged kv cache
-    kv_data_type: torch.dtype
     # The data type of the query
     q_data_type: torch.dtype
 
+    seq_lens_cpu: torch.Tensor
     slot_mapping: torch.Tensor
 
     # For flashinfer trtllm batch decode
@@ -185,10 +175,6 @@ class FlashInferMetadata:
     qo_indptr_gpu: Optional[torch.Tensor] = None
     paged_kv_indptr_gpu: Optional[torch.Tensor] = None
 
-    def __post_init__(self):
-        if self.head_dim is not None:
-            FlashInferBackend.validate_head_size(self.head_dim)
-
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
     cudagraph_support: ClassVar[AttentionCGSupport] = \
@@ -201,13 +187,14 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         self.device = device
         self.vllm_config = vllm_config
         self.cache_config = vllm_config.cache_config
+        self.model_config = vllm_config.model_config
         self.kv_cache_spec = kv_cache_spec
         self._workspace_buffer = None
         self._prefill_wrapper = None  # Wrapper for prefill/append
         self._decode_wrapper = None  # Wrapper for decode (general shape)
 
         self.compilation_config = vllm_config.compilation_config
-        max_num_pages_per_req = cdiv(vllm_config.model_config.max_model_len,
+        max_num_pages_per_req = cdiv(self.model_config.max_model_len,
                                      self.kv_cache_spec.block_size)
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         max_num_pages = max_num_reqs * max_num_pages_per_req
@@ -221,6 +208,29 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self._decode_cudagraph_max_bs = min(
                 max_num_reqs, self.compilation_config.max_capture_size)
 
+        self.num_qo_heads = self.model_config.get_num_attention_heads(
+            self.vllm_config.parallel_config)
+        self.num_kv_heads = self.kv_cache_spec.num_kv_heads
+        self.head_dim = self.kv_cache_spec.head_size
+        FlashInferBackend.validate_head_size(self.head_dim)
+        self.page_size = self.kv_cache_spec.block_size
+
+        self.enable_fusion = (
+            self.compilation_config.pass_config.enable_attn_fusion)
+        self.q_data_type = self.model_config.dtype
+        self.cache_dtype = self.cache_config.cache_dtype
+        if self.cache_dtype.startswith("fp8"):
+            self.kv_cache_dtype = (
+                FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                    self.cache_dtype))
+            # Insert FP8 quant for query if FP8 kv cache and attn fusion enabled
+            if self.enable_fusion:
+                self.q_data_type = self.kv_cache_dtype
+        else:
+            self.kv_cache_dtype = self.kv_cache_spec.dtype
+        self.use_tensor_cores = (envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or
+                                 (self.num_qo_heads // self.num_kv_heads > 4))
+
         self._cascade_wrapper = None  # Wrapper for cascade attention
 
         # Global hyperparameters shared by all attention layers
@@ -282,14 +292,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             decode_wrapper = self._decode_wrapper
 
         if decode_wrapper is None:
-            num_qo_heads = (
-                self.vllm_config.model_config.get_num_attention_heads(
-                    self.vllm_config.parallel_config))
-            num_kv_heads = self.vllm_config.model_config.get_num_kv_heads(
-                self.vllm_config.parallel_config)
-            use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
-                num_qo_heads // num_kv_heads > 4)
-
             if use_cudagraph:
                 paged_kv_indptr = self.paged_kv_indptr[:batch_size + 1]
                 paged_kv_indices = self.paged_kv_indices
@@ -306,7 +308,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 paged_kv_indptr_buffer=paged_kv_indptr,
                 paged_kv_indices_buffer=paged_kv_indices,
                 paged_kv_last_page_len_buffer=paged_kv_last_page_len,
-                use_tensor_cores=use_tensor_cores)
+                use_tensor_cores=self.use_tensor_cores)
 
             # save the decode wrapper
             if use_cudagraph:
@@ -342,16 +344,16 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                     attn_metadata.shared_kv_last_page_len_cpu,
                     attn_metadata.paged_kv_last_page_len_cpu
                 ],
-                attn_metadata.num_qo_heads,
-                attn_metadata.num_kv_heads,
-                attn_metadata.head_dim,
-                attn_metadata.page_size,
+                self.num_qo_heads,
+                self.num_kv_heads,
+                self.head_dim,
+                self.page_size,
                 causal=True,
                 sm_scale=self.global_hyperparameters.sm_scale,
                 window_left=self.global_hyperparameters.window_left,
                 logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
-                q_data_type=attn_metadata.q_data_type,
-                kv_data_type=attn_metadata.kv_data_type,
+                q_data_type=self.q_data_type,
+                kv_data_type=self.kv_cache_dtype,
             )
         else:
             # Regular attention (common case).
@@ -383,17 +385,17 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                         attn_metadata.paged_kv_indices,
                         attn_metadata.
                         paged_kv_last_page_len_cpu[prefill_start:],
-                        attn_metadata.num_qo_heads,
-                        attn_metadata.num_kv_heads,
-                        attn_metadata.head_dim,
-                        attn_metadata.page_size,
+                        self.num_qo_heads,
+                        self.num_kv_heads,
+                        self.head_dim,
+                        self.page_size,
                         causal=True,
                         sm_scale=self.global_hyperparameters.sm_scale,
                         window_left=self.global_hyperparameters.window_left,
                         logits_soft_cap=self.global_hyperparameters.
                         logits_soft_cap,
-                        q_data_type=attn_metadata.q_data_type,
-                        kv_data_type=attn_metadata.kv_data_type,
+                        q_data_type=self.q_data_type,
+                        kv_data_type=self.kv_cache_dtype,
                     )
                 else:
                     attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device)
@@ -435,18 +437,19 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                         self.paged_kv_indptr_cpu[:num_input_tokens + 1],
                         attn_metadata.paged_kv_indices,
                         self.paged_kv_last_page_len_cpu[:num_input_tokens],
-                        attn_metadata.num_qo_heads,
-                        attn_metadata.num_kv_heads,
-                        attn_metadata.head_dim,
-                        attn_metadata.page_size,
+                        attn_metadata.seq_lens_cpu[:num_input_tokens],
+                        self.num_qo_heads,
+                        self.num_kv_heads,
+                        self.head_dim,
+                        self.page_size,
                         # Disable flashinfer's pos encoding and use vllm's rope.
                         pos_encoding_mode="NONE",
                         sm_scale=self.global_hyperparameters.sm_scale,
                         window_left=self.global_hyperparameters.window_left,
                         logits_soft_cap=self.global_hyperparameters.
                         logits_soft_cap,
-                        q_data_type=attn_metadata.q_data_type,
-                        kv_data_type=attn_metadata.kv_data_type,
+                        q_data_type=self.q_data_type,
+                        kv_data_type=self.kv_cache_dtype,
                     )
 
     def build(self,
@@ -458,9 +461,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\
             split_decodes_and_prefills(common_attn_metadata)
 
-        page_size = self.kv_cache_spec.block_size
+        page_size = self.page_size
         max_q_len = common_attn_metadata.max_query_len
-        max_seq_len = common_attn_metadata.seq_lens_cpu.max()
+        max_seq_len = common_attn_metadata.seq_lens_cpu.max().item()
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         block_table_tensor = common_attn_metadata.block_table_tensor
@@ -495,7 +498,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             shared_kv_page_indices_cpu = None
             shared_kv_last_page_len_cpu = None
 
-        max_num_blocks = block_table_bounds_cpu.max()
+        max_num_blocks = block_table_bounds_cpu.max().item()
         block_table_bounds = block_table_bounds_cpu.to(self.device,
                                                        non_blocking=True)
         mask = (self.block_table_arange[:max_num_blocks].unsqueeze(0)
@@ -520,42 +523,23 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                     paged_kv_last_page_len_cpu,
                     out=self.paged_kv_last_page_len_cpu[:num_reqs])
 
-        cache_dtype = self.cache_config.cache_dtype
-        if cache_dtype.startswith("fp8"):
-            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                cache_dtype)
-        else:
-            kv_cache_dtype = self.kv_cache_spec.dtype
-
-        config = self.vllm_config
-        num_qo_heads = config.model_config.get_num_attention_heads(
-            config.parallel_config)
-        num_kv_heads = self.kv_cache_spec.num_kv_heads
-        head_dim = self.kv_cache_spec.head_size
-
         # Check if any layer uses sinks (requires TRTLLM attention)
         has_sinks = self.global_hyperparameters.has_sinks
 
-        # Insert FP8 quant for query if FP8 kv cache and attn fusion enabled
-        q_dtype = config.model_config.dtype
-        enable_fusion = config.compilation_config.pass_config.enable_attn_fusion
-        if cache_dtype.startswith("fp8") and enable_fusion:
-            q_dtype = kv_cache_dtype
-
-        prefill_use_trtllm = use_trtllm_attention(num_qo_heads,
-                                                  num_kv_heads,
+        prefill_use_trtllm = use_trtllm_attention(self.num_qo_heads,
+                                                  self.num_kv_heads,
                                                   num_prefill_tokens,
                                                   max_seq_len,
-                                                  cache_dtype,
-                                                  q_dtype,
+                                                  self.cache_dtype,
+                                                  self.q_data_type,
                                                   is_prefill=True,
                                                   has_sinks=has_sinks)
-        decode_use_trtllm = use_trtllm_attention(num_qo_heads,
-                                                 num_kv_heads,
+        decode_use_trtllm = use_trtllm_attention(self.num_qo_heads,
+                                                 self.num_kv_heads,
                                                  num_decode_tokens,
                                                  max_seq_len,
-                                                 cache_dtype,
-                                                 q_dtype,
+                                                 self.cache_dtype,
+                                                 self.q_data_type,
                                                  is_prefill=False,
                                                  has_sinks=has_sinks)
 
@@ -566,12 +550,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             paged_kv_indices=paged_kv_indices,
             paged_kv_last_page_len_cpu=self.
             paged_kv_last_page_len_cpu[:num_reqs],
-            num_qo_heads=num_qo_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim=head_dim,
-            page_size=page_size,
-            kv_data_type=kv_cache_dtype,
-            q_data_type=q_dtype,
+            q_data_type=self.q_data_type,
+            seq_lens_cpu=seq_lens_cpu,
             slot_mapping=common_attn_metadata.slot_mapping,
             max_q_len=max_q_len,
             max_seq_len=max_seq_len,
@@ -910,6 +890,7 @@ def fast_plan_decode(
     indptr_cpu: torch.Tensor,
     indices: torch.Tensor,
     last_page_len_cpu: torch.Tensor,
+    seq_lens_cpu: torch.Tensor,
     num_qo_heads: int,
     num_kv_heads: int,
     head_dim: int,
@@ -987,9 +968,6 @@ def fast_plan_decode(
     kv_data_type = getattr(torch, kv_data_type) if isinstance(
         kv_data_type, str) else kv_data_type
 
-    if self.use_tensor_cores:
-        qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
-
     if batch_size != self._fixed_batch_size:
         raise ValueError(
             "The batch size should be fixed in cudagraph mode, the runtime "
@@ -1006,12 +984,8 @@ def fast_plan_decode(
     self._paged_kv_last_page_len_buf.copy_(last_page_len_cpu,
                                            non_blocking=True)
 
-    indptr_host = indptr_cpu
-    last_page_len_host = last_page_len_cpu
-
     if self.use_tensor_cores:
-        kv_lens_arr_host = get_seq_lens(indptr_host, last_page_len_host,
-                                        page_size)
+        qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
 
         try:
             # Make sure we pass exactly 15 arguments for tensor core version
@@ -1020,8 +994,8 @@ def fast_plan_decode(
                 self._int_workspace_buffer,
                 self._pin_memory_int_workspace_buffer,
                 qo_indptr_host,
-                indptr_host,
-                kv_lens_arr_host,
+                indptr_cpu,
+                seq_lens_cpu,
                 batch_size,  # total_num_rows
                 batch_size,
                 num_qo_heads,
@@ -1041,7 +1015,7 @@ def fast_plan_decode(
                 self._float_workspace_buffer,
                 self._int_workspace_buffer,
                 self._pin_memory_int_workspace_buffer,
-                indptr_host,
+                indptr_cpu,
                 batch_size,
                 num_qo_heads,
                 num_kv_heads,

From 21dce80ea96bcf033d159c0f952fb274567b315c Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 19 Aug 2025 16:49:34 -0400
Subject: [PATCH 405/932] [CI/Build] Add support for Python 3.13 (#13164)

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 CMakeLists.txt                     |  2 +-
 docs/getting_started/quickstart.md |  2 +-
 pyproject.toml                     |  3 ++-
 vllm/config/__init__.py            | 12 +++++++++++-
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34386d670a..bcbd1b52a0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12", "3.13")
 
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index f833807666..2af26626d2 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -8,7 +8,7 @@ This guide will help you quickly get started with vLLM to perform:
 ## Prerequisites
 
 - OS: Linux
-- Python: 3.9 -- 3.12
+- Python: 3.9 -- 3.13
 
 ## Installation
 
diff --git a/pyproject.toml b/pyproject.toml
index 03a32ac0ba..013f2a6cd5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,13 +24,14 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
     "Intended Audience :: Developers",
     "Intended Audience :: Information Technology",
     "Intended Audience :: Science/Research",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Scientific/Engineering :: Information Analysis",
 ]
-requires-python = ">=3.9,<3.13"
+requires-python = ">=3.9,<3.14"
 dynamic = [ "version", "dependencies", "optional-dependencies"]
 
 [project.urls]
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index cd2be212c2..56a749789b 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -191,7 +191,17 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
             yield a, b
             a = b
 
-    cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
+    try:
+        cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
+    except (OSError, KeyError, TypeError):
+        # HACK: Python 3.13+ workaround - set missing __firstlineno__
+        # Workaround can be removed after we upgrade to pydantic==2.12.0
+        with open(inspect.getfile(cls)) as f:
+            for i, line in enumerate(f):
+                if f"class {cls.__name__}" in line and ":" in line:
+                    cls.__firstlineno__ = i + 1
+                    break
+        cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
 
     if not isinstance(cls_node, ast.ClassDef):
         raise TypeError("Given object was not a class.")

From a38b8af4c3647ca615e3f1e334779cda8b1eddd8 Mon Sep 17 00:00:00 2001
From: amirkl94 <203507526+amirkl94@users.noreply.github.com>
Date: Wed, 20 Aug 2025 01:01:53 +0300
Subject: [PATCH 406/932] [NVIDIA] Add SM100 Flashinfer Cutlass MoE fp8 backend
 (#22357)

Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +
 tests/kernels/moe/test_flashinfer.py          | 248 ++++++++++++++++++
 .../fused_moe/flashinfer_cutlass_moe.py       |  51 ++--
 .../model_executor/layers/quantization/fp8.py | 221 ++++++++++------
 .../layers/quantization/modelopt.py           | 118 ++++++---
 .../quantization/utils/flashinfer_utils.py    | 112 ++++++++
 6 files changed, 613 insertions(+), 139 deletions(-)
 create mode 100644 tests/kernels/moe/test_flashinfer.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 265e6ad72a..781b8e0fa0 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -630,6 +630,7 @@ steps:
   - vllm/model_executor/layers/fused_moe/cutlass_moe.py
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
   - vllm/compilation/fusion.py
   - vllm/compilation/fusion_attn.py
@@ -650,6 +651,7 @@ steps:
     # Fusion
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
new file mode 100644
index 0000000000..52a3d2ca3b
--- /dev/null
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -0,0 +1,248 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    apply_flashinfer_per_tensor_scale_fp8, flashinfer_cutlass_moe_fp8,
+    register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
+    swap_w13_to_w31)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    input_to_float8)
+from vllm.model_executor.models.llama4 import Llama4MoE
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+
+if not has_flashinfer_cutlass_fused_moe(
+) or not current_platform.has_device_capability(100):
+    pytest.skip("Requires flashinfer_cutlass_fused_moe and nvfp4 support",
+                allow_module_level=True)
+
+NUM_EXPERTS = [16]
+TOP_KS = [1]
+
+MNK_FACTORS = [
+    (256, 8192, 5120),
+    (256, 4096, 5120),
+    (127, 8192, 5120),
+    (127, 4096, 5120),
+    (10, 8192, 5120),
+    (10, 4096, 5120),
+    (1, 8192, 5120),
+    (1, 4096, 5120),
+]
+
+vllm_config = VllmConfig(parallel_config=ParallelConfig(
+    pipeline_parallel_size=1))
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+
+def quant_fp8_per_tensor_batches(a):
+    num_batches = a.size(0)
+    a_quant = []
+    a_scales = []
+
+    for i in range(num_batches):
+        a_fp8, a_global_sf = input_to_float8(a[i])
+        a_global_sf = 1.0 / a_global_sf
+        a_quant.append(a_fp8)
+        a_scales.append(a_global_sf)
+
+    result_a_quant = torch.stack(a_quant)
+    result_a_scales = torch.stack(a_scales)
+
+    return result_a_quant, result_a_scales
+
+
+@dataclass
+class TestData:
+    hidden_states: torch.Tensor
+    w13_quantized: torch.Tensor
+    w2_quantized: torch.Tensor
+    a1_scale: torch.Tensor
+    a2_scale: torch.Tensor
+    w13_weight_scale: torch.Tensor
+    w2_weight_scale: torch.Tensor
+    layer: torch.nn.Module
+
+    @staticmethod
+    def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
+                              reorder: bool) -> "TestData":
+        hidden_states = torch.randn(
+            (m, k), device="cuda", dtype=torch.bfloat16) / 10
+        w13 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16)
+        w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16)
+
+        # Scale to fp8
+        _, a1_scale = input_to_float8(hidden_states)
+        a1_scale = 1.0 / a1_scale
+        a2_scale = torch.scalar_tensor(1.0).to(device="cuda").to(
+            dtype=torch.float32)
+        w13_quantized, w13_weight_scale = quant_fp8_per_tensor_batches(w13)
+        w2_quantized, w2_weight_scale = quant_fp8_per_tensor_batches(w2)
+
+        layer = torch.nn.Module()
+        layer.w13_weight = w13_quantized.clone()
+        layer.w2_weight = w2_quantized.clone()
+        layer.w13_input_scale = a1_scale
+        layer.w2_input_scale = a2_scale
+        layer.w13_weight_scale = w13_weight_scale
+        layer.w2_weight_scale = w2_weight_scale
+
+        register_moe_scaling_factors(layer)
+
+        # flashinfer expects swapped rows for w13
+        layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
+        if reorder:
+            rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
+                                              layer.w2_weight)
+        layer.custom_routing_function = Llama4MoE.custom_routing_function
+        layer.intermediate_size_per_partition = n
+        layer.ep_rank = 0
+        layer.local_num_experts = e
+
+        return TestData(
+            hidden_states=hidden_states,
+            w13_quantized=w13_quantized,
+            w2_quantized=w2_quantized,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            w13_weight_scale=w13_weight_scale,
+            w2_weight_scale=w2_weight_scale,
+            layer=layer,
+        )
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+def test_flashinfer_per_tensor_moe_fp8_no_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    monkeypatch,
+):
+    current_platform.seed_everything(7)
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
+    with set_current_vllm_config(vllm_config):
+        td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=td.hidden_states,
+            router_logits=score,
+            use_grouped_topk=False,
+            top_k=topk,
+            renormalize=False,
+            custom_routing_function=Llama4MoE.custom_routing_function,
+            scoring_func="softmax")
+
+        output = fused_experts(
+            td.hidden_states,
+            td.w13_quantized,
+            td.w2_quantized,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=False,
+            activation="silu",
+            use_fp8_w8a8=True,
+            per_channel_quant=False,
+            global_num_experts=e,
+            expert_map=None,
+            w1_scale=td.w13_weight_scale,
+            w2_scale=td.w2_weight_scale,
+            a1_scale=td.a1_scale,
+            a2_scale=td.a2_scale,
+            apply_router_weight_on_input=True,
+        )
+
+        flashinfer_output = apply_flashinfer_per_tensor_scale_fp8(
+            layer=td.layer,
+            hidden_states=td.hidden_states,
+            router_logits=score,
+            routing_bias=None,
+            global_num_experts=e,
+            top_k=topk,
+            num_expert_group=None,
+            topk_group=None,
+            apply_router_weight_on_input=True)
+
+        torch.testing.assert_close(output,
+                                   flashinfer_output,
+                                   atol=5.5e-2,
+                                   rtol=1e-2)
+
+
+@pytest.mark.skip(
+    "Requires flashinfer version that contains https://github.com/flashinfer-ai/flashinfer/pull/1472"
+)
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+def test_flashinfer_cutlass_moe_fp8_no_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    monkeypatch,
+):
+    current_platform.seed_everything(7)
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
+    with set_current_vllm_config(vllm_config):
+        td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=td.hidden_states,
+            router_logits=score,
+            use_grouped_topk=False,
+            top_k=topk,
+            renormalize=False,
+            custom_routing_function=Llama4MoE.custom_routing_function,
+            scoring_func="softmax")
+
+        output = fused_experts(
+            td.hidden_states,
+            td.w13_quantized,
+            td.w2_quantized,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=False,
+            activation="silu",
+            use_fp8_w8a8=True,
+            per_channel_quant=False,
+            global_num_experts=e,
+            expert_map=None,
+            w1_scale=td.w13_weight_scale,
+            w2_scale=td.w2_weight_scale,
+            a1_scale=td.a1_scale,
+            a2_scale=td.a2_scale,
+            apply_router_weight_on_input=True,
+        )
+
+        td.layer.dp_size = 1
+
+        flashinfer_cutlass_output = flashinfer_cutlass_moe_fp8(
+            td.hidden_states,
+            td.layer,
+            topk_weights,
+            topk_ids,
+            activation="silu",
+            global_num_experts=e,
+            expert_map=None,
+            apply_router_weight_on_input=True,
+        )
+
+        torch.testing.assert_close(output,
+                                   flashinfer_cutlass_output,
+                                   atol=5.5e-2,
+                                   rtol=1e-2)
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 6a9c28b53c..feab3f74ca 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -61,8 +61,8 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
                 per_act_token_quant=False,
                 block_shape=None,
             ))
-        assert quant_dtype == "nvfp4", ("Only nvfp4 quantization is "
-                                        "currently supported.")
+        assert quant_dtype in ("nvfp4", torch.float8_e4m3fn), (
+            "Only nvfp4,fp8 quantization are currently supported.")
         self.ep_rank = ep_rank
         self.ep_size = ep_size
         self.tp_rank = tp_rank
@@ -122,7 +122,8 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         """
         aq_m, aq_n = aq.shape
         workspace2 = ()
-        output_shape = (aq_m, aq_n * 2)
+        output_shape = (aq_m, aq_n * 2) if self.quant_dtype != \
+            torch.float8_e4m3fn else (aq_m, aq_n)
         workspace_dtype = a.dtype
         workspace1 = output_shape
         # The workspace is determined by `aq`, since it comes after any
@@ -151,29 +152,39 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
         apply_router_weight_on_input: Optional[bool],
     ):
-        # Flashinfer CUTLASS kernel takes scalar global scales,
-        # min because inv_scale.
+        if self.quant_dtype == torch.float8_e4m3fn:
+            quant_scales = [
+                self.g1_alphas, self.a2_gscale, self.g2_alphas, self.a1_gscale
+            ]
 
-        # Ensure w1_scale and w2_scale are not None before calling view
-        assert w1_scale is not None and w2_scale is not None, (
-            "w1_scale and w2_scale must not "
-            "be None for FlashInferExperts")
+            a1q_scale = None  # not passing input_sf in fp8
+            fc1_expert_weights = w1
+            fc2_expert_weights = w2
+        else:
+            # Ensure w1_scale and w2_scale are not None before calling view
+            assert w1_scale is not None and w2_scale is not None, (
+                "w1_scale and w2_scale must not "
+                "be None for FlashInferExperts")
+            # Flashinfer CUTLASS kernel takes scalar global scales,
+            # min because inv_scale.
+            quant_scales = [
+                self.a1_gscale,
+                w1_scale.view(torch.int32),
+                self.g1_alphas,
+                self.a2_gscale,
+                w2_scale.view(torch.int32),
+                self.g2_alphas,
+            ]
+            # FlashInfer API requires weight to be long for nvfp4
+            fc1_expert_weights = w1.view(torch.long)
+            fc2_expert_weights = w2.view(torch.long)
 
-        quant_scales = [
-            self.a1_gscale,
-            w1_scale.view(torch.int32),
-            self.g1_alphas,
-            self.a2_gscale,
-            w2_scale.view(torch.int32),
-            self.g2_alphas,
-        ]
         _ = flashinfer_cutlass_fused_moe(
             input=hidden_states,
             token_selected_experts=topk_ids.to(torch.int),
             token_final_scales=topk_weights,
-            # FlashInfer API requires weight to be long for nvfp4
-            fc1_expert_weights=w1.view(torch.long),
-            fc2_expert_weights=w2.view(torch.long),
+            fc1_expert_weights=fc1_expert_weights,
+            fc2_expert_weights=fc2_expert_weights,
             output_dtype=self.out_dtype,
             quant_scales=quant_scales,
             input_sf=a1q_scale,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index f07be08554..7c447c2a53 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -9,6 +9,7 @@ from torch.nn import Module
 from torch.nn.parameter import Parameter
 
 import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
@@ -23,8 +24,11 @@ from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors,
-    rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31)
+    FlashinferMoeBackend, apply_flashinfer_per_tensor_scale_fp8,
+    build_flashinfer_fp8_cutlass_moe_prepare_finalize,
+    flashinfer_cutlass_moe_fp8, get_flashinfer_moe_backend,
+    register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
+    select_cutlass_fp8_gemm_impl, swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
@@ -145,7 +149,7 @@ class Fp8Config(QuantizationConfig):
                 return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            return Fp8MoEMethod(self, layer.moe_config)
+            return Fp8MoEMethod(self, layer)
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
         return None
@@ -482,16 +486,20 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         quant_config: The quantization config.
     """
 
-    def __init__(self, quant_config: Fp8Config, moe: FusedMoEConfig):
-        super().__init__(moe)
+    def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
+        super().__init__(layer.moe_config)
+        self.layer = layer
         self.quant_config = quant_config
         self.block_quant = self.quant_config.weight_block_size is not None
 
-        self.flashinfer_moe_enabled = False
+        self.flashinfer_moe_backend: Optional[FlashinferMoeBackend] = None
+        self.fused_experts: Optional[
+            mk.FusedMoEModularKernel] = None  # type: ignore
         if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
+            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
-                "Using FlashInfer MoE FP8 kernels for Fp8MoEMethod.")
-            self.flashinfer_moe_enabled = True
+                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
+            )
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
         self.use_marlin = (not current_platform.has_device_capability(89)
@@ -531,6 +539,20 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 "CutlassBlockScaledGroupedGemm not supported on the current "
                 "platform.")
 
+    def maybe_make_prepare_finalize(
+        self,
+        moe: FusedMoEConfig,
+    ) -> Optional[mk.FusedMoEPrepareAndFinalize]:
+        if self.flashinfer_moe_backend != FlashinferMoeBackend.CUTLASS:
+            return super().maybe_make_prepare_finalize(moe)
+
+        prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+            moe,
+            layer=self.layer,
+        )
+        logger.debug_once("%s", prepare_finalize.__class__.__name__)
+        return prepare_finalize
+
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -678,7 +700,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     normalize_e4m3fn_to_e4m3fnuz(
                         layer.w2_weight, layer.w2_weight_scale_inv,
                         layer.w2_input_scale)
-            elif self.flashinfer_moe_enabled:
+            elif self.flashinfer_moe_backend is not None:
                 # NOTE: weights have to be swapped since the activation is
                 # applied on different half for flashinfer vs vllm
                 w13_weight = swap_w13_to_w31(layer.w13_weight.data)
@@ -686,9 +708,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     layer.w13_weight_scale_inv.data)
                 w2_weight = layer.w2_weight.data
                 w2_weight_scale_inv = layer.w2_weight_scale_inv.data
-                if not self.block_quant:
-                    register_moe_scaling_factors(layer)
-                    rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
             else:
                 w13_weight = layer.w13_weight.data
                 w13_weight_scale_inv = layer.w13_weight_scale_inv.data
@@ -834,6 +853,17 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
 
+            if self.flashinfer_moe_backend is not None:
+                # NOTE: weights have to be swapped since the activation is
+                # applied on different half for flashinfer vs vllm
+                assert not self.block_quant
+                register_moe_scaling_factors(layer)
+                w13_weight = swap_w13_to_w31(layer.w13_weight.data)
+                if self.flashinfer_moe_backend == \
+                    FlashinferMoeBackend.TENSORRT_LLM:
+                    rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
+                layer.w13_weight.data = w13_weight.data
+
         if self.use_marlin:
             prepare_moe_fp8_layer_for_marlin(layer, False)
             # Activations not quantized for marlin.
@@ -892,6 +922,13 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 per_act_token_quant=False,
                 allow_deep_gemm=self.allow_deep_gemm,
             )
+        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            experts = select_cutlass_fp8_gemm_impl(
+                moe,
+                self.layer,
+            )
+            logger.debug_once("Using %s", experts.__class__.__name__)
+            return experts
         else:
             logger.debug(
                 "TritonOrDeepGemmExperts(%s): block_size=%s, per_act_token=%s",
@@ -930,25 +967,66 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             assert logical_to_physical_map is not None
             assert logical_replica_count is not None
             assert isinstance(layer, FusedMoE)
-        if not self.flashinfer_moe_enabled:
-            topk_weights, topk_ids = FusedMoE.select_experts(
-                hidden_states=x,
-                router_logits=router_logits,
-                use_grouped_topk=use_grouped_topk,
-                top_k=top_k,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                custom_routing_function=custom_routing_function,
-                scoring_func=scoring_func,
-                e_score_correction_bias=e_score_correction_bias,
-                indices_type=self.topk_indices_dtype,
-                enable_eplb=enable_eplb,
-                expert_map=expert_map,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
+
+        if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            assert activation == 'silu', (
+                f"Expected 'silu' activation but got {activation}")
+            assert scoring_func == 'sigmoid', (
+                f"Expected 'sigmoid' scoring func but got {scoring_func}")
+            if self.block_quant:
+                assert (renormalize and use_grouped_topk
+                        and custom_routing_function is None)
+
+                return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
+                    routing_logits=router_logits.to(torch.float32),
+                    routing_bias=e_score_correction_bias,
+                    x=x,
+                    w13_weight=layer.w13_weight,
+                    w13_weight_scale_inv=layer.w13_weight_scale_inv,
+                    w2_weight=layer.w2_weight,
+                    w2_weight_scale_inv=layer.w2_weight_scale_inv,
+                    global_num_experts=global_num_experts,
+                    top_k=top_k,
+                    num_expert_group=num_expert_group,
+                    topk_group=topk_group,
+                    intermediate_size=layer.intermediate_size_per_partition,
+                    expert_offset=layer.ep_rank * layer.local_num_experts,
+                    local_num_experts=layer.local_num_experts,
+                    block_shape=self.quant_config.weight_block_size,
+                    routed_scaling=1.0,
+                )
+            else:
+                assert (not renormalize
+                        and custom_routing_function is not None)
+                return apply_flashinfer_per_tensor_scale_fp8(
+                    layer=layer,
+                    hidden_states=x,
+                    router_logits=router_logits,
+                    routing_bias=e_score_correction_bias,
+                    global_num_experts=global_num_experts,
+                    top_k=top_k,
+                    num_expert_group=num_expert_group,
+                    topk_group=topk_group,
+                    apply_router_weight_on_input=apply_router_weight_on_input)
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+        )
 
         if self.rocm_aiter_moe_enabled:
             from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
@@ -988,63 +1066,38 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map)
-        elif self.flashinfer_moe_enabled:
-            assert activation == 'silu'
-            assert scoring_func == 'sigmoid'
-            if self.block_quant:
-                assert (renormalize and use_grouped_topk
-                        and custom_routing_function is None)
-
-                return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
-                    routing_logits=router_logits.to(torch.float32),
-                    routing_bias=e_score_correction_bias,
-                    x=x,
-                    w13_weight=layer.w13_weight,
-                    w13_weight_scale_inv=layer.w13_weight_scale_inv,
-                    w2_weight=layer.w2_weight,
-                    w2_weight_scale_inv=layer.w2_weight_scale_inv,
+        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            assert self.block_quant is None
+            assert (not renormalize and custom_routing_function is not None)
+            assert activation == 'silu', (
+                f"Expected 'silu' activation but got {activation}")
+            assert scoring_func == 'sigmoid', (
+                f"Expected 'sigmoid' scoring func but got {scoring_func}")
+            if self.fused_experts is not None:
+                return self.fused_experts(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    inplace=False,
+                    activation=activation,
                     global_num_experts=global_num_experts,
-                    top_k=top_k,
-                    num_expert_group=num_expert_group,
-                    topk_group=topk_group,
-                    intermediate_size=layer.intermediate_size_per_partition,
-                    expert_offset=layer.ep_rank * layer.local_num_experts,
-                    local_num_experts=layer.local_num_experts,
-                    block_shape=self.quant_config.weight_block_size,
-                    routed_scaling=1.0,
+                    expert_map=expert_map,
+                    apply_router_weight_on_input=apply_router_weight_on_input,
                 )
             else:
-                assert (not renormalize
-                        and custom_routing_function is not None)
-                return apply_flashinfer_per_tensor_scale_fp8(
-                    layer=layer,
-                    hidden_states=x,
-                    router_logits=router_logits,
-                    routing_bias=e_score_correction_bias,
+                return flashinfer_cutlass_moe_fp8(
+                    x,
+                    layer,
+                    topk_weights,
+                    topk_ids,
+                    inplace=False,
+                    activation=activation,
                     global_num_experts=global_num_experts,
-                    top_k=top_k,
-                    num_expert_group=num_expert_group,
-                    topk_group=topk_group,
-                    apply_router_weight_on_input=apply_router_weight_on_input)
-        elif self.fused_experts is not None:
-            return self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
-                w1_scale=(layer.w13_weight_scale_inv
-                          if self.block_quant else layer.w13_weight_scale),
-                w2_scale=(layer.w2_weight_scale_inv
-                          if self.block_quant else layer.w2_weight_scale),
-                a1_scale=layer.w13_input_scale,
-                a2_scale=layer.w2_input_scale,
-            )
+                    expert_map=expert_map,
+                    apply_router_weight_on_input=apply_router_weight_on_input,
+                )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
             return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 28f16d1088..046234057f 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from enum import Enum
 from typing import Any, Callable, Optional, Union
 
 import torch
@@ -27,8 +26,11 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1,
     select_nvfp4_gemm_impl)
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors,
-    rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31)
+    FlashinferMoeBackend, apply_flashinfer_per_tensor_scale_fp8,
+    build_flashinfer_fp8_cutlass_moe_prepare_finalize,
+    flashinfer_cutlass_moe_fp8, get_flashinfer_moe_backend,
+    register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
+    select_cutlass_fp8_gemm_impl, swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     apply_fp4_marlin_linear, is_fp4_marlin_supported,
     prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
@@ -49,11 +51,6 @@ QUANT_ALGOS = ["FP8", "NVFP4"]
 KV_CACHE_QUANT_ALGOS = ["FP8"]
 
 
-class FlashinferMoeBackend(Enum):
-    TENSORRT_LLM = "TensorRT-LLM"
-    CUTLASS = "CUTLASS"
-
-
 class ModelOptFp8Config(QuantizationConfig):
     """Config class for ModelOpt FP8."""
 
@@ -179,7 +176,7 @@ class ModelOptFp8Config(QuantizationConfig):
         elif isinstance(layer, Attention):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
-            return ModelOptFp8MoEMethod(self, layer.moe_config)
+            return ModelOptFp8MoEMethod(self, layer)
         return None
 
 
@@ -278,18 +275,49 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: ModelOptFp8Config,
-        moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> None:
-        super().__init__(moe)
+        super().__init__(layer.moe_config)
+        self.layer = layer
         self.quant_config = quant_config
         from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
             cutlass_fp8_supported)
         self.cutlass_fp8_supported = cutlass_fp8_supported()
-        self.flashinfer_moe_enabled = False
+        self.flashinfer_moe_backend: Optional[FlashinferMoeBackend] = None
+        self.fused_experts: Optional[
+            mk.FusedMoEModularKernel] = None  # type: ignore
         if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
+            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
-                "Using FlashInfer MoE FP8 kernels for ModelOptFp8MoEMethod.")
-            self.flashinfer_moe_enabled = True
+                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
+            )
+
+    def maybe_make_prepare_finalize(
+        self,
+        moe: FusedMoEConfig,
+    ) -> Optional[mk.FusedMoEPrepareAndFinalize]:
+        if self.fused_experts is not None or \
+            self.flashinfer_moe_backend != FlashinferMoeBackend.CUTLASS:
+            return super().maybe_make_prepare_finalize(moe)
+
+        prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+            moe,
+            layer=self.layer,
+        )
+        logger.debug_once("%s", prepare_finalize.__class__.__name__)
+        return prepare_finalize
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        experts = select_cutlass_fp8_gemm_impl(
+            moe,
+            self.layer,
+        )
+        logger.debug_once("Using %s", experts.__class__.__name__)
+        return experts
 
     def create_weights(
         self,
@@ -433,11 +461,12 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             layer.w2_input_scale = Parameter(layer.w2_input_scale.max(),
                                              requires_grad=False)
 
-        if self.flashinfer_moe_enabled:
+        if self.flashinfer_moe_backend is not None:
             layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
-            rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
-                                              layer.w2_weight)
             register_moe_scaling_factors(layer)
+            if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+                rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
+                                                  layer.w2_weight)
 
     def apply(
         self,
@@ -461,14 +490,13 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptFp8MoEMethod` yet.")
 
-        if self.flashinfer_moe_enabled:
-            assert activation == 'silu'
+        if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            assert activation == 'silu', (
+                f"Expected 'silu' activation but got {activation}")
             assert not renormalize
             return apply_flashinfer_per_tensor_scale_fp8(
                 layer=layer,
@@ -495,6 +523,36 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
         )
+
+        if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            assert not renormalize
+            assert activation == 'silu', (
+                f"Expected 'silu' activation but got {activation}")
+            if self.fused_experts is not None:
+                return self.fused_experts(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    inplace=False,
+                    activation=activation,
+                    global_num_experts=global_num_experts,
+                    expert_map=expert_map,
+                    apply_router_weight_on_input=apply_router_weight_on_input,
+                )
+            else:
+                return flashinfer_cutlass_moe_fp8(
+                    x,
+                    layer,
+                    topk_weights,
+                    topk_ids,
+                    inplace=False,
+                    activation=activation,
+                    global_num_experts=global_num_experts,
+                    expert_map=expert_map,
+                    apply_router_weight_on_input=apply_router_weight_on_input,
+                )
         from vllm.model_executor.layers.fused_moe.fused_moe import (
             fused_experts)
         return fused_experts(
@@ -951,20 +1009,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         self.flashinfer_moe_backend = None
 
         if self.allow_flashinfer:
-            flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
-            if flashinfer_moe_backend == "throughput":
-                self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
-                logger.info_once("Using FlashInfer CUTLASS kernels for "
-                                 "ModelOptNvFp4FusedMoE.")
-            elif flashinfer_moe_backend == "latency":
-                self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
-                logger.info_once("Using FlashInfer TensorRT-LLM kernels for "
-                                 "ModelOptNvFp4FusedMoE.")
-            else:
-                allowed_backends = ["throughput", "latency"]
-                raise ValueError(
-                    f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
-                    f" expected one of {allowed_backends}")
+            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
+            logger.info_once(
+                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
+                " for ModelOptNvFp4FusedMoE.")
 
     def maybe_make_prepare_finalize(
         self,
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 278ee5232f..9889808f07 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -1,9 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from enum import Enum
 from typing import Optional
 
 import torch
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    FlashInferExperts)
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+    FlashInferCutlassMoEPrepareAndFinalize)
+
+logger = init_logger(__name__)
+
+
+class FlashinferMoeBackend(Enum):
+    TENSORRT_LLM = "TensorRT-LLM"
+    CUTLASS = "CUTLASS"
+
 
 def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
 
@@ -144,3 +161,98 @@ def register_moe_scaling_factors(layer: torch.nn.Module) -> None:
     layer.register_parameter(
         'output2_scales_scalar',
         torch.nn.Parameter(output2_scales, requires_grad=False))
+    layer.register_parameter(
+        'w2_input_scale_inv',
+        torch.nn.Parameter(1.0 / layer.w2_input_scale, requires_grad=False))
+
+
+def build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+    moe: Optional[FusedMoEConfig],
+    layer: torch.nn.Module,
+) -> mk.FusedMoEPrepareAndFinalize:
+    """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel"""
+    use_dp = moe.moe_parallel_config.dp_size > 1 if moe is not None else False
+    return FlashInferCutlassMoEPrepareAndFinalize(
+        use_dp, a1_gscale=layer.w13_input_scale)
+
+
+def select_cutlass_fp8_gemm_impl(
+    moe: Optional[FusedMoEConfig],
+    layer: torch.nn.Module,
+    out_dtype: Optional[torch.dtype] = None,
+) -> mk.FusedMoEPermuteExpertsUnpermute:
+    """Return a GEMM *experts* implementation for fused-MoE layers"""
+
+    from vllm.model_executor.models.llama4 import Llama4MoE
+    assert layer.custom_routing_function == Llama4MoE.custom_routing_function, \
+        "FusedMoE flashinfer kernels are only supported for Llama4"
+
+    if moe is not None:
+        return FlashInferExperts(
+            g1_alphas=layer.output1_scales_gate_scalar,
+            g2_alphas=layer.output2_scales_scalar,
+            a1_gscale=layer.w13_input_scale,
+            a2_gscale=layer.w2_input_scale_inv,
+            out_dtype=moe.in_dtype,
+            quant_dtype=torch.float8_e4m3fn,
+            ep_rank=moe.moe_parallel_config.ep_rank,
+            ep_size=moe.moe_parallel_config.ep_size,
+            tp_rank=moe.moe_parallel_config.tp_rank,
+            tp_size=moe.moe_parallel_config.tp_size,
+        )
+
+    assert out_dtype is not None, (
+        "If moe config is None, out_dtype must be passed")
+    return FlashInferExperts(
+        g1_alphas=layer.output1_scales_gate_scalar,
+        g2_alphas=layer.output2_scales_scalar,
+        a1_gscale=layer.w13_input_scale,
+        a2_gscale=layer.w2_input_scale_inv,
+        out_dtype=out_dtype,
+        quant_dtype=torch.float8_e4m3fn,
+    )
+
+
+def flashinfer_cutlass_moe_fp8(
+    hidden_states: torch.Tensor,
+    layer: torch.nn.Module,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    fused_experts = mk.FusedMoEModularKernel(
+        build_flashinfer_fp8_cutlass_moe_prepare_finalize(moe=None,
+                                                          layer=layer),
+        select_cutlass_fp8_gemm_impl(moe=None,
+                                     layer=layer,
+                                     out_dtype=hidden_states.dtype))
+
+    return fused_experts(
+        hidden_states,
+        layer.w13_weight,
+        layer.w2_weight,
+        topk_weights,
+        topk_ids,
+        inplace=inplace,
+        activation=activation,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+    )
+
+
+def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
+    flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
+    if flashinfer_moe_backend == "throughput":
+        return FlashinferMoeBackend.CUTLASS
+    elif flashinfer_moe_backend == "latency":
+        return FlashinferMoeBackend.TENSORRT_LLM
+
+    allowed_backends = ["throughput", "latency"]
+    raise ValueError(
+        f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
+        f" expected one of {allowed_backends}")

From 0f4f0191d842831854c49dfbe589f7b31e638dc7 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 19 Aug 2025 18:07:30 -0400
Subject: [PATCH 407/932] [CI/Build] Replace lm-eval gsm8k tests with faster
 implementation (#23002)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   4 +-
 tests/evals/gsm8k/README.md                   |  35 +++
 tests/evals/gsm8k/__init__.py                 |   2 +
 .../Llama-3-8B-Instruct-nonuniform-CT.yaml    |   5 +
 .../Llama-3.2-1B-Instruct-INT8-CT.yaml        |   5 +
 .../gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml   |   5 +
 .../Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml   |   5 +
 tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml |   5 +
 tests/evals/gsm8k/configs/models-small.txt    |   5 +
 tests/evals/gsm8k/conftest.py                 |  66 +++++
 tests/evals/gsm8k/gsm8k_eval.py               | 252 ++++++++++++++++++
 tests/evals/gsm8k/test_gsm8k_correctness.py   |  90 +++++++
 12 files changed, 476 insertions(+), 3 deletions(-)
 create mode 100644 tests/evals/gsm8k/README.md
 create mode 100644 tests/evals/gsm8k/__init__.py
 create mode 100644 tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
 create mode 100644 tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
 create mode 100644 tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
 create mode 100644 tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 create mode 100644 tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
 create mode 100644 tests/evals/gsm8k/configs/models-small.txt
 create mode 100644 tests/evals/gsm8k/conftest.py
 create mode 100644 tests/evals/gsm8k/gsm8k_eval.py
 create mode 100644 tests/evals/gsm8k/test_gsm8k_correctness.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 781b8e0fa0..2f7f1db75b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -451,13 +451,11 @@ steps:
 
 - label: LM Eval Small Models # 53min
   mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 
 - label: OpenAI API correctness
   mirror_hardwares: [amdexperimental]
diff --git a/tests/evals/gsm8k/README.md b/tests/evals/gsm8k/README.md
new file mode 100644
index 0000000000..58572c3a6f
--- /dev/null
+++ b/tests/evals/gsm8k/README.md
@@ -0,0 +1,35 @@
+# GSM8K Accuracy Evaluation
+
+This directory contains a replacement for the lm-eval-harness GSM8K evaluation, using an isolated GSM8K script and vLLM server for better performance and control.
+
+## Usage
+
+### Run tests with pytest (like buildkite)
+
+```bash
+pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt \
+    --tp-size=1
+```
+
+### Run standalone evaluation script
+
+```bash
+# Start vLLM server first
+vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000
+
+# Run evaluation
+python tests/gsm8k/gsm8k_eval.py --port 8000
+```
+
+## Configuration Format
+
+Model configs in `configs/` directory use this YAML format:
+
+```yaml
+model_name: "Qwen/Qwen2.5-1.5B-Instruct"
+accuracy_threshold: 0.54  # Minimum expected accuracy
+num_questions: 1319       # Number of questions (default: full test set)
+num_fewshot: 5            # Few-shot examples from train set
+max_model_len: 4096       # Model context length
+```
diff --git a/tests/evals/gsm8k/__init__.py b/tests/evals/gsm8k/__init__.py
new file mode 100644
index 0000000000..0fec1fe5bc
--- /dev/null
+++ b/tests/evals/gsm8k/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
\ No newline at end of file
diff --git a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
new file mode 100644
index 0000000000..caa0448f23
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
@@ -0,0 +1,5 @@
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
+accuracy_threshold: 0.74
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
diff --git a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
new file mode 100644
index 0000000000..615aa69a2d
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
@@ -0,0 +1,5 @@
+model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
+accuracy_threshold: 0.31
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
diff --git a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
new file mode 100644
index 0000000000..c5dbceeeb2
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
@@ -0,0 +1,5 @@
+model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
+accuracy_threshold: 0.45
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
diff --git a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
new file mode 100644
index 0000000000..5319ada30f
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -0,0 +1,5 @@
+model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
+accuracy_threshold: 0.60
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
diff --git a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
new file mode 100644
index 0000000000..c39fb979d9
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
@@ -0,0 +1,5 @@
+model_name: "Qwen/Qwen3-0.6B-FP8"
+accuracy_threshold: 0.375
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
diff --git a/tests/evals/gsm8k/configs/models-small.txt b/tests/evals/gsm8k/configs/models-small.txt
new file mode 100644
index 0000000000..afd1065b91
--- /dev/null
+++ b/tests/evals/gsm8k/configs/models-small.txt
@@ -0,0 +1,5 @@
+Qwen3-0.6B-FP8.yaml
+Llama-3.2-1B-Instruct-INT8-CT.yaml
+Llama-3-8B-Instruct-nonuniform-CT.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Qwen1.5-MoE-W4A16-CT.yaml
diff --git a/tests/evals/gsm8k/conftest.py b/tests/evals/gsm8k/conftest.py
new file mode 100644
index 0000000000..d96b0a66ed
--- /dev/null
+++ b/tests/evals/gsm8k/conftest.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+
+
+def pytest_addoption(parser):
+    """Add custom command line options."""
+    parser.addoption("--config-list-file",
+                     default="configs/models-small.txt",
+                     help="File containing list of config files to test")
+    parser.addoption("--tp-size",
+                     default=1,
+                     type=int,
+                     help="Tensor parallel size")
+
+
+def pytest_generate_tests(metafunc):
+    """Generate test parameters from config files."""
+    if "config_filename" in metafunc.fixturenames:
+        config_list_file = metafunc.config.getoption("--config-list-file")
+        tp_size = metafunc.config.getoption("--tp-size")
+
+        # Handle both relative and absolute paths
+        config_list_path = Path(config_list_file)
+        if not config_list_path.is_absolute():
+            # If relative, try relative to test directory first
+            test_dir_path = Path(__file__).parent / config_list_file
+            if test_dir_path.exists():
+                config_list_path = test_dir_path
+            else:
+                # Try relative to current working directory
+                config_list_path = Path.cwd() / config_list_file
+
+        print(f"Looking for config list at: {config_list_path}")
+
+        config_files = []
+        if config_list_path.exists():
+            # Determine config directory (same directory as the list file)
+            config_dir = config_list_path.parent
+
+            with open(config_list_path) as f:
+                for line in f:
+                    line = line.strip()
+                    if line and not line.startswith("#"):
+                        config_path = config_dir / line
+                        print(f"Checking config file: {config_path}")
+                        if config_path.exists():
+                            config_files.append(config_path)
+                            print(f"  ✓ Found: {config_path}")
+                        else:
+                            print(f"  ✗ Missing: {config_path}")
+        else:
+            print(f"Config list file not found: {config_list_path}")
+
+        # Generate test parameters
+        if config_files:
+            metafunc.parametrize(["config_filename", "tp_size"],
+                                 [(config_file, int(tp_size))
+                                  for config_file in config_files],
+                                 ids=[
+                                     f"{config_file.stem}-tp{tp_size}"
+                                     for config_file in config_files
+                                 ])
+        else:
+            print("No config files found, test will be skipped")
diff --git a/tests/evals/gsm8k/gsm8k_eval.py b/tests/evals/gsm8k/gsm8k_eval.py
new file mode 100644
index 0000000000..7d0ce25f75
--- /dev/null
+++ b/tests/evals/gsm8k/gsm8k_eval.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Isolated GSM8K evaluation script for vLLM serve endpoint.
+"""
+
+import argparse
+import ast
+import asyncio
+import json
+import os
+import time
+from collections.abc import Generator
+from typing import Optional, Union
+
+import aiohttp
+import numpy as np
+import regex as re
+import requests
+from tqdm.asyncio import tqdm
+
+INVALID = -9999999
+
+
+def download_and_cache_file(url: str, filename: Optional[str] = None) -> str:
+    """Download and cache a file from a URL."""
+    if filename is None:
+        filename = os.path.join("/tmp", url.split("/")[-1])
+
+    if os.path.exists(filename):
+        return filename
+
+    print(f"Downloading from {url} to {filename}")
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+
+    with open(filename, "wb") as f:
+        for chunk in response.iter_content(chunk_size=1024):
+            f.write(chunk)
+
+    return filename
+
+
+def load_gsm8k_data() -> tuple[list[dict], list[dict]]:
+    """Load GSM8K train and test data"""
+    train_url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl"
+    test_url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+
+    train_file = download_and_cache_file(train_url)
+    test_file = download_and_cache_file(test_url)
+
+    train_data = list(read_jsonl(train_file))
+    test_data = list(read_jsonl(test_file))
+
+    return train_data, test_data
+
+
+def read_jsonl(filename: str) -> Generator[dict, None, None]:
+    """Read a JSONL file."""
+    with open(filename) as fin:
+        for line in fin:
+            if not line.startswith("#"):
+                yield json.loads(line)
+
+
+def get_answer_value(answer_str: str) -> int:
+    """Extract the numerical answer from the response."""
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+async def call_vllm_api(session: aiohttp.ClientSession,
+                        prompt: str,
+                        temperature: float,
+                        max_tokens: int,
+                        stop: Optional[list[str]] = None,
+                        url: Optional[str] = None,
+                        seed: Optional[int] = None) -> str:
+    """Call vLLM's OpenAI-compatible completions endpoint."""
+    data = {
+        "prompt": prompt,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+        "stop": stop,
+    }
+    if seed is not None:
+        data["seed"] = seed
+
+    try:
+        async with session.post(f"{url}/v1/completions",
+                                json=data) as response:
+            response.raise_for_status()
+            result = await response.json()
+            return result["choices"][0]["text"]
+    except Exception as e:
+        print(f"Error calling vLLM API: {e}")
+        return ""
+
+
+def evaluate_gsm8k(num_questions: int = 1319,
+                   num_shots: int = 5,
+                   max_tokens: int = 256,
+                   host: str = "http://127.0.0.1",
+                   port: int = 8000,
+                   temperature: float = 0.0,
+                   seed: Optional[int] = 42) -> dict[str, Union[float, int]]:
+    """
+    Evaluate GSM8K accuracy using vLLM serve endpoint.
+    
+    Returns dict with accuracy, invalid_rate, latency, etc.
+    """
+    base_url = f"{host}:{port}"
+
+    # Load GSM8K train and test data
+    train_data, test_data = load_gsm8k_data()
+
+    # Limit to available test questions
+    num_questions = min(num_questions, len(test_data))
+
+    # Build few-shot examples from train split (like lm-eval does)
+    few_shot_examples = ""
+    for i in range(num_shots):
+        few_shot_examples += (f"Question: {train_data[i]['question']}\n"
+                              f"Answer: {train_data[i]['answer']}\n\n")
+
+    # Prepare test questions and labels from test split
+    questions = []
+    labels = []
+    for i in range(num_questions):
+        questions.append(f"Question: {test_data[i]['question']}\nAnswer:")
+        labels.append(get_answer_value(test_data[i]["answer"]))
+
+    assert all(label != INVALID for label in labels), "Some labels are invalid"
+
+    # Run evaluation
+    async def run_async_evaluation():
+        states: list[str] = [""] * num_questions
+
+        async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
+            prompt = few_shot_examples + questions[i]
+            answer = await call_vllm_api(
+                session=session,
+                prompt=prompt,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                stop=["Question", "Assistant:", "<|separator|>"],
+                url=base_url,
+                seed=seed,
+            )
+            states[i] = answer
+            return answer
+
+        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
+                total=600)) as session:
+            tasks = [get_answer(session, i) for i in range(num_questions)]
+            await tqdm.gather(*tasks, desc="Evaluating")
+
+        return states
+
+    print(f"Running GSM8K evaluation: {num_questions} questions, "
+          f"{num_shots}-shot")
+
+    tic = time.perf_counter()
+    states = asyncio.run(run_async_evaluation())
+    latency = time.perf_counter() - tic
+
+    # Compute metrics
+    preds = [get_answer_value(state) for state in states]
+    accuracy = np.mean(np.array(preds) == np.array(labels))
+    invalid_rate = np.mean(np.array(preds) == INVALID)
+
+    result = {
+        "accuracy": accuracy,
+        "invalid_rate": invalid_rate,
+        "latency": latency,
+        "questions_per_second": num_questions / latency,
+        "num_questions": num_questions,
+        "num_shots": num_shots,
+        "max_tokens": max_tokens,
+        "timestamp": time.time(),
+    }
+
+    return result
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="GSM8K evaluation for vLLM serve")
+    parser.add_argument("--num-shots",
+                        type=int,
+                        default=5,
+                        help="Number of few-shot examples")
+    parser.add_argument("--num-questions",
+                        type=int,
+                        default=1319,
+                        help="Number of questions to evaluate")
+    parser.add_argument("--max-tokens",
+                        type=int,
+                        default=256,
+                        help="Max tokens for generation")
+    parser.add_argument("--host",
+                        type=str,
+                        default="http://127.0.0.1",
+                        help="Host URL")
+    parser.add_argument("--port", type=int, default=8000, help="Port number")
+    parser.add_argument("--temperature",
+                        type=float,
+                        default=0.0,
+                        help="Temperature for generation")
+    parser.add_argument("--seed",
+                        type=int,
+                        default=42,
+                        help="Random seed for reproducibility")
+    parser.add_argument("--save-results",
+                        type=str,
+                        help="Save results to JSON file")
+
+    args = parser.parse_args()
+
+    result = evaluate_gsm8k(
+        num_questions=args.num_questions,
+        num_shots=args.num_shots,
+        max_tokens=args.max_tokens,
+        host=args.host,
+        port=args.port,
+        temperature=args.temperature,
+        seed=args.seed,
+    )
+
+    # Print results to terminal
+    print("\nResults:")
+    print(f"Accuracy: {result['accuracy']:.3f}")
+    print(f"Invalid responses: {result['invalid_rate']:.3f}")
+    print(f"Total latency: {result['latency']:.3f} s")
+    print(f"Questions per second: {result['questions_per_second']:.3f}")
+
+    # Optional file saving
+    if args.save_results:
+        with open(args.save_results, "w") as f:
+            json.dump(result, f, indent=2)
+        print(f"Results saved to {args.save_results}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py
new file mode 100644
index 0000000000..a12dd49dbe
--- /dev/null
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+GSM8K evaluation using vLLM server and isolated GSM8K script.
+Replacement for lm-eval-harness with better performance and control.
+
+Usage:
+pytest -s -v test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt \
+    --tp-size=1
+"""
+
+import yaml
+
+from tests.utils import RemoteOpenAIServer
+
+from .gsm8k_eval import evaluate_gsm8k
+
+RTOL = 0.08  # Relative tolerance for accuracy comparison
+
+
+def launch_gsm8k_eval(eval_config, server_url, tp_size):
+    """Launch GSM8K evaluation using our isolated script."""
+    # Extract host and port from server URL
+    if "://" in server_url:
+        server_url = server_url.split("://")[1]
+
+    host_port = server_url.split("/")[0]  # Remove path if present
+    if ":" in host_port:
+        host, port = host_port.split(":")
+        port = int(port)
+    else:
+        host = host_port
+        port = 8000
+
+    # Add http:// prefix if not present
+    if not host.startswith("http"):
+        host = f"http://{host}"
+
+    # Run GSM8K evaluation
+    results = evaluate_gsm8k(
+        num_questions=eval_config["num_questions"],
+        num_shots=eval_config["num_fewshot"],
+        host=host,
+        port=port,
+    )
+
+    return results
+
+
+def test_gsm8k_correctness_param(config_filename, tp_size):
+    """Test GSM8K correctness for a given model configuration."""
+    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
+
+    # Server arguments
+    server_args = [
+        "--max-model-len",
+        str(eval_config.get("max_model_len", 4096)),
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--tensor-parallel-size",
+        str(tp_size),
+    ]
+
+    # Launch server and run evaluation
+    with RemoteOpenAIServer(eval_config["model_name"],
+                            server_args,
+                            max_wait_seconds=480) as remote_server:
+        server_url = remote_server.url_for("v1")
+
+        results = launch_gsm8k_eval(eval_config, server_url, tp_size)
+
+        # Check accuracy against threshold
+        measured_accuracy = results["accuracy"]
+        expected_accuracy = eval_config["accuracy_threshold"]
+
+        print(f"GSM8K Results for {eval_config['model_name']}:")
+        print(f"  Accuracy: {measured_accuracy:.3f}")
+        print(f"  Expected: {expected_accuracy:.3f}")
+        print(f"  Questions: {results['num_questions']}")
+        print(f"  Invalid rate: {results['invalid_rate']:.3f}")
+        print(f"  Latency: {results['latency']:.1f}s")
+        print(f"  QPS: {results['questions_per_second']:.1f}")
+
+        # Verify accuracy is within tolerance
+        assert measured_accuracy >= expected_accuracy - RTOL, (
+            f"Accuracy too low: {measured_accuracy:.3f} < "
+            f"{expected_accuracy:.3f} - {RTOL:.3f}")
+
+        print(f"✅ GSM8K test passed for {eval_config['model_name']}")

From 14e2b0730bf92eb4beaca82e437e3287d6ca82b9 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 19 Aug 2025 18:17:08 -0400
Subject: [PATCH 408/932] [BugFix] fix CUTLASS MLA full cudagraph  (#23200)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/attention/backends/mla/cutlass_mla.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 6937ce10ac..0b581dea04 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -21,7 +21,7 @@ logger = init_logger(__name__)
 
 class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
     # enable full CUDA Graph support for decode-only capture
-    attn_cudagraph_support: ClassVar[
+    cudagraph_support: ClassVar[
         AttentionCGSupport] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
 
From 1630cc8d0f5e0ff19d4c5736a4b531dd27a3f4d8 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Tue, 19 Aug 2025 16:42:31 -0700
Subject: [PATCH 409/932] [Benchmarks] Add video inputs to ShareGPTDataset. 
 (#23199)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 benchmarks/README.md            | 41 ++++++++++++++++++++++++++++++---
 benchmarks/benchmark_dataset.py | 38 +++++++++++++++++++++++++++++-
 vllm/benchmarks/datasets.py     | 40 ++++++++++++++++++++++++++++++--
 3 files changed, 113 insertions(+), 6 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 69d32e2228..176b402129 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -32,6 +32,14 @@ become available.
         <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
         <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
       </td>
+    </tr>
+        <tr>
+      <td><strong>ShareGPT4Video (Video)</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>
+        <code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
+      </td>
     </tr>
     <tr>
       <td><strong>BurstGPT</strong></td>
@@ -231,7 +239,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
   --backend openai-chat \
-  --endpoint-type openai-chat \  
+  --endpoint-type openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
@@ -246,7 +254,7 @@ vllm bench serve \
 ```bash
 vllm bench serve \
   --backend openai-chat \
-  --endpoint-type openai-chat \  
+  --endpoint-type openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
@@ -612,7 +620,7 @@ vllm bench serve \
   --prefix-repetition-prefix-len 512 \
   --prefix-repetition-suffix-len 128 \
   --prefix-repetition-num-prefixes 5 \
-  --prefix-repetition-output-len 128 
+  --prefix-repetition-output-len 128
 ```
 
 </details>
@@ -687,4 +695,31 @@ python benchmarks/benchmark_serving.py \
   --endpoint /v1/chat/completion
 ```
 
+### Videos (ShareGPT4Video)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dtype bfloat16 \
+  --limit-mm-per-prompt '{"video": 1}' \
+  --allowed-local-media-path /path/to/sharegpt4video/videos
+```
+
+Send requests with videos:
+
+```bash
+python benchmarks/benchmark_serving.py \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dataset-name sharegpt \
+  --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
+  --num-prompts 100 \
+  --save-result \
+  --result-dir ~/vllm_benchmark_results \
+  --save-detailed \
+  --endpoint /v1/chat/completion
+```
+
 </details>
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index c62934ed94..e1a856026c 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -293,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
     )
 
 
+def process_video(video: Any) -> Mapping[str, Any]:
+    """
+    Process a single video input and return a multimedia content dictionary.
+
+    Supports the following input types:
+
+    1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
+       containing raw video data.
+
+    2. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(video, dict) and "bytes" in video:
+        video_bytes = video["bytes"]
+        video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+        return {
+            "type": "video_url",
+            "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
+        }
+
+    if isinstance(video, str):
+        video_url = (
+            video if video.startswith(("http://", "file://")) else f"file://{video}"
+        )
+        return {"type": "video_url", "video_url": {"url": video_url}}
+
+    raise ValueError(
+        f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`."  # noqa: E501
+    )
+
+
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
@@ -451,9 +486,10 @@ class ShareGPTDataset(BenchmarkDataset):
                 skip_min_output_len_check=output_len is not None,
             ):
                 continue
-            # TODO: Also support ShareGPT4Video.
             if image_path := entry.get("image"):
                 mm_content = process_image(image_path)
+            elif video_path := entry.get("video"):
+                mm_content = process_video(video_path)
             else:
                 mm_content = None
             if enable_multimodal_chat:
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 3532a083fb..f4fbfad2d1 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -281,7 +281,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
     """
     Process a single image input and return a multimedia content dictionary.
 
-    Supports three input types:
+    Supports the following input types:
 
     1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
        containing raw image data.  - Loads the bytes as a PIL.Image.Image.
@@ -321,6 +321,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
                      " or str or dictionary with raw image bytes.")
 
 
+def process_video(video: Any) -> Mapping[str, Any]:
+    """
+    Process a single video input and return a multimedia content dictionary.
+
+    Supports the following input types:
+
+    1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
+       containing raw video data.
+
+    2. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(video, dict) and 'bytes' in video:
+        video_bytes = video['bytes']
+        video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+        return {
+            "type": "video_url",
+            "video_url": {
+                "url": f"data:video/mp4;base64,{video_base64}"
+            },
+        }
+
+    if isinstance(video, str):
+        video_url = (video if video.startswith(
+            ("http://", "file://")) else f"file://{video}")
+        return {"type": "video_url", "video_url": {"url": video_url}}
+
+    raise ValueError(
+        f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`."  # noqa: E501
+    )
+
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
@@ -474,9 +509,10 @@ class ShareGPTDataset(BenchmarkDataset):
                                      skip_min_output_len_check=output_len
                                      is not None):
                 continue
-            # TODO: Also support ShareGPT4Video.
             if image_path := entry.get("image"): 
                 mm_content = process_image(image_path) 
+            elif video_path := entry.get("video"): 
+                mm_content = process_video(video_path)
             else: 
                 mm_content = None
             if enable_multimodal_chat:

From c32e6ad1f63631fd8033f0cca3a35d5e48ccfc7f Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 19 Aug 2025 20:39:28 -0400
Subject: [PATCH 410/932] [Quantization] Bump Compressed Tensors Version
 (#23202)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 3c3ac0abf5..365457436f 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -39,7 +39,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.10.2 # required for compressed-tensors
+compressed-tensors == 0.11.0 # required for compressed-tensors
 depyf==0.19.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files

From 0167efe20d3d2280c3da6aea94a6f59afec5099c Mon Sep 17 00:00:00 2001
From: 633WHU <cliu_whu@yeah.net>
Date: Wed, 20 Aug 2025 09:25:59 +0800
Subject: [PATCH 411/932] [Core] Optimize scheduler request removal for single
 completions (#21917)

Signed-off-by: chiliu <chiliu@paypal.com>
Signed-off-by: chiliu <cliu_whu@yeah.net>
Co-authored-by: chiliu <chiliu@paypal.com>
---
 vllm/v1/core/sched/scheduler.py | 14 ++++++--------
 vllm/v1/core/sched/utils.py     | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index b3defa4431..f9a7e21014 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -25,7 +25,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
 from vllm.v1.core.sched.request_queue import (SchedulingPolicy,
                                               create_request_queue)
-from vllm.v1.core.sched.utils import check_stop
+from vllm.v1.core.sched.utils import check_stop, remove_all
 from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
                             EngineCoreOutputs)
 from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -872,9 +872,7 @@ class Scheduler(SchedulerInterface):
 
         # Remove the stopped requests from the running and waiting queues.
         if stopped_running_reqs:
-            self.running = [
-                req for req in self.running if req not in stopped_running_reqs
-            ]
+            self.running = remove_all(self.running, stopped_running_reqs)
         if stopped_preempted_reqs:
             # This is a rare case and unlikely to impact performance.
             self.waiting.remove_requests(stopped_preempted_reqs)
@@ -1000,7 +998,7 @@ class Scheduler(SchedulerInterface):
         else:
             request_ids = set(request_ids)
 
-        running_requests_to_remove = []
+        running_requests_to_remove = set()
         waiting_requests_to_remove = []
         valid_requests = []
 
@@ -1013,13 +1011,13 @@ class Scheduler(SchedulerInterface):
 
             valid_requests.append(request)
             if request.status == RequestStatus.RUNNING:
-                running_requests_to_remove.append(request)
+                running_requests_to_remove.add(request)
             else:
                 waiting_requests_to_remove.append(request)
 
         # Remove all requests from queues at once for better efficiency
-        for request in running_requests_to_remove:
-            self.running.remove(request)
+        if running_requests_to_remove:
+            self.running = remove_all(self.running, running_requests_to_remove)
         if waiting_requests_to_remove:
             self.waiting.remove_requests(waiting_requests_to_remove)
 
diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py
index 42ec95091f..42d3e5c68b 100644
--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
 from typing import Optional
 
 import torch
@@ -7,6 +8,38 @@ import torch
 from vllm.v1.request import Request, RequestStatus
 
 
+def remove_all(lst: list, items_to_remove: set) -> list:
+    """Remove all items from a list that are in the items_to_remove set.
+    
+    This method optimizes for the common case of removing a single item,
+    falling back to list comprehension for multiple items.
+    
+    Args:
+        lst: The list to remove items from
+        items_to_remove: Set of items to remove
+    
+    Returns:
+        Either the modified original list (for single item removal) or
+        a new list (for multiple item removal). Callers should use the
+        returned value.
+    
+    Note:
+        For single item removal, this modifies the original list in-place
+        and returns it. For multiple items, it creates and returns a new list.
+    """
+    if not items_to_remove:
+        return lst
+
+    if len(items_to_remove) == 1:
+        # Fast path for single item removal (most common case)
+        item = next(iter(items_to_remove))
+        with contextlib.suppress(ValueError):
+            lst.remove(item)
+        return lst
+    # For multiple items, use list comprehension
+    return [item for item in lst if item not in items_to_remove]
+
+
 def check_stop(request: Request,
                max_model_len: int,
                pooler_output: Optional[torch.Tensor] = None) -> bool:

From d46d417b5897d7eddb002b61b19e8cba029c3dda Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 19 Aug 2025 22:18:52 -0400
Subject: [PATCH 412/932] [CI Perf] Only test bfloat16 for
 tests/compile/test_fusion_all_reduce.py (#23132)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/compile/test_fusion_all_reduce.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py
index 4c3cf6c2a1..dd31e0db1f 100644
--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@@ -148,7 +148,7 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [8])
 @pytest.mark.parametrize("hidden_size", [16])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
                     reason="Only test on CUDA")
 @pytest.mark.skipif(

From e58c5a97688750e7930f13b6fe556d9a28a5b2d9 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Tue, 19 Aug 2025 19:32:47 -0700
Subject: [PATCH 413/932] [Core] Add torch profiler CPU traces for AsyncLLM.
 (#21794)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 vllm/envs.py                |  6 ++++--
 vllm/v1/engine/async_llm.py | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 861e4c6a1b..70068cca66 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -667,8 +667,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_LORA_RESOLVER_CACHE_DIR":
     lambda: os.getenv("VLLM_LORA_RESOLVER_CACHE_DIR", None),
 
-    # Enables torch profiler if set. Path to the directory where torch profiler
-    # traces are saved. Note that it must be an absolute path.
+    # Enables torch profiler if set.
+    # Both AsyncLLM's CPU traces as well as workers'
+    # traces (CPU & GPU) will be saved under this directory.
+    # Note that it must be an absolute path.
     "VLLM_TORCH_PROFILER_DIR":
     lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
              .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 664fec31a4..342d7b24f8 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
+import os
+import socket
 import time
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
 from typing import Any, Optional, Union
 
 import numpy as np
+import torch
 
 import vllm.envs as envs
 from vllm.config import ModelConfig, VllmConfig
@@ -144,6 +147,26 @@ class AsyncLLM(EngineClient):
         except RuntimeError:
             pass
 
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            logger.info(
+                "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s",  # noqa: E501
+                envs.VLLM_TORCH_PROFILER_DIR)
+            worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                ],
+                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    envs.VLLM_TORCH_PROFILER_DIR,
+                    worker_name=worker_name,
+                    use_gzip=True))
+        else:
+            logger.info(
+                "Torch profiler disabled. AsyncLLM CPU traces will not be collected."  # noqa: E501
+            )
+            self.profiler = None
+
     @classmethod
     @deprecate_kwargs(
         "disable_log_requests",
@@ -562,10 +585,16 @@ class AsyncLLM(EngineClient):
             raise self.dead_error
 
     async def start_profile(self) -> None:
-        await self.engine_core.profile_async(True)
+        coros = [self.engine_core.profile_async(True)]
+        if self.profiler is not None:
+            coros.append(asyncio.to_thread(self.profiler.start))
+        await asyncio.gather(*coros)
 
     async def stop_profile(self) -> None:
-        await self.engine_core.profile_async(False)
+        coros = [self.engine_core.profile_async(False)]
+        if self.profiler is not None:
+            coros.append(asyncio.to_thread(self.profiler.stop))
+        await asyncio.gather(*coros)
 
     async def reset_mm_cache(self) -> None:
         self.processor.mm_registry.reset_processor_cache(self.model_config)

From 64ab3c7253afb8cc2008777153812109bf92d7c8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 20 Aug 2025 10:33:41 +0800
Subject: [PATCH 414/932] [Doc] Update V1 status of various pooling models
 (#23189)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/supported_models.md              | 26 ++++++++++----------
 tests/models/language/pooling/test_gritlm.py |  9 ++++---
 vllm/model_executor/models/gritlm.py         |  6 ++---
 vllm/model_executor/models/interfaces.py     | 11 ++++++---
 4 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 1d165fa6f1..7908e42387 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -363,7 +363,7 @@ th {
 | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
-| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
+| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
 | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
 | `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
 | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
@@ -436,17 +436,17 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
-| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ |
-| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
-| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  |  |
-| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  |  |
-| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  |  |
-| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  |  |
+| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ |
+| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
+| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  | ✅︎ |
+| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  | ✅︎ |
+| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  | ✅︎ |
+| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  | ✅︎ |
 | `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | |
+| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 
 <sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))  
@@ -476,7 +476,7 @@ These models primarily support the [`LLM.classify`](./pooling_models.md#llmclass
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
+| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 
@@ -493,12 +493,12 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | |
+| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ |
 | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
-| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | |
-| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | |
+| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 
 <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index d21987571c..17a55d916b 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -14,6 +14,7 @@ from ....utils import RemoteOpenAIServer
 
 MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
 MAX_MODEL_LEN = 4000
+ATOL = 0.002
 
 
 def _arr(arr):
@@ -97,16 +98,16 @@ def get_test_data():
 
 def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
     cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
-    assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=0.001)
+    assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=ATOL)
 
     cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
-    assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=0.001)
+    assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=ATOL)
 
     cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
-    assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=0.001)
+    assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=ATOL)
 
     cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
-    assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=0.001)
+    assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=ATOL)
 
 
 def test_gritlm_offline_embedding(vllm_runner):
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 9e7490e3c4..3f6790269a 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -20,7 +20,7 @@ from vllm.sequence import PoolerOutput
 from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
-from .interfaces import SupportsV0Only
+from .interfaces import default_pooling_type
 
 logger = init_logger(__name__)
 
@@ -215,7 +215,8 @@ class GritLMPooler(Pooler):
         return build_output(pooled_data)
 
 
-class GritLM(LlamaForCausalLM, SupportsV0Only):
+@default_pooling_type("MEAN")
+class GritLM(LlamaForCausalLM):
     """This class implements the embedding model for parasail-ai/GritLM-7B-vllm.
 
     The class inherits from LlamaForCausalLM and provides a custom pooling
@@ -241,7 +242,6 @@ class GritLM(LlamaForCausalLM, SupportsV0Only):
         prefix: str = "",
         **kwargs,
     ) -> None:
-        # Use full attention for pooling (this is why V1 is not supported yet)
         if vllm_config.model_config.runner_type == "pooling":
             hf_config = vllm_config.model_config.hf_config
             hf_config.is_causal = False
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index c425488f83..9415e67924 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -3,7 +3,7 @@
 
 from collections.abc import Iterable, Mapping, MutableSequence
 from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
-                    Union, overload, runtime_checkable)
+                    TypeVar, Union, overload, runtime_checkable)
 
 import numpy as np
 import torch
@@ -641,11 +641,14 @@ def supports_cross_encoding(
     return is_pooling_model(model) and _supports_cross_encoding(model)
 
 
-def default_pooling_type(pooling_type: str) -> object:
+_T = TypeVar("_T", bound=type[torch.nn.Module])
+
+
+def default_pooling_type(pooling_type: str):
     """Set default_pooling_type decorator. """
 
-    def func(model: object):
-        model.default_pooling_type = pooling_type
+    def func(model: _T) -> _T:
+        model.default_pooling_type = pooling_type  # type: ignore
         return model
 
     return func

From a634733f67b39fd9c1da1a861ba39f75efb576f3 Mon Sep 17 00:00:00 2001
From: Zebing Lin <linzebing1995@gmail.com>
Date: Tue, 19 Aug 2025 22:57:47 -0400
Subject: [PATCH 415/932] [Attention] Optimize
 make_local_attention_virtual_batches for Flash Attention (#23185)

Signed-off-by: linzebing <linzebing1995@gmail.com>
---
 vllm/v1/attention/backends/utils.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 5e6bc33183..94dd3d2629 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -464,8 +464,9 @@ def make_local_attention_virtual_batches(
         attn_chunk_size)[arange > 0]
 
     # convert from q_seqlens to cu_seqlens_q
-    cu_seqlens_q_local = np.pad(np.cumsum(seqlens_q_local), (1, 0))\
-        .astype(np.int32)
+    cu_seqlens_q_local = np.empty(virtual_batches + 1, dtype=np.int32)
+    np.cumsum(seqlens_q_local, out=cu_seqlens_q_local[1:])
+    cu_seqlens_q_local[0] = 0
 
     # compute the seqlens_k_local,
     #  basically a full local attention block for all but the last block in each
@@ -508,11 +509,10 @@ def make_local_attention_virtual_batches(
     #     [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4])
     #     [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8])
     #   ]
-    block_indices= np.broadcast_to(
-        np.arange(pages_per_local_batch, dtype=np.int32),
-        (virtual_batches, pages_per_local_batch)) \
-            + np.expand_dims(block_starts, axis=1)
-    block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1)
+    block_indices = (block_starts[:, None] +
+                     np.arange(pages_per_local_batch, dtype=np.int32))
+    block_indices = block_indices.reshape(-1).clip(max=block_table.shape[1] -
+                                                   1)
     batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32),
                               local_blocks * pages_per_local_batch)
     block_table_local = block_table[batch_indices, block_indices]\

From 941f56858a48e097391cfcc451c3f6d88f7cf20c Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Tue, 19 Aug 2025 20:14:32 -0700
Subject: [PATCH 416/932] Fix a performance comparison issue in Benchmark Suite
 (#23047)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
---
 .../scripts/compare-json-results.py           | 144 ++++++++++++++----
 1 file changed, 118 insertions(+), 26 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 12c4ba6aa6..50431d0cd4 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -3,44 +3,129 @@
 import argparse
 import json
 import os
+from importlib import util
 
 import pandas as pd
 
+plotly_found = util.find_spec("plotly.express") is not None
+
 
 def compare_data_columns(
     files, name_column, data_column, info_cols, drop_column, debug=False
 ):
-    print("\ncompare_data_column: " + data_column)
+    """
+    Align concatenation by keys derived from info_cols instead of row order.
+    - Pick one canonical key list: subset of info_cols present in ALL files.
+    - For each file: set index to those keys, aggregate duplicates
+    - (mean for metric, first for names).
+    - Concat along axis=1 (indexes align), then reset_index so callers can
+    - group by columns.
+    - If --debug, add a <file_label>_name column per file.
+    """
+    print("\ncompare_data_column:", data_column)
+
     frames = []
     raw_data_cols = []
     compare_frames = []
+
+    # 1) choose a canonical key list from info_cols that exists in ALL files
+    cols_per_file = []
+    for f in files:
+        try:
+            df_tmp = pd.read_json(f, orient="records")
+        except Exception as err:
+            raise ValueError(f"Failed to read {f}") from err
+        cols_per_file.append(set(df_tmp.columns))
+
+    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
+    if not key_cols:
+        # soft fallback: use any info_cols present in the first file
+        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
+    if not key_cols:
+        raise ValueError(
+            "No common key columns found from info_cols across the input files."
+        )
+
+    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
+    meta_added = False
+
     for file in files:
-        data_df = pd.read_json(file)
-        serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
-        # Show all info columns in the first couple columns
-        if not frames:
-            for col in info_cols:
-                if col not in serving_df.columns:
-                    print(f"Skipping missing column: {col}")
-                    continue
-                frames.append(serving_df[col])
-        # only show test name under debug mode
-        if debug is True:
-            serving_df = serving_df.rename(columns={name_column: file + "_name"})
-            frames.append(serving_df[file + "_name"])
+        df = pd.read_json(file, orient="records")
 
-        file = "/".join(file.split("/")[:-1])
-        serving_df = serving_df.rename(columns={data_column: file})
-        frames.append(serving_df[file])
-        raw_data_cols.append(file)
-        compare_frames.append(serving_df[file])
+        # Keep rows that actually have the compared metric (same as original behavior)
+        if drop_column in df.columns:
+            df = df.dropna(subset=[drop_column], ignore_index=True)
+
+        # Stabilize numeric key columns (harmless if missing)
+        for c in (
+            "Input Len",
+            "Output Len",
+            "TP Size",
+            "PP Size",
+            "# of max concurrency.",
+            "qps",
+        ):
+            if c in df.columns:
+                df[c] = pd.to_numeric(df[c], errors="coerce")
+
+        # Ensure all key columns exist
+        for c in key_cols:
+            if c not in df.columns:
+                df[c] = pd.NA
+
+        # Set index = key_cols and aggregate duplicates → unique MultiIndex
+        df_idx = df.set_index(key_cols, drop=False)
+
+        # meta (key columns), unique per key
+        meta = df_idx[key_cols]
+        if not meta.index.is_unique:
+            meta = meta.groupby(level=key_cols, dropna=False).first()
+
+        # metric series for this file, aggregated to one row per key
+        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
+        s = df_idx[data_column]
+        if not s.index.is_unique:
+            s = s.groupby(level=key_cols, dropna=False).mean()
+        s.name = file_label  # column label like original
+
+        # add meta once (from first file) so keys are the leftmost columns
+        if not meta_added:
+            frames.append(meta)
+            meta_added = True
+
+        # (NEW) debug: aligned test-name column per file
+        if debug and name_column in df_idx.columns:
+            name_s = df_idx[name_column]
+            if not name_s.index.is_unique:
+                name_s = name_s.groupby(level=key_cols, dropna=False).first()
+            name_s.name = f"{file_label}_name"
+            frames.append(name_s)
+
+        frames.append(s)
+        raw_data_cols.append(file_label)
+        compare_frames.append(s)
+
+        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
         if len(compare_frames) >= 2:
-            # Compare numbers among two files
-            ratio_df = compare_frames[1] / compare_frames[0]
-            frames.append(ratio_df)
-            compare_frames.pop(1)
+            base = compare_frames[0]
+            current = compare_frames[-1]
+            ratio = current / base
+            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
+            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
+            frames.append(ratio)
 
+    # 4) concat on columns with aligned MultiIndex;
+    # then reset_index to return keys as columns
     concat_df = pd.concat(frames, axis=1)
+    concat_df = concat_df.reset_index(drop=True).reset_index()
+    if "index" in concat_df.columns:
+        concat_df = concat_df.drop(columns=["index"])
+
+    # Ensure key/info columns appear first (in your info_cols order)
+    front = [c for c in info_cols if c in concat_df.columns]
+    rest = [c for c in concat_df.columns if c not in front]
+    concat_df = concat_df[front + rest]
+
     print(raw_data_cols)
     return concat_df, raw_data_cols
 
@@ -67,6 +152,15 @@ def split_json_by_tp_pp(
 
     df = pd.DataFrame(data)
 
+    # Keep only "serving" tests
+    name_col = next(
+        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
+    )
+    if name_col:
+        df = df[
+            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
+        ].copy()
+
     # Handle alias column names
     rename_map = {
         "tp_size": "TP Size",
@@ -181,7 +275,6 @@ if __name__ == "__main__":
                     f"Expected subset: {filtered_info_cols}, "
                     f"but DataFrame has: {list(output_df.columns)}"
                 )
-
             output_df_sorted = output_df.sort_values(by=existing_group_cols)
             output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
             for name, group in output_groups:
@@ -189,8 +282,7 @@ if __name__ == "__main__":
                 text_file.write(html_msgs_for_data_cols[i])
                 text_file.write(html)
 
-                if plot is True:
-                    import pandas as pd
+                if plot and plotly_found:
                     import plotly.express as px
 
                     df = group[raw_data_cols]

From 1a3079a15e5c8ae2790a1897f82e5af0d68a6921 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B8=B8=EC=9E=AC=EC=9D=80?= <rha3122@naver.com>
Date: Wed, 20 Aug 2025 13:02:50 +0900
Subject: [PATCH 417/932] chore: support pytorch format in lora  (#22790)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: jaeeun.kil <rha3122@naver.com>
Signed-off-by: 길재은 <rha3122@naver.com>
---
 vllm/lora/models.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index e6b19d4748..3072047a26 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -207,6 +207,7 @@ class LoRAModel(AdapterModel):
         """
         lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
         lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
+        lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
         new_embeddings_tensor_path = os.path.join(
             lora_dir, "new_embeddings.safetensors")
         new_embeddings_bin_file_path = os.path.join(lora_dir,
@@ -255,9 +256,10 @@ class LoRAModel(AdapterModel):
                 check_unexpected_modules(f)
                 for module in f.keys():  # noqa
                     tensors[module] = f.get_tensor(module)
-        elif os.path.isfile(lora_bin_file_path):
-            # When a bin file is provided, we rely on config to find unexpected
-            # modules.
+        elif os.path.isfile(lora_bin_file_path) or os.path.isfile(
+                lora_pt_file_path):
+            # When a bin/pt file is provided, we rely on config to find
+            # unexpected modules.
             unexpected_modules = []
             target_modules = peft_helper.target_modules
             if not isinstance(target_modules, list):
@@ -279,7 +281,10 @@ class LoRAModel(AdapterModel):
                     f" target modules in {expected_lora_modules}"
                     f" but received {unexpected_modules}."
                     f" Please verify that the loaded LoRA module is correct")
-            tensors = torch.load(lora_bin_file_path,
+            lora_file_path = (lora_bin_file_path
+                              if os.path.isfile(lora_bin_file_path) else
+                              lora_pt_file_path)
+            tensors = torch.load(lora_file_path,
                                  map_location=device,
                                  weights_only=True)
         else:

From f72902327246bc68ff0d196a89cc81262f46de1b Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Tue, 19 Aug 2025 21:09:27 -0700
Subject: [PATCH 418/932] [CI/Build] Also check DP in benchmarks throughput
 script (#23038)

Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 benchmarks/benchmark_throughput.py | 4 ++--
 vllm/benchmarks/throughput.py      | 8 ++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c51b579686..c7f290e1eb 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -597,8 +597,8 @@ def validate_args(args):
     # https://github.com/vllm-project/vllm/issues/16222
     if args.data_parallel_size > 1:
         raise ValueError(
-            "Data parallel is not supported in offline benchmark, \
-            please use benchmark serving instead"
+            "Data parallel is not supported in offline benchmark, "
+            "please use benchmark serving instead"
         )
 
 
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 0c19fa6dcf..f022a55e62 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -434,6 +434,14 @@ def validate_args(args):
     if args.backend == "mii" and args.tokenizer != args.model:
         raise ValueError(
             "Tokenizer must be the same as the model for MII backend.")
+    
+    # --data-parallel is not supported currently.
+    # https://github.com/vllm-project/vllm/issues/16222
+    if args.data_parallel_size > 1:
+        raise ValueError(
+            "Data parallel is not supported in offline benchmark, "
+            "please use benchmark serving instead"
+        )
 
 
 def add_cli_args(parser: argparse.ArgumentParser):

From de7b67a0232e35ae8e8ecd944aeddfc8cbc02631 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 20 Aug 2025 13:06:42 +0800
Subject: [PATCH 419/932] [CI/Build] Sync multimodal tests (#23181)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_common.py      | 10 +++++---
 tests/models/registry.py                      | 24 +++++++++----------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 0fdc182b9e..8aa0dc7e8e 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -275,16 +275,17 @@ def _test_processing_correctness_one(
     "google/gemma-3n-E2B-it",
     "zai-org/glm-4v-9b",
     "zai-org/GLM-4.1V-9B-Thinking",
+    "zai-org/GLM-4.5V",
     "ibm-granite/granite-speech-3.3-2b",
     "h2oai/h2ovl-mississippi-800m",
+    "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
     "internlm/Intern-S1",
     "OpenGVLab/InternVL2-1B",
     "OpenGVLab/InternVL3-1B",
-    "HuggingFaceM4/Idefics3-8B-Llama3",
-    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+    "Kwai-Keye/Keye-VL-8B-Preview",
     "moonshotai/Kimi-VL-A3B-Instruct",
     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-v1.6-mistral-7b-hf",
     "llava-hf/LLaVA-NeXT-Video-7B-hf",
@@ -315,10 +316,13 @@ def _test_processing_correctness_one(
     "Qwen/Qwen2-Audio-7B-Instruct",
     "Qwen/Qwen2.5-Omni-3B",
     "Skywork/Skywork-R1V-38B",
+    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+    "stepfun-ai/step3",
     "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     "openai/whisper-large-v3",
     "omni-research/Tarsier-7b",
     "omni-research/Tarsier2-Recap-7b",
+    "mistralai/Voxtral-Mini-3B-2507",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index cbdc9edbbc..28fe906316 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -215,9 +215,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
                                                 trust_remote_code=True,
                                                 is_available_online=False),
-    "HCXVisionForCausalLM": _HfExamplesInfo(
-        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
-        trust_remote_code=True),
     "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
                                            trust_remote_code=True),
     "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
@@ -298,8 +295,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
     "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3",
-                                            trust_remote_code=True,
-                                            is_available_online=False),
+                                            trust_remote_code=True),
     "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct",
                                         trust_remote_code=True),
     "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@@ -405,22 +401,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
     "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"),  # noqa: E501
     "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V",
-                                          is_available_online=False),   # noqa: E501
+                                                        min_transformers_version="4.56"),  # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
                                       trust_remote_code=True,
                                       extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
                                       max_transformers_version="4.48",  # noqa: E501
                                       transformers_version_reason="HF model is not compatible."),  # noqa: E501
+    "HCXVisionForCausalLM": _HfExamplesInfo("naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",  # noqa: E501
+                                            trust_remote_code=True),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},    # noqa: E501
                                                         min_transformers_version="4.55.1",
                                                         transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
+    "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
+                                                        trust_remote_code=True),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B",
                                                  "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
                                          trust_remote_code=True),
-    "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
-                                         trust_remote_code=True),
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                     trust_remote_code=True),
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
@@ -464,9 +462,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                             transformers_version_reason="HF model is not compatible",  # noqa: E501
                             extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
                                     "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
-    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True,
-                            max_transformers_version="4.53",
-                            transformers_version_reason="HF model is not compatible"),  # noqa: E501
+    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B",
+                               trust_remote_code=True,
+                               max_transformers_version="4.53",
+                               transformers_version_reason="HF model is not compatible"),  # noqa: E501
     "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
@@ -496,8 +495,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                        min_transformers_version="4.55.1",
                                                        transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
     "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3",
-                                                        trust_remote_code=True,
-                                                        is_available_online=False),
+                                                        trust_remote_code=True),
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True),
     "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),  # noqa: E501

From 8fd920924c8c13fb757c324f9e73c70d2d5f3029 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 19 Aug 2025 22:50:29 -0700
Subject: [PATCH 420/932] [BugFix] Fix stuck stats/metrics after requests are
 aborted (#22995)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py | 95 +++++++++++++++++++++++-
 vllm/v1/core/block_pool.py               |  7 +-
 vllm/v1/core/sched/scheduler.py          |  9 ++-
 3 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 9107d08983..ff2e7004ff 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import asyncio
 import subprocess
 import sys
 import tempfile
@@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
             assert metric in response.text
 
 
+@pytest.mark.asyncio
+async def test_abort_metrics_reset(server: RemoteOpenAIServer,
+                                   client: openai.AsyncClient, use_v1: bool):
+
+    running_requests, waiting_requests, kv_cache_usage = (
+        _get_running_metrics_from_api(server))
+
+    # Expect no running requests or kvcache usage
+    assert running_requests == 0
+    assert waiting_requests == 0
+    assert kv_cache_usage == 0.0
+
+    # Start some long-running requests that we can abort
+    tasks = []
+    for _ in range(3):
+        task = asyncio.create_task(
+            client.completions.create(
+                model=MODEL_NAME,
+                prompt=_TOKENIZED_PROMPT,
+                max_tokens=100,  # Long generation to give time to abort
+                temperature=0.0))
+        tasks.append(task)
+
+    # Wait a bit for requests to start processing
+    await asyncio.sleep(0.5)
+
+    # Check that we have running requests
+    running_requests, waiting_requests, kv_cache_usage = (
+        _get_running_metrics_from_api(server))
+
+    # Expect running requests and kvcache usage
+    assert running_requests > 0
+    assert kv_cache_usage > 0
+
+    # Cancel all tasks to abort the requests
+    for task in tasks:
+        task.cancel()
+
+    # Wait for cancellations to be processed
+    await asyncio.sleep(1.0)
+
+    # Check that metrics have reset to zero
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+
+    # Verify running and waiting requests counts and KV cache usage are zero
+    running_requests_after, waiting_requests_after, kv_cache_usage_after = (
+        _get_running_metrics_from_api(server))
+
+    assert running_requests_after == 0,\
+        (f"Expected 0 running requests after abort, got "
+         f"{running_requests_after}")
+    assert waiting_requests_after == 0,\
+        (f"Expected 0 waiting requests after abort, got "
+         f"{waiting_requests_after}")
+    assert kv_cache_usage_after == 0,\
+        (f"Expected 0% KV cache usage after abort, got "
+         f"{kv_cache_usage_after}")
+
+
+def _get_running_metrics_from_api(server: RemoteOpenAIServer):
+    """Return (running_count, waiting_count, kv_cache_usage)"""
+
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+
+    # Verify running and waiting requests counts and KV cache usage are zero
+    running_requests, waiting_requests, kv_cache_usage = None, None, None
+
+    for family in text_string_to_metric_families(response.text):
+        if family.name == "vllm:num_requests_running":
+            for sample in family.samples:
+                if sample.name == "vllm:num_requests_running":
+                    running_requests = sample.value
+                    break
+        elif family.name == "vllm:num_requests_waiting":
+            for sample in family.samples:
+                if sample.name == "vllm:num_requests_waiting":
+                    waiting_requests = sample.value
+                    break
+        elif family.name == "vllm:gpu_cache_usage_perc":
+            for sample in family.samples:
+                if sample.name == "vllm:gpu_cache_usage_perc":
+                    kv_cache_usage = sample.value
+                    break
+
+    assert running_requests is not None
+    assert waiting_requests is not None
+    assert kv_cache_usage is not None
+
+    return running_requests, waiting_requests, kv_cache_usage
+
+
 def test_metrics_exist_run_batch(use_v1: bool):
     input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501
 
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 839297135f..fdd96c3e95 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -298,7 +298,12 @@ class BlockPool:
         Returns:
             The KV cache usage (between 0.0 and 1.0).
         """
-        return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks)
+
+        # Subtract 1 to account for null block.
+        total_gpu_blocks = self.num_gpu_blocks - 1
+        if not total_gpu_blocks:
+            return 0
+        return 1.0 - (self.get_num_free_blocks() / total_gpu_blocks)
 
     def take_events(self) -> list[KVCacheEvent]:
         """Atomically takes all events and clears the queue.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index f9a7e21014..4b167da5c8 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -902,10 +902,13 @@ class Scheduler(SchedulerInterface):
                         finished_requests=finished_set)
             finished_req_ids.clear()
 
-        if engine_core_outputs:
+        if (stats := self.make_stats(spec_decoding_stats)) is not None:
             # Return stats to only one of the front-ends.
-            next(iter(engine_core_outputs.values())).scheduler_stats = (
-                self.make_stats(spec_decoding_stats))
+            if (eco := next(iter(engine_core_outputs.values()), None)) is None:
+                # We must return the stats even if there are no request
+                # outputs this step.
+                engine_core_outputs[0] = eco = EngineCoreOutputs()
+            eco.scheduler_stats = stats
 
         return engine_core_outputs
 

From d983769c41db224e0897fac2e9aefc5f57ad1122 Mon Sep 17 00:00:00 2001
From: who who who <fsx950223@outlook.com>
Date: Wed, 20 Aug 2025 14:24:37 +0800
Subject: [PATCH 421/932] fix cuda graph (#22721)

Signed-off-by: fsx950223 <fsx950223@outlook.com>
---
 vllm/v1/attention/backends/rocm_aiter_fa.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 7d09ac0a4a..36b5853bfd 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with AiterFlashAttention."""
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import Optional
 
 import torch
 
@@ -11,7 +11,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                              AttentionMetadataBuilder,
                                               CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
@@ -231,7 +232,7 @@ class AiterFlashAttentionMetadata:
 
 class AiterFlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[AiterFlashAttentionMetadata]):
-    full_cudagraph_supported: ClassVar[bool] = True
+    cudagraph_support = AttentionCGSupport.ALWAYS
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):

From 103f1ec8d348a5f336f11d972d6285c4fb4736d4 Mon Sep 17 00:00:00 2001
From: Calvin Chen <wen.chen@dynamia.ai>
Date: Wed, 20 Aug 2025 18:16:27 +0800
Subject: [PATCH 422/932] [Model] use autoWeightsLoader for gptoss (#22446)

Signed-off-by: calvin chen <wen.chen@dynamia.ai>
---
 vllm/model_executor/models/gpt_oss.py | 724 +++++++++++++-------------
 1 file changed, 370 insertions(+), 354 deletions(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 2f5d9ddd90..cd93f0ef1e 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -27,7 +27,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.utils import cdiv
 
-from .utils import extract_layer_index, maybe_prefix
+from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index,
+                    maybe_prefix)
 
 
 class OAIAttention(nn.Module):
@@ -203,6 +204,7 @@ class GptOssModel(nn.Module):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
         self.quant_config = vllm_config.quant_config
+        self.parallel_config = vllm_config.parallel_config
         self.config.hidden_size = self.config.hidden_size
         self.embedding = VocabParallelEmbedding(
             self.config.vocab_size,
@@ -225,8 +227,364 @@ class GptOssModel(nn.Module):
         x = self.norm(x)
         return x
 
+    def _load_weights_mxfp4(
+        self,
+        ep_rank_end: int,
+        ep_rank_start: int,
+        heads_per_rank: int,
+        head_start: int,
+        weights: Iterable[tuple[str, torch.Tensor]],
+        stacked_params_mapping: list[tuple[str, ...]],
+    ) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        mxfp4_block = 32
+        use_ep = self.parallel_config.enable_expert_parallel
+        num_experts = self.config.num_local_experts
+
+        tp_rank = get_tensor_model_parallel_rank()
+        tp_size = get_tensor_model_parallel_world_size()
+
+        intermediate_size = self.config.intermediate_size
+        intermediate_size_block = intermediate_size // mxfp4_block
+        per_rank_intermediate_size_block = cdiv(intermediate_size_block,
+                                                tp_size)
+        per_rank_intermediate_size = (per_rank_intermediate_size_block *
+                                      mxfp4_block)
+
+        # Calculate common slicing bounds for current rank
+        tp_rank_start = tp_rank * per_rank_intermediate_size
+        tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size,
+                          intermediate_size)
+
+        for name, weight in weights:
+            # FIXME(woosuk): Remove this after testing.
+            weight = weight.cuda()
+
+            if ".w13_weight_scale" in name:
+                # Handle MLP gate and up projection weights scale
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[:,
+                                           2 * tp_rank_start:2 * tp_rank_end,
+                                           ...]
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param,
+                              narrow_weight,
+                              weight_name=name,
+                              shard_id=None,
+                              expert_id=None)
+                loaded_params.add(name)
+                continue
+            elif ".w2_weight_scale" in name:
+                # Handle MLP down projection weights
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[..., tp_rank_start //
+                                           mxfp4_block:tp_rank_end //
+                                           mxfp4_block]
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param,
+                              narrow_weight,
+                              weight_name=name,
+                              shard_id=None,
+                              expert_id=None)
+                loaded_params.add(name)
+                continue
+            elif ".w13_weight" in name:
+                # Handle MLP gate and up projection weights
+                # flat weight from (E, 2 * N, block_size, entry_per_block)
+                # to (E, 2 * N, -1), shouldn't trigger copy for contiguous
+                weight = weight.view(num_experts, 2 * intermediate_size,
+                                     -1).contiguous()
+
+                # Extract gate and up projection parts
+                # since the weight is shuffled, we can slice directly
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[:,
+                                           2 * tp_rank_start:2 * tp_rank_end,
+                                           ...]
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param,
+                              narrow_weight,
+                              weight_name=name,
+                              shard_id=None,
+                              expert_id=None)
+                loaded_params.add(name)
+                continue
+            elif ".w2_weight" in name:
+                # Handle MLP down projection weights
+                # same flatten here, but since 2 mx4 value are packed in 1
+                # uint8, divide by 2
+                weight = weight.view(num_experts, -1,
+                                     intermediate_size // 2).contiguous()
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[...,
+                                           tp_rank_start // 2:tp_rank_end // 2]
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param,
+                              narrow_weight,
+                              weight_name=name,
+                              shard_id=None,
+                              expert_id=None)
+                loaded_params.add(name)
+                continue
+            elif ".w13_bias" in name:
+                # Handle MLP gate and up projection biases
+                # Extract gate and up projection bias parts
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[:,
+                                           2 * tp_rank_start:2 * tp_rank_end]
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param,
+                              narrow_weight,
+                              weight_name=name,
+                              shard_id=None,
+                              expert_id=None)
+                loaded_params.add(name)
+                continue
+            elif ".w2_bias" in name:
+                # Handle MLP down projection bias
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                if use_ep:
+                    weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    # (only load on rank 0 to avoid duplication)
+                    if tp_rank != 0:
+                        weight.zero_()
+                weight_loader(param,
+                              weight,
+                              weight_name=name,
+                              shard_id=None,
+                              expert_id=None)
+                loaded_params.add(name)
+                continue
+            elif "sinks" in name:
+                # Handle attention sinks (distributed across ranks)
+                param = params_dict[name]
+                narrow_weight = weight.narrow(0, head_start, heads_per_rank)
+                param.data.copy_(narrow_weight)
+                loaded_params.add(name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, weight)
+                else:
+                    weight_loader(param, weight, shard_id)
+                break
+            else:
+                # Handle all other weights with potential renaming
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def _load_weights_other(
+        self,
+        ep_rank_start: int,
+        ep_rank_end: int,
+        heads_per_rank: int,
+        head_start: int,
+        weights: Iterable[tuple[str, torch.Tensor]],
+        stacked_params_mapping: list[tuple[str, ...]],
+    ) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        use_ep = self.parallel_config.enable_expert_parallel
+
+        tp_rank = get_tensor_model_parallel_rank()
+        tp_size = get_tensor_model_parallel_world_size()
+
+        intermediate_size = self.config.intermediate_size
+        per_rank_intermediate_size = cdiv(intermediate_size, tp_size)
+        # Calculate common slicing bounds for current rank
+        tp_rank_start = tp_rank * per_rank_intermediate_size
+        tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size,
+                          intermediate_size)
+
+        for name, weight in weights:
+            if ".w13_weight" in name:
+                # Handle MLP gate and up projection weights
+                # Extract gate and up projection parts
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[:, :,
+                                           2 * tp_rank_start:2 * tp_rank_end]
+
+                narrow_weight = narrow_weight.permute(0, 2, 1).contiguous()
+                param = params_dict[name]
+
+                param.copy_(narrow_weight)
+                loaded_params.add(name)
+                continue
+            elif ".w2_weight" in name:
+                # Handle MLP down projection weights
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[:, tp_rank_start:tp_rank_end, :]
+                narrow_weight = narrow_weight.permute(0, 2, 1).contiguous()
+                param = params_dict[name]
+
+                param.copy_(narrow_weight)
+                loaded_params.add(name)
+                continue
+            elif ".w13_bias" in name:
+                # Handle MLP gate and up projection biases
+                # Extract gate and up projection bias parts
+                if use_ep:
+                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    narrow_weight = weight[:,
+                                           2 * tp_rank_start:2 * tp_rank_end]
+
+                param = params_dict[name]
+                param.copy_(narrow_weight)
+                loaded_params.add(name)
+                continue
+            elif ".w2_bias" in name:
+                # Handle MLP down projection bias
+                if use_ep:
+                    weight = weight[ep_rank_start:ep_rank_end, ...]
+                else:
+                    # (only load on rank 0 to avoid duplication)
+                    if tp_rank != 0:
+                        weight.zero_()
+                param = params_dict[name]
+                param.copy_(weight)
+                loaded_params.add(name)
+                continue
+            elif "sinks" in name:
+                # Handle attention sinks (distributed across ranks)
+                param = params_dict[name]
+                narrow_weight = weight.narrow(0, head_start, heads_per_rank)
+                param.data.copy_(narrow_weight)
+                loaded_params.add(name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, weight)
+                else:
+                    weight_loader(param, weight, shard_id)
+                break
+            else:
+                # Handle all other weights with potential renaming
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv", ".q_proj", "q"),
+            (".qkv", ".k_proj", "k"),
+            (".qkv", ".v_proj", "v"),
+        ]
+
+        tp_rank = get_tensor_model_parallel_rank()
+        tp_size = get_tensor_model_parallel_world_size()
+
+        # Attention heads per rank
+        heads_per_rank = self.config.num_attention_heads // tp_size
+        head_start = tp_rank * heads_per_rank
+
+        ep_size = get_ep_group().world_size
+        ep_rank = get_ep_group().rank
+        num_experts = self.config.num_local_experts
+        experts_per_rank = num_experts // ep_size
+        ep_rank_start = ep_rank * experts_per_rank
+        ep_rank_end = (ep_rank + 1) * experts_per_rank
+
+        quant_method = (self.config.quantization_config['quant_method'] if
+                        hasattr(self.config, "quantization_config") else None)
+        if quant_method == "mxfp4":
+            return self._load_weights_mxfp4(ep_rank_end, ep_rank_start,
+                                            heads_per_rank, head_start,
+                                            weights, stacked_params_mapping)
+        else:
+            return self._load_weights_other(ep_rank_end, ep_rank_start,
+                                            heads_per_rank, head_start,
+                                            weights, stacked_params_mapping)
+
 
 class GptOssForCausalLM(nn.Module):
+    packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]}
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".self_attn.": ".attn.",
+            ".post_attention_layernorm.": ".mlp.norm.",
+        },
+        orig_to_new_suffix={
+            ".embed_tokens.weight": ".embedding.weight",
+            ".input_layernorm.weight": ".attn.norm.weight",
+            ".post_attention_layernorm.weight": ".mlp.norm.weight",
+
+            # MoE MXFP4 weights
+            ".gate_up_proj_blocks": ".w13_weight",
+            ".down_proj_blocks": ".w2_weight",
+            ".gate_up_proj_scales": ".w13_weight_scale",
+            ".down_proj_scales": ".w2_weight_scale",
+
+            # MoE other weights
+            ".gate_up_proj": ".w13_weight",
+            ".down_proj": ".w2_weight",
+
+            # MoE Bias
+            ".gate_up_proj_bias": ".w13_bias",
+            ".down_proj_bias": ".w2_bias",
+        },
+    )
 
     def __init__(
         self,
@@ -235,16 +593,17 @@ class GptOssForCausalLM(nn.Module):
     ):
         super().__init__()
         self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config.hf_config
+        self.config = vllm_config.model_config.hf_config
+
         self.model = GptOssModel(
             vllm_config=vllm_config,
             prefix=maybe_prefix(prefix, "model"),
         )
         self.lm_head = ParallelLMHead(
-            self.model_config.vocab_size,
-            self.model_config.hidden_size,
+            self.config.vocab_size,
+            self.config.hidden_size,
         )
-        self.logits_processor = LogitsProcessor(self.model_config.vocab_size)
+        self.logits_processor = LogitsProcessor(self.config.vocab_size)
 
     def forward(self,
                 input_ids: torch.Tensor,
@@ -261,354 +620,11 @@ class GptOssForCausalLM(nn.Module):
                                        sampling_metadata)
         return logits
 
-    def _load_weights_mxfp4(
-            self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        rename_mapping = {
-            "self_attn": "attn",
-            "input_layernorm.weight": "attn.norm.weight",
-            "post_attention_layernorm.weight": "mlp.norm.weight",
-            "embed_tokens": "embedding",
-        }
-
-        def maybe_rename(name: str) -> str:
-            for remap_name, new_name in rename_mapping.items():
-                if remap_name in name:
-                    return name.replace(remap_name, new_name)
-            return name
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        mxfp4_block = 32
-
-        tp_rank = get_tensor_model_parallel_rank()
-        tp_size = get_tensor_model_parallel_world_size()
-        intermediate_size = self.model_config.intermediate_size
-        intermediate_size_block = intermediate_size // mxfp4_block
-        per_rank_intermediate_size_block = cdiv(intermediate_size_block,
-                                                tp_size)
-        per_rank_intermediate_size = (per_rank_intermediate_size_block *
-                                      mxfp4_block)
-
-        # Calculate common slicing bounds for current rank
-        tp_rank_start = tp_rank * per_rank_intermediate_size
-        tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size,
-                          intermediate_size)
-
-        # Attention heads per rank
-        heads_per_rank = self.model_config.num_attention_heads // tp_size
-        head_start = tp_rank * heads_per_rank
-
-        use_ep = self.vllm_config.parallel_config.enable_expert_parallel
-        ep_size = get_ep_group().world_size
-        ep_rank = get_ep_group().rank
-        num_experts = self.model_config.num_local_experts
-        experts_per_rank = num_experts // ep_size
-        ep_rank_start = ep_rank * experts_per_rank
-        ep_rank_end = (ep_rank + 1) * experts_per_rank
-
-        for name, weight in weights:
-            # FIXME(woosuk): Remove this after testing.
-            weight = weight.cuda()
-
-            if "gate_up_proj_blocks" in name:
-                # Handle MLP gate and up projection weights
-                new_name = name.replace("gate_up_proj_blocks", "w13_weight")
-
-                # flat weight from (E, 2 * N, block_size, entry_per_block)
-                # to (E, 2 * N, -1), shouldn't trigger copy for contiguous
-                weight = weight.view(num_experts, 2 * intermediate_size,
-                                     -1).contiguous()
-
-                # Extract gate and up projection parts
-                # since the weight is shuffled, we can slice directly
-                if use_ep:
-                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    narrow_weight = weight[:,
-                                           2 * tp_rank_start:2 * tp_rank_end,
-                                           ...]
-
-                param = params_dict[new_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param,
-                              narrow_weight,
-                              weight_name=new_name,
-                              shard_id=None,
-                              expert_id=None)
-                loaded_params.add(new_name)
-
-            elif "down_proj_blocks" in name:
-                # Handle MLP down projection weights
-                new_name = name.replace("down_proj_blocks", "w2_weight")
-                # same flatten here, but since 2 mx4 value are packed in 1
-                # uint8, divide by 2
-                weight = weight.view(num_experts, -1,
-                                     intermediate_size // 2).contiguous()
-                if use_ep:
-                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    narrow_weight = weight[...,
-                                           tp_rank_start // 2:tp_rank_end // 2]
-
-                param = params_dict[new_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param,
-                              narrow_weight,
-                              weight_name=new_name,
-                              shard_id=None,
-                              expert_id=None)
-                loaded_params.add(new_name)
-
-            elif "gate_up_proj_scales" in name:
-                # Handle MLP gate and up projection weights scale
-                new_name = name.replace("gate_up_proj_scales",
-                                        "w13_weight_scale")
-                if use_ep:
-                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    narrow_weight = weight[:,
-                                           2 * tp_rank_start:2 * tp_rank_end,
-                                           ...]
-
-                param = params_dict[new_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param,
-                              narrow_weight,
-                              weight_name=new_name,
-                              shard_id=None,
-                              expert_id=None)
-                loaded_params.add(new_name)
-
-            elif "down_proj_scales" in name:
-                # Handle MLP down projection weights
-                new_name = name.replace("down_proj_scales", "w2_weight_scale")
-                if use_ep:
-                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    narrow_weight = weight[..., tp_rank_start //
-                                           mxfp4_block:tp_rank_end //
-                                           mxfp4_block]
-
-                param = params_dict[new_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param,
-                              narrow_weight,
-                              weight_name=new_name,
-                              shard_id=None,
-                              expert_id=None)
-                loaded_params.add(new_name)
-            elif "gate_up_proj_bias" in name:
-                # Handle MLP gate and up projection biases
-                new_name = name.replace("gate_up_proj_bias", "w13_bias")
-
-                # Extract gate and up projection bias parts
-                if use_ep:
-                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    narrow_weight = weight[:,
-                                           2 * tp_rank_start:2 * tp_rank_end]
-
-                param = params_dict[new_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param,
-                              narrow_weight,
-                              weight_name=new_name,
-                              shard_id=None,
-                              expert_id=None)
-                loaded_params.add(new_name)
-
-            elif "down_proj_bias" in name:
-                # Handle MLP down projection bias
-                new_name = name.replace("down_proj_bias", "w2_bias")
-                param = params_dict[new_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                if use_ep:
-                    weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    # (only load on rank 0 to avoid duplication)
-                    if tp_rank != 0:
-                        weight.zero_()
-                weight_loader(param,
-                              weight,
-                              weight_name=new_name,
-                              shard_id=None,
-                              expert_id=None)
-                loaded_params.add(new_name)
-            elif "sinks" in name:
-                # Handle attention sinks (distributed across ranks)
-                name = name.replace("self_attn", "attn")
-                param = params_dict[name]
-                narrow_weight = weight.narrow(0, head_start, heads_per_rank)
-                param.data.copy_(narrow_weight)
-                loaded_params.add(name)
-            elif "q_proj" in name or "k_proj" in name or "v_proj" in name:
-                shard_id = ("q" if "q_proj" in name else
-                            "k" if "k_proj" in name else "v")
-                name = name.replace("self_attn", "attn")
-                param_name = name.replace(f"{shard_id}_proj", "qkv")
-                param = params_dict[param_name]
-                weight_loader = param.weight_loader
-                weight_loader(param, weight, loaded_shard_id=shard_id)
-                loaded_params.add(param_name)
-            else:
-                # Handle all other weights with potential renaming
-                renamed_name = maybe_rename(name)
-                if renamed_name not in params_dict:
-                    continue
-                param = params_dict[renamed_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, weight)
-                loaded_params.add(renamed_name)
-
-        return loaded_params
-
-    def _load_weights_other(
-            self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        rename_mapping = {
-            "self_attn": "attn",
-            "input_layernorm.weight": "attn.norm.weight",
-            "post_attention_layernorm.weight": "mlp.norm.weight",
-            "embed_tokens": "embedding",
-        }
-
-        def maybe_rename(name: str) -> str:
-            for remap_name, new_name in rename_mapping.items():
-                if remap_name in name:
-                    return name.replace(remap_name, new_name)
-            return name
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-
-        tp_rank = get_tensor_model_parallel_rank()
-        tp_size = get_tensor_model_parallel_world_size()
-        intermediate_size = self.model_config.intermediate_size
-
-        per_rank_intermediate_size = cdiv(intermediate_size, tp_size)
-        # Calculate common slicing bounds for current rank
-        tp_rank_start = tp_rank * per_rank_intermediate_size
-        tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size,
-                          intermediate_size)
-
-        # Attention heads per rank
-        heads_per_rank = self.model_config.num_attention_heads // tp_size
-        head_start = tp_rank * heads_per_rank
-
-        use_ep = self.vllm_config.parallel_config.enable_expert_parallel
-        ep_size = get_ep_group().world_size
-        ep_rank = get_ep_group().rank
-        num_experts = self.model_config.num_local_experts
-        experts_per_rank = num_experts // ep_size
-        ep_rank_start = ep_rank * experts_per_rank
-        ep_rank_end = (ep_rank + 1) * experts_per_rank
-
-        for name, weight in weights:
-            if ".experts.gate_up_proj" in name and "bias" not in name:
-                # Handle MLP gate and up projection weights
-                new_name = name.replace(".experts.gate_up_proj",
-                                        ".experts.w13_weight")
-
-                # Extract gate and up projection parts
-                # since the weight is shuffled, we can slice directly
-                if use_ep:
-                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    narrow_weight = weight[:, :,
-                                           2 * tp_rank_start:2 * tp_rank_end]
-
-                narrow_weight = narrow_weight.permute(0, 2, 1).contiguous()
-                param = params_dict[new_name]
-
-                param.copy_(narrow_weight)
-                loaded_params.add(new_name)
-
-            elif ".experts.down_proj" in name and "bias" not in name:
-                # Handle MLP down projection weights
-                new_name = name.replace(".experts.down_proj",
-                                        ".experts.w2_weight")
-
-                if use_ep:
-                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    narrow_weight = weight[:, tp_rank_start:tp_rank_end, :]
-                narrow_weight = narrow_weight.permute(0, 2, 1).contiguous()
-                param = params_dict[new_name]
-
-                param.copy_(narrow_weight)
-                loaded_params.add(new_name)
-
-            elif "gate_up_proj_bias" in name:
-                # Handle MLP gate and up projection biases
-                new_name = name.replace("gate_up_proj_bias", "w13_bias")
-
-                # Extract gate and up projection bias parts
-                if use_ep:
-                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    narrow_weight = weight[:,
-                                           2 * tp_rank_start:2 * tp_rank_end]
-
-                param = params_dict[new_name]
-
-                param.copy_(narrow_weight)
-                loaded_params.add(new_name)
-
-            elif "down_proj_bias" in name:
-                # Handle MLP down projection bias
-                new_name = name.replace("down_proj_bias", "w2_bias")
-
-                if use_ep:
-                    weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    # (only load on rank 0 to avoid duplication)
-                    if tp_rank != 0:
-                        weight.zero_()
-                param = params_dict[new_name]
-                param.copy_(weight)
-                loaded_params.add(new_name)
-            elif "sinks" in name:
-                # Handle attention sinks (distributed across ranks)
-                name = name.replace("self_attn", "attn")
-                param = params_dict[name]
-                narrow_weight = weight.narrow(0, head_start, heads_per_rank)
-                param.data.copy_(narrow_weight)
-                loaded_params.add(name)
-            elif "q_proj" in name or "k_proj" in name or "v_proj" in name:
-                shard_id = ("q" if "q_proj" in name else
-                            "k" if "k_proj" in name else "v")
-                name = name.replace("self_attn", "attn")
-                param_name = name.replace(f"{shard_id}_proj", "qkv")
-                param = params_dict[param_name]
-                weight_loader = param.weight_loader
-                weight_loader(param, weight, loaded_shard_id=shard_id)
-                loaded_params.add(param_name)
-            else:
-                # Handle all other weights with potential renaming
-
-                renamed_name = maybe_rename(name)
-                if renamed_name not in params_dict:
-                    continue
-                param = params_dict[renamed_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, weight)
-                loaded_params.add(renamed_name)
-
-        return loaded_params
-
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        quant_method = (self.model_config.quantization_config['quant_method']
-                        if hasattr(self.model_config, "quantization_config")
-                        else None)
-        if quant_method == "mxfp4":
-            return self._load_weights_mxfp4(weights)
-        else:
-            return self._load_weights_other(weights)
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From 3aa8c100381a1c6a99a259d9da5dac70fd3a6c0b Mon Sep 17 00:00:00 2001
From: Shiming Zhang <wzshiming@hotmail.com>
Date: Wed, 20 Aug 2025 18:46:59 +0800
Subject: [PATCH 423/932] Fix missing quotes (#23242)

Signed-off-by: Shiming Zhang <wzshiming@hotmail.com>
---
 docs/deployment/frameworks/dstack.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md
index 23dc58c974..fe4d87f78f 100644
--- a/docs/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@@ -9,7 +9,7 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/),
 To install dstack client, run:
 
 ```bash
-pip install "dstack[all]
+pip install dstack[all]
 dstack server
 ```
 

From 83e69a09d6c1a5e88ae00060e79ec7b7a9465462 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Wed, 20 Aug 2025 04:01:31 -0700
Subject: [PATCH 424/932] [Model] Support deepseek with eagle (#21086)

Signed-off-by: Xin Yang <xyangx@amazon.com>
---
 tests/models/registry.py                     |   3 +
 tests/v1/e2e/test_spec_decode.py             |   6 +-
 vllm/model_executor/models/deepseek_eagle.py | 246 +++++++++++++++++++
 vllm/model_executor/models/registry.py       |   1 +
 4 files changed, 255 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/deepseek_eagle.py

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 28fe906316..739d962279 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -530,6 +530,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
     "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random",
                                         speculative_model="luccafong/deepseek_mtp_draft_random",  # noqa: E501
                                         trust_remote_code=True),
+    "EagleDeepSeekMTPModel": _HfExamplesInfo("eagle618/deepseek-v3-random",
+                                        speculative_model="eagle618/eagle-deepseek-v3-random",  # noqa: E501
+                                        trust_remote_code=True),
     "EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B",
                                              trust_remote_code=True,
                                              speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 7b3f458312..bd0fa6b807 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -144,6 +144,8 @@ def test_ngram_correctness(
              "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
             True,
             marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+        (("eagle", "eagle618/deepseek-v3-random",
+          "eagle618/eagle-deepseek-v3-random", 1), False),
     ],
     ids=[
         # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
@@ -151,7 +153,8 @@ def test_ngram_correctness(
         "llama3_eagle",
         "llama3_eagle3",
         "llama4_eagle",
-        "llama4_eagle_mm"
+        "llama4_eagle_mm",
+        "deepseek_eagle"
     ])
 @pytest.mark.parametrize("attn_backend",
                          get_attn_backend_list_based_on_platform())
@@ -177,6 +180,7 @@ def test_eagle_correctness(
     '''
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_MLA_DISABLE", "1")
         m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
         if (attn_backend == "TRITON_ATTN_VLLM_V1"
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
new file mode 100644
index 0000000000..0c9c83cf61
--- /dev/null
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.deepseek_v2 import (DeepseekV2DecoderLayer,
+                                                    DeepseekV3ForCausalLM)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+from .utils import AutoWeightsLoader, maybe_prefix
+
+
+@support_torch_compile
+class DeepseekV2Model(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        start_layer_id: int = 0,
+    ) -> None:
+        super().__init__()
+        self.config = vllm_config. \
+            speculative_config.draft_model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.vocab_size = self.config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
+
+        self.layers = nn.ModuleList([
+            DeepseekV2DecoderLayer(
+                self.config,
+                prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+            ) for i in range(self.config.num_hidden_layers)
+        ])
+
+        self.fc = nn.Linear(
+            self.config.model.hidden_size * 2,
+            self.config.model.hidden_size,
+            bias=False,
+        )
+
+        self.enorm = RMSNorm(self.config.hidden_size,
+                             eps=self.config.rms_norm_eps)
+        self.hnorm = RMSNorm(self.config.hidden_size,
+                             eps=self.config.rms_norm_eps)
+        self.norm = RMSNorm(self.config.hidden_size,
+                            eps=self.config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_embeds = self.embed_tokens(input_ids)
+
+        inputs = torch.cat(
+            [self.enorm(input_embeds),
+             self.hnorm(hidden_states)], dim=-1)
+        hidden_states = self.fc(inputs)
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states, hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+            ("fused_qkv_a_proj", "q_a_proj", 0),
+            ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name_mapped = name.replace(weight_name, param_name)
+
+                # QKV fusion is optional, fall back to normal
+                # weight loading if it's not enabled
+                # if go with fusion option, then update name
+                if ((param_name == "fused_qkv_a_proj")
+                        and name_mapped not in params_dict):
+                    continue
+                else:
+                    name = name_mapped
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # if PP disabled then draft will share embed with target
+                    if get_pp_group().world_size == 1 and \
+                            "embed_tokens." in name:
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        self.config = vllm_config. \
+            speculative_config.draft_model_config.hf_config
+        quant_config = vllm_config.quant_config
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config)
+        self.model = DeepseekV2Model(vllm_config=vllm_config,
+                                     prefix="model",
+                                     start_layer_id=target_layer_num)
+
+        self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                      self.config.hidden_size,
+                                      quant_config=quant_config)
+
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.config.vocab_size,
+                                                scale=logit_scale)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                f"{type(self).__name__} does not support multimodal inputs yet."
+            )
+        return self.model(input_ids, positions, hidden_states)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=None,
+        )
+
+        model_weights = {}
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+            model_weights[name] = loaded_weight
+        loader.load_weights(model_weights.items())
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 8728684d8e..a94231b0f8 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -264,6 +264,7 @@ _SPECULATIVE_DECODING_MODELS = {
     "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
     # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
     "MedusaModel": ("medusa", "Medusa"),

From 68fcd3fa7313d00240f766f42affe931f1f379a7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 20 Aug 2025 19:09:18 +0800
Subject: [PATCH 425/932] [Bugfix] Ensure correctness of Cohere2Vision
 processing (#23245)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_common.py      |  1 +
 vllm/model_executor/models/aya_vision.py      |  3 +-
 vllm/model_executor/models/cohere2_vision.py  | 71 ++++++++++++++-----
 3 files changed, 56 insertions(+), 19 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 8aa0dc7e8e..d5b1de834a 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -268,6 +268,7 @@ def _test_processing_correctness_one(
     "CohereForAI/aya-vision-8b",
     "Salesforce/blip2-opt-2.7b",
     "facebook/chameleon-7b",
+    "CohereLabs/command-a-vision-07-2025",
     "deepseek-ai/deepseek-vl2-tiny",
     "microsoft/Florence-2-base",
     "adept/fuyu-8b",
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index b02a973d94..687c82ded9 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -250,8 +250,7 @@ class AyaVisionMultiModalProcessor(
         image_processor = hf_processor.image_processor
 
         def get_replacement(item_idx: int):
-            images: ImageProcessorItems = mm_items.get("image",
-                                                       ImageProcessorItems)
+            images = mm_items.get_items("image", ImageProcessorItems)
             image_size: ImageSize = images.get_image_size(item_idx)
             num_patches = self.info.get_num_patches(
                 image_width=image_size.width,
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index bc526fd661..4682a8a428 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -10,6 +10,8 @@ import torch
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 from transformers.models.cohere2_vision import Cohere2VisionConfig
+from transformers.models.cohere2_vision.image_processing_cohere2_vision_fast import (  # noqa: E501
+    get_optimal_tiled_canvas)
 from transformers.models.cohere2_vision.processing_cohere2_vision import (
     Cohere2VisionProcessor)
 
@@ -150,14 +152,46 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo):
         max_patches = image_processor.max_patches
         return ImageSize(height=height * max_patches, width=width)
 
-    def get_num_patches(self, image_width: int, image_height: int) -> int:
+    def get_num_patches(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Cohere2VisionProcessor],
+    ) -> int:
         """
         Calculate the number of image patches for a given image.
         Uses the HF processor to determine the actual number of patches.
         """
-        return self.get_hf_processor(
-        ).image_processor.get_number_of_image_patches(image_height,
-                                                      image_width, {})
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        image_processor = processor.image_processor
+
+        # The current implementation of get_number_of_image_patches
+        # is incorrect, so we patch it here.
+        # return image_processor.get_number_of_image_patches(image_height,
+        #                                                    image_width, {})
+
+        min_patches = image_processor.min_patches
+        max_patches = image_processor.max_patches
+        patch_size = image_processor.size
+        crop_to_patches = image_processor.crop_to_patches
+
+        if not crop_to_patches:
+            return 1
+
+        num_columns, num_rows = get_optimal_tiled_canvas(
+            (image_height, image_width),
+            (patch_size["height"], patch_size["width"]),
+            min_patches,
+            max_patches,
+        )
+        num_patches = num_columns * num_rows
+        if num_patches > 1:
+            num_patches += 1  # Thumbnail image
+
+        return num_patches
 
 
 class Cohere2VisionDummyInputsBuilder(
@@ -208,6 +242,8 @@ class Cohere2VisionMultiModalProcessor(
         # Ensure num_patches is available for proper tensor splitting
         if "num_patches" not in processed_outputs and (
                 images := mm_data.get("images")) is not None:
+            hf_processor = self.info.get_hf_processor(**mm_kwargs)
+
             # Fallback calculation if HF processor didn't provide num_patches
             parsed_images = self._get_data_parser().parse_mm_data({
                 "image":
@@ -217,8 +253,9 @@ class Cohere2VisionMultiModalProcessor(
             num_patches = [
                 self.info.get_num_patches(
                     image_width=parsed_images.get_image_size(i).width,
-                    image_height=parsed_images.get_image_size(i).height)
-                for i in range(len(parsed_images))
+                    image_height=parsed_images.get_image_size(i).height,
+                    processor=hf_processor,
+                ) for i in range(len(parsed_images))
             ]
             processed_outputs["num_patches"] = torch.tensor(num_patches)
 
@@ -245,25 +282,25 @@ class Cohere2VisionMultiModalProcessor(
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token
+        img_tokens_per_tile = int(hf_processor.patch_size**2)
         img_line_break_token = hf_processor.img_line_break_token
         boi_token = hf_processor.boi_token
         eoi_token = hf_processor.eoi_token
 
         def get_replacement(item_idx: int):
-            images: ImageProcessorItems = mm_items.get("image",
-                                                       ImageProcessorItems)
+            images = mm_items.get_items("image", ImageProcessorItems)
             image_size: ImageSize = images.get_image_size(item_idx)
 
-            num_patches = self.info.get_num_patches(image_size.height,
-                                                    image_size.width)
-            img_tokens_per_tile = int(hf_processor.patch_size**2)
-            single_tile_tokens = image_token * img_tokens_per_tile + \
-                img_line_break_token
-            img_string = f"{boi_token}\
-                {single_tile_tokens * num_patches}\
-                {eoi_token}"
+            num_patches = self.info.get_num_patches(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                processor=hf_processor,
+            )
+            patch_tokens = (image_token * img_tokens_per_tile +
+                            img_line_break_token)
+            repl = f"{boi_token}{patch_tokens * num_patches}{eoi_token}"
 
-            return PromptUpdateDetails.select_text(img_string, image_token)
+            return PromptUpdateDetails.select_text(repl, image_token)
 
         return [
             PromptReplacement(

From 50df09fe13c93b520c64c581de4f0b469995f7b9 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 20 Aug 2025 08:05:54 -0400
Subject: [PATCH 426/932] Update to flashinfer-python==0.2.12 and disable AOT
 compile for non-release image (#23129)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/release-pipeline.yaml |  2 +-
 docker/Dockerfile                | 52 ++++++++++++++++++++------------
 setup.py                         |  2 +-
 3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 85d3e56387..e20ce54ca7 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -68,7 +68,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Annotate release workflow"
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 7493891778..cfaa598682 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -372,31 +372,45 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
-# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
-# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
-ARG FLASHINFER_GIT_REF="v0.2.11"
+# Keep this in sync with "flashinfer" extra in setup.py
+ARG FLASHINFER_GIT_REF="v0.2.12"
+# Flag to control whether to compile FlashInfer AOT kernels
+# Set to "true" to enable AOT compilation:
+# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
+ARG FLASHINFER_AOT_COMPILE=false
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
   . /etc/environment
     git clone --depth 1 --recursive --shallow-submodules \
         --branch ${FLASHINFER_GIT_REF} \
         ${FLASHINFER_GIT_REPO} flashinfer
-    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
-    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
-    if [[ "${CUDA_VERSION}" == 11.* ]]; then
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
-    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
-    else
-        # CUDA 12.8+ supports 10.0a and 12.0
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
-    fi
-    echo "🏗️  Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
-    # Needed to build AOT kernels
     pushd flashinfer
-        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-            python3 -m flashinfer.aot
-        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-            uv pip install --system --no-build-isolation --force-reinstall --no-deps .
+        if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
+            # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+            # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
+            if [[ "${CUDA_VERSION}" == 11.* ]]; then
+                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+            elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+            else
+                # CUDA 12.8+ supports 10.0a and 12.0
+                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+            fi
+            echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+            # Build AOT kernels
+            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+                python3 -m flashinfer.aot
+            # Install with no-build-isolation since we already built AOT kernels
+            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+                uv pip install --system --no-build-isolation . \
+                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+            # Download pre-compiled cubins
+            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+                python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
+        else
+            echo "🏗️  Installing FlashInfer without AOT compilation in JIT mode"
+            uv pip install --system . \
+                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+        fi
     popd
     rm -rf flashinfer
 BASH
diff --git a/setup.py b/setup.py
index cc3037ebb7..6a3013de79 100644
--- a/setup.py
+++ b/setup.py
@@ -685,7 +685,7 @@ setup(
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.11"],
+        "flashinfer": ["flashinfer-python==0.2.12"],
     },
     cmdclass=cmdclass,
     package_data=package_data,

From 7cd17e22d76473919c55aa75ac1897e4d3fbe277 Mon Sep 17 00:00:00 2001
From: xyxinyang <43821961+xyxinyang@users.noreply.github.com>
Date: Wed, 20 Aug 2025 20:41:55 +0800
Subject: [PATCH 427/932] [Model][V1] Support Ernie MTP (#22169)

Signed-off-by: zhouchong <zhouchong03@baidu.com>
Co-authored-by: zhouchong <zhouchong03@baidu.com>
---
 tests/models/registry.py                |   3 +
 vllm/config/__init__.py                 |  31 ++-
 vllm/model_executor/models/ernie_mtp.py | 287 ++++++++++++++++++++++++
 vllm/model_executor/models/registry.py  |   1 +
 vllm/v1/spec_decode/eagle.py            |   2 +-
 vllm/worker/worker.py                   |   3 +-
 6 files changed, 320 insertions(+), 7 deletions(-)
 create mode 100644 vllm/model_executor/models/ernie_mtp.py

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 739d962279..6e6acfb8cd 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -556,6 +556,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                             is_available_online=False,
                                             speculative_model="openbmb/MiniCPM-2B-sft-bf16",
                                             tokenizer="openbmb/MiniCPM-2B-sft-bf16"),
+    "ErnieMTPModel": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT",
+                                    trust_remote_code=True,
+                                    speculative_model="baidu/ERNIE-4.5-21B-A3B-PT"),
     "Glm4MoeMTPModel": _HfExamplesInfo("zai-org/GLM-4.5",
                                         speculative_model="zai-org/GLM-4.5",
                                         min_transformers_version="4.54",
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 56a749789b..801fa97fe5 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1463,7 +1463,8 @@ class ModelConfig:
         from vllm.distributed.utils import get_pp_indices
         if (self.hf_text_config.model_type == "deepseek_mtp"
                 or self.hf_config.model_type == "mimo_mtp"
-                or self.hf_config.model_type == "glm4_moe_mtp"):
+                or self.hf_config.model_type == "glm4_moe_mtp"
+                or self.hf_config.model_type == "ernie_mtp"):
             total_num_hidden_layers = getattr(self.hf_text_config,
                                               "num_nextn_predict_layers", 0)
         else:
@@ -1911,7 +1912,8 @@ class DeviceConfig:
 
 
 SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
-                            "mlp_speculator", "draft_model", "deepseek_mtp"]
+                            "mlp_speculator", "draft_model", "deepseek_mtp",
+                            "ernie_mtp"]
 
 
 @config
@@ -2044,6 +2046,16 @@ class SpeculativeConfig:
                 "architectures": ["Glm4MoeMTPModel"]
             })
 
+        if hf_config.model_type == "ernie4_5_moe":
+            hf_config.model_type = "ernie_mtp"
+        if hf_config.model_type == "ernie_mtp":
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+            hf_config.update({
+                "n_predict": n_predict,
+                "architectures": ["ErnieMTPModel"]
+            })
+            return hf_config
+
         return hf_config
 
     def __post_init__(self):
@@ -2062,8 +2074,8 @@ class SpeculativeConfig:
             if self.target_model_config and \
                 (self.target_model_config.hf_text_config.model_type \
                         == "deepseek_v3" or
-                    self.target_model_config.hf_text_config.model_type \
-                        == "mimo"):
+                    self.target_model_config.hf_text_config.model_type in
+                        ("mimo","ernie4_5_moe")):
                 # use the draft model from the same model:
                 self.model = self.target_model_config.model
             elif self.method in ("ngram", "[ngram]"):
@@ -2161,6 +2173,15 @@ class SpeculativeConfig:
                                 "one layer. Might need some code changes " \
                                 "to support multiple layers."
                             )
+                elif (self.draft_model_config.hf_config.model_type ==
+                      "ernie_mtp"):
+                    self.method = "ernie_mtp"
+                    if self.num_speculative_tokens > 1:
+                        logger.warning(
+                                "All Ernie MTP models only have " \
+                                "one layer. Might need some code changes " \
+                                "to support multiple layers."
+                            )
                 else:
                     self.method = "draft_model"
                     raise NotImplementedError(
@@ -2376,7 +2397,7 @@ class SpeculativeConfig:
         return self.num_speculative_tokens
 
     def use_eagle(self) -> bool:
-        return self.method in ("eagle", "eagle3", "deepseek_mtp")
+        return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp")
 
     def __repr__(self) -> str:
         method = self.method
diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py
new file mode 100644
index 0000000000..90a1267b28
--- /dev/null
+++ b/vllm/model_executor/models/ernie_mtp.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Baidu team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Ernie-MTP model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .llama import LlamaDecoderLayer
+from .utils import is_pp_missing_parameter, maybe_prefix
+
+
+class ErnieMultiTokenPredictorLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        model_config: ModelConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.mtp_emb_norm = RMSNorm(config.hidden_size,
+                                    eps=config.rms_norm_eps)
+        self.mtp_hidden_norm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.mtp_linear_proj = nn.Linear(config.hidden_size * 2,
+                                         config.hidden_size,
+                                         bias=False)
+        self.mtp_block = LlamaDecoderLayer(config, cache_config, quant_config,
+                                           prefix)
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        spec_step_index: int = 0,
+    ) -> torch.Tensor:
+        assert inputs_embeds is not None
+        # masking inputs at position 0, as not needed by MTP
+        inputs_embeds[positions == 0] = 0
+
+        inputs_embeds = self.mtp_emb_norm(inputs_embeds)
+        previous_hidden_states = self.mtp_hidden_norm(previous_hidden_states)
+
+        hidden_states = self.mtp_linear_proj(
+            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+
+        hidden_states, residual = self.mtp_block(positions=positions,
+                                                 hidden_states=hidden_states,
+                                                 residual=None)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class ErnieMultiTokenPredictor(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.num_mtp_layers = config.num_nextn_predict_layers
+        # to map the exact layer index from weights
+        self.layers = torch.nn.ModuleDict({
+            str(idx):
+            ErnieMultiTokenPredictorLayer(
+                config,
+                f"{prefix}.layers.{idx}",
+                model_config=vllm_config.model_config,
+                cache_config=vllm_config.cache_config,
+            )
+            for idx in range(self.mtp_start_layer_idx,
+                             self.mtp_start_layer_idx + self.num_mtp_layers)
+        })
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        return self.layers[str(self.mtp_start_layer_idx + spec_step_idx)](
+            inputs_embeds,
+            positions,
+            previous_hidden_states,
+            spec_step_idx,
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: ParallelLMHead,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        self.layers[str(self.mtp_start_layer_idx + spec_step_idx)]
+        logits = self.logits_processor(lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+
+class ErnieMTP(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        self.config = vllm_config.model_config.hf_config
+        self.model = ErnieMultiTokenPredictor(vllm_config=vllm_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "model"))
+        self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                      self.config.hidden_size)
+        self.sampler = get_sampler()
+
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        assert spec_step_idx == 0, "ernie_mtp only support predict one token"
+        hidden_states = self.model(input_ids, positions, hidden_states,
+                                   inputs_embeds, spec_step_idx)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> Optional[torch.Tensor]:
+        return self.model.compute_logits(hidden_states, self.lm_head,
+                                         sampling_metadata, spec_step_idx)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+
+            if self.config.tie_word_embeddings and name.endswith(
+                    "lm_head.weight"):
+                continue
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "mtp" in name:
+                name = self._rewrite_spec_layer_name(self.config, name)
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                if "mtp" not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                # According to DeepSeek-V3 Technical Report, MTP modules
+                # shares embedding layer. We only load the first weights.
+                if "mtp_" not in name and ("embed_tokens" not in name
+                                           and "lm_head" not in name):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def _rewrite_spec_layer_name(self, config: PretrainedConfig,
+                                 name: str) -> str:
+        """
+        Rewrite the weight name to match the format of the original model.
+        """
+        spec_layer_weight_names = [
+            "embed_tokens", "mtp_emb_norm", "mtp_hidden_norm",
+            "mtp_linear_proj"
+        ]
+        layer_idx = config.num_hidden_layers
+        for weight_name in spec_layer_weight_names:
+            if weight_name in name:
+                name = name.replace(
+                    f"model.{weight_name}.0.",
+                    f"model.layers.{layer_idx}.{weight_name}.")
+                return name
+        name = name.replace("model.mtp_block.0.",
+                            f"model.layers.{layer_idx}.mtp_block.")
+        return name
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a94231b0f8..78ef270598 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -266,6 +266,7 @@ _SPECULATIVE_DECODING_MODELS = {
     # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
+    "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
     "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
     "MedusaModel": ("medusa", "Medusa"),
     # Temporarily disabled.
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index a8a160a0f9..8cd2ad12cf 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -194,7 +194,7 @@ class EagleProposer:
                 hidden_states=self.hidden_states[:num_input_tokens],
                 inputs_embeds=inputs_embeds,
             )
-            if self.method == "deepseek_mtp":
+            if self.method in ("deepseek_mtp", "ernie_mtp"):
                 last_hidden_states = ret_hidden_states
             else:
                 last_hidden_states, hidden_states = ret_hidden_states
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 9dfea94756..7a01e585ba 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -77,7 +77,8 @@ class Worker(LocalOrDistributedWorkerBase):
                         "eagle",
                         "deepseek_mtp",
                         "glm4_moe_mtp",
-                        "mimo_mtp")) \
+                        "mimo_mtp",
+                        "ernie_mtp")) \
                     else {"return_hidden_states": True}
 
         ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner

From c6d80a7a9620637ba5016dd3c0d6061e79eed73c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 20 Aug 2025 20:47:05 +0800
Subject: [PATCH 428/932] [Model] Improve olmo and olmo2 (#23228)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/models/supported_models.md     |  4 ++--
 vllm/model_executor/models/olmo.py  | 22 +++++++++++++++++++---
 vllm/model_executor/models/olmo2.py | 17 +++++++++++++++--
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 7908e42387..7308d00106 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -384,8 +384,8 @@ th {
 | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
 | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | ✅︎ |
-| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | ✅︎ |
+| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ |
 | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ |
 | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ |
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 1dc4df85c1..01639d3981 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -47,7 +47,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -91,6 +91,7 @@ class OlmoAttention(nn.Module):
             self.total_num_heads,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
 
         # Rotary embeddings.
@@ -114,6 +115,7 @@ class OlmoAttention(nn.Module):
             self.hidden_size,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
     def forward(
@@ -142,6 +144,7 @@ class OlmoMLP(nn.Module):
         self,
         config: OlmoConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -154,6 +157,7 @@ class OlmoMLP(nn.Module):
             [self.intermediate_size] * 2,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
 
         # Activation function.
@@ -165,6 +169,7 @@ class OlmoMLP(nn.Module):
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
 
     def forward(
@@ -197,7 +202,7 @@ class OlmoDecoderLayer(nn.Module):
                                        prefix=f"{prefix}.self_attn")
 
         # MLP block.
-        self.mlp = OlmoMLP(config, quant_config)
+        self.mlp = OlmoMLP(config, quant_config, prefix=f"{prefix}.mlp")
 
         # LayerNorm
         self.input_layernorm = nn.LayerNorm(config.hidden_size,
@@ -326,10 +331,21 @@ class OlmoModel(nn.Module):
         return loaded_params
 
 
-class OlmoForCausalLM(nn.Module, SupportsPP):
+class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     """
     Extremely barebones HF model wrapper.
     """
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 499e6d30ed..66a0f91155 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -33,6 +33,7 @@ from torch import nn
 from transformers import Olmo2Config
 
 from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.distributed.communication_op import tensor_model_parallel_all_gather
@@ -48,7 +49,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import SupportsPP
+from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
 from vllm.model_executor.models.utils import (
     AutoWeightsLoader, is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
@@ -253,6 +254,7 @@ class Olmo2DecoderLayer(nn.Module):
         return hidden_states
 
 
+@support_torch_compile
 class Olmo2Model(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -354,10 +356,21 @@ class Olmo2Model(nn.Module):
         return loaded_params
 
 
-class Olmo2ForCausalLM(nn.Module, SupportsPP):
+class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     """
     Extremely barebones HF model wrapper.
     """
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()

From 38217877aa70041c0115ee367b75197af9cbc5ad Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Wed, 20 Aug 2025 21:34:49 +0800
Subject: [PATCH 429/932] [Fix] fix offline env use local mode path (#22526)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 .../offline_mode/test_offline_mode.py         | 35 +++++++++++++++++++
 vllm/engine/arg_utils.py                      | 10 +++++-
 vllm/transformers_utils/config.py             | 23 ++++++++++--
 3 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index a606eeab58..dd8d63ad31 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for HF_HUB_OFFLINE mode"""
+import dataclasses
 import importlib
 import sys
 
@@ -9,6 +10,7 @@ import urllib3
 
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.engine.arg_utils import EngineArgs
 
 MODEL_CONFIGS = [
     {
@@ -108,3 +110,36 @@ def _re_import_modules():
     # Error this test if reloading a module failed
     if reload_exception is not None:
         raise reload_exception
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.usefixtures("cache_models")
+def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
+    # Set HF to offline mode and ensure we can still construct an LLM
+    with monkeypatch.context() as m:
+        try:
+            m.setenv("HF_HUB_OFFLINE", "1")
+            m.setenv("VLLM_NO_USAGE_STATS", "1")
+
+            def disable_connect(*args, **kwargs):
+                raise RuntimeError("No http calls allowed")
+
+            m.setattr(
+                urllib3.connection.HTTPConnection,
+                "connect",
+                disable_connect,
+            )
+            m.setattr(
+                urllib3.connection.HTTPSConnection,
+                "connect",
+                disable_connect,
+            )
+            # Need to re-import huggingface_hub
+            # and friends to setup offline mode
+            _re_import_modules()
+            engine_args = EngineArgs(model="facebook/opt-125m")
+            LLM(**dataclasses.asdict(engine_args))
+        finally:
+            # Reset the environment after the test
+            # NB: Assuming tests are run in online mode
+            _re_import_modules()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 679905aed9..48d9cd08af 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -15,6 +15,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
                     Literal, Optional, Type, TypeVar, Union, cast, get_args,
                     get_origin)
 
+import huggingface_hub
 import regex as re
 import torch
 from pydantic import TypeAdapter, ValidationError
@@ -39,7 +40,7 @@ from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_ray_initialized
 from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
-from vllm.transformers_utils.config import is_interleaved
+from vllm.transformers_utils.config import get_model_path, is_interleaved
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                         GiB_bytes, get_ip, is_in_ray_actor)
@@ -457,6 +458,13 @@ class EngineArgs:
         # Setup plugins
         from vllm.plugins import load_general_plugins
         load_general_plugins()
+        # when use hf offline,replace model id to local model path
+        if huggingface_hub.constants.HF_HUB_OFFLINE:
+            model_id = self.model
+            self.model = get_model_path(self.model, self.revision)
+            logger.info(
+                "HF_HUB_OFFLINE is True, replace model_id [%s] " \
+                "to model_path [%s]",model_id, self.model)
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index d8c964fb2a..fe345bd8f0 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -14,7 +14,7 @@ from huggingface_hub import get_safetensors_metadata, hf_hub_download
 from huggingface_hub import list_repo_files as hf_list_repo_files
 from huggingface_hub import try_to_load_from_cache
 from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
-                                   HFValidationError, LocalEntryNotFoundError,
+                                   LocalEntryNotFoundError,
                                    RepositoryNotFoundError,
                                    RevisionNotFoundError)
 from transformers import GenerationConfig, PretrainedConfig
@@ -335,6 +335,7 @@ def maybe_override_with_speculators_target_model(
         gguf_model_repo = Path(model).parent
     else:
         gguf_model_repo = None
+    kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
     config_dict, _ = PretrainedConfig.get_config_dict(
         model if gguf_model_repo is None else gguf_model_repo,
         revision=revision,
@@ -400,6 +401,7 @@ def get_config(
             raise ValueError(error_message) from e
 
     if config_format == ConfigFormat.HF:
+        kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
         config_dict, _ = PretrainedConfig.get_config_dict(
             model,
             revision=revision,
@@ -532,7 +534,7 @@ def try_get_local_file(model: Union[str, Path],
                                                      revision=revision)
             if isinstance(cached_filepath, str):
                 return Path(cached_filepath)
-        except HFValidationError:
+        except ValueError:
             ...
     return None
 
@@ -908,3 +910,20 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
             exc_info=e)
 
     return max_position_embeddings
+
+
+def get_model_path(model: Union[str, Path], revision: Optional[str] = None):
+    if os.path.exists(model):
+        return model
+    assert huggingface_hub.constants.HF_HUB_OFFLINE
+    common_kwargs = {
+        "local_files_only": huggingface_hub.constants.HF_HUB_OFFLINE,
+        "revision": revision,
+    }
+
+    if envs.VLLM_USE_MODELSCOPE:
+        from modelscope.hub.snapshot_download import snapshot_download
+        return snapshot_download(model_id=model, **common_kwargs)
+
+    from huggingface_hub import snapshot_download
+    return snapshot_download(repo_id=model, **common_kwargs)

From 44492358439f612b3934ccd902dbd90fcfa19866 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 20 Aug 2025 22:19:30 +0800
Subject: [PATCH 430/932] [Bugfix] Ensure correctness of HCXVision processing
 (#23254)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_common.py      |   2 +-
 .../models/hyperclovax_vision.py              | 116 ++++++++----------
 2 files changed, 55 insertions(+), 63 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index d5b1de834a..02aecfad82 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -102,7 +102,7 @@ def _test_processing_correctness(
         partial(random_video,
                 rng,
                 min_frames=2,
-                max_frames=8,
+                max_frames=16,
                 min_wh=128,
                 max_wh=256),
         "audio":
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index d3ddc47ea9..f8b30d8d98 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -53,6 +53,21 @@ IMAGE_TOKEN: str = "<|dummy3|>"
 VIDEO_TOKEN: str = "<|_unuse_missing_100270|>"
 
 
+# Based on combine_frames_into_images in
+# https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B/blob/main/processing_hyperclovax.py
+def get_num_combined_frames(
+        num_frames: int,
+        max_grid_shape: tuple[int, int] = (3, 3),
+) -> int:
+    max_num_grids = max_grid_shape[0] * max_grid_shape[1]
+
+    # Calculate the number of canvases needed.
+    num_canvases = num_frames // max_num_grids
+    leftover_frames = num_frames % max_num_grids
+
+    return num_canvases + (leftover_frames > 0)
+
+
 class HCXVisionMultimodalPixelInputs(TypedDict):
     type: Literal["pixel_values"]
     pixel_values_images: list[torch.Tensor]
@@ -172,23 +187,20 @@ class HCXVisionMultiModalProcessor(
         def replace_multimodal_token(
             token_ids: torch.Tensor,
             target_token: int,
-            repeats: list,
+            repeats: list[int],
         ):
-            output = list()
+            output = list[int]()
             _repeats_idx = 0
             for token_id in token_ids:
                 if token_id == target_token:
-                    output += [
-                        token_id.item(),
-                    ] * repeats[_repeats_idx]
+                    output += [token_id.item()] * repeats[_repeats_idx]
                     _repeats_idx += 1
                 else:
-                    output += [
-                        token_id.item(),
-                    ]
+                    output += [token_id.item()]
+
             return torch.tensor(output, device=token_ids.device)
 
-        for video_idx, video_arr in enumerate(mm_data.get("videos", list())):
+        for video_idx, video_arr in enumerate(mm_data.get("videos", [])):
             if video_arr.dtype == np.uint8:
                 continue
             mm_data["videos"][video_idx] = video_arr.astype(np.uint8)
@@ -205,88 +217,68 @@ class HCXVisionMultiModalProcessor(
         if len(mm_data) > 0:
             # batchify input as a single item
             images = mm_data.get("images", None)
-            num_images = 0
-            if images is not None:
-                num_images = len(images)
-                images = [
-                    images,
-                ]  # batchify
+            batched_images = None if images is None else [images]
 
-            videos = mm_data.get("videos",
-                                 None)  # list of video in single conversation
-            num_videos = 0
-            if videos is not None:
-                num_videos = len(videos)
-                videos = [
-                    videos,
-                ]  # batchify
+            # list of video in single conversation
+            videos = mm_data.get("videos", None)
+            batched_videos = None if videos is None else [videos]
 
             _processed_outputs = self.info.ctx.call_hf_processor(
                 hf_processor=self.info.get_hf_processor(**mm_kwargs),
                 data=dict(
                     text=None,
-                    images=images,
-                    videos=videos,
+                    images=batched_images,
+                    videos=batched_videos,
                 ),
             )  # mm-only
 
             for k, v in _processed_outputs.items():
-                if len(v) < 1:
-                    continue
-                elif k.endswith("_images"):
-                    # list of list of 4D tensor -> list of 4D tensor
+                if isinstance(v, list) and len(v) > 0:
+                    assert len(v) == 1
                     _processed_outputs[k] = v[0]
-                elif k.endswith("_videos"):
-                    # list of list of 4D tensor -> list of 4D tensor
-                    v = v[0]
-                    if k == "pixel_values_videos":
-                        v = torch.cat(v, dim=0)
-                        _c, _w, _h = v.shape[-3:]
-                        v = v.reshape(num_videos, -1, _c, _w, _h)
-                        v = list(torch.unbind(v, dim=0))
-                    _processed_outputs[k] = v
 
-            if num_images > 0:
+            if images:
                 tokenizer = self.info.get_tokenizer()
+                image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
                 processed_outputs["input_ids"] = torch.stack([
                     replace_multimodal_token(
                         token_ids=_input_ids,
-                        target_token=tokenizer.convert_tokens_to_ids(
-                            IMAGE_TOKEN),
+                        target_token=image_token_id,
                         repeats=_processed_outputs[
                             "vision_query_lengths_images"],
                     ) for _input_ids in processed_outputs["input_ids"]
                 ],
                                                              dim=0)
 
-            if num_videos > 0:
-                tokenizer = self.info.get_tokenizer()
-                processed_outputs["input_ids"] = torch.stack([
-                    replace_multimodal_token(
-                        token_ids=_input_ids,
-                        target_token=tokenizer.convert_tokens_to_ids(
-                            VIDEO_TOKEN),
-                        repeats=_processed_outputs[
-                            "vision_query_lengths_videos"],
-                    ) for _input_ids in processed_outputs["input_ids"]
-                ],
-                                                             dim=0)
-
-                _ratios = [
-                    len(_pixel_values) for _pixel_values in
-                    _processed_outputs["pixel_values_videos"]
-                ]
+            if videos:
                 _num_per_videos = [
-                    int(_e / sum(_ratios) *
-                        len(_processed_outputs["vision_query_lengths_videos"]))
-                    for _e in _ratios
+                    get_num_combined_frames(len(video)) for video in videos
+                ]
+                _processed_outputs["pixel_values_videos"] = [
+                    _processed_outputs["pixel_values_videos"]
+                    [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])]
+                    for _i in range(len(videos))
                 ]
                 _processed_outputs["vision_query_lengths_videos"] = [
                     _processed_outputs["vision_query_lengths_videos"]
                     [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])]
-                    for _i in range(0, num_videos)
+                    for _i in range(len(videos))
                 ]
 
+                tokenizer = self.info.get_tokenizer()
+                video_token_id = tokenizer.convert_tokens_to_ids(VIDEO_TOKEN)
+                processed_outputs["input_ids"] = torch.stack([
+                    replace_multimodal_token(
+                        token_ids=_input_ids,
+                        target_token=video_token_id,
+                        repeats=[
+                            sum(lens) for lens in
+                            _processed_outputs["vision_query_lengths_videos"]
+                        ],
+                    ) for _input_ids in processed_outputs["input_ids"]
+                ],
+                                                             dim=0)
+
             processed_outputs.update(_processed_outputs)
 
         return processed_outputs

From b17109beeafbf9577c319ab61530810943a7fc4b Mon Sep 17 00:00:00 2001
From: shixianc <49539556+shixianc@users.noreply.github.com>
Date: Wed, 20 Aug 2025 07:35:26 -0700
Subject: [PATCH 431/932] [Kernel] CUTLASS MoE FP8: Integrate cuda moe
 permute/unpermute (#23045)

Signed-off-by: Shixian Cui <shixian@amazon.com>
---
 .../kernels/benchmark_grouped_gemm_cutlass.py |  35 +++-
 csrc/moe/moe_permute_unpermute_op.cu          |  33 ++--
 csrc/ops.h                                    |   5 +
 .../cutlass_w8a8/moe/get_group_starts.cuh     |   6 +-
 .../quantization/cutlass_w8a8/moe/moe_data.cu |  65 +++++--
 .../cutlass_w8a8/scaled_mm_entry.cu           |  24 +++
 csrc/torch_bindings.cpp                       |  13 ++
 tests/kernels/moe/test_cutlass_moe.py         |  18 +-
 .../kernels/moe/test_moe_permute_unpermute.py |   6 +-
 tests/kernels/moe/test_pplx_cutlass_moe.py    |  22 ++-
 .../quantization/test_cutlass_scaled_mm.py    |   2 +-
 vllm/_custom_ops.py                           |  22 +++
 .../layers/fused_moe/cutlass_moe.py           | 179 +++++++++++-------
 .../layers/fused_moe/moe_permute_unpermute.py |  29 ++-
 .../compressed_tensors_moe.py                 |  31 +++
 15 files changed, 369 insertions(+), 121 deletions(-)

diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index 1d4e730f99..a6b42406b5 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -80,6 +80,11 @@ def bench_run(
         a, score, topk, renormalize=False
     )
 
+    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
+    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
+    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+
     def run_triton_moe(
         a: torch.Tensor,
         w1: torch.Tensor,
@@ -111,6 +116,10 @@ def bench_run(
         w2: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         per_act_token: bool,
@@ -125,6 +134,10 @@ def bench_run(
                 topk_ids,
                 w1_scale,
                 w2_scale,
+                ab_strides1,
+                ab_strides2,
+                c_strides1,
+                c_strides2,
                 per_act_token,
                 a1_scale=None,
             )
@@ -136,6 +149,10 @@ def bench_run(
         w2_q: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
     ):
@@ -150,6 +167,10 @@ def bench_run(
                 topk_ids,
                 w1_scale,
                 w2_scale,
+                ab_strides1,
+                ab_strides2,
+                c_strides1,
+                c_strides2,
                 per_act_token,
                 a1_scale=None,
             )
@@ -194,6 +215,10 @@ def bench_run(
             w2_q,
             w1_scale,
             w2_scale,
+            ab_strides1,
+            ab_strides2,
+            c_strides1,
+            c_strides2,
             topk_weights,
             topk_ids,
         )
@@ -231,6 +256,10 @@ def bench_run(
         "w1_scale": w1_scale,
         "w2_scale": w2_scale,
         "per_act_token": per_act_token,
+        "ab_strides1": ab_strides1,
+        "ab_strides2": ab_strides2,
+        "c_strides1": c_strides1,
+        "c_strides2": c_strides2,
         # cuda graph params
         "cutlass_graph": cutlass_graph,
         "triton_graph": triton_graph,
@@ -289,6 +318,10 @@ def bench_run(
         w2_q,
         w1_scale,
         w2_scale,
+        ab_strides1,
+        ab_strides2,
+        c_strides1,
+        c_strides2,
         topk_weights,
         topk_ids,
         per_act_token,
@@ -297,7 +330,7 @@ def bench_run(
 
     results.append(
         benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu
index 2922352a3f..ca0c873f49 100644
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -45,8 +45,6 @@ void moe_permute(
   auto copy_topk_ids = topk_ids.clone();  // copy topk_ids for preprocess
   auto permuted_experts_id = torch::empty_like(topk_ids);
   auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
-  auto align_expert_first_token_offset =
-      torch::zeros_like(expert_first_token_offset);
 
   CubKeyValueSorter sorter{};
   int64_t* valid_num_ptr = nullptr;
@@ -85,12 +83,14 @@ void moe_permute(
   });
 
   // get m_indices and update expert_first_token_offset with align block
-  getMIndices(get_ptr<int64_t>(expert_first_token_offset),
-              get_ptr<int64_t>(align_expert_first_token_offset),
-              get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
-              stream);
+  // this is only required for DeepGemm and not required for CUTLASS group gemm
   if (align_block_size.has_value()) {
-    // update align_expert_first_token_offset
+    auto align_expert_first_token_offset =
+        torch::zeros_like(expert_first_token_offset);
+    getMIndices(get_ptr<int64_t>(expert_first_token_offset),
+                get_ptr<int64_t>(align_expert_first_token_offset),
+                get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
+                stream);
     expert_first_token_offset.copy_(align_expert_first_token_offset);
   }
 }
@@ -195,19 +195,14 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
                  torch::Tensor& expert_first_token_offset,
                  torch::Tensor& src_row_id2dst_row_id_map,
                  torch::Tensor& m_indices) {
-  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
+  TORCH_CHECK(false, "moe_permute is not supported on CUDA < 12.0");
 }
 
-void moe_unpermute(const torch::Tensor& input,
-                   const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
-                   const torch::Tensor& token_expert_indices,
-                   const std::optional<torch::Tensor>& expert_map,
-                   int64_t n_expert, int64_t n_local_expert, int64_t topk,
-                   const std::optional<int64_t>& align_block_size,
-                   torch::Tensor& permuted_input,
-                   torch::Tensor& expert_first_token_offset,
-                   torch::Tensor& src_row_id2dst_row_id_map,
-                   torch::Tensor& m_indices) {
+void moe_unpermute(
+    const torch::Tensor& permuted_hidden_states,
+    const torch::Tensor& topk_weights, const torch::Tensor& inv_permuted_idx,
+    const std::optional<torch::Tensor>& expert_first_token_offset, int64_t topk,
+    torch::Tensor& hidden_states) {
   TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
 }
 
@@ -224,4 +219,4 @@ bool moe_permute_unpermute_supported() {
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("moe_permute", &moe_permute);
   m.impl("moe_unpermute", &moe_unpermute);
-}
+}
\ No newline at end of file
diff --git a/csrc/ops.h b/csrc/ops.h
index 64bcec6ca1..86fe848e2f 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -229,6 +229,11 @@ void get_cutlass_moe_mm_data(
     const int64_t num_experts, const int64_t n, const int64_t k,
     const std::optional<torch::Tensor>& blockscale_offsets);
 
+void get_cutlass_moe_mm_problem_sizes(
+    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
+
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                   torch::Tensor& problem_sizes1,
                                   torch::Tensor& problem_sizes2,
diff --git a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
index 6c6e897908..15bb2c3005 100644
--- a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
@@ -10,7 +10,7 @@
 
 template <typename ElementAB, typename ElementC, typename ElementAccumulator>
 __global__ void get_group_gemm_starts(
-    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
+    int64_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
     ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
     ElementAccumulator** b_scales_offsets, ElementAB* a_base_as_int,
     ElementAB* b_base_as_int, ElementC* out_base_as_int,
@@ -34,7 +34,7 @@ __global__ void get_group_gemm_starts(
   else if (out_tensors.dtype() == TENSOR_C_TYPE) {                         \
     get_group_gemm_starts<cutlass::float_e4m3_t, C_TYPE, float>            \
         <<<1, num_experts, 0, stream>>>(                                   \
-            static_cast<int32_t*>(expert_offsets.data_ptr()),              \
+            static_cast<int64_t*>(expert_offsets.data_ptr()),              \
             static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),       \
             static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),       \
             static_cast<C_TYPE**>(out_ptrs.data_ptr()),                    \
@@ -61,6 +61,8 @@ void run_get_group_gemm_starts(
   TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  // expect int64_t to avoid overflow during offset calculations
+  TORCH_CHECK(expert_offsets.dtype() == torch::kInt64);
 
   int num_experts = static_cast<int>(expert_offsets.size(0));
   bool per_act_token = a_scales.numel() != 1;
diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
index 100f485084..49cafcc32a 100644
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -104,6 +104,53 @@ __global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids,
   }
 }
 
+namespace {
+inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
+                                         torch::Tensor& problem_sizes1,
+                                         torch::Tensor& problem_sizes2,
+                                         torch::Tensor& atomic_buffer,
+                                         int64_t num_experts, int64_t n,
+                                         int64_t k, cudaStream_t stream,
+                                         const bool swap_ab) {
+  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
+
+  const int32_t* topk_ptr = static_cast<const int32_t*>(topk_ids.data_ptr());
+  int32_t* ps1_ptr = static_cast<int32_t*>(problem_sizes1.data_ptr());
+  int32_t* ps2_ptr = static_cast<int32_t*>(problem_sizes2.data_ptr());
+  int32_t* atomic_ptr = static_cast<int32_t*>(atomic_buffer.data_ptr());
+
+  if (swap_ab) {
+    compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
+        topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr,
+        static_cast<int>(topk_ids.numel()), static_cast<int>(n),
+        static_cast<int>(k));
+  } else {
+    compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
+        topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr,
+        static_cast<int>(topk_ids.numel()), static_cast<int>(n),
+        static_cast<int>(k));
+  }
+}
+}  // namespace
+
+void get_cutlass_moe_mm_problem_sizes_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
+  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
+  auto options_int32 =
+      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
+  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
+
+  // Swap-AB should be disabled for FP4 path
+  bool may_swap_ab = (!blockscale_offsets.has_value()) &&
+                     (topk_ids.numel() <= SWAP_AB_THRESHOLD);
+
+  launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
+                               atomic_buffer, num_experts, n, k, stream,
+                               may_swap_ab);
+}
+
 void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -121,21 +168,9 @@ void get_cutlass_moe_mm_data_caller(
   bool may_swap_ab = (!blockscale_offsets.has_value()) &&
                      (topk_ids.numel() <= SWAP_AB_THRESHOLD);
 
-  if (may_swap_ab) {
-    compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
-        static_cast<const int32_t*>(topk_ids.data_ptr()),
-        static_cast<int32_t*>(problem_sizes1.data_ptr()),
-        static_cast<int32_t*>(problem_sizes2.data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
-        k);
-  } else {
-    compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
-        static_cast<const int32_t*>(topk_ids.data_ptr()),
-        static_cast<int32_t*>(problem_sizes1.data_ptr()),
-        static_cast<int32_t*>(problem_sizes2.data_ptr()),
-        static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
-        k);
-  }
+  launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
+                               atomic_buffer, num_experts, n, k, stream,
+                               may_swap_ab);
 
   if (blockscale_offsets.has_value()) {
     // fp4 path
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 106bacb488..84843ee6e0 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -76,6 +76,11 @@ void get_cutlass_moe_mm_data_caller(
     const int64_t num_experts, const int64_t n, const int64_t k,
     const std::optional<torch::Tensor>& blockscale_offsets);
 
+void get_cutlass_moe_mm_problem_sizes_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
+
 void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
                                          torch::Tensor& problem_sizes1,
                                          torch::Tensor& problem_sizes2,
@@ -293,6 +298,25 @@ void get_cutlass_moe_mm_data(
       version_num, ". Required capability: 90 or 100");
 }
 
+void get_cutlass_moe_mm_problem_sizes(
+    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
+    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
+  int32_t version_num = get_sm_version_num();
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+  get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
+                                          problem_sizes2, num_experts, n, k,
+                                          blockscale_offsets);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
+      "kernel for CUDA device capability: ",
+      version_num, ". Required capability: 90 or 100");
+}
+
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                   torch::Tensor& problem_sizes1,
                                   torch::Tensor& problem_sizes2,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 7079671c2e..3a0ff6eaa7 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -440,6 +440,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       {stride_tag});
   ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
 
+  // A function that computes problem sizes for each expert's multiplication
+  // used by the two mms called from fused MoE operation. It takes topk_ids as
+  // an input, and computes problem_sizes1 and problem_sizes2 only.
+  ops.def(
+      "get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, "
+      "                                 Tensor! problem_sizes1, "
+      "                                 Tensor! problem_sizes2, "
+      "                                 int num_experts, int n, int k, "
+      "                                 Tensor? blockscale_offsets) -> ()",
+      {stride_tag});
+  ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
+           &get_cutlass_moe_mm_problem_sizes);
+
   // A function that computes data required to run fused MoE with w8a8 grouped
   // GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs
   // as an input, and computes expert_offsets (token start indices of each
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index 81fb3ec1de..c84f66383b 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -207,6 +207,10 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
         'topk_ids': topk_ids,
         'w1_scale': moe_tensors.w1_scale,
         'w2_scale': moe_tensors.w2_scale,
+        'ab_strides1': moe_tensors.ab_strides1,
+        'ab_strides2': moe_tensors.ab_strides2,
+        'c_strides1': moe_tensors.c_strides1,
+        'c_strides2': moe_tensors.c_strides2,
         'per_act_token': per_act_token,
         'a1_scale': None  #moe_tensors.a_scale
     }
@@ -424,8 +428,8 @@ def test_run_cutlass_moe_fp8(
         topk_ids[0][1] = 1
 
         workspace13_shape = (m * topk, max(2 * n, k))
-        workspace2_shape = (m * topk, n)
-        output_shape = (m * topk, k)
+        workspace2_shape = (m * topk, max(n, k))
+        output_shape = (m, k)
 
         workspace13 = torch.empty(prod(workspace13_shape),
                                   device="cuda",
@@ -440,6 +444,11 @@ def test_run_cutlass_moe_fp8(
         expert_map[start:end] = list(range(num_local_experts))
         expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")
 
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+
         activation = lambda o, i: torch.ops._C.silu_and_mul(o, i)
         a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale,
                                                    torch.float8_e4m3fn,
@@ -448,8 +457,9 @@ def test_run_cutlass_moe_fp8(
         func = lambda output: run_cutlass_moe_fp8(
             output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation,
             global_num_experts, expert_map, mt.w1_scale, mt.w2_scale,
-            a1q_scale, None, workspace13, workspace2, None, mt.a.dtype,
-            per_act_token, per_out_channel, False)
+            a1q_scale, None, ab_strides1, ab_strides2, c_strides1, c_strides2,
+            workspace13, workspace2, None, mt.a.dtype, per_act_token,
+            per_out_channel, False, topk_weights)
 
         workspace13.random_()
         output_random_workspace = torch.empty(output_shape,
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index 6ca01f9271..d71664d94b 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -238,7 +238,11 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
                                atol=0,
                                rtol=0)
     # check mindice
-    torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0)
+    # current kernel usage assumes deepgemm requires align_block_size
+    # when it's not provided then we don't compute m_indices (for cutlass)
+    if align_block_size is not None:
+        torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0)
+
     # check permuted_hidden_states, only valid token
     torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx],
                                permuted_hidden_states[valid_row_idx],
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index f98937ee6c..98908f2714 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -76,6 +76,7 @@ def pplx_cutlass_moe(
     assert torch.cuda.current_device() == pgi.local_rank
 
     num_tokens, hidden_dim = a.shape
+    intermediate_dim = w2.shape[2]
     num_experts = w1.shape[0]
     block_size = hidden_dim  # TODO support more cases
     device = pgi.device
@@ -124,8 +125,27 @@ def pplx_cutlass_moe(
         num_local_experts=num_local_experts,
         num_dispatchers=num_dispatchers)
 
+    ab_strides1 = torch.full((num_local_experts, ),
+                             hidden_dim,
+                             device="cuda",
+                             dtype=torch.int64)
+    ab_strides2 = torch.full((num_local_experts, ),
+                             intermediate_dim,
+                             device="cuda",
+                             dtype=torch.int64)
+    c_strides1 = torch.full((num_local_experts, ),
+                            2 * intermediate_dim,
+                            device="cuda",
+                            dtype=torch.int64)
+    c_strides2 = torch.full((num_local_experts, ),
+                            hidden_dim,
+                            device="cuda",
+                            dtype=torch.int64)
+
     experts = CutlassBatchedExpertsFp8(num_local_experts, num_dispatchers,
-                                       out_dtype, per_act_token, per_out_ch)
+                                       out_dtype, per_act_token, per_out_ch,
+                                       ab_strides1, ab_strides2, c_strides1,
+                                       c_strides2)
 
     fused_cutlass_experts = FusedMoEModularKernel(
         prepare_finalize,
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index 8730eeaaa7..a15decdf6f 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -535,7 +535,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
 
     expert_offsets = torch.zeros((num_experts + 1),
                                  device=device,
-                                 dtype=torch.int32)
+                                 dtype=torch.int64)
 
     problem_sizes = torch.zeros((num_experts, 3),
                                 device=device,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 0d556053f8..39da08847b 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -844,6 +844,28 @@ def get_cutlass_moe_mm_data(topk_ids: torch.Tensor,
                                                 blockscale_offsets)
 
 
+def get_cutlass_moe_mm_problem_sizes(
+        topk_ids: torch.Tensor,
+        problem_sizes1: torch.Tensor,
+        problem_sizes2: torch.Tensor,
+        num_experts: int,
+        n: int,
+        k: int,
+        blockscale_offsets: Optional[torch.Tensor] = None):
+    """
+    Compute only the per-expert problem sizes needed by the two grouped matrix
+    multiplications used in CUTLASS-based fused MoE.
+
+    The function takes in topk_ids (token→expert mapping) and computes:
+    - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's
+                                    multiplication for the two grouped MMs
+                                    used in the fused MoE operation.
+    """
+    return torch.ops._C.get_cutlass_moe_mm_problem_sizes(
+        topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k,
+        blockscale_offsets)
+
+
 def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
     """
     Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor.
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 0a02b558d0..95d23ec034 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -9,12 +9,13 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
+    moe_permute, moe_unpermute)
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP)
-from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm,
-                                                        _fp8_quantize,
+from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize,
                                                         _resize_cache)
 from vllm.scalar_type import scalar_types
 
@@ -34,6 +35,10 @@ def run_cutlass_moe_fp8(
     w2_scale: Optional[torch.Tensor],
     a1q_scale: Optional[torch.Tensor],
     a2_scale: Optional[torch.Tensor],
+    ab_strides1: torch.Tensor,
+    ab_strides2: torch.Tensor,
+    c_strides1: torch.Tensor,
+    c_strides2: torch.Tensor,
     workspace13: torch.Tensor,
     workspace2: torch.Tensor,
     expert_num_tokens: Optional[torch.Tensor],
@@ -41,6 +46,7 @@ def run_cutlass_moe_fp8(
     per_act_token: bool,
     per_out_ch: bool,
     use_batched_format: bool,
+    topk_weights: Optional[torch.Tensor],
 ):
     a1q = hidden_states
 
@@ -99,6 +105,22 @@ def run_cutlass_moe_fp8(
     topk = local_topk_ids.size(1)
     local_E = w1.size(0)
 
+    if use_batched_format:
+        mm1_out = _resize_cache(workspace13, (local_E * padded_M, N * 2))
+        act_out = _resize_cache(workspace2, (local_E * padded_M, N))
+        quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
+                                  (local_E * padded_M, N))
+        mm2_out = _resize_cache(workspace2, (local_E * padded_M, K))
+    else:
+        a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn),
+                                 (M * topk, K))
+        mm1_out = _resize_cache(workspace13, (M * topk, N * 2))
+        act_out = _resize_cache(workspace2, (M * topk, N))
+        # original workspace are based on input hidden_states dtype (bf16)
+        quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
+                                  (M * topk, N))
+        mm2_out = _resize_cache(workspace2, (M * topk, K))
+
     if use_batched_format:
         assert expert_num_tokens is not None
 
@@ -120,11 +142,10 @@ def run_cutlass_moe_fp8(
         w2_scale = w2_scale.reshape(w2_scale.size(0), -1)
         a1q = a1q.reshape(-1, a1q.size(2))
         a1q_scale = a1q_scale.reshape(-1, a1q_scale.size(2)).contiguous()
-
+        # c3x get_group_gemm_starts expects int64 to avoid overflow
+        # during offset calculations
+        expert_offsets = expert_offsets.to(torch.int64)
     else:
-        expert_offsets = torch.empty((global_num_experts + 1),
-                                     dtype=torch.int32,
-                                     device=device)
         problem_sizes1 = torch.empty((global_num_experts, 3),
                                      dtype=torch.int32,
                                      device=device)
@@ -132,84 +153,57 @@ def run_cutlass_moe_fp8(
                                      dtype=torch.int32,
                                      device=device)
 
-        # With expert_map each Rank processes only a subset of experts. As
-        # a result not all of a_map and c2 tensors are filled. We fill it
-        # zeros for correctness.
-        if expert_map is not None:
-            a_map = torch.zeros((local_topk_ids.numel()),
-                                dtype=torch.int32,
-                                device=device)
-        else:
-            a_map = torch.empty((local_topk_ids.numel()),
-                                dtype=torch.int32,
-                                device=device)
-
-        c_map = torch.empty((local_topk_ids.numel()),
-                            dtype=torch.int32,
-                            device=device)
-
-        ops.get_cutlass_moe_mm_data(local_topk_ids, expert_offsets,
-                                    problem_sizes1, problem_sizes2, a_map,
-                                    c_map, global_num_experts, N, K)
-
-        a1q = _fp8_perm(a1q, a_map)
-        a1q_scale = a1q_scale[a_map] if per_act_token else a1q_scale
+        num_expert = global_num_experts if expert_map is None \
+                     else expert_map.size(0)
+        # permuted a1q reuses workspace2
+        a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute(
+            a1q,
+            a1q_scale,
+            topk_ids,
+            num_expert,
+            local_E,
+            expert_map,
+            permuted_hidden_states=a1q_perm)
         expert_offsets = expert_offsets[:-1]
 
-    ab_strides1 = torch.full((w1.size(0), ),
-                             K,
-                             device=device,
-                             dtype=torch.int64)
-    c_strides1 = torch.full((w1.size(0), ),
-                            2 * N,
-                            device=device,
-                            dtype=torch.int64)
-    ab_strides2 = torch.full((w1.size(0), ),
-                             N,
-                             device=device,
-                             dtype=torch.int64)
-    c_strides2 = torch.full((w1.size(0), ),
-                            K,
-                            device=device,
-                            dtype=torch.int64)
-
-    if use_batched_format:
-        c1 = _resize_cache(workspace13, (local_E * padded_M, N * 2))
-        c2 = _resize_cache(workspace2, (local_E * padded_M, N))
-        c3 = _resize_cache(workspace13, (local_E * padded_M, K))
-    else:
-        c1 = _resize_cache(workspace13, (M * topk, N * 2))
-        c2 = _resize_cache(workspace2, (M * topk, N))
-        c3 = _resize_cache(workspace13, (M * topk, K))
+        ops.get_cutlass_moe_mm_problem_sizes(local_topk_ids, problem_sizes1,
+                                             problem_sizes2,
+                                             global_num_experts, N, K)
 
     if not per_act_token and (expert_map is not None or use_batched_format):
         # this is necessary to avoid imprecise scale calculation caused by
         # random data in the unused workspace. The workspace is unused when
         # this rank handles only partial tokens, or when it is batched .
-        c1.fill_(0)
+        mm1_out.fill_(0)
 
-    ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, expert_offsets,
+    ops.cutlass_moe_mm(mm1_out, a1q, w1, a1q_scale, w1_scale, expert_offsets,
                        problem_sizes1, ab_strides1, ab_strides1, c_strides1,
                        per_act_token, per_out_ch)
 
-    activation_callable(c2, c1)
+    activation_callable(act_out, mm1_out)
 
     a2q, a2q_scale = ops.scaled_fp8_quant(
-        c2, a2_scale, use_per_token_if_dynamic=per_act_token)
+        act_out,
+        a2_scale,
+        use_per_token_if_dynamic=per_act_token,
+        output=quant_out)
 
     if expert_map is not None:
-        c3.fill_(0)
+        mm2_out.fill_(0)
 
-    ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale, expert_offsets,
+    ops.cutlass_moe_mm(mm2_out, a2q, w2, a2q_scale, w2_scale, expert_offsets,
                        problem_sizes2, ab_strides2, ab_strides2, c_strides2,
                        per_act_token, per_out_ch)
 
     if use_batched_format:
-        output.copy_(c3.reshape(local_E, padded_M, K), non_blocking=True)
+        output.copy_(mm2_out.reshape(local_E, padded_M, K), non_blocking=True)
     else:
-        # We can't do this inplace because output may point to the same tensor
-        # as c3.
-        output.copy_(c3[c_map].view(M * topk, K), non_blocking=True)
+        # for non-chunking mode the output is resized from workspace13
+        # so we need to make sure mm2_out uses workspace2.
+        moe_unpermute(out=output,
+                      permuted_hidden_states=mm2_out,
+                      topk_weights=topk_weights,
+                      inv_permuted_idx=inv_perm)
 
 
 class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
@@ -219,6 +213,10 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
         out_dtype: Optional[torch.dtype],
         per_act_token_quant: bool,
         per_out_ch_quant: bool,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
         block_shape: Optional[list[int]] = None,
     ):
         super().__init__(
@@ -229,6 +227,10 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
                 block_shape=block_shape,
             ))
         self.out_dtype = out_dtype
+        self.ab_strides1 = ab_strides1
+        self.ab_strides2 = ab_strides2
+        self.c_strides1 = c_strides1
+        self.c_strides2 = c_strides2
 
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         # Let PrepareAndFinalize::finalize() decide the impl.
@@ -272,10 +274,11 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
         run_cutlass_moe_fp8(
             output, hidden_states, w1, w2, topk_ids, activation_callable,
             global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale,
-            a2_scale, workspace13, workspace2, expert_num_tokens,
+            a2_scale, self.ab_strides1, self.ab_strides2, self.c_strides1,
+            self.c_strides2, workspace13, workspace2, expert_num_tokens,
             self.out_dtype if self.out_dtype is not None else in_dtype,
             self.per_act_token_quant, self.per_out_ch_quant,
-            use_batched_format)
+            use_batched_format, topk_weights)
 
 
 class CutlassExpertsFp8(CutlassExpertsFp8Base):
@@ -285,12 +288,20 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
         out_dtype: Optional[torch.dtype],
         per_act_token_quant: bool,
         per_out_ch_quant: bool,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
         block_shape: Optional[list[int]] = None,
     ):
         super().__init__(
             out_dtype,
             per_act_token_quant,
             per_out_ch_quant,
+            ab_strides1,
+            ab_strides2,
+            c_strides1,
+            c_strides2,
             block_shape,
         )
 
@@ -307,6 +318,10 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
     def supports_expert_map(self) -> bool:
         return True
 
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # topk weights and reduction are fused in moe_unpermute cuda kernel
+        return TopKWeightAndReduceNoOP()
+
     def workspace_shapes(
         self,
         a: torch.Tensor,
@@ -320,8 +335,8 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         workspace1 = (M * topk, max(N, K))
-        workspace2 = (M * topk, N // 2)
-        output = (M * topk, K)
+        workspace2 = (M * topk, max(N // 2, K))
+        output = (M, K)
         return (workspace1, workspace2, output,
                 self.out_dtype if self.out_dtype is not None else a.dtype)
 
@@ -335,12 +350,20 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
         out_dtype: Optional[torch.dtype],
         per_act_token_quant: bool,
         per_out_ch_quant: bool,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
         block_shape: Optional[list[int]] = None,
     ):
         super().__init__(
             out_dtype,
             per_act_token_quant,
             per_out_ch_quant,
+            ab_strides1,
+            ab_strides2,
+            c_strides1,
+            c_strides2,
             block_shape,
         )
         assert max_experts_per_worker > 0
@@ -378,7 +401,8 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
         assert num_dp is not None
         workspace1 = (self.max_experts_per_worker, padded_M * num_dp,
                       max(N, K))
-        workspace2 = (self.max_experts_per_worker, padded_M * num_dp, (N // 2))
+        workspace2 = (self.max_experts_per_worker, padded_M * num_dp,
+                      max(N // 2, K))
         output = (self.max_experts_per_worker, padded_M, K)
         return (workspace1, workspace2, output,
                 self.out_dtype if self.out_dtype is not None else a.dtype)
@@ -392,6 +416,10 @@ def cutlass_moe_fp8(
     topk_ids: torch.Tensor,
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
+    ab_strides1: torch.Tensor,
+    ab_strides2: torch.Tensor,
+    c_strides1: torch.Tensor,
+    c_strides2: torch.Tensor,
     per_act_token: Optional[bool] = None,
     activation: str = "silu",
     a1_scale: Optional[torch.Tensor] = None,
@@ -419,6 +447,17 @@ def cutlass_moe_fp8(
         Shape: [num_experts] or [num_experts, 2N]
     - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
         Shape: [num_experts] or [num_experts, K]
+    - ab_strides1 (torch.Tensor): The input/weight strides for the first gemm.
+        Shape: [num_experts]
+    - ab_strides2 (torch.Tensor): The input/weight strides for the second gemm.
+        Shape: [num_experts]
+    - c_strides1 (torch.Tensor): The output strides for the first gemm.
+        Shape: [num_experts]
+    - c_strides2 (torch.Tensor): The output strides for the second gemm.
+        Shape: [num_experts]
+    - per_act_token (Optional[bool]): Whether the scale is per-token or
+                                      per-tensor.
+    - activation (str): The activation function to use.
     - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
         Shape: scalar or [M]
     - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
@@ -450,6 +489,10 @@ def cutlass_moe_fp8(
             out_dtype=a.dtype,
             per_act_token_quant=per_act_token,
             per_out_ch_quant=per_out_ch,
+            ab_strides1=ab_strides1,
+            ab_strides2=ab_strides2,
+            c_strides1=c_strides1,
+            c_strides2=c_strides2,
         ),
     )
 
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index d9059f50b4..16a155e718 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -82,7 +82,8 @@ def moe_permute(
     n_local_expert: int = -1,
     expert_map: Optional[torch.Tensor] = None,
     align_block_size: Optional[int] = None,
-    fill_invalid_expert: int = -1
+    fill_invalid_expert: int = -1,
+    permuted_hidden_states: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
            torch.Tensor]:
     """
@@ -95,14 +96,17 @@ def moe_permute(
     - n_expert (int): The number of expert.
     - n_local_expert (int): The number of expert in current EP rank.
     - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
-        from the global expert space to the local expert space of the expert 
+        from the global expert space to the local expert space of the expert
         parallel shard.
     - align_block_size (Optional[int]): align group gemm block size for deepgemm
     - fill_invalid_expert(int): fill expert id in m_indices for invalid expert
       to workaround DeepGemm unsupported -1 in m_indices
+    - permuted_hidden_states (Optional[torch.Tensor]): Optional output tensor.
+        If None, the output tensor will be created in this function.
     Returns:
     - permuted_hidden_states (torch.Tensor): permuted activation.
-    - a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states
+    - a1q_scale (Optional[torch.Tensor]): permuted quant scale for hidden_states
+        if original scale not per-tensor scaling
     - expert_first_token_offset (torch.Tensor): offset of the first token
        of each expert for standard grouped gemm. if enable 'align_block_size'
        expert_first_token_offset will align up to 'align_block_size'.
@@ -122,11 +126,16 @@ def moe_permute(
                              1) // align_block_size * align_block_size
     if n_local_expert == -1:
         n_local_expert = n_expert
-    permuted_hidden_states = torch.empty(
-        (permuted_row_size, n_hidden),
-        dtype=hidden_states.dtype,
-        device=hidden_states.device,
-    )
+    if permuted_hidden_states is None:
+        permuted_hidden_states = torch.empty(
+            (permuted_row_size, n_hidden),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+    assert permuted_hidden_states.size() == (permuted_row_size, n_hidden), (
+        f"Expected permuted hidden states to be {(permuted_row_size, n_hidden)}"
+        f" but got {permuted_hidden_states.size()}")
+
     token_expert_indices = torch.arange(0,
                                         n_token * topk,
                                         dtype=torch.int32,
@@ -153,7 +162,8 @@ def moe_permute(
                                  align_block_size, permuted_hidden_states,
                                  expert_first_token_offset, inv_permuted_idx,
                                  permuted_idx, m_indices)
-    if a1q_scale is not None:
+
+    if a1q_scale is not None and a1q_scale.dim() > 1:
         a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) //
                               topk]
     return (permuted_hidden_states, a1q_scale, expert_first_token_offset,
@@ -185,6 +195,7 @@ def moe_unpermute(
     n_hidden = permuted_hidden_states.size(-1)
     assert (n_hidden * permuted_hidden_states.element_size()
             ) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
+
     torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
                                    inv_permuted_idx, expert_first_token_offset,
                                    topk, out)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 8ca8249e69..7bc35cd81a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -669,6 +669,25 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             from vllm.model_executor.layers.fused_moe import fused_experts
             self.fused_experts_func = fused_experts
 
+        if self.use_cutlass:
+            device = layer.w13_weight.device
+            # ab_strides1 and c_strides2 are the same
+            self.ab_strides1_c_strides2 = torch.full(
+                (layer.local_num_experts, ),
+                layer.hidden_size,
+                device=device,
+                dtype=torch.int64)
+            self.ab_strides2 = torch.full(
+                (layer.local_num_experts, ),
+                layer.intermediate_size_per_partition,
+                device=device,
+                dtype=torch.int64)
+            self.c_strides1 = torch.full(
+                (layer.local_num_experts, ),
+                2 * layer.intermediate_size_per_partition,
+                device=device,
+                dtype=torch.int64)
+
     def select_gemm_impl(
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
@@ -693,6 +712,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     moe.in_dtype,
                     self.input_quant.strategy == QuantizationStrategy.TOKEN,
                     self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
+                    ab_strides1=self.ab_strides1_c_strides2,
+                    ab_strides2=self.ab_strides2,
+                    c_strides1=self.c_strides1,
+                    c_strides2=self.ab_strides1_c_strides2,
                 )
             else:
                 logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
@@ -700,6 +723,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     moe.in_dtype,
                     self.input_quant.strategy == QuantizationStrategy.TOKEN,
                     self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
+                    ab_strides1=self.ab_strides1_c_strides2,
+                    ab_strides2=self.ab_strides2,
+                    c_strides1=self.c_strides1,
+                    c_strides2=self.ab_strides1_c_strides2,
                 )
 
             self.disable_expert_map = (num_dispatchers > 1
@@ -822,6 +849,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     expert_map=None if self.disable_expert_map else expert_map,
                     w1_scale=layer.w13_weight_scale,
                     w2_scale=layer.w2_weight_scale,
+                    ab_strides1=self.ab_strides1_c_strides2,
+                    ab_strides2=self.ab_strides2,
+                    c_strides1=self.c_strides1,
+                    c_strides2=self.ab_strides1_c_strides2,
                     a1_scale=layer.w13_input_scale,
                     a2_scale=layer.w2_input_scale,
                 )

From 5efd6905bc8469a30664de83bdafaad56aa92903 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 20 Aug 2025 23:42:28 +0800
Subject: [PATCH 432/932] [CLI][Doc] Formalize `--mm-encoder-tp-mode` (#23190)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/configuration/optimization.md       | 45 ++++++++++++++++++++++++
 vllm/config/__init__.py                  | 34 +++++++++++++++++-
 vllm/config/parallel.py                  |  4 ---
 vllm/engine/arg_utils.py                 | 35 +++++++++++-------
 vllm/model_executor/models/mllama4.py    |  4 +--
 vllm/model_executor/models/qwen2_5_vl.py |  3 +-
 vllm/model_executor/models/step3_vl.py   |  3 +-
 7 files changed, 104 insertions(+), 24 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index c7f50497d6..db9dfb313f 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -129,6 +129,51 @@ Data parallelism replicates the entire model across multiple GPU sets and proces
 Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`.
 Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.
 
+### Batch-level DP for Multi-Modal Encoders
+
+By default, TP is used to shard the weights of multi-modal encoders just like for language decoders,
+in order to reduce the memory and compute load on each GPU.
+
+However, since the size of multi-modal encoders is very small compared to language decoders,
+there is relatively little gain from TP. On the other hand, TP incurs significant communication
+overhead because of all-reduce being performed after every layer.
+
+Given this, it may be advantageous to instead shard the batched input data using TP, essentially
+performing batch-level DP. This has been shown to improve the throughput by around 10% for
+`tensor_parallel_size=8`. For vision encoders that use hardware-unoptimized Conv3D operations,
+batch-level DP can provide another 40% increase to throughput compared to regular TP.
+
+Nevertheless, since the weights of the multi-modal encoder are replicated across each TP rank,
+there will be a minor increase in memory consumption and may cause OOM if you can barely fit the model already.
+
+You can enable batch-level DP by setting `mm_encoder_tp_mode="data"`, for example:
+
+```python
+from vllm import LLM
+
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-72B-Instruct",
+    # Create two EngineCore instances, one per DP rank
+    data_parallel_size=2,
+    # Within each EngineCore instance:
+    # The vision encoder uses TP=4 (not DP=2) to shard the input data
+    # The language decoder uses TP=4 to shard the weights as usual
+    tensor_parallel_size=4,
+    mm_encoder_tp_mode="data",
+)
+```
+
+!! important
+    Batch-level DP is not to be confused with API request-level DP
+    (which is instead controlled by `data_parallel_size`).
+
+The availablilty of batch-level DP is based on model implementation.
+Currently, the following models support `mm_encoder_tp_mode="data"`:
+
+- Llama4 (<gh-pr:18368>)
+- Qwen2.5-VL (<gh-pr:22742>)
+- Step3 (<gh-pr:22697>)
+
 ## Input Processing
 
 ### Parallel Processing
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 801fa97fe5..5b5d477ef0 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -258,6 +258,7 @@ TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal["raw_logprobs", "raw_logits", "processed_logprobs",
                        "processed_logits"]
+MMEncoderTPMode = Literal["weights", "data"]
 
 
 @config
@@ -438,6 +439,19 @@ class ModelConfig:
     `mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
 
     Set to `0` to disable this cache completely (not recommended)."""
+    mm_encoder_tp_mode: MMEncoderTPMode = "weights"
+    """Indicates how to optimize multi-modal encoder inference using
+    tensor parallelism (TP).
+
+    - `"weights"`: Within the same vLLM engine, split the weights of
+        each layer across TP ranks. (default TP behavior)
+    - `"data"`: Within the same vLLM engine, split the batched input data
+        across TP ranks to process the data in parallel, while hosting
+        the full weights on each TP rank.
+        This batch-level DP is not to be confused with API request-level
+        DP (which is controlled by `--data-parallel-size`).
+        This is only supported on a per-model basis and falls back to
+        `"weights"` if the encoder does not support DP."""
     override_neuron_config: dict[str, Any] = field(default_factory=dict)
     """Initialize non-default neuron config or override default neuron config
     that are specific to Neuron devices, this argument will be used to
@@ -856,8 +870,10 @@ class ModelConfig:
                 media_io_kwargs=self.media_io_kwargs,
                 mm_processor_kwargs=self.mm_processor_kwargs,
                 mm_processor_cache_gb=self.mm_processor_cache_gb,
+                mm_encoder_tp_mode=self.mm_encoder_tp_mode,
                 interleave_mm_strings=self.interleave_mm_strings,
-                skip_mm_profiling=self.skip_mm_profiling)
+                skip_mm_profiling=self.skip_mm_profiling,
+            )
 
         return None
 
@@ -2547,6 +2563,22 @@ class MultiModalConfig:
     Set to `0` to disable this cache completely (not recommended).
     """
 
+    mm_encoder_tp_mode: MMEncoderTPMode = "weights"
+    """
+    Indicates how to optimize multi-modal encoder inference using
+    tensor parallelism (TP).
+
+    - `"weights"`: Within the same vLLM engine, split the weights of
+        each layer across TP ranks. (default TP behavior)
+    - `"data"`: Within the same vLLM engine, split the batched input data
+        across TP ranks to process the data in parallel, while hosting
+        the full weights on each TP rank.
+        This batch-level DP is not to be confused with API request-level
+        DP (which is controlled by `--data-parallel-size`).
+        This is only supported on a per-model basis and falls back to
+        `"weights"` if the encoder does not support DP.
+    """
+
     interleave_mm_strings: bool = False
     """
     Enable fully interleaved support for multimodal prompts.
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index bac1e63800..7a9e68f0ea 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -137,10 +137,6 @@ class ParallelConfig:
     rank: int = 0
     """Global rank in distributed setup."""
 
-    enable_multimodal_encoder_data_parallel: bool = False
-    """ Use data parallelism instead of tensor parallelism for vision encoder.
-    Only support LLama4 for now"""
-
     @property
     def world_size_across_dp(self) -> int:
         """world_size_across_dp is TPxPPxDP, it is the size of the world
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 48d9cd08af..6869c3f23f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -28,12 +28,12 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
                          DeviceConfig, DistributedExecutorBackend,
                          GuidedDecodingBackend, HfOverrides, KVEventsConfig,
                          KVTransferConfig, LoadConfig, LogprobsMode,
-                         LoRAConfig, MambaDType, ModelConfig, ModelDType,
-                         ModelImpl, MultiModalConfig, ObservabilityConfig,
-                         ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
-                         RunnerOption, SchedulerConfig, SchedulerPolicy,
-                         SpeculativeConfig, TaskOption, TokenizerMode,
-                         VllmConfig, get_attr_docs, get_field)
+                         LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig,
+                         ModelDType, ModelImpl, MultiModalConfig,
+                         ObservabilityConfig, ParallelConfig, PoolerConfig,
+                         PrefixCachingHashAlgo, RunnerOption, SchedulerConfig,
+                         SchedulerPolicy, SpeculativeConfig, TaskOption,
+                         TokenizerMode, VllmConfig, get_attr_docs, get_field)
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
@@ -352,6 +352,7 @@ class EngineArgs:
         MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
     mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
+    mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     # LoRA fields
     enable_lora: bool = False
@@ -434,16 +435,14 @@ class EngineArgs:
     use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
     pt_load_map_location: str = LoadConfig.pt_load_map_location
 
-    enable_multimodal_encoder_data_parallel: bool = \
-        ParallelConfig.enable_multimodal_encoder_data_parallel
+    # DEPRECATED
+    enable_multimodal_encoder_data_parallel: bool = False
 
     logits_processors: Optional[list[Union[
         str, type[LogitsProcessor]]]] = ModelConfig.logits_processors
     """Custom logitproc types"""
 
     async_scheduling: bool = SchedulerConfig.async_scheduling
-    # DEPRECATED
-    enable_prompt_adapter: bool = False
 
     kv_sharing_fast_prefill: bool = \
         CacheConfig.kv_sharing_fast_prefill
@@ -685,7 +684,8 @@ class EngineArgs:
                                     **parallel_kwargs["worker_extension_cls"])
         parallel_group.add_argument(
             "--enable-multimodal-encoder-data-parallel",
-            **parallel_kwargs["enable_multimodal_encoder_data_parallel"])
+            action="store_true",
+            deprecated=True)
 
         # KV cache arguments
         cache_kwargs = get_kwargs(CacheConfig)
@@ -735,6 +735,8 @@ class EngineArgs:
         multimodal_group.add_argument("--disable-mm-preprocessor-cache",
                                       action="store_true",
                                       deprecated=True)
+        multimodal_group.add_argument(
+            "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"])
         multimodal_group.add_argument(
             "--interleave-mm-strings",
             **multimodal_kwargs["interleave_mm_strings"])
@@ -909,6 +911,14 @@ class EngineArgs:
 
             self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
 
+        if self.enable_multimodal_encoder_data_parallel:
+            logger.warning(
+                "--enable-multimodal-encoder-data-parallel` is deprecated "
+                "and will be removed in v0.13. "
+                "Please use `--mm-encoder-tp-mode data` instead.")
+
+            self.mm_encoder_tp_mode = "data"
+
         return ModelConfig(
             model=self.model,
             hf_config_path=self.hf_config_path,
@@ -947,6 +957,7 @@ class EngineArgs:
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
             mm_processor_cache_gb=self.mm_processor_cache_gb,
+            mm_encoder_tp_mode=self.mm_encoder_tp_mode,
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern,
@@ -1258,8 +1269,6 @@ class EngineArgs:
             distributed_executor_backend=self.distributed_executor_backend,
             worker_cls=self.worker_cls,
             worker_extension_cls=self.worker_extension_cls,
-            enable_multimodal_encoder_data_parallel=self.
-            enable_multimodal_encoder_data_parallel,
         )
 
         if model_config.is_multimodal_model:
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 35103eac8f..595bdd17cf 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -728,8 +728,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
-        self.use_data_parallel = (vllm_config.parallel_config.
-                                  enable_multimodal_encoder_data_parallel)
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
+
         self.config = config
         self.quant_config = quant_config
         self.multimodal_config = multimodal_config
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 34eec10296..811ecffcc1 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -877,8 +877,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
         multimodal_config = vllm_config.model_config.multimodal_config
 
-        self.use_data_parallel = (vllm_config.parallel_config.
-                                  enable_multimodal_encoder_data_parallel)
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
         self.config = config
         self.multimodal_config = multimodal_config
 
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 5d41a9e569..f8877b584b 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -882,8 +882,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         self.config = config
         self.multimodal_config = multimodal_config
-        self.use_data_parallel = (vllm_config.parallel_config.
-                                  enable_multimodal_encoder_data_parallel)
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
         if multimodal_config.get_limit_per_prompt("image"):
             self.vision_model = Step3VisionTransformer(

From d6d13bd49ed7fda56ac6a1b0aa53621490c975ac Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 20 Aug 2025 09:05:29 -0700
Subject: [PATCH 433/932] [Misc] Add max_seq_len to CommonAttentionMetadata 
 (#23216)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/attention/utils.py                  | 2 ++
 tests/v1/spec_decode/test_tree_attention.py  | 2 ++
 vllm/v1/attention/backends/flash_attn.py     | 2 +-
 vllm/v1/attention/backends/flashinfer.py     | 2 +-
 vllm/v1/attention/backends/flex_attention.py | 2 +-
 vllm/v1/attention/backends/rocm_aiter_fa.py  | 2 +-
 vllm/v1/attention/backends/tree_attn.py      | 2 +-
 vllm/v1/attention/backends/triton_attn.py    | 2 +-
 vllm/v1/attention/backends/utils.py          | 6 ++++++
 vllm/v1/attention/backends/xformers.py       | 2 +-
 vllm/v1/spec_decode/eagle.py                 | 1 +
 vllm/v1/worker/gpu_model_runner.py           | 4 ++++
 12 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index a4e38eb32f..e547e71e0c 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -58,6 +58,7 @@ def create_common_attn_metadata(
                             dtype=torch.int32,
                             device=device)
     seq_lens_cpu = seq_lens.cpu()
+    max_seq_len = int(seq_lens_cpu.max())
 
     # Create computed tokens (context length for each sequence)
     context_lens = [
@@ -101,6 +102,7 @@ def create_common_attn_metadata(
         num_reqs=batch_spec.batch_size,
         num_actual_tokens=num_tokens,
         max_query_len=max_query_len,
+        max_seq_len=max_seq_len,
         block_table_tensor=block_table_tensor,
         slot_mapping=slot_mapping,
         causal=True,
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index 456ce712d3..6317817408 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -50,6 +50,7 @@ def forward_attention(
         dtype=torch.int32,
     )
     context_lens = seq_lens - query_lens
+    max_seq_len = int(seq_lens.max())
     max_query_len = q_len
     num_actual_tokens = query_start_loc[-1]
 
@@ -81,6 +82,7 @@ def forward_attention(
         num_reqs=batch_size,
         num_actual_tokens=num_actual_tokens,
         max_query_len=max_query_len,
+        max_seq_len=max_seq_len,
         block_table_tensor=block_table,
         slot_mapping=slot_mapping,
     )
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index ab7a71a399..eed3cba9a2 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -233,7 +233,7 @@ class FlashAttentionMetadataBuilder(
         num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
-        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        max_seq_len = common_attn_metadata.max_seq_len
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 53fafbc4af..8a25088848 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -463,7 +463,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
         page_size = self.page_size
         max_q_len = common_attn_metadata.max_query_len
-        max_seq_len = common_attn_metadata.seq_lens_cpu.max().item()
+        max_seq_len = common_attn_metadata.max_seq_len
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         block_table_tensor = common_attn_metadata.block_table_tensor
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index e599411b2d..abca981035 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -305,7 +305,7 @@ class FlexAttentionMetadataBuilder(
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
 
-        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        max_seq_len = common_attn_metadata.max_seq_len
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 36b5853bfd..b9ff113573 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -270,7 +270,7 @@ class AiterFlashAttentionMetadataBuilder(
 
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
-        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        max_seq_len = common_attn_metadata.max_seq_len
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index 5d10e9e260..2a0c52377c 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -205,7 +205,7 @@ class TreeAttentionMetadataBuilder(
         q_start_loc = common_attn_metadata.query_start_loc
         max_query_len = common_attn_metadata.max_query_len
         kv_seqlens = common_attn_metadata.seq_lens
-        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        max_seq_len = common_attn_metadata.max_seq_len
         block_table = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
 
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 48a9af3dec..c69dd8415f 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -90,7 +90,7 @@ class TritonAttentionMetadataBuilder(
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
 
-        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        max_seq_len = common_attn_metadata.max_seq_len
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 94dd3d2629..57c4d436c5 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -58,6 +58,8 @@ class CommonAttentionMetadata:
     """Total number of tokens in batch"""
     max_query_len: int
     """Longest query in batch"""
+    max_seq_len: int
+    """Longest context length in batch"""
 
     block_table_tensor: torch.Tensor
     slot_mapping: torch.Tensor
@@ -107,6 +109,7 @@ def _make_metadata_with_slice(
 
     seq_lens = attn_metadata.seq_lens[request_slice]
     seq_lens_cpu = attn_metadata.seq_lens_cpu[request_slice]
+    max_seq_len = int(seq_lens_cpu.max())
     num_computed_tokens_cpu = attn_metadata.num_computed_tokens_cpu[
         request_slice]
 
@@ -128,6 +131,7 @@ def _make_metadata_with_slice(
         num_reqs=num_requests,
         num_actual_tokens=num_actual_tokens,
         max_query_len=max_query_len,
+        max_seq_len=max_seq_len,
         block_table_tensor=block_table_tensor,
         slot_mapping=slot_mapping,
     )
@@ -520,6 +524,7 @@ def make_local_attention_virtual_batches(
 
     query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local)
     seq_lens_cpu = torch.from_numpy(seqlens_k_local)
+    max_seq_len = int(seq_lens_cpu.max())
 
     return CommonAttentionMetadata(
         query_start_loc_cpu=query_start_loc_cpu,
@@ -531,6 +536,7 @@ def make_local_attention_virtual_batches(
         num_reqs=len(seq_lens_cpu),
         num_actual_tokens=common_attn_metadata.num_actual_tokens,
         max_query_len=seqlens_q_local.max(),
+        max_seq_len=max_seq_len,
         block_table_tensor=block_table_local,
         slot_mapping=common_attn_metadata.slot_mapping,
         causal=True,
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index fe732c6017..b305bc1539 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -231,7 +231,7 @@ class XFormersAttentionMetadataBuilder(
         q_seqlens = torch.diff(q_start_loc)
         max_query_len = common_attn_metadata.max_query_len
         kv_seqlens = common_attn_metadata.seq_lens
-        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        max_seq_len = common_attn_metadata.max_seq_len
         block_table = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
 
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 8cd2ad12cf..cc2b2a139d 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -582,6 +582,7 @@ class EagleProposer:
             num_reqs=common_attn_metadata.num_reqs,
             num_actual_tokens=total_num_tokens,
             max_query_len=new_query_len_per_req.max().item(),
+            max_seq_len=new_seq_lens_cpu.max().item(),
             block_table_tensor=common_attn_metadata.block_table_tensor,
             slot_mapping=common_attn_metadata.slot_mapping[token_indices],
             causal=True,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e0bab3367c..d9770226b1 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -774,6 +774,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.seq_lens_np[num_reqs:].fill(0)
         self.seq_lens.copy_(self.seq_lens_cpu, non_blocking=True)
         seq_lens = self.seq_lens[:num_reqs]
+        max_seq_len = self.seq_lens_np[:num_reqs].max().item()
 
         # Copy the tensors to the GPU.
         self.input_ids[:total_num_scheduled_tokens].copy_(
@@ -886,6 +887,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 num_reqs=num_reqs,
                 num_actual_tokens=total_num_scheduled_tokens,
                 max_query_len=max_num_scheduled_tokens,
+                max_seq_len=max_seq_len,
                 block_table_tensor=blk_table_tensor,
                 slot_mapping=slot_mapping,
                 causal=True,
@@ -2338,6 +2340,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     num_reqs=num_reqs,
                     num_actual_tokens=num_tokens,
                     max_query_len=max_query_len,
+                    max_seq_len=self.max_model_len,
                     block_table_tensor=self.input_batch.block_table[
                         kv_cache_group_id].get_device_tensor()[:num_reqs],
                     slot_mapping=self.input_batch.
@@ -3343,6 +3346,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 num_reqs=num_reqs,
                 num_actual_tokens=total_num_scheduled_tokens,
                 max_query_len=max_num_scheduled_tokens,
+                max_seq_len=self.seq_lens_cpu[:num_reqs].max().item(),
                 block_table_tensor=dummy_block_table,
                 slot_mapping=dummy_slot_mapping,
                 causal=False,

From 3b11b26b5069718a6bde11b9041681bc17369f96 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@gmail.com>
Date: Wed, 20 Aug 2025 18:08:29 +0200
Subject: [PATCH 434/932] [FIXBUG ] Allow disabling rocm_aiter_fa backend for
 ROCm GPUs not compatible with AITER (#22795)

Signed-off-by: JartX <sagformas@epdcenter.es>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 vllm/v1/spec_decode/eagle.py | 80 ++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 35 deletions(-)

diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index cc2b2a139d..0a0e9fed72 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import ast
 from dataclasses import replace
-from typing import Optional
+from importlib.util import find_spec
+from typing import Optional, Protocol
 
 import numpy as np
 import torch
@@ -20,8 +21,6 @@ from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
 from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.v1.attention.backends.rocm_aiter_fa import (
-    AiterFlashAttentionMetadata)
 from vllm.v1.attention.backends.tree_attn import (TreeAttentionMetadata,
                                                   TreeAttentionMetadataBuilder)
 from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata
@@ -34,6 +33,17 @@ logger = init_logger(__name__)
 PADDING_SLOT_ID = -1
 
 
+class EagleAttentionMetadata(Protocol):
+    # Required attributes
+    num_actual_tokens: int
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+
+
 class EagleProposer:
 
     def __init__(
@@ -97,6 +107,20 @@ class EagleProposer:
             dtype=self.dtype,
             device=device)
 
+        # Determine allowed attention backends once during initialization.
+        self.allowed_attn_types: tuple[type[EagleAttentionMetadata], ...]
+        if current_platform.is_rocm():
+            rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
+            # vllm.v1.attention.backends.rocm_aiter_fa is an optional backend
+            if find_spec("vllm.v1.attention.backends.rocm_aiter_fa"):
+                from vllm.v1.attention.backends.rocm_aiter_fa import (
+                    AiterFlashAttentionMetadata)
+                rocm_types.append(AiterFlashAttentionMetadata)
+            self.allowed_attn_types = tuple(rocm_types)
+        else:
+            self.allowed_attn_types = (FlashAttentionMetadata,
+                                       TreeAttentionMetadata)
+
         # Parse the speculative token tree.
         spec_token_tree = self.speculative_config.speculative_token_tree
         self.tree_choices: list[tuple[int,
@@ -165,7 +189,7 @@ class EagleProposer:
         for layer_name in self.attn_layer_names:
             per_layer_attn_metadata[layer_name] = attn_metadata
         if self.use_cuda_graph and \
-            num_tokens <= self.cudagraph_batch_sizes[-1]:
+                num_tokens <= self.cudagraph_batch_sizes[-1]:
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
         else:
             num_input_tokens = num_tokens
@@ -225,25 +249,13 @@ class EagleProposer:
         # TODO: Currently, MTP module released by deepseek only has
         # one layer. Adapt this code to support multiple layers once
         # there's a multi-layer MTP module.
-
-        # On ROCm, both AiterFlashAttention and TritonAttention
-        # support multi-token eagle spec decode.
-        if current_platform.is_rocm():
-            assert isinstance(
-                attn_metadata,
-                (TritonAttentionMetadata, AiterFlashAttentionMetadata,
-                 FlashAttentionMetadata))
-        else:
-            # Currently, only FlashAttention supports multi-token eagle spec
-            # decode. This is because the code below makes assumptions about
-            # attn_metadata attributes available.
-            assert isinstance(attn_metadata, FlashAttentionMetadata)
+        assert isinstance(attn_metadata, self.allowed_attn_types)
 
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
 
         if self.use_cuda_graph and \
-            batch_size <= self.cudagraph_batch_sizes[-1]:
+                batch_size <= self.cudagraph_batch_sizes[-1]:
             input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
         else:
             input_batch_size = batch_size
@@ -449,7 +461,7 @@ class EagleProposer:
                 num_tokens, -1)
 
             if self.use_cuda_graph and \
-                num_tokens <= self.cudagraph_batch_sizes[-1]:
+                    num_tokens <= self.cudagraph_batch_sizes[-1]:
                 num_input_tokens = self.vllm_config.pad_for_cudagraph(
                     num_tokens)
             else:
@@ -508,19 +520,19 @@ class EagleProposer:
         """
         # E.g.
         #  common_attn_metadata.query_start_loc{_cpu}:
-        #         [0, q1, q1 + q2, q1 + q2 + q3]
+        #       [0, q1, q1 + q2, q1 + q2 + q3]
         #  common_attn_metadata.seq_lens{_cpu}: [s1, s2, s3]
         #  num_rejected_tokens: [n1, n2, n3]
         # This function computes the intermediate values:
         #  num_tokens_per_req: [q1 - n1, q2 - n2, q3 - n3]
         # And returns:
         #  common_attn_metadata.query_start_loc{_cpu}:
-        #         [0, q1 - n1, q1 + q2 - n1 - n2, q1 + q2 + q3 - n1 - n2 - n3]
+        #       [0, q1 - n1, q1 + q2 - n1 - n2, q1 + q2 + q3 - n1 - n2 - n3]
         #  common_attn_metadata.seq_lens{_cpu}:
-        #         [s1 - n1 + 1, s2 - n2 + 1, s3 - n3 + 1]
+        #       [s1 - n1 + 1, s2 - n2 + 1, s3 - n3 + 1]
         #  token_indices: [0, 1, ..., q1 - n1 - 1,
-        #                  q1, q1 + 1, ..., q1 + q2 - n2 - 1,
-        #                  q1 + q2, q1 + q2 + 1, ..., q1 + q2 + q3 - n3 - 1]
+        #                 q1, q1 + 1, ..., q1 + q2 - n2 - 1,
+        #                 q1 + q2, q1 + q2 + 1, ..., q1 + q2 + q3 - n3 - 1]
 
         device = common_attn_metadata.query_start_loc.device
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
@@ -564,9 +576,9 @@ class EagleProposer:
         old_query_start_locs_expanded = np.repeat(
             query_start_loc_cpu[:-1].numpy(), new_num_tokens_per_req_np)
         # Final token indices are:
-        # [0, 1,                                   // req 1
-        #  q1 + 0, q1 + 1, q1 + 2, q1 + 3,         // req 2
-        #  q1 + q2 + 0, q1 + q2 + 1, q1 + q2 + 2]  // req 3
+        # [0, 1,                                // req 1
+        #  q1 + 0, q1 + 1, q1 + 2, q1 + 3,       // req 2
+        #  q1 + q2 + 0, q1 + q2 + 1, q1 + q2 + 2] // req 3
         token_indices_np = token_offests + old_query_start_locs_expanded
         token_indices = torch.from_numpy(token_indices_np).to(
             device, non_blocking=True)
@@ -616,20 +628,18 @@ class EagleProposer:
             target_language_model = target_model
         # share embed_tokens with the target model if needed
         if get_pp_group().world_size == 1 \
-            and self.model.model.embed_tokens.weight.shape \
-                == target_language_model.model.embed_tokens.weight.shape:
+                and self.model.model.embed_tokens.weight.shape \
+            == target_language_model.model.embed_tokens.weight.shape:
             logger.info(
-                "Assuming the EAGLE head shares the same vocab embedding" \
-                " with the target model."
-            )
+                "Assuming the EAGLE head shares the same vocab embedding"
+                " with the target model.")
             del self.model.model.embed_tokens
             self.model.model.embed_tokens = (
                 target_language_model.model.embed_tokens)
         else:
             logger.info(
-                "The EAGLE head's vocab embedding will be loaded separately" \
-                " from the target model."
-            )
+                "The EAGLE head's vocab embedding will be loaded separately"
+                " from the target model.")
 
         # share lm_head with the target model if needed
         # some model definition do not define lm_head explicitly

From dfd2382039c38be80d6c2c9b56e441b5bd7cd0ad Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Wed, 20 Aug 2025 09:52:59 -0700
Subject: [PATCH 435/932] [torch.compile] Support conditional torch.compile per
 module (#22269)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +
 .../compile/piecewise/test_multiple_graphs.py | 135 +++-------
 tests/compile/test_decorator.py               | 251 ++++++++++++++++++
 vllm/compilation/decorators.py                |  21 +-
 4 files changed, 307 insertions(+), 102 deletions(-)
 create mode 100644 tests/compile/test_decorator.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 2f7f1db75b..7454206640 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -328,6 +328,7 @@ steps:
     - pytest -v -s compile/test_sequence_parallelism.py
     - pytest -v -s compile/test_async_tp.py
     - pytest -v -s compile/test_fusion_all_reduce.py
+    - pytest -v -s compile/test_decorator.py
 
 - label: PyTorch Fullgraph Smoke Test # 9min
   mirror_hardwares: [amdexperimental]
@@ -341,6 +342,7 @@ steps:
   - pytest -v -s compile/piecewise/test_simple.py
   - pytest -v -s compile/piecewise/test_toy_llama.py
   - pytest -v -s compile/piecewise/test_full_cudagraph.py
+  - pytest -v -s compile/piecewise/test_multiple_graphs.py
 
 - label: PyTorch Fullgraph Test # 18min
   mirror_hardwares: [amdexperimental]
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
index e460d70951..f5e2d9ddb7 100644
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -12,10 +12,9 @@ from vllm.compilation.backends import set_model_tag
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import (ignore_torch_compile,
                                          support_torch_compile)
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
-from vllm.envs import VLLM_USE_V1
-from vllm.forward_context import set_forward_context
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         VllmConfig, set_current_vllm_config)
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import direct_register_custom_op
 
 # create a library to hold the custom op
@@ -164,104 +163,34 @@ class SimpleModelWithTwoGraphs(ParentModel):
         return x
 
 
-def test_ignore_torch_compile_decorator():
-    assert VLLM_USE_V1
-
-    # piecewise
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        use_cudagraph=True,
-        splitting_ops=["silly.attention"],
-        cudagraph_capture_sizes=[1, 2],
-    ))
-
-    @support_torch_compile
-    class A(nn.Module):
-
-        def __init__(self,
-                     *,
-                     vllm_config: VllmConfig,
-                     prefix: str = '',
-                     **kwargs) -> None:
-            super().__init__()
-
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            x = x + x
-            attn_output = torch.empty_like(x)
-            torch.ops.silly.attention(x, x, x, attn_output)
-            x = attn_output
-            x = x * 3
-            return x
-
-    @ignore_torch_compile
-    class B(A):
-        ...
-
-    @support_torch_compile
-    class C(B):
-        ...
-
-    with set_current_vllm_config(vllm_config):
-        mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
-
-    # A has support_torch_compile
-    with compilation_counter.expect(
-            num_graphs_seen=1,
-            num_piecewise_graphs_seen=3,
-            num_piecewise_capturable_graphs_seen=2,
-            num_backend_compilations=2,
-            num_cudagraph_captured=4,
-            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ), set_forward_context({}, vllm_config=vllm_config):
-        # first run is for compile
-        mod_A(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
-        # run cudagraph captured sizes
-        mod_A(torch.randn(2, MLP_SIZE).cuda())
-        mod_A(torch.randn(1, MLP_SIZE).cuda())
-
-    with set_current_vllm_config(vllm_config):
-        mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda()
-
-    # B's ignore_torch_compile should override A's support_torch_compile
-    with compilation_counter.expect(
-            num_graphs_seen=0,
-            num_piecewise_graphs_seen=0,
-            num_piecewise_capturable_graphs_seen=0,
-            num_backend_compilations=0,
-            num_cudagraph_captured=0,
-    ), set_forward_context({}, vllm_config=vllm_config):
-        mod_B(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
-        mod_B(torch.randn(2, MLP_SIZE).cuda())
-        mod_B(torch.randn(1, MLP_SIZE).cuda())
-
-    with set_current_vllm_config(vllm_config):
-        mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda()
-
-    # C's support_torch_compile should override B's ignore_torch_compile
-    with compilation_counter.expect(
-            num_graphs_seen=1,
-            num_piecewise_graphs_seen=3,
-            num_piecewise_capturable_graphs_seen=2,
-            num_backend_compilations=2,
-            num_cudagraph_captured=4,
-            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ), set_forward_context({}, vllm_config=vllm_config):
-        mod_C(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
-        mod_C(torch.randn(2, MLP_SIZE).cuda())
-        mod_C(torch.randn(1, MLP_SIZE).cuda())
-
-
 @torch.inference_mode
-def run_model(vllm_config, model: nn.Module, inputs: torch.Tensor):
+def run_model(vllm_config: VllmConfig, model: nn.Module, inputs: torch.Tensor,
+              cudagraph_runtime_mode: CUDAGraphMode):
     with set_forward_context({}, vllm_config=vllm_config):
-        # First run is for compile
+        # warmup for the model with cudagraph_mode NONE
         model(inputs)
 
-        # Run CUDAGraph captured sizes
-        model(inputs[:2])
-        model(inputs[:1])
+        # simulate cudagraphs capturing
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            model(inputs[:2])
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=1, )):
+            model(inputs[:1])
 
-        output = model(inputs[:2])
+        # simulate cudagraphs replay
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            output = model(inputs[:2])
 
         output = output.cpu()
         return output.cpu()
@@ -277,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
         splitting_ops=["silly.attention"],
         cudagraph_capture_sizes=[1, 2],
     ))
+    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
     with set_current_vllm_config(vllm_config):
         model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
@@ -299,11 +229,13 @@ def test_multi_graph_piecewise_compile_outputs_equal():
             num_cudagraph_captured=8,
             # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
-        outputs.append(run_model(vllm_config, model, inputs))
+        outputs.append(
+            run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
 
     # no compile or cudagraph
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
         level=CompilationLevel.NO_COMPILATION, ))
+    cudagraph_runtime_mode = CUDAGraphMode.NONE
 
     with set_current_vllm_config(vllm_config):
         model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
@@ -318,7 +250,8 @@ def test_multi_graph_piecewise_compile_outputs_equal():
             num_backend_compilations=0,
             num_cudagraph_captured=0,
     ):
-        outputs.append(run_model(vllm_config, model, inputs))
+        outputs.append(
+            run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
 
     # piecewise compile without CUDA graph
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
@@ -326,6 +259,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
         use_cudagraph=False,
         splitting_ops=["silly.attention"],
     ))
+    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
     with set_current_vllm_config(vllm_config):
         model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
@@ -340,7 +274,8 @@ def test_multi_graph_piecewise_compile_outputs_equal():
             num_backend_compilations=4,
             num_cudagraph_captured=0,  # no cudagraph captured
     ):
-        outputs.append(run_model(vllm_config, model, inputs))
+        outputs.append(
+            run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
 
     # Generally don't expect outputs with and without inductor
     # to be bitwise equivalent
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
new file mode 100644
index 0000000000..51f8ddd566
--- /dev/null
+++ b/tests/compile/test_decorator.py
@@ -0,0 +1,251 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch import nn
+from torch.library import Library
+
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import (ignore_torch_compile,
+                                         support_torch_compile)
+from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
+                         CUDAGraphMode, VllmConfig, set_current_vllm_config)
+from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.utils import direct_register_custom_op
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
+BATCH_SIZE = 32
+MLP_SIZE = 128
+
+
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    out.copy_(q)
+    out += k
+    out += v
+
+
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
+@torch.inference_mode
+def run_model(vllm_config: VllmConfig, model: nn.Module,
+              cudagraph_runtime_mode: CUDAGraphMode):
+    with set_forward_context({}, vllm_config=vllm_config):
+        # warmup for the model with cudagraph_mode NONE
+        model(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
+
+        # simulate cudagraphs capturing
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            model(torch.randn(2, MLP_SIZE).cuda())
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=1, )):
+            model(torch.randn(1, MLP_SIZE).cuda())
+
+        # simulate cudagraphs replay
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            output = model(torch.randn(2, MLP_SIZE).cuda())
+
+        output = output.cpu()
+        return output.cpu()
+
+
+def test_ignore_torch_compile_decorator():
+    # piecewise
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=True,
+        splitting_ops=["silly.attention"],
+        cudagraph_capture_sizes=[1, 2],
+    ))
+    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
+
+    @support_torch_compile
+    class A(nn.Module):
+
+        def __init__(self,
+                     *,
+                     vllm_config: VllmConfig,
+                     prefix: str = '',
+                     **kwargs) -> None:
+            super().__init__()
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            x = x + x
+            attn_output = torch.empty_like(x)
+            torch.ops.silly.attention(x, x, x, attn_output)
+            x = attn_output
+            x = x * 3
+            return x
+
+    @ignore_torch_compile
+    class B(A):
+        ...
+
+    @support_torch_compile
+    class C(B):
+        ...
+
+    with set_current_vllm_config(vllm_config):
+        mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
+
+    # A has support_torch_compile
+    with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=3,
+            num_piecewise_capturable_graphs_seen=2,
+            num_backend_compilations=2,
+            num_cudagraph_captured=4,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        run_model(vllm_config, mod_A, cudagraph_runtime_mode)
+
+    with set_current_vllm_config(vllm_config):
+        mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda()
+
+    # B's ignore_torch_compile should override A's support_torch_compile
+    with compilation_counter.expect(
+            num_graphs_seen=0,
+            num_piecewise_graphs_seen=0,
+            num_piecewise_capturable_graphs_seen=0,
+            num_backend_compilations=0,
+            num_cudagraph_captured=0,
+    ):
+        run_model(vllm_config, mod_B, cudagraph_runtime_mode)
+
+    with set_current_vllm_config(vllm_config):
+        mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda()
+
+    # C's support_torch_compile should override B's ignore_torch_compile
+    with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=3,
+            num_piecewise_capturable_graphs_seen=2,
+            num_backend_compilations=2,
+            num_cudagraph_captured=4,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        run_model(vllm_config, mod_C, cudagraph_runtime_mode)
+
+
+# Only enable torch.compile if
+# vllm_config.cache_config.kv_sharing_fast_prefill=True
+@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
+                       kv_sharing_fast_prefill)
+class B(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + x
+        attn_output = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, attn_output)
+        x = attn_output
+        x = x + x
+        return x
+
+
+# Only enable torch.compile if
+# vllm_config.cache_config.kv_sharing_fast_prefill=False
+@support_torch_compile(enable_if=lambda vllm_config: not vllm_config.
+                       cache_config.kv_sharing_fast_prefill)
+class A(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
+        super().__init__()
+        self.mod1 = B(vllm_config=vllm_config, prefix=prefix, **kwargs)
+        self.mod2 = B(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.mod1(x)
+        attn_output = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, attn_output)
+        x = attn_output
+        x = self.mod2(x)
+        return x
+
+
+def test_conditional_compile_enable_if():
+    vllm_config = VllmConfig(cache_config=CacheConfig(
+        kv_sharing_fast_prefill=True, ),
+                             compilation_config=CompilationConfig(
+                                 level=CompilationLevel.PIECEWISE,
+                                 use_cudagraph=True,
+                                 splitting_ops=["silly.attention"],
+                                 cudagraph_capture_sizes=[1, 2],
+                             ))
+    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
+
+    with set_current_vllm_config(vllm_config):
+        mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
+
+    # A has support_torch_compile but enable_if fn returns False
+    # enalbe_if will be True for B, so we expect mod1 and mod2
+    # to be compiled
+    with compilation_counter.expect(
+            num_graphs_seen=2,
+            num_piecewise_graphs_seen=6,
+            # 3 piecewise graphs per instance of B()
+            num_piecewise_capturable_graphs_seen=4,
+            num_backend_compilations=4,
+            num_cudagraph_captured=8,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        run_model(vllm_config, mod_A, cudagraph_runtime_mode)
+
+    # Set kv_sharing_fast_prefill=False
+    # which will cause A to be compiled and B to not be compiled
+    vllm_config = VllmConfig(cache_config=CacheConfig(
+        kv_sharing_fast_prefill=False, ),
+                             compilation_config=CompilationConfig(
+                                 level=CompilationLevel.PIECEWISE,
+                                 use_cudagraph=True,
+                                 splitting_ops=["silly.attention"],
+                                 cudagraph_capture_sizes=[1, 2],
+                             ))
+
+    with set_current_vllm_config(vllm_config):
+        mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,
+            num_piecewise_graphs_seen=7,
+            # 3 attn ops and 4 non-attn ops
+            num_piecewise_capturable_graphs_seen=4,
+            num_backend_compilations=4,
+            num_cudagraph_captured=8,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        run_model(vllm_config, mod_A, cudagraph_runtime_mode)
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 58f70ef9ef..41d9fcb824 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -52,6 +52,14 @@ def _should_ignore_torch_compile(cls) -> bool:
     return getattr(cls, IGNORE_COMPILE_KEY, False)
 
 
+@overload
+def support_torch_compile(
+    *,
+    enable_if: Optional[Callable[[VllmConfig], bool]] = None,
+) -> Callable[[_T], _T]:
+    ...
+
+
 @overload
 def support_torch_compile(
     *,
@@ -69,6 +77,7 @@ def support_torch_compile(
     cls: Optional[_T] = None,
     *,
     dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]] = None,
+    enable_if: Optional[Callable[[VllmConfig], bool]] = None,
 ) -> Union[Callable[[_T], _T], _T]:
     """
     A decorator to add support for compiling the forward method of a class.
@@ -118,6 +127,11 @@ def support_torch_compile(
     NOTE: if an argument is `None`, it should always be passed as `None` during
     the lifetime of the model, otherwise, it cannot be captured as a single
     computation graph.
+
+    `enable_if` is a function that takes a `VllmConfig` object as input and
+    returns a boolean value indicating whether to compile the model or not.
+    This is useful if you want to compile the model only when certain
+    conditions are met.
     """
 
     def cls_decorator_helper(cls: _T) -> _T:
@@ -149,7 +163,8 @@ def support_torch_compile(
             if k not in sig.parameters:
                 raise ValueError(
                     f"Argument {k} not found in the forward method of {cls}")
-        return _support_torch_compile(cls, inferred_dynamic_arg_dims)
+        return _support_torch_compile(cls, inferred_dynamic_arg_dims,
+                                      enable_if)
 
     if cls is not None:
         # use `support_torch_compile` as a decorator without arguments
@@ -162,6 +177,7 @@ def support_torch_compile(
 def _support_torch_compile(
     cls: _T,
     dynamic_arg_dims: dict[str, Union[int, list[int]]],
+    enable_if: Optional[Callable[[VllmConfig], bool]] = None,
 ) -> _T:
     """
     A decorator to add support for compiling the forward method of a class.
@@ -182,13 +198,14 @@ def _support_torch_compile(
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
         old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
         self.vllm_config = vllm_config
+        enable_compile = enable_if is None or enable_if(vllm_config)
         # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
         # will handle the compilation, so we don't need to do anything here.
         self.do_not_compile = \
             vllm_config.compilation_config.level in [
             CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
         ] or not supports_dynamo() or _should_ignore_torch_compile(
-            self.__class__)
+            self.__class__) or not enable_compile
         if self.do_not_compile:
             return
 

From c4477f55e581e5ef5f52bbe39cba6e0de1956444 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Wed, 20 Aug 2025 10:37:29 -0700
Subject: [PATCH 436/932] Migrate Mistral3ImagePixelInputs to TensorSchema
 (#21945)

Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/mistral3.py | 38 ++++++++++++--------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index a647292d3a..438513433d 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -3,7 +3,7 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
+from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar,
                     Union)
 
 import torch
@@ -32,6 +32,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -42,15 +43,23 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
 from .vision import get_vision_encoder_info
 
 
-class Mistral3ImagePixelInputs(TypedDict):
-    type: Literal["pixel_values_pixtral"]
-    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+class Mistral3ImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each image
+        - w: Width of each image
     """
-    Shape: `(batch_size * num_images, num_channels, height, width)`
 
-    Note that `height` or `width` may be different per batch and image,
-    in which case the data is passed as a list instead of a batched tensor.
-    """
+    type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral"
+
+    # Note that `height` or `width` may be different per batch and image,
+    # in which case the data is passed as a list instead of a batched tensor.
+    pixel_values: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", 3, "h", "w", dynamic_dims={"h", "w"}),
+    ]
 
 
 class Mistral3PatchMerger(nn.Module):
@@ -456,19 +465,6 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[Mistral3ImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)

From f77a0802b758a32c5b9f7bc04e9498d77e8d99e0 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 20 Aug 2025 13:57:37 -0400
Subject: [PATCH 437/932] Limit HTTP header count and size (#23267)

Signed-off-by: Taneem Ibrahim <taneem.ibrahim@gmail.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Taneem Ibrahim <taneem.ibrahim@gmail.com>
---
 vllm/entrypoints/constants.py         | 10 ++++++++++
 vllm/entrypoints/launcher.py          | 21 +++++++++++++++++++++
 vllm/entrypoints/openai/api_server.py |  2 ++
 vllm/entrypoints/openai/cli_args.py   |  8 ++++++++
 4 files changed, 41 insertions(+)
 create mode 100644 vllm/entrypoints/constants.py

diff --git a/vllm/entrypoints/constants.py b/vllm/entrypoints/constants.py
new file mode 100644
index 0000000000..b5bcccc35d
--- /dev/null
+++ b/vllm/entrypoints/constants.py
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Shared constants for vLLM entrypoints.
+"""
+
+# HTTP header limits for h11 parser
+# These constants help mitigate header abuse attacks
+H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304  # 4 MB
+H11_MAX_HEADER_COUNT_DEFAULT = 256
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 9f4dc19fb4..4e852ba594 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -14,6 +14,8 @@ from vllm import envs
 from vllm.engine.async_llm_engine import AsyncEngineDeadError
 from vllm.engine.multiprocessing import MQEngineDeadError
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
+                                        H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
 from vllm.entrypoints.ssl import SSLCertRefresher
 from vllm.logger import init_logger
 from vllm.utils import find_process_using_port
@@ -26,6 +28,11 @@ async def serve_http(app: FastAPI,
                      sock: Optional[socket.socket],
                      enable_ssl_refresh: bool = False,
                      **uvicorn_kwargs: Any):
+    """
+    Start a FastAPI app using Uvicorn, with support for custom Uvicorn config
+    options.  Supports http header limits via h11_max_incomplete_event_size and
+    h11_max_header_count.
+    """
     logger.info("Available routes are:")
     for route in app.routes:
         methods = getattr(route, "methods", None)
@@ -36,7 +43,21 @@ async def serve_http(app: FastAPI,
 
         logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
 
+    # Extract header limit options if present
+    h11_max_incomplete_event_size = uvicorn_kwargs.pop(
+        "h11_max_incomplete_event_size", None)
+    h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None)
+
+    # Set safe defaults if not provided
+    if h11_max_incomplete_event_size is None:
+        h11_max_incomplete_event_size = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
+    if h11_max_header_count is None:
+        h11_max_header_count = H11_MAX_HEADER_COUNT_DEFAULT
+
     config = uvicorn.Config(app, **uvicorn_kwargs)
+    # Set header limits
+    config.h11_max_incomplete_event_size = h11_max_incomplete_event_size
+    config.h11_max_header_count = h11_max_header_count
     config.load()
     server = uvicorn.Server(config)
     _add_shutdown_handlers(app, server)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 765327da3b..24148bcef2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1922,6 +1922,8 @@ async def run_server_worker(listen_address,
             ssl_certfile=args.ssl_certfile,
             ssl_ca_certs=args.ssl_ca_certs,
             ssl_cert_reqs=args.ssl_cert_reqs,
+            h11_max_incomplete_event_size=args.h11_max_incomplete_event_size,
+            h11_max_header_count=args.h11_max_header_count,
             **uvicorn_kwargs,
         )
 
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index e15f65b430..6e4eff5c80 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -20,6 +20,8 @@ from vllm.config import config
 from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
+from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
+                                        H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
 from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.logger import init_logger
@@ -172,6 +174,12 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
     enable_log_outputs: bool = False
     """If set to True, enable logging of model outputs (generations) 
     in addition to the input logging that is enabled by default."""
+    h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
+    """Maximum size (bytes) of an incomplete HTTP event (header or body) for
+    h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB)."""
+    h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT
+    """Maximum number of HTTP headers allowed in a request for h11 parser.
+    Helps mitigate header abuse. Default: 256."""
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:

From ebe56a0064f7a72a5c51d4cd6bcca165590c5bed Mon Sep 17 00:00:00 2001
From: dongluw <108290936+dongluw@users.noreply.github.com>
Date: Wed, 20 Aug 2025 14:15:18 -0400
Subject: [PATCH 438/932] Small fix for Command-A-Vision (#23268)

Signed-off-by: donglu <donglu@cohere.com>
---
 vllm/model_executor/models/cohere2_vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index 4682a8a428..fca1aee835 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -348,7 +348,7 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             vllm_config=vllm_config,
             hf_config=config.text_config,
             prefix=maybe_prefix(prefix, "language_model"),
-            architectures=["Cohere2ForCausalLM"])
+            architectures=config.text_config.architectures)
 
     @property
     def dtype(self):

From 0cdbf5e61ce3fd97d33b31b775d2faaadc99fbc5 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 20 Aug 2025 15:13:36 -0400
Subject: [PATCH 439/932] [Kernel/Quant] Remove the original marlin format and
 qqq (#23204)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../configs/Meta-Llama-3-8B-QQQ.yaml          |   12 -
 .../lm-eval-harness/configs/models-large.txt  |    1 -
 CMakeLists.txt                                |    2 -
 benchmarks/kernels/benchmark_machete.py       |   23 +-
 csrc/quantization/machete/generate.py         |  139 +-
 csrc/quantization/marlin/dense/LICENSE        |  209 ---
 csrc/quantization/marlin/dense/common/base.h  |   32 -
 csrc/quantization/marlin/dense/common/mem.h   |   89 --
 .../marlin/dense/marlin_cuda_kernel.cu        | 1073 --------------
 .../marlin/qqq/marlin_qqq_gemm_kernel.cu      | 1248 -----------------
 csrc/torch_bindings.cpp                       |   17 -
 tests/compile/test_full_graph.py              |    6 -
 tests/kernels/quantization/test_machete_mm.py |   34 +-
 .../kernels/quantization/test_marlin_gemm.py  |   83 --
 tests/quantization/test_configs.py            |   10 -
 tests/quantization/test_lm_head.py            |    6 +-
 tests/weight_loading/models.txt               |    4 -
 vllm/_custom_ops.py                           |   36 -
 vllm/config/__init__.py                       |    7 +-
 vllm/lora/layers.py                           |    3 -
 vllm/model_executor/layers/linear.py          |    1 -
 .../layers/quantization/__init__.py           |    6 -
 .../layers/quantization/marlin.py             |  263 ----
 .../model_executor/layers/quantization/qqq.py |  275 ----
 .../utils/marlin_utils_test_qqq.py            |  126 --
 .../layers/quantization/utils/quant_utils.py  |   85 --
 26 files changed, 92 insertions(+), 3698 deletions(-)
 delete mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
 delete mode 100644 csrc/quantization/marlin/dense/LICENSE
 delete mode 100644 csrc/quantization/marlin/dense/common/base.h
 delete mode 100644 csrc/quantization/marlin/dense/common/mem.h
 delete mode 100644 csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
 delete mode 100644 csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
 delete mode 100644 vllm/model_executor/layers/quantization/marlin.py
 delete mode 100644 vllm/model_executor/layers/quantization/qqq.py
 delete mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py

diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
deleted file mode 100644
index 56ec933c9c..0000000000
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
-model_name: "HandH1998/QQQ-Llama-3-8b-g128"
-tasks:
-- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.419
-  - name: "exact_match,flexible-extract"
-    value: 0.416
-limit: 1000
-num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
index 27a1a9a82b..37eeac85c9 100644
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
-Meta-Llama-3-8B-QQQ.yaml
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bcbd1b52a0..a1deefb07f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -357,9 +357,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 
     set(MARLIN_SRCS
-       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
        "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
        "csrc/quantization/gptq_marlin/gptq_marlin.cu"
        "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
        "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 975d10f2e9..a9c4d30d9b 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -253,28 +253,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
     else:
         assert bt.a.dtype == torch.int8
         assert bt.wtype == scalar_types.uint4b8
-
-        if bt.w_ch_s is not None:
-            s_ch = bt.w_ch_s.to(torch.float32)
-        else:
-            s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device)
-
-        if bt.w_tok_s is not None:
-            s_tok = bt.w_tok_s.to(torch.float32)
-        else:
-            s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device)
-
-        fn = lambda: ops.marlin_qqq_gemm(
-            a=bt.a,
-            b_q_weight=w_q,
-            s_group=w_s,
-            s_tok=s_tok,
-            s_ch=s_ch,
-            workspace=workspace.scratch,
-            size_m=bt.a.shape[0],
-            size_n=bt.w_ref.shape[1],
-            size_k=bt.w_ref.shape[0],
-        )
+        raise NotImplementedError("QQQ is not supported anymore")
 
     return fn
 
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index 88b3f9c734..0d14ba1593 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -571,78 +571,79 @@ def generate():
                      itertools.repeat(default_heuristic))
     ]
 
-    # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
-    # TODO (LucasWilkinson): Further tuning required
-    qqq_tile_heuristic_config = {
-        #### M = 257+
-        # ((128, 256), (2, 1, 1)) Broken for QQQ types
-        # TODO (LucasWilkinson): Investigate further
-        # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
-        # "M > 256": ((128, 256), (2, 1, 1)),
-        "M > 256": ((128, 128), (2, 1, 1)),
-        #### M = 129-256
-        "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
-        "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
-        # ((128, 256), (2, 1, 1)) Broken for QQQ types
-        # TODO (LucasWilkinson): Investigate further
-        # "M > 128": ((128, 256), (2, 1, 1)),
-        "M > 128": ((128, 128), (2, 1, 1)),
-        #### M = 65-128
-        "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
-        "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
-        "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
-        "M > 64": ((128, 128), (2, 1, 1)),
-        #### M = 33-64
-        "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
-        # Broken for QQQ types
-        # TODO (LucasWilkinson): Investigate further
-        #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
-        "M > 32": ((128, 64), (2, 1, 1)),
-        #### M = 17-32
-        "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
-        "M > 16": ((256, 32), (2, 1, 1)),
-        #### M = 1-16
-        "N >= 26624": ((256, 16), (1, 1, 1)),
-        None: ((128, 16), (1, 1, 1)),
-    }
+    # TODO: Support W4A8 when ready
+    # # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk))
+    # # TODO (LucasWilkinson): Further tuning required
+    # qqq_tile_heuristic_config = {
+    #     #### M = 257+
+    #     # ((128, 256), (2, 1, 1)) Broken for QQQ types
+    #     # TODO (LucasWilkinson): Investigate further
+    #     # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)),
+    #     # "M > 256": ((128, 256), (2, 1, 1)),
+    #     "M > 256": ((128, 128), (2, 1, 1)),
+    #     #### M = 129-256
+    #     "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)),
+    #     "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)),
+    #     # ((128, 256), (2, 1, 1)) Broken for QQQ types
+    #     # TODO (LucasWilkinson): Investigate further
+    #     # "M > 128": ((128, 256), (2, 1, 1)),
+    #     "M > 128": ((128, 128), (2, 1, 1)),
+    #     #### M = 65-128
+    #     "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)),
+    #     "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)),
+    #     "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)),
+    #     "M > 64": ((128, 128), (2, 1, 1)),
+    #     #### M = 33-64
+    #     "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)),
+    #     # Broken for QQQ types
+    #     # TODO (LucasWilkinson): Investigate further
+    #     #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)),
+    #     "M > 32": ((128, 64), (2, 1, 1)),
+    #     #### M = 17-32
+    #     "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)),
+    #     "M > 16": ((256, 32), (2, 1, 1)),
+    #     #### M = 1-16
+    #     "N >= 26624": ((256, 16), (1, 1, 1)),
+    #     None: ((128, 16), (1, 1, 1)),
+    # }
 
-    # For now we use the same heuristic for all types
-    # Heuristic is currently tuned for H100s
-    qqq_heuristic = [
-        (cond, ScheduleConfig(*tile_config,
-                              **sch_common_params))  # type: ignore
-        for cond, tile_config in qqq_tile_heuristic_config.items()
-    ]
+    # # For now we use the same heuristic for all types
+    # # Heuristic is currently tuned for H100s
+    # qqq_heuristic = [
+    #     (cond, ScheduleConfig(*tile_config,
+    #                           **sch_common_params))  # type: ignore
+    #     for cond, tile_config in qqq_tile_heuristic_config.items()
+    # ]
 
-    QQQ_kernel_types = [
-        *(TypeConfig(
-            a=DataType.s8,
-            b=VLLMDataType.u4b8,
-            b_group_scale=b_group_scale,
-            b_group_zeropoint=DataType.void,
-            b_channel_scale=DataType.f32,
-            a_token_scale=DataType.f32,
-            out=DataType.f16,
-            accumulator=DataType.s32,
-        ) for b_group_scale in (DataType.f16, DataType.void)),
-        *(TypeConfig(
-            a=DataType.e4m3,
-            b=VLLMDataType.u4b8,
-            b_group_scale=b_group_scale,
-            b_group_zeropoint=DataType.void,
-            b_channel_scale=DataType.f32,
-            a_token_scale=DataType.f32,
-            out=DataType.f16,
-            accumulator=DataType.f32,
-        ) for b_group_scale in (DataType.f16, DataType.void)),
-    ]
+    # QQQ_kernel_types = [
+    #     *(TypeConfig(
+    #         a=DataType.s8,
+    #         b=VLLMDataType.u4b8,
+    #         b_group_scale=b_group_scale,
+    #         b_group_zeropoint=DataType.void,
+    #         b_channel_scale=DataType.f32,
+    #         a_token_scale=DataType.f32,
+    #         out=DataType.f16,
+    #         accumulator=DataType.s32,
+    #     ) for b_group_scale in (DataType.f16, DataType.void)),
+    #     *(TypeConfig(
+    #         a=DataType.e4m3,
+    #         b=VLLMDataType.u4b8,
+    #         b_group_scale=b_group_scale,
+    #         b_group_zeropoint=DataType.void,
+    #         b_channel_scale=DataType.f32,
+    #         a_token_scale=DataType.f32,
+    #         out=DataType.f16,
+    #         accumulator=DataType.f32,
+    #     ) for b_group_scale in (DataType.f16, DataType.void)),
+    # ]
 
-    impl_configs += [
-        ImplConfig(x[0], x[1], x[2])
-        for x in zip(QQQ_kernel_types,
-                     itertools.repeat(get_unique_schedules(qqq_heuristic)),
-                     itertools.repeat(qqq_heuristic))
-    ]
+    # impl_configs += [
+    #     ImplConfig(x[0], x[1], x[2])
+    #     for x in zip(QQQ_kernel_types,
+    #                  itertools.repeat(get_unique_schedules(qqq_heuristic)),
+    #                  itertools.repeat(qqq_heuristic))
+    # ]
 
     output_dir = os.path.join(SCRIPT_DIR, "generated")
 
diff --git a/csrc/quantization/marlin/dense/LICENSE b/csrc/quantization/marlin/dense/LICENSE
deleted file mode 100644
index 1d1e4cf9c8..0000000000
--- a/csrc/quantization/marlin/dense/LICENSE
+++ /dev/null
@@ -1,209 +0,0 @@
-Contains code from https://github.com/IST-DASLab/marlin
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright {yyyy} {name of copyright owner}
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-------------------------------------------------------------------------------------
-
-This product bundles various third-party components under other open source licenses.
-This section summarizes those components and their licenses. See licenses/
-for text of these licenses.
diff --git a/csrc/quantization/marlin/dense/common/base.h b/csrc/quantization/marlin/dense/common/base.h
deleted file mode 100644
index 68c83d5478..0000000000
--- a/csrc/quantization/marlin/dense/common/base.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Modified by HandH1998
- * Modified by Neural Magic
- * Copyright (C) Marlin.2024 Elias Frantar
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *         http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
-
-// Instances of `Vec` are used to organize groups of >>registers<<, as needed
-// for instance as inputs to tensor core operations. Consequently, all
-// corresponding index accesses must be compile-time constants, which is why we
-// extensively use `#pragma unroll` throughout the kernel code to guarantee
-// this.
-template <typename T, int n>
-struct Vec {
-  T elems[n];
-  __device__ T& operator[](int i) { return elems[i]; }
-};
diff --git a/csrc/quantization/marlin/dense/common/mem.h b/csrc/quantization/marlin/dense/common/mem.h
deleted file mode 100644
index 64f9c393d7..0000000000
--- a/csrc/quantization/marlin/dense/common/mem.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Modified by HandH1998
- * Modified by Neural Magic
- * Copyright (C) Marlin.2024 Elias Frantar
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *         http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-// Predicated asynchronous global->shared copy; used for inputs A where we apply
-// predication to handle batchsizes that are not multiples of 16.
-__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
-                                      bool pred = true) {
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   .reg .pred p;\n"
-      "   setp.ne.b32 p, %0, 0;\n"
-      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
-      "}\n" ::"r"((int)pred),
-      "r"(smem), "l"(glob_ptr), "n"(BYTES));
-}
-
-// Asynchronous global->shared copy
-__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
-  const int BYTES = 16;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
-      "}\n" ::"r"(smem),
-      "l"(glob_ptr), "n"(BYTES));
-}
-
-// Async copy fence.
-__device__ inline void cp_async_fence() {
-  asm volatile("cp.async.commit_group;\n" ::);
-}
-
-// Wait until at most `n` async copy stages are still pending.
-template <int n>
-__device__ inline void cp_async_wait() {
-  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
-}
-
-// Wait until barrier reaches `count`, then lock for current threadblock.
-__device__ inline void barrier_acquire(int* lock, int count) {
-  if (threadIdx.x == 0) {
-    int state = -1;
-    do
-      // Guarantee that subsequent writes by this threadblock will be visible
-      // globally.
-      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
-                   : "=r"(state)
-                   : "l"(lock));
-    while (state != count);
-  }
-  __syncthreads();
-}
-
-// Release barrier and increment visitation count.
-__device__ inline void barrier_release(int* lock, bool reset = false) {
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    if (reset) {
-      lock[0] = 0;
-      return;
-    }
-    int val = 1;
-    // Make sure that all writes since acquiring this barrier are visible
-    // globally, while releasing the barrier.
-    asm volatile("fence.acq_rel.gpu;\n");
-    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
-                 :
-                 : "l"(lock), "r"(val));
-  }
-}
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
deleted file mode 100644
index ea96326ed7..0000000000
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ /dev/null
@@ -1,1073 +0,0 @@
-/*
- * Modified by Neural Magic
- * Copyright (C) Marlin.2024 Elias Frantar
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *         http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-#include <iostream>
-
-#include "common/base.h"
-#include "core/registration.h"
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-  #include "common/mem.h"
-#endif
-
-template <typename T>
-inline std::string str(T x) {
-  return std::to_string(x);
-}
-
-namespace marlin_dense {
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-
-using I4 = Vec<int, 4>;
-// Matrix fragments for tensor core instructions; their precise layout is
-// documented here:
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
-using FragA = Vec<half2, 4>;
-using FragB = Vec<half2, 2>;
-using FragC = Vec<float, 4>;
-using FragS = Vec<half2, 1>;  // quantization scales
-
-// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
-// output/accumulation.
-__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
-                           FragC& frag_c) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-      : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-        "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-}
-
-// Instruction for loading a full 16x16 matrix fragment of operand A from shared
-// memory, directly in tensor core layout.
-__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
-  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
-               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
-               : "r"(smem));
-}
-
-// Lookup-table based 3-input logical operation; explicitly used for
-// dequantization as the compiler does not seem to automatically recognize it in
-// all cases.
-template <int lut>
-__device__ inline int lop3(int a, int b, int c) {
-  int res;
-  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
-               : "=r"(res)
-               : "r"(a), "r"(b), "r"(c), "n"(lut));
-  return res;
-}
-
-// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
-// values. We mostly follow the strategy in the link below, with some small
-// changes:
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-__device__ inline FragB dequant(int q) {
-  const int LO = 0x000f000f;
-  const int HI = 0x00f000f0;
-  const int EX = 0x64006400;
-  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
-  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
-  // directly into `SUB` and `ADD`.
-  const int SUB = 0x64086408;
-  const int MUL = 0x2c002c00;
-  const int ADD = 0xd480d480;
-  FragB frag_b;
-  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
-                      *reinterpret_cast<const half2*>(&SUB));
-  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
-                      *reinterpret_cast<const half2*>(&MUL),
-                      *reinterpret_cast<const half2*>(&ADD));
-  return frag_b;
-}
-
-// Multiply dequantized values by the corresponding quantization scale; used
-// only for grouped quantization.
-__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
-  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
-  frag_b[0] = __hmul2(frag_b[0], s);
-  frag_b[1] = __hmul2(frag_b[1], s);
-}
-
-template <const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
-          const int thread_n_blocks,  // same for n dimension (output)
-          const int thread_k_blocks,  // same for k dimension (reduction)
-          const int stages,  // number of stages for the async global->shared
-                             // fetch pipeline
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
-          >
-__global__ void Marlin(
-    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
-    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4* __restrict__ C,        // fp16 output buffer of shape mxn
-    const int4* __restrict__ s,  // fp16 quantization scales of shape
-                                 // (k/groupsize)xn
-    int prob_m,                  // batch dimension m
-    int prob_n,                  // output dimension n
-    int prob_k,                  // reduction dimension k
-    int* locks  // extra global storage for barrier synchronization
-) {
-  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
-  // same size, which might involve multiple column "slices" (of width 16 *
-  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
-  // example:
-  //   0 1 3
-  //   0 2 3
-  //   1 2 4
-  // While this kind of partitioning makes things somewhat more complicated, it
-  // ensures good utilization of all SMs for many kinds of shape and GPU
-  // configurations, while requiring as few slow global cross-threadblock
-  // reductions as possible.
-
-  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
-  // better partitioning with less reductions
-  int parallel = 1;
-  if (prob_m > 16 * thread_m_blocks) {
-    parallel = prob_m / (16 * thread_m_blocks);
-    prob_m = 16 * thread_m_blocks;
-  }
-
-  int k_tiles = prob_k / 16 / thread_k_blocks;
-  int n_tiles = prob_n / 16 / thread_n_blocks;
-  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
-  // Ensure that the number of tiles in each stripe is a multiple of the
-  // groupsize; this avoids an annoying special case where a stripe starts in
-  // the middle of group.
-  if (group_blocks != -1)
-    iters = (group_blocks / thread_k_blocks) *
-            ceildiv(iters, (group_blocks / thread_k_blocks));
-
-  int slice_row = (iters * blockIdx.x) % k_tiles;
-  int slice_col_par = (iters * blockIdx.x) / k_tiles;
-  int slice_col = slice_col_par;
-  int slice_iters;  // number of threadblock tiles in the current slice
-  int slice_count =
-      0;          // total number of active threadblocks in the current slice
-  int slice_idx;  // index of threadblock in current slice; numbered bottom to
-                  // top
-
-  // We can easily implement parallel problem execution by just remapping
-  // indices and advancing global pointers
-  if (slice_col_par >= n_tiles) {
-    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
-    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
-    locks += (slice_col_par / n_tiles) * n_tiles;
-    slice_col = slice_col_par % n_tiles;
-  }
-
-  // Compute all information about the current slice which is required for
-  // synchronization.
-  auto init_slice = [&]() {
-    slice_iters =
-        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
-    if (slice_iters == 0) return;
-    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
-    slice_count = 1;
-    slice_idx = 0;
-    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
-    if (col_first <= k_tiles * (slice_col_par + 1)) {
-      int col_off = col_first - k_tiles * slice_col_par;
-      slice_count = ceildiv(k_tiles - col_off, iters);
-      if (col_off > 0) slice_count++;
-      int delta_first = iters * blockIdx.x - col_first;
-      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
-        slice_idx = slice_count - 1;
-      else {
-        slice_idx = slice_count - 1 - delta_first / iters;
-        if (col_off > 0) slice_idx--;
-      }
-    }
-    if (slice_col == n_tiles) {
-      A += 16 * thread_m_blocks * prob_k / 8;
-      C += 16 * thread_m_blocks * prob_n / 8;
-      locks += n_tiles;
-      slice_col = 0;
-    }
-  };
-  init_slice();
-
-  int a_gl_stride = prob_k / 8;  // stride of the A matrix in global memory
-  // We typically use `constexpr` to indicate that this value is a compile-time
-  // constant
-  constexpr int a_sh_stride =
-      16 * thread_k_blocks / 8;  // stride of an A matrix tile in shared memory
-  constexpr int a_gl_rd_delta_o =
-      16 * thread_k_blocks /
-      8;  // delta between subsequent A tiles in global memory
-  int a_gl_rd_delta_i =
-      a_gl_stride *
-      (threads / a_gl_rd_delta_o);  // between subsequent accesses within a tile
-  constexpr int a_sh_wr_delta =
-      a_sh_stride *
-      (threads / a_gl_rd_delta_o);  // between shared memory writes
-  constexpr int a_sh_rd_delta_o =
-      2 * ((threads / 32) /
-           (thread_n_blocks / 4));  // between shared memory tile reads
-  constexpr int a_sh_rd_delta_i =
-      a_sh_stride * 16;  // within a shared memory tile
-  constexpr int a_sh_stage =
-      a_sh_stride * (16 * thread_m_blocks);  // overall size of a tile
-  constexpr int a_sh_wr_iters =
-      ceildiv(a_sh_stage,
-              a_sh_wr_delta);  // number of shared write iterations for a tile
-
-  int b_gl_stride = 16 * prob_n / 32;
-  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
-  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
-  constexpr int b_sh_wr_delta = threads;
-  constexpr int b_sh_rd_delta = threads;
-  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
-  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
-
-  int s_gl_stride = prob_n / 8;
-  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
-  constexpr int s_sh_stage = s_sh_stride;
-  int s_gl_rd_delta = s_gl_stride;
-
-  // Global A read index of current thread.
-  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                (threadIdx.x % a_gl_rd_delta_o);
-  a_gl_rd += a_gl_rd_delta_o * slice_row;
-  // Shared write index of current thread.
-  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                (threadIdx.x % a_gl_rd_delta_o);
-  // Shared read index.
-  int a_sh_rd =
-      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
-  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
-
-  int b_gl_rd =
-      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
-  b_gl_rd += b_sh_stride * slice_col;
-  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  auto b_sh_wr = threadIdx.x;
-  auto b_sh_rd = threadIdx.x;
-
-  int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
-                s_sh_stride * slice_col + threadIdx.x;
-  auto s_sh_wr = threadIdx.x;
-  int s_sh_rd;
-  // We use a different scale layout for grouped and column-wise quantization as
-  // we scale a `half2` tile in column-major layout in the former and in
-  // row-major in the latter case.
-  if (group_blocks != -1)
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  else
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) % 4;
-
-  // Precompute which thread should not read memory in which iterations; this is
-  // needed if there are more threads than required for a certain tilesize or
-  // when the batchsize is not a multiple of 16.
-  bool a_sh_wr_pred[a_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < a_sh_wr_iters; i++)
-    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
-  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
-
-  // To ensure that writing and reading A tiles to/from shared memory, the
-  // latter in fragment format, is fully bank conflict free, we need to use a
-  // rather fancy XOR-based layout. The key here is that neither reads nor
-  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
-  // same shared memory banks. Further, it seems (based on NSight-Compute) that
-  // each warp must also write a consecutive memory segment?
-  auto transform_a = [&](int i) {
-    int row = i / a_gl_rd_delta_o;
-    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
-  };
-  // Since the computation of this remapping is non-trivial and, due to our main
-  // loop unrolls, all shared memory accesses are static, we simply precompute
-  // both transformed reads and writes.
-  int a_sh_wr_trans[a_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < a_sh_wr_iters; i++)
-    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
-  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-    for (int j = 0; j < thread_m_blocks; j++)
-      a_sh_rd_trans[i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
-  }
-
-  // Since B-accesses have non-constant stride they have to be computed at
-  // runtime; we break dependencies between subsequent accesses with a tile by
-  // maintining multiple pointers (we have enough registers), a tiny
-  // optimization.
-  const int4* B_ptr[b_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++)
-    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
-
-  extern __shared__ int4 sh[];
-  // Shared memory storage for global fetch pipelines.
-  int4* sh_a = sh;
-  int4* sh_b = sh_a + (stages * a_sh_stage);
-  int4* sh_s = sh_b + (stages * b_sh_stage);
-  // Register storage for double buffer of shared memory reads.
-  FragA frag_a[2][thread_m_blocks];
-  I4 frag_b_quant[2];
-  FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];
-
-  // Zero accumulators.
-  auto zero_accums = [&]() {
-  #pragma unroll
-    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
-      reinterpret_cast<float*>(frag_c)[i] = 0;
-  };
-
-  // Asynchronously fetch the next A, B and s tile from global to the next
-  // shared memory pipeline location.
-  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
-    if (pred) {
-      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < a_sh_wr_iters; i++) {
-        cp_async4_pred(
-            &sh_a_stage[a_sh_wr_trans[i]],
-            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
-            a_sh_wr_pred[i]);
-      }
-      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < b_sh_wr_iters; i++) {
-        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
-        B_ptr[i] += b_gl_rd_delta_o;
-      }
-      // Only fetch scales if this tile starts a new group
-      if constexpr (group_blocks != -1) {
-        // This assumes group_blocks >= thread_k_blocks
-        // and would need to be modified to support smaller groups.
-        static_assert(group_blocks >= thread_k_blocks);
-        if (pipe % (group_blocks / thread_k_blocks) == 0) {
-          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
-          if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
-          s_gl_rd += s_gl_rd_delta;
-        }
-      }
-    }
-    // Insert a fence even when we are winding down the pipeline to ensure that
-    // waiting is also correct at this point.
-    cp_async_fence();
-  };
-
-  // Wait until the next thread tile has been loaded to shared memory.
-  auto wait_for_stage = [&]() {
-    // We only have `stages - 2` active fetches since we are double buffering
-    // and can only issue the next fetch when it is guaranteed that the previous
-    // shared memory load is fully complete (as it may otherwise be
-    // overwritten).
-    cp_async_wait<stages - 2>();
-    __syncthreads();
-  };
-
-  // Load the next sub-tile from the current location in the shared memory pipe
-  // into the current register buffer.
-  auto fetch_to_registers = [&](int k, int pipe) {
-    // It may seem inefficient that we reload the groups for every sub-tile;
-    // however, this does not seem to be a significant bottleneck, while some
-    // theoretically better attempts have lead to bad instruction ordering by
-    // the compiler and correspondingly a noticeable drop in performance.
-    if constexpr (group_blocks != -1) {
-      // This assumes group_blocks >= thread_k_blocks
-      // and would need to be modified to support smaller groups.
-      static_assert(group_blocks >= thread_k_blocks);
-      int4* sh_s_stage =
-          sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                               (pipe / (group_blocks / thread_k_blocks)));
-      reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
-    }
-    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
-  #pragma unroll
-    for (int i = 0; i < thread_m_blocks; i++)
-      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
-    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
-        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
-  };
-
-  // Execute the actual tensor core matmul of a sub-tile.
-  auto matmul = [&](int k) {
-  // We have the m dimension as the inner loop in order to encourage overlapping
-  // dequantization and matmul operations.
-  #pragma unroll
-    for (int j = 0; j < 4; j++) {
-      int b_quant = frag_b_quant[k % 2][j];
-      int b_quant_shift = b_quant >> 8;
-      FragB frag_b0 = dequant(b_quant);
-      // If there are no groups, we can just scale the final output once and can
-      // avoid doing so for each weight.
-      if (group_blocks != -1) scale(frag_b0, frag_s[k % 2][j], 0);
-      FragB frag_b1 = dequant(b_quant_shift);
-      if (group_blocks != -1) scale(frag_b1, frag_s[k % 2][j], 1);
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks; i++) {
-        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
-        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
-      }
-    }
-  };
-
-  // Since we slice across the k dimension of a tile in order to increase the
-  // number of warps while keeping the n dimension of a tile reasonable, we have
-  // multiple warps that accumulate their partial sums of the same output
-  // location; which we have to reduce over in the end. We do in shared memory.
-  auto thread_block_reduce = [&]() {
-    constexpr int red_off = threads / b_sh_stride / 2;
-    if (red_off >= 1) {
-      auto red_idx = threadIdx.x / b_sh_stride;
-      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
-      constexpr int red_sh_delta = b_sh_stride;
-      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
-                      (threadIdx.x % b_sh_stride);
-
-      // Parallel logarithmic shared memory reduction. We make sure to avoid any
-      // unnecessary read or write iterations, e.g., for two warps we write only
-      // once by warp 1 and read only once by warp 0.
-
-  #pragma unroll
-      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
-  #pragma unroll
-        for (int i = red_off; i > 0; i /= 2) {
-          if (i <= red_idx && red_idx < 2 * i) {
-  #pragma unroll
-            for (int j = 0; j < 4 * 2; j++) {
-              int red_sh_wr =
-                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
-              if (i < red_off) {
-                float* c_rd =
-                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
-                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
-  #pragma unroll
-                for (int k = 0; k < 4; k++)
-                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
-                      c_rd[k] + c_wr[k];
-              }
-              sh[red_sh_wr] =
-                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
-            }
-          }
-          __syncthreads();
-        }
-        if (red_idx == 0) {
-  #pragma unroll
-          for (int i = 0; i < 4 * 2; i++) {
-            float* c_rd =
-                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
-  #pragma unroll
-            for (int j = 0; j < 4; j++)
-              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
-                  c_rd[j];
-          }
-        }
-        __syncthreads();
-      }
-    }
-  };
-
-  // Since multiple threadblocks may process parts of the same column slice, we
-  // finally have to globally reduce over the results. As the striped
-  // partitioning minimizes the number of such reductions and our outputs are
-  // usually rather small, we perform this reduction serially in L2 cache.
-  auto global_reduce = [&](bool first = false, bool last = false) {
-    // We are very careful here to reduce directly in the output buffer to
-    // maximize L2 cache utilization in this step. To do this, we write out
-    // results in FP16 (but still reduce with FP32 compute).
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
-    if (threadIdx.x < active_threads) {
-      int c_gl_stride = prob_n / 8;
-      int c_gl_wr_delta_o = 8 * c_gl_stride;
-      int c_gl_wr_delta_i = 4 * (active_threads / 32);
-      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
-                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
-      c_gl_wr += (2 * thread_n_blocks) * slice_col;
-      constexpr int c_sh_wr_delta = active_threads;
-      auto c_sh_wr = threadIdx.x;
-
-      int row = (threadIdx.x % 32) / 4;
-
-      if (!first) {
-  // Interestingly, doing direct global accesses here really seems to mess up
-  // the compiler and lead to slowdowns, hence we also use async-copies even
-  // though these fetches are not actually asynchronous.
-  #pragma unroll
-        for (int i = 0; i < thread_m_blocks * 4; i++) {
-          cp_async4_pred(
-              &sh[c_sh_wr + c_sh_wr_delta * i],
-              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
-                 c_gl_wr_delta_i * (i % 2)],
-              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
-        }
-        cp_async_fence();
-        cp_async_wait<0>();
-      }
-
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks * 4; i++) {
-        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
-          if (!first) {
-            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
-  #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
-              reinterpret_cast<float*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
-                  __half2float(reinterpret_cast<__half*>(&c_red)[j]);
-            }
-          }
-          if (!last) {
-            int4 c;
-  #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
-              reinterpret_cast<__half*>(&c)[j] =
-                  __float2half(reinterpret_cast<float*>(
-                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
-            }
-            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
-                c;
-          }
-        }
-      }
-    }
-  };
-
-  // Write out the reduce final result in the correct layout. We only actually
-  // reshuffle matrix fragments in this step, the reduction above is performed
-  // in fragment layout.
-  auto write_result = [&]() {
-    int c_gl_stride = prob_n / 8;
-    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
-    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
-    constexpr int c_sh_rd_delta =
-        c_sh_stride * (threads / (2 * thread_n_blocks));
-
-    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
-                  (threadIdx.x % (2 * thread_n_blocks));
-    c_gl_wr += (2 * thread_n_blocks) * slice_col;
-    int c_sh_wr =
-        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
-    c_sh_wr += 32 * (threadIdx.x / 32);
-    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
-                  (threadIdx.x % (2 * thread_n_blocks));
-
-    int c_gl_wr_end = c_gl_stride * prob_m;
-
-    // We first reorder in shared memory to guarantee the most efficient final
-    // global write patterns
-    auto write = [&](int idx, float c0, float c1, FragS& s) {
-      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
-      if (group_blocks ==
-          -1)  // for per-column quantization we finally apply the scale here
-        res = __hmul2(res, s[0]);
-      ((half2*)sh)[idx] = res;
-    };
-    if (threadIdx.x / 32 < thread_n_blocks / 4) {
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks; i++) {
-  #pragma unroll
-        for (int j = 0; j < 4; j++) {
-          int wr = c_sh_wr + 8 * j;
-          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
-                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
-          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
-                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
-          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
-                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
-          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
-                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
-        }
-        c_sh_wr += 16 * (4 * c_sh_stride);
-      }
-    }
-    __syncthreads();
-
-  #pragma unroll
-    for (int i = 0;
-         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
-         i++) {
-      if (c_gl_wr < c_gl_wr_end) {
-        C[c_gl_wr] = sh[c_sh_rd];
-        c_gl_wr += c_gl_wr_delta;
-        c_sh_rd += c_sh_rd_delta;
-      }
-    }
-  };
-
-  // Start global fetch and register load pipelines.
-  auto start_pipes = [&]() {
-  #pragma unroll
-    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
-    zero_accums();
-    wait_for_stage();
-    fetch_to_registers(0, 0);
-    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
-  };
-  start_pipes();
-
-  // Main loop.
-  while (slice_iters) {
-  // We unroll over both the global fetch and the register load pipeline to
-  // ensure all shared memory accesses are static. Note that both pipelines have
-  // even length meaning that the next iteration will always start at index 0.
-  #pragma unroll
-    for (int pipe = 0; pipe < stages;) {
-  #pragma unroll
-      for (int k = 0; k < b_sh_wr_iters; k++) {
-        fetch_to_registers(k + 1, pipe % stages);
-        if (k == b_sh_wr_iters - 2) {
-          fetch_to_shared((pipe + stages - 1) % stages, pipe,
-                          slice_iters >= stages);
-          pipe++;
-          wait_for_stage();
-        }
-        matmul(k);
-      }
-      slice_iters--;
-      if (slice_iters == 0) break;
-    }
-    a_gl_rd += a_gl_rd_delta_o * stages;
-
-    // Process results and, if necessary, proceed to the next column slice.
-    // While this pattern may not be the most readable, other ways of writing
-    // the loop seemed to noticeably worse performance after compilation.
-    if (slice_iters == 0) {
-      cp_async_wait<0>();
-      bool last = slice_idx == slice_count - 1;
-      // For per-column scales, we only fetch them here in the final step before
-      // write-out
-      if (group_blocks == -1 && last) {
-        if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
-        cp_async_fence();
-      }
-      thread_block_reduce();
-      if (group_blocks == -1 && last) {
-        cp_async_wait<0>();
-        __syncthreads();
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
-          reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
-          reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
-        }
-      }
-      if (slice_count > 1) {  // only globally reduce if there is more than one
-                              // block in a slice
-        barrier_acquire(&locks[slice_col], slice_idx);
-        global_reduce(slice_idx == 0, last);
-        barrier_release(&locks[slice_col], last);
-      }
-      if (last)  // only the last block in a slice actually writes the result
-        write_result();
-      slice_row = 0;
-      slice_col_par++;
-      slice_col++;
-      init_slice();
-      if (slice_iters) {
-        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                  (threadIdx.x % a_gl_rd_delta_o);
-  #pragma unroll
-        for (int i = 0; i < b_sh_wr_iters; i++)
-          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
-        if (slice_col == 0) {
-  #pragma unroll
-          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
-        }
-        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-        start_pipes();
-      }
-    }
-  }
-}
-
-#else
-
-template <const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
-          const int thread_n_blocks,  // same for n dimension (output)
-          const int thread_k_blocks,  // same for k dimension (reduction)
-          const int stages,  // number of stages for the async global->shared
-                             // fetch pipeline
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
-          >
-__global__ void Marlin(
-    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
-    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4* __restrict__ C,        // fp16 output buffer of shape mxn
-    const int4* __restrict__ s,  // fp16 quantization scales of shape
-                                 // (k/groupsize)xn
-    int prob_m,                  // batch dimension m
-    int prob_n,                  // output dimension n
-    int prob_k,                  // reduction dimension k
-    int* locks  // extra global storage for barrier synchronization
-) {
-  // Marlin is not implemented yet for SM < 8.0
-  assert(false);
-  return;
-}
-
-#endif
-
-// 8 warps are a good choice since every SM has 4 schedulers and having more
-// than 1 warp per schedule allows some more latency hiding. At the same time,
-// we want relatively few warps to have many registers per warp and small tiles.
-const int USER_THREADS =
-    256;               // Note: This is only used with user-provided thread_k/n
-const int STAGES = 4;  // 4 pipeline stages fit into shared memory
-const int SHARED_MEM =
-    96 * 1024;  // max shared memory on compute capability 8.6 (< 8.0)
-
-static constexpr int min_thread_n = 64;
-static constexpr int min_thread_k = 64;
-
-static constexpr int tile_size = 16;
-static constexpr int max_par = 16;
-
-static constexpr int pack_factor_4bit =
-    8;  // We have 8 4-bit vals inside a 32 bit
-
-#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
-                  GROUP_BLOCKS, NUM_THREADS)                                   \
-  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
-           thread_n_blocks == THREAD_N_BLOCKS &&                               \
-           thread_k_blocks == THREAD_K_BLOCKS &&                               \
-           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
-    cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
-                                THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,        \
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,          \
-                         SHARED_MEM);                                          \
-    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
-           STAGES, GROUP_BLOCKS><<<blocks, NUM_THREADS, SHARED_MEM, stream>>>( \
-        A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks);            \
-  }
-
-typedef struct {
-  int thread_k;
-  int thread_n;
-  int num_threads;
-} thread_config_t;
-
-thread_config_t small_batch_thread_configs[] = {
-    // Ordered by priority
-
-    // thread_k, thread_n, num_threads
-    {128, 128, 256},  // Default
-    {128, 64, 128},   // Reduce N 2X, same K
-    {64, 256, 256},   // Reduce K 2X, increase N 2X
-    {64, 128, 128},   // Reduce K 2X, same N
-};
-
-thread_config_t large_batch_thread_configs[] = {
-    // Ordered by priority
-
-    // thread_k, thread_n, num_threads
-    {64, 256, 256},   // Default
-    {128, 128, 256},  // Reduce N 2X, increase K 2X
-    {64, 128, 128},   // Reduce N 2X, same K
-    {128, 64, 128},   // Reduce N 4X, increase K 2X
-};
-
-bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n,
-                     int prob_k) {
-  // Sanity
-  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
-      th_config.num_threads == -1) {
-    return false;
-  }
-
-  // Verify K/N are divisible by thread K/N
-  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
-    return false;
-  }
-
-  // thread_k can be only 128 or 64 (because it must be less than groupsize
-  // which is 128)
-  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
-    return false;
-  }
-
-  // Verify min for thread K/N
-  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
-    return false;
-  }
-
-  // num_threads must be at least 128 (= 4 warps)
-  if (th_config.num_threads < 128) {
-    return false;
-  }
-
-  return true;
-}
-
-thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
-  if (prob_m <= 16) {
-    for (auto th_config : small_batch_thread_configs) {
-      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
-        return th_config;
-      }
-    }
-
-  } else {
-    for (auto th_config : large_batch_thread_configs) {
-      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
-        return th_config;
-      }
-    }
-  }
-
-  return thread_config_t{-1, -1, -1};
-}
-
-#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
-  __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
-  __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
-  __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
-
-void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m,
-                 int prob_n, int prob_k, void* workspace, int groupsize = -1,
-                 int dev = 0, cudaStream_t stream = 0, int thread_k = -1,
-                 int thread_n = -1, int sms = -1, int max_par = 16) {
-  int tot_m = prob_m;
-  int tot_m_blocks = ceildiv(tot_m, 16);
-  int pad = 16 * tot_m_blocks - tot_m;
-
-  if (sms == -1)
-    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
-
-  // Set thread config
-  thread_config_t th_config;
-  if (thread_k != -1 && thread_n != -1) {
-    // User-defined config
-    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
-  } else {
-    // Auto config
-    th_config = determine_thread_config(prob_m, prob_n, prob_k);
-  }
-
-  if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) {
-    throw std::runtime_error(
-        "Invalid thread config: thread_k = " + str(th_config.thread_k) +
-        ", thread_n = " + str(th_config.thread_n) +
-        ", num_threads = " + str(th_config.num_threads) + " for MKN = [" +
-        str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]");
-  }
-
-  // Uncomment for debug
-  // std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) +
-  //                  ", thread_n = " + str(th_config.thread_n) +
-  //                  ", num_threads = " + str(th_config.num_threads) + " for
-  //                  MKN = [" + str(prob_m) +
-  //                  ", " + str(prob_k) + ", " + str(prob_n) + "]\n";
-
-  int num_threads = th_config.num_threads;
-  thread_k = th_config.thread_k;
-  thread_n = th_config.thread_n;
-
-  int thread_k_blocks = thread_k / 16;
-  int thread_n_blocks = thread_n / 16;
-  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
-  int blocks = sms;
-
-  if (prob_m == 0 || prob_n == 0 || prob_k == 0) {
-    return;
-  }
-
-  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
-              " is not divisible by thread_n = ", thread_n);
-  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
-              " is not divisible by thread_k = ", thread_k);
-  if (group_blocks != -1) {
-    TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
-                " is not divisible by group_blocks = ", group_blocks);
-  }
-
-  const int4* A_ptr = (const int4*)A;
-  const int4* B_ptr = (const int4*)B;
-  int4* C_ptr = (int4*)C;
-  const int4* s_ptr = (const int4*)s;
-
-  int* locks = (int*)workspace;
-
-  for (int i = 0; i < tot_m_blocks; i += 4) {
-    int thread_m_blocks = tot_m_blocks - i;
-    prob_m = tot_m - 16 * i;
-    int par = 1;
-    if (thread_m_blocks > 4) {
-      // Note that parallel > 1 currently only works for inputs without any
-      // padding
-      par = (16 * thread_m_blocks - pad) / 64;
-      if (par > max_par) par = max_par;
-      prob_m = 64 * par;
-      i += 4 * (par - 1);
-      thread_m_blocks = 4;
-    }
-
-    // For compilation speed, we only define the kernel configurations that have
-    // seemed useful (in terms of performance) in our testing, however many more
-    // are, in principle, possible.
-    if (false) {
-    }
-    CALL_IF(8, 8, 256)
-    CALL_IF(16, 4, 256)
-    CALL_IF(8, 4, 128)
-    CALL_IF(4, 8, 128)
-    else {
-      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
-                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
-                               ", groupsize = " + str(groupsize) +
-                               ", thread_m_blocks = " + str(thread_m_blocks) +
-                               ", thread_n_blocks = " + str(thread_n_blocks) +
-                               ", thread_k_blocks = " + str(thread_k_blocks));
-    }
-
-    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
-    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
-  }
-}
-
-}  // namespace marlin_dense
-
-torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
-                          torch::Tensor& b_scales, torch::Tensor& workspace,
-                          int64_t size_m, int64_t size_n, int64_t size_k) {
-  // Verify M
-  TORCH_CHECK(size_m == a.size(0),
-              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
-                  ", size_m = " + str(size_m));
-
-  // Verify K
-  TORCH_CHECK(size_k == a.size(1),
-              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
-                  ", size_k = " + str(size_k));
-  TORCH_CHECK(size_k % marlin_dense::tile_size == 0,
-              "size_k = " + str(size_k) + " is not divisible by tile_size = " +
-                  str(marlin_dense::tile_size));
-  TORCH_CHECK((size_k / marlin_dense::tile_size) == b_q_weight.size(0),
-              "Shape mismatch: b_q_weight.size(0) = " +
-                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
-                  ", tile_size = " + str(marlin_dense::tile_size));
-
-  // Verify N
-  TORCH_CHECK(b_scales.size(1) == size_n,
-              "b_scales.size(1) = " + str(b_scales.size(1)) +
-                  ", size_n = " + str(size_n));
-  TORCH_CHECK(
-      b_q_weight.size(1) % marlin_dense::tile_size == 0,
-      "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
-          " is not divisible by tile_size = " + str(marlin_dense::tile_size));
-
-  int actual_size_n = (b_q_weight.size(1) / marlin_dense::tile_size) *
-                      marlin_dense::pack_factor_4bit;
-  TORCH_CHECK(
-      size_n == actual_size_n,
-      "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n));
-
-  // Verify A device and strides
-  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
-  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
-
-  // Verify B device and strides
-  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
-  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
-
-  // Verify scales device and strides
-  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
-  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
-
-  // Alloc C matrix
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
-  torch::Tensor c = torch::empty({size_m, size_n}, options);
-
-  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_k = -1;
-  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_n = -1;
-  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
-  int sms = -1;
-
-  // Detect groupsize
-  if (b_scales.size(0) != 1) {
-    TORCH_CHECK(size_k % b_scales.size(0) == 0,
-                "size_k = " + str(size_k) +
-                    ", is not divisible by b_scales.size(0) = " +
-                    str(b_scales.size(0)));
-  }
-  int groupsize = b_scales.size(0) == 1 ? -1 : size_k / b_scales.size(0);
-
-  // Verify groupsize
-  TORCH_CHECK(groupsize == -1 || groupsize == 128,
-              "Unexpected groupsize = " + str(groupsize));
-
-  // Verify workspace size
-  TORCH_CHECK(size_n % marlin_dense::min_thread_n == 0,
-              "size_n = " + str(size_n) +
-                  ", is not divisible by min_thread_n = " +
-                  str(marlin_dense::min_thread_n));
-  int min_workspace_size =
-      (size_n / marlin_dense::min_thread_n) * marlin_dense::max_par;
-  TORCH_CHECK(workspace.numel() >= min_workspace_size,
-              "workspace.numel = " + str(workspace.numel()) +
-                  " is below min_workspace_size = " + str(min_workspace_size));
-
-  int dev = a.get_device();
-  marlin_dense::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
-                            b_scales.data_ptr(), size_m, size_n, size_k,
-                            workspace.data_ptr(), groupsize, dev,
-                            at::cuda::getCurrentCUDAStream(dev), thread_k,
-                            thread_n, sms, marlin_dense::max_par);
-
-  return c;
-}
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("marlin_gemm", &marlin_gemm);
-}
diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
deleted file mode 100644
index c96d68d9b2..0000000000
--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ /dev/null
@@ -1,1248 +0,0 @@
-/*
- * Adapted from
- * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda_kernel.cu
- * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda.cpp
- * Modified by HandH1998
- * Copyright (C) 2024 HandH1998
- * Copyright (C) Marlin.2024 Elias Frantar
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *         http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-#include <iostream>
-
-#include "../dense/common/base.h"
-#include "core/registration.h"
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-  #include "../dense/common/mem.h"
-#endif
-
-template <typename T>
-inline std::string str(T x) {
-  return std::to_string(x);
-}
-
-namespace {
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-
-using I4 = Vec<int, 4>;
-// Matrix fragments for tensor core instructions; their precise layout is
-// documented here:
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-integer-type
-using FragA = Vec<uint32_t, 2>;
-using FragB = Vec<uint32_t, 1>;
-using FragC = Vec<int, 4>;
-using FragS_GROUP = Vec<half2, 1>;  // weight per-group quantization scales
-using FragS_CHANNEL =
-    Vec<float, 2>;  // weight per-channel quantization scales or activaton
-                    // per-token quantization scales
-
-// NOTE(HandH1998): cp.async.cg only support BYTES = 16, however,
-// cp.async.ca can support BYTES = 4, 8, 16;
-// as s_tok's shape is equal to prob_m, we need set s_tok to float type,
-// and cp_size = 1 float, i.e., 4 BYTES
-// Asynchronous global->shared copy for activation quantizaton scales s_tok
-__device__ inline void cp_async1(void* smem_ptr, const void* glob_ptr) {
-  const int BYTES = 4;
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile(
-      "{\n"
-      "   cp.async.ca.shared.global [%0], [%1], %2;\n"
-      "}\n" ::"r"(smem),
-      "l"(glob_ptr), "n"(BYTES));
-}
-
-// m16n8k16 tensor core mma instruction with int8 inputs and int32
-// output/accumulation.
-__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
-                           FragC& frag_c) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  int* c = reinterpret_cast<int*>(&frag_c);
-  asm volatile(
-      "mma.sync.aligned.m16n8k16.row.col.satfinite.s32.s8.s8.s32 "
-      "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-      : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-      : "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
-        "r"(c[3]));
-}
-
-// Instruction for loading a full 16x16 matrix fragment of operand A from shared
-// memory, directly in int8 tensor core layout.
-__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
-  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
-  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
-  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
-               : "=r"(a[0]), "=r"(a[1])
-               : "r"(smem));
-}
-
-inline __device__ half2 float2_to_half2(float2 f) {
-  uint32_t res;
-  // NOTE(HandH1998): h0,h1 should be uint16_t, not half
-  uint16_t h0, h1;
-  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h0) : "f"(f.x));
-  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h1) : "f"(f.y));
-  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(res) : "h"(h0), "h"(h1));
-  return reinterpret_cast<half2&>(res);
-}
-
-inline __device__ float int32_to_float(int h) {
-  float res;
-  asm volatile("cvt.rn.f32.s32 %0, %1;\n" : "=f"(res) : "r"(h));
-  return res;
-}
-
-// Lookup-table based 3-input logical operation; explicitly used for
-// dequantization as the compiler does not seem to automatically recognize it in
-// all cases.
-template <int lut>
-__device__ inline int lop3(int a, int b, int c) {
-  int res;
-  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
-               : "=r"(res)
-               : "r"(a), "r"(b), "r"(c), "n"(lut));
-  return res;
-}
-
-// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values
-// for weight per channel dequant.
-__device__ inline FragB dequant_per_channel(int q) {
-  static constexpr int MASK = 0xf0f0f0f0;
-  FragB frag_b;
-  frag_b[0] = (q & MASK);
-  return frag_b;
-}
-
-// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values
-// for weight per group dequant.
-__device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
-  static constexpr uint32_t LO = 0x000f000f;
-  static constexpr uint32_t HI = 0x00f000f0;
-  static constexpr uint32_t EX = 0x64006400;
-  // Guarantee that the `(a & b) | c` operations are LOP3s.
-  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
-  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
-  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
-  // directly into `SUB` and `ADD`.
-  static constexpr uint32_t SUB = 0x64086408;
-  static constexpr uint32_t MUL = 0x2c002c00;
-  static constexpr uint32_t ADD = 0xd480d480;
-  *reinterpret_cast<half2*>(&t0) = __hsub2(
-      *reinterpret_cast<half2*>(&t0), *reinterpret_cast<const half2*>(&SUB));
-  *reinterpret_cast<half2*>(&t1) = __hfma2(
-      *reinterpret_cast<half2*>(&t1), *reinterpret_cast<const half2*>(&MUL),
-      *reinterpret_cast<const half2*>(&ADD));
-
-  uint16_t s = reinterpret_cast<uint16_t*>(&frag_s)[i];
-  uint32_t double_s;
-  // pack 2xfp16 to half2
-  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(double_s) : "h"(s), "h"(s));
-  // dequant and convert 4 half to 4 uint8 (be placed at the low 8 bits of 4
-  // half, respectively)
-  static constexpr uint32_t MAGIC_NUM = 0x64806480;
-  *reinterpret_cast<half2*>(&t0) = __hfma2(
-      *reinterpret_cast<half2*>(&t0), *reinterpret_cast<half2*>(&double_s),
-      *reinterpret_cast<const half2*>(&MAGIC_NUM));
-  *reinterpret_cast<half2*>(&t1) = __hfma2(
-      *reinterpret_cast<half2*>(&t1), *reinterpret_cast<half2*>(&double_s),
-      *reinterpret_cast<const half2*>(&MAGIC_NUM));
-  // take out the 4 uint8 from 4 half, then convert them to 4 int8 and pack 4
-  // int8 into 1 uint32
-  FragB frag_b;
-  uint32_t uint8s;
-  static constexpr uint32_t MASK_0246 = 0x6420;
-  static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080;
-  asm volatile("prmt.b32 %0,%1,%2,%3;\n"
-               : "=r"(uint8s)
-               : "r"(t0), "r"(t1), "n"(MASK_0246));
-  frag_b[0] = (uint8s ^ UINT8s_TO_INT8s_MASK);
-  return frag_b;
-}
-
-template <const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
-          const int thread_n_blocks,  // same for n dimension (output)
-          const int thread_k_blocks,  // same for k dimension (reduction)
-          const int stages,  // number of stages for the async global->shared
-                             // fetch pipeline
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
-          >
-__global__ void Marlin(
-    const int4* __restrict__ A,  // int8 input matrix of shape mxk
-    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4* __restrict__ C,        // int32 global_reduce buffer of shape
-                           // (max_par*16*4)xn, as int8 tensor core's output is
-                           // int32 dtype
-    int4* __restrict__ D,              // fp16 output buffer of shape mxn
-    const float* __restrict__ s_tok,   // fp32 activation per-token quantization
-                                       // scales of shape mx1
-    const int4* __restrict__ s_ch,     // fp32 weight per-channel quantization
-                                       // scales of shape 1xn
-    const int4* __restrict__ s_group,  // fp16 weight per-group quantization
-                                       // scales of shape (k/groupsize)xn, when
-                                       // group_blocks=-1, it should be nullptr
-    int prob_m,                        // batch dimension m
-    int prob_n,                        // output dimension n
-    int prob_k,                        // reduction dimension k
-    int* locks  // extra global storage for barrier synchronization
-) {
-  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
-  // same size, which might involve multiple column "slices" (of width 16 *
-  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
-  // example:
-  //   0 1 3
-  //   0 2 3
-  //   1 2 4
-  // While this kind of partitioning makes things somewhat more complicated, it
-  // ensures good utilization of all SMs for many kinds of shape and GPU
-  // configurations, while requiring as few slow global cross-threadblock
-  // reductions as possible.
-
-  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
-  // better partitioning with less reductions
-  int parallel = 1;
-  if (prob_m > 16 * thread_m_blocks) {
-    parallel = prob_m / (16 * thread_m_blocks);
-    prob_m = 16 * thread_m_blocks;
-  }
-
-  int k_tiles = prob_k / 16 / thread_k_blocks;
-  int n_tiles = prob_n / 16 / thread_n_blocks;
-  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
-  // Ensure that the number of tiles in each stripe is a multiple of the
-  // groupsize; this avoids an annoying special case where a stripe starts in
-  // the middle of group.
-  if constexpr (group_blocks != -1)
-    iters = (group_blocks / thread_k_blocks) *
-            ceildiv(iters, (group_blocks / thread_k_blocks));
-
-  int slice_row = (iters * blockIdx.x) % k_tiles;
-  int slice_col_par = (iters * blockIdx.x) / k_tiles;
-  int slice_col = slice_col_par;
-  int slice_iters;  // number of threadblock tiles in the current slice
-  int slice_count =
-      0;          // total number of active threadblocks in the current slice
-  int slice_idx;  // index of threadblock in current slice; numbered bottom to
-                  // top
-
-  // We can easily implement parallel problem execution by just remapping
-  // indices and advancing global pointers
-  if (slice_col_par >= n_tiles) {
-    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 16;
-    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 4;
-    D += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
-    s_tok += (slice_col_par / n_tiles) * 16 * thread_m_blocks;
-    locks += (slice_col_par / n_tiles) * n_tiles;
-    slice_col = slice_col_par % n_tiles;
-  }
-
-  // Compute all information about the current slice which is required for
-  // synchronization.
-  auto init_slice = [&]() {
-    slice_iters =
-        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
-    if (slice_iters == 0) return;
-    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
-    slice_count = 1;
-    slice_idx = 0;
-    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
-    if (col_first <= k_tiles * (slice_col_par + 1)) {
-      int col_off = col_first - k_tiles * slice_col_par;
-      slice_count = ceildiv(k_tiles - col_off, iters);
-      if (col_off > 0) slice_count++;
-      int delta_first = iters * blockIdx.x - col_first;
-      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
-        slice_idx = slice_count - 1;
-      else {
-        slice_idx = slice_count - 1 - delta_first / iters;
-        if (col_off > 0) slice_idx--;
-      }
-    }
-    if (slice_col == n_tiles) {
-      A += 16 * thread_m_blocks * prob_k / 16;
-      C += 16 * thread_m_blocks * prob_n / 4;
-      D += 16 * thread_m_blocks * prob_n / 8;
-      s_tok += 16 * thread_m_blocks;
-      locks += n_tiles;
-      slice_col = 0;
-    }
-  };
-  init_slice();
-
-  int a_gl_stride = prob_k / 16;  // stride of the A matrix in global memory
-  // We typically use `constexpr` to indicate that this value is a compile-time
-  // constant
-  constexpr int a_sh_stride =
-      16 * thread_k_blocks / 16;  // stride of an A matrix tile in shared memory
-  constexpr int a_gl_rd_delta_o =
-      16 * thread_k_blocks /
-      16;  // delta between subsequent A tiles in global memory
-  int a_gl_rd_delta_i =
-      a_gl_stride *
-      (threads / a_gl_rd_delta_o);  // between subsequent accesses within a tile
-  constexpr int a_sh_wr_delta =
-      a_sh_stride *
-      (threads / a_gl_rd_delta_o);  // between shared memory writes
-  constexpr int a_sh_rd_delta_o =
-      1 * ((threads / 32) /
-           (thread_n_blocks / 4));  // between shared memory tile reads
-  constexpr int a_sh_rd_delta_i =
-      a_sh_stride * 16;  // within a shared memory tile
-  constexpr int a_sh_stage =
-      a_sh_stride * (16 * thread_m_blocks);  // overall size of a tile
-  constexpr int a_sh_wr_iters =
-      ceildiv(a_sh_stage,
-              a_sh_wr_delta);  // number of shared write iterations for a tile
-
-  int b_gl_stride = 16 * prob_n / 32;
-  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
-  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
-  constexpr int b_sh_wr_delta = threads;
-  constexpr int b_sh_rd_delta = threads;
-  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
-  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
-
-  constexpr int s_tok_sh_stride = 16 * thread_m_blocks;
-
-  constexpr int s_ch_sh_stride = 16 * thread_n_blocks / 4;
-
-  int s_group_gl_stride = prob_n / 8;
-  constexpr int s_group_sh_stride = 16 * thread_n_blocks / 8;
-  constexpr int s_group_sh_stage = s_group_sh_stride;
-  int s_group_gl_rd_delta = s_group_gl_stride;
-
-  // Global A read index of current thread.
-  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                (threadIdx.x % a_gl_rd_delta_o);
-  a_gl_rd += a_gl_rd_delta_o * slice_row;
-  // Shared write index of current thread.
-  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                (threadIdx.x % a_gl_rd_delta_o);
-  // Shared read index.
-  // NOTE(HandH1998): int8 input a only need 16 threads to load 16x16 matrix
-  int a_sh_rd = a_sh_stride * ((threadIdx.x % 32) % 16);
-  a_sh_rd += 1 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
-
-  int b_gl_rd =
-      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
-  b_gl_rd += b_sh_stride * slice_col;
-  b_gl_rd += b_gl_rd_delta_o * slice_row;
-  auto b_sh_wr = threadIdx.x;
-  auto b_sh_rd = threadIdx.x;
-
-  auto s_tok_gl_rd = threadIdx.x;
-  // NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10,
-  // 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for
-  // thread 0, 1, 2, 3. For more details, refer to mma operand A layout as
-  // s_tok's size is not fixed, we can not shuffle before inference we shuffle
-  // it when fetching s_tok from global memory to shared memory, that's why
-  // s_tok_sh_wr is like this
-  int s_tok_sh_wr =
-      (threadIdx.x / 16) * 16 + (threadIdx.x % 8) * 2 + (threadIdx.x % 16) / 8;
-  int s_tok_sh_rd = (threadIdx.x % 32) / 4;
-  bool s_tok_sh_wr_pred = threadIdx.x < prob_m;
-
-  auto s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
-  auto s_ch_sh_wr = threadIdx.x;
-  int s_ch_sh_rd = 16 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                   2 * ((threadIdx.x % 32) % 4);
-  bool s_ch_sh_wr_pred = threadIdx.x < s_ch_sh_stride;
-
-  int s_group_gl_rd, s_group_sh_wr, s_group_sh_rd;
-  bool s_group_sh_wr_pred;
-  if constexpr (group_blocks != -1) {
-    s_group_gl_rd =
-        s_group_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
-        s_group_sh_stride * slice_col + threadIdx.x;
-    s_group_sh_wr = threadIdx.x;
-    // NOTE(HandH1998): s_group_sh_rd is related to mma output C
-    s_group_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                    (threadIdx.x % 32) / 4;
-    s_group_sh_wr_pred = threadIdx.x < s_group_sh_stride;
-  }
-
-  // Precompute which thread should not read memory in which iterations; this is
-  // needed if there are more threads than required for a certain tilesize or
-  // when the batchsize is not a multiple of 16.
-  bool a_sh_wr_pred[a_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < a_sh_wr_iters; i++)
-    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
-
-  // To ensure that writing and reading A tiles to/from shared memory, the
-  // latter in fragment format, is fully bank conflict free, we need to use a
-  // rather fancy XOR-based layout. The key here is that neither reads nor
-  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
-  // same shared memory banks. Further, it seems (based on NSight-Compute) that
-  // each warp must also write a consecutive memory segment?
-  auto transform_a = [&](int i) {
-    int row = i / a_gl_rd_delta_o;
-    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
-  };
-  // Since the computation of this remapping is non-trivial and, due to our main
-  // loop unrolls, all shared memory accesses are static, we simply precompute
-  // both transformed reads and writes.
-  int a_sh_wr_trans[a_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < a_sh_wr_iters; i++)
-    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
-  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-    for (int j = 0; j < thread_m_blocks; j++)
-      a_sh_rd_trans[i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
-  }
-
-  // Since B-accesses have non-constant stride they have to be computed at
-  // runtime; we break dependencies between subsequent accesses with a tile by
-  // maintining multiple pointers (we have enough registers), a tiny
-  // optimization.
-  const int4* B_ptr[b_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++)
-    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
-
-  extern __shared__ int4 sh[];
-  // Shared memory storage for global fetch pipelines.
-  // NOTE(HandH1998): stages need >= 4, otherwise, sh_s_tok = sh + max(stages *
-  // a_sh_stage + stages * b_sh_stage, 4 * stages * a_sh_stage)
-  int4* sh_a = sh;
-  int4* sh_b = sh_a + (stages * a_sh_stage);
-  int4* sh_s_tok = sh_b + (stages * b_sh_stage);
-  int4* sh_s_ch = sh_s_tok + s_tok_sh_stride;
-  int4* sh_s_group = sh_s_ch + s_ch_sh_stride;
-
-  // Register storage for double buffer of shared memory reads.
-  FragA frag_a[2][thread_m_blocks];
-  I4 frag_b_quant[2];
-  FragC frag_c[thread_m_blocks][4][2];
-  FragS_GROUP frag_s_group[2][4];
-  FragS_CHANNEL frag_s_tok[thread_m_blocks];
-  FragS_CHANNEL frag_s_ch[2][4];
-
-  // Zero accumulators.
-  auto zero_accums = [&]() {
-  #pragma unroll
-    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
-      reinterpret_cast<int*>(frag_c)[i] = 0;
-  };
-
-  // Asynchronously fetch the next A, B and s tile from global to the next
-  // shared memory pipeline location.
-  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
-    if (pred) {
-      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < a_sh_wr_iters; i++) {
-        cp_async4_pred(
-            &sh_a_stage[a_sh_wr_trans[i]],
-            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
-            a_sh_wr_pred[i]);
-      }
-      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-  #pragma unroll
-      for (int i = 0; i < b_sh_wr_iters; i++) {
-        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
-        B_ptr[i] += b_gl_rd_delta_o;
-      }
-      // Only fetch scales if this tile starts a new group
-      if constexpr (group_blocks != -1) {
-        if (pipe % (group_blocks / thread_k_blocks) == 0) {
-          int4* sh_s_group_stage = sh_s_group + s_group_sh_stage * pipe;
-          if (s_group_sh_wr_pred)
-            cp_async4(&sh_s_group_stage[s_group_sh_wr],
-                      &s_group[s_group_gl_rd]);
-          s_group_gl_rd += s_group_gl_rd_delta;
-        }
-      }
-    }
-    // Insert a fence even when we are winding down the pipeline to ensure that
-    // waiting is also correct at this point.
-    cp_async_fence();
-  };
-
-  // Wait until the next thread tile has been loaded to shared memory.
-  auto wait_for_stage = [&]() {
-    // We only have `stages - 2` active fetches since we are double buffering
-    // and can only issue the next fetch when it is guaranteed that the previous
-    // shared memory load is fully complete (as it may otherwise be
-    // overwritten).
-    cp_async_wait<stages - 2>();
-    __syncthreads();
-  };
-
-  // Load the next sub-tile from the current location in the shared memory pipe
-  // into the current register buffer.
-  auto fetch_to_registers = [&](int k, int pipe) {
-    // It may seem inefficient that we reload the groups for every sub-tile;
-    // however, this does not seem to be a significant bottleneck, while some
-    // theoretically better attempts have lead to bad instruction ordering by
-    // the compiler and correspondingly a noticeable drop in performance.
-    if constexpr (group_blocks != -1) {
-      int4* sh_s_group_stage =
-          sh_s_group +
-          s_group_sh_stage * ((group_blocks / thread_k_blocks) *
-                              (pipe / (group_blocks / thread_k_blocks)));
-      reinterpret_cast<int4*>(&frag_s_group[k % 2])[0] =
-          sh_s_group_stage[s_group_sh_rd];
-    }
-    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
-  #pragma unroll
-    for (int i = 0; i < thread_m_blocks; i++)
-      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
-    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
-    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
-        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
-  };
-
-  // Execute the actual tensor core matmul of a sub-tile.
-  auto matmul = [&](int k) {
-  // We have the m dimension as the inner loop in order to encourage overlapping
-  // dequantization and matmul operations.
-  #pragma unroll
-    for (int j = 0; j < 4; j++) {
-      int b_quant = frag_b_quant[k % 2][j];
-      // int b_quant_shift = b_quant << 4;
-      FragB frag_b0, frag_b1;
-      // If there are no groups, we can just scale the final output once and can
-      // avoid doing so for each weight.
-      if constexpr (group_blocks != -1) {
-        int b_quant_shift = b_quant >> 8;
-        frag_b0 = dequant_per_group(b_quant, frag_s_group[k % 2][j], 0);
-        frag_b1 = dequant_per_group(b_quant_shift, frag_s_group[k % 2][j], 1);
-      } else {
-        int b_quant_shift = b_quant << 4;
-        frag_b0 = dequant_per_channel(b_quant);
-        frag_b1 = dequant_per_channel(b_quant_shift);
-      }
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks; i++) {
-        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
-        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
-      }
-    }
-  };
-
-  // Since we slice across the k dimension of a tile in order to increase the
-  // number of warps while keeping the n dimension of a tile reasonable, we have
-  // multiple warps that accumulate their partial sums of the same output
-  // location; which we have to reduce over in the end. We do in shared memory.
-  auto thread_block_reduce = [&]() {
-    constexpr int red_off = threads / b_sh_stride / 2;
-    if (red_off >= 1) {
-      auto red_idx = threadIdx.x / b_sh_stride;
-      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
-      constexpr int red_sh_delta = b_sh_stride;
-      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
-                      (threadIdx.x % b_sh_stride);
-
-      // Parallel logarithmic shared memory reduction. We make sure to avoid any
-      // unnecessary read or write iterations, e.g., for two warps we write only
-      // once by warp 1 and read only once by warp 0.
-
-  #pragma unroll
-      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
-  #pragma unroll
-        for (int i = red_off; i > 0; i /= 2) {
-          if (i <= red_idx && red_idx < 2 * i) {
-  #pragma unroll
-            for (int j = 0; j < 4 * 2; j++) {
-              int red_sh_wr =
-                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
-              if (i < red_off) {
-                int* c_rd =
-                    reinterpret_cast<int*>(&sh[red_sh_delta * j + red_sh_rd]);
-                int* c_wr = reinterpret_cast<int*>(&sh[red_sh_wr]);
-  #pragma unroll
-                for (int k = 0; k < 4; k++)
-                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
-                      c_rd[k] + c_wr[k];
-              }
-              sh[red_sh_wr] =
-                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
-            }
-          }
-          __syncthreads();
-        }
-        if (red_idx == 0) {
-  #pragma unroll
-          for (int i = 0; i < 4 * 2; i++) {
-            int* c_rd =
-                reinterpret_cast<int*>(&sh[red_sh_delta * i + red_sh_rd]);
-  #pragma unroll
-            for (int j = 0; j < 4; j++)
-              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
-                  c_rd[j];
-          }
-        }
-        __syncthreads();
-      }
-    }
-  };
-
-  // Since multiple threadblocks may process parts of the same column slice, we
-  // finally have to globally reduce over the results. As the striped
-  // partitioning minimizes the number of such reductions and our outputs are
-  // usually rather small, we perform this reduction serially in L2 cache.
-  // global_reduce works on INT32 elements, which are the results of INT8 GEMM.
-  // This is why we need another INT32 maxtrix `C` to reduce instead of the
-  // original half matrix `D`.
-  auto global_reduce = [&](bool first = false, bool last = false) {
-    // We are very careful here to reduce directly in the output buffer to
-    // maximize L2 cache utilization in this step. To do this, we write out
-    // results in FP16 (but still reduce with FP32 compute).
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
-    if (threadIdx.x < active_threads) {
-      int c_gl_stride = prob_n / 4;
-      int c_gl_wr_delta_o = 8 * c_gl_stride;
-      int c_gl_wr_delta_i = 8 * (active_threads / 32);
-      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
-                    8 * (threadIdx.x / 32) + (threadIdx.x % 4) * 2;
-      c_gl_wr += (4 * thread_n_blocks) * slice_col;
-      constexpr int c_sh_wr_delta = active_threads * 2;
-      auto c_sh_wr = 2 * threadIdx.x;
-
-      int row = (threadIdx.x % 32) / 4;
-
-      if (!first) {
-  // Interestingly, doing direct global accesses here really seems to mess up
-  // the compiler and lead to slowdowns, hence we also use async-copies even
-  // though these fetches are not actually asynchronous.
-  #pragma unroll
-        for (int i = 0; i < thread_m_blocks * 4; i++) {
-          cp_async4_pred(
-              &sh[c_sh_wr + c_sh_wr_delta * i],
-              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
-                 c_gl_wr_delta_i * (i % 2)],
-              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
-          cp_async4_pred(
-              &sh[c_sh_wr + c_sh_wr_delta * i + 1],
-              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
-                 c_gl_wr_delta_i * (i % 2) + 1],
-              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
-        }
-        cp_async_fence();
-        cp_async_wait<0>();
-      }
-
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks * 4; i++) {
-        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
-          if (!first) {
-            int4 d_red1 = sh[c_sh_wr + i * c_sh_wr_delta];
-            int4 d_red2 = sh[c_sh_wr + i * c_sh_wr_delta + 1];
-  #pragma unroll
-            for (int j = 0; j < 4; j++) {
-              reinterpret_cast<int*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
-                  reinterpret_cast<int*>(&d_red1)[j];
-            }
-  #pragma unroll
-            for (int j = 0; j < 4; j++) {
-              reinterpret_cast<int*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)] +=
-                  reinterpret_cast<int*>(&d_red2)[j];
-            }
-          }
-          if (!last) {
-            int4 d1, d2;
-  #pragma unroll
-            for (int j = 0; j < 4; j++) {
-              reinterpret_cast<int*>(&d1)[j] = reinterpret_cast<int*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)];
-            }
-  #pragma unroll
-            for (int j = 0; j < 4; j++) {
-              reinterpret_cast<int*>(&d2)[j] = reinterpret_cast<int*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)];
-            }
-            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
-                d1;
-            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2) +
-              1] = d2;
-          }
-        }
-      }
-    }
-  };
-
-  // Write out the reduce final result in the correct layout. We only actually
-  // reshuffle matrix fragments in this step, the reduction above is performed
-  // in fragment layout.
-  auto write_result = [&]() {
-    int d_gl_stride = prob_n / 8;
-    constexpr int d_sh_stride = 2 * thread_n_blocks + 1;
-    int d_gl_wr_delta = d_gl_stride * (threads / (2 * thread_n_blocks));
-    constexpr int d_sh_rd_delta =
-        d_sh_stride * (threads / (2 * thread_n_blocks));
-
-    int d_gl_wr = d_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
-                  (threadIdx.x % (2 * thread_n_blocks));
-    d_gl_wr += (2 * thread_n_blocks) * slice_col;
-    int d_sh_wr =
-        (4 * d_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
-    d_sh_wr += 32 * (threadIdx.x / 32);
-    int d_sh_rd = d_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
-                  (threadIdx.x % (2 * thread_n_blocks));
-
-    int d_gl_wr_end = d_gl_stride * prob_m;
-
-    // We first reorder in shared memory to guarantee the most efficient final
-    // global write patterns
-    auto write = [&](int idx, int c0, int c1, float a_s, FragS_CHANNEL& w_s) {
-      float2 deq_res;
-      deq_res.x = int32_to_float(c0) * w_s[0] * a_s;
-      deq_res.y = int32_to_float(c1) * w_s[1] * a_s;
-      ((half2*)sh)[idx] = float2_to_half2(deq_res);
-    };
-
-    if (threadIdx.x / 32 < thread_n_blocks / 4) {
-  #pragma unroll
-      for (int i = 0; i < thread_m_blocks; i++) {
-  #pragma unroll
-        for (int j = 0; j < 4; j++) {
-          int wr = d_sh_wr + 8 * j;
-          write(wr + (4 * d_sh_stride) * 0 + 0, frag_c[i][j][0][0],
-                frag_c[i][j][0][1], frag_s_tok[i][0],
-                frag_s_ch[j / 2][2 * (j % 2) + 0]);
-          write(wr + (4 * d_sh_stride) * 8 + 0, frag_c[i][j][0][2],
-                frag_c[i][j][0][3], frag_s_tok[i][1],
-                frag_s_ch[j / 2][2 * (j % 2) + 0]);
-          write(wr + (4 * d_sh_stride) * 0 + 4, frag_c[i][j][1][0],
-                frag_c[i][j][1][1], frag_s_tok[i][0],
-                frag_s_ch[j / 2][2 * (j % 2) + 1]);
-          write(wr + (4 * d_sh_stride) * 8 + 4, frag_c[i][j][1][2],
-                frag_c[i][j][1][3], frag_s_tok[i][1],
-                frag_s_ch[j / 2][2 * (j % 2) + 1]);
-        }
-        d_sh_wr += 16 * (4 * d_sh_stride);
-      }
-    }
-    __syncthreads();
-
-  #pragma unroll
-    for (int i = 0;
-         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
-         i++) {
-      if (d_gl_wr < d_gl_wr_end) {
-        D[d_gl_wr] = sh[d_sh_rd];
-        d_gl_wr += d_gl_wr_delta;
-        d_sh_rd += d_sh_rd_delta;
-      }
-    }
-  };
-
-  // Start global fetch and register load pipelines.
-  auto start_pipes = [&]() {
-  #pragma unroll
-    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
-    zero_accums();
-    wait_for_stage();
-    fetch_to_registers(0, 0);
-    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
-  };
-  start_pipes();
-
-  // Main loop.
-  while (slice_iters) {
-  // We unroll over both the global fetch and the register load pipeline to
-  // ensure all shared memory accesses are static. Note that both pipelines have
-  // even length meaning that the next iteration will always start at index 0.
-  #pragma unroll
-    for (int pipe = 0; pipe < stages;) {
-  #pragma unroll
-      for (int k = 0; k < b_sh_wr_iters; k++) {
-        fetch_to_registers(k + 1, pipe % stages);
-        if (k == b_sh_wr_iters - 2) {
-          fetch_to_shared((pipe + stages - 1) % stages, pipe,
-                          slice_iters >= stages);
-          pipe++;
-          wait_for_stage();
-        }
-        matmul(k);
-      }
-      slice_iters--;
-      if (slice_iters == 0) break;
-    }
-    a_gl_rd += a_gl_rd_delta_o * stages;
-
-    // Process results and, if necessary, proceed to the next column slice.
-    // While this pattern may not be the most readable, other ways of writing
-    // the loop seemed to noticeably worse performance after compilation.
-    if (slice_iters == 0) {
-      cp_async_wait<0>();
-      bool last = slice_idx == slice_count - 1;
-      // For per-column scales, we only fetch them here in the final step before
-      // write-out
-      if (last) {
-        if (s_tok_sh_wr_pred) {
-          cp_async1(&sh_s_tok[s_tok_sh_wr], &s_tok[s_tok_gl_rd]);
-        }
-        if (s_ch_sh_wr_pred) {
-          cp_async4(&sh_s_ch[s_ch_sh_wr], &s_ch[s_ch_gl_rd]);
-        }
-        cp_async_fence();
-      }
-      thread_block_reduce();
-      if (last) {
-        cp_async_wait<0>();
-        __syncthreads();
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
-  #pragma unroll
-          for (int i = 0; i < thread_m_blocks; i++) {
-            frag_s_tok[i][0] =
-                *reinterpret_cast<float*>(&sh_s_tok[16 * i + 2 * s_tok_sh_rd]);
-            frag_s_tok[i][1] = *reinterpret_cast<float*>(
-                &sh_s_tok[16 * i + 2 * s_tok_sh_rd + 1]);
-          }
-          reinterpret_cast<int4*>(&frag_s_ch)[0] = sh_s_ch[s_ch_sh_rd + 0];
-          reinterpret_cast<int4*>(&frag_s_ch)[1] = sh_s_ch[s_ch_sh_rd + 1];
-          reinterpret_cast<int4*>(&frag_s_ch)[2] = sh_s_ch[s_ch_sh_rd + 8];
-          reinterpret_cast<int4*>(&frag_s_ch)[3] = sh_s_ch[s_ch_sh_rd + 9];
-        }
-      }
-      if (slice_count > 1) {  // only globally reduce if there is more than one
-                              // block in a slice
-        barrier_acquire(&locks[slice_col], slice_idx);
-        global_reduce(slice_idx == 0, last);
-        barrier_release(&locks[slice_col], last);
-      }
-      if (last)  // only the last block in a slice actually writes the result
-        write_result();
-      slice_row = 0;
-      slice_col_par++;
-      slice_col++;
-      init_slice();
-      if (slice_iters) {
-        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
-                  (threadIdx.x % a_gl_rd_delta_o);
-  #pragma unroll
-        for (int i = 0; i < b_sh_wr_iters; i++)
-          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
-        if (slice_col == 0) {
-  #pragma unroll
-          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
-        }
-        s_group_gl_rd = s_group_sh_stride * slice_col + threadIdx.x;
-        s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
-        start_pipes();
-      }
-    }
-  }
-}
-
-#else
-
-template <const int threads,          // number of threads in a threadblock
-          const int thread_m_blocks,  // number of 16x16 blocks in the m
-                                      // dimension (batchsize) of the
-                                      // threadblock
-          const int thread_n_blocks,  // same for n dimension (output)
-          const int thread_k_blocks,  // same for k dimension (reduction)
-          const int stages,  // number of stages for the async global->shared
-                             // fetch pipeline
-          const int group_blocks = -1  // number of consecutive 16x16 blocks
-                                       // with a separate quantization scale
-          >
-__global__ void Marlin(
-    const int4* __restrict__ A,  // int8 input matrix of shape mxk
-    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4* __restrict__ C,        // int32 global_reduce buffer of shape
-                           // (max_par*16*4)xn, as int8 tensor core's output is
-                           // int32 dtype
-    int4* __restrict__ D,              // fp16 output buffer of shape mxn
-    const float* __restrict__ s_tok,   // fp32 activation per-token quantization
-                                       // scales of shape mx1
-    const int4* __restrict__ s_ch,     // fp32 weight per-channel quantization
-                                       // scales of shape 1xn
-    const int4* __restrict__ s_group,  // fp16 weight per-group quantization
-                                       // scales of shape (k/groupsize)xn, when
-                                       // group_blocks=-1, it should be nullptr
-    int prob_m,                        // batch dimension m
-    int prob_n,                        // output dimension n
-    int prob_k,                        // reduction dimension k
-    int* locks  // extra global storage for barrier synchronization
-) {
-  // Marlin is not implemented yet for SM < 8.0
-  assert(false);
-  return;
-}
-
-#endif
-
-// 8 warps are a good choice since every SM has 4 schedulers and having more
-// than 1 warp per schedule allows some more latency hiding. At the same time,
-// we want relatively few warps to have many registers per warp and small tiles.
-const int USER_THREADS =
-    256;               // Note: This is only used with user-provided thread_k/n
-const int STAGES = 4;  // 4 pipeline stages fit into shared memory
-
-static constexpr int min_thread_n = 64;
-static constexpr int min_thread_k = 64;
-
-static constexpr int tile_size = 16;
-static constexpr int max_par = 16;
-
-static constexpr int pack_factor_4bit =
-    8;  // We have 8 4-bit vals inside a 32 bit
-
-#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
-                  GROUP_BLOCKS, NUM_THREADS)                                   \
-  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
-           thread_n_blocks == THREAD_N_BLOCKS &&                               \
-           thread_k_blocks == THREAD_K_BLOCKS &&                               \
-           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
-    cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
-                                THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,        \
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,          \
-                         max_shared_mem);                                      \
-    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
-           STAGES, GROUP_BLOCKS>                                               \
-        <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                     \
-            A_ptr, B_ptr, C_ptr, D_ptr, s_tok_ptr, s_ch_ptr, s_group_ptr,      \
-            prob_m, prob_n, prob_k, locks);                                    \
-  }
-
-typedef struct {
-  int thread_k;
-  int thread_n;
-  int num_threads;
-} thread_config_t;
-
-thread_config_t small_batch_thread_configs[] = {
-    // Ordered by priority
-
-    // thread_k, thread_n, num_threads
-    {128, 128, 256},  // Default
-    {128, 64, 128},   // Reduce N 2X, same K
-    {64, 256, 256},   // Reduce K 2X, increase N 2X
-    {64, 128, 128},   // Reduce K 2X, same N
-};
-
-thread_config_t large_batch_thread_configs[] = {
-    // Ordered by priority
-
-    // thread_k, thread_n, num_threads
-    {64, 256, 256},   // Default
-    {128, 128, 256},  // Reduce N 2X, increase K 2X
-    {64, 128, 128},   // Reduce N 2X, same K
-    {128, 64, 128},   // Reduce N 4X, increase K 2X
-};
-
-bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n,
-                     int prob_k) {
-  // Sanity
-  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
-      th_config.num_threads == -1) {
-    return false;
-  }
-
-  // Verify K/N are divisible by thread K/N
-  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
-    return false;
-  }
-
-  // thread_k can be only 128 or 64 (because it must be less than groupsize
-  // which is 128)
-  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
-    return false;
-  }
-
-  // Verify min for thread K/N
-  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
-    return false;
-  }
-
-  // num_threads must be at least 128 (= 4 warps)
-  if (th_config.num_threads < 128) {
-    return false;
-  }
-
-  return true;
-}
-
-thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
-  if (prob_m <= 16) {
-    for (auto th_config : small_batch_thread_configs) {
-      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
-        return th_config;
-      }
-    }
-
-  } else {
-    for (auto th_config : large_batch_thread_configs) {
-      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
-        return th_config;
-      }
-    }
-  }
-
-  return thread_config_t{-1, -1, -1};
-}
-
-#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
-  __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
-  __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
-  __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
-  __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
-
-void marlin_qqq_cuda(const void* A, const void* B, void* C, void* D,
-                     void* s_tok, void* s_ch, void* s_group, int prob_m,
-                     int prob_n, int prob_k, void* workspace,
-                     int groupsize = -1, int dev = 0, cudaStream_t stream = 0,
-                     int thread_k = -1, int thread_n = -1, int sms = -1,
-                     int max_par = 16) {
-  int tot_m = prob_m;
-  int tot_m_blocks = ceildiv(tot_m, 16);
-  int pad = 16 * tot_m_blocks - tot_m;
-
-  if (sms == -1)
-    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
-
-  int max_shared_mem = 0;
-  cudaDeviceGetAttribute(&max_shared_mem,
-                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
-  TORCH_CHECK(max_shared_mem > 0);
-
-  // Set thread config
-  thread_config_t th_config;
-  if (thread_k != -1 && thread_n != -1) {
-    // User-defined config
-    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
-  } else {
-    // Auto config
-    th_config = determine_thread_config(prob_m, prob_n, prob_k);
-  }
-
-  if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) {
-    throw std::runtime_error(
-        "Invalid thread config: thread_k = " + str(th_config.thread_k) +
-        ", thread_n = " + str(th_config.thread_n) +
-        ", num_threads = " + str(th_config.num_threads) + " for MKN = [" +
-        str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]");
-  }
-
-  int num_threads = th_config.num_threads;
-  thread_k = th_config.thread_k;
-  thread_n = th_config.thread_n;
-
-  int thread_k_blocks = thread_k / 16;
-  int thread_n_blocks = thread_n / 16;
-  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
-  int blocks = sms;
-
-  if (prob_m == 0 || prob_n == 0 || prob_k == 0) {
-    return;
-  }
-
-  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
-              " is not divisible by thread_n = ", thread_n);
-  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
-              " is not divisible by thread_k = ", thread_k);
-  if (group_blocks != -1) {
-    TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
-                " is not divisible by group_blocks = ", group_blocks);
-  }
-
-  const int4* A_ptr = (const int4*)A;
-  const int4* B_ptr = (const int4*)B;
-  int4* C_ptr = (int4*)C;
-  int4* D_ptr = (int4*)D;
-  const float* s_tok_ptr = (const float*)s_tok;
-  const int4* s_ch_ptr = (const int4*)s_ch;
-  const int4* s_group_ptr = (const int4*)s_group;
-
-  int* locks = (int*)workspace;
-
-  for (int i = 0; i < tot_m_blocks; i += 4) {
-    int thread_m_blocks = tot_m_blocks - i;
-    prob_m = tot_m - 16 * i;
-    int par = 1;
-    if (thread_m_blocks > 4) {
-      // Note that parallel > 1 currently only works for inputs without any
-      // padding
-      par = (16 * thread_m_blocks - pad) / 64;
-      if (par > max_par) par = max_par;
-      prob_m = 64 * par;
-      i += 4 * (par - 1);
-      thread_m_blocks = 4;
-    }
-
-    // For compilation speed, we only define the kernel configurations that have
-    // seemed useful (in terms of performance) in our testing, however many more
-    // are, in principle, possible.
-    if (false) {
-    }
-    CALL_IF(8, 8, 256)
-    CALL_IF(16, 4, 256)
-    CALL_IF(8, 4, 128)
-    CALL_IF(4, 8, 128)
-    else {
-      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
-                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
-                               ", groupsize = " + str(groupsize) +
-                               ", thread_m_blocks = " + str(thread_m_blocks) +
-                               ", thread_n_blocks = " + str(thread_n_blocks) +
-                               ", thread_k_blocks = " + str(thread_k_blocks));
-    }
-
-    A_ptr += 16 * thread_m_blocks * (prob_k / 16) * par;
-    D_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
-    s_tok_ptr += 16 * thread_m_blocks * par;
-  }
-}
-}  // anonymous namespace
-
-torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
-                              torch::Tensor const& b_q_weight,
-                              torch::Tensor const& s_tok,
-                              torch::Tensor const& s_ch,
-                              torch::Tensor const& s_group,
-                              torch::Tensor& workspace, int64_t size_m,
-                              int64_t size_n, int64_t size_k) {
-  // Verify M
-  TORCH_CHECK(size_m == a.size(0),
-              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
-                  ", size_m = " + str(size_m));
-  TORCH_CHECK(size_m == s_tok.numel(),
-              "Shape mismatch: s_tok.numel() = " + str(s_tok.numel()) +
-                  ", size_m = " + str(size_m));
-
-  // Verify K
-  TORCH_CHECK(size_k == a.size(1),
-              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
-                  ", size_k = " + str(size_k));
-  TORCH_CHECK(size_k % tile_size == 0,
-              "size_k = " + str(size_k) +
-                  " is not divisible by tile_size = " + str(tile_size));
-  TORCH_CHECK(
-      (size_k / tile_size) == b_q_weight.size(0),
-      "Shape mismatch: b_q_weight.size(0) = " + str(b_q_weight.size(0)) +
-          ", size_k = " + str(size_k) + ", tile_size = " + str(tile_size));
-
-  int groupsize = (s_group.numel() == 0) ? -1 : size_k / s_group.size(0);
-  // Verify groupsize
-  TORCH_CHECK(groupsize == -1 || groupsize == 128,
-              "Unexpected groupsize = " + str(groupsize));
-
-  // Verify N
-  TORCH_CHECK(s_ch.numel() == size_n,
-              "Shape mismatch: s_ch.numel() = " + str(s_ch.numel()) +
-                  ", size_n = " + str(size_n));
-  TORCH_CHECK(b_q_weight.size(1) % tile_size == 0,
-              "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
-                  " is not divisible by tile_size = " + str(tile_size));
-  if (groupsize != -1) {
-    TORCH_CHECK(s_group.size(1) == size_n,
-                "Shape mismatch: s_group.size(1) = " + str(s_group.size(1)) +
-                    ", size_n = " + str(size_n));
-    TORCH_CHECK(
-        size_k % s_group.size(0) == 0,
-        "size_k = " + str(size_k) +
-            ", is not divisible by s_group.size(0) = " + str(s_group.size(0)));
-  }
-
-  int actual_size_n = (b_q_weight.size(1) / tile_size) * pack_factor_4bit;
-  TORCH_CHECK(size_n == actual_size_n,
-              "Shape mismatch: size_n = " + str(size_n) +
-                  ", actual_size_n = " + str(actual_size_n));
-
-  // Verify A device and strides
-  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
-  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
-
-  // Verify B device and strides
-  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
-  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
-
-  // Verify s_tok device, strides and dtype
-  TORCH_CHECK(s_tok.device().is_cuda(), "s_tok is not on GPU");
-  TORCH_CHECK(s_tok.is_contiguous(), "s_tok is not contiguous");
-  TORCH_CHECK(s_tok.dtype() == torch::kFloat32, "s_tok's dtype is not float32");
-
-  // Verify s_ch device, strides and dtype
-  TORCH_CHECK(s_ch.device().is_cuda(), "s_ch is not on GPU");
-  TORCH_CHECK(s_ch.is_contiguous(), "s_ch is not contiguous");
-  TORCH_CHECK(s_ch.dtype() == torch::kFloat32, "s_ch's dtype is not float32");
-
-  // Verify s_group device, strides and dtype
-  TORCH_CHECK(s_group.device().is_cuda(), "s_group is not on GPU");
-  TORCH_CHECK(s_group.is_contiguous(), "s_group is not contiguous");
-  TORCH_CHECK(s_group.dtype() == torch::kFloat16,
-              "s_group's dtype is not float16");
-
-  // Verify workspace size
-  TORCH_CHECK(size_n % min_thread_n == 0,
-              "size_n = " + str(size_n) +
-                  ", is not divisible by min_thread_n = " + str(min_thread_n));
-  int min_workspace_size = (size_n / min_thread_n) * max_par;
-  TORCH_CHECK(workspace.numel() >= min_workspace_size,
-              "workspace.numel = " + str(workspace.numel()) +
-                  " is below min_workspace_size = " + str(min_workspace_size));
-
-  // Alloc C matrix
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-  auto options_c = torch::TensorOptions().dtype(torch::kInt).device(a.device());
-  torch::Tensor c = torch::empty({max_par * 64, size_n}, options_c);
-
-  // Alloc D matrix
-  auto options_d =
-      torch::TensorOptions().dtype(torch::kFloat16).device(a.device());
-  torch::Tensor d = torch::empty({size_m, size_n}, options_d);
-
-  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_k = -1;
-  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_n = -1;
-  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
-  int sms = -1;
-
-  int dev = a.get_device();
-  marlin_qqq_cuda(
-      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), d.data_ptr(),
-      s_tok.data_ptr(), s_ch.data_ptr(), s_group.data_ptr(), size_m, size_n,
-      size_k, workspace.data_ptr(), groupsize, dev,
-      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par);
-
-  return d;
-}
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("marlin_qqq_gemm", &marlin_qqq_gemm);
-}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 3a0ff6eaa7..60710f62c0 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -241,14 +241,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // custom types:
   // https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
 
-  // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
-  ops.def(
-      "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
-      "Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> "
-      "Tensor",
-      {stride_tag});
-  // conditionally compiled so impl in source file
-
   // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
   ops.def(
       "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
@@ -353,15 +345,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size);
 
 #ifndef USE_ROCM
-  // marlin_qqq_gemm for QQQ.
-  ops.def(
-      "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
-      "Tensor s_tok, Tensor s_ch, Tensor s_group, "
-      "Tensor! workspace, SymInt size_m, SymInt size_n, "
-      "SymInt size_k) -> Tensor",
-      {stride_tag});
-  // conditionally compiled so impl registration is in source file
-
   // CUTLASS nvfp4 block scaled GEMM
   ops.def(
       "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index a2fc6ffeb8..84178344a5 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -53,12 +53,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
                 "quantization": "gptq_marlin_24"
             }))
 
-        if is_quant_method_supported("marlin"):
-            TEST_MODELS.append(
-                ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
-                    "quantization": "marlin"
-                }))
-
         if not current_platform.is_rocm() and is_quant_method_supported("awq"):
             TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
                 "quantization": "AWQ"
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index a842d2f1cb..0e09661c95 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -95,23 +95,23 @@ TEST_TYPES = [
                  token_scale_type=None)
       for w_type in [scalar_types.uint4, scalar_types.uint8]
       for a_type in [torch.float16, torch.bfloat16]),
-    # QQQ style
-    *(TypeConfig(act_type=torch.int8,
-                 weight_type=scalar_types.uint4b8,
-                 output_type=torch.float16,
-                 group_scale_type=group_scale_type,
-                 group_zero_type=None,
-                 channel_scale_type=torch.float,
-                 token_scale_type=torch.float)
-      for group_scale_type in [None, torch.float16]),
-    *(TypeConfig(act_type=torch.float8_e4m3fn,
-                 weight_type=scalar_types.uint4b8,
-                 output_type=torch.float16,
-                 group_scale_type=group_scale_type,
-                 group_zero_type=None,
-                 channel_scale_type=torch.float,
-                 token_scale_type=torch.float)
-      for group_scale_type in [None, torch.float16]),
+    # # QQQ style
+    # *(TypeConfig(act_type=torch.int8,
+    #              weight_type=scalar_types.uint4b8,
+    #              output_type=torch.float16,
+    #              group_scale_type=group_scale_type,
+    #              group_zero_type=None,
+    #              channel_scale_type=torch.float,
+    #              token_scale_type=torch.float)
+    #   for group_scale_type in [None, torch.float16]),
+    # *(TypeConfig(act_type=torch.float8_e4m3fn,
+    #              weight_type=scalar_types.uint4b8,
+    #              output_type=torch.float16,
+    #              group_scale_type=group_scale_type,
+    #              group_zero_type=None,
+    #              channel_scale_type=torch.float,
+    #              token_scale_type=torch.float)
+    #   for group_scale_type in [None, torch.float16]),
 ]
 
 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index cea7700ac3..ad077e0b94 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -13,11 +13,7 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
-from vllm.model_executor.layers.quantization.qqq import (
-    MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
-    MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
     MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
     marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
     query_marlin_supported_quant_types)
@@ -31,8 +27,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     marlin_weights)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
     marlin_24_quantize)
-from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import (  # noqa: E501
-    marlin_qqq_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
 from vllm.scalar_type import scalar_types
@@ -449,68 +443,6 @@ def test_hqq_marlin_gemm(
     assert max_diff < 0.04
 
 
-@pytest.mark.skipif(not is_quant_method_supported("qqq"),
-                    reason="Marlin is not supported on this GPU type.")
-@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
-@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS)
-@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES)
-@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_marlin_qqq_gemm(
-    k_chunk,
-    n_chunk,
-    num_bits,
-    group_size,
-    mnk_factors,
-):
-    int8_traits = torch.iinfo(torch.int8)
-    m_factor, n_factor, k_factor = mnk_factors
-
-    size_m = m_factor
-    size_k = k_chunk * k_factor
-    size_n = n_chunk * n_factor
-
-    a_input = rand_data((size_m, size_k))
-    b_weight = rand_data((size_k, size_n))
-
-    # Quantize activations
-    s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to(
-        torch.float)
-    q_a = (a_input / s_a).round().clamp(int8_traits.min,
-                                        int8_traits.max).to(torch.int8)
-
-    # Quantize weights
-    w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \
-    marlin_qqq_quantize(b_weight, num_bits, group_size)
-
-    workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
-                                MARLIN_QQQ_MAX_PARALLEL)
-
-    opcheck(torch.ops._C.marlin_qqq_gemm,
-            (q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
-             marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
-             b_weight.shape[1], a_input.shape[1]))
-
-    output = ops.marlin_qqq_gemm(
-        q_a,
-        marlin_qqq_q_w,
-        s_a,
-        marlin_qqq_s_channel,
-        marlin_qqq_s_group,
-        workspace.scratch,
-        a_input.shape[0],
-        b_weight.shape[1],
-        a_input.shape[1],
-    )
-    output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref)
-
-    torch.cuda.synchronize()
-
-    max_diff = compute_max_diff(output, output_ref)
-
-    assert max_diff < 0.04
-
-
 def test_marlin_gemm_subset_input():
     quant_type = scalar_types.uint4b8
     group_size = 128
@@ -602,18 +534,3 @@ def test_marlin_gemm_with_bias(size_m):
     max_diff = compute_max_diff(output, output_ref)
 
     assert max_diff < 0.04
-
-
-def test_marlin_gemm_opcheck():
-    size_m = 2048
-    size_n = 4096
-    size_k = 4096
-    a = torch.rand((size_m, size_n), device='cuda', dtype=torch.float16)
-    w = torch.randint(-5, 5, (256, 8192), device='cuda', dtype=torch.int32)
-    s = torch.full((32, size_k), 0.125, device='cuda', dtype=torch.float16)
-    wk = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
-                         GPTQ_MARLIN_MAX_PARALLEL).scratch
-    x = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
-    y = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
-    torch.testing.assert_close(x, y)
-    opcheck(torch.ops._C.marlin_gemm, (a, w, s, wk, size_m, size_n, size_k))
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 8cf8402436..1843bffd21 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -22,22 +22,12 @@ class ModelPair:
 MODEL_ARG_EXPTYPES = [
     # AUTOGPTQ
     # compat: autogptq <=0.7.1 is_marlin_format: bool
-    # Model Serialized in Marlin Format should always use Marlin kernel.
-    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"),
-    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"),
-    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"),
-    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"),
     # Model Serialized in Exllama Format.
     ("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
     ("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
     ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
     ("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
     # compat: autogptq >=0.8.0 use checkpoint_format: str
-    # Model Serialized in Marlin Format should always use Marlin kernel.
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"),
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"),
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"),
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"),
     # Model Serialized in Exllama Format.
     ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
     ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index 11f78a23bb..5ec8b27c15 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -11,7 +11,6 @@ import torch
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinLinearMethod)
-from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     UnquantizedEmbeddingMethod)
 
@@ -19,9 +18,7 @@ PROMPT = "On the surface of Mars, we found"
 
 MODELS_QUANT = [
     ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True),
-    ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False),
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
-    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)
 ]
 
 
@@ -41,8 +38,7 @@ def test_lm_head(
             lm_head_layer = model.lm_head
             if lm_head_quantized:
                 assert isinstance(lm_head_layer.quant_method,
-                                  (GPTQLinearMethod, GPTQMarlinLinearMethod,
-                                   MarlinLinearMethod))
+                                  (GPTQLinearMethod, GPTQMarlinLinearMethod))
             else:
                 assert isinstance(lm_head_layer.quant_method,
                                   UnquantizedEmbeddingMethod)
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
index 1b79707409..cc18c9ff1f 100644
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -26,9 +26,5 @@ compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
-marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
-marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
-qqq, HandH1998/QQQ-Llama-3-8b-g128, main
-qqq, HandH1998/QQQ-Llama-3-8b, main
 hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
 None, mgleize/fairseq2-dummy-Llama-3.2-1B, main
\ No newline at end of file
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 39da08847b..59f2d7737f 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -387,14 +387,6 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
     torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
 
 
-# marlin
-def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
-                b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int,
-                size_n: int, size_k: int) -> torch.Tensor:
-    return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
-                                    size_n, size_k)
-
-
 # marlin_24
 def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                         b_meta: torch.Tensor, b_scales: torch.Tensor,
@@ -437,25 +429,6 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
                                is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
 
-    @register_fake("_C::marlin_qqq_gemm")
-    def _marlin_qqq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
-                              s_tok: torch.Tensor, s_ch: torch.Tensor,
-                              s_group: torch.Tensor, workspace: torch.Tensor,
-                              size_m: torch.SymInt, size_n: torch.SymInt,
-                              size_k: torch.SymInt) -> torch.Tensor:
-        return torch.empty((size_m, size_n),
-                           dtype=torch.float16,
-                           device=a.device)
-
-    @register_fake("_C::marlin_gemm")
-    def _marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
-                          b_scales: torch.Tensor, workspace: torch.Tensor,
-                          size_m: torch.SymInt, size_n: torch.SymInt,
-                          size_k: torch.SymInt) -> torch.Tensor:
-        return torch.empty((size_m, size_n),
-                           dtype=torch.float16,
-                           device=a.device)
-
     @register_fake("_C::awq_dequantize")
     def _awq_dequantize_fake(qweight: torch.Tensor, scales: torch.Tensor,
                              zeros: torch.Tensor, split_k_iters: torch.SymInt,
@@ -1348,15 +1321,6 @@ def scaled_int8_quant(
     return output, input_scales, input_azp
 
 
-# qqq ops
-def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
-                    s_tok: torch.Tensor, s_ch: torch.Tensor,
-                    s_group: torch.Tensor, workspace: torch.Tensor,
-                    size_m: int, size_n: int, size_k: int) -> torch.Tensor:
-    return torch.ops._C.marlin_qqq_gemm(a, b_q_weight, s_tok, s_ch, s_group,
-                                        workspace, size_m, size_n, size_k)
-
-
 # gguf
 def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int, n: int,
                     dtype: Optional[torch.dtype]) -> torch.Tensor:
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 5b5d477ef0..62dfd4333b 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1112,9 +1112,9 @@ class ModelConfig:
     def _verify_quantization(self) -> None:
         supported_quantization = me_quant.QUANTIZATION_METHODS
         optimized_quantization_methods = [
-            "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
-            "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
-            "quark", "modelopt_fp4", "bitblas", "gptq_bitblas", "inc"
+            "fp8", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
+            "fbgemm_fp8", "compressed-tensors", "experts_int8", "quark",
+            "modelopt_fp4", "bitblas", "gptq_bitblas", "inc"
         ]
         if self.quantization is not None:
             self.quantization = cast(me_quant.QuantizationMethods,
@@ -1137,7 +1137,6 @@ class ModelConfig:
             # `override_quantization_method` method) must be checked in order
             # of preference (this is particularly important for GPTQ).
             overrides = [
-                "marlin",
                 "bitblas",
                 "gptq_marlin_24",
                 "gptq_marlin",
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index de5933d6d4..24a05d310d 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -48,9 +48,6 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device:
     # GPTQ/AWQ
     elif hasattr(base_layer, "qweight"):
         return base_layer.qweight.device
-    # marlin
-    elif hasattr(base_layer, "B"):
-        return base_layer.B.device
     # HQQ marlin
     elif hasattr(base_layer, "W_q"):
         return base_layer.W_q.device
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index d3b6b2089f..654e2ec7b2 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -42,7 +42,6 @@ WEIGHT_LOADER_V2_SUPPORTED = [
     "GPTQMarlinLinearMethod",
     "Fp8LinearMethod",
     "MarlinLinearMethod",
-    "QQQLinearMethod",
     "GPTQMarlin24LinearMethod",
     "TPUInt8LinearMethod",
     "GPTQLinearMethod",
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index a4c2671225..ea51468422 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -15,7 +15,6 @@ QuantizationMethods = Literal[
     "fbgemm_fp8",
     "modelopt",
     "modelopt_fp4",
-    "marlin",
     "bitblas",
     "gguf",
     "gptq_marlin_24",
@@ -25,7 +24,6 @@ QuantizationMethods = Literal[
     "gptq",
     "compressed-tensors",
     "bitsandbytes",
-    "qqq",
     "hqq",
     "experts_int8",
     "neuron_quant",
@@ -106,13 +104,11 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .hqq_marlin import HQQMarlinConfig
     from .inc import INCConfig
     from .ipex_quant import IPEXConfig
-    from .marlin import MarlinConfig
     from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config
     from .moe_wna16 import MoeWNA16Config
     from .mxfp4 import Mxfp4Config
     from .neuron_quant import NeuronQuantConfig
     from .ptpc_fp8 import PTPCFp8Config
-    from .qqq import QQQConfig
     from .rtn import RTNConfig
     from .torchao import TorchAOConfig
     from .tpu_int8 import Int8TpuConfig
@@ -125,7 +121,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "fbgemm_fp8": FBGEMMFp8Config,
         "modelopt": ModelOptFp8Config,
         "modelopt_fp4": ModelOptNvFp4Config,
-        "marlin": MarlinConfig,
         "bitblas": BitBLASConfig,
         "gguf": GGUFConfig,
         "gptq_marlin_24": GPTQMarlin24Config,
@@ -136,7 +131,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "compressed-tensors": CompressedTensorsConfig,
         "bitsandbytes": BitsAndBytesConfig,
         "ptpc_fp8": PTPCFp8Config,
-        "qqq": QQQConfig,
         "hqq": HQQMarlinConfig,
         "experts_int8": ExpertsInt8Config,
         "neuron_quant": NeuronQuantConfig,
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
deleted file mode 100644
index 18d1c13373..0000000000
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Any, Optional
-
-import torch
-from torch.nn.parameter import Parameter
-
-from vllm import _custom_ops as ops
-from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           ChannelQuantScaleParameter,
-                                           GroupQuantScaleParameter,
-                                           PackedvLLMParameter)
-
-logger = init_logger(__name__)
-
-
-class MarlinConfig(QuantizationConfig):
-    """Config class for Marlin.
-
-    Reference: https://github.com/IST-DASLab/marlin/tree/master
-    """
-
-    def __init__(
-        self,
-        group_size: int,
-        lm_head_quantized: bool,
-    ) -> None:
-        super().__init__()
-
-        # Group size for the quantization.
-        self.group_size = group_size
-        self.lm_head_quantized = lm_head_quantized
-        if self.group_size != 128 and self.group_size != -1:
-            raise ValueError(
-                "Currently, only group size 128 and -1 (channelwise) "
-                "is supported for Marlin, but got group_size of "
-                f"{self.group_size}")
-
-        # 4 Bits packed into 32 bit datatype.
-        self.pack_factor = 32 // 4
-
-        # Tile size used by marlin kernels.
-        self.tile_size = 16
-
-        # Min out_features dim
-        self.min_n_threads = 64
-
-        # Min in_features dim
-        self.min_k_threads = 128
-
-        # Max parallel problems to solve at once (improves large
-        # batch performance)
-        self.max_parallel = 16
-
-        # Permutation length used by the marlin kernels.
-        self.perm_len = 1024
-
-    def __repr__(self) -> str:
-        return (f"MarlinConfig(group_size={self.group_size}, "
-                f"lm_head_quantized={self.lm_head_quantized})")
-
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
-        return "marlin"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
-        return [torch.half]
-
-    @classmethod
-    # Need to figure it out
-    def get_min_capability(cls) -> int:
-        return 80
-
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return ["quantize_config.json"]
-
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "MarlinConfig":
-        group_size = cls.get_from_keys(config, ["group_size"])
-        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
-                                                 default=False)
-        return cls(group_size, lm_head_quantized)
-
-    @classmethod
-    def override_quantization_method(
-            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
-        # compat: autogptq >=0.8.0 use checkpoint_format: str
-        # compat: autogptq <=0.7.1 is_marlin_format: bool
-        is_marlin_format = (hf_quant_cfg.get("checkpoint_format") == "marlin"
-                            or hf_quant_cfg.get("is_marlin_format", False))
-
-        is_valid_user_quant = (user_quant is None or user_quant == "gptq"
-                               or user_quant == "marlin")
-
-        if is_marlin_format and is_valid_user_quant:
-            msg = ("The model is serialized in {} format. Using {} kernel.".
-                   format(cls.get_name(), cls.get_name()))
-            logger.info(msg)
-            return cls.get_name()
-
-        return None
-
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["MarlinLinearMethod"]:
-        if (isinstance(layer, LinearBase) or
-            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
-            return MarlinLinearMethod(self)
-        return None
-
-
-class MarlinLinearMethod(LinearMethodBase):
-    """Linear method for Marlin.
-
-    Args:
-        quant_config: The Marlin quantization config.
-    """
-
-    def __init__(self, quant_config: MarlinConfig):
-        self.quant_config = quant_config
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: list[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        del output_size  # Unused.
-        weight_loader = extra_weight_attrs["weight_loader"]
-
-        if params_dtype != torch.float16:
-            raise ValueError(
-                f"The params dtype must be float16, but got {params_dtype}")
-
-        # Validate output_size_per_partition
-        output_size_per_partition = sum(output_partition_sizes)
-        if output_size_per_partition % self.quant_config.min_n_threads != 0:
-            raise ValueError(
-                f"Weight output_size_per_partition = "
-                f"{output_size_per_partition} is not divisible by "
-                f"min_n_threads = {self.quant_config.min_n_threads}.")
-        if output_size_per_partition % self.quant_config.pack_factor != 0:
-            raise ValueError(
-                f"Weight output_size_per_partition = "
-                f"{output_size_per_partition} is not divisible by "
-                f"pack_factor = {self.quant_config.pack_factor}.")
-
-        # Validate input_size_per_partition
-        if input_size_per_partition % self.quant_config.min_k_threads != 0:
-            raise ValueError(
-                f"Weight input_size_per_partition = "
-                f"{input_size_per_partition} is not divisible by "
-                f"min_k_threads = {self.quant_config.min_k_threads}.")
-        if (self.quant_config.group_size != -1 and
-                input_size_per_partition % self.quant_config.group_size != 0):
-            raise ValueError(f"Weight input_size_per_partition = "
-                             f"{input_size_per_partition} is not divisible by "
-                             f"group_size = {self.quant_config.group_size}.")
-
-        # Check that we have at least 4 tiles horizontally in the shard
-        num_tiles_per_perm = self.quant_config.perm_len // (
-            self.quant_config.tile_size**2)
-        if output_size_per_partition % num_tiles_per_perm != 0:
-            raise ValueError(
-                "Each permutation group must reside on the same gpu")
-
-        # Quantized 4Bit weights packed into Int32.
-        qweight = PackedvLLMParameter(
-            data=torch.empty(
-                input_size_per_partition // self.quant_config.tile_size,
-                output_size_per_partition * self.quant_config.tile_size //
-                self.quant_config.pack_factor,
-                device="cuda",
-                dtype=torch.int32,
-            ),
-            input_dim=0,
-            output_dim=1,
-            packed_dim=1,
-            packed_factor=self.quant_config.pack_factor,
-            marlin_tile_size=self.quant_config.tile_size,
-            weight_loader=weight_loader)
-
-        # Determine if channelwise or not
-        input_groups = (1 if self.quant_config.group_size == -1 else
-                        input_size_per_partition //
-                        self.quant_config.group_size)
-
-        weight_scale_args = {
-            "data":
-            torch.empty(
-                input_groups,
-                output_size_per_partition,
-                device="cuda",
-                dtype=params_dtype,
-            ),
-            "weight_loader":
-            weight_loader
-        }
-        if input_groups == 1:
-            scales = ChannelQuantScaleParameter(output_dim=1,
-                                                **weight_scale_args)
-        else:
-            scales = GroupQuantScaleParameter(output_dim=1,
-                                              input_dim=0,
-                                              **weight_scale_args)
-
-        # Allocate workspace (Used for internal locking mechanism)
-        max_workspace_size = (
-            output_size_per_partition //
-            self.quant_config.min_n_threads) * self.quant_config.max_parallel
-
-        workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size,
-                                                       device="cuda",
-                                                       dtype=torch.int),
-                                      weight_loader=weight_loader)
-
-        layer.register_parameter("B", qweight)
-        layer.register_parameter("s", scales)
-        layer.register_parameter("workspace", workspace)
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # required by torch.compile
-        layer.B = Parameter(layer.B.data, requires_grad=False)
-        layer.s = Parameter(layer.s.data, requires_grad=False)
-        layer.workspace = Parameter(layer.workspace.data, requires_grad=False)
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        qweight = layer.B
-        scales = layer.s
-        workspace = layer.workspace
-
-        x_2d = x.view(-1, x.shape[-1])
-
-        size_m = x_2d.shape[0]
-        size_k = x_2d.shape[1]
-        size_n = scales.shape[1]
-
-        output_2d = ops.marlin_gemm(x_2d, qweight, scales, workspace, size_m,
-                                    size_n, size_k)
-
-        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
-
-        if bias is not None:
-            output.add_(bias)  # In-place add
-
-        return output
diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py
deleted file mode 100644
index 25978cb13b..0000000000
--- a/vllm/model_executor/layers/quantization/qqq.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Any, Optional
-
-import torch
-from torch.nn.parameter import Parameter
-
-from vllm import _custom_ops as ops
-from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-from vllm.model_executor.parameter import (BasevLLMParameter,
-                                           ChannelQuantScaleParameter,
-                                           GroupQuantScaleParameter,
-                                           PackedvLLMParameter)
-
-logger = init_logger(__name__)
-
-MARLIN_QQQ_TILE = 16
-MARLIN_QQQ_MIN_THREAD_N = 64
-MARLIN_QQQ_MIN_THREAD_K = 128
-MARLIN_QQQ_MAX_PARALLEL = 16
-
-MARLIN_QQQ_SUPPORTED_NUM_BITS = [4]
-MARLIN_QQQ_SUPPORTED_GROUP_SIZES = [-1, 128]
-MARLIN_QQQ_SUPPORTED_SYM = [True]
-
-
-class QQQConfig(QuantizationConfig):
-    """Config class for QQQ
-    
-    Reference: https://arxiv.org/pdf/2406.09904
-    """
-
-    def __init__(
-        self,
-        weight_bits: int,
-        group_size: int,
-        is_sym: bool = True,
-    ) -> None:
-        super().__init__()
-        self.weight_bits = weight_bits
-        self.group_size = group_size
-        self.is_sym = is_sym
-
-        # Verify
-        if self.weight_bits not in MARLIN_QQQ_SUPPORTED_NUM_BITS:
-            raise ValueError(
-                f"QQQ does not support weight_bits = {self.weight_bits}. "
-                f"Only weight_bits = {MARLIN_QQQ_SUPPORTED_NUM_BITS} "
-                "are supported.")
-        if self.group_size not in MARLIN_QQQ_SUPPORTED_GROUP_SIZES:
-            raise ValueError(
-                f"QQQ does not support group_size = {self.group_size}. "
-                f"Only group_sizes = {MARLIN_QQQ_SUPPORTED_GROUP_SIZES} "
-                "are supported.")
-        if self.is_sym not in MARLIN_QQQ_SUPPORTED_SYM:
-            raise ValueError(
-                f"QQQ does not support is_sym = {self.is_sym}. "
-                f"Only sym = {MARLIN_QQQ_SUPPORTED_SYM} are supported.")
-
-        # 4 Bits packed into 32 bit datatype.
-        self.pack_factor = 32 // self.weight_bits
-
-        # Tile size used by QQQ kernels.
-        self.tile_size = MARLIN_QQQ_TILE
-
-        # Min out_features dim
-        self.min_n_threads = MARLIN_QQQ_MIN_THREAD_N
-
-        # Min in_features dim
-        self.min_k_threads = MARLIN_QQQ_MIN_THREAD_K
-
-        # Max parallel problems to solve at once (improves large
-        # batch performance)
-        self.max_parallel = MARLIN_QQQ_MAX_PARALLEL
-
-        # Permutation length used by the QQQ kernels.
-        self.perm_len = 1024
-
-    def __repr__(self) -> str:
-        return "QQQConfig(weight_bits={}, group_size={})".format(
-            self.weight_bits, self.group_size)
-
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
-        return "qqq"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
-        return [torch.half]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 80
-
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        """List of filenames to search for in the model directory."""
-        return [
-            "quant_config.json",
-            "quantize_config.json",
-        ]
-
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "QQQConfig":
-        weight_bits = cls.get_from_keys(config, ["wbits"])
-        group_size = cls.get_from_keys(config, ["group_size"])
-        return cls(weight_bits, group_size)
-
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["QQQLinearMethod"]:
-        if isinstance(layer, LinearBase):
-            return QQQLinearMethod(self)
-        return None
-
-
-class QQQLinearMethod(LinearMethodBase):
-    """Linear method for QQQ.
-
-    Args:
-        quant_config: The QQQ quantization config.
-    """
-
-    def __init__(self, quant_config: QQQConfig):
-        self.quant_config = quant_config
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: list[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        weight_loader = extra_weight_attrs["weight_loader"]
-        if params_dtype != torch.float16:
-            raise ValueError(
-                f"The params dtype must be float16, but got {params_dtype}")
-
-        # Validate output_size_per_partition
-        output_size_per_partition = sum(output_partition_sizes)
-        if output_size_per_partition % self.quant_config.min_n_threads != 0:
-            raise ValueError(
-                f"Weight output_size_per_partition = "
-                f"{output_size_per_partition} is not divisible by "
-                f"min_n_threads = {self.quant_config.min_n_threads}.")
-        if output_size_per_partition % self.quant_config.pack_factor != 0:
-            raise ValueError(
-                f"Weight output_size_per_partition = "
-                f"{output_size_per_partition} is not divisible by "
-                f"pack_factor = {self.quant_config.pack_factor}.")
-
-        # Validate input_size_per_partition
-        if input_size_per_partition % self.quant_config.min_k_threads != 0:
-            raise ValueError(
-                f"Weight input_size_per_partition = "
-                f"{input_size_per_partition} is not divisible by "
-                f"min_k_threads = {self.quant_config.min_k_threads}.")
-        if (self.quant_config.group_size != -1 and
-                input_size_per_partition % self.quant_config.group_size != 0):
-            raise ValueError(f"Weight input_size_per_partition = "
-                             f"{input_size_per_partition} is not divisible by "
-                             f"group_size = {self.quant_config.group_size}.")
-
-        # Check that we have at least 4 tiles horizontally in the shard
-        num_tiles_per_perm = self.quant_config.perm_len // (
-            self.quant_config.tile_size**2)
-        if output_size_per_partition % num_tiles_per_perm != 0:
-            raise ValueError(
-                "Each permutation group must reside on the same gpu")
-
-        # Quantized 4Bit weights packed into Int32.
-        qweight = PackedvLLMParameter(
-            data=torch.empty(
-                input_size_per_partition // self.quant_config.tile_size,
-                output_size_per_partition * self.quant_config.tile_size //
-                self.quant_config.pack_factor,
-                device="cuda",
-                dtype=torch.int32,
-            ),
-            input_dim=0,
-            output_dim=1,
-            packed_dim=1,
-            packed_factor=self.quant_config.pack_factor,
-            marlin_tile_size=self.quant_config.tile_size,
-            weight_loader=weight_loader)
-
-        s_channel = ChannelQuantScaleParameter(data=torch.empty(
-            1,
-            output_size_per_partition,
-            device="cuda",
-            dtype=torch.float,
-        ),
-                                               weight_loader=weight_loader,
-                                               output_dim=1)
-
-        if self.quant_config.group_size == -1:
-            s_group_data = torch.tensor(
-                [],
-                device="cuda",
-                dtype=torch.half,
-            )
-        else:
-            s_group_data = torch.empty(
-                input_size_per_partition // self.quant_config.group_size,
-                output_size_per_partition,
-                device="cuda",
-                dtype=torch.half,
-            )
-
-        s_group_attr = {"data": s_group_data, "weight_loader": weight_loader}
-
-        if self.quant_config.group_size == -1:
-            s_group = BasevLLMParameter(**s_group_attr)
-        else:
-            s_group = GroupQuantScaleParameter(output_dim=1,
-                                               input_dim=0,
-                                               **s_group_attr)
-
-        # Allocate workspace (Used for internal locking mechanism)
-        max_workspace_size = (
-            output_size_per_partition //
-            self.quant_config.min_n_threads) * self.quant_config.max_parallel
-
-        workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size,
-                                                       device="cuda",
-                                                       dtype=torch.int),
-                                      weight_loader=weight_loader)
-
-        layer.register_parameter("B", qweight)
-        layer.register_parameter("s_channel", s_channel)
-        layer.register_parameter("s_group", s_group)
-        layer.register_parameter("workspace", workspace)
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # required by torch.compile
-        layer.B = Parameter(layer.B.data, requires_grad=False)
-        layer.s_channel = Parameter(layer.s_channel.data, requires_grad=False)
-        layer.s_group = Parameter(layer.s_group.data, requires_grad=False)
-        layer.workspace = Parameter(layer.workspace.data, requires_grad=False)
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        qweight = layer.B
-        s_ch = layer.s_channel
-        s_group = layer.s_group
-        workspace = layer.workspace
-
-        x_2d = x.view(-1, x.shape[-1])
-
-        size_m = x_2d.shape[0]
-        size_k = x_2d.shape[1]
-        size_n = s_ch.shape[1]
-
-        x_int8, s_tok, _ = ops.scaled_int8_quant(x_2d)
-
-        output_2d = ops.marlin_qqq_gemm(x_int8, qweight, s_tok, s_ch, s_group,
-                                        workspace, size_m, size_n, size_k)
-
-        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
-
-        if bias is not None:
-            output.add_(bias)  # In-place add
-
-        return output
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
deleted file mode 100644
index 8a64bebae0..0000000000
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import numpy
-import torch
-
-from .marlin_utils_test import marlin_permute_weights
-from .quant_utils import get_pack_factor, qqq_quantize_weights
-
-
-def marlin_qqq_weights(q_w, size_k, size_n, num_bits, perm, group_size):
-    # Permute
-    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
-
-    # Pack
-    pack_factor = get_pack_factor(num_bits)
-    orig_device = q_w.device
-
-    q_w = q_w.cpu().numpy().astype(numpy.uint32)
-
-    q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
-                           dtype=numpy.uint32)
-    if group_size == size_k:
-        for i in range(pack_factor):
-            q_packed |= (q_w[:, i::pack_factor] & 0xF) << num_bits * i
-    else:
-        for i in range(pack_factor):
-            q_packed |= q_w[:, i::pack_factor] << num_bits * i
-
-    q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)
-
-    return q_packed
-
-
-def get_qqq_scale_perms():
-    scale_perm: list[int] = []
-    for i in range(8):
-        scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single: list[int] = []
-    for i in range(4):
-        scale_perm_single.extend(
-            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
-    return scale_perm, scale_perm_single
-
-
-# NOTE(HandH1998): QQQ employs different perms for per-group and per-channel weight quantization. # noqa: E501
-def get_qqq_weight_perm(num_bits: int, quant_type: str):
-    perm_list: list[int] = []
-    for i in range(32):
-        perm1: list[int] = []
-        col = i // 4
-        for block in [0, 1]:
-            for row in [
-                    4 * (i % 4),
-                    4 * (i % 4) + 1,
-                    4 * (i % 4) + 2,
-                    4 * (i % 4) + 3,
-            ]:
-                perm1.append(16 * row + col + 8 * block)
-        for j in range(4):
-            perm_list.extend([p + 256 * j for p in perm1])
-
-    perm = numpy.array(perm_list)
-
-    assert quant_type in ["per-channel",
-                          "per-group"], "not supported quantization type"
-    if num_bits == 4:
-        if quant_type == "per-channel":
-            interleave = numpy.array([4, 0, 5, 1, 6, 2, 7, 3])
-        else:
-            interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
-    else:
-        raise Exception("num_bits must be 4, got {}".format(num_bits))
-
-    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
-    perm = torch.from_numpy(perm)
-    return perm
-
-
-def marlin_qqq_permute_scales(s_group, s_channel, size_k, size_n, group_size):
-    scale_perm, scale_perm_single = get_qqq_scale_perms()
-    if group_size < size_k and group_size != -1:
-        s_group = s_group.reshape((-1, len(scale_perm)))[:, scale_perm]
-        s_channel = s_channel.reshape(
-            (-1, len(scale_perm_single)))[:, scale_perm_single]
-        s_group = s_group.reshape((-1, size_n)).contiguous()
-    else:
-        s_channel = s_channel.reshape(
-            (-1, len(scale_perm_single)))[:, scale_perm_single]
-    s_channel = s_channel.reshape((-1, size_n)).contiguous()
-
-    return s_group, s_channel
-
-
-def marlin_qqq_quantize(
-    w: torch.Tensor,
-    num_bits: int,
-    group_size: int,
-):
-    size_k, size_n = w.shape
-
-    # Normalize group_size
-    if group_size == -1:
-        group_size = size_k
-    assert group_size <= size_k
-    quant_type = "per-channel" if group_size == size_k else "per-group"
-
-    # Quantize
-    w_ref, q_w, s_group, s_channel = qqq_quantize_weights(
-        w, num_bits, group_size)
-
-    # Reformat to marlin_qqq
-    weight_perm = get_qqq_weight_perm(num_bits, quant_type)
-    marlin_qqq_q_w = marlin_qqq_weights(q_w, size_k, size_n, num_bits,
-                                        weight_perm, group_size)
-    marlin_qqq_s_group, marlin_qqq_s_channel = marlin_qqq_permute_scales(
-        s_group, s_channel, size_k, size_n, group_size)
-
-    # Create result
-    res_list = [
-        w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel
-    ]
-    for i in range(len(res_list)):
-        res_list[i] = res_list[i].to(w.device)
-
-    return res_list
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 428e9e99aa..3cfaca6230 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -9,8 +9,6 @@ import numpy
 import torch
 
 from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
-from vllm.model_executor.layers.quantization.qqq import (
-    MARLIN_QQQ_SUPPORTED_NUM_BITS)
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
@@ -386,89 +384,6 @@ def gptq_quantize_weights(w: torch.Tensor,
     return w_ref, w_q, w_s, g_idx, rand_perm
 
 
-# QQQ employs different quant schemes for per-group and
-# per-channel quantization.
-def qqq_quantize_weights(w: torch.Tensor, num_bits: int, group_size: int):
-    orig_device = w.device
-    size_k, size_n = w.shape
-
-    assert w.is_floating_point(), "w must be float"
-    assert num_bits in MARLIN_QQQ_SUPPORTED_NUM_BITS, \
-           f"Unsupported num_bits = {num_bits}"
-    assert group_size in SUPPORTED_GROUP_SIZES + [
-        size_k
-    ], f"Unsupported groupsize = {group_size}"
-
-    if group_size == -1:
-        group_size = size_k
-    assert group_size <= size_k
-
-    if group_size < size_k:
-        # Reshape to [groupsize, -1]
-        w = w.reshape((-1, group_size, size_n))
-        w = w.permute(1, 0, 2)
-        w = w.reshape((group_size, -1))
-
-        max_q_val = 2**num_bits - 1
-        half_q_val = (max_q_val + 1) // 2
-
-        # Compute scale for each group
-        s_group = torch.max(torch.abs(w), 0, keepdim=True)[0]
-        s_group *= 2 / max_q_val  # 2 => symmetric
-
-        # Quantize
-        q_w = torch.round(w / s_group).int()
-        q_w += half_q_val
-        q_w = torch.clamp(q_w, 0, max_q_val)
-        # Compute ref (dequantized)
-        w_ref = (q_w - half_q_val).half() * s_group
-
-        # Restore original shapes
-        def reshape_w(w):
-            w = w.reshape((group_size, -1, size_n))
-            w = w.permute(1, 0, 2)
-            w = w.reshape((size_k, size_n)).contiguous()
-            return w
-
-        q_w = reshape_w(q_w)
-        w_ref = reshape_w(w_ref)
-
-        # Compute int8 quantization scale for each channel
-        s_channel = torch.max(torch.abs(w_ref), 0, keepdim=True)[0]
-        s_channel /= 127.0
-        t_int8 = (w_ref / s_channel).round().clamp(-128, 127).to(torch.int8)
-        w_ref = t_int8.half() * s_channel
-        s_channel = s_channel.reshape(1, -1).to(dtype=torch.float)
-
-        # Fuse scales
-        s_group = (s_group.reshape(-1, size_n).contiguous() /
-                   s_channel).to(dtype=torch.half)
-    else:
-        max_q_val = 2**(num_bits - 1) - 1
-
-        # Compute scale for each channel
-        s_channel = torch.max(torch.abs(w), 0, keepdim=True)[0]
-        s_channel /= max_q_val
-
-        # Quantize
-        q_w = torch.round(w / s_channel).int()
-        q_w = torch.clamp(q_w, -max_q_val, max_q_val)
-        # Compute ref (dequantized)
-        w_ref = q_w.half() * s_channel
-
-        s_group = torch.tensor([], dtype=torch.half)
-        # div 2 ** (8 - self.bits)) to offset right shift in unpacking
-        s_channel /= (2**(8 - num_bits))
-        s_channel = s_channel.reshape(-1, size_n).contiguous().to(torch.float)
-
-    return (
-        w_ref.to(device=orig_device),
-        q_w.to(device=orig_device),
-        s_group.to(device=orig_device),
-        s_channel.to(device=orig_device),
-    )
-
-
 def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
     orig_device = q_w.device
 

From 582bbe6bd708d01d74d6d02d6ef59b4c3c34a7b1 Mon Sep 17 00:00:00 2001
From: bigmoyan <moyan_work@foxmail.com>
Date: Thu, 21 Aug 2025 03:59:54 +0800
Subject: [PATCH 440/932] [Fix] correct tool_id for kimi-k2 when use
 tool_choice=required (#21259)

Co-authored-by: wangzhengtao <wangzhengtao@msh.team>
---
 .../test_completion_with_function_calling.py  | 314 +++++++++++-------
 tests/utils.py                                |  10 +-
 vllm/entrypoints/chat_utils.py                |  17 +-
 vllm/entrypoints/openai/protocol.py           |   4 +-
 vllm/entrypoints/openai/serving_chat.py       |  64 +++-
 .../tool_parsers/deepseekv3_tool_parser.py    |   4 +-
 .../granite_20b_fc_tool_parser.py             |   4 +-
 .../tool_parsers/granite_tool_parser.py       |   4 +-
 .../openai/tool_parsers/hermes_tool_parser.py |   4 +-
 .../tool_parsers/internlm2_tool_parser.py     |   4 +-
 .../openai/tool_parsers/jamba_tool_parser.py  |   4 +-
 .../openai/tool_parsers/llama_tool_parser.py  |   4 +-
 .../tool_parsers/minimax_tool_parser.py       |   4 +-
 .../tool_parsers/phi4mini_tool_parser.py      |   4 +-
 .../openai/tool_parsers/xlam_tool_parser.py   |   4 +-
 15 files changed, 283 insertions(+), 166 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index a5b081f861..4ef5d4e8a6 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -13,6 +13,127 @@ from ...utils import RemoteOpenAIServer
 # any model with a chat template should work here
 MODEL_NAME = "Qwen/Qwen3-0.6B"
 
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description":
+                        "The city to find the weather for, e.g. 'Vienna'",
+                        "default": "Vienna",
+                    },
+                    "country": {
+                        "type":
+                        "string",
+                        "description":
+                        "The country that the city is in, e.g. 'Austria'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                    "options": {
+                        "$ref": "#/$defs/WeatherOptions",
+                        "description": "Optional parameters for weather query",
+                    },
+                },
+                "required": ["country", "unit"],
+                "$defs": {
+                    "WeatherOptions": {
+                        "title": "WeatherOptions",
+                        "type": "object",
+                        "additionalProperties": False,
+                        "properties": {
+                            "unit": {
+                                "type": "string",
+                                "enum": ["celsius", "fahrenheit"],
+                                "default": "celsius",
+                                "description": "Temperature unit",
+                                "title": "Temperature Unit",
+                            },
+                            "include_forecast": {
+                                "type": "boolean",
+                                "default": False,
+                                "description":
+                                "Whether to include a 24-hour forecast",
+                                "title": "Include Forecast",
+                            },
+                            "language": {
+                                "type": "string",
+                                "default": "zh-CN",
+                                "description": "Language of the response",
+                                "title": "Language",
+                                "enum": ["zh-CN", "en-US", "ja-JP"],
+                            },
+                        },
+                    },
+                },
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description":
+                        "The city to get the forecast for, e.g. 'Vienna'",
+                        "default": "Vienna",
+                    },
+                    "country": {
+                        "type":
+                        "string",
+                        "description":
+                        "The country that the city is in, e.g. 'Austria'",
+                    },
+                    "days": {
+                        "type":
+                        "integer",
+                        "description":
+                        "Number of days to get the forecast for (1-7)",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["country", "days", "unit"],
+            },
+        },
+    },
+]
+
+messages = [
+    {
+        "role": "user",
+        "content": "Hi! How are you doing today?"
+    },
+    {
+        "role": "assistant",
+        "content": "I'm doing well! How can I help you?"
+    },
+    {
+        "role":
+        "user",
+        "content":
+        "Can you tell me what the current weather is in Berlin and the "\
+        "forecast for the next 5 days, in fahrenheit?",
+    },
+]
+
 
 @pytest.fixture(scope="module")
 def server():  # noqa: F811
@@ -27,6 +148,8 @@ def server():  # noqa: F811
         "hermes",
         "--reasoning-parser",
         "qwen3",
+        "--gpu-memory-utilization",
+        "0.4"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -54,129 +177,6 @@ async def client(server):
 async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
                                  stream: bool, tool_choice: Union[str, dict],
                                  enable_thinking: bool):
-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "get_current_weather",
-                "description": "Get the current weather in a given location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "city": {
-                            "type": "string",
-                            "description":
-                            "The city to find the weather for, e.g. 'Vienna'",
-                            "default": "Vienna",
-                        },
-                        "country": {
-                            "type":
-                            "string",
-                            "description":
-                            "The country that the city is in, e.g. 'Austria'",
-                        },
-                        "unit": {
-                            "type": "string",
-                            "description":
-                            "The unit to fetch the temperature in",
-                            "enum": ["celsius", "fahrenheit"],
-                        },
-                        "options": {
-                            "$ref": "#/$defs/WeatherOptions",
-                            "description":
-                            "Optional parameters for weather query",
-                        },
-                    },
-                    "required": ["country", "unit"],
-                    "$defs": {
-                        "WeatherOptions": {
-                            "title": "WeatherOptions",
-                            "type": "object",
-                            "additionalProperties": False,
-                            "properties": {
-                                "unit": {
-                                    "type": "string",
-                                    "enum": ["celsius", "fahrenheit"],
-                                    "default": "celsius",
-                                    "description": "Temperature unit",
-                                    "title": "Temperature Unit",
-                                },
-                                "include_forecast": {
-                                    "type": "boolean",
-                                    "default": False,
-                                    "description":
-                                    "Whether to include a 24-hour forecast",
-                                    "title": "Include Forecast",
-                                },
-                                "language": {
-                                    "type": "string",
-                                    "default": "zh-CN",
-                                    "description": "Language of the response",
-                                    "title": "Language",
-                                    "enum": ["zh-CN", "en-US", "ja-JP"],
-                                },
-                            },
-                        },
-                    },
-                },
-            },
-        },
-        {
-            "type": "function",
-            "function": {
-                "name": "get_forecast",
-                "description": "Get the weather forecast for a given location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "city": {
-                            "type": "string",
-                            "description":
-                            "The city to get the forecast for, e.g. 'Vienna'",
-                            "default": "Vienna",
-                        },
-                        "country": {
-                            "type":
-                            "string",
-                            "description":
-                            "The country that the city is in, e.g. 'Austria'",
-                        },
-                        "days": {
-                            "type":
-                            "integer",
-                            "description":
-                            "Number of days to get the forecast for (1-7)",
-                        },
-                        "unit": {
-                            "type": "string",
-                            "description":
-                            "The unit to fetch the temperature in",
-                            "enum": ["celsius", "fahrenheit"],
-                        },
-                    },
-                    "required": ["country", "days", "unit"],
-                },
-            },
-        },
-    ]
-
-    messages = [
-        {
-            "role": "user",
-            "content": "Hi! How are you doing today?"
-        },
-        {
-            "role": "assistant",
-            "content": "I'm doing well! How can I help you?"
-        },
-        {
-            "role":
-            "user",
-            "content":
-            "Can you tell me what the current weather is in Berlin and the "\
-            "forecast for the next 5 days, in fahrenheit?",
-        },
-    ]
     if not stream:
         # Non-streaming test
         chat_completion = await client.chat.completions.create(
@@ -216,3 +216,71 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
                 output.extend(chunk.choices[0].delta.tool_calls)
 
         assert len(output) > 0
+
+
+@pytest.fixture(scope="module")
+def k2_server():  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "half",
+        "--enable-auto-tool-choice",
+        "--guided-decoding-backend",
+        "xgrammar",
+        "--tool-call-parser",
+        "hermes",
+        "--reasoning-parser",
+        "qwen3",
+        "--gpu-memory-utilization",
+        "0.4",
+    ]
+    # hack to test kimi_k2 tool use tool_id format.
+    # avoid error in is_deepseek_mla check by setting kv_lora_rank=null
+    with RemoteOpenAIServer(MODEL_NAME,
+                            args,
+                            override_hf_configs={
+                                "model_type": 'kimi_k2',
+                                'kv_lora_rank': None
+                            }) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def k2_client(k2_server):
+    async with k2_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("stream", [True, False])
+@pytest.mark.parametrize("tool_choice", ["required"])
+async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str,
+                               stream: bool, tool_choice: str):
+
+    if not stream:
+        # Non-streaming test
+        chat_completion = await k2_client.chat.completions.create(
+            messages=messages,
+            model=model_name,
+            tools=tools,
+            tool_choice=tool_choice)
+        assert chat_completion.choices[0].message.tool_calls is not None
+        assert len(chat_completion.choices[0].message.tool_calls) > 0
+        assert chat_completion.choices[0].message.tool_calls[
+            0].id == 'functions.get_current_weather:0'
+    else:
+        # Streaming test
+        output_stream = await k2_client.chat.completions.create(
+            messages=messages,
+            model=model_name,
+            tools=tools,
+            tool_choice=tool_choice,
+            stream=True)
+
+        output = []
+        async for chunk in output_stream:
+            if chunk.choices and chunk.choices[0].delta.tool_calls:
+                output.extend(chunk.choices[0].delta.tool_calls)
+        for o in output:
+            assert o.id is None or o.id == 'functions.get_current_weather:0'
diff --git a/tests/utils.py b/tests/utils.py
index e98707fb44..4dba549466 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -5,6 +5,7 @@ import asyncio
 import copy
 import functools
 import importlib
+import json
 import os
 import signal
 import subprocess
@@ -101,7 +102,8 @@ class RemoteOpenAIServer:
                  env_dict: Optional[dict[str, str]] = None,
                  seed: Optional[int] = 0,
                  auto_port: bool = True,
-                 max_wait_seconds: Optional[float] = None) -> None:
+                 max_wait_seconds: Optional[float] = None,
+                 override_hf_configs: Optional[dict[str, Any]] = None) -> None:
         if auto_port:
             if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
                 raise ValueError("You have manually specified the port "
@@ -120,6 +122,12 @@ class RemoteOpenAIServer:
 
             vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
 
+        if override_hf_configs is not None:
+            vllm_serve_args = vllm_serve_args + [
+                "--hf-overrides",
+                json.dumps(override_hf_configs)
+            ]
+
         parser = FlexibleArgumentParser(
             description="vLLM's remote OpenAI server.")
         subparsers = parser.add_subparsers(required=False, dest="subparser")
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 74c8093f49..87772a499f 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1345,5 +1345,18 @@ def apply_mistral_chat_template(
             "template")
         raise ValueError(str(e)) from e
 
-def random_tool_call_id() -> str:
-    return f"chatcmpl-tool-{random_uuid()}"
+def get_history_tool_calls_cnt(conversation: list[ConversationMessage]):
+    idx = 0
+    for msg in conversation:
+        if msg['role'] == 'assistant':
+            tool_calls = msg.get('tool_calls')
+            idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
+    return idx
+
+def make_tool_call_id(id_type:str='random', func_name=None, idx=None):
+
+    if id_type=='kimi_k2':
+        return f'functions.{func_name}:{idx}'
+    else:
+        # by default return random
+        return f"chatcmpl-tool-{random_uuid()}"
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 39facd4d53..a44868973f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -38,7 +38,7 @@ from typing_extensions import TypeAlias
 
 from vllm import envs
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
-                                         random_tool_call_id)
+                                         make_tool_call_id)
 from vllm.entrypoints.score_utils import (ScoreContentPartParam,
                                           ScoreMultiModalParam)
 from vllm.logger import init_logger
@@ -1634,7 +1634,7 @@ class FunctionCall(OpenAIBaseModel):
 
 
 class ToolCall(OpenAIBaseModel):
-    id: str = Field(default_factory=random_tool_call_id)
+    id: str = Field(default_factory=make_tool_call_id)
     type: Literal["function"] = "function"
     function: FunctionCall
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index d57868847e..65aac23ee6 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -19,7 +19,8 @@ from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          ConversationMessage,
-                                         random_tool_call_id)
+                                         get_history_tool_calls_cnt,
+                                         make_tool_call_id)
 from vllm.entrypoints.harmony_utils import (
     get_developer_message, get_stop_tokens_for_assistant_actions,
     get_streamable_parser_for_assistant, get_system_message, parse_chat_input,
@@ -133,6 +134,10 @@ class OpenAIServingChat(OpenAIServing):
             source = "model" if source == "auto" else source
             logger.info("Using default chat sampling params from %s: %s",
                         source, self.default_sampling_params)
+        if self.model_config.hf_config.model_type == 'kimi_k2':
+            self.tool_call_id_type = 'kimi_k2'
+        else:
+            self.tool_call_id_type = 'random'
 
         self.use_harmony = model_config.hf_config.model_type == "gpt_oss"
         if self.use_harmony:
@@ -379,6 +384,7 @@ class OpenAIServingChat(OpenAIServing):
         current_text: Optional[str],
         delta_text: str,
         function_name_returned: bool,
+        tool_call_idx: Optional[int] = None
     ) -> tuple[Optional[DeltaMessage], bool]:
         if current_text is None or current_text == "":
             # if the current text is empty, we cannot parse it
@@ -424,8 +430,12 @@ class OpenAIServingChat(OpenAIServing):
                         current_tool_call = obj[-2]
 
                     function_name_returned = True
+                    tool_call_id = make_tool_call_id(
+                        id_type=self.tool_call_id_type,
+                        func_name=current_tool_call["name"],
+                        idx=tool_call_idx)
                     delta_message = DeltaMessage(tool_calls=[
-                        DeltaToolCall(id=random_tool_call_id(),
+                        DeltaToolCall(id=tool_call_id,
                                       function=DeltaFunctionCall(
                                           name=current_tool_call["name"],
                                           arguments=arguments),
@@ -491,6 +501,10 @@ class OpenAIServingChat(OpenAIServing):
 
         all_previous_token_ids: Optional[list[list[int]]]
         function_name_returned = [False] * num_choices
+        if self.tool_call_id_type == 'kimi_k2':
+            history_tool_call_cnt = get_history_tool_calls_cnt(conversation)
+        else:
+            history_tool_call_cnt = 0
 
         # Always track previous_texts for comprehensive output logging
         previous_texts = [""] * num_choices
@@ -673,7 +687,6 @@ class OpenAIServingChat(OpenAIServing):
                         previous_text = previous_texts[i]
                         previous_token_ids = all_previous_token_ids[i]
                         current_text = previous_text + delta_text
-
                         # avoid the None + list error.
                         if previous_token_ids:
                             current_token_ids = previous_token_ids + as_list(
@@ -733,7 +746,7 @@ class OpenAIServingChat(OpenAIServing):
                                     index=i)
                             else:
                                 delta_tool_call = DeltaToolCall(
-                                    id=random_tool_call_id(),
+                                    id=make_tool_call_id(),
                                     type="function",
                                     function=DeltaFunctionCall(
                                         name=tool_choice_function_name,
@@ -764,7 +777,11 @@ class OpenAIServingChat(OpenAIServing):
                                 previous_text=previous_text,
                                 current_text=content,
                                 delta_text=delta_text,
-                                function_name_returned=fn_name_returned))
+                                function_name_returned=fn_name_returned,
+                                tool_call_idx=history_tool_call_cnt))
+                        if (delta_message and delta_message.tool_calls and
+                                delta_message.tool_calls[0].id is not None):
+                            history_tool_call_cnt += 1
 
                         # update the previous values for the next iteration
                         previous_texts[i] = current_text
@@ -1089,6 +1106,10 @@ class OpenAIServingChat(OpenAIServing):
         assert final_res is not None
 
         choices: list[ChatCompletionResponseChoice] = []
+        if self.tool_call_id_type == 'kimi_k2':
+            history_tool_call_cnt = get_history_tool_calls_cnt(conversation)
+        else:
+            history_tool_call_cnt = 0
 
         role = self.get_chat_request_role(request)
         for output in final_res.outputs:
@@ -1194,17 +1215,26 @@ class OpenAIServingChat(OpenAIServing):
                 assert content is not None
                 tool_calls = TypeAdapter(
                     list[FunctionDefinition]).validate_json(content)
+                tool_call_ids = []
+                for tool_call in tool_calls:
+                    tool_call_ids.append(
+                        make_tool_call_id(id_type=self.tool_call_id_type,
+                                          func_name=tool_call.name,
+                                          idx=history_tool_call_cnt))
+                    history_tool_call_cnt += 1
                 message = ChatMessage(
                     role=role,
                     content="",
-                    reasoning_content=reasoning_content,
                     tool_calls=[
-                        tool_call_class(function=FunctionCall(
-                            name=tool_call.name,
-                            arguments=json.dumps(tool_call.parameters,
-                                                 ensure_ascii=False)))
-                        for tool_call in tool_calls
-                    ])
+                        tool_call_class(id=tool_call_ids[i],
+                                        function=FunctionCall(
+                                            name=tool_call.name,
+                                            arguments=json.dumps(
+                                                tool_call.parameters,
+                                                ensure_ascii=False)))
+                        for i, tool_call in enumerate(tool_calls)
+                    ],
+                    reasoning_content=reasoning_content)
 
             # if the request doesn't use tool choice
             # OR specifies to not use a tool
@@ -1248,7 +1278,6 @@ class OpenAIServingChat(OpenAIServing):
                     if (tool_call_info.content
                             and len(tool_call_info.content) > 0):
                         ret_content = tool_call_info.content
-
                     message = ChatMessage(role=role,
                                           reasoning_content=reasoning_content,
                                           content=ret_content)
@@ -1327,12 +1356,11 @@ class OpenAIServingChat(OpenAIServing):
                 elif choice.message.tool_calls:
                     # For tool calls, log the function name and arguments
                     tool_call_descriptions = []
-                    for tool_call in choice.message.tool_calls:
-                        if hasattr(tool_call.function, "name") and hasattr(
-                                tool_call.function, "arguments"):
+                    for tc in choice.message.tool_calls:
+                        if hasattr(tc.function, "name") and hasattr(
+                                tc.function, "arguments"):
                             tool_call_descriptions.append(
-                                f"{tool_call.function.name}({tool_call.function.arguments})"
-                            )
+                                f"{tc.function.name}({tc.function.arguments})")
                     tool_calls_str = ", ".join(tool_call_descriptions)
                     output_text = f"[tool_calls: {tool_calls_str}]"
 
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index da4760ad1b..ac272b0c3b 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -6,7 +6,7 @@ from typing import Union
 
 import regex as re
 
-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -267,7 +267,7 @@ class DeepSeekV3ToolParser(ToolParser):
                         DeltaToolCall(
                             index=self.current_tool_id,
                             type="function",
-                            id=random_tool_call_id(),
+                            id=make_tool_call_id(),
                             function=DeltaFunctionCall(
                                 name=function_name).model_dump(
                                     exclude_none=True),
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 5508ba6a39..824b100f35 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -10,7 +10,7 @@ import partial_json_parser
 import regex as re
 from partial_json_parser.core.options import Allow
 
-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -203,7 +203,7 @@ class Granite20bFCToolParser(ToolParser):
                     delta = DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
                                       type="function",
-                                      id=random_tool_call_id(),
+                                      id=make_tool_call_id(),
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index fcc5b7edda..ac517616a9 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -8,7 +8,7 @@ from typing import Union
 import partial_json_parser
 from partial_json_parser.core.options import Allow
 
-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -185,7 +185,7 @@ class GraniteToolParser(ToolParser):
                     delta = DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
                                       type="function",
-                                      id=random_tool_call_id(),
+                                      id=make_tool_call_id(),
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index d126130ab9..a6ce33af6b 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -9,7 +9,7 @@ import partial_json_parser
 import regex as re
 from partial_json_parser.core.options import Allow
 
-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -307,7 +307,7 @@ class Hermes2ProToolParser(ToolParser):
                     return DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
                                       type="function",
-                                      id=random_tool_call_id(),
+                                      id=make_tool_call_id(),
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 92004de030..6ef8fadf59 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -8,7 +8,7 @@ from typing import Union
 import partial_json_parser
 from partial_json_parser.core.options import Allow
 
-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -107,7 +107,7 @@ class Internlm2ToolParser(ToolParser):
                     delta = DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
                                       type="function",
-                                      id=random_tool_call_id(),
+                                      id=make_tool_call_id(),
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 66b483d8b0..3b41f60347 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -9,7 +9,7 @@ import partial_json_parser
 import regex as re
 from partial_json_parser.core.options import Allow
 
-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -222,7 +222,7 @@ class JambaToolParser(ToolParser):
                     delta = DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
                                       type="function",
-                                      id=random_tool_call_id(),
+                                      id=make_tool_call_id(),
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 194a144ad5..31b19c8db4 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -10,7 +10,7 @@ import regex as re
 from partial_json_parser.core.options import Allow
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -213,7 +213,7 @@ class Llama3JsonToolParser(ToolParser):
                     delta = DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
                                       type="function",
-                                      id=random_tool_call_id(),
+                                      id=make_tool_call_id(),
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
index 226309ef29..283e609501 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -7,7 +7,7 @@ from typing import Any, Optional, Union
 
 import regex as re
 
-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -394,7 +394,7 @@ class MinimaxToolParser(ToolParser):
             sent_tools.append({
                 "sent_name": False,
                 "sent_arguments": "",
-                "id": random_tool_call_id(),
+                "id": make_tool_call_id(),
             })
 
         while len(tool_ids) < tool_count:
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index 5501028cf3..85dd56213c 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -8,7 +8,7 @@ from typing import Any, Optional
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage,
                                               ExtractedToolCallInformation,
@@ -74,7 +74,7 @@ class Phi4MiniJsonToolParser(ToolParser):
 
             tool_calls: list[ToolCall] = [
                 ToolCall(
-                    id=random_tool_call_id(),
+                    id=make_tool_call_id(),
                     type="function",
                     function=FunctionCall(
                         name=raw_function_call["name"],
diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
index 321718b1c9..87cd413b37 100644
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -7,7 +7,7 @@ from typing import Any, Optional, Union
 
 import regex as re
 
-from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -226,7 +226,7 @@ class xLAMToolParser(ToolParser):
                         function_name = name_match.group(1)
 
                         # The test expects us to send just the name first
-                        tool_id = random_tool_call_id()
+                        tool_id = make_tool_call_id()
                         delta = DeltaMessage(tool_calls=[
                             DeltaToolCall(
                                 index=0,

From b95697d7310637399998ebf1f21a26b523aa6611 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Wed, 20 Aug 2025 13:03:37 -0700
Subject: [PATCH 441/932] [Frontend] improve error logging of chat completion
 (#22957)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/openai/api_server.py | 74 +++++++++++++++++++++------
 1 file changed, 57 insertions(+), 17 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 24148bcef2..14ba8aa641 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -600,8 +600,11 @@ async def create_responses(request: ResponsesRequest, raw_request: Request):
     if handler is None:
         return base(raw_request).create_error_response(
             message="The model does not support Responses API")
-
-    generator = await handler.create_responses(request, raw_request)
+    try:
+        generator = await handler.create_responses(request, raw_request)
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
@@ -618,7 +621,11 @@ async def retrieve_responses(response_id: str, raw_request: Request):
         return base(raw_request).create_error_response(
             message="The model does not support Responses API")
 
-    response = await handler.retrieve_responses(response_id)
+    try:
+        response = await handler.retrieve_responses(response_id)
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
 
     if isinstance(response, ErrorResponse):
         return JSONResponse(content=response.model_dump(),
@@ -633,7 +640,11 @@ async def cancel_responses(response_id: str, raw_request: Request):
         return base(raw_request).create_error_response(
             message="The model does not support Responses API")
 
-    response = await handler.cancel_responses(response_id)
+    try:
+        response = await handler.cancel_responses(response_id)
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
 
     if isinstance(response, ErrorResponse):
         return JSONResponse(content=response.model_dump(),
@@ -667,9 +678,11 @@ async def create_chat_completion(request: ChatCompletionRequest,
     if handler is None:
         return base(raw_request).create_error_response(
             message="The model does not support Chat Completions API")
-
-    generator = await handler.create_chat_completion(request, raw_request)
-
+    try:
+        generator = await handler.create_chat_completion(request, raw_request)
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.error.code)
@@ -742,7 +755,11 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
         return base(raw_request).create_error_response(
             message="The model does not support Embeddings API")
 
-    generator = await handler.create_embedding(request, raw_request)
+    try:
+        generator = await handler.create_embedding(request, raw_request)
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
@@ -770,8 +787,11 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
     if handler is None:
         return base(raw_request).create_error_response(
             message="The model does not support Pooling API")
-
-    generator = await handler.create_pooling(request, raw_request)
+    try:
+        generator = await handler.create_pooling(request, raw_request)
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.error.code)
@@ -791,7 +811,11 @@ async def create_classify(request: ClassificationRequest,
         return base(raw_request).create_error_response(
             message="The model does not support Classification API")
 
-    generator = await handler.create_classify(request, raw_request)
+    try:
+        generator = await handler.create_classify(request, raw_request)
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.error.code)
@@ -820,7 +844,11 @@ async def create_score(request: ScoreRequest, raw_request: Request):
         return base(raw_request).create_error_response(
             message="The model does not support Score API")
 
-    generator = await handler.create_score(request, raw_request)
+    try:
+        generator = await handler.create_score(request, raw_request)
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.error.code)
@@ -878,8 +906,12 @@ async def create_transcriptions(raw_request: Request,
             message="The model does not support Transcriptions API")
 
     audio_data = await request.file.read()
-    generator = await handler.create_transcription(audio_data, request,
-                                                   raw_request)
+    try:
+        generator = await handler.create_transcription(audio_data, request,
+                                                       raw_request)
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
@@ -919,8 +951,12 @@ async def create_translations(request: Annotated[TranslationRequest,
             message="The model does not support Translations API")
 
     audio_data = await request.file.read()
-    generator = await handler.create_translation(audio_data, request,
-                                                 raw_request)
+    try:
+        generator = await handler.create_translation(audio_data, request,
+                                                     raw_request)
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
 
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
@@ -949,7 +985,11 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
     if handler is None:
         return base(raw_request).create_error_response(
             message="The model does not support Rerank (Score) API")
-    generator = await handler.do_rerank(request, raw_request)
+    try:
+        generator = await handler.do_rerank(request, raw_request)
+    except Exception as e:
+        raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+                            detail=str(e)) from e
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.error.code)

From bf7c99dfc40bff6844b2ae57554516922eb93b71 Mon Sep 17 00:00:00 2001
From: Saurabh Misra <misra.saurabh1@gmail.com>
Date: Wed, 20 Aug 2025 13:17:11 -0700
Subject: [PATCH 442/932] [Perf] Speed up function
 `_convert_tokens_to_string_with_added_encoders` by 13.7x (#20413)

Signed-off-by: Saurabh Misra <misra.saurabh1@gmail.com>
Signed-off-by: Aseem Saxena <aseem.bits@gmail.com>
Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com>
Co-authored-by: Aseem Saxena <aseem.bits@gmail.com>
---
 vllm/transformers_utils/detokenizer_utils.py | 25 ++++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index be1040c3e0..101f31d39c 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -23,27 +23,32 @@ def _convert_tokens_to_string_with_added_encoders(
     # NOTE(woosuk): The following code is slow because it runs a for loop over
     # the output_tokens. In Python, running a for loop over a list can be slow
     # even when the loop body is very simple.
+    # Performance improvements: avoid repeated attribute and function lookups;
+    # localize frequently used objects;
+
     sub_texts: list[str] = []
     current_sub_text: list[str] = []
-    all_special_tokens = set(tokenizer.all_special_tokens)
+    convert_tokens_to_string = tokenizer.convert_tokens_to_string
+    added_vocab_set = set(tokenizer.get_added_vocab())
+    all_special_tokens = set(
+        tokenizer.all_special_tokens) if skip_special_tokens else ()
+
     for token in output_tokens:
-        if skip_special_tokens and token in all_special_tokens:
+        # Use precomputed set for skip-special check
+        if token in all_special_tokens:
             continue
-        if token in tokenizer.get_added_vocab():
+        if token in added_vocab_set:
             if current_sub_text:
-                sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
-                sub_texts.append(sub_text)
-                current_sub_text = []
+                sub_texts.append(convert_tokens_to_string(current_sub_text))
+                current_sub_text.clear()
             sub_texts.append(token)
         else:
             current_sub_text.append(token)
     if current_sub_text:
-        sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
-        sub_texts.append(sub_text)
+        sub_texts.append(convert_tokens_to_string(current_sub_text))
     if spaces_between_special_tokens:
         return " ".join(sub_texts)
-    else:
-        return "".join(sub_texts)
+    return "".join(sub_texts)
 
 
 # 5 is an arbitrary value that should work for all

From 4e51fa8cbaba2c6fd516b4615a533b0a94796516 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 20 Aug 2025 16:28:30 -0400
Subject: [PATCH 443/932] Do not use eval() to convert unknown types (#23266)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .../openai/tool_parsers/qwen3coder_tool_parser.py   | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
index cf4d0b231a..2501d6739e 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -208,15 +208,10 @@ class Qwen3CoderToolParser(ToolParser):
                             "valid JSON object in tool '%s', will try other "
                             "methods to parse it.", param_value, param_name,
                             func_name)
-                try:
-                    converted_value = eval(param_value)
-                    return converted_value
-                except Exception:
-                    logger.warning(
-                        "Parsed value '%s' of parameter '%s' cannot be "
-                        "converted via Python `eval()` in tool '%s', "
-                        "degenerating to string.", param_value, param_name,
-                        func_name)
+                logger.warning(
+                    "Parameter '%s' has unknown type '%s'. "
+                    "The value will be treated as a string.", param_name,
+                    param_type)
                 return param_value
 
         # Extract function name

From 4fbda0b20cc539f72314375c2abc6100ebac8392 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Thu, 21 Aug 2025 05:07:28 +0800
Subject: [PATCH 444/932] [Feature] use --eplb_config to set eplb param
 (#20562)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: rongfu.leng <lenronfu@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/__init__.py                   |   3 +-
 vllm/config/parallel.py                   | 108 +++++++++++++++++-----
 vllm/distributed/eplb/eplb_state.py       |   4 +-
 vllm/engine/arg_utils.py                  |  63 +++++++++----
 vllm/model_executor/models/deepseek_v2.py |   4 +-
 vllm/model_executor/models/glm4_moe.py    |   4 +-
 vllm/model_executor/models/qwen3_moe.py   |   7 +-
 vllm/v1/worker/gpu_model_runner.py        |   4 +-
 vllm/v1/worker/gpu_worker.py              |   4 +-
 9 files changed, 149 insertions(+), 52 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 62dfd4333b..959f111ced 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -33,7 +33,8 @@ from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
                                PrefixCachingHashAlgo)
 from vllm.config.compilation import (CompilationConfig, CompilationLevel,
                                      CUDAGraphMode, PassConfig)
-from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig
+from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
+                                  ParallelConfig)
 from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
 from vllm.config.utils import ConfigType, config
 from vllm.logger import init_logger
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 7a9e68f0ea..2b716a7706 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -6,7 +6,7 @@ from dataclasses import field
 from typing import TYPE_CHECKING, Any, Literal, Optional, Union
 
 import torch
-from pydantic import model_validator
+from pydantic import TypeAdapter, model_validator
 from pydantic.dataclasses import dataclass
 from torch.distributed import ProcessGroup, ReduceOp
 from typing_extensions import Self
@@ -32,6 +32,38 @@ logger = init_logger(__name__)
 DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
 
 
+@config
+@dataclass
+class EPLBConfig:
+    """Configuration for Expert Parallel Load Balancing (EP)."""
+
+    window_size: int = 1000
+    """Window size for expert load recording."""
+    step_interval: int = 3000
+    """
+    Interval for rearranging experts in expert parallelism.
+
+    Note that if this is greater than the EPLB window size, only the metrics
+    of the last `lb_window_size` steps will be used for rearranging experts.
+    """
+
+    num_redundant_experts: int = 0
+    """Number of redundant experts to use for expert parallelism."""
+
+    log_balancedness: bool = False
+    """
+    Log the balancedness each step of expert parallelism.
+    This is turned off by default since it will cause communication overhead.
+    """
+
+    @classmethod
+    def from_cli(cls, cli_value: str) -> "EPLBConfig":
+        """Parse the CLI value for the compilation config.
+        -O1, -O2, -O3, etc. is handled in FlexibleArgumentParser.
+        """
+        return TypeAdapter(EPLBConfig).validate_json(cli_value)
+
+
 @config
 @dataclass
 class ParallelConfig:
@@ -75,22 +107,24 @@ class ParallelConfig:
     """Use expert parallelism instead of tensor parallelism for MoE layers."""
     enable_eplb: bool = False
     """Enable expert parallelism load balancing for MoE layers."""
-    num_redundant_experts: int = 0
-    """Number of redundant experts to use for expert parallelism."""
-    eplb_window_size: int = 1000
-    """Window size for expert load recording."""
-    eplb_step_interval: int = 3000
-    """
-    Interval for rearranging experts in expert parallelism.
-
-    Note that if this is greater than the EPLB window size, only the metrics
-    of the last `eplb_window_size` steps will be used for rearranging experts.
-    """
-    eplb_log_balancedness: bool = False
-    """
-    Log the balancedness each step of expert parallelism.
-    This is turned off by default since it will cause communication overhead.
-    """
+    eplb_config: EPLBConfig = field(default_factory=EPLBConfig)
+    """Expert parallelism configuration."""
+    num_redundant_experts: Optional[int] = None
+    """`num_redundant_experts` is deprecated and has been replaced with
+    `eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
+    Please use `eplb_config.num_redundant_experts` instead."""
+    eplb_window_size: Optional[int] = None
+    """`eplb_window_size` is deprecated and has been replaced with
+    `eplb_config.window_size`. This will be removed in v0.12.0.
+    Please use `eplb_config.window_size` instead."""
+    eplb_step_interval: Optional[int] = None
+    """`eplb_step_interval` is deprecated and has been replaced with
+    `eplb_config.step_interval`. This will be removed in v0.12.0.
+    Please use `eplb_config.step_interval` instead."""
+    eplb_log_balancedness: Optional[bool] = None
+    """`eplb_log_balancedness` is deprecated and has been replaced with
+    `eplb_config.log_balancedness`. This will be removed in v0.12.0.
+    Please use `eplb_config.log_balancedness` instead."""
 
     max_parallel_loading_workers: Optional[int] = None
     """Maximum number of parallel loading workers when loading model
@@ -237,6 +271,38 @@ class ParallelConfig:
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __post_init__(self) -> None:
+        # Forward deprecated fields to their new location
+        if self.num_redundant_experts is not None:
+            self.eplb_config.num_redundant_experts = (
+                self.num_redundant_experts)
+            logger.warning_once(
+                "num_redundant_experts is deprecated and has been replaced "
+                "with eplb_config.num_redundant_experts. This will be removed "
+                "in v0.12.0. Changing this field after initialization will "
+                "have no effect.")
+        if self.eplb_window_size is not None:
+            self.eplb_config.window_size = self.eplb_window_size
+            logger.warning_once(
+                "eplb_window_size is deprecated and has been replaced "
+                "with eplb_config.window_size. This will be removed "
+                "in v0.12.0. Changing this field after initialization will "
+                "have no effect.")
+        if self.eplb_step_interval is not None:
+            self.eplb_config.step_interval = self.eplb_step_interval
+            logger.warning_once(
+                "eplb_step_interval is deprecated and has been replaced "
+                "with eplb_config.step_interval. This will be removed "
+                "in v0.12.0. Changing this field after initialization will "
+                "have no effect.")
+        if self.eplb_log_balancedness is not None:
+            self.eplb_config.log_balancedness = self.eplb_log_balancedness
+            logger.warning_once(
+                "eplb_log_balancedness is deprecated and has been replaced "
+                "with eplb_config.log_balancedness. This will be removed "
+                "in v0.12.0. Changing this field after initialization will "
+                "have no effect.")
+
+        # Continue with the rest of the initialization
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
 
@@ -275,10 +341,10 @@ class ParallelConfig:
                 raise ValueError(
                     "Expert parallelism load balancing is only supported on "
                     "CUDA devices now.")
-            if self.num_redundant_experts < 0:
+            if self.eplb_config.num_redundant_experts < 0:
                 raise ValueError(
                     "num_redundant_experts must be non-negative, but got "
-                    f"{self.num_redundant_experts}.")
+                    f"{self.eplb_config.num_redundant_experts}.")
             if not self.enable_expert_parallel:
                 raise ValueError(
                     "enable_expert_parallel must be True to use EPLB.")
@@ -289,10 +355,10 @@ class ParallelConfig:
                     f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
                 )
         else:
-            if self.num_redundant_experts != 0:
+            if self.eplb_config.num_redundant_experts != 0:
                 raise ValueError(
                     "num_redundant_experts should be used with EPLB."
-                    f"{self.num_redundant_experts}.")
+                    f"{self.eplb_config.num_redundant_experts}.")
         if self.distributed_executor_backend is None and self.world_size > 1:
             # We use multiprocessing by default if world_size fits on the
             # current node and we aren't in a ray placement group.
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 979f2a06ce..042acf40d6 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -244,7 +244,7 @@ class EplbState:
             dtype=torch.int32,
             device=device,
         )
-        expert_load_window_size = parallel_config.eplb_window_size
+        expert_load_window_size = parallel_config.eplb_config.window_size
         expert_load_window = torch.zeros(
             (expert_load_window_size, model.num_moe_layers,
              model.num_physical_experts),
@@ -253,7 +253,7 @@ class EplbState:
         )
 
         # Set the initial progress of rearrangement to 3/4
-        eplb_step_interval = parallel_config.eplb_step_interval
+        eplb_step_interval = parallel_config.eplb_config.step_interval
         expert_rearrangement_step = max(
             0, eplb_step_interval - eplb_step_interval // 4)
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6869c3f23f..dcf7875894 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -25,7 +25,7 @@ import vllm.envs as envs
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
                          ConfigFormat, ConfigType, ConvertOption,
                          DecodingConfig, DetailedTraceModules, Device,
-                         DeviceConfig, DistributedExecutorBackend,
+                         DeviceConfig, DistributedExecutorBackend, EPLBConfig,
                          GuidedDecodingBackend, HfOverrides, KVEventsConfig,
                          KVTransferConfig, LoadConfig, LogprobsMode,
                          LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig,
@@ -305,11 +305,12 @@ class EngineArgs:
     data_parallel_hybrid_lb: bool = False
     data_parallel_backend: str = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
+    eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
     enable_eplb: bool = ParallelConfig.enable_eplb
-    num_redundant_experts: int = ParallelConfig.num_redundant_experts
-    eplb_window_size: int = ParallelConfig.eplb_window_size
-    eplb_step_interval: int = ParallelConfig.eplb_step_interval
-    eplb_log_balancedness: bool = ParallelConfig.eplb_log_balancedness
+    num_redundant_experts: int = EPLBConfig.num_redundant_experts
+    eplb_window_size: int = EPLBConfig.window_size
+    eplb_step_interval: int = EPLBConfig.step_interval
+    eplb_log_balancedness: bool = EPLBConfig.log_balancedness
     max_parallel_loading_workers: Optional[
         int] = ParallelConfig.max_parallel_loading_workers
     block_size: Optional[BlockSize] = CacheConfig.block_size
@@ -454,6 +455,9 @@ class EngineArgs:
         if isinstance(self.compilation_config, dict):
             self.compilation_config = CompilationConfig(
                 **self.compilation_config)
+        if isinstance(self.eplb_config, dict):
+            self.eplb_config = EPLBConfig.from_cli(json.dumps(
+                self.eplb_config))
         # Setup plugins
         from vllm.plugins import load_general_plugins
         load_general_plugins()
@@ -661,14 +665,32 @@ class EngineArgs:
             **parallel_kwargs["enable_expert_parallel"])
         parallel_group.add_argument("--enable-eplb",
                                     **parallel_kwargs["enable_eplb"])
-        parallel_group.add_argument("--num-redundant-experts",
-                                    **parallel_kwargs["num_redundant_experts"])
-        parallel_group.add_argument("--eplb-window-size",
-                                    **parallel_kwargs["eplb_window_size"])
-        parallel_group.add_argument("--eplb-step-interval",
-                                    **parallel_kwargs["eplb_step_interval"])
-        parallel_group.add_argument("--eplb-log-balancedness",
-                                    **parallel_kwargs["eplb_log_balancedness"])
+        parallel_group.add_argument("--eplb-config",
+                                    **parallel_kwargs["eplb_config"])
+        parallel_group.add_argument(
+            "--num-redundant-experts",
+            type=int,
+            help=
+            "[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
+            deprecated=True)
+        parallel_group.add_argument(
+            "--eplb-window-size",
+            type=int,
+            help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
+            deprecated=True)
+        parallel_group.add_argument(
+            "--eplb-step-interval",
+            type=int,
+            help=
+            "[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
+            deprecated=True)
+        parallel_group.add_argument(
+            "--eplb-log-balancedness",
+            action=argparse.BooleanOptionalAction,
+            help=
+            "[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
+            deprecated=True)
+
         parallel_group.add_argument(
             "--max-parallel-loading-workers",
             **parallel_kwargs["max_parallel_loading_workers"])
@@ -1244,6 +1266,16 @@ class EngineArgs:
                     "Currently, speculative decoding is not supported with "
                     "async scheduling.")
 
+        # Forward the deprecated CLI args to the EPLB config.
+        if self.num_redundant_experts is not None:
+            self.eplb_config.num_redundant_experts = self.num_redundant_experts
+        if self.eplb_window_size is not None:
+            self.eplb_config.window_size = self.eplb_window_size
+        if self.eplb_step_interval is not None:
+            self.eplb_config.step_interval = self.eplb_step_interval
+        if self.eplb_log_balancedness is not None:
+            self.eplb_config.log_balancedness = self.eplb_log_balancedness
+
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
@@ -1257,10 +1289,7 @@ class EngineArgs:
             data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
             enable_expert_parallel=self.enable_expert_parallel,
             enable_eplb=self.enable_eplb,
-            num_redundant_experts=self.num_redundant_experts,
-            eplb_window_size=self.eplb_window_size,
-            eplb_step_interval=self.eplb_step_interval,
-            eplb_log_balancedness=self.eplb_log_balancedness,
+            eplb_config=self.eplb_config,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
             ray_workers_use_nsight=self.ray_workers_use_nsight,
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index f199da135e..d56224b4b7 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -132,10 +132,10 @@ class DeepseekV2MoE(nn.Module):
 
         # Load balancing settings.
         vllm_config = get_current_vllm_config()
-        parallel_config = vllm_config.parallel_config
+        eplb_config = vllm_config.parallel_config.eplb_config
         self.enable_eplb = enable_eplb
 
-        self.n_redundant_experts = parallel_config.num_redundant_experts
+        self.n_redundant_experts = eplb_config.num_redundant_experts
         self.n_logical_experts = self.n_routed_experts
         self.n_physical_experts = (self.n_logical_experts +
                                    self.n_redundant_experts)
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index aff491f959..fe5e46a998 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -131,10 +131,10 @@ class Glm4MoE(nn.Module):
 
         # Load balancing settings.
         vllm_config = get_current_vllm_config()
-        parallel_config = vllm_config.parallel_config
+        eplb_config = vllm_config.parallel_config.eplb_config
         self.enable_eplb = enable_eplb
 
-        self.n_redundant_experts = parallel_config.num_redundant_experts
+        self.n_redundant_experts = eplb_config.num_redundant_experts
         self.n_logical_experts = self.n_routed_experts
         self.n_physical_experts = (self.n_logical_experts +
                                    self.n_redundant_experts)
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 05bbb0d2e8..2812f79a66 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -121,11 +121,11 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
 
         # Load balancing settings.
         vllm_config = get_current_vllm_config()
-        parallel_config = vllm_config.parallel_config
+        eplb_config = vllm_config.parallel_config.eplb_config
         self.enable_eplb = enable_eplb
 
         self.n_logical_experts = self.n_routed_experts
-        self.n_redundant_experts = parallel_config.num_redundant_experts
+        self.n_redundant_experts = eplb_config.num_redundant_experts
         self.n_physical_experts = (self.n_logical_experts +
                                    self.n_redundant_experts)
         self.n_local_physical_experts = self.n_physical_experts // self.ep_size
@@ -363,7 +363,8 @@ class Qwen3MoeModel(nn.Module):
         quant_config = vllm_config.quant_config
         parallel_config = vllm_config.parallel_config
         enable_eplb = parallel_config.enable_eplb
-        self.num_redundant_experts = parallel_config.num_redundant_experts
+        eplb_config = parallel_config.eplb_config
+        self.num_redundant_experts = eplb_config.num_redundant_experts
 
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d9770226b1..33747d6917 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1435,7 +1435,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             model,
             is_dummy,
             is_profile,
-            log_stats=self.parallel_config.eplb_log_balancedness,
+            log_stats=self.parallel_config.eplb_config.log_balancedness,
         )
 
     def get_dp_padding(self,
@@ -1977,7 +1977,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             global_expert_load, old_global_expert_indices = (
                 EplbState.recv_state())
             num_logical_experts = global_expert_load.shape[1]
-            self.parallel_config.num_redundant_experts = (
+            self.parallel_config.eplb_config.num_redundant_experts = (
                 num_local_physical_experts * new_ep_size - num_logical_experts)
             assert old_global_expert_indices.shape[
                 1] % num_local_physical_experts == 0
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 22e639b97d..d61177d424 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -515,7 +515,7 @@ class Worker(WorkerBase):
             assert self.model_runner.eplb_state is not None
             new_physical_experts = \
                 self.model_runner.eplb_state.physical_to_logical_map.shape[1]
-            parallel_config.num_redundant_experts = (
+            parallel_config.eplb_config.num_redundant_experts = (
                 new_physical_experts -
                 self.model_runner.eplb_state.logical_replica_count.shape[1])
             global_expert_load = None
@@ -531,7 +531,7 @@ class Worker(WorkerBase):
             assert self.model_runner.eplb_state is not None
             global_expert_load = self.model_runner.eplb_state.rearrange(
                 self.model_runner.model, execute_shuffle=False)
-            parallel_config.num_redundant_experts = (
+            parallel_config.eplb_config.num_redundant_experts = (
                 new_physical_experts - global_expert_load.shape[1])
         prepare_communication_buffer_for_model(self.model_runner.model)
         self.model_runner.model.update_physical_experts_metadata(

From 1b125004bea9f4cd120d3ce96dc1d3a2962ebace Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 21 Aug 2025 05:15:34 +0800
Subject: [PATCH 445/932] [misc] fix multiple arch wheels for the nightly index
 (#23110)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/generate_index.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
index 7045d88104..6b5a2a9935 100644
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -8,7 +8,8 @@ template = """<!DOCTYPE html>
 <html>
     <body>
     <h1>Links for vLLM</h1/>
-        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
+        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
+        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
     </body>
 </html>
 """
@@ -21,7 +22,20 @@ filename = os.path.basename(args.wheel)
 
 with open("index.html", "w") as f:
     print(f"Generated index.html for {args.wheel}")
+    if "x86_64" in filename:
+        x86_wheel = filename
+        arm_wheel = filename.replace("x86_64", "aarch64")
+    elif "aarch64" in filename:
+        x86_wheel = filename.replace("aarch64", "x86_64")
+        arm_wheel = filename
+    else:
+        raise ValueError(f"Unsupported wheel: {filename}")
     # cloudfront requires escaping the '+' character
     f.write(
-        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
+        template.format(
+            x86_wheel=x86_wheel,
+            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
+            arm_wheel=arm_wheel,
+            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
+        )
     )

From a4fbb32fab3d2f91b3672bf581565378aaa18d6c Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni001@gmail.com>
Date: Wed, 20 Aug 2025 17:43:17 -0400
Subject: [PATCH 446/932] Remove chunked_prefill_enabled flag in V1 MLA
 (#23183)

Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
---
 vllm/v1/attention/backends/mla/common.py | 50 +++++++++++-------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index f2610671f7..646e4fec83 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -416,7 +416,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         self.model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         parallel_config = vllm_config.parallel_config
-        self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
         self.num_heads = self.model_config.get_num_attention_heads(
             parallel_config)
         self.mla_dims = get_mla_dims(self.model_config)
@@ -426,30 +425,28 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         if self.aot_schedule:
             self.page_size = self.kv_cache_spec.block_size
 
-        if self.chunked_prefill_enabled:
-            self.chunked_prefill_workspace_size = min(
-                # Max sure there is enough for 8 full length request or at least
-                # 4 pages of cache per request
-                max(
-                    8 * self.model_config.max_model_len, 4 *
-                    scheduler_config.max_num_seqs * cache_config.block_size),
-                # For long-context models try not to over-allocate limiting
-                # kv-cache space, limiting it to 64k tokens,
-                # which would result in the workspace being:
-                #   2*(576)*(64*1024) = 144mb
-                # (assuming 576 MLA head dim, and fp16)
-                # which would result in up-projected context being
-                #   2*(192*128)*(64*1024) = 3gb
-                # (assuming 192 QK head dim, 128 heads, and fp16)
-                128 * 1024)
-            assert self.chunked_prefill_workspace_size >= \
-                scheduler_config.max_num_seqs * cache_config.block_size
-            self.chunked_prefill_workspace = torch.empty(
-                (self.chunked_prefill_workspace_size,
-                 self.model_config.get_head_size()),
-                dtype=self.model_config.dtype,
-                device=device,
-            )
+        self.chunked_prefill_workspace_size = min(
+            # Max sure there is enough for 8 full length request or at least
+            # 4 pages of cache per request
+            max(8 * self.model_config.max_model_len,
+                4 * scheduler_config.max_num_seqs * cache_config.block_size),
+            # For long-context models try not to over-allocate limiting
+            # kv-cache space, limiting it to 64k tokens,
+            # which would result in the workspace being:
+            #   2*(576)*(64*1024) = 144mb
+            # (assuming 576 MLA head dim, and fp16)
+            # which would result in up-projected context being
+            #   2*(192*128)*(64*1024) = 3gb
+            # (assuming 192 QK head dim, 128 heads, and fp16)
+            128 * 1024)
+        assert self.chunked_prefill_workspace_size >= \
+            scheduler_config.max_num_seqs * cache_config.block_size
+        self.chunked_prefill_workspace = torch.empty(
+            (self.chunked_prefill_workspace_size,
+             self.model_config.get_head_size()),
+            dtype=self.model_config.dtype,
+            device=device,
+        )
 
         self._use_cudnn_prefill = use_cudnn_prefill()
         self._use_fi_prefill = use_flashinfer_prefill()
@@ -620,8 +617,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 reqs_start:] - query_start_loc[reqs_start]
 
             chunked_context_metadata = None
-            if self.chunked_prefill_enabled and num_prefills > 0 \
-                and max_context_len_cpu > 0:
+            if max_context_len_cpu > 0:
                 # NOTE: it is recommend you read the `Chunked Prefill` section
                 # in the comment at the top of the file before trying to
                 # understand the following code

From 10cc12ba66834e33659f1ce3a00235506db20dd5 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni001@gmail.com>
Date: Wed, 20 Aug 2025 17:46:47 -0400
Subject: [PATCH 447/932] Feature/mla tests (#23195)

Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 tests/v1/attention/test_attention_backends.py |  26 +-
 tests/v1/attention/test_mla_backends.py       | 522 ++++++++++++++++++
 tests/v1/attention/utils.py                   |  11 +-
 vllm/v1/attention/backends/mla/common.py      |  16 +-
 4 files changed, 551 insertions(+), 24 deletions(-)
 create mode 100644 tests/v1/attention/test_mla_backends.py

diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index ac08b9052c..60e04ad906 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -150,15 +150,15 @@ def create_and_prepopulate_kv_cache(
 
     # Permute the context blocks (excluding block 0 which is null)
     if randomize_blocks:
-        perm = torch.randperm(
-            blocks_end - 1) + 1  # Random permutation starting from block 1
+        # Random permutation starting from block 1
+        perm = torch.randperm(blocks_end - 1) + 1
     else:
-        perm = torch.arange(
-            1, blocks_end)  # Sequential order starting from block 1
+        # Sequential order starting from block 1
+        perm = torch.arange(1, blocks_end)
 
     inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device)
-    inv_perm[1:] = torch.argsort(
-        perm) + 1  # Add 1 to account for starting from block 1
+    # Add 1 to account for starting from block 1
+    inv_perm[1:] = torch.argsort(perm) + 1
     kv_cache[:, 1:blocks_end, ...] = kv_cache[:, perm, ...]
 
     # Construct the right block table
@@ -281,7 +281,8 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
 
 @pytest.mark.parametrize("batch_spec_name", [
     "small_decode", "small_prefill", "mixed_small", "medium_decode",
-    "medium_prefill", "mixed_medium"
+    "medium_prefill", "mixed_medium", "large_decode", "large_prefill",
+    "single_decode", "single_prefill"
 ])
 @pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
 def test_backend_correctness(batch_spec_name: str, model: str):
@@ -302,7 +303,8 @@ def test_backend_correctness(batch_spec_name: str, model: str):
     """
     batch_spec = BATCH_SPECS[batch_spec_name]
     vllm_config = create_vllm_config(model_name=model,
-                                     max_model_len=max(batch_spec.seq_lens))
+                                     max_model_len=max(batch_spec.seq_lens),
+                                     num_gpu_blocks=8192)
     device = torch.device("cuda:0")
 
     kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
@@ -465,12 +467,6 @@ def test_backend_correctness(batch_spec_name: str, model: str):
                                    rtol=rtol,
                                    atol=atol)
 
-        if not all_close:
-            print(f"[{backend_name}] output differs from SDPA baseline. "
-                  f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})")
-            print(f"[{backend_name}] output: {backend_output}")
-            print(f"[{backend_name}] SDPA baseline: {sdpa_output}")
-
         assert all_close, (
             f"[{backend_name}] output differs from SDPA baseline. "
-            f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})")
+            f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})")
\ No newline at end of file
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
new file mode 100644
index 0000000000..2407035879
--- /dev/null
+++ b/tests/v1/attention/test_mla_backends.py
@@ -0,0 +1,522 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for v1 MLA backends without GPUModelRunner dependency."""
+
+import pytest
+import torch
+
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata,
+                                      create_standard_kv_cache_spec,
+                                      create_vllm_config,
+                                      get_attention_backend)
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.kv_cache_interface import FullAttentionSpec
+
+BACKENDS_TO_TEST = [
+    _Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1,
+    _Backend.TRITON_MLA_VLLM_V1
+]
+
+# Remove CUTLASS_MLA from the list if not using sm100
+if not torch.cuda.is_available() or torch.cuda.get_device_properties(
+        0).major < 10:
+    BACKENDS_TO_TEST.remove(_Backend.CUTLASS_MLA)
+
+torch.manual_seed(42)
+
+
+def _convert_dtype_to_torch(dtype):
+    """Convert ModelDType to torch.dtype."""
+    if isinstance(dtype, str):
+        if dtype == "auto":
+            return torch.float16  # Default dtype for testing
+        elif dtype in STR_DTYPE_TO_TORCH_DTYPE:
+            return STR_DTYPE_TO_TORCH_DTYPE[dtype]
+        else:
+            raise ValueError(f"Unknown dtype: {dtype}")
+    elif isinstance(dtype, torch.dtype):
+        return dtype
+    else:
+        raise ValueError(f"Unknown dtype: {dtype}")
+
+
+# Define common batch configurations
+BATCH_SPECS = {
+    "small_decode":
+    BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]),
+    "small_prefill":
+    BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]),
+    "mixed_small":
+    BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]),
+    "medium_decode":
+    BatchSpec(seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024],
+              query_lens=[1, 1, 1, 1, 1, 1, 1, 1]),
+    "medium_prefill":
+    BatchSpec(seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]),
+    "mixed_medium":
+    BatchSpec(seq_lens=[512, 1024, 2048, 512, 1024, 2048],
+              query_lens=[1, 1, 1, 7, 7, 7]),
+    "large_decode":
+    BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
+    "large_prefill":
+    BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
+    "single_decode":
+    BatchSpec(seq_lens=[1024], query_lens=[1]),
+    "single_prefill":
+    BatchSpec(seq_lens=[1024], query_lens=[64]),
+}
+
+
+def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec,
+                          device: torch.device,
+                          num_blocks: int = 100) -> torch.Tensor:
+    """Create a dummy KV cache tensor for testing."""
+    kv_cache = torch.randn(
+        num_blocks,
+        kv_cache_spec.block_size,
+        kv_cache_spec.head_size,  # latent dimension
+        dtype=_convert_dtype_to_torch(kv_cache_spec.dtype),
+        device=device,
+    )
+    return kv_cache
+
+
+def create_and_prepopulate_kv_cache(
+        kv_c_contexts: list[torch.Tensor],
+        k_pe_contexts: list[torch.Tensor],
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        num_blocks: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        randomize_blocks: bool = True) -> torch.Tensor:
+    """Create and prepopulate an MLA KV cache with context data.
+    
+    Args:
+        kv_c_contexts: List of latent KV context tensors for each sequence
+        k_pe_contexts: List of key positional embedding context tensors
+                       for each sequence
+        block_size: Size of each block
+        num_kv_heads: Number of KV heads (should be 1 for MLA)
+        head_size: Size of each head (latent dimension)
+        dtype: Data type for the cache
+        device: Device to create the cache on
+        num_blocks: Total number of blocks in the cache
+        common_attn_metadata: Common attention metadata
+        randomize_blocks: Whether to randomly permute blocks 
+                          or use sequential order
+        
+    Returns:
+        MLA KV cache tensor
+    """
+    batch_size = len(kv_c_contexts)
+    seq_lens = common_attn_metadata.seq_lens_cpu
+    query_lens = common_attn_metadata.query_start_loc_cpu[
+        1:] - common_attn_metadata.query_start_loc_cpu[:-1]
+    context_lens = common_attn_metadata.num_computed_tokens_cpu
+    block_table = common_attn_metadata.block_table_tensor
+    slot_mapping = common_attn_metadata.slot_mapping
+
+    # Create MLA KV cache: (num_blocks, block_size, head_size)
+    kv_cache = torch.empty(num_blocks,
+                           block_size,
+                           head_size,
+                           dtype=dtype,
+                           device=device)
+    kv_cache_flat = kv_cache.view(-1, head_size)
+
+    # Populate the cache with the context tokens
+    # Start from block_id=1 since block_id=0 is considered the null block
+    start_block_idx = 1
+    for i in range(batch_size):
+        kv_c_context, k_pe_context = kv_c_contexts[i], k_pe_contexts[i]
+        kv_context = torch.cat([kv_c_context, k_pe_context.squeeze(1)], dim=-1)
+        start = start_block_idx * block_size
+        end = start + kv_context.shape[0]
+        kv_cache_flat[start:end, ...] = kv_context
+
+        # Stay block aligned and allocate enough blocks for the new tokens
+        start_block_idx += cdiv(int(seq_lens[i]), block_size)
+
+    blocks_end = start_block_idx
+
+    # Permute the context blocks (excluding block 0 which is null)
+    if randomize_blocks:
+        perm = torch.randperm(
+            blocks_end - 1) + 1  # Random permutation starting from block 1
+    else:
+        perm = torch.arange(
+            1, blocks_end)  # Sequential order starting from block 1
+
+    inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device)
+    inv_perm[1:] = torch.argsort(
+        perm) + 1  # Add 1 to account for starting from block 1
+    kv_cache[1:blocks_end, ...] = kv_cache[perm, ...]
+
+    # Construct the right block table
+    # Start from block_id=1 since block_id=0 is considered the null block
+    start_block_idx = 1
+    for i in range(batch_size):
+        num_blocks_for_seq = cdiv(int(seq_lens[i]), block_size)
+        start = start_block_idx
+        end = start + num_blocks_for_seq
+        block_table[i, :num_blocks_for_seq] = inv_perm[start:end]
+        start_block_idx += num_blocks_for_seq
+
+        # Create a realistic slot mapping that corresponds to the block table
+    for i in range(batch_size):
+        token_offsets = torch.arange(int(query_lens[i])) + int(context_lens[i])
+        block_indices = token_offsets // block_size
+        token_inter_block_offsets = token_offsets % block_size
+        start = common_attn_metadata.query_start_loc_cpu[i]
+        end = common_attn_metadata.query_start_loc_cpu[i + 1]
+        slot_mapping[start:end] = block_table[
+            i,
+            block_indices] * block_size + token_inter_block_offsets.to(device)
+
+    return kv_cache
+
+
+class MockAttentionLayer:
+    """A mock attention layer for testing."""
+
+    def __init__(self, device: torch.device):
+        self._q_scale = torch.tensor(1.0, device=device)
+        self._k_scale = torch.tensor(1.0, device=device)
+        self._v_scale = torch.tensor(1.0, device=device)
+
+
+def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
+                          layer_names: list[str], vllm_config,
+                          device: torch.device,
+                          common_attn_metadata: CommonAttentionMetadata,
+                          query: torch.Tensor, kv_c: torch.Tensor,
+                          k_pe: torch.Tensor, kv_cache: torch.Tensor,
+                          kv_lora_rank: int, qk_nope_head_dim: int,
+                          qk_rope_head_dim: int, v_head_dim: int,
+                          mock_kv_b_proj) -> torch.Tensor:
+    """Run attention computation using the specified backend's AttentionImpl."""
+
+    builder_cls, impl_cls = get_attention_backend(backend)
+
+    # Build metadata
+    builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device)
+    attn_metadata = builder.build(
+        common_prefix_len=0,
+        common_attn_metadata=common_attn_metadata,
+    )
+
+    # Instantiate MLA implementation
+    num_heads = vllm_config.model_config.get_num_attention_heads(
+        vllm_config.parallel_config)
+    num_kv_heads = vllm_config.model_config.get_num_kv_heads(
+        vllm_config.parallel_config)
+    head_size = vllm_config.model_config.get_head_size()
+    scale = 1.0 / (head_size**0.5)
+    impl = impl_cls(
+        num_heads=num_heads,
+        head_size=head_size,
+        scale=scale,
+        num_kv_heads=num_kv_heads,
+        alibi_slopes=None,
+        sliding_window=None,
+        kv_cache_dtype="auto",
+        logits_soft_cap=None,
+        attn_type="decoder",
+        kv_sharing_target_layer_name=None,
+        q_lora_rank=None,
+        kv_lora_rank=kv_lora_rank,
+        qk_nope_head_dim=qk_nope_head_dim,
+        qk_rope_head_dim=qk_rope_head_dim,
+        qk_head_dim=qk_nope_head_dim + qk_rope_head_dim,
+        v_head_dim=v_head_dim,
+        kv_b_proj=mock_kv_b_proj,
+    )
+
+    # Process weights to create W_UK_T and W_UV attributes needed by MLA
+    act_dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
+    impl.process_weights_after_loading(act_dtype)
+
+    # Create mock layer and output buffer
+    mock_layer = MockAttentionLayer(device)
+    num_tokens = query.shape[0]
+    output = torch.empty(num_tokens,
+                         num_heads * v_head_dim,
+                         dtype=query.dtype,
+                         device=query.device)
+
+    # Run forward pass
+    # NOTE: The query, key, and value are already shaped correctly
+    # in the calling test function.
+    output = impl.forward(mock_layer,
+                          query,
+                          kv_c,
+                          k_pe,
+                          kv_cache,
+                          attn_metadata,
+                          output=output)
+
+    return output
+
+
+@pytest.mark.parametrize("batch_spec_name", [
+    "small_decode", "small_prefill", "mixed_small", "medium_decode",
+    "medium_prefill", "mixed_medium", "large_decode", "large_prefill",
+    "single_decode", "single_prefill"
+])
+@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
+def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
+    """
+    Test that all backends produce similar outputs to a reference implementation
+    using torch.nn.functional.scaled_dot_product_attention.
+
+    This test works by:
+    1. Generating a batch of sequences with specified context and query lengths.
+    2. Computing a ground-truth attention output using torch.sdpa on
+       contiguous Q, K, and V tensors.
+    3. Simulating vLLM's paged KV cache: It takes the context portion of the
+       K/V tensors and manually places them into a paged buffer according to
+       the test's (randomly generated) block table.
+    4. Running each vLLM attention backend with the new queries and the
+       simulated paged KV cache.
+    5. Comparing the vLLM backend's output to the ground-truth SDPA output.
+    """
+    batch_spec = BATCH_SPECS[batch_spec_name]
+    vllm_config = create_vllm_config(model_name=model,
+                                     max_model_len=max(batch_spec.seq_lens),
+                                     num_gpu_blocks=2048)
+    device = torch.device("cuda:0")
+
+    kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
+
+    # 1. Setup
+    batch_size = batch_spec.batch_size
+    seq_lens = batch_spec.seq_lens
+    query_lens = batch_spec.query_lens
+    num_q_heads = vllm_config.model_config.get_num_attention_heads(
+        vllm_config.parallel_config)
+    num_kv_heads = vllm_config.model_config.get_num_kv_heads(
+        vllm_config.parallel_config)
+    head_size = vllm_config.model_config.get_head_size()
+    dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
+    block_size = vllm_config.cache_config.block_size
+    kv_lora_rank = 512
+    qk_rope_head_dim = 64
+    qk_nope_head_dim = 128
+    v_head_dim = 128
+    total_head_size = kv_lora_rank + qk_rope_head_dim
+    assert kv_lora_rank + qk_rope_head_dim == head_size, \
+        f"MLA dimensions don't match: {total_head_size} != {head_size}"
+    scale = 1.0 / (total_head_size**0.5)
+
+    # 2. Generate data and compute SDPA reference output for MLA
+    all_q_vllm, all_kv_c_vllm, all_k_pe_vllm = [], [], []
+    all_sdpa_outputs = []
+    kv_c_contexts, k_pe_contexts = [], []
+
+    # Create shared MLA weight matrices for consistency across all sequences
+    W_UK = torch.randn(kv_lora_rank,
+                       num_q_heads,
+                       qk_nope_head_dim,
+                       dtype=dtype,
+                       device=device)
+    W_UV = torch.randn(kv_lora_rank,
+                       num_q_heads,
+                       v_head_dim,
+                       dtype=dtype,
+                       device=device)
+    kv_b_proj_weight = torch.cat([W_UK, W_UV], dim=-1)
+
+    for i in range(batch_size):
+        s_len = seq_lens[i]
+        q_len = query_lens[i]
+        context_len = s_len - q_len
+
+        # Generate MLA tensors
+        # Q has both nope and rope components:
+        # [q_len, num_heads, qk_nope_head_dim + qk_rope_head_dim]
+        q_c = torch.randn(q_len,
+                          num_q_heads,
+                          qk_nope_head_dim + qk_rope_head_dim,
+                          dtype=dtype,
+                          device=device)
+
+        # KV_C (latent K/V): [s_len, kv_lora_rank]
+        kv_c_full = torch.randn(s_len,
+                                kv_lora_rank,
+                                dtype=dtype,
+                                device=device)
+
+        # K_PE (rope component): [s_len, 1, qk_rope_head_dim]
+        k_pe_full = torch.randn(s_len,
+                                1,
+                                qk_rope_head_dim,
+                                dtype=dtype,
+                                device=device)
+
+        # Determine if this is decode (single token)
+        # or prefill (multiple tokens)
+        is_decode = q_len == 1
+
+        # Split q into nope and rope components
+        q_nope, q_pe = q_c.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1)
+
+        if is_decode:
+            # Decode path: MQA-style attention in latent space
+            # Transform q_nope to latent space: q_nope @ W_UK
+            # q_nope: [1, num_heads, qk_nope_head_dim]
+            # W_UK: [kv_lora_rank, num_heads, qk_nope_head_dim]
+            ql_nope = torch.einsum("qnh,lnh->qnl", q_nope,
+                                   W_UK)  # [1, num_heads, kv_lora_rank]
+
+            # Build MQA attention inputs
+            # Q: [1, num_heads, kv_lora_rank + qk_rope_head_dim]
+            q_mqa = torch.cat([ql_nope, q_pe], dim=-1)
+            # K: [s_len, kv_lora_rank + qk_rope_head_dim]
+            # (broadcasted to all heads)
+            k_mqa = torch.cat([kv_c_full, k_pe_full.squeeze(1)], dim=-1)
+            k_mqa = k_mqa.unsqueeze(1).expand(-1, num_q_heads, -1)
+            # V: [s_len, kv_lora_rank] (broadcasted to all heads)
+            v_mqa = kv_c_full.unsqueeze(1).expand(-1, num_q_heads, -1)
+
+            # SDPA expects (N, H, L, D)
+            q_sdpa_in = q_mqa.unsqueeze(0).transpose(1, 2)
+            k_sdpa_in = k_mqa.unsqueeze(0).transpose(1, 2)
+            v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2)
+
+            sdpa_out_i = torch.nn.functional.scaled_dot_product_attention(
+                q_sdpa_in, k_sdpa_in, v_sdpa_in, is_causal=False, scale=scale)
+            sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze(
+                0)  # [1, num_heads, kv_lora_rank]
+
+            # Project back to output space: sdpa_out @ W_UV
+            sdpa_out_i = torch.einsum("qnl,lnv->qnv", sdpa_out_i, W_UV)
+            sdpa_out_i = sdpa_out_i.flatten(start_dim=-2)
+        else:
+            # Prefill path: MHA-style attention with full sequence
+            # Apply kv_b_proj to the full kv_c tensor
+            kv_nope_full = torch.einsum("sl,lnh->snh", kv_c_full,
+                                        kv_b_proj_weight)
+            k_nope_full, v_full = kv_nope_full.split(
+                [qk_nope_head_dim, v_head_dim], dim=-1)
+
+            # Build attention inputs for full sequence
+            q_mha = torch.cat([q_nope, q_pe],
+                              dim=-1)  # [q_len, num_heads, total_dim]
+            k_pe_full_expanded = k_pe_full.expand(-1, num_q_heads, -1)
+            k_full = torch.cat([k_nope_full, k_pe_full_expanded], dim=-1)
+
+            # Create custom attention mask:
+            # - Query tokens can attend to all context tokens
+            # - Query tokens can only attend to query tokens up to their pos
+            attn_mask = torch.ones(q_len,
+                                   s_len,
+                                   dtype=torch.bool,
+                                   device=device)
+            # Apply causal mask only to the query portion (context_len onwards)
+            causal_mask = torch.tril(torch.ones(q_len, q_len, device=device))
+            attn_mask[:, context_len:] = causal_mask
+
+            # SDPA expects (N, H, L, D)
+            q_sdpa_in = q_mha.unsqueeze(0).transpose(1, 2)
+            k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2)
+            v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2)
+
+            # Single attention call with custom mask
+            sdpa_out_i = torch.nn.functional.scaled_dot_product_attention(
+                q_sdpa_in,
+                k_sdpa_in,
+                v_sdpa_in,
+                attn_mask=attn_mask,
+                scale=scale)
+            sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze(0)
+            sdpa_out_i = sdpa_out_i.flatten(start_dim=-2)
+
+        all_sdpa_outputs.append(sdpa_out_i)
+
+        # Inputs for vLLM MLA backends are just the new tokens
+        all_q_vllm.append(q_c)
+        all_kv_c_vllm.append(kv_c_full[context_len:])  # New kv_c tokens
+        all_k_pe_vllm.append(k_pe_full[context_len:])  # New k_pe tokens
+
+        # Contextual K/V data used to populate the paged cache (MLA format)
+        kv_c_contexts.append(kv_c_full[:context_len])
+        k_pe_contexts.append(k_pe_full[:context_len])
+
+    # Concatenate all sequences (no reordering needed)
+    query_vllm = torch.cat(all_q_vllm, dim=0)
+    kv_c_vllm = torch.cat(all_kv_c_vllm, dim=0)
+    k_pe_vllm = torch.cat(all_k_pe_vllm, dim=0)
+    sdpa_output = torch.cat(all_sdpa_outputs, dim=0)
+
+    # Create mock kv_b_proj using the same weights as reference implementation
+    from vllm.model_executor.layers.linear import ColumnParallelLinear
+    mock_kv_b_proj = ColumnParallelLinear(input_size=kv_lora_rank,
+                                          output_size=num_q_heads *
+                                          (qk_nope_head_dim + v_head_dim),
+                                          bias=False).to(device=device,
+                                                         dtype=dtype)
+
+    # Set the mock weights to match our reference implementation
+    # Reshape W_UK and W_UV to match the expected kv_b_proj format
+    # [kv_lora_rank, num_heads, qk_nope_head_dim + v_head_dim]
+    kv_b_proj_weight = kv_b_proj_weight.view(
+        kv_lora_rank, num_q_heads * (qk_nope_head_dim + v_head_dim))
+    mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T)
+
+    # Create metadata using original batch spec
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec, vllm_config.cache_config.block_size, device)
+
+    # 3. Simulate Paged KV Cache and a realistic slot_mapping
+    kv_cache = create_and_prepopulate_kv_cache(
+        kv_c_contexts=kv_c_contexts,
+        k_pe_contexts=k_pe_contexts,
+        block_size=block_size,
+        num_kv_heads=num_kv_heads,
+        head_size=head_size,
+        dtype=dtype,
+        device=device,
+        num_blocks=vllm_config.cache_config.num_gpu_blocks,
+        common_attn_metadata=common_attn_metadata,
+        randomize_blocks=True)
+
+    # 4. Run vLLM backends and compare
+    for backend_name in BACKENDS_TO_TEST:
+        backend_output = run_attention_backend(
+            backend_name, kv_cache_spec, ["placeholder"], vllm_config, device,
+            common_attn_metadata, query_vllm, kv_c_vllm, k_pe_vllm, kv_cache,
+            kv_lora_rank, qk_nope_head_dim, qk_rope_head_dim, v_head_dim,
+            mock_kv_b_proj)
+
+        # Check shape and dtype consistency
+        assert backend_output.shape == sdpa_output.shape, (
+            f"[{backend_name}] shape {backend_output.shape} != "
+            f"SDPA shape {sdpa_output.shape}")
+        assert backend_output.dtype == sdpa_output.dtype, (
+            f"[{backend_name}] dtype {backend_output.dtype} != "
+            f"SDPA dtype {sdpa_output.dtype}")
+
+        assert torch.isfinite(backend_output).all(), (
+            f"[{backend_name}] produced non-finite values")
+
+        # Check numerical similarity
+        rtol = 1e-2
+        atol = 5e-1
+
+        max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item()
+        max_rel_diff = torch.max(
+            torch.abs(backend_output - sdpa_output) /
+            torch.abs(sdpa_output)).item()
+        all_close = torch.allclose(backend_output,
+                                   sdpa_output,
+                                   rtol=rtol,
+                                   atol=atol)
+
+        assert all_close, (
+            f"[{backend_name}] output differs from SDPA baseline. "
+            f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})")
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index e547e71e0c..6a08cdc56f 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -135,6 +135,12 @@ def get_attention_backend(backend_name: _Backend):
         "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
         _Backend.XFORMERS_VLLM_V1:
         "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
+        _Backend.CUTLASS_MLA:
+        "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
+        _Backend.FLASHMLA_VLLM_V1:
+        "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
+        _Backend.TRITON_MLA_VLLM_V1:
+        "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
     }
 
     if backend_name not in backend_map:
@@ -167,9 +173,11 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
                        tensor_parallel_size: int = 1,
                        max_model_len: int = 1024,
                        dtype: Union[ModelDType, torch.dtype] = "auto",
+                       num_gpu_blocks: int = 1000,
                        block_size: int = 16,
                        max_num_seqs: int = 256,
                        max_num_batched_tokens: int = 8192,
+                       enable_chunked_prefill: bool = True,
                        add_mock_model_methods: bool = True) -> VllmConfig:
     """Create a VllmConfig for testing with reasonable defaults."""
 
@@ -189,7 +197,7 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
     )
     # Set cache blocks for testing
     #   (these may be set during initialization normally)
-    cache_config.num_gpu_blocks = 1000
+    cache_config.num_gpu_blocks = num_gpu_blocks
     cache_config.num_cpu_blocks = 0
 
     parallel_config = ParallelConfig(
@@ -198,6 +206,7 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B",
     scheduler_config = SchedulerConfig(
         max_num_seqs=max_num_seqs,
         max_num_batched_tokens=max_num_batched_tokens,
+        enable_chunked_prefill=enable_chunked_prefill,
     )
 
     device_config = DeviceConfig()
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 646e4fec83..03028ebfe7 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -24,7 +24,7 @@ Main reference: DeepseekV2 paper, and FlashInfer Implementation
 (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
 
 Deepseek's MLA attention works the following way:
-* Use a single latent vector to represent the per-token entry of the KV cache. 
+* Use a single latent vector to represent the per-token entry of the KV cache.
 * For decode (i.e. the memory friendly approach) the attention "simulates" a
 multi-head attention, while the compute is similar to multi-query attention.
 
@@ -82,7 +82,7 @@ spda_o = scaled_dot_product_attention(
     torch.cat([q_nope, q_pe], dim=-1),
     torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
     v
-) 
+)
 return spda_o @ W_O
 
 NOTE: in the actual code,
@@ -120,20 +120,20 @@ return o.view(-1, N * V) @ self.num_heads @ W_O
 
 ## Chunked Prefill
 
-For chunked prefill we want to use the compute friendly algorithm. We are 
-assuming sufficiently large Sq / Skv ratio, in the future may want to switch to 
+For chunked prefill we want to use the compute friendly algorithm. We are
+assuming sufficiently large Sq / Skv ratio, in the future may want to switch to
 the data-movement friendly approach if the chunk (i.e. `Sq`) is small.
 
 However, the compute-friendly approach can potentially run out of memory if Skv
 is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)`
 
-To mitigate this, we chunk the computation of attention with respect to the 
-current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a 
+To mitigate this, we chunk the computation of attention with respect to the
+current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a
 fixed workspace size.
 
 The chunked prefill approach is as follows:
 
-MCC        Max chunk of context to process per iter, computed dynamically, 
+MCC        Max chunk of context to process per iter, computed dynamically,
            used to bound the memory usage
 
 q_c        = h_t @ W_DQ
@@ -155,7 +155,7 @@ curr_o, curr_lse = scaled_dot_product_attention(
     new_v,
     casual=True,
     return_softmax_lse=True
-) 
+)
 
 // Compute attention with the already existing context
 for chunk_idx in range(cdiv(C, MCC)):

From c86af22f31838ee654c856279ac5110ae3fdb2cc Mon Sep 17 00:00:00 2001
From: shixianc <49539556+shixianc@users.noreply.github.com>
Date: Wed, 20 Aug 2025 15:04:21 -0700
Subject: [PATCH 448/932] [Fix] remove is_marlin param in benchmark_moe
 (#23286)


From 4b795020eda910ecf16c289a23c4a6c119a4b43b Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Wed, 20 Aug 2025 16:46:06 -0700
Subject: [PATCH 449/932] [EP] Add logging for experts map (#22685)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index aa8ceda1bb..b16c21b701 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -695,6 +695,26 @@ def determine_expert_map(
     return (local_num_experts, expert_map)
 
 
+def get_compressed_expert_map(expert_map: torch.Tensor) -> str:
+    """
+        Compresses the expert map by removing any -1 entries.
+
+        Args:
+            expert_map (torch.Tensor): A tensor of shape (global_num_experts,)
+                mapping from global to local index. Contains -1 for experts not
+                assigned to the current rank.
+
+        Returns:
+            str: A string mapping from local to global index.
+                Using str to support hashing for logging once only.
+        """
+    global_indices = torch.where(expert_map != -1)[0]
+    local_indices = expert_map[global_indices]
+    return ", ".join(
+        f"{local_index.item()}->{global_index.item()}"
+        for local_index, global_index in zip(local_indices, global_indices))
+
+
 @CustomOp.register("fused_moe")
 class FusedMoE(CustomOp):
     """FusedMoE layer for MoE models.
@@ -795,6 +815,12 @@ class FusedMoE(CustomOp):
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
                 global_num_experts=self.global_num_experts)
+            logger.info_once(
+                "[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
+                " number of experts: %s/%s. Experts local to global index map:"
+                " %s.", self.ep_rank, self.ep_size, self.local_num_experts,
+                self.global_num_experts,
+                get_compressed_expert_map(self.expert_map))
         else:
             self.local_num_experts, self.expert_map = (self.global_num_experts,
                                                        None)

From f5aa307d7795b8400d3719087c502c2a227030c7 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 20 Aug 2025 20:14:59 -0400
Subject: [PATCH 450/932] Remove duplicate entry in vllm.attention.__all__
 (#23296)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/attention/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index 344040586a..dcb2aa68fb 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -14,7 +14,6 @@ __all__ = [
     "AttentionMetadata",
     "AttentionType",
     "AttentionMetadataBuilder",
-    "Attention",
     "AttentionState",
     "get_attn_backend",
 ]

From bbea1cefdd1a29b53355b1655f5d2ae343921f85 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 20 Aug 2025 20:18:12 -0400
Subject: [PATCH 451/932] [CI Bugfix] Fix CI by fully removing
 --enable-prompt-adapter (#23284)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/engine/arg_utils.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index dcf7875894..f3afc015f6 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -888,12 +888,6 @@ class EngineArgs:
         parser.add_argument('--disable-log-stats',
                             action='store_true',
                             help='Disable logging statistics.')
-        parser.add_argument('--enable-prompt-adapter',
-                            action='store_true',
-                            deprecated=True,
-                            help='[DEPRECATED] Prompt adapter has been '
-                            'removed. Setting this flag to True or False'
-                            ' has no effect on vLLM behavior.')
 
         return parser
 

From b029de9902aa3ac58806c8c17776c7074175b6db Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 20 Aug 2025 18:25:56 -0700
Subject: [PATCH 452/932] [Optimization] Make new_block_ids None if empty
 (#23262)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
---
 vllm/v1/core/kv_cache_manager.py   | 30 ++++++++++++++++++++++++++----
 vllm/v1/core/sched/output.py       |  2 +-
 vllm/v1/core/sched/scheduler.py    | 24 ++++++++++++------------
 vllm/v1/worker/gpu_model_runner.py | 14 +++++++++-----
 vllm/v1/worker/tpu_model_runner.py | 14 +++++++++-----
 5 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index bfaa7ab08f..fd0bdb2c80 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Optional
+from typing import Literal, Optional, overload
 
 from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
@@ -37,7 +37,24 @@ class KVCacheBlocks:
             tuple(blk1 + blk2
                   for blk1, blk2 in zip(self.blocks, other.blocks)))
 
-    def get_block_ids(self) -> tuple[list[int], ...]:
+    @overload
+    def get_block_ids(
+        self,
+        allow_none: Literal[False] = False,
+    ) -> tuple[list[int], ...]:
+        ...
+
+    @overload
+    def get_block_ids(
+        self,
+        allow_none: Literal[True] = True,
+    ) -> Optional[tuple[list[int], ...]]:
+        ...
+
+    def get_block_ids(
+        self,
+        allow_none: bool = False,
+    ):
         """
         Converts the KVCacheBlocks instance to block_ids.
         
@@ -46,6 +63,8 @@ class KVCacheBlocks:
             * the outer tuple corresponds to KV cache groups
             * each inner list contains the block_ids of the blocks in that group
         """
+        if allow_none and all(len(group) == 0 for group in self.blocks):
+            return None
         return tuple([blk.block_id for blk in group] for group in self.blocks)
 
     def get_unhashed_block_ids(self) -> list[int]:
@@ -348,10 +367,13 @@ class KVCacheManager:
         """
         return self.block_pool.take_events()
 
+    def get_blocks(self, request_id: str) -> KVCacheBlocks:
+        """Get the blocks of a request."""
+        return KVCacheBlocks(self.coordinator.get_blocks(request_id))
+
     def get_block_ids(self, request_id: str) -> tuple[list[int], ...]:
         """Get the block ids of a request."""
-        return KVCacheBlocks(
-            self.coordinator.get_blocks(request_id)).get_block_ids()
+        return self.get_blocks(request_id).get_block_ids()
 
     def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
         """Cache the blocks for the request, if enabled."""
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index fac07f9719..9ba7ec9d96 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -91,7 +91,7 @@ class CachedRequestData:
     # NOTE(woosuk): new_token_ids is only used for pipeline parallelism.
     # When PP is not used, new_token_ids will be empty.
     new_token_ids: list[list[int]]
-    new_block_ids: list[tuple[list[int], ...]]
+    new_block_ids: list[Optional[tuple[list[int], ...]]]
     num_computed_tokens: list[int]
 
     @property
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4b167da5c8..0b528587b9 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -19,7 +19,7 @@ from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
-from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
@@ -185,7 +185,7 @@ class Scheduler(SchedulerInterface):
         # uses structured decoding.
         structured_output_request_ids: dict[str, int] = {}
 
-        req_to_new_block_ids: dict[str, tuple[list[int], ...]] = {}
+        req_to_new_blocks: dict[str, KVCacheBlocks] = {}
         num_scheduled_tokens: dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
         # Encoder-related.
@@ -288,8 +288,7 @@ class Scheduler(SchedulerInterface):
                 # Therefore, we might introduce some additional
                 # cycle to fill in the bitmask, which could be a big no-op.
                 structured_output_request_ids[request.request_id] = req_index
-            req_to_new_block_ids[request.request_id] = (
-                new_blocks.get_block_ids())
+            req_to_new_blocks[request.request_id] = new_blocks
             num_scheduled_tokens[request.request_id] = num_new_tokens
             token_budget -= num_new_tokens
             req_index += 1
@@ -496,8 +495,8 @@ class Scheduler(SchedulerInterface):
 
                 if self.lora_config and request.lora_request:
                     scheduled_loras.add(request.lora_request.lora_int_id)
-                req_to_new_block_ids[request.request_id] = (
-                    self.kv_cache_manager.get_block_ids(request.request_id))
+                req_to_new_blocks[request.request_id] = (
+                    self.kv_cache_manager.get_blocks(request.request_id))
                 num_scheduled_tokens[request.request_id] = num_new_tokens
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
@@ -546,8 +545,8 @@ class Scheduler(SchedulerInterface):
         )
         # Construct the scheduler output.
         new_reqs_data = [
-            NewRequestData.from_request(req,
-                                        req_to_new_block_ids[req.request_id])
+            NewRequestData.from_request(
+                req, req_to_new_blocks[req.request_id].get_block_ids())
             for req in scheduled_new_reqs
         ]
         cached_reqs_data = self._make_cached_request_data(
@@ -555,7 +554,7 @@ class Scheduler(SchedulerInterface):
             scheduled_resumed_reqs,
             num_scheduled_tokens,
             scheduled_spec_decode_tokens,
-            req_to_new_block_ids,
+            req_to_new_blocks,
         )
         scheduler_output = SchedulerOutput(
             scheduled_new_reqs=new_reqs_data,
@@ -628,11 +627,11 @@ class Scheduler(SchedulerInterface):
         resumed_reqs: list[Request],
         num_scheduled_tokens: dict[str, int],
         spec_decode_tokens: dict[str, list[int]],
-        req_to_new_block_ids: dict[str, tuple[list[int], ...]],
+        req_to_new_blocks: dict[str, KVCacheBlocks],
     ) -> CachedRequestData:
         req_ids: list[str] = []
         new_token_ids: list[list[int]] = []
-        new_block_ids: list[tuple[list[int], ...]] = []
+        new_block_ids: list[Optional[tuple[list[int], ...]]] = []
         num_computed_tokens: list[int] = []
 
         use_connector = self.connector is not None
@@ -655,7 +654,8 @@ class Scheduler(SchedulerInterface):
                 # out of bounds errors. TODO: Remove this once the KVConnector
                 # is updated to handle token IDs properly.
                 new_token_ids.append([])
-            new_block_ids.append(req_to_new_block_ids[req_id])
+            new_block_ids.append(
+                req_to_new_blocks[req_id].get_block_ids(allow_none=True))
             num_computed_tokens.append(req.num_computed_tokens)
         # Because resumed_reqs is usually empty, it is more efficient to do
         # in-place appending so that we don't need to allocate a new list.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 33747d6917..cc86f98264 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -574,11 +574,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
             # Update the block IDs.
             if not resumed_from_preemption:
-                # Append the new blocks to the existing block IDs.
-                for block_ids, new_ids in zip(req_state.block_ids,
-                                              new_block_ids):
-                    block_ids.extend(new_ids)
+                if new_block_ids is not None:
+                    # Append the new blocks to the existing block IDs.
+                    for block_ids, new_ids in zip(req_state.block_ids,
+                                                  new_block_ids):
+                        block_ids.extend(new_ids)
             else:
+                assert new_block_ids is not None
                 # The request is resumed from preemption.
                 # Replace the existing block IDs with the new ones.
                 req_state.block_ids = new_block_ids
@@ -594,7 +596,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # Update the persistent batch.
             self.input_batch.num_computed_tokens_cpu[req_index] = (
                 num_computed_tokens)
-            self.input_batch.block_table.append_row(new_block_ids, req_index)
+            if new_block_ids is not None:
+                self.input_batch.block_table.append_row(
+                    new_block_ids, req_index)
 
             # For the last rank, we don't need to update the token_ids_cpu
             # because the sampled tokens are already cached.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 9196c62377..0f569500cd 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -418,11 +418,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # Update the cached states.
             req_state.num_computed_tokens = num_computed_tokens
             if not resumed_from_preemption:
-                # Append the new blocks to the existing block IDs.
-                for block_ids, new_ids in zip(req_state.block_ids,
-                                              new_block_ids):
-                    block_ids.extend(new_ids)
+                if new_block_ids is not None:
+                    # Append the new blocks to the existing block IDs.
+                    for block_ids, new_ids in zip(req_state.block_ids,
+                                                  new_block_ids):
+                        block_ids.extend(new_ids)
             else:
+                assert new_block_ids is not None
                 # The request is resumed from preemption.
                 # Replace the existing block IDs with the new ones.
                 req_state.block_ids = new_block_ids
@@ -438,7 +440,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # Update the persistent batch.
             self.input_batch.num_computed_tokens_cpu[req_index] = (
                 num_computed_tokens)
-            self.input_batch.block_table.append_row(new_block_ids, req_index)
+            if new_block_ids is not None:
+                self.input_batch.block_table.append_row(
+                    new_block_ids, req_index)
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.

From 7be5d113d8784536b79f27f24cfa91958dc291b0 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 21 Aug 2025 09:34:24 +0800
Subject: [PATCH 453/932] [CPU] Refactor CPU W8A8 scaled_mm (#23071)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../scripts/hardware_ci/run-cpu-test.sh       |   7 +-
 cmake/cpu_extension.cmake                     |  59 +-
 csrc/cpu/cpu_types_x86.hpp                    |   8 +-
 csrc/cpu/dnnl_helper.cpp                      | 346 +++++++
 csrc/cpu/dnnl_helper.h                        | 169 ++++
 csrc/cpu/dnnl_helper.hpp                      | 206 ----
 csrc/cpu/dnnl_kernels.cpp                     | 494 +++++++++
 csrc/cpu/quant.cpp                            | 951 ------------------
 csrc/cpu/torch_bindings.cpp                   |  92 +-
 tests/kernels/test_onednn.py                  | 144 +++
 vllm/_custom_ops.py                           |  83 ++
 vllm/model_executor/layers/fused_moe/layer.py |  11 +-
 vllm/model_executor/layers/linear.py          |   8 +-
 .../kernels/scaled_mm/__init__.py             |   4 +-
 .../quantization/kernels/scaled_mm/cpu.py     | 206 ++++
 .../quantization/kernels/scaled_mm/cutlass.py |   4 +-
 vllm/model_executor/layers/utils.py           |   6 +
 17 files changed, 1525 insertions(+), 1273 deletions(-)
 create mode 100644 csrc/cpu/dnnl_helper.cpp
 create mode 100644 csrc/cpu/dnnl_helper.h
 delete mode 100644 csrc/cpu/dnnl_helper.hpp
 create mode 100644 csrc/cpu/dnnl_kernels.cpp
 delete mode 100644 csrc/cpu/quant.cpp
 create mode 100644 tests/kernels/test_onednn.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 57a7bc4e5f..9dec9f8e9e 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -46,6 +46,11 @@ function cpu_tests() {
     set -e
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
+  # Run kernel tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -v -s tests/kernels/test_onednn.py"
+
   # Run basic model test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
@@ -99,4 +104,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index e0da46e2ac..cc38cd41a5 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -182,17 +182,17 @@ endif()
 #
 # Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
 # Flag to enable ACL kernels for AARCH64 platforms
-if ( VLLM_BUILD_ACL STREQUAL "ON")
+if (VLLM_BUILD_ACL STREQUAL "ON")
     set(USE_ACL ON)
 else()
     set(USE_ACL OFF)
 endif()
 
-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
+if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
     FetchContent_Declare(
         oneDNN
         GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG  v3.8.1
+        GIT_TAG v3.9
         GIT_PROGRESS TRUE
         GIT_SHALLOW TRUE
     )
@@ -204,7 +204,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
         endif()
         set(ONEDNN_AARCH64_USE_ACL "ON")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
-        endif()
+    endif()
 
     set(ONEDNN_LIBRARY_TYPE "STATIC")
     set(ONEDNN_BUILD_DOC "OFF")
@@ -217,38 +217,23 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
     set(ONEDNN_ENABLE_ITT_TASKS "OFF")
     set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
     set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
+    set(ONEDNN_VERBOSE "OFF")
     set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 
     FetchContent_MakeAvailable(oneDNN)
-    
-    list(APPEND LIBS dnnl)
-elseif(POWER10_FOUND)
-    FetchContent_Declare(
-        oneDNN
-        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG v3.7.2
-        GIT_PROGRESS TRUE
-        GIT_SHALLOW TRUE
+    add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
+    target_include_directories(
+        dnnl_ext
+        PUBLIC ${oneDNN_SOURCE_DIR}/include
+        PUBLIC ${oneDNN_BINARY_DIR}/include
+        PRIVATE ${oneDNN_SOURCE_DIR}/src
     )
-
-    set(ONEDNN_LIBRARY_TYPE "STATIC")
-    set(ONEDNN_BUILD_DOC "OFF")
-    set(ONEDNN_BUILD_EXAMPLES "OFF")
-    set(ONEDNN_BUILD_TESTS "OFF")
-    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
-    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
-    set(ONEDNN_BUILD_GRAPH "OFF")
-    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
-    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
-    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
-    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
-    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
-
-    set(DNNL_CPU_RUNTIME "OMP")
-
-    FetchContent_MakeAvailable(oneDNN)
-
-    list(APPEND LIBS dnnl)
+    target_link_libraries(dnnl_ext dnnl)
+    target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
+    list(APPEND LIBS dnnl_ext)
+    set(USE_ONEDNN ON)
+else()
+    set(USE_ONEDNN OFF)
 endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
@@ -275,7 +260,6 @@ set(VLLM_EXT_SRC
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     set(VLLM_EXT_SRC
-        "csrc/cpu/quant.cpp"
         "csrc/cpu/shm.cpp"
         ${VLLM_EXT_SRC})
     if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
@@ -289,14 +273,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
             ${VLLM_EXT_SRC})
         add_compile_definitions(-DCPU_CAPABILITY_AVX512)
     endif()
-elseif(POWER10_FOUND)
-    set(VLLM_EXT_SRC
-        "csrc/cpu/quant.cpp"
-        ${VLLM_EXT_SRC})
 endif()
-if (ASIMD_FOUND)
+
+if(USE_ONEDNN)
     set(VLLM_EXT_SRC
-        "csrc/cpu/quant.cpp"
+        "csrc/cpu/dnnl_kernels.cpp"
         ${VLLM_EXT_SRC})
 endif()
 
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 3952c43cbc..982f7c07a1 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -89,7 +89,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
 
   explicit FP16Vec16(const FP32Vec16&);
 
-  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
+  void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
 
   void save(void* ptr, const int elem_num) const {
     constexpr uint32_t M = 0xFFFFFFFF;
@@ -126,7 +126,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
 
   explicit BF16Vec16(const FP32Vec16&);
 
-  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
+  void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
 
   void save(void* ptr, const int elem_num) const {
     constexpr uint32_t M = 0xFFFFFFFF;
@@ -180,8 +180,8 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
             (__m128i)vec8_data.reg, 1)) {}
 
   void save(void* ptr) const {
-    *reinterpret_cast<__m256i*>(ptr) = reg_low;
-    *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high;
+    _mm256_storeu_si256((__m256i*)ptr, reg_low);
+    _mm256_storeu_si256((__m256i*)ptr + 1, reg_high);
   }
 };
 #endif
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
new file mode 100644
index 0000000000..f3f00edb36
--- /dev/null
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -0,0 +1,346 @@
+#include <list>
+#include <optional>
+
+#include "common/memory_desc.hpp"
+#include "common/memory.hpp"
+
+#include "dnnl_helper.h"
+
+static dnnl::engine& default_engine() {
+  static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
+  return engine;
+}
+
+static dnnl::stream& default_stream() {
+  static dnnl::stream stream(default_engine());
+  return stream;
+}
+
+void release_dnnl_matmul_handler(int64_t handler) {
+  DNNLMatMulPrimitiveHandler* ptr =
+      reinterpret_cast<DNNLMatMulPrimitiveHandler*>(handler);
+  delete ptr;
+}
+
+template <typename KT, typename VT>
+class DNNLPrimitiveCache {
+ public:
+  using cache_value_t = std::pair<KT, VT>;
+  using result_value_t = VT;
+  using container_t = std::list<cache_value_t>;
+  using value_iterator_t = typename container_t::iterator;
+  using map_t = std::unordered_map<KT, value_iterator_t>;
+  using creator_t = VT (*)();
+
+ public:
+  DNNLPrimitiveCache(size_t capacity)
+      : capacity_(capacity),
+        values_(),
+        key_to_value_(std::min(256lu, capacity)) {
+    assert(capacity > 0);
+  }
+
+  template <typename F>
+  result_value_t get_or_create(const KT& key, F&& creator) {
+    std::optional<value_iterator_t> value = get_value(key);
+    if (value.has_value()) {
+      return value.value()->second;
+    } else {
+      return add_value({key, creator()})->second;
+    }
+  }
+
+  size_t size() const { return values_.size(); }
+
+ private:
+  void dump_data() {
+    std::stringstream ss;
+    ss << "table_id: " << std::hex << reinterpret_cast<size_t>(this) << std::dec
+       << "\n";
+    ss << "container: [";
+    for (auto&& iter : values_) {
+      ss << "(" << iter.first << ", " << std::hex
+         << reinterpret_cast<size_t>(iter.second.get()) << "), " << std::dec;
+    }
+    ss << "]\n";
+
+    ss << "map: [";
+    for (auto&& iter : key_to_value_) {
+      ss << "(" << iter.first << ", " << iter.second->first << ", " << std::hex
+         << reinterpret_cast<size_t>(iter.second->second.get()) << std::dec
+         << "), ";
+    }
+    ss << "]\n";
+    std::printf("%s\n", ss.str().c_str());
+  }
+
+  value_iterator_t add_value(cache_value_t&& new_value) {
+    if (size() == capacity_) {
+      cache_value_t& last_item = values_.back();
+      key_to_value_.erase(last_item.first);
+      values_.pop_back();
+    }
+
+    auto& added_value_ = values_.emplace_front(std::move(new_value));
+    key_to_value_.emplace(added_value_.first, values_.begin());
+    return values_.begin();
+  }
+
+  std::optional<value_iterator_t> get_value(const KT& key) {
+    if (key_to_value_.size() > 0 && key == values_.begin()->first) {
+      return values_.begin();
+    }
+
+    auto value_map_iterator = key_to_value_.find(key);
+    if (value_map_iterator != key_to_value_.end()) {
+      values_.splice(values_.begin(), values_, value_map_iterator->second);
+      return value_map_iterator->second;
+    } else {
+      return {};
+    }
+  }
+
+ private:
+  const size_t capacity_;
+  container_t values_;
+  map_t key_to_value_;
+};
+
+DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
+    const Args& args, dnnl::memory::data_type b_type)
+    : b_n_size_(args.b_n_size),
+      b_n_stride_(args.b_n_stride),
+      b_k_size_(args.b_k_size),
+      b_k_stride_(args.b_k_stride),
+      b_type_(b_type),
+      c_type_(args.c_type),
+      runtime_memory_ptrs_(8),
+      primitive_cache_size_(args.primitive_cache_size) {
+  assert(primitive_cache_size_ > 0);
+}
+
+void DNNLMatMulPrimitiveHandler::prepack_weight(
+    void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
+  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
+                                   {b_k_stride_, b_n_stride_});
+  dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
+  dnnl::memory packed_weight(b_target_mem_desc, default_engine());
+  {
+    dnnl::reorder(original_weight, packed_weight)
+        .execute(default_stream(), original_weight, packed_weight);
+    default_stream().wait();
+  }
+  memory_cache_[DNNL_ARG_WEIGHTS] = packed_weight;
+  b_target_mem_desc_ = b_target_mem_desc;
+}
+
+void DNNLMatMulPrimitiveHandler::set_runtime_memory_ptr(
+    size_t index, dnnl_memory* memory_ptr) {
+  dnnl::impl::memory_storage_t* mem_storage_ptr = memory_ptr->memory_storage();
+  dnnl_memory_desc* mem_desc = const_cast<dnnl_memory_desc*>(memory_ptr->md());
+  runtime_memory_ptrs_[index] = {mem_storage_ptr, mem_desc};
+}
+
+std::pair<dnnl::impl::memory_storage_t*, dnnl_memory_desc*>
+DNNLMatMulPrimitiveHandler::get_runtime_memory_ptr(size_t index) {
+  return runtime_memory_ptrs_[index];
+}
+
+namespace std {
+template <>
+struct hash<W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey> {
+  size_t operator()(
+      const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
+    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size) ^
+           hash<int>()(static_cast<int>(val.a_qs)) ^
+           hash<int>()(static_cast<int>(val.b_qs)) ^ hash<bool>()(val.use_azp) ^
+           hash<int>()(static_cast<int>(val.c_type));
+  }
+};
+
+template <>
+struct hash<W8A8MatMulPrimitiveHandler::MSizeCacheKey> {
+  size_t operator()(
+      const W8A8MatMulPrimitiveHandler::MSizeCacheKey& val) const {
+    return hash<dnnl_dim_t>()(val.a_m_size) ^ hash<bool>()(val.use_bias) ^
+           hash<int>()(static_cast<int>(val.bias_type));
+  }
+};
+}  // namespace std
+
+bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
+                const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
+  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size &&
+         l.a_qs == r.a_qs && l.b_qs == r.b_qs && l.use_azp == r.use_azp &&
+         l.c_type == r.c_type;
+}
+
+bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
+                const W8A8MatMulPrimitiveHandler::MSizeCacheKey& r) {
+  return l.use_bias == r.use_bias && l.a_m_size == r.a_m_size &&
+         l.bias_type == r.bias_type;
+}
+
+static std::shared_ptr<W8A8MatMulPrimitiveHandler::MSizeCache>
+get_w8a8_class_primitive_cache(
+    const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
+    int64_t cache_size) {
+  static W8A8MatMulPrimitiveHandler::ClassMatmulCache cache(128);
+  assert(cache_size > 0);
+  return cache.get_or_create(key, [&]() {
+    return std::make_shared<W8A8MatMulPrimitiveHandler::MSizeCache>(cache_size);
+  });
+}
+
+W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
+    : DNNLMatMulPrimitiveHandler(
+          static_cast<const DNNLMatMulPrimitiveHandler::Args&>(args),
+          dnnl::memory::data_type::s8),
+      use_azp_(args.use_a_zero_point),
+      a_qs_(args.a_quantization_strategy),
+      b_qs_(args.b_quantization_strategy),
+      m_size_cache_(nullptr) {
+  assert(a_qs_ != QuantizationStrategy::PER_OUTPUT_CHANNEL);
+  assert(b_qs_ != QuantizationStrategy::PER_TOKEN);
+  if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
+    assert(!use_azp_);
+  };
+  prepack_weight(args.b_ptr,
+                 create_primitive_desc(
+                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
+                                   .use_bias = false,
+                                   .bias_type = dnnl::memory::data_type::undef},
+                     true)
+                     .weights_desc());
+  init_runtime_memory_cache(args);
+}
+
+void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
+  auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
+  auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
+  a_storage->set_data_handle((void*)args.a_ptr);
+  a_mem_desc->dims[0] = args.a_m_size;
+  c_storage->set_data_handle((void*)args.c_ptr);
+  c_mem_desc->dims[0] = args.a_m_size;
+
+  if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
+    auto&& [a_scale_storage, a_scale_mem_desc] = get_runtime_memory_ptr(2);
+    a_scale_storage->set_data_handle((void*)args.a_scales_ptr);
+  }
+  if (use_azp_) {
+    auto&& [a_zero_point_storage, a_zero_point_mem_desc] =
+        get_runtime_memory_ptr(3);
+    a_zero_point_storage->set_data_handle((void*)args.a_zero_points_ptr);
+  }
+
+  if (args.use_bias) {
+    auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(4);
+    bias_storage->set_data_handle((void*)args.bias_ptr);
+  }
+
+  dnnl::matmul matmul = get_matmul_cache(args);
+  matmul.execute(default_stream(), memory_cache_);
+  default_stream().wait();
+}
+
+dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
+    const MSizeCacheKey& key) {
+  if (m_size_cache_.get() == nullptr) {
+    ClassMatmulCacheKey key = {.b_n_size = b_n_size_,
+                               .b_k_size = b_k_size_,
+                               .a_qs = a_qs_,
+                               .b_qs = b_qs_,
+                               .use_azp = use_azp_,
+                               .c_type = c_type_};
+    m_size_cache_ = get_w8a8_class_primitive_cache(key, primitive_cache_size_);
+  }
+
+  return m_size_cache_->get_or_create(key, [&]() {
+    dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
+    return dnnl::matmul(desc);
+  });
+}
+
+void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
+  memory_cache_[DNNL_ARG_SRC] = dnnl::memory({{1, b_k_size_},
+                                              dnnl::memory::data_type::s8,
+                                              dnnl::memory::format_tag::ab},
+                                             default_engine(), nullptr);
+  set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
+  memory_cache_[DNNL_ARG_DST] =
+      dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
+
+  // For PER_TOKEN, scales will be applied in outside epilogue
+  if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
+    memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = dnnl::memory(
+        {{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), nullptr);
+    set_runtime_memory_ptr(
+        2, memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC].get());
+    if (use_azp_) {
+      memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC] = dnnl::memory(
+          {{1}, dnnl::memory::data_type::s32, {1}}, default_engine(), nullptr);
+      set_runtime_memory_ptr(
+          3, memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC].get());
+    }
+  }
+
+  if (b_qs_ == QuantizationStrategy::PER_TENSOR) {
+    memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] =
+        dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, default_engine(),
+                     (void*)args.b_scales_ptr);
+  } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) {
+    memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] =
+        dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+                     default_engine(), (void*)args.b_scales_ptr);
+  }
+
+  memory_cache_[DNNL_ARG_BIAS] =
+      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get());
+}
+
+dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
+    const MSizeCacheKey& key, bool first_time) {
+  dnnl::memory::desc a_md({key.a_m_size, b_k_size_},
+                          dnnl::memory::data_type::s8,
+                          dnnl::memory::format_tag::ab);
+  dnnl::memory::desc b_md;
+  if (first_time) {
+    b_md =
+        dnnl::memory::desc({b_k_size_, b_n_size_}, dnnl::memory::data_type::s8,
+                           dnnl::memory::format_tag::any);
+  } else {
+    b_md = b_target_mem_desc_;
+  }
+  dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
+                          dnnl::memory::format_tag::ab);
+
+  dnnl::primitive_attr attr;
+  // For PER_TOKEN, scales will be applied in outside epilogue
+  if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
+    attr.set_scales_mask(DNNL_ARG_SRC, 0);
+    if (use_azp_) {
+      attr.set_zero_points_mask(DNNL_ARG_SRC, 0);
+    }
+  }
+
+  if (b_qs_ == QuantizationStrategy::PER_TENSOR) {
+    attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
+  } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) {
+    attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
+  }
+
+  if (key.use_bias) {
+    // For PER_TOKEN, bias will be applied in epilogue
+    assert(a_qs_ == QuantizationStrategy::PER_TENSOR);
+    dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
+    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
+                                        c_md, attr);
+  } else {
+    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
+                                        attr);
+  }
+}
diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h
new file mode 100644
index 0000000000..54ceefced9
--- /dev/null
+++ b/csrc/cpu/dnnl_helper.h
@@ -0,0 +1,169 @@
+#ifndef DNNL_HELPER_H
+#define DNNL_HELPER_H
+
+#include <optional>
+#include <cassert>
+
+#include "oneapi/dnnl/dnnl.hpp"
+
+namespace c10 {
+struct BFloat16;
+struct Half;
+}  // namespace c10
+
+namespace dnnl {
+namespace impl {
+struct memory_storage_t;
+struct matmul_pd_t;
+struct matmul_desc_t;
+}  // namespace impl
+}  // namespace dnnl
+struct dnnl_memory_desc;
+
+template <typename KT, typename VT>
+class DNNLPrimitiveCache;
+
+template <typename T>
+struct DNNLType {
+  static constexpr dnnl::memory::data_type type =
+      dnnl::memory::data_type::undef;
+};
+
+template <>
+struct DNNLType<int8_t> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
+};
+
+template <>
+struct DNNLType<int32_t> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
+};
+
+template <>
+struct DNNLType<float> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
+};
+
+template <>
+struct DNNLType<c10::BFloat16> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
+};
+
+template <>
+struct DNNLType<c10::Half> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
+};
+
+template <typename T>
+constexpr inline dnnl::memory::data_type get_dnnl_type() {
+  return DNNLType<std::decay_t<T>>::type;
+}
+
+class DNNLMatMulPrimitiveHandler {
+ public:
+  virtual ~DNNLMatMulPrimitiveHandler() = default;
+
+ protected:
+  struct Args {
+    dnnl_dim_t b_n_size;
+    dnnl_dim_t b_n_stride;
+    dnnl_dim_t b_k_size;
+    dnnl_dim_t b_k_stride;
+    void* b_ptr;
+    dnnl::memory::data_type c_type;
+    size_t primitive_cache_size;
+  };
+
+ protected:
+  DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
+
+  void prepack_weight(void* original_b_ptr,
+                      dnnl::memory::desc b_target_mem_desc);
+
+  void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
+
+  std::pair<dnnl::impl::memory_storage_t*, dnnl_memory_desc*>
+  get_runtime_memory_ptr(size_t index);
+
+ protected:
+  const dnnl_dim_t b_n_size_;
+  const dnnl_dim_t b_n_stride_;
+  const dnnl_dim_t b_k_size_;
+  const dnnl_dim_t b_k_stride_;
+  dnnl::memory::data_type b_type_;
+  dnnl::memory::data_type c_type_;
+  std::unordered_map<int, dnnl::memory> memory_cache_;
+  std::vector<std::pair<dnnl::impl::memory_storage_t*, dnnl_memory_desc*>>
+      runtime_memory_ptrs_;
+  dnnl::memory::desc b_target_mem_desc_;
+  int64_t primitive_cache_size_;
+};
+
+class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
+ public:
+  enum class QuantizationStrategy { PER_TOKEN, PER_TENSOR, PER_OUTPUT_CHANNEL };
+
+  struct Args : public DNNLMatMulPrimitiveHandler::Args {
+    bool use_a_zero_point;
+    QuantizationStrategy a_quantization_strategy;
+    QuantizationStrategy b_quantization_strategy;
+    float* b_scales_ptr;
+  };
+
+  struct ClassMatmulCacheKey {
+    dnnl_dim_t b_n_size;
+    dnnl_dim_t b_k_size;
+    QuantizationStrategy a_qs;
+    QuantizationStrategy b_qs;
+    bool use_azp;
+    dnnl::memory::data_type c_type;
+
+    friend bool operator==(const ClassMatmulCacheKey& l,
+                           const ClassMatmulCacheKey& r);
+  };
+
+  struct MSizeCacheKey {
+    dnnl_dim_t a_m_size;
+    bool use_bias;
+    dnnl::memory::data_type bias_type;
+
+    friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r);
+  };
+
+  using MSizeCache = DNNLPrimitiveCache<MSizeCacheKey, dnnl::matmul>;
+  using ClassMatmulCache =
+      DNNLPrimitiveCache<ClassMatmulCacheKey, std::shared_ptr<MSizeCache>>;
+
+  struct ExecArgs : public MSizeCacheKey {
+    const int8_t* a_ptr;
+    const float* a_scales_ptr;
+    const int32_t* a_zero_points_ptr;
+    const void* bias_ptr;
+    void* c_ptr;
+  };
+
+ public:
+  W8A8MatMulPrimitiveHandler(const Args& args);
+
+  QuantizationStrategy get_input_scale_strategy() const { return a_qs_; }
+
+  bool get_input_use_zero_point() const { return use_azp_; }
+
+  void execute(ExecArgs& args);
+
+ private:
+  dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key,
+                                                     bool first_time);
+
+  void init_runtime_memory_cache(const Args& args);
+
+  dnnl::matmul get_matmul_cache(const MSizeCacheKey& key);
+
+ private:
+  const bool use_azp_;
+  const QuantizationStrategy a_qs_;
+  const QuantizationStrategy b_qs_;
+  std::shared_ptr<MSizeCache> m_size_cache_;
+};
+
+#endif
diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp
deleted file mode 100644
index 1cb8dc5b25..0000000000
--- a/csrc/cpu/dnnl_helper.hpp
+++ /dev/null
@@ -1,206 +0,0 @@
-#ifndef DNNL_HELPER_HPP
-#define DNNL_HELPER_HPP
-
-#include <c10/util/BFloat16.h>
-#include <c10/util/Half.h>
-
-#include "oneapi/dnnl/dnnl.hpp"
-
-namespace {
-template <typename T>
-struct DNNLType {
-  static constexpr dnnl::memory::data_type type =
-      dnnl::memory::data_type::undef;
-};
-
-template <>
-struct DNNLType<int8_t> {
-  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
-};
-
-template <>
-struct DNNLType<int32_t> {
-  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
-};
-
-template <>
-struct DNNLType<float> {
-  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
-};
-
-template <>
-struct DNNLType<c10::BFloat16> {
-  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
-};
-
-template <>
-struct DNNLType<c10::Half> {
-  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
-};
-
-template <typename T>
-constexpr inline dnnl::memory::data_type get_dnnl_type() {
-  return DNNLType<std::decay_t<T>>::type;
-}
-};  // namespace
-
-template <bool InputNoScale>
-class DNNLPrimitiveHelper {
- public:
-  // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
-  // A: [M, K], row-major
-  // B: [K, N], column-major
-  // C: [M, N], row-major
-  // bias: [N], row-major, optional
-  // a_scales: [MS]
-  // b_scales: [NS]
-  // Note: Due to the limitation of oneDNN
-  // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
-  // not supported.
-
-  template <typename OutputT, typename BiasT>
-  static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
-                            const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
-                            dnnl_dim_t K, const float* a_scales,
-                            const float* b_scales, dnnl_dim_t MS,
-                            dnnl_dim_t NS) {
-    auto&& OutputType = get_dnnl_type<OutputT>();
-    auto&& BiasType = get_dnnl_type<BiasT>();
-
-    dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
-    dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
-    dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
-
-    dnnl::primitive_attr attr;
-    if constexpr (!InputNoScale) {
-      if (MS == 1) {
-        // per-tensor
-        attr.set_scales_mask(DNNL_ARG_SRC, 0);
-      } else {
-        // per-token
-        TORCH_CHECK(false, "per-token quantization is unsupported.");
-      }
-    }
-
-    if (NS == 1) {
-      // per-tensor
-      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
-    } else {
-      // per-channel
-      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
-    }
-
-    dnnl::matmul::primitive_desc matmul_pd;
-// Create memory descriptors with format_tag::any for the primitive. This
-// enables the matmul primitive to choose memory layouts for an
-// optimized primitive implementation, and these layouts may differ from the
-// ones provided by the user.
-#ifdef __aarch64__
-    auto mat_src_md = dnnl::memory::desc({M, K}, dnnl::memory::data_type::s8,
-                                         dnnl::memory::format_tag::any);
-    auto mat_weights_md = dnnl::memory::desc(
-        {K, N}, dnnl::memory::data_type::s8, dnnl::memory::format_tag::any);
-    auto mat_dst_md =
-        dnnl::memory::desc({M, N}, OutputType, dnnl::memory::format_tag::any);
-    if (bias) {
-      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
-      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), mat_src_md,
-                                               mat_weights_md, bias_md,
-                                               mat_dst_md, attr);
-    } else {
-      matmul_pd = dnnl::matmul::primitive_desc(
-          default_engine(), mat_src_md, mat_weights_md, mat_dst_md, attr);
-    }
-#else
-    if (bias) {
-      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
-      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
-                                               bias_md, c_md, attr);
-    } else {
-      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
-                                               c_md, attr);
-    }
-#endif
-    dnnl::matmul matmul(matmul_pd);
-
-    auto& engine = default_engine();
-
-    dnnl::memory a_m(a_md, engine, (void*)a);
-    dnnl::memory b_m(b_md, engine, (void*)b);
-    dnnl::memory c_m(c_md, engine, (void*)c);
-    dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
-                            (void*)a_scales);
-    dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
-                            (void*)b_scales);
-
-    auto& stream = default_stream();
-
-    auto mat_src_mem = a_m;
-    auto mat_weights_mem = b_m;
-    auto mat_dst_mem = c_m;
-#ifdef __aarch64__
-    if (matmul_pd.weights_desc() != b_m.get_desc()) {
-      mat_weights_mem = dnnl::memory(matmul_pd.weights_desc(), engine);
-      dnnl::reorder(b_m, mat_weights_mem).execute(stream, b_m, mat_weights_mem);
-    }
-#endif
-    if constexpr (InputNoScale) {
-      if (bias) {
-        dnnl::memory::desc bias_md({N}, BiasType, {1});
-        dnnl::memory bias_m(bias_md, engine, (void*)bias);
-        matmul.execute(
-            stream, {
-                        {DNNL_ARG_SRC, mat_src_mem},
-                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
-                        {DNNL_ARG_BIAS, bias_m},
-                        {DNNL_ARG_DST, mat_dst_mem},
-                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
-                    });
-      } else {
-        matmul.execute(
-            stream, {
-                        {DNNL_ARG_SRC, mat_src_mem},
-                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
-                        {DNNL_ARG_DST, mat_dst_mem},
-                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
-                    });
-      }
-    } else {
-      if (bias) {
-        dnnl::memory::desc bias_md({N}, BiasType, {1});
-        dnnl::memory bias_m(bias_md, engine, (void*)bias);
-        matmul.execute(
-            stream, {
-                        {DNNL_ARG_SRC, mat_src_mem},
-                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
-                        {DNNL_ARG_BIAS, bias_m},
-                        {DNNL_ARG_DST, mat_dst_mem},
-                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
-                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
-                    });
-      } else {
-        matmul.execute(
-            stream, {
-                        {DNNL_ARG_SRC, mat_src_mem},
-                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
-                        {DNNL_ARG_DST, mat_dst_mem},
-                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
-                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
-                    });
-      }
-    }
-    stream.wait();
-  }
-
- private:
-  static dnnl::engine& default_engine() {
-    static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
-    return engine;
-  }
-
-  static dnnl::stream& default_stream() {
-    static dnnl::stream stream(default_engine());
-    return stream;
-  }
-};
-#endif
diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp
new file mode 100644
index 0000000000..acc3b9ecde
--- /dev/null
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -0,0 +1,494 @@
+#include "cpu_types.hpp"
+#include "dnnl_helper.h"
+
+namespace {
+template <typename scalar_t>
+struct KernelVecType {
+  using load_vec_type = void;
+  using cvt_vec_type = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using load_vec_type = vec_op::FP32Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+
+#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using load_vec_type = vec_op::BF16Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+#endif
+
+template <>
+struct KernelVecType<c10::Half> {
+#if defined(__powerpc64__) || defined(__s390x__)
+  // Power architecture-specific vector type
+  using load_vec_type = vec_op::FP32Vec16;
+#else
+  // Fallback for other architectures
+  using load_vec_type = vec_op::FP16Vec16;
+#endif
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+
+template <bool AZP, typename scalar_t>
+void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                   const float* scale, const int32_t* azp,
+                                   const int64_t num_tokens,
+                                   const int64_t input_stride,
+                                   const int64_t hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int64_t vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  constexpr float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  constexpr float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  const cvt_vec_t inv_scale(1.0 / *scale);
+  const cvt_vec_t i8_min_vec(i8_min);
+  const cvt_vec_t i8_max_vec(i8_max);
+
+  cvt_vec_t zp_vec;
+  if constexpr (AZP) {
+    zp_vec = cvt_vec_t(static_cast<float>(*azp));
+  }
+
+#pragma omp parallel for
+  for (int64_t i = 0; i < num_tokens; ++i) {
+    int64_t j = 0;
+    const scalar_t* input_ptr = input + i * input_stride;
+    int8_t* output_ptr = output + i * hidden_size;
+    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+      load_vec_t elems(input_ptr + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = elems_fp32 * inv_scale;
+
+      if constexpr (AZP) {
+        elems_fp32 = elems_fp32 + zp_vec;
+      }
+
+      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      elems_int8.save(output_ptr + j);
+    }
+
+    load_vec_t elems(input_ptr + j);
+    cvt_vec_t elems_fp32(elems);
+    elems_fp32 = elems_fp32 * inv_scale;
+
+    if constexpr (AZP) {
+      elems_fp32 = elems_fp32 + zp_vec;
+    }
+
+    elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+    vec_op::INT8Vec16 elems_int8(elems_fp32);
+    elems_int8.save(output_ptr + j, hidden_size - j);
+  }
+}
+
+template <bool AZP, typename scalar_t>
+void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+                                    float* scale, int32_t* azp,
+                                    const int64_t num_tokens,
+                                    const int64_t input_stride,
+                                    const int64_t hidden_size) {
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  constexpr float i8_min =
+      static_cast<float>(std::numeric_limits<int8_t>::min());
+  constexpr float i8_max =
+      static_cast<float>(std::numeric_limits<int8_t>::max());
+  const cvt_vec_t i8_min_vec(i8_min);
+  const cvt_vec_t i8_max_vec(i8_max);
+
+#pragma omp parallel for
+  for (int64_t i = 0; i < num_tokens; ++i) {
+    cvt_vec_t max_value(std::numeric_limits<float>::lowest());
+    cvt_vec_t min_value(std::numeric_limits<float>::max());
+    {
+      int64_t j = 0;
+      const scalar_t* input_ptr = input + i * input_stride;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input_ptr + j);
+        cvt_vec_t elems_fp32(elems);
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32);
+          min_value = min_value.min(elems_fp32);
+        } else {
+          max_value = max_value.max(elems_fp32.abs());
+        }
+      }
+
+      load_vec_t elems(input_ptr + j);
+      cvt_vec_t elems_fp32(elems);
+
+      if (j + vec_elem_num == hidden_size) {
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32);
+          min_value = min_value.min(elems_fp32);
+        } else {
+          max_value = max_value.max(elems_fp32.abs());
+        }
+      } else {
+        if constexpr (AZP) {
+          max_value = max_value.max(elems_fp32, hidden_size - j);
+          min_value = min_value.min(elems_fp32, hidden_size - j);
+        } else {
+          max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
+        }
+      }
+    }
+
+    float scale_val, azp_val;
+    if constexpr (AZP) {
+      float max_scalar = max_value.reduce_max();
+      float min_scalar = min_value.reduce_min();
+      scale_val = (max_scalar - min_scalar) / 255.0f;
+      azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
+      azp[i] = azp_val;
+      scale[i] = scale_val;
+    } else {
+      scale_val = max_value.reduce_max() / 127.0f;
+      scale[i] = scale_val;
+    }
+
+    const cvt_vec_t inv_scale(1.0 / scale_val);
+    const cvt_vec_t azp_vec(azp_val);
+
+    {
+      int64_t j = 0;
+      const scalar_t* input_ptr = input + i * input_stride;
+      int8_t* output_ptr = output + i * hidden_size;
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+        load_vec_t elems(input_ptr + j);
+        cvt_vec_t elems_fp32(elems);
+        elems_fp32 = (elems_fp32 * inv_scale);
+
+        if constexpr (AZP) {
+          elems_fp32 = elems_fp32 + azp_vec;
+        }
+        elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+        vec_op::INT8Vec16 elems_int8(elems_fp32);
+        elems_int8.save(output_ptr + j);
+      }
+
+      load_vec_t elems(input_ptr + j);
+      cvt_vec_t elems_fp32(elems);
+      elems_fp32 = (elems_fp32 * inv_scale);
+
+      if constexpr (AZP) {
+        elems_fp32 = elems_fp32 + azp_vec;
+      }
+      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+      vec_op::INT8Vec16 elems_int8(elems_fp32);
+      elems_int8.save(output_ptr + j, hidden_size - j);
+    }
+  }
+}
+
+template <bool AZP, bool Bias, typename scalar_t>
+void dynamic_quant_epilogue(const float* input, scalar_t* output,
+                            const float* a_scale, const int32_t* azp,
+                            const float* azp_adj, const scalar_t* bias,
+                            const int64_t num_tokens,
+                            const int64_t hidden_size) {
+  CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
+  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
+  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
+  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+  const int64_t thread_num = omp_get_max_threads();
+  if (num_tokens > thread_num) {
+#pragma omp parallel for
+    for (int64_t i = 0; i < num_tokens; ++i) {
+      const float* input_ptr = input + i * hidden_size;
+      scalar_t* output_ptr = output + i * hidden_size;
+      int64_t j = 0;
+      cvt_vec_t token_scale_vec(a_scale[i]);
+      cvt_vec_t token_zp_scale_vec;
+      if constexpr (AZP) {
+        float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
+        token_zp_scale_vec = cvt_vec_t(zp_scale_val);
+      }
+      for (; j < hidden_size - vec_elem_num; ++j) {
+        cvt_vec_t elems_fp32(input_ptr + j);
+        elems_fp32 = elems_fp32 * token_scale_vec;
+        if constexpr (AZP) {
+          cvt_vec_t azp_adj_fp32(azp_adj + j);
+          elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
+        }
+        if constexpr (Bias) {
+          load_vec_t bias_vec(bias + j);
+          cvt_vec_t bias_vec_fp32(bias_vec);
+          elems_fp32 = elems_fp32 + bias_vec_fp32;
+        }
+        load_vec_t elems_out(elems_fp32);
+        elems_out.save(output_ptr + j);
+      }
+      cvt_vec_t elems_fp32(input_ptr + j);
+      elems_fp32 = elems_fp32 * token_scale_vec;
+      if constexpr (AZP) {
+        cvt_vec_t azp_adj_fp32(azp_adj + j);
+        elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
+      }
+      if constexpr (Bias) {
+        load_vec_t bias_vec(bias + j);
+        cvt_vec_t bias_vec_fp32(bias_vec);
+        elems_fp32 = elems_fp32 + bias_vec_fp32;
+      }
+      load_vec_t elems_out(elems_fp32);
+      elems_out.save(output_ptr + j, hidden_size - j);
+    }
+  } else {
+    const int64_t vec_iteration =
+        (hidden_size + vec_elem_num - 1) / vec_elem_num;
+    const int64_t vec_iteration_per_thread =
+        (vec_iteration + thread_num - 1) / thread_num;
+    const int64_t elem_num_per_thread = vec_iteration_per_thread * vec_elem_num;
+#pragma omp parallel for schedule(static, 1)
+    for (int64_t i = 0; i < thread_num; ++i) {
+      const int64_t start = elem_num_per_thread * i;
+      const int64_t end = std::min(hidden_size, elem_num_per_thread + start);
+      for (int64_t j = 0; j < num_tokens; ++j) {
+        cvt_vec_t token_scale_vec(a_scale[j]);
+        cvt_vec_t token_zp_scale_vec;
+        if constexpr (AZP) {
+          float zp_scale_val = a_scale[j] * static_cast<float>(azp[j]);
+          token_zp_scale_vec = cvt_vec_t(zp_scale_val);
+        }
+        int64_t k = start;
+        const float* input_ptr = input + j * hidden_size;
+        scalar_t* output_ptr = output + j * hidden_size;
+        for (; k < end - vec_elem_num; k += vec_elem_num) {
+          cvt_vec_t elems_fp32(input_ptr + k);
+          elems_fp32 = elems_fp32 * token_scale_vec;
+          if constexpr (AZP) {
+            cvt_vec_t azp_adj_fp32(azp_adj + k);
+            elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
+          }
+          if constexpr (Bias) {
+            load_vec_t bias_vec(bias + k);
+            cvt_vec_t bias_vec_fp32(bias_vec);
+            elems_fp32 = elems_fp32 + bias_vec_fp32;
+          }
+          load_vec_t elems_out(elems_fp32);
+          elems_out.save(output_ptr + k);
+        }
+        if (k < end) {
+          cvt_vec_t elems_fp32(input_ptr + k);
+          elems_fp32 = elems_fp32 * token_scale_vec;
+          if constexpr (AZP) {
+            cvt_vec_t azp_adj_fp32(azp_adj + k);
+            elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
+          }
+          if constexpr (Bias) {
+            load_vec_t bias_vec(bias + k);
+            cvt_vec_t bias_vec_fp32(bias_vec);
+            elems_fp32 = elems_fp32 + bias_vec_fp32;
+          }
+          load_vec_t elems_out(elems_fp32);
+          elems_out.save(output_ptr + k, end - k);
+        }
+      }
+    }
+  }
+}
+}  // namespace
+
+int64_t create_onednn_scaled_mm_handler(
+    const torch::Tensor& b,         // [IC, OC], column-major
+    const torch::Tensor& b_scales,  // [1] or [OC]
+    at::ScalarType output_type, bool dynamic_act_quant, bool use_azp,
+    int64_t primitive_cache_size) {
+  TORCH_CHECK(b.dim() == 2);
+  TORCH_CHECK(b.stride(0) == 1);  // Column-major
+  TORCH_CHECK(b_scales.is_contiguous());
+
+  W8A8MatMulPrimitiveHandler::Args args;
+  args.primitive_cache_size = primitive_cache_size;
+
+  if (b_scales.numel() == 1) {
+    args.b_quantization_strategy =
+        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR;
+  } else {
+    TORCH_CHECK_EQ(b_scales.numel(), b.size(1));
+    args.b_quantization_strategy =
+        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_OUTPUT_CHANNEL;
+  }
+  args.b_scales_ptr = b_scales.data_ptr<float>();
+  args.b_k_size = b.size(0);
+  args.b_k_stride = b.stride(0);
+  args.b_n_size = b.size(1);
+  args.b_n_stride = b.stride(1);
+  args.b_ptr = b.data_ptr<int8_t>();
+
+  if (dynamic_act_quant) {
+    // dynamic per-token, bias, A scales and A zps will be applied in outside.
+    args.a_quantization_strategy =
+        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN;
+    args.use_a_zero_point = false;
+  } else {
+    // static per-tensor
+    args.a_quantization_strategy =
+        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR;
+    args.use_a_zero_point = use_azp;
+  }
+
+  VLLM_DISPATCH_FLOATING_TYPES(output_type, "create_onednn_scaled_mm_handler",
+                               [&] {
+                                 if (dynamic_act_quant) {
+                                   args.c_type = get_dnnl_type<float>();
+                                 } else {
+                                   args.c_type = get_dnnl_type<scalar_t>();
+                                 }
+                               });
+
+  return reinterpret_cast<int64_t>(new W8A8MatMulPrimitiveHandler(args));
+}
+
+void onednn_scaled_mm(
+    torch::Tensor& c,                             // [M, OC], row-major
+    const torch::Tensor& a,                       // [M, IC], row-major
+    const torch::Tensor& a_scales,                // [M] or [1]
+    const std::optional<torch::Tensor>& azp,      // [M] or [1]
+    const std::optional<torch::Tensor>& azp_adj,  // [M] or [1]
+    const std::optional<torch::Tensor>& bias,     // [N]
+    int64_t handler) {
+  CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
+  TORCH_CHECK(a.dim() == 2);
+  TORCH_CHECK(a.is_contiguous());
+  TORCH_CHECK(c.is_contiguous());
+  W8A8MatMulPrimitiveHandler* ptr =
+      reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler);
+  const int32_t* azp_ptr = nullptr;
+  if (azp.has_value()) {
+    azp_ptr = azp->data_ptr<int32_t>();
+  }
+  if (ptr->get_input_scale_strategy() ==
+      W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) {
+    TORCH_CHECK_EQ(a_scales.numel(), 1);
+  }
+
+  W8A8MatMulPrimitiveHandler::ExecArgs exec_args;
+  exec_args.a_ptr = a.data_ptr<int8_t>();
+  exec_args.a_m_size = a.size(0);
+  exec_args.bias_ptr = nullptr;
+  exec_args.use_bias = false;
+  exec_args.a_scales_ptr = nullptr;
+  exec_args.a_zero_points_ptr = nullptr;
+
+  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "onednn_scaled_mm", [&] {
+    if (ptr->get_input_scale_strategy() ==
+        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) {
+      if (bias.has_value()) {
+        exec_args.bias_ptr = bias->data_ptr<scalar_t>();
+        exec_args.bias_type = get_dnnl_type<scalar_t>();
+        exec_args.use_bias = true;
+      }
+      exec_args.a_scales_ptr = a_scales.data_ptr<float>();
+      exec_args.a_zero_points_ptr = azp_ptr;
+      exec_args.c_ptr = c.data_ptr<scalar_t>();
+      ptr->execute(exec_args);
+    } else if (ptr->get_input_scale_strategy() ==
+               W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN) {
+      torch::Tensor tmp_fp32_out =
+          torch::empty_like(c, ::at::ScalarType::Float);
+      exec_args.c_ptr = tmp_fp32_out.data_ptr<float>();
+      ptr->execute(exec_args);
+      if (bias.has_value()) {
+        if (azp.has_value()) {
+          dynamic_quant_epilogue<true, true>(
+              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+              a_scales.data_ptr<float>(), azp_ptr, azp_adj->data_ptr<float>(),
+              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
+        } else {
+          dynamic_quant_epilogue<false, true>(
+              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+              a_scales.data_ptr<float>(), azp_ptr, nullptr,
+              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
+        }
+      } else {
+        if (azp.has_value()) {
+          dynamic_quant_epilogue<true, false>(
+              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+              a_scales.data_ptr<float>(), azp_ptr, azp_adj->data_ptr<float>(),
+              (scalar_t*)nullptr, c.size(0), c.size(1));
+        } else {
+          dynamic_quant_epilogue<false, false>(
+              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
+              a_scales.data_ptr<float>(), azp_ptr, nullptr, (scalar_t*)nullptr,
+              c.size(0), c.size(1));
+        }
+      }
+    } else {
+      TORCH_CHECK(false, "invalid act quant type.");
+    }
+  });
+}
+
+// static-per-tensor quantization.
+void static_scaled_int8_quant(
+    torch::Tensor& out,          // [batch, hidden_size]
+    const torch::Tensor& input,  // [batch, hidden_size]
+    const torch::Tensor& scale, std::optional<torch::Tensor> const& azp) {
+  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK_EQ(input.dim(), 2);
+  TORCH_CHECK_EQ(input.stride(1), 1);
+  TORCH_CHECK(scale.numel() == 1);
+  TORCH_CHECK(!azp.has_value() || azp->numel() == 1);
+
+  const int64_t stride = input.stride(0);
+  const int64_t hidden_size = input.size(1);
+  const int64_t num_tokens = input.size(0);
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
+        if (azp.has_value()) {
+          static_scaled_int8_quant_impl<true>(
+              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
+              stride, hidden_size);
+        } else {
+          static_scaled_int8_quant_impl<false>(input.data_ptr<scalar_t>(),
+                                               out.data_ptr<int8_t>(),
+                                               scale.data_ptr<float>(), nullptr,
+                                               num_tokens, stride, hidden_size);
+        }
+      });
+}
+
+// dynamic-per-token quantization.
+void dynamic_scaled_int8_quant(
+    torch::Tensor& out,          // [batch, hidden_size]
+    const torch::Tensor& input,  // [batch, hidden_size]
+    torch::Tensor& scale,        // [batch, 1]
+    std::optional<torch::Tensor> const& azp) {
+  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK_EQ(input.dim(), 2);
+  TORCH_CHECK_EQ(input.stride(1), 1);
+
+  const int64_t hidden_size = input.size(1);
+  const int64_t num_tokens = input.size(0);
+  const int64_t stride = input.stride(0);
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
+        if (azp.has_value()) {
+          dynamic_scaled_int8_quant_impl<true>(
+              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
+              stride, hidden_size);
+        } else {
+          dynamic_scaled_int8_quant_impl<false>(
+              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+              scale.data_ptr<float>(), nullptr, num_tokens, stride,
+              hidden_size);
+        }
+      });
+}
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
deleted file mode 100644
index 6e120b8d20..0000000000
--- a/csrc/cpu/quant.cpp
+++ /dev/null
@@ -1,951 +0,0 @@
-#include "cpu_types.hpp"
-#include "dnnl_helper.hpp"
-
-namespace {
-template <typename scalar_t>
-struct KernelVecType {
-  using load_vec_type = void;
-  using azp_adj_load_vec_type = void;
-  using cvt_vec_type = void;
-};
-
-template <>
-struct KernelVecType<float> {
-  using load_vec_type = vec_op::FP32Vec16;
-  using azp_adj_load_vec_type = vec_op::INT32Vec16;
-  using cvt_vec_type = vec_op::FP32Vec16;
-};
-
-#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using load_vec_type = vec_op::BF16Vec16;
-  using azp_adj_load_vec_type = vec_op::INT32Vec16;
-  using cvt_vec_type = vec_op::FP32Vec16;
-};
-#endif
-
-template <>
-struct KernelVecType<c10::Half> {
-#if defined(__powerpc64__) || defined(__s390x__)
-  // Power architecture-specific vector type
-  using load_vec_type = vec_op::FP32Vec16;
-#else
-  // Fallback for other architectures
-  using load_vec_type = vec_op::FP16Vec16;
-#endif
-  using azp_adj_load_vec_type = vec_op::INT32Vec16;
-  using cvt_vec_type = vec_op::FP32Vec16;
-};
-
-#if defined(__AVX512F__) || defined(__aarch64__)
-template <bool AZP, typename scalar_t>
-void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
-                                   const float* scale, const int32_t* azp,
-                                   const int num_tokens,
-                                   const int hidden_size) {
-  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
-  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
-  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
-  constexpr float i8_min =
-      static_cast<float>(std::numeric_limits<int8_t>::min());
-  constexpr float i8_max =
-      static_cast<float>(std::numeric_limits<int8_t>::max());
-  const cvt_vec_t inv_scale(1.0 / *scale);
-  const cvt_vec_t i8_min_vec(i8_min);
-  const cvt_vec_t i8_max_vec(i8_max);
-
-  cvt_vec_t zp_vec;
-  if constexpr (AZP) {
-    zp_vec = cvt_vec_t(static_cast<float>(*azp));
-  }
-
-  #pragma omp parallel for
-  for (int i = 0; i < num_tokens; ++i) {
-    int j = 0;
-    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-      load_vec_t elems(input + i * hidden_size + j);
-      cvt_vec_t elems_fp32(elems);
-      elems_fp32 = elems_fp32 * inv_scale;
-
-      if constexpr (AZP) {
-        elems_fp32 = elems_fp32 + zp_vec;
-      }
-
-      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
-      vec_op::INT8Vec16 elems_int8(elems_fp32);
-      elems_int8.save(output + i * hidden_size + j);
-    }
-
-    load_vec_t elems(input + i * hidden_size + j);
-    cvt_vec_t elems_fp32(elems);
-    elems_fp32 = elems_fp32 * inv_scale;
-
-    if constexpr (AZP) {
-      elems_fp32 = elems_fp32 + zp_vec;
-    }
-
-    elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
-    vec_op::INT8Vec16 elems_int8(elems_fp32);
-    elems_int8.save(output + i * hidden_size + j, hidden_size - j);
-  }
-}
-
-template <bool AZP, typename scalar_t>
-void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
-                                    float* scale, int32_t* azp,
-                                    const int num_tokens,
-                                    const int hidden_size) {
-  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
-  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
-  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
-  constexpr float i8_min =
-      static_cast<float>(std::numeric_limits<int8_t>::min());
-  constexpr float i8_max =
-      static_cast<float>(std::numeric_limits<int8_t>::max());
-  const cvt_vec_t i8_min_vec(i8_min);
-  const cvt_vec_t i8_max_vec(i8_max);
-
-  #pragma omp parallel for
-  for (int i = 0; i < num_tokens; ++i) {
-    cvt_vec_t max_value(std::numeric_limits<float>::lowest());
-    cvt_vec_t min_value(std::numeric_limits<float>::max());
-    {
-      int j = 0;
-      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-        load_vec_t elems(input + i * hidden_size + j);
-        cvt_vec_t elems_fp32(elems);
-        if constexpr (AZP) {
-          max_value = max_value.max(elems_fp32);
-          min_value = min_value.min(elems_fp32);
-        } else {
-          max_value = max_value.max(elems_fp32.abs());
-        }
-      }
-
-      load_vec_t elems(input + i * hidden_size + j);
-      cvt_vec_t elems_fp32(elems);
-
-      if (j + vec_elem_num == hidden_size) {
-        if constexpr (AZP) {
-          max_value = max_value.max(elems_fp32);
-          min_value = min_value.min(elems_fp32);
-        } else {
-          max_value = max_value.max(elems_fp32.abs());
-        }
-      } else {
-        if constexpr (AZP) {
-          max_value = max_value.max(elems_fp32, hidden_size - j);
-          min_value = min_value.min(elems_fp32, hidden_size - j);
-        } else {
-          max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
-        }
-      }
-    }
-
-    float scale_val, azp_val;
-    if constexpr (AZP) {
-      float max_scalar = max_value.reduce_max();
-      float min_scalar = min_value.reduce_min();
-      scale_val = (max_scalar - min_scalar) / 255.0f;
-      azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
-      azp[i] = static_cast<int32_t>(azp_val);
-      scale[i] = scale_val;
-    } else {
-      scale_val = max_value.reduce_max() / 127.0f;
-      scale[i] = scale_val;
-    }
-
-    const cvt_vec_t inv_scale(1.0 / scale_val);
-    const cvt_vec_t azp_vec(azp_val);
-
-    {
-      int j = 0;
-      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-        load_vec_t elems(input + i * hidden_size + j);
-        cvt_vec_t elems_fp32(elems);
-        elems_fp32 = (elems_fp32 * inv_scale);
-
-        if constexpr (AZP) {
-          elems_fp32 = elems_fp32 + azp_vec;
-        }
-        elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
-        vec_op::INT8Vec16 elems_int8(elems_fp32);
-        elems_int8.save(output + i * hidden_size + j);
-      }
-
-      load_vec_t elems(input + i * hidden_size + j);
-      cvt_vec_t elems_fp32(elems);
-      elems_fp32 = (elems_fp32 * inv_scale);
-
-      if constexpr (AZP) {
-        elems_fp32 = elems_fp32 + azp_vec;
-      }
-      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
-      vec_op::INT8Vec16 elems_int8(elems_fp32);
-      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
-    }
-  }
-}
-
-template <bool PerChannel, typename scalar_t>
-void static_quant_epilogue(const float* input, scalar_t* output,
-                           const float a_scale, const float* b_scale,
-                           const int32_t* azp_with_adj, const int num_tokens,
-                           const int hidden_size) {
-  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
-  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
-  using azp_adj_load_vec_t =
-      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
-  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
-  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
-  #pragma omp parallel for
-  for (int i = 0; i < num_tokens; ++i) {
-    cvt_vec_t a_scale_vec(a_scale);
-    cvt_vec_t b_scale_vec(*b_scale);
-    cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
-
-    int j = 0;
-    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-      cvt_vec_t elems_fp32(input + i * hidden_size + j);
-      azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
-      cvt_vec_t azp_adj_fp32(azp_adj_vec);
-
-      if constexpr (PerChannel) {
-        b_scale_vec = cvt_vec_t(b_scale + j);
-        scale_vec = b_scale_vec * a_scale_vec;
-      }
-
-      elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
-
-      load_vec_t elems_out(elems_fp32);
-      elems_out.save(output + i * hidden_size + j);
-    }
-
-    cvt_vec_t elems_fp32(input + i * hidden_size + j);
-    azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
-    cvt_vec_t azp_adj_fp32(azp_adj_vec);
-
-    if constexpr (PerChannel) {
-      b_scale_vec = cvt_vec_t(b_scale + j);
-      scale_vec = b_scale_vec * a_scale_vec;
-    }
-
-    elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
-
-    load_vec_t elems_out(elems_fp32);
-    elems_out.save(output + i * hidden_size + j, hidden_size - j);
-  }
-}
-
-template <bool AZP, bool PerChannel, bool Bias, typename scalar_t>
-void dynamic_quant_epilogue(const float* input, scalar_t* output,
-                            const float* a_scale, const float* b_scale,
-                            const int32_t* azp, const int32_t* azp_adj,
-                            const scalar_t* bias, const int num_tokens,
-                            const int hidden_size) {
-  CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
-  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
-  using azp_adj_load_vec_t =
-      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
-  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
-  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
-  #pragma omp parallel for
-  for (int i = 0; i < num_tokens; ++i) {
-    int j = 0;
-    cvt_vec_t token_scale_vec(a_scale[i]);
-    cvt_vec_t token_zp_scale_vec;
-    if constexpr (AZP) {
-      float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
-      if constexpr (!PerChannel) {
-        zp_scale_val *= *b_scale;
-      }
-      token_zp_scale_vec = cvt_vec_t(zp_scale_val);
-    }
-
-    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-      cvt_vec_t elems_fp32(input + i * hidden_size + j);
-      elems_fp32 = elems_fp32 * token_scale_vec;
-
-      if constexpr (AZP) {
-        azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
-        cvt_vec_t azp_adj_fp32(azp_adj_vec);
-        azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
-
-        if constexpr (PerChannel) {
-          cvt_vec_t b_scale_vec(b_scale + j);
-          azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
-        }
-
-        elems_fp32 = elems_fp32 - azp_adj_fp32;
-      }
-
-      if constexpr (Bias) {
-        load_vec_t bias_vec(bias + j);
-        cvt_vec_t bias_vec_fp32(bias_vec);
-        elems_fp32 = elems_fp32 + bias_vec_fp32;
-      }
-
-      load_vec_t elems_out(elems_fp32);
-      elems_out.save(output + i * hidden_size + j);
-    }
-
-    cvt_vec_t elems_fp32(input + i * hidden_size + j);
-    elems_fp32 = elems_fp32 * token_scale_vec;
-
-    if constexpr (AZP) {
-      azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
-      cvt_vec_t azp_adj_fp32(azp_adj_vec);
-      azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
-
-      if constexpr (PerChannel) {
-        cvt_vec_t b_scale_vec(b_scale + j);
-        azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
-      }
-
-      elems_fp32 = elems_fp32 - azp_adj_fp32;
-    }
-
-    if constexpr (Bias) {
-      load_vec_t bias_vec(bias + j);
-      cvt_vec_t bias_vec_fp32(bias_vec);
-      elems_fp32 = elems_fp32 + bias_vec_fp32;
-    }
-
-    load_vec_t elems_out(elems_fp32);
-    elems_out.save(output + i * hidden_size + j, hidden_size - j);
-  }
-}
-#elif defined(__powerpc64__)
-template <bool AZP, typename scalar_t>
-void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
-                                   const float* scale, const int32_t* azp,
-                                   const int num_tokens,
-                                   const int hidden_size) {
-  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
-  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
-  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
-  constexpr float i8_min =
-      static_cast<float>(std::numeric_limits<int8_t>::min());
-  constexpr float i8_max =
-      static_cast<float>(std::numeric_limits<int8_t>::max());
-
-  const cvt_vec_t inv_scale(1.0 / *scale);
-  const cvt_vec_t i8_min_vec(i8_min);
-  const cvt_vec_t i8_max_vec(i8_max);
-
-  cvt_vec_t zp_vec;
-  if constexpr (AZP) {
-    zp_vec = cvt_vec_t(static_cast<float>(*azp));
-  }
-  #pragma omp parallel for
-  for (int i = 0; i < num_tokens; ++i) {
-    int j = 0;
-    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-      load_vec_t elems(input + i * hidden_size + j);
-      cvt_vec_t elems_fp32(elems);
-      elems_fp32 = elems_fp32 * inv_scale;
-      if constexpr (AZP) {
-        elems_fp32 = elems_fp32 + zp_vec;
-      }
-      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
-      vec_op::INT8Vec16 elems_int8(elems_fp32);
-      elems_int8.save(output + i * hidden_size + j);
-    }
-    load_vec_t elems(input + i * hidden_size + j);
-    cvt_vec_t elems_fp32(elems);
-    elems_fp32 = elems_fp32 * inv_scale;
-
-    if constexpr (AZP) {
-      elems_fp32 = elems_fp32 + zp_vec;
-    }
-
-    elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
-    vec_op::INT8Vec16 elems_int8(elems_fp32);
-    elems_int8.save(output + i * hidden_size + j, hidden_size - j);
-  }
-}
-template <bool AZP, typename scalar_t>
-void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
-                                    float* scale, int32_t* azp,
-                                    const int num_tokens,
-                                    const int hidden_size) {
-  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
-  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
-  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
-  constexpr float i8_min =
-      static_cast<float>(std::numeric_limits<int8_t>::min());
-  constexpr float i8_max =
-      static_cast<float>(std::numeric_limits<int8_t>::max());
-  const cvt_vec_t i8_min_vec(i8_min);
-  const cvt_vec_t i8_max_vec(i8_max);
-
-  #pragma omp parallel for
-  for (int i = 0; i < num_tokens; ++i) {
-    cvt_vec_t max_value(std::numeric_limits<float>::lowest());
-    cvt_vec_t min_value(std::numeric_limits<float>::max());
-    {
-      int j = 0;
-      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-        load_vec_t elems(input + i * hidden_size + j);
-        cvt_vec_t elems_fp32(elems);
-        if constexpr (AZP) {
-          max_value = max_value.max(elems_fp32);
-          min_value = min_value.min(elems_fp32);
-        } else {
-          max_value = max_value.max(elems_fp32.abs());
-        }
-      }
-
-      load_vec_t elems(input + i * hidden_size + j);
-      cvt_vec_t elems_fp32(elems);
-
-      if (j + vec_elem_num == hidden_size) {
-        if constexpr (AZP) {
-          max_value = max_value.max(elems_fp32);
-          min_value = min_value.min(elems_fp32);
-        } else {
-          max_value = max_value.max(elems_fp32.abs());
-        }
-      } else {
-        if constexpr (AZP) {
-          max_value = max_value.max(elems_fp32, hidden_size - j);
-          min_value = min_value.min(elems_fp32, hidden_size - j);
-        } else {
-          max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
-        }
-      }
-    }
-
-    float scale_val, azp_val;
-    if constexpr (AZP) {
-      float max_scalar = max_value.reduce_max();
-      float min_scalar = min_value.reduce_min();
-      scale_val = (max_scalar - min_scalar) / 255.0f;
-      azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
-      azp[i] = static_cast<int32_t>(azp_val);
-      scale[i] = scale_val;
-    } else {
-      scale_val = max_value.reduce_max() / 127.0f;
-      scale[i] = scale_val;
-    }
-
-    const cvt_vec_t inv_scale(1.0 / scale_val);
-    const cvt_vec_t azp_vec(azp_val);
-
-    {
-      int j = 0;
-      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-        load_vec_t elems(input + i * hidden_size + j);
-        cvt_vec_t elems_fp32(elems);
-        elems_fp32 = (elems_fp32 * inv_scale);
-
-        if constexpr (AZP) {
-          elems_fp32 = elems_fp32 + azp_vec;
-        }
-        elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
-        vec_op::INT8Vec16 elems_int8(elems_fp32);
-        elems_int8.save(output + i * hidden_size + j);
-      }
-
-      load_vec_t elems(input + i * hidden_size + j);
-      cvt_vec_t elems_fp32(elems);
-      elems_fp32 = (elems_fp32 * inv_scale);
-
-      if constexpr (AZP) {
-        elems_fp32 = elems_fp32 + azp_vec;
-      }
-      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
-      vec_op::INT8Vec16 elems_int8(elems_fp32);
-      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
-    }
-  }
-}
-template <bool PerChannel, typename scalar_t>
-void static_quant_epilogue(const float* input, scalar_t* output,
-                           const float a_scale, const float* b_scale,
-                           const int32_t* azp_with_adj, const int num_tokens,
-                           const int hidden_size) {
-  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
-  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
-  using azp_adj_load_vec_t =
-      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
-  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
-  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
-  #pragma omp parallel for
-  for (int i = 0; i < num_tokens; ++i) {
-    cvt_vec_t a_scale_vec(a_scale);
-    cvt_vec_t b_scale_vec(*b_scale);
-    cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
-
-    int j = 0;
-    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-      cvt_vec_t elems_fp32(input + i * hidden_size + j);
-      azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
-      cvt_vec_t azp_adj_fp32(azp_adj_vec);
-
-      if constexpr (PerChannel) {
-        b_scale_vec = cvt_vec_t(b_scale + j);
-        scale_vec = b_scale_vec * a_scale_vec;
-      }
-      elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
-      load_vec_t elems_out(elems_fp32);
-      elems_out.save(output + i * hidden_size + j);
-    }
-
-    cvt_vec_t elems_fp32(input + i * hidden_size + j);
-    azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
-    cvt_vec_t azp_adj_fp32(azp_adj_vec);
-
-    if constexpr (PerChannel) {
-      b_scale_vec = cvt_vec_t(b_scale + j);
-      scale_vec = b_scale_vec * a_scale_vec;
-    }
-
-    elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
-
-    load_vec_t elems_out(elems_fp32);
-    elems_out.save(output + i * hidden_size + j, hidden_size - j);
-  }
-}
-template <bool AZP, bool PerChannel, bool Bias, typename scalar_t>
-void dynamic_quant_epilogue(const float* input, scalar_t* output,
-                            const float* a_scale, const float* b_scale,
-                            const int32_t* azp, const int32_t* azp_adj,
-                            const scalar_t* bias, const int num_tokens,
-                            const int hidden_size) {
-  CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
-  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
-  using azp_adj_load_vec_t =
-      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
-  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
-  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
-  #pragma omp parallel for
-  for (int i = 0; i < num_tokens; ++i) {
-    int j = 0;
-    cvt_vec_t token_scale_vec(a_scale[i]);
-    cvt_vec_t token_zp_scale_vec;
-    if constexpr (AZP) {
-      float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
-      if constexpr (!PerChannel) {
-        zp_scale_val *= *b_scale;
-      }
-      token_zp_scale_vec = cvt_vec_t(zp_scale_val);
-    }
-
-    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
-      cvt_vec_t elems_fp32(input + i * hidden_size + j);
-      elems_fp32 = elems_fp32 * token_scale_vec;
-
-      if constexpr (AZP) {
-        azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
-        cvt_vec_t azp_adj_fp32(azp_adj_vec);
-        azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
-
-        if constexpr (PerChannel) {
-          cvt_vec_t b_scale_vec(b_scale + j);
-          azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
-        }
-
-        elems_fp32 = elems_fp32 - azp_adj_fp32;
-      }
-
-      if constexpr (Bias) {
-        load_vec_t bias_vec(bias + j);
-        cvt_vec_t bias_vec_fp32(bias_vec);
-        elems_fp32 = elems_fp32 + bias_vec_fp32;
-      }
-
-      load_vec_t elems_out(elems_fp32);
-      elems_out.save(output + i * hidden_size + j);
-    }
-
-    cvt_vec_t elems_fp32(input + i * hidden_size + j);
-    elems_fp32 = elems_fp32 * token_scale_vec;
-
-    if constexpr (AZP) {
-      azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
-      cvt_vec_t azp_adj_fp32(azp_adj_vec);
-      azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
-
-      if constexpr (PerChannel) {
-        cvt_vec_t b_scale_vec(b_scale + j);
-        azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
-      }
-
-      elems_fp32 = elems_fp32 - azp_adj_fp32;
-    }
-
-    if constexpr (Bias) {
-      load_vec_t bias_vec(bias + j);
-      cvt_vec_t bias_vec_fp32(bias_vec);
-      elems_fp32 = elems_fp32 + bias_vec_fp32;
-    }
-
-    load_vec_t elems_out(elems_fp32);
-    elems_out.save(output + i * hidden_size + j, hidden_size - j);
-  }
-}
-#else
-template <typename scalar_t>
-void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
-                                   const float* scale, const int32_t* azp,
-                                   const int num_tokens,
-                                   const int hidden_size) {
-  TORCH_CHECK(false,
-              "static_scaled_int8_quant_impl requires AVX512/powerpc64/AArch64 "
-              "support.")
-}
-
-template <typename scalar_t>
-void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
-                                    float* scale, int32_t* azp,
-                                    const int num_tokens,
-                                    const int hidden_size) {
-  TORCH_CHECK(false,
-              "dynamic_scaled_int8_quant_impl requires "
-              "AVX512/powerpc64/AArch64 support.")
-}
-
-template <bool PerChannel, typename scalar_t>
-void static_quant_epilogue(const float* input, scalar_t* output,
-                           const float a_scale, const float* b_scale,
-                           const int32_t* azp_with_adj, const int num_tokens,
-                           const int hidden_size) {
-  TORCH_CHECK(
-      false, "static_quant_epilogue requires AVX512/powerpc64/AArch64 support.")
-}
-
-template <typename scalar_t>
-void dynamic_quant_epilogue(const float* input, scalar_t* output,
-                            const float* a_scale, const float* b_scale,
-                            const int32_t* azp, const int32_t* azp_with_adj,
-                            const scalar_t* bias, const int num_tokens,
-                            const int hidden_size) {
-  TORCH_CHECK(
-      false,
-      "dynamic_quant_epilogue requires AVX512/powerpc64/AArch64 support.")
-}
-#endif
-}  // namespace
-
-void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
-                    const torch::Tensor& a,         // [M, IC], row-major
-                    const torch::Tensor& b,         // [IC, OC], column-major
-                    const torch::Tensor& a_scales,  // [1] or [M]
-                    const torch::Tensor& b_scales,  // [1] or [OC]
-                    const std::optional<torch::Tensor>& bias  // [OC]
-) {
-  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
-  // Checks for conformality
-  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
-              "int8_scaled_mm only supports INT8 inputs.")
-  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
-  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
-              b.size(1) == c.size(1));
-  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
-  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
-
-  // Check for strides and alignment
-  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
-  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
-  TORCH_CHECK(c.stride(0) % 16 == 0 &&
-              b.stride(1) % 16 == 0);  // 16 Byte Alignment
-  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
-
-  if (bias) {
-    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
-                bias->dim() == 1);
-  }
-
-  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm", [&] {
-    if (a_scales.numel() != 1) {
-      // per-token
-      // Note: oneDNN doesn't support per-token activation quantization
-      // Ideally we want to fuse the GEMM and the scale procedure with oneDNN
-      // JIT, the intermediate data is cached in registers or L1. But for now
-      // the oneDNN GEMM code generation only supports two quantization
-      // patterns: per-tensor or per-output-channel of weight.
-      // So we have to apply the per-token scale with a 'epilogue'. In C=s_a *
-      // s_b * (A@B) + bias, the C_inter = s_b * (A@B) is computed by oneDNN
-      // GEMM, then the per-token scale (and bias) is applied with the epilogue
-      // C=s_a * C_inter + bias.
-      torch::Tensor tmp_fp32_out =
-          torch::empty_like(c, ::at::ScalarType::Float);
-      // Compute C_inter=s_b * (A@B)
-      DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
-          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
-          tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
-          a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
-      if (bias.has_value()) {
-        // Compute C=s_a * C_inter + bias
-        dynamic_quant_epilogue<false, true, true>(
-            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-            a_scales.data_ptr<float>(), nullptr, nullptr, nullptr,
-            bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
-      } else {
-        // Compute C=s_a * C_inter
-        dynamic_quant_epilogue<false, true, false, scalar_t>(
-            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-            a_scales.data_ptr<float>(), nullptr, nullptr, nullptr, nullptr,
-            c.size(0), c.size(1));
-      }
-    } else {
-      // per-tensor
-      if (bias.has_value()) {
-        // Compute C=s_a * s_b * (A@B) + bias
-        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
-            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
-            bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
-            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
-            a_scales.numel(), b_scales.numel());
-      } else {
-        // Compute C=s_a * s_b * (A@B)
-        DNNLPrimitiveHelper<false>::gemm_s8s8_jit<scalar_t, void>(
-            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
-            nullptr, a.size(0), b.size(1), a.size(1),
-            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
-            a_scales.numel(), b_scales.numel());
-      }
-    }
-  });
-}
-
-void int8_scaled_mm_azp(torch::Tensor& c,        // [M, OC], row-major
-                        const torch::Tensor& a,  // [M, IC], row-major
-                        const torch::Tensor& b,  // [IC, OC], column-major
-                        const torch::Tensor& a_scales,            // [1] or [M]
-                        const torch::Tensor& b_scales,            // [1] or [OC]
-                        const torch::Tensor& azp_adj,             // [OC]
-                        const std::optional<torch::Tensor>& azp,  // [1] or [M]
-                        const std::optional<torch::Tensor>& bias  // [OC]
-) {
-  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
-  // Checks for conformality
-  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
-              "int8_scaled_mm_azp only supports INT8 inputs.")
-  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
-  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
-              b.size(1) == c.size(1));
-  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
-  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
-
-  // Check for strides and alignment
-  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
-  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
-  TORCH_CHECK(c.stride(0) % 16 == 0 &&
-              b.stride(1) % 16 == 0);  // 16 Byte Alignment
-  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
-
-  if (bias) {
-    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous());
-  }
-  if (azp) {
-    TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous());
-  }
-  TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous());
-
-  // azp & bias types
-  TORCH_CHECK(azp_adj.dtype() == torch::kInt32);
-  TORCH_CHECK(!azp || azp->dtype() == torch::kInt32);
-  TORCH_CHECK(!bias || bias->dtype() == c.dtype(),
-              "currently bias dtype must match output dtype ", c.dtype());
-
-  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_azp", [&] {
-    torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float);
-    if (a_scales.numel() != 1) {
-      // per-token
-      // Note: oneDNN doesn't support per-token activation quantization
-      // Compute C_inter=s_b * (A@B)
-      DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
-          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
-          tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
-          a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
-      if (bias.has_value()) {
-        // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + bias
-        if (b_scales.numel() != 1) {
-          // Per-Channel
-          dynamic_quant_epilogue<true, true, true>(
-              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
-              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(),
-              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
-        } else {
-          // Per-Tensor
-          dynamic_quant_epilogue<true, false, true>(
-              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
-              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(),
-              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
-        }
-      } else {
-        // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj
-        if (b_scales.numel() != 1) {
-          // Per-Channel
-          dynamic_quant_epilogue<true, true, false, scalar_t>(
-              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
-              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(), nullptr,
-              c.size(0), c.size(1));
-        } else {
-          // Per-Tensor
-          dynamic_quant_epilogue<true, false, false, scalar_t>(
-              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
-              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(), nullptr,
-              c.size(0), c.size(1));
-        }
-      }
-    } else {
-      // per-tensor
-      if (bias.has_value()) {
-        // Compute C_inter=s_a * s_b * (A@B) + bias
-        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
-            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
-            tmp_fp32_out.data_ptr<float>(), bias->data_ptr<scalar_t>(),
-            a.size(0), b.size(1), a.size(1), a_scales.data_ptr<float>(),
-            b_scales.data_ptr<float>(), a_scales.numel(), b_scales.numel());
-      } else {
-        // Compute C_inter=s_a * s_b * (A@B)
-        DNNLPrimitiveHelper<false>::gemm_s8s8_jit<float, void>(
-            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
-            tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
-            a.size(1), a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
-            a_scales.numel(), b_scales.numel());
-      }
-
-      // Compute C=C_inter - s_a * s_b * azp_adj
-      if (b_scales.numel() != 1) {
-        // Per-Channel
-        static_quant_epilogue<true>(
-            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-            *a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
-            azp_adj.data_ptr<int32_t>(), a.size(0), b.size(1));
-      } else {
-        // Per-Tensor
-        static_quant_epilogue<false>(
-            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-            *a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
-            azp_adj.data_ptr<int32_t>(), a.size(0), b.size(1));
-      }
-    }
-  });
-}
-
-// static-per-tensor quantization.
-void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
-                              const torch::Tensor& input,  // [..., hidden_size]
-                              const torch::Tensor& scale,
-                              std::optional<torch::Tensor> const& azp) {
-  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(out.is_contiguous());
-  TORCH_CHECK(scale.numel() == 1);
-  TORCH_CHECK(!azp.has_value() || azp->numel() == 1);
-
-  const int hidden_size = input.size(-1);
-  const int num_tokens = input.numel() / hidden_size;
-  VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
-        if (azp.has_value()) {
-          static_scaled_int8_quant_impl<true>(
-              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
-              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
-              hidden_size);
-        } else {
-          static_scaled_int8_quant_impl<false>(
-              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
-              scale.data_ptr<float>(), nullptr, num_tokens, hidden_size);
-        }
-      });
-}
-
-// dynamic-per-token quantization.
-void dynamic_scaled_int8_quant(
-    torch::Tensor& out,          // [..., hidden_size]
-    const torch::Tensor& input,  // [..., hidden_size]
-    torch::Tensor& scale,        // [..., 1]
-    std::optional<torch::Tensor> const& azp) {
-  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(out.is_contiguous());
-
-  int const hidden_size = input.size(-1);
-  int const num_tokens = input.numel() / hidden_size;
-  VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
-        if (azp.has_value()) {
-          dynamic_scaled_int8_quant_impl<true>(
-              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
-              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
-              hidden_size);
-        } else {
-          dynamic_scaled_int8_quant_impl<false>(
-              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
-              scale.data_ptr<float>(), nullptr, num_tokens, hidden_size);
-        }
-      });
-}
-
-#if defined(__powerpc64__)
-void int8_scaled_mm_ppc64le(torch::Tensor& c,        // [M, OC], row-major
-                            const torch::Tensor& a,  // [M, IC], row-major
-                            const torch::Tensor& b,  // [IC, OC], column-major
-                            const torch::Tensor& a_scales,
-                            const torch::Tensor& b_scales,
-                            const std::optional<torch::Tensor>& bias  // [OC]
-) {
-  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
-  // Checks for conformality
-  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
-              "int8_scaled_mm_ppc64le only supports INT8 inputs.");
-  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
-  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
-              b.size(1) == c.size(1));
-  // We dont need this
-  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
-  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
-
-  // Check for strides and alignment
-  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
-  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
-  TORCH_CHECK(c.stride(0) % 16 == 0 &&
-              b.stride(1) % 16 == 0);  // 16 Byte Alignment
-  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
-
-  if (bias) {
-    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
-                bias->dim() == 1);
-  }
-  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_ppc64le", [&] {
-    torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float);
-    // Compute C_inter=s_b * (A@B)
-    DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
-        a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
-        tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
-        a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
-    if (bias.has_value()) {
-      // Compute C=s_a * C_inter + bias
-      dynamic_quant_epilogue<false, true, true>(
-          tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-          a_scales.data_ptr<float>(), nullptr, nullptr, nullptr,
-          bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
-    } else {
-      // Compute C=s_a * C_inter
-      dynamic_quant_epilogue<false, true, false, scalar_t>(
-          tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
-          a_scales.data_ptr<float>(), nullptr, nullptr, nullptr, nullptr,
-          c.size(0), c.size(1));
-    }
-  });
-}
-
-#endif
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index b20a054648..c9f426bdf6 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -6,25 +6,20 @@
 
 std::string init_cpu_threads_env(const std::string& cpu_ids);
 
-void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
-                    const torch::Tensor& b, const torch::Tensor& a_scales,
-                    const torch::Tensor& b_scales,
-                    const std::optional<torch::Tensor>& bias);
+void release_dnnl_matmul_handler(int64_t handler);
 
-void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
-                        const torch::Tensor& b, const torch::Tensor& a_scales,
-                        const torch::Tensor& b_scales,
-                        const torch::Tensor& azp_adj,
-                        const std::optional<torch::Tensor>& azp,
-                        const std::optional<torch::Tensor>& bias);
+int64_t create_onednn_scaled_mm_handler(const torch::Tensor& b,
+                                        const torch::Tensor& b_scales,
+                                        at::ScalarType output_type,
+                                        bool dynamic_act_quant, bool use_azp,
+                                        int64_t primitive_cache_size);
 
-#if defined(__powerpc64__)
-void int8_scaled_mm_ppc64le(torch::Tensor& c, const torch::Tensor& a,
-                            const torch::Tensor& b,
-                            const torch::Tensor& a_scales,
-                            const torch::Tensor& b_scales,
-                            const std::optional<torch::Tensor>& bias);
-#endif
+void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
+                      const torch::Tensor& a_scales,
+                      const std::optional<torch::Tensor>& azp,
+                      const std::optional<torch::Tensor>& azp_adj,
+                      const std::optional<torch::Tensor>& bias,
+                      int64_t handler);
 
 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                         torch::Tensor& kv_cache, double scale,
@@ -151,8 +146,25 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
 
   // Quantization
-#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
+#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
+    defined(__powerpc64__)
   at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
+  // Helper function to release oneDNN handlers
+  ops.def("release_dnnl_matmul_handler(int handler) -> ()",
+          &release_dnnl_matmul_handler);
+
+  // Create oneDNN W8A8 handler
+  ops.def(
+      "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
+      "output_type, bool dynamic_act_quant, bool use_azp, int "
+      "primitive_cache_size) -> int",
+      &create_onednn_scaled_mm_handler);
+
+  // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
+  ops.def(
+      "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
+      "Tensor? azp_adj, Tensor? bias, int handler) -> ()");
+  ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);
 
   // Compute int8 quantized tensor for given scaling factor.
   ops.def(
@@ -168,50 +180,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       {stride_tag});
   ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
            &dynamic_scaled_int8_quant);
-  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
-  // quantization.
-  ops.def(
-      "cutlass_scaled_mm(Tensor! out, Tensor a,"
-      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()",
-      {stride_tag});
-  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
-  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
-  // quantization.
-  ops.def(
-      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
-      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()",
-      {stride_tag});
-  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
-#elif defined(__powerpc64__)
-  // Compute int8 quantized tensor for given scaling factor.
-  ops.def(
-      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
-      "Tensor? azp) -> ()");
-  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
-
-  // Compute int8 quantized tensor and scaling factor
-  ops.def(
-      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
-      "Tensor!? azp) -> ()");
-  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
-           &dynamic_scaled_int8_quant);
-  // W8A8 GEMM, supporting symmetric quantization.
-  ops.def(
-      "cutlass_scaled_mm(Tensor! out, Tensor a,"
-      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()");
-  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm_ppc64le);
-  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
-  // quantization.
-  ops.def(
-      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
-      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()");
-  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #endif
 
 // SHM CCL
diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py
new file mode 100644
index 0000000000..17692384ac
--- /dev/null
+++ b/tests/kernels/test_onednn.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Integration tests for FlexAttention backend vs default backend"""
+
+from typing import Optional
+
+import pytest
+import torch
+
+from tests.kernels.utils import to_int8
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+NK_FACTORS = [
+    (256, 128),
+    (4096, 4096),
+    (16384, 4096),
+    (1023, 491),
+    (1001, 15),
+]
+M_FACTORS = [
+    (16, 1, 32, 128, 64),
+    (1, 17, 1, 31, 17),
+]
+CACHE_SIZES = [2]
+DTYPE = [torch.bfloat16]
+
+
+def rand_int8(shape: tuple, device: str = "cpu"):
+    return to_int8(torch.rand(shape, device=device) * 255 - 128)
+
+
+def ref_int8_scaled_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    azp: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    output_type: torch.dtype,
+):
+    if azp is not None:
+        a = a.to(dtype=torch.float32) - azp.to(dtype=torch.float32)
+    output = torch.mm((scale_a * a.to(dtype=torch.float32)),
+                      (scale_b * b.to(dtype=torch.float32)))
+    if bias is not None:
+        output += bias.float()
+
+    return output.to(dtype=output_type)
+
+
+def onednn_int8_gemm_test_helper(primitive_cache_size: int,
+                                 m: int,
+                                 n: int,
+                                 k: int,
+                                 per_tensor_a_quant: bool,
+                                 per_tensor_b_quant: bool,
+                                 use_azp: bool,
+                                 use_bias: bool,
+                                 out_dtype: torch.dtype = torch.bfloat16,
+                                 device: str = "cpu"):
+    # Test for a oneDNN kernel with per-tensor / per-token activation
+    # quantization and per-tensor / per-output channel weight quantization.
+    a = to_int8(torch.randn((m, k), device=device) * 5)
+    b = to_int8(torch.randn((n, k), device=device).t() * 5)
+
+    a_scales_shape = (1, 1) if per_tensor_a_quant else (m, 1)
+    b_scales_shape = (1, 1) if per_tensor_b_quant else (1, n)
+
+    scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
+    scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
+
+    if use_azp:
+        azp = torch.rand(a_scales_shape, dtype=torch.float32) * 10 + 1.5
+        azp = (azp / scale_a).round().to(dtype=torch.int32)
+        azp_adj = scale_b * b.sum(dim=0, keepdim=True, dtype=torch.float32)
+    else:
+        azp = None
+        azp_adj = None
+
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
+    else:
+        bias = None
+
+    handler = ops.create_onednn_scaled_mm(
+        b,
+        scale_b,
+        out_dtype,
+        not per_tensor_a_quant,
+        use_azp,
+        primitive_cache_size,
+    )
+
+    out = torch.zeros((m, n), dtype=out_dtype)
+    ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, bias)
+    baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, bias, out_dtype)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+    if use_bias:
+        # To test runtime bias setting
+        out = torch.zeros((m, n), dtype=out_dtype)
+        ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, None)
+        baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, None,
+                                      out_dtype)
+
+        torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
+
+
+@pytest.mark.parametrize("n,k", NK_FACTORS)
+@pytest.mark.parametrize("m_list", M_FACTORS)
+@pytest.mark.parametrize("per_tensor_a_scale", [True, False])
+@pytest.mark.parametrize("per_tensor_b_scale", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("use_azp", [True, False])
+@pytest.mark.parametrize("output_type", DTYPE)
+@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES)
+def test_onednn_int8_scaled_gemm(
+    n: int,
+    k: int,
+    m_list: tuple[int],
+    per_tensor_a_scale: bool,
+    per_tensor_b_scale: bool,
+    use_bias: bool,
+    use_azp: bool,
+    output_type: torch.dtype,
+    primitive_cache_size: int,
+):
+    for m in m_list:
+        onednn_int8_gemm_test_helper(
+            primitive_cache_size=primitive_cache_size,
+            m=m,
+            n=n,
+            k=k,
+            per_tensor_a_quant=per_tensor_a_scale,
+            per_tensor_b_quant=per_tensor_b_scale,
+            use_bias=use_bias,
+            use_azp=use_azp,
+            out_dtype=output_type,
+        )
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 59f2d7737f..3081aff114 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1827,3 +1827,86 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
         M = mat1.size(0)
         N = mat2.size(0)
         return torch.empty((M, N), dtype=out_dtype)
+
+
+class CPUDNNLGEMMHandler:
+
+    def __init__(self) -> None:
+        self.handler: Optional[int] = None
+        self.n = -1
+        self.k = -1
+
+    def __del__(self):
+        if self.handler is not None:
+            torch.ops._C.release_dnnl_matmul_handler(self.handler)
+
+
+def create_onednn_scaled_mm(
+    weight: torch.Tensor,  # [K, N]
+    weight_scales: torch.Tensor,
+    output_type: torch.dtype,
+    dynamic_quant: bool,
+    use_azp: bool,
+    primitive_cache_size: int = 128,
+) -> CPUDNNLGEMMHandler:
+    handler = CPUDNNLGEMMHandler()
+    handler.k, handler.n = weight.size()
+    handler.handler = torch.ops._C.create_onednn_scaled_mm_handler(
+        weight, weight_scales, output_type, dynamic_quant, use_azp,
+        primitive_cache_size)
+    return handler
+
+
+def onednn_scaled_int8_quant(input: torch.Tensor,
+                             scale: Optional[torch.Tensor] = None,
+                             azp: Optional[torch.Tensor] = None,
+                             symmetric: bool = True):
+    """
+    Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
+
+    Args:
+        input: The input tensor to be quantized to int8.
+        scale: Optional scaling factor for the int8 quantization.
+            When not provided, we invoke dynamic-per-token quantization.
+        azp: Optional zero-point for the int8 quantization.
+            Must be provided for asymmetric quantization if `scale` is provided.
+        symmetric: Whether to use symmetric quantization (scale only, azp ignored).
+
+    Returns:
+      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
+    """
+    output = torch.empty_like(input, dtype=torch.int8)
+    token_num = input.numel() // input.shape[-1]
+    input = input.view((token_num, input.shape[-1]))
+    if scale is not None:
+        # static-per-tensor quantization.
+        assert symmetric == (
+            azp
+            is None), "azp must only be provided for asymmetric quantization."
+        torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
+        return output, scale, azp
+
+    # dynamic-per-token quantization.
+    input_scales = torch.empty((token_num, 1),
+                               device=input.device,
+                               dtype=torch.float32)
+    input_azp = None if symmetric else torch.empty_like(input_scales,
+                                                        dtype=torch.int32)
+    torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales,
+                                           input_azp)
+    return output, input_scales, input_azp
+
+
+def onednn_scaled_mm(
+    dnnl_handler: CPUDNNLGEMMHandler,
+    x: torch.Tensor,
+    output: torch.Tensor,
+    input_scale: Optional[torch.Tensor],
+    input_zp: Optional[torch.Tensor],
+    input_zp_adj: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    torch.ops._C.onednn_scaled_mm(output, x, input_scale, input_zp,
+                                  input_zp_adj, bias, dnnl_handler.handler)
+
+    return output
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b16c21b701..fcc6987d26 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -360,10 +360,15 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         elif current_platform.is_cpu():
             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
                 from vllm.model_executor.layers.fused_moe import cpu_fused_moe
-                dtype = layer.w13_weight.dtype
+                from vllm.model_executor.layers.utils import (
+                    check_cpu_sgl_kernel)
+                dtype_w13 = layer.w13_weight.dtype
+                _, n_w13, k_w13 = layer.w13_weight.size()
+                dtype_w2 = layer.w2_weight.dtype
+                _, n_w2, k_w2 = layer.w2_weight.size()
                 if (envs.VLLM_CPU_SGL_KERNEL
-                        and torch._C._cpu._is_amx_tile_supported()
-                        and dtype == torch.bfloat16):
+                        and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13)
+                        and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)):
                     packed_w13_weight = torch.ops._C.convert_weight_packed(
                         layer.w13_weight)
                     assert packed_w13_weight.size() == layer.w13_weight.size()
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 654e2ec7b2..9b1ab7af0a 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -199,11 +199,10 @@ class UnquantizedLinearMethod(LinearMethodBase):
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL:
+            from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
             N, K = layer.weight.size()
             dtype = layer.weight.dtype
-            if (torch._C._cpu._is_amx_tile_supported()
-                    and dtype == torch.bfloat16 and N % 32 == 0
-                    and K % 32 == 0):
+            if check_cpu_sgl_kernel(N, K, dtype):
                 packed_weight = torch.ops._C.convert_weight_packed(
                     layer.weight)
                 assert packed_weight.size() == layer.weight.size()
@@ -215,7 +214,8 @@ class UnquantizedLinearMethod(LinearMethodBase):
             else:
                 logger.warning(
                     "CPU SGL kernels require Intel AMX support,"
-                    " bfloat16 weight, IC and OC are divisible by 32.")
+                    " bf16/fp16/int8 weight, IC and OC are divisible by "
+                    "32 and 16.")
                 layer.use_cpu_sgl = False
 
     def apply(self,
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index 18f5ce04fd..2bc68ab3eb 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -6,6 +6,8 @@ from typing import Optional
 
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
     AiterScaledMMLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import (
+    CPUScaledMMLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
     CutlassScaledMMLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
@@ -18,7 +20,7 @@ from vllm.platforms import PlatformEnum, current_platform
 
 # in priority/performance order (when available)
 _POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = {
-    PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
+    PlatformEnum.CPU: [CPUScaledMMLinearKernel],
     PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
     PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel],
     PlatformEnum.TPU: [XLAScaledMMLinearKernel],
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
new file mode 100644
index 0000000000..59d2b5bce9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
@@ -0,0 +1,206 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise)
+from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
+from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
+
+from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
+                                   ScaledMMLinearLayerConfig)
+
+
+class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        if not current_platform.is_cpu():
+            return False, "CPUScaledMM requires running on CPU."
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight = getattr(layer, self.w_q_name)
+        dtype = weight.dtype
+        N, K = weight.size()
+        if (current_platform.get_cpu_architecture() == CpuArchEnum.X86
+                and envs.VLLM_CPU_SGL_KERNEL and self.config.input_symmetric
+                and check_cpu_sgl_kernel(N, K, dtype)):
+            self.linear_method = self._apply_weights_sgl
+            self.process_weights_for_sgl(layer)
+        else:
+            self.linear_method = self._apply_weights_onednn
+            self.process_weights_for_onednn(layer)
+
+    def process_weights_for_onednn(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        # Transpose to [K, N] for convenience
+        weight = getattr(layer, self.w_q_name)
+        replace_parameter(
+            layer, self.w_q_name,
+            torch.nn.Parameter(weight.t().data, requires_grad=False))
+
+        # WEIGHT SCALE
+        # oneDNN kernels support only per-tensor and per-channel.
+        # If we have a fused module (QKV, MLP) with per tensor scales (thus N
+        # scales being passed to the kernel), convert to the per-channel case.
+        is_fused_module = len(layer.logical_widths) > 1
+        weight_scale = getattr(layer, self.w_s_name)
+        if is_fused_module and not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+
+        # INPUT SCALE
+        if self.config.is_static_input_scheme:
+            input_scale = getattr(layer, self.i_s_name)
+
+            if self.config.input_symmetric:
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(input_scale.max(), requires_grad=False))
+                setattr(layer, self.i_zp_name, None)
+            else:
+                input_zero_point = getattr(layer, self.i_zp_name)
+
+                # reconstruct the ranges
+                int8_traits = torch.iinfo(torch.int8)
+                azps = input_zero_point.to(dtype=torch.int32)
+                range_max = (input_scale * (int8_traits.max - azps)).max()
+                range_min = (input_scale * (int8_traits.min - azps)).min()
+
+                scale = (range_max - range_min) / (int8_traits.max -
+                                                   int8_traits.min)
+                replace_parameter(
+                    layer, self.i_s_name,
+                    torch.nn.Parameter(scale, requires_grad=False))
+
+                azp = (int8_traits.min -
+                       range_min / scale).round().to(dtype=torch.int32)
+                replace_parameter(layer, self.i_zp_name,
+                                  torch.nn.Parameter(azp, requires_grad=False))
+
+        else:
+            setattr(layer, self.i_s_name, None)
+            setattr(layer, self.i_zp_name, None)
+
+        # Different from cutlass, oneDNN kernels only need the AZP adjustment
+        # term for dynamic quantization. And s_b should be folded into the
+        # term. Such as:
+        # s_a * s_b * [(A - zp_a)B] + bias =
+        # s_a * (s_b * AB) - s_a * s_b * zp_a * B + bias =
+        # s_a * GEMM_output - s_a * zp_a * adj + bias
+        if not (self.config.input_symmetric
+                and self.config.is_static_input_scheme):
+            weight = getattr(layer, self.w_q_name)
+            weight_scale = getattr(layer, self.w_s_name)
+            azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.float32)
+            azp_adj = azp_adj * weight_scale.squeeze()
+            setattr(layer, self.azp_adj_name,
+                    torch.nn.Parameter(azp_adj, requires_grad=False))
+        else:
+            setattr(layer, self.azp_adj_name, None)
+
+        weight = getattr(layer, self.w_q_name)
+        self.dnnl_handler = ops.create_onednn_scaled_mm(
+            weight,
+            getattr(layer, self.w_s_name),
+            torch.get_default_dtype(),
+            getattr(layer, self.i_s_name) is None,
+            not self.config.input_symmetric,
+            32,
+        )
+        # weight is prepacked and maintained by the dnnl_handler,
+        # release the original weight
+        setattr(layer, self.w_q_name, None)
+        del weight
+
+    def process_weights_for_sgl(self, layer: torch.nn.Module) -> None:
+        # WEIGHT
+        weight = getattr(layer, self.w_q_name)
+        packed_weight = torch.ops._C.convert_weight_packed(weight)
+        replace_parameter(
+            layer, self.w_q_name,
+            torch.nn.Parameter(packed_weight, requires_grad=False))
+
+        if layer.bias is not None:
+            bias = layer.bias
+            layer.register_parameter(
+                "bias_fp32",
+                torch.nn.Parameter(bias.float().data, requires_grad=False))
+
+        # WEIGHT SCALE
+        # CPU SGL kernels only support per-channel.
+        # For per-tensor quant, convert to the per-channel case.
+        weight_scale = getattr(layer, self.w_s_name)
+        if not self.config.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+        replace_parameter(
+            layer, self.w_s_name,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False))
+
+        setattr(layer, self.i_s_name, None)
+        setattr(layer, self.i_zp_name, None)
+        setattr(layer, self.azp_adj_name, None)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return self.linear_method(
+            layer,
+            x,
+            bias,
+        )
+
+    def _apply_weights_onednn(
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        x_q, x_s, x_zp = ops.onednn_scaled_int8_quant(
+            x, i_s, i_zp, self.config.input_symmetric)
+
+        m = x.size(0)
+        n = self.dnnl_handler.n
+        out = torch.empty((m, n), dtype=x.dtype)
+        ops.onednn_scaled_mm(self.dnnl_handler, x_q, out, x_s, x_zp, azp_adj,
+                             bias)
+
+        return out
+
+    def _apply_weights_sgl(
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        w_q, w_s, _, _, _ = self._get_weight_params(layer)
+        return torch.ops._C.int8_scaled_mm_with_quant(
+            x,
+            w_q,
+            w_s,
+            layer.bias_fp32 if bias is not None else None,
+            x.dtype,
+            True,
+        )
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index 6ddd4a9ec4..2f982f96b0 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -25,8 +25,8 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
     def can_implement(
             cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
 
-        if (not current_platform.is_cuda() and not current_platform.is_cpu()):
-            return False, "CutlassScaledMM requires running on CUDA or CPU."
+        if not current_platform.is_cuda():
+            return False, "CutlassScaledMM requires running on CUDA."
 
         return True, None
 
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 48a347a8f5..2897f75b31 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -142,6 +142,12 @@ direct_register_custom_op(
 )
 
 
+def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype):
+    return (torch._C._cpu._is_amx_tile_supported()
+            and (dtype in (torch.bfloat16, torch.int8)) and k % 32 == 0
+            and n % 16 == 0)
+
+
 def cpu_unquantized_gemm(layer: torch.nn.Module,
                          x: torch.Tensor,
                          weight: torch.Tensor,

From 2461d9e562e5852555c76e0dbed06979f9c6c688 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 21 Aug 2025 11:05:20 +0800
Subject: [PATCH 454/932] [CI/Build] Split out mm processor tests (#23260)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                     | 15 +++++++++++----
 .../{ => processing}/test_tensor_schema.py        |  7 +++----
 vllm/model_executor/models/cohere2_vision.py      |  2 ++
 3 files changed, 16 insertions(+), 8 deletions(-)
 rename tests/models/multimodal/{ => processing}/test_tensor_schema.py (98%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7454206640..5869ae21d5 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -545,6 +545,15 @@ steps:
   commands:
     - pytest -v -s models/language/pooling -m 'not core_model'
 
+- label: Multi-Modal Processor Test
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+
 - label: Multi-Modal Models Test (Standard)
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
@@ -554,9 +563,7 @@ steps:
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal/processing
-    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model
-    - pytest -v -s models/multimodal/test_tensor_schema.py -m core_model  # Needs mp_method="spawn"
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
 - label: Multi-Modal Models Test (Extended) 1
@@ -567,7 +574,7 @@ steps:
   - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
+    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
 
 - label: Multi-Modal Models Test (Extended) 2
   mirror_hardwares: [amdexperimental]
diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
similarity index 98%
rename from tests/models/multimodal/test_tensor_schema.py
rename to tests/models/multimodal/processing/test_tensor_schema.py
index 143b4c8fc8..79164f02c3 100644
--- a/tests/models/multimodal/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -24,9 +24,9 @@ from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads
 from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.engine.core import EngineCore as V1EngineCore
 
-from ...conftest import VllmRunner
-from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
-from ..utils import dummy_hf_overrides
+from ....conftest import VllmRunner
+from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
+from ...utils import dummy_hf_overrides
 
 ARCH_TO_SKIP = {
     "MolmoForCausalLM": "incompatible requirements",
@@ -147,7 +147,6 @@ def get_model_id_to_test(
     return filtered_results
 
 
-@pytest.mark.core_model
 @pytest.mark.parametrize(
     "model_arch, model_id",
     get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index fca1aee835..179cc2af8e 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -170,6 +170,8 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo):
 
         # The current implementation of get_number_of_image_patches
         # is incorrect, so we patch it here.
+        # TODO: Revert once
+        # https://github.com/huggingface/transformers/pull/40312 is released.
         # return image_processor.get_number_of_image_patches(image_height,
         #                                                    image_width, {})
 

From 3663870c72da246d81d8bd8f5c059890fb3f3f5d Mon Sep 17 00:00:00 2001
From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Thu, 21 Aug 2025 06:08:51 +0300
Subject: [PATCH 455/932] [V1][Mamba1] - Full CUDA and Piecewise CUDA Graphs
 Support (#23035)

Signed-off-by: asafg <asafg@ai21.com>
Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
Co-authored-by: asafg <asafg@ai21.com>
---
 docs/usage/v1_guide.md                        |  2 +-
 .../models/language/generation/test_hybrid.py | 20 ++----
 vllm/config/compilation.py                    |  1 +
 .../layers/mamba/mamba_mixer.py               | 66 ++++++++++++++++---
 vllm/model_executor/models/jamba.py           |  8 ++-
 vllm/model_executor/models/mamba.py           |  7 +-
 vllm/v1/attention/backends/mamba1_attn.py     | 37 +++++------
 vllm/v1/attention/backends/mamba2_attn.py     | 45 ++-----------
 vllm/v1/attention/backends/mamba_attn.py      | 55 ++++++++++++++++
 9 files changed, 154 insertions(+), 87 deletions(-)
 create mode 100644 vllm/v1/attention/backends/mamba_attn.py

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 54af970ea8..9bf0c5842c 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -107,7 +107,7 @@ to enable simultaneous generation and embedding using the same engine instance i
 #### Mamba Models
 
 Models using selective state-space mechanisms instead of standard transformer attention are supported.
-Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1. Additionally, Mamba-1 models require `enforce_eager=True`.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1.
 
 Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index aee0a50336..f8c0eaa8cf 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -54,16 +54,14 @@ V1_SUPPORTED_MODELS = [
     "tiiuae/Falcon-H1-0.5B-Base",
 ]
 
+FULL_CUDA_GRAPH_MODELS = [
+    "ai21labs/Jamba-tiny-dev",
+    "Zyphra/Zamba2-1.2B-instruct",
+]
+
 # Avoid OOM
 MAX_NUM_SEQS = 4
 
-# Once we add support for FCG in Mamba1, this list will be removed and tests
-# all test cases will use enforce_eager=False
-ENFORCE_EAGER_MODELS_V1 = [
-    "state-spaces/mamba-130m-hf",
-    "ai21labs/Jamba-tiny-dev",
-]
-
 
 @pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
@@ -101,19 +99,13 @@ def test_models(
             example_prompts, max_tokens, num_logprobs)
 
     if model in V1_SUPPORTED_MODELS:
-        enforce_eager = False
         with monkeypatch.context() as m:
             m.setenv("VLLM_USE_V1", "1")
             if model in HYBRID_MODELS:
                 # required due to reorder_batch behaviour
                 m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
-
-            if model in ENFORCE_EAGER_MODELS_V1:
-                enforce_eager = True
-
             with vllm_runner(model,
                              max_num_seqs=MAX_NUM_SEQS,
-                             enforce_eager=enforce_eager,
                              enable_prefix_caching=False) as vllm_model:
                 vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                     example_prompts, max_tokens, num_logprobs)
@@ -373,7 +365,7 @@ def test_distributed_correctness(
     )
 
 
-@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
+@pytest.mark.parametrize("model", FULL_CUDA_GRAPH_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_full_cuda_graph(
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 56a2183f8e..c654485f4f 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -336,6 +336,7 @@ class CompilationConfig:
         "vllm.unified_attention",
         "vllm.unified_attention_with_output",
         "vllm.mamba_mixer2",
+        "vllm.mamba_mixer",
     ]
 
     def compute_hash(self) -> str:
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 3c7322260d..a24e72778b 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -27,6 +27,8 @@ from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
     selective_scan_fn, selective_state_update)
 from vllm.model_executor.models.mamba_cache import MambaCacheParams
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
 from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata
 
 
@@ -183,22 +185,26 @@ class MambaMixer(MambaBase, CustomOp):
 
     def forward(self,
                 hidden_states: torch.Tensor,
+                output: torch.Tensor,
                 mamba_cache_params: Optional[MambaCacheParams] = None):
         if not envs.VLLM_USE_V1:
-            return CustomOp.forward(self, hidden_states, mamba_cache_params)
+            CustomOp.forward(self, hidden_states, output, mamba_cache_params)
         else:
-            return self.forward_cuda(
+            torch.ops.vllm.mamba_mixer(
                 hidden_states,
-                mamba_cache_params,
+                output,
+                self.prefix,
             )
 
     def forward_native(self,
                        hidden_states: torch.Tensor,
+                       output: torch.Tensor,
                        mamba_cache_params: Optional[MambaCacheParams] = None):
         pass
 
     def forward_cuda(self,
                      hidden_states: torch.Tensor,
+                     output: torch.Tensor,
                      mamba_cache_params: Optional[MambaCacheParams] = None):
         """
         Run the Mamba-1 SSM pipeline.
@@ -237,6 +243,7 @@ class MambaMixer(MambaBase, CustomOp):
                 conv_state = self_kv_cache[0].transpose(-1, -2)
                 ssm_state = self_kv_cache[1]
                 has_initial_states = mamba1_metadata.has_initial_states
+                num_padded_decodes = mamba1_metadata.num_padded_decodes
         else:
             assert isinstance(attn_metadata, AttentionMetadata)
             assert mamba_cache_params is not None
@@ -248,6 +255,7 @@ class MambaMixer(MambaBase, CustomOp):
             has_initial_states = None
             if context_lens_tensor is not None:
                 has_initial_states = context_lens_tensor > 0
+            num_padded_decodes = attn_metadata.num_decode_tokens
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
@@ -267,6 +275,7 @@ class MambaMixer(MambaBase, CustomOp):
         num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
         has_prefill = num_prefill_tokens > 0
         has_decode = num_decode_tokens > 0
+        num_actual_tokens = num_prefill_tokens + num_decode_tokens
 
         prefill_decode_split = split_batch_to_prefill_and_decode(
             hidden_states_BC,
@@ -278,6 +287,7 @@ class MambaMixer(MambaBase, CustomOp):
             num_decode_tokens,
             num_prefills,
             num_decodes,
+            num_padded_decodes,
         )
         hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p
         hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d
@@ -371,7 +381,7 @@ class MambaMixer(MambaBase, CustomOp):
         else:
             out = self.out_proj(scan_outputs_combined.transpose(-2, -1))[0]
 
-        return out
+        output[:num_actual_tokens] = out
 
     def get_state_dtype(self) -> tuple[torch.dtype]:
         assert self.model_config is not None
@@ -421,18 +431,27 @@ def split_batch_to_prefill_and_decode(
     num_decode_tokens: int,
     num_prefills: int,
     num_decodes: int,
+    num_padded_decodes: int,
 ) -> PrefillDecodeSplit:
+    num_actual_tokens = num_prefill_tokens + num_padded_decodes
+
     if envs.VLLM_USE_V1:
         # In v1, decode tokens come first, then prefill tokens.
         hidden_states_BC_d, hidden_states_BC_p = torch.split(
-            hidden_states_BC, [num_decode_tokens, num_prefill_tokens], dim=-1)
-        gate_d, gate_p = torch.split(gate,
-                                     [num_decode_tokens, num_prefill_tokens],
+            hidden_states_BC[..., :num_actual_tokens],
+            [num_padded_decodes, num_prefill_tokens],
+            dim=-1)
+        gate_d, gate_p = torch.split(gate[..., :num_actual_tokens],
+                                     [num_padded_decodes, num_prefill_tokens],
                                      dim=-1)
+
+        # num_padded_decodes accounts for CUDA graph padding when applicable
         state_indices_tensor_d, state_indices_tensor_p = torch.split(
-            state_indices_tensor, [num_decodes, num_prefills], dim=0)
+            state_indices_tensor[:num_padded_decodes + num_prefills],
+            [num_padded_decodes, num_prefills],
+            dim=0)
         query_start_loc_p = (query_start_loc[-num_prefills - 1:] -
-                             num_decodes if num_prefills > 0 else None)
+                             num_padded_decodes if num_prefills > 0 else None)
         has_initial_states_p = has_initial_states[-num_prefills:] if (
             has_initial_states is not None and num_prefills > 0) else None
     else:
@@ -459,3 +478,32 @@ def split_batch_to_prefill_and_decode(
         query_start_loc_p=query_start_loc_p,
         has_initial_states_p=has_initial_states_p,
     )
+
+
+def mamba_mixer(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self.forward_cuda(hidden_states=hidden_states,
+                      output=output,
+                      mamba_cache_params=None)
+
+
+def mamba_mixer_fake(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="mamba_mixer",
+    op_func=mamba_mixer,
+    mutates_args=["output"],
+    fake_impl=mamba_mixer_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 0b32d6f256..3c1a0b68df 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -10,6 +10,7 @@ from transformers import JambaConfig
 
 from vllm import envs
 from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
@@ -154,10 +155,10 @@ class JambaMambaDecoderLayer(nn.Module):
             hidden_states, residual = self.input_layernorm(
                 hidden_states, residual)
 
-        hidden_states = self.mamba(hidden_states, mamba_cache_params)
+        output = torch.empty_like(hidden_states)
+        self.mamba(hidden_states, output, mamba_cache_params)
         # Fully Connected
-        hidden_states, residual = self.pre_ff_layernorm(
-            hidden_states, residual)
+        hidden_states, residual = self.pre_ff_layernorm(output, residual)
         hidden_states = self.feed_forward(hidden_states)
         return hidden_states, residual
 
@@ -278,6 +279,7 @@ ALL_DECODER_LAYER_TYPES = {
 }
 
 
+@support_torch_compile
 class JambaModel(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index f4aaf0c6f4..f02499a4f9 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -9,6 +9,7 @@ from torch import nn
 from transformers import MambaConfig
 
 from vllm import envs
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -81,10 +82,12 @@ class MambaDecoderLayer(nn.Module):
         else:
             hidden_states, residual = self.norm(hidden_states, residual)
 
-        hidden_states = self.mixer(hidden_states, mamba_cache_params)
-        return hidden_states, residual
+        output = torch.empty_like(hidden_states)
+        self.mixer(hidden_states, output, mamba_cache_params)
+        return output, residual
 
 
+@support_torch_compile
 class MambaModel(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index 6cdc509083..97a1aa86dd 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -2,16 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import Optional
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.config import VllmConfig
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
-                                              CommonAttentionMetadata,
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.v1.attention.backends.mamba_attn import (
+    BaseMambaAttentionMetadataBuilder)
+from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
                                               split_decodes_and_prefills)
-from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
 
 class Mamba1AttentionBackend(AttentionBackend):
@@ -31,24 +31,11 @@ class Mamba1AttentionMetadata:
     num_prefill_tokens: int
     num_decodes: int
     num_decode_tokens: int
+    num_padded_decodes: int
 
 
 class Mamba1AttentionMetadataBuilder(
-        AttentionMetadataBuilder[Mamba1AttentionMetadata]):
-    reorder_batch_threshold: ClassVar[int] = 1
-
-    def __init__(
-        self,
-        kv_cache_spec: AttentionSpec,
-        vllm_config: VllmConfig,
-        device: torch.device,
-        layer_names: list[str],
-    ):
-        assert isinstance(kv_cache_spec, MambaSpec)
-        self.kv_cache_spec = kv_cache_spec
-        self.device = device
-        self.vllm_config = vllm_config
-        self.layer_names = layer_names
+        BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]):
 
     def build(
         self,
@@ -67,9 +54,18 @@ class Mamba1AttentionMetadataBuilder(
                                        decode_threshold=1))
 
         has_initial_states = None
+        padded_decodes = num_decodes
 
         if num_prefills > 0:
             has_initial_states = context_lens_tensor > 0
+        elif (num_decodes > 0 and num_decodes <= self.decode_cudagraph_max_bs
+              and self.compilation_config.full_cuda_graph):
+            state_indices_for_decode = state_indices_tensor[:num_decodes]
+            padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes)
+            self.state_indices_tensor[:num_decodes].copy_(
+                state_indices_for_decode, non_blocking=True)
+            state_indices_tensor = self.state_indices_tensor[:padded_decodes]
+            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
 
         return Mamba1AttentionMetadata(
             query_start_loc=query_start_loc,
@@ -80,4 +76,5 @@ class Mamba1AttentionMetadataBuilder(
             num_prefill_tokens=num_prefill_tokens,
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
+            num_padded_decodes=padded_decodes,
         )
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index ace078e2b2..ed30884fdb 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -2,18 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import Optional
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import VllmConfig
-from vllm.v1.attention.backends.utils import (AttentionCGSupport,
-                                              AttentionMetadataBuilder,
-                                              CommonAttentionMetadata,
+from vllm.v1.attention.backends.mamba_attn import (
+    BaseMambaAttentionMetadataBuilder)
+from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
                                               split_decodes_and_prefills)
-from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
+from vllm.v1.kv_cache_interface import AttentionSpec
 
 
 def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor,
@@ -88,29 +88,14 @@ class Mamba2AttentionMetadata:
 
 
 class Mamba2AttentionMetadataBuilder(
-        AttentionMetadataBuilder[Mamba2AttentionMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
-
-    reorder_batch_threshold: ClassVar[int] = 1
+        BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata]):
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
-        assert isinstance(kv_cache_spec, MambaSpec)
-        self.kv_cache_spec = kv_cache_spec
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         self.chunk_size = vllm_config.model_config.get_mamba_chunk_size()
-        self.vllm_config = vllm_config
-        self.compilation_config = vllm_config.compilation_config
         assert self.chunk_size is not None, (
             "chunk_size needs to be set in the model config for Mamba2 models")
-        self.decode_cudagraph_max_bs = min(
-            self.vllm_config.scheduler_config.max_num_seqs,
-            self.compilation_config.max_capture_size)
-        self.state_indices_tensor = torch.empty(
-            (self.decode_cudagraph_max_bs, ),
-            dtype=torch.int32,
-            device=device,
-        )
 
     def build(self,
               common_prefix_len: int,
@@ -187,19 +172,3 @@ class Mamba2AttentionMetadataBuilder(
             state_indices_tensor=state_indices_tensor,
         )
         return attn_metadata
-
-    def build_for_cudagraph_capture(
-            self, common_attn_metadata: CommonAttentionMetadata):
-        """
-        This method builds the metadata for full cudagraph capture.
-        Currently, only decode is supported for full cudagraphs with Mamba.
-        """
-        m = common_attn_metadata
-
-        assert m.num_reqs == m.num_actual_tokens, \
-            "Mamba only supports decode-only full CUDAGraph capture. " \
-            "Make sure all cudagraph capture sizes <= max_num_seq."
-
-        m.max_query_len = 1  # decode-only
-
-        return self.build(0, m)
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
new file mode 100644
index 0000000000..07ef7cb69a
--- /dev/null
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import abc
+from typing import ClassVar, TypeVar
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                              AttentionMetadataBuilder,
+                                              CommonAttentionMetadata)
+from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
+
+M = TypeVar("M")
+
+
+class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
+    reorder_batch_threshold: ClassVar[int] = 1
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
+        assert isinstance(kv_cache_spec, MambaSpec)
+        self.kv_cache_spec = kv_cache_spec
+        self.device = device
+        self.vllm_config = vllm_config
+        self.layer_names = layer_names
+
+        self.compilation_config = vllm_config.compilation_config
+        self.decode_cudagraph_max_bs = min(
+            self.vllm_config.scheduler_config.max_num_seqs,
+            self.compilation_config.max_capture_size)
+        self.state_indices_tensor = torch.empty(
+            (self.decode_cudagraph_max_bs, ),
+            dtype=torch.int32,
+            device=device,
+        )
+
+    def build_for_cudagraph_capture(
+            self, common_attn_metadata: CommonAttentionMetadata) -> M:
+        """
+        This method builds the metadata for full cudagraph capture.
+        Currently, only decode is supported for full cudagraphs with Mamba.
+        """
+        m = common_attn_metadata
+
+        assert m.num_reqs == m.num_actual_tokens, \
+            "Mamba only supports decode-only full CUDAGraph capture. " \
+            "Make sure all cudagraph capture sizes <= max_num_seq."
+
+        m.max_query_len = 1  # decode-only
+
+        return self.build(0, m)
\ No newline at end of file

From f94bf9b924afe2e720b864590c9798b911e77e66 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 20 Aug 2025 23:09:39 -0400
Subject: [PATCH 456/932] [Compile] Fix Compile Warning SM100 Cutlass MLA
 (#23287)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 csrc/attention/mla/sm100_cutlass_mla_kernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
index e0e95d0629..6dd6f269f3 100644
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@@ -167,7 +167,7 @@ typename T::Fmha::Arguments args_from_options(
       // TODO(trevor-m): Change split_kv back to -1 when
       // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
       // perform worse with larger context length and smaller batch sizes.
-      num_kv_splits, // split_kv
+      static_cast<int>(num_kv_splits), // split_kv
       nullptr,       // is_var_split_kv
   };
   // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
@@ -264,7 +264,7 @@ int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_ba
   // Assumes device 0 when getting sm_count.
   arguments.hw_info.sm_count =
       sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count;
-  arguments.split_kv = num_kv_splits;
+  arguments.split_kv = static_cast<int>(num_kv_splits);
   MlaSm100Type::Fmha::set_split_kv(arguments);
 
   return MlaSm100Type::Fmha::get_workspace_size(arguments);

From 655a09f6538e6b09af23771dcc4fcebd72a15b23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=A8=E5=A5=87=28yann=20qi=29?=
 <51905299+yannqi@users.noreply.github.com>
Date: Thu, 21 Aug 2025 12:08:52 +0800
Subject: [PATCH 457/932] [Model][VLM] Support R-4B Model (#23246)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: yannqi <yannqi@qq.com>
Signed-off-by: 杨奇(yann qi) <51905299+yannqi@users.noreply.github.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: yannqiyang <yannqiyang@tencent.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/models/supported_models.md               |   1 +
 examples/offline_inference/vision_language.py |  23 ++++
 .../vision_language_multi_image.py            |  34 ++++++
 .../multimodal/processing/test_common.py      |   1 +
 tests/models/registry.py                      |   2 +
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/rvl.py             | 103 ++++++++++++++++++
 7 files changed, 165 insertions(+)
 create mode 100644 vllm/model_executor/models/rvl.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 7308d00106..831bfb1e93 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -652,6 +652,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎ |
+| `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ | ✅︎ |
 | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
 | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
 | `Step3VLForConditionalGeneration` | Step3-VL | T + I<sup>+</sup> | `stepfun-ai/step3` | | ✅︎ | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 88bbbfdfbd..e7a7a30dd3 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1436,6 +1436,28 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
     )
 
 
+# R-4B
+def run_r_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "YannQi/R-4B"
+
+    prompts = [
+        f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
+        for question in questions
+    ]
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=16384,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # SkyworkR1V
 def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1622,6 +1644,7 @@ model_example_map = {
     "qwen2_vl": run_qwen2_vl,
     "qwen2_5_vl": run_qwen2_5_vl,
     "qwen2_5_omni": run_qwen2_5_omni,
+    "rvl": run_r_vl,
     "skywork_chat": run_skyworkr1v,
     "smolvlm": run_smolvlm,
     "step3": run_step3,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index eabd9453f3..d9242efa85 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -992,6 +992,39 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "YannQi/R-4B"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=16384,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
 
@@ -1193,6 +1226,7 @@ model_example_map = {
     "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
     "qwen2_5_vl": load_qwen2_5_vl,
+    "rvl": load_r_vl,
     "smolvlm": load_smolvlm,
     "step3": load_step3,
     "tarsier": load_tarsier,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 02aecfad82..adc8b2510d 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -316,6 +316,7 @@ def _test_processing_correctness_one(
     "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
     "Qwen/Qwen2.5-Omni-3B",
+    "YannQi/R-4B",
     "Skywork/Skywork-R1V-38B",
     "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
     "stepfun-ai/step3",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 6e6acfb8cd..4f69f90b6a 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -489,6 +489,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                           max_model_len=4096),
     "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"),
     "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
+    "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B",
+                                                 trust_remote_code=True),
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B",
                                            trust_remote_code=True),
     "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct",  # noqa: E501
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 78ef270598..39a3e425a4 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -217,6 +217,7 @@ _MULTIMODAL_MODELS = {
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
     "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
+    "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
     "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
     "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
     "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
diff --git a/vllm/model_executor/models/rvl.py b/vllm/model_executor/models/rvl.py
new file mode 100644
index 0000000000..efdb010046
--- /dev/null
+++ b/vllm/model_executor/models/rvl.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Mapping
+
+import torch
+import torch.nn as nn
+from transformers.activations import GELUActivation
+
+from vllm.config import VllmConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalDataDict
+
+from .llava_next import (LlavaDummyInputsBuilder, LlavaNextMultiModalProcessor,
+                         LlavaNextProcessingInfo)
+from .llava_onevision import LlavaOnevisionForConditionalGeneration
+from .utils import WeightsMapper
+
+
+class RVLProcessingInfo(LlavaNextProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(**kwargs)
+
+
+class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        image_token = "<image>"
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = (
+            self.info.get_image_size_with_most_features())
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+        }
+
+
+class RVLMultiModalProjector(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(config.vision_config.hidden_size,
+                                     eps=1e-06)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=True,
+        )
+        self.act = GELUActivation()
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=True,
+        )
+
+    def forward(self, image_feature: torch.Tensor) -> torch.Tensor:
+        image_feature = self.pre_norm(image_feature)
+        hidden_states = self.linear_1(image_feature)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    LlavaNextMultiModalProcessor,
+    info=RVLProcessingInfo,
+    dummy_inputs=RVLDummyInputsBuilder,
+)
+class RForConditionalGeneration(LlavaOnevisionForConditionalGeneration):
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers
+            # v4.52
+            "model.language_model.": "language_model.model.",
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "model.image_newline": "image_newline",
+            "lm_head.": "language_model.lm_head.",
+        })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        config = vllm_config.model_config.hf_config
+        self.multi_modal_projector = RVLMultiModalProjector(config)

From 8993073dc1a7e2d31eda85812b76789046ae7c28 Mon Sep 17 00:00:00 2001
From: QiliangCui <derrhein@gmail.com>
Date: Thu, 21 Aug 2025 04:15:20 +0000
Subject: [PATCH 458/932] [CI] Delete images older than 24h. (#23291)

Signed-off-by: Qiliang Cui <derrhein@gmail.com>
---
 .buildkite/scripts/tpu/cleanup_docker.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh
index 209d9c4341..740d81fb39 100755
--- a/.buildkite/scripts/tpu/cleanup_docker.sh
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
@@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then
   # Remove dangling images (those that are not tagged and not used by any container)
   docker image prune -f
   # Remove unused volumes / force the system prune for old images as well.
-  docker volume prune -f && docker system prune --force --filter "until=72h" --all
+  docker volume prune -f && docker system prune --force --filter "until=24h" --all
   echo "Docker images and volumes cleanup completed."
 else
   echo "Disk usage is below $threshold%. No cleanup needed."

From f64ee61d9e7014a5f230a8347186b952dbe483de Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 21 Aug 2025 00:21:05 -0400
Subject: [PATCH 459/932] [CI] Block the cu126 wheel build while broken
 (#23285)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/release-pipeline.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index e20ce54ca7..f96c38bf57 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -27,7 +27,12 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
+  - block: "Build CUDA 12.6 wheel"
+    key: block-build-cu126-wheel
+    depends_on: ~
+
   - label: "Build wheel - CUDA 12.6"
+    depends_on: block-build-cu126-wheel
     id: build-wheel-cuda-12-6
     agents:
       queue: cpu_queue_postmerge

From f571ff8eb6d9117c6a418f7f925921968dff8ac8 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Wed, 20 Aug 2025 21:28:32 -0700
Subject: [PATCH 460/932] [Sampler] Support returning final logprobs (#22387)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 docs/usage/v1_guide.md                  |  7 ++-
 tests/v1/sample/test_logprobs.py        | 10 ++--
 vllm/config/__init__.py                 | 30 ++++++----
 vllm/engine/arg_utils.py                |  1 +
 vllm/v1/sample/ops/topk_topp_sampler.py | 65 ++++++++++----------
 vllm/v1/sample/sampler.py               | 79 +++++++++++++++++++------
 vllm/v1/sample/tpu/sampler.py           |  2 +-
 7 files changed, 125 insertions(+), 69 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 9bf0c5842c..b897689136 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -154,12 +154,15 @@ differences compared to V0:
 
 ##### Logprobs Calculation
 
-Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
+By default, logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
 before applying any logits post-processing such as temperature scaling or penalty
 adjustments). As a result, the returned logprobs do not reflect the final adjusted
 probabilities used during sampling.
 
-Support for logprobs with post-sampling adjustments is in progress and will be added in future updates.
+You can adjust this behavior by setting the `--logprobs-mode` flag.
+Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`.
+Raw means the values before applying any logit processors, like bad words.
+Processed means the values after applying all processors, including temperature and top_k/top_p.
 
 ##### Prompt Logprobs with Prefix Caching
 
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 8bd142e87b..e835c02963 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -456,9 +456,7 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
                 assert len(logprob) == vocab_size
 
 
-@pytest.mark.parametrize(
-    "logprobs_mode",
-    ["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"])
+@pytest.mark.parametrize("logprobs_mode", list(LogprobsMode))
 def test_logprobs_mode(logprobs_mode: LogprobsMode,
                        monkeypatch: pytest.MonkeyPatch):
     """Test with LLM engine with different logprobs_mode.
@@ -487,12 +485,14 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode,
             for logprobs in output.logprobs:
                 for token_id in logprobs:
                     logprob = logprobs[token_id]
-                    if "logprobs" in logprobs_mode:
+                    if logprobs_mode in (LogprobsMode.RAW_LOGPROBS,
+                                         LogprobsMode.PROCESSED_LOGPROBS):
                         assert logprob.logprob <= 0
                     if logprob.logprob > 0:
                         positive_values = positive_values + 1
                     total_token_with_logprobs = total_token_with_logprobs + 1
         assert total_token_with_logprobs >= len(results[0].outputs)
-        if "logits" in logprobs_mode:
+        if logprobs_mode in (LogprobsMode.RAW_LOGITS,
+                             LogprobsMode.PROCESSED_LOGITS):
             assert positive_values > 0
         del llm
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 959f111ced..2973cb92d1 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -257,11 +257,16 @@ def is_init_field(cls: ConfigType, name: str) -> bool:
 
 TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
-LogprobsMode = Literal["raw_logprobs", "raw_logits", "processed_logprobs",
-                       "processed_logits"]
 MMEncoderTPMode = Literal["weights", "data"]
 
 
+class LogprobsMode(enum.Enum):
+    RAW_LOGITS = "raw_logits"
+    RAW_LOGPROBS = "raw_logprobs"
+    PROCESSED_LOGITS = "processed_logits"
+    PROCESSED_LOGPROBS = "processed_logprobs"
+
+
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class ModelConfig:
@@ -363,12 +368,13 @@ class ModelConfig:
     specified in `SamplingParams`. The default value comes the default for the
     OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length *
     vocab_size) logprobs are allowed to be returned and it may cause OOM."""
-    logprobs_mode: LogprobsMode = "raw_logprobs"
+    logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS
     """Indicates the content returned in the logprobs and prompt_logprobs.
     Supported mode:
     1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits.
-    Raw means the values before applying logit processors, like bad words.
-    Processed means the values after applying such processors.
+    Raw means the values before applying any logit processors, like bad words.
+    Processed means the values after applying all processors, including
+    temperature and top_k/top_p.
     """
     disable_sliding_window: bool = False
     """Whether to disable sliding window. If True, we will disable the sliding
@@ -2586,7 +2592,7 @@ class MultiModalConfig:
 
     skip_mm_profiling: bool = False
     """
-    When enabled, skips multimodal memory profiling and only profiles with 
+    When enabled, skips multimodal memory profiling and only profiles with
     language backbone model during engine initialization.
 
     This reduces engine startup time but shifts the responsibility to users for
@@ -2649,24 +2655,24 @@ class PoolerConfig:
     ## for embeddings models
     normalize: Optional[bool] = None
     """
-    Whether to normalize the embeddings outputs. 
+    Whether to normalize the embeddings outputs.
     """
     dimensions: Optional[int] = None
     """
-    Reduce the dimensions of embeddings if model 
+    Reduce the dimensions of embeddings if model
     support matryoshka representation.
     """
 
     ## for classification models
     activation: Optional[bool] = None
     """
-    Whether to apply activation function to the classification outputs. 
+    Whether to apply activation function to the classification outputs.
     """
 
     ## for reward models
     softmax: Optional[bool] = None
     """
-    Whether to apply softmax to the reward outputs. 
+    Whether to apply softmax to the reward outputs.
     """
     step_tag_id: Optional[int] = None
     """
@@ -2692,9 +2698,9 @@ class PoolerConfig:
 
     max_embed_len: Optional[int] = None
     """
-    Maximum input length allowed for embedding generation. When set, allows 
+    Maximum input length allowed for embedding generation. When set, allows
     inputs longer than max_embed_len to be accepted for embedding models.
-    This parameter enables accepting long inputs without requiring 
+    This parameter enables accepting long inputs without requiring
     VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds
     max_embed_len, it will be handled according to the original max_model_len
     validation logic. Defaults to None (i.e. set to max_model_len).
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f3afc015f6..b0f50b4429 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -516,6 +516,7 @@ class EngineArgs:
         model_group.add_argument("--max-logprobs",
                                  **model_kwargs["max_logprobs"])
         model_group.add_argument("--logprobs-mode",
+                                 choices=[f.value for f in LogprobsMode],
                                  **model_kwargs["logprobs_mode"])
         model_group.add_argument("--disable-sliding-window",
                                  **model_kwargs["disable_sliding_window"])
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index e0434c8f3d..7bd4a5a380 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -8,6 +8,7 @@ import torch.nn as nn
 from packaging import version
 
 from vllm import envs
+from vllm.config import LogprobsMode
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
@@ -28,9 +29,16 @@ class TopKTopPSampler(nn.Module):
     Implementations may update the logits tensor in-place.
     """
 
-    def __init__(self):
+    def __init__(
+            self,
+            logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS) -> None:
         super().__init__()
-        if current_platform.is_cuda():
+        self.logprobs_mode = logprobs_mode
+        # flashinfer optimization does not apply if intermediate
+        # logprobs/logits after top_k/top_p need to be returned
+        if logprobs_mode not in (LogprobsMode.PROCESSED_LOGITS,
+                                 LogprobsMode.PROCESSED_LOGPROBS
+                                 ) and current_platform.is_cuda():
             if is_flashinfer_available:
                 flashinfer_version = flashinfer.__version__
                 if version.parse(flashinfer_version) < version.parse("0.2.3"):
@@ -63,10 +71,12 @@ class TopKTopPSampler(nn.Module):
                     "native implementation of top-p & top-k sampling. For the "
                     "best performance, please install FlashInfer.")
                 self.forward = self.forward_native
-        elif current_platform.is_tpu():
-            self.forward = self.forward_tpu
         else:
             self.forward = self.forward_native
+        if current_platform.is_tpu():
+            self.apply_top_k_top_p = apply_top_k_top_p_tpu
+        else:
+            self.apply_top_k_top_p = apply_top_k_top_p
 
     def forward_native(
         self,
@@ -74,15 +84,20 @@ class TopKTopPSampler(nn.Module):
         generators: dict[int, torch.Generator],
         k: Optional[torch.Tensor],
         p: Optional[torch.Tensor],
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         """
         PyTorch-native implementation of top-k and top-p sampling.
 
         The logits tensor may be updated in-place.
         """
-        logits = apply_top_k_top_p(logits, k, p)
+        logits = self.apply_top_k_top_p(logits, k, p)
+        logits_to_return = None
+        if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
+            logits_to_return = logits
+        elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
+            logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32)
         probs = logits.softmax(dim=-1, dtype=torch.float32)
-        return random_sample(probs, generators)
+        return random_sample(probs, generators), logits_to_return
 
     def forward_cuda(
         self,
@@ -90,34 +105,24 @@ class TopKTopPSampler(nn.Module):
         generators: dict[int, torch.Generator],
         k: Optional[torch.Tensor],
         p: Optional[torch.Tensor],
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         """More optimized implementation for top-k and top-p sampling."""
-        if k is None and p is None:
-            # We prefer `random_sample` over `flashinfer_sample` when sorting is
-            # not needed. This is because `random_sample` does not require
-            # CPU-GPU synchronization while `flashinfer_sample` does.
-            probs = logits.softmax(dim=-1, dtype=torch.float32)
-            return random_sample(probs, generators)
-        if generators:
-            logger.warning_once("FlashInfer 0.2.3+ does not support "
-                                "per-request generators. Falling back to "
-                                "PyTorch-native implementation.")
+        # We prefer `random_sample` over `flashinfer_sample` when sorting is
+        # not needed. This is because `random_sample` does not require
+        # CPU-GPU synchronization while `flashinfer_sample` does.
+        if (k is None and p is None) or generators:
+            if generators:
+                logger.warning_once("FlashInfer 0.2.3+ does not support "
+                                    "per-request generators. Falling back to "
+                                    "PyTorch-native implementation.")
             return self.forward_native(logits, generators, k, p)
+        assert self.logprobs_mode not in (
+            LogprobsMode.PROCESSED_LOGITS, LogprobsMode.PROCESSED_LOGPROBS
+        ), "FlashInfer does not support returning logits/logprobs"
         # flashinfer sampling functions expect contiguous logits.
         # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous
         # because of slicing operation in logits_processor.
-        return flashinfer_sample(logits.contiguous(), k, p, generators)
-
-    def forward_tpu(
-        self,
-        logits: torch.Tensor,
-        generators: dict[int, torch.Generator],
-        k: Optional[torch.Tensor],
-        p: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        logits = apply_top_k_top_p_tpu(logits, k, p)
-        probs = logits.softmax(dim=-1, dtype=torch.float32)
-        return random_sample(probs, generators)
+        return flashinfer_sample(logits.contiguous(), k, p, generators), None
 
 
 def apply_top_k_top_p_tpu(
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 82f51298f1..70ec8a0c26 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that samples the next tokens from the model's outputs."""
 
+from typing import Optional
+
 import torch
 import torch.nn as nn
 
@@ -18,10 +20,50 @@ _SAMPLING_EPS = 1e-5
 
 
 class Sampler(nn.Module):
+    """
+    A layer that samples the next tokens from the model's outputs
+    with the following steps in order:
 
-    def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs"):
+    1. If logprobs are requested:  
+        a) If `logprobs_mode` is `raw_logprobs`, compute logprobs
+           as the final logprobs to return.  
+        b) If `logprobs_mode` is `raw_logits`, clone the logits
+           as the final logprobs to return.  
+    2. Convert logits to float32.  
+    3. Apply allowed token ids whitelist.  
+    4. Apply bad words exclusion.  
+    5. Apply logit processors which are not argmax-invariant,
+       i.e. that can impact greedy sampling.  
+        a) Min tokens processor  
+        b) Logit bias processor  
+    6. Apply penalties  
+        a) Repetition penalty  
+        b) Frequency penalty  
+        c) Presence penalty  
+    7. Sample the next tokens. `sample` method performs the following steps:  
+        a) If not `all_random`, perform greedy sampling. If `all_greedy`,
+           return the greedily sampled tokens and final logprobs if requested.  
+        b) Apply temperature.  
+        c) Apply logit processors which are argmax-invariant, by default
+           the min_p processor.  
+        d) Apply top_k and/or top_p.  
+        e) Sample the next tokens with the probability distribution.  
+        f) If `all_random` or temperature >= epsilon (1e-5), return the
+           randomly sampled tokens and final logprobs if requested. Else,
+           return the greedily sampled tokens and logprobs if requested.  
+    8. Gather the logprobs of the top `max_num_logprobs` and sampled token
+       (if requested). Note that if the sampled token is within the top
+       `max_num_logprobs`, the logprob will be eventually merged in
+       `LogprobsProcessor` during output processing. Therefore, the
+       final output may contain either `max_num_logprobs + 1` or
+       `max_num_logprobs` logprobs.  
+    9. Return the final `SamplerOutput`.
+    """
+
+    def __init__(self,
+                 logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS):
         super().__init__()
-        self.topk_topp_sampler = TopKTopPSampler()
+        self.topk_topp_sampler = TopKTopPSampler(logprobs_mode)
         self.pin_memory = is_pin_memory_available()
         self.logprobs_mode = logprobs_mode
 
@@ -34,13 +76,11 @@ class Sampler(nn.Module):
         # temperature scaling) for the top-k logprobs.
         # This is different from the V0 sampler, which uses the logits that
         # is used for sampling (after penalties and temperature scaling).
-        # TODO(rob): provide option for logprobs post sampling.
-        # See https://vllm-dev.slack.com/archives/C07UUL8E61Z/p1735907856007919 # noqa: E501
         num_logprobs = sampling_metadata.max_num_logprobs
         if num_logprobs is not None:
-            if self.logprobs_mode == "raw_logprobs":
+            if self.logprobs_mode == LogprobsMode.RAW_LOGPROBS:
                 raw_logprobs = self.compute_logprobs(logits)
-            elif self.logprobs_mode == "raw_logits":
+            elif self.logprobs_mode == LogprobsMode.RAW_LOGITS:
                 raw_logprobs = logits.clone()
 
         # Use float32 for the logits.
@@ -57,15 +97,10 @@ class Sampler(nn.Module):
         # Apply penalties (e.g., min_tokens, freq_penalties).
         logits = self.apply_penalties(logits, sampling_metadata)
 
-        # Get the process logprobs or logits.
-        if num_logprobs is not None:
-            if self.logprobs_mode == "processed_logprobs":
-                raw_logprobs = self.compute_logprobs(logits)
-            elif self.logprobs_mode == "processed_logits":
-                raw_logprobs = logits.clone()
-
         # Sample the next token.
-        sampled = self.sample(logits, sampling_metadata)
+        sampled, processed_logprobs = self.sample(logits, sampling_metadata)
+        if processed_logprobs is not None:
+            raw_logprobs = processed_logprobs
         # Convert sampled token ids to int64 (long) type to ensure compatibility
         # with subsequent operations that may use these values as indices.
         # This conversion is necessary because FlashInfer sampling operations
@@ -105,7 +140,7 @@ class Sampler(nn.Module):
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Sample logits based on sampling metadata.
 
         The various logits processing functions called in this method
@@ -119,7 +154,13 @@ class Sampler(nn.Module):
         else:
             greedy_sampled = self.greedy_sample(logits)
             if sampling_metadata.all_greedy:
-                return greedy_sampled
+                processed_logprobs = None
+                if sampling_metadata.max_num_logprobs is not None:
+                    if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
+                        processed_logprobs = logits
+                    elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
+                        processed_logprobs = self.compute_logprobs(logits)
+                return greedy_sampled, processed_logprobs
 
         assert sampling_metadata.temperature is not None
 
@@ -132,7 +173,7 @@ class Sampler(nn.Module):
             logits = processor.apply(logits)
 
         # Apply top_k and/or top_p.
-        random_sampled = self.topk_topp_sampler(
+        random_sampled, processed_logprobs = self.topk_topp_sampler(
             logits,
             sampling_metadata.generators,
             sampling_metadata.top_k,
@@ -140,7 +181,7 @@ class Sampler(nn.Module):
         )
 
         if greedy_sampled is None:
-            return random_sampled
+            return random_sampled, processed_logprobs
 
         sampled = torch.where(
             sampling_metadata.temperature < _SAMPLING_EPS,
@@ -148,7 +189,7 @@ class Sampler(nn.Module):
             random_sampled,
             out=greedy_sampled,  # Reuse tensor
         )
-        return sampled
+        return sampled, processed_logprobs
 
     def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
         return logits.log_softmax(dim=-1, dtype=torch.float32)
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index 2c9f4892bc..04545d587e 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -65,7 +65,7 @@ class Sampler(nn.Module):
             logits = self.apply_min_p(logits, sampling_metadata.min_p)
 
         # Apply top_k and/or top_p.
-        random_sampled = self.topk_topp_sampler(
+        random_sampled, _ = self.topk_topp_sampler(
             logits,
             sampling_metadata.generators,
             sampling_metadata.top_k,

From 0c31e28e9520d96c451cc7f023fd0f0af549766a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 21 Aug 2025 13:03:00 +0800
Subject: [PATCH 461/932] [Bugfix] Fix extra whitespace in strings caused by
 newline (#23272)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 benchmarks/benchmark_dataset.py                  |  6 ++++--
 examples/offline_inference/vision_language.py    | 15 +++++++--------
 vllm/benchmarks/datasets.py                      |  6 ++++--
 vllm/model_executor/model_loader/tpu.py          | 11 ++++++-----
 vllm/model_executor/models/hyperclovax_vision.py |  9 ++++-----
 vllm/model_executor/models/phi4mm.py             |  6 +++---
 vllm/transformers_utils/configs/eagle.py         |  4 ++--
 7 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index e1a856026c..2ea4f9ccaf 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -958,8 +958,10 @@ class InstructCoderDataset(HuggingFaceDataset):
         for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
-            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
-            the code, do not include any explanation."
+            prompt = (
+                f"{item['input']}\n\n{item['instruction']} Just output "
+                "the code, do not include any explanation."
+            )
 
             # apply template
             prompt = tokenizer.apply_chat_template(
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index e7a7a30dd3..8d97ba2668 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -283,8 +283,10 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
     )
 
     prompts = [
-        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
-        {question}<|assistant|>"
+        (
+            "<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>"
+            f"{question}<|assistant|>"
+        )
         for question in questions
     ]
 
@@ -767,15 +769,13 @@ def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestDat
 def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData:
     if modality == "video":
         prompts = [
-            f"<|im_start|>user <video>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
+            f"<|im_start|>user <video>\n{question}<|im_end|><|im_start|>assistant\n"
             for question in questions
         ]
 
     elif modality == "image":
         prompts = [
-            f"<|im_start|>user <image>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
+            f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
             for question in questions
         ]
 
@@ -998,8 +998,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
     )
 
     prompts = [
-        f"<|im_start|>user <image>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
+        f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
         for question in questions
     ]
 
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index f4fbfad2d1..920d21bda3 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1289,8 +1289,10 @@ class InstructCoderDataset(HuggingFaceDataset):
         for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
-            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
-            the code, do not include any explanation."
+            prompt = (
+                f"{item['input']}\n\n{item['instruction']} Just output "
+                "the code, do not include any explanation."
+            )
 
             # apply template
             prompt = tokenizer.apply_chat_template(
diff --git a/vllm/model_executor/model_loader/tpu.py b/vllm/model_executor/model_loader/tpu.py
index b44c165397..a70cdeb483 100644
--- a/vllm/model_executor/model_loader/tpu.py
+++ b/vllm/model_executor/model_loader/tpu.py
@@ -98,14 +98,15 @@ class TPUModelLoader(DefaultModelLoader):
 
         # Check parameters
         for name, param in model.named_parameters():
-            assert param.device.type == device_type, f"Parameter {name} is on \
-                {param.device.type} instead of {device_type}"
+            assert param.device.type == device_type, (
+                f"Parameter {name} is on {param.device.type} "
+                f"instead of {device_type}")
 
         # Check buffers
         for name, buffer in model.named_buffers():
-            assert buffer.device.type == device_type, \
-                f"Buffer {name} is on {buffer.device.type} instead of \
-                    {device_type}"
+            assert buffer.device.type == device_type, (
+                f"Buffer {name} is on {buffer.device.type} "
+                f"instead of {device_type}")
 
         for module in model.modules():
             if (mesh is not None) and (get_fqn(module) == 'QKVParallelLinear'):
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index f8b30d8d98..eeb8291c77 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -929,8 +929,8 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                 target_group_size = 0
 
             elif video_group_size < target_group_size:
-                raise RuntimeError(f"video_group_size < target_group_size!! \
-                        [{video_group_size} < {target_group_size}]")
+                raise RuntimeError(
+                    f"{video_group_size=} < {target_group_size=}")
 
         assert len(target_features
                    ) == 0, f"target_features is not empty!! {target_features}"
@@ -1114,9 +1114,8 @@ def reshape_and_unpad_image_features(
     base_image_feature = image_feature[0]
     image_feature = image_feature[1:]
 
-    assert (height * width == base_image_feature.shape[0]
-            ), f"height: {height}, width: {width}, \
-        base_image_feature.shape[0]: {base_image_feature.shape[0]}"
+    assert height * width == base_image_feature.shape[0], (
+        f"{height=} * {width=} != {base_image_feature.shape[0]=}")
 
     num_patch_width, num_patch_height = get_anyres_image_grid_shape(
         image_size, possible_resolutions, grid_size)
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 391117f075..b4aed11b86 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -262,9 +262,9 @@ class Phi4MMImageEncoder(nn.Module):
             img_features.shape[1]))
         assert base_feat_height == base_feat_height_target \
             and base_feat_width == base_feat_height_target, \
-                f'base_feat_height: {base_feat_height},"\
-                f" base_feat_width: {base_feat_width}, "\
-                f"expect {base_feat_height_target} features for hd transform'
+                (f"base_feat_height: {base_feat_height}, "
+                 f"base_feat_width: {base_feat_width}, "
+                 f"expect {base_feat_height_target} features for hd transform")
 
         # bs x max_num_crops x (24x24) x C
         img_features = img_features.view(bs, -1,
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index bc249c5836..6aabf9e526 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -61,8 +61,8 @@ class EAGLEConfig(PretrainedConfig):
                 else f"Eagle3{arch}" for arch in self.model.architectures
             ]
         else:
-            raise ValueError(f"Invalid method {method}. \
-                Supported methods are eagle and eagle3.")
+            raise ValueError(f"Invalid method {method}. "
+                             "Supported methods are eagle and eagle3.")
 
         super().__init__(**kwargs)
 

From 31282401b6c5c4271eb558e6af3f1d42b1d157c8 Mon Sep 17 00:00:00 2001
From: Jared O'Connell <46976761+jaredoconnell@users.noreply.github.com>
Date: Thu, 21 Aug 2025 02:23:56 -0400
Subject: [PATCH 462/932] [BugFix] Fix Python 3.9 Support (#23306)

Signed-off-by: Jared O'Connell <46976761+jaredoconnell@users.noreply.github.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/benchmarks/lib/endpoint_request_func.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
index 677fe16cf5..76beded4d5 100644
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -9,7 +9,7 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import Optional, Union
 
 import aiohttp
 from tqdm.asyncio import tqdm
@@ -28,7 +28,7 @@ class RequestFuncInput:
     model_name: Optional[str] = None
     logprobs: Optional[int] = None
     extra_body: Optional[dict] = None
-    multi_modal_content: Optional[dict | list[dict]] = None
+    multi_modal_content: Optional[Union[dict, list[dict]]] = None
     ignore_eos: bool = False
     language: Optional[str] = None
     request_id: Optional[str] = None

From 2e2000f352d861dd2b527a48c5f12d295a93c3dd Mon Sep 17 00:00:00 2001
From: Paul Pak <52512091+paulpak58@users.noreply.github.com>
Date: Thu, 21 Aug 2025 01:35:07 -0600
Subject: [PATCH 463/932] [Model] Add LFM2 architecture (#22845)

Signed-off-by: Paul Pak <paulpak58@gmail.com>
---
 docs/models/supported_models.md               |   1 +
 .../models/language/generation/test_hybrid.py |  33 +-
 tests/models/registry.py                      |   2 +
 tests/models/test_initialization.py           |   2 +
 vllm/config/compilation.py                    |   1 +
 .../layers/mamba/mamba_utils.py               |  24 +
 .../model_executor/layers/mamba/short_conv.py | 262 ++++++++
 vllm/model_executor/models/lfm2.py            | 557 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/v1/attention/backends/mamba_selectors.py |   4 +
 vllm/v1/attention/backends/short_conv_attn.py |  81 +++
 11 files changed, 960 insertions(+), 8 deletions(-)
 create mode 100644 vllm/model_executor/layers/mamba/short_conv.py
 create mode 100644 vllm/model_executor/models/lfm2.py
 create mode 100644 vllm/v1/attention/backends/short_conv_attn.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 831bfb1e93..ad3db1cf21 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -373,6 +373,7 @@ th {
 | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ |
 | `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Lfm2ForCausalLM`  | LFM2  | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | ✅︎ |
 | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ |
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index f8c0eaa8cf..2055c44c83 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -31,6 +31,7 @@ HYBRID_MODELS = [
     "hmellor/tiny-random-BambaForCausalLM",
     "ibm-granite/granite-4.0-tiny-preview",
     "tiiuae/Falcon-H1-0.5B-Base",
+    "LiquidAI/LFM2-1.2B",
 ]
 
 HF_UNSUPPORTED_MODELS = [
@@ -52,6 +53,7 @@ V1_SUPPORTED_MODELS = [
     "hmellor/tiny-random-BambaForCausalLM",
     "ibm-granite/granite-4.0-tiny-preview",
     "tiiuae/Falcon-H1-0.5B-Base",
+    "LiquidAI/LFM2-1.2B",
 ]
 
 FULL_CUDA_GRAPH_MODELS = [
@@ -59,6 +61,10 @@ FULL_CUDA_GRAPH_MODELS = [
     "Zyphra/Zamba2-1.2B-instruct",
 ]
 
+V0_UNSUPPORTED_MODELS = [
+    "LiquidAI/LFM2-1.2B",
+]
+
 # Avoid OOM
 MAX_NUM_SEQS = 4
 
@@ -94,9 +100,12 @@ def test_models(
         else:
             hf_outputs = None
 
-    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+    if model not in V0_UNSUPPORTED_MODELS:
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
+    else:
+        vllm_v0_outputs = None
 
     if model in V1_SUPPORTED_MODELS:
         with monkeypatch.context() as m:
@@ -112,7 +121,7 @@ def test_models(
     else:
         vllm_v1_outputs = None
 
-    if hf_outputs is not None:
+    if hf_outputs is not None and vllm_v0_outputs is not None:
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_v0_outputs,
@@ -122,6 +131,7 @@ def test_models(
 
     if model in V1_SUPPORTED_MODELS:
         ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
+        assert ref_outputs is not None
         check_logprobs_close(
             outputs_0_lst=ref_outputs,
             outputs_1_lst=vllm_v1_outputs,
@@ -140,6 +150,9 @@ def test_batching(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
+    if model in V0_UNSUPPORTED_MODELS:
+        pytest.skip(
+            f"Unsupported V0 Engine. Skipping `test_batching` on {model}.")
 
     try:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
@@ -392,9 +405,12 @@ def test_full_cuda_graph(
         else:
             hf_outputs = None
 
-    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+    if model not in V0_UNSUPPORTED_MODELS:
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
+    else:
+        vllm_v0_outputs = None
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
@@ -408,7 +424,7 @@ def test_full_cuda_graph(
             vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, num_logprobs)
 
-    if hf_outputs is not None:
+    if hf_outputs is not None and vllm_v0_outputs is not None:
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_v0_outputs,
@@ -417,6 +433,7 @@ def test_full_cuda_graph(
         )
 
     ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
+    assert ref_outputs is not None
     check_logprobs_close(
         outputs_0_lst=ref_outputs,
         outputs_1_lst=vllm_v1_outputs,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 4f69f90b6a..a6d5c305f7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -230,6 +230,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                             "tiny": "ai21labs/Jamba-tiny-dev",
                                             "random": "ai21labs/Jamba-tiny-random",  # noqa: E501
                                         }),
+    "Lfm2ForCausalLM": _HfExamplesInfo("LiquidAI/LFM2-1.2B",
+                                       min_transformers_version="4.54"),
     "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct",
                                         extras={"guard": "meta-llama/Llama-Guard-3-1B",  # noqa: E501
                                                 "hermes": "NousResearch/Hermes-3-Llama-3.1-8B", # noqa: E501
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index f06b34285e..bbd3da982a 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -95,6 +95,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
 
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
+    if model_arch == "Lfm2ForCausalLM":
+        pytest.skip("Skipping until test supports V1-only models")
     can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
 
 
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index c654485f4f..e2785e7602 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -337,6 +337,7 @@ class CompilationConfig:
         "vllm.unified_attention_with_output",
         "vllm.mamba_mixer2",
         "vllm.mamba_mixer",
+        "vllm.short_conv",
     ]
 
     def compute_hash(self) -> str:
diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py
index 66674d1a6f..280a9e45e6 100644
--- a/vllm/model_executor/layers/mamba/mamba_utils.py
+++ b/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -54,6 +54,16 @@ class MambaStateDtypeCalculator:
 
         return (conv_state_dtype, temporal_state_dtype)
 
+    @classmethod
+    def short_conv_state_dtype(
+        cls,
+        model_dtype: Union[ModelDType, torch.dtype],
+        mamba_cache_dtype: MambaDType,
+    ) -> tuple[torch.dtype, ...]:
+        conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype,
+                                                    model_dtype)
+        return (conv_state_dtype, )
+
 
 class MambaStateShapeCalculator:
 
@@ -122,6 +132,20 @@ class MambaStateShapeCalculator:
                                        tp_world_size), head_dim, state_size)
         return conv_state_shape, temporal_state_shape
 
+    @classmethod
+    def short_conv_state_shape(
+        cls,
+        tp_world_size: int,
+        intermediate_size: int,
+        conv_kernel: int,
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int]]:
+        conv_dim = divide(intermediate_size, tp_world_size)
+        conv_state_shape = (conv_kernel - 1, conv_dim)
+        if not use_v1:
+            conv_state_shape = conv_state_shape[1], conv_state_shape[0]
+        return (conv_state_shape, )
+
     @classmethod
     def extra_groups_for_head_shards(cls, ngroups: int, tp_size: int):
         """Compute the increase in group numbers to account for
diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py
new file mode 100644
index 0000000000..fead1e73e3
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -0,0 +1,262 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm import envs
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.mamba.abstract import MambaBase
+from vllm.model_executor.layers.mamba.mamba2_metadata import update_metadata
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
+from vllm.v1.attention.backends.short_conv_attn import (
+    ShortConvAttentionMetadata)
+
+
+@CustomOp.register("short_conv")
+class ShortConv(MambaBase, CustomOp):
+
+    def __init__(self,
+                 config,
+                 dim: int,
+                 layer_idx: int,
+                 model_config: Optional[ModelConfig] = None,
+                 cache_config: Optional[CacheConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.conv_dim = dim
+        self.L_cache = config.conv_L_cache
+        self.bias = config.conv_bias
+
+        self.conv = ColumnParallelLinear(
+            input_size=self.L_cache,
+            output_size=dim,
+            bias=self.bias,
+            prefix=f"{prefix}.conv1d",
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv.weight.data = self.conv.weight.data.unsqueeze(1)
+
+        self.in_proj = MergedColumnParallelLinear(
+            input_size=dim,
+            output_sizes=[dim] * 3,
+            bias=self.bias,
+            prefix=f"{prefix}.in_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=dim,
+            output_size=dim,
+            bias=self.bias,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        assert envs.VLLM_USE_V1, ("ShortConv layers are only supported in V1")
+        compilation_config = get_current_vllm_config().compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        compilation_config.static_forward_context[prefix] = self
+        # The outer list is for v0 PP virtual engine. Though this code path
+        # only runs for v1, we have to do this to unify with the interface
+        # of Attention + v0 PP.
+        self.kv_cache = [(torch.tensor([]), )]
+
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.prefix = prefix
+
+    def forward_native(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        conv_metadata: ShortConvAttentionMetadata,
+    ):
+        return
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        conv_metadata: ShortConvAttentionMetadata,
+    ):
+        torch.ops.vllm.short_conv(
+            hidden_states,
+            output,
+            self.prefix,
+        )
+
+    def forward_cuda(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        conv_metadata: ShortConvAttentionMetadata,
+    ):
+        forward_context = get_forward_context()
+        # ShortConvAttentionMetadata contains metadata necessary for the
+        # short_conv triton kernels to operate in continuous batching and in
+        # chunked prefill modes; they are computed at top-level model forward
+        # since they stay the same and reused for all mamba layers in the same
+        # iteration.
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+        if attn_metadata is not None:
+            assert isinstance(attn_metadata, dict)
+            attn_metadata = attn_metadata[self.prefix]
+            conv_metadata = attn_metadata
+            assert isinstance(attn_metadata, ShortConvAttentionMetadata)
+            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+            conv_state = self_kv_cache[0].transpose(-1, -2)
+            state_indices_tensor = attn_metadata.state_indices_tensor
+            has_initial_states_p = attn_metadata.has_initial_states
+
+        BCx, _ = self.in_proj(hidden_states)
+
+        B, C, x = BCx.chunk(3, dim=-1)
+
+        conv_weights = self.conv.weight.view(self.conv.weight.size(0),
+                                             self.conv.weight.size(2))
+
+        if attn_metadata is None:
+            # V1 profile run
+            Bx = (B * x).contiguous()
+            hidden_states = C * Bx
+            contextualized_states, _ = self.out_proj(hidden_states)
+            return contextualized_states
+
+        num_prefills = attn_metadata.num_prefills  # request count
+        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
+        num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
+        has_prefill = num_prefills > 0
+        has_decode = num_decodes > 0
+        num_actual_tokens = num_decodes + num_prefill_tokens
+
+        # NOTE: V1 puts decode before prefill
+        # Separate prefill and decode by splitting varlen input
+        # Split along token dimension
+        B_d, B_p = torch.split(
+            B[:num_actual_tokens],
+            [num_decodes, num_prefill_tokens],
+            dim=0,
+        )
+        C_d, C_p = torch.split(
+            C[:num_actual_tokens],
+            [num_decodes, num_prefill_tokens],
+            dim=0,
+        )
+        x_d, x_p = torch.split(
+            x[:num_actual_tokens],
+            [num_decodes, num_prefill_tokens],
+            dim=0,
+        )
+        # Split along batch dimension
+        state_indices_tensor_d, state_indices_tensor_p = torch.split(
+            state_indices_tensor,
+            [num_decodes, num_prefills],
+            dim=0,
+        )
+        query_start_loc_p = (
+            attn_metadata.query_start_loc[-num_prefills - 1:] -
+            num_decodes if has_prefill else None)
+
+        conv_output_list = []
+
+        if has_prefill:
+            Bx_p = (B_p * x_p).transpose(0, 1)
+            if conv_metadata.cu_seqlen is None:
+                conv_metadata = update_metadata(Bx_p, query_start_loc_p,
+                                                conv_metadata)
+            Bx = causal_conv1d_fn(Bx_p,
+                                  conv_weights,
+                                  self.conv.bias,
+                                  activation=None,
+                                  conv_states=conv_state,
+                                  has_initial_state=has_initial_states_p,
+                                  cache_indices=state_indices_tensor_p,
+                                  metadata=conv_metadata,
+                                  query_start_loc=query_start_loc_p).transpose(
+                                      0, 1)[:num_prefill_tokens]
+
+            y = C_p * Bx
+            conv_output_list.append(y)
+
+        if has_decode:
+            Bx_d = (B_d * x_d).contiguous()
+            Bx = causal_conv1d_update(
+                Bx_d,
+                conv_state,
+                conv_weights,
+                self.conv.bias,
+                activation=None,
+                conv_state_indices=state_indices_tensor_d)
+            y = C_d * Bx
+            conv_output_list.insert(0, y)
+
+        # Merge prefill and decode outputs before passing to gated MLP
+        hidden_states = torch.vstack(conv_output_list)
+
+        # Final linear projection
+        output[:num_actual_tokens], _ = self.out_proj(hidden_states)
+
+    def get_state_dtype(self) -> tuple[torch.dtype, ...]:
+        assert self.model_config is not None
+        assert self.cache_config is not None
+        return MambaStateDtypeCalculator.short_conv_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+        )
+
+    def get_state_shape(self) -> tuple[tuple[int, ...]]:
+        return MambaStateShapeCalculator.short_conv_state_shape(
+            tp_world_size=get_tensor_model_parallel_world_size(),
+            intermediate_size=self.conv_dim,
+            conv_kernel=self.L_cache,
+        )
+
+    @property
+    def mamba_type(self) -> str:
+        return "short_conv"
+
+
+def short_conv(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self.forward_cuda(hidden_states=hidden_states,
+                      output=output,
+                      conv_metadata=None)
+
+
+def short_conv_fake(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="short_conv",
+    op_func=short_conv,
+    mutates_args=["output"],
+    fake_impl=short_conv_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
new file mode 100644
index 0000000000..5f3148b47e
--- /dev/null
+++ b/vllm/model_executor/models/lfm2.py
@@ -0,0 +1,557 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from typing import Any, Optional
+
+import torch
+import torch.nn as nn
+from transformers import Lfm2Config
+
+from vllm import envs
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+from vllm.model_executor.layers.mamba.short_conv import ShortConv
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
+                         SupportsQuant)
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class Lfm2MLP(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        ff_dim: int,
+        multiple_of: int,
+        auto_adjust_ff_dim: bool,
+        ffn_dim_multiplier: Optional[float],
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        if auto_adjust_ff_dim:
+            ff_dim = int(2 * ff_dim / 3)
+            # custom dim factor multiplier
+            if ffn_dim_multiplier is not None:
+                ff_dim = int(ffn_dim_multiplier * ff_dim)
+            ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
+
+        self.w1 = MergedColumnParallelLinear(
+            input_size=dim,
+            output_sizes=[ff_dim] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.w2 = RowParallelLinear(
+            input_size=ff_dim,
+            output_size=dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.w1(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.w2(x)
+        return x
+
+
+class Lfm2Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: Lfm2Config,
+        layer_idx: int,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        self.num_kv_heads = num_kv_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=True,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.q_layernorm = RMSNorm(self.head_dim, eps=config.norm_eps)
+        self.k_layernorm = RMSNorm(self.head_dim, eps=config.norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        n_tokens, _ = hidden_states.shape
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = q.view(n_tokens, self.num_heads, self.head_dim).contiguous()
+        k = k.view(n_tokens, self.num_kv_heads, self.head_dim).contiguous()
+        q = self.q_layernorm(q)
+        k = self.k_layernorm(k)
+        q, k = self.rotary_emb(positions, q, k)
+        q = q.view(n_tokens, self.num_heads * self.head_dim)
+        k = k.view(n_tokens, self.num_kv_heads * self.head_dim)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class Lfm2AttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Lfm2Config,
+        layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.prefix = prefix
+        self.config = config
+        self.layer_idx = layer_idx
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+
+        self.self_attn = Lfm2Attention(
+            config=config,
+            layer_idx=layer_idx,
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        self.feed_forward = Lfm2MLP(
+            dim=config.block_dim,
+            ff_dim=config.block_ff_dim,
+            multiple_of=config.block_multiple_of,
+            auto_adjust_ff_dim=config.block_auto_adjust_ff_dim,
+            ffn_dim_multiplier=config.block_ffn_dim_multiplier,
+            quant_config=quant_config,
+            prefix=f"{prefix}.feed_forward",
+        )
+        self.operator_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.operator_norm(hidden_states)
+        else:
+            hidden_states, residual = self.operator_norm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions,
+                                       hidden_states=hidden_states)
+        hidden_states, residual = self.ffn_norm(hidden_states, residual)
+        return self.feed_forward(hidden_states), residual
+
+
+class Lfm2ShortConvDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Lfm2Config,
+        layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.conv = ShortConv(
+            config=config,
+            dim=config.conv_dim,
+            layer_idx=layer_idx,
+            model_config=model_config,
+            cache_config=cache_config,
+            prefix=f"{prefix}.conv",
+        )
+
+        self.feed_forward = Lfm2MLP(
+            dim=config.block_dim,
+            ff_dim=config.block_ff_dim,
+            multiple_of=config.block_multiple_of,
+            auto_adjust_ff_dim=config.block_auto_adjust_ff_dim,
+            ffn_dim_multiplier=config.block_ffn_dim_multiplier,
+            quant_config=quant_config,
+            prefix=f"{prefix}.feed_forward",
+        )
+        self.operator_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.operator_norm(hidden_states)
+        else:
+            hidden_states, residual = self.operator_norm(
+                hidden_states, residual)
+        output = torch.empty_like(hidden_states)
+        self.conv(
+            hidden_states,
+            output,
+            conv_metadata=None,
+        )
+        hidden_states, residual = self.ffn_norm(output, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Lfm2Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size)
+
+        def get_layer(prefix: str):
+            layer_idx = extract_layer_index(prefix)
+            is_attn = self.config.layer_types[layer_idx] == "full_attention"
+            layer_class = (Lfm2AttentionDecoderLayer
+                           if is_attn else Lfm2ShortConvDecoderLayer)
+            return layer_class(
+                config,
+                layer_idx,
+                model_config,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+        if get_pp_group().is_last_rank:
+            self.embedding_norm = RMSNorm(config.hidden_size,
+                                          eps=config.norm_eps)
+        else:
+            self.embedding_norm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.embedding_norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".w1", ".w1", 0),
+            (".w1", ".w3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                      IsHybrid, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "w1": [
+            "w1",
+            "w3",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, ...]:
+
+        return MambaStateDtypeCalculator.short_conv_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+        )
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int]]:
+        """ Calculate shapes for LFM2's convolutional cache.
+
+        Args:
+            vllm_config: vLLM config
+            use_v1: Get shapes for V1 (or V0)
+
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+        """
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+
+        return MambaStateShapeCalculator.short_conv_state_shape(
+            tp_world_size=parallel_config.tensor_parallel_size,
+            intermediate_size=hf_config.conv_dim,
+            conv_kernel=hf_config.conv_L_cache,
+            use_v1=use_v1,
+        )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert (not cache_config.enable_prefix_caching
+                ), "Lfm2 currently does not support prefix caching"
+        assert envs.VLLM_USE_V1, (
+            "Lfm2ForCausalLM doesn't support vLLM v0. Please enable v1")
+
+        super().__init__()
+        self.config = config
+        self.vllm_config = vllm_config
+        self.scheduler_config = scheduler_config
+        self.model_config = vllm_config.model_config
+
+        self.model = Lfm2Model(vllm_config=vllm_config,
+                               prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = self.config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config else
+                    lora_config.lora_vocab_padding_size),
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 39a3e425a4..28d7e93af9 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -93,6 +93,7 @@ _TEXT_GENERATION_MODELS = {
     "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"),
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
+    "Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"),  # noqa: E501
     # For decapoda-research/llama-*
diff --git a/vllm/v1/attention/backends/mamba_selectors.py b/vllm/v1/attention/backends/mamba_selectors.py
index d3a0c63c5e..fb18445082 100644
--- a/vllm/v1/attention/backends/mamba_selectors.py
+++ b/vllm/v1/attention/backends/mamba_selectors.py
@@ -4,6 +4,8 @@ from vllm.attention.backends.abstract import AttentionBackend
 from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend
 from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
 from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
+from vllm.v1.attention.backends.short_conv_attn import (
+    ShortConvAttentionBackend)
 
 
 def get_mamba_attn_backend(mamba_type: str) -> type[AttentionBackend]:
@@ -13,6 +15,8 @@ def get_mamba_attn_backend(mamba_type: str) -> type[AttentionBackend]:
         return Mamba2AttentionBackend
     if mamba_type == "linear_attention":
         return LinearAttentionBackend
+    if mamba_type == "short_conv":
+        return ShortConvAttentionBackend
 
     raise NotImplementedError(f"Mamba Attention type {mamba_type} is not "
                               "supported yet.")
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
new file mode 100644
index 0000000000..d80ced8ec8
--- /dev/null
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import ClassVar, Optional
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.config import VllmConfig
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata,
+                                              split_decodes_and_prefills)
+from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
+
+
+class ShortConvAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_builder_cls() -> type["ShortConvAttentionMetadataBuilder"]:
+        return ShortConvAttentionMetadataBuilder
+
+
+@dataclass
+class ShortConvAttentionMetadata:
+    num_prefills: int
+    num_prefill_tokens: int
+    num_decodes: int
+    num_decode_tokens: int
+
+    query_start_loc: torch.Tensor
+    has_initial_states: torch.Tensor
+    state_indices_tensor: torch.Tensor  # shape: [batch,]
+
+    # For causal_conv1d
+    nums_dict: Optional[dict] = None
+    cu_seqlen: Optional[int] = None
+    batch_ptr: Optional[torch.tensor] = None
+    token_chunk_offset_ptr: Optional[torch.tensor] = None
+
+
+class ShortConvAttentionMetadataBuilder(
+        AttentionMetadataBuilder[ShortConvAttentionMetadata]):
+
+    reorder_batch_threshold: ClassVar[int] = 1
+
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
+        assert isinstance(kv_cache_spec, MambaSpec)
+        self.kv_cache_spec = kv_cache_spec
+
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False) -> ShortConvAttentionMetadata:
+        num_reqs = common_attn_metadata.num_reqs
+        query_start_loc = common_attn_metadata.query_start_loc
+
+        state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
+
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+            split_decodes_and_prefills(common_attn_metadata,
+                                       decode_threshold=1))
+        has_initial_states = None
+        if num_prefills > 0:
+            #[batch,]
+            has_initial_states_cpu = (
+                common_attn_metadata.
+                num_computed_tokens_cpu[num_reqs - num_prefills:num_reqs] > 0)
+            has_initial_states = has_initial_states_cpu.to(
+                query_start_loc.device)
+
+        attn_metadata = ShortConvAttentionMetadata(
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
+            query_start_loc=query_start_loc,
+            has_initial_states=has_initial_states,
+            state_indices_tensor=state_indices_tensor,
+        )
+        return attn_metadata
\ No newline at end of file

From 0c6e40bbaa4707528286a1e7bf17c90c88a1d920 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 21 Aug 2025 16:00:16 +0800
Subject: [PATCH 464/932] [Refactor] Simplify code for MM budget (#23310)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/v1/core/encoder_cache_manager.py | 58 +++++++++++++++------------
 vllm/v1/worker/gpu_model_runner.py    | 18 +++------
 vllm/v1/worker/tpu_model_runner.py    | 13 ++----
 vllm/v1/worker/utils.py               | 40 +++++++++---------
 4 files changed, 59 insertions(+), 70 deletions(-)

diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index faf5c132f8..0b9da60c67 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+from collections.abc import Mapping
 from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
@@ -188,35 +188,47 @@ def compute_encoder_budget(
         - Space budget for encoder cache size, in unit of number of tokens 
             in the input sequence.
     """
+    if mm_registry.supports_multimodal_inputs(model_config):
+        max_tokens_by_modality = mm_registry \
+            .get_max_tokens_per_item_by_nonzero_modality(model_config)
 
-    if not mm_registry.supports_multimodal_inputs(model_config):
-        return 0, 0
+        return compute_mm_encoder_budget(
+            scheduler_config,
+            max_tokens_by_modality,
+        )
 
-    # TODO: handle encoder-decoder models once we support them.
-    (
-        encoder_compute_budget,
-        encoder_cache_size,
-    ) = _compute_encoder_budget_multimodal(
-        model_config,
-        scheduler_config,
-        mm_registry,
-    )
-
-    return encoder_compute_budget, encoder_cache_size
+    return compute_text_encoder_budget(scheduler_config)
 
 
-def _compute_encoder_budget_multimodal(
-    model_config: "ModelConfig",
+def compute_text_encoder_budget(
+        scheduler_config: "SchedulerConfig") -> tuple[int, int]:
+    """Compute the encoder cache budget based on the model and scheduler 
+    configurations for a text-only model.
+
+    Args:
+        scheduler_config: Scheduler configuration.
+
+    Returns:
+        - Compute budget for encoder execution, in unit of number of tokens 
+            in the input sequence.
+        - Space budget for encoder cache size, in unit of number of tokens 
+            in the input sequence.
+    """
+    # Currently text-only encoder-decoder models are not supported
+    return 0, 0
+
+
+def compute_mm_encoder_budget(
     scheduler_config: "SchedulerConfig",
-    mm_registry: MultiModalRegistry,
+    max_tokens_by_modality: Mapping[str, int],
 ) -> tuple[int, int]:
     """Compute the encoder cache budget based on the model and scheduler 
     configurations for a multimodal model.
 
     Args:
-        model_config: Model configuration.
         scheduler_config: Scheduler configuration.
-        mm_registry: Provides information about the token cost.
+        max_tokens_by_modality: The maximum number of tokens for each
+            non-text modality.
 
     Returns:
         - Compute budget for encoder execution, in unit of number of tokens 
@@ -225,18 +237,14 @@ def _compute_encoder_budget_multimodal(
             in the input sequence.
     """
 
-    max_tokens_by_modality_dict = mm_registry \
-        .get_max_tokens_per_item_by_nonzero_modality(model_config)
-
-    if not max_tokens_by_modality_dict:
+    if not max_tokens_by_modality:
         logger.warning(
             "All non-text modalities supported by the model have been "
             "explicitly disabled via limit_mm_per_prompt. Encoder cache will "
             "not be initialized.")
         return 0, 0
 
-    _, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(),
-                                    key=lambda item: item[1])
+    max_tokens_per_mm_item = max(max_tokens_by_modality.values())
 
     if (scheduler_config.disable_chunked_mm_input and max_tokens_per_mm_item
             > scheduler_config.max_num_batched_tokens):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index cc86f98264..7caa873be4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -341,10 +341,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.model_config,
             self.scheduler_config,
             self.mm_registry,
-            max_model_len=self.max_model_len,
-            max_num_reqs=self.max_num_reqs,
-        ) if self.supports_mm_inputs \
-            else None)
+        ) if self.supports_mm_inputs else None)
 
         self.reorder_batch_threshold: Optional[int] = None
 
@@ -669,7 +666,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             mm_budget = self.mm_budget
             assert mm_budget is not None
 
-            dummy_modality, _ = mm_budget.get_modality_with_max_tokens()
+            dummy_modality = mm_budget.get_modality_with_max_tokens()
 
             return self._get_mm_dummy_batch(dummy_modality, num_seqs)
 
@@ -2595,14 +2592,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     # NOTE: Currently model is profiled with a single non-text
                     # modality with the max possible input tokens even when
                     # it supports multiple.
-                    (
-                        dummy_modality,
-                        max_tokens,
-                    ) = mm_budget.get_modality_with_max_tokens()
-                    (
-                        max_mm_items_per_prompt,
-                        max_mm_items_per_batch,
-                    ) = mm_budget.get_max_items(dummy_modality, max_tokens)
+                    dummy_modality = mm_budget.get_modality_with_max_tokens()
+                    max_mm_items_per_batch = mm_budget \
+                        .max_items_per_batch_by_modality[dummy_modality]
 
                     logger.info(
                         "Encoder cache will be initialized with a budget of "
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 0f569500cd..2a8d65948d 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -292,8 +292,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.model_config,
             self.scheduler_config,
             self.mm_registry,
-            max_model_len=self.max_model_len,
-            max_num_reqs=self.max_num_reqs,
         ) if self.supports_mm_inputs else None)
 
         if not self.use_spmd:
@@ -1545,14 +1543,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     # NOTE: Currently model is profiled with a single non-text
                     # modality with the max possible input tokens even when
                     # it supports multiple.
-                    (
-                        dummy_modality,
-                        max_tokens,
-                    ) = mm_budget.get_modality_with_max_tokens()
-                    (
-                        max_mm_items_per_prompt,
-                        max_mm_items_per_batch,
-                    ) = mm_budget.get_max_items(dummy_modality, max_tokens)
+                    dummy_modality = mm_budget.get_modality_with_max_tokens()
+                    max_mm_items_per_batch = mm_budget \
+                        .max_items_per_batch_by_modality[dummy_modality]
 
                     logger.info(
                         "Encoder cache will be initialized with a budget of "
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index b138f11af1..c7ccd2e254 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -12,7 +12,7 @@ from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.registry import MultiModalRegistry
 from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
-from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
+from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec
 
 if TYPE_CHECKING:
@@ -27,9 +27,6 @@ class MultiModalBudget:
         model_config: ModelConfig,
         scheduler_config: SchedulerConfig,
         mm_registry: MultiModalRegistry,
-        *,
-        max_model_len: int,
-        max_num_reqs: int,
     ) -> None:
         super().__init__()
 
@@ -37,25 +34,25 @@ class MultiModalBudget:
         self.scheduler_config = scheduler_config
         self.mm_registry = mm_registry
 
-        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
-            model_config=model_config,
-            scheduler_config=scheduler_config,
-            mm_registry=mm_registry,
-        )
-
-        self.max_num_encoder_input_tokens = encoder_compute_budget
-        self.encoder_cache_size = encoder_cache_size
-        self.max_model_len = max_model_len
-        self.max_num_reqs = max_num_reqs
+        self.max_model_len = model_config.max_model_len
+        self.max_num_reqs = scheduler_config.max_num_seqs
 
         self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config)
 
-        max_items_per_prompt_by_modality = dict[str, int]()
-        max_items_per_batch_by_modality = dict[str, int]()
-
         max_tokens_by_modality = mm_registry \
             .get_max_tokens_per_item_by_nonzero_modality(model_config)
 
+        encoder_compute_budget, encoder_cache_size = compute_mm_encoder_budget(
+            scheduler_config,
+            max_tokens_by_modality,
+        )
+
+        self.encoder_compute_budget = encoder_compute_budget
+        self.encoder_cache_size = encoder_cache_size
+
+        max_items_per_prompt_by_modality = dict[str, int]()
+        max_items_per_batch_by_modality = dict[str, int]()
+
         for modality, max_tokens in max_tokens_by_modality.items():
             (
                 max_items_per_prompt,
@@ -69,15 +66,14 @@ class MultiModalBudget:
         self.max_items_per_prompt_by_modality = max_items_per_prompt_by_modality
         self.max_items_per_batch_by_modality = max_items_per_batch_by_modality
 
-    def get_modality_with_max_tokens(self) -> tuple[str, int]:
+    def get_modality_with_max_tokens(self) -> str:
         max_tokens_by_modality = self.max_tokens_by_modality
-        modality, max_tokens = max(max_tokens_by_modality.items(),
-                                   key=lambda item: item[1])
+        modality, _ = max(max_tokens_by_modality.items(), key=lambda x: x[1])
 
-        return modality, max_tokens
+        return modality
 
     def get_encoder_budget(self) -> int:
-        return min(self.max_num_encoder_input_tokens, self.encoder_cache_size)
+        return min(self.encoder_compute_budget, self.encoder_cache_size)
 
     def get_max_items(
         self,

From 5cc54f7c5bf2516b909fadb0ad09b68e65c42812 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 21 Aug 2025 21:16:38 +0800
Subject: [PATCH 465/932] [Doc] Fix batch-level DP example (#23325)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 docs/configuration/optimization.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index db9dfb313f..357a5eb594 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -153,13 +153,14 @@ from vllm import LLM
 
 llm = LLM(
     model="Qwen/Qwen2.5-VL-72B-Instruct",
-    # Create two EngineCore instances, one per DP rank
-    data_parallel_size=2,
-    # Within each EngineCore instance:
-    # The vision encoder uses TP=4 (not DP=2) to shard the input data
-    # The language decoder uses TP=4 to shard the weights as usual
     tensor_parallel_size=4,
+    # When mm_encoder_tp_mode="data",
+    # the vision encoder uses TP=4 (not DP=1) to shard the input data,
+    # so the TP size becomes the effective DP size.
+    # Note that this is independent of the DP size for language decoder which is used in expert parallel setting.
     mm_encoder_tp_mode="data",
+    # The language decoder uses TP=4 to shard the weights regardless
+    # of the setting of mm_encoder_tp_mode
 )
 ```
 

From d70a16625dc74d9517641aa82f4ae7367854da96 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Thu, 21 Aug 2025 21:26:09 +0800
Subject: [PATCH 466/932] [Performance] V1 Pooling Models E2E Performance
 Optimization (#23162)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 vllm/model_executor/layers/pooler.py    | 133 +++++++++---------------
 vllm/model_executor/models/bert.py      |   6 +-
 vllm/model_executor/models/roberta.py   |  65 ++----------
 vllm/model_executor/pooling_metadata.py |  23 ++--
 vllm/v1/pool/metadata.py                |  56 +++++++++-
 vllm/v1/worker/gpu_input_batch.py       |   2 +-
 vllm/v1/worker/gpu_model_runner.py      |  36 ++++---
 vllm/worker/pooling_model_runner.py     |   9 +-
 8 files changed, 162 insertions(+), 168 deletions(-)

diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 75e65072b7..d34fb58cb5 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -19,7 +19,8 @@ from vllm.model_executor.pooling_metadata import PoolingTensors
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
 from vllm.tasks import PoolingTask
-from vllm.utils import resolve_obj_by_qualname
+from vllm.utils import current_stream, resolve_obj_by_qualname
+from vllm.v1.pool.metadata import PoolingCursor
 from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
 
 PoolingMetadata = Union[V0PoolingMetadata, V1PoolingMetadata]
@@ -205,6 +206,13 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
 
 def build_output(
     all_data: Union[torch.Tensor, list[torch.Tensor]], ) -> PoolerOutput:
+    # Pooling models D2H & synchronize occurs here
+    if isinstance(all_data, list):
+        all_data = [d.to("cpu", non_blocking=True) for d in all_data]
+    else:
+        all_data = all_data.to("cpu", non_blocking=True)
+    current_stream().synchronize()
+
     all_outputs = [PoolingSequenceGroupOutput(data) for data in all_data]
     return PoolerOutput(outputs=all_outputs)
 
@@ -231,40 +239,21 @@ class PoolingMethod(nn.Module, ABC):
     def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
         return PoolingParamsUpdate()
 
-    @abstractmethod
-    def forward_one(
-        self,
-        hidden_states: torch.Tensor,
-        prompt_len: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """
-        Note:
-            `prompt_len=None` means `prompt_len=len(hidden_states)`.
-        """
-        raise NotImplementedError
-
     @abstractmethod
     def forward_all(
         self,
         hidden_states: torch.Tensor,
-        prompt_lens: torch.Tensor,
+        pooling_cursor: PoolingCursor,
     ) -> Union[list[torch.Tensor], torch.Tensor]:
         raise NotImplementedError
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor,
         pooling_metadata: PoolingMetadata,
     ) -> Union[list[torch.Tensor], torch.Tensor]:
-        prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
-
-        if isinstance(hidden_states, list):
-            return [
-                self.forward_one(h, prompt_len)
-                for h, prompt_len in zip(hidden_states, prompt_lens)
-            ]
-
-        return self.forward_all(hidden_states, prompt_lens)
+        pooling_cursor = pooling_metadata.pooling_cursor
+        return self.forward_all(hidden_states, pooling_cursor)
 
 
 class CLSPool(PoolingMethod):
@@ -272,24 +261,15 @@ class CLSPool(PoolingMethod):
     def get_supported_tasks(self) -> Set[PoolingTask]:
         return {"encode", "embed", "classify", "score"}
 
-    def forward_one(
-        self,
-        hidden_states: torch.Tensor,
-        prompt_len: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        assert prompt_len is None or prompt_len == hidden_states.shape[0], \
-            "partial prefill not supported with CLS pooling"
-
-        return hidden_states[0]
-
     def forward_all(
         self,
         hidden_states: torch.Tensor,
-        prompt_lens: torch.Tensor,
+        pooling_cursor: PoolingCursor,
     ) -> Union[list[torch.Tensor], torch.Tensor]:
-        first_token_flat_indices = torch.zeros_like(prompt_lens)
-        first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1]
-        return hidden_states[first_token_flat_indices]
+        assert not pooling_cursor.is_partial_prefill(), \
+            "partial prefill not supported with CLS pooling"
+
+        return hidden_states[pooling_cursor.first_token_indices_gpu]
 
 
 class LastPool(PoolingMethod):
@@ -297,20 +277,12 @@ class LastPool(PoolingMethod):
     def get_supported_tasks(self) -> Set[PoolingTask]:
         return {"encode", "embed", "classify", "score"}
 
-    def forward_one(
-        self,
-        hidden_states: torch.Tensor,
-        prompt_len: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        return hidden_states[-1]
-
     def forward_all(
         self,
         hidden_states: torch.Tensor,
-        prompt_lens: torch.Tensor,
+        pooling_cursor: PoolingCursor,
     ) -> Union[list[torch.Tensor], torch.Tensor]:
-        last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
-        return hidden_states[last_token_flat_indices]
+        return hidden_states[pooling_cursor.last_token_indices_gpu]
 
 
 class AllPool(PoolingMethod):
@@ -318,22 +290,19 @@ class AllPool(PoolingMethod):
     def get_supported_tasks(self) -> Set[PoolingTask]:
         return {"encode"}
 
-    def forward_one(
-        self,
-        hidden_states: torch.Tensor,
-        prompt_len: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        assert prompt_len is None or prompt_len == hidden_states.shape[0], \
-            "partial prefill not supported with ALL pooling"
-
-        return hidden_states
-
     def forward_all(
         self,
         hidden_states: torch.Tensor,
-        prompt_lens: torch.Tensor,
+        pooling_cursor: PoolingCursor,
     ) -> Union[list[torch.Tensor], torch.Tensor]:
-        return list(hidden_states.split_with_sizes(prompt_lens.tolist()))
+
+        assert not pooling_cursor.is_partial_prefill(), \
+            "partial prefill not supported with ALL pooling"
+
+        hidden_states_lst = list(
+            hidden_states.split(
+                pooling_cursor.num_scheduled_tokens_cpu.tolist()))
+        return [hidden_states_lst[i] for i in pooling_cursor.index]
 
 
 class MeanPool(PoolingMethod):
@@ -341,31 +310,25 @@ class MeanPool(PoolingMethod):
     def get_supported_tasks(self) -> Set[PoolingTask]:
         return {"encode", "embed", "classify", "score"}
 
-    def forward_one(
-        self,
-        hidden_states: torch.Tensor,
-        prompt_len: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        assert prompt_len is None or prompt_len == hidden_states.shape[0], \
-            "partial prefill not supported with MEAN pooling"
-
-        return hidden_states.mean(dim=0, dtype=torch.float32)
-
     def forward_all(
         self,
         hidden_states: torch.Tensor,
-        prompt_lens: torch.Tensor,
+        pooling_cursor: PoolingCursor,
     ) -> Union[list[torch.Tensor], torch.Tensor]:
+
+        assert not pooling_cursor.is_partial_prefill(), \
+            "partial prefill not supported with MEAN pooling"
+
+        prompt_lens = pooling_cursor.prompt_lens_cpu.to(hidden_states.device,
+                                                        non_blocking=True)
+
         # Use float32 for torch.cumsum in MeanPool,
         # otherwise precision will be lost significantly.
         cumsum = torch.cumsum(hidden_states, dim=0, dtype=torch.float32)
 
-        start_indices = torch.cat([
-            torch.tensor([0], device=hidden_states.device),
-            torch.cumsum(prompt_lens[:-1], dim=0)
-        ])
-        end_indices = torch.cumsum(prompt_lens, dim=0)
-        return (cumsum[end_indices - 1] - cumsum[start_indices] +
+        start_indices = pooling_cursor.first_token_indices_gpu
+        end_indices = pooling_cursor.last_token_indices_gpu
+        return (cumsum[end_indices] - cumsum[start_indices] +
                 hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
 
 
@@ -477,6 +440,10 @@ class EmbeddingPoolerHead(PoolerHead):
 
         pooling_params = get_pooling_params(pooling_metadata)
 
+        if isinstance(pooled_data, list):
+            pooled_data = torch.stack(pooled_data)
+        # pooled_data shape: [batchsize, embedding_dimension]
+
         # for matryoshka representation
         dimensions_list = [
             pooling_param.dimensions for pooling_param in pooling_params
@@ -667,6 +634,10 @@ class ClassifierPooler(Pooler):
     ) -> PoolerOutput:
         pooled_data = self.pooling(hidden_states, pooling_metadata)
 
+        if isinstance(pooled_data, list):
+            pooled_data = torch.stack(pooled_data)
+        # pooled_data shape: [batchsize, hidden_size]
+
         if self.classifier is not None:
             # apply classifier once on the full batch if possible
             if isinstance(pooled_data, torch.Tensor):
@@ -717,12 +688,6 @@ class DispatchPooler(Pooler):
     ) -> PoolerOutput:
         poolers_by_task = self.poolers_by_task
 
-        if isinstance(hidden_states, list):
-            hidden_states_lst = hidden_states
-        else:
-            prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
-            hidden_states_lst = list(hidden_states.split(prompt_lens.tolist()))
-
         outputs = list[PoolingSequenceGroupOutput]()
         offset = 0
         for task, group in groupby(get_tasks(pooling_metadata)):
@@ -733,7 +698,7 @@ class DispatchPooler(Pooler):
 
             num_items = len(list(group))
             group_output: PoolerOutput = pooler(
-                hidden_states_lst[offset:offset + num_items],
+                hidden_states,
                 pooling_metadata[offset:offset + num_items],
             )
 
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 6638f06f98..2bd5eb5bb7 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -528,9 +528,9 @@ def _encode_token_type_ids(input_ids: torch.Tensor,
 
 def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
 
-    ids_mask = torch.ones(input_ids.shape,
-                          dtype=torch.int32,
-                          device=input_ids.device) << TOKEN_TYPE_SHIFT
+    ids_mask = torch.ones_like(input_ids,
+                               dtype=torch.int32,
+                               device=input_ids.device) << TOKEN_TYPE_SHIFT
     tokens_mask = ids_mask.bitwise_not()
 
     token_type_ids = input_ids.bitwise_and(ids_mask) >> TOKEN_TYPE_SHIFT
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 32a4a2c9a2..49a37342c6 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -9,7 +9,6 @@ from torch import nn
 from transformers import RobertaConfig
 
 from vllm.config import VllmConfig
-from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool,
                                                DispatchPooler, Pooler)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -100,7 +99,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
-        self.padding_idx = vllm_config.model_config.hf_config.pad_token_id
+        self.padding_idx: int = vllm_config.model_config.hf_config.pad_token_id
 
     def forward(
         self,
@@ -178,7 +177,7 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        self.padding_idx = vllm_config.model_config.hf_config.pad_token_id
+        self.padding_idx: int = vllm_config.model_config.hf_config.pad_token_id
 
         self.num_labels = config.num_labels
         self.roberta = BertModel(vllm_config=vllm_config,
@@ -233,58 +232,14 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
                             intermediate_tensors=intermediate_tensors)
 
 
-# Adapted from transformers
-def create_position_ids_from_input_ids(input_ids,
-                                       padding_idx,
-                                       past_key_values_length=0):
-    """
-    Replace non-padding symbols with their position numbers.
-    Position numbers begin at padding_idx+1. Padding symbols
-    are ignored. This is modified from fairseq's `utils.make_positions`.
-
-    Args:
-        x: torch.Tensor x:
-
-    Returns: torch.Tensor
-    """
-    # The series of casts and type-conversions here are carefully
-    # balanced to both work with ONNX export and XLA.
-    mask = input_ids.ne(padding_idx).int()
-
-    incremental_indices = (torch.cumsum(mask, dim=0).type_as(mask) +
-                           past_key_values_length) * mask
-
-    return incremental_indices.long() + padding_idx
-
-
 def replace_roberta_positions(input_ids: torch.Tensor,
                               position_ids: torch.Tensor,
                               padding_idx: int) -> None:
-
-    seq_lens: Optional[torch.Tensor] = None
-    attn_metadata = get_forward_context().attn_metadata
-    if attn_metadata is not None:  # can be None during warmup
-        if isinstance(attn_metadata, dict):
-            attn_metadata = next(iter(attn_metadata.values()))
-        # TODO: remove "seq_lens_tensor" after V0 is removed
-        seq_lens = getattr(attn_metadata, "seq_lens_tensor",
-                           getattr(attn_metadata, "seq_lens", None))
-
-    if seq_lens is not None:
-        assert isinstance(seq_lens, torch.Tensor)
-
-        # Replace position ids because in RoBERTa models
-        # they have to start at padding_idx + 1 and ignore
-        # existing padding tokens
-        # References:
-        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
-        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
-        token_list = torch.split(input_ids[:torch.sum(seq_lens)],
-                                 seq_lens.tolist())
-
-        offset = 0
-        for tokens in token_list:
-            length = tokens.shape[0]
-            position_ids[offset:offset+length] = \
-                create_position_ids_from_input_ids(tokens, padding_idx)
-            offset = offset + length
+    # Replace position ids because in RoBERTa models
+    # they have to start at padding_idx + 1 and ignore
+    # existing padding tokens
+    # References:
+    # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
+    # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
+    # vllm does not use padding tokens, let's make things simpler
+    position_ids += padding_idx + 1
diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
index e6f1ca61dd..3209879193 100644
--- a/vllm/model_executor/pooling_metadata.py
+++ b/vllm/model_executor/pooling_metadata.py
@@ -2,12 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Optional
 
 import torch
 
 from vllm.pooling_params import PoolingParams
 from vllm.utils import is_pin_memory_available
+from vllm.v1.pool.metadata import PoolingCursor, build_pooling_cursor
 
 
 class PoolingMetadata:
@@ -23,14 +24,15 @@ class PoolingMetadata:
     """
 
     def __init__(
-        self,
-        seq_groups: list[tuple[list[int], PoolingParams]],
-        seq_data: dict[int, Any],  # Specific data related to sequences
-        prompt_lens: list[int],
-    ) -> None:
+            self,
+            seq_groups: list[tuple[list[int], PoolingParams]],
+            seq_data: dict[int, Any],  # Specific data related to sequences
+            prompt_lens: list[int],
+            pooling_cursor: Optional[PoolingCursor] = None) -> None:
         self.seq_groups = seq_groups
         self.seq_data = seq_data
         self.prompt_lens = prompt_lens
+        self.pooling_cursor: Optional[PoolingCursor] = pooling_cursor
 
     def __repr__(self) -> str:
         return ("PoolingMetadata("
@@ -43,8 +45,17 @@ class PoolingMetadata:
             seq_groups=self.seq_groups[indices],
             seq_data=dict(list(self.seq_data.items())[indices]),
             prompt_lens=self.prompt_lens[indices],
+            pooling_cursor=None
+            if self.pooling_cursor is None else self.pooling_cursor[indices],
         )
 
+    def build_pooling_cursor(self, num_scheduled_tokens: list[int],
+                             device: torch.device):
+        prompt_lens = torch.tensor(self.prompt_lens, device="cpu")
+        self.pooling_cursor = build_pooling_cursor(num_scheduled_tokens,
+                                                   prompt_lens,
+                                                   device=device)
+
 
 @dataclass
 class PoolingTensors:
diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py
index 28af720d05..46506d272e 100644
--- a/vllm/v1/pool/metadata.py
+++ b/vllm/v1/pool/metadata.py
@@ -6,15 +6,40 @@ from typing import Optional
 import torch
 
 from vllm.pooling_params import PoolingParams
+from vllm.utils import is_pin_memory_available
+
+pin_memory = is_pin_memory_available()
+
+
+@dataclass
+class PoolingCursor:
+    index: list[int]
+    first_token_indices_gpu: torch.Tensor
+    last_token_indices_gpu: torch.Tensor
+    prompt_lens_cpu: torch.Tensor
+    num_scheduled_tokens_cpu: torch.Tensor
+
+    def __getitem__(self, indices: slice):
+        return PoolingCursor(
+            index=self.index[indices],
+            first_token_indices_gpu=self.first_token_indices_gpu[indices],
+            last_token_indices_gpu=self.last_token_indices_gpu[indices],
+            prompt_lens_cpu=self.prompt_lens_cpu[indices],
+            num_scheduled_tokens_cpu=self.num_scheduled_tokens_cpu[indices],
+        )
+
+    def is_partial_prefill(self):
+        return not torch.all(
+            self.prompt_lens_cpu == self.num_scheduled_tokens_cpu)
 
 
 @dataclass
 class PoolingMetadata:
     """Tensors for pooling."""
-
-    prompt_lens: torch.Tensor
+    prompt_lens: torch.Tensor  # CPU Tensor
     prompt_token_ids: Optional[torch.Tensor]
     pooling_params: list[PoolingParams]
+    pooling_cursor: Optional[PoolingCursor] = None
 
     def __getitem__(self, indices: slice):
         return PoolingMetadata(
@@ -22,4 +47,31 @@ class PoolingMetadata:
             prompt_token_ids=None if self.prompt_token_ids is None else
             self.prompt_token_ids[indices],
             pooling_params=self.pooling_params[indices],
+            pooling_cursor=None
+            if self.pooling_cursor is None else self.pooling_cursor[indices],
         )
+
+    def build_pooling_cursor(self, num_scheduled_tokens: list[int],
+                             device: torch.device):
+        self.pooling_cursor = build_pooling_cursor(num_scheduled_tokens,
+                                                   self.prompt_lens, device)
+
+
+def build_pooling_cursor(num_scheduled_tokens: list[int],
+                         prompt_lens: torch.Tensor, device: torch.device):
+    assert len(prompt_lens) == len(num_scheduled_tokens)
+
+    n_seq = len(num_scheduled_tokens)
+    index = list(range(n_seq))
+    num_scheduled_tokens = torch.tensor(num_scheduled_tokens, device="cpu")
+    cumsum = torch.zeros(n_seq + 1,
+                         dtype=torch.int64,
+                         pin_memory=pin_memory,
+                         device="cpu")
+    torch.cumsum(num_scheduled_tokens, dim=0, out=cumsum[1:])
+    cumsum = cumsum.to(device, non_blocking=True)
+    return PoolingCursor(index=index,
+                         first_token_indices_gpu=cumsum[:n_seq],
+                         last_token_indices_gpu=cumsum[1:] - 1,
+                         prompt_lens_cpu=prompt_lens,
+                         num_scheduled_tokens_cpu=num_scheduled_tokens)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 8d08bd7742..154b77ae63 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -713,7 +713,7 @@ class InputBatch:
 
         return PoolingMetadata(
             prompt_lens=torch.from_numpy(
-                self.num_prompt_tokens[:self.num_reqs]).to(self.device),
+                self.num_prompt_tokens[:self.num_reqs]),
             prompt_token_ids=self.sampling_metadata.prompt_token_ids,
             pooling_params=pooling_params,
         )
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7caa873be4..43a9888d8e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1476,23 +1476,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         "Either all or none of the requests in" \
         " a batch must be pooling request"
 
-        extracted_hidden_states = list(
-            torch.split(hidden_states[:num_scheduled_tokens],
-                        num_scheduled_tokens_np.tolist()))
-
+        hidden_states = hidden_states[:num_scheduled_tokens]
         pooling_metadata = self.input_batch.pooling_metadata
+        pooling_metadata.build_pooling_cursor(num_scheduled_tokens_np.tolist(),
+                                              device=hidden_states.device)
+        seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
 
+        # Pooling models D2H & synchronize occurs in pooler.py:build_output
         raw_pooler_output = self.model.pooler(
-            hidden_states=extracted_hidden_states,
-            pooling_metadata=pooling_metadata)
+            hidden_states=hidden_states, pooling_metadata=pooling_metadata)
 
         pooler_output: list[Optional[torch.Tensor]] = []
-        seq_lens = self.seq_lens[:self.input_batch.num_reqs]
         for raw_output, seq_len, prompt_len in zip(
-                raw_pooler_output, seq_lens, pooling_metadata.prompt_lens):
+                raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
 
             if seq_len == prompt_len:
-                pooler_output.append(raw_output.data.cpu())
+                pooler_output.append(raw_output.data)
             else:
                 pooler_output.append(None)
 
@@ -2524,13 +2523,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         assert sum(num_scheduled_tokens_list) == num_tokens
         assert len(num_scheduled_tokens_list) == num_reqs
 
-        hidden_states_list = list(
-            torch.split(hidden_states, num_scheduled_tokens_list))
         req_num_tokens = num_tokens // num_reqs
 
         dummy_prompt_lens = torch.tensor(
-            [h.shape[0] for h in hidden_states_list],
-            device=self.device,
+            num_scheduled_tokens_list,
+            device="cpu",
         )
         dummy_token_ids = torch.zeros((num_reqs, req_num_tokens),
                                       dtype=torch.int32,
@@ -2547,8 +2544,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             pooling_params=[dummy_pooling_params] * num_reqs,
         )
 
+        dummy_metadata.build_pooling_cursor(num_scheduled_tokens_list,
+                                            device=hidden_states.device)
+
         try:
-            return model.pooler(hidden_states=hidden_states_list,
+            return model.pooler(hidden_states=hidden_states,
                                 pooling_metadata=dummy_metadata)
         except RuntimeError as e:
             if 'out of memory' in str(e):
@@ -3316,10 +3316,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         dummy_block_table = torch.zeros((num_reqs, 1),
                                         dtype=torch.int32,
-                                        device=self.device)
+                                        pin_memory=self.pin_memory,
+                                        device="cpu").to(self.device,
+                                                         non_blocking=True)
         dummy_slot_mapping = torch.zeros((total_num_scheduled_tokens, ),
                                          dtype=torch.int32,
-                                         device=self.device)
+                                         pin_memory=self.pin_memory,
+                                         device="cpu").to(self.device,
+                                                          non_blocking=True)
 
         group_metadata = dict[str, tuple[CommonAttentionMetadata, Any]]()
 
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index e49783ad9b..8d8d9b4d05 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -149,9 +149,16 @@ class PoolingModelRunner(
         if not self.is_driver_worker:
             return []
 
+        pooling_metadata = model_input.pooling_metadata
+        assert pooling_metadata is not None
+
+        pooling_metadata.build_pooling_cursor(
+            num_scheduled_tokens=pooling_metadata.prompt_lens,
+            device=hidden_or_intermediate_states.device)
+
         return [
             self.model.pooler(hidden_states=hidden_or_intermediate_states,
-                              pooling_metadata=model_input.pooling_metadata)
+                              pooling_metadata=pooling_metadata)
         ]
 
     def make_model_input_from_broadcasted_tensor_dict(

From c8e33c72c62e12bffe7a42f92e531dc5fb92f7cf Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 21 Aug 2025 10:08:35 -0400
Subject: [PATCH 467/932] [V1] Remove unnecessary check for main thread
 (#23298)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
---
 vllm/engine/arg_utils.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b0f50b4429..972ac203d7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -8,7 +8,6 @@ import dataclasses
 import functools
 import json
 import sys
-import threading
 from dataclasses import MISSING, dataclass, fields, is_dataclass
 from itertools import permutations
 from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
@@ -1527,11 +1526,6 @@ class EngineArgs:
         #############################################################
         # Experimental Features - allow users to opt in.
 
-        # Signal Handlers requires running in main thread.
-        if (threading.current_thread() != threading.main_thread()
-                and _warn_or_fallback("Engine in background thread")):
-            return False
-
         if self.pipeline_parallel_size > 1:
             supports_pp = getattr(self.distributed_executor_backend,
                                   'supports_pp', False)

From f8daddcc4c199c7257835cc4671f51400361b5e8 Mon Sep 17 00:00:00 2001
From: jerryzhuang <zhuangqhc@gmail.com>
Date: Fri, 22 Aug 2025 00:22:39 +1000
Subject: [PATCH 468/932] [Bugfix] set system_message in phi4mini chat template
 (#23309)

Signed-off-by: zhuangqh <zhuangqhc@gmail.com>
---
 examples/tool_chat_template_phi4_mini.jinja | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/examples/tool_chat_template_phi4_mini.jinja b/examples/tool_chat_template_phi4_mini.jinja
index 36423b6c42..83886762c2 100644
--- a/examples/tool_chat_template_phi4_mini.jinja
+++ b/examples/tool_chat_template_phi4_mini.jinja
@@ -1,10 +1,14 @@
-{%- if messages %}
-    {%- if system_message or tools %}
-<|system|>
-
-{%- if system_message %}
-{{ system_message }}
+{%- if messages and messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "You are a helpful assistant." %}
 {%- endif %}
+
+{%- if messages %}
+<|system|>
+{{ system_message }}
+{%- if tools %}
 In addition to plain text responses, you can chose to call one or more of the provided functions.
 
 Use the following rule to decide when to call a function:
@@ -19,13 +23,11 @@ If you decide to call functions:
   * make sure you pick the right functions that match the user intent
 
 
-{%- if tools %}
         {%- for t in tools %}
             {{- t | tojson(indent=4) }}
             {{- "\n\n" }}
         {%- endfor %}
 {%- endif %}<|end|>
-    {%- endif %}
 
     {%- for message in messages %}
         {%- if message.role != "system" %}

From 79f05e4436fd97383bfd6319a1e80886bceb0fd3 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Thu, 21 Aug 2025 07:23:28 -0700
Subject: [PATCH 469/932] [Multimodal] Always enable hashing mm data (#23308)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config/__init__.py                       |   9 --
 vllm/inputs/preprocess.py                     |  53 ++-------
 vllm/model_executor/models/deepseek_vl2.py    |   4 -
 vllm/model_executor/models/h2ovl.py           |   4 -
 vllm/model_executor/models/llava.py           |   3 +-
 vllm/model_executor/models/mllama.py          |   3 +-
 vllm/model_executor/models/paligemma.py       |   3 +-
 vllm/model_executor/models/pixtral.py         |   3 -
 .../models/prithvi_geospatial_mae.py          | 112 +++++++++++-------
 vllm/model_executor/models/transformers.py    |   1 -
 vllm/model_executor/models/voxtral.py         |   3 -
 vllm/multimodal/hasher.py                     |   2 +-
 vllm/multimodal/inputs.py                     |   2 +-
 vllm/multimodal/processing.py                 |  20 +---
 vllm/v1/engine/processor.py                   |  20 +---
 15 files changed, 94 insertions(+), 148 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 2973cb92d1..fbc4dd3989 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1685,15 +1685,6 @@ class ModelConfig:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
-    @property
-    def processor_return_mm_hashes(self) -> bool:
-        """Whether the multi-modal processor should output hashes."""
-        mm_config = self.multimodal_config
-        if mm_config is None:
-            return False
-
-        return mm_config.mm_processor_cache_gb > 0
-
     @property
     def enable_mm_processor_cache(self) -> bool:
         """Whether the multi-modal processor cache should be enabled."""
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index de5dc08766..3f521012e8 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -254,7 +254,6 @@ class InputPreprocessor:
         mm_processor_kwargs: Optional[Mapping[str, object]],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
@@ -271,8 +270,7 @@ class InputPreprocessor:
         return mm_processor.apply(prompt,
                                   mm_data,
                                   hf_processor_mm_kwargs=mm_processor_kwargs,
-                                  tokenization_kwargs=tokenization_kwargs,
-                                  return_mm_hashes=return_mm_hashes)
+                                  tokenization_kwargs=tokenization_kwargs)
 
     async def _process_multimodal_async(
         self,
@@ -281,7 +279,6 @@ class InputPreprocessor:
         mm_processor_kwargs: Optional[Mapping[str, object]],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         """
         Async version of
@@ -297,8 +294,7 @@ class InputPreprocessor:
         return mm_processor.apply(prompt,
                                   mm_data,
                                   hf_processor_mm_kwargs=mm_processor_kwargs,
-                                  tokenization_kwargs=tokenization_kwargs,
-                                  return_mm_hashes=return_mm_hashes)
+                                  tokenization_kwargs=tokenization_kwargs)
 
     def _process_embeds(
         self,
@@ -335,7 +331,6 @@ class InputPreprocessor:
         parsed_content: TokensPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = parsed_content["prompt_token_ids"]
         token_type_ids = parsed_content.get("token_type_ids")
@@ -348,7 +343,6 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                return_mm_hashes=return_mm_hashes,
             )
         else:
             inputs = token_inputs(
@@ -366,7 +360,6 @@ class InputPreprocessor:
         parsed_content: TokensPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = parsed_content["prompt_token_ids"]
         token_type_ids = parsed_content.get("token_type_ids")
@@ -379,7 +372,6 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                return_mm_hashes=return_mm_hashes,
             )
         else:
             inputs = token_inputs(
@@ -397,7 +389,6 @@ class InputPreprocessor:
         parsed_content: TextPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -409,7 +400,6 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                return_mm_hashes=return_mm_hashes,
             )
         else:
             prompt_token_ids = self._tokenize_prompt(
@@ -432,7 +422,6 @@ class InputPreprocessor:
         parsed_content: TextPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -444,7 +433,6 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                return_mm_hashes=return_mm_hashes,
             )
         else:
             prompt_token_ids = await self._tokenize_prompt_async(
@@ -467,7 +455,6 @@ class InputPreprocessor:
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
     ) -> SingletonInputs:
         """
         Extract the singleton inputs from a prompt.
@@ -476,7 +463,6 @@ class InputPreprocessor:
 
         * prompt: single encoder or decoder input prompt
         * lora_request: this is only valid for decoder prompts
-        * return_mm_hashes: whether to return multimodal hashes
 
         Returns:
 
@@ -490,21 +476,18 @@ class InputPreprocessor:
             return self._process_tokens(
                 parsed["content"],
                 lora_request=lora_request,
-                return_mm_hashes=return_mm_hashes,
             )
         if parsed["type"] == "text":
             return self._process_text(
                 parsed["content"],
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                return_mm_hashes=return_mm_hashes,
             )
         if parsed["type"] == "str":
             return self._process_text(
                 TextPrompt(prompt=parsed["content"]),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                return_mm_hashes=return_mm_hashes,
             )
 
         assert_never(parsed)
@@ -514,7 +497,6 @@ class InputPreprocessor:
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
     ) -> SingletonInputs:
         """
         Async version of
@@ -528,21 +510,18 @@ class InputPreprocessor:
             return await self._process_tokens_async(
                 parsed["content"],
                 lora_request=lora_request,
-                return_mm_hashes=return_mm_hashes,
             )
         if parsed["type"] == "text":
             return await self._process_text_async(
                 parsed["content"],
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                return_mm_hashes=return_mm_hashes,
             )
         if parsed["type"] == "str":
             return await self._process_text_async(
                 TextPrompt(prompt=parsed["content"]),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                return_mm_hashes=return_mm_hashes,
             )
 
         assert_never(parsed)
@@ -785,7 +764,6 @@ class InputPreprocessor:
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
@@ -796,7 +774,6 @@ class InputPreprocessor:
 
         * prompt: input prompt
         * lora_request
-        * return_mm_hashes
 
         Returns:
 
@@ -807,7 +784,6 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
-            return_mm_hashes=return_mm_hashes,
         )
 
         return self._build_decoder_only_llm_inputs(prompt_comps)
@@ -817,7 +793,6 @@ class InputPreprocessor:
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
     ) -> DecoderOnlyInputs:
         """
         Async version of
@@ -827,7 +802,6 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
-            return_mm_hashes=return_mm_hashes,
         )
 
         return self._build_decoder_only_llm_inputs(prompt_comps)
@@ -837,17 +811,15 @@ class InputPreprocessor:
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
         if self.model_config.is_encoder_decoder:
-            assert not return_mm_hashes, (
-                "Multimodal hashes for encoder-decoder models should not be ",
-                "returned until they are supported on vLLM V1.")
             # Encoder-decoder model requires special mapping of
-            # input prompts to encoder & decoder
+            # input prompts to encoder & decoder.
             return self._process_encoder_decoder_prompt(
-                prompt, tokenization_kwargs)
+                prompt,
+                tokenization_kwargs,
+            )
 
         if is_explicit_encoder_decoder_prompt(prompt):
             raise ValueError("Cannot pass encoder-decoder prompt "
@@ -858,7 +830,6 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
-            return_mm_hashes=return_mm_hashes,
         )
 
     async def preprocess_async(
@@ -866,19 +837,18 @@ class InputPreprocessor:
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
     ) -> ProcessorInputs:
         """
         Async version of
         [`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess].
         """
         if self.model_config.is_encoder_decoder:
-            assert not return_mm_hashes, (
-                "Multimodal hashes for encoder-decoder models should not be ",
-                "returned until they are supported on vLLM V1.")
             # Encoder-decoder model requires special mapping of
-            # input prompts to encoder & decoder
-            return await self._process_encoder_decoder_prompt_async(prompt)
+            # input prompts to encoder & decoder.
+            return await self._process_encoder_decoder_prompt_async(
+                prompt,
+                tokenization_kwargs,
+            )
 
         if is_explicit_encoder_decoder_prompt(prompt):
             raise ValueError("Cannot pass encoder-decoder prompt "
@@ -889,5 +859,4 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
-            return_mm_hashes=return_mm_hashes,
         )
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 4210763483..ceb5e1364b 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -290,8 +290,6 @@ class DeepseekVL2MultiModalProcessor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        *,
-        return_mm_hashes: bool,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 2 vs > 2
         # Since the processing cache assumes that the processor output is
@@ -303,7 +301,6 @@ class DeepseekVL2MultiModalProcessor(
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
-                return_mm_hashes=return_mm_hashes,
             )
 
         return super()._cached_apply_hf_processor(
@@ -311,7 +308,6 @@ class DeepseekVL2MultiModalProcessor(
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
-            return_mm_hashes=return_mm_hashes,
         )
 
 
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 75ab4dbe7b..87e451a276 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -479,8 +479,6 @@ class H2OVLMultiModalProcessor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        *,
-        return_mm_hashes: bool,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 1 vs > 1
         # Since the processing cache assumes that the processor output is
@@ -492,7 +490,6 @@ class H2OVLMultiModalProcessor(
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
-                return_mm_hashes=return_mm_hashes,
             )
 
         return super()._cached_apply_hf_processor(
@@ -500,7 +497,6 @@ class H2OVLMultiModalProcessor(
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
-            return_mm_hashes=return_mm_hashes,
         )
 
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 3caaaa9f7d..cd41d4fb43 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -795,7 +795,6 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
@@ -807,7 +806,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
         )
 
         result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                               tokenization_kwargs, return_mm_hashes)
+                               tokenization_kwargs)
 
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 9d2ac77147..bb3267ce5b 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -168,10 +168,9 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        return_mm_hashes: bool = False,
     ) -> MultiModalEncDecInputs:
         mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                                  tokenization_kwargs, return_mm_hashes)
+                                  tokenization_kwargs)
 
         image_token_id = self.info.get_hf_config().image_token_index
         # Check that the number of image tokens in the decoder prompt matches
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index f15e7a17d5..7d6a6207c7 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -194,10 +194,9 @@ class PaliGemmaMultiModalProcessor(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                                  tokenization_kwargs, return_mm_hashes)
+                                  tokenization_kwargs)
         prompt_token_ids = mm_inputs["prompt_token_ids"]
 
         tokenizer = self.info.get_tokenizer()
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 25be44e3f6..c01074e212 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -308,15 +308,12 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        *,
-        return_mm_hashes: bool,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
-            return_mm_hashes=return_mm_hashes,
         )
 
         # NOTE: The tokens are already inserted by the chat template
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 442596a6b5..59e9f3e8a4 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -18,7 +18,7 @@
 """Inference-only IBM/NASA Prithvi Geospatial model."""
 
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -32,18 +32,56 @@ from vllm.model_executor.models.interfaces import (
     default_pooling_type)
 from vllm.model_executor.models.utils import AutoWeightsLoader
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalFieldElem, MultiModalInputs,
-                                    MultiModalKwargsItem,
-                                    MultiModalKwargsItems,
-                                    MultiModalSharedField, PlaceholderRange)
-from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
+                                    MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, MultiModalKwargsItems,
+                                    PlaceholderRange)
+from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems,
+                                   MultiModalDataItems, MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 
+def _prithvi_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    # This model receives in input a multi-dimensional tensor representing
+    # a single image patch and therefore it is not to be split
+    # into multiple elements, but rather to be considered a single one.
+    # Hence, the decision of using a MultiModalSharedField.
+    # The expected shape is (num_channels, width, height).
+
+    # This model however allows the user to also submit multiple image
+    # patches as a batch, adding a further dimension to the above shape.
+    # At this stage we only support submitting one patch per request and
+    # batching is achieved via vLLM batching.
+    # TODO (christian-pinto): enable support for multi patch requests
+    # in tandem with vLLM batching.
+    return dict(
+        pixel_values=MultiModalFieldConfig.shared(batch_size=1,
+                                                  modality="image"),
+        location_coords=MultiModalFieldConfig.shared(batch_size=1,
+                                                     modality="image"),
+    )
+
+
+class PrithviGeoSpatialMAEMultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                required_fields={"pixel_values", "location_coords"},
+                fields_factory=_prithvi_field_config,
+            )
+
+        return super()._parse_image_data(data)
+
+
 class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
@@ -64,26 +102,26 @@ class PrithviGeoSpatialMAEInputBuilder(
         # This model input is fixed and is in the form of a torch Tensor.
         # The size of pixel_values might change in the cases where we resize
         # the input but never exceeds the dimensions below.
-        return {
+        image_data = {
             "pixel_values": torch.full((6, 512, 512), 1.0,
                                        dtype=torch.float16),
             "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
         }
 
+        return {"image": image_data}
+
 
 class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return PrithviGeoSpatialMAEMultiModalDataParser()
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.shared(batch_size=1,
-                                                      modality="image"),
-            location_coords=MultiModalFieldConfig.shared(batch_size=1,
-                                                         modality="image"),
-        )
+        return _prithvi_field_config(hf_inputs)
 
     def _get_prompt_updates(
         self,
@@ -99,46 +137,32 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
-        mm_kwargs = {}
+        if "image" in mm_data:
+            image_data = mm_data["image"]
+        else:
+            image_data = mm_data
+            mm_data = {"image": mm_data}
 
-        for k, v in mm_data.items():
-            if isinstance(v, dict) and k == "image":
-                mm_kwargs.update(v)
-            else:
-                mm_kwargs[k] = v
+        mm_items = self._to_mm_items(mm_data)
+        mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
+                                        tokenization_kwargs or {})
         mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
 
-        # This model receives in input a multi-dimensional tensor representing
-        # a single image patch and therefore it is not to be split
-        # into multiple elements, but rather to be considered a single one.
-        # Hence, the decision of using a MultiModalSharedField.
-        # The expected shape is (num_channels, width, height).
+        mm_processed_data = BatchFeature(image_data)
 
-        # This model however allows the user to also submit multiple image
-        # patches as a batch, adding a further dimension to the above shape.
-        # At this stage we only support submitting one patch per request and
-        # batching is achieved via vLLM batching.
-        # TODO (christian-pinto): enable support for multi patch requests
-        # in tandem with vLLM batching.
-        multimodal_kwargs_items = [
-            MultiModalKwargsItem.from_elems([
-                MultiModalFieldElem(
-                    modality="image",
-                    key=key,
-                    data=data,
-                    field=MultiModalSharedField(1),
-                ) for key, data in mm_kwargs.items()
-            ])
-        ]
+        mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
+            mm_processed_data,
+            self._get_mm_fields_config(mm_processed_data,
+                                       hf_processor_mm_kwargs),
+        )
 
         return MultiModalInputs(
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=[1],
-            mm_kwargs=MultiModalKwargsItems.from_seq(multimodal_kwargs_items),
-            mm_hashes=None,
+            mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholders,
         )
 
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index ed9d6c0ab4..fc242d1ada 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -310,7 +310,6 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index d0e8e3d39b..77f11a691e 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -288,15 +288,12 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        *,
-        return_mm_hashes: bool,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
-            return_mm_hashes=return_mm_hashes,
         )
 
         # NOTE: The tokens are already inserted by the chat template
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index c9ce1f0be5..210a4ec762 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -43,7 +43,7 @@ class MultiModalHasher:
             return cls.item_to_bytes(
                 "image", np.asarray(convert_image_mode(obj, "RGBA")))
         if isinstance(obj, torch.Tensor):
-            return cls.item_to_bytes("tensor", obj.numpy())
+            return cls.item_to_bytes("tensor", obj.cpu().numpy())
         if isinstance(obj, np.ndarray):
             # If the array is non-contiguous, we need to copy it first
             arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes()
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index d46d81fe14..581f9a109c 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -901,7 +901,7 @@ class MultiModalInputs(TypedDict):
     mm_kwargs: MultiModalKwargsItems
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: Optional["MultiModalHashDict"]
+    mm_hashes: "MultiModalHashDict"
     """The hashes of the multi-modal data."""
 
     mm_placeholders: "MultiModalPlaceholderDict"
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index e1363b7b0d..55fd1479d2 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -998,7 +998,7 @@ A collection of prompt updates with a similar structure as
 
 class MultiModalProcessingInfo(NamedTuple):
     kwargs: MultiModalKwargsItems
-    hashes: Optional[MultiModalHashes]
+    hashes: MultiModalHashes
     prompt_updates: MultiModalPromptUpdates
 
 
@@ -1399,8 +1399,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        *,
-        return_mm_hashes: bool,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         (
             prompt_ids,
@@ -1420,9 +1418,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                                        hf_processor_mm_kwargs),
         )
 
-        mm_hashes = (self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
-                                         tokenization_kwargs)
-                     if return_mm_hashes else None)
+        mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
+                                        tokenization_kwargs)
 
         unbound_prompt_updates = self._get_prompt_updates(
             mm_data_items,
@@ -1446,8 +1443,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        *,
-        return_mm_hashes: bool,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         """
         Apply the HF processor on the full prompt text,
@@ -1462,7 +1457,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
-                return_mm_hashes=return_mm_hashes,
             )
 
         mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
@@ -1476,8 +1470,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             mm_hashes=mm_hashes,
         )
 
-        mm_hashes_to_return = mm_hashes if return_mm_hashes else None
-
         # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
         # so we can't apply prompt updates until the new multimodal
         # items are combined with the cached multimodal items
@@ -1515,7 +1507,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         mm_info = MultiModalProcessingInfo(
             kwargs=mm_kwargs,
-            hashes=mm_hashes_to_return,
+            hashes=mm_hashes,
             prompt_updates=mm_prompt_updates,
         )
 
@@ -1697,7 +1689,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1726,7 +1717,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             mm_items,
             hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
-            return_mm_hashes=return_mm_hashes,
         )
 
         # NOTE: tokenization_kwargs are not required to init processor
@@ -1811,7 +1801,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        return_mm_hashes: bool = False,
     ) -> MultiModalEncDecInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1826,7 +1815,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
             mm_data,
             hf_processor_mm_kwargs,
             tokenization_kwargs,
-            return_mm_hashes,
         )
 
         return self._get_enc_dec_inputs(
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 97d79c2ae0..69f8e531e0 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -17,7 +17,6 @@ from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
-from vllm.utils import is_list_of
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
 from vllm.v1.structured_output.backend_guidance import (
@@ -253,13 +252,10 @@ class Processor:
         # 1. Tokenize text prompt, with LoRA request if one exists.
         # 2. For multimodal models with a merged preprocessor, preprocess
         #   multimodal data and expand prompt token ids accordingly.
-        return_mm_hashes = (self.model_config.processor_return_mm_hashes
-                            or bool(self.cache_config.enable_prefix_caching))
         processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
-            return_mm_hashes=return_mm_hashes,
         )
         from vllm.platforms import current_platform
         current_platform.validate_request(
@@ -302,7 +298,7 @@ class Processor:
         if decoder_inputs["type"] == "multimodal":
             decoder_mm_inputs = decoder_inputs["mm_kwargs"]
             decoder_mm_positions = decoder_inputs["mm_placeholders"]
-            decoder_mm_hashes = decoder_inputs.get("mm_hashes")
+            decoder_mm_hashes = decoder_inputs["mm_hashes"]
 
             # Merge and flatten multimodal placeholders, hashes and inputs
             # from dictionaries to lists, and sort them by each item's position
@@ -317,19 +313,15 @@ class Processor:
                 decoder_mm_positions[modality][idx]
                 for modality, idx in sorted_mm_idxs
             ]
-            sorted_mm_hashes = None if decoder_mm_hashes is None else [
+            sorted_mm_hashes = [
                 decoder_mm_hashes[modality][idx]
                 for modality, idx in sorted_mm_idxs
             ]
 
-            if sorted_mm_hashes is not None:
-                sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
-                    orig_sorted_mm_inputs,
-                    sorted_mm_hashes,
-                )
-            else:
-                assert is_list_of(orig_sorted_mm_inputs, MultiModalKwargsItem)
-                sorted_mm_inputs = orig_sorted_mm_inputs
+            sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
+                orig_sorted_mm_inputs,
+                sorted_mm_hashes,
+            )
 
         return decoder_inputs.get("prompt"), EngineCoreRequest(
             request_id=request_id,

From e0b056e443135e98a525bcfd2f59d87f4a7bcc14 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 21 Aug 2025 23:32:55 +0800
Subject: [PATCH 470/932] [ci/build] Fix abi tag for aarch64 (#23329)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/generate_index.py        |  9 +++++++--
 .buildkite/scripts/upload-wheels.sh | 15 +++++++++++++--
 setup.py                            | 15 ++++++++++++---
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
index 6b5a2a9935..bbed80ebe8 100644
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -22,11 +22,16 @@ filename = os.path.basename(args.wheel)
 
 with open("index.html", "w") as f:
     print(f"Generated index.html for {args.wheel}")
+    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
     if "x86_64" in filename:
         x86_wheel = filename
-        arm_wheel = filename.replace("x86_64", "aarch64")
+        arm_wheel = filename.replace("x86_64", "aarch64").replace(
+            "manylinux1", "manylinux2014"
+        )
     elif "aarch64" in filename:
-        x86_wheel = filename.replace("aarch64", "x86_64")
+        x86_wheel = filename.replace("aarch64", "x86_64").replace(
+            "manylinux2014", "manylinux1"
+        )
         arm_wheel = filename
     else:
         raise ValueError(f"Unsupported wheel: {filename}")
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 037897e53d..745f285c00 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -14,8 +14,19 @@ fi
 # Get the single wheel file
 wheel="${wheel_files[0]}"
 
-# Rename 'linux' to 'manylinux1' in the wheel filename
-new_wheel="${wheel/linux/manylinux1}"
+# Detect architecture and rename 'linux' to appropriate manylinux version
+arch=$(uname -m)
+if [[ $arch == "x86_64" ]]; then
+    manylinux_version="manylinux1"
+elif [[ $arch == "aarch64" ]]; then
+    manylinux_version="manylinux2014"
+else
+    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
+    manylinux_version="manylinux1"
+fi
+
+# Rename 'linux' to the appropriate manylinux version in the wheel filename
+new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
 
diff --git a/setup.py b/setup.py
index 6a3013de79..fa406b868c 100644
--- a/setup.py
+++ b/setup.py
@@ -643,16 +643,25 @@ if envs.VLLM_USE_PRECOMPILED:
     if wheel_location is not None:
         wheel_url = wheel_location
     else:
+        import platform
+        arch = platform.machine()
+        if arch == "x86_64":
+            wheel_tag = "manylinux1_x86_64"
+        elif arch == "aarch64":
+            wheel_tag = "manylinux2014_aarch64"
+        else:
+            raise ValueError(f"Unsupported architecture: {arch}")
         base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
-        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        nightly_wheel_url = f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
         from urllib.request import urlopen
         try:
             with urlopen(wheel_url) as resp:
                 if resp.status != 200:
-                    wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+                    wheel_url = nightly_wheel_url
         except Exception as e:
             print(f"[warn] Falling back to nightly wheel: {e}")
-            wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+            wheel_url = nightly_wheel_url
 
     patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
         wheel_url)

From a482e4e7696141cba45180d7463ddde691d512ae Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Thu, 21 Aug 2025 09:54:08 -0700
Subject: [PATCH 471/932] Migrate MolmoImageInputs to TensorSchema (#22022)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/molmo.py | 56 ++++++++++++-----------------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 6a08d2793f..5fc28ed0e4 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -5,7 +5,7 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
 from functools import cached_property, partial
-from typing import Optional, TypedDict, Union
+from typing import Annotated, Optional, Union
 
 import numpy as np
 import torch
@@ -51,6 +51,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP, SupportsQuant)
@@ -70,23 +71,25 @@ IM_END_TOKEN = "<im_end>"
 POOLING_SIZE = 2
 
 
-class MolmoImageInputs(TypedDict):
-    images: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size * num_images, num_crops, num_patch, patch_dim)`"""
-
-    image_masks: Optional[Union[torch.Tensor, list[torch.Tensor]]]
-    """Shape: `(batch_size * num_images, num_crops, num_patch)`"""
-
-    feat_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+class MolmoImageInputs(TensorSchema):
     """
-    A boolean mask indicating which image features correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_crops, num_patch)`
+    Dimensions:
+        - bn: Batch size * number of images
+        - nc: Number of crops
+        - np: Number of patches
+        - pd: Patch dimension
     """
+    images: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                      TensorShape("bn", "nc", "np", "pd")]
 
-    num_crops: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
+    image_masks: Annotated[Optional[Union[torch.Tensor, list[torch.Tensor]]],
+                           TensorShape("bn", "nc", "np")]
+
+    feat_is_patch: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                             TensorShape("bn", "nc", "np")]
+    # A boolean mask indicating which image features correspond to patch tokens.
+
+    num_crops: Annotated[torch.Tensor, TensorShape("bn")]
 
 
 @dataclass
@@ -1410,28 +1413,17 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         **kwargs: object,
     ) -> Optional[MolmoImageInputs]:
         images = kwargs.pop("images", None)
+        image_masks = kwargs.pop("image_masks", None)
+        feat_is_patch = kwargs.pop("feat_is_patch", None)
+        num_crops = kwargs.pop("num_crops", None)
+
         if images is None:
             return None
 
-        if not isinstance(images, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of images. "
-                             f"Got type: {type(images)}")
-
-        image_masks = kwargs.pop("image_masks", None)
-        if not (image_masks is None or isinstance(image_masks,
-                                                  (torch.Tensor, list))):
-            raise ValueError("Incorrect type of image_masks. "
-                             f"Got type: {type(image_masks)}")
-
-        feat_is_patch = kwargs.pop("feat_is_patch", None)
-        if not isinstance(feat_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of feat_is_patch. "
-                             f"Got type: {type(feat_is_patch)}")
-
-        num_crops = kwargs.pop("num_crops", None)
         if not isinstance(num_crops, (torch.Tensor, list)):
             raise ValueError("Incorrect type of num_crops. "
                              f"Got type: {type(num_crops)}")
+        num_crops = flatten_bn(num_crops, concat=True)
 
         img_patch_id = kwargs.pop("img_patch_id", None)
         if not isinstance(img_patch_id, torch.Tensor):
@@ -1439,8 +1431,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
                              f"Got type: {type(img_patch_id)}")
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
-        num_crops = flatten_bn(num_crops, concat=True)
-
         return MolmoImageInputs(
             images=images,
             image_masks=image_masks,

From 0278f1ac3ac90c245a9a000cb11eb3146103cfd1 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Fri, 22 Aug 2025 00:54:50 +0800
Subject: [PATCH 472/932] Fix nvfp4 swizzling (#23140)

Signed-off-by: yiliu30 <yi4.liu@intel.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 .../schemes/compressed_tensors_w4a4_nvfp4.py  | 27 +++----------------
 .../layers/quantization/utils/quant_utils.py  |  4 +--
 2 files changed, 5 insertions(+), 26 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 63bfe565b1..49d76bbeaa 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -12,6 +12,8 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
     run_nvfp4_emulations)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    swizzle_blockscale)
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
@@ -83,29 +85,6 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
             weight_loader=weight_loader)
         layer.register_parameter("input_global_scale", input_global_scale)
 
-    def swizzle_blockscale(self, scale: torch.tensor):
-        assert (scale.dtype == torch.float8_e4m3fn)
-        # Pad and blockwise interleave weight_scale
-        scale_ndim = scale.ndim
-        if scale.ndim == 2:
-            scale = scale.unsqueeze(0)
-        assert scale.ndim == 3
-        B, M, K = scale.shape
-        round_up_multiple = lambda x, m: (x + m - 1) // m * m
-        M_padded = round_up_multiple(M, 128)
-        K_padded = round_up_multiple(K, 4)
-        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
-        padded_scale[:B, :M, :K] = scale
-        batches, rows, cols = padded_scale.shape
-        assert rows % 128 == 0
-        assert cols % 4 == 0
-        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
-                                            cols // 4, 4)
-        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
-        swizzled_scale = swizzled_scale.contiguous().cuda()
-        return (swizzled_scale.reshape(M, K)
-                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
-
     def process_weights_after_loading(self, layer) -> None:
 
         global_input_scale = layer.input_global_scale.max().to(torch.float32)
@@ -137,7 +116,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
                                                     requires_grad=False)
             layer.weight_packed = Parameter(weight, requires_grad=False)
         else:
-            swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
+            swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
             layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
                                                     requires_grad=False)
             layer.weight_packed = Parameter(layer.weight_packed.data,
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 3cfaca6230..97e5922ebd 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -552,8 +552,8 @@ def swizzle_blockscale(scale: torch.Tensor) -> torch.Tensor:
     swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda()
 
     if scale_ndim == 2:
-        return swizzled.reshape(M, K)
-    return swizzled.reshape(B, M, K)
+        return swizzled.reshape(M_padded, K_padded)
+    return swizzled.reshape(B, M_padded, K_padded)
 
 
 def cutlass_fp4_supported() -> bool:

From f8ce022948873a84e6c857c9fc6ac06c9dedc56f Mon Sep 17 00:00:00 2001
From: Lain <siyuanf@nvidia.com>
Date: Thu, 21 Aug 2025 10:05:47 -0700
Subject: [PATCH 473/932] add tg-mxfp4-moe-test (#22540)

Signed-off-by: siyuanf <siyuanf@nvidia.com>
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml       |   1 +
 tests/kernels/moe/test_mxfp4_moe.py | 420 +++++++++++++++++++++++++++-
 2 files changed, 420 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5869ae21d5..1f67e7e92b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -655,6 +655,7 @@ steps:
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
     # Fusion
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py
index 824b072a9f..7bd1ffce58 100644
--- a/tests/kernels/moe/test_mxfp4_moe.py
+++ b/tests/kernels/moe/test_mxfp4_moe.py
@@ -4,15 +4,27 @@
 import importlib
 import importlib.metadata
 from dataclasses import dataclass
+from typing import Optional
 
 import pytest
 import torch
 from packaging import version
 
+from vllm.platforms import current_platform
+
 QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
     "quark") is not None and version.parse(
         importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
 
+TRTLLM_GEN_MXFP4_AVAILABLE = current_platform.is_cuda(
+) and current_platform.is_device_capability(100)
+
+if TRTLLM_GEN_MXFP4_AVAILABLE:
+    from flashinfer import (fp4_quantize, mxfp8_quantize,
+                            next_positive_power_of_2,
+                            reorder_rows_for_gated_act_gemm, shuffle_matrix_a,
+                            shuffle_matrix_sf_a, trtllm_fp4_block_scale_moe)
+
 
 @dataclass
 class ModelCase:
@@ -54,4 +66,410 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
 
         output = llm.generate_greedy("Today I am in the French Alps and",
                                      max_tokens=20)
-        assert output
\ No newline at end of file
+        assert output
+
+
+def swiglu(x,
+           alpha: float = 1.702,
+           beta: float = 1.0,
+           limit: Optional[float] = None):
+    # Note we add an extra bias of 1 to the linear layer
+    x_glu, x_linear = torch.chunk(x, 2, dim=-1)
+    if limit is not None:
+        x_glu = x_glu.clamp(max=limit)
+        x_linear = x_linear.clamp(min=-limit, max=limit)
+    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
+    return out_glu * (x_linear + beta)
+
+
+fp4_lookup_table = [
+    0, 0.5, 1, 1.5, 2, 3, 4, 6, -0, -0.5, -1, -1.5, -2, -3, -4, -6
+]
+
+
+def mxfp4_dequantize(x, scale):
+    assert x.dtype == torch.uint8
+    x = x.view(torch.uint8).to(torch.int32)
+    x_unpacked = torch.zeros(*x.shape[:-1],
+                             x.shape[-1] * 2,
+                             dtype=torch.int32,
+                             device=x.device)
+    x_unpacked[..., 0::2].copy_(x & 0xF)
+    x_unpacked[..., 1::2].copy_((x >> 4) & 0xF)
+
+    x_float = torch.zeros(x_unpacked.shape,
+                          dtype=torch.float32,
+                          device=x.device)
+    for i, val in enumerate(fp4_lookup_table):
+        x_float[x_unpacked == i] = val
+
+    scale = scale.view(torch.uint8).to(torch.int32)
+    scale = (scale << 23).view(torch.float32)
+    scale = scale.reshape(*x.shape[:-1], -1)
+    scale = torch.stack([scale] * 32, dim=-1).reshape(*x_float.shape)
+
+    return x_float * scale
+
+
+def mxfp8_dequantize(x, scale):
+    assert x.dtype == torch.float8_e4m3fn
+    x_float = x.to(torch.float32)
+
+    scale = scale.view(torch.uint8).to(torch.int32)
+    scale = (scale << 23).view(torch.float32)
+    scale = scale.reshape(*x.shape[:-1], -1)
+    scale = torch.stack([scale] * 32, dim=-1).reshape(*x_float.shape)
+
+    return x_float * scale
+
+
+def reference_moe(
+    roouting_logits,
+    topk,
+    num_experts,
+    hidden_states,
+    w13,
+    bias13,
+    w2,
+    bias2,
+    alpha,
+    beta,
+    limit,
+    act_type,
+):
+    # renormalize routing
+    experts = torch.topk(roouting_logits, k=topk, dim=-1, sorted=True)
+    expert_weights = torch.nn.functional.softmax(experts.values, dim=1)
+    expert_indices = experts.indices
+    t = hidden_states.clone()
+    # MLP #1
+    mlp1_weight = w13[expert_indices, ...]
+    mlp1_bias = bias13[expert_indices, ...]
+    t = torch.einsum("beck,bk->bec", mlp1_weight, t) + mlp1_bias
+    t = swiglu(t, alpha=alpha, beta=beta, limit=limit)
+
+    if act_type == 'mxfp8':
+        t_quantized, t_scale = mxfp8_quantize(t.to(torch.bfloat16),
+                                              is_sf_swizzled_layout=False)
+        t = mxfp8_dequantize(t_quantized, t_scale)
+    # MLP #2
+    mlp2_weight = w2[expert_indices, ...]
+    mlp2_bias = bias2[expert_indices, ...]
+    t = torch.einsum("beck,bek->bec", mlp2_weight, t) + mlp2_bias
+    # Weighted sum of experts
+    t = torch.einsum("bec,be->bc", t, expert_weights)
+    assert t.shape == hidden_states.shape
+    return t.to(torch.bfloat16)
+
+
+def get_tile_tokens_dim(x: torch.Tensor, top_k: int, num_experts: int):
+    # Number of tokens in the input tensor.
+    num_tokens = x.shape[0]
+    # Factor to account for the imbalance of the experts.
+    # factor equals to the
+    # max_real_num_tokens_per_expert / perfect_num_tokens_per_expert
+    # - 1.0 means perfect expert distribution.
+    # - > 1.0 means some experts have more
+    #     tokens than the perfect distribution.
+    # - < 1.0 does not make sense.
+    imbalance_factor = 1.3
+    # Calculate the number of tokens per expert
+    # assuming perfect distribution.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # Apply the imbalance factor.
+    num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
+    # Cap to 8-64 tokens per CTA tile
+    # as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    return tile_tokens_dim
+
+
+def tg_mxfp4_moe(
+    router_logits,
+    topk,
+    num_experts,
+    intermediate_size,
+    hidden_size,
+    hidden_states,
+    hidden_states_scale,
+    w13_weight,
+    w13_weight_scale,
+    w13_bias,
+    w2_weight,
+    w2_weight_scale,
+    w2_bias,
+    act_type,
+    alpha,
+    beta,
+    limit,
+) -> torch.Tensor:
+    sf_block_size = 32
+    assert (w13_weight.dim() == 3 and w13_weight.shape[0] == num_experts
+            and w13_weight.shape[1] == intermediate_size * 2
+            and w13_weight.shape[2] == hidden_size // 2)
+    assert (w13_weight_scale.dim() == 3
+            and w13_weight_scale.shape[0] == num_experts
+            and w13_weight_scale.shape[1] == intermediate_size * 2
+            and w13_weight_scale.shape[2] == hidden_size // sf_block_size)
+    assert (w2_weight.dim() == 3 and w2_weight.shape[0] == num_experts
+            and w2_weight.shape[1] == hidden_size
+            and w2_weight.shape[2] == intermediate_size // 2)
+    assert (w2_weight_scale.dim() == 3
+            and w2_weight_scale.shape[1] == hidden_size
+            and w2_weight_scale.shape[2] == intermediate_size // sf_block_size)
+    assert (w13_bias.dim() == 2 and w13_bias.shape[0] == num_experts
+            and w13_bias.shape[1] == intermediate_size * 2)
+    assert (w2_bias.dim() == 2 and w2_bias.shape[0] == num_experts
+            and w2_bias.shape[1] == hidden_size)
+
+    # Swap w1 and w3 as the defenition of
+    # swiglu is different in the trtllm-gen
+    w13_weight_scale_ = w13_weight_scale.clone()
+    w13_weight_ = w13_weight.clone()
+    w13_bias_ = w13_bias.clone()
+    w13_weight[:, :intermediate_size, :].copy_(
+        w13_weight_[:, intermediate_size:, :])
+    w13_weight[:, intermediate_size:, :].copy_(
+        w13_weight_[:, :intermediate_size, :])
+    w13_weight_scale[:, :intermediate_size, :].copy_(
+        w13_weight_scale_[:, intermediate_size:, :])
+    w13_weight_scale[:, intermediate_size:, :].copy_(
+        w13_weight_scale_[:, :intermediate_size, :])
+    w13_bias[:, :intermediate_size].copy_(w13_bias_[:, intermediate_size:])
+    w13_bias[:, intermediate_size:].copy_(w13_bias_[:, :intermediate_size])
+
+    # Interleave the weights and scaling factors for activation
+    w13_weight_interleaved = []
+    w13_weight_scale_interleaved = []
+    w13_bias_interleaved = []
+    for i in range(num_experts):
+        w13_weight_interleaved.append(
+            reorder_rows_for_gated_act_gemm(w13_weight[i].clone()))
+        w13_weight_scale_interleaved.append(
+            reorder_rows_for_gated_act_gemm(w13_weight_scale[i].clone()))
+        w13_bias_interleaved.append(
+            reorder_rows_for_gated_act_gemm(w13_bias[i].clone().reshape(-1,
+                                                                        1)))
+    w13_weight = torch.stack(w13_weight_interleaved).reshape(
+        num_experts, 2 * intermediate_size, hidden_size // 2)
+    w13_weight_scale = torch.stack(w13_weight_scale_interleaved).reshape(
+        num_experts, 2 * intermediate_size, hidden_size // 32)
+    w13_bias = torch.stack(w13_bias_interleaved).reshape(
+        num_experts, 2 * intermediate_size)
+
+    # Shuffle weights and scaling factors for transposed mma output
+    gemm1_weights_shuffled = []
+    gemm1_scales_shuffled = []
+    gemm2_weights_shuffled = []
+    gemm2_scales_shuffled = []
+    gemm1_bias_shuffled = []
+    gemm2_bias_shuffled = []
+    epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
+    for i in range(num_experts):
+        gemm1_weights_shuffled.append(
+            shuffle_matrix_a(w13_weight[i].view(torch.uint8), epilogue_tile_m))
+        gemm1_scales_shuffled.append(
+            shuffle_matrix_sf_a(w13_weight_scale[i].view(torch.uint8),
+                                epilogue_tile_m))
+
+        gemm2_weights_shuffled.append(
+            shuffle_matrix_a(w2_weight[i].view(torch.uint8), epilogue_tile_m))
+        gemm2_scales_shuffled.append(
+            shuffle_matrix_sf_a(w2_weight_scale[i].view(torch.uint8),
+                                epilogue_tile_m))
+        gemm1_bias_shuffled.append(
+            shuffle_matrix_a(w13_bias[i].reshape(-1, 1), epilogue_tile_m))
+        gemm2_bias_shuffled.append(
+            shuffle_matrix_a(w2_bias[i].reshape(-1, 1), epilogue_tile_m))
+
+    w13_weight = torch.stack(gemm1_weights_shuffled)
+    w13_weight_scale = torch.stack(gemm1_scales_shuffled).reshape(
+        num_experts, 2 * intermediate_size,
+        hidden_size // sf_block_size).view(torch.float8_e4m3fn)
+    w13_bias = torch.stack(gemm1_bias_shuffled).reshape(num_experts, -1)
+
+    w2_weight = torch.stack(gemm2_weights_shuffled)
+    w2_weight_scale = torch.stack(gemm2_scales_shuffled).reshape(
+        num_experts, hidden_size,
+        intermediate_size // sf_block_size).view(torch.float8_e4m3fn)
+    w2_bias = torch.stack(gemm2_bias_shuffled).reshape(num_experts, -1)
+
+    tg_result = trtllm_fp4_block_scale_moe(
+        routing_logits=router_logits.to(torch.bfloat16),
+        routing_bias=None,
+        hidden_states=hidden_states,
+        hidden_states_scale=hidden_states_scale,
+        gemm1_weights=w13_weight,
+        gemm1_weights_scale=w13_weight_scale,
+        gemm1_bias=w13_bias,
+        gemm1_alpha=alpha,
+        gemm1_beta=beta,
+        gemm1_clamp_limit=limit,
+        gemm2_weights=w2_weight,
+        gemm2_weights_scale=w2_weight_scale,
+        gemm2_bias=w2_bias,
+        output1_scale_scalar=None,
+        output1_scale_gate_scalar=None,
+        output2_scale_scalar=None,
+        num_experts=num_experts,
+        top_k=topk,
+        n_group=None,
+        topk_group=None,
+        intermediate_size=intermediate_size,
+        local_expert_offset=0,
+        local_num_experts=num_experts,
+        routed_scaling_factor=None,
+        tile_tokens_dim=get_tile_tokens_dim(hidden_states, topk, num_experts),
+        routing_method_type=1,  # renormalize
+        do_finalize=True)[0]
+    return tg_result
+
+
+def check_accuracy(a, b, atol, rtol, percent):
+    """Allow a mismatch percentage of 1 - percent."""
+    if torch.any(torch.isnan(a)):
+        raise Exception("NaN in reference output")
+    if torch.any(torch.isnan(b)):
+        raise Exception("NaN in actual output")
+    if torch.any(torch.isinf(a)):
+        raise Exception("Inf in reference output")
+    if torch.any(torch.isinf(b)):
+        raise Exception("Inf in actual output")
+    assert a.shape == b.shape, f"Shape mismatch: {a.shape} vs {b.shape}"
+
+    left = torch.abs(a - b)
+    right = atol + rtol * torch.abs(b)
+    count = torch.sum(left > right)
+    mismatch_percent = count / a.numel()
+    if mismatch_percent > 1 - percent:
+        raise Exception(
+            f"Mismatch percentage is {mismatch_percent:.4f} for rtol {rtol} "
+            f"(threshold: {1-percent:.4f})")
+
+
+@pytest.mark.parametrize("topk", [1, 4])
+@pytest.mark.parametrize("num_experts", [32, 128])
+@pytest.mark.parametrize("num_tokens", [1, 128, 1024])
+@pytest.mark.parametrize("intermediate_size,hidden_size", [(3072, 3072)])
+@pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None),
+                                              (1.702, 1.0, 7.0)])
+@pytest.mark.parametrize("act_type", ['mxfp8', 'bf16'])
+@pytest.mark.skipif(
+    not TRTLLM_GEN_MXFP4_AVAILABLE,
+    reason="nvidia gpu and compute capability sm100 is required for this test")
+def test_trtllm_gen_mxfp4_fused_moe(
+    topk: int,
+    num_experts: int,
+    num_tokens: int,
+    intermediate_size: int,
+    hidden_size: int,
+    alpha: float,
+    beta: float,
+    limit: Optional[float],
+    act_type: str,
+):
+    seed = 42
+    torch.manual_seed(seed)
+    hidden_states = torch.randn(num_tokens,
+                                hidden_size,
+                                device="cuda:0",
+                                dtype=torch.bfloat16)
+    w13 = (torch.randn(num_experts,
+                       intermediate_size * 2,
+                       hidden_size,
+                       device="cuda:0",
+                       dtype=torch.bfloat16))
+    w2 = (torch.randn(num_experts,
+                      hidden_size,
+                      intermediate_size,
+                      device="cuda:0",
+                      dtype=torch.bfloat16))
+    bias13 = torch.randn(num_experts, intermediate_size * 2,
+                         device="cuda:0") * 10
+    bias2 = torch.randn(num_experts, hidden_size, device="cuda:0") * 10
+    router_logits = torch.rand(num_tokens, num_experts,
+                               dtype=torch.float32).cuda()
+
+    w13, w13_scale = fp4_quantize(w13,
+                                  torch.tensor(1.0, device="cuda:0"),
+                                  32,
+                                  sf_use_ue8m0=True,
+                                  is_sf_swizzled_layout=False)
+    w13_scale = w13_scale.view(torch.float8_e4m3fn).reshape(
+        num_experts, intermediate_size * 2, hidden_size // 32)
+    w2, w2_scale = fp4_quantize(w2,
+                                torch.tensor(1.0, device="cuda:0"),
+                                32,
+                                sf_use_ue8m0=True,
+                                is_sf_swizzled_layout=False)
+    w2_scale = w2_scale.view(torch.float8_e4m3fn).reshape(
+        num_experts, hidden_size, intermediate_size // 32)
+    if act_type == 'mxfp8':
+        hidden_states, hidden_states_scale = mxfp8_quantize(
+            hidden_states, is_sf_swizzled_layout=False)
+        hidden_states_scale = hidden_states_scale.view(
+            torch.float8_e4m3fn).reshape(-1)
+    else:
+        hidden_states_scale = None
+
+    # reference result
+    ref_result = torch.empty_like(hidden_states, dtype=torch.bfloat16)
+    w13_ref = mxfp4_dequantize(w13.clone(), w13_scale.clone())
+    w2_ref = mxfp4_dequantize(w2.clone(), w2_scale.clone())
+    bias13_ref = bias13
+    bias2_ref = bias2
+    if act_type == 'mxfp8':
+        hidden_states_ref = mxfp8_dequantize(
+            hidden_states, hidden_states_scale).to(torch.float32)
+    else:
+        hidden_states_ref = hidden_states.to(torch.float32)
+    # Process tokens in chunks of 32 to reduce memory usage
+    chunk_size = 32
+    num_chunks = (num_tokens + chunk_size - 1) // chunk_size
+    for i in range(num_chunks):
+        start_idx = i * chunk_size
+        end_idx = min(start_idx + chunk_size, num_tokens)
+        chunk_result = reference_moe(
+            router_logits[start_idx:end_idx].to(torch.float32),
+            topk,
+            num_experts,
+            hidden_states_ref[start_idx:end_idx],
+            w13_ref,
+            bias13_ref,
+            w2_ref,
+            bias2_ref,
+            alpha,
+            beta,
+            limit,
+            act_type,
+        )
+        ref_result[start_idx:end_idx].copy_(chunk_result)
+
+    # trtllm-gen result
+    if alpha is not None:
+        alpha = torch.full((num_experts, ), alpha, device=hidden_states.device)
+    if limit is not None:
+        limit = torch.full((num_experts, ), limit, device=hidden_states.device)
+    if beta is not None:
+        beta = torch.full((num_experts, ), beta, device=hidden_states.device)
+    tg_result = tg_mxfp4_moe(router_logits,
+                             topk,
+                             num_experts,
+                             intermediate_size,
+                             hidden_size,
+                             hidden_states,
+                             hidden_states_scale,
+                             w13,
+                             w13_scale,
+                             bias13,
+                             w2,
+                             w2_scale,
+                             bias2,
+                             act_type,
+                             alpha=alpha,
+                             beta=beta,
+                             limit=limit)
+    # relatively loose check since the mxfp4 quantization is less accurate
+    check_accuracy(ref_result, tg_result, atol=0, rtol=0.3, percent=0.8)

From 48bfb0c9b733aab69f4c7d77051e20a80c8d088f Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 21 Aug 2025 13:11:28 -0400
Subject: [PATCH 474/932] [Bug] Fix R1 Accuracy 0 Bug (#23294)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../model_executor/layers/quantization/fp8.py | 22 +++++++++++++------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 7c447c2a53..a4de4d7094 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1099,8 +1099,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     apply_router_weight_on_input=apply_router_weight_on_input,
                 )
         else:
-            from vllm.model_executor.layers.fused_moe import fused_experts
-            return fused_experts(
+            common_kwargs = dict(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
@@ -1117,11 +1116,20 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                           if self.block_quant else layer.w2_weight_scale),
                 a1_scale=layer.w13_input_scale,
                 a2_scale=layer.w2_input_scale,
-                use_fp8_w8a8=True,
-                block_shape=self.quant_config.weight_block_size,
-                allow_deep_gemm=self.allow_deep_gemm,
-                allow_cutlass_block_scaled_grouped_gemm=(
-                    self.allow_cutlass_block_scaled_grouped_gemm))
+            )
+
+            if self.fused_experts is not None:
+                return self.fused_experts(**common_kwargs)
+            else:
+                from vllm.model_executor.layers.fused_moe import fused_experts
+                return fused_experts(
+                    **common_kwargs,
+                    use_fp8_w8a8=True,
+                    block_shape=self.quant_config.weight_block_size,
+                    allow_deep_gemm=self.allow_deep_gemm,
+                    allow_cutlass_block_scaled_grouped_gemm=(
+                        self.allow_cutlass_block_scaled_grouped_gemm),
+                )
 
 
 class Fp8KVCacheMethod(BaseKVCacheMethod):

From 10f535c0863ffcd23ae5838979ea4e0dd82c5886 Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Thu, 21 Aug 2025 10:22:18 -0700
Subject: [PATCH 475/932] [Bugfix] Fix port conflict by obtaining a list of
 open ports upfront (#21894)

Signed-off-by: Ming Yang <minos.future@gmail.com>
---
 vllm/config/parallel.py | 24 ++++++++++++++++++------
 vllm/utils/__init__.py  | 20 ++++++++++++++------
 vllm/v1/engine/utils.py |  4 +++-
 3 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 2b716a7706..f667cac2fe 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -15,7 +15,7 @@ import vllm.envs as envs
 from vllm.config.utils import config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import cuda_device_count_stateless, get_open_port
+from vllm.utils import cuda_device_count_stateless, get_open_ports_list
 
 if TYPE_CHECKING:
     from ray.runtime_env import RuntimeEnv
@@ -171,6 +171,11 @@ class ParallelConfig:
     rank: int = 0
     """Global rank in distributed setup."""
 
+    _data_parallel_master_port_list: list[int] = field(default_factory=list)
+    """List of open port auto-queried for data parallel messaging.
+    Set to be private as it's not intended to be configured by users.
+    """
+
     @property
     def world_size_across_dp(self) -> int:
         """world_size_across_dp is TPxPPxDP, it is the size of the world
@@ -183,11 +188,15 @@ class ParallelConfig:
         processes that is related to data parallelism,
         e.g. both in the worker and in the engine, which
         can live in different processes. To avoid port conflicts, we
-        increment the port number each time we need to initialize a
-        new process group related to data parallelism.
+        pop a new port from the prepared port list each time we need to
+        initialize a new process group related to data parallelism.
         """
-        answer = self.data_parallel_master_port
-        self.data_parallel_master_port += 1
+        if self._data_parallel_master_port_list:
+            answer = self._data_parallel_master_port_list.pop()
+        else:
+            answer = self.data_parallel_master_port
+            self.data_parallel_master_port += 1
+
         return answer
 
     def stateless_init_dp_group(self) -> ProcessGroup:
@@ -313,7 +322,10 @@ class ParallelConfig:
 
         if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
             # Data parallel was specified in the engine args.
-            self.data_parallel_master_port = get_open_port()
+            if not self._data_parallel_master_port_list:
+                self._data_parallel_master_port_list = get_open_ports_list(5)
+            self.data_parallel_master_port = \
+                self._data_parallel_master_port_list.pop()
 
             if not (0 <= self.data_parallel_rank < self.data_parallel_size):
                 raise ValueError(
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 5cb9f97ae0..1eefb32eaa 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -516,8 +516,8 @@ def random_uuid() -> str:
 class AsyncMicrobatchTokenizer:
     """Asynchronous tokenizer with micro-batching.
 
-    Pulls pending encode/decode requests from a queue and batches them 
-    up to reduce overhead. A single-thread ThreadPoolExecutor is used 
+    Pulls pending encode/decode requests from a queue and batches them
+    up to reduce overhead. A single-thread ThreadPoolExecutor is used
     so the event loop stays responsive.
     """
 
@@ -664,18 +664,18 @@ class AsyncMicrobatchTokenizer:
     def _queue_key(self, op: str, kwargs: dict) -> tuple:
         """
         Return a normalized key describing operation + kwargs.
-        
+
         - `add_special_tokens`: {True/False}
         - `truncation`: {True/False}
-          - If `truncation` is False (`max_length` is None), 
+          - If `truncation` is False (`max_length` is None),
             returns a key for a can_batch queue.
           - If `truncation` is True and `max_length` is None or equals
             `tokenizer.model_max_length`, returns a key for a can_batch queue.
           - Otherwise, returns a key for a cannot_batch queue.
-        
+
         Examples:
           - Decode: ("decode",)
-          - Encode typical: 
+          - Encode typical:
             ("encode", add_special_tokens, bool_truncation, max_length_label)
           - Fallback: ("encode", "other")
         """
@@ -940,6 +940,14 @@ def get_open_port() -> int:
     return _get_open_port()
 
 
+def get_open_ports_list(count: int = 5) -> list[int]:
+    """Get a list of open ports."""
+    ports = set()
+    while len(ports) < count:
+        ports.add(get_open_port())
+    return list(ports)
+
+
 def _get_open_port() -> int:
     port = envs.VLLM_PORT
     if port is not None:
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 770aa7d9dc..62f229e286 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -71,7 +71,7 @@ class EngineHandshakeMetadata:
     connect to.
     """
     addresses: EngineZmqAddresses
-    parallel_config: dict[str, Union[int, str]]
+    parallel_config: dict[str, Union[int, str, list[int]]]
 
 
 class CoreEngineProcManager:
@@ -798,6 +798,8 @@ def wait_for_engine_startup(
                         parallel_config.data_parallel_master_ip,
                         "data_parallel_master_port":
                         parallel_config.data_parallel_master_port,
+                        "_data_parallel_master_port_list":
+                        parallel_config._data_parallel_master_port_list,
                         "data_parallel_size":
                         parallel_config.data_parallel_size,
                     }))

From 603fbbbce0d163a3dd1fec6c5b56cf64c7021691 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 21 Aug 2025 10:22:55 -0700
Subject: [PATCH 476/932] [Misc] Misc code cleanup/simplification (#23304)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/sample/sampler.py          |  2 +-
 vllm/v1/worker/gpu_input_batch.py  |  9 +--
 vllm/v1/worker/gpu_model_runner.py | 95 ++++++++++++++----------------
 vllm/worker/worker_base.py         |  2 +-
 4 files changed, 51 insertions(+), 57 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 70ec8a0c26..546531a916 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -91,7 +91,7 @@ class Sampler(nn.Module):
         logits = self.apply_bad_words(logits, sampling_metadata)
 
         # Apply logits processors which can impact greedy sampling
-        for processor in (sampling_metadata.logitsprocs.non_argmax_invariant):
+        for processor in sampling_metadata.logitsprocs.non_argmax_invariant:
             logits = processor.apply(logits)
 
         # Apply penalties (e.g., min_tokens, freq_penalties).
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 154b77ae63..e45d1ef31f 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -442,10 +442,11 @@ class InputBatch:
         # LoRA
         lora_id = self.request_lora_mapping[req_index]
         if lora_id != 0:
-            self.lora_id_to_request_ids[lora_id].discard(req_id)
-            if len(self.lora_id_to_request_ids[lora_id]) == 0:
-                self.lora_id_to_request_ids.pop(lora_id)
-                self.lora_id_to_lora_request.pop(lora_id)
+            lora_req_ids = self.lora_id_to_request_ids[lora_id]
+            lora_req_ids.discard(req_id)
+            if not lora_req_ids:
+                del self.lora_id_to_request_ids[lora_id]
+                del self.lora_id_to_lora_request[lora_id]
             self.request_lora_mapping[req_index] = 0
 
         self.has_allowed_token_ids.discard(req_id)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 43a9888d8e..870aca41ec 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -358,6 +358,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if num_pooling_reqs == 0:
             return model_kwargs
 
+        # This does nontrivial work.
         pooling_params = self.input_batch.pooling_metadata.pooling_params
 
         assert num_pooling_reqs == num_reqs
@@ -465,7 +466,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         for req_id in unscheduled_req_ids:
             self.input_batch.remove_request(req_id)
 
-        req_ids_to_add: list[str] = []
+        reqs_to_add: list[CachedRequestState] = []
         # Add new requests to the cached states.
         for new_req_data in scheduler_output.scheduled_new_reqs:
             req_id = new_req_data.req_id
@@ -480,14 +481,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 generator = None
 
             if pooling_params:
-                assert (task := pooling_params.task) is not None, (
-                    "You did not set `task` in the API")
+                task = pooling_params.task
+                assert task is not None, "You did not set `task` in the API"
 
                 model = cast(VllmModelForPooling, self.get_model())
                 to_update = model.pooler.get_pooling_updates(task)
                 to_update.apply(pooling_params)
 
-            self.requests[req_id] = CachedRequestState(
+            req_state = CachedRequestState(
                 req_id=req_id,
                 prompt_token_ids=new_req_data.prompt_token_ids,
                 mm_kwargs=new_req_data.mm_kwargs,
@@ -501,6 +502,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 lora_request=new_req_data.lora_request,
             )
 
+            self.requests[req_id] = req_state
+
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             if self.uses_mrope:
                 image_grid_thw = []
@@ -508,29 +511,25 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 second_per_grid_ts = []
                 audio_feature_lengths = []
                 use_audio_in_video = False
-                for mm_item in self.requests[req_id].mm_kwargs:
+                for mm_item in req_state.mm_kwargs:
                     mm_input = mm_item.get_data()
-                    if mm_input.get("image_grid_thw") is not None:
-                        image_grid_thw.append(
-                            mm_input["image_grid_thw"].tolist())
-                    if mm_input.get("video_grid_thw") is not None:
-                        video_grid_thw.append(
-                            mm_input["video_grid_thw"].tolist())
-                    if mm_input.get("second_per_grid_ts") is not None:
-                        second_per_grid_ts.append(
-                            mm_input["second_per_grid_ts"])
-                    if mm_input.get("audio_feature_lengths") is not None:
-                        audio_feature_lengths.append(
-                            mm_input["audio_feature_lengths"])
+                    if (t := mm_input.get("image_grid_thw")) is not None:
+                        image_grid_thw.append(t.tolist())
+                    if (t := mm_input.get("video_grid_thw")) is not None:
+                        video_grid_thw.append(t.tolist())
+                    if (t := mm_input.get("second_per_grid_ts")) is not None:
+                        second_per_grid_ts.append(t)
+                    if (t :=
+                            mm_input.get("audio_feature_lengths")) is not None:
+                        audio_feature_lengths.append(t)
                     if mm_input.get("use_audio_in_video") is True:
                         use_audio_in_video = True
 
                 hf_config = self.model_config.hf_config
 
-                self.requests[req_id].mrope_positions, \
-                    self.requests[req_id].mrope_position_delta = \
+                req_state.mrope_positions, req_state.mrope_position_delta = \
                     MRotaryEmbedding.get_input_positions_tensor(
-                        self.requests[req_id].prompt_token_ids,
+                        req_state.prompt_token_ids,
                         hf_config=hf_config,
                         image_grid_thw=image_grid_thw,
                         video_grid_thw=video_grid_thw,
@@ -539,7 +538,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         use_audio_in_video=use_audio_in_video,
                     )
 
-            req_ids_to_add.append(req_id)
+            reqs_to_add.append(req_state)
 
         # Update the states of the running/resumed requests.
         is_last_rank = get_pp_group().is_last_rank
@@ -587,7 +586,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # The request is not in the persistent batch.
                 # The request was either preempted and resumed later, or was not
                 # scheduled in the previous step and needs to be added again.
-                req_ids_to_add.append(req_id)
+                reqs_to_add.append(req_state)
                 continue
 
             # Update the persistent batch.
@@ -624,9 +623,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
-        for req_id in req_ids_to_add:
-            req_state = self.requests[req_id]
-            self.input_batch.add_request(req_state)
+        for request in reqs_to_add:
+            self.input_batch.add_request(request)
 
         # Condense the batched states if there are gaps left by removed requests
         self.input_batch.condense()
@@ -639,38 +637,32 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self,
         scheduler_output: "SchedulerOutput",
     ) -> BatchedTensorInputs:
-        if self.is_multimodal_raw_input_supported:  # noqa: SIM102
-            if scheduler_output:
-                mm_kwargs = list[MultiModalKwargsItem]()
-                for req in scheduler_output.scheduled_new_reqs:
-                    req_mm_kwargs = req.mm_kwargs
-                    if not isinstance(req_mm_kwargs, list):
-                        req_mm_kwargs = list(req_mm_kwargs)
-                    mm_kwargs.extend(req_mm_kwargs)
+        if not self.is_multimodal_raw_input_supported or not scheduler_output:  # noqa: SIM102
+            return {}
 
-                # Input all modalities at once
-                mm_kwargs_combined: BatchedTensorInputs = {}
-                for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
-                        mm_kwargs,
-                        device=self.device,
-                        pin_memory=self.pin_memory,
-                ):
-                    mm_kwargs_combined.update(mm_kwargs_group)
+        mm_kwargs = list[MultiModalKwargsItem]()
+        for req in scheduler_output.scheduled_new_reqs:
+            mm_kwargs.extend(req.mm_kwargs)
 
-                return mm_kwargs_combined
+        # Input all modalities at once
+        mm_kwargs_combined: BatchedTensorInputs = {}
+        for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
+                mm_kwargs,
+                device=self.device,
+                pin_memory=self.pin_memory,
+        ):
+            mm_kwargs_combined.update(mm_kwargs_group)
 
-        return {}
+        return mm_kwargs_combined
 
     def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs:
-        if self.is_multimodal_raw_input_supported:
-            mm_budget = self.mm_budget
-            assert mm_budget is not None
+        if not self.is_multimodal_raw_input_supported:
+            return {}
+        mm_budget = self.mm_budget
+        assert mm_budget is not None
 
-            dummy_modality = mm_budget.get_modality_with_max_tokens()
-
-            return self._get_mm_dummy_batch(dummy_modality, num_seqs)
-
-        return {}
+        dummy_modality = mm_budget.get_modality_with_max_tokens()
+        return self._get_mm_dummy_batch(dummy_modality, num_seqs)
 
     def _get_cumsum_and_arange(
         self,
@@ -1612,6 +1604,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 batch_descriptor=batch_descriptor,
         ), self.maybe_get_kv_connector_output(
                 scheduler_output) as kv_connector_output:
+
             model_output = self.model(
                 input_ids=input_ids,
                 positions=positions,
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index f1c9a0ab00..a1fa7f2cf7 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -544,7 +544,7 @@ class WorkerWrapperBase:
         Arguments are passed to the worker class constructor.
         """
         kwargs = all_kwargs[self.rpc_rank]
-        self.vllm_config = kwargs.get("vllm_config", None)
+        self.vllm_config = kwargs.get("vllm_config")
         assert self.vllm_config is not None, (
             "vllm_config is required to initialize the worker")
         enable_trace_function_call_for_thread(self.vllm_config)

From 8a19303173881e4197c7656727c6f2b296faa7fc Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Thu, 21 Aug 2025 10:31:11 -0700
Subject: [PATCH 477/932] [BugFix][gpt-oss] Fix Chat Completion with Multiple
 Output Message (#23318)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/harmony_utils.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index efca1472e4..bc810f683f 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -329,23 +329,19 @@ def parse_chat_output(
         token_ids: Sequence[int]) -> tuple[Optional[str], Optional[str], bool]:
     parser = parse_output_into_messages(token_ids)
     output_msgs = parser.messages
+    is_tool_call = False  # TODO: update this when tool call is supported
     if len(output_msgs) == 0:
         # The generation has stopped during reasoning.
-        is_tool_call = False
         reasoning_content = parser.current_content
         final_content = None
     elif len(output_msgs) == 1:
         # The generation has stopped during final message.
-        is_tool_call = False
         reasoning_content = output_msgs[0].content[0].text
         final_content = parser.current_content
     else:
-        if len(output_msgs) != 2:
-            raise ValueError(
-                "Expected 2 output messages (reasoning and final), "
-                f"but got {len(output_msgs)}.")
-        reasoning_msg, final_msg = output_msgs
-        reasoning_content = reasoning_msg.content[0].text
+        reasoning_msg = output_msgs[:-1]
+        final_msg = output_msgs[-1]
+        reasoning_content = "\n".join(
+            [msg.content[0].text for msg in reasoning_msg])
         final_content = final_msg.content[0].text
-        is_tool_call = final_msg.recipient is not None
     return reasoning_content, final_content, is_tool_call

From 3496274663d3c442446b4f5000af006cc28d715c Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Fri, 22 Aug 2025 03:49:09 +0800
Subject: [PATCH 478/932] [Misc] Convert VLLM_TORCH_PROFILER_DIR path to
 absolute (#23191)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/envs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 70068cca66..a844aa8af6 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -673,7 +673,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Note that it must be an absolute path.
     "VLLM_TORCH_PROFILER_DIR":
     lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
-             .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),
+             .path.abspath(os.path.expanduser(os.getenv(
+        "VLLM_TORCH_PROFILER_DIR", ".")))),
 
     # Enable torch profiler to record shapes if set
     # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will

From 1d353b6352da30122ef084e656506bc3c43349c8 Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Thu, 21 Aug 2025 13:02:11 -0700
Subject: [PATCH 479/932] [Core] Always use tensor cores for Flashinfer Decode
 Wrapper (#23214)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
---
 .../benchmark_trtllm_decode_attention.py      |  2 +-
 tests/kernels/attention/test_flashinfer.py    |  6 +-
 .../test_flashinfer_trtllm_attention.py       |  4 +-
 vllm/envs.py                                  |  7 --
 vllm/v1/attention/backends/flashinfer.py      | 76 +++++++------------
 5 files changed, 31 insertions(+), 64 deletions(-)

diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index b3f8171546..72b54b40a2 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -110,7 +110,7 @@ def benchmark_decode(
     wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
         workspace_buffer,
         kv_layout,
-        use_tensor_cores=((num_qo_heads // num_kv_heads) > 4),
+        use_tensor_cores=True,
     )
     wrapper.plan(
         kv_indptr,
diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index be78f0e4fc..a821a74aba 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -137,9 +137,7 @@ def test_flashinfer_decode_with_paged_kv(
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.\
         BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
-                use_tensor_cores=(
-                    (num_query_heads//num_kv_heads) > 4)
-                )
+                use_tensor_cores=True)
     wrapper.plan(
         kv_indptr,
         kv_indices,
@@ -411,7 +409,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
     assert num_query_heads % num_kv_heads == 0
     max_kv_len = max(kv_lens)
     scale = head_size**-0.5
-    use_tensor_cores = (num_query_heads // num_kv_heads) > 4
+    use_tensor_cores = True
     kv_cache_dtype = torch.float8_e4m3fn
 
     query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 619822f3ee..69e44264cd 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -136,9 +136,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
 
     # Baseline Decode
     wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-        workspace_buffer,
-        kv_layout,
-        use_tensor_cores=((num_qo_heads // num_kv_heads) > 4))
+        workspace_buffer, kv_layout, use_tensor_cores=True)
     wrapper.plan(kv_indptr,
                  kv_indices,
                  kv_last_page_lens,
diff --git a/vllm/envs.py b/vllm/envs.py
index a844aa8af6..296c173089 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -42,7 +42,6 @@ if TYPE_CHECKING:
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
-    VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: Optional[int] = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
@@ -465,11 +464,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: bool(int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]))
     if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ else None,
 
-    # If set, vllm will force flashinfer to use tensor cores;
-    # otherwise will use heuristic based on model architecture.
-    "VLLM_FLASHINFER_FORCE_TENSOR_CORES":
-    lambda: bool(int(os.getenv("VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"))),
-
     # Pipeline stage partition strategy
     "VLLM_PP_LAYER_PARTITION":
     lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
@@ -1221,7 +1215,6 @@ def compute_hash() -> str:
         "VLLM_USE_AITER_UNIFIED_ATTENTION",
         "VLLM_ATTENTION_BACKEND",
         "VLLM_USE_FLASHINFER_SAMPLER",
-        "VLLM_FLASHINFER_FORCE_TENSOR_CORES",
         "VLLM_DISABLED_KERNELS",
         "VLLM_USE_DEEP_GEMM",
         "VLLM_USE_TRTLLM_FP4_GEMM",
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 8a25088848..1e6e3f1d0a 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -13,7 +13,6 @@ from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
 from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 
-import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionType)
@@ -228,8 +227,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 self.q_data_type = self.kv_cache_dtype
         else:
             self.kv_cache_dtype = self.kv_cache_spec.dtype
-        self.use_tensor_cores = (envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or
-                                 (self.num_qo_heads // self.num_kv_heads > 4))
 
         self._cascade_wrapper = None  # Wrapper for cascade attention
 
@@ -308,7 +305,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 paged_kv_indptr_buffer=paged_kv_indptr,
                 paged_kv_indices_buffer=paged_kv_indices,
                 paged_kv_last_page_len_buffer=paged_kv_last_page_len,
-                use_tensor_cores=self.use_tensor_cores)
+                # Tensor cores are enabled by default because the perf would be
+                # atleast as good as cuda cores for all attention ops in latest
+                # gpus.
+                use_tensor_cores=True,
+            )
 
             # save the decode wrapper
             if use_cudagraph:
@@ -984,52 +985,29 @@ def fast_plan_decode(
     self._paged_kv_last_page_len_buf.copy_(last_page_len_cpu,
                                            non_blocking=True)
 
-    if self.use_tensor_cores:
-        qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
+    qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
 
-        try:
-            # Make sure we pass exactly 15 arguments for tensor core version
-            self._plan_info = self._cached_module.plan(
-                self._float_workspace_buffer,
-                self._int_workspace_buffer,
-                self._pin_memory_int_workspace_buffer,
-                qo_indptr_host,
-                indptr_cpu,
-                seq_lens_cpu,
-                batch_size,  # total_num_rows
-                batch_size,
-                num_qo_heads,
-                num_kv_heads,
-                page_size,
-                self.is_cuda_graph_enabled,
-                head_dim,
-                head_dim,
-                False,  # causal
-            )
-        except Exception as e:
-            raise RuntimeError(f"Error in tensor core plan: {e}") from e
-    else:
-        try:
-            # Make sure we pass exactly 15 arguments for standard version
-            self._plan_info = self._cached_module.plan(
-                self._float_workspace_buffer,
-                self._int_workspace_buffer,
-                self._pin_memory_int_workspace_buffer,
-                indptr_cpu,
-                batch_size,
-                num_qo_heads,
-                num_kv_heads,
-                page_size,
-                self.is_cuda_graph_enabled,
-                window_left,
-                logits_soft_cap,
-                head_dim,
-                head_dim,
-                torch.empty(0, dtype=q_data_type),
-                torch.empty(0, dtype=kv_data_type),
-            )
-        except Exception as e:
-            raise RuntimeError(f"Error in standard plan: {e}") from e
+    try:
+        # Make sure we pass exactly 15 arguments for tensor core version
+        self._plan_info = self._cached_module.plan(
+            self._float_workspace_buffer,
+            self._int_workspace_buffer,
+            self._pin_memory_int_workspace_buffer,
+            qo_indptr_host,
+            indptr_cpu,
+            seq_lens_cpu,
+            batch_size,  # total_num_rows
+            batch_size,
+            num_qo_heads,
+            num_kv_heads,
+            page_size,
+            self.is_cuda_graph_enabled,
+            head_dim,
+            head_dim,
+            False,  # causal
+        )
+    except Exception as e:
+        raise RuntimeError(f"Error in tensor core plan: {e}") from e
 
     self._pos_encoding_mode = pos_encoding_mode
     self._window_left = window_left

From 044931f97b39975cce6dbef3df94586d83893758 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Elvir=20Crn=C4=8Devi=C4=87?= <elvircrn@gmail.com>
Date: Thu, 21 Aug 2025 22:06:54 +0200
Subject: [PATCH 480/932] Make sure that vectorize_with_alignment produced
 vectorized global loads (#23182)

---
 csrc/quantization/vectorization_utils.cuh | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/csrc/quantization/vectorization_utils.cuh b/csrc/quantization/vectorization_utils.cuh
index 8aa0147df6..98b491b7e2 100644
--- a/csrc/quantization/vectorization_utils.cuh
+++ b/csrc/quantization/vectorization_utils.cuh
@@ -41,8 +41,10 @@ __device__ inline void vectorize_with_alignment(
 
     for (int i = tid; i < num_vec; i += stride) {
       vout_t tmp;
-      vec_op(tmp, v_in[i]);
-      v_out[i] = tmp;
+      // Make a local copy of the entire pack
+      vin_t src = v_in[i];  // <- encourages a single vector ld
+      vec_op(tmp, src);
+      v_out[i] = tmp;  // <- encourages a single vector st
     }
     return;
   }
@@ -71,8 +73,10 @@ __device__ inline void vectorize_with_alignment(
   // 2. vectorize the main part
   for (int i = tid; i < num_vec; i += stride) {
     vout_t tmp;
-    vec_op(tmp, v_in[i]);
-    v_out[i] = tmp;
+    // Make a local copy of the entire pack
+    vin_t src = v_in[i];  // <- encourages a single vector ld
+    vec_op(tmp, src);
+    v_out[i] = tmp;  // <- encourages a single vector st
   }
 
   // 3. handle the tail
@@ -125,7 +129,8 @@ __device__ inline void vectorize_read_with_alignment(const InT* in, int len,
     auto* v_in = reinterpret_cast<const vin_t*>(in);
 
     for (int i = tid; i < num_vec; i += stride) {
-      vec_op(v_in[i]);
+      vin_t tmp = v_in[i];
+      vec_op(tmp);
     }
     return;
   }

From 800349c2a50ac1f823d3387cb0f44d0bd6d470d1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 21 Aug 2025 13:53:33 -0700
Subject: [PATCH 481/932] [Structured Outputs] Refactor bitmask construction
 into get_grammar_bitmask (#23361)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/sched/scheduler.py | 55 ++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 0b528587b9..60d5720b6b 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -177,14 +177,6 @@ class Scheduler(SchedulerInterface):
         scheduled_running_reqs: list[Request] = []
         preempted_reqs: list[Request] = []
 
-        # NOTE: structured_output_request_ids maps
-        # a request's (request that uses structured output)
-        # request_id to the running request index.
-        # This will helps us determine to slice the grammar bitmask
-        # and only applies valid mask for requests that
-        # uses structured decoding.
-        structured_output_request_ids: dict[str, int] = {}
-
         req_to_new_blocks: dict[str, KVCacheBlocks] = {}
         num_scheduled_tokens: dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
@@ -282,12 +274,6 @@ class Scheduler(SchedulerInterface):
 
             # Schedule the request.
             scheduled_running_reqs.append(request)
-            if request.use_structured_output:
-                # PERF: in case of chunked prefill,
-                # request might not include any new tokens.
-                # Therefore, we might introduce some additional
-                # cycle to fill in the bitmask, which could be a big no-op.
-                structured_output_request_ids[request.request_id] = req_index
             req_to_new_blocks[request.request_id] = new_blocks
             num_scheduled_tokens[request.request_id] = num_new_tokens
             token_budget -= num_new_tokens
@@ -477,9 +463,6 @@ class Scheduler(SchedulerInterface):
                     request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
                     continue
 
-                if request.use_structured_output:
-                    structured_output_request_ids[request.request_id] = (
-                        req_index)
                 req_index += 1
                 self.running.append(request)
                 if self.log_stats:
@@ -538,11 +521,6 @@ class Scheduler(SchedulerInterface):
                 self.kv_cache_manager.get_num_common_prefix_blocks(
                     any_request, len(self.running)))
 
-        grammar_bitmask = self.structured_output_manager.grammar_bitmask(
-            self.requests,
-            structured_output_request_ids,
-            scheduled_spec_decode_tokens,
-        )
         # Construct the scheduler output.
         new_reqs_data = [
             NewRequestData.from_request(
@@ -556,6 +534,9 @@ class Scheduler(SchedulerInterface):
             scheduled_spec_decode_tokens,
             req_to_new_blocks,
         )
+        structured_output_request_ids, grammar_bitmask = (
+            self.get_grammar_bitmask(self.running,
+                                     scheduled_spec_decode_tokens))
         scheduler_output = SchedulerOutput(
             scheduled_new_reqs=new_reqs_data,
             scheduled_cached_reqs=cached_reqs_data,
@@ -753,6 +734,36 @@ class Scheduler(SchedulerInterface):
             encoder_inputs_to_schedule.append(i)
         return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
 
+    def get_grammar_bitmask(
+        self,
+        requests: list[Request],
+        scheduled_spec_decode_tokens: dict[str, list[int]],
+    ):
+        # NOTE: structured_output_request_ids maps
+        # a request's (request that uses structured output)
+        # request_id to its index in the batch.
+        # This will helps us determine to slice the grammar bitmask
+        # and only applies valid mask for requests that
+        # uses structured decoding.
+        structured_output_request_ids: dict[str, int] = {}
+        for i, req in enumerate(requests):
+            if req.use_structured_output:
+                # PERF: in case of chunked prefill,
+                # request might not include any new tokens.
+                # Therefore, we might introduce some additional
+                # cycle to fill in the bitmask, which could be a big no-op.
+                structured_output_request_ids[req.request_id] = i
+
+        if not structured_output_request_ids:
+            bitmask = None
+        else:
+            bitmask = self.structured_output_manager.grammar_bitmask(
+                self.requests,
+                structured_output_request_ids,
+                scheduled_spec_decode_tokens,
+            )
+        return structured_output_request_ids, bitmask
+
     def update_from_output(
         self,
         scheduler_output: SchedulerOutput,

From 8b5fe6eb51464256bfc2a7ee33af228a6ace8aba Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 21 Aug 2025 14:29:04 -0700
Subject: [PATCH 482/932] =?UTF-8?q?[CI]=20Clean=20up=20actions:=20remove?=
 =?UTF-8?q?=20helm,=20publish=20workflows=20and=20improve=20pr=20=E2=80=A6?=
 =?UTF-8?q?=20(#23377)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/lint-and-deploy.yaml |  89 --------------------
 .github/workflows/publish.yml          | 111 -------------------------
 .github/workflows/reminder_comment.yml |  51 +++++++++---
 3 files changed, 40 insertions(+), 211 deletions(-)
 delete mode 100644 .github/workflows/lint-and-deploy.yaml
 delete mode 100644 .github/workflows/publish.yml

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
deleted file mode 100644
index 2b1086b7fa..0000000000
--- a/.github/workflows/lint-and-deploy.yaml
+++ /dev/null
@@ -1,89 +0,0 @@
-name: Lint and Deploy Charts
-
-on: pull_request
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  lint-and-deploy:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: Set up Helm
-        uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
-        with:
-          version: v3.14.4
-
-       #Python is required because ct lint runs Yamale and yamllint which require Python.
-      - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
-        with:
-          python-version: '3.13'
-
-      - name: Set up chart-testing
-        uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
-        with:
-          version: v3.10.1
-
-      - name: Run chart-testing (lint)
-        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
-
-      - name: Setup minio
-        run: |
-          docker network create vllm-net
-          docker run -d -p 9000:9000 --name minio --net vllm-net \
-                     -e "MINIO_ACCESS_KEY=minioadmin" \
-                     -e "MINIO_SECRET_KEY=minioadmin" \
-                     -v /tmp/data:/data \
-                     -v /tmp/config:/root/.minio \
-                     minio/minio server /data
-          export AWS_ACCESS_KEY_ID=minioadmin
-          export AWS_SECRET_ACCESS_KEY=minioadmin
-          export AWS_EC2_METADATA_DISABLED=true
-          mkdir opt-125m
-          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
-          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
-          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
-
-      - name: Create kind cluster
-        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
-
-      - name: Build the Docker image vllm cpu
-        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
-
-      - name: Configuration of docker images, network and namespace for the kind cluster
-        run: |
-          docker pull amazon/aws-cli:2.6.4
-          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
-          kind load docker-image vllm-cpu-env:latest --name chart-testing
-          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
-          kubectl create ns ns-vllm
-
-      - name: Run chart-testing (install)
-        run: |
-          export AWS_ACCESS_KEY_ID=minioadmin
-          export AWS_SECRET_ACCESS_KEY=minioadmin
-          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
-
-      - name: curl test
-        run: |
-          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
-          sleep 10
-          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
-                  --header "Content-Type: application/json" \
-                  --data '{
-                          "model": "opt-125m",
-                          "prompt": "San Francisco is a",
-                          "max_tokens": 7,
-                          "temperature": 0
-                  }'):$CODE"
-          echo "$CODE"
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
deleted file mode 100644
index bfd0287996..0000000000
--- a/.github/workflows/publish.yml
+++ /dev/null
@@ -1,111 +0,0 @@
-# This workflow will upload a Python Package to Release asset
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Create Release
-
-on:
-  push:
-    tags:
-      - v*
-
-# Needed to create release and upload assets
-permissions:
-  contents: write
-
-jobs:
-  release:
-    # Retrieve tag and create release
-    name: Create Release
-    runs-on: ubuntu-latest
-    outputs:
-      upload_url: ${{ steps.create_release.outputs.upload_url }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Extract branch info
-        shell: bash
-        run: |
-          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
-
-      - name: Create Release
-        id: create_release
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-        env:
-          RELEASE_TAG: ${{ env.release_tag }}
-        with:
-          github-token: "${{ secrets.GITHUB_TOKEN }}"
-          script: |
-            const script = require('.github/workflows/scripts/create_release.js')
-            await script(github, context, core)
-
-  # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. 
-  # wheel:
-  #   name: Build Wheel
-  #   runs-on: ${{ matrix.os }}
-  #   needs: release
-
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #         os: ['ubuntu-20.04']
-  #         python-version: ['3.9', '3.10', '3.11', '3.12']
-  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements/cuda.txt.
-  #         cuda-version: ['11.8', '12.1']
-
-  #   steps:
-  #     - name: Checkout
-  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-  #     - name: Setup ccache
-  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
-  #       with:
-  #         create-symlink: true
-  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-
-  #     - name: Set up Linux Env
-  #       if: ${{ runner.os == 'Linux' }}
-  #       run: |
-  #         bash -x .github/workflows/scripts/env.sh
-
-  #     - name: Set up Python
-  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-  #       with:
-  #           python-version: ${{ matrix.python-version }}
-
-  #     - name: Install CUDA ${{ matrix.cuda-version }}
-  #       run: |
-  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-
-  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
-  #       run: |
-  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
-
-  #     - name: Build wheel
-  #       shell: bash
-  #       env:
-  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
-  #       run: |
-  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
-  #         asset_name=${wheel_name//"linux"/"manylinux1"}
-  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
-  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
-
-  #     - name: Upload Release Asset
-  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
-  #       env:
-  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  #       with:
-  #         upload_url: ${{ needs.release.outputs.upload_url }}
-  #         asset_path: ./dist/${{ env.wheel_name }}
-  #         asset_name: ${{ env.asset_name }}
-  #         asset_content_type: application/*
-
-      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
-      # - name: Publish package
-      #   uses: pypa/gh-action-pypi-publish@release/v1.8
-      #   with:
-      #     repository-url: https://test.pypi.org/legacy/
-      #     password: ${{ secrets.PYPI_API_TOKEN }}
-      #     skip-existing: true
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 16ae1aadb9..2f4da9e2c1 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -12,16 +12,45 @@ jobs:
         uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
         with:
           script: |
-            github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
-                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
-                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
-                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
-                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
-                '🚀'
-            })
+            try {
+              // Get the PR author
+              const prAuthor = context.payload.pull_request.user.login;
+              
+              // Check if this is the author's first PR in this repository
+              const { data: allPRs } = await github.rest.pulls.list({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                state: 'all',
+                per_page: 100
+              });
+              
+              // Filter to find PRs by this author
+              const authorPRs = allPRs.filter(pr => pr.user.login === prAuthor);
+              
+              console.log(`Found ${authorPRs.length} PRs by ${prAuthor}`);
+              
+              // Only post comment if this is the first PR (only one PR by this author)
+              if (authorPRs.length === 1) {
+                console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
+                await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
+                  '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
+                  'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
+                  'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
+                  'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
+                  'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
+                  'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
+                  '🚀'
+                });
+              } else {
+                console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRs.length} PRs found)`);
+              }
+            } catch (error) {
+              console.error('Error checking PR history or posting comment:', error);
+              // Don't fail the workflow, just log the error
+            }
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From c5041f899f586644326dc67b21d7de50c62eb410 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 21 Aug 2025 14:49:03 -0700
Subject: [PATCH 483/932] [CI] improve pr comments bot (#23380)

---
 .github/workflows/reminder_comment.yml | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 2f4da9e2c1..1ee605dc7b 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -17,20 +17,18 @@ jobs:
               const prAuthor = context.payload.pull_request.user.login;
               
               // Check if this is the author's first PR in this repository
-              const { data: allPRs } = await github.rest.pulls.list({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                state: 'all',
-                per_page: 100
+              // Use GitHub's search API to find all PRs by this author
+              const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
+                q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
+                per_page: 100  
               });
               
-              // Filter to find PRs by this author
-              const authorPRs = allPRs.filter(pr => pr.user.login === prAuthor);
+              const authorPRCount = searchResults.total_count;
               
-              console.log(`Found ${authorPRs.length} PRs by ${prAuthor}`);
+              console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
               
               // Only post comment if this is the first PR (only one PR by this author)
-              if (authorPRs.length === 1) {
+              if (authorPRCount === 1) {
                 console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
                 await github.rest.issues.createComment({
                 owner: context.repo.owner,
@@ -46,7 +44,7 @@ jobs:
                   '🚀'
                 });
               } else {
-                console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRs.length} PRs found)`);
+                console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
               }
             } catch (error) {
               console.error('Error checking PR history or posting comment:', error);

From 3bbe11cc136373bd4f6c12912dc094dba086fa11 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 21 Aug 2025 17:56:15 -0400
Subject: [PATCH 484/932] [Perf] Small optimizations for
 silu_mul_fp8_quant_deep_gemm (#23265)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../kernels/benchmark_silu_mul_fp8_quant.py   | 77 +++++++++++++++++++
 .../moe/test_silu_mul_fp8_quant_deep_gemm.py  |  4 +-
 .../layers/fused_moe/batched_deep_gemm_moe.py | 58 +++++++-------
 3 files changed, 107 insertions(+), 32 deletions(-)
 create mode 100644 benchmarks/kernels/benchmark_silu_mul_fp8_quant.py

diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
new file mode 100644
index 0000000000..0650cbf3cc
--- /dev/null
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+
+import torch
+
+from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+    silu_mul_fp8_quant_deep_gemm,
+)
+from vllm.platforms import current_platform
+
+
+def benchmark(E, T, H, G=128, runs=50):
+    current_platform.seed_everything(42)
+    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
+    tokens_per_expert = torch.randint(
+        T // 2, T, size=(E,), dtype=torch.int32, device="cuda"
+    )
+
+    # Warmup
+    for _ in range(10):
+        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
+        torch.cuda.synchronize()
+
+    # Benchmark
+    torch.cuda.synchronize()
+    start = time.perf_counter()
+    for _ in range(runs):
+        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
+    torch.cuda.synchronize()
+
+    avg_time = (time.perf_counter() - start) / runs * 1000
+
+    # Calculate actual work done (only count valid tokens)
+    actual_tokens = tokens_per_expert.sum().item()
+    actual_elements = actual_tokens * H
+
+    # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
+    ops_per_element = 8
+    total_ops = actual_elements * ops_per_element
+    gflops = total_ops / (avg_time / 1000) / 1e9
+
+    # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
+    input_bytes = actual_tokens * 2 * H * 2  # 2*H bfloat16 inputs
+    output_bytes = actual_tokens * H * 1  # H fp8 outputs
+    scale_bytes = actual_tokens * (H // G) * 4  # scales in float32
+    total_bytes = input_bytes + output_bytes + scale_bytes
+    memory_bw = total_bytes / (avg_time / 1000) / 1e9
+
+    return avg_time, gflops, memory_bw
+
+
+configs = [
+    (8, 32, 1024),
+    (16, 64, 2048),
+    (32, 128, 4096),
+    # DeepSeekV3 Configs
+    (256, 16, 7168),
+    (256, 32, 7168),
+    (256, 64, 7168),
+    (256, 128, 7168),
+    (256, 256, 7168),
+    (256, 512, 7168),
+    (256, 1024, 7168),
+]
+
+print(f"GPU: {torch.cuda.get_device_name()}")
+print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}")
+print("-" * 50)
+
+for E, T, H in configs:
+    try:
+        time_ms, gflops, gbps = benchmark(E, T, H)
+        print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}")
+    except Exception:
+        print(f"E={E:3d},T={T:4d},H={H:4d} FAILED")
diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
index 673a0aa367..5a0379dfb4 100644
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -24,7 +24,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
     current_platform.seed_everything(seed)
 
     # Input tensor of shape (E, T, 2*H)
-    y = torch.randn((E, T, 2 * H), dtype=torch.float32, device="cuda")
+    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
     tokens_per_expert = torch.randint(
         low=0,
         high=T,
@@ -74,7 +74,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
         y_se = y_s[e]
         y_qe = y_q[e]
 
-        torch.testing.assert_close(y_se[:nt], ref_s[:nt])
+        torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
         torch.testing.assert_close(
             y_qe[:nt].to(torch.float32),
             ref_q[:nt].to(torch.float32),
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index d9cfe96f7a..c4d680af93 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -70,53 +70,51 @@ def _silu_mul_fp8_quant_deep_gemm(
     # number of valid tokens for this expert
     n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)
 
-    cols = tl.arange(0, BLOCK)
-    cols = cols.to(tl.int64)
-    mask_h = cols < BLOCK
+    cols = tl.arange(0, BLOCK).to(tl.int64)
+    mask = cols < BLOCK
+
+    base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
+    base_gate_offset = base_input_offset + cols * stride_i_h
+    base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
+    base_yq_offset = (e * stride_yq_e + g * GROUP_SIZE * stride_yq_h +
+                      cols * stride_yq_h)
+    base_ys_offset = e * stride_ys_e + g * stride_ys_g
 
     for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
-        base_i_offset = (e * stride_i_e + t * stride_i_t +
-                         g * GROUP_SIZE * stride_i_h)
-        base_yq_offset = (e * stride_yq_e + t * stride_yq_t +
-                          g * GROUP_SIZE * stride_yq_h)
-        base_ys_offset = e * stride_ys_e + t * stride_ys_t + g * stride_ys_g
-
-        mask = mask_h
-        x = tl.load(input_ptr + base_i_offset + cols * stride_i_h,
-                    mask=mask,
-                    other=0.0).to(tl.float32)
-        y2 = tl.load(input_ptr + base_i_offset + H * stride_i_h +
-                     cols * stride_i_h,
+        gate = tl.load(input_ptr + base_gate_offset + t * stride_i_t,
+                       mask=mask,
+                       other=0.0).to(tl.float32)
+        up = tl.load(input_ptr + base_up_offset + t * stride_i_t,
                      mask=mask,
-                     other=0.0).to(tl.float32)
+                     other=0.0)
 
-        x = x * (1.0 / (1.0 + tl.exp(-x)))
-        y = x * y2
+        gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
+        y = gate * up
+
+        y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
+        if use_ue8m0:
+            y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
 
-        _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
-        scale_raw = _absmax / fp8_max
-        y_s = tl.math.exp2(tl.ceil(
-            tl.log2(scale_raw))) if use_ue8m0 else scale_raw
         y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
 
-        tl.store(y_q_ptr + base_yq_offset + cols * stride_yq_h, y_q, mask=mask)
-        tl.store(y_s_ptr + base_ys_offset, y_s)
+        tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask)
+        tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)
 
 
 def silu_mul_fp8_quant_deep_gemm(
-    y: torch.Tensor,  # (E, T, 2*H) float32
+    y: torch.Tensor,  # (E, T, 2*H)
     tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
     group_size: int = 128,
     eps: float = 1e-10,
-):
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
 
     y has shape (E, T, 2*H). The first half of the last dimension is 
     silu-activated, multiplied by the second half, then quantized into FP8.
 
     Returns `(y_q, y_s)` where
-    * `y_q` is the FP8 tensor of shape `(E, T, H)`, same layout as `y[..., :H]`.
-    * `y_s` has shape `(E, T, H // group_size)` and strides `(T*G, 1, T)`
+    * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
+    * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
     """
     assert y.ndim == 3, "y must be (E, T, 2*H)"
     E, T, H2 = y.shape
@@ -148,7 +146,7 @@ def silu_mul_fp8_quant_deep_gemm(
 
     stride_cnt_e = tokens_per_expert.stride()[0]
 
-    # static grid over experts and H-groups.
+    # Static grid over experts and H-groups.
     # A loop inside the kernel handles the token dim
     grid = (E * G, )
 
@@ -178,7 +176,7 @@ def silu_mul_fp8_quant_deep_gemm(
         fp8_max,
         is_blackwell_deep_gemm_e8m0_used(),
         BLOCK=group_size,
-        NUM_STAGES=8,
+        NUM_STAGES=4,
         num_warps=1,
     )
 

From 8ef6b8a38cc3b69e55b0bb60d5442c4efb1988de Mon Sep 17 00:00:00 2001
From: tvalentyn <tvalentyn@users.noreply.github.com>
Date: Fri, 22 Aug 2025 00:01:03 +0200
Subject: [PATCH 485/932] Always use cache mounts when installing vllm to avoid
 populating pip cache in the image. Also remove apt cache. (#23270)

Signed-off-by: Valentyn Tymofieiev <valentyn@google.com>
---
 docker/Dockerfile.tpu | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.tpu b/docker/Dockerfile.tpu
index 2190151369..ca2d7833c1 100644
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@@ -7,7 +7,8 @@ WORKDIR /workspace/vllm
 # Install some basic utilities
 RUN apt-get update && apt-get install -y \
     git \
-    ffmpeg libsm6 libxext6 libgl1
+    ffmpeg libsm6 libxext6 libgl1 && \
+    rm -rf /var/lib/apt/lists/*
 
 # Build vLLM.
 COPY . .
@@ -16,6 +17,9 @@ RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 
 # Remove existing versions of dependencies
+# TODO: These packages will remain as dead weight in the Docker image layers.
+# We should find a way to build the image without uninstalling these.
+# Consider using a different base image.
 RUN pip uninstall -y torch torch_xla torchvision
 
 ENV VLLM_TARGET_DEVICE="tpu"
@@ -23,9 +27,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
     python3 -m pip install \
         -r requirements/tpu.txt
-RUN python3 -m pip install -e .
+
+RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install -e .
 
 # install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
+RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install -e tests/vllm_test_utils
 
 CMD ["/bin/bash"]

From 5368f76855b6d100c14f43f6f1920a4deb3d75f9 Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Fri, 22 Aug 2025 07:09:16 +0800
Subject: [PATCH 486/932] [Feature][Responses API] Support logprobs(non-stream)
 (#23319)

Signed-off-by: Kebe <mail@kebe7jun.com>
---
 .../openai/responses/test_basic.py            | 13 ++++
 vllm/entrypoints/openai/protocol.py           | 13 +++-
 vllm/entrypoints/openai/serving_responses.py  | 64 ++++++++++++++++++-
 3 files changed, 86 insertions(+), 4 deletions(-)

diff --git a/tests/v1/entrypoints/openai/responses/test_basic.py b/tests/v1/entrypoints/openai/responses/test_basic.py
index 18c35152e7..7a0baa5767 100644
--- a/tests/v1/entrypoints/openai/responses/test_basic.py
+++ b/tests/v1/entrypoints/openai/responses/test_basic.py
@@ -73,3 +73,16 @@ async def test_chat_with_input_type(client: openai.AsyncOpenAI):
     ], )
     print(response)
     assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+async def test_logprobs(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        include=["message.output_text.logprobs"],
+        input="What is 13 * 24?",
+        top_logprobs=5,
+    )
+    print(response)
+    outputs = response.output
+    assert outputs[-1].content[-1].logprobs
+    assert len(outputs[-1].content[-1].logprobs[0].top_logprobs) == 5
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index a44868973f..a3d7b78cf4 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -357,13 +357,22 @@ class ResponsesRequest(OpenAIBaseModel):
             temperature=temperature,
             top_p=top_p,
             max_tokens=max_tokens,
-            logprobs=self.top_logprobs,
+            logprobs=self.top_logprobs
+            if self.is_include_output_logprobs() else None,
             stop_token_ids=stop_token_ids,
             output_kind=(RequestOutputKind.DELTA
                          if self.stream else RequestOutputKind.FINAL_ONLY),
             guided_decoding=guided_decoding,
         )
 
+    def is_include_output_logprobs(self) -> bool:
+        """Check if the request includes output logprobs."""
+        if self.include is None:
+            return False
+        return isinstance(
+            self.include,
+            list) and "message.output_text.logprobs" in self.include
+
     @model_validator(mode="before")
     def validate_background(cls, data):
         if not data.get("background"):
@@ -1808,7 +1817,7 @@ class ResponsesResponse(OpenAIBaseModel):
     service_tier: Literal["auto", "default", "flex", "scale", "priority"]
     status: ResponseStatus
     text: Optional[ResponseTextConfig] = None
-    top_logprobs: int
+    top_logprobs: Optional[int] = None
     truncation: Literal["auto", "disabled"]
     usage: Optional[ResponseUsage] = None
     user: Optional[str] = None
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 1b30fa01ea..6b131bbb04 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -4,7 +4,7 @@
 import asyncio
 import json
 import time
-from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import AsyncGenerator, AsyncIterator, Sequence
 from contextlib import AsyncExitStack
 from copy import copy
 from http import HTTPStatus
@@ -25,6 +25,8 @@ from openai.types.responses import (ResponseCreatedEvent,
                                     ResponseReasoningItem,
                                     ResponseReasoningTextDeltaEvent,
                                     ResponseReasoningTextDoneEvent)
+from openai.types.responses.response_output_text import (Logprob,
+                                                         LogprobTopLogprob)
 # yapf: enable
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent)
@@ -59,6 +61,8 @@ from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import SamplingParams
+from vllm.sequence import Logprob as SampleLogprob
+from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
 
@@ -201,6 +205,12 @@ class OpenAIServingResponses(OpenAIServing):
             # (i.e., their request's `store=True` just because it's the default
             # value).
             request.store = False
+        if self.use_harmony and request.is_include_output_logprobs():
+            return self.create_error_response(
+                err_type="invalid_request_error",
+                message="logprobs are not supported with gpt-oss models",
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
 
         # Handle the previous response ID.
         prev_response_id = request.previous_response_id
@@ -491,6 +501,51 @@ class OpenAIServingResponses(OpenAIServing):
                     self.response_store[response.id] = response
         return response
 
+    def _topk_logprobs(self, logprobs: dict[int,
+                                            SampleLogprob], top_logprobs: int,
+                       tokenizer: AnyTokenizer) -> list[LogprobTopLogprob]:
+        """Returns the top-k logprobs from the logprobs dictionary."""
+        out = []
+        for i, (token_id, _logprob) in enumerate(logprobs.items()):
+            if i >= top_logprobs:
+                break
+            text = _logprob.decoded_token if _logprob.decoded_token \
+                is not None else tokenizer.decode([token_id])
+            out.append(
+                LogprobTopLogprob(
+                    token=text,
+                    logprob=max(_logprob.logprob, -9999.0),
+                    bytes=list(text.encode("utf-8", errors="replace")),
+                ))
+        return out
+
+    def _create_response_logprobs(
+            self,
+            token_ids: Sequence[int],
+            logprobs: Optional[SampleLogprobs],
+            tokenizer: AnyTokenizer,
+            top_logprobs: Optional[int] = None) -> list[Logprob]:
+        assert logprobs is not None, "logprobs must be provided"
+        assert len(token_ids) == len(logprobs), (
+            "token_ids and logprobs.token_ids must have the same length")
+        out = []
+        for i, token_id in enumerate(token_ids):
+            logprob = logprobs[i]
+            token_logprob = logprob[token_id]
+            text = token_logprob.decoded_token if token_logprob.decoded_token \
+                is not None else tokenizer.decode([token_id])
+            out.append(
+                Logprob(
+                    token=text,
+                    logprob=max(token_logprob.logprob, -9999.0),
+                    bytes=list(text.encode("utf-8", errors="replace")),
+                    top_logprobs=self._topk_logprobs(logprob,
+                                                     top_logprobs=top_logprobs,
+                                                     tokenizer=tokenizer)
+                    if top_logprobs else [],
+                ))
+        return out
+
     def _make_response_output_items(
         self,
         request: ResponsesRequest,
@@ -547,7 +602,12 @@ class OpenAIServingResponses(OpenAIServing):
                 text=content,
                 annotations=[],  # TODO
                 type="output_text",
-                logprobs=None,  # TODO
+                logprobs=self._create_response_logprobs(
+                    token_ids=final_output.token_ids,
+                    logprobs=final_output.logprobs,
+                    tokenizer=tokenizer,
+                    top_logprobs=request.top_logprobs,
+                ) if request.is_include_output_logprobs() else None,
             )
             message = ResponseOutputMessage(
                 id=f"msg_{random_uuid()}",

From 480bdf5a7b23ae9b9c06e56b3e9fab8e28f1f7c8 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Thu, 21 Aug 2025 18:40:54 -0700
Subject: [PATCH 487/932] [Core] Support custom executor qualname (#23314)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml      |   1 +
 tests/v1/executor/__init__.py      |   0
 tests/v1/executor/test_executor.py | 116 +++++++++++++++++++++++++++++
 vllm/config/parallel.py            |  14 ++--
 vllm/engine/arg_utils.py           |   2 +-
 vllm/v1/executor/abstract.py       |   8 ++
 6 files changed, 133 insertions(+), 8 deletions(-)
 create mode 100644 tests/v1/executor/__init__.py
 create mode 100644 tests/v1/executor/test_executor.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 1f67e7e92b..df2735fefe 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -244,6 +244,7 @@ steps:
     - pytest -v -s v1/core
     - pytest -v -s v1/engine
     - pytest -v -s v1/entrypoints
+    - pytest -v -s v1/executor
     - pytest -v -s v1/sample
     - pytest -v -s v1/logits_processors
     - pytest -v -s v1/worker
diff --git a/tests/v1/executor/__init__.py b/tests/v1/executor/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/v1/executor/test_executor.py b/tests/v1/executor/test_executor.py
new file mode 100644
index 0000000000..bdd5155c14
--- /dev/null
+++ b/tests/v1/executor/test_executor.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import os
+from typing import Any, Callable, Optional, Union
+
+import pytest
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.sampling_params import SamplingParams
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.llm_engine import LLMEngine
+from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+
+
+class Mock:
+    ...
+
+
+class CustomMultiprocExecutor(MultiprocExecutor):
+
+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict] = None,
+                       non_block: bool = False,
+                       unique_reply_rank: Optional[int] = None) -> list[Any]:
+        # Drop marker to show that this was ran
+        with open(".marker", "w"):
+            ...
+        return super().collective_rpc(method, timeout, args, kwargs)
+
+
+CustomMultiprocExecutorAsync = CustomMultiprocExecutor
+MODEL = "Qwen/Qwen3-0.6B"
+
+
+def test_custom_executor_type_checking():
+    with pytest.raises(ValueError):
+        engine_args = EngineArgs(
+            model=MODEL,
+            gpu_memory_utilization=0.2,
+            max_model_len=8192,
+            distributed_executor_backend=Mock,
+        )
+        LLMEngine.from_engine_args(engine_args)
+    with pytest.raises(ValueError):
+        engine_args = AsyncEngineArgs(model=MODEL,
+                                      gpu_memory_utilization=0.2,
+                                      max_model_len=8192,
+                                      distributed_executor_backend=Mock)
+        AsyncLLM.from_engine_args(engine_args)
+
+
+@pytest.mark.parametrize("distributed_executor_backend", [
+    CustomMultiprocExecutor,
+    "tests.v1.executor.test_executor.CustomMultiprocExecutor"
+])
+def test_custom_executor(distributed_executor_backend, tmp_path):
+    cwd = os.path.abspath(".")
+    os.chdir(tmp_path)
+    try:
+        assert not os.path.exists(".marker")
+
+        engine_args = EngineArgs(
+            model=MODEL,
+            gpu_memory_utilization=0.2,
+            max_model_len=8192,
+            distributed_executor_backend=distributed_executor_backend,
+            enforce_eager=True,  # reduce test time
+        )
+        engine = LLMEngine.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
+
+        engine.add_request("0", "foo", sampling_params)
+        engine.step()
+
+        assert os.path.exists(".marker")
+    finally:
+        os.chdir(cwd)
+
+
+@pytest.mark.parametrize("distributed_executor_backend", [
+    CustomMultiprocExecutorAsync,
+    "tests.v1.executor.test_executor.CustomMultiprocExecutorAsync"
+])
+def test_custom_executor_async(distributed_executor_backend, tmp_path):
+    cwd = os.path.abspath(".")
+    os.chdir(tmp_path)
+    try:
+        assert not os.path.exists(".marker")
+
+        engine_args = AsyncEngineArgs(
+            model=MODEL,
+            gpu_memory_utilization=0.2,
+            max_model_len=8192,
+            distributed_executor_backend=distributed_executor_backend,
+            enforce_eager=True,  # reduce test time
+        )
+        engine = AsyncLLM.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
+
+        async def t():
+            stream = engine.generate(request_id="0",
+                                     prompt="foo",
+                                     sampling_params=sampling_params)
+            async for x in stream:
+                ...
+
+        asyncio.run(t())
+
+        assert os.path.exists(".marker")
+    finally:
+        os.chdir(cwd)
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index f667cac2fe..f7b8b1d0a5 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -143,7 +143,8 @@ class ParallelConfig:
     placement_group: Optional[PlacementGroup] = None
     """ray distributed model workers placement group."""
 
-    distributed_executor_backend: Optional[Union[DistributedExecutorBackend,
+    distributed_executor_backend: Optional[Union[str,
+                                                 DistributedExecutorBackend,
                                                  type[ExecutorBase]]] = None
     """Backend to use for distributed model
     workers, either "ray" or "mp" (multiprocessing). If the product
@@ -416,23 +417,22 @@ class ParallelConfig:
     def use_ray(self) -> bool:
         return self.distributed_executor_backend == "ray" or (
             isinstance(self.distributed_executor_backend, type)
-            and self.distributed_executor_backend.uses_ray)
+            and getattr(self.distributed_executor_backend, "uses_ray", False))
 
     @model_validator(mode='after')
     def _verify_args(self) -> Self:
         # Lazy import to avoid circular import
         from vllm.executor.executor_base import ExecutorBase
         from vllm.platforms import current_platform
-        if self.distributed_executor_backend not in (
-                "ray", "mp", "uni",
-                "external_launcher", None) and not (isinstance(
+        if self.distributed_executor_backend is not None and not isinstance(
+                self.distributed_executor_backend, str) and not (isinstance(
                     self.distributed_executor_backend, type) and issubclass(
                         self.distributed_executor_backend, ExecutorBase)):
             raise ValueError(
                 "Unrecognized distributed executor backend "
                 f"{self.distributed_executor_backend}. Supported "
-                "values are 'ray', 'mp' 'uni', 'external_launcher' or"
-                " custom ExecutorBase subclass.")
+                "values are 'ray', 'mp' 'uni', 'external_launcher', "
+                " custom ExecutorBase subclass or its import path.")
         if self.use_ray:
             from vllm.executor import ray_utils
             ray_utils.assert_ray_available()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 972ac203d7..4040414d5b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -290,7 +290,7 @@ class EngineArgs:
     # is intended for expert use only. The API may change without
     # notice.
     distributed_executor_backend: Optional[Union[
-        DistributedExecutorBackend,
+        str, DistributedExecutorBackend,
         Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 063a5f592e..4be2f74177 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -13,6 +13,7 @@ from vllm.executor.uniproc_executor import (  # noqa
     ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0)
 from vllm.executor.uniproc_executor import (  # noqa
     UniProcExecutor as UniProcExecutorV0)
+from vllm.utils import resolve_obj_by_qualname
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 
@@ -50,6 +51,13 @@ class Executor(ExecutorBase):
             # TODO: make v1 scheduling deterministic
             # to support external launcher
             executor_class = ExecutorWithExternalLauncher
+        elif isinstance(distributed_executor_backend, str):
+            executor_class = resolve_obj_by_qualname(
+                distributed_executor_backend)
+            if not issubclass(executor_class, ExecutorBase):
+                raise TypeError(
+                    "distributed_executor_backend must be a subclass of "
+                    f"ExecutorBase. Got {executor_class}.")
         else:
             raise ValueError("Unknown distributed executor backend: "
                              f"{distributed_executor_backend}")

From 19fe1a0510852b5a892932653a5881e58e359660 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni001@gmail.com>
Date: Thu, 21 Aug 2025 22:26:32 -0400
Subject: [PATCH 488/932] [Kernel] Add FP8 support with FlashMLA backend
 (#22668)

Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
---
 cmake/external_projects/flashmla.cmake        |  9 +--
 csrc/cache.h                                  |  6 +-
 csrc/cache_kernels.cu                         | 57 +++++++--------
 csrc/torch_bindings.cpp                       | 13 ++--
 tests/kernels/attention/test_cache.py         | 42 +++++++----
 tests/kernels/attention/test_flashmla.py      | 69 ++++++++++++++-----
 vllm/_custom_ops.py                           | 20 +++---
 vllm/attention/backends/mla/common.py         | 14 ++--
 vllm/attention/ops/flashmla.py                |  6 ++
 vllm/engine/arg_utils.py                      |  3 +-
 vllm/platforms/cuda.py                        | 41 ++++++++---
 vllm/platforms/interface.py                   |  3 +-
 vllm/platforms/rocm.py                        |  3 +-
 vllm/platforms/tpu.py                         |  3 +-
 vllm/v1/attention/backends/mla/common.py      | 37 ++++++++--
 vllm/v1/attention/backends/mla/cutlass_mla.py |  3 +-
 vllm/v1/attention/backends/mla/flashmla.py    | 10 ++-
 .../attention/backends/mla/rocm_aiter_mla.py  |  2 +
 vllm/v1/attention/backends/mla/triton_mla.py  |  3 +-
 19 files changed, 235 insertions(+), 109 deletions(-)

diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index ee6768bce2..02224cfe3e 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
   FetchContent_Declare(
         flashmla
         GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
-        GIT_TAG 0e43e774597682284358ff2c54530757b654b8d1
+        GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
         GIT_PROGRESS TRUE
         CONFIGURE_COMMAND ""
         BUILD_COMMAND ""
@@ -37,13 +37,14 @@ cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
     set(FlashMLA_SOURCES
         ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
-        ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
         ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
-        ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu)
+        ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
 
     set(FlashMLA_INCLUDES
         ${flashmla_SOURCE_DIR}/csrc/cutlass/include
-        ${flashmla_SOURCE_DIR}/csrc/include)
+        ${flashmla_SOURCE_DIR}/csrc)
 
     set_gencode_flags_for_srcs(
         SRCS "${FlashMLA_SOURCES}"
diff --git a/csrc/cache.h b/csrc/cache.h
index 0970b704be..fb0c353b96 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -40,9 +40,11 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
 
-void gather_cache(
+void gather_and_maybe_dequant_cache(
     torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
     torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
     torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
     torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
\ No newline at end of file
+    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& scale,
+    std::optional<torch::Tensor> seq_starts = std::nullopt);
\ No newline at end of file
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 131dcb15cd..b3a985c2d5 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -624,9 +624,9 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
 namespace vllm {
 
 // grid is launched with dimensions (batch, num_splits)
-template <typename scalar_t>
-__global__ void gather_cache(
-    const scalar_t* __restrict__ src_cache,   // [NUM_BLOCKS, BLOCK_SIZE,
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void gather_and_maybe_dequant_cache(
+    const cache_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE,
                                               // ENTRIES...]
     scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
     const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
@@ -634,6 +634,7 @@ __global__ void gather_cache(
     const int32_t block_size, const int32_t entry_size,
     const int64_t block_table_stride, const int64_t cache_block_stride,
     const int64_t cache_entry_stride, const int64_t dst_entry_stride,
+    const float* __restrict__ scale,
     const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
                                                // batch
 
@@ -675,10 +676,16 @@ __global__ void gather_cache(
     if (partial_block_size) full_blocks_end -= 1;
   }
 
-  auto copy_entry = [&](const scalar_t* __restrict__ _src,
+  auto copy_entry = [&](const cache_t* __restrict__ _src,
                         scalar_t* __restrict__ _dst) {
-    for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
-      _dst[i] = _src[i];
+    for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        _dst[i] = static_cast<scalar_t>(_src[i]);
+      } else {
+        _dst[i] =
+            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
+      }
+    }
   };
 
   for (int pid = split_start; pid < full_blocks_end; ++pid) {
@@ -705,25 +712,31 @@ __global__ void gather_cache(
 }  // namespace vllm
 
 // Macro to dispatch the kernel based on the data type.
-#define CALL_GATHER_CACHE(CPY_DTYPE)                                    \
-  vllm::gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>(            \
-      reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()),               \
-      reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()),                     \
-      block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
-      block_size, entry_size, block_table_stride, cache_block_stride,   \
-      cache_entry_stride, dst_entry_stride, seq_starts_ptr);
+// SCALAR_T is the data type of the destination tensor.
+// CACHE_T is the stored data type of kv-cache.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                      \
+  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE>         \
+      <<<grid, block, 0, stream>>>(                                         \
+          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                 \
+          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                      \
+          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
+          block_size, entry_size, block_table_stride, cache_block_stride,   \
+          cache_entry_stride, dst_entry_stride,                             \
+          reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
 
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
 //  - Optionally, seq_starts (if provided) offsets the starting block index by
 //  (seq_starts[bid] / page_size)
-void gather_cache(
+void gather_and_maybe_dequant_cache(
     torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
     torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
     torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
     torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size,
+    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& scale,
     std::optional<torch::Tensor> seq_starts = std::nullopt) {
   at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -761,20 +774,8 @@ void gather_cache(
   dim3 grid(batch_size, num_splits);
   dim3 block(1024);
 
-  TORCH_CHECK(src_cache.dtype() == dst.dtype(),
-              "src_cache and dst must have the same dtype");
-
-  const int dtype_bits = src_cache.element_size() * 8;
   const int32_t* seq_starts_ptr =
       seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
 
-  if (dtype_bits == 32) {
-    CALL_GATHER_CACHE(uint32_t);
-  } else if (dtype_bits == 16) {
-    CALL_GATHER_CACHE(uint16_t);
-  } else if (dtype_bits == 8) {
-    CALL_GATHER_CACHE(uint8_t);
-  } else {
-    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
-  }
+  DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
 }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 60710f62c0..4edb7af50f 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -672,11 +672,16 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "str kv_cache_dtype) -> ()");
   cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
 
-  // Gather cache blocks from src_cache to dst.
+  // Gather cache blocks from src_cache to dst, dequantizing from
+  // src_cache's dtype to dst's dtype if necessary.
   cache_ops.def(
-      "gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
-      "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
-  cache_ops.impl("gather_cache", torch::kCUDA, &gather_cache);
+      "gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, "
+      "                               Tensor block_table, Tensor cu_seq_lens, "
+      "                               int batch_size, "
+      "                               str kv_cache_dtype, "
+      "                               Tensor scale, Tensor? seq_starts) -> ()");
+  cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
+                 &gather_and_maybe_dequant_cache);
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 8c3cc8cba9..cbf11da63c 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -709,14 +709,15 @@ def test_swap_blocks_mla(
 @pytest.mark.parametrize("max_seq_len", [512])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("dtype", [torch.float32])
-@pytest.mark.parametrize("kv_cache_dtype",
-                         ["auto"])  # You can also test "fp8" if needed.
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
-                          num_blocks, max_seq_len, batch_size, dtype,
-                          kv_cache_dtype, device):
+def test_gather_and_maybe_dequant_cache_mla(kv_lora_rank, qk_rope_head_dim,
+                                            block_size, num_blocks,
+                                            max_seq_len, batch_size, dtype,
+                                            kv_cache_dtype, device):
     entry_size = kv_lora_rank + qk_rope_head_dim
+    scale = torch.tensor(0.1, dtype=torch.float32, device=device)
     src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
                                   kv_cache_dtype, device)
     _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
@@ -742,9 +743,7 @@ def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
         perm = torch.randperm(num_blocks, device=device)
         block_table[b, :] = perm
 
-    dst = torch.zeros((total_tokens, entry_size),
-                      dtype=src_cache.dtype,
-                      device=device)
+    dst = torch.zeros((total_tokens, entry_size), dtype=dtype, device=device)
 
     expected_batches = []
     for b in range(batch_size):
@@ -756,21 +755,38 @@ def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
 
         gathered_rows = []
         for i in range(tot - 1):
-            gathered_rows.append(src_cache[blocks[i]])
+            block_data = src_cache[blocks[i]]
+            if kv_cache_dtype == "fp8":
+                dequantized_block = torch.empty_like(block_data, dtype=dtype)
+                ops.convert_fp8(dequantized_block, block_data, scale.item())
+                gathered_rows.append(dequantized_block)
+            else:
+                gathered_rows.append(block_data)
         remaining = s - (tot - 1) * block_size
-        gathered_rows.append(src_cache[blocks[-1], :remaining, :])
+        last_block_data = src_cache[blocks[-1], :remaining, :]
+        if kv_cache_dtype == "fp8":
+            dequantized_last_block = torch.empty_like(last_block_data,
+                                                      dtype=dtype)
+            ops.convert_fp8(dequantized_last_block, last_block_data,
+                            scale.item())
+            gathered_rows.append(dequantized_last_block)
+        else:
+            gathered_rows.append(last_block_data)
 
         batch_expected = torch.cat(gathered_rows, dim=0)
         expected_batches.append(batch_expected)
     expected = torch.cat(expected_batches, dim=0)
 
     opcheck(
-        torch.ops._C_cache_ops.gather_cache,
-        (src_cache, dst, block_table, cu_seq_lens, batch_size, None),
+        torch.ops._C_cache_ops.gather_and_maybe_dequant_cache,
+        (src_cache, dst, block_table, cu_seq_lens, batch_size, kv_cache_dtype,
+         scale, None),
         test_utils=DEFAULT_OPCHECK_TEST_UTILS,
     )
 
-    ops.gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size)
+    ops.gather_and_maybe_dequant_cache(src_cache, dst, block_table,
+                                       cu_seq_lens, batch_size, kv_cache_dtype,
+                                       scale, None)
     torch.testing.assert_close(dst, expected)
 
 
diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
index 81841be583..abcfe828d5 100644
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -13,11 +13,17 @@ from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
 from vllm.triton_utils import triton
 
 
-def cal_diff(x: torch.Tensor, y: torch.Tensor, name: str) -> None:
+def cal_diff(x: torch.Tensor,
+             y: torch.Tensor,
+             name: str,
+             use_fp8: bool = False) -> None:
     x, y = x.double(), y.double()
     cos_diff = 1 - 2 * (x * y).sum().item() / max(
         (x * x + y * y).sum().item(), 1e-12)
-    assert cos_diff < 1e-5
+    if (use_fp8):
+        assert cos_diff < 1e-4
+    else:
+        assert cos_diff < 1e-5
 
 FLASH_MLA_UNSUPPORTED_REASON = is_flashmla_supported()[1] \
     if not is_flashmla_supported()[0] else "FlashMLA is supported"
@@ -27,7 +33,7 @@ FLASH_MLA_UNSUPPORTED_REASON = is_flashmla_supported()[1] \
                     reason=FLASH_MLA_UNSUPPORTED_REASON)
 @pytest.mark.parametrize("b", [128])
 @pytest.mark.parametrize("s_q", [1, 2])
-@pytest.mark.parametrize("mean_sk", [4096, 8192])
+@pytest.mark.parametrize("mean_sk", [4096, 8192, 16384])
 @pytest.mark.parametrize("h_q", [16, 32, 64, 128])
 @pytest.mark.parametrize("h_kv", [1])
 @pytest.mark.parametrize("d", [576])
@@ -35,20 +41,26 @@ FLASH_MLA_UNSUPPORTED_REASON = is_flashmla_supported()[1] \
 @pytest.mark.parametrize("block_size", [64])
 @pytest.mark.parametrize("causal", [True])
 @pytest.mark.parametrize("varlen", [False, True])
-@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("torch_dtype",
+                         [torch.bfloat16, torch.float16, torch.float8_e4m3fn])
 @torch.inference_mode()
 def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
-                   varlen, dtype):
+                   varlen, torch_dtype):
     device = torch.device("cuda:0")
-    torch.set_default_dtype(dtype)
+    if torch_dtype == torch.float8_e4m3fn:
+        init_dtype = torch.bfloat16
+    else:
+        init_dtype = torch_dtype
+    torch.set_default_dtype(init_dtype)
     torch.set_default_device(device)
     torch.cuda.set_device(device)
     torch.manual_seed(0)
     random.seed(0)
 
     print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
-          f"{d=}, {dv=}, {causal=}, {varlen=}, {dtype=}")
+          f"{d=}, {dv=}, {causal=}, {varlen=}, {torch_dtype=}")
 
+    use_fp8 = torch_dtype == torch.float8_e4m3fn
     cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32)
     if varlen:
         for i in range(b):
@@ -71,6 +83,19 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
     tile_scheduler_metadata, num_splits = get_mla_metadata(
         cache_seqlens, s_q * h_q // h_kv, h_kv)
 
+    init_dtype = q.dtype
+    if use_fp8:
+        fp8_dtype = torch.float8_e4m3fn
+        descale_q = torch.ones((1), dtype=torch.float32)
+        descale_k = torch.ones((1), dtype=torch.float32)
+
+        q = q.to(fp8_dtype)
+        blocked_k = blocked_k.to(fp8_dtype)
+        blocked_v = blocked_v.to(fp8_dtype)
+    else:
+        descale_q = None
+        descale_k = None
+
     def flash_mla():
         return flash_mla_with_kvcache(
             q,
@@ -81,6 +106,8 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
             tile_scheduler_metadata,
             num_splits,
             causal=causal,
+            descale_q=descale_q,
+            descale_k=descale_k,
         )
 
     def scaled_dot_product_attention(query, key, value, is_causal=False):
@@ -104,29 +131,35 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
         return attn_weight @ value, lse
 
     def ref_mla():
+        q_ = (q.to(torch.float) * descale_q).to(init_dtype) if use_fp8 else q
+        blocked_k_ = (blocked_k.to(torch.float) *
+                      descale_k).to(init_dtype) if use_fp8 else blocked_k
+        blocked_v_ = (blocked_v.to(torch.float) *
+                      descale_k).to(init_dtype) if use_fp8 else blocked_v
         out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
         lse = torch.empty(b, h_q, s_q, dtype=torch.float32)
         for i in range(b):
             begin = i * max_seqlen_pad
             end = begin + cache_seqlens[i]
-            ref_O, LSE = scaled_dot_product_attention(
-                q[i].transpose(0, 1),
-                blocked_k.view(-1, h_kv, d)[begin:end].transpose(0, 1),
-                blocked_v.view(-1, h_kv, dv)[begin:end].transpose(0, 1),
+            out_i, lse_i = scaled_dot_product_attention(
+                q_[i].transpose(0, 1),
+                blocked_k_.view(-1, h_kv, d)[begin:end].transpose(0, 1),
+                blocked_v_.view(-1, h_kv, dv)[begin:end].transpose(0, 1),
                 is_causal=causal,
             )
-            out[i] = ref_O.transpose(0, 1)
-            lse[i] = LSE
+            out[i] = out_i.transpose(0, 1)
+            lse[i] = lse_i
         return out, lse
 
     out_flash, lse_flash = flash_mla()
     out_torch, lse_torch = ref_mla()
-    cal_diff(out_flash, out_torch, "out")
+    cal_diff(out_flash, out_torch, "out", use_fp8)
     cal_diff(lse_flash, lse_torch, "lse")
 
     t = triton.testing.do_bench(flash_mla)
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d +
-             b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
-    print(f"{t:.3f} ms, {FLOPS / 10 ** 9 / t:.0f} "
-          f"TFLOPS, {bytes / 10 ** 6 / t:.0f} GB/s")
+    bytes = (total_seqlens * h_kv * d +
+             b * s_q * h_q * d) * (torch.finfo(torch_dtype).bits // 8) + (
+                 b * s_q * h_q * dv) * (torch.finfo(init_dtype).bits // 8)
+    print(f"{t:.3f} ms, {FLOPS / 10 ** 9 / t:.0f} TFLOPS,",
+          f"{bytes / 10 ** 6 / t:.0f} GB/s")
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 3081aff114..0043456e00 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1589,14 +1589,18 @@ def convert_fp8(output: torch.Tensor,
     torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype)
 
 
-def gather_cache(src_cache: torch.Tensor,
-                 dst: torch.Tensor,
-                 block_table: torch.Tensor,
-                 cu_seq_lens: torch.Tensor,
-                 batch_size: int,
-                 seq_starts: Optional[torch.Tensor] = None) -> None:
-    torch.ops._C_cache_ops.gather_cache(src_cache, dst, block_table,
-                                        cu_seq_lens, batch_size, seq_starts)
+def gather_and_maybe_dequant_cache(
+        src_cache: torch.Tensor,
+        dst: torch.Tensor,
+        block_table: torch.Tensor,
+        cu_seq_lens: torch.Tensor,
+        batch_size: int,
+        kv_cache_dtype: str,
+        scale: torch.Tensor,
+        seq_starts: Optional[torch.Tensor] = None) -> None:
+    torch.ops._C_cache_ops.gather_and_maybe_dequant_cache(
+        src_cache, dst, block_table, cu_seq_lens, batch_size, kv_cache_dtype,
+        scale, seq_starts)
 
 
 def get_device_attribute(attribute: int, device: int) -> int:
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 8ff7f56743..9d6ab7e321 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -837,8 +837,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
                 self.context_chunk_workspace_size // num_prefills_with_context
 
             # align max_context_chunk to page_size by rounding down,
-            # currently the `gather_cache` kernel cannot handle
-            # `context_chunk_starts` that are not aligned to page_size
+            # currently the `gather_and_maybe_dequant_cache` kernel cannot
+            # handle `context_chunk_starts` that are not aligned to page_size
             max_context_chunk = round_down(max_context_chunk, self.page_size)
             assert max_context_chunk > 0
             num_chunks = cdiv(context_lens_tensor.max(), max_context_chunk)
@@ -1082,6 +1082,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         q: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
+        k_scale: torch.Tensor,
     ):
         prefill_metadata = attn_metadata.prefill_metadata
         assert prefill_metadata is not None
@@ -1103,12 +1104,14 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         for i in range(iters):
             toks = prefill_metadata.context_chunk_seq_tot[i]
 
-            ops.gather_cache(
+            ops.gather_and_maybe_dequant_cache(
                 src_cache=kv_c_and_k_pe_cache,
                 dst=workspace,
                 block_table=prefill_metadata.block_tables,
                 cu_seq_lens=prefill_metadata.context_chunk_cu_seq_lens[i],
                 batch_size=prefill_metadata.num_prefills,
+                kv_cache_dtype=self.kv_cache_dtype,
+                scale=k_scale,
                 seq_starts=prefill_metadata.context_chunk_starts[i],
             )
 
@@ -1165,6 +1168,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         k_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
+        k_scale: torch.Tensor,
     ) -> torch.Tensor:
 
         prefill_metadata = attn_metadata.prefill_metadata
@@ -1197,7 +1201,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
             # ROCm flash_attn_varlen_func will return 3 objects instead of 2
             suffix_output, suffix_lse = output
             context_output, context_lse = self._compute_prefill_context( \
-                q, kv_c_and_k_pe_cache, attn_metadata)
+                q, kv_c_and_k_pe_cache, attn_metadata, k_scale)
 
             output = torch.empty_like(suffix_output)
             merge_attn_states(
@@ -1287,7 +1291,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         if has_prefill:
             output[:num_prefill_tokens] = self._forward_prefill(
                 prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
-                attn_metadata)
+                attn_metadata, layer._k_scale)
 
         if has_decode:
             decode_q_nope, decode_q_pe = decode_q.split(
diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py
index 1af26dfc3d..564042cf8e 100644
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -67,6 +67,8 @@ def flash_mla_with_kvcache(
     num_splits: torch.Tensor,
     softmax_scale: Optional[float] = None,
     causal: bool = False,
+    descale_q: Optional[torch.Tensor] = None,
+    descale_k: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Arguments:
@@ -81,6 +83,8 @@ def flash_mla_with_kvcache(
         softmax_scale: float. The scaling of QK^T before applying softmax. 
                        Default to 1 / sqrt(head_dim).
         causal: bool. Whether to apply causal attention mask.
+        descale_q: (batch_size), torch.float32. Descaling factors for Q.
+        descale_k: (batch_size), torch.float32. Descaling factors for K.
 
     Return:
         out: (batch_size, seq_len_q, num_heads_q, head_dim_v).
@@ -98,6 +102,8 @@ def flash_mla_with_kvcache(
         causal,
         tile_scheduler_metadata,
         num_splits,
+        descale_q,
+        descale_k,
     )
     return out, softmax_lse
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4040414d5b..4700a93dd6 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1445,10 +1445,9 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
-        # No Fp8 KV cache so far.
         if self.kv_cache_dtype != "auto":
             supported = current_platform.is_kv_cache_dtype_supported(
-                self.kv_cache_dtype)
+                self.kv_cache_dtype, model_config)
             if not supported:
                 _raise_or_fallback(feature_name="--kv-cache-dtype",
                                    recommend_to_remove=False)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 55d7afeef6..134ba36e5e 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -481,16 +481,41 @@ class CudaPlatformBase(Platform):
         return cuda_device_count_stateless()
 
     @classmethod
-    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
+                                    model_config: "ModelConfig") -> bool:
         fp8_attention = kv_cache_dtype.startswith("fp8")
-        will_use_fa = (not envs.is_set("VLLM_ATTENTION_BACKEND")
-                       ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
+        attention_backend = envs.VLLM_ATTENTION_BACKEND
+
         supported = False
-        if cls.is_device_capability(100):
-            supported = True
-        elif fp8_attention and will_use_fa:
-            from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
-            supported = flash_attn_supports_fp8()
+        if model_config is not None and model_config.use_mla:
+            # Default to CutlassMLA for blackwell,
+            # FlashMLA otherwise
+            if attention_backend is None:
+                if cls.is_device_capability(100):
+                    attention_backend = "CUTLASS_MLA"
+                else:
+                    attention_backend = "FLASHMLA"
+
+            # Only FlashMLA supports fp8
+            if attention_backend == "FLASHMLA":
+                supported = True
+            else:
+                supported = (not fp8_attention)
+        else:
+            # Default to FlashAttention
+            if attention_backend is None:
+                attention_backend = "FLASH_ATTN_VLLM_V1"
+
+            # All Blackwell backends support fp8
+            if cls.is_device_capability(100):
+                supported = True
+            elif attention_backend == "FLASH_ATTN_VLLM_V1":
+                if fp8_attention:
+                    from vllm.attention.utils.fa_utils import (
+                        flash_attn_supports_fp8)
+                    supported = flash_attn_supports_fp8()
+                else:
+                    supported = True
         return supported
 
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 40334375b8..00bc555288 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -565,7 +565,8 @@ class Platform:
         raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
 
     @classmethod
-    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
+                                    model_config: "ModelConfig") -> bool:
         """
         Returns if the kv_cache_dtype is supported by the current platform.
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 3ede86e158..317bc401a7 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -459,5 +459,6 @@ class RocmPlatform(Platform):
         return cuda_device_count_stateless()
 
     @classmethod
-    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
+                                    model_config: "ModelConfig") -> bool:
         return True
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index dc2be5c250..d7468d74b0 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -196,7 +196,8 @@ class TpuPlatform(Platform):
             raise ValueError("Torch XLA does not support per-request seed.")
 
     @classmethod
-    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
+                                    model_config: "ModelConfig") -> bool:
         return True
 
 
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 03028ebfe7..5c0ff94379 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -631,8 +631,9 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
 
                 if self.aot_schedule:
                     # align max_context_chunk to page_size by rounding down,
-                    # currently the `gather_cache` kernel cannot handle
-                    # `context_chunk_starts` that are not aligned to page_size
+                    # currently the `gather_and_maybe_dequant_cache` kernel
+                    # cannot handle `context_chunk_starts` that are not aligned
+                    # to page_size
                     max_context_chunk = round_down(max_context_chunk,
                                                    self.page_size)
 
@@ -1005,6 +1006,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         q: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
+        k_scale: torch.Tensor,
     ):
         assert attn_metadata.prefill is not None
         prefill_metadata = attn_metadata.prefill
@@ -1017,12 +1019,14 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         for i in range(iters):
             toks = prefill_metadata.chunked_context.seq_tot[i]
 
-            ops.gather_cache(
+            ops.gather_and_maybe_dequant_cache(
                 src_cache=kv_c_and_k_pe_cache,
                 dst=workspace,
                 block_table=prefill_metadata.block_table,
                 cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i],
                 batch_size=attn_metadata.num_prefills,
+                kv_cache_dtype=self.kv_cache_dtype,
+                scale=k_scale,
                 seq_starts=prefill_metadata.chunked_context.starts[i],
             )
 
@@ -1073,6 +1077,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         k_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
+        k_scale: torch.Tensor,
     ) -> torch.Tensor:
         assert attn_metadata.prefill is not None
 
@@ -1095,7 +1100,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         if has_context:
             suffix_output, suffix_lse = output
             context_output, context_lse = self._compute_prefill_context( \
-                q, kv_c_and_k_pe_cache, attn_metadata)
+                q, kv_c_and_k_pe_cache, attn_metadata, k_scale)
 
             output = torch.empty_like(suffix_output)
             merge_attn_states(
@@ -1119,6 +1124,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: M,
+        layer: AttentionLayer,
     ) -> torch.Tensor:
         raise NotImplementedError
 
@@ -1146,6 +1152,8 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
             # same expert outputs.
             return output.fill_(0)
 
+        fp8_attention = self.kv_cache_dtype.startswith("fp8")
+
         num_actual_toks = attn_metadata.num_actual_tokens
 
         # Inputs and outputs may be padded for CUDA graphs
@@ -1180,10 +1188,13 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                 scale=layer._k_scale,
             )
 
+        if fp8_attention:
+            kv_cache = kv_cache.view(current_platform.fp8_dtype())
+
         if has_prefill:
             output[num_decode_tokens:] = self._forward_prefill(
                 prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
-                attn_metadata)
+                attn_metadata, layer._k_scale)
 
         if has_decode:
             assert attn_metadata.decode is not None
@@ -1196,7 +1207,21 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
             # Convert from (N, B, L) to (B, N, L)
             decode_ql_nope = decode_ql_nope.transpose(0, 1)
 
+            if fp8_attention:
+                ql_nope_shape = decode_ql_nope.shape
+                decode_ql_nope, _ = ops.scaled_fp8_quant(
+                    decode_ql_nope.reshape([
+                        ql_nope_shape[0], ql_nope_shape[1] * ql_nope_shape[2]
+                    ]), layer._q_scale)
+                decode_ql_nope = decode_ql_nope.reshape(ql_nope_shape)
+                q_pe_shape = decode_q_pe.shape
+                decode_q_pe, _ = ops.scaled_fp8_quant(
+                    decode_q_pe.reshape(
+                        [q_pe_shape[0], q_pe_shape[1] * q_pe_shape[2]]),
+                    layer._q_scale)
+                decode_q_pe = decode_q_pe.reshape(q_pe_shape)
+
             output[:num_decode_tokens] = self._forward_decode(
-                decode_ql_nope, decode_q_pe, kv_cache, attn_metadata)
+                decode_ql_nope, decode_q_pe, kv_cache, attn_metadata, layer)
 
         return output_padded
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 0b581dea04..8a17d3a492 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -7,7 +7,7 @@ from typing import ClassVar, Optional
 import torch
 
 import vllm._custom_ops as ops
-from vllm.attention.backends.abstract import (AttentionType,
+from vllm.attention.backends.abstract import (AttentionLayer, AttentionType,
                                               is_quantized_kv_cache)
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
@@ -278,6 +278,7 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
+        layer: AttentionLayer,
     ) -> torch.Tensor:
         if self._use_old_cutlass_mla:
             # TODO: Remove the old cutlass MLA kernel after more extensive
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 1167442340..1c50144d47 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -6,8 +6,7 @@ from typing import ClassVar, Optional
 
 import torch
 
-from vllm.attention.backends.abstract import (AttentionType,
-                                              is_quantized_kv_cache)
+from vllm.attention.backends.abstract import AttentionLayer, AttentionType
 from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
                                          get_mla_metadata,
                                          is_flashmla_supported)
@@ -166,16 +165,13 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
                                       "are not implemented for "
                                       "FlashMLAImpl")
 
-        if is_quantized_kv_cache(self.kv_cache_dtype):
-            raise NotImplementedError(
-                "FlashMLA V1 with FP8 KV cache not yet supported")
-
     def _forward_decode(
         self,
         q_nope: torch.Tensor,
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: FlashMLAMetadata,
+        layer: AttentionLayer,
     ) -> torch.Tensor:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
@@ -194,6 +190,8 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
             num_splits=attn_metadata.decode.num_splits,
             softmax_scale=self.scale,
             causal=True,
+            descale_q=layer._q_scale.reshape(1),
+            descale_k=layer._k_scale.reshape(1),
         )
 
         return self._v_up_proj(o)
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 082c7e6f7c..870cc60038 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -7,6 +7,7 @@ from typing import ClassVar, Optional
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionLayer
 from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
 from vllm.config import VllmConfig
 from vllm.utils import cdiv
@@ -221,6 +222,7 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: AiterMLAMetadata,
+        layer: AttentionLayer,
     ) -> torch.Tensor:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 700fce6895..f2974ed668 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -6,7 +6,7 @@ from typing import Optional
 import torch
 
 from vllm import envs
-from vllm.attention.backends.abstract import (AttentionType,
+from vllm.attention.backends.abstract import (AttentionLayer, AttentionType,
                                               is_quantized_kv_cache)
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
 from vllm.attention.ops.triton_flash_attention import triton_attention
@@ -127,6 +127,7 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
+        layer: AttentionLayer,
     ) -> torch.Tensor:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None

From 8896eb72ebe90be6f1b83f32b3d3d0c379db4f82 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 22 Aug 2025 10:56:57 +0800
Subject: [PATCH 489/932] [Deprecation] Remove `prompt_token_ids` arg fallback
 in `LLM.generate` and `LLM.embed` (#18800)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../run-lm-eval-gsm-hf-baseline.sh            |   2 +-
 .../run-lm-eval-gsm-vllm-baseline.sh          |   2 +-
 docker/Dockerfile.rocm                        |   2 +-
 docs/features/quantization/fp8.md             |   2 +-
 docs/features/quantization/int4.md            |   2 +-
 docs/features/quantization/int8.md            |   2 +-
 docs/features/quantization/quark.md           |   2 +-
 examples/offline_inference/spec_decode.py     |   4 +-
 .../offline_inference/structured_outputs.py   |   2 +-
 requirements/nightly_torch_test.txt           |   2 +-
 requirements/test.in                          |   3 +-
 requirements/test.txt                         |   2 +-
 tests/entrypoints/llm/test_chat.py            |  15 +-
 tests/entrypoints/llm/test_classify.py        |   5 +-
 tests/entrypoints/llm/test_embedding.py       |   5 +-
 tests/entrypoints/llm/test_encode.py          |  52 +--
 tests/entrypoints/llm/test_generate.py        |  43 +--
 .../llm/test_generate_multiple_loras.py       |   5 +-
 tests/entrypoints/llm/test_reward.py          |   5 +-
 tests/entrypoints/llm/test_score.py           |   5 +-
 tests/quantization/test_fp8.py                |   6 +-
 tests/quantization/test_lm_head.py            |   2 +-
 .../llm/test_struct_output_generate.py        | 104 +++---
 vllm/entrypoints/llm.py                       | 309 ++----------------
 24 files changed, 116 insertions(+), 467 deletions(-)

diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
index a67fc89d54..897f84d1e3 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.4
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 
 usage() {
     echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index b98d42aa7b..792f355c47 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.4
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 
 usage() {
     echo``
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 4f40f32a39..f164857325 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -71,7 +71,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 RUN cd /vllm-workspace \
     && rm -rf vllm \
     && python3 -m pip install -e tests/vllm_test_utils \
-    && python3 -m pip install lm-eval[api]==0.4.4 \
+    && python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
     && python3 -m pip install pytest-shard
 
 # -----------------------
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index 0661933acd..834c03cbe0 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -79,7 +79,7 @@ Since simple RTN does not require data for weight quantization and the activatio
 Install `vllm` and `lm-evaluation-harness` for evaluation:
 
 ```bash
-pip install vllm lm-eval==0.4.4
+pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 ```
 
 Load and run the model in `vllm`:
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index 127e403989..d6fdac7b07 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -18,7 +18,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 
 ```bash
-pip install vllm lm-eval==0.4.4
+pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 ```
 
 ## Quantization Process
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index 45fae58a64..247d0cbdd3 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -19,7 +19,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 
 ```bash
-pip install vllm lm-eval==0.4.4
+pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 ```
 
 ## Quantization Process
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index e8ed215537..047cc83824 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -20,7 +20,7 @@ for more installation details.
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
 
 ```bash
-pip install vllm lm-eval==0.4.4
+pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 ```
 
 ## Quantization Process
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 184c30891e..c4972f02d0 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -5,6 +5,7 @@ from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
 from vllm.benchmarks.datasets import add_dataset_parser, get_samples
+from vllm.inputs import TokensPrompt
 from vllm.v1.metrics.reader import Counter, Vector
 
 try:
@@ -137,7 +138,8 @@ def main():
     sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
     if not args.custom_mm_prompts:
         outputs = llm.generate(
-            prompt_token_ids=prompt_ids, sampling_params=sampling_params
+            TokensPrompt(prompt_token_ids=prompt_ids),
+            sampling_params=sampling_params,
         )
     else:
         outputs = llm.chat(prompts, sampling_params=sampling_params)
diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index f46064931d..88d87beb48 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -85,7 +85,7 @@ def format_output(title: str, output: str):
 
 
 def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
-    outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+    outputs = llm.generate(prompt, sampling_params=sampling_params)
     return outputs[0].outputs[0].text
 
 
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index 491fa06259..a529bf4504 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.2 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api]==0.4.8 # required for model evaluation test
+lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
 transformers==4.52.4
 tokenizers==0.21.1
diff --git a/requirements/test.in b/requirements/test.in
index 7f141fe281..098a9242bc 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -32,7 +32,8 @@ num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api]==0.4.8 # required for model evaluation test
+# TODO: Use lm-eval[api]==0.4.10 once released
+lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb[bm25s]>=1.38.11, <2 # required for mteb test
 transformers==4.55.2
 tokenizers==0.21.1
diff --git a/requirements/test.txt b/requirements/test.txt
index 48eb09811b..85b677c00b 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -408,7 +408,7 @@ lightning-utilities==0.14.3
     #   torchmetrics
 llvmlite==0.44.0
     # via numba
-lm-eval==0.4.8
+lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
     # via -r requirements/test.in
 lxml==5.3.0
     # via
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 97cf3b5ce8..2cbfed98a5 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -18,10 +18,9 @@ def text_llm():
               enforce_eager=True,
               seed=0)
 
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
+    yield weakref.proxy(llm)
 
-        del llm
+    del llm
 
     cleanup_dist_env_and_memory()
 
@@ -88,10 +87,9 @@ def vision_llm():
         seed=0,
     )
 
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
+    yield weakref.proxy(llm)
 
-        del llm
+    del llm
 
     cleanup_dist_env_and_memory()
 
@@ -158,10 +156,9 @@ def thinking_llm():
         seed=0,
     )
 
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
+    yield weakref.proxy(llm)
 
-        del llm
+    del llm
 
     cleanup_dist_env_and_memory()
 
diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py
index 71e76abcb7..57705ff669 100644
--- a/tests/entrypoints/llm/test_classify.py
+++ b/tests/entrypoints/llm/test_classify.py
@@ -35,10 +35,9 @@ def llm():
               enforce_eager=True,
               seed=0)
 
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
+    yield weakref.proxy(llm)
 
-        del llm
+    del llm
 
     cleanup_dist_env_and_memory()
 
diff --git a/tests/entrypoints/llm/test_embedding.py b/tests/entrypoints/llm/test_embedding.py
index ba20d7b954..485f04ed6d 100644
--- a/tests/entrypoints/llm/test_embedding.py
+++ b/tests/entrypoints/llm/test_embedding.py
@@ -26,10 +26,9 @@ def llm():
               enforce_eager=True,
               seed=0)
 
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
+    yield weakref.proxy(llm)
 
-        del llm
+    del llm
 
     cleanup_dist_env_and_memory()
 
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index b930f05beb..cb54b16b0b 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -5,11 +5,9 @@ import weakref
 
 import pytest
 
-from vllm import LLM, PoolingParams, PoolingRequestOutput
+from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 
-from ...models.utils import check_embeddings_close
-
 MODEL_NAME = "intfloat/multilingual-e5-small"
 
 PROMPTS = [
@@ -48,57 +46,13 @@ def llm():
               enforce_eager=True,
               seed=0)
 
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
+    yield weakref.proxy(llm)
 
-        del llm
+    del llm
 
     cleanup_dist_env_and_memory()
 
 
-def assert_outputs_match(o1: list[PoolingRequestOutput],
-                         o2: list[PoolingRequestOutput]):
-    check_embeddings_close(
-        embeddings_0_lst=[o.outputs.data for o in o1],
-        embeddings_1_lst=[o.outputs.data for o in o2],
-        name_0="hf",
-        name_1="vllm",
-        tol=1e-2,
-    )
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
-def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
-                                                    prompt_token_ids):
-    pooling_params = PoolingParams()
-
-    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
-        v1_output = llm.encode(prompt_token_ids=prompt_token_ids,
-                               pooling_params=pooling_params)
-
-    v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
-                           pooling_params=pooling_params)
-    assert_outputs_match(v1_output, v2_output)
-
-
-@pytest.mark.skip_global_cleanup
-def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
-    pooling_params = PoolingParams()
-
-    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
-        v1_output = llm.encode(prompt_token_ids=TOKEN_IDS,
-                               pooling_params=pooling_params)
-
-    v2_output = llm.encode(
-        [{
-            "prompt_token_ids": p
-        } for p in TOKEN_IDS],
-        pooling_params=pooling_params,
-    )
-    assert_outputs_match(v1_output, v2_output)
-
-
 @pytest.mark.skip_global_cleanup
 def test_multiple_pooling_params(llm: LLM):
     pooling_params = [
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 707891f6bd..3bbbcc755d 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -5,7 +5,7 @@ import weakref
 
 import pytest
 
-from vllm import LLM, RequestOutput, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 
 MODEL_NAME = "distilbert/distilgpt2"
@@ -41,50 +41,13 @@ def llm():
               gpu_memory_utilization=0.10,
               enforce_eager=True)
 
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
+    yield weakref.proxy(llm)
 
-        del llm
+    del llm
 
     cleanup_dist_env_and_memory()
 
 
-def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]):
-    assert [o.outputs for o in o1] == [o.outputs for o in o2]
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
-def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
-                                                    prompt_token_ids):
-    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
-
-    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
-        v1_output = llm.generate(prompt_token_ids=prompt_token_ids,
-                                 sampling_params=sampling_params)
-
-    v2_output = llm.generate({"prompt_token_ids": prompt_token_ids},
-                             sampling_params=sampling_params)
-    assert_outputs_equal(v1_output, v2_output)
-
-
-@pytest.mark.skip_global_cleanup
-def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
-    sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
-
-    with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
-        v1_output = llm.generate(prompt_token_ids=TOKEN_IDS,
-                                 sampling_params=sampling_params)
-
-    v2_output = llm.generate(
-        [{
-            "prompt_token_ids": p
-        } for p in TOKEN_IDS],
-        sampling_params=sampling_params,
-    )
-    assert_outputs_equal(v1_output, v2_output)
-
-
 @pytest.mark.skip_global_cleanup
 def test_multiple_sampling_params(llm: LLM):
     sampling_params = [
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index b7d53e31fd..a04f195692 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -48,10 +48,9 @@ def llm(request, monkeypatch_module):
               max_num_seqs=128,
               enforce_eager=True)
 
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
+    yield weakref.proxy(llm)
 
-        del llm
+    del llm
 
     cleanup_dist_env_and_memory()
 
diff --git a/tests/entrypoints/llm/test_reward.py b/tests/entrypoints/llm/test_reward.py
index 361e2d0e10..de82cf8d40 100644
--- a/tests/entrypoints/llm/test_reward.py
+++ b/tests/entrypoints/llm/test_reward.py
@@ -36,10 +36,9 @@ def llm():
               trust_remote_code=True,
               seed=0)
 
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
+    yield weakref.proxy(llm)
 
-        del llm
+    del llm
 
     cleanup_dist_env_and_memory()
 
diff --git a/tests/entrypoints/llm/test_score.py b/tests/entrypoints/llm/test_score.py
index dd4eae0ccc..5a1339b2ad 100644
--- a/tests/entrypoints/llm/test_score.py
+++ b/tests/entrypoints/llm/test_score.py
@@ -33,10 +33,9 @@ def llm():
               enforce_eager=True,
               seed=0)
 
-    with llm.deprecate_legacy_api():
-        yield weakref.proxy(llm)
+    yield weakref.proxy(llm)
 
-        del llm
+    del llm
 
     cleanup_dist_env_and_memory()
 
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 0b37c83c92..d781f462b4 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -38,8 +38,7 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
     with vllm_runner(model_id) as llm:
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(prompts=["Hello my name is"],
-                                      max_tokens=10)
+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
         print(outputs[0][1])
 
 
@@ -90,8 +89,7 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
 
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(prompts=["Hello my name is"],
-                                      max_tokens=10)
+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
         print(outputs[0][1])
 
 
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index 5ec8b27c15..b24964a9d0 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -46,5 +46,5 @@ def test_lm_head(
         vllm_model.apply_model(check_model)
 
         print(
-            vllm_model.generate_greedy(prompts=["Hello my name is"],
+            vllm_model.generate_greedy(["Hello my name is"],
                                        max_tokens=10)[0][1])
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 8bddfb0b48..58b6297762 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -127,13 +127,15 @@ def test_structured_output(
         temperature=1.0,
         max_tokens=4096,
         guided_decoding=GuidedDecodingParams(json=sample_json_schema))
-    outputs = llm.generate(prompts=[
-        (f"Give an example JSON for an employee profile that fits this "
-         f"schema. Make the response as short as possible. Schema: "
-         f"{sample_json_schema}")
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+
+    prompt = ("Give an example JSON for an employee profile that fits this "
+              "schema. Make the response as short as possible. Schema: "
+              f"{sample_json_schema}")
+    outputs = llm.generate(
+        [prompt] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
 
     assert outputs is not None
 
@@ -191,20 +193,24 @@ def test_structured_output(
         with pytest.raises(ValueError,
                            match="The provided JSON schema contains features "
                            "not supported by xgrammar."):
+
+            prompt = (f"Give an example JSON for an employee profile that "
+                      f"fits this schema: {unsupported_json_schema}. "
+                      f"Make the response as short as possible.")
             llm.generate(
-                prompts=[(f"Give an example JSON for an employee profile that "
-                          f"fits this schema: {unsupported_json_schema}. "
-                          f"Make the response as short as possible.")] * 2,
+                [prompt] * 2,
                 sampling_params=sampling_params,
-                use_tqdm=True)
+                use_tqdm=True,
+            )
     else:
-        outputs = llm.generate(prompts=(
-            "Give an example JSON object for a grade "
-            "that fits this schema: "
-            f"{unsupported_json_schema}. Make the response as short as "
-            "possible."),
-                               sampling_params=sampling_params,
-                               use_tqdm=True)
+        prompt = (f"Give an example JSON object for a grade that "
+                  f"fits this schema: {unsupported_json_schema}. "
+                  f"Make the response as short as possible.")
+        outputs = llm.generate(
+            prompt,
+            sampling_params=sampling_params,
+            use_tqdm=True,
+        )
         assert outputs is not None
         for output in outputs:
             assert output is not None
@@ -227,10 +233,9 @@ def test_structured_output(
             max_tokens=1000,
             guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
         outputs = llm.generate(
-            prompts=(
-                "Generate a sql statement that selects col_1 from "
-                "table_1 where it is equal to 1. Make the response as short as "
-                "possible."),
+            ("Generate a sql statement that selects col_1 from "
+             "table_1 where it is equal to 1. Make the response as short as "
+             "possible."),
             sampling_params=sampling_params,
             use_tqdm=True,
         )
@@ -261,10 +266,9 @@ def test_structured_output(
             max_tokens=1000,
             guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
         outputs = llm.generate(
-            prompts=(
-                "Generate a sql statement that selects col_1 from "
-                "table_1 where it is equal to 1. Make the response as short as "
-                "possible."),
+            ("Generate a sql statement that selects col_1 from "
+             "table_1 where it is equal to 1. Make the response as short as "
+             "possible."),
             sampling_params=sampling_params,
             use_tqdm=True,
         )
@@ -301,7 +305,6 @@ def test_structured_output(
             guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
         with pytest.raises(ValueError, match="Failed to convert the grammar "):
             llm.generate(
-                prompts=
                 ("Generate a sql statement that selects col_1 from "
                  "table_1 where it is equal to 1. Make the response as short "
                  "as possible."),
@@ -316,11 +319,11 @@ def test_structured_output(
         temperature=0.8,
         top_p=0.95,
         guided_decoding=GuidedDecodingParams(regex=sample_regex))
+
+    prompt = (f"Give an example IPv4 address with this regex: {sample_regex}. "
+              f"Make the response as short as possible.")
     outputs = llm.generate(
-        prompts=[
-            (f"Give an example IPv4 address with this regex: {sample_regex}. "
-             f"Make the response as short as possible.")
-        ] * 2,
+        [prompt] * 2,
         sampling_params=sampling_params,
         use_tqdm=True,
     )
@@ -343,11 +346,13 @@ def test_structured_output(
         temperature=0.8,
         top_p=0.95,
         guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
+
     outputs = llm.generate(
-        prompts=("The best language for type-safe systems programming is "
-                 "(Make the response as short as possible.) "),
+        ("The best language for type-safe systems programming is "
+         "(Make the response as short as possible.) "),
         sampling_params=sampling_params,
-        use_tqdm=True)
+        use_tqdm=True,
+    )
     assert outputs is not None
     for output in outputs:
         assert output is not None
@@ -367,12 +372,14 @@ def test_structured_output(
         temperature=1.0,
         max_tokens=1000,
         guided_decoding=GuidedDecodingParams(json=json_schema))
-    outputs = llm.generate(prompts=(
-        "Generate a JSON with the brand, model and car_type of the most "
-        "iconic car from the 90's. Make the response as short as "
-        "possible."),
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+
+    outputs = llm.generate(
+        ("Generate a JSON with the brand, model and car_type of the most "
+         "iconic car from the 90's. Make the response as short as "
+         "possible."),
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
 
     assert outputs is not None
 
@@ -411,10 +418,11 @@ def test_structured_output(
         guided_decoding=GuidedDecodingParams(json=json_schema))
 
     outputs = llm.generate(
-        prompts=("Generate a description of a frog using 50 characters. "
-                 "Make the response as short as possible."),
+        ("Generate a description of a frog using 50 characters. "
+         "Make the response as short as possible."),
         sampling_params=sampling_params,
-        use_tqdm=True)
+        use_tqdm=True,
+    )
 
     assert outputs is not None
 
@@ -498,7 +506,7 @@ Make the response as short as possible.
 """
 
         # Change this once other backends support structural_tag
-        outputs = llm.generate(prompts=prompt,
+        outputs = llm.generate(prompt,
                                sampling_params=sampling_params,
                                use_tqdm=True)
         assert outputs is not None
@@ -639,15 +647,13 @@ def test_structured_output_auto_mode(
         f"{unsupported_json_schema}. Make the response as short as possible.")
     # This would fail with the default of "xgrammar", but in "auto"
     # we will handle fallback automatically.
-    outputs = llm.generate(prompts=prompts,
+    outputs = llm.generate(prompts,
                            sampling_params=sampling_params,
                            use_tqdm=True)
     # Make sure `auto` backend handling doesn't mess up sampling_params
     # and that we can reuse it without error.
     outputs.extend(
-        llm.generate(prompts=prompts,
-                     sampling_params=sampling_params,
-                     use_tqdm=True))
+        llm.generate(prompts, sampling_params=sampling_params, use_tqdm=True))
 
     assert outputs is not None
     for output in outputs:
@@ -705,7 +711,7 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
                                          max_tokens=256,
                                          guided_decoding=guided_params)
 
-        outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+        outputs = llm.generate(prompt, sampling_params=sampling_params)
         assert outputs is not None
         generated_text = outputs[0].outputs[0].text
         assert generated_text is not None
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index b002f234c0..728ed8328d 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -3,15 +3,13 @@
 
 import itertools
 from collections.abc import Sequence
-from contextlib import contextmanager
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union,
-                    cast, overload)
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast
 
 import cloudpickle
 import torch.nn as nn
 from pydantic import ValidationError
 from tqdm.auto import tqdm
-from typing_extensions import TypeVar, deprecated
+from typing_extensions import TypeVar
 
 import vllm.envs as envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
@@ -40,7 +38,6 @@ from vllm.entrypoints.score_utils import (ScoreContentPartParam,
 from vllm.entrypoints.utils import (_validate_truncation_size,
                                     log_non_default_args)
 from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
-from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -54,7 +51,7 @@ from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of
+from vllm.utils import Counter, Device, is_list_of
 from vllm.v1.sample.logits_processor import LogitsProcessor
 
 if TYPE_CHECKING:
@@ -157,18 +154,6 @@ class LLM:
         serving, use the [AsyncLLMEngine][vllm.AsyncLLMEngine] class instead.
     """
 
-    DEPRECATE_LEGACY: ClassVar[bool] = True
-    """A flag to toggle whether to deprecate the legacy generate/encode API."""
-
-    @classmethod
-    @contextmanager
-    def deprecate_legacy_api(cls):
-        cls.DEPRECATE_LEGACY = True
-
-        yield
-
-        cls.DEPRECATE_LEGACY = False
-
     def __init__(
         self,
         model: str,
@@ -325,99 +310,14 @@ class LLM:
             return SamplingParams.from_optional(**self.default_sampling_params)
         return SamplingParams()
 
-    @overload
     def generate(
         self,
         prompts: Union[PromptType, Sequence[PromptType]],
-        /,
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
         *,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-    ) -> list[RequestOutput]:
-        ...
-
-    @overload  # LEGACY: single (prompt + optional token ids)
-    @deprecated("'prompt_token_ids' will become part of 'prompts'")
-    def generate(
-        self,
-        prompts: str,
-        sampling_params: Optional[Union[SamplingParams,
-                                        list[SamplingParams]]] = None,
-        prompt_token_ids: Optional[list[int]] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-    ) -> list[RequestOutput]:
-        ...
-
-    @overload  # LEGACY: multi (prompt + optional token ids)
-    @deprecated("'prompt_token_ids' will become part of 'prompts'")
-    def generate(
-        self,
-        prompts: list[str],
-        sampling_params: Optional[Union[SamplingParams,
-                                        list[SamplingParams]]] = None,
-        prompt_token_ids: Optional[list[list[int]]] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-    ) -> list[RequestOutput]:
-        ...
-
-    @overload  # LEGACY: single (token ids + optional prompt)
-    @deprecated("'prompt_token_ids' will become part of 'prompts'")
-    def generate(
-        self,
-        prompts: Optional[str] = None,
-        sampling_params: Optional[Union[SamplingParams,
-                                        list[SamplingParams]]] = None,
-        *,
-        prompt_token_ids: list[int],
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-    ) -> list[RequestOutput]:
-        ...
-
-    @overload  # LEGACY: multi (token ids + optional prompt)
-    @deprecated("'prompt_token_ids' will become part of 'prompts'")
-    def generate(
-        self,
-        prompts: Optional[list[str]] = None,
-        sampling_params: Optional[Union[SamplingParams,
-                                        list[SamplingParams]]] = None,
-        *,
-        prompt_token_ids: list[list[int]],
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-    ) -> list[RequestOutput]:
-        ...
-
-    @overload  # LEGACY: single or multi token ids [pos-only]
-    @deprecated("'prompt_token_ids' will become part of 'prompts'")
-    def generate(
-        self,
-        prompts: None,
-        sampling_params: None,
-        prompt_token_ids: Union[list[int], list[list[int]]],
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-    ) -> list[RequestOutput]:
-        ...
-
-    @deprecate_kwargs(
-        "prompt_token_ids",
-        is_deprecated=lambda: LLM.DEPRECATE_LEGACY,
-        additional_message="Please use the 'prompts' parameter instead.",
-    )
-    def generate(
-        self,
-        prompts: Union[Union[PromptType, Sequence[PromptType]],
-                       Optional[Union[str, list[str]]]] = None,
-        sampling_params: Optional[Union[SamplingParams,
-                                        Sequence[SamplingParams]]] = None,
-        prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         priority: Optional[list[int]] = None,
     ) -> list[RequestOutput]:
         """Generates the completions for the input prompts.
@@ -460,15 +360,6 @@ class LLM:
                 "Try passing `--runner generate` to use the model as a "
                 "generative model.")
 
-        if prompt_token_ids is not None:
-            parsed_prompts = self._convert_v1_inputs(
-                prompts=cast(Optional[Union[str, list[str]]], prompts),
-                prompt_token_ids=prompt_token_ids,
-            )
-        else:
-            parsed_prompts = cast(Union[PromptType, Sequence[PromptType]],
-                                  prompts)
-
         if sampling_params is None:
             # Use default sampling params.
             sampling_params = self.get_default_sampling_params()
@@ -483,10 +374,10 @@ class LLM:
 
         # Add any modality specific loras to the corresponding prompts
         lora_request = self._get_modality_specific_lora_reqs(
-            parsed_prompts, lora_request)
+            prompts, lora_request)
 
         self._validate_and_add_requests(
-            prompts=parsed_prompts,
+            prompts=prompts,
             params=sampling_params,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
@@ -498,7 +389,7 @@ class LLM:
         return self.engine_class.validate_outputs(outputs, RequestOutput)
 
     def _get_modality_specific_lora_reqs(
-            self, parsed_prompts: Union[PromptType, Sequence[PromptType]],
+            self, prompts: Union[PromptType, Sequence[PromptType]],
             lora_request: Optional[Union[list[LoRARequest], LoRARequest]]):
         # Grab the lora config off the vllm config on the engine,
         # since this is the same for both v0 & v1.
@@ -511,35 +402,33 @@ class LLM:
                 or (lora_config and lora_config.default_mm_loras is None)):
             return lora_request
 
-        if not isinstance(parsed_prompts, Sequence):
-            parsed_prompts = [parsed_prompts]
+        if not isinstance(prompts, Sequence):
+            prompts = [prompts]
 
-        optional_loras = ([lora_request] * len(parsed_prompts)
+        optional_loras = ([lora_request] * len(prompts)
                           if not isinstance(lora_request, Sequence) else
                           lora_request)
 
         return [
             self._resolve_single_prompt_mm_lora(
-                parsed_prompt,
+                prompt,
                 opt_lora_req,
                 lora_config.default_mm_loras,
-            ) for parsed_prompt, opt_lora_req in zip(parsed_prompts,
-                                                     optional_loras)
+            ) for prompt, opt_lora_req in zip(prompts, optional_loras)
         ]
 
-    def _resolve_single_prompt_mm_lora(self, parsed_prompt: PromptType,
+    def _resolve_single_prompt_mm_lora(self, prompt: PromptType,
                                        lora_request: Optional[LoRARequest],
                                        default_mm_loras: Optional[dict[str,
                                                                        str]]):
-        if (not default_mm_loras or not isinstance(parsed_prompt, dict)
-                or "multi_modal_data" not in parsed_prompt):
+        if (not default_mm_loras or not isinstance(prompt, dict)
+                or "multi_modal_data" not in prompt):
             return lora_request
 
-        parsed_prompt = cast(Union[TextPrompt, TokensPrompt], parsed_prompt)
+        prompt = cast(Union[TextPrompt, TokensPrompt], prompt)
 
-        intersection = set(
-            parsed_prompt["multi_modal_data"].keys()).intersection(
-                default_mm_loras.keys())
+        intersection = set(prompt["multi_modal_data"].keys()) \
+            .intersection(default_mm_loras.keys())
         if not intersection:
             return lora_request
         if len(intersection) > 1:
@@ -933,11 +822,9 @@ class LLM:
             lora_request=lora_request,
         )
 
-    @overload
     def encode(
         self,
         prompts: Union[PromptType, Sequence[PromptType]],
-        /,
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         *,
@@ -946,107 +833,6 @@ class LLM:
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         pooling_task: PoolingTask = "encode",
         tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> list[PoolingRequestOutput]:
-        ...
-
-    @overload  # LEGACY: single (prompt + optional token ids)
-    @deprecated("'prompt_token_ids' will become part of 'prompts'")
-    def encode(
-        self,
-        prompts: str,
-        pooling_params: Optional[Union[PoolingParams,
-                                       Sequence[PoolingParams]]] = None,
-        prompt_token_ids: Optional[list[int]] = None,
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        pooling_task: PoolingTask = "encode",
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> list[PoolingRequestOutput]:
-        ...
-
-    @overload  # LEGACY: multi (prompt + optional token ids)
-    @deprecated("'prompt_token_ids' will become part of 'prompts'")
-    def encode(
-        self,
-        prompts: list[str],
-        pooling_params: Optional[Union[PoolingParams,
-                                       Sequence[PoolingParams]]] = None,
-        prompt_token_ids: Optional[list[list[int]]] = None,
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        pooling_task: PoolingTask = "encode",
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> list[PoolingRequestOutput]:
-        ...
-
-    @overload  # LEGACY: single (token ids + optional prompt)
-    @deprecated("'prompt_token_ids' will become part of 'prompts'")
-    def encode(
-        self,
-        prompts: Optional[str] = None,
-        pooling_params: Optional[Union[PoolingParams,
-                                       Sequence[PoolingParams]]] = None,
-        *,
-        prompt_token_ids: list[int],
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        pooling_task: PoolingTask = "encode",
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> list[PoolingRequestOutput]:
-        ...
-
-    @overload  # LEGACY: multi (token ids + optional prompt)
-    @deprecated("'prompt_token_ids' will become part of 'prompts'")
-    def encode(
-        self,
-        prompts: Optional[list[str]] = None,
-        pooling_params: Optional[Union[PoolingParams,
-                                       Sequence[PoolingParams]]] = None,
-        *,
-        prompt_token_ids: list[list[int]],
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        pooling_task: PoolingTask = "encode",
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> list[PoolingRequestOutput]:
-        ...
-
-    @overload  # LEGACY: single or multi token ids [pos-only]
-    @deprecated("'prompt_token_ids' will become part of 'prompts'")
-    def encode(
-        self,
-        prompts: None,
-        pooling_params: None,
-        prompt_token_ids: Union[list[int], list[list[int]]],
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        pooling_task: PoolingTask = "encode",
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> list[PoolingRequestOutput]:
-        ...
-
-    @deprecate_kwargs(
-        "prompt_token_ids",
-        is_deprecated=lambda: LLM.DEPRECATE_LEGACY,
-        additional_message="Please use the 'prompts' parameter instead.",
-    )
-    def encode(
-        self,
-        prompts: Union[Union[PromptType, Sequence[PromptType]],
-                       Optional[Union[str, list[str]]]] = None,
-        pooling_params: Optional[Union[PoolingParams,
-                                       Sequence[PoolingParams]]] = None,
-        prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        pooling_task: Optional[PoolingTask] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> list[PoolingRequestOutput]:
         """Apply pooling to the hidden states corresponding to the input
         prompts.
@@ -1108,15 +894,6 @@ class LLM:
             raise ValueError(
                 f"pooling_task must be one of {self.supported_tasks}.")
 
-        if prompt_token_ids is not None:
-            parsed_prompts = self._convert_v1_inputs(
-                prompts=cast(Optional[Union[str, list[str]]], prompts),
-                prompt_token_ids=prompt_token_ids,
-            )
-        else:
-            parsed_prompts = cast(Union[PromptType, Sequence[PromptType]],
-                                  prompts)
-
         if pooling_params is None:
             # Use default pooling params.
             pooling_params = PoolingParams()
@@ -1134,7 +911,7 @@ class LLM:
                                       tokenization_kwargs)
 
         self._validate_and_add_requests(
-            prompts=parsed_prompts,
+            prompts=prompts,
             params=pooling_params,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
@@ -1148,7 +925,6 @@ class LLM:
     def embed(
         self,
         prompts: Union[PromptType, Sequence[PromptType]],
-        /,
         *,
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
@@ -1198,7 +974,6 @@ class LLM:
     def classify(
         self,
         prompts: Union[PromptType, Sequence[PromptType]],
-        /,
         *,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         pooling_params: Optional[Union[PoolingParams,
@@ -1348,7 +1123,7 @@ class LLM:
         _validate_truncation_size(model_config.max_model_len,
                                   truncate_prompt_tokens, tokenization_kwargs)
 
-        parsed_prompts = []
+        prompts = list[PromptType]()
 
         input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
 
@@ -1372,10 +1147,10 @@ class LLM:
             else:
                 pooling_params_list.append(pooling_params)
 
-            parsed_prompts.append(engine_prompt)
+            prompts.append(engine_prompt)
 
         self._validate_and_add_requests(
-            prompts=parsed_prompts,
+            prompts=prompts,
             params=pooling_params_list,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
@@ -1585,48 +1360,6 @@ class LLM:
         assert isinstance(self.llm_engine, V1LLMEngine)
         return self.llm_engine.get_metrics()
 
-    # LEGACY
-    def _convert_v1_inputs(
-        self,
-        prompts: Optional[Union[str, list[str]]],
-        prompt_token_ids: Optional[Union[list[int], list[list[int]]]],
-    ):
-        # skip_tokenizer_init is now checked in engine
-
-        if prompts is None and prompt_token_ids is None:
-            raise ValueError(
-                "Either prompts or prompt_token_ids must be provided.")
-        if prompts is not None and prompt_token_ids is not None \
-                and len(prompts) != len(prompt_token_ids):
-            raise ValueError(
-                "The lengths of prompts and prompt_token_ids must be the same."
-            )
-
-        if prompts is not None:
-            prompts = [p["content"] for p in parse_and_batch_prompt(prompts)]
-        if prompt_token_ids is not None:
-            prompt_token_ids = [
-                p["content"] for p in parse_and_batch_prompt(prompt_token_ids)
-            ]
-        if prompts is not None:
-            num_requests = len(prompts)
-        elif prompt_token_ids is not None:
-            num_requests = len(prompt_token_ids)
-        parsed_prompts: list[PromptType] = []
-        for i in range(num_requests):
-            item: PromptType
-
-            if prompts is not None:
-                item = TextPrompt(prompt=prompts[i])
-            elif prompt_token_ids is not None:
-                item = TokensPrompt(prompt_token_ids=prompt_token_ids[i])
-            else:
-                raise AssertionError
-
-            parsed_prompts.append(item)
-
-        return parsed_prompts
-
     def _validate_and_add_requests(
         self,
         prompts: Union[PromptType, Sequence[PromptType]],

From 0b9cc56fac3ee7ba0f7a117ad34809efd3f9d179 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Thu, 21 Aug 2025 20:28:49 -0700
Subject: [PATCH 490/932] Migrate MllamaImagePixelInputs to TensorSchema
 (#22020)

Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/mllama.py | 36 ++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index bb3267ce5b..2a60450de4 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -17,7 +17,7 @@
 """PyTorch Mllama model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -64,6 +64,7 @@ from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPMLP
 from .interfaces import SupportsMultiModal, SupportsV0Only
@@ -73,15 +74,30 @@ from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 logger = init_logger(__name__)
 
 
-class MllamaImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: """
-    """(batch_size, max_num_image, max_num_chunk, num_channel, height, width)"""
-    aspect_ratio_ids: torch.Tensor
-    """Shape: `(batch_size, max_num_image)`"""
-    aspect_ratio_mask: torch.Tensor
-    """Shape: `(batch_size, max_num_image, max_num_tiles)`"""
+class MllamaImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - batch_size: Batch size
+        - max_num_image: Max number of images
+        - max_num_chunk: Max number of chunks
+        - max_num_tiles: Max number of tiles per image
+        - num_channel: Number of channels
+        - height: Height
+        - width: Width
+    """
+
+    type: Literal["pixel_values"] = "pixel_values"
+
+    data: Annotated[torch.Tensor,
+                    TensorShape("batch_size", "max_num_image", "max_num_chunk",
+                                "num_channel", "height", "width")]
+
+    aspect_ratio_ids: Annotated[torch.Tensor,
+                                TensorShape("batch_size", "max_num_image")]
+
+    aspect_ratio_mask: Annotated[
+        torch.Tensor,
+        TensorShape("batch_size", "max_num_image", "max_num_tiles")]
 
 
 # TODO: support LlamaImageEmbeddingInputs

From 3ac849665d91c78c058be380aef12a077ab03cad Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 22 Aug 2025 11:39:46 +0800
Subject: [PATCH 491/932] [CI/Build] Skip Idefics3 and SmolVLM generation test
 again (#23356)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/registry.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index a6d5c305f7..4871ade231 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -413,8 +413,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                             trust_remote_code=True),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},    # noqa: E501
-                                                        min_transformers_version="4.55.1",
-                                                        transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
+                                                        min_transformers_version="4.56",
+                                                        transformers_version_reason="HF model broken in 4.55"),  # noqa: E501
     "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
                                                         trust_remote_code=True),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
@@ -496,8 +496,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B",
                                            trust_remote_code=True),
     "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct",  # noqa: E501
-                                                       min_transformers_version="4.55.1",
-                                                       transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
+                                                       min_transformers_version="4.56",
+                                                       transformers_version_reason="HF model broken in 4.55"),  # noqa: E501
     "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3",
                                                         trust_remote_code=True),
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501

From 394591e34371e4a1ad23a85401148df31d8de451 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 22 Aug 2025 00:01:08 -0400
Subject: [PATCH 492/932] [Feature] Enable DeepGEMM Linear on B200; 1.5% E2E
 throughput improvement (#23351)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../layers/quantization/utils/fp8_utils.py    | 22 +++++--------------
 vllm/utils/deep_gemm.py                       |  7 ++++++
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 2fb7ef29e4..ab1d5383f4 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -19,8 +19,9 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_BLOCK_FP8_SUPPORTED)
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
-from vllm.utils import cdiv, direct_register_custom_op, has_deep_gemm
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
+from vllm.utils import cdiv, direct_register_custom_op
+from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
+                                  should_use_deepgemm_for_fp8_linear)
 
 logger = init_logger(__name__)
 
@@ -108,19 +109,6 @@ def dispatch_w8a8_blockscale_func(
     return w8a8_block_fp8_matmul
 
 
-def should_use_deepgemm(output_dtype: torch.dtype, weight: torch.Tensor):
-    """
-    Check if DeepGEMM should be used based on the output dtype and weight shape.
-    DeepGEMM is only supported for bfloat16 output dtype and weights with shape
-    divisible by 128.
-    """
-
-    return (current_platform.is_cuda()
-            and current_platform.is_device_capability(90) and has_deep_gemm()
-            and envs.VLLM_USE_DEEP_GEMM and output_dtype == torch.bfloat16
-            and weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0)
-
-
 # TODO fix ROCm->Triton custom path:
 #  https://github.com/vllm-project/vllm/issues/14397
 def apply_w8a8_block_fp8_linear(
@@ -139,7 +127,7 @@ def apply_w8a8_block_fp8_linear(
     output_shape = [*input.shape[:-1], weight.shape[0]]
     output_dtype = input.dtype
 
-    if should_use_deepgemm(output_dtype, weight):
+    if should_use_deepgemm_for_fp8_linear(output_dtype, weight):
 
         input_2d = input.view(-1, input.shape[-1])
         output_shape = [*input.shape[:-1], weight.shape[0]]
@@ -150,7 +138,9 @@ def apply_w8a8_block_fp8_linear(
             column_major_scales=True,
         )
 
+        # ensure DeepGEMM-backed custom op is registered before use
         import vllm.model_executor.layers.quantization.deepgemm  # noqa: F401
+
         output = torch.ops.vllm.w8a8_block_fp8_matmul_deepgemm(
             q_input,
             weight,
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 861d9c0c00..c0a4ed077e 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -202,6 +202,12 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor):
     return 1 - sim
 
 
+def should_use_deepgemm_for_fp8_linear(output_dtype: torch.dtype,
+                                       weight: torch.Tensor):
+    return (is_deep_gemm_supported() and output_dtype == torch.bfloat16
+            and weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0)
+
+
 __all__ = [
     "calc_diff",
     "fp8_gemm_nt",
@@ -210,4 +216,5 @@ __all__ = [
     "per_block_cast_to_fp8",
     "is_blackwell_deep_gemm_e8m0_used",
     "is_deep_gemm_supported",
+    "should_use_deepgemm_for_fp8_linear",
 ]

From 111692bb8ca3921941c5351a62a56f4310f7723b Mon Sep 17 00:00:00 2001
From: Arjun Reddy <arjunreddy5275@gmail.com>
Date: Thu, 21 Aug 2025 23:04:07 -0500
Subject: [PATCH 493/932] [CI] Add end-to-end V1 min_tokens test coverage
 (#22495)

Signed-off-by: Arjun Reddy <189282188+arjunbreddy22@users.noreply.github.com>
Co-authored-by: Arjun Reddy <189282188+arjunbreddy22@users.noreply.github.com>
---
 tests/v1/e2e/test_min_tokens.py | 479 ++++++++++++++++++++++++++++++++
 1 file changed, 479 insertions(+)
 create mode 100644 tests/v1/e2e/test_min_tokens.py

diff --git a/tests/v1/e2e/test_min_tokens.py b/tests/v1/e2e/test_min_tokens.py
new file mode 100644
index 0000000000..f013425cb5
--- /dev/null
+++ b/tests/v1/e2e/test_min_tokens.py
@@ -0,0 +1,479 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Comprehensive end-to-end tests for `min_tokens` in the V1 engine.
+
+Addresses #21950: verify and add CI coverage.
+
+Covers:
+1) Basic functionality
+2) Stop strings with `min_tokens` (bug #21987; fix in PR #22014)
+3) EOS behavior with `min_tokens` (potential logits-processor bug)
+4) Edge cases (min_tokens == max_tokens, min_tokens == 0)
+5) Multiple stop conditions
+"""
+
+import os
+from typing import Optional, Union
+
+import pytest
+
+from vllm import LLM, SamplingParams
+from vllm.outputs import RequestOutput
+
+# Test configuration
+TEST_MODEL = "facebook/opt-125m"  # Small model for fast CI execution
+GREEDY = 0.0  # Deterministic generation for consistent testing
+
+
+class MinTokensTestCase:
+    """Data class for min_tokens test scenarios"""
+
+    def __init__(
+        self,
+        name: str,
+        min_tokens: int,
+        max_tokens: int,
+        stop: Optional[Union[str, list[str]]] = None,
+        expected_min_len: Optional[int] = None,
+        expected_exact_len: Optional[int] = None,
+    ):
+        self.name = name
+        self.min_tokens = min_tokens
+        self.max_tokens = max_tokens
+        self.stop = stop
+        self.expected_min_len = expected_min_len or min_tokens
+        self.expected_exact_len = expected_exact_len
+
+    def __str__(self):
+        return (f"{self.name}: min={self.min_tokens}, "
+                f"max={self.max_tokens}, stop={self.stop}")
+
+
+# Test scenarios covering all critical cases
+MIN_TOKENS_TEST_CASES = [
+    # === BASIC FUNCTIONALITY (should work) ===
+    MinTokensTestCase(name="basic_min_tokens_no_stop",
+                      min_tokens=8,
+                      max_tokens=20,
+                      stop=None,
+                      expected_min_len=8),
+    MinTokensTestCase(name="min_tokens_zero",
+                      min_tokens=0,
+                      max_tokens=10,
+                      stop=None,
+                      expected_min_len=0),
+    MinTokensTestCase(name="min_equals_max_no_stop",
+                      min_tokens=15,
+                      max_tokens=15,
+                      stop=None,
+                      expected_exact_len=15),
+
+    # === STOP STRINGS WITH MIN_TOKENS ===
+    # These tests expose the detokenizer bug where stop strings
+    # bypass min_tokens
+    # Using mathematically guaranteed approach with wide stop nets
+    pytest.param(
+        MinTokensTestCase(
+            name="min_tokens_with_comprehensive_stops",
+            min_tokens=5,
+            max_tokens=20,
+            stop=[
+                "a",
+                "e",
+                "i",
+                "o",
+                "u",
+                "t",
+                "n",
+                "s",
+                "r",
+                "l",
+                " ",
+            ],
+            expected_min_len=5,
+        ),
+        marks=pytest.mark.xfail(
+            reason=("Known bug #21987: stop strings bypass min_tokens "
+                    "(fixed by PR #22014)"),
+            strict=False),
+        id="min_tokens_with_comprehensive_stops",
+    ),
+    pytest.param(
+        MinTokensTestCase(
+            name="min_tokens_with_simple_char_stop",
+            min_tokens=3,
+            max_tokens=15,
+            stop=["e", "a", " "],
+            expected_min_len=3,
+        ),
+        marks=pytest.mark.xfail(
+            reason=("Known bug #21987: stop strings bypass min_tokens "
+                    "(fixed by PR #22014)"),
+            strict=False),
+        id="min_tokens_with_simple_char_stop",
+    ),
+
+    # === EOS TOKEN WITH MIN_TOKENS (potential LogitsProcessor bug) ===
+    # These test the MinTokensLogitsProcessor handling of EOS tokens
+    pytest.param(
+        MinTokensTestCase(
+            name="min_equals_max_eos_only",
+            min_tokens=20,
+            max_tokens=20,
+            stop=None,  # Relies on default EOS token behavior
+            expected_exact_len=20,
+        ),
+        marks=pytest.mark.xfail(
+            reason=
+            ("Potential logits-processor bug: EOS tokens may bypass min_tokens"
+             ),
+            strict=False,
+        ),
+        id="min_equals_max_eos_only",
+    ),
+
+    # === EDGE CASES ===
+    MinTokensTestCase(name="large_min_tokens",
+                      min_tokens=50,
+                      max_tokens=60,
+                      stop=None,
+                      expected_min_len=50),
+    MinTokensTestCase(
+        name="min_tokens_with_empty_stop_list",
+        min_tokens=5,
+        max_tokens=15,
+        stop=[],  # Empty stop list
+        expected_min_len=5),
+]
+
+
+@pytest.fixture(scope="module")
+def llm_v1():
+    """Create V1 LLM instance for testing"""
+    # Ensure V1 engine is used
+    os.environ["VLLM_USE_V1"] = "1"
+
+    llm = LLM(
+        model=TEST_MODEL,
+        tensor_parallel_size=1,
+        max_model_len=1024,  # Small context for fast testing
+        enforce_eager=True,  # Avoid graph compilation overhead
+    )
+    return llm
+
+
+def get_token_count(output: RequestOutput) -> int:
+    """Extract token count from LLM output"""
+    if not output.outputs:
+        return 0
+    return len(output.outputs[0].token_ids)
+
+
+def assert_min_tokens_satisfied(output: RequestOutput,
+                                test_case: MinTokensTestCase) -> None:
+    """Assert that min_tokens requirement is satisfied"""
+    token_count = get_token_count(output)
+    stop_reason = (output.outputs[0].stop_reason
+                   if output.outputs else "no output")
+
+    if test_case.expected_exact_len is not None:
+        # Exact length requirement
+        assert token_count == test_case.expected_exact_len, (
+            f"Expected exactly {test_case.expected_exact_len} tokens, "
+            f"got {token_count} tokens. "
+            f"Stop reason: {stop_reason}")
+    else:
+        # Minimum length requirement
+        assert token_count >= (test_case.expected_min_len or 0), (
+            f"Expected at least {test_case.expected_min_len} tokens, "
+            f"got {token_count} tokens. "
+            f"Stop reason: {stop_reason}")
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    MIN_TOKENS_TEST_CASES,
+    ids=lambda tc: tc.name,
+)
+def test_min_tokens_comprehensive(llm_v1: LLM, test_case: MinTokensTestCase):
+    """
+    Comprehensive test for min_tokens functionality in V1 engine.
+    
+    This test covers all critical scenarios for min_tokens:
+    - Basic functionality (should work)
+    - Stop strings with min_tokens (known bug)
+    - EOS tokens with min_tokens (potential bug)
+    - Edge cases
+    
+    Args:
+        llm_v1: V1 LLM instance
+        test_case: Test scenario parameters
+    """
+    # Known failing cases are handled via param-level xfail marks above.
+
+    # Create sampling parameters
+    sampling_params = SamplingParams(
+        min_tokens=test_case.min_tokens,
+        max_tokens=test_case.max_tokens,
+        stop=test_case.stop,
+        temperature=GREEDY,
+        include_stop_str_in_output=True  # Include stop strings for debugging
+    )
+
+    # Use simple prompt. Comprehensive stop lists should catch any generation
+    prompt = "Hello"
+
+    # Generate output
+    outputs = llm_v1.generate([prompt], sampling_params)
+
+    assert len(outputs) == 1, "Expected exactly one output"
+    output = outputs[0]
+
+    # Debug information
+    token_count = get_token_count(output)
+    generated_text = output.outputs[0].text if output.outputs else ""
+    stop_reason = output.outputs[0].stop_reason if output.outputs else "unknown"
+
+    print(f"\nTest: {test_case.name}")
+    print(f"Generated {token_count} tokens")
+    print(f"Stop reason: {stop_reason}")
+    print(f"Generated text: {repr(generated_text)}")
+    print(f"Expected min: {test_case.expected_min_len}")
+    if test_case.expected_exact_len:
+        print(f"Expected exact: {test_case.expected_exact_len}")
+
+    # Validate min_tokens requirement
+    assert_min_tokens_satisfied(output, test_case)
+
+
+def test_min_tokens_basic_functionality(llm_v1: LLM):
+    """
+    Test basic min_tokens functionality without stop conditions.
+    
+    This is a baseline test that should always pass and validates
+    that min_tokens works correctly in the simple case.
+    """
+    sampling_params = SamplingParams(min_tokens=10,
+                                     max_tokens=20,
+                                     temperature=GREEDY)
+
+    prompt = "Once upon a time"
+    outputs = llm_v1.generate([prompt], sampling_params)
+
+    assert len(outputs) == 1
+    token_count = get_token_count(outputs[0])
+
+    assert token_count >= 10, f"Expected at least 10 tokens, got {token_count}"
+    assert token_count <= 20, f"Expected at most 20 tokens, got {token_count}"
+
+
+@pytest.mark.xfail(
+    reason=("Known bug #21987: stop strings bypass min_tokens "
+            "(fixed by PR #22014)"),
+    strict=False,
+)
+def test_min_tokens_stop_strings_bug(llm_v1: LLM):
+    """
+    Test the specific bug where stop strings bypass min_tokens.
+    
+    This test specifically reproduces the bug Calvin is fixing in PR #22014.
+    It should fail until that fix is merged.
+    
+    Strategy: Use guaranteed stop characters that will appear
+    in any generated text.
+    """
+    # If the bug is fixed upstream, this test will XPASS
+
+    sampling_params = SamplingParams(
+        min_tokens=15,
+        max_tokens=50,
+        # Common letter; likely appears early
+        stop=["e"],
+        temperature=GREEDY,
+        include_stop_str_in_output=True)
+
+    # Simple prompt that will generate text containing "e"
+    prompt = "The quick brown fox"
+    outputs = llm_v1.generate([prompt], sampling_params)
+
+    assert len(outputs) == 1
+    token_count = get_token_count(outputs[0])
+    generated_text = outputs[0].outputs[0].text if outputs[0].outputs else ""
+
+    # Debug info to understand what happened
+    print(f"Generated text: {repr(generated_text)}")
+    print(f"Token count: {token_count}")
+    print(f"Contains 'e': {'e' in generated_text}")
+
+    # This assertion should fail due to the bug - if stop string is found early,
+    # the model should still continue generating until min_tokens is reached
+    stop_reason = (outputs[0].outputs[0].stop_reason
+                   if outputs[0].outputs else "no output")
+    assert token_count >= 15, ("Bug confirmed: "
+                               f"{token_count} tokens < min_tokens=15. "
+                               f"Reason: {stop_reason}. "
+                               f"Text: {repr(generated_text)}")
+
+
+@pytest.mark.xfail(
+    reason=("Known bug #21987: stop strings bypass min_tokens "
+            "(fixed by PR #22014)"),
+    strict=False,
+)
+def test_min_tokens_stop_strings_guaranteed_early_trigger(llm_v1: LLM):
+    """
+    Guaranteed test for stop strings bypassing min_tokens bug.
+    
+    Strategy: Use very low temperature and multiple common stop strings
+    to virtually guarantee early detection, combined with long min_tokens
+    to ensure the bug is exposed regardless of model behavior.
+    """
+    # If the bug is fixed upstream, this test will XPASS
+
+    sampling_params = SamplingParams(
+        min_tokens=50,  # Set high min_tokens to ensure bug detection
+        max_tokens=200,
+        # Use multiple very common patterns - at least one will appear
+        stop=["e", "a", "i", "o", "u", " ", "t", "n", "s", "r"],
+        temperature=GREEDY,
+        include_stop_str_in_output=True)
+
+    # Simple prompt that will generate some text
+    prompt = "The cat"
+    outputs = llm_v1.generate([prompt], sampling_params)
+
+    assert len(outputs) == 1
+    token_count = get_token_count(outputs[0])
+    generated_text = outputs[0].outputs[0].text if outputs[0].outputs else ""
+    stop_reason = (outputs[0].outputs[0].stop_reason
+                   if outputs[0].outputs else "unknown")
+
+    print(f"Generated text: {repr(generated_text)}")
+    print(f"Token count: {token_count}")
+    print(f"Stop reason: {stop_reason}")
+
+    # With the bug, this will fail because ANY of the common characters
+    # will trigger early termination before min_tokens=50 is reached
+    # It's virtually impossible to generate 50 tokens without hitting
+    # at least one of: e, a, i, o, u, space, t, n, s, r
+    finish_reason = (outputs[0].outputs[0].finish_reason
+                     if outputs[0].outputs else "unknown")
+
+    print(f"Finish reason: {finish_reason}")
+
+    if finish_reason == "stop":
+        assert token_count >= 50, ("Bug confirmed: "
+                                   f"{token_count} tokens < min_tokens=50. "
+                                   f"Reason: {finish_reason}. "
+                                   f"Text: {repr(generated_text)}")
+
+
+@pytest.mark.xfail(
+    reason=(
+        "Potential logits-processor bug: EOS tokens may bypass min_tokens"),
+    strict=False,
+)
+def test_min_tokens_eos_behavior(llm_v1: LLM):
+    """
+    Verify EOS handling with and without min_tokens.
+
+    - Without min_tokens: expect early EOS -> finish_reason == "stop",
+      stop_reason is None, and generated tokens < max_tokens (25).
+    - With min_tokens: EOS should be blocked until min_tokens is reached
+      (finish_reason == "length"); verify that eos_token_id does not appear
+      in generated token_ids.
+    """
+    # tokenizer + eos id
+    tokenizer = llm_v1.get_tokenizer()
+    eos_token_id = tokenizer.eos_token_id
+
+    prompt = "Give a file extension."
+    max_toks = 32
+
+    # Case 1: WITHOUT min_tokens
+    sp_no_min = SamplingParams(
+        max_tokens=max_toks,
+        temperature=GREEDY,
+    )
+    out_no_min = llm_v1.generate([prompt], sp_no_min)
+    assert len(out_no_min) == 1
+    choice_no_min = out_no_min[0].outputs[0]
+
+    ids_no_min = choice_no_min.token_ids or []
+    finish_no_min = choice_no_min.finish_reason
+    stop_no_min = choice_no_min.stop_reason
+
+    print("[no-min] tokens=", len(ids_no_min), " finish=", finish_no_min,
+          " stop_reason=", stop_no_min)
+
+    assert finish_no_min == "stop", (
+        f"Expected finish_reason 'stop' without min_tokens, got {finish_no_min}"
+    )
+    assert stop_no_min is None, (
+        "For EOS-based stop (no user stop strings), stop_reason should be None."
+    )
+    assert len(ids_no_min) < max_toks, (
+        f"Expected early EOS with < {max_toks} tokens, got {len(ids_no_min)}")
+
+    # Case 2: WITH min_tokens
+    sp_with_min = SamplingParams(
+        min_tokens=max_toks,
+        max_tokens=max_toks,
+        temperature=GREEDY,
+    )
+    out_with_min = llm_v1.generate([prompt], sp_with_min)
+    assert len(out_with_min) == 1
+    choice_with_min = out_with_min[0].outputs[0]
+
+    ids_with_min = choice_with_min.token_ids or []
+    finish_with_min = choice_with_min.finish_reason
+    stop_with_min = choice_with_min.stop_reason
+
+    print("[with-min] tokens=", len(ids_with_min), " finish=", finish_with_min,
+          " stop_reason=", stop_with_min)
+
+    # Exact length reached; EOS should have been blocked
+    assert len(ids_with_min) == max_toks, (
+        f"Expected exactly {max_toks} tokens with min_tokens; "
+        f"got {len(ids_with_min)}")
+    assert finish_with_min == "length", (
+        f"Expected finish_reason 'length'; got {finish_with_min}")
+    assert eos_token_id not in ids_with_min, (
+        "EOS token id should not appear when min_tokens prevents early EOS.")
+
+
+def test_min_tokens_validation():
+    """
+    Test that SamplingParams correctly validates min_tokens parameters.
+    
+    This tests the parameter validation logic in SamplingParams.
+    """
+    # Valid cases
+    SamplingParams(min_tokens=0, max_tokens=10)
+    SamplingParams(min_tokens=5, max_tokens=10)
+    SamplingParams(min_tokens=10, max_tokens=10)
+
+    # Invalid cases
+    with pytest.raises(
+            ValueError,
+            match="min_tokens must be greater than or equal to 0",
+    ):
+        SamplingParams(min_tokens=-1, max_tokens=10)
+
+    with pytest.raises(
+            ValueError,
+            match="min_tokens must be less than or equal to max_tokens",
+    ):
+        SamplingParams(min_tokens=15, max_tokens=10)
+
+
+if __name__ == "__main__":
+    """
+    Run tests locally for development.
+    
+    Usage:
+        cd vllm/
+        VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v
+    """
+    pytest.main([__file__, "-v"])

From de9c085e17f049c3cea970aa5c5f784231706795 Mon Sep 17 00:00:00 2001
From: Philip Chung <philip.f.chung@gmail.com>
Date: Thu, 21 Aug 2025 21:06:50 -0700
Subject: [PATCH 494/932] [Misc] Add gemma3 chat template with pythonic-style
 function calling (#17149)

Signed-off-by: Philip Chung <philip.f.chung@gmail.com>
---
 .../tool_chat_template_gemma3_pythonic.jinja  | 123 ++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 examples/tool_chat_template_gemma3_pythonic.jinja

diff --git a/examples/tool_chat_template_gemma3_pythonic.jinja b/examples/tool_chat_template_gemma3_pythonic.jinja
new file mode 100644
index 0000000000..5a20b01911
--- /dev/null
+++ b/examples/tool_chat_template_gemma3_pythonic.jinja
@@ -0,0 +1,123 @@
+{#- Begin-of-sequence token to start the model prompt -#}
+{{ bos_token }}
+{#- Extracts the system message. Gemma does not support system messages so it will be prepended to first user message. -#}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{#- Set tools to none if not defined for this ChatCompletion request (helps avoid errors later) -#}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{#- Validate alternating user/assistant messages (excluding 'tool' messages and ones with tool_calls) -#}
+{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | selectattr("tool_calls", "undefined") -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Main loop over all messages in the conversation history -#}
+{%- for message in loop_messages -%}
+    {#- Normalize roles for model prompt formatting -#}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- elif (message['role'] == 'tool') -%}
+        {%- set role = "user" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {#- Mark the start of a message block with the appropriate role -#}
+    {{ '<start_of_turn>' + role + '\n' -}}
+
+    {#- Insert system message content (if present) at the beginning of the first message. -#}
+    {%- if loop.first -%}
+        {{ first_user_prefix }}
+        {#- Append system message with tool information if using tools in message request. -#}
+        {%- if tools is not none -%}
+            {{- "Tools (functions) are available. If you decide to invoke one or more of the tools, you must respond with a python list of the function calls.\n" -}}
+            {{- "Example Format: [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] \n" -}}
+            {{- "Do not use variables. DO NOT USE MARKDOWN SYNTAX. You SHOULD NOT include any other text in the response if you call a function. If none of the functions can be used, point it out. If you lack the parameters required by the function, also point it out.\n" -}}
+            {{- "Here is a list of functions in JSON format that you can invoke.\n" -}}
+            {{- tools | tojson(indent=4) -}}
+            {{- "\n\n" -}}
+        {%- endif -%}
+    {%- endif -%}
+
+    {#- Format model tool calls (turns where model indicates they want to call a tool) -#}
+    {%- if 'tool_calls' in message -%}
+        {#- Opening bracket for tool call list. -#}
+        {{- '[' -}}
+        {#- For each tool call -#}
+        {%- for tool_call in message.tool_calls -%}
+            {#- Get tool call function. -#}
+            {%- if tool_call.function is defined -%}
+                {%- set tool_call = tool_call.function -%}
+            {%- endif -%}
+            {#- Function name & opening parenthesis. -#}
+            {{- tool_call.name + '(' -}}
+
+            {#-- Handle arguments as list (positional) or dict (named) --#}
+            {#-- Named arguments (dict) --#}
+            {%- if tool_call.arguments is iterable and tool_call.arguments is mapping -%}
+                {%- set first = true -%}
+                {%- for key, val in tool_call.arguments.items() -%}
+                    {%- if not first %}, {% endif -%}
+                    {{ key }}={{ val | tojson }}
+                    {%- set first = false -%}
+                {%- endfor -%}
+            {#-- Positional arguments (list) --#}
+            {%- elif tool_call.arguments is iterable -%}
+                {{- tool_call.arguments | map('tojson') | join(', ') -}}
+            {#-- Fallback: single positional value --#}
+            {%- else -%}
+                {{- tool_call.arguments | tojson -}}
+            {#-- Closing parenthesis. --#}
+            {%- endif -%}
+                {{- ')' -}}
+            {#-- If more than one tool call, place comma and move to formatting next tool call --#}
+            {%- if not loop.last -%}, {% endif -%}
+        {%- endfor -%}
+        {#- Closing bracket for tool call list. -#}
+        {{- ']' -}}
+    {%- endif -%}
+    
+    {#- Tool response start tag (for messages from a tool) -#}
+    {%- if (message['role'] == 'tool') -%}
+        {{ '<tool_response>\n' -}}
+    {%- endif -%}
+
+    {#- Render the message content: handle plain string or multimodal content like image/text -#}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+
+    {#- Tool response end tag -#}
+    {%- if (message['role'] == 'tool') -%}
+        {{ '</tool_response>' -}}
+    {%- endif -%}
+
+    {#- Mark end of a single turn -#}
+    {{ '<end_of_turn>\n' }}
+{%- endfor -%}
+
+{#- If generation is to be triggered, add model prompt prefix -#}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model\n'}}
+{%- endif -%}
\ No newline at end of file

From 5964069367a7d54c3816ce3faba79e02110cde17 Mon Sep 17 00:00:00 2001
From: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Date: Fri, 22 Aug 2025 12:58:10 +0800
Subject: [PATCH 495/932] [New Model] Add Seed-Oss model (#23241)

Signed-off-by: jiabin.00 <jiabin.00@bytedance.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/models/supported_models.md               |   1 +
 tests/models/registry.py                      |   3 +
 tests/tool_use/test_seed_oss_tool_parser.py   | 459 ++++++++++++
 .../openai/tool_parsers/__init__.py           |   2 +
 .../tool_parsers/seed_oss_tool_parser.py      | 676 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/seed_oss.py        | 487 +++++++++++++
 7 files changed, 1629 insertions(+)
 create mode 100644 tests/tool_use/test_seed_oss_tool_parser.py
 create mode 100644 vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
 create mode 100644 vllm/model_executor/models/seed_oss.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index ad3db1cf21..297d98142b 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -401,6 +401,7 @@ th {
 | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
 | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
 | `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 4871ade231..4035319b45 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -292,6 +292,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
+    "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501
+                                          trust_remote_code=True,
+                                          is_available_online=False),
     "SmolLM3ForCausalLM": _HfExamplesInfo("HuggingFaceTB/SmolLM3-3B"),
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),  # noqa: E501
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
new file mode 100644
index 0000000000..d85bc9bbf1
--- /dev/null
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -0,0 +1,459 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+import json
+from collections.abc import Generator
+from typing import Optional
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionToolsParam,
+                                              DeltaMessage, FunctionCall,
+                                              ToolCall)
+from vllm.entrypoints.openai.tool_parsers import SeedOssToolParser
+from vllm.transformers_utils.detokenizer import detokenize_incrementally
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+
+# Use a common model that is likely to be available
+MODEL = "ByteDance-Seed/Seed-OSS-36B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def seed_oss_tokenizer():
+    return get_tokenizer(tokenizer_name=MODEL, trust_remote_code=True)
+
+
+@pytest.fixture
+def seed_oss_tool_parser(seed_oss_tokenizer):
+    return SeedOssToolParser(seed_oss_tokenizer)
+
+
+@pytest.fixture
+def sample_tools():
+    return [
+        ChatCompletionToolsParam(
+            type="function",
+            function={
+                "name": "get_weather",
+                "description": "Get current temperature for a given location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description":
+                            "City and country e.g. Bogotá, Colombia"
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description": "this is the unit of temperature"
+                        }
+                    },
+                    "required": ["location"],
+                    "additionalProperties": False
+                },
+                "returns": {
+                    "type": "object",
+                    "properties": {
+                        "temperature": {
+                            "type": "number",
+                            "description": "temperature in celsius"
+                        }
+                    },
+                    "required": ["temperature"],
+                    "additionalProperties": False
+                },
+                "strict": True
+            }),
+    ]
+
+
+def assert_tool_calls(actual_tool_calls: list[ToolCall],
+                      expected_tool_calls: list[ToolCall]):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
+                                                    expected_tool_calls):
+        # Seed-OSS tool call will not generate id
+        assert actual_tool_call.type == "function"
+        assert actual_tool_call.function == expected_tool_call.function
+
+        assert actual_tool_call.function.name == expected_tool_call.function.name
+        assert actual_tool_call.function.arguments == expected_tool_call.function.arguments
+
+
+def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
+    model_output = "This is a test response without any tool calls"
+    extracted_tool_calls = seed_oss_tool_parser.extract_tool_calls(
+        model_output, request=None)  # type: ignore[arg-type]
+
+    assert not extracted_tool_calls.tools_called
+    assert extracted_tool_calls.tool_calls == []
+    assert extracted_tool_calls.content == model_output
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "tool_call_0_thinking_budget",
+        "tool_call_512_thinkg_budget",
+        "tool_call_unlimited_thinking_budget",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        ("""<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
+         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
+         """<seed:tool_call>\n<function=get_weather>\n"""
+         """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
+         [
+             ToolCall(function=FunctionCall(
+                 name="get_weather",
+                 arguments=json.dumps({
+                     "location": "Barcelona, Spain",
+                 }, ),
+             ),
+                      type='function')
+         ],
+         """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
+         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
+         ),
+        (
+            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
+            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
+            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
+            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
+            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
+            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
+            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
+            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
+            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
+            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
+            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
+            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n"""
+            """<seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, Spain</parameter>\n</function>"""
+            """\n</seed:tool_call>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_weather",
+                    arguments=json.dumps({
+                        "location": "Barcelona, Spain",
+                    }, ),
+                ),
+                         type='function')
+            ],
+            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
+            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
+            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
+            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
+            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
+            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
+            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
+            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
+            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
+            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
+            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
+            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n""",
+        ),
+        (
+            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
+            """First, I need to remember the function I can use: get_weather. The function requires a """
+            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
+            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
+            """let me check the function docstring again. Oh, the function says unit is optional, and """
+            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
+            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
+            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
+            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
+            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
+            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
+            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
+            """call should be as above. Then wait for the result to come back and tell the user the """
+            """temperature in Celsius.</seed:think><seed:tool_call>\n<function=get_weather>\n<parameter=location>"""
+            """Barcelona, Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_weather",
+                    arguments=json.dumps(
+                        {
+                            "location": "Barcelona, Spain",
+                            "unit": "celsius",
+                        }, ),
+                ),
+                         type='function')
+            ],
+            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
+            """First, I need to remember the function I can use: get_weather. The function requires a """
+            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
+            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
+            """let me check the function docstring again. Oh, the function says unit is optional, and """
+            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
+            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
+            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
+            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
+            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
+            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
+            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
+            """call should be as above. Then wait for the result to come back and tell the user the """
+            """temperature in Celsius.</seed:think>""",
+        ),
+    ],
+)
+def test_extract_tool_calls(seed_oss_tool_parser, sample_tools, model_output,
+                            expected_tool_calls, expected_content):
+    request = ChatCompletionRequest(model=MODEL,
+                                    messages=[],
+                                    tools=sample_tools)
+    extracted_tool_calls = seed_oss_tool_parser.extract_tool_calls(
+        model_output, request=request)  # type: ignore[arg-type]
+    assert extracted_tool_calls.tools_called
+
+    assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+
+    assert extracted_tool_calls.content == expected_content
+
+
+def test_streaming_tool_calls_no_tools(seed_oss_tool_parser):
+    model_output = "This is a test response without any tool calls"
+
+    result = seed_oss_tool_parser.extract_tool_calls_streaming(
+        previous_text="his is a test response",
+        current_text=model_output,
+        delta_text=" without any tool calls.",
+        previous_token_ids=[],
+        current_token_ids=[],
+        delta_token_ids=[],
+        request=None,
+    )
+
+    # Should return the delta text as content
+    assert result is not None
+    assert hasattr(result, 'content')
+    assert result.content == " without any tool calls."
+
+
+def stream_delta_message_generator(
+    seed_oss_tool_parser: SeedOssToolParser,
+    seed_oss_tokenizer: AnyTokenizer,
+    model_output: str,
+    request: Optional[ChatCompletionRequest] = None
+) -> Generator[DeltaMessage, None, None]:
+    all_token_ids = seed_oss_tokenizer.encode(model_output,
+                                              add_special_tokens=False)
+
+    previous_text = ""
+    previous_tokens = None
+    prefix_offset = 0
+    read_offset = 0
+    for i, delta_token in enumerate(all_token_ids):
+        delta_token_ids = [delta_token]
+        previous_token_ids = all_token_ids[:i]
+        current_token_ids = all_token_ids[:i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset,
+         new_read_offset) = detokenize_incrementally(
+             tokenizer=seed_oss_tokenizer,
+             all_input_ids=current_token_ids,
+             prev_tokens=previous_tokens,
+             prefix_offset=prefix_offset,
+             read_offset=read_offset,
+             skip_special_tokens=False,
+             spaces_between_special_tokens=True,
+         )
+
+        current_text = previous_text + delta_text
+
+        delta_message = seed_oss_tool_parser.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+            request=request,
+        )
+        if delta_message:
+            yield delta_message
+
+        previous_text = current_text
+        previous_tokens = (previous_tokens +
+                           new_tokens if previous_tokens else new_tokens)
+        prefix_offset = new_prefix_offset
+        read_offset = new_read_offset
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "tool_call_0_thinking_budget",
+        "tool_call_512_thinkg_budget",
+        "tool_call_unlimited_thinking_budget",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        ("""<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
+         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
+         """<seed:tool_call>\n<function=get_weather>\n"""
+         """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
+         [
+             ToolCall(function=FunctionCall(
+                 name="get_weather",
+                 arguments=json.dumps({
+                     "location": "Barcelona, Spain",
+                 }, ),
+             ),
+                      type='function')
+         ],
+         """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
+         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
+         ),
+        (
+            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
+            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
+            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
+            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
+            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
+            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
+            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
+            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
+            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
+            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
+            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
+            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n"""
+            """<seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, Spain</parameter>\n</function>"""
+            """\n</seed:tool_call>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_weather",
+                    arguments=json.dumps({
+                        "location": "Barcelona, Spain",
+                    }, ),
+                ),
+                         type='function')
+            ],
+            """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
+            """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
+            """there\'s a get_weather function that can retrieve the current temperature for a given location. \n\nFirst, """
+            """check the parameters required by get_weather: location is mandatory (needs city and country), and unit is """
+            """optional. The user provided "Barcelona Spain" as the location, which fits the required format (city, """
+            """country). \n<seed:cot_budget_reflect>I have used 131 tokens, and there are 381 tokens remaining for use."""
+            """</seed:cot_budget_reflect>\n Since the unit isn\'t specified, the function will default to Celsius, which """
+            """is fine. \n\nThere\'s no need to ask for more information because the location is clear. So I should call """
+            """the get_weather function with location set to "Barcelona, Spain" (adding a comma for clarity, though the """
+            """user\'s input has a space, but the function might accept either; to be safe, using the standard format """
+            """with a comma).\n<seed:cot_budget_reflect>I have used 257 tokens, and there are 255 tokens remaining for """
+            """use.</seed:cot_budget_reflect>\n The unit parameter can be omitted since it\'s optional.</seed:think>\n""",
+        ),
+        (
+            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
+            """First, I need to remember the function I can use: get_weather. The function requires a """
+            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
+            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
+            """let me check the function docstring again. Oh, the function says unit is optional, and """
+            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
+            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
+            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
+            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
+            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
+            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
+            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
+            """call should be as above. Then wait for the result to come back and tell the user the """
+            """temperature in Celsius.</seed:think><seed:tool_call>\n<function=get_weather>\n<parameter=location>"""
+            """Barcelona, Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>""",
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_weather",
+                    arguments=json.dumps(
+                        {
+                            "location": "Barcelona, Spain",
+                            "unit": "celsius",
+                        }, ),
+                ),
+                         type='function')
+            ],
+            """<seed:think>\nGot it, let\'s see. The user asked for the weather in Barcelona, Spain. """
+            """First, I need to remember the function I can use: get_weather. The function requires a """
+            """location (city and country) which is "Barcelona, Spain" here, and unit is optional. Since """
+            """the user didn\'t specify the unit, the default in the function is Celsius, right? Wait, """
+            """let me check the function docstring again. Oh, the function says unit is optional, and """
+            """returns temperature in Celsius. So I should call get_weather with location "Barcelona, """
+            """Spain" and maybe omit unit or set to Celsius. Let me format the function call correctly. """
+            """The format is <seed:tool_call>\n<function=get_weather>\n<parameter=location>Barcelona, """
+            """Spain</parameter>\n<parameter=unit>celsius</parameter>\n</function>\n</seed:tool_call>. """
+            """Wait, but does the unit parameter accept "celsius"? The docstring says unit is the unit """
+            """of temperature, but the return is in Celsius anyway. Maybe even if I don\'t pass unit, """
+            """it\'s okay, but to be explicit, maybe pass "celsius". Let me go with that. So the function """
+            """call should be as above. Then wait for the result to come back and tell the user the """
+            """temperature in Celsius.</seed:think>""",
+        ),
+    ],
+)
+def test_streaming_tool_calls(seed_oss_tool_parser, seed_oss_tokenizer,
+                              sample_tools, model_output, expected_tool_calls,
+                              expected_content):
+    """Test incremental streaming behavior"""
+    request = ChatCompletionRequest(model=MODEL,
+                                    messages=[],
+                                    tools=sample_tools)
+
+    other_content = ''
+    tool_states = {}  # Track state per tool index
+
+    for delta_message in stream_delta_message_generator(
+            seed_oss_tool_parser, seed_oss_tokenizer, model_output, request):
+        # role should never be streamed from tool parser
+        assert not delta_message.role
+
+        if delta_message.content:
+            other_content += delta_message.content
+
+        if delta_message.tool_calls:
+            for tool_call in delta_message.tool_calls:
+                idx = tool_call.index
+
+                # Initialize state for new tool
+                if idx not in tool_states:
+                    tool_states[idx] = {
+                        "id": None,
+                        "name": None,
+                        "arguments": "",
+                        "type": None
+                    }
+
+                # First chunk should have id, name, and type
+                if tool_call.id:
+                    tool_states[idx]["id"] = tool_call.id
+
+                if tool_call.type:
+                    assert tool_call.type == "function"
+                    tool_states[idx]["type"] = tool_call.type
+
+                if tool_call.function:
+                    if tool_call.function.name:
+                        # Should only be set once
+                        assert tool_states[idx]["name"] is None
+                        tool_states[idx]["name"] = tool_call.function.name
+
+                    if tool_call.function.arguments is not None:
+                        # Accumulate arguments incrementally
+                        tool_states[idx][
+                            "arguments"] += tool_call.function.arguments
+
+    # Verify final content
+    assert other_content == expected_content
+
+    # Verify we got all expected tool calls
+    assert len(tool_states) == len(expected_tool_calls)
+
+    # Verify each tool call
+    for idx, expected_tool in enumerate(expected_tool_calls):
+        state = tool_states[idx]
+        assert state["id"] is not None
+        assert state["type"] == "function"
+        assert state["name"] == expected_tool.function.name
+
+        # Parse accumulated arguments
+        arguments_str = state["arguments"]
+        assert arguments_str is not None
+        actual_args = json.loads(arguments_str)
+        expected_args = json.loads(expected_tool.function.arguments)
+        assert actual_args == expected_args
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 099e456aa4..468c3799bd 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -18,6 +18,7 @@ from .mistral_tool_parser import MistralToolParser
 from .phi4mini_tool_parser import Phi4MiniJsonToolParser
 from .pythonic_tool_parser import PythonicToolParser
 from .qwen3coder_tool_parser import Qwen3CoderToolParser
+from .seed_oss_tool_parser import SeedOssToolParser
 from .step3_tool_parser import Step3ToolParser
 from .xlam_tool_parser import xLAMToolParser
 
@@ -41,5 +42,6 @@ __all__ = [
     "HunyuanA13BToolParser",
     "Glm4MoeModelToolParser",
     "Qwen3CoderToolParser",
+    "SeedOssToolParser",
     "Step3ToolParser",
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
new file mode 100644
index 0000000000..69cf2e68f7
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
@@ -0,0 +1,676 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from qwen3coder xml parser, All rights reserved.
+# ruff: noqa: E501
+
+import ast
+import json
+import uuid
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionToolsParam,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("seed_oss")
+class SeedOssToolParser(ToolParser):
+    TOOL_CALL_START = "<seed:tool_call>"
+    TOOL_CALL_END = "</seed:tool_call>"
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        # --- streaming state ---
+        self._reset_streaming_state()
+        self.prev_tool_call_arr: list[dict] = []
+
+        self.tool_call_start_token: str = self.TOOL_CALL_START
+        self.tool_call_end_token: str = self.TOOL_CALL_END
+        # Sentinel tokens for streaming mode
+        self.tool_call_prefix: str = "<function="
+        self.function_end_token: str = "</function>"
+        self.parameter_prefix: str = "<parameter="
+        self.parameter_end_token: str = "</parameter>"
+        self.think_start_token: str = "<seed:think>"
+        self.think_end_token: str = "</seed:think>"
+        self.is_tool_call_started: bool = False
+        self.is_thinking_end: bool = False
+        self.failed_count: int = 0
+        self._reset_streaming_state()
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+        self.think_end_token_id = self.vocab.get(self.think_end_token)
+
+        if (self.tool_call_start_token_id is None
+                or self.tool_call_end_token_id is None):
+            raise RuntimeError(
+                "Seed_Oss XML parser: tokenizer did not include "
+                "<seed:tool_call> or its closing tag.")
+
+        tool_start_re = re.escape(self.tool_call_start_token)
+        tool_end_re = re.escape(self.tool_call_end_token)
+
+        self.tool_call_complete_regex = re.compile(
+            rf"{tool_start_re}(.*?){tool_end_re}", re.DOTALL)
+        self.tool_call_regex = re.compile(
+            rf"{tool_start_re}(.*?){tool_end_re}|{tool_start_re}(.*?)$",
+            re.DOTALL)
+
+        self.tool_call_function_regex = re.compile(
+            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
+        self.tool_call_parameter_regex = re.compile(
+            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL)
+
+        logger.info("vLLM Seed-Oss XML tool parser loaded (%s).",
+                    self.__class__.__name__)
+
+    def _generate_tool_call_id(self) -> str:
+        """Generate a unique tool call ID."""
+        return f"call_{uuid.uuid4().hex[:24]}"
+
+    def _reset_streaming_state(self):
+        """Reset all streaming state."""
+        self.current_tool_index = 0
+        self.is_tool_call_started = False
+        self.header_sent = False
+        self.current_tool_id = -1
+        self.current_function_name = None
+        self.current_param_name = None
+        self.current_param_value = ""
+        self.param_count = 0
+        self.in_param = False
+        self.in_function = False
+        self.accumulated_text = ""
+        self.json_started = False
+        self.json_closed = False
+
+    def _parse_xml_function_call(
+            self, function_call_str: str,
+            tools: Optional[list[ChatCompletionToolsParam]]
+    ) -> Optional[ToolCall]:
+
+        def get_arguments_config(func_name: str) -> dict:
+            if tools is None:
+                return {}
+            for config in tools:
+                if not hasattr(config, "type") or not (
+                        hasattr(config, "function")
+                        and hasattr(config.function, "name")):
+                    continue
+                if (config.type == "function"
+                        and config.function.name == func_name):
+                    if not hasattr(config.function, "parameters"):
+                        return {}
+                    params = config.function.parameters
+                    if isinstance(params, dict) and "properties" in params:
+                        return params["properties"]
+                    elif isinstance(params, dict):
+                        return params
+                    else:
+                        return {}
+            logger.warning("Tool '%s' is not defined in the tools list.",
+                           func_name)
+            return {}
+
+        def convert_param_value(param_value: str, param_name: str,
+                                param_config: dict, func_name: str) -> Any:
+            # Handle null value for any type
+            if param_value.lower() == "null":
+                return None
+
+            if param_name not in param_config:
+                if param_config != {}:
+                    logger.warning(
+                        "Parsed parameter '%s' is not defined in "
+                        "the tool parameters for tool '%s', "
+                        "directly returning the string value.", param_name,
+                        func_name)
+                return param_value
+
+            if (isinstance(param_config[param_name], dict)
+                    and "type" in param_config[param_name]):
+                param_type = str(
+                    param_config[param_name]["type"]).strip().lower()
+            else:
+                param_type = "string"
+            if param_type in [
+                    "string", "str", "text", "varchar", "char", "enum"
+            ]:
+                return param_value
+            elif (param_type.startswith("int") or param_type.startswith("uint")
+                  or param_type.startswith("long")
+                  or param_type.startswith("short")
+                  or param_type.startswith("unsigned")):
+                try:
+                    param_value = int(param_value)  # type: ignore
+                except (ValueError, TypeError):
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' is not an integer in tool "
+                        "'%s', degenerating to string.", param_value,
+                        param_name, func_name)
+                return param_value
+            elif param_type.startswith("num") or param_type.startswith(
+                    "float"):
+                try:
+                    float_param_value = float(param_value)
+                    param_value = float_param_value if float_param_value - int(
+                        float_param_value) != 0 else int(
+                            float_param_value)  # type: ignore
+                except (ValueError, TypeError):
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' is not a float in tool "
+                        "'%s', degenerating to string.", param_value,
+                        param_name, func_name)
+                return param_value
+            elif param_type in ["boolean", "bool", "binary"]:
+                param_value = param_value.lower()
+                if param_value not in ["true", "false"]:
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' is not a boolean "
+                        "(`true` of `false`) in tool '%s', degenerating to false.",
+                        param_value, param_name, func_name)
+                return param_value == "true"
+            else:
+                if param_type == "object" or param_type.startswith("dict"):
+                    try:
+                        param_value = json.loads(param_value)
+                        return param_value
+                    except (ValueError, TypeError, json.JSONDecodeError):
+                        logger.warning(
+                            "Parsed value '%s' of parameter '%s' is not a valid JSON "
+                            "object in tool '%s', will try other methods to parse it.",
+                            param_value, param_name, func_name)
+                try:
+                    param_value = ast.literal_eval(param_value)
+                except (ValueError, SyntaxError):
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' cannot be converted via "
+                        "Python `ast.literal_eval()` in tool '%s', degenerating to string.",
+                        param_value, param_name, func_name)
+                return param_value
+
+        # Extract function name
+        end_index = function_call_str.index(">")
+        function_name = function_call_str[:end_index]
+        param_config = get_arguments_config(function_name)
+        parameters = function_call_str[end_index + 1:]
+        param_dict = {}
+        for match in self.tool_call_parameter_regex.findall(parameters):
+            match_text = match[0] if match[0] else match[1]
+            idx = match_text.index(">")
+            param_name = match_text[:idx]
+            param_value = str(match_text[idx + 1:])
+            # Remove prefix and trailing \n
+            if param_value.startswith("\n"):
+                param_value = param_value[1:]
+            if param_value.endswith("\n"):
+                param_value = param_value[:-1]
+
+            param_dict[param_name] = convert_param_value(
+                param_value, param_name, param_config, function_name)
+        return ToolCall(
+            type="function",
+            function=FunctionCall(name=function_name,
+                                  arguments=json.dumps(param_dict,
+                                                       ensure_ascii=False)),
+        )
+
+    def _get_function_calls(self, model_output: str) -> list[str]:
+        # Find all tool calls
+        matched_ranges = self.tool_call_regex.findall(model_output)
+        raw_tool_calls = [
+            match[0] if match[0] else match[1] for match in matched_ranges
+        ]
+
+        # Back-off strategy if no tool_call tags found
+        if len(raw_tool_calls) == 0:
+            raw_tool_calls = [model_output]
+
+        raw_function_calls = []
+        for tool_call in raw_tool_calls:
+            raw_function_calls.extend(
+                self.tool_call_function_regex.findall(tool_call))
+
+        function_calls = [
+            match[0] if match[0] else match[1] for match in raw_function_calls
+        ]
+        return function_calls
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        # Quick check to avoid unnecessary processing
+        if self.tool_call_prefix not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        # Check if both think start and end tokens are present
+        if (self.think_start_token in model_output
+                and self.think_end_token in model_output):
+            # Find the position of think end token
+            think_end_index = model_output.find(self.think_end_token) + len(
+                self.think_end_token)
+            # Extract content after think end token
+            result_content = model_output[think_end_index:]
+            thinking_content = model_output[:think_end_index]
+
+        try:
+            function_calls = self._get_function_calls(result_content)
+            if len(function_calls) == 0:
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+            tool_calls = [
+                self._parse_xml_function_call(function_call_str, request.tools)
+                for function_call_str in function_calls
+            ]
+
+            # Populate prev_tool_call_arr for serving layer to set finish_reason
+            self.prev_tool_call_arr.clear()  # Clear previous calls
+            for tool_call in tool_calls:
+                if tool_call:
+                    self.prev_tool_call_arr.append({
+                        "name":
+                        tool_call.function.name,
+                        "arguments":
+                        tool_call.function.arguments,
+                    })
+
+            # Extract content before tool calls
+            tool_call_start_index = result_content.find(
+                self.tool_call_start_token)
+            tool_call_start_index = (
+                tool_call_start_index if tool_call_start_index >= 0 else
+                result_content.find(self.tool_call_prefix))
+            content = thinking_content + result_content[:tool_call_start_index]
+
+            return ExtractedToolCallInformation(
+                tools_called=(len(tool_calls) > 0),
+                tool_calls=tool_calls,
+                content=content if content else None,
+            )
+
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        # If no delta text, return None unless
+        # it's an EOS token after tool calls
+        if not delta_text:
+            # Check if this is an EOS token after all tool calls are complete
+            # We check for tool calls in the text even if is_tool_call_started
+            # is False because it might have been reset after processing all tools
+            if (delta_token_ids
+                    and self.tool_call_end_token_id not in delta_token_ids):
+                # Count complete tool calls
+                complete_calls = len(
+                    self.tool_call_complete_regex.findall(current_text))
+
+                # If we have completed tool calls and populated prev_tool_call_arr
+                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
+                    # Check if all tool calls are closed
+                    open_calls = current_text.count(
+                        self.tool_call_start_token) - current_text.count(
+                            self.tool_call_end_token)
+                    if open_calls == 0:
+                        # Return empty delta message to allow finish_reason processing
+                        return DeltaMessage(content="")
+                elif not self.is_tool_call_started and current_text:
+                    # This is a regular content response that's now complete
+                    return DeltaMessage(content="")
+            return None
+
+        # Check if this is the first call (reset state if needed)
+        if not previous_text:
+            self._reset_streaming_state()
+
+        # Update accumulated text
+        self.accumulated_text = current_text
+
+        # Check if we need to advance to next tool
+        if self.json_closed and not self.in_function:
+            # Check if this tool call has ended
+            tool_ends = current_text.count(self.tool_call_end_token)
+            if tool_ends > self.current_tool_index:
+                # This tool has ended, advance to next
+                self.current_tool_index += 1
+                self.header_sent = False
+                self.param_count = 0
+                self.json_started = False
+                self.json_closed = False
+
+                # Check if there are more tool calls
+                if self.current_tool_index >= current_text.count(
+                        self.tool_call_start_token):
+                    # No more tool calls
+                    self.is_tool_call_started = False
+                # Continue processing next tool
+                return None
+
+        # Check if end thinking
+        if (not self.is_thinking_end
+                and (self.think_end_token_id in delta_token_ids
+                     or self.think_end_token in delta_text)):
+            self.is_thinking_end = True
+
+        # If thinking hasn't ended yet, don't process any tool calls
+        if not self.is_thinking_end:
+            return DeltaMessage(content=delta_text)
+
+        # Handle normal content before tool calls
+        if not self.is_tool_call_started:
+            # Check if tool call is starting
+            if (self.tool_call_start_token_id in delta_token_ids
+                    or self.tool_call_start_token in delta_text):
+                self.is_tool_call_started = True
+                # Return any content before the tool call
+                if self.tool_call_start_token in delta_text:
+                    content_before = delta_text[:delta_text.index(
+                        self.tool_call_start_token)]
+                    if content_before:
+                        return DeltaMessage(content=content_before)
+                return None
+            else:
+                # Check if we're between tool calls - skip whitespace
+                if (current_text.rstrip().endswith(self.tool_call_end_token)
+                        and delta_text.strip() == ""):
+                    # We just ended a tool call, skip whitespace
+                    return None
+                # Normal content, no tool call
+                return DeltaMessage(content=delta_text)
+
+        # Check if we're between tool calls (waiting for next one)
+        # Count tool calls we've seen vs processed
+        tool_starts_count = current_text.count(self.tool_call_start_token)
+        if self.current_tool_index >= tool_starts_count:
+            # We're past all tool calls, shouldn't be here
+            return None
+
+        # We're in a tool call, find the current tool call portion
+        # Need to find the correct tool call based on current_tool_index
+        # Only process tool calls after think_end_token
+        think_end_index = current_text.find(self.think_end_token) + len(
+            self.think_end_token
+        ) if self.think_end_token in current_text else 0
+        tool_starts: list[int] = []
+        idx = think_end_index
+        while True:
+            idx = current_text.find(self.tool_call_start_token, idx)
+            if idx == -1:
+                break
+            tool_starts.append(idx)
+            idx += len(self.tool_call_start_token)
+
+        if self.current_tool_index >= len(tool_starts):
+            # No more tool calls to process yet
+            return None
+
+        tool_start_idx = tool_starts[self.current_tool_index]
+        # Find where this tool call ends (or current position if not ended yet)
+        tool_end_idx = current_text.find(self.tool_call_end_token,
+                                         tool_start_idx)
+        if tool_end_idx == -1:
+            tool_text = current_text[tool_start_idx:]
+        else:
+            tool_text = current_text[tool_start_idx:tool_end_idx +
+                                     len(self.tool_call_end_token)]
+
+        # Looking for function header
+        if not self.header_sent:
+            if self.tool_call_prefix in tool_text:
+                func_start = tool_text.find(self.tool_call_prefix) + len(
+                    self.tool_call_prefix)
+                func_end = tool_text.find(">", func_start)
+
+                if func_end != -1:
+                    # Found complete function name
+                    self.current_function_name = tool_text[func_start:func_end]
+                    self.current_tool_id = self._generate_tool_call_id(
+                    )  # type: ignore
+                    self.header_sent = True
+                    self.in_function = True
+
+                    # IMPORTANT: Add to prev_tool_call_arr immediately when we detect a tool call
+                    # This ensures finish_reason="tool_calls" even if parsing isn't complete
+                    already_added = any(
+                        tool.get("name") == self.current_function_name
+                        for tool in self.prev_tool_call_arr)
+                    if not already_added:
+                        self.prev_tool_call_arr.append({
+                            "name": self.current_function_name,
+                            "arguments":
+                            "{}",  # Placeholder, will be updated later
+                        })
+
+                    # Send header with function info
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            id=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                name=self.current_function_name, arguments=""),
+                            type="function",
+                        )
+                    ])
+            return None
+
+        # We've sent header, now handle function body
+        if self.in_function:
+            # Send opening brace if not sent yet
+            if (not self.json_started
+                    and self.parameter_prefix not in delta_text):
+                self.json_started = True
+                return DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_index,
+                        function=DeltaFunctionCall(arguments="{"),
+                    )
+                ])
+
+            # Make sure json_started is set if we're processing parameters
+            if not self.json_started:
+                self.json_started = True
+
+            # Check for function end in accumulated text
+            if not self.json_closed and self.function_end_token in tool_text:
+                # Close JSON
+                self.json_closed = True
+
+                # Extract the complete tool call to update prev_tool_call_arr with final arguments
+                # Find the function content
+                func_start = tool_text.find(self.tool_call_prefix) + len(
+                    self.tool_call_prefix)
+                func_content_end = tool_text.find(self.function_end_token,
+                                                  func_start)
+                if func_content_end != -1:
+                    func_content = tool_text[func_start:func_content_end]
+                    # Parse to get the complete arguments
+                    try:
+                        parsed_tool = self._parse_xml_function_call(
+                            func_content, request.tools if request else None)
+                        if parsed_tool:
+                            # Update existing entry in prev_tool_call_arr with complete arguments
+                            for i, tool in enumerate(self.prev_tool_call_arr):
+                                if tool.get(
+                                        "name") == parsed_tool.function.name:
+                                    self.prev_tool_call_arr[i]["arguments"] = (
+                                        parsed_tool.function.arguments)
+                                    break
+                    except Exception:
+                        logger.warning(
+                            "Failed to parse tool arguments during streaming.",
+                            exc_info=True)
+
+                result = DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_index,
+                        function=DeltaFunctionCall(arguments="}"),
+                    )
+                ])
+
+                # Reset state for next tool
+                self.in_function = False
+                self.json_closed = True
+
+                return result
+
+            # Look for parameters
+            # Count how many complete parameters we have processed
+            complete_params = tool_text.count(self.parameter_end_token)
+
+            # Check if we should start a new parameter
+            if not self.in_param and self.param_count < complete_params:
+                # Find the unprocessed parameter
+                # Count parameter starts
+                param_starts = []
+                idx = 0
+                while True:
+                    idx = tool_text.find(self.parameter_prefix, idx)
+                    if idx == -1:
+                        break
+                    param_starts.append(idx)
+                    idx += len(self.parameter_prefix)
+
+                if len(param_starts) > self.param_count:
+                    # Process the next parameter
+                    param_idx = param_starts[self.param_count]
+                    param_start = param_idx + len(self.parameter_prefix)
+                    remaining = tool_text[param_start:]
+
+                    if ">" in remaining:
+                        # We have the complete parameter name
+                        name_end = remaining.find(">")
+                        self.current_param_name = remaining[:name_end]
+
+                        # Find the parameter value
+                        value_start = param_start + name_end + 1
+                        value_text = tool_text[value_start:]
+                        if value_text.startswith("\n"):
+                            value_text = value_text[1:]
+
+                        # Find where this parameter ends
+                        param_end_idx = value_text.find(
+                            self.parameter_end_token)
+                        if param_end_idx != -1:
+                            # Complete parameter found
+                            param_value = value_text[:param_end_idx]
+                            if param_value.endswith("\n"):
+                                param_value = param_value[:-1]
+
+                            # Build complete JSON fragment for this parameter
+                            if self.param_count == 0:
+                                json_fragment = (
+                                    '"' + self.current_param_name + '": "' +
+                                    json.dumps(param_value)[1:-1] + '"')
+                            else:
+                                json_fragment = (
+                                    ', "' + self.current_param_name + '": "' +
+                                    json.dumps(param_value)[1:-1] + '"')
+
+                            self.param_count += 1
+
+                            return DeltaMessage(tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_index,
+                                    function=DeltaFunctionCall(
+                                        arguments=json_fragment),
+                                )
+                            ])
+
+            # Continue parameter value
+            if self.in_param:
+                if self.parameter_end_token in delta_text:
+                    # End of parameter
+                    end_idx = delta_text.find(self.parameter_end_token)
+                    value_chunk = delta_text[:end_idx]
+
+                    # Skip past > if at start
+                    if not self.current_param_value and ">" in value_chunk:
+                        gt_idx = value_chunk.find(">")
+                        value_chunk = value_chunk[gt_idx + 1:]
+
+                    if not self.current_param_value and value_chunk.startswith(
+                            "\n"):
+                        value_chunk = value_chunk[1:]
+
+                    # Calculate incremental JSON
+                    full_value = self.current_param_value + value_chunk
+                    prev_escaped = (json.dumps(self.current_param_value)[1:-1]
+                                    if self.current_param_value else "")
+                    full_escaped = json.dumps(full_value)[1:-1]
+                    delta_escaped = full_escaped[len(prev_escaped):]
+
+                    self.in_param = False
+                    self.current_param_value = ""
+
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_index,
+                            function=DeltaFunctionCall(
+                                arguments=delta_escaped + '"'),
+                        )
+                    ])
+                else:
+                    # Continue accumulating value
+                    value_chunk = delta_text
+
+                    # Handle first chunk after param name
+                    if not self.current_param_value and ">" in value_chunk:
+                        gt_idx = value_chunk.find(">")
+                        value_chunk = value_chunk[gt_idx + 1:]
+
+                    if not self.current_param_value and value_chunk.startswith(
+                            "\n"):
+                        value_chunk = value_chunk[1:]
+
+                    if value_chunk:
+                        # Stream the escaped delta
+                        prev_escaped = (json.dumps(
+                            self.current_param_value)[1:-1]
+                                        if self.current_param_value else "")
+                        self.current_param_value += value_chunk
+                        full_escaped = json.dumps(
+                            self.current_param_value)[1:-1]
+                        delta_escaped = full_escaped[len(prev_escaped):]
+
+                        if delta_escaped:
+                            return DeltaMessage(tool_calls=[
+                                DeltaToolCall(
+                                    index=self.current_tool_index,
+                                    function=DeltaFunctionCall(
+                                        arguments=delta_escaped),
+                                )
+                            ])
+
+        return None
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 28d7e93af9..465c25f094 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -130,6 +130,7 @@ _TEXT_GENERATION_MODELS = {
     "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"),
     "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"),
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
+    "SeedOssForCausalLM": ("seed_oss", "SeedOssForCausalLM"),
     "Step3TextForCausalLM": ("step3_text", "Step3TextForCausalLM"),
     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
new file mode 100644
index 0000000000..34a87a6a69
--- /dev/null
+++ b/vllm/model_executor/models/seed_oss.py
@@ -0,0 +1,487 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Seed team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only SeedOss model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig as SeedOssConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class SeedOssMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class SeedOssAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        rope_scaling: Optional[tuple] = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            attn_type=attn_type,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class SeedOssDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: SeedOssConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+
+        # By default, SeedOss uses causal attention as it is a
+        # decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        self.self_attn = SeedOssAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            rope_theta=rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            rope_scaling=rope_scaling,
+            prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
+        )
+        self.mlp = SeedOssMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
+class SeedOssModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 decoder_layer_type: type[nn.Module] = SeedOssDecoderLayer):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        # TODO (@robertgshaw2): see if this can be moved out
+        if (cache_config.sliding_window is not None
+                and hasattr(config, "max_window_layers")):
+            assert config.max_window_layers == config.num_hidden_layers, (
+                "Sliding window for some but all layers is not supported. "
+                "This model uses sliding window but `max_window_layers` = {} "
+                "is less than `num_hidden_layers` = {}. Please open an issue "
+                "to discuss this feature.".format(
+                    config.max_window_layers,
+                    config.num_hidden_layers,
+                ))
+
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        # Use the provided decoder layer type or default to SeedDecoderLayer
+        decoder_layer_type = decoder_layer_type or SeedOssDecoderLayer
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: decoder_layer_type(config=config,
+                                              cache_config=cache_config,
+                                              quant_config=quant_config,
+                                              prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = SeedOssModel(vllm_config=vllm_config,
+                                  prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)

From 17373dcd93ca60554d72cef4e159e70abbfd15af Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Thu, 21 Aug 2025 22:05:59 -0700
Subject: [PATCH 496/932] [Attention] Refactor AttentionMetadata Preparation
 for Encoder-only Models (#23154)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 tests/v1/worker/test_gpu_model_runner.py      |  11 +-
 .../layers/chunked_local_attention.py         |  29 +--
 .../layers/encoder_only_attention.py          |  86 +++++++
 vllm/model_executor/models/bert.py            |  17 +-
 vllm/model_executor/models/bert_with_rope.py  |  17 +-
 vllm/model_executor/models/llama.py           |   6 +-
 vllm/model_executor/models/modernbert.py      |  14 +-
 vllm/model_executor/models/qwen2.py           |   5 +-
 vllm/v1/attention/backends/utils.py           |  32 +--
 vllm/v1/kv_cache_interface.py                 |   8 +
 vllm/v1/worker/gpu_model_runner.py            | 211 ++++++------------
 vllm/v1/worker/utils.py                       |   4 +
 12 files changed, 226 insertions(+), 214 deletions(-)
 create mode 100644 vllm/attention/layers/encoder_only_attention.py

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 4bcc63f293..b9b2314ce5 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -680,6 +680,7 @@ def test_init_kv_cache_with_kv_sharing_valid():
         kv_cache_spec[layer_0].page_size_bytes
 
     runner.initialize_kv_cache(kv_cache_config)
+    kv_cache_config_after_init = runner.kv_cache_config
 
     layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
     layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
@@ -687,10 +688,12 @@ def test_init_kv_cache_with_kv_sharing_valid():
     assert id(layer_1_kv) == id(layer_0_kv)
 
     # check layer 1 added to kv cache group's layer names
-    assert len(kv_cache_config.kv_cache_groups) == 1
-    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
-    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
-    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+    assert len(kv_cache_config_after_init.kv_cache_groups) == 1
+    assert len(kv_cache_config_after_init.kv_cache_groups[0].layer_names) == 2
+    assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[
+        0] == layer_0
+    assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[
+        1] == layer_1
 
 
 def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index 892077ba91..087c5004bd 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -6,12 +6,13 @@ from typing import List, Optional
 import torch
 
 from vllm import envs
-from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata)
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig, QuantizationConfig
 from vllm.v1.attention.backends.utils import (
     CommonAttentionMetadata, make_local_attention_virtual_batches,
-    subclass_attention_backend, subclass_attention_metadata_builder)
+    subclass_attention_backend)
 
 from ..layer import Attention
 
@@ -24,21 +25,23 @@ def create_chunked_local_attention_backend(
 ) -> type[AttentionBackend]:
     prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
 
-    def build_preprocess_fn(cm: CommonAttentionMetadata):
-        return make_local_attention_virtual_batches(attention_chunk_size, cm,
-                                                    block_size)
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+
+    class ChunkedLocalAttentionBuilder(underlying_builder):  # type: ignore
+
+        def build(self,
+                  common_prefix_len: int,
+                  common_attn_metadata: CommonAttentionMetadata,
+                  fast_build: bool = False) -> AttentionMetadata:
+            common_attn_metadata = make_local_attention_virtual_batches(
+                attention_chunk_size, common_attn_metadata, block_size)
+            return super().build(common_prefix_len, common_attn_metadata,
+                                 fast_build)
 
-    # Dynamically create a new attention backend that wraps the
-    # underlying attention backend but applies
-    # `make_local_attention_virtual_batches` before calling `build(...)`
-    builder_cls = subclass_attention_metadata_builder(
-        name_prefix=prefix,
-        builder_cls=underlying_attn_backend.get_builder_cls(),
-        build_preprocess_fn=build_preprocess_fn)
     attn_backend = subclass_attention_backend(
         name_prefix=prefix,
         attention_backend_cls=underlying_attn_backend,
-        builder_cls=builder_cls)
+        builder_cls=ChunkedLocalAttentionBuilder)
 
     return attn_backend
 
diff --git a/vllm/attention/layers/encoder_only_attention.py b/vllm/attention/layers/encoder_only_attention.py
new file mode 100644
index 0000000000..7b3dcbd823
--- /dev/null
+++ b/vllm/attention/layers/encoder_only_attention.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+from copy import copy
+from typing import Optional
+
+import torch
+from transformers import CacheConfig
+
+from vllm import envs
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.layer import Attention
+from vllm.attention.selector import get_attn_backend
+from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
+                                              subclass_attention_backend)
+
+
+@functools.lru_cache
+def create_encoder_only_attention_backend(
+    underlying_attn_backend: AttentionBackend, ) -> type[AttentionBackend]:
+    prefix = "EncoderOnlyAttention_"
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+
+    class EncoderOnlyAttentionBuilder(underlying_builder):  # type: ignore
+
+        def build(self,
+                  common_prefix_len: int,
+                  common_attn_metadata: CommonAttentionMetadata,
+                  fast_build: bool = False) -> AttentionMetadata:
+            new_common_attn_metadata = copy(common_attn_metadata)
+            new_common_attn_metadata.causal = False
+            return super().build(common_prefix_len, new_common_attn_metadata,
+                                 fast_build)
+
+    attn_backend = subclass_attention_backend(
+        name_prefix=prefix,
+        attention_backend_cls=underlying_attn_backend,
+        builder_cls=EncoderOnlyAttentionBuilder)
+
+    return attn_backend
+
+
+class EncoderOnlyAttention(Attention):
+    """
+    Encoder attention is a special case that doesn't need a KV Cache.
+    """
+
+    def __init__(self,
+                 num_heads: int,
+                 head_size: int,
+                 scale: float,
+                 cache_config: Optional[CacheConfig] = None,
+                 attn_type: Optional[str] = None,
+                 **kwargs):
+        dtype = torch.get_default_dtype()
+
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+
+        if envs.VLLM_USE_V1:
+            underlying_attn_backend = get_attn_backend(head_size, dtype,
+                                                       kv_cache_dtype,
+                                                       block_size)
+
+            attn_backend = create_encoder_only_attention_backend(
+                underlying_attn_backend)
+        else:
+            # in v0 encoder only attention is handled inside the backends
+            attn_backend = None
+
+        if attn_type is not None:
+            assert attn_type == AttentionType.ENCODER_ONLY, \
+                "EncoderOnlyAttention only supports AttentionType.ENCODER_ONLY"
+
+        super().__init__(num_heads=num_heads,
+                         head_size=head_size,
+                         scale=scale,
+                         cache_config=cache_config,
+                         attn_backend=attn_backend,
+                         attn_type=AttentionType.ENCODER_ONLY,
+                         **kwargs)
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 2bd5eb5bb7..22b6c44012 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -8,7 +8,7 @@ import torch
 from torch import nn
 from transformers import BertConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, PoolerConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -239,14 +239,13 @@ class BertSelfAttention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj")
 
-        self.attn = Attention(num_heads=self.num_heads,
-                              head_size=self.head_dim,
-                              scale=self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn",
-                              attn_type=AttentionType.ENCODER_ONLY)
+        self.attn = EncoderOnlyAttention(num_heads=self.num_heads,
+                                         head_size=self.head_dim,
+                                         scale=self.scaling,
+                                         num_kv_heads=self.num_kv_heads,
+                                         cache_config=cache_config,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.attn")
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index e18b7b7ffa..129450927e 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -7,7 +7,7 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
@@ -119,14 +119,13 @@ class BertWithRopeAttention(nn.Module):
 
         self.rotary_emb = get_rope(**rotary_kwargs)
 
-        self.attn = Attention(num_heads=self.num_heads,
-                              head_size=self.head_dim,
-                              scale=self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn",
-                              attn_type=AttentionType.ENCODER_ONLY)
+        self.attn = EncoderOnlyAttention(num_heads=self.num_heads,
+                                         head_size=self.head_dim,
+                                         scale=self.scaling,
+                                         num_kv_heads=self.num_kv_heads,
+                                         cache_config=cache_config,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.attn")
 
         self.out_proj = RowParallelLinear(input_size=hidden_size,
                                           output_size=hidden_size,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 24cd448d83..f99f1c3643 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -31,6 +31,7 @@ from torch import nn
 from transformers import LlamaConfig
 
 from vllm.attention import Attention, AttentionType
+from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -173,7 +174,10 @@ class LlamaAttention(nn.Module):
             if is_sliding:
                 sliding_window = config.sliding_window
 
-        self.attn = Attention(
+        attn_cls = (EncoderOnlyAttention
+                    if attn_type == AttentionType.ENCODER_ONLY else Attention)
+
+        self.attn = attn_cls(
             self.num_heads,
             self.head_dim,
             self.scaling,
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index c6e84e2d4e..72290bf2ee 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -7,7 +7,7 @@ import torch
 from torch import nn
 from transformers import ModernBertConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -104,12 +104,12 @@ class ModernBertAttention(nn.Module):
                                                     head_size=self.head_dim,
                                                     dim=self.head_dim,
                                                     base=rope_theta)
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              prefix=f"{layer_id}.attn",
-                              attn_type=AttentionType.ENCODER_ONLY,
-                              per_layer_sliding_window=sliding_window)
+        self.attn = EncoderOnlyAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            prefix=f"{layer_id}.attn",
+            per_layer_sliding_window=sliding_window)
         self.Wo = RowParallelLinear(config.hidden_size,
                                     config.hidden_size,
                                     bias=config.attention_bias)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 7304fbf120..b6a1d2db30 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -32,6 +32,7 @@ from torch import nn
 from transformers import Qwen2Config
 
 from vllm.attention import Attention, AttentionType
+from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -159,7 +160,9 @@ class Qwen2Attention(nn.Module):
             rope_scaling=rope_scaling,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
-        self.attn = Attention(
+        attn_cls = (EncoderOnlyAttention
+                    if attn_type == AttentionType.ENCODER_ONLY else Attention)
+        self.attn = attn_cls(
             self.num_heads,
             self.head_dim,
             self.scaling,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 57c4d436c5..39bdbe1256 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -5,8 +5,7 @@ import enum
 import functools
 from abc import abstractmethod
 from dataclasses import dataclass, make_dataclass
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Generic, Optional,
-                    TypeVar)
+from typing import TYPE_CHECKING, Any, ClassVar, Generic, Optional, TypeVar
 
 import numpy as np
 import torch
@@ -543,35 +542,6 @@ def make_local_attention_virtual_batches(
     )
 
 
-def subclass_attention_metadata_builder(
-    name_prefix: str,
-    builder_cls: type[AttentionMetadataBuilder[M]],
-    build_preprocess_fn: Callable[[CommonAttentionMetadata],
-                                  CommonAttentionMetadata],
-) -> type[AttentionMetadataBuilder[M]]:
-    """
-    Return a new subclass of `builder_cls` whose .build(...) method
-    first calls build_preprocess_fn(common_attn_metadata) on the metadata.
-    """
-    name: str = name_prefix + builder_cls.__name__  # type: ignore
-
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False):
-        return builder_cls.build(self, common_prefix_len,
-                                 build_preprocess_fn(common_attn_metadata),
-                                 fast_build)
-
-    Wrapped = type(
-        name,
-        (builder_cls, ),  # inherit from the original
-        {
-            "build": build,
-        })
-    return Wrapped  # type: ignore
-
-
 def subclass_attention_backend(
         name_prefix: str, attention_backend_cls: type[AttentionBackend],
         builder_cls: type[AttentionMetadataBuilder[M]]
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 429416afa2..ed8e0bf798 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -203,6 +203,14 @@ class MambaSpec(KVCacheSpec):
         return self.page_size_bytes
 
 
+@dataclass(frozen=True)
+class EncoderOnlyAttentionSpec(AttentionSpec):
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        # Encoder-only layers do not need KV cache
+        return 0
+
+
 @dataclass
 class KVCacheTensor:
     """
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 870aca41ec..d520b71de3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -8,6 +8,7 @@ import time
 from collections import defaultdict
 from collections.abc import Iterator
 from contextlib import contextmanager
+from copy import deepcopy
 from typing import TYPE_CHECKING, Any, Optional, Union, cast
 
 import numpy as np
@@ -62,9 +63,10 @@ from vllm.v1.attention.backends.utils import (
 from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 from vllm.v1.kv_cache_interface import (AttentionSpec,
                                         ChunkedLocalAttentionSpec,
+                                        EncoderOnlyAttentionSpec,
                                         FullAttentionSpec, KVCacheConfig,
-                                        KVCacheSpec, MambaSpec,
-                                        SlidingWindowSpec)
+                                        KVCacheGroupSpec, KVCacheSpec,
+                                        MambaSpec, SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
                              LogprobsTensors, ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
@@ -136,7 +138,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 cache_config.cache_dtype]
 
         self.is_pooling_model = model_config.pooler_config is not None
-        self.is_encoder_only_model = False
         self.is_multimodal_raw_input_supported = (
             model_config.is_multimodal_raw_input_supported)
         self.max_model_len = model_config.max_model_len
@@ -345,6 +346,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         self.reorder_batch_threshold: Optional[int] = None
 
+        # Attention layers that are only in the KVCacheConfig of the runner
+        # (e.g., KV sharing, encoder-only attention), but not in the
+        # KVCacheConfig of the scheduler.
+        self.runner_only_attn_layers: set[str] = set()
+
         # Cached outputs.
         self._draft_token_ids: Optional[Union[list[list[int]],
                                               torch.Tensor]] = None
@@ -834,23 +840,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         attn_metadata: dict[str, Any] = {}
 
-        # Prepare encoder attention metadata separately
-        # (encoder layers are not in KV cache groups)
-        if self.is_encoder_only_model:
-
-            per_layer_metadata = \
-                self._build_encoder_only_attn_metadata(
-                scheduler_output)
-
-            # Add encoder attention metadata for all encoder layers
-            attention_layers = get_layers_from_vllm_config(
-                self.vllm_config, Attention)
-            for layer_name, attn_module in attention_layers.items():
-                if attn_module.attn_type == AttentionType.ENCODER_ONLY:
-                    common_attn_metadata, encoder_attn_metadata =\
-                        per_layer_metadata[layer_name]
-                    attn_metadata[layer_name] = encoder_attn_metadata
-
         # Used in the below loop.
         query_start_loc_cpu = self.query_start_loc_cpu[:num_reqs + 1]
         seq_lens_cpu = self.seq_lens_cpu[:num_reqs]
@@ -863,13 +852,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
                 self.kv_cache_config.kv_cache_groups):
 
-            blk_table = self.input_batch.block_table[kv_cache_group_id]
-            blk_table_tensor = blk_table.get_device_tensor()[:num_reqs]
-            slot_mapping = blk_table.slot_mapping[:total_num_scheduled_tokens]
+            if isinstance(kv_cache_group_spec.kv_cache_spec,
+                          EncoderOnlyAttentionSpec):
+                # Encoder-only layers do not have KV cache, so we need to
+                # create a dummy block table and slot mapping for them.
+                blk_table_tensor = torch.zeros(
+                    (num_reqs, 1),
+                    dtype=torch.int32,
+                    pin_memory=self.pin_memory,
+                    device="cpu").to(self.device, non_blocking=True)
+                slot_mapping = torch.zeros((total_num_scheduled_tokens, ),
+                                           dtype=torch.int32,
+                                           pin_memory=self.pin_memory,
+                                           device="cpu").to(self.device,
+                                                            non_blocking=True)
+                num_common_prefix_blocks = 0
+            else:
+                blk_table = self.input_batch.block_table[kv_cache_group_id]
+                blk_table_tensor = blk_table.get_device_tensor()[:num_reqs]
+                slot_mapping = blk_table.slot_mapping[:
+                                                      total_num_scheduled_tokens]
 
-            # Fill unused with -1. Needed for reshape_and_cache in full cuda
-            # graph mode.
-            blk_table.slot_mapping[total_num_scheduled_tokens:].fill_(-1)
+                # Fill unused with -1. Needed for reshape_and_cache in full cuda
+                # graph mode.
+                blk_table.slot_mapping[total_num_scheduled_tokens:].fill_(-1)
+                num_common_prefix_blocks = (
+                    scheduler_output.
+                    num_common_prefix_blocks[kv_cache_group_id])
 
             common_attn_metadata = CommonAttentionMetadata(
                 query_start_loc=query_start_loc,
@@ -897,8 +906,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 if self.cascade_attn_enabled:
                     common_prefix_len = self._compute_cascade_attn_prefix_len(
                         num_scheduled_tokens,
-                        scheduler_output.
-                        num_common_prefix_blocks[kv_cache_group_id],
+                        num_common_prefix_blocks,
                         kv_cache_group_spec.kv_cache_spec,
                         builder,
                     )
@@ -2812,49 +2820,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Calculate reorder batch threshold (if neeeded)
         self.calculate_reorder_batch_threshold()
 
-        if len(self.attn_groups) > 0:
-            return
-
-        # Check if model is encoder-only
-        block_size = self.vllm_config.cache_config.block_size
-        use_mla = self.vllm_config.model_config.use_mla
-        attn_specs: dict[AttentionSpec, list[str]] = defaultdict(list)
-        for layer_name, attn_module in attn_layers.items():
-
-            if attn_module.attn_type == AttentionType.ENCODER_ONLY:
-                if attn_module.sliding_window is None:
-                    attn_spec: AttentionSpec = FullAttentionSpec(
-                        block_size=block_size,
-                        num_kv_heads=attn_module.num_kv_heads,
-                        head_size=attn_module.head_size,
-                        dtype=self.kv_cache_dtype,
-                        use_mla=use_mla)
-                else:
-                    attn_spec = SlidingWindowSpec(
-                        block_size=block_size,
-                        num_kv_heads=attn_module.num_kv_heads,
-                        head_size=attn_module.head_size,
-                        dtype=self.kv_cache_dtype,
-                        sliding_window=attn_module.sliding_window,
-                        use_mla=use_mla)
-                attn_specs[attn_spec].append(layer_name)
-
-            else:
-                raise ValueError("Expected only encoder-only layers")
-
-        if len(attn_specs) > 0:
-            total_layers = 0
-            for attn_spec, layer_names in attn_specs.items():
-
-                attn_backends = get_attn_backends_for_layers(layer_names)
-                total_layers += len(layer_names)
-
-                self.attn_groups.append(
-                    create_attn_groups(attn_backends, attn_spec))
-            assert total_layers == len(attn_layers), \
-                "All or none of the layers are expected to be encoder-only"
-            self.is_encoder_only_model = True
-
     def initialize_cudagraph_capture(self) -> None:
         min_cg_support = AttentionCGSupport.ALWAYS
         min_cg_builder_name = None
@@ -3002,7 +2967,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         layer_names = set()
         for group in kv_cache_config.kv_cache_groups:
-            layer_names.update(group.layer_names)
+            for layer_name in group.layer_names:
+                if layer_name in self.runner_only_attn_layers:
+                    continue
+                layer_names.add(layer_name)
         assert layer_names == set(kv_cache_raw_tensors.keys(
         )), "Some layers are not correctly initialized"
         return kv_cache_raw_tensors
@@ -3040,6 +3008,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
             attn_backend = group.backend
             for layer_name in group.layer_names:
+                if layer_name in self.runner_only_attn_layers:
+                    continue
                 raw_tensor = kv_cache_raw_tensors[layer_name]
                 assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
                 num_blocks = (raw_tensor.numel() //
@@ -3161,6 +3131,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 kv_cache_config.kv_cache_groups,
                 kv_caches,
                 self.attn_groups,
+                self.runner_only_attn_layers,
             )
             attn_layers = get_layers_from_vllm_config(self.vllm_config,
                                                       Attention)
@@ -3185,8 +3156,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             kv_cache_config: Configuration for the KV cache, including the KV
             cache size of each layer
         """
+        kv_cache_config = deepcopy(kv_cache_config)
         self.kv_cache_config = kv_cache_config
         self.may_reinitialize_input_batch(kv_cache_config)
+        self.may_add_encoder_only_layers_to_kv_cache_config()
         self.initialize_attn_backend(kv_cache_config)
         kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
 
@@ -3199,6 +3172,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(kv_caches)
 
+    def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
+        """
+        Add encoder-only layers to the KV cache config.
+        """
+        block_size = self.vllm_config.cache_config.block_size
+        use_mla = self.vllm_config.model_config.use_mla
+        encoder_only_attn_specs: dict[AttentionSpec,
+                                      list[str]] = defaultdict(list)
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        for layer_name, attn_module in attn_layers.items():
+            if attn_module.attn_type == AttentionType.ENCODER_ONLY:
+                attn_spec = EncoderOnlyAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=attn_module.num_kv_heads,
+                    head_size=attn_module.head_size,
+                    dtype=self.kv_cache_dtype,
+                    use_mla=use_mla)
+                encoder_only_attn_specs[attn_spec].append(layer_name)
+                self.runner_only_attn_layers.add(layer_name)
+        if len(encoder_only_attn_specs) > 0:
+            assert len(
+                encoder_only_attn_specs
+            ) == 1, "Only support one encoder-only attention spec now"
+            spec, layer_names = encoder_only_attn_specs.popitem()
+            self.kv_cache_config.kv_cache_groups.append(
+                KVCacheGroupSpec(layer_names=layer_names, kv_cache_spec=spec))
+
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         """
         Generates the KVCacheSpec by parsing the kv cache format from each
@@ -3287,70 +3287,3 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     mamba_type=mamba_module.mamba_type)
 
         return kv_cache_spec
-
-    def _build_encoder_only_attn_metadata(
-            self, scheduler_output: "SchedulerOutput") -> \
-                dict[str, tuple[CommonAttentionMetadata, Any]]:
-        """Prepare encoder attention metadata for encoder-only models.
-
-        Args:
-            scheduler_output: Scheduler output
-
-        Returns:
-            dict[str, Any]: Encoder attention metadata
-        """
-        num_reqs = self.input_batch.num_reqs
-        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-
-        # Get the number of scheduled tokens for each request.
-        req_ids = self.input_batch.req_ids
-        tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
-        max_num_scheduled_tokens = max(tokens)
-
-        dummy_block_table = torch.zeros((num_reqs, 1),
-                                        dtype=torch.int32,
-                                        pin_memory=self.pin_memory,
-                                        device="cpu").to(self.device,
-                                                         non_blocking=True)
-        dummy_slot_mapping = torch.zeros((total_num_scheduled_tokens, ),
-                                         dtype=torch.int32,
-                                         pin_memory=self.pin_memory,
-                                         device="cpu").to(self.device,
-                                                          non_blocking=True)
-
-        group_metadata = dict[str, tuple[CommonAttentionMetadata, Any]]()
-
-        for attn_group_list in self.attn_groups:
-
-            assert len(attn_group_list) == 1
-            attn_group = attn_group_list[0]
-
-            # Use the first attention metadata builder
-            # to create encoder attention metadata
-            builder = attn_group.metadata_builder
-
-            common_metadata = CommonAttentionMetadata(
-                query_start_loc=self.query_start_loc[:num_reqs + 1],
-                query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
-                seq_lens=self.seq_lens[:num_reqs],
-                seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
-                num_computed_tokens_cpu=self.input_batch.
-                num_computed_tokens_cpu_tensor[:num_reqs],
-                num_reqs=num_reqs,
-                num_actual_tokens=total_num_scheduled_tokens,
-                max_query_len=max_num_scheduled_tokens,
-                max_seq_len=self.seq_lens_cpu[:num_reqs].max().item(),
-                block_table_tensor=dummy_block_table,
-                slot_mapping=dummy_slot_mapping,
-                causal=False,
-            )
-
-            metadata = builder.build(
-                common_prefix_len=0,  # No cascade for encoder
-                common_attn_metadata=common_metadata,
-            )
-
-            for layer_name in attn_group.layer_names:
-                group_metadata[layer_name] = (common_metadata, metadata)
-
-        return group_metadata
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index c7ccd2e254..ffc1a11bc3 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -204,6 +204,7 @@ def initialize_kv_cache_for_kv_sharing(
     kv_caches: dict[str, torch.Tensor],
     # Optional for now to avoid breaking TPU
     attn_groups: Optional[list[list[AttentionGroup]]] = None,
+    runner_only_attn_layers: Optional[set[str]] = None,
 ) -> None:
     """
     Sets up KV cache sharing by reusing the allocated KV caches in `kv_caches`
@@ -250,6 +251,9 @@ def initialize_kv_cache_for_kv_sharing(
             attn_groups[kv_cache_group_idx][attn_group_idx].layer_names.append(
                 layer_name)
 
+        if runner_only_attn_layers is not None:
+            runner_only_attn_layers.add(layer_name)
+
 
 def bind_kv_cache(
     kv_caches: dict[str, torch.Tensor],

From 53415653ff24be03e7c90f5b42ef9cb3f72aad71 Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Thu, 21 Aug 2025 22:30:48 -0700
Subject: [PATCH 497/932] [P/D][Nixl] Make kv cache register compatible with
 hybrid memory allocator (#23079)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 .../kv_connector/unit/test_nixl_connector.py  |  86 +++++++++-
 .../kv_transfer/kv_connector/v1/base.py       |   4 +-
 .../kv_connector/v1/nixl_connector.py         | 155 +++++++-----------
 3 files changed, 150 insertions(+), 95 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index e6859ea738..040b44dc5d 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -14,6 +14,7 @@ from unittest.mock import patch
 
 import pytest
 import ray
+import torch
 
 from vllm import LLM
 from vllm.config import KVTransferConfig
@@ -22,6 +23,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
     NixlConnectorWorker)
 from vllm.forward_context import ForwardContext
 from vllm.sampling_params import SamplingParams
+from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
 
 from .utils import create_request, create_scheduler, create_vllm_config
 
@@ -98,7 +100,6 @@ class FakeNixlWrapper:
 
     def set_cycles_before_xfer_done(self, cycles: int):
         """Set the number of cycles before a transfer is considered done."""
-        self._cycles_before_xfer_done = cycles
 
 
 @contextlib.contextmanager
@@ -562,3 +563,86 @@ def _run_abort_timeout_test(llm_kwargs: dict, timeout: int):
                      sampling_params)
     # Request-0 times out and is cleared!
     assert '0' not in req_to_blocks
+
+
+def test_register_kv_caches(dist_init):
+    """
+    Test that register_kv_caches() properly calls nixl_wrapper methods with
+    correct data.
+    
+    This test verifies:
+    1. nixl_wrapper.get_reg_descs() is called with caches_data containing
+       tensor metadata
+    2. nixl_wrapper.get_xfer_descs() is called with blocks_data containing
+       block layout info
+    """
+
+    vllm_config = create_vllm_config()
+
+    # Create test kv cache tensors using proper backend shape
+    kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(num_blocks=2,
+                                                              block_size=16,
+                                                              num_kv_heads=4,
+                                                              head_size=64)
+    shared_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
+    unique_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
+    kv_caches = {
+        "layer0": shared_tensor,
+        "layer1": unique_tensor,
+        "layer2": shared_tensor,
+    }
+
+    # Store tensor info for validation
+    expected_tensor_size = shared_tensor[0].element_size(
+    ) * shared_tensor[0].numel()
+    expected_base_addrs = [
+        shared_tensor[0].data_ptr(), shared_tensor[1].data_ptr(),
+        unique_tensor[0].data_ptr(), unique_tensor[1].data_ptr()
+    ]
+
+    with patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper") as mock_nixl_wrapper, \
+         patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Event"), \
+         patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Thread"):  # noqa: E501
+
+        # Create connector
+        connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        connector.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, connector.engine_id, hand_shake_latency=0)
+
+        # Get the mock instance
+        mock_wrapper_instance = mock_nixl_wrapper.return_value
+        connector.connector_worker.nixl_wrapper = mock_wrapper_instance
+
+        # Execute register_kv_caches
+        connector.register_kv_caches(kv_caches)
+
+        # Verify get_reg_descs was called with caches_data
+        assert mock_wrapper_instance.get_reg_descs.called
+        caches_data, _ = mock_wrapper_instance.get_reg_descs.call_args[0]
+        assert len(caches_data) == 4
+
+        for i, cache_entry in enumerate(caches_data):
+            base_addr, size, _tp_rank, _ = cache_entry
+            assert size == expected_tensor_size, \
+                f"Entry {i}: Expected tensor size {expected_tensor_size}, " \
+                f"got {size}"
+            assert base_addr == expected_base_addrs[i], \
+                f"Entry {i}: Expected base address {expected_base_addrs[i]}, " \
+                f"got {base_addr}"
+
+        # Verify get_xfer_descs was called with blocks_data
+        assert mock_wrapper_instance.get_xfer_descs.called
+        blocks_data, _ = mock_wrapper_instance.get_xfer_descs.call_args[0]
+
+        # Validate blocks_data structure and size
+        expected_blocks_count = 8
+        assert len(blocks_data) == expected_blocks_count, \
+            f"Expected {expected_blocks_count} blocks, " \
+            f"got {len(blocks_data)}"
+
+        expected_block_len = expected_tensor_size // 2
+        for i, block_entry in enumerate(blocks_data):
+            block_start_addr, block_len, tp_rank = block_entry
+            assert block_len == expected_block_len, \
+                f"Block entry {i}: Expected block len {expected_block_len}, " \
+                f"got {block_len}"
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 07fcdecac6..5601ee74be 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -131,8 +131,8 @@ class KVConnectorBase_V1(ABC):
         Initialize with the KV caches. Useful for pre-registering the
         KV Caches in the KVConnector (e.g. for NIXL).
 
-        Args: kv_caches:
-            dictionary of layer names, kv cache
+        Args: 
+            kv_caches: dictionary of layer names, kv cache
         """
         return
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 4f51229ffb..6608d2a4a9 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -686,9 +686,6 @@ class NixlConnectorWorker:
     def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """Register the KV Cache data in nixl."""
 
-        _, first_kv_cache = next(iter(kv_caches.items()))
-        kv_elem_size = first_kv_cache.element_size()
-
         if self.use_host_buffer:
             self.initialize_host_xfer_buffer(kv_caches=kv_caches)
             assert len(self.host_xfer_buffers) == len(kv_caches), (
@@ -701,66 +698,16 @@ class NixlConnectorWorker:
                 "host_xfer_buffer should not be initialized when "
                 f"kv_buffer_device is {self.kv_buffer_device}")
 
-        # TODO(tms): Find a more robust way to detect and handle MLA
-        # NOTE (NickLucche) To move blocks efficiently with NIXL, the expected
-        # KV memory layout is HND, as opposed to the default NHD. Note that it
-        # will only affects the strides. For MLA instead, we make require no
-        # such thing and resort to the standard layout.
-        use_mla = len(first_kv_cache.shape) == 3
-        if self.device_type == "tpu":
-            assert not use_mla, f"{self.kv_buffer_device} does not support MLA."
-            assert self._use_pallas_v1, f"attn backend: {self.backend_name}"
-            # tpu (v1) kv shape per layer:
-            # (num_blocks, block_size, num_kv_heads * 2, head_size)
-            self.num_blocks = first_kv_cache.shape[0]
-            block_rank = 3  # [block_size, kv_heads, head_dim]
-            block_shape = first_kv_cache.shape[-block_rank:]
-            block_size, n_kv_heads_x_2, head_dim = block_shape
-            self.slot_size_bytes = kv_elem_size * n_kv_heads_x_2 * head_dim
-        elif self.device_type == "cuda":
-            assert use_mla == self.use_mla
-            # TODO (NickLucche) not compatible with hybrid allocator.
-            # Enforce check once it goes live, as a single kv layout
-            # is expected for xfers.
-            if use_mla:
-                # MLA case.
-                self.num_blocks = first_kv_cache.shape[0]
-                block_rank = 2  # [block_size, latent_dim]
-                block_shape = first_kv_cache.shape[-block_rank:]
-                block_size, kv_latent_dim = block_shape
-                self.slot_size_bytes = kv_elem_size * kv_latent_dim
-            else:
-                # [2 (k and v), num_blocks, ...]
-                if self._use_flashinfer:
-                    # FlashInfer swaps 2<->num_blocks dimensions.
-                    self.num_blocks = first_kv_cache.shape[0]
-                    block_rank = 4  # [2, block_size, kv_heads, head_dim]
-                else:
-                    self.num_blocks = first_kv_cache.shape[1]
-                    block_rank = 3  # [block_size, kv_heads, head_dim]
-                block_shape = first_kv_cache.shape[-block_rank:]
-                block_size, n_kv_heads, head_dim = block_shape[-3:]
-                # head size in bytes.
-                self.slot_size_bytes = kv_elem_size * n_kv_heads * head_dim
-            assert block_size == self.block_size
-        else:
-            raise RuntimeError(
-                f"{self.device_type} ({self.backend_name}) is not supported.")
-
-        # TODO(tms): self.block_len needs to be per-layer for sliding window,
-        # hybrid attn, etc
-        # block size in bytes
-        self.block_len = kv_elem_size * math.prod(block_shape)
         logger.info(
             "Registering KV_Caches. use_mla: %s, kv_buffer_device: %s, "
-            "use_host_buffer: %s, num_blocks: %s, block_shape: %s, "
-            "per_layer_kv_cache_shape: %s", use_mla, self.kv_buffer_device,
-            self.use_host_buffer, self.num_blocks, block_shape,
-            first_kv_cache.shape)
-        self.dst_num_blocks[self.engine_id] = self.num_blocks
-        self.device_kv_caches = kv_caches
-        kv_caches_base_addr = []
+            "use_host_buffer: %s", self.use_mla, self.kv_buffer_device,
+            self.use_host_buffer)
+
         caches_data = []
+        # With hybrid allocator, layers can share a kv cache tensor
+        seen_base_addresses = []
+        xfer_buffers = (self.host_xfer_buffers
+                        if self.use_host_buffer else kv_caches)
 
         # Note(tms): I modified this from the original region setup code.
         # K and V are now in different regions. Advantage is that we can
@@ -770,42 +717,35 @@ class NixlConnectorWorker:
         # (roughly 8KB vs 5KB).
         # Conversely for FlashInfer, K and V are transferred in the same tensor
         # to better exploit the memory layout (ie num_blocks is the first dim).
-        for cache_or_caches in xfer_buffers.values():
-            # Normalize to always be a list of caches
-            cache_list = [cache_or_caches] if use_mla \
-                         or self._use_pallas_v1 or self._use_flashinfer \
-                         else cache_or_caches
+        split_k_and_v = not (self.use_mla or self._use_pallas_v1
+                             or self._use_flashinfer)
+        tensor_size_bytes = None
+        for layer_name, cache_or_caches in xfer_buffers.items():
+            cache_list = cache_or_caches if split_k_and_v else [
+                cache_or_caches
+            ]
+
             for cache in cache_list:
                 base_addr = cache.data_ptr()
-                region_len = self.num_blocks * self.block_len
-                # NOTE: use tp_rank for device_id since multi-node TP
-                # is rarely used.
-                caches_data.append((base_addr, region_len, self.tp_rank, ""))
-                kv_caches_base_addr.append(base_addr)
-        self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr
+                if base_addr in seen_base_addresses:
+                    continue
+
+                seen_base_addresses.append(base_addr)
+                curr_tensor_size_bytes = cache.numel() * cache.element_size()
+
+                if tensor_size_bytes is None:
+                    tensor_size_bytes = curr_tensor_size_bytes
+                    self.num_blocks = cache.shape[0]
+
+                assert tensor_size_bytes == curr_tensor_size_bytes, \
+                    "All kv cache tensors must have the same size"
+                caches_data.append(
+                    (base_addr, tensor_size_bytes, self.tp_rank, ""))
+
+        self.kv_caches_base_addr[self.engine_id] = seen_base_addresses
         self.num_regions = len(caches_data)
         self.num_layers = len(xfer_buffers.keys())
 
-        # TODO(mgoin): remove this once we have hybrid memory allocator
-        # Optimization for models with local attention (Llama 4)
-        if self.vllm_config.model_config.hf_config.model_type == "llama4":
-            from transformers import Llama4TextConfig
-            assert isinstance(self.vllm_config.model_config.hf_text_config,
-                              Llama4TextConfig)
-            llama4_config = self.vllm_config.model_config.hf_text_config
-            no_rope_layers = llama4_config.no_rope_layers
-            chunk_size = llama4_config.attention_chunk_size
-            chunk_block_size = math.ceil(chunk_size / self.block_size)
-            for layer_idx in range(self.num_layers):
-                # no_rope_layers[layer_idx] == 0 means NoPE (global)
-                # Any other value means RoPE (local chunked)
-                is_local_attention = no_rope_layers[layer_idx] != 0
-                block_window = chunk_block_size if is_local_attention else None
-                self.block_window_per_layer.append(block_window)
-            logger.debug("Llama 4 block window per layer mapping: %s",
-                         self.block_window_per_layer)
-            assert len(self.block_window_per_layer) == self.num_layers
-
         descs = self.nixl_wrapper.get_reg_descs(caches_data,
                                                 self.nixl_memory_type)
         logger.debug("Registering descs: %s", caches_data)
@@ -813,9 +753,20 @@ class NixlConnectorWorker:
         logger.debug("Done registering descs")
         self._registered_descs.append(descs)
 
+        assert tensor_size_bytes is not None
+        assert self.num_blocks != 0
+        assert tensor_size_bytes % self.num_blocks == 0
+        self.block_len = tensor_size_bytes // self.num_blocks
+        self.slot_size_bytes = self.block_len // self.block_size
+        if self._use_flashinfer:
+            assert self.slot_size_bytes % 2 == 0
+            self.slot_size_bytes /= 2
+        self.device_kv_caches = kv_caches
+        self.dst_num_blocks[self.engine_id] = self.num_blocks
+
         # Register local/src descr for NIXL xfer.
         blocks_data = []
-        for base_addr in self.kv_caches_base_addr[self.engine_id]:
+        for base_addr in seen_base_addresses:
             # NOTE With heter-TP, more blocks are prepared than what are
             # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We
             # could create fewer, but then _get_block_descs_ids needs to
@@ -836,6 +787,26 @@ class NixlConnectorWorker:
         self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
             "NIXL_INIT_AGENT", descs)
 
+        # TODO(mgoin): Hybrid memory allocator is currently diabled for
+        # models with local attention (Llama 4). Can remove this once enabled.
+        if self.vllm_config.model_config.hf_config.model_type == "llama4":
+            from transformers import Llama4TextConfig
+            assert isinstance(self.vllm_config.model_config.hf_text_config,
+                              Llama4TextConfig)
+            llama4_config = self.vllm_config.model_config.hf_text_config
+            no_rope_layers = llama4_config.no_rope_layers
+            chunk_size = llama4_config.attention_chunk_size
+            chunk_block_size = math.ceil(chunk_size / self.block_size)
+            for layer_idx in range(self.num_layers):
+                # no_rope_layers[layer_idx] == 0 means NoPE (global)
+                # Any other value means RoPE (local chunked)
+                is_local_attention = no_rope_layers[layer_idx] != 0
+                block_window = chunk_block_size if is_local_attention else None
+                self.block_window_per_layer.append(block_window)
+            logger.debug("Llama 4 block window per layer mapping: %s",
+                         self.block_window_per_layer)
+            assert len(self.block_window_per_layer) == self.num_layers
+
         # After KV Caches registered, listen for new connections.
         metadata = NixlAgentMetadata(
             engine_id=self.engine_id,

From 0ba1b54ac6958de7a02dfd39be7b59dd430be9ca Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Fri, 22 Aug 2025 10:32:24 +0200
Subject: [PATCH 498/932] [gpt-oss] add input/output usage in responses api
 when harmony context is leveraged (#22667)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 vllm/entrypoints/context.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index e817f07ef5..f70e1fc207 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -3,6 +3,7 @@
 import json
 import logging
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from typing import TYPE_CHECKING, Union
 
 from openai_harmony import Author, Message, Role, StreamState, TextContent
@@ -67,15 +68,27 @@ class HarmonyContext(ConversationContext):
 
         self.parser = get_streamable_parser_for_assistant()
         self.num_init_messages = len(messages)
-        # TODO(woosuk): Implement the following fields.
         self.num_prompt_tokens = 0
-        self.num_cached_tokens = 0
         self.num_output_tokens = 0
+        # TODO(woosuk): Implement the following fields.
+        self.num_cached_tokens = 0
         self.num_reasoning_tokens = 0
 
+    def _update_num_prompt_tokens(self, output: RequestOutput):
+        if output.prompt_token_ids and len(output.prompt_token_ids) > 0:
+            # NOTE: with built-in tools, there might be multiple rounds in
+            # the conversation, with the full conversation being resent
+            # as new prompt each time. Hence the sum.
+            self.num_prompt_tokens += len(output.prompt_token_ids)
+
+    def _update_num_output_tokens(self, token_ids: Sequence[int]):
+        self.num_output_tokens += len(token_ids)
+
     def append_output(self, output) -> None:
         if isinstance(output, RequestOutput):
+            self._update_num_prompt_tokens(output)
             output_token_ids = output.outputs[0].token_ids
+            self._update_num_output_tokens(output_token_ids)
             self.parser = get_streamable_parser_for_assistant()
             for token_id in output_token_ids:
                 self.parser.process(token_id)
@@ -158,6 +171,7 @@ class StreamingHarmonyContext(HarmonyContext):
         self.parser = get_streamable_parser_for_assistant()
         self.encoding = get_encoding()
         self.last_tok = None
+        self.first_tok_of_message = True
 
     @property
     def messages(self) -> list:
@@ -165,8 +179,18 @@ class StreamingHarmonyContext(HarmonyContext):
 
     def append_output(self, output) -> None:
         if isinstance(output, RequestOutput):
+            # append_output is called for each output token in streaming case,
+            # so we only want to add the prompt tokens once for each message.
+            if self.first_tok_of_message:
+                self._update_num_prompt_tokens(output)
+            # Reset self.first_tok_of_message if needed:
+            # if the current token is the last one of the current message
+            # (finished=True), then the next token processed will mark the
+            # beginning of a new message
+            self.first_tok_of_message = output.finished
             tok = output.outputs[0].token_ids[0]
             self.parser.process(tok)
+            self._update_num_output_tokens(output.outputs[0].token_ids)
             self.last_tok = tok
         else:
             # Handle the case of tool output in direct message format

From 998720859caadd8a8d2a3e2af8b3e6e34a42e8da Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Fri, 22 Aug 2025 01:43:29 -0700
Subject: [PATCH 499/932] Migrate MiniCPMOAudioInputs to TensorSchema (#21847)

Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/models/minicpmo.py | 52 +++++++++++++++++---------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index 98ea366d3a..225668d87f 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -24,7 +24,7 @@
 # limitations under the License.
 """Inference-only MiniCPM-O model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Callable, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Callable, Literal, Optional, Union
 
 import torch
 from torch import nn
@@ -49,6 +49,7 @@ from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
                                         PromptUpdateDetails)
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
                        MiniCPMVDummyInputsBuilder,
@@ -61,35 +62,52 @@ from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn,
 CPU_DEVICE = torch.device("cpu")
 
 
-class MiniCPMOAudioFeatureInputs(TypedDict):
-    type: Literal["audio_features"]
-    audio_features: Union[torch.Tensor, list[torch.Tensor]]
+class MiniCPMOAudioFeatureInputs(TensorSchema):
+    """
+    Dimensions:
+        - bns: Batch size * number of audios * number of slices
+        - bn: Batch size * number of audios
+        - c: Number of channels
+        - l: Length
+        - s: Number of slices
+    """
+    type: Literal["audio_features"] = "audio_features"
+
+    audio_features: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bns", "c", "l", dynamic_dims={"l"}),
+    ]
     """
-    Shape: `(batch_size * num_audios * num_slices, num_channels, length)`
     Slice here means chunk. Audio that is too long will be split into slices,
-    which is the same as image.
-    Padding is used therefore `audio_features` is `torch.Tensor`.
+    which is the same as image. Padding is used therefore `audio_features` is 
+    `torch.Tensor`.
     """
 
-    audio_feature_lens: Union[torch.Tensor, list[torch.Tensor]]
+    audio_feature_lens: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "s"),
+    ]
     """
-    Shape: `(batch_size * num_audios, num_slices)`
-
     This should be feature length of each audio slice, 
     which equals to `audio_features.shape[-1]`
     """
 
 
-class MiniCPMOAudioEmbeddingInputs(TypedDict):
-    type: Literal["audio_embeds"]
-    audio_embeds: Union[torch.Tensor, list[torch.Tensor]]
+class MiniCPMOAudioEmbeddingInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_audios, num_slices, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
-    instead of a batched tensor.
+    Dimensions:
+        - bn: Batch size * number of audios
+        - s: Number of slices
+        - h: Hidden size (must match language model backbone)
+    
     Length of each slice may vary, so pass it as a list.
     """
+    type: Literal["audio_embeds"] = "audio_embeds"
+
+    audio_embeds: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "s", "h", dynamic_dims={"s"}),
+    ]
 
 
 MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,

From 88016c372a5962eb98f4dfc71243ccd64433710e Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 22 Aug 2025 17:47:17 +0800
Subject: [PATCH 500/932] [Bugfix] Fix pooling models on CPU backend (#23392)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 vllm/utils/__init__.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 1eefb32eaa..7079bfb8db 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1440,6 +1440,12 @@ def _patched_set_stream(stream: torch.cuda.Stream) -> None:
 torch.cuda.set_stream = _patched_set_stream
 
 
+class _StreamPlaceholder:
+
+    def __init__(self):
+        self.synchronize = lambda: None
+
+
 def current_stream() -> torch.cuda.Stream:
     """
     replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
@@ -1459,8 +1465,18 @@ def current_stream() -> torch.cuda.Stream:
         # On ROCm using the default 0 stream in combination with RCCL
         # is hurting performance. Therefore creating a dedicated stream
         # per process
-        _current_stream_tls.value = torch.cuda.Stream(
-        ) if current_platform.is_rocm() else torch.cuda.current_stream()
+        if current_platform.is_rocm():
+            _current_stream_tls.value = torch.cuda.Stream()
+        elif current_platform.is_cpu():
+            _current_stream_tls.value = _StreamPlaceholder()
+        else:
+            current_stream = current_platform.current_stream
+            if current_stream is not None:
+                _current_stream_tls.value = current_stream()
+            else:
+                raise ValueError(
+                    "Fail to set current stream, current platform "
+                    "may not support current_stream with torch API")
     return _current_stream_tls.value
 
 
From 285178b3b824d70b46b351daa8f8942d23da264a Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 22 Aug 2025 17:56:51 +0800
Subject: [PATCH 501/932] [V0 Deprecation] Remove V0 LoRA test (#23418)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py          |  31 +-------
 tests/lora/test_add_lora.py     |  11 +--
 tests/lora/test_llama_tp.py     |   5 +-
 tests/lora/test_lora_manager.py | 130 +++++++++++++++++---------------
 tests/lora/test_mixtral.py      |   1 -
 tests/lora/test_worker.py       |  20 ++---
 tests/lora/utils.py             |  76 +++++++++++++++++++
 7 files changed, 158 insertions(+), 116 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 909b739331..cba573b63c 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -3,15 +3,13 @@
 
 import tempfile
 from collections import OrderedDict
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
 import pytest
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download
 
-import vllm
-from vllm.config import LoRAConfig
 from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
                               initialize_model_parallel)
@@ -21,7 +19,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.platforms import current_platform
 
@@ -104,6 +101,7 @@ def dummy_model() -> nn.Module:
         ]))
     model.config = MagicMock()
     model.embedding_modules = {"lm_head": "lm_head"}
+    model.unpadded_vocab_size = 32000
     return model
 
 
@@ -137,6 +135,8 @@ def dummy_model_gate_up() -> nn.Module:
         ],
     }
     model.embedding_modules = {"lm_head": "lm_head"}
+    model.unpadded_vocab_size = 32000
+
     return model
 
 
@@ -221,29 +221,6 @@ def phi2_lora_files():
     return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
 
 
-@pytest.fixture
-def llama_2_7b_engine_extra_embeddings():
-    cleanup_dist_env_and_memory(shutdown_ray=True)
-    get_model_old = get_model
-
-    def get_model_patched(**kwargs):
-        kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,
-                                                       max_lora_rank=8)
-        return get_model_old(**kwargs)
-
-    with patch("vllm.worker.model_runner.get_model", get_model_patched):
-        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
-    yield engine.llm_engine
-    del engine
-    cleanup_dist_env_and_memory(shutdown_ray=True)
-
-
-@pytest.fixture
-def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
-    yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
-           model_runner.model)
-
-
 @pytest.fixture
 def reset_default_device():
     """
diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index d7b019509f..44755c603f 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -5,7 +5,6 @@ import time
 
 import pytest
 
-import vllm.envs as env
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
@@ -98,12 +97,10 @@ async def test_add_lora(chatglm3_lora_files):
         # Run with warmup
         add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
         add_lora_results = await asyncio.gather(*add_lora_tasks)
-        if env.VLLM_USE_V1:
-            # Test that all all_lora calls are successful.
-            assert all(add_lora_results)
-        else:
-            # No way to check V0 engine results as the calls just return None.
-            pass
+
+        # Test that all all_lora calls are successful.
+        assert all(add_lora_results)
+
         time_with_add_lora = await requests_processing_time(
             llm, warmup_run_requests)
 
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index b1ad1fdd06..06196cc697 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -113,8 +113,7 @@ def test_llama_lora(sql_lora_files):
         enable_lora=True,
         # also test odd max_num_seqs
         max_num_seqs=13,
-        max_loras=4,
-        enable_chunked_prefill=True)
+        max_loras=4)
     generate_and_test(llm, sql_lora_files)
 
 
@@ -128,7 +127,6 @@ def test_llama_lora_tp4(sql_lora_files):
         max_num_seqs=16,
         max_loras=4,
         tensor_parallel_size=4,
-        enable_chunked_prefill=True,
     )
     generate_and_test(llm, sql_lora_files)
 
@@ -144,7 +142,6 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
         max_loras=4,
         tensor_parallel_size=4,
         fully_sharded_loras=True,
-        enable_chunked_prefill=True,
     )
     generate_and_test(llm, sql_lora_files)
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 8f8a27006c..c9ab32edc7 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -21,6 +21,8 @@ from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                       WorkerLoRAManager)
 from vllm.platforms import current_platform
 
+from .utils import create_peft_lora
+
 EMBEDDING_MODULES = {
     "embed_tokens": "input_embeddings",
     "lm_head": "output_embeddings",
@@ -35,17 +37,6 @@ DEVICES = ([
 DEFAULT_DTYPE = torch.get_default_dtype()
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Some tests depend on V0 internals. Since both V0 and V1 use the same
-    LoRAModelManager it is okay to just test V0.
-    """
-    with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
-        yield
-
-
 @pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(
@@ -326,7 +317,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
                                                   max_loras=2,
                                                   lora_dtype=DEFAULT_DTYPE),
                                        device=device)
-
     assert all(x is None for x in manager.lora_index_to_id)
 
     # Add up to capacity
@@ -430,32 +420,40 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
-                                          sql_lora_files, device):
+def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
+                                          tmp_path):
     lora_config = LoRAConfig(max_lora_rank=8,
                              max_cpu_loras=4,
                              max_loras=4,
                              lora_dtype=DEFAULT_DTYPE)
+
+    dummy_lora_files = f"{tmp_path}/lora_adapter"
+    os.makedirs(dummy_lora_files, exist_ok=True)
+    create_peft_lora(
+        dummy_model,
+        save_dir=dummy_lora_files,
+        target_modules=["layer1.dense1", "dense2"],
+        lora_dtype=DEFAULT_DTYPE,
+    )
     worker_adapter_manager = LRUCacheWorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
-        lora_config.lora_extra_vocab_size, lora_config, device,
-        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
-    worker_adapter_manager.create_lora_manager(
-        llama_2_7b_model_extra_embeddings)
+        4, 2,
+        dummy_model.unpadded_vocab_size - lora_config.lora_extra_vocab_size,
+        lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
+    worker_adapter_manager.create_lora_manager(dummy_model)
 
     mapping = LoRAMapping([], [])
     worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("2", 2, dummy_lora_files)
     ], mapping)
     assert worker_adapter_manager.list_adapters() == {1, 2}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
 
     worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("3", 3, sql_lora_files),
-        LoRARequest("4", 4, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("3", 3, dummy_lora_files),
+        LoRARequest("4", 4, dummy_lora_files)
     ], mapping)
     assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -464,9 +462,9 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
     worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files),
-        LoRARequest("5", 5, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("2", 2, dummy_lora_files),
+        LoRARequest("5", 5, dummy_lora_files)
     ], mapping)
     assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -475,9 +473,9 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
     worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("1", 1, dummy_lora_files)
     ], mapping)
     assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -486,9 +484,9 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
 
     worker_adapter_manager.set_active_adapters([
-        LoRARequest("6", 6, sql_lora_files),
-        LoRARequest("7", 7, sql_lora_files),
-        LoRARequest("8", 8, sql_lora_files)
+        LoRARequest("6", 6, dummy_lora_files),
+        LoRARequest("7", 7, dummy_lora_files),
+        LoRARequest("8", 8, dummy_lora_files)
     ], mapping)
     assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -499,11 +497,11 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
     # Over capacity
     with pytest.raises(RuntimeError):
         worker_adapter_manager.set_active_adapters([
-            LoRARequest("10", 10, sql_lora_files),
-            LoRARequest("11", 11, sql_lora_files),
-            LoRARequest("12", 12, sql_lora_files),
-            LoRARequest("13", 13, sql_lora_files),
-            LoRARequest("14", 14, sql_lora_files)
+            LoRARequest("10", 10, dummy_lora_files),
+            LoRARequest("11", 11, dummy_lora_files),
+            LoRARequest("12", 12, dummy_lora_files),
+            LoRARequest("13", 13, dummy_lora_files),
+            LoRARequest("14", 14, dummy_lora_files)
         ], mapping)
 
     assert worker_adapter_manager.device == device
@@ -512,33 +510,41 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
-                                sql_lora_files, device):
+def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
+                                tmp_path):
     # Should remove every LoRA not specified in the request.
     lora_config = LoRAConfig(max_lora_rank=8,
                              max_cpu_loras=4,
                              max_loras=4,
                              lora_dtype=DEFAULT_DTYPE)
     worker_adapter_manager = WorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+        4, 2, dummy_model_gate_up.unpadded_vocab_size -
         lora_config.lora_extra_vocab_size, lora_config, device,
         EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
-    worker_adapter_manager.create_lora_manager(
-        llama_2_7b_model_extra_embeddings)
+    worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
+
+    dummy_lora_files = f"{tmp_path}/lora_adapter"
+    os.makedirs(dummy_lora_files, exist_ok=True)
+    create_peft_lora(
+        dummy_model_gate_up,
+        save_dir=dummy_lora_files,
+        target_modules=["layer1.dense1", "dense2"],
+        lora_dtype=DEFAULT_DTYPE,
+    )
 
     mapping = LoRAMapping([], [])
     worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("2", 2, dummy_lora_files)
     ], mapping)
     assert worker_adapter_manager.list_adapters() == {1, 2}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
 
     worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("3", 3, sql_lora_files),
-        LoRARequest("4", 4, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("3", 3, dummy_lora_files),
+        LoRARequest("4", 4, dummy_lora_files)
     ], mapping)
     assert worker_adapter_manager.list_adapters() == {1, 3, 4}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -546,9 +552,9 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4
 
     worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files),
-        LoRARequest("5", 5, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("2", 2, dummy_lora_files),
+        LoRARequest("5", 5, dummy_lora_files)
     ], mapping)
     assert worker_adapter_manager.list_adapters() == {1, 2, 5}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -556,9 +562,9 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
 
     worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files)
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("1", 1, dummy_lora_files),
+        LoRARequest("1", 1, dummy_lora_files)
     ], mapping)
     assert worker_adapter_manager.list_adapters() == {1}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
@@ -566,9 +572,9 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None
 
     worker_adapter_manager.set_active_adapters([
-        LoRARequest("6", 6, sql_lora_files),
-        LoRARequest("7", 7, sql_lora_files),
-        LoRARequest("8", 8, sql_lora_files)
+        LoRARequest("6", 6, dummy_lora_files),
+        LoRARequest("7", 7, dummy_lora_files),
+        LoRARequest("8", 8, dummy_lora_files)
     ], mapping)
     assert worker_adapter_manager.list_adapters() == {6, 7, 8}
     assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
@@ -578,11 +584,11 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
     # Over capacity
     with pytest.raises(RuntimeError):
         worker_adapter_manager.set_active_adapters([
-            LoRARequest("10", 10, sql_lora_files),
-            LoRARequest("11", 11, sql_lora_files),
-            LoRARequest("12", 12, sql_lora_files),
-            LoRARequest("13", 13, sql_lora_files),
-            LoRARequest("14", 14, sql_lora_files)
+            LoRARequest("10", 10, dummy_lora_files),
+            LoRARequest("11", 11, dummy_lora_files),
+            LoRARequest("12", 12, dummy_lora_files),
+            LoRARequest("13", 13, dummy_lora_files),
+            LoRARequest("14", 14, dummy_lora_files)
         ], mapping)
 
     assert worker_adapter_manager.device == device
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 0ea0779331..03e5d8d5d6 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -50,7 +50,6 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
         max_loras=4,
         distributed_executor_backend="ray",
         tensor_parallel_size=tp_size,
-        enable_chunked_prefill=True,
     )
 
     expected_lora_output = [
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index bd0aea67b9..a836ff94ba 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -4,17 +4,14 @@
 import os
 import random
 import tempfile
-from typing import Union
 from unittest.mock import patch
 
-import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VllmConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
-from vllm.v1.worker.gpu_worker import Worker as V1Worker
-from vllm.worker.worker import Worker
+from vllm.v1.worker.gpu_worker import Worker
 
 NUM_LORAS = 16
 
@@ -22,18 +19,11 @@ NUM_LORAS = 16
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
 
-    def set_active_loras(worker: Union[Worker, V1Worker],
-                         lora_requests: list[LoRARequest]):
+    def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
         lora_mapping = LoRAMapping([], [])
-        if isinstance(worker, Worker):
-            # v0 case
-            worker.model_runner.set_active_loras(lora_requests, lora_mapping)
-        else:
-            # v1 case
-            worker.model_runner.lora_manager.set_active_adapters(
-                lora_requests, lora_mapping)
 
-    worker_cls = V1Worker if envs.VLLM_USE_V1 else Worker
+        worker.model_runner.lora_manager.set_active_adapters(
+            lora_requests, lora_mapping)
 
     vllm_config = VllmConfig(
         model_config=ModelConfig(
@@ -62,7 +52,7 @@ def test_worker_apply_lora(sql_lora_files):
                                max_cpu_loras=NUM_LORAS,
                                max_loras=NUM_LORAS),
     )
-    worker = worker_cls(
+    worker = Worker(
         vllm_config=vllm_config,
         local_rank=0,
         rank=0,
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index cc1b0d8195..7cda90787b 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -1,10 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import json
+import os
 from dataclasses import dataclass
 from typing import Optional, Union
 
 import torch
+from safetensors.torch import save_file
 
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
 
@@ -340,3 +343,76 @@ def generate_data_for_nslices(
         seq_len_tensor,
         indices,
     )
+
+
+def create_peft_lora(
+    model: torch.nn.Module,
+    save_dir: str,
+    target_modules: list[str],
+    rank: int = 8,
+    alpha: int = 16,
+    dropout: float = 0.1,
+    lora_dtype: torch.dtype = torch.float16,
+) -> dict[str, torch.Tensor]:
+    lora_weights = {}
+    adapter_config = {
+        "peft_type": "LORA",
+        "auto_mapping": None,
+        "base_model_name_or_path": "dummy_model",
+        "revision": None,
+        "task_type": "CAUSAL_LM",
+        "inference_mode": False,
+        "r": rank,
+        "lora_alpha": alpha,
+        "lora_dropout": dropout,
+        "fan_in_fan_out": False,
+        "bias": "none",
+        "modules_to_save": None,
+        "init_lora_weights": True,
+        "layers_to_transform": None,
+        "layers_pattern": None,
+        "target_modules": target_modules,
+        "exclude_modules": None,
+        "use_rslora": False,
+        "use_dora": False,
+        "loftq_config": None,
+    }
+
+    for module_name in target_modules:
+
+        module = model
+        for attr in module_name.split("."):
+            module = getattr(module, attr)
+
+        if hasattr(module, "input_size") and hasattr(module, "output_size"):
+
+            in_features = module.input_size
+            out_features = module.output_size
+
+        elif hasattr(module, "embedding_dim") and hasattr(
+                module, "num_embeddings"):
+            # ParallelLMHead
+            in_features = module.embedding_dim
+            out_features = module.num_embeddings
+        else:
+            raise ValueError(
+                f"Unable to determine dimensions for module {module_name}")
+
+        lora_A = torch.randn(rank, in_features, dtype=lora_dtype)
+
+        torch.nn.init.kaiming_uniform_(lora_A, a=5**0.5)
+
+        lora_B = torch.zeros(out_features, rank, dtype=lora_dtype)
+
+        # PEFT style
+        lora_weights[f"base_model.model.{module_name}.lora_A.weight"] = lora_A
+        lora_weights[f"base_model.model.{module_name}.lora_B.weight"] = lora_B
+
+    config_path = os.path.join(save_dir, "adapter_config.json")
+    with open(config_path, "w", encoding="utf-8") as f:
+        json.dump(adapter_config, f, indent=2, ensure_ascii=False)
+
+    weights_path = os.path.join(save_dir, "adapter_model.safetensors")
+    save_file(lora_weights, weights_path)
+
+    return lora_weights

From 808d2e9aa0f302bf9667b09b9dcf297f86927dac Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 22 Aug 2025 03:07:22 -0700
Subject: [PATCH 502/932] [Misc] Move M-RoPE init logic to
 _init_mrope_positions (#23422)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 63 +++++++++++++++---------------
 1 file changed, 31 insertions(+), 32 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d520b71de3..7160894b4a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -507,42 +507,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 output_token_ids=[],
                 lora_request=new_req_data.lora_request,
             )
-
             self.requests[req_id] = req_state
 
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             if self.uses_mrope:
-                image_grid_thw = []
-                video_grid_thw = []
-                second_per_grid_ts = []
-                audio_feature_lengths = []
-                use_audio_in_video = False
-                for mm_item in req_state.mm_kwargs:
-                    mm_input = mm_item.get_data()
-                    if (t := mm_input.get("image_grid_thw")) is not None:
-                        image_grid_thw.append(t.tolist())
-                    if (t := mm_input.get("video_grid_thw")) is not None:
-                        video_grid_thw.append(t.tolist())
-                    if (t := mm_input.get("second_per_grid_ts")) is not None:
-                        second_per_grid_ts.append(t)
-                    if (t :=
-                            mm_input.get("audio_feature_lengths")) is not None:
-                        audio_feature_lengths.append(t)
-                    if mm_input.get("use_audio_in_video") is True:
-                        use_audio_in_video = True
-
-                hf_config = self.model_config.hf_config
-
-                req_state.mrope_positions, req_state.mrope_position_delta = \
-                    MRotaryEmbedding.get_input_positions_tensor(
-                        req_state.prompt_token_ids,
-                        hf_config=hf_config,
-                        image_grid_thw=image_grid_thw,
-                        video_grid_thw=video_grid_thw,
-                        second_per_grid_ts=second_per_grid_ts,
-                        audio_feature_lengths=audio_feature_lengths,
-                        use_audio_in_video=use_audio_in_video,
-                    )
+                self._init_mrope_positions(req_state)
 
             reqs_to_add.append(req_state)
 
@@ -639,6 +608,36 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Refresh batch metadata with any pending updates.
         self.input_batch.refresh_metadata()
 
+    def _init_mrope_positions(self, req_state: CachedRequestState):
+        image_grid_thw = []
+        video_grid_thw = []
+        second_per_grid_ts = []
+        audio_feature_lengths = []
+        use_audio_in_video = False
+        for mm_item in req_state.mm_kwargs:
+            mm_input = mm_item.get_data()
+            if (t := mm_input.get("image_grid_thw")) is not None:
+                image_grid_thw.append(t.tolist())
+            if (t := mm_input.get("video_grid_thw")) is not None:
+                video_grid_thw.append(t.tolist())
+            if (t := mm_input.get("second_per_grid_ts")) is not None:
+                second_per_grid_ts.append(t)
+            if (t := mm_input.get("audio_feature_lengths")) is not None:
+                audio_feature_lengths.append(t)
+            if mm_input.get("use_audio_in_video") is True:
+                use_audio_in_video = True
+
+        req_state.mrope_positions, req_state.mrope_position_delta = \
+            MRotaryEmbedding.get_input_positions_tensor(
+                req_state.prompt_token_ids,
+                hf_config=self.model_config.hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                audio_feature_lengths=audio_feature_lengths,
+                use_audio_in_video=use_audio_in_video,
+            )
+
     def _extract_mm_kwargs(
         self,
         scheduler_output: "SchedulerOutput",

From 281710ef9a2a795d57bce997d89a3ed69287464e Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 22 Aug 2025 08:10:16 -0400
Subject: [PATCH 503/932] [Attention] Allow V1 flash_attn to support
 cross-attention (#23297)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/v1/attention/backends/flash_attn.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index eed3cba9a2..eca83b6d2e 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -405,13 +405,6 @@ class FlashAttentionImpl(AttentionImpl):
 
         FlashAttentionBackend.validate_head_size(head_size)
 
-        if attn_type not in [
-                AttentionType.DECODER, AttentionType.ENCODER_ONLY
-        ]:
-            raise NotImplementedError("Encoder/decoder cross-attention "
-                                      "is not implemented for "
-                                      "FlashAttentionImpl")
-
         self.attn_type = attn_type
         self.vllm_flash_attn_version = get_flash_attn_version()
         if is_quantized_kv_cache(self.kv_cache_dtype) \
@@ -477,7 +470,7 @@ class FlashAttentionImpl(AttentionImpl):
         num_actual_tokens = attn_metadata.num_actual_tokens
 
         # Handle encoder attention differently - no KV cache needed
-        if attn_type in (AttentionType.ENCODER_ONLY, ):
+        if attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER):
             # For encoder attention,
             # we use direct Q, K, V tensors without caching
             return self._forward_encoder_attention(query[:num_actual_tokens],
@@ -489,7 +482,11 @@ class FlashAttentionImpl(AttentionImpl):
         # For decoder and cross-attention, use KV cache as before
         key_cache, value_cache = kv_cache.unbind(0)
 
-        if self.kv_sharing_target_layer_name is None:
+        # key and value may be None in the case of cross attention. They are
+        # calculated once based on the output from the encoder and then cached
+        # in KV cache.
+        if (self.kv_sharing_target_layer_name is None and key is not None
+                and value is not None):
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             # NOTE(woosuk): Here, key and value are padded while slot_mapping is
@@ -528,7 +525,7 @@ class FlashAttentionImpl(AttentionImpl):
             block_table = attn_metadata.block_table
             scheduler_metadata = attn_metadata.scheduler_metadata
 
-            descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+            descale_shape = (cu_seqlens_q.shape[0] - 1, self.num_kv_heads)
 
             flash_attn_varlen_func(
                 q=query[:num_actual_tokens],

From 695e7adcd22c25b859a6d4b3af99617aaf425708 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=A8=E6=9C=B1=20=C2=B7=20Kiki?= <baofa.fan@daocloud.io>
Date: Fri, 22 Aug 2025 21:08:53 +0800
Subject: [PATCH 504/932] [misc] Remove outdate comment about
 runai_model_streamer (#23421)

Signed-off-by: carlory <baofa.fan@daocloud.io>
---
 vllm/model_executor/model_loader/weight_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 78b186265d..7053c5bc51 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -31,9 +31,7 @@ from vllm.utils import PlaceholderModule
 
 try:
     from runai_model_streamer import SafetensorsStreamer
-except (ImportError, OSError):
-    # see https://github.com/run-ai/runai-model-streamer/issues/26
-    # OSError will be raised on arm64 platform
+except ImportError:
     runai_model_streamer = PlaceholderModule(
         "runai_model_streamer")  # type: ignore[assignment]
     SafetensorsStreamer = runai_model_streamer.placeholder_attr(

From a073be6d87c6480ecd725bd475cc4f30fd747aa4 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 22 Aug 2025 06:20:39 -0700
Subject: [PATCH 505/932] [Doc] Update the doc for log probs + prefix caching
 (#23399)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/usage/v1_guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index b897689136..7fc615d4c0 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -166,7 +166,7 @@ Processed means the values after applying all processors, including temperature
 
 ##### Prompt Logprobs with Prefix Caching
 
-Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](gh-issue:13414).
+Logprobs are not cached. For a request requiring prompt logprobs, the engine will ignore the prefix cache and recompute the prefill of full prompt to generate the logprobs.
 
 #### Deprecated Features
 

From 325aa3dee922b344a26b9e74d9ae3c769828e70e Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Fri, 22 Aug 2025 22:01:35 +0800
Subject: [PATCH 506/932] [Misc] local import code clean (#23420)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/v1/worker/gpu_worker.py | 1 -
 vllm/worker/worker.py        | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index d61177d424..f83a4f4fae 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -292,7 +292,6 @@ class Worker(WorkerBase):
             allocator = CuMemAllocator.get_instance()
             context = allocator.use_memory_pool(tag="kv_cache")
         else:
-            from contextlib import nullcontext
             context = nullcontext()
         with context:
             self.model_runner.initialize_kv_cache(kv_cache_config)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 7a01e585ba..fc24d95b80 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -3,6 +3,7 @@
 """A GPU worker class."""
 import gc
 import os
+from contextlib import nullcontext
 from typing import Dict, List, Optional, Set, Tuple, Type, Union
 
 import torch
@@ -206,7 +207,6 @@ class Worker(LocalOrDistributedWorkerBase):
                 "used for one instance per process.")
             context = allocator.use_memory_pool(tag="weights")
         else:
-            from contextlib import nullcontext
             context = nullcontext()
         with context:
             self.model_runner.load_model()
@@ -330,7 +330,6 @@ class Worker(LocalOrDistributedWorkerBase):
             allocator = CuMemAllocator.get_instance()
             context = allocator.use_memory_pool(tag="kv_cache")
         else:
-            from contextlib import nullcontext
             context = nullcontext()
         with context:
             self._init_cache_engine()

From ebe14621e353217ff16da329c2e76b80ca233b1b Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Fri, 22 Aug 2025 08:12:28 -0700
Subject: [PATCH 507/932] [Bug fix] Dynamically setting the backend variable
 for genai_perf_tests in the run-nightly-benchmark script (#23375)

Signed-off-by: Naman Lalit <nl2688@nyu.edu>
---
 .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 06d7b5ed48..a00de940cb 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -382,7 +382,7 @@ run_genai_perf_tests() {
       client_command="genai-perf profile \
         -m $model \
         --service-kind openai \
-        --backend vllm \
+        --backend "$backend" \
         --endpoint-type chat \
         --streaming \
         --url localhost:$port \

From 51a215300bb9df3b5730ef7dedeb46eb5f5a0138 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Fri, 22 Aug 2025 17:13:39 +0200
Subject: [PATCH 508/932] [Fix] Bump triton version in rocm-build requirements
 (#21630)

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 requirements/rocm-build.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 94201543cd..cbae9bbb8a 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -6,7 +6,7 @@ torch==2.7.0
 torchvision==0.22.0
 torchaudio==2.7.0
 
-triton==3.2
+triton==3.3.0
 cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0

From 613a23b57f02cad9138e69399bea2d2413bb6802 Mon Sep 17 00:00:00 2001
From: Martin Hickey <martin.hickey@ie.ibm.com>
Date: Fri, 22 Aug 2025 17:22:29 +0100
Subject: [PATCH 509/932] [Bugfix]: Installing dev environment due to pydantic
 incompatible version (#23353)

Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com>
---
 requirements/common.txt | 2 +-
 requirements/test.txt   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 365457436f..8acf634526 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -13,7 +13,7 @@ protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
 openai >= 1.99.1  # For Responses API with reasoning content
-pydantic >= 2.10
+pydantic >= 2.11.7
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
diff --git a/requirements/test.txt b/requirements/test.txt
index 85b677c00b..8b872752d8 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -742,7 +742,7 @@ pycparser==2.22
     # via cffi
 pycryptodomex==3.22.0
     # via blobfile
-pydantic==2.11.5
+pydantic==2.11.7
     # via
     #   -r requirements/test.in
     #   albumentations

From 88491c1b6bcac8fa6adfa22489c92419c5e89055 Mon Sep 17 00:00:00 2001
From: PapaGoose <56637198+PapaGoose@users.noreply.github.com>
Date: Fri, 22 Aug 2025 19:39:19 +0300
Subject: [PATCH 510/932] [Speculators][Speculative Decoding] Fix Qwen 2 Eagle3
 Support (#23337)

---
 vllm/model_executor/models/qwen2.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index b6a1d2db30..801741ecaf 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -52,7 +52,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import is_interleaved
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
@@ -442,7 +442,7 @@ class Qwen2Model(nn.Module):
         return loaded_params
 
 
-class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -488,6 +488,13 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
     def forward(
         self,
         input_ids: torch.Tensor,

From 424fb7a5d22c013ec0ef6244c62cd75ed076375a Mon Sep 17 00:00:00 2001
From: bppps <44322223+bppps@users.noreply.github.com>
Date: Sat, 23 Aug 2025 00:56:46 +0800
Subject: [PATCH 511/932] =?UTF-8?q?[BugFix]=20Fix=20the=20issue=20where=20?=
 =?UTF-8?q?image=20embeddings=20were=20incorrectly=20split.=E2=80=A6=20(#2?=
 =?UTF-8?q?3366)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: bppps <bpppsaka@gmail.com>
Co-authored-by: zouyu.zzx <zouyu.zzx@alibaba-inc.com>
Co-authored-by: bppps <bpppsaka@gmail.com>
---
 vllm/model_executor/models/glm4_1v.py         |  7 +-
 .../models/qwen2_5_omni_thinker.py            | 80 ++++++++++++-------
 vllm/model_executor/models/qwen2_vl.py        | 64 ++++++++++-----
 3 files changed, 99 insertions(+), 52 deletions(-)

diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 08252c5131..662728e6b1 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -74,7 +74,8 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from ..layers.activation import SiluAndMul
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
-from .qwen2_vl import _qwen2vl_field_config, apply_rotary_pos_emb_vision
+from .qwen2_vl import (_create_qwen2vl_field_factory,
+                       apply_rotary_pos_emb_vision)
 from .utils import (AutoWeightsLoader, WeightsMapper,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -1153,7 +1154,9 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return _qwen2vl_field_config(hf_inputs)
+        return _create_qwen2vl_field_factory(
+            self.info.get_hf_config().vision_config.spatial_merge_size)(
+                hf_inputs)
 
     def _get_prompt_updates(
         self,
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 5aadebc333..664e3f2985 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -25,7 +25,7 @@
 from collections.abc import Iterable, Mapping, Sequence
 from copy import copy
 from functools import partial
-from typing import Any, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -79,40 +79,57 @@ except (ImportError, ModuleNotFoundError):
 logger = init_logger(__name__)
 
 
-def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
-    audio_feature_lengths = hf_inputs.get("audio_feature_lengths",
-                                          torch.empty((0, )))
+def create_qwen2_5_omni_thinker_field_factory(
+    spatial_merge_size: int
+) -> Callable[[Mapping[str, torch.Tensor]], Mapping[str,
+                                                    MultiModalFieldConfig]]:
 
-    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
-    image_grid_sizes = image_grid_thw.prod(-1)
+    def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str,
+                                                              torch.Tensor]):
+        audio_feature_lengths = hf_inputs.get("audio_feature_lengths",
+                                              torch.empty((0, )))
 
-    video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
-    video_grid_sizes = video_grid_thw.prod(-1)
+        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+        image_pixel_grid_sizes = image_grid_thw.prod(-1)
+        image_embed_grid_sizes = (image_pixel_grid_sizes //
+                                  spatial_merge_size // spatial_merge_size)
 
-    num_videos = len(video_grid_sizes)
+        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+        video_grid_sizes = video_grid_thw.prod(-1)
+        video_embed_grid_sizes = (video_grid_sizes // spatial_merge_size //
+                                  spatial_merge_size)
 
-    return dict(
-        input_audio_features=MultiModalFieldConfig.flat_from_sizes(
-            "audio", audio_feature_lengths, dim=1),
-        feature_attention_mask=MultiModalFieldConfig.batched("audio"),
-        audio_feature_lengths=MultiModalFieldConfig.batched("audio"),
-        pixel_values=MultiModalFieldConfig.flat_from_sizes(
-            "image", image_grid_sizes),
-        image_embeds=MultiModalFieldConfig.flat_from_sizes(
-            "image", image_grid_sizes),
-        image_grid_thw=MultiModalFieldConfig.batched("image"),
-        pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
-            "video", video_grid_sizes),
-        video_embeds=MultiModalFieldConfig.flat_from_sizes(
-            "video", video_grid_sizes),
-        video_grid_thw=MultiModalFieldConfig.batched("video"),
-        second_per_grid_ts=MultiModalFieldConfig.batched("video"),
-        use_audio_in_video=MultiModalFieldConfig.shared("video", num_videos),
-    )
+        num_videos = len(video_grid_sizes)
+
+        return dict(
+            input_audio_features=MultiModalFieldConfig.flat_from_sizes(
+                "audio", audio_feature_lengths, dim=1),
+            feature_attention_mask=MultiModalFieldConfig.batched("audio"),
+            audio_feature_lengths=MultiModalFieldConfig.batched("audio"),
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_pixel_grid_sizes),
+            image_embeds=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_embed_grid_sizes),
+            image_grid_thw=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+                "video", video_grid_sizes),
+            video_embeds=MultiModalFieldConfig.flat_from_sizes(
+                "video", video_embed_grid_sizes),
+            video_grid_thw=MultiModalFieldConfig.batched("video"),
+            second_per_grid_ts=MultiModalFieldConfig.batched("video"),
+            use_audio_in_video=MultiModalFieldConfig.shared(
+                "video", num_videos),
+        )
+
+    return _qwen2_5_omni_thinker_field_config
 
 
 class Qwen2_5OmniThinkerMultiModalDataParser(Qwen2VLMultiModalDataParser):
 
+    def __init__(self, spatial_merge_size: int, *args, **kwargs):
+        self._spatial_merge_size = spatial_merge_size
+        super().__init__(self._spatial_merge_size, *args, **kwargs)
+
     def _parse_audio_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
@@ -124,7 +141,8 @@ class Qwen2_5OmniThinkerMultiModalDataParser(Qwen2VLMultiModalDataParser):
                 required_fields={
                     "input_audio_features", "audio_feature_lengths"
                 },
-                fields_factory=_qwen2_5_omni_thinker_field_config,
+                fields_factory=create_qwen2_5_omni_thinker_field_factory(
+                    self._spatial_merge_size),
             )
 
         return super()._parse_audio_data(data)
@@ -214,6 +232,8 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
         return Qwen2_5OmniThinkerMultiModalDataParser(
+            spatial_merge_size=self.info.get_hf_config(
+            ).vision_config.spatial_merge_size,
             target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
@@ -265,7 +285,9 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return _qwen2_5_omni_thinker_field_config(hf_inputs)
+        return create_qwen2_5_omni_thinker_field_factory(
+            self.info.get_hf_config().vision_config.spatial_merge_size)(
+                hf_inputs)
 
     def _maybe_apply_prompt_updates(
         self,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 2315fe2ab9..ae7a8d8d7a 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -699,29 +699,46 @@ class Qwen2VisionTransformer(nn.Module):
         return loaded_params
 
 
-def _qwen2vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
-    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
-    image_grid_sizes = image_grid_thw.prod(-1)
+def _create_qwen2vl_field_factory(
+    spatial_merge_size: int
+) -> Callable[
+    [Mapping[str, torch.Tensor]],
+        Mapping[str, MultiModalFieldConfig],
+]:
 
-    video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
-    video_grid_sizes = video_grid_thw.prod(-1)
+    def _qwen2vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+        image_pixel_grid_sizes = image_grid_thw.prod(-1)
+        image_embed_grid_sizes = (image_pixel_grid_sizes //
+                                  spatial_merge_size // spatial_merge_size)
 
-    return dict(
-        pixel_values=MultiModalFieldConfig.flat_from_sizes(
-            "image", image_grid_sizes),
-        image_embeds=MultiModalFieldConfig.flat_from_sizes(
-            "image", image_grid_sizes),
-        image_grid_thw=MultiModalFieldConfig.batched("image"),
-        pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
-            "video", video_grid_sizes),
-        video_embeds=MultiModalFieldConfig.flat_from_sizes(
-            "video", video_grid_sizes),
-        video_grid_thw=MultiModalFieldConfig.batched("video"),
-    )
+        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+        video_grid_sizes = video_grid_thw.prod(-1)
+        video_embed_grid_sizes = (video_grid_sizes // spatial_merge_size //
+                                  spatial_merge_size)
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_pixel_grid_sizes),
+            image_embeds=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_embed_grid_sizes),
+            image_grid_thw=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+                "video", video_grid_sizes),
+            video_embeds=MultiModalFieldConfig.flat_from_sizes(
+                "video", video_embed_grid_sizes),
+            video_grid_thw=MultiModalFieldConfig.batched("video"),
+        )
+
+    return _qwen2vl_field_config
 
 
 class Qwen2VLMultiModalDataParser(MultiModalDataParser):
 
+    def __init__(self, spatial_merge_size: int, *args, **kwargs):
+        self._spatial_merge_size = spatial_merge_size
+        super().__init__(*args, **kwargs)
+
     def _parse_image_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
@@ -731,7 +748,8 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
                 data,
                 modality="image",
                 required_fields={"image_embeds", "image_grid_thw"},
-                fields_factory=_qwen2vl_field_config,
+                fields_factory=_create_qwen2vl_field_factory(
+                    self._spatial_merge_size),
             )
 
         return super()._parse_image_data(data)
@@ -745,7 +763,8 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
                 data,
                 modality="video",
                 required_fields={"video_embeds", "video_grid_thw"},
-                fields_factory=_qwen2vl_field_config,
+                fields_factory=_create_qwen2vl_field_factory(
+                    self._spatial_merge_size),
             )
 
         return super()._parse_video_data(data)
@@ -967,7 +986,8 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
                                  ):
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        return Qwen2VLMultiModalDataParser()
+        return Qwen2VLMultiModalDataParser(
+            self.info.get_hf_config().vision_config.spatial_merge_size)
 
     def _get_prompt_updates(
         self,
@@ -1010,7 +1030,9 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return _qwen2vl_field_config(hf_inputs)
+        return _create_qwen2vl_field_factory(
+            self.info.get_hf_config().vision_config.spatial_merge_size)(
+                hf_inputs)
 
 
 @MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor,

From 341923b9820ea1dc437445e2e81644e6ba47e5b6 Mon Sep 17 00:00:00 2001
From: Aziz <azizbenothman76@gmail.com>
Date: Fri, 22 Aug 2025 19:20:59 +0200
Subject: [PATCH 512/932] fix(tests): Ensure reliable CUDA cache clearing in
 MoE test (#23416)

Signed-off-by: AzizCode92 <azizbenothman76@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/kernels/moe/test_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 1951eb0c61..0ea9667914 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -429,11 +429,11 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
                 vllm_moe.experts.w13_weight, (0, 128), "constant", 0)[...,
                                                                       0:-128],
                                                     requires_grad=False)
-            torch.cuda.empty_cache()
             vllm_moe.experts.w2_weight = Parameter(F.pad(
                 vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[...,
                                                                      0:-128],
                                                    requires_grad=False)
+            torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
         # Run forward passes for both MoE blocks

From b6d7d34fc62947eadf9adcfbc0264da388cb830c Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Fri, 22 Aug 2025 10:31:24 -0700
Subject: [PATCH 513/932] Add unit tests for batched guided and non-guided
 requests (#23389)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 .../llm/test_struct_output_generate.py        | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 58b6297762..572af0175d 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -11,9 +11,11 @@ from typing import TYPE_CHECKING, Any
 import jsonschema
 import pytest
 import regex as re
+import torch
 from pydantic import BaseModel
 
 from tests.reasoning.utils import run_reasoning_extraction
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
@@ -727,3 +729,83 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
     assert "a4" not in generated
     assert "a5" not in generated
     assert "a6" not in generated
+
+
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["guidance", "xgrammar", "outlines"])
+def test_structured_output_batched_with_non_guided_requests(
+    monkeypatch: pytest.MonkeyPatch,
+    sample_json_schema: dict[str, Any],
+    guided_decoding_backend: str,
+):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    # Don't use eager execution on TPUs because we want to test for no
+    # recompilation at runtime
+    enforce_eager = bool(not current_platform.is_tpu())
+
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        enforce_eager=enforce_eager,
+        max_model_len=1024,
+        guided_decoding_backend=guided_decoding_backend,
+        guided_decoding_disable_any_whitespace=(guided_decoding_backend
+                                                in {"xgrammar", "guidance"}),
+    )
+
+    guided_prompt = (
+        "Give an example JSON for an employee profile that fits this "
+        "schema. Make the response as short as possible. Schema: "
+        f"{sample_json_schema}")
+
+    non_guided_prompt = "The diameter of the Earth in kilometers is "
+
+    prompts = [guided_prompt, non_guided_prompt]
+    sampling_params = [
+        SamplingParams(
+            temperature=1.0,
+            max_tokens=400,
+            guided_decoding=GuidedDecodingParams(json=sample_json_schema)),
+        # No max tokens, temp=0 to assert on contents
+        SamplingParams(
+            seed=42,
+            temperature=0,
+            top_p=1.0,
+        ),
+    ]
+
+    outputs = llm.generate(prompts=prompts,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    # Free memory as soon as possible as failed assertions
+    # will short circuit and not free up memory
+    del llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    for index, output in enumerate(outputs):
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt:\n{prompt!r}\nGenerated text:\n{generated_text!r}")
+
+        if index == 0:
+            # First prompt is guided, expect valid JSON
+            assert "\n" not in generated_text
+            output_json = json.loads(generated_text)
+            jsonschema.validate(instance=output_json,
+                                schema=sample_json_schema)
+        else:
+            # Second prompt is not guided, expect valid output
+            # Cannot assert on exact output, but we can expect it to be factual
+            assert "12,742" in generated_text
+
+            # non-guided requests should not return a valid JSON here
+            with pytest.raises(ValueError):
+                output_json = json.loads(generated_text)

From 22cf679aadca99311cfb5a9f894039e464e366aa Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Fri, 22 Aug 2025 19:38:46 +0200
Subject: [PATCH 514/932] [Doc]: fix various typos in multiple files (#23179)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 vllm/beam_search.py                   | 2 +-
 vllm/compilation/backends.py          | 2 +-
 vllm/engine/arg_utils.py              | 6 +++---
 vllm/engine/multiprocessing/client.py | 4 ++--
 vllm/entrypoints/chat_utils.py        | 2 +-
 vllm/utils/__init__.py                | 4 ++--
 vllm/v1/structured_output/__init__.py | 4 ++--
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index f3bc421832..5a2e79e1b5 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -18,7 +18,7 @@ class BeamSearchSequence:
     The text field is optional and will only be filled when the sequence is
     about to be returned to the user.
     """
-    # The tokens includes the prompt.
+    # The tokens include the prompt.
     tokens: list[int]
     logprobs: list[dict[int, Logprob]]
     lora_request: Optional[LoRARequest] = None
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 059e7a3b29..56494dffc9 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -484,7 +484,7 @@ class VllmBackend:
 
             factors = []
             # 0. factors come from the env, for example, The values of
-            # VLLM_PP_LAYER_PARTITION will affects the computation graph.
+            # VLLM_PP_LAYER_PARTITION will affect the computation graph.
             env_hash = envs.compute_hash()
             factors.append(env_hash)
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4700a93dd6..965264ee30 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -605,7 +605,7 @@ class EngineArgs:
             **guided_decoding_kwargs["disable_additional_properties"])
         guided_decoding_group.add_argument(
             "--reasoning-parser",
-            # This choices is a special case because it's not static
+            # This choice is a special case because it's not static
             choices=list(ReasoningParserManager.reasoning_parsers),
             **guided_decoding_kwargs["reasoning_backend"])
 
@@ -1047,7 +1047,7 @@ class EngineArgs:
             # details from the config directly
             # no user input required / expected
             if isinstance(hf_config, SpeculatorsConfig):
-                # We create one since we dont create one
+                # We create one since we don't create one
                 self.speculative_config = {}
                 self.speculative_config[
                     "num_speculative_tokens"] = hf_config.num_lookahead_tokens
@@ -1775,7 +1775,7 @@ class AsyncEngineArgs(EngineArgs):
     def add_cli_args(parser: FlexibleArgumentParser,
                      async_args_only: bool = False) -> FlexibleArgumentParser:
         # Initialize plugin to update the parser, for example, The plugin may
-        # adding a new kind of quantization method to --quantization argument or
+        # add a new kind of quantization method to --quantization argument or
         # a new device to --device argument.
         load_general_plugins()
         if not async_args_only:
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index eca29af500..0bb11328b1 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -539,7 +539,7 @@ class MQLLMEngineClient(EngineClient):
         if request_id in self.output_queues:
             raise ValueError(f"Request {request_id} already exists")
 
-        # 1) Create output queue for this requests.
+        # 1) Create output queue for this request.
         queue: asyncio.Queue[Union[RequestOutput,
                                    BaseException]] = asyncio.Queue()
         self.output_queues[request_id] = queue
@@ -651,7 +651,7 @@ class MQLLMEngineClient(EngineClient):
         # Uses the same I/O as generate requests
         request = RPCLoadAdapterRequest(lora_request)
 
-        # Create output queue for this requests.
+        # Create output queue for this request.
         queue: asyncio.Queue[Union[None, BaseException]] = asyncio.Queue()
         self.output_queues[request.request_id] = queue
 
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 87772a499f..7b11a50642 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1330,7 +1330,7 @@ def apply_mistral_chat_template(
     # mistral-common uses assert statements to stop processing of input
     # if input does not comply with the expected format.
     # We convert those assertion errors to ValueErrors so they can be
-    # are properly caught in the preprocessing_input step
+    # properly caught in the preprocessing_input step
     except (AssertionError, MistralCommonException) as e:
         raise ValueError(str(e)) from e
 
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 7079bfb8db..7c34a858c0 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -2482,7 +2482,7 @@ class PlaceholderModule(_PlaceholderBase):
     A placeholder object to use when a module does not exist.
 
     This enables more informative errors when trying to access attributes
-    of a module that does not exists.
+    of a module that does not exist.
     """
 
     def __init__(self, name: str) -> None:
@@ -3109,7 +3109,7 @@ class LazyLoader(types.ModuleType):
     """
     LazyLoader module borrowed from Tensorflow
     https://github.com/tensorflow/tensorflow/blob/main/tensorflow/python/util/lazy_loader.py
-    with a addition of "module caching".
+    with an addition of "module caching".
 
     Lazily import a module, mainly to avoid pulling in large dependencies.
     Modules such as `xgrammar` might do additional side effects, so we
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 63604a335d..3bafa61044 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -267,7 +267,7 @@ class StructuredOutputManager:
             assert request.structured_output_request is not None
             assert request.structured_output_request.grammar is not None
         # by default, we should always advance
-        # for cases that doesn't uses thinking mode.
+        # for cases that don't use thinking mode.
         if self.reasoner is not None:
             structured_req = request.structured_output_request
 
@@ -276,7 +276,7 @@ class StructuredOutputManager:
 
             # Check if reasoning ends in *this* step
             if self.reasoner.is_reasoning_end(request.all_token_ids):
-                # Reasoning just ended, so we shouldn't advanced til
+                # Reasoning just ended, so we shouldn't advance til
                 # next pass
                 structured_req.reasoning_ended = True
 

From 32d2b4064feea38802489b71e47703d1f901a17e Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 23 Aug 2025 01:46:34 +0800
Subject: [PATCH 515/932] [Model] Add Ovis2.5 PP support (#23405)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/distributed/test_pipeline_parallel.py   |   1 +
 .../multimodal/generation/test_common.py      |   6 +-
 tests/models/registry.py                      |   4 +-
 vllm/model_executor/models/ovis2_5.py         |  36 +--
 vllm/model_executor/models/siglip2navit.py    | 243 ++++++++++++------
 5 files changed, 185 insertions(+), 105 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 12dd7c4222..28150d7682 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -233,6 +233,7 @@ MULTIMODAL_MODELS = {
     "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
     "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
     "AIDC-AI/Ovis2-1B": PPTestSettings.fast(),
+    "AIDC-AI/Ovis2.5-2B": PPTestSettings.fast(),
     "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
     "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
     "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index ea5de9d9f5..96208f8eda 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -11,7 +11,6 @@ from pathlib import PosixPath
 import pytest
 from transformers import (AutoModel, AutoModelForImageTextToText,
                           AutoModelForTextToWaveform, AutoModelForVision2Seq)
-from transformers.utils import is_flash_attn_2_available
 
 from vllm.platforms import current_platform
 from vllm.utils import identity
@@ -637,10 +636,7 @@ VLM_TEST_SETTINGS = {
         dtype="half",
         num_logprobs=10,
         patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
-        marks=[pytest.mark.skipif(
-            not is_flash_attn_2_available(),
-            reason="HF model needs `flash_attn` installed"
-        )],
+        hf_model_kwargs={"revision": "refs/pr/5"},
     ),
     "phi3v": VLMTestInfo(
         models=["microsoft/Phi-3.5-vision-instruct"],
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 4035319b45..25dbbd7fa9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -468,9 +468,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                             extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
                                     "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
     "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B",
-                               trust_remote_code=True,
-                               max_transformers_version="4.53",
-                               transformers_version_reason="HF model is not compatible"),  # noqa: E501
+                               trust_remote_code=True),
     "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index aa4ea3dd48..58a1407244 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -30,7 +30,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 
 IMAGE_TOKEN = "<image>"
 VIDEO_TOKEN = "<video>"
@@ -70,6 +70,7 @@ class VisualTokenizer(torch.nn.Module):
         visual_vocab_size: int,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.config = config
@@ -77,6 +78,7 @@ class VisualTokenizer(torch.nn.Module):
             config=config,
             quant_config=quant_config,
             prefix=f"{prefix}.vit",
+            use_data_parallel=use_data_parallel,
         )
         # reserved tokens for INDICATOR_IDS
         head_dim = visual_vocab_size - len(INDICATOR_IDS)
@@ -93,31 +95,33 @@ class VisualTokenizer(torch.nn.Module):
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         model_type = config.model_type
         if model_type == "siglip2_navit":
-            return Siglip2NavitModel(config=config, )
+            return Siglip2NavitModel(config=config,
+                                     quant_config=quant_config,
+                                     prefix=prefix,
+                                     use_data_parallel=use_data_parallel)
         raise ValueError(
             f"Unsupported visual tokenizer model_type: {model_type}")
 
     @property
-    def dtype(self):
+    def dtype(self) -> torch.dtype:
         return next(self.head.parameters()).dtype
 
     @property
-    def device(self):
+    def device(self) -> torch.device:
         return next(self.head.parameters()).device
 
-    def tokenize(self, logits):
+    def tokenize(self, logits: torch.Tensor) -> torch.Tensor:
         tokens = torch.softmax(logits, dim=-1,
                                dtype=torch.float32).to(logits.dtype)
         return tokens
 
-    def encode(self, pixel_values, grid_thws):
-        features = self.vit(pixel_values,
-                            grid_thws,
-                            output_hidden_states=True,
-                            return_dict=True)
+    def encode(self, pixel_values: torch.Tensor,
+               grid_thws: torch.Tensor) -> torch.Tensor:
+        features = self.vit(pixel_values, grid_thws)
         # refer to qwen2.5-vl patchmerger
         seq_len, _ = features.shape
         features = features.reshape(seq_len // (self.config.hidden_stride**2),
@@ -125,7 +129,8 @@ class VisualTokenizer(torch.nn.Module):
 
         return features
 
-    def forward(self, pixel_values, grid_thws) -> torch.Tensor:
+    def forward(self, pixel_values: torch.Tensor,
+                grid_thws: torch.Tensor) -> torch.Tensor:
         features = self.encode(pixel_values, grid_thws)
         logits = self.head(features)
         tokens = self.tokenize(logits)
@@ -395,7 +400,7 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]
 @MULTIMODAL_REGISTRY.register_processor(Ovis2_5MultiModalProcessor,
                                         info=Ovis2_5ProcessingInfo,
                                         dummy_inputs=Ovis2_5DummyInputsBuilder)
-class Ovis2_5(nn.Module, SupportsMultiModal):
+class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -421,9 +426,8 @@ class Ovis2_5(nn.Module, SupportsMultiModal):
         text_model_type = self.config.get_text_config().model_type
         self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
 
-        # TODO(Isotr0py): PP support
-        # self.make_empty_intermediate_tensors = (
-        #    self.language_model.make_empty_intermediate_tensors)
+        self.make_empty_intermediate_tensors = (
+            self.get_language_model().make_empty_intermediate_tensors)
 
     def _parse_and_validate_visual_input(
             self, is_video,
@@ -567,4 +571,4 @@ class Ovis2_5(nn.Module, SupportsMultiModal):
         return loader.load_weights(weights)
 
     def get_language_model(self) -> torch.nn.Module:
-        return self.llm
\ No newline at end of file
+        return self.llm
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 10093f92a5..c6244fb3b3 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -3,16 +3,24 @@
 """Implementation of SiglipVisionModel intended to be only used
 within a vision language model."""
 
-from typing import Optional, Union
+from collections.abc import Iterable
+from typing import Optional
 
 import torch
 from einops import rearrange, repeat
 from torch import nn
 from torch.nn import functional as F
-from transformers.activations import ACT2FN
+from transformers import Siglip2VisionConfig
 from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_outputs import BaseModelOutputWithNoAttention
 
+from vllm.config import QuantizationConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase, QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.platforms import _Backend
 
 from .vision import get_vit_attn_backend
@@ -48,10 +56,11 @@ class Siglip2VisionEmbeddings(nn.Module):
 
         # siglip2 naflex
         if self.num_patches > 0:
-            self.patch_embedding = nn.Linear(
-                in_features=config.num_channels * self.patch_size *
+            self.patch_embedding = ReplicatedLinear(
+                input_size=config.num_channels * self.patch_size *
                 self.patch_size,
-                out_features=self.embed_dim,
+                output_size=self.embed_dim,
+                return_bias=False,
             )
             if self.preserve_original_pe:
                 self.position_embedding_size = int(self.num_patches**0.5)
@@ -89,7 +98,7 @@ class Siglip2VisionEmbeddings(nn.Module):
 
         # Apply patch embeddings to already patchified pixel values
         target_dtype = self.patch_embedding.weight.dtype
-        if isinstance(self.patch_embedding, nn.Linear):
+        if isinstance(self.patch_embedding, LinearBase):
             patch_embeds = self.patch_embedding(
                 pixel_values.to(dtype=target_dtype))
         elif isinstance(self.patch_embedding, nn.Conv2d):
@@ -184,7 +193,13 @@ def apply_rotary_pos_emb(
 class Siglip2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config):
+    def __init__(
+        self,
+        config: Siglip2VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -199,11 +214,25 @@ class Siglip2Attention(nn.Module):
         self.dropout = config.attention_dropout
         self.is_causal = False
 
-        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        # TODO(Isotr0py): Enable data parallel after we support
+        # disabling TP on parallel linear layer
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
 
+        self.tp_size = (1 if use_data_parallel else
+                        get_tensor_model_parallel_world_size())
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
         self.use_rope = config.use_rope
 
         # Detect attention implementation.
@@ -228,13 +257,15 @@ class Siglip2Attention(nn.Module):
 
         seq_length, embed_dim = hidden_states.shape
 
-        queries = self.q_proj(hidden_states)
-        keys = self.k_proj(hidden_states)
-        values = self.v_proj(hidden_states)
+        qkv_states, _ = self.qkv_proj(hidden_states)
+        queries, keys, values = qkv_states.chunk(3, dim=-1)
 
-        queries = queries.view(seq_length, self.num_heads, self.head_dim)
-        keys = keys.view(seq_length, self.num_heads, self.head_dim)
-        values = values.view(seq_length, self.num_heads, self.head_dim)
+        queries = queries.view(seq_length, self.num_heads_per_partition,
+                               self.head_dim)
+        keys = keys.view(seq_length, self.num_heads_per_partition,
+                         self.head_dim)
+        values = values.view(seq_length, self.num_heads_per_partition,
+                             self.head_dim)
 
         if self.use_rope:
             cos, sin = position_embeddings
@@ -276,41 +307,72 @@ class Siglip2Attention(nn.Module):
                                                           v_i,
                                                           dropout_p=0.0)
                 # (1, num_heads, seq_len, head_dim) -> (seq_len, embed_dim)
-                output_i = output_i.transpose(1, 2).reshape(-1, self.embed_dim)
+                output_i = output_i.transpose(1, 2).reshape(
+                    end_idx - start_idx, -1)
                 outputs.append(output_i)
 
             attn_output = torch.cat(outputs, dim=0)
-        attn_output = self.out_proj(attn_output)
+        attn_output, _ = self.out_proj(attn_output)
         return attn_output
 
 
 class Siglip2MLP(nn.Module):
 
-    def __init__(self, config):
+    def __init__(
+        self,
+        config: Siglip2VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         self.config = config
-        self.activation_fn = ACT2FN[config.hidden_act]
-        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
-        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.activation_fn = get_act_fn(config.hidden_act)
+        # TODO(Isotr0py): Enable data parallel after we support
+        # disabling TP on parallel linear layer
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.fc1(hidden_states)
+        hidden_states, _ = self.fc1(hidden_states)
         hidden_states = self.activation_fn(hidden_states)
-        hidden_states = self.fc2(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
         return hidden_states
 
 
 class Siglip2EncoderLayer(nn.Module):
 
-    def __init__(self, config: PretrainedConfig):
+    def __init__(
+        self,
+        config: Siglip2VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.layer_norm1 = nn.LayerNorm(self.embed_dim,
                                         eps=config.layer_norm_eps)
-        self.self_attn = Siglip2Attention(config)
+        self.self_attn = Siglip2Attention(config,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.self_attn",
+                                          use_data_parallel=use_data_parallel)
         self.layer_norm2 = nn.LayerNorm(self.embed_dim,
                                         eps=config.layer_norm_eps)
-        self.mlp = Siglip2MLP(config)
+        self.mlp = Siglip2MLP(config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.mlp",
+                              use_data_parallel=use_data_parallel)
 
     def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
                 position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]:
@@ -347,14 +409,22 @@ class Siglip2Encoder(nn.Module):
         config: PretrainedConfig
     """
 
-    def __init__(self, config: PretrainedConfig):
+    def __init__(
+        self,
+        config: Siglip2VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         self.config = config
         self.layers = nn.ModuleList([
-            Siglip2EncoderLayer(config)
-            for _ in range(config.num_hidden_layers)
+            Siglip2EncoderLayer(config,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.layers.{idx}",
+                                use_data_parallel=use_data_parallel)
+            for idx in range(config.num_hidden_layers)
         ])
-        self.gradient_checkpointing = False
 
         self.rotary_pos_emb = VisionRotaryEmbedding(
             config.hidden_size // config.num_attention_heads // 2)
@@ -445,13 +515,11 @@ class Siglip2Encoder(nn.Module):
 
         return window_index, cu_window_seqlens
 
-    # Ignore copy
     def forward(
         self,
-        inputs_embeds,
+        inputs_embeds: torch.Tensor,
         grid_thws: torch.Tensor,
-        output_hidden_states: bool = False,
-    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, ...]]]:
+    ) -> torch.Tensor:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape
@@ -506,7 +574,6 @@ class Siglip2Encoder(nn.Module):
         cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
 
         reverse_indices = torch.argsort(window_index)
-        encoder_states = () if output_hidden_states else None
 
         hidden_states = inputs_embeds
         for index, block in enumerate(self.layers):
@@ -517,45 +584,40 @@ class Siglip2Encoder(nn.Module):
                 cu_seqlens_tmp = cu_window_seqlens
             hidden_states = block(hidden_states, cu_seqlens_tmp,
                                   position_embeddings)
-            if output_hidden_states:
-                hidden_states_ = hidden_states.reshape(
-                    seq_len // self.spatial_merge_unit,
-                    self.spatial_merge_unit, -1)
-                encoder_states += (hidden_states_[reverse_indices, :].reshape(
-                    seq_len, -1), )
-        # tokens = self.post_trunk_norm(tokens)
+
         hidden_states = hidden_states.reshape(
             seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
         hidden_states = hidden_states[reverse_indices, :].reshape(seq_len, -1)
 
-        return hidden_states, encoder_states
+        return hidden_states
 
 
 class Siglip2VisionTransformer(nn.Module):
 
-    def __init__(self, config: PretrainedConfig):
+    def __init__(
+        self,
+        config: Siglip2VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
         self.config = config
         embed_dim = config.hidden_size
 
         self.embeddings = Siglip2VisionEmbeddings(config)
-        self.encoder = Siglip2Encoder(config)
+        self.encoder = Siglip2Encoder(config,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.encoder",
+                                      use_data_parallel=use_data_parallel)
         self.post_layernorm = nn.LayerNorm(embed_dim,
                                            eps=config.layer_norm_eps)
-        self._use_flash_attention_2 = \
-            (config._attn_implementation == "flash_attention_2")
 
     def forward(
         self,
         pixel_values: torch.FloatTensor,
         grid_thws: torch.LongTensor,
-        output_hidden_states: Optional[bool] = True,
-        return_dict: Optional[bool] = True,
-    ) -> Union[
-            tuple[torch.Tensor],
-            tuple[torch.Tensor, tuple[torch.Tensor, ...]],
-            BaseModelOutputWithNoAttention,
-    ]:
+    ) -> torch.Tensor:
         r"""
         spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
             Tensor containing the spatial dimensions (height, width)
@@ -563,45 +625,64 @@ class Siglip2VisionTransformer(nn.Module):
         """
         hidden_states = self.embeddings(pixel_values, grid_thws)
 
-        last_hidden_state, hidden_states = self.encoder(
-            hidden_states, grid_thws, output_hidden_states)
+        last_hidden_state = self.encoder(hidden_states, grid_thws)
         last_hidden_state = self.post_layernorm(last_hidden_state)
 
-        if not return_dict:
-            output = (last_hidden_state, )
-            output += (hidden_states, ) if output_hidden_states else ()
-            return output
-
         return last_hidden_state
 
 
 class Siglip2NavitModel(torch.nn.Module):
 
-    def __init__(self, config: PretrainedConfig):
+    def __init__(
+        self,
+        config: Siglip2VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
         super().__init__()
 
-        self.vision_model = Siglip2VisionTransformer(config)
+        self.vision_model = Siglip2VisionTransformer(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.vision_model",
+            use_data_parallel=use_data_parallel)
 
     def forward(
         self,
         pixel_values: torch.FloatTensor,
         grid_thws: torch.LongTensor,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[
-            tuple[torch.Tensor],
-            tuple[torch.Tensor, tuple[torch.Tensor, ...]],
-            BaseModelOutputWithNoAttention,
-    ]:
-
-        if output_hidden_states is None:
-            output_hidden_states = self.config.output_hidden_states
-        if return_dict is None:
-            return_dict = self.config.use_return_dict
-
+    ) -> torch.Tensor:
         return self.vision_model(
             pixel_values=pixel_values,
             grid_thws=grid_thws,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params

From cd7a3df26f351bae0261168232898cec39ed7304 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 23 Aug 2025 01:50:52 +0800
Subject: [PATCH 516/932] [Bugfix] Fix broken Florence-2 model (#23426)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: 汪志鹏 <wangzhipeng628@gmail.com>
---
 vllm/model_executor/models/florence2.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 4a8cb35a54..d0881231fb 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -647,7 +647,8 @@ class Florence2LanguageModel(nn.Module):
 
         encoder_hidden_states = None
 
-        if inputs_embeds is not None or encoder_input_ids.numel() > 0:
+        if ((inputs_embeds is not None and inputs_embeds.numel() > 0)
+                or encoder_input_ids.numel() > 0):
             # Run encoder attention if a non-zero number of encoder tokens
             # are provided as input
             encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
@@ -681,6 +682,8 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
         self.lm_head = BartParallelLMHead(self.vocab_size,
                                           config.d_model,
                                           embed_scale=embed_scale)
+        if self.config.tie_word_embeddings:
+            self.lm_head.tie_weights(self.model.shared)
 
         self.logits_processor = LogitsProcessor(self.vocab_size,
                                                 config.vocab_size)
@@ -749,7 +752,8 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
             else:
                 if "final_logits_bias" in name:
                     continue
-                if self.config.tie_word_embeddings and "embed_tokens" in name:
+                if self.config.tie_word_embeddings and ("embed_tokens" in name
+                                                        or "lm_head" in name):
                     continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",

From 4645024d3ad4caf748dbbd0541d1a0835b95bd0a Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 23 Aug 2025 03:04:22 +0800
Subject: [PATCH 517/932] [Quantization] Allow GGUF quantization to skip
 unquantized layer (#23188)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/quantization/gguf.py  | 13 +++++++++++--
 vllm/model_executor/model_loader/gguf_loader.py  | 14 +++++++++++++-
 vllm/model_executor/model_loader/weight_utils.py | 12 ++++++++++++
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 49d28927d6..90222f2e3b 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -13,7 +13,8 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEConfig,
                                                         FusedMoEMethodBase)
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
@@ -28,8 +29,10 @@ logger = init_logger(__name__)
 class GGUFConfig(QuantizationConfig):
     """Config class for GGUF."""
 
-    def __init__(self, ) -> None:
+    def __init__(self,
+                 unquantized_modules: Optional[list[str]] = None) -> None:
         super().__init__()
+        self.unquantized_modules = unquantized_modules or []
 
     def __repr__(self) -> str:
         return ("GGUFConfig()")
@@ -55,6 +58,8 @@ class GGUFConfig(QuantizationConfig):
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
+            if is_layer_skipped_gguf(prefix, self.unquantized_modules):
+                return UnquantizedLinearMethod()
             return GGUFLinearMethod(self)
         elif isinstance(layer, VocabParallelEmbedding):
             return GGUFEmbeddingMethod(self)
@@ -63,6 +68,10 @@ class GGUFConfig(QuantizationConfig):
         return None
 
 
+def is_layer_skipped_gguf(prefix: str, unquantized_modules: list[str]):
+    return any(module_name in prefix for module_name in unquantized_modules)
+
+
 UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16}
 STANDARD_QUANT_TYPES = {
     WeightType.Q4_0,
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 21655b0c69..9877cb3b7c 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -14,7 +14,8 @@ from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.utils import (
     initialize_model, process_weights_after_loading, set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
-    get_gguf_extra_tensor_names, gguf_quant_weights_iterator)
+    get_gguf_extra_tensor_names, get_gguf_weight_type_map,
+    gguf_quant_weights_iterator)
 
 
 class GGUFModelLoader(BaseModelLoader):
@@ -132,6 +133,17 @@ class GGUFModelLoader(BaseModelLoader):
                 local_model_path, gguf_weights_map):
             model_config.hf_config.update({"tie_word_embeddings": True})
 
+        weight_type_map = get_gguf_weight_type_map(model_config.model,
+                                                   gguf_weights_map)
+
+        # filter out unquantized modules to skip
+        unquant_names = [
+            name.removesuffix(".weight")
+            for name, weight_type in weight_type_map.items()
+            if weight_type == "F32" and name.endswith(".weight")
+        ]
+        vllm_config.quant_config.unquantized_modules.extend(unquant_names)
+
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 7053c5bc51..3bb47f82d2 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -563,6 +563,18 @@ def get_gguf_extra_tensor_names(
     return [gguf_to_hf_name_map[key] for key in extra_keys]
 
 
+def get_gguf_weight_type_map(
+        gguf_file: str, gguf_to_hf_name_map: dict[str, str]) -> dict[str, str]:
+    """
+    Return GGUF mapped weight's name and its quant type
+    """
+    reader = gguf.GGUFReader(gguf_file)
+    return {
+        gguf_to_hf_name_map[tensor.name]: tensor.tensor_type.name
+        for tensor in reader.tensors if tensor.name in gguf_to_hf_name_map
+    }
+
+
 def gguf_quant_weights_iterator(
     gguf_file: str, gguf_to_hf_name_map: dict[str, str]
 ) -> Generator[tuple[str, torch.Tensor], None, None]:

From da65bec3096b704f70606e9d11ee18aae453bbae Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@meta.com>
Date: Fri, 22 Aug 2025 12:25:45 -0700
Subject: [PATCH 518/932] add an env var for path to pre-downloaded flashinfer
 cubin files (#22675)

---
 vllm/envs.py             | 6 ++++++
 vllm/utils/flashinfer.py | 5 +++++
 2 files changed, 11 insertions(+)

diff --git a/vllm/envs.py b/vllm/envs.py
index 296c173089..7ca6cee9ab 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -158,6 +158,7 @@ if TYPE_CHECKING:
     VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
     VLLM_ENABLE_RESPONSES_API_STORE: bool = False
     VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
+    VLLM_HAS_FLASHINFER_CUBIN: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
     VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
@@ -1105,6 +1106,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_TRTLLM_ATTENTION":
     lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
 
+    # If set, it means we pre-downloaded cubin files and flashinfer will
+    # read the cubin files directly.
+    "VLLM_HAS_FLASHINFER_CUBIN":
+    lambda: os.getenv("VLLM_HAS_FLASHINFER_CUBIN", False),
+
     # If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
     # Otherwise, uses the first available of: flashinfer cutlass GEMM,
     # vllm cutlass GEMM, marlin GEMM.
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 996be12656..5dd239c50f 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -132,6 +132,11 @@ def has_nvidia_artifactory() -> bool:
     This checks connectivity to the kernel inference library artifactory
     which is required for downloading certain cubin kernels like TRTLLM FHMA.
     """
+    # Since FLASHINFER_CUBIN_DIR defines the pre-downloaded cubins path, when
+    # it's true, we could assume the cubins are available.
+    if envs.VLLM_HAS_FLASHINFER_CUBIN:
+        return True
+
     try:
         # Use a short timeout to avoid blocking for too long
         response = requests.get(FLASHINFER_CUBINS_REPOSITORY, timeout=5)

From 0483fabc746c79f6969b600665568255260d0b94 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Fri, 22 Aug 2025 13:34:40 -0700
Subject: [PATCH 519/932] [CI/Build] add EP dependencies to docker (#21976)

Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 .buildkite/test-pipeline.yaml | 7 +++++++
 docker/Dockerfile             | 8 ++++++++
 2 files changed, 15 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index df2735fefe..20f3ce1adb 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -843,3 +843,10 @@ steps:
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: Qwen MoE EP Test # optional
+  gpu: h200
+  optional: true
+  num_gpus: 2
+  commands:
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
diff --git a/docker/Dockerfile b/docker/Dockerfile
index cfaa598682..977dd7a3d3 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -458,6 +458,14 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
     fi
 BASH
 
+# Install EP kernels(pplx-kernels and DeepEP), NixL
+COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
+COPY tools/install_nixl.sh install_nixl.sh
+ENV CUDA_HOME=/usr/local/cuda
+RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
+    && bash install_python_libraries.sh \
+    && bash install_nixl.sh --force
+
 #################### vLLM installation IMAGE ####################
 
 #################### TEST IMAGE ####################

From 0313cf854d87a41c84efb69e89a79cd7b5897593 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Fri, 22 Aug 2025 23:39:08 +0200
Subject: [PATCH 520/932] [PERF] PyTorch Symmetric Memory All-Reduce (#20759)

Signed-off-by: ilmarkov <imarkov@redhat.com>
Signed-off-by: ilmarkov <markovilya197@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 docs/design/multiprocessing.md                |   2 +-
 tests/distributed/test_symm_mem_allreduce.py  | 108 +++++++++++++++++
 tools/check_pickle_imports.py                 |   2 +-
 ...ll_reduce_utils.py => all_reduce_utils.py} |  33 ++++++
 .../device_communicators/cuda_communicator.py |  15 +++
 .../device_communicators/custom_all_reduce.py |  12 +-
 .../device_communicators/symm_mem.py          | 111 ++++++++++++++++++
 vllm/envs.py                                  |   5 +
 8 files changed, 283 insertions(+), 5 deletions(-)
 create mode 100644 tests/distributed/test_symm_mem_allreduce.py
 rename vllm/distributed/device_communicators/{custom_all_reduce_utils.py => all_reduce_utils.py} (93%)
 create mode 100644 vllm/distributed/device_communicators/symm_mem.py

diff --git a/docs/design/multiprocessing.md b/docs/design/multiprocessing.md
index 06ebd77258..247072d1cb 100644
--- a/docs/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@@ -77,7 +77,7 @@ The `multiproc_xpu_executor` forces the use of `spawn`.
 
 There are other miscellaneous places hard-coding the use of `spawn`:
 
-- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L135>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/all_reduce_utils.py#L135>
 - <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/entrypoints/openai/api_server.py#L184>
 
 Related PRs:
diff --git a/tests/distributed/test_symm_mem_allreduce.py b/tests/distributed/test_symm_mem_allreduce.py
new file mode 100644
index 0000000000..5a804a3891
--- /dev/null
+++ b/tests/distributed/test_symm_mem_allreduce.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+import typing
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+import vllm.envs as envs
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+from vllm.distributed.device_communicators.cuda_communicator import (
+    CudaCommunicator)
+from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
+                                             get_tp_group,
+                                             init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+
+torch.manual_seed(42)
+random.seed(44)
+
+test_size_elements = 4 * 1024 * 1024
+
+
+def symm_mem_allreduce_worker(local_rank: int, world_size: int):
+    monkeypatch = pytest.MonkeyPatch()
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        dtype = torch.bfloat16
+        device = torch.device(f"cuda:{local_rank}")
+        torch.cuda.set_device(device)
+        torch.set_default_device(device)
+        torch.set_default_dtype(dtype)
+        update_environment_variables({
+            'RANK': str(local_rank),
+            'LOCAL_RANK': str(local_rank),
+            'WORLD_SIZE': str(world_size),
+            'MASTER_ADDR': 'localhost',
+            'MASTER_PORT': '12345',
+        })
+
+        init_distributed_environment()
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+        cuda_communicator = typing.cast(CudaCommunicator,
+                                        get_tp_group().device_communicator)
+        symm_mem_comm = cuda_communicator.symm_mem_comm
+        if symm_mem_comm is None or symm_mem_comm.disabled:
+            pytest.skip("SymmMemCommunicator is not available or disabled.")
+
+        inp_direct_symm_mem = torch.randint(1,
+                                            23, (test_size_elements, ),
+                                            dtype=dtype,
+                                            device=device)
+        if not symm_mem_comm.should_use_symm_mem(inp_direct_symm_mem):
+            pytest.skip(
+                "SymmMemCommunicator isn't used for this world and input size."
+            )
+
+        original_inp_direct_symm_mem = inp_direct_symm_mem.clone()
+        out_direct_symm_mem = symm_mem_comm.all_reduce(inp_direct_symm_mem)
+        assert out_direct_symm_mem is not None
+
+        group = get_tensor_model_parallel_group().device_group
+        dist.all_reduce(original_inp_direct_symm_mem, group=group)
+        torch.testing.assert_close(out_direct_symm_mem,
+                                   original_inp_direct_symm_mem,
+                                   atol=2.5,
+                                   rtol=0.1)
+
+        # Test tensor_model_parallel_all_reduce which should use symm_mem
+        inp_tensor_parallel = torch.randint(-23,
+                                            1, (test_size_elements, ),
+                                            dtype=dtype,
+                                            device=device)
+        original_inp_tensor_parallel = inp_tensor_parallel.clone()
+        out_tensor_parallel = tensor_model_parallel_all_reduce(
+            inp_tensor_parallel)
+        dist.all_reduce(original_inp_tensor_parallel, group=group)
+        torch.testing.assert_close(out_tensor_parallel,
+                                   original_inp_tensor_parallel,
+                                   atol=2.5,
+                                   rtol=0.1)
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="SymmMemAllreduce is only available for CUDA platforms.")
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("pipeline_parallel_size", [1])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
+                    reason="Only test on CUDA")
+def test_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
+                            pipeline_parallel_size):
+    world_size = tp_size * pipeline_parallel_size
+    if world_size > torch.cuda.device_count():
+        pytest.skip("Not enough GPUs to run the test.")
+
+    # Enable SymmMemCommunicator
+    monkeypatch.setenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1")
+
+    mp.spawn(symm_mem_allreduce_worker, args=(world_size, ), nprocs=world_size)
+    cleanup_dist_env_and_memory()
diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py
index 444e2bf53f..ad0ae45d1d 100644
--- a/tools/check_pickle_imports.py
+++ b/tools/check_pickle_imports.py
@@ -37,7 +37,7 @@ ALLOWED_FILES = set([
     'vllm/distributed/utils.py',
     'vllm/distributed/parallel_state.py',
     'vllm/engine/multiprocessing/client.py',
-    'vllm/distributed/device_communicators/custom_all_reduce_utils.py',
+    'vllm/distributed/device_communicators/all_reduce_utils.py',
     'vllm/distributed/device_communicators/shm_broadcast.py',
     'vllm/engine/multiprocessing/engine.py',
     'benchmarks/kernels/graph_machete_bench.py',
diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py
similarity index 93%
rename from vllm/distributed/device_communicators/custom_all_reduce_utils.py
rename to vllm/distributed/device_communicators/all_reduce_utils.py
index 7c6001e870..5c64e7d5c4 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/all_reduce_utils.py
@@ -23,6 +23,39 @@ from vllm.utils import (cuda_device_count_stateless,
 
 logger = init_logger(__name__)
 
+MiB = 1024 * 1024
+# Max size for each world size in case symmetric memory is available
+# For different SM architectures
+CUSTOM_ALL_REDUCE_MAX_SIZES = {
+    "9.0": {
+        2: 64 * MiB,  # 64 MB
+        4: 32 * MiB,  # 32 MB
+        6: MiB // 2,  # 512 KB
+        8: MiB // 4,  # 256 KB
+    },
+    "10.0": {
+        2: 2 * MiB,  # 2 MB
+        4: 2 * MiB,  # 2 MB
+        6: 2 * MiB,  # 2 MB
+        8: 2 * MiB,  # 2 MB
+    }
+}
+
+SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
+    "9.0": {
+        2: 64 * MiB,  # 64 MB
+        4: 32 * MiB,  # 32 MB
+        6: 64 * MiB,  # 64 MB
+        8: 64 * MiB,  # 64 MB
+    },
+    "10.0": {
+        2: 8 * MiB,  # 8 MB
+        4: 32 * MiB,  # 32 MB
+        6: 128 * MiB,  # 128 MB
+        8: 128 * MiB,  # 128 MB
+    }
+}
+
 
 def producer(batch_src: Sequence[int],
              producer_queue,
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 66d4940c9c..0ea8de2f36 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -44,6 +44,8 @@ class CudaCommunicator(DeviceCommunicatorBase):
             PyNcclCommunicator)
         from vllm.distributed.device_communicators.quick_all_reduce import (
             QuickAllReduce)
+        from vllm.distributed.device_communicators.symm_mem import (
+            SymmMemCommunicator)
 
         self.pynccl_comm: Optional[PyNcclCommunicator] = None
         if use_pynccl and self.world_size > 1:
@@ -54,6 +56,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
 
         self.ca_comm: Optional[CustomAllreduce] = None
         self.qr_comm: Optional[QuickAllReduce] = None
+        self.symm_mem_comm: Optional[SymmMemCommunicator] = None
         if use_custom_allreduce and self.world_size > 1:
             # Initialize a custom fast all-reduce implementation.
             self.ca_comm = CustomAllreduce(
@@ -69,6 +72,12 @@ class CudaCommunicator(DeviceCommunicatorBase):
                 # currently be an MI300 series.
                 self.qr_comm = QuickAllReduce(group=self.cpu_group,
                                               device=self.device)
+        if envs.VLLM_ALLREDUCE_USE_SYMM_MEM and current_platform.is_cuda():
+            self.symm_mem_comm = SymmMemCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+
         if self.use_all2all:
             all2all_backend = envs.VLLM_ALL2ALL_BACKEND
             if all2all_backend == "naive":
@@ -105,6 +114,12 @@ class CudaCommunicator(DeviceCommunicatorBase):
             out = ca_comm.custom_all_reduce(input_)
             assert out is not None
             return out
+        symm_mem_comm = self.symm_mem_comm
+        if symm_mem_comm is not None and \
+            symm_mem_comm.should_use_symm_mem(input_):
+            out = symm_mem_comm.all_reduce(input_)
+            assert out is not None
+            return out
         pynccl_comm = self.pynccl_comm
         assert pynccl_comm is not None
         out = pynccl_comm.all_reduce(input_)
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 8dfb7959a5..80aca81234 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -10,8 +10,8 @@ from torch.distributed import ProcessGroup
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
-from vllm.distributed.device_communicators.custom_all_reduce_utils import (
-    gpu_p2p_access_check)
+from vllm.distributed.device_communicators.all_reduce_utils import (
+    CUSTOM_ALL_REDUCE_MAX_SIZES, gpu_p2p_access_check)
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -109,7 +109,13 @@ class CustomAllreduce:
         # now `device` is a `torch.device` object
         assert isinstance(device, torch.device)
         self.device = device
-
+        device_capability = current_platform.get_device_capability(
+        ).as_version_str()
+        if (current_platform.is_cuda() and envs.VLLM_ALLREDUCE_USE_SYMM_MEM
+                and device_capability in CUSTOM_ALL_REDUCE_MAX_SIZES):
+            max_size = min(
+                CUSTOM_ALL_REDUCE_MAX_SIZES[device_capability][world_size],
+                max_size)
         cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
         if cuda_visible_devices:
             device_ids = list(map(int, cuda_visible_devices.split(",")))
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
new file mode 100644
index 0000000000..d907e1b833
--- /dev/null
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.device_communicators.all_reduce_utils import (
+    SYMM_MEM_ALL_REDUCE_MAX_SIZES)
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+try:
+    import torch.distributed._symmetric_memory as torch_symm_mem
+
+    symm_mem_available = True
+except ImportError:
+    symm_mem_available = False
+
+logger = init_logger(__name__)
+
+
+class SymmMemCommunicator:
+    _WORLD_SIZES_MULTIMEM = {
+        "9.0": [4, 6, 8],
+        "10.0": [6, 8],
+    }
+
+    def __init__(self, group: ProcessGroup, device: Union[int, str,
+                                                          torch.device]):
+        self.disabled = True
+
+        if not symm_mem_available:
+            return
+
+        if not current_platform.is_cuda():
+            logger.warning("SymmMemCommunicator: symmetric "
+                           "memory is not available.")
+            return
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        torch.cuda.set_device(device)
+        self.dtype = torch.bfloat16
+        self.device = device
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+        self.device_capability = current_platform.get_device_capability(
+        ).as_version_str()
+        if self.device_capability not in SYMM_MEM_ALL_REDUCE_MAX_SIZES:
+            logger.warning(
+                "SymmMemCommunicator: Device capability %s not supported, "
+                "communicator is not available.",
+                self.device_capability,
+            )
+            return
+        if self.world_size not in SYMM_MEM_ALL_REDUCE_MAX_SIZES[
+                self.device_capability]:
+            logger.warning(
+                "SymmMemCommunicator: World size %d not supported, "
+                "communicator is not available.",
+                self.world_size,
+            )
+            return
+        self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][
+            self.world_size]
+        self.buffer = torch_symm_mem.empty(
+            self.max_size // self.dtype.itemsize,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
+        if handle.multicast_ptr == 0:
+            logger.warning("SymmMemCommunicator: symmetric memory "
+                           "multicast operations are not supported.")
+            return
+        self.disabled = False
+
+    def should_use_symm_mem(self, inp: torch.Tensor):
+        if self.disabled:
+            return False
+        if inp.dtype != self.dtype:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        if inp_size % 4 != 0:
+            return False
+        return inp_size < self.max_size
+
+    def all_reduce(
+            self,
+            inp: torch.Tensor,
+            *,
+            out: Optional[torch.Tensor] = None) -> Optional[torch.Tensor]:
+        if not self.should_use_symm_mem(inp):
+            return None
+        if out is None:
+            out = torch.empty_like(inp)
+        self.buffer[:inp.numel()].copy_(inp.view(-1))
+        if self.world_size in self._WORLD_SIZES_MULTIMEM[
+                self.device_capability]:
+            torch.ops.symm_mem.multimem_all_reduce_(self.buffer[:inp.numel()],
+                                                    "sum",
+                                                    self.group.group_name)
+        else:
+            torch.ops.symm_mem.two_shot_all_reduce_(self.buffer[:inp.numel()],
+                                                    "sum",
+                                                    self.group.group_name)
+        out.copy_(self.buffer[:inp.numel()].view(out.shape))
+        return out
diff --git a/vllm/envs.py b/vllm/envs.py
index 7ca6cee9ab..5d0e972f43 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -161,6 +161,7 @@ if TYPE_CHECKING:
     VLLM_HAS_FLASHINFER_CUBIN: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
+    VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
     VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
 
 
@@ -1156,6 +1157,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_ENABLE_RESPONSES_API_STORE":
     lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
 
+    # Whether to use pytorch symmetric memory for allreduce
+    "VLLM_ALLREDUCE_USE_SYMM_MEM":
+    lambda: bool(int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))),
+
     # Allows vllm to find tuned config under customized folder
     "VLLM_TUNED_CONFIG_FOLDER":
     lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),

From cc7ae5e7cab77765369630c1401410ca54184065 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 22 Aug 2025 16:47:57 -0500
Subject: [PATCH 521/932] [BugFix][AMD][Quantization] Fix torch.compile issue
 where wvSplitKQ not being called when it should when using quantized FP8
 model (#22281)

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
---
 .../layers/quantization/utils/w8a8_utils.py   | 40 +++++++++++++++----
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 659029fd37..36d16960ec 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape)
 from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
 
 # Input scaling factors are no longer optional in _scaled_mm starting
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
@@ -156,13 +157,10 @@ def cutlass_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
     return output.view(*output_shape)
 
 
-def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
-                                   weight: torch.Tensor,
-                                   out_dtype: torch.dtype,
-                                   scale_a: torch.Tensor,
-                                   scale_b: torch.Tensor, bias: torch.Tensor,
-                                   input_2d: torch.Tensor,
-                                   output_shape: list) -> torch.Tensor:
+def rocm_per_tensor_w8a8_scaled_mm_impl(
+        qinput: torch.Tensor, weight: torch.Tensor, out_dtype: torch.dtype,
+        scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor,
+        input_2d: torch.Tensor) -> torch.Tensor:
     from vllm.platforms.rocm import on_mi3xx
     if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx(
     ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0:
@@ -175,10 +173,38 @@ def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
                                   scale_a=scale_a,
                                   scale_b=scale_b,
                                   bias=bias)
+    return output
 
+
+def rocm_per_tensor_w8a8_scaled_mm_fake(
+        qinput: torch.Tensor, weight: torch.Tensor, out_dtype: torch.dtype,
+        scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor,
+        input_2d: torch.Tensor) -> torch.Tensor:
+    return qinput.new_empty((*qinput.shape[:-1], weight.shape[1]),
+                            dtype=out_dtype)
+
+
+def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                   weight: torch.Tensor,
+                                   out_dtype: torch.dtype,
+                                   scale_a: torch.Tensor,
+                                   scale_b: torch.Tensor, bias: torch.Tensor,
+                                   input_2d: torch.Tensor,
+                                   output_shape: list) -> torch.Tensor:
+    output = torch.ops.vllm.rocm_per_tensor_w8a8_scaled_mm_impl(
+        qinput, weight, out_dtype, scale_a, scale_b, bias, input_2d)
     return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
 
 
+direct_register_custom_op(
+    op_name="rocm_per_tensor_w8a8_scaled_mm_impl",
+    op_func=rocm_per_tensor_w8a8_scaled_mm_impl,
+    mutates_args=[],
+    fake_impl=rocm_per_tensor_w8a8_scaled_mm_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
+
+
 def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
                                     weight: torch.Tensor,
                                     out_dtype: torch.dtype,

From 24d0c9e6edc4299f62053ee0cb0154ce86b08cb8 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Sat, 23 Aug 2025 06:09:05 +0800
Subject: [PATCH 522/932] [NVIDIA][torch.compile] Support Flashinfer TRTLLM
 FP8-q/kv NVFP4-out Attention Kernel (#22703)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 .../benchmark_trtllm_decode_attention.py      |  50 +++--
 .../benchmark_trtllm_prefill_attention.py     |  52 +++--
 tests/compile/test_functionalization.py       |   5 +-
 tests/compile/test_fusion.py                  |  10 +-
 tests/compile/test_fusion_attn.py             | 152 ++++++++++----
 .../test_flashinfer_trtllm_attention.py       |  72 ++++++-
 vllm/attention/backends/abstract.py           |  13 +-
 .../backends/differential_flash_attn.py       |   6 +
 .../backends/dual_chunk_flash_attn.py         |   3 +-
 vllm/attention/backends/flash_attn.py         |   3 +-
 vllm/attention/backends/mla/common.py         |   3 +-
 vllm/attention/backends/rocm_flash_attn.py    |  14 +-
 vllm/attention/backends/xformers.py           |   3 +-
 vllm/attention/layer.py                       |   7 +-
 vllm/compilation/fusion.py                    |  66 ++-----
 vllm/compilation/fusion_attn.py               | 185 ++++++++++++++----
 .../layers/quantization/utils/quant_utils.py  |  63 ++++++
 vllm/v1/attention/backends/cpu_attn.py        |   3 +-
 vllm/v1/attention/backends/flash_attn.py      |   3 +-
 vllm/v1/attention/backends/flashinfer.py      |  62 ++++--
 vllm/v1/attention/backends/flex_attention.py  |   3 +-
 vllm/v1/attention/backends/mla/common.py      |   3 +-
 vllm/v1/attention/backends/pallas.py          |   3 +-
 vllm/v1/attention/backends/rocm_aiter_fa.py   |   3 +-
 vllm/v1/attention/backends/tree_attn.py       |   3 +-
 vllm/v1/attention/backends/triton_attn.py     |   3 +-
 vllm/v1/attention/backends/xformers.py        |   3 +-
 27 files changed, 596 insertions(+), 200 deletions(-)

diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 72b54b40a2..603ce5ecf0 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -9,8 +9,11 @@ from typing import Optional
 import flashinfer
 import torch
 
+from vllm.utils import round_up
+
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
+FP4_DTYPE = torch.uint8
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -61,13 +64,13 @@ def benchmark_decode(
     else:
         raise ValueError(f"Invalid kv_layout: {kv_layout}")
 
-    query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
+    # Always using 1.0 scale to reflect the real perf in benchmarking
+    q_scale = 1.0
+    ref_query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
     if q_quant_dtype == FP8_DTYPE:
-        query, q_scale = to_float8(query)
-        ref_query = query.to(dtype) * q_scale
+        query, _ = to_float8(ref_query)
     else:
-        q_scale = 1.0
-        ref_query = query
+        query = ref_query
 
     kv_lens = torch.randint(1, max_seq_len, (batch_size,), dtype=torch.int32)
     kv_lens[-1] = max_seq_len
@@ -75,14 +78,13 @@ def benchmark_decode(
     seq_lens = kv_lens
     max_seq_len = torch.max(seq_lens).item()
 
-    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    # Always using 1.0 scale to reflect the real perf in benchmarking
+    k_scale = v_scale = 1.0
+    ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
     if kv_quant_dtype == FP8_DTYPE:
-        kv_cache, kv_scale = to_float8(kv_cache)
-        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+        kv_cache, _ = to_float8(ref_kv_cache)
     else:
-        kv_scale = 1.0
-        ref_kv_cache = kv_cache
-    k_scale = v_scale = kv_scale
+        kv_cache = ref_kv_cache
 
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(
@@ -142,11 +144,31 @@ def benchmark_decode(
         return sum(times) / len(times), torch.std(torch.tensor(times))
 
     o_scale = 1.0
+    o_sf_scale = None
     output_baseline = torch.empty(ref_query.shape, dtype=dtype)
-    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+    if o_quant_dtype == FP4_DTYPE:
+        o_sf_scale = 500.0
+        output_trtllm = flashinfer.utils.FP4Tensor(
+            torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
+            torch.empty(
+                (
+                    round_up(query.shape[0], 128),
+                    round_up(query.shape[1] * query.shape[2] // 16, 4),
+                ),
+                dtype=torch.float8_e4m3fn,
+            ),
+        )
+    else:
+        output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
 
     def baseline_decode():
-        return wrapper.run(ref_query, ref_kv_cache, out=output_baseline)
+        return wrapper.run(
+            ref_query,
+            ref_kv_cache,
+            k_scale=k_scale,
+            v_scale=v_scale,
+            out=output_baseline,
+        )
 
     def trtllm_decode():
         return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
@@ -158,6 +180,7 @@ def benchmark_decode(
             max_seq_len=max_seq_len,
             bmm1_scale=q_scale * k_scale * sm_scale,
             bmm2_scale=v_scale / o_scale,
+            o_sf_scale=o_sf_scale,
             out=output_trtllm,
         )
 
@@ -237,6 +260,7 @@ if __name__ == "__main__":
         (None, None, None),
         (None, FP8_DTYPE, None),
         (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+        (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
     ]
 
     for quant_dtype in quant_dtypes:
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 49810e20c7..40903c6c34 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -9,8 +9,11 @@ from typing import Optional
 import flashinfer
 import torch
 
+from vllm.utils import round_up
+
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
+FP4_DTYPE = torch.uint8
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -72,13 +75,15 @@ def benchmark_prefill(
         ]
     )
 
-    query = torch.randn(torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype)
+    # Always using 1.0 scale to reflect the real perf in benchmarking
+    q_scale = 1.0
+    ref_query = torch.randn(
+        torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype
+    )
     if q_quant_dtype == FP8_DTYPE:
-        query, q_scale = to_float8(query)
-        ref_query = query.to(dtype) * q_scale
+        query, _ = to_float8(ref_query)
     else:
-        q_scale = 1.0
-        ref_query = query
+        query = ref_query
 
     kv_lens = torch.randint(0, max_kv_len, (batch_size,), dtype=torch.int32)
     kv_lens[-1] = max_kv_len
@@ -86,14 +91,13 @@ def benchmark_prefill(
     seq_lens = kv_lens + q_lens
     max_seq_len = torch.max(seq_lens).item()
 
-    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    # Always using 1.0 scale to reflect the real perf in benchmarking
+    k_scale = v_scale = 1.0
+    ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
     if kv_quant_dtype == FP8_DTYPE:
-        kv_cache, kv_scale = to_float8(kv_cache)
-        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+        kv_cache, _ = to_float8(ref_kv_cache)
     else:
-        kv_scale = 1.0
-        ref_kv_cache = kv_cache
-    k_scale = v_scale = kv_scale
+        kv_cache = ref_kv_cache
 
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(
@@ -152,11 +156,31 @@ def benchmark_prefill(
         return sum(times) / len(times), torch.std(torch.tensor(times))
 
     o_scale = 1.0
+    o_sf_scale = None
     output_baseline = torch.empty(ref_query.shape, dtype=dtype)
-    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+    if o_quant_dtype == FP4_DTYPE:
+        o_sf_scale = 500.0
+        output_trtllm = flashinfer.utils.FP4Tensor(
+            torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
+            torch.empty(
+                (
+                    round_up(query.shape[0], 128),
+                    round_up(query.shape[1] * query.shape[2] // 16, 4),
+                ),
+                dtype=torch.float8_e4m3fn,
+            ),
+        )
+    else:
+        output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
 
     def baseline_prefill():
-        return wrapper.run(ref_query, ref_kv_cache, out=output_baseline)
+        return wrapper.run(
+            ref_query,
+            ref_kv_cache,
+            k_scale=k_scale,
+            v_scale=v_scale,
+            out=output_baseline,
+        )
 
     def trtllm_prefill():
         return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
@@ -172,6 +196,7 @@ def benchmark_prefill(
             batch_size=batch_size,
             cum_seq_lens_q=q_indptr,
             cum_seq_lens_kv=kv_indptr,
+            o_sf_scale=o_sf_scale,
             out=output_trtllm,
         )
 
@@ -250,6 +275,7 @@ if __name__ == "__main__":
         # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
         (None, None, None),
         (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+        (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
     ]
 
     for quant_dtype in quant_dtypes:
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index aade29b99d..0c7e6fbccf 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -8,11 +8,12 @@ import vllm.envs as envs
 from vllm import LLM, SamplingParams
 from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
-                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
+from vllm.compilation.fusion import FUSED_OPS, FusionPass
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import CompilationConfig, PassConfig, VllmConfig
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey, kFp8DynamicTokenSym, kFp8StaticTensorSym)
 
 from .backend import TestBackend
 
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 4a3820e20f..5cfad935a0 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -7,11 +7,13 @@ import torch
 import vllm.envs as envs
 import vllm.plugins
 from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
-                                     FusionPass, GroupShape, QuantKey)
+                                     FusionPass)
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import (CompilationConfig, CompilationLevel, PassConfig,
                          VllmConfig)
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape, QuantKey, ScaleDesc)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
 from vllm.platforms import current_platform
@@ -30,10 +32,8 @@ class TestModel(torch.nn.Module):
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
         self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
-        self.key = QuantKey(dtype=FP8_DTYPE,
-                            static=static,
-                            group_shape=group_shape,
-                            symmetric=True)
+        quant_scale = ScaleDesc(torch.float32, static, group_shape)
+        self.key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
         if static:
             self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         else:
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index bef0fdef98..dba668cfa1 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -11,9 +11,10 @@ from tests.models.utils import check_outputs_equal
 from tests.v1.attention.utils import (BatchSpec, _Backend,
                                       create_common_attn_metadata)
 from vllm import LLM, SamplingParams
+from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.attention import Attention
 from vllm.attention.selector import global_force_attn_backend_context_manager
-from vllm.compilation.fusion import QUANT_OPS, QuantKey, kFp8StaticTensorSym
+from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.noop_elimination import NoOpEliminationPass
@@ -22,13 +23,14 @@ from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
                          set_current_vllm_config)
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+    QuantKey, kFp8StaticTensorSym, kNvfp4Quant)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp)
 from vllm.platforms import current_platform
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
 
 # globals needed for string-import custom Dynamo backend field
 backend: Optional[TestBackend] = None
@@ -105,9 +107,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
 
     # check support
     attn_fusion_supported = [
-        layer.impl.fused_output_quant_supported(quant_key.dtype,
-                                                quant_key.static,
-                                                quant_key.group_shape)
+        layer.impl.fused_output_quant_supported(quant_key)
         for key, layer in compile_config.static_forward_context.items()
     ]
 
@@ -149,12 +149,12 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
     backend = None
 
 
-class TestAttentionStaticQuantPatternModel(torch.nn.Module):
-    """Test model for AttentionStaticQuantPattern fusion."""
+class AttentionQuantPatternModel(torch.nn.Module):
+    """Base model for AttentionQuantPattern fusion."""
 
     def __init__(self, num_qo_heads: int, num_kv_heads: int, head_size: int,
                  kv_cache_dtype: torch.dtype, device: torch.device,
-                 vllm_config: VllmConfig):
+                 vllm_config: VllmConfig, **kwargs):
         super().__init__()
         self.num_qo_heads = num_qo_heads
         self.num_kv_heads = num_kv_heads
@@ -172,11 +172,6 @@ class TestAttentionStaticQuantPatternModel(torch.nn.Module):
             prefix="model.layers.0.self_attn.attn",
         )
 
-        self.fp8_linear = Fp8LinearOp(
-            act_quant_static=True, act_quant_group_shape=GroupShape.PER_TENSOR)
-        self.wscale = torch.tensor([1.0], dtype=torch.float32)
-        self.scale = torch.tensor([1.0], dtype=torch.float32)
-
         self.block_size = 16
 
         # Initialize attn MetadataBuilder
@@ -230,23 +225,86 @@ class TestAttentionStaticQuantPatternModel(torch.nn.Module):
 
         return self.attn_metadata
 
-    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                w: torch.Tensor):
+
+class TestAttentionFp8StaticQuantPatternModel(AttentionQuantPatternModel):
+    """Test model for AttentionFp8StaticQuantPattern fusion."""
+
+    quant_key = kFp8StaticTensorSym
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=self.quant_key.scale.static,
+            act_quant_group_shape=self.quant_key.scale.group_shape)
+
+        hidden_size = self.num_qo_heads * self.head_size
+        self.w = kwargs.get(
+            "w", {
+                "weight":
+                torch.randn(hidden_size, hidden_size).to(
+                    dtype=FP8_DTYPE, device=self.device).t(),
+                "wscale":
+                torch.tensor([1.0], dtype=torch.float32, device=self.device),
+                "scale":
+                torch.tensor([1.0], dtype=torch.float32, device=self.device),
+            })
+
+    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
         """Forward pass that creates the pattern to be fused."""
         attn_output = self.attn(q, k, v)
         return self.fp8_linear.apply(input=attn_output,
-                                     weight=w,
-                                     weight_scale=self.wscale,
-                                     input_scale=self.scale)
+                                     weight=self.w["weight"],
+                                     weight_scale=self.w["wscale"],
+                                     input_scale=self.w["scale"])
+
+
+class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
+    """Test model for AttentionNvfp4QuantPattern fusion."""
+
+    quant_key = kNvfp4Quant
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        hidden_size = self.num_qo_heads * self.head_size
+        self.w = kwargs.get(
+            "w", {
+                "weight":
+                torch.randint(256, (hidden_size, hidden_size // 2),
+                              dtype=FP4_DTYPE,
+                              device=self.device),
+                "wscale_swizzled":
+                torch.randn(hidden_size, hidden_size // 16).to(
+                    dtype=FP8_DTYPE, device=self.device),
+                "wscale":
+                torch.tensor([500], dtype=torch.float32, device=self.device),
+                "scale":
+                torch.tensor([0.002], dtype=torch.float32, device=self.device),
+            })
+
+    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
+        """Forward pass that creates the pattern to be fused."""
+        attn_output = self.attn(q, k, v)
+        quant_output, output_block_scale = scaled_fp4_quant(
+            attn_output, 1 / self.w["scale"])
+        return cutlass_scaled_fp4_mm(a=quant_output,
+                                     b=self.w["weight"],
+                                     block_scale_a=output_block_scale,
+                                     block_scale_b=self.w["wscale_swizzled"],
+                                     alpha=self.w["scale"] * self.w["wscale"],
+                                     out_dtype=attn_output.dtype)
 
 
 @pytest.mark.parametrize("num_qo_heads, num_kv_heads", [(64, 8), (40, 8)])
 @pytest.mark.parametrize("head_size", [128])
 @pytest.mark.parametrize("batch_size", [7, 256, 533])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.parametrize(
-    "model_name, quant_key",
-    [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", kFp8StaticTensorSym)])
+@pytest.mark.parametrize("model_name, model_class",
+                         [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
+                           TestAttentionFp8StaticQuantPatternModel),
+                          ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+                           TestAttentionNvfp4QuantPatternModel)])
 @pytest.mark.parametrize("backend", [_Backend.FLASHINFER])
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
 @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
@@ -255,8 +313,8 @@ class TestAttentionStaticQuantPatternModel(torch.nn.Module):
 def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                  head_size: int, batch_size: int,
                                  dtype: torch.dtype, model_name: str,
-                                 quant_key: QuantKey, backend: _Backend,
-                                 monkeypatch, dist_init):
+                                 model_class: type[AttentionQuantPatternModel],
+                                 backend: _Backend, monkeypatch, dist_init):
     """Test AttentionStaticQuantPattern fusion pass"""
 
     monkeypatch.setenv("VLLM_USE_V1", "1")
@@ -277,8 +335,10 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
         cache_config=CacheConfig(cache_dtype="fp8"))
 
     # Create test inputs
-    hidden_size = num_qo_heads * head_size
-    q = torch.randn(batch_size, hidden_size, dtype=dtype, device=device)
+    q = torch.randn(batch_size,
+                    num_qo_heads * head_size,
+                    dtype=dtype,
+                    device=device)
     k = torch.randn(batch_size,
                     num_kv_heads * head_size,
                     dtype=dtype,
@@ -287,7 +347,6 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                     num_kv_heads * head_size,
                     dtype=dtype,
                     device=device)
-    linear_w = torch.randn(hidden_size, hidden_size).to(FP8_DTYPE).t()
 
     # Mark first dimension as dynamic for realistic testing
     torch._dynamo.mark_dynamic(q, 0)
@@ -299,9 +358,12 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
     with set_current_vllm_config(vllm_config_unfused), set_forward_context(
             attn_metadata=None, vllm_config=vllm_config_unfused
     ), global_force_attn_backend_context_manager(backend):
-        model_unfused = TestAttentionStaticQuantPatternModel(
-            num_qo_heads, num_kv_heads, head_size, FP8_DTYPE, device,
-            vllm_config_unfused)
+        model_unfused = model_class(num_qo_heads=num_qo_heads,
+                                    num_kv_heads=num_kv_heads,
+                                    head_size=head_size,
+                                    kv_cache_dtype=FP8_DTYPE,
+                                    device=device,
+                                    vllm_config=vllm_config_unfused)
         model_unfused = model_unfused.to(device)
 
         forward_ctx = get_forward_context()
@@ -309,7 +371,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
             batch_size)
 
         # Run model directly without compilation and fusion
-        result_unfused = model_unfused(q, k, v, linear_w)
+        result_unfused = model_unfused(q, k, v)
 
     # Run model with attn fusion enabled
     vllm_config.compilation_config.pass_config = PassConfig(
@@ -317,9 +379,13 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
     with set_current_vllm_config(vllm_config), set_forward_context(
             attn_metadata=None, vllm_config=vllm_config
     ), global_force_attn_backend_context_manager(backend):
-        model_fused = TestAttentionStaticQuantPatternModel(
-            num_qo_heads, num_kv_heads, head_size, FP8_DTYPE, device,
-            vllm_config)
+        model_fused = model_class(num_qo_heads=num_qo_heads,
+                                  num_kv_heads=num_kv_heads,
+                                  head_size=head_size,
+                                  kv_cache_dtype=FP8_DTYPE,
+                                  device=device,
+                                  vllm_config=vllm_config,
+                                  w=model_unfused.w)
         model_fused = model_fused.to(device)
 
         forward_ctx = get_forward_context()
@@ -336,21 +402,20 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                        backend=test_backend,
                                        fullgraph=True)
         assert model_compiled.attn._o_scale_float is None
-        result_fused_1 = model_compiled(q, k, v, linear_w)
+        result_fused_1 = model_compiled(q, k, v)
 
         # After the 1st round of the forward pass, output quant scale should be
         # loaded into the attn layer's _o_scale_float, the 2nd round should
         # reuse the loaded _o_scale_float
         assert model_compiled.attn._o_scale_float is not None
-        result_fused_2 = model_compiled(q, k, v, linear_w)
+        result_fused_2 = model_compiled(q, k, v)
         assert model_compiled.attn._o_scale_float is not None
 
     # Check attn fusion support
+    quant_key = model_class.quant_key
     attn_fusion_supported = [
-        layer.impl.fused_output_quant_supported(quant_key.dtype,
-                                                quant_key.static,
-                                                quant_key.group_shape) for key,
-        layer in vllm_config.compilation_config.static_forward_context.items()
+        layer.impl.fused_output_quant_supported(quant_key) for key, layer in
+        vllm_config.compilation_config.static_forward_context.items()
     ]
     if any(attn_fusion_supported):
         # Check quantization ops in the graph before and after fusion
@@ -370,6 +435,15 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
     assert attn_nodes_post[0].kwargs.get("output_scale") is not None, \
         "Attention should have output_scale after fusion"
 
+    assert attn_nodes_pre[0].kwargs.get("output_block_scale") is None, \
+        "Attention should not have output_block_scale before fusion"
+    if quant_key.dtype == FP8_DTYPE:
+        assert attn_nodes_post[0].kwargs.get("output_block_scale") is None, \
+            "Attention should not have output_block_scale after FP8 fusion"
+    elif quant_key.dtype == FP4_DTYPE:
+        assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, \
+            "Attention should have output_block_scale after FP4 fusion"  # noqa: E501
+
     # Check that results are closed
     torch.testing.assert_close(result_unfused,
                                result_fused_1,
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 69e44264cd..8d0a11d8eb 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -6,7 +6,11 @@ import flashinfer
 import pytest
 import torch
 
+from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
+                                                    FLOAT8_E4M3_MAX,
+                                                    dequantize_nvfp4_to_dtype)
 from vllm.platforms import current_platform
+from vllm.utils import round_up
 
 if not current_platform.is_device_capability(100):
     pytest.skip("This TRTLLM kernel requires NVIDIA Blackwell.",
@@ -14,6 +18,7 @@ if not current_platform.is_device_capability(100):
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -29,7 +34,9 @@ DTYPE = [torch.bfloat16]
 QUANT_DTYPES = [
     # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
     (None, None, None),
+    (None, FP8_DTYPE, None),
     (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+    (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
 ]
 BATCH_SIZE = [4, 12]
 MAX_SEQ_LENS = [(1024, 4096)]
@@ -153,11 +160,25 @@ def test_flashinfer_trtllm_decode_with_baseline(
     output = torch.empty(ref_query.shape, dtype=dtype)
     wrapper.run(ref_query, ref_kv_cache, out=output)
     o_scale = 1.0
+    o_sf_scale = None
     if o_quant_dtype == FP8_DTYPE:
         _, o_scale = to_float8(output)
+    elif o_quant_dtype == FP4_DTYPE:
+        o_sf_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                      torch.amax(output.flatten(), dim=-1)).to(torch.float32)
 
     # TRTLLM Decode
-    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+    if o_quant_dtype == FP4_DTYPE:
+        output_trtllm = flashinfer.utils.FP4Tensor(
+            torch.empty(query.shape[:-1] + (query.shape[-1] // 2, ),
+                        dtype=torch.uint8),
+            torch.empty((round_up(query.shape[0], 128),
+                         round_up(query.shape[1] * query.shape[2] // 16, 4)),
+                        dtype=torch.float8_e4m3fn),
+        )
+    else:
+        output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+
     flashinfer.decode.trtllm_batch_decode_with_kv_cache(
         query=query,
         kv_cache=kv_cache,
@@ -167,15 +188,27 @@ def test_flashinfer_trtllm_decode_with_baseline(
         max_seq_len=max_seq_len,
         bmm1_scale=q_scale * k_scale * sm_scale,
         bmm2_scale=v_scale / o_scale,
+        o_sf_scale=o_sf_scale,
         out=output_trtllm,
     )
     if o_quant_dtype == FP8_DTYPE:
         output_trtllm = output_trtllm.to(dtype) * o_scale
+    elif o_quant_dtype == FP4_DTYPE:
+        output_trtllm.data = output_trtllm.data.reshape(
+            -1, query.shape[1] * query.shape[2] // 2)
+        output_trtllm = dequantize_nvfp4_to_dtype(output_trtllm.data,
+                                                  output_trtllm.scale,
+                                                  o_sf_scale, dtype,
+                                                  query.device)
+        output_trtllm = output_trtllm.reshape(-1, query.shape[1],
+                                              query.shape[2])
 
-    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
+    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
+        rtol, atol = 3e-1, 1e0
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
         rtol, atol = 5e-2, 7e-2
     else:
-        rtol, atol = 1e-2, 1e-2
+        rtol, atol = 1e-2, 2e-2
 
     torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol), \
         f"{torch.max(torch.abs(output - output_trtllm))}"
@@ -211,6 +244,9 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     kv_quant_dtype = kv_quant_dtype or dtype
     o_quant_dtype = o_quant_dtype or dtype
 
+    if q_quant_dtype != kv_quant_dtype:
+        pytest.skip("Skipped mixed QKV dtypes for prefill")
+
     max_q_len, max_kv_len = max_seq_lens
 
     num_qo_heads, num_kv_heads = num_heads
@@ -303,11 +339,25 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     output = torch.empty(ref_query.shape, dtype=dtype)
     wrapper.run(ref_query, ref_kv_cache, out=output)
     o_scale = 1.0
+    o_sf_scale = None
     if o_quant_dtype == FP8_DTYPE:
         _, o_scale = to_float8(output)
+    elif o_quant_dtype == FP4_DTYPE:
+        o_sf_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                      torch.amax(output.flatten(), dim=-1)).to(torch.float32)
 
     # TRTLLM Prefill
-    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+    if o_quant_dtype == FP4_DTYPE:
+        output_trtllm = flashinfer.utils.FP4Tensor(
+            torch.empty(query.shape[:-1] + (query.shape[-1] // 2, ),
+                        dtype=torch.uint8),
+            torch.empty((round_up(query.shape[0], 128),
+                         round_up(query.shape[1] * query.shape[2] // 16, 4)),
+                        dtype=torch.float8_e4m3fn),
+        )
+    else:
+        output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+
     flashinfer.prefill.trtllm_batch_context_with_kv_cache(
         query=query,
         kv_cache=kv_cache,
@@ -321,12 +371,24 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         batch_size=batch_size,
         cum_seq_lens_q=q_indptr,
         cum_seq_lens_kv=kv_indptr,
+        o_sf_scale=o_sf_scale,
         out=output_trtllm,
     )
     if o_quant_dtype == FP8_DTYPE:
         output_trtllm = output_trtllm.to(dtype) * o_scale
+    elif o_quant_dtype == FP4_DTYPE:
+        output_trtllm.data = output_trtllm.data.reshape(
+            -1, query.shape[1] * query.shape[2] // 2)
+        output_trtllm = dequantize_nvfp4_to_dtype(output_trtllm.data,
+                                                  output_trtllm.scale,
+                                                  o_sf_scale, dtype,
+                                                  query.device)
+        output_trtllm = output_trtllm.reshape(-1, query.shape[1],
+                                              query.shape[2])
 
-    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
+    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
+        rtol, atol = 4e-1, 1e0
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
         rtol, atol = 5e-2, 7e-2
     else:
         rtol, atol = 1e-2, 1e-2
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index d21f077568..0b9c625533 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -9,8 +9,7 @@ from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional,
 
 import torch
 
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
 from vllm.multimodal import MultiModalPlaceholderMap
 
 if TYPE_CHECKING:
@@ -285,20 +284,17 @@ class AttentionImpl(ABC, Generic[T]):
         attn_metadata: T,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
 
-    def fused_output_quant_supported(self, dtype: torch.dtype, static: bool,
-                                     group_shape: GroupShape):
+    def fused_output_quant_supported(self, quant_key: QuantKey):
         """
         Does this attention implementation support fused output quantization.
         This is used by the AttnFusionPass to only fuse output quantization
         onto implementations that support it.
 
-        TODO(luka) merge parameters into QuantDescriptor
-        :param dtype: quantized dtype
-        :param static: static or dynamic quantization
-        :param group_shape: quant group shape.
+        :param quant_key: QuantKey object that describes the quantization op
         :return: is fusion supported for this type of quantization
         """
         return False
@@ -317,6 +313,7 @@ class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
         attn_metadata: T,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py
index fac3c318a8..ce9467efd2 100644
--- a/vllm/attention/backends/differential_flash_attn.py
+++ b/vllm/attention/backends/differential_flash_attn.py
@@ -800,6 +800,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
         attn_metadata: DifferentialFlashAttentionMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
@@ -817,6 +818,11 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
               {q,k,v}_descale to be (num_sequences, num_kv_heads).
               We use torch's .expand() to avoid duplicating values
         """
+        if output_scale is not None or output_block_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for DifferentialFlashAttentionImpl")
+
         if self.lambda_full is None:
             self.lambda_init = self.differential_flash_attention_config[
                 "lambda_init"]
diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py
index fa6f3f1b39..85957bea1e 100644
--- a/vllm/attention/backends/dual_chunk_flash_attn.py
+++ b/vllm/attention/backends/dual_chunk_flash_attn.py
@@ -371,6 +371,7 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl):
         attn_metadata: DualChunkFlashAttentionMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with DualChunkFlashAttention.
         Args:
@@ -386,7 +387,7 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl):
         """
         assert output is None, "Output tensor not supported for DualChunk"
 
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for FlashAttentionImpl")
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index e52480d5c5..ba7a9afe86 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -596,6 +596,7 @@ class FlashAttentionImpl(AttentionImpl):
         attn_metadata: FlashAttentionMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
@@ -615,7 +616,7 @@ class FlashAttentionImpl(AttentionImpl):
         """
         assert output is not None, "Output tensor must be provided."
 
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for FlashAttentionImpl")
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 9d6ab7e321..c5ed4c6e40 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1238,12 +1238,13 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
         attn_metadata: T,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if output is not None:
             raise NotImplementedError(
                 "output is not yet supported for MLAImplBase")
 
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for MLAImplBase")
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 63e467f5a7..e4c27a0ef3 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -20,7 +20,7 @@ from vllm.attention.ops.paged_attn import (PagedAttention,
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+    QuantKey, kFp8StaticTensorSym)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -529,11 +529,9 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                                   head_dim).reshape(tokens, n_kv_heads * n_rep,
                                                     head_dim))
 
-    def fused_output_quant_supported(self, dtype: torch.dtype, static: bool,
-                                     group_shape: GroupShape):
+    def fused_output_quant_supported(self, quant_key: QuantKey):
         if self.use_triton_flash_attn:
-            return dtype == current_platform.fp8_dtype(
-            ) and static and group_shape == GroupShape.PER_TENSOR
+            return quant_key == kFp8StaticTensorSym
 
         # Only supported in the Triton backend
         return False
@@ -548,6 +546,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         attn_metadata: ROCmFlashAttentionMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
@@ -606,6 +605,11 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                 "fused output quantization only supported for Triton"
                 " implementation in ROCMFlashAttentionImpl for now")
 
+        if output_block_scale is not None:
+            raise NotImplementedError(
+                "fused nvfp4 output quantization is not supported"
+                " for ROCMFlashAttentionImpl")
+
         query = query.view(-1, self.num_heads, self.head_size)
         if key is not None:
             assert value is not None
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 0bc38b4142..c1213f7620 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -432,6 +432,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         attn_metadata: "XFormersMetadata",
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
@@ -484,7 +485,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for XFormersImpl")
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 04ab100c87..9fbead3178 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -495,6 +495,7 @@ def unified_attention_with_output(
     output: torch.Tensor,
     layer_name: str,
     output_scale: Optional[torch.Tensor] = None,
+    output_block_scale: Optional[torch.Tensor] = None,
 ) -> None:
     wait_for_kv_layer_from_connector(layer_name)
     forward_context: ForwardContext = get_forward_context()
@@ -510,7 +511,8 @@ def unified_attention_with_output(
                       kv_cache,
                       attn_metadata,
                       output=output,
-                      output_scale=output_scale)
+                      output_scale=output_scale,
+                      output_block_scale=output_block_scale)
 
     maybe_save_kv_layer_to_connector(layer_name, kv_cache)
 
@@ -522,6 +524,7 @@ def unified_attention_with_output_fake(
     output: torch.Tensor,
     layer_name: str,
     output_scale: Optional[torch.Tensor] = None,
+    output_block_scale: Optional[torch.Tensor] = None,
 ) -> None:
     return
 
@@ -529,7 +532,7 @@ def unified_attention_with_output_fake(
 direct_register_custom_op(
     op_name="unified_attention_with_output",
     op_func=unified_attention_with_output,
-    mutates_args=["output"],
+    mutates_args=["output", "output_block_scale"],
     fake_impl=unified_attention_with_output_fake,
     dispatch_key=current_platform.dispatch_key,
 )
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 3dec939c28..413948799d 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -12,7 +12,8 @@ from torch._ops import OpOverload
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+    GroupShape, QuantKey, ScaleDesc, kFp8DynamicTensorSym, kFp8DynamicTokenSym,
+    kFp8StaticTensorSym, kNvfp4Quant, kStaticTensorScale)
 from vllm.platforms import current_platform
 
 from .fx_utils import find_getitem_maybe
@@ -21,6 +22,7 @@ from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
 
 
 def empty_bf16(*args, **kwargs):
@@ -31,42 +33,13 @@ def empty_fp32(*args, **kwargs):
     return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
 
 
+def empty_i32(*args, **kwargs):
+    return torch.empty(*args, **kwargs, dtype=torch.int32, device="cuda")
+
+
 RMS_OP = torch.ops._C.rms_norm.default
 RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
 
-
-class QuantKey(NamedTuple):
-    """
-    Named tuple for identifying the type of quantization.
-    dtype: quantized data type
-    static: static quantization if True, dynamic if False
-    group_shape: quantization group shape
-    symmetric: symmetric if True, asymmetric if False
-
-    TODO(luka) use QuantDescriptor once standardized:
-    https://github.com/vllm-project/vllm/issues/8913
-
-    """
-    dtype: torch.dtype
-    static: bool
-    group_shape: GroupShape
-    symmetric: bool = True
-
-    def __str__(self):
-        group_shape = ('per_tensor'
-                       if self.group_shape == GroupShape.PER_TENSOR else
-                       ('per_token' if self.group_shape == GroupShape.PER_TOKEN
-                        else str(self.group_shape)))
-
-        return (f"QuantKey({'static' if self.static else 'dynamic'},"
-                f"{fx.graph.dtype_abbrs[self.dtype]},{group_shape},"
-                f"{'a' if not self.symmetric else ''}symmetric)")
-
-
-kFp8StaticTensorSym = QuantKey(FP8_DTYPE, True, GroupShape.PER_TENSOR, True)
-kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, GroupShape.PER_TENSOR, True)
-kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, GroupShape.PER_TOKEN, True)
-
 QUANT_OPS: dict[QuantKey, OpOverload] = {
     kFp8StaticTensorSym:
     torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
@@ -74,6 +47,7 @@ QUANT_OPS: dict[QuantKey, OpOverload] = {
     torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
     kFp8DynamicTokenSym:
     torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
+    kNvfp4Quant: torch.ops._C.scaled_fp4_quant.default,  # noqa: E501
 }
 
 
@@ -187,11 +161,9 @@ class RMSNormStaticQuantPattern(RMSNormQuantPattern):
                  quant_dtype: torch.dtype,
                  symmetric=True):
         fused_key = FusedRMSQuantKey(fused_add=False,
-                                     quant=QuantKey(
-                                         dtype=quant_dtype,
-                                         static=True,
-                                         group_shape=GroupShape.PER_TENSOR,
-                                         symmetric=symmetric))
+                                     quant=QuantKey(dtype=quant_dtype,
+                                                    scale=kStaticTensorScale,
+                                                    symmetric=symmetric))
         super().__init__(epsilon, fused_key)
 
     def register(self, pm_pass: PatternMatcherPass):
@@ -244,11 +216,9 @@ class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern):
                  quant_dtype: torch.dtype,
                  symmetric=True):
         key = FusedRMSQuantKey(fused_add=True,
-                               quant=QuantKey(
-                                   dtype=quant_dtype,
-                                   static=True,
-                                   group_shape=GroupShape.PER_TENSOR,
-                                   symmetric=symmetric))
+                               quant=QuantKey(dtype=quant_dtype,
+                                              scale=kStaticTensorScale,
+                                              symmetric=symmetric))
         super().__init__(epsilon, key)
 
     def register(self, pm_pass: PatternMatcherPass,
@@ -337,10 +307,10 @@ class RMSNormDynamicQuantPattern(RMSNormQuantPattern):
                  quant_dtype: torch.dtype,
                  group_shape: GroupShape = GroupShape.PER_TOKEN,
                  symmetric=True):
+        scale = ScaleDesc(torch.float32, False, group_shape)
         key = FusedRMSQuantKey(fused_add=False,
                                quant=QuantKey(dtype=quant_dtype,
-                                              static=False,
-                                              group_shape=group_shape,
+                                              scale=scale,
                                               symmetric=symmetric))
         super().__init__(epsilon, key)
 
@@ -435,10 +405,10 @@ class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern):
                  quant_dtype: torch.dtype,
                  group_shape: GroupShape = GroupShape.PER_TOKEN,
                  symmetric=True):
+        scale = ScaleDesc(torch.float32, False, group_shape)
         key = FusedRMSQuantKey(fused_add=True,
                                quant=QuantKey(dtype=quant_dtype,
-                                              static=False,
-                                              group_shape=group_shape,
+                                              scale=scale,
                                               symmetric=symmetric))
         super().__init__(epsilon, key)
 
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index 1f77a26676..f942afe6a2 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from abc import ABC, abstractmethod
+
 import torch
 import torch._inductor.pattern_matcher as pm
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
@@ -11,44 +13,41 @@ from torch._subclasses.fake_tensor import (FakeTensorMode,
 from vllm.attention import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey, kNvfp4Quant, kStaticTensorScale)
 from vllm.platforms import current_platform
+from vllm.utils import round_up
 
-from .fusion import QUANT_OPS, GroupShape, QuantKey, empty_bf16, empty_fp32
+from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32
 from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
 FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
 
 ATTN_OP = torch.ops.vllm.unified_attention_with_output.default
 RESHAPE_OP = torch.ops.aten.reshape.default
 
 
-class AttentionStaticQuantPattern:
+class AttentionQuantPattern(ABC):
     """
-    Fusion for Attention+StaticQuant.
-
-    Only triggers when the attention implementation returns True in
-    `fused_output_quant_supported()`. If the pattern is found, the StaticQuant
-    op will be removed from the graph, and its scale will be passed into
-    Attention op as the `output_scale` argument.
+    The base class for Attn+Quant fusions.
+    Should not be used directly.
     """
 
     def __init__(
         self,
         layer: Attention,
-        quant_dtype: torch.dtype,
-        symmetric=True,
+        quant_key: QuantKey,
     ):
         self.layer = layer
         self.layer_name = layer.layer_name
         self.num_heads = layer.num_heads
         self.head_size = layer.head_size
-        self.quant_dtype = quant_dtype
-        self.quant_key = QuantKey(dtype=quant_dtype,
-                                  static=True,
-                                  group_shape=GroupShape.PER_TENSOR,
-                                  symmetric=symmetric)
+        self.quant_key = quant_key
+        self.quant_dtype = quant_key.dtype
+
         assert self.quant_key in QUANT_OPS, \
             f"unsupported quantization scheme {self.quant_key}"
         self.QUANT_OP = QUANT_OPS[self.quant_key]
@@ -57,12 +56,49 @@ class AttentionStaticQuantPattern:
         kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs}
         return torch.empty(*args, **kwargs)
 
+    @staticmethod
+    def wrap_trace_fn(process_fx, trace_fn):
+
+        def wrapped(*args, **kwargs):
+            return process_fx(trace_fn(*args, **kwargs))
+
+        return wrapped
+
+    @staticmethod
+    def fx_view_to_reshape(gm: torch.fx.GraphModule):
+        from torch._inductor.fx_passes.post_grad import view_to_reshape
+        view_to_reshape(gm)
+        return gm
+
     def register_if_supported(self, pm_pass: PatternMatcherPass):
-        if self.layer.impl.fused_output_quant_supported(
-                self.quant_dtype, self.quant_key.static,
-                self.quant_key.group_shape):
+        if self.layer.impl.fused_output_quant_supported(self.quant_key):
             self._register(pm_pass)
 
+    @abstractmethod
+    def _register(self, pm_pass: PatternMatcherPass):
+        raise NotImplementedError
+
+
+class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
+    """
+    Fusion for Attention+Fp8StaticQuant.
+
+    Only triggers when the attention implementation returns True in
+    `fused_output_quant_supported()`. If the pattern is found, the
+    Fp8StaticQuant op will be removed from the graph, and its scale
+    will be passed into Attention op as the `output_scale` argument.
+    """
+
+    def __init__(
+        self,
+        layer: Attention,
+        symmetric: bool = True,
+    ):
+        quant_key = QuantKey(dtype=FP8_DTYPE,
+                             scale=kStaticTensorScale,
+                             symmetric=symmetric)
+        super().__init__(layer, quant_key)
+
     def _register(self, pm_pass: PatternMatcherPass):
 
         def pattern(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
@@ -74,9 +110,10 @@ class AttentionStaticQuantPattern:
                                       value=v,
                                       output=output_attn,
                                       layer_name=self.layer_name,
-                                      output_scale=None)
-            attn_out_view = RESHAPE_OP(at1[1],
-                                       [-1, self.num_heads * self.head_size])
+                                      output_scale=None,
+                                      output_block_scale=None)
+            attn_out_view = RESHAPE_OP(
+                at1[1], [q.shape[0], self.num_heads * self.head_size])
             at2 = auto_functionalized(self.QUANT_OP,
                                       result=output_quant,
                                       input=attn_out_view,
@@ -98,7 +135,8 @@ class AttentionStaticQuantPattern:
                                       value=v,
                                       output=output_attn,
                                       layer_name=self.layer_name,
-                                      output_scale=scale)
+                                      output_scale=scale,
+                                      output_block_scale=None)
             return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])
 
         # Need custom fake mode, otherwise tracing happens with real tensors.
@@ -114,21 +152,94 @@ class AttentionStaticQuantPattern:
                 empty_fp32(1, 1)  # scale
             ]
 
-            def wrap_trace_fn(process_fx, trace_fn):
+            pm.register_replacement(
+                pattern, replacement, inputs,
+                AttentionQuantPattern.wrap_trace_fn(
+                    AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
+                pm_pass)
 
-                def wrapped(*args, **kwargs):
-                    return process_fx(trace_fn(*args, **kwargs))
 
-                return wrapped
+class AttentionNvfp4QuantPattern(AttentionQuantPattern):
+    """
+    Fusion for Attention+Nvfp4Quant.
 
-            def fx_view_to_reshape(gm: torch.fx.GraphModule):
-                from torch._inductor.fx_passes.post_grad import view_to_reshape
-                view_to_reshape(gm)
-                return gm
+    Only triggers when the attention implementation returns True in
+    `fused_output_quant_supported()`. If the pattern is found, the
+    Nvfp4Quant op will be removed from the graph, and its scale
+    will be passed into Attention op as the `output_scale` argument.
+    """
+
+    def __init__(self, layer: Attention):
+        super().__init__(layer, kNvfp4Quant)
+
+    def _register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    output_attn: torch.Tensor, output_quant: torch.Tensor,
+                    output_scale: torch.Tensor, input_scale: torch.Tensor):
+            at1 = auto_functionalized(ATTN_OP,
+                                      query=q,
+                                      key=k,
+                                      value=v,
+                                      output=output_attn,
+                                      layer_name=self.layer_name,
+                                      output_scale=None,
+                                      output_block_scale=None)
+            attn_out_view = RESHAPE_OP(
+                at1[1], [q.shape[0], self.num_heads * self.head_size])
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      output=output_quant,
+                                      input=attn_out_view,
+                                      output_scale=output_scale,
+                                      input_scale=input_scale)
+            output_scale_view = torch.ops.aten.view.dtype(at2[2], FP8_DTYPE)
+            return at2[1], output_scale_view
+
+        def replacement(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                        output_attn: torch.Tensor, output_quant: torch.Tensor,
+                        output_scale: torch.Tensor, input_scale: torch.Tensor):
+            # attention output in quant_dtype
+            output_attn = torch.ops.aten.full.default(
+                [q.shape[0], self.num_heads, self.head_size // 2],
+                0.0,
+                dtype=self.quant_dtype,
+                device=q.device)
+            # attention output block scale
+            output_scale_view = torch.ops.aten.view.dtype(
+                output_scale, FP8_DTYPE)
+            at2 = auto_functionalized(ATTN_OP,
+                                      query=q,
+                                      key=k,
+                                      value=v,
+                                      output=output_attn,
+                                      layer_name=self.layer_name,
+                                      output_scale=input_scale,
+                                      output_block_scale=output_scale_view)
+            output = RESHAPE_OP(at2[1],
+                                [-1, self.num_heads * self.head_size // 2])
+            return output, at2[2]
+
+        # Need custom fake mode, otherwise tracing happens with real tensors.
+        # That would not work for the unified_attention custom op.
+        with unset_fake_temporarily(), FakeTensorMode():
+            inputs = [
+                empty_bf16(5, self.num_heads, self.head_size),  # q
+                empty_bf16(5, self.num_heads, self.head_size),  # k
+                empty_bf16(5, self.num_heads, self.head_size),  # v
+                empty_bf16(5, self.num_heads, self.head_size),  # output_attn
+                self.empty_quant(5, self.num_heads * self.head_size //
+                                 2),  # output_quant
+                empty_i32(128,
+                          round_up(self.num_heads * self.head_size // 16,
+                                   4)),  # output_scale
+                empty_fp32(1, 1),  # input_scale
+            ]
 
             pm.register_replacement(
                 pattern, replacement, inputs,
-                wrap_trace_fn(fx_view_to_reshape, pm.fwd_only), pm_pass)
+                AttentionQuantPattern.wrap_trace_fn(
+                    AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
+                pm_pass)
 
 
 class AttnFusionPass(VllmInductorPass):
@@ -151,8 +262,12 @@ class AttnFusionPass(VllmInductorPass):
 
         attn_layers = get_layers_from_vllm_config(config, Attention)
         for layer_name, layer in attn_layers.items():
-            pattern = AttentionStaticQuantPattern(layer, FP8_DTYPE)
-            pattern.register_if_supported(self.patterns)
+            pattern_fp8 = AttentionFp8StaticQuantPattern(layer)
+            pattern_fp8.register_if_supported(self.patterns)
+
+            pattern_nvfp4 = AttentionNvfp4QuantPattern(layer)
+            pattern_nvfp4.register_if_supported(self.patterns)
+
         if len(attn_layers) == 0:
             logger.warning(
                 "Attention + quant fusion is enabled, but no attention layers "
@@ -175,4 +290,6 @@ class AttnFusionPass(VllmInductorPass):
         self.end_and_log()
 
     def uuid(self):
-        return VllmInductorPass.hash_source(self, AttentionStaticQuantPattern)
+        return VllmInductorPass.hash_source(self, AttentionQuantPattern,
+                                            AttentionFp8StaticQuantPattern,
+                                            AttentionNvfp4QuantPattern)
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 97e5922ebd..6154fca2e4 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -2,16 +2,21 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """This file is used for /tests and /benchmarks"""
 from collections.abc import Mapping
+from dataclasses import dataclass
 from types import MappingProxyType
 from typing import ClassVar, NamedTuple, Optional
 
 import numpy
 import torch
+from torch import fx
 
 from vllm._custom_ops import cutlass_scaled_mm_supports_fp4
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
+FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
+
 
 # Use proxy as NamedTuple direct subclasses cannot have static members
 class _GroupShape(NamedTuple):
@@ -34,6 +39,64 @@ GroupShape.PER_TENSOR = GroupShape(-1, -1)
 GroupShape.PER_TOKEN = GroupShape(1, -1)
 
 
+@dataclass(frozen=True)
+class ScaleDesc:
+    """
+    Class for describing a single quantization scaling factor.
+    dtype: data type of the scale
+    static: static scale if True, dynamic if False
+    group_shape: group shape of the scale
+    """
+    dtype: torch.dtype
+    static: bool
+    group_shape: GroupShape
+
+    def __str__(self):
+        group_shape = ('per_tensor'
+                       if self.group_shape == GroupShape.PER_TENSOR else
+                       ('per_token' if self.group_shape == GroupShape.PER_TOKEN
+                        else str(self.group_shape)))
+
+        return (f"{fx.graph.dtype_abbrs[self.dtype]},"
+                f"{'static' if self.static else 'dynamic'},{group_shape}")
+
+
+@dataclass(frozen=True)
+class QuantKey:
+    """
+    Class for identifying the type of quantization.
+    dtype: quantized data type
+    scale: scale descriptor
+    scale2: second-level scale descriptor
+    symmetric: symmetric if True, asymmetric if False
+    """
+    dtype: torch.dtype
+    scale: ScaleDesc
+    scale2: Optional[ScaleDesc] = None
+    symmetric: bool = True
+
+    def __str__(self):
+        scale2_str = f"scale2({self.scale2})," if self.scale2 else ""
+        return (f"QuantKey({fx.graph.dtype_abbrs[self.dtype]},"
+                f"scale({self.scale}),{scale2_str}"
+                f"{'a' if not self.symmetric else ''}symmetric)")
+
+
+kStaticTensorScale = ScaleDesc(torch.float32, True, GroupShape.PER_TENSOR)
+kFp8StaticTensorSym = QuantKey(FP8_DTYPE, kStaticTensorScale, symmetric=True)
+
+kDynamicTensorScale = ScaleDesc(torch.float32, False, GroupShape.PER_TENSOR)
+kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, kDynamicTensorScale, symmetric=True)
+
+kDynamicTokenScale = ScaleDesc(torch.float32, False, GroupShape.PER_TOKEN)
+kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, kDynamicTokenScale, symmetric=True)
+
+kNvfp4GroupScale = ScaleDesc(FP8_DTYPE, False, GroupShape(1, 16))
+kNvfp4Quant = QuantKey(FP4_DTYPE,
+                       scale=kNvfp4GroupScale,
+                       scale2=kStaticTensorScale)
+
+
 # Normalize the group_shape to the full extent for any dims that are -1
 def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape):
     # -1 means full extent
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 9ed4633186..973979fdf7 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -483,6 +483,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         attn_metadata: TorchSDPAMetadata,  # type: ignore
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
 
@@ -497,7 +498,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for TorchSDPABackendImpl")
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index eca83b6d2e..6e7096de92 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -430,6 +430,7 @@ class FlashAttentionImpl(AttentionImpl):
         attn_metadata: FlashAttentionMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
@@ -447,7 +448,7 @@ class FlashAttentionImpl(AttentionImpl):
         """
         assert output is not None, "Output tensor must be provided."
 
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for FlashAttentionImpl")
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 1e6e3f1d0a..50819bb2bb 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -12,6 +12,7 @@ from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
                         MultiLevelCascadeAttentionWrapper)
 from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
+from flashinfer.utils import FP4Tensor
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
@@ -19,7 +20,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+    QuantKey, kFp8StaticTensorSym, kNvfp4Quant)
 from vllm.platforms import current_platform
 from vllm.utils import cdiv, is_pin_memory_available
 from vllm.utils.flashinfer import (supports_trtllm_attention,
@@ -40,6 +41,7 @@ from vllm.v1.kv_cache_interface import AttentionSpec
 FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
 
 FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
 
 logger = init_logger(__name__)
 
@@ -653,14 +655,12 @@ class FlashInferImpl(AttentionImpl):
                                     and num_heads % num_kv_heads == 0)
         self.bmm1_scale: Optional[float] = None
         self.bmm2_scale: Optional[float] = None
+        self.o_sf_scale: Optional[float] = None
 
-    def fused_output_quant_supported(self, dtype: torch.dtype, static: bool,
-                                     group_shape: GroupShape):
-        supported_quant_type = (dtype == FP8_DTYPE and static
-                                and group_shape == GroupShape.PER_TENSOR)
+    def fused_output_quant_supported(self, quant_key: QuantKey):
         return (self.support_trtllm_attn
                 and self.kv_cache_dtype.startswith("fp8")
-                and supported_quant_type)
+                and quant_key in (kFp8StaticTensorSym, kNvfp4Quant))
 
     def forward(
         self,
@@ -672,6 +672,7 @@ class FlashInferImpl(AttentionImpl):
         attn_metadata: FlashInferMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashInfer.
 
@@ -705,19 +706,32 @@ class FlashInferImpl(AttentionImpl):
         if output_scale is None:
             assert attn_metadata.q_data_type != FP8_DTYPE, \
                 "Query can only be FP8 if output fusion happened."
+            assert output_block_scale is None, "output_block_scale "\
+                "is not supported when fusion has not happened"
         else:
             assert attn_metadata.q_data_type == FP8_DTYPE, \
                 "Query must be FP8 when attn+quant fusion happened."
             assert (attn_metadata.prefill_use_trtllm and
                     attn_metadata.decode_use_trtllm), "Must use TRT-LLM attn"
-            assert output.dtype == FP8_DTYPE, \
-                "Output must be FP8 when attn+quant fusion happened."
 
-            # TRTLLM attn kernel requires o scale as a host scalar, store the
-            # o scale to host scalar in warmup run with cuda graph not enabled
+            if output.dtype == FP8_DTYPE:
+                assert output_block_scale is None, \
+                    "output_block_scale should not be provided for fp8 output"
+            elif output.dtype == FP4_DTYPE:
+                assert output_block_scale is not None, \
+                    "output_block_scale is required for nvfp4 output"
+            else:
+                raise ValueError(f"Unsupported output dtype: {output.dtype}")
+
+            # TRTLLM attn kernel requires o scale to pass as a host scalar,
+            # store the o scale as a host scalar in warmup run with cuda graph
+            # not enabled
             if layer._o_scale_float is None:
                 layer._o_scale_float = output_scale.cpu().item()
-                self.bmm2_scale = self.bmm2_scale / layer._o_scale_float
+                if output.dtype == FP8_DTYPE:
+                    self.bmm2_scale = self.bmm2_scale / layer._o_scale_float
+                elif output.dtype == FP4_DTYPE:
+                    self.o_sf_scale = layer._o_scale_float
 
             # Insert FP8 quant for query
             num_tokens, num_heads, head_size = query.shape
@@ -818,6 +832,16 @@ class FlashInferImpl(AttentionImpl):
                 assert block_tables_prefill.is_contiguous()
                 assert seq_lens_prefill.is_contiguous()
 
+                if output.dtype == FP4_DTYPE:
+                    assert self.o_sf_scale is not None
+                    out = FP4Tensor(data=output[num_decode_tokens:],
+                                    scale=output_block_scale,
+                                    scale_start_index=num_decode_tokens,
+                                    original_shape=prefill_query.shape)
+                else:
+                    assert self.o_sf_scale is None
+                    out = output[num_decode_tokens:]
+
                 trtllm_batch_context_with_kv_cache(
                     query=prefill_query,
                     kv_cache=kv_cache_permute,
@@ -833,7 +857,8 @@ class FlashInferImpl(AttentionImpl):
                     cum_seq_lens_kv=attn_metadata.paged_kv_indptr_gpu,
                     window_left=self.window_left,
                     sinks=self.sinks,
-                    out=output[num_decode_tokens:],
+                    o_sf_scale=self.o_sf_scale,
+                    out=out,
                 )
 
         if num_decode_tokens > 0:
@@ -870,6 +895,16 @@ class FlashInferImpl(AttentionImpl):
                 assert block_tables_decode.is_contiguous()
                 assert seq_lens_decode.is_contiguous()
 
+                if output.dtype == FP4_DTYPE:
+                    assert self.o_sf_scale is not None
+                    out = FP4Tensor(data=output[:num_decode_tokens],
+                                    scale=output_block_scale,
+                                    scale_start_index=0,
+                                    original_shape=decode_query.shape)
+                else:
+                    assert self.o_sf_scale is None
+                    out = output[:num_decode_tokens]
+
                 trtllm_batch_decode_with_kv_cache(
                     query=decode_query,
                     kv_cache=kv_cache_permute,
@@ -881,7 +916,8 @@ class FlashInferImpl(AttentionImpl):
                     bmm2_scale=self.bmm2_scale,
                     window_left=self.window_left,
                     sinks=self.sinks,
-                    out=output[:num_decode_tokens],
+                    o_sf_scale=self.o_sf_scale,
+                    out=out,
                 )
         return output_padded
 
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index abca981035..f4aa54660a 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -428,6 +428,7 @@ class FlexAttentionImpl(AttentionImpl):
         attn_metadata: FlexAttentionMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FLexAttention.
 
@@ -441,7 +442,7 @@ class FlexAttentionImpl(AttentionImpl):
             shape = [num_tokens, num_heads * head_size]
         """
         assert output is not None, "Output tensor must be provided."
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for FlexAttentionImpl")
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 5c0ff94379..ce45b34f64 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1138,10 +1138,11 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         attn_metadata: M,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         assert output is not None, "Output tensor must be provided."
 
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for MLACommonImpl")
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 3eb4a0e7a5..fd97db0abb 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -227,6 +227,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
         attn_metadata: PallasMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
 
@@ -239,7 +240,7 @@ class PallasAttentionBackendImpl(AttentionImpl):
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for PallasAttentionBackendImpl")
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index b9ff113573..403ad8e88a 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -421,6 +421,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
         attn_metadata: AiterFlashAttentionMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with AiterFlashAttention.
 
@@ -438,7 +439,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
         """
         assert output is not None, "Output tensor must be provided."
 
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for FlashAttentionImpl")
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index 2a0c52377c..c93223a340 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -354,6 +354,7 @@ class TreeAttentionImpl(AttentionImpl):
         attn_metadata: TreeAttentionMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with TreeAttention.
 
@@ -368,7 +369,7 @@ class TreeAttentionImpl(AttentionImpl):
         """
         assert output is not None, "Output tensor must be provided."
 
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for TreeAttentionImpl")
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index c69dd8415f..b12036c599 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -277,6 +277,7 @@ class TritonAttentionImpl(AttentionImpl):
         attn_metadata: FlashAttentionMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
@@ -291,7 +292,7 @@ class TritonAttentionImpl(AttentionImpl):
         """
         assert output is not None, "Output tensor must be provided."
 
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for TritonAttentionImpl")
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index b305bc1539..e0eb7d8be9 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -322,6 +322,7 @@ class XFormersAttentionImpl(AttentionImpl):
         attn_metadata: XFormersAttentionMetadata,
         output: Optional[torch.Tensor] = None,
         output_scale: Optional[torch.Tensor] = None,
+        output_block_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with XFormers.
 
@@ -336,7 +337,7 @@ class XFormersAttentionImpl(AttentionImpl):
         """
         assert output is not None, "Output tensor must be provided."
 
-        if output_scale is not None:
+        if output_scale is not None or output_block_scale is not None:
             raise NotImplementedError(
                 "fused output quantization is not yet supported"
                 " for XFormersAttentionImpl")

From c80c53a30ff7a9c074ec6a7d88021ebe8c19e2e9 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 22 Aug 2025 17:20:41 -0700
Subject: [PATCH 523/932] [BugFix] Fix batch updates for pooling models
 (#23398)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/sample/logits_processor/state.py |  20 +++-
 vllm/v1/worker/gpu_input_batch.py        | 146 ++++++++++++-----------
 vllm/v1/worker/gpu_model_runner.py       |   8 +-
 3 files changed, 95 insertions(+), 79 deletions(-)

diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
index 0f58b52496..31cece58c7 100644
--- a/vllm/v1/sample/logits_processor/state.py
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -50,6 +50,10 @@ class BatchUpdateBuilder:
         self.added = added or []
         self._is_removed_sorted = False
 
+        # Used to track changes in the pooling case
+        # where we don't populate the added list.
+        self.batch_changed = False
+
     def _ensure_removed_sorted(self) -> None:
         """Sort removed request indices in
         descending order.
@@ -80,6 +84,7 @@ class BatchUpdateBuilder:
             raise RuntimeError("Cannot register new removed request after"
                                " self.removed has been read.")
         self._removed.append(index)
+        self.batch_changed = True
 
     def has_removed(self) -> bool:
         return bool(self._removed)
@@ -98,9 +103,15 @@ class BatchUpdateBuilder:
             return self._removed.pop()
         return None
 
-    def _is_update(self) -> bool:
-        """True if there is a batch state change"""
-        return any((self._removed, self.moved, self.added))
+    def reset(self) -> bool:
+        """Returns True if there were any changes to the batch."""
+        self._is_removed_sorted = False
+        self._removed.clear()
+        self.moved.clear()
+        self.added.clear()
+        batch_changed = self.batch_changed
+        self.batch_changed = False
+        return batch_changed
 
     def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
         """Generate a logitsprocs batch update data structure and reset
@@ -114,7 +125,8 @@ class BatchUpdateBuilder:
         """
         # Reset removal-sorting logic
         self._is_removed_sorted = False
-        if not self._is_update():
+        self.batch_changed = False
+        if not any((self._removed, self.moved, self.added)):
             # No update; short-circuit
             return None
         # Build batch state update
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index e45d1ef31f..f48c9de2f4 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -65,8 +65,7 @@ class CachedRequestState:
     def get_token_id(self, idx: int) -> int:
         if idx < self.num_prompt_tokens:
             return self.prompt_token_ids[idx]
-        else:
-            return self.output_token_ids[idx - self.num_prompt_tokens]
+        return self.output_token_ids[idx - self.num_prompt_tokens]
 
 
 class InputBatch:
@@ -261,30 +260,27 @@ class InputBatch:
         Not applicable to pooling models.
         """
 
-        # Detailed added request metadata is only required for non-pooling
-        # models, to support logitsprocs
-        assert request.sampling_params
-
         # Fill the next empty index if there is one.
         if (new_req_index := self.batch_update_builder.pop_removed()) is None:
             # Append to end otherwise.
             new_req_index = self.num_reqs
 
         assert new_req_index < self.max_num_reqs
-        self.batch_update_builder.added.append(
-            (new_req_index, request.sampling_params, request.prompt_token_ids,
-             request.output_token_ids))
+        self.batch_update_builder.batch_changed = True
+        if request.sampling_params:
+            # Detailed added request metadata is only required for non-pooling
+            # models, to support logitsprocs.
+            self.batch_update_builder.added.append(
+                (new_req_index, request.sampling_params,
+                 request.prompt_token_ids, request.output_token_ids))
+
         return new_req_index
 
     def add_request(
         self,
         request: "CachedRequestState",
     ) -> int:
-        if not self.is_pooling_model:
-            # New request index bookkeeping for autoregressive models.
-            req_index = self._register_add_request(request)
-        else:
-            req_index = self.num_reqs
+        req_index = self._register_add_request(request)
 
         req_id = request.req_id
         if req_index == len(self._req_ids):
@@ -389,7 +385,7 @@ class InputBatch:
             self.logits_processing_needs_token_ids[req_index] = (
                 pooling_params.requires_token_ids)
         else:
-            raise NotImplementedError(request)
+            raise NotImplementedError("Unrecognized request type")
 
         # Add request lora ID
         if request.lora_request:
@@ -419,13 +415,25 @@ class InputBatch:
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
             return None
-        if not self.is_pooling_model:
-            # Autoregressive models require bookkeeping of removed requests to
-            # support logitsprocs.
-            self.batch_update_builder.removed_append(req_index)
+
+        self.batch_update_builder.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
 
+        # LoRA
+        lora_id = self.request_lora_mapping[req_index]
+        if lora_id != 0:
+            lora_req_ids = self.lora_id_to_request_ids[lora_id]
+            lora_req_ids.discard(req_id)
+            if not lora_req_ids:
+                del self.lora_id_to_request_ids[lora_id]
+                del self.lora_id_to_lora_request[lora_id]
+            self.request_lora_mapping[req_index] = 0
+
+        if self.is_pooling_model:
+            self.pooling_params.pop(req_id, None)
+            return req_index
+
         self.greedy_reqs.discard(req_id)
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
@@ -439,29 +447,14 @@ class InputBatch:
         self.num_prompt_logprobs.pop(req_id, None)
         self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
 
-        # LoRA
-        lora_id = self.request_lora_mapping[req_index]
-        if lora_id != 0:
-            lora_req_ids = self.lora_id_to_request_ids[lora_id]
-            lora_req_ids.discard(req_id)
-            if not lora_req_ids:
-                del self.lora_id_to_request_ids[lora_id]
-                del self.lora_id_to_lora_request[lora_id]
-            self.request_lora_mapping[req_index] = 0
-
         self.has_allowed_token_ids.discard(req_id)
         if self.allowed_token_ids_mask_cpu_tensor is not None:
             # False means we don't fill with -inf.
             self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
         self.bad_words_token_ids.pop(req_index, None)
-        self.pooling_params.pop(req_id, None)
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
-        # For autoregressive models, track detailed request reordering info
-        # to support logitsprocs
-        self.batch_update_builder.moved.append(
-            (i1, i2, MoveDirectionality.SWAP))
         old_id_i1 = self._req_ids[i1]
         old_id_i2 = self._req_ids[i2]
         self._req_ids[i1], self._req_ids[i2] =\
@@ -479,18 +472,6 @@ class InputBatch:
             self.num_prompt_tokens[i2], self.num_prompt_tokens[i1]
         self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] =\
             self.num_computed_tokens_cpu[i2], self.num_computed_tokens_cpu[i1]
-        self.temperature_cpu[i1], self.temperature_cpu[i2] =\
-            self.temperature_cpu[i2], self.temperature_cpu[i1]
-        self.top_p_cpu[i1], self.top_p_cpu[i2] =\
-            self.top_p_cpu[i2], self.top_p_cpu[i1]
-        self.top_k_cpu[i1], self.top_k_cpu[i2] =\
-            self.top_k_cpu[i2], self.top_k_cpu[i1]
-        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] =\
-            self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1]
-        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] =\
-            self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
-        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
-            self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
 
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
@@ -501,18 +482,41 @@ class InputBatch:
         self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
         self.token_ids_cpu[i2, ...] = tmp
 
+        self.block_table.swap_row(i1, i2)
+
+        self.request_lora_mapping[i1], self.request_lora_mapping[i2] = \
+            self.request_lora_mapping[i2], self.request_lora_mapping[i1]
+
+        if self.is_pooling_model:
+            # Sampling and logits parameters don't apply to pooling models.
+            return
+
+        # For autoregressive models, track detailed request reordering info
+        # to support logitsprocs.
+        self.batch_update_builder.moved.append(
+            (i1, i2, MoveDirectionality.SWAP))
+
+        self.temperature_cpu[i1], self.temperature_cpu[i2] = \
+            self.temperature_cpu[i2], self.temperature_cpu[i1]
+        self.top_p_cpu[i1], self.top_p_cpu[i2] = \
+            self.top_p_cpu[i2], self.top_p_cpu[i1]
+        self.top_k_cpu[i1], self.top_k_cpu[i2] = \
+            self.top_k_cpu[i2], self.top_k_cpu[i1]
+        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] = \
+            self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1]
+        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] = \
+            self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
+        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] = \
+            self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
+
         swap_dict_values(self.generators, i1, i2)
         swap_dict_values(self.bad_words_token_ids, i1, i2)
 
-        self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
-            self.request_lora_mapping[i2], self.request_lora_mapping[i1]
-
         if self.allowed_token_ids_mask_cpu_tensor is not None:
             self.allowed_token_ids_mask_cpu_tensor[i1], \
                 self.allowed_token_ids_mask_cpu_tensor[i2] =\
                 self.allowed_token_ids_mask_cpu_tensor[i2], \
                     self.allowed_token_ids_mask_cpu_tensor[i1]
-        self.block_table.swap_row(i1, i2)
 
     def condense(self) -> None:
         """Slide non-empty requests down into lower, empty indices.
@@ -529,12 +533,6 @@ class InputBatch:
         """
         num_reqs = self.num_reqs
 
-        if self.is_pooling_model:
-            # Will be contiguous in pooling case, just trim the lists.
-            del self._req_ids[num_reqs:]
-            del self.req_output_token_ids[num_reqs:]
-            return
-
         if not (empty_req_indices := self.batch_update_builder.removed):
             # All removed requests were replaced by added requests, or else no
             # requests were removed at all. No condense() needed
@@ -562,11 +560,6 @@ class InputBatch:
             # Move active request down into empty request
             # index.
             self.batch_update_builder.pop_removed()
-            # Autoregressive models require detailed tracking of condense
-            # operations to support logitsprocs
-            self.batch_update_builder.moved.append(
-                (last_req_index, empty_index,
-                 MoveDirectionality.UNIDIRECTIONAL))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
             assert req_id is not None
@@ -587,6 +580,21 @@ class InputBatch:
             self.num_computed_tokens_cpu[
                 empty_index] = self.num_computed_tokens_cpu[last_req_index]
             self.block_table.move_row(last_req_index, empty_index)
+
+            self.request_lora_mapping[empty_index] = self.request_lora_mapping[
+                last_req_index]
+
+            if self.is_pooling_model:
+                last_req_index -= 1
+                # Samping state not used by pooling models.
+                continue
+
+            # Autoregressive models require detailed tracking of condense
+            # operations to support logitsprocs
+            self.batch_update_builder.moved.append(
+                (last_req_index, empty_index,
+                 MoveDirectionality.UNIDIRECTIONAL))
+
             self.temperature_cpu[empty_index] = self.temperature_cpu[
                 last_req_index]
             self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
@@ -601,9 +609,6 @@ class InputBatch:
             if generator is not None:
                 self.generators[empty_index] = generator
 
-            self.request_lora_mapping[empty_index] = self.request_lora_mapping[
-                last_req_index]
-
             # TODO convert these to LogitsProcessors
             if self.allowed_token_ids_mask_cpu_tensor is not None:
                 self.allowed_token_ids_mask_cpu_tensor[
@@ -626,8 +631,9 @@ class InputBatch:
         """Apply any batch updates to sampling metadata."""
 
         if self.is_pooling_model:
-            # Batch changes every step for pooling models.
-            self.sampling_metadata = self._make_sampling_metadata()
+            batch_changed = self.batch_update_builder.reset()
+            if batch_changed:
+                self.sampling_metadata = self._make_sampling_metadata()
             return
 
         # For non-pooling models - generate and apply logitsprocs update;
@@ -720,7 +726,8 @@ class InputBatch:
         )
 
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
-        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
+        num_reqs = self.num_reqs
+        max_prompt_len = self.num_prompt_tokens[:num_reqs].max()
         prompt_token_ids_cpu_tensor = torch.empty(
             (self.num_reqs, max_prompt_len),
             device="cpu",
@@ -728,11 +735,10 @@ class InputBatch:
             pin_memory=self.pin_memory,
         )
         prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
-        prompt_token_ids[:] = self.token_ids_cpu[:self.
-                                                 num_reqs, :max_prompt_len]
+        prompt_token_ids[:] = self.token_ids_cpu[:num_reqs, :max_prompt_len]
         # Use the value of vocab_size as a pad since we don't have a
         # token_id of this value.
-        for i in range(self.num_reqs):
+        for i in range(num_reqs):
             prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
         return prompt_token_ids_cpu_tensor.to(device=self.device,
                                               non_blocking=True)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7160894b4a..ed4a4e55f1 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1489,10 +1489,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         for raw_output, seq_len, prompt_len in zip(
                 raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
 
-            if seq_len == prompt_len:
-                pooler_output.append(raw_output.data)
-            else:
-                pooler_output.append(None)
+            output = raw_output.data if seq_len == prompt_len else None
+            pooler_output.append(output)
 
         return ModelRunnerOutput(
             req_ids=self.input_batch.req_ids,
@@ -1522,7 +1520,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Prepare the decoder inputs.
         (attn_metadata, logits_indices, spec_decode_metadata,
          num_scheduled_tokens_np, spec_decode_common_attn_metadata,
-         max_query_len) = (self._prepare_inputs(scheduler_output))
+         max_query_len) = self._prepare_inputs(scheduler_output)
 
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE

From add1adfec742dfb13e614dab3372b5aafd1ff046 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 22 Aug 2025 17:22:11 -0700
Subject: [PATCH 524/932] [BugFix] Fix `MinPLogitsProcessor.update_states()`
 (#23401)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/sample/logits_processor/builtin.py | 34 +++++++++++++---------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index 24387ab793..00dd757489 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -53,29 +53,37 @@ class MinPLogitsProcessor(LogitsProcessor):
         # Process added requests.
         for index, params, _, _ in batch_update.added:
             min_p = params.min_p
-            if self.min_p_cpu[index] != min_p:
+            min_p_before = self.min_p_cpu[index]
+            if min_p_before != min_p:
                 needs_update = True
                 self.min_p_cpu[index] = min_p
-            if min_p:
-                self.min_p_count += 1
+                if min_p and not min_p_before:
+                    self.min_p_count += 1
+                elif not min_p and min_p_before:
+                    self.min_p_count -= 1
 
         if self.min_p_count:
             # Process removed requests.
-            needs_update |= bool(batch_update.removed)
-            for index in batch_update.removed:
-                if self.min_p_cpu[index]:
-                    self.min_p_count -= 1
+            if batch_update.removed:
+                needs_update = True
+                for index in batch_update.removed:
+                    if self.min_p_cpu[index]:
+                        self.min_p_cpu[index] = 0
+                        self.min_p_count -= 1
 
-            # Process moved requests, unidirectional (a->b) and swap (a<->b)
+            # Process moved requests, unidirectional (a->b) and swap (a<->b).
             for adx, bdx, direct in batch_update.moved:
-                change = (min_p_a :=
-                          self.min_p_cpu[adx]) != (min_p_b :=
-                                                   self.min_p_cpu[bdx])
-                needs_update |= change
-                if change:
+                min_p_a, min_p_b = self.min_p_cpu[adx], self.min_p_cpu[bdx]
+                if min_p_a != min_p_b:
+                    needs_update = True
                     self.min_p_cpu[bdx] = min_p_a
                     if direct == MoveDirectionality.SWAP:
                         self.min_p_cpu[adx] = min_p_b
+                if direct == MoveDirectionality.UNIDIRECTIONAL:
+                    if min_p_a:
+                        self.min_p_cpu[adx] = 0
+                    if min_p_b:
+                        self.min_p_count -= 1
 
         # Update tensors if needed.
         size = batch_update.batch_size

From 23c939fd30b845d06568304a0e9d168a48cc8cb4 Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Sat, 23 Aug 2025 10:14:41 +0800
Subject: [PATCH 525/932] [Model] Support DP for ViT on MiniCPM-V-4 (#23327)

Signed-off-by: ycyaw66 <497410282@qq.com>
Co-authored-by: ycyaw66 <497410282@qq.com>
---
 docs/configuration/optimization.md            |   1 +
 .../models/idefics2_vision_model.py           | 123 ++++++++++++++----
 vllm/model_executor/models/minicpmv.py        |   9 +-
 vllm/multimodal/utils.py                      |   2 +
 4 files changed, 105 insertions(+), 30 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 357a5eb594..69d4de9d2f 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -172,6 +172,7 @@ The availablilty of batch-level DP is based on model implementation.
 Currently, the following models support `mm_encoder_tp_mode="data"`:
 
 - Llama4 (<gh-pr:18368>)
+- MiniCPM-V-4 (<gh-pr:23327>)
 - Qwen2.5-VL (<gh-pr:22742>)
 - Step3 (<gh-pr:22697>)
 
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 9e27200fb1..88b2a29590 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -27,13 +27,15 @@ from transformers.models.idefics2.configuration_idefics2 import (
     Idefics2Config, Idefics2VisionConfig)
 
 from vllm.attention.layer import MultiHeadAttention
-from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.multimodal.utils import run_dp_sharded_vision_model
 
 
 class Idefics2VisionEmbeddings(nn.Module):
@@ -118,6 +120,7 @@ class Idefics2VisionAttention(nn.Module):
         config: Idefics2VisionConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         self.config = config
@@ -130,22 +133,43 @@ class Idefics2VisionAttention(nn.Module):
                 f" {self.num_heads}).")
         self.scale = self.head_dim**-0.5
         self.dropout = config.attention_dropout
-        self.qkv_proj = QKVParallelLinear(
-            self.embed_dim,
-            self.head_dim,
-            self.num_heads,
-            quant_config=quant_config,
-            prefix=f"{prefix}.qkv_proj",
-        )
-        self.out_proj = RowParallelLinear(
-            self.embed_dim,
-            self.embed_dim,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.out_proj",
-        )
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+
+        tp_size = (1 if use_data_parallel else
+                   get_tensor_model_parallel_world_size())
+        assert self.num_heads % tp_size == 0
+        self.num_heads_per_partition = self.num_heads // tp_size
+
+        if use_data_parallel:
+            self.q_size = self.num_heads * self.head_dim
+            self.qkv_proj = ReplicatedLinear(
+                self.embed_dim,
+                3 * self.q_size,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+            self.out_proj = ReplicatedLinear(
+                self.embed_dim,
+                self.embed_dim,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.out_proj",
+            )
+        else:
+            self.qkv_proj = QKVParallelLinear(
+                self.embed_dim,
+                self.head_dim,
+                self.num_heads,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+            self.out_proj = RowParallelLinear(
+                self.embed_dim,
+                self.embed_dim,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.out_proj",
+            )
         self.attn = MultiHeadAttention(self.num_heads_per_partition,
                                        self.head_dim, self.scale)
 
@@ -169,18 +193,23 @@ class Idefics2VisionMLP(nn.Module):
         config: Idefics2VisionConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        self.fc1 = ColumnParallelLinear(
+        cls_fc1 = (ReplicatedLinear
+                   if use_data_parallel else ColumnParallelLinear)
+        self.fc1 = cls_fc1(
             config.hidden_size,
             config.intermediate_size,
             bias=True,
             quant_config=quant_config,
             prefix=f"{prefix}.fc1",
         )
-        self.fc2 = RowParallelLinear(
+        cls_fc2 = (ReplicatedLinear
+                   if use_data_parallel else RowParallelLinear)
+        self.fc2 = cls_fc2(
             config.intermediate_size,
             config.hidden_size,
             bias=True,
@@ -202,17 +231,21 @@ class Idefics2EncoderLayer(nn.Module):
         config: Idefics2Config,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         self.embed_dim = config.hidden_size
-        self.self_attn = Idefics2VisionAttention(config,
-                                                 quant_config=quant_config,
-                                                 prefix=f"{prefix}.self_attn")
+        self.self_attn = Idefics2VisionAttention(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            use_data_parallel=use_data_parallel)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim,
                                         eps=config.layer_norm_eps)
         self.mlp = Idefics2VisionMLP(config,
                                      quant_config=quant_config,
-                                     prefix=f"{prefix}.mlp")
+                                     prefix=f"{prefix}.mlp",
+                                     use_data_parallel=use_data_parallel)
         self.layer_norm2 = nn.LayerNorm(self.embed_dim,
                                         eps=config.layer_norm_eps)
 
@@ -254,6 +287,7 @@ class Idefics2Encoder(nn.Module):
         *,
         num_hidden_layers_override: Optional[int] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
@@ -267,7 +301,8 @@ class Idefics2Encoder(nn.Module):
         self.layers = nn.ModuleList([
             Idefics2EncoderLayer(config,
                                  quant_config=quant_config,
-                                 prefix=f"{prefix}.layers.{layer_idx}")
+                                 prefix=f"{prefix}.layers.{layer_idx}",
+                                 use_data_parallel=use_data_parallel)
             for layer_idx in range(num_hidden_layers)
         ])
 
@@ -301,17 +336,20 @@ class Idefics2VisionTransformer(nn.Module):
         num_hidden_layers_override: Optional[int] = None,
         require_post_norm: bool = True,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
         embed_dim = config.hidden_size
         self.config = config
+        self.use_data_parallel = use_data_parallel
         self.embeddings = Idefics2VisionEmbeddings(config)
         self.encoder = Idefics2Encoder(
             config,
             quant_config=quant_config,
             num_hidden_layers_override=num_hidden_layers_override,
-            prefix=f"{prefix}.encoder")
+            prefix=f"{prefix}.encoder",
+            use_data_parallel=use_data_parallel)
 
         num_hidden_layers = config.num_hidden_layers
         if len(self.encoder.layers) > config.num_hidden_layers:
@@ -340,10 +378,38 @@ class Idefics2VisionTransformer(nn.Module):
             patch_attention_mask=patch_attention_mask,
             tgt_sizes=tgt_sizes,
         )
-        encoder_outputs = self.encoder(hidden_states)
+        if self.use_data_parallel:
+            encoder_outputs = run_dp_sharded_vision_model(
+                hidden_states, self.encoder)
+        else:
+            encoder_outputs = self.encoder(hidden_states)
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state
 
+    def _consolidate_qkv_weights(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> Iterable[tuple[str, torch.Tensor]]:
+        qkv_idx_mappings = {
+            ".self_attn.q_proj": 0,
+            ".self_attn.k_proj": 1,
+            ".self_attn.v_proj": 2,
+        }
+        qkv_weights = {}
+        for name, loaded_weight in weights:
+            for weight_name, idx in qkv_idx_mappings.items():
+                if weight_name not in name:
+                    continue
+                new_name = name.replace(weight_name, ".self_attn.qkv_proj")
+                if new_name not in qkv_weights:
+                    qkv_weights[new_name] = [None] * 3
+                qkv_weights[new_name][idx] = loaded_weight
+                break
+            else:
+                yield name, loaded_weight
+        for key, weight in qkv_weights.items():
+            qkv_weight = torch.cat(weight, dim=0)
+            yield key, qkv_weight
+
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
@@ -356,6 +422,9 @@ class Idefics2VisionTransformer(nn.Module):
         loaded_params: set[str] = set()
         layer_count = len(self.encoder.layers)
 
+        if self.use_data_parallel:
+            weights = self._consolidate_qkv_weights(weights)
+
         for name, loaded_weight in weights:
             # skip pooling header
             if name.startswith("head."):
@@ -373,7 +442,7 @@ class Idefics2VisionTransformer(nn.Module):
                     continue
 
             for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
+                if weight_name not in name or self.use_data_parallel:
                     continue
                 name = name.replace(weight_name, param_name)
                 param = params_dict[name]
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 48ce1b9d38..a2a71bdd12 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -778,6 +778,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         # and config class
         self.config = config
         self.multimodal_config = multimodal_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
         self.version = get_version_by_config(self.config)
         self.llm = self.init_llm(vllm_config=vllm_config,
@@ -1325,9 +1326,11 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         prefix: str = "",
     ) -> nn.Module:
         quant_config = self._maybe_ignore_quant_config(quant_config)
-        model = Idefics2VisionTransformer(config.vision_config,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
+        model = Idefics2VisionTransformer(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_data_parallel=self.use_data_parallel)
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 58c71d865d..834b2189e4 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -461,6 +461,8 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor,
                                               num_chunks_per_rank, ...]
 
     vision_embeddings = vision_model(image_input_per_rank)
+    # Ensure tensor is contiguous before all_gather
+    vision_embeddings = vision_embeddings.contiguous()
     vision_embeddings = tensor_model_parallel_all_gather(vision_embeddings,
                                                          dim=0)
     vision_embeddings = vision_embeddings[:num_chunks, ...]

From f6818a92cbad18b7bac0b5cb424549ba941e11b9 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 22 Aug 2025 22:52:50 -0400
Subject: [PATCH 526/932] [UX] Move Dockerfile DeepGEMM install to
 tools/install_deepgemm.sh (#23360)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 docker/Dockerfile         |  28 ++--------
 tools/install_deepgemm.sh | 108 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+), 24 deletions(-)
 create mode 100755 tools/install_deepgemm.sh

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 977dd7a3d3..839ac501db 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -432,31 +432,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
 ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
-RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
-  . /etc/environment
-    CUDA_MAJOR="${CUDA_VERSION%%.*}"
-    CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
-    CUDA_MINOR="${CUDA_MINOR%%.*}"
-    if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then
-        git clone --recursive --shallow-submodules \
-            ${DEEPGEMM_GIT_REPO} deepgemm
-        echo "🏗️  Building DeepGEMM"
-        pushd deepgemm
-            git checkout ${DEEPGEMM_GIT_REF}
-            # Build DeepGEMM
-            # (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
-            rm -rf build dist
-            rm -rf *.egg-info
-            python3 setup.py bdist_wheel
-            uv pip install --system dist/*.whl
-        popd
-        rm -rf deepgemm
-    else
-        echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
-    fi
-BASH
+COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
+RUN --mount=type=cache,target=/root/.cache/uv \
+    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "${DEEPGEMM_GIT_REF}" \
+    && rm /tmp/install_deepgemm.sh
 
 # Install EP kernels(pplx-kernels and DeepEP), NixL
 COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
diff --git a/tools/install_deepgemm.sh b/tools/install_deepgemm.sh
new file mode 100755
index 0000000000..33849581d2
--- /dev/null
+++ b/tools/install_deepgemm.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+# Script to install DeepGEMM from source
+# This script can be used both in Docker builds and by users locally
+
+set -e
+
+# Default values
+DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
+DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --ref)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --ref requires an argument." >&2
+                exit 1
+            fi
+            DEEPGEMM_GIT_REF="$2"
+            shift 2
+            ;;
+        --cuda-version)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --cuda-version requires an argument." >&2
+                exit 1
+            fi
+            CUDA_VERSION="$2"
+            shift 2
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --ref REF          Git reference to checkout (default: $DEEPGEMM_GIT_REF)"
+            echo "  --cuda-version VER CUDA version (auto-detected if not provided)"
+            echo "  -h, --help         Show this help message"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1" >&2
+            exit 1
+            ;;
+    esac
+done
+
+# Auto-detect CUDA version if not provided
+if [ -z "$CUDA_VERSION" ]; then
+    if command -v nvcc >/dev/null 2>&1; then
+        CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \([0-9]\+\.[0-9]\+\).*/\1/p')
+        echo "Auto-detected CUDA version: $CUDA_VERSION"
+    else
+        echo "Warning: Could not auto-detect CUDA version. Please specify with --cuda-version"
+        exit 1
+    fi
+fi
+
+# Extract major and minor version numbers
+CUDA_MAJOR="${CUDA_VERSION%%.*}"
+CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
+CUDA_MINOR="${CUDA_MINOR%%.*}"
+
+echo "CUDA version: $CUDA_VERSION (major: $CUDA_MAJOR, minor: $CUDA_MINOR)"
+
+# Check CUDA version requirement
+if [ "$CUDA_MAJOR" -lt 12 ] || { [ "$CUDA_MAJOR" -eq 12 ] && [ "$CUDA_MINOR" -lt 8 ]; }; then
+    echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
+    exit 0
+fi
+
+echo "Installing DeepGEMM from source..."
+echo "Repository: $DEEPGEMM_GIT_REPO"
+echo "Reference: $DEEPGEMM_GIT_REF"
+
+# Create a temporary directory for the build
+INSTALL_DIR=$(mktemp -d)
+trap 'rm -rf "$INSTALL_DIR"' EXIT
+
+# Clone the repository
+git clone --recursive --shallow-submodules "$DEEPGEMM_GIT_REPO" "$INSTALL_DIR/deepgemm"
+
+echo "🏗️  Building DeepGEMM"
+pushd "$INSTALL_DIR/deepgemm"
+
+# Checkout the specific reference
+git checkout "$DEEPGEMM_GIT_REF"
+
+# Build DeepGEMM
+# (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
+rm -rf build dist
+rm -rf *.egg-info
+python3 setup.py bdist_wheel
+
+# Install the wheel
+if command -v uv >/dev/null 2>&1; then
+    echo "Installing DeepGEMM wheel using uv..."
+    # Use --system in Docker contexts, respect user's environment otherwise
+    if [ -n "$VLLM_DOCKER_BUILD_CONTEXT" ]; then
+        uv pip install --system dist/*.whl
+    else
+        uv pip install dist/*.whl
+    fi
+else
+    echo "Installing DeepGEMM wheel using pip..."
+    python3 -m pip install dist/*.whl
+fi
+
+popd
+
+echo "✅ DeepGEMM installation completed successfully"
\ No newline at end of file

From fa78de9dc373c3bcc365ccc0f0631010df22ebe0 Mon Sep 17 00:00:00 2001
From: Daifeng Li <119935962+fengli1702@users.noreply.github.com>
Date: Sat, 23 Aug 2025 10:53:21 +0800
Subject: [PATCH 527/932] Quantization: support FP4 quantized models on AMD
 CDNA2/CDNA3 GPUs (#22527)

Signed-off-by: feng <fengli1702@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 requirements/rocm.txt                         |   2 +-
 setup.py                                      |   2 +
 vllm/config/__init__.py                       |  18 +-
 vllm/model_executor/layers/linear.py          |   1 +
 .../layers/quantization/__init__.py           |   3 +
 .../layers/quantization/petit.py              | 306 ++++++++++++++++++
 .../layers/quantization/utils/petit_utils.py  | 122 +++++++
 vllm/platforms/rocm.py                        |   2 +-
 8 files changed, 451 insertions(+), 5 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/petit.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/petit_utils.py

diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 7038c9024c..c3bb65b70a 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -17,4 +17,4 @@ setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
-conch-triton-kernels==1.2.1
+conch-triton-kernels==1.2.1
\ No newline at end of file
diff --git a/setup.py b/setup.py
index fa406b868c..ca6e0a8592 100644
--- a/setup.py
+++ b/setup.py
@@ -695,6 +695,8 @@ setup(
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
         "flashinfer": ["flashinfer-python==0.2.12"],
+        # Optional deps for AMD FP4 quantization support
+        "petit-kernel": ["petit-kernel"],
     },
     cmdclass=cmdclass,
     package_data=package_data,
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index fbc4dd3989..6ce40626b3 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1119,9 +1119,20 @@ class ModelConfig:
     def _verify_quantization(self) -> None:
         supported_quantization = me_quant.QUANTIZATION_METHODS
         optimized_quantization_methods = [
-            "fp8", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
-            "fbgemm_fp8", "compressed-tensors", "experts_int8", "quark",
-            "modelopt_fp4", "bitblas", "gptq_bitblas", "inc"
+            "fp8",
+            "modelopt",
+            "gptq_marlin_24",
+            "gptq_marlin",
+            "awq_marlin",
+            "fbgemm_fp8",
+            "compressed-tensors",
+            "experts_int8",
+            "quark",
+            "modelopt_fp4",
+            "bitblas",
+            "gptq_bitblas",
+            "inc",
+            "petit_nvfp4",
         ]
         if self.quantization is not None:
             self.quantization = cast(me_quant.QuantizationMethods,
@@ -1153,6 +1164,7 @@ class ModelConfig:
                 "moe_wna16",
                 "modelopt",
                 "modelopt_fp4",
+                "petit_nvfp4",
             ]
             quantization_methods = [
                 q for q in supported_quantization if q not in overrides
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 9b1ab7af0a..5725c841e5 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -52,6 +52,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [
     "HQQMarlinMethod",
     "QuarkLinearMethod",
     "ModelOptNvFp4LinearMethod",
+    "PetitNvFp4LinearMethod",
 ]
 
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index ea51468422..d73fcf368f 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -35,6 +35,7 @@ QuantizationMethods = Literal[
     "rtn",
     "inc",
     "mxfp4",
+    "petit_nvfp4",
 ]
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
 
@@ -108,6 +109,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .moe_wna16 import MoeWNA16Config
     from .mxfp4 import Mxfp4Config
     from .neuron_quant import NeuronQuantConfig
+    from .petit import PetitNvFp4Config
     from .ptpc_fp8 import PTPCFp8Config
     from .rtn import RTNConfig
     from .torchao import TorchAOConfig
@@ -142,6 +144,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "rtn": RTNConfig,
         "inc": INCConfig,
         "mxfp4": Mxfp4Config,
+        "petit_nvfp4": PetitNvFp4Config,
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py
new file mode 100644
index 0000000000..5b9fee69bb
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/petit.py
@@ -0,0 +1,306 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
+
+from typing import Any, Optional
+
+import regex as re
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.petit_utils import (
+    apply_petit_nvfp4_linear, prepare_nvfp4_layer_for_petit,
+    verify_petit_nvfp4_supported)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    is_layer_skipped)
+from vllm.model_executor.parameter import (ModelWeightParameter,
+                                           PerTensorScaleParameter)
+from vllm.platforms import current_platform
+
+# Initialize logger for the module
+logger = init_logger(__name__)
+
+
+# Configuration class to support the NVFP4 quantized model
+# generated by the ModelOpt quantization tool
+class PetitNvFp4Config(QuantizationConfig):
+    """Config class for Petit FP4."""
+
+    def __init__(
+        self,
+        is_checkpoint_nvfp4_serialized: bool = False,
+        kv_cache_quant_algo: Optional[str] = None,
+        group_size: Optional[int] = None,
+        exclude_modules: Optional[list[str]] = None,
+    ) -> None:
+        self._check_hardware_support()
+        self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
+        if is_checkpoint_nvfp4_serialized:
+            logger.warning("Detected nvfp4 checkpoint. Please note that the "
+                           "format is experimental and subject to change.")
+        self.group_size = group_size
+        self.kv_cache_quant_algo = kv_cache_quant_algo
+        self.exclude_modules = exclude_modules
+
+    def _check_hardware_support(self) -> None:
+        """
+        Verifies that the current hardware is supported by the Petit backend.
+        This backend is specifically designed for AMD GPUs and is not
+        supported on the CUDA platform.
+        """
+        # This check ensures the code is NOT running on an NVIDIA GPU.
+        if current_platform.is_cuda():
+            raise ValueError(
+                "The 'petit' quantization backend is designed for AMD GPUs "
+                "and is not supported on the CUDA platform. For NVIDIA GPUs, "
+                "please use a different quantization method such as FP8, AWQ, "
+                "or GPTQ.")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "petit_nvfp4"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # Petit supports the gfx90a and gfx942 GPUs
+        return 90
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "PetitNvFp4Config":
+        qc = cls.get_from_keys(config, ["quantization"])
+
+        quant_method_raw = qc.get("quant_algo")
+        if not isinstance(quant_method_raw, str) or not quant_method_raw:
+            raise ValueError(
+                "Missing or invalid 'quant_algo' in quantization config.")
+        quant_method = quant_method_raw.upper()
+
+        group_size_raw = qc.get("group_size")
+        if not isinstance(group_size_raw, int):
+            raise ValueError(
+                "Missing or invalid 'group_size' (int) in hf_quant_config.json."
+            )
+        group_size = group_size_raw
+
+        verify_petit_nvfp4_supported(quant_method, group_size)
+
+        kv_cache_quant_algo_raw = qc.get("kv_cache_quant_algo") or "auto"
+        if not isinstance(kv_cache_quant_algo_raw, str):
+            raise ValueError(
+                "'kv_cache_quant_algo' must be a string if provided.")
+        kv_cache_quant_algo = kv_cache_quant_algo_raw
+
+        exclude_raw = qc.get("exclude_modules", [])
+        if exclude_raw is None:
+            exclude_modules: list[str] = []
+        elif isinstance(exclude_raw, list) and all(
+                isinstance(x, str) for x in exclude_raw):
+            exclude_modules = exclude_raw
+        else:
+            raise ValueError(
+                "'exclude_modules' must be a list[str] (or omitted).")
+
+        is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
+
+        return cls(
+            is_checkpoint_nvfp4_serialized=is_checkpoint_nvfp4_serialized,
+            kv_cache_quant_algo=kv_cache_quant_algo,
+            group_size=group_size,
+            exclude_modules=exclude_modules,
+        )
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        if not current_platform.is_rocm():
+            return None
+
+        qc = hf_quant_cfg.get("quantization", hf_quant_cfg)
+        algo = (qc.get("quant_algo") or qc.get("quant_method") or "").upper()
+        if algo in ("NVFP4", "MODELOPT_FP4", "MODELOPT"):
+            return cls.get_name()  # "petit_nvfp4"
+        return None
+
+    @classmethod
+    def is_petit_nvfp4_compatible(cls, quant_config: dict[str, Any]) -> bool:
+        qc = quant_config.get("quantization", quant_config)
+        algo = (qc.get("quant_algo") or qc.get("quant_method") or "").upper()
+        return algo == "NVFP4"
+
+    def is_layer_excluded(self, prefix: str,
+                          exclude_modules: list[str]) -> bool:
+        for pattern in exclude_modules:
+            regex_str = pattern.replace(".", r"\.").replace("*", r".*")
+            if re.fullmatch(regex_str, prefix):
+                return True
+        return False
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        exclude = self.require_exclude_modules()
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, exclude) or self.is_layer_excluded(
+                    prefix, exclude):
+                return UnquantizedLinearMethod()
+            return PetitNvFp4LinearMethod(self)
+        elif isinstance(layer, Attention):
+            return PetitFp8KVCacheMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> list[str]:
+        return []
+
+    def require_group_size(self) -> int:
+        if self.group_size is None:
+            logger.warning("group_size not set; defaulting to 16 for NVFP4.")
+            return 16
+        return self.group_size
+
+    def require_kv_cache_quant_algo(self) -> str:
+        return self.kv_cache_quant_algo or "auto"
+
+    def require_exclude_modules(self) -> list[str]:
+        return list(self.exclude_modules or [])
+
+
+class PetitFp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+
+    def __init__(self, quant_config: PetitNvFp4Config):
+        super().__init__(quant_config)
+
+
+class PetitNvFp4LinearMethod(LinearMethodBase):
+    """Linear method for NVFP4.
+    Supports loading NVFP4 checkpoints with the following structure:
+
+    |Tensor Name           | datatype      |  shape      |
+    |----------------------------------------------------|
+    |input_scale           | torch.float32 | scalar      |
+    |weight                | NVFP4(SE2M1)  | [1, X, y/2] |
+    |weight_scale          | FP8-E4M3      | [X, Y]      |
+    |weight_scale_2        | torch.float32 | scalar      |
+
+    The weights are quantized per block of 16 elements.
+    Args: quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: PetitNvFp4Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError("NVFP4 quantization was selected, "
+                             " dynamic quantization is not supported.")
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        layer.logical_widths = output_partition_sizes
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        if input_size_per_partition % 16 != 0:
+            raise ValueError("Unsupported model when in features size is "
+                             "not multiple of 16")
+
+        weight_dtype = (torch.float8_e4m3fn
+                        if self.quant_config.is_checkpoint_nvfp4_serialized
+                        else params_dtype)
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                # 2 fp4 data is packed in one uint8 in the input dimension
+                output_size_per_partition,
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        input_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("input_scale", input_scale)
+
+        weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale_2", weight_scale_2)
+
+        group_size = self.quant_config.require_group_size()
+        weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // group_size,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        input_scale_2 = layer.input_scale.max().to(torch.float32)
+        weight_scale_2 = layer.weight_scale_2.max().to(torch.float32)
+        layer.input_scale = Parameter(input_scale_2, requires_grad=False)
+        layer.weight_scale_2 = Parameter(weight_scale_2, requires_grad=False)
+        layer.alpha = Parameter(layer.input_scale * layer.weight_scale_2,
+                                requires_grad=False)
+
+        prepare_nvfp4_layer_for_petit(layer)
+        del layer.input_scale
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_petit_nvfp4_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            weight_scale_2=layer.weight_scale_2,
+            size_n=layer.output_size_per_partition,
+            size_k=layer.input_size_per_partition,
+            bias=bias,
+        )
diff --git a/vllm/model_executor/layers/quantization/utils/petit_utils.py b/vllm/model_executor/layers/quantization/utils/petit_utils.py
new file mode 100644
index 0000000000..00d3def1db
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/petit_utils.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+# TYPE_CHECKING is used for static type analysis to prevent circular imports.
+if TYPE_CHECKING:
+    from types import ModuleType
+
+# 1. Create a global variable as a placeholder for the module
+_petit_kernel: Optional["ModuleType"] = None
+
+_PETIT_INSTALL_MSG = ("Petit is not installed. Please install it with "
+                      "`pip install petit-kernel`.")
+
+
+def _import_petit_kernel() -> "ModuleType":
+    """
+    A helper function to handle the lazy import.
+    The first time this function is called, it will import the petit_kernel 
+    library and store it in the global _petit_kernel variable.
+    Subsequent calls will return the already-loaded module directly.
+    """
+    global _petit_kernel
+    if _petit_kernel is not None:
+        return _petit_kernel
+
+    try:
+        import petit_kernel
+        _petit_kernel = petit_kernel
+        return _petit_kernel
+    except ImportError:
+        # The 'from None' syntax prevents chaining the original ImportError,
+        # making the traceback cleaner.
+        raise ImportError(_PETIT_INSTALL_MSG) from None
+
+
+# The _require_petit function can now be a simple alias for consistency.
+_require_petit = _import_petit_kernel
+
+
+def _check_petit_nvfp4_supported(
+        quant_method: str,
+        group_size: Optional[int]) -> tuple[bool, Optional[str]]:
+    if quant_method != "NVFP4":
+        return (
+            False,
+            ("Petit currently only supports: NVFP4 quantizations in sglang. "
+             "Please check the `hf_quant_config.json` file for your model's "
+             "quant configuration."),
+        )
+    if group_size is not None and group_size != 16:
+        return (
+            False,
+            "Petit currently only supports: group_size=16 quantizations.",
+        )
+    return (True, None)
+
+
+def verify_petit_nvfp4_supported(quant_method: str,
+                                 group_size: Optional[int]) -> None:
+    supported, error_msg = _check_petit_nvfp4_supported(
+        quant_method, group_size)
+    if not supported:
+        assert error_msg is not None
+        raise ValueError(error_msg)
+
+
+def prepare_nvfp4_layer_for_petit(layer: torch.nn.Module) -> None:
+    # 2. Call _import_petit_kernel() to trigger (or get) the import.
+    petit_kernel = _import_petit_kernel()
+
+    # Repack weights to petit format
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+    qweight = layer.weight.view(torch.int32).contiguous()
+
+    # 3. Call functions through the imported module variable.
+    petit_qweight = petit_kernel.repack_nvfp4(qweight,
+                                              size_n=part_size_n,
+                                              size_k=part_size_k)
+    layer.weight = torch.nn.Parameter(petit_qweight, requires_grad=False)
+
+    # Permute scales
+    weight_scale = petit_kernel.process_nvfp4_scales(scales=layer.weight_scale,
+                                                     size_k=part_size_k,
+                                                     size_n=part_size_n)
+    layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+
+
+def apply_petit_nvfp4_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_scale_2: torch.Tensor,
+    size_n: int,
+    size_k: int,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    # Trigger (or get) the import here as well.
+    petit_kernel = _import_petit_kernel()
+
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n, )
+
+    # TODO: Use auto-tuning to find the performant solution_id
+    # Call the function via the module variable.
+    output = petit_kernel.mul_nvfp4_a16(
+        a=reshaped_x,
+        b=weight,
+        s=weight_scale,
+        global_scale=weight_scale_2,
+        size_m=reshaped_x.size(0),
+        size_n=size_n,
+        size_k=size_k,
+        solution_id=-1,
+    )
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 317bc401a7..323ec591c5 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -171,7 +171,7 @@ class RocmPlatform(Platform):
 
     supported_quantization: list[str] = [
         "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
-        "quark", "ptpc_fp8", "mxfp4"
+        "quark", "ptpc_fp8", "mxfp4", "petit_nvfp4"
     ]
 
     @classmethod

From 308fa287a84a16ec5d81f662ca219ff5dc8929f5 Mon Sep 17 00:00:00 2001
From: Chenxi Yang <cxyang@cs.utexas.edu>
Date: Fri, 22 Aug 2025 19:54:19 -0700
Subject: [PATCH 528/932] Add glm4.5v tp2,4 fp8 config on H100_80GB (#23443)

Co-authored-by: Chenxi Yang <cxyang@meta.com>
---
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 122 ++++++++++++++++++
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 114 ++++++++++++++++
 2 files changed, 236 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000..63de4bfa4c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,122 @@
+{
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8192": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16384": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000..6efcc02b4d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,114 @@
+{
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8192": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16384": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}

From b4e9fd811f528ae54e620147ae9e2a09af50358f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 23 Aug 2025 12:16:48 +0800
Subject: [PATCH 529/932] Revert "[PERF] Use faster way of decode in tokenizer:
 avoid useless list-to-list conversion (#20000)" (#23396)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/transformers_utils/tokenizer.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 4546f60aae..b3f1977f26 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -49,12 +49,11 @@ def decode_tokens(
     `skip_special_tokens=None` means to use the backend's default
     settings.
     """
-    decode_method = getattr(tokenizer, "_decode", tokenizer.decode)
     if skip_special_tokens is not None:
-        return decode_method(token_ids,
-                             skip_special_tokens=skip_special_tokens)
+        return tokenizer.decode(token_ids,
+                                skip_special_tokens=skip_special_tokens)
 
-    return decode_method(token_ids)
+    return tokenizer.decode(token_ids)
 
 
 def encode_tokens(

From d9a55204bad7bde505624f4ffb0464d98c86914a Mon Sep 17 00:00:00 2001
From: Aziz <azizbenothman76@gmail.com>
Date: Sat, 23 Aug 2025 07:23:54 +0200
Subject: [PATCH 530/932] fix(tests): Correct unreachable assertion in
 truncation test (#23425)

Signed-off-by: AzizCode92 <azizbenothman76@gmail.com>
---
 tests/entrypoints/openai/test_truncation.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py
index 79b6ce059c..18ddc493c9 100644
--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/openai/test_truncation.py
@@ -74,18 +74,15 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
     }
 
     with pytest.raises(openai.BadRequestError) as err:
-        err = await client.post(path="embeddings",
-                                cast_to=object,
-                                body={**kwargs})
+        await client.post(path="embeddings", cast_to=object, body={**kwargs})
 
-        assert str(err) == f"""openai.BadRequestError: 
-                    Error code: 400 - {{'object': 'error', 
-                    'message': 'truncate_prompt_tokens value 
-                    ({truncation_size}) 
-                    is greater than max_model_len ({max_model_len}). 
-                    Please, select a smaller truncation size.', 
-                    'type': 'BadRequestError', 
-                    'param': None, 'code': 400}}"""
+    assert err.value.status_code == 400
+    error_details = err.value.response.json()["error"]
+    assert error_details["type"] == "BadRequestError"
+    expected_message = ("truncate_prompt_tokens value is "
+                        "greater than max_model_len."
+                        " Please, select a smaller truncation size.")
+    assert error_details["message"] == expected_message
 
 
 @pytest.mark.asyncio

From b8f17f5d980eb95838753a183d87d744b87ad489 Mon Sep 17 00:00:00 2001
From: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com>
Date: Sat, 23 Aug 2025 13:50:16 +0800
Subject: [PATCH 531/932] Support DeepSeek-V3.1 tool call (#23454)

Signed-off-by: Xu Wenqing <xuwq1993@qq.com>
---
 docs/features/tool_calling.md                 |   8 +
 examples/tool_chat_template_deepseekv31.jinja |  91 +++++
 .../openai/tool_parsers/__init__.py           |   2 +
 .../tool_parsers/deepseekv31_tool_parser.py   | 367 ++++++++++++++++++
 4 files changed, 468 insertions(+)
 create mode 100644 examples/tool_chat_template_deepseekv31.jinja
 create mode 100644 vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 37d502ef9c..afc605a504 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -284,6 +284,14 @@ Supported models:
 
 Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
 
+### DeepSeek-V3.1 Models (`deepseek_v31`)
+
+Supported models:
+
+* `deepseek-ai/DeepSeek-V3.1` (use with <gh-file:examples/tool_chat_template_deepseekv31.jinja>)
+
+Flags: `--tool-call-parser deepseek_v31 --chat-template {see_above}`
+
 ### Kimi-K2 Models (`kimi_k2`)
 
 Supported models:
diff --git a/examples/tool_chat_template_deepseekv31.jinja b/examples/tool_chat_template_deepseekv31.jinja
new file mode 100644
index 0000000000..863be69d60
--- /dev/null
+++ b/examples/tool_chat_template_deepseekv31.jinja
@@ -0,0 +1,91 @@
+{% if not add_generation_prompt is defined %}
+  {% set add_generation_prompt = false %}
+{% endif %}
+{% if not thinking is defined %}
+  {% set thinking = false %}
+{% endif %}
+{% set ns = namespace(is_first=false, is_tool=false, system_prompt='', is_first_sp=true, is_last_user=false) %}
+{%- for message in messages %}
+  {%- if message['role'] == 'system' %}
+    {%- if ns.is_first_sp %}
+      {% set ns.system_prompt = ns.system_prompt + message['content'] %}
+      {% set ns.is_first_sp = false %}
+    {%- else %}
+      {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
+    {%- endif %}
+  {%- endif %}
+{%- endfor %}
+
+{% if tools is defined and tools is not none %}
+  {% set tool_ns = namespace(text='## Tools\nYou have access to the following tools:\n') %}
+  {% for tool in tools %}
+    {% set tool_ns.text = tool_ns.text + '\n### ' + tool.function.name + '\nDescription: ' + tool.function.description + '\n\nParameters: ' + (tool.function.parameters | tojson) + '\n' %}
+  {% endfor %}
+  {% set tool_ns.text = tool_ns.text + "\nIMPORTANT: ALWAYS adhere to this exact format for tool use:\n<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>tool_call_name<｜tool▁sep｜>tool_call_arguments<｜tool▁call▁end｜>{{additional_tool_calls}}<｜tool▁calls▁end｜>\n\nWhere:\n\n- `tool_call_name` must be an exact match to one of the available tools\n- `tool_call_arguments` must be valid JSON that strictly follows the tool's Parameters Schema\n- For multiple tool calls, chain them directly without separators or spaces\n" %}
+  {% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %}
+{% endif %}
+
+{{ bos_token }}{{ ns.system_prompt }}
+{%- for message in messages %}
+  {%- if message['role'] == 'user' %}
+    {%- set ns.is_tool = false -%}
+    {%- set ns.is_first = false -%}
+    {%- set ns.is_last_user = true -%}
+    {{'<｜User｜>' + message['content']}}
+  {%- endif %}
+  {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+    {%- if ns.is_last_user %}
+      {{'<｜Assistant｜></think>'}}
+    {%- endif %}
+    {%- set ns.is_last_user = false -%}
+    {%- set ns.is_first = false %}
+    {%- set ns.is_tool = false -%}
+    {%- for tool in message['tool_calls'] %}
+      {%- if not ns.is_first %}
+        {%- if message['content'] is none %}
+          {{'<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>'+ tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments']|tojson + '<｜tool▁call▁end｜>'}}
+        {%- else %}
+          {{message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments']|tojson + '<｜tool▁call▁end｜>'}}
+        {%- endif %}
+        {%- set ns.is_first = true -%}
+      {%- else %}
+        {{'<｜tool▁call▁begin｜>'+ tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments']|tojson + '<｜tool▁call▁end｜>'}}
+      {%- endif %}
+    {%- endfor %}
+    {{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+  {%- endif %}
+  {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none) %}
+    {%- if ns.is_last_user %}
+      {{'<｜Assistant｜>'}}
+      {%- if message['prefix'] is defined and message['prefix'] and thinking %}
+        {{'<think>'}}  
+      {%- else %}
+        {{'</think>'}}
+      {%- endif %}
+    {%- endif %}
+    {%- set ns.is_last_user = false -%}
+    {%- if ns.is_tool %}
+      {{message['content'] + '<｜end▁of▁sentence｜>'}}
+      {%- set ns.is_tool = false -%}
+    {%- else %}
+      {%- set content = message['content'] -%}
+      {%- if '</think>' in content %}
+        {%- set content = content.split('</think>', 1)[1] -%}
+      {%- endif %}
+      {{content + '<｜end▁of▁sentence｜>'}}
+    {%- endif %}
+  {%- endif %}
+  {%- if message['role'] == 'tool' %}
+    {%- set ns.is_last_user = false -%}
+    {%- set ns.is_tool = true -%}
+    {{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}
+  {%- endif %}
+{%- endfor -%}
+{%- if add_generation_prompt and ns.is_last_user and not ns.is_tool %}
+  {{'<｜Assistant｜>'}}
+  {%- if not thinking %}
+    {{'</think>'}}
+  {%- else %}
+    {{'<think>'}}
+  {%- endif %}
+{% endif %}
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 468c3799bd..44aa1208a5 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -3,6 +3,7 @@
 
 from .abstract_tool_parser import ToolParser, ToolParserManager
 from .deepseekv3_tool_parser import DeepSeekV3ToolParser
+from .deepseekv31_tool_parser import DeepSeekV31ToolParser
 from .glm4_moe_tool_parser import Glm4MoeModelToolParser
 from .granite_20b_fc_tool_parser import Granite20bFCToolParser
 from .granite_tool_parser import GraniteToolParser
@@ -36,6 +37,7 @@ __all__ = [
     "PythonicToolParser",
     "Phi4MiniJsonToolParser",
     "DeepSeekV3ToolParser",
+    "DeepSeekV31ToolParser",
     "xLAMToolParser",
     "MinimaxToolParser",
     "KimiK2ToolParser",
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
new file mode 100644
index 0000000000..2656db9c62
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
@@ -0,0 +1,367 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Union
+
+import regex as re
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("deepseek_v31")
+class DeepSeekV31ToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = (
+            [])  # map what has been streamed for each tool so far to a list
+
+        self.tool_calls_start_token: str = "<｜tool▁calls▁begin｜>"
+        self.tool_calls_end_token: str = "<｜tool▁calls▁end｜>"
+
+        self.tool_call_start_token: str = "<｜tool▁call▁begin｜>"
+        self.tool_call_end_token: str = "<｜tool▁call▁end｜>"
+
+        self.tool_call_regex = re.compile(
+            r"<｜tool▁call▁begin｜>(?P<function_name>.*)<｜tool▁sep｜>(?P<function_arguments>.*)<｜tool▁call▁end｜>"
+        )
+
+        self.stream_tool_call_portion_regex = re.compile(
+            r"(?P<function_name>.*)<｜tool▁sep｜>(?P<function_arguments>.*)")
+
+        self.stream_tool_call_name_regex = re.compile(
+            r"(?P<function_name>.*)<｜tool▁sep｜>")
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+        self.tool_calls_start_token_id = self.vocab.get(
+            self.tool_calls_start_token)
+        self.tool_calls_end_token_id = self.vocab.get(
+            self.tool_calls_end_token)
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+
+        if (self.tool_calls_start_token_id is None
+                or self.tool_calls_end_token_id is None):
+            raise RuntimeError(
+                "DeepSeek-V3 Tool parser could not locate tool call start/end "
+                "tokens in the tokenizer!")
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_calls_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        else:
+            try:
+                # there are two possible captures - between tags, or between a
+                # tag and end-of-string so the result of
+                # findall is an array of tuples where one is a function call and
+                # the other is None
+                function_call_tuples = self.tool_call_regex.findall(
+                    model_output)
+
+                tool_calls = []
+                for match in function_call_tuples:
+                    function_name, function_args = match
+                    tool_calls.append(
+                        ToolCall(
+                            type="function",
+                            function=FunctionCall(name=function_name,
+                                                  arguments=function_args),
+                        ))
+
+                content = model_output[:model_output.
+                                       find(self.tool_calls_start_token)]
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if content else None,
+                )
+
+            except Exception:
+                logger.exception(
+                    "Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+        # check to see if we should be streaming a tool call - is there a
+        if self.tool_calls_start_token_id not in current_token_ids:
+            logger.debug("No tool call tokens found!")
+            return DeltaMessage(content=delta_text)
+        delta_text = delta_text.replace(self.tool_calls_start_token,
+                                        "").replace(self.tool_calls_end_token,
+                                                    "")
+        try:
+
+            # figure out where we are in the parsing by counting tool call
+            # start & end tags
+            prev_tool_start_count = previous_token_ids.count(
+                self.tool_call_start_token_id)
+            prev_tool_end_count = previous_token_ids.count(
+                self.tool_call_end_token_id)
+            cur_tool_start_count = current_token_ids.count(
+                self.tool_call_start_token_id)
+            cur_tool_end_count = current_token_ids.count(
+                self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
+
+            # case: if we're generating text, OR rounding out a tool call
+            if (cur_tool_start_count == cur_tool_end_count
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
+                logger.debug("Generating text content! skipping tool parsing.")
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
+
+            # case -- we're starting a new tool call
+            if (cur_tool_start_count > cur_tool_end_count
+                    and cur_tool_start_count > prev_tool_start_count):
+                if len(delta_token_ids) > 1:
+                    tool_call_portion = current_text.split(
+                        self.tool_call_start_token)[-1]
+                else:
+                    tool_call_portion = None
+                    delta = None
+
+                text_portion = None
+
+                # set cursors and state appropriately
+                self.current_tool_id += 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("Starting on a new tool %s", self.current_tool_id)
+
+            # case -- we're updating an existing tool call
+            elif (cur_tool_start_count > cur_tool_end_count
+                  and cur_tool_start_count == prev_tool_start_count):
+
+                # get the portion of the text that's the tool call
+                tool_call_portion = current_text.split(
+                    self.tool_call_start_token)[-1]
+                text_portion = None
+
+            # case -- the current tool call is being closed.
+            elif (cur_tool_start_count == cur_tool_end_count
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if self.prev_tool_call_arr is None or len(
+                        self.prev_tool_call_arr) == 0:
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
+                diff = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments")
+                if diff:
+                    diff = (diff.encode("utf-8").decode("unicode_escape")
+                            if diff is str else diff)
+                    if '"}' not in delta_text:
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
+                    logger.debug(
+                        "Finishing tool and found diff that had not "
+                        "been streamed yet: %s",
+                        diff,
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += diff
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=diff).model_dump(exclude_none=True),
+                        )
+                    ])
+
+            # case -- otherwise we're just generating text
+            else:
+                text = delta_text.replace(self.tool_call_start_token, "")
+                text = text.replace(self.tool_call_end_token, "")
+                delta = DeltaMessage(tool_calls=[], content=text)
+                return delta
+
+            current_tool_call = dict()
+            if tool_call_portion:
+                current_tool_call_matches = (
+                    self.stream_tool_call_portion_regex.match(
+                        tool_call_portion))
+                if current_tool_call_matches:
+                    tool_name, tool_args = current_tool_call_matches.groups()
+                    current_tool_call["name"] = tool_name
+                    current_tool_call["arguments"] = tool_args
+                else:
+                    current_tool_call_name_matches = (
+                        self.stream_tool_call_name_regex.match(
+                            tool_call_portion))
+                    if current_tool_call_name_matches:
+                        tool_name = current_tool_call_name_matches.groups()
+                        current_tool_call["name"] = tool_name
+                        current_tool_call["arguments"] = ""
+                    else:
+                        logger.debug("Not enough token")
+                        return None
+
+            # case - we haven't sent the tool name yet. If it's available, send
+            #   it. otherwise, wait until it's available.
+            if not self.current_tool_name_sent:
+                if current_tool_call is None:
+                    return None
+                function_name: Union[str, None] = current_tool_call.get("name")
+                if function_name:
+                    self.current_tool_name_sent = True
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            type="function",
+                            id=make_tool_call_id(),
+                            function=DeltaFunctionCall(
+                                name=function_name).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                else:
+                    return None
+
+            # case -- otherwise, send the tool call delta
+
+            # if the tool call portion is None, send the delta as text
+            if tool_call_portion is None:
+                # if there's text but not tool calls, send that -
+                # otherwise None to skip chunk
+                delta = (DeltaMessage(
+                    content=delta_text) if text_portion is not None else None)
+                return delta
+
+            # now, the nitty-gritty of tool calls
+            # now we have the portion to parse as tool call.
+
+            logger.debug("Trying to parse current tool call with ID %s",
+                         self.current_tool_id)
+
+            # if we're starting a new tool call, push an empty object in as
+            #   a placeholder for the arguments
+            if len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+
+            # main logic for tool parsing here - compare prev. partially-parsed
+            #   JSON to the current partially-parsed JSON
+            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                "arguments")
+            cur_arguments = current_tool_call.get("arguments")
+
+            logger.debug("diffing old arguments: %s", prev_arguments)
+            logger.debug("against new ones: %s", cur_arguments)
+
+            # case -- no arguments have been created yet. skip sending a delta.
+            if not cur_arguments and not prev_arguments:
+                logger.debug("Skipping text %s - no arguments", delta_text)
+                delta = None
+
+            # case -- prev arguments are defined, but non are now.
+            #   probably impossible, but not a fatal error - just keep going
+            elif not cur_arguments and prev_arguments:
+                logger.error("should be impossible to have arguments reset "
+                             "mid-call. skipping streaming anything.")
+                delta = None
+
+            # case -- we now have the first info about arguments available from
+            #   autocompleting the JSON
+            elif cur_arguments and not prev_arguments:
+
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_id,
+                        function=DeltaFunctionCall(
+                            arguments=cur_arguments).model_dump(
+                                exclude_none=True),
+                    )
+                ])
+                self.streamed_args_for_tool[
+                    self.current_tool_id] = cur_arguments
+
+            # last case -- we have an update to existing arguments.
+            elif cur_arguments and prev_arguments:
+                if (isinstance(delta_text, str)
+                        and cur_arguments != prev_arguments
+                        and len(cur_arguments) > len(prev_arguments)
+                        and cur_arguments.startswith(prev_arguments)):
+                    delta_arguments = cur_arguments[len(prev_arguments):]
+                    logger.debug("got diff %s", delta_text)
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=delta_arguments).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] = cur_arguments
+                else:
+                    delta = None
+
+            # handle saving the state for the current tool into
+            # the "prev" list for use in diffing for the next iteration
+            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
+                self.prev_tool_call_arr[
+                    self.current_tool_id] = current_tool_call
+            else:
+                self.prev_tool_call_arr.append(current_tool_call)
+
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            return None  # do not stream a delta. skip this token ID.

From 65197a5fb37ef4d8b93e0b99ecc8b902fe948e97 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 23 Aug 2025 14:05:27 +0800
Subject: [PATCH 532/932] [Misc] Modify CacheConfig import (#23459)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/attention/layers/encoder_only_attention.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/layers/encoder_only_attention.py b/vllm/attention/layers/encoder_only_attention.py
index 7b3dcbd823..cea05df5b9 100644
--- a/vllm/attention/layers/encoder_only_attention.py
+++ b/vllm/attention/layers/encoder_only_attention.py
@@ -5,13 +5,13 @@ from copy import copy
 from typing import Optional
 
 import torch
-from transformers import CacheConfig
 
 from vllm import envs
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
+from vllm.config import CacheConfig
 from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
                                               subclass_attention_backend)
 

From c55c02899853a31b1a8a0f48b2ca3ea9dae80586 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Sun, 24 Aug 2025 12:42:38 +0800
Subject: [PATCH 533/932] [gpt-oss] Streaming Output for Python Tool (#23409)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/entrypoints/openai/serving_responses.py | 70 ++++++++++++--------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 6b131bbb04..5adcb310e3 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -1069,7 +1069,48 @@ class OpenAIServingResponses(OpenAIServing):
                             delta=ctx.parser.last_content_delta,
                             sequence_number=-1,
                         ))
-
+                # built-in tools will be triggered on the analysis channel
+                # However, occasionally built-in tools will
+                # still be output to commentary.
+                elif (ctx.parser.current_channel == "commentary"
+                      or ctx.parser.current_channel == "analysis"
+                      ) and ctx.parser.current_recipient == "python":
+                    if not sent_output_item_added:
+                        sent_output_item_added = True
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseOutputItemAddedEvent(
+                                type="response.output_item.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.
+                                ResponseCodeInterpreterToolCallParam(
+                                    type="code_interpreter_call",
+                                    id=current_item_id,
+                                    code=None,
+                                    container_id="auto",
+                                    outputs=None,
+                                    status="in_progress",
+                                ),
+                            ))
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseCodeInterpreterCallInProgressEvent(
+                                type=
+                                "response.code_interpreter_call.in_progress",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item_id=current_item_id,
+                            ))
+                    yield _send_event(
+                        openai_responses_types.
+                        ResponseCodeInterpreterCallCodeDeltaEvent(
+                            type="response.code_interpreter_call_code.delta",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            delta=ctx.parser.last_content_delta,
+                        ))
             if ctx.is_assistant_action_turn() and len(ctx.parser.messages) > 0:
                 previous_item = ctx.parser.messages[-1]
                 if (self.tool_server is not None
@@ -1165,30 +1206,6 @@ class OpenAIServingResponses(OpenAIServing):
                         and self.tool_server.has_tool("python")
                         and previous_item.recipient is not None
                         and previous_item.recipient.startswith("python")):
-                    yield _send_event(
-                        openai_responses_types.ResponseOutputItemAddedEvent(
-                            type="response.output_item.added",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item=openai_responses_types.
-                            ResponseCodeInterpreterToolCallParam(
-                                type="code_interpreter_call",
-                                id=current_item_id,
-                                code="",
-                                container_id="auto",
-                                outputs=[],
-                                status="in_progress",
-                            ),
-                        ))
-                    yield _send_event(
-                        openai_responses_types.
-                        ResponseCodeInterpreterCallInProgressEvent(
-                            type="response.code_interpreter_call.in_progress",
-                            sequence_number=-1,
-                            output_index=current_output_index,
-                            item_id=current_item_id,
-                        ))
-                    # TODO: do we need to add delta event here?
                     yield _send_event(
                         openai_responses_types.
                         ResponseCodeInterpreterCallCodeDoneEvent(
@@ -1196,7 +1213,8 @@ class OpenAIServingResponses(OpenAIServing):
                             sequence_number=-1,
                             output_index=current_output_index,
                             item_id=current_item_id,
-                            code=previous_item.content[0].text))
+                            code=previous_item.content[0].text,
+                        ))
                     yield _send_event(
                         openai_responses_types.
                         ResponseCodeInterpreterCallInterpretingEvent(

From 053278a5dc7a81d751f8e63c1ed793062b32cbce Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sat, 23 Aug 2025 21:55:53 -0700
Subject: [PATCH 534/932] Migrate Pixtral inputs to TensorSchema (#23472)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/pixtral.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index c01074e212..461b9c85d1 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -5,7 +5,7 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass, fields
 from functools import cached_property
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -48,6 +48,7 @@ from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
                                                cached_tokenizer_from_config)
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
@@ -68,15 +69,20 @@ except ImportError:
 PATCH_MERGE = "patch_merge"
 
 
-class PixtralImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-
-    images: Union[torch.Tensor, list[torch.Tensor]]
+class PixtralImagePixelInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_images, num_channels, image_width, image_height)`
-
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each image
+        - w: Width of each image
+    
     The result of stacking `ImageEncoding.tokens` from each prompt.
     """
+    type: Literal["pixel_values"] = "pixel_values"
+
+    images: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                      TensorShape("bn", 3, "h", "w", dynamic_dims={"h", "w"})]
 
 
 class PixtralProcessorAdapter:
@@ -381,10 +387,6 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
         if images is None:
             return None
 
-        if not isinstance(images, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of images. "
-                             f"Got type: {type(images)}")
-
         return PixtralImagePixelInputs(
             type="pixel_values",
             images=flatten_bn(images),

From 9dc30b7068ae07ceca89663e9f8403d00217256d Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Sat, 23 Aug 2025 21:56:17 -0700
Subject: [PATCH 535/932] [Bugfix] Add strong reference to CUDA pluggable
 allocator callbacks (#23477)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Eric Marcus <eric.marcus@kaiko.ai>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 vllm/device_allocator/cumem.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 942e866ed9..7963fb15c4 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -152,8 +152,13 @@ class CuMemAllocator:
         self.pointer_to_data: dict[int, AllocationData] = {}
         self.current_tag: str = CuMemAllocator.default_tag
         self.allocator_and_pools: dict[str, Any] = {}
+        # Creating strong references to the two callbacks here to prevent
+        # these ephemeral bound-method objects being garbage collected.
+        # See discussions in https://github.com/vllm-project/vllm/pull/22724
+        self.python_malloc_callback = self._python_malloc_callback
+        self.python_free_callback = self._python_free_callback
 
-    def python_malloc_callback(self, allocation_handle: HandleType) -> None:
+    def _python_malloc_callback(self, allocation_handle: HandleType) -> None:
         """
         Internal method to store the allocation data
         when memory is allocated in the memory pool."""
@@ -162,7 +167,7 @@ class CuMemAllocator:
             allocation_handle, self.current_tag)
         return
 
-    def python_free_callback(self, ptr: int) -> HandleType:
+    def _python_free_callback(self, ptr: int) -> HandleType:
         """
         Internal method to look up the allocation data
         when memory is freed in the memory pool."""
@@ -212,9 +217,9 @@ class CuMemAllocator:
     def wake_up(self, tags: Optional[list[str]] = None) -> None:
         """
         Wake up the allocator from sleep mode.
-        All data that is previously offloaded will be loaded back to GPU 
+        All data that is previously offloaded will be loaded back to GPU
         memory, and the rest of the data will have empty memory.
-        
+
         :param tags: The tags of the memory allocation that will be loaded
             back to GPU memory. If None, all memory allocation will be loaded
             back to GPU memory.

From a75277285ba6fa178c9cb9185fec7ec5943fff6b Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sat, 23 Aug 2025 21:56:56 -0700
Subject: [PATCH 536/932] Migrate Paligemma inputs to TensorSchema (#23470)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/paligemma.py | 67 +++++++++++--------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 7d6a6207c7..95abb190e0 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 from torch import nn
@@ -21,6 +21,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
@@ -32,19 +33,27 @@ from .vision import get_vision_encoder_info
 logger = init_logger(__name__)
 
 
-class PaliGemmaImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
-
-
-class PaliGemmaImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
+class PaliGemmaImagePixelInputs(TensorSchema):
     """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+    """
+    type: Literal["pixel_values"] = "pixel_values"
+    data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
+
+
+class PaliGemmaImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - ifs: Image feature size
+        - hs: Hidden size (must match language model backbone)
+    """
+    type: Literal["image_embeds"] = "image_embeds"
+    data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
 PaliGemmaImageInputs = Union[PaliGemmaImagePixelInputs,
@@ -279,19 +288,6 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[PaliGemmaImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -301,22 +297,17 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
             pixel_values = flatten_bn(pixel_values, concat=True)
 
-            return PaliGemmaImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_pixel_values(pixel_values),
-            )
+            h = w = self.config.vision_config.image_size
+            return PaliGemmaImagePixelInputs(type="pixel_values",
+                                             data=pixel_values,
+                                             resolve_bindings={
+                                                 "h": h,
+                                                 "w": w
+                                             })
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
             image_embeds = flatten_bn(image_embeds, concat=True)
 
             return PaliGemmaImageEmbeddingInputs(

From e76e23354033f167b778f5e49fd384e301681d65 Mon Sep 17 00:00:00 2001
From: czhu-cohere <conway.zhu@cohere.com>
Date: Sun, 24 Aug 2025 02:18:04 -0400
Subject: [PATCH 537/932] [kernel] Support W4A8 on Hopper (#23198)

Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
---
 CMakeLists.txt                                |  27 ++
 benchmarks/kernels/benchmark_machete.py       |  33 ++
 benchmarks/kernels/weight_shapes.py           |   6 +
 .../cutlass_w4a8/w4a8_mm_entry.cu             | 418 ++++++++++++++++++
 csrc/torch_bindings.cpp                       |  20 +
 .../kernels/quantization/test_cutlass_w4a8.py | 259 +++++++++++
 vllm/_custom_ops.py                           |  48 ++
 .../compressed_tensors/compressed_tensors.py  |  43 +-
 .../compressed_tensors/schemes/__init__.py    |   4 +-
 .../schemes/compressed_tensors_w4a8_fp8.py    | 160 +++++++
 .../kernels/mixed_precision/__init__.py       |   3 +
 .../kernels/mixed_precision/cutlass.py        | 114 +++++
 12 files changed, 1128 insertions(+), 7 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
 create mode 100644 tests/kernels/quantization/test_cutlass_w4a8.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a1deefb07f..aca42c3fe5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -750,6 +750,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                      "found in CUDA target architectures")
     endif()
   endif()
+
+  # Only build W4A8 kernels if we are building for something compatible with sm90a
+  cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
+    set(SRCS
+       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${W4A8_ARCHS}")
+
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+
+    message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
+        AND W4A8_ARCHS)
+      message(STATUS "Not building W4A8 kernels as CUDA Compiler version is "
+                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                     "later if you intend on running w4a16 quantized models on "
+                     "Hopper.")
+    else()
+      message(STATUS "Not building W4A8 kernels as no compatible archs "
+                     "found in CUDA target architectures")
+    endif()
+  endif()
+
 # if CUDA endif
 endif()
 
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index a9c4d30d9b..1b1c3b321c 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -284,6 +284,25 @@ def machete_create_bench_fn(
     )
 
 
+def cutlass_w4a8_create_bench_fn(
+    bt: BenchmarkTensors, out_type=torch.dtype, schedule=None
+) -> Callable:
+    w_q = bt.w_q.t().contiguous().t()  # make col major
+    w_q = ops.cutlass_encode_and_reorder_int4b(w_q)
+    # expects fp8 scales
+    w_s = ops.cutlass_pack_scale_fp8(bt.w_g_s.to(torch.float8_e4m3fn))
+
+    return lambda: ops.cutlass_w4a8_mm(
+        a=bt.a,
+        b_q=w_q,
+        b_group_scales=w_s,
+        b_group_size=bt.group_size,
+        b_channel_scales=bt.w_ch_s,
+        a_token_scales=bt.w_tok_s,
+        maybe_schedule=schedule,
+    )
+
+
 # impl
 
 # bench
@@ -385,6 +404,20 @@ def bench(
         )
     )
 
+    # cutlass w4a8
+    if types.act_type == torch.float8_e4m3fn and group_size == 128:
+        timers.append(
+            bench_fns(
+                label,
+                sub_label,
+                f"cutlass w4a8 ({name_type_string})",
+                [
+                    cutlass_w4a8_create_bench_fn(bt, out_type=types.output_type)
+                    for bt in benchmark_tensors
+                ],
+            )
+        )
+
     if sweep_schedules:
         global _SWEEP_SCHEDULES_RESULTS
 
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index a27f02394a..9a057990bd 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -95,4 +95,10 @@ WEIGHT_SHAPES = {
         ([2048, 2816], 1),
         ([1408, 2048], 0),
     ],
+    "CohereLabs/c4ai-command-a-03-2025": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 73728], 1),
+        ([36864, 12288], 0),
+    ],
 }
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
new file mode 100644
index 0000000000..fdac47c425
--- /dev/null
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@@ -0,0 +1,418 @@
+//
+// Based off of:
+//   https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu
+//
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+#include "cutlass_extensions/torch_utils.hpp"
+
+#include "core/registration.h"
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/mixed_dtype_utils.hpp"
+
+#include "cutlass_extensions/common.hpp"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm::cutlass_w4a8 {
+
+using namespace cute;
+
+// -------------------------------------------------------------------------------------
+// Static configuration shared across all instantiations
+// -------------------------------------------------------------------------------------
+using MmaType = cutlass::float_e4m3_t;  // A/scale element type
+using QuantType = cutlass::int4b_t;     // B element type (packed int4)
+
+static int constexpr TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
+static int constexpr ScalePackSize = 8;  // pack 8 scale elements together
+static int constexpr PackFactor = 8;     // 8 4-bit packed into int32
+
+// A matrix configuration
+using ElementA = MmaType;                   // Element type for A matrix operand
+using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
+using LayoutA_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+constexpr int AlignmentA =
+    128 / cutlass::sizeof_bits<
+              ElementA>::value;  // Memory access granularity/alignment of A
+                                 // matrix in units of elements (up to 16 bytes)
+using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+
+// B matrix configuration
+using ElementB = QuantType;  // Element type for B matrix operand
+using LayoutB =
+    cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
+using LayoutB_Transpose =
+    typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+constexpr int AlignmentB =
+    128 / cutlass::sizeof_bits<
+              ElementB>::value;  // Memory access granularity/alignment of B
+                                 // matrix in units of elements (up to 16 bytes)
+using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
+
+// Define the CuTe layout for reordered quantized tensor B
+// LayoutAtomQuant places values that will be read by the same thread in
+// contiguous locations in global memory. It specifies the reordering within a
+// single warp's fragment
+using LayoutAtomQuant =
+    decltype(cutlass::compute_memory_reordering_atom<MmaType>());
+using LayoutB_Reordered = decltype(cute::tile_to_shape(
+    LayoutAtomQuant{}, Layout<Shape<int, int, int>, StrideB>{}));
+
+// Group-wise scales
+using ElementScale = MmaType;
+using LayoutScale = cutlass::layout::RowMajor;
+
+// Per-tok, per-chan scales
+using ElementSChannel = float;
+
+// C/D matrix configuration
+using ElementC =
+    cutlass::bfloat16_t;  // Element type for C and D matrix operands
+using LayoutC =
+    cutlass::layout::RowMajor;  // Layout type for C and D matrix operands
+constexpr int AlignmentC =
+    128 / cutlass::sizeof_bits<
+              ElementC>::value;  // Memory access granularity/alignment of C
+                                 // matrix in units of elements (up to 16 bytes)
+
+using ElementD = ElementC;
+using LayoutD = LayoutC;
+constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+// Core kernel configurations
+using ElementAccumulator = float;     // Element type for internal accumulation
+using ElementCompute = float;         // Element type for epilogue computation
+using ArchTag = cutlass::arch::Sm90;  // Tag indicating the minimum SM that
+                                      // supports the intended feature
+using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
+using KernelSchedule =
+    cutlass::gemm::KernelTmaWarpSpecializedCooperative;  // Kernel to launch
+                                                         // based on the default
+                                                         // setting in the
+                                                         // Collective Builder
+using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+
+// ----------------------------------------------------------------------------
+// Kernel template — Tile/Cluster shapes
+// ----------------------------------------------------------------------------
+template <class TileShape_MN, class ClusterShape_MNK>
+struct W4A8GemmKernel {
+  using TileShape =
+      decltype(cute::append(TileShape_MN{}, cute::Int<TileShapeK>{}));
+  using ClusterShape = ClusterShape_MNK;
+
+  // Epilogue per-tok, per-chan scales
+  using ChTokScalesEpilogue =
+      typename vllm::c3x::ScaledEpilogue<ElementAccumulator, ElementD,
+                                         TileShape>;
+  using EVTCompute = typename ChTokScalesEpilogue::EVTCompute;
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
+          ElementAccumulator, ElementSChannel,
+          // Transpose layout of D here since we use explicit swap + transpose
+          // the void type for C tells the builder to allocate 0 smem for the C
+          // matrix. We can enable this if beta == 0 by changing ElementC to
+          // void below.
+          ElementC, typename cutlass::layout::LayoutTranspose<LayoutC>::type,
+          AlignmentC, ElementD,
+          typename cutlass::layout::LayoutTranspose<LayoutD>::type, AlignmentD,
+          EpilogueSchedule,  // This is the only epi supporting the required
+                             // swap + transpose.
+          EVTCompute>::CollectiveOp;
+
+  // The Scale information must get paired with the operand that will be scaled.
+  // In this example, B is scaled so we make a tuple of B's information and the
+  // scale information.
+  using CollectiveMainloopShuffled =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass,
+          cute::tuple<ElementB, cutlass::Array<ElementScale, ScalePackSize>>,
+          LayoutB_Reordered, AlignmentB, ElementA, LayoutA_Transpose,
+          AlignmentA, ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>,  // Indicates ProblemShape
+      CollectiveMainloopShuffled, CollectiveEpilogue>;
+  using GemmShuffled =
+      cutlass::gemm::device::GemmUniversalAdapter<GemmKernelShuffled>;
+
+  using StrideC = typename GemmKernelShuffled::StrideC;
+  using StrideD = typename GemmKernelShuffled::StrideD;
+  using StrideS = typename CollectiveMainloopShuffled::StrideScale;
+
+  static torch::Tensor mm(torch::Tensor const& A,
+                          torch::Tensor const& B,             // already packed
+                          torch::Tensor const& group_scales,  // already packed
+                          int64_t group_size,
+                          torch::Tensor const& channel_scales,
+                          torch::Tensor const& token_scales,
+                          std::optional<at::ScalarType> const& maybe_out_type) {
+    // TODO: param validation
+    int m = A.size(0);
+    int k = A.size(1);
+    int n = B.size(1);
+
+    // Allocate output
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+    auto device = A.device();
+    auto stream = at::cuda::getCurrentCUDAStream(device.index());
+    torch::Tensor D =
+        torch::empty({m, n}, torch::TensorOptions()
+                                 .dtype(equivalent_scalar_type_v<ElementD>)
+                                 .device(device));
+    // prepare arg pointers
+    auto A_ptr = static_cast<MmaType const*>(A.const_data_ptr());
+    auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr());
+    auto D_ptr = static_cast<ElementD*>(D.data_ptr());
+    // can we avoid harcode the 8 here
+    auto S_ptr =
+        static_cast<cutlass::Array<ElementScale, ScalePackSize> const*>(
+            group_scales.const_data_ptr());
+
+    // runtime layout for B
+    auto shape_B = cute::make_shape(n, k, 1);
+    LayoutB_Reordered layout_B_reordered =
+        cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
+
+    // strides
+    int const scale_k = cutlass::ceil_div(k, group_size);
+    StrideA stride_A =
+        cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+    // Reverse stride here due to swap and transpose
+    StrideD stride_D =
+        cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(n, m, 1));
+    StrideS stride_S = cutlass::make_cute_packed_stride(
+        StrideS{}, cute::make_shape(n, scale_k, 1));
+
+    // Create a structure of gemm kernel arguments suitable for invoking an
+    // instance of Gemm auto arguments =
+    // args_from_options<GemmShuffled>(options);
+    /// Populates a Gemm::Arguments structure from the given arguments
+    /// Swap the A and B tensors, as well as problem shapes here.
+    using Args = typename GemmShuffled::Arguments;
+    using MainloopArguments = typename GemmKernelShuffled::MainloopArguments;
+    using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
+
+    MainloopArguments mainloop_arguments{
+        B_ptr, layout_B_reordered, A_ptr,     stride_A,
+        S_ptr, stride_S,           group_size};
+
+    EpilogueArguments epilogue_arguments{
+        ChTokScalesEpilogue::prepare_args(channel_scales, token_scales),
+        nullptr,
+        {},  // no C
+        D_ptr,
+        stride_D};
+
+    Args arguments{cutlass::gemm::GemmUniversalMode::kGemm,
+                   {n, m, k, 1},  // shape
+                   mainloop_arguments,
+                   epilogue_arguments};
+
+    // Workspace
+    size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
+    torch::Tensor workspace =
+        torch::empty(workspace_size,
+                     torch::TensorOptions().dtype(torch::kU8).device(device));
+
+    // Run GEMM
+    GemmShuffled gemm;
+    CUTLASS_CHECK(gemm.can_implement(arguments));
+    CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+    CUTLASS_CHECK(gemm.run(stream));
+
+    return D;
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Kernel instantiations and dispatch logic
+// ----------------------------------------------------------------------------
+using Kernel_256x128_1x1x1 =
+    W4A8GemmKernel<Shape<_256, _128>, Shape<_1, _1, _1>>;
+using Kernel_256x64_1x1x1 = W4A8GemmKernel<Shape<_256, _64>, Shape<_1, _1, _1>>;
+using Kernel_256x32_1x1x1 = W4A8GemmKernel<Shape<_256, _32>, Shape<_1, _1, _1>>;
+using Kernel_256x16_1x1x1 = W4A8GemmKernel<Shape<_256, _16>, Shape<_1, _1, _1>>;
+using Kernel_128x256_2x1x1 =
+    W4A8GemmKernel<Shape<_128, _256>, Shape<_2, _1, _1>>;
+using Kernel_128x256_1x1x1 =
+    W4A8GemmKernel<Shape<_128, _256>, Shape<_1, _1, _1>>;
+using Kernel_128x128_1x1x1 =
+    W4A8GemmKernel<Shape<_128, _128>, Shape<_1, _1, _1>>;
+using Kernel_128x64_1x1x1 = W4A8GemmKernel<Shape<_128, _64>, Shape<_1, _1, _1>>;
+using Kernel_128x32_1x1x1 = W4A8GemmKernel<Shape<_128, _32>, Shape<_1, _1, _1>>;
+using Kernel_128x16_1x1x1 = W4A8GemmKernel<Shape<_128, _16>, Shape<_1, _1, _1>>;
+
+torch::Tensor mm_dispatch(torch::Tensor const& A,
+                          torch::Tensor const& B,             // already packed
+                          torch::Tensor const& group_scales,  // already packed
+                          int64_t group_size,
+                          torch::Tensor const& channel_scales,
+                          torch::Tensor const& token_scales,
+                          std::optional<at::ScalarType> const& maybe_out_type,
+                          const std::string& schedule) {
+  if (schedule == "256x128_1x1x1") {
+    return Kernel_256x128_1x1x1::mm(A, B, group_scales, group_size,
+                                    channel_scales, token_scales,
+                                    maybe_out_type);
+  } else if (schedule == "256x64_1x1x1") {
+    return Kernel_256x64_1x1x1::mm(A, B, group_scales, group_size,
+                                   channel_scales, token_scales,
+                                   maybe_out_type);
+  } else if (schedule == "256x32_1x1x1") {
+    return Kernel_256x32_1x1x1::mm(A, B, group_scales, group_size,
+                                   channel_scales, token_scales,
+                                   maybe_out_type);
+  } else if (schedule == "256x16_1x1x1") {
+    return Kernel_256x16_1x1x1::mm(A, B, group_scales, group_size,
+                                   channel_scales, token_scales,
+                                   maybe_out_type);
+  } else if (schedule == "128x256_2x1x1") {
+    return Kernel_128x256_2x1x1::mm(A, B, group_scales, group_size,
+                                    channel_scales, token_scales,
+                                    maybe_out_type);
+  } else if (schedule == "128x256_1x1x1") {
+    return Kernel_128x256_1x1x1::mm(A, B, group_scales, group_size,
+                                    channel_scales, token_scales,
+                                    maybe_out_type);
+  } else if (schedule == "128x128_1x1x1") {
+    return Kernel_128x128_1x1x1::mm(A, B, group_scales, group_size,
+                                    channel_scales, token_scales,
+                                    maybe_out_type);
+  } else if (schedule == "128x64_1x1x1") {
+    return Kernel_128x64_1x1x1::mm(A, B, group_scales, group_size,
+                                   channel_scales, token_scales,
+                                   maybe_out_type);
+  } else if (schedule == "128x32_1x1x1") {
+    return Kernel_128x32_1x1x1::mm(A, B, group_scales, group_size,
+                                   channel_scales, token_scales,
+                                   maybe_out_type);
+  } else if (schedule == "128x16_1x1x1") {
+    return Kernel_128x16_1x1x1::mm(A, B, group_scales, group_size,
+                                   channel_scales, token_scales,
+                                   maybe_out_type);
+  }
+  TORCH_CHECK(false, "Unknown W4A8 schedule: ", schedule);
+  return {};
+}
+
+torch::Tensor mm(torch::Tensor const& A,
+                 torch::Tensor const& B,             // already packed
+                 torch::Tensor const& group_scales,  // already packed
+                 int64_t group_size, torch::Tensor const& channel_scales,
+                 torch::Tensor const& token_scales,
+                 std::optional<at::ScalarType> const& maybe_out_type,
+                 std::optional<std::string> maybe_schedule) {
+  // requested a specific schedule
+  if (maybe_schedule) {
+    return mm_dispatch(A, B, group_scales, group_size, channel_scales,
+                       token_scales, maybe_out_type, *maybe_schedule);
+  }
+  std::string schedule;
+  int M = A.size(0);
+  int K = A.size(1);
+  int N = B.size(1);
+  // heuristic
+  if (M <= 16) {
+    schedule = (K == 16384 && N == 18432) ? "256x16_1x1x1" : "128x16_1x1x1";
+  } else if (M <= 32) {
+    schedule = (K == 16384 && N == 18432) ? "256x32_1x1x1" : "128x32_1x1x1";
+  } else if (M <= 64) {
+    if (K == 16384 && N == 18432)
+      schedule = "256x64_1x1x1";
+    else if (N <= 8192 && K <= 8192)
+      schedule = "128x32_1x1x1";
+    else
+      schedule = "128x64_1x1x1";
+  } else if (M <= 128) {
+    if (K == 16384 && N == 18432)
+      schedule = "256x128_1x1x1";
+    else if (N <= 8192)
+      schedule = "128x64_1x1x1";
+    else
+      schedule = "128x128_1x1x1";
+  } else if (M <= 256) {
+    if (N <= 4096)
+      schedule = "128x64_1x1x1";
+    else if (N <= 8192)
+      schedule = "128x128_1x1x1";
+    else
+      schedule = "128x256_1x1x1";
+  } else if (M <= 512 && N <= 4096) {
+    schedule = "128x128_1x1x1";
+  } else if (M <= 1024) {
+    schedule = "128x256_1x1x1";
+  } else {
+    schedule = "128x256_2x1x1";
+  }
+  return mm_dispatch(A, B, group_scales, group_size, channel_scales,
+                     token_scales, maybe_out_type, schedule);
+}
+
+// ----------------------------------------------------------------------------
+// Pre-processing utils
+// ----------------------------------------------------------------------------
+torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
+  TORCH_CHECK(scales.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(scales.is_contiguous());
+  TORCH_CHECK(scales.is_cuda());
+
+  auto packed_scales = torch::empty(
+      {scales.numel() * ScalePackSize},
+      torch::TensorOptions().dtype(scales.dtype()).device(scales.device()));
+  auto scales_ptr = static_cast<MmaType const*>(scales.const_data_ptr());
+  auto packed_scales_ptr =
+      static_cast<cutlass::Array<ElementScale, ScalePackSize>*>(
+          packed_scales.data_ptr());
+
+  cutlass::pack_scale_fp8(scales_ptr, packed_scales_ptr, scales.numel());
+
+  return packed_scales;
+}
+
+torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
+  TORCH_CHECK(B.dtype() == torch::kInt32);
+  TORCH_CHECK(B.dim() == 2);
+
+  torch::Tensor B_packed = torch::empty_like(B);
+
+  int k = B.size(0) * PackFactor;  // logical k
+  int n = B.size(1);
+
+  auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr());
+  auto B_packed_ptr = static_cast<QuantType*>(B_packed.data_ptr());
+  auto shape_B = cute::make_shape(n, k, 1);
+  auto layout_B = make_layout(shape_B, LayoutRight{});  // row major
+  LayoutB_Reordered layout_B_reordered =
+      cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
+
+  cutlass::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
+  cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);
+
+  return B_packed;
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_w4a8_mm", &mm);
+  m.impl("cutlass_pack_scale_fp8", &pack_scale_fp8);
+  m.impl("cutlass_encode_and_reorder_int4b", &encode_and_reorder_int4b);
+}
+
+}  // namespace vllm::cutlass_w4a8
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 4edb7af50f..7ae054dc19 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -309,6 +309,26 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
       "SymInt size_n, int num_bits) -> Tensor");
   // conditionally compiled so impl registrations are in source file
+
+  // CUTLASS w4a8 GEMM
+  ops.def(
+      "cutlass_w4a8_mm("
+      "   Tensor A,"
+      "   Tensor B,"
+      "   Tensor group_scales,"
+      "   int    group_size,"
+      "   Tensor channel_scales,"
+      "   Tensor token_scales,"
+      "   ScalarType? out_type,"
+      "   str?   maybe_schedule"
+      ") -> Tensor",
+      {stride_tag});
+  // pack scales
+  ops.def("cutlass_pack_scale_fp8(Tensor scales) -> Tensor");
+  // encode and reorder weight matrix
+  ops.def("cutlass_encode_and_reorder_int4b(Tensor B) -> Tensor");
+  // conditionally compiled so impl registration is in source file
+
 #endif
 
   // Dequantization for GGML.
diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py
new file mode 100644
index 0000000000..7832f8179d
--- /dev/null
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
@@ -0,0 +1,259 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the CUTLASS W4A8 kernel.
+
+Run `pytest tests/kernels/test_cutlass_w4a8.py`.
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_rows, quantize_weights)
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
+#  unit tests to a common utility function. Currently the use of
+#  `is_quant_method_supported` conflates kernels with quantization methods
+#  an assumption which is breaking down as quantizations methods can have
+#  have kernels and some kernels support multiple quantization methods.
+IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
+
+MNK_SHAPES = [(1, 128, 128), (1, 512, 1024), (1, 4096, 4096), (1, 8192, 28672),
+              (13, 8192, 4096), (26, 4096, 8192), (64, 4096, 4096),
+              (64, 8192, 28672), (257, 128, 4096), (257, 4096, 4096),
+              (1024, 4096, 8192), (1024, 8192, 4096)]
+
+# TODO(czhu): get supported schedules from fn
+SCHEDULES = [
+    '128x16_1x1x1', '256x16_1x1x1', '128x32_1x1x1', '256x32_1x1x1',
+    '128x64_1x1x1', '256x64_1x1x1', '128x128_1x1x1', '256x128_1x1x1',
+    '128x256_1x1x1', '128x256_2x1x1'
+]
+
+
+@dataclass
+class TypeConfig:
+    act_type: torch.dtype
+    weight_type: ScalarType
+    output_type: Optional[torch.dtype]
+    group_scale_type: Optional[torch.dtype]
+    channel_scale_type: Optional[torch.dtype]
+    token_scale_type: Optional[torch.dtype]
+
+
+@dataclass
+class Tensors:
+    w_ref: torch.Tensor
+    a_ref: torch.Tensor
+    a: torch.Tensor
+    w_q: torch.Tensor
+    w_g_s: torch.Tensor
+    w_ch_s: torch.Tensor
+    w_tok_s: torch.Tensor
+
+
+# (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
+#  Ch Scales Type, Tok Scales Type)
+TestTypeTuple = tuple[list[torch.dtype], ScalarType, Optional[torch.dtype],
+                      Optional[torch.dtype], bool]
+TEST_TYPES = [
+    *(
+        TypeConfig(act_type=torch.float8_e4m3fn,
+                   weight_type=w_type,
+                   output_type=o_type,
+                   group_scale_type=torch.float8_e4m3fn,
+                   channel_scale_type=torch.float32,
+                   token_scale_type=torch.float32)
+        for w_type in [scalar_types.int4]
+        # TODO(czhu): fp16 out type
+        for o_type in [torch.bfloat16]),
+]
+
+# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
+#  unit tests to a common utility function. Currently the use of
+#  `is_quant_method_supported` conflates kernels with quantization methods
+#  an assumption which is breaking down as quantizations methods can have
+#  have kernels and some kernels support multiple quantization methods.
+IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
+
+
+# For testing quantized linear kernels
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return tensor.clamp(min=finfo.min,
+                        max=finfo.max).to(dtype=torch.float8_e4m3fn)
+
+
+def cutlass_quantize_and_pack(atype: torch.dtype,
+                              w: torch.Tensor,
+                              wtype: ScalarType,
+                              stype: Optional[torch.dtype],
+                              group_size: Optional[int],
+                              zero_points: bool = False):
+    assert wtype.is_integer(), "TODO: support floating point weights"
+
+    w_ref, w_q, w_s, w_zp = quantize_weights(w,
+                                             wtype,
+                                             group_size=group_size,
+                                             zero_points=zero_points)
+
+    # since scales are cast to fp8, we need to compute w_ref this way
+    w_ref = ((w_q).to(torch.float32) * w_s.to(atype).to(
+        torch.float32).repeat_interleave(group_size, dim=0)).to(atype)
+
+    # bit mask prevents sign extending int4 when packing
+    w_q = pack_rows(w_q & 0x0F, wtype.size_bits, *w_q.shape)
+    w_q = w_q.t().contiguous().t()  # convert to col major
+
+    w_q_packed = ops.cutlass_encode_and_reorder_int4b(w_q)
+    w_s_packed = ops.cutlass_pack_scale_fp8(w_s.to(atype))
+
+    return w_ref, w_q_packed, w_s_packed, w_zp
+
+
+def create_test_tensors(shape: tuple[int, int, int], types: TypeConfig,
+                        group_size: Optional[int]) -> Tensors:
+    m, n, k = shape
+
+    print("create_test_tensors, shape:", shape, "types:", types, "group_size:",
+          group_size)
+
+    a = to_fp8(torch.randn((m, k), device="cuda"))
+    w = to_fp8(torch.randn((k, n), device="cuda"))
+
+    if types.group_scale_type is not None:
+        w = w.to(types.group_scale_type)
+    if w.dtype.itemsize == 1:
+        w = w.to(torch.float16)
+
+    w_ref, w_q_packed, w_s, _ = cutlass_quantize_and_pack(
+        a.dtype, w, types.weight_type, types.group_scale_type, group_size,
+        False)
+
+    a_ref = a.to(torch.float32)
+    w_ref = w_ref.to(torch.float32)
+
+    # for the practical use case we need per-tok scales for fp8 activations
+    w_tok_s = torch.randn((m, ), device='cuda', dtype=types.token_scale_type)
+    # weights are already per-group quantized, use placeholder here
+    w_ch_s = torch.ones((n, ), device='cuda', dtype=types.channel_scale_type)
+
+    return Tensors(w_ref=w_ref,
+                   a_ref=a_ref,
+                   a=a,
+                   w_q=w_q_packed,
+                   w_g_s=w_s,
+                   w_ch_s=w_ch_s,
+                   w_tok_s=w_tok_s)
+
+
+def mm_test_helper(types: TypeConfig,
+                   tensors: Tensors,
+                   group_size: Optional[int] = None,
+                   schedule: Optional[str] = None):
+    # CUTLASS upstream uses fp8 with fastaccum as reference
+    # https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu#L406
+    output_ref = torch._scaled_mm(
+        tensors.a_ref.to(types.act_type),
+        tensors.w_ref.to(types.act_type).t().contiguous().t(),  # col major
+        tensors.w_tok_s.unsqueeze(1),
+        tensors.w_ch_s.unsqueeze(0),
+        out_dtype=types.output_type,
+        use_fast_accum=True)
+
+    output = ops.cutlass_w4a8_mm(
+        a=tensors.a,
+        b_q=tensors.w_q,
+        b_group_scales=tensors.w_g_s,
+        b_group_size=group_size,
+        b_channel_scales=tensors.w_ch_s,
+        a_token_scales=tensors.w_tok_s,
+    )
+
+    print(output)
+    print(output_ref)
+
+    torch.testing.assert_close(output,
+                               output_ref.to(output.dtype),
+                               rtol=1e-3,
+                               atol=1e-3)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="CUTLASS W4A8 is not supported on this GPU type.")
+@pytest.mark.parametrize("shape",
+                         MNK_SHAPES,
+                         ids=lambda x: "x".join(str(v) for v in x))
+@pytest.mark.parametrize("types", TEST_TYPES)
+@pytest.mark.parametrize("schedule", SCHEDULES)
+def test_cutlass_w4a8(shape, types: TypeConfig, schedule):
+    group_sizes = [128]
+    for group_size in group_sizes:
+        tensors = create_test_tensors(shape, types, group_size)
+        mm_test_helper(types, tensors, group_size, schedule)
+
+
+# Test to make sure cuda graphs work
+class W4A8Layer(torch.nn.Module):
+
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.kwargs = kwargs
+
+    def forward(self, a):
+        return ops.cutlass_w4a8_mm(a=a, **self.kwargs)
+
+
+@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
+                    reason="CUTLASS W4A8 is not supported on this GPU type.")
+def test_w4a8_cuda_graph():
+    m, n, k = 512, 4096, 4096
+
+    a = to_fp8(torch.randn((m, k), device="cuda"))
+    b = to_fp8(torch.randn((k, n), device="cuda"))
+
+    wtype = scalar_types.int4
+    stype = torch.float8_e4m3fn
+    group_size = 128
+    zero_points = False
+
+    w_ref, w_q_packed, w_s, _ = cutlass_quantize_and_pack(
+        a.dtype, b.to(torch.float16), wtype, stype, group_size, zero_points)
+
+    w_tok_s = torch.randn((m, ), device='cuda', dtype=torch.float32)
+    w_ch_s = torch.ones((n, ), device='cuda', dtype=torch.float32)
+
+    # Construct a trivial model with a single layer that calls the kernel
+    model = W4A8Layer(
+        b_q=w_q_packed,
+        b_group_scales=w_s,
+        b_group_size=group_size,
+        b_channel_scales=w_ch_s,
+        a_token_scales=w_tok_s,
+    )
+
+    output_ref = torch._scaled_mm(
+        a,
+        w_ref.to(a.dtype).t().contiguous().t(),  # col major
+        w_tok_s.unsqueeze(1),
+        w_ch_s.unsqueeze(0),
+        out_dtype=torch.bfloat16,
+        use_fast_accum=True)
+
+    # Run the model with a cuda graph
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            output = model(a)
+
+    output.zero_()
+    g.replay()
+
+    torch.testing.assert_close(output, output_ref, rtol=1e-3, atol=1e-3)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 0043456e00..3e3b43ce2a 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -474,6 +474,30 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
         return torch.empty_like(b_q_weight,
                                 memory_format=torch.contiguous_format)
 
+    @register_fake("_C::cutlass_w4a8_mm")
+    def cutlass_w4a8_mm_fake(
+            a: torch.Tensor,
+            # b_q Should be the tensor returned by cutlass_encode_and_reorder_int4b
+            b_q: torch.Tensor,
+            b_group_scales: torch.Tensor,
+            b_group_size: int,
+            b_channel_scales: torch.Tensor,
+            a_token_scales: torch.Tensor,
+            out_type: Optional[torch.dtype] = None,
+            maybe_schedule: Optional[str] = None) -> torch.Tensor:
+        m = a.size(0)
+        n = b_q.size(1)
+        out_dtype = out_type if out_type is not None else torch.bfloat16
+        return torch.empty((m, n), device=a.device, dtype=out_dtype)
+
+    @register_fake("_C::cutlass_pack_scale_fp8")
+    def cutlass_pack_scale_fp8_fake(scales: torch.Tensor) -> torch.Tensor:
+        return torch.empty_like(scales, memory_format=torch.contiguous_format)
+
+    @register_fake("_C::cutlass_encode_and_reorder_int4b")
+    def cutlass_encode_and_reorder_int4b_fake(b: torch.Tensor) -> torch.Tensor:
+        return torch.empty_like(b, memory_format=torch.contiguous_format)
+
 
 if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
 
@@ -1032,6 +1056,30 @@ def machete_prepack_B(
                                           group_scales_type)
 
 
+# CUTLASS W4A8
+def cutlass_w4a8_mm(
+        a: torch.Tensor,
+        # b_q Should be the tensor returned by cutlass_encode_and_reorder_int4b
+        b_q: torch.Tensor,
+        b_group_scales: torch.Tensor,
+        b_group_size: int,
+        b_channel_scales: torch.Tensor,
+        a_token_scales: torch.Tensor,
+        out_type: Optional[torch.dtype] = None,
+        maybe_schedule: Optional[str] = None) -> torch.Tensor:
+    return torch.ops._C.cutlass_w4a8_mm(a, b_q, b_group_scales, b_group_size,
+                                        b_channel_scales, a_token_scales,
+                                        out_type, maybe_schedule)
+
+
+def cutlass_pack_scale_fp8(scales: torch.Tensor) -> torch.Tensor:
+    return torch.ops._C.cutlass_pack_scale_fp8(scales)
+
+
+def cutlass_encode_and_reorder_int4b(b: torch.Tensor) -> torch.Tensor:
+    return torch.ops._C.cutlass_encode_and_reorder_int4b(b)
+
+
 if hasattr(torch.ops._C, "permute_cols"):
 
     @register_fake("_C::permute_cols")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 637a843729..ce74375aab 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -26,10 +26,10 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
     CompressedTensorsScheme, CompressedTensorsW4A4Fp4,
-    CompressedTensorsW4A8Int, CompressedTensorsW4A16Fp4,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
-    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16)
+    CompressedTensorsW4A8Fp8, CompressedTensorsW4A8Int,
+    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, is_activation_quantization_format,
     should_ignore_layer)
@@ -200,8 +200,10 @@ class CompressedTensorsConfig(QuantizationConfig):
                     format
                 ) if format is not None else is_activation_quantization_format(
                     quant_format)
-                if act_quant_format:
-                    input_activations = quant_config.get("input_activations")
+                # TODO(czhu): w4a8fp8 is in packed-quantized format
+                # but needs input activation quantization
+                input_activations = quant_config.get("input_activations")
+                if act_quant_format or input_activations:
                     # The only case where we have activation quant supported
                     # but no input_activations provided in the config
                     # should be w8a16fp8 w8a16fp8 can also run for cases where
@@ -352,6 +354,28 @@ class CompressedTensorsConfig(QuantizationConfig):
             input_quant.strategy == QuantizationStrategy.TENSOR)
         return is_symmetric_activation and is_per_tensor_activation
 
+    def _is_fp8_w4a8(self, weight_quant: BaseModel,
+                     input_quant: BaseModel) -> bool:
+        if not weight_quant or not input_quant:
+            return False
+        is_weight_4_bits = weight_quant.num_bits == 4
+        is_activation_8_bits = input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.GROUP.value)
+        is_token = (weight_strategy and input_quant.strategy
+                    == QuantizationStrategy.TOKEN.value)
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+        # Only per-group symmetric weight (4bit)
+        # + per-tok symmetric activation (8bit) quantization supported.
+        return (is_weight_4_bits and is_activation_8_bits and is_token
+                and is_symmetric and is_dynamic)
+
+    def _is_fp8_w4a8_sm90(self, weight_quant: BaseModel,
+                          input_quant: BaseModel) -> bool:
+        return (self._check_scheme_supported(90, error=False, match_exact=True)
+                and self._is_fp8_w4a8(weight_quant, input_quant))
+
     def _is_fp8_w8a8_sm90(self, weight_quant: BaseModel,
                           input_quant: BaseModel) -> bool:
         return (self._check_scheme_supported(90, error=False, match_exact=True)
@@ -405,6 +429,13 @@ class CompressedTensorsConfig(QuantizationConfig):
         if self._is_fp4a16_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A16Fp4()
 
+        if self._is_fp8_w4a8_sm90(weight_quant, input_quant):
+            return CompressedTensorsW4A8Fp8(num_bits=weight_quant.num_bits,
+                                            strategy=weight_quant.strategy,
+                                            symmetric=weight_quant.symmetric,
+                                            group_size=weight_quant.group_size,
+                                            actorder=weight_quant.actorder)
+
         if self._is_wNa16_group_channel(weight_quant, input_quant):
             if (self.quant_format == CompressionFormat.marlin_24.value
                     and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 734fa603ba..cac65cca50 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -3,6 +3,7 @@
 
 from .compressed_tensors_scheme import CompressedTensorsScheme
 from .compressed_tensors_w4a4_nvfp4 import CompressedTensorsW4A4Fp4
+from .compressed_tensors_w4a8_fp8 import CompressedTensorsW4A8Fp8
 from .compressed_tensors_w4a8_int import CompressedTensorsW4A8Int
 from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
                                           CompressedTensorsW4A16Sparse24)
@@ -21,5 +22,6 @@ __all__ = [
     "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
     "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
     "CompressedTensors24", "CompressedTensorsW4A16Fp4",
-    "CompressedTensorsW4A4Fp4", "CompressedTensorsW4A8Int"
+    "CompressedTensorsW4A4Fp4", "CompressedTensorsW4A8Int",
+    "CompressedTensorsW4A8Fp8"
 ]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
new file mode 100644
index 0000000000..f6cc49c231
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors.quantization import ActivationOrdering
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
+    MPLinearLayerConfig, choose_mp_linear_kernel)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    marlin_repeat_scales_on_all_ranks)
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+# yapf: enable
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+__all__ = ["CompressedTensorsW4A8Fp8"]
+W4A8_SUPPORTED_TYPES_MAP = {
+    4: scalar_types.int4,
+}
+W4A8_SUPPORTED_BITS = list(W4A8_SUPPORTED_TYPES_MAP.keys())
+
+
+class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
+    _kernel_backends_being_used: set[str] = set()
+
+    def __init__(self,
+                 strategy: str,
+                 num_bits: int,
+                 group_size: Optional[int] = None,
+                 symmetric: Optional[bool] = True,
+                 actorder: Optional[ActivationOrdering] = None):
+
+        self.pack_factor = 32 // num_bits
+        self.strategy = strategy
+        self.symmetric = symmetric
+        self.group_size = -1 if group_size is None else group_size
+        self.has_g_idx = actorder == ActivationOrdering.GROUP
+
+        if self.group_size != 128 or self.strategy != "group":
+            raise ValueError("W4A8 kernels require group quantization " \
+            "with group size 128")
+
+        if num_bits not in W4A8_SUPPORTED_TYPES_MAP:
+            raise ValueError(
+                f"Unsupported num_bits = {num_bits}. "
+                f"Supported num_bits = {W4A8_SUPPORTED_TYPES_MAP.keys()}")
+
+        self.quant_type = W4A8_SUPPORTED_TYPES_MAP[num_bits]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # hopper
+        return 90
+
+    def create_weights(self, layer: torch.nn.Module, output_size: int,
+                       input_size: int, output_partition_sizes: list[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        output_size_per_partition = sum(output_partition_sizes)
+
+        mp_linear_kernel_config = MPLinearLayerConfig(
+            full_weight_shape=(input_size, output_size),
+            partition_weight_shape=\
+                (input_size_per_partition, output_size_per_partition),
+            weight_type=self.quant_type,
+            act_type=torch.float8_e4m3fn,  # always use fp8(e4m3)
+            group_size=self.group_size,
+            zero_points=not self.symmetric,
+            has_g_idx=self.has_g_idx
+        )
+
+        kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for CompressedTensorsW4A8Fp8",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
+
+        # If group_size is -1, we are in channelwise case.
+        group_size = self.group_size if self.group_size != -1 else input_size
+        row_parallel = (input_size != input_size_per_partition)
+        partition_scales = not marlin_repeat_scales_on_all_ranks(
+            self.has_g_idx, self.group_size, row_parallel)
+
+        scales_and_zp_size = input_size // group_size
+
+        if partition_scales:
+            assert input_size_per_partition % group_size == 0
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        weight = PackedvLLMParameter(input_dim=1,
+                                     output_dim=0,
+                                     weight_loader=weight_loader,
+                                     packed_factor=self.pack_factor,
+                                     packed_dim=1,
+                                     data=torch.empty(
+                                         output_size_per_partition,
+                                         input_size_per_partition //
+                                         self.pack_factor,
+                                         dtype=torch.int32,
+                                     ))
+
+        # TODO(czhu): allocate the packed fp8 scales memory here?
+        # the scales will be expanded by 8x via `cutlass_pack_scale_fp8`
+        weight_scale_args = {
+            "weight_loader":
+            weight_loader,
+            "data":
+            torch.empty(
+                output_size_per_partition,
+                scales_and_zp_size,
+                dtype=params_dtype,
+            )
+        }
+
+        if not partition_scales:
+            weight_scale = ChannelQuantScaleParameter(output_dim=0,
+                                                      **weight_scale_args)
+        else:
+            weight_scale = GroupQuantScaleParameter(output_dim=0,
+                                                    input_dim=1,
+                                                    **weight_scale_args)
+
+        # A 2D array defining the original shape of the weights
+        # before packing
+        weight_shape = BasevLLMParameter(data=torch.empty(2,
+                                                          dtype=torch.int64),
+                                         weight_loader=weight_loader)
+
+        layer.register_parameter("weight_packed", weight)
+        layer.register_parameter("weight_scale", weight_scale)
+        layer.register_parameter("weight_shape", weight_shape)
+
+        self.kernel = kernel_type(mp_linear_kernel_config,
+                                  w_q_param_name="weight_packed",
+                                  w_s_param_name="weight_scale",
+                                  w_zp_param_name="weight_zero_point",
+                                  w_gidx_param_name="weight_g_idx")
+
+    # Checkpoints are serialized in compressed-tensors format, which is
+    # different from the format the kernel may want. Handle repacking here.
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+        return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index a5084f6ee9..4bcfcd04b3 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -10,6 +10,8 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision.bitblas imp
     BitBLASLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.conch import (  # noqa: E501
     ConchLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.cutlass import (  # noqa: E501
+    CutlassW4A8LinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.dynamic_4bit import (  # noqa: E501
     Dynamic4bitLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
@@ -24,6 +26,7 @@ from vllm.platforms import current_platform
 
 # in priority/performance order (when available)
 _POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [
+    CutlassW4A8LinearKernel,
     MacheteLinearKernel,
     AllSparkLinearKernel,
     MarlinLinearKernel,
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
new file mode 100644
index 0000000000..f1d49693fc
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+
+class CutlassW4A8LinearKernel(MPLinearKernel):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # dynamic per-tok fp8 activation quantization
+        self.quant_fp8 = QuantFP8(static=False,
+                                  group_shape=GroupShape.PER_TOKEN)
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        if not current_platform.is_cuda():
+            return False, "CUTLASS only supported on CUDA"
+
+        if not current_platform.is_device_capability(90):
+            return False, "CUTLASS W4A8 requires compute capability of 90 "\
+                "(Hopper)"
+
+        if c.act_type != torch.float8_e4m3fn:
+            return False, "CUTLASS W4A8 only supports FP8 (e4m3) activations"
+
+        if c.has_g_idx:
+            return False, "Act reordering not supported by CUTLASS W4A8"
+
+        if c.zero_points:
+            return False, "Zero points not supported by CUTLASS W4A8"
+
+        if c.weight_type != scalar_types.int4:
+            return False, f"Quant type ({c.weight_type}) not supported by "\
+                           "CUTLASS W4A8, only supported int4"
+
+        # TODO(czhu): support -1 (column-wise)
+        if c.group_size != 128:
+            return False, "Only group_size 128 is supported"
+
+        in_features, out_features = c.partition_weight_shape
+        if in_features % 128 or out_features % 128:
+            return False, "K and N must be divisible by 128, got "\
+                           f"{c.partition_weight_shape}"
+        return True, None
+
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        c = self.config
+
+        # TODO(czhu): optimize speed/mem usage
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x.data = ops.cutlass_encode_and_reorder_int4b(
+                x.data.t().contiguous().t())
+            return x
+
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = x.data.contiguous().to(torch.float8_e4m3fn)
+            x.data = ops.cutlass_pack_scale_fp8(x.data)
+            return x
+
+        # Encode/reorder weights and pack scales
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+
+        # TODO(czhu): support loading channel scales
+        self.w_ch_s = torch.ones((c.partition_weight_shape[1], ),
+                                 dtype=torch.float32,
+                                 device='cuda')
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        assert bias is None, "bias not supported by CUTLASS W4A8"
+        c = self.config
+        w_q, w_s, _, _ = self._get_weight_params(layer)
+
+        x_2d = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+
+        x_2d, act_scales = self.quant_fp8(x_2d)
+        output = ops.cutlass_w4a8_mm(a=x_2d,
+                                     b_q=w_q,
+                                     b_group_scales=w_s,
+                                     b_group_size=c.group_size,
+                                     a_token_scales=act_scales,
+                                     b_channel_scales=self.w_ch_s)
+
+        return output.reshape(out_shape)

From 1b9b16649c10453fe25ff28313dffa175194a84b Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Sun, 24 Aug 2025 16:06:34 +0800
Subject: [PATCH 538/932] [Misc] update dict parse to EPLBConfig from json
 dumps to dict unpacking (#23305)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/config/parallel.py  | 9 +--------
 vllm/engine/arg_utils.py | 3 +--
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index f7b8b1d0a5..9ea883d4a0 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -6,7 +6,7 @@ from dataclasses import field
 from typing import TYPE_CHECKING, Any, Literal, Optional, Union
 
 import torch
-from pydantic import TypeAdapter, model_validator
+from pydantic import model_validator
 from pydantic.dataclasses import dataclass
 from torch.distributed import ProcessGroup, ReduceOp
 from typing_extensions import Self
@@ -56,13 +56,6 @@ class EPLBConfig:
     This is turned off by default since it will cause communication overhead.
     """
 
-    @classmethod
-    def from_cli(cls, cli_value: str) -> "EPLBConfig":
-        """Parse the CLI value for the compilation config.
-        -O1, -O2, -O3, etc. is handled in FlexibleArgumentParser.
-        """
-        return TypeAdapter(EPLBConfig).validate_json(cli_value)
-
 
 @config
 @dataclass
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 965264ee30..3ab1115f14 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -455,8 +455,7 @@ class EngineArgs:
             self.compilation_config = CompilationConfig(
                 **self.compilation_config)
         if isinstance(self.eplb_config, dict):
-            self.eplb_config = EPLBConfig.from_cli(json.dumps(
-                self.eplb_config))
+            self.eplb_config = EPLBConfig(**self.eplb_config)
         # Setup plugins
         from vllm.plugins import load_general_plugins
         load_general_plugins()

From 5e021b4981c14724b57744df182d876e6a07e4b9 Mon Sep 17 00:00:00 2001
From: TeeKen Lau <13831887+teekenl@users.noreply.github.com>
Date: Sun, 24 Aug 2025 20:12:47 +1000
Subject: [PATCH 539/932] (Misc): add missing test for zero truncation size.
 (#23457)

Signed-off-by: teekenl <teekenlau@gmail.com>
---
 tests/entrypoints/openai/test_truncation.py | 22 +++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py
index 18ddc493c9..121c0413e1 100644
--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/openai/test_truncation.py
@@ -64,6 +64,28 @@ async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
     assert response["usage"]["prompt_tokens"] == truncation_size
 
 
+@pytest.mark.asyncio
+async def test_zero_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = 0
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size
+    }
+
+    with pytest.raises(openai.BadRequestError) as err:
+        await client.post(path="embeddings", cast_to=object, body={**kwargs})
+
+    assert err.value.status_code == 400
+    error_details = err.value.response.json()["error"]
+
+    assert error_details["type"] == "BadRequestError"
+    assert "This model's maximum context length is" in error_details["message"]
+    assert "tokens in the input for embedding generation" in error_details[
+        "message"]
+    assert "Please reduce the length of the input" in error_details["message"]
+
+
 @pytest.mark.asyncio
 async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
     truncation_size = max_model_len + 1

From 416f05929ac66f5ae364936b70087fc60cacee4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Sun, 24 Aug 2025 20:52:24 +0800
Subject: [PATCH 540/932] [New Model]Donut model (#23229)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
---
 docs/models/supported_models.md               |   1 +
 examples/offline_inference/dolphin.py         | 311 ++++++++++++
 .../encoder_decoder_multimodal.py             |  46 ++
 .../multimodal/processing/test_common.py      |   2 +
 tests/models/registry.py                      |   3 +
 vllm/engine/llm_engine.py                     |   2 +-
 vllm/model_executor/models/donut.py           | 398 +++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/swin.py            | 475 ++++++++++++++++++
 vllm/multimodal/profiling.py                  |   2 +-
 vllm/v1/engine/processor.py                   |   2 +-
 11 files changed, 1240 insertions(+), 3 deletions(-)
 create mode 100644 examples/offline_inference/dolphin.py
 create mode 100644 vllm/model_executor/models/donut.py
 create mode 100644 vllm/model_executor/models/swin.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 297d98142b..3159d3bd1c 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -615,6 +615,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ |
 | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
 | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
+| `DonutForConditionalGeneration`<sup>^</sup> | Donut | T + I | `ByteDance/Dolphin`, `naver-clova-ix/donut-base-finetuned-docvqa`, etc. | | | |
 | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
 | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
diff --git a/examples/offline_inference/dolphin.py b/examples/offline_inference/dolphin.py
new file mode 100644
index 0000000000..d2ba27cd1e
--- /dev/null
+++ b/examples/offline_inference/dolphin.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import copy
+import os
+from dataclasses import dataclass
+
+import cv2
+import numpy as np
+import regex as re
+from PIL import Image
+from transformers import DonutProcessor
+
+from vllm import LLM, SamplingParams
+from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt
+from vllm.multimodal.utils import fetch_image
+
+
+# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
+@dataclass
+class ImageDimensions:
+    original_w: int
+    original_h: int
+    padded_w: int
+    padded_h: int
+
+
+# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
+def map_to_original_coordinates(
+    x1, y1, x2, y2, dims: ImageDimensions
+) -> tuple[int, int, int, int]:
+    try:
+        top = (dims.padded_h - dims.original_h) // 2
+        left = (dims.padded_w - dims.original_w) // 2
+        orig_x1 = max(0, x1 - left)
+        orig_y1 = max(0, y1 - top)
+        orig_x2 = min(dims.original_w, x2 - left)
+        orig_y2 = min(dims.original_h, y2 - top)
+        if orig_x2 <= orig_x1:
+            orig_x2 = min(orig_x1 + 1, dims.original_w)
+        if orig_y2 <= orig_y1:
+            orig_y2 = min(orig_y1 + 1, dims.original_h)
+        return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2)
+    except Exception as e:
+        print(f"map_to_original_coordinates error: {str(e)}")
+        return 0, 0, min(100, dims.original_w), min(100, dims.original_h)
+
+
+# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
+def adjust_box_edges(image, boxes: list[list[float]], max_pixels=15, threshold=0.2):
+    if isinstance(image, str):
+        image = cv2.imread(image)
+    img_h, img_w = image.shape[:2]
+    new_boxes = []
+    for box in boxes:
+        best_box = copy.deepcopy(box)
+
+        def check_edge(img, current_box, i, is_vertical):
+            edge = current_box[i]
+            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+            _, binary = cv2.threshold(
+                gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
+            )
+            if is_vertical:
+                line = binary[current_box[1] : current_box[3] + 1, edge]
+            else:
+                line = binary[edge, current_box[0] : current_box[2] + 1]
+            transitions = np.abs(np.diff(line))
+            return np.sum(transitions) / len(transitions)
+
+        edges = [(0, -1, True), (2, 1, True), (1, -1, False), (3, 1, False)]
+        current_box = copy.deepcopy(box)
+        current_box[0] = min(max(current_box[0], 0), img_w - 1)
+        current_box[1] = min(max(current_box[1], 0), img_h - 1)
+        current_box[2] = min(max(current_box[2], 0), img_w - 1)
+        current_box[3] = min(max(current_box[3], 0), img_h - 1)
+
+        for i, direction, is_vertical in edges:
+            best_score = check_edge(image, current_box, i, is_vertical)
+            if best_score <= threshold:
+                continue
+            for step in range(max_pixels):
+                current_box[i] += direction
+                if i == 0 or i == 2:
+                    current_box[i] = min(max(current_box[i], 0), img_w - 1)
+                else:
+                    current_box[i] = min(max(current_box[i], 0), img_h - 1)
+                score = check_edge(image, current_box, i, is_vertical)
+                if score < best_score:
+                    best_score = score
+                    best_box = copy.deepcopy(current_box)
+                if score <= threshold:
+                    break
+        new_boxes.append(best_box)
+    return new_boxes
+
+
+# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
+def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None):
+    try:
+        x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h)
+        x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h)
+        x1, y1, x2, y2 = (
+            max(0, min(x1, dims.padded_w - 1)),
+            max(0, min(y1, dims.padded_h - 1)),
+            max(0, min(x2, dims.padded_w)),
+            max(0, min(y2, dims.padded_h)),
+        )
+        if x2 <= x1:
+            x2 = min(x1 + 1, dims.padded_w)
+        if y2 <= y1:
+            y2 = min(y1 + 1, dims.padded_h)
+        new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]])
+        x1, y1, x2, y2 = new_boxes[0]
+        x1, y1, x2, y2 = (
+            max(0, min(x1, dims.padded_w - 1)),
+            max(0, min(y1, dims.padded_h - 1)),
+            max(0, min(x2, dims.padded_w)),
+            max(0, min(y2, dims.padded_h)),
+        )
+        if x2 <= x1:
+            x2 = min(x1 + 1, dims.padded_w)
+        if y2 <= y1:
+            y2 = min(y1 + 1, dims.padded_h)
+        if previous_box is not None:
+            prev_x1, prev_y1, prev_x2, prev_y2 = previous_box
+            if (x1 < prev_x2 and x2 > prev_x1) and (y1 < prev_y2 and y2 > prev_y1):
+                y1 = prev_y2
+                y1 = min(y1, dims.padded_h - 1)
+                if y2 <= y1:
+                    y2 = min(y1 + 1, dims.padded_h)
+        new_previous_box = [x1, y1, x2, y2]
+        orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(
+            x1, y1, x2, y2, dims
+        )
+        return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
+    except Exception as e:
+        print(f"process_coordinates error: {str(e)}")
+        orig_x1, orig_y1, orig_x2, orig_y2 = (
+            0,
+            0,
+            min(100, dims.original_w),
+            min(100, dims.original_h),
+        )
+        return 0, 0, 100, 100, orig_x1, orig_y1, orig_x2, orig_y2, [0, 0, 100, 100]
+
+
+# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
+def prepare_image(image) -> tuple[np.ndarray, ImageDimensions]:
+    try:
+        image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        original_h, original_w = image_cv.shape[:2]
+        max_size = max(original_h, original_w)
+        top = (max_size - original_h) // 2
+        bottom = max_size - original_h - top
+        left = (max_size - original_w) // 2
+        right = max_size - original_w - left
+        padded_image = cv2.copyMakeBorder(
+            image_cv, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0)
+        )
+        padded_h, padded_w = padded_image.shape[:2]
+        dimensions = ImageDimensions(
+            original_w=original_w,
+            original_h=original_h,
+            padded_w=padded_w,
+            padded_h=padded_h,
+        )
+        return padded_image, dimensions
+    except Exception as e:
+        print(f"prepare_image error: {str(e)}")
+        h, w = image.height, image.width
+        dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h)
+        return np.zeros((h, w, 3), dtype=np.uint8), dimensions
+
+
+# Copied from https://github.com/bytedance/Dolphin/utils/utils.py
+def parse_layout_string(bbox_str):
+    """Parse layout string using regular expressions"""
+    pattern = r"\[(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+)\]\s*(\w+)"
+    matches = re.finditer(pattern, bbox_str)
+
+    parsed_results = []
+    for match in matches:
+        coords = [float(match.group(i)) for i in range(1, 5)]
+        label = match.group(5).strip()
+        parsed_results.append((coords, label))
+
+    return parsed_results
+
+
+model_id = "ByteDance/Dolphin"
+
+# The input image size for Dolphin is 896 x 896,
+# and the patch_size is 4 x 4.
+# Therefore, the initial number of patches is:
+# Height: 896 / 4 = 224 patches
+# Width: 896 / 4 = 224 patches
+
+# The Dolphin model uses a staged downsampling approach,
+# defined by the "depths": [2, 2, 14, 2] configuration.
+# Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed,
+# which halves the feature map's dimensions (dividing both height and width by 2).
+# Before Stage 2: The size changes from 224 x 224 to (224/2) x (224/2) = 112 x 112.
+# Before Stage 3: The size changes from 112 x 112 to (112/2) x (112/2) = 56 x 56.
+# Before Stage 4: The size changes from 56 x 56 to (56/2) x (56/2) = 28 x 28.
+
+# Because vLLM needs to fill the image features with an encoder_prompt,
+# and the encoder_prompt will have `<pad>` tokens added when tokenized,
+# we need to construct an encoder_prompt with a length of 28 x 28 - 1 = 783.
+encoder_prompt = "".join(["0"] * 783)
+sampling_params = SamplingParams(
+    temperature=0.0,
+    max_tokens=2048,
+)
+
+processor = DonutProcessor.from_pretrained(model_id)
+llm = LLM(
+    model=model_id,
+    dtype="float16",
+    max_num_seqs=8,
+    hf_overrides={"architectures": ["DonutForConditionalGeneration"]},
+)
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--image_path", type=str, default=None, help="Path to a local image file."
+)
+args = parser.parse_args()
+
+if args.image_path:
+    if not os.path.exists(args.image_path):
+        raise FileNotFoundError(f"Error: File not found at {args.image_path}")
+    image = Image.open(args.image_path).convert("RGB")
+else:
+    image = fetch_image(
+        "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg"
+    )
+
+
+prompt = "Parse the reading order of this document. "
+decoder_prompt = f"<s>{prompt}<Answer/>"
+decoder_prompt_tokens = TokensPrompt(
+    prompt_token_ids=processor.tokenizer(decoder_prompt, add_special_tokens=False)[
+        "input_ids"
+    ]
+)
+enc_dec_prompt = ExplicitEncoderDecoderPrompt(
+    encoder_prompt=TextPrompt(prompt=encoder_prompt, multi_modal_data={"image": image}),
+    decoder_prompt=decoder_prompt_tokens,
+)
+layout_outputs = llm.generate(prompts=enc_dec_prompt, sampling_params=sampling_params)
+layout_result_str = layout_outputs[0].outputs[0].text
+print(f"Layout analysis output:\n{layout_result_str}")
+
+padded_image, dims = prepare_image(image)
+layout_results = parse_layout_string(layout_result_str)
+text_table_elements = []
+previous_box = None
+reading_order = 0
+for bbox_coords, label in layout_results:
+    if label == "fig":
+        continue
+    try:
+        x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = (
+            process_coordinates(bbox_coords, padded_image, dims, previous_box)
+        )
+        cropped = padded_image[y1:y2, x1:x2]
+        if cropped.size > 0 and cropped.shape[0] > 3 and cropped.shape[1] > 3:
+            pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
+            prompt_ocr = (
+                "Parse the table in the image. "
+                if label == "tab"
+                else "Read text in the image. "
+            )
+            text_table_elements.append(
+                {
+                    "crop": pil_crop,
+                    "prompt": prompt_ocr,
+                    "reading_order": reading_order,
+                }
+            )
+        reading_order += 1
+    except Exception as e:
+        print(f"Error processing bbox (label: {label}): {str(e)}")
+        continue
+
+if text_table_elements:
+    batch_prompts = []
+    for elem in text_table_elements:
+        decoder_prompt_str = f"<s>{elem['prompt']}<Answer/>"
+        decoder_prompt_tokens = TokensPrompt(
+            prompt_token_ids=processor.tokenizer(
+                decoder_prompt_str, add_special_tokens=False
+            )["input_ids"]
+        )
+        enc_dec_prompt = ExplicitEncoderDecoderPrompt(
+            encoder_prompt=TextPrompt(
+                prompt=encoder_prompt, multi_modal_data={"image": elem["crop"]}
+            ),
+            decoder_prompt=decoder_prompt_tokens,
+        )
+        batch_prompts.append(enc_dec_prompt)
+    batch_outputs = llm.generate(prompts=batch_prompts, sampling_params=sampling_params)
+    for i, output in enumerate(batch_outputs):
+        text_table_elements[i]["text"] = output.outputs[0].text.strip()
+
+print("------" * 8)
+text_table_elements.sort(key=lambda x: x["reading_order"])
+for elem in text_table_elements:
+    print(elem.get("text", ""))
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index d27a902edb..655f9f3fce 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -13,6 +13,7 @@ from typing import NamedTuple
 from vllm import LLM, EngineArgs, PromptType, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
+from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -21,6 +22,50 @@ class ModelRequestData(NamedTuple):
     prompts: Sequence[PromptType]
 
 
+def run_donut():
+    engine_args = EngineArgs(
+        model="naver-clova-ix/donut-base-finetuned-docvqa",
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": 1},
+        dtype="float16",
+        hf_overrides={"architectures": ["DonutForConditionalGeneration"]},
+    )
+
+    # The input image size for donut-base-finetuned-docvqa is 2560 x 1920,
+    # and the patch_size is 4 x 4.
+    # Therefore, the initial number of patches is:
+    # Height: 1920 / 4 = 480 patches
+    # Width: 2560 / 4 = 640 patches
+    # The Swin model uses a staged downsampling approach,
+    # defined by the "depths": [2, 2, 14, 2] configuration.
+    # Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed,
+    # which halves the feature map's dimensions (dividing both height and width by 2).
+    # Before Stage 2: The size changes from 480 x 640 to (480/2) x (640/2) = 240 x 320.
+    # Before Stage 3: The size changes from 240 x 320 to (240/2) x (320/2) = 120 x 160.
+    # Before Stage 4: The size changes from 120 x 160 to (120/2) x (160/2) = 60 x 80.
+    # Because vLLM needs to fill the image features with an encoder_prompt,
+    # and the encoder_prompt will have `<pad>` tokens added when tokenized,
+    # we need to construct an encoder_prompt with a length of 60 x 80 - 1 = 4799.
+    prompts = [
+        {
+            "encoder_prompt": {
+                "prompt": "".join(["$"] * 4799),
+                "multi_modal_data": {
+                    "image": fetch_image(
+                        "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg"
+                    )  # noqa: E501
+                },
+            },
+            "decoder_prompt": "<s_docvqa><s_question>What time is the coffee break?</s_question><s_answer>",  # noqa: E501
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 def run_florence2():
     engine_args = EngineArgs(
         model="microsoft/Florence-2-large",
@@ -118,6 +163,7 @@ def run_whisper():
 
 
 model_example_map = {
+    "donut": run_donut,
     "florence2": run_florence2,
     "mllama": run_mllama,
     "whisper": run_whisper,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index adc8b2510d..a604d11f0e 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -160,6 +160,7 @@ def _test_processing_correctness(
 # incorrect token ids. So we need use `add_special_tokens=False` here
 # to leave bos_token to be added by the processor.
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
+    "donut": False,
     "mllama": False,
     "ovis": False,
     "ovis2_5": False,
@@ -270,6 +271,7 @@ def _test_processing_correctness_one(
     "facebook/chameleon-7b",
     "CohereLabs/command-a-vision-07-2025",
     "deepseek-ai/deepseek-vl2-tiny",
+    "naver-clova-ix/donut-base-finetuned-docvqa",
     "microsoft/Florence-2-base",
     "adept/fuyu-8b",
     "google/gemma-3-4b-it",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 25dbbd7fa9..b34c6f2e5d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -513,6 +513,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         is_available_online=False,
     ),
     # [Encoder-decoder]
+    "DonutForConditionalGeneration": _HfExamplesInfo("naver-clova-ix/donut-base-finetuned-docvqa",  # noqa: E501
+                                                    hf_overrides={"architectures": ["DonutForConditionalGeneration"], "model_type": "donut"},  # noqa: E501
+                                                    extras={"dolphin": "ByteDance/Dolphin"}),  # noqa: E501
     # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
     # Therefore, we borrow the BartTokenizer from the original Bart model
     "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index bbe958351e..dbf8d3ba50 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1822,7 +1822,7 @@ class LLMEngine:
                 assert isinstance(mm_processor, EncDecMultiModalProcessor)
 
                 if mm_processor.pad_dummy_encoder_prompt:
-                    return  # Skip encoder length check for Whisper
+                    return  # Skip encoder length check for Whisper and Donut
 
             if model_config.is_multimodal_model:
                 suggestion = (
diff --git a/vllm/model_executor/models/donut.py b/vllm/model_executor/models/donut.py
new file mode 100644
index 0000000000..b1f6a0af6b
--- /dev/null
+++ b/vllm/model_executor/models/donut.py
@@ -0,0 +1,398 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature, NougatProcessor
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.bart import BartParallelLMHead, MBartDecoder
+from vllm.model_executor.models.interfaces import (MultiModalEmbeddings,
+                                                   SupportsMultiModal,
+                                                   SupportsV0Only)
+from vllm.model_executor.models.swin import SwinModel
+from vllm.model_executor.models.utils import (AutoWeightsLoader,
+                                              _flatten_embeddings, flatten_bn)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargsItems)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseProcessingInfo,
+                                        EncDecMultiModalProcessor,
+                                        PromptIndexTargets, PromptInsertion,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+
+
+class MBartDecoderWrapper(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.decoder = MBartDecoder(config,
+                                    cache_config,
+                                    quant_config=quant_config,
+                                    prefix=f"{prefix}.decoder")
+
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+
+class DonutLanguageForConditionalGeneration(nn.Module, SupportsV0Only):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+
+        self.config = config
+        self.model = MBartDecoderWrapper(vllm_config=vllm_config,
+                                         prefix=f"{prefix}.model")
+        embed_scale = math.sqrt(
+            config.d_model) if config.scale_embedding else 1.0
+
+        self.vocab_size = config.vocab_size
+        self.lm_head = BartParallelLMHead(self.vocab_size,
+                                          config.d_model,
+                                          embed_scale=embed_scale)
+
+        self.logits_processor = LogitsProcessor(self.vocab_size,
+                                                config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids
+                torch.Tensor of *decoder* input token ids.
+            positions
+                torch.Tensor of *decoder* position indices.
+        Returns:
+            Output torch.Tensor
+        """
+
+        return self.model(decoder_input_ids=input_ids,
+                          decoder_positions=positions,
+                          encoder_hidden_states=inputs_embeds)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "final_logits_bias" in name:
+                    continue
+                # if self.config.tie_word_embeddings and "embed_tokens" in name:
+                #     continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class DonutImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: (batch_size, num_channel, height, width)"""
+
+
+class DonutProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self):
+        return self.ctx.get_hf_processor()
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_num_image_tokens(self) -> int:
+        return 1
+
+
+class DonutDummyInputsBuilder(BaseDummyInputsBuilder[DonutProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = self.info.get_hf_config(
+        ).encoder.image_size
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class DonutMultiModalProcessor(EncDecMultiModalProcessor[DonutProcessingInfo]):
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
+    def create_encoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        return prompt
+
+    def create_decoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        return prompt
+
+    @property
+    def pad_dummy_encoder_prompt(self) -> bool:
+        return True
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        hf_processor = self.info.get_hf_processor()
+        if mm_data:
+            processed_outputs = super()._call_hf_processor(
+                prompt, mm_data, mm_kwargs, tok_kwargs)
+            if isinstance(hf_processor, NougatProcessor):
+                processed_outputs["input_ids"] = processed_outputs["labels"]
+        else:
+            tokenizer = hf_processor.tokenizer
+            processed_outputs = tokenizer(prompt,
+                                          add_special_tokens=False,
+                                          return_tensors="pt")
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor()
+        tokenizer = hf_processor.tokenizer
+        pad_token_id = tokenizer.pad_token_id
+        num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [pad_token_id] * num_image_tokens
+
+        return [
+            PromptInsertion(
+                modality="image",
+                target=PromptIndexTargets.start(),
+                insertion=image_tokens,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(DonutMultiModalProcessor,
+                                        info=DonutProcessingInfo,
+                                        dummy_inputs=DonutDummyInputsBuilder)
+class DonutForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                    SupportsV0Only):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        processor_config = vllm_config.model_config.hf_image_processor_config
+
+        self.config = config
+        self.vision_config = config.encoder
+        self.processor_config = processor_config
+        self.encoder = SwinModel(config=config.encoder)
+
+        self.decoder = DonutLanguageForConditionalGeneration(
+            vllm_config=vllm_config.with_hf_config(config.decoder),
+            prefix=f"{prefix}.decoder",
+        )
+        self.pad_token_id = config.pad_token_id
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, list[torch.Tensor]]
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+
+        # size = self.processor_config["size"]
+        h, w = self.config.encoder.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                raise ValueError(
+                    "The expected shape of pixel values per batch "
+                    f"is {expected_dims}. You supplied {actual_dims}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(self, **kwargs: object):
+        pixel_values: Optional[Union[list[list[torch.Tensor]],
+                                     list[torch.Tensor],
+                                     torch.Tensor]] = kwargs.pop(
+                                         "pixel_values", None)
+        image_embeds: Optional[Union[list[list[torch.Tensor]],
+                                     list[torch.Tensor],
+                                     torch.Tensor]] = kwargs.pop(
+                                         "image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None and image_embeds is not None:
+            raise ValueError(
+                "Both pixel values and image embeds are provided.")
+
+        if pixel_values is not None:
+            return DonutImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            raise NotImplementedError
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+            self, image_input: DonutImagePixelInputs) -> torch.Tensor:
+        assert image_input["type"] == "pixel_values"
+        pixel_values = image_input["data"]
+        dtype = next(self.encoder.parameters()).dtype
+        pixel_values = pixel_values.to(dtype)
+        return self.encoder(pixel_values)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.decoder
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings,
+    ) -> torch.Tensor:
+        return _flatten_embeddings(multimodal_embeddings)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        *,
+        encoder_input_ids: torch.Tensor,
+        encoder_positions: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids
+                torch.Tensor of *decoder* input token ids.
+            positions
+                torch.Tensor of *decoder* position indices.
+            encoder_input_ids
+                torch.Tensor of *encoder* input token ids.
+            encoder_positions
+                torch.Tensor of *encoder* position indices
+        Returns:
+            Output torch.Tensor
+        """
+
+        inputs_embeds = None
+        if encoder_input_ids.numel() > 0:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(encoder_input_ids,
+                                                      vision_embeddings)
+
+        hidden_states = self.decoder(input_ids,
+                                     positions,
+                                     inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.decoder.compute_logits(hidden_states, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 465c25f094..ebf78771e4 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -252,6 +252,7 @@ _MULTIMODAL_MODELS = {
     "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"),  # noqa: E501
     "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
     # [Encoder-decoder]
+    "DonutForConditionalGeneration": ("donut", "DonutForConditionalGeneration"),
     "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
     "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/model_executor/models/swin.py b/vllm/model_executor/models/swin.py
new file mode 100644
index 0000000000..30b441f5b4
--- /dev/null
+++ b/vllm/model_executor/models/swin.py
@@ -0,0 +1,475 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import SwinConfig
+from transformers.models.swin.modeling_swin import SwinEmbeddings
+from transformers.models.swin.modeling_swin import SwinLayer as HFSwinLayer
+from transformers.models.swin.modeling_swin import SwinPatchMerging
+from transformers.pytorch_utils import meshgrid
+
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+
+class SwinSelfAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: SwinConfig,
+        dim: int,
+        num_heads: int,
+        window_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of "
+                f"attention heads ({num_heads})")
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.window_size = (window_size if isinstance(window_size, Iterable)
+                            else (window_size, window_size))
+        self.scale = self.attention_head_size**-0.5
+
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(
+                (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1),
+                num_heads))
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:,
+                                                                      None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+
+        self.relative_position_index = nn.Parameter(relative_position_index,
+                                                    requires_grad=False)
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=dim,
+            head_size=self.attention_head_size,
+            total_num_heads=self.num_attention_heads,
+            bias=config.qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+        )
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def _get_rel_pos_bias(self) -> torch.Tensor:
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)]
+        relative_position_bias = relative_position_bias.view(
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1], -1)
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()
+        return relative_position_bias.unsqueeze(0)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, ...]:
+        batch_size, dim, num_channels = hidden_states.shape
+
+        qkv_output, _ = self.qkv(hidden_states)
+        query_layer, key_layer, value_layer = qkv_output.chunk(3, dim=-1)
+
+        key_layer = self.transpose_for_scores(key_layer)
+        value_layer = self.transpose_for_scores(value_layer)
+        query_layer = self.transpose_for_scores(query_layer)
+
+        attention_scores = self._get_rel_pos_bias()
+        if attention_mask is not None:
+            mask_shape = attention_mask.shape[0]
+            attention_mask_expanded = attention_mask.view(
+                1, mask_shape, 1, dim,
+                dim).expand(batch_size // mask_shape, mask_shape,
+                            self.num_attention_heads, dim, dim)
+            attention_scores = attention_scores + \
+            attention_mask_expanded.unsqueeze(
+                1).unsqueeze(0)
+            attention_scores = attention_scores.view(-1,
+                                                     self.num_attention_heads,
+                                                     dim, dim)
+
+        context_layer = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            attn_mask=attention_scores,
+            dropout_p=0.,
+        )
+        attention_probs = None
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        return outputs
+
+
+class SwinSelfOutput(nn.Module):
+
+    def __init__(
+        self,
+        config: SwinConfig,
+        dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.dense = RowParallelLinear(
+            input_size=dim,
+            output_size=dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+
+    def forward(self, hidden_states: torch.Tensor,
+                input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+
+        return hidden_states
+
+
+class SwinAttention(nn.Module):
+
+    def __init__(self,
+                 config: SwinConfig,
+                 dim: int,
+                 num_heads: int,
+                 window_size: int,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.self = SwinSelfAttention(config,
+                                      dim,
+                                      num_heads,
+                                      window_size,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.self")
+        self.output = SwinSelfOutput(config,
+                                     dim,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.output")
+        self.pruned_heads = set()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, attention_mask, head_mask,
+                                 output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output, ) + self_outputs[1:]
+        return outputs
+
+
+class SwinIntermediate(nn.Module):
+
+    def __init__(self,
+                 config: SwinConfig,
+                 dim: int,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.dense = ColumnParallelLinear(dim,
+                                          int(config.mlp_ratio * dim),
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.dense")
+        self.intermediate_act_fn = get_act_fn(config.hidden_act)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class SwinOutput(nn.Module):
+
+    def __init__(self,
+                 config: SwinConfig,
+                 dim: int,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.dense = RowParallelLinear(int(config.mlp_ratio * dim),
+                                       dim,
+                                       quant_config=quant_config,
+                                       prefix=f"{prefix}.dense")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        return hidden_states
+
+
+class SwinLayer(HFSwinLayer):
+
+    def __init__(
+        self,
+        config: SwinConfig,
+        dim: int,
+        input_resolution: int,
+        num_heads: int,
+        drop_path_rate: float = 0.0,
+        shift_size: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            config=config,
+            dim=dim,
+            input_resolution=input_resolution,
+            num_heads=num_heads,
+            drop_path_rate=drop_path_rate,
+            shift_size=shift_size,
+        )
+
+        self.attention = SwinAttention(config,
+                                       dim,
+                                       num_heads,
+                                       window_size=self.window_size,
+                                       quant_config=quant_config,
+                                       prefix=f"{prefix}.attention")
+        self.intermediate = SwinIntermediate(config,
+                                             dim,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.intermediate")
+        self.output = SwinOutput(config,
+                                 dim,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.output")
+
+
+class SwinStage(nn.Module):
+
+    def __init__(
+        self,
+        config: SwinConfig,
+        dim: int,
+        input_resolution: int,
+        depth: int,
+        num_heads: int,
+        drop_path: list[float],
+        downsample: Optional[SwinPatchMerging] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.blocks = nn.ModuleList([
+            SwinLayer(config=config,
+                      dim=dim,
+                      input_resolution=input_resolution,
+                      num_heads=num_heads,
+                      drop_path_rate=drop_path[layer_idx],
+                      shift_size=0 if
+                      (layer_idx % 2 == 0) else config.window_size // 2,
+                      quant_config=quant_config,
+                      prefix=f"{prefix}.blocks.{layer_idx}")
+            for layer_idx in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution,
+                                         dim=dim,
+                                         norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        height, width = input_dimensions
+        for i, layer_module in enumerate(self.blocks):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(hidden_states, input_dimensions,
+                                         layer_head_mask, output_attentions,
+                                         always_partition)
+
+            hidden_states = layer_outputs[0]
+
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            height_downsampled, width_downsampled = (height + 1) // 2, (width +
+                                                                        1) // 2
+            output_dimensions = (height, width, height_downsampled,
+                                 width_downsampled)
+            hidden_states = self.downsample(hidden_states_before_downsampling,
+                                            input_dimensions)
+        else:
+            output_dimensions = (height, width, height, width)
+
+        stage_outputs = (hidden_states, hidden_states_before_downsampling,
+                         output_dimensions)
+
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
+
+
+class SwinEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: SwinConfig,
+        grid_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.num_layers = len(config.depths)
+        self.config = config
+        dpr = [
+            x.item() for x in torch.linspace(
+                0, config.drop_path_rate, sum(config.depths), device="cpu")
+        ]
+        self.layers = nn.ModuleList([
+            SwinStage(config=config,
+                      dim=int(config.embed_dim * 2**layer_idx),
+                      input_resolution=(grid_size[0] // (2**layer_idx),
+                                        grid_size[1] // (2**layer_idx)),
+                      depth=config.depths[layer_idx],
+                      num_heads=config.num_heads[layer_idx],
+                      drop_path=dpr[sum(config.depths[:layer_idx]
+                                        ):sum(config.depths[:layer_idx + 1])],
+                      downsample=SwinPatchMerging if
+                      (layer_idx < self.num_layers - 1) else None,
+                      quant_config=quant_config,
+                      prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(self.num_layers)
+        ])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(hidden_states, input_dimensions,
+                                         layer_head_mask, output_attentions,
+                                         always_partition)
+
+            hidden_states = layer_outputs[0]
+            output_dimensions = layer_outputs[2]
+
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+
+        return hidden_states
+
+
+class SwinModel(nn.Module):
+    config_class: SwinConfig
+
+    def __init__(
+        self,
+        config: SwinConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.num_layers = len(config.depths)
+        self.num_features = int(config.embed_dim * 2**(self.num_layers - 1))
+
+        self.embeddings = SwinEmbeddings(config)
+        self.encoder = SwinEncoder(config,
+                                   self.embeddings.patch_grid,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.encoder")
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+    ) -> tuple[torch.Tensor]:
+        embedding_output, input_dimensions = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            input_dimensions,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+
+        return encoder_outputs
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv", "query", "q"),
+            ("qkv", "key", "k"),
+            ("qkv", "value", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 2da9b4c721..ea2efbdd8b 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -209,7 +209,7 @@ class MultiModalProfiler(Generic[_I]):
         if processor.pad_dummy_encoder_prompt:
             num_tokens_to_pad = max(total_len, seq_len) - total_len
             encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
-        # NOTE: Whisper allows total_len > seq_len.
+        # NOTE: Whisper and Donut allows total_len > seq_len.
         elif total_len > seq_len and not envs.VLLM_USE_V1:
             # `max_num_batched_tokens` is defined by `SchedulerConfig`
             logger.warning_once(
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 69f8e531e0..219857dc7b 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -389,7 +389,7 @@ class Processor:
                 assert isinstance(mm_processor, EncDecMultiModalProcessor)
 
                 if mm_processor.pad_dummy_encoder_prompt:
-                    return  # Skip encoder length check for Whisper
+                    return  # Skip encoder length check for Whisper and Donut
 
             if model_config.is_multimodal_model:
                 suggestion = (

From e2db1164a186a9d2592299aaa5aea3f013711db3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 24 Aug 2025 21:30:47 +0800
Subject: [PATCH 541/932] [Model] Enable BLOOM on V1 (#23488)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/supported_models.md     | 2 +-
 vllm/model_executor/models/bloom.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 3159d3bd1c..8fb1019f2b 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -328,7 +328,7 @@ th {
 | `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
-| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
+| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | ✅︎ |
 | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
 | `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 6e4a399f3c..1264045848 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -43,7 +43,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP, SupportsQuant, SupportsV0Only
+from .interfaces import SupportsPP, SupportsQuant
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -313,7 +313,7 @@ class BloomModel(nn.Module):
         return loaded_params
 
 
-class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant):
+class BloomForCausalLM(nn.Module, SupportsPP, SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()

From ad78868450a3596bed37dac05be9049019953e94 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 24 Aug 2025 14:03:36 -0700
Subject: [PATCH 542/932] [Misc] Remove unused slot_mapping buffer (#23502)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ed4a4e55f1..ec9887b801 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -254,9 +254,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.seq_lens = torch.zeros(self.max_num_reqs,
                                     dtype=torch.int32,
                                     device=self.device)
-        self.slot_mapping = torch.zeros(self.max_num_tokens,
-                                        dtype=torch.int64,
-                                        device=self.device)
 
         # None in the first PP rank. The rest are set after load_model.
         self.intermediate_tensors: Optional[IntermediateTensors] = None

From c7fc6b1354a20f5dbdd2fb806cd4b7da27d46f63 Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Sun, 24 Aug 2025 15:35:41 -0700
Subject: [PATCH 543/932] fix incompatibililty with non cuda platform for nvfp4
 (#23478)

Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
---
 vllm/compilation/fusion.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 413948799d..0d8d562514 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -47,8 +47,10 @@ QUANT_OPS: dict[QuantKey, OpOverload] = {
     torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
     kFp8DynamicTokenSym:
     torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
-    kNvfp4Quant: torch.ops._C.scaled_fp4_quant.default,  # noqa: E501
 }
+if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
+    QUANT_OPS[
+        kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default  # noqa: E501
 
 
 class FusedRMSQuantKey(NamedTuple):

From 47455c424f62a20b75a7cfd872e17c5ba11c9f3a Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Mon, 25 Aug 2025 02:04:04 +0200
Subject: [PATCH 544/932] [Doc: ]fix various typos in multiple files (#23487)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/nightly-benchmarks/nightly-descriptions.md | 2 +-
 docs/deployment/frameworks/anything-llm.md            | 2 +-
 docs/design/fused_moe_modular_kernel.md               | 2 +-
 docs/design/metrics.md                                | 2 +-
 docs/design/paged_attention.md                        | 2 +-
 docs/features/quantization/inc.md                     | 2 +-
 docs/getting_started/installation/cpu.md              | 6 +++---
 docs/getting_started/installation/intel_gaudi.md      | 4 ++--
 vllm/config/cache.py                                  | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 8afde017d3..37e2980eea 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -17,7 +17,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
     - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
     - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
     - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-        - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
+        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
     - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 - Hardware
     - 8x Nvidia A100 GPUs
diff --git a/docs/deployment/frameworks/anything-llm.md b/docs/deployment/frameworks/anything-llm.md
index e62a33b208..0b41e73b03 100644
--- a/docs/deployment/frameworks/anything-llm.md
+++ b/docs/deployment/frameworks/anything-llm.md
@@ -18,7 +18,7 @@ vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
 
 - Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
 
-- On the bottom left of open settings, AI Prooviders --> LLM:
+- On the bottom left of open settings, AI Providers --> LLM:
     - LLM Provider: Generic OpenAI
     - Base URL: http://{vllm server host}:{vllm server port}/v1
     - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index 4b917ab408..3c4c7d2102 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -226,7 +226,7 @@ Doing this will add the new implementation to the test suite.
 
 The unit test file [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
 Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
-As a side-effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
+As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
 with incompatible types, the script will error.
 
 ### How To Profile
diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index b01838883f..b24364247b 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -565,7 +565,7 @@ model and then validate those tokens with the larger model.
 - `vllm:spec_decode_num_emitted_tokens_total` (Counter)
 
 There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
-seculative decoding to v1. Other techniques will follow. We should
+speculative decoding to v1. Other techniques will follow. We should
 revisit the v0 metrics in this context.
 
 !!! note
diff --git a/docs/design/paged_attention.md b/docs/design/paged_attention.md
index fb991a35ca..d87b2a639d 100644
--- a/docs/design/paged_attention.md
+++ b/docs/design/paged_attention.md
@@ -422,7 +422,7 @@ a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle
 a whole block of value tokens. And each `accs` in each thread
 contains 8 elements that accumulated at 8 different head positions.
 For the thread 0, the `accs` variable will have 8 elements, which
-are 0th, 32th … 224th elements of a value head that are accumulated
+are 0th, 32nd … 224th elements of a value head that are accumulated
 from all assigned 8 tokens.
 
 ## LV
diff --git a/docs/features/quantization/inc.md b/docs/features/quantization/inc.md
index 13b151bc7f..5e86e9388f 100644
--- a/docs/features/quantization/inc.md
+++ b/docs/features/quantization/inc.md
@@ -7,7 +7,7 @@ Intel Gaudi supports quantization of various modules and functions, including, b
 [Supported Modules\\Supported Functions\\Custom Patched Modules](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-modules).
 
 !!! note
-    Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package.
+    Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vLLM HPU extension](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package.
 
 !!! note
     `QUANT_CONFIG` is an environment variable that points to the measurement or quantization [JSON config file](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-json-config-file-options).
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 7a34d47d8e..e76ec35e1e 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -170,7 +170,7 @@ This value is 4GB by default. Larger space can support more concurrent requests,
 
 First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`.
 
-Inference batch size is a important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM:
+Inference batch size is an important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM:
 
 - `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as:
     - Offline Inference: `4096 * world_size`
@@ -179,7 +179,7 @@ Inference batch size is a important parameter for the performance. Larger batch
     - Offline Inference: `256 * world_size`
     - Online Serving: `128 * world_size`
 
-vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more detials of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP togther if there are enough CPU sockets and memory nodes.
+vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP together if there are enough CPU sockets and memory nodes.
 
 ### Which quantization configs does vLLM CPU support?
 
@@ -190,6 +190,6 @@ vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage mu
 
 ### (x86 only) What is the purpose of `VLLM_CPU_MOE_PREPACK` and `VLLM_CPU_SGL_KERNEL`?
 
-- Both of them requires `amx` CPU flag.
+- Both of them require `amx` CPU flag.
     - `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models
     - `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios.
diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md
index 61b2b02aa1..ff912efec9 100644
--- a/docs/getting_started/installation/intel_gaudi.md
+++ b/docs/getting_started/installation/intel_gaudi.md
@@ -261,13 +261,13 @@ Lower value corresponds to less usable graph memory reserved for prefill stage,
 
 User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
 
-- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
+- `max_bs` - graph capture queue will be sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
 - `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
 
 When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
 
 !!! note
-    `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
+    `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt to do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
 
 Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
 
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index ae11dec3ca..a9550d4390 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -116,7 +116,7 @@ class CacheConfig:
     In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
     some layers can skip tokens corresponding to prefill. This flag enables
     attention metadata for eligible layers to be overriden with metadata
-    necessary for implementating this optimization in some models (e.g. Gemma3n)
+    necessary for implementing this optimization in some models (e.g. Gemma3n)
     """
 
     def compute_hash(self) -> str:

From 504d91431499e302bbd5a3d8a1432cd427ec8d5d Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Sun, 24 Aug 2025 18:06:35 -0700
Subject: [PATCH 545/932] [Perf] Add Triton config for DeepSeek V3 FP8 EP32
 H200 (#23504)

Signed-off-by: Ming Yang <minos.future@gmail.com>
---
 .../kernels/benchmark_w8a8_block_fp8.py       |   2 +-
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 154 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 +++
 .../quantization/utils/configs/README.md      |   3 +
 4 files changed, 184 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/README.md

diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index 4fcdbadd65..e648a91077 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -11,8 +11,8 @@ from datetime import datetime
 from typing import Any
 
 import torch
-import tqdm
 import triton
+from tqdm import tqdm
 
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     _w8a8_block_fp8_matmul,
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..d677d69c57
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,154 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8192": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..fbca5ce05d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/README.md b/vllm/model_executor/layers/quantization/utils/configs/README.md
new file mode 100644
index 0000000000..1110ced4fa
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/README.md
@@ -0,0 +1,3 @@
+# Quantization Kernel Config
+
+Use scripts under `benchmarks/kernels/` to generate these config files.

From 39971db3aaa339a1983582422c449ba5c2ed6e46 Mon Sep 17 00:00:00 2001
From: Noam Gat <noamgat@gmail.com>
Date: Mon, 25 Aug 2025 05:31:22 +0300
Subject: [PATCH 546/932] Frontend: Adding LM Format Enforcer support to V1
 engine (#22564)

Signed-off-by: Noam Gat <noamgat@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 requirements/common.txt                       |   2 +-
 .../llm/test_struct_output_generate.py        |  10 +-
 vllm/config/__init__.py                       |   3 +-
 vllm/v1/engine/processor.py                   |   5 +
 vllm/v1/structured_output/__init__.py         |   8 +
 .../backend_lm_format_enforcer.py             | 167 ++++++++++++++++++
 6 files changed, 190 insertions(+), 5 deletions(-)
 create mode 100644 vllm/v1/structured_output/backend_lm_format_enforcer.py

diff --git a/requirements/common.txt b/requirements/common.txt
index 8acf634526..e21abfb9a3 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -18,7 +18,7 @@ prometheus_client >= 0.18.0
 pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer >= 0.10.11, < 0.11
+lm-format-enforcer == 0.11.3
 llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines_core == 0.2.10 ; platform_machine != "s390x"
 outlines == 0.1.11 ; platform_machine == "s390x"
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 572af0175d..cd82eb2ac4 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -41,8 +41,11 @@ EAGLE_SPEC_CONFIG = {
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
     ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
+    ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto",
+     None),
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None),
     ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
     ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
     ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto",
@@ -148,7 +151,8 @@ def test_structured_output(
 
         generated_text = output.outputs[0].text
         assert generated_text is not None
-        assert "\n" not in generated_text
+        if guided_decoding_backend != 'lm-format-enforcer':
+            assert "\n" not in generated_text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=sample_json_schema)
@@ -225,7 +229,7 @@ def test_structured_output(
             parsed_json = json.loads(generated_text)
             assert isinstance(parsed_json, dict)
 
-    if guided_decoding_backend != "outlines":
+    if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]:
         #
         # Test 4: Generate SQL statement using EBNF grammar
         #
@@ -439,7 +443,7 @@ def test_structured_output(
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=json_schema)
 
-    if guided_decoding_backend != "outlines":
+    if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]:
         #
         # Test 11: Generate structured output using structural_tag format
         #
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 6ce40626b3..cd0e17977e 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3057,7 +3057,8 @@ def get_served_model_name(model: str,
     return served_model_name
 
 
-GuidedDecodingBackend = Literal["auto", "xgrammar", "guidance", "outlines"]
+GuidedDecodingBackend = Literal["auto", "xgrammar", "guidance", "outlines",
+                                "lm-format-enforcer"]
 
 
 @config
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 219857dc7b..300b0713b2 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -21,6 +21,8 @@ from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
 from vllm.v1.structured_output.backend_guidance import (
     validate_guidance_grammar)
+from vllm.v1.structured_output.backend_lm_format_enforcer import (
+    validate_structured_output_request_lm_format_enforcer)
 from vllm.v1.structured_output.backend_outlines import (
     validate_structured_output_request_outlines)
 from vllm.v1.structured_output.backend_xgrammar import (
@@ -200,6 +202,9 @@ class Processor:
         elif engine_level_backend == "outlines":
             # outlines backend
             validate_structured_output_request_outlines(params)
+        elif engine_level_backend == "lm-format-enforcer":
+            # lm format enforcer backend
+            validate_structured_output_request_lm_format_enforcer(params)
         else:
             # NOTE: engine_level_backend must be "auto" here, because we have
             # checked supported_backends above.
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 3bafa61044..57854cc112 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -108,6 +108,14 @@ class StructuredOutputManager:
                     tokenizer=self.tokenizer,
                     vocab_size=vocab_size,
                 )
+            elif backend == "lm-format-enforcer":
+                from vllm.v1.structured_output.backend_lm_format_enforcer import (  # noqa: E501
+                    LMFormatEnforcerBackend)
+                self.backend = LMFormatEnforcerBackend(
+                    self.vllm_config,
+                    tokenizer=self.tokenizer,
+                    vocab_size=vocab_size,
+                )
             else:
                 raise ValueError(
                     f"Unsupported structured output backend: {backend}")
diff --git a/vllm/v1/structured_output/backend_lm_format_enforcer.py b/vllm/v1/structured_output/backend_lm_format_enforcer.py
new file mode 100644
index 0000000000..2279a1c8c8
--- /dev/null
+++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+import ast
+import json
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import TYPE_CHECKING
+
+import torch
+from transformers import PreTrainedTokenizerBase
+
+from vllm.sampling_params import SamplingParams
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
+                                                     StructuredOutputGrammar,
+                                                     StructuredOutputOptions)
+
+if TYPE_CHECKING:
+    import lmformatenforcer
+    import lmformatenforcer.integrations.vllm as lmfe_vllm
+else:
+    lmformatenforcer = LazyLoader("lmformatenforcer", globals(),
+                                  "lmformatenforcer")
+    lmfe_vllm = LazyLoader("lmformatenforcer.integrations.vllm", globals(),
+                           "lmformatenforcer.integrations.vllm")
+
+
+@lru_cache
+def _cached_build_vllm_token_enforcer_tokenizer_data(
+        tokenizer: PreTrainedTokenizerBase,
+        vocab_size: int) -> lmfe_vllm.TokenEnforcerTokenizerData:
+    return lmfe_vllm.build_vllm_token_enforcer_tokenizer_data(
+        tokenizer, use_bitmask=True, vocab_size=vocab_size)
+
+
+@dataclass
+class LMFormatEnforcerGrammar(StructuredOutputGrammar):
+    token_enforcer: lmformatenforcer.TokenEnforcer
+    current_tokens_prefix: list[int] = field(default_factory=list)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        original_len = len(self.current_tokens_prefix)
+        for token in tokens:
+            if not self.token_enforcer.get_allowed_tokens(
+                    self.current_tokens_prefix).is_token_allowed(token):
+                # Rollback partial updates to ensure atomicity.
+                del self.current_tokens_prefix[original_len:]
+                return False
+            self.current_tokens_prefix.append(token)
+        return True
+
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        for prefix_length in range(len(tokens)):
+            prefix = tokens[:prefix_length]
+            next_token = tokens[prefix_length]
+            if not self.token_enforcer.get_allowed_tokens(
+                    self.current_tokens_prefix +
+                    prefix).is_token_allowed(next_token):
+                break
+        else:
+            return tokens
+
+        return tokens[:prefix_length]
+
+    def rollback(self, num_tokens: int) -> None:
+        self.current_tokens_prefix = self.current_tokens_prefix[:-num_tokens]
+
+    def fill_bitmask(self, bitmask: torch.Tensor, batch_index: int) -> None:
+        allowed_tokens = self.token_enforcer.get_allowed_tokens(
+            self.current_tokens_prefix)
+        bitmask[batch_index] = allowed_tokens.allowed_tokens
+
+    def is_terminated(self) -> bool:
+        # We are considered terminated if the prefix ends with eos_token_id
+        return_value = len(
+            self.current_tokens_prefix) > 0 and self.current_tokens_prefix[
+                -1] == self.token_enforcer.eos_token_id
+        return return_value
+
+    def reset(self):
+        self.current_tokens_prefix = []
+
+
+@dataclass
+class LMFormatEnforcerBackend(StructuredOutputBackend):
+
+    def __post_init__(self):
+        self.tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
+            self.tokenizer, self.vocab_size)
+
+    def compile_grammar(self, request_type: StructuredOutputOptions,
+                        grammar_spec: str) -> StructuredOutputGrammar:
+        character_level_parser: lmformatenforcer.CharacterLevelParser
+        if request_type == StructuredOutputOptions.JSON:
+            spec_dict = json.loads(grammar_spec)
+            character_level_parser = lmformatenforcer.JsonSchemaParser(
+                spec_dict)
+        elif request_type == StructuredOutputOptions.JSON_OBJECT:
+            character_level_parser = lmformatenforcer.JsonSchemaParser(None)
+        elif request_type == StructuredOutputOptions.REGEX:
+            character_level_parser = lmformatenforcer.RegexParser(grammar_spec)
+        elif request_type == StructuredOutputOptions.CHOICE:
+            choices = ast.literal_eval(grammar_spec)
+            character_level_parser = lmformatenforcer.UnionParser(
+                [lmformatenforcer.StringParser(choice) for choice in choices])
+        else:
+            raise ValueError(
+                "Invalid request type for LM Format Enforcer backend"
+                f"({request_type!s})")
+        max_rollback_tokens = (
+            self.vllm_config.speculative_config.num_speculative_tokens
+            if self.vllm_config.speculative_config is not None else 0)
+
+        if max_rollback_tokens > 0:
+            raise ValueError(
+                "LM Format Enforcer backend does not support speculative tokens"
+            )
+
+        token_enforcer = lmformatenforcer.TokenEnforcer(
+            tokenizer_data=self.tokenizer_data,
+            parser=character_level_parser,
+        )
+        return LMFormatEnforcerGrammar(token_enforcer)
+
+    def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor:
+        return torch.full(
+            (max_num_seqs, (self.vocab_size + 31) // 32),
+            -1,
+            dtype=torch.int32,
+            pin_memory=torch.cuda.is_available(),
+        )
+
+    def destroy(self):
+        pass
+
+
+def validate_structured_output_request_lm_format_enforcer(
+        params: SamplingParams):
+    if params.guided_decoding is None:
+        return
+
+    gd_params = params.guided_decoding
+
+    if gd_params.regex:
+        return
+    elif gd_params.json:
+        if isinstance(gd_params.json, str):
+            try:
+                # make sure schema is valid json
+                json.loads(gd_params.json)
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid JSON grammar specification.") from e
+        else:
+            try:
+                json.dumps(gd_params.json)
+            except Exception as e:
+                raise ValueError(
+                    f"Error serializing guided decoding jsonschema: {e}"
+                ) from e
+        return
+    elif gd_params.choice:
+        return
+    elif gd_params.grammar:
+        raise ValueError("LM Format Enforcer guided decoding backend "
+                         "does not support grammar specifications")

From a71e4765cc0c1534f2a8891aaf628e1751f6df07 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Sun, 24 Aug 2025 19:40:22 -0700
Subject: [PATCH 547/932] [Bugfix] Fix Qwen2.5-VL quantized model weights
 loading (#23512)

Signed-off-by: Zifei Tong <zifeitong@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 811ecffcc1..0f11636ce3 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -135,7 +135,7 @@ class Qwen2_5_VLVideoPixelInputs(TypedDict):
 
     second_per_grid_ts: torch.Tensor
     """
-    The video time interval (in seconds) for each grid along the temporal 
+    The video time interval (in seconds) for each grid along the temporal
     dimension in the 3D position IDs. Returned when `videos` is not `None`.
     """
 
@@ -852,6 +852,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsLoRA, SupportsPP,
                                          SupportsQuant):
 
+    packed_modules_mapping = {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={

From 170e8ea9ea95294d1bdc4af4bea73741ea759e22 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 25 Aug 2025 11:13:51 +0800
Subject: [PATCH 548/932] [Misc] Unified linear print info (#23516)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/layers/linear.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 5725c841e5..dd54aebeb0 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -1378,7 +1378,7 @@ class RowParallelLinear(LinearBase):
         return output, output_bias
 
     def extra_repr(self) -> str:
-        s = f"input_features={self.input_size_per_partition}"
+        s = f"in_features={self.input_size_per_partition}"
         s += f", output_features={self.output_size}"
         s += f", bias={self.bias is not None}"
         s += f", tp_size={self.tp_size}"

From 99f8094400b7177d7ac758083905abde18ddfa64 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sun, 24 Aug 2025 21:42:36 -0700
Subject: [PATCH 549/932] Migrate tarsier inputs to TensorSchema (#23500)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/tarsier.py | 44 ++++++++++++++-------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index 0990be8d02..9b9cca8c6b 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -3,7 +3,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
+from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar,
                     Union, cast)
 
 import torch
@@ -34,6 +34,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.jsontree import json_map_leaves
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -43,14 +44,28 @@ from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
 from .vision import VisionEncoderInfo, get_vision_encoder_info
 
 
-class TarsierImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
+class TarsierImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+    """
+    type: Literal["pixel_values"] = "pixel_values"
+    pixel_values: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
 
-class TarsierImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
+class TarsierImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - ifs: Image feature size
+        - hs: Hidden size (must match the hidden size of language model
+          backbone)
+    """
+    type: Literal["image_embeds"] = "image_embeds"
+    data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
 TarsierImageInputs = Union[TarsierImagePixelInputs,
@@ -432,18 +447,6 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)  # Assuming 3 channels
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[TarsierImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -459,8 +462,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
 
             return TarsierImagePixelInputs(
                 type="pixel_values",
-                pixel_values=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                pixel_values=flatten_bn(pixel_values, concat=True),
             )
 
         if image_embeds is not None:

From a5203d04dffcbdb095651ca4bf06589409370301 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sun, 24 Aug 2025 21:43:21 -0700
Subject: [PATCH 550/932] Migrate skyworkr1v inputs to TensorSchema (#23499)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/skyworkr1v.py | 76 ++++++++++++------------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 920f4def69..9857ccdcbe 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -8,7 +8,7 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -35,6 +35,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
@@ -48,27 +49,42 @@ IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
 
 
-class SkyworkR1VImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values_flat: torch.Tensor
+class SkyworkR1VImagePixelInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    Dimensions:
+        - bnp: Batch size * number of images * (1 + num_patches)
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+        - bn: Batch size * number of images
     """
+    type: Literal["pixel_values"] = "pixel_values"
 
-    num_patches: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
+    pixel_values_flat: Annotated[
+        torch.Tensor,
+        TensorShape("bnp", 3, "h", "w"),
+    ]
+
+    num_patches: Annotated[
+        torch.Tensor,
+        TensorShape("bn"),
+    ]
 
 
-class SkyworkR1VImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """ 
-    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
-    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
+class SkyworkR1VImageEmbeddingInputs(TensorSchema):
     """
+    Dimensions:
+        - ni: Number of images
+        - ifs: Image feature size
+        - hs: Hidden size (must match the hidden size of language model 
+          backbone)
+    """
+    type: Literal["image_embeds"] = "image_embeds"
+
+    data: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("ni", "ifs", "hs"),
+    ]
 
 
 SkyworkR1VImageInputs = Union[SkyworkR1VImagePixelInputs,
@@ -731,26 +747,6 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f" per patch is {expected_expr}. "
-                    f"You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[SkyworkR1VImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
@@ -788,10 +784,12 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
             return SkyworkR1VImagePixelInputs(
                 type="pixel_values",
-                pixel_values_flat=self._validate_pixel_values(
-                    pixel_values_flat),
+                pixel_values_flat=pixel_values_flat,
                 num_patches=image_num_patches,
-            )
+                resolve_bindings={
+                    "h": self.config.vision_config.image_size,
+                    "w": self.config.vision_config.image_size,
+                })
 
         raise AssertionError("This line should be unreachable.")
 

From 787cdb3829676504da2f612fad041db4b8acc271 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sun, 24 Aug 2025 22:02:15 -0700
Subject: [PATCH 551/932] Migrate DonutImagePixelInputs to TensorSchema
 (#23509)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/donut.py | 49 +++++++++++------------------
 1 file changed, 19 insertions(+), 30 deletions(-)

diff --git a/vllm/model_executor/models/donut.py b/vllm/model_executor/models/donut.py
index b1f6a0af6b..c00db52371 100644
--- a/vllm/model_executor/models/donut.py
+++ b/vllm/model_executor/models/donut.py
@@ -3,7 +3,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -29,6 +29,7 @@ from vllm.multimodal.processing import (BaseProcessingInfo,
                                         PromptIndexTargets, PromptInsertion,
                                         PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 
 class MBartDecoderWrapper(nn.Module):
@@ -132,10 +133,16 @@ class DonutLanguageForConditionalGeneration(nn.Module, SupportsV0Only):
         return loaded_params
 
 
-class DonutImagePixelInputs(TypedDict):
+class DonutImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+    """
     type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: (batch_size, num_channel, height, width)"""
+    data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")]
 
 
 class DonutProcessingInfo(BaseProcessingInfo):
@@ -275,27 +282,6 @@ class DonutForConditionalGeneration(nn.Module, SupportsMultiModal,
         )
         self.pad_token_id = config.pad_token_id
 
-    def _validate_pixel_values(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-
-        # size = self.processor_config["size"]
-        h, w = self.config.encoder.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                raise ValueError(
-                    "The expected shape of pixel values per batch "
-                    f"is {expected_dims}. You supplied {actual_dims}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(self, **kwargs: object):
         pixel_values: Optional[Union[list[list[torch.Tensor]],
                                      list[torch.Tensor],
@@ -314,11 +300,14 @@ class DonutForConditionalGeneration(nn.Module, SupportsMultiModal,
                 "Both pixel values and image embeds are provided.")
 
         if pixel_values is not None:
-            return DonutImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
-            )
+            h, w = self.config.encoder.image_size
+            return DonutImagePixelInputs(type="pixel_values",
+                                         data=flatten_bn(pixel_values,
+                                                         concat=True),
+                                         resolve_bindings={
+                                             "h": h,
+                                             "w": w,
+                                         })
 
         if image_embeds is not None:
             raise NotImplementedError

From c9abb104893b16958d8c73f927ea37e65985f4ca Mon Sep 17 00:00:00 2001
From: LIYIFAN_liyifan <100135380+FFFfff1FFFfff@users.noreply.github.com>
Date: Sun, 24 Aug 2025 22:39:24 -0700
Subject: [PATCH 552/932] [Bugfix] Fix Dense module loading for
 sentence-transformers embedding models (simplified V2) (#23408)

Signed-off-by: FFFfff1FFFfff <yifanli0919@gmail.com>
---
 .../language/pooling/test_st_projector.py     | 22 +++++
 vllm/model_executor/layers/activation.py      | 11 +++
 vllm/model_executor/layers/pooler.py          | 24 ++++-
 vllm/model_executor/models/adapters.py        | 98 ++++++++++++++++++-
 vllm/transformers_utils/config.py             | 22 +++++
 5 files changed, 175 insertions(+), 2 deletions(-)
 create mode 100644 tests/models/language/pooling/test_st_projector.py

diff --git a/tests/models/language/pooling/test_st_projector.py b/tests/models/language/pooling/test_st_projector.py
new file mode 100644
index 0000000000..51ddbcc5ab
--- /dev/null
+++ b/tests/models/language/pooling/test_st_projector.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from .mteb_utils import mteb_test_embed_models
+
+# ST models with projector (Dense) layers
+ST_PROJECTOR_MODELS = [
+    CLSPoolingEmbedModelInfo(
+        "TencentBAC/Conan-embedding-v1",
+        architecture="BertModel",
+        enable_test=True,
+    ),
+]
+
+
+@pytest.mark.parametrize("model_info", ST_PROJECTOR_MODELS)
+def test_embed_models_mteb(hf_runner, vllm_runner,
+                           model_info: EmbedModelInfo) -> None:
+
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 86ab4f546d..f3248589ab 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -422,12 +422,23 @@ _ACTIVATION_REGISTRY = LazyDict({
     lambda: nn.SiLU(),
     "quick_gelu":
     lambda: QuickGELU(),
+    "tanh":
+    lambda: nn.Tanh(),
+    "sigmoid":
+    lambda: nn.Sigmoid(),
 })
 
 
 def get_act_fn(act_fn_name: str) -> nn.Module:
     """Get an activation function by name."""
     act_fn_name = act_fn_name.lower()
+
+    if act_fn_name.startswith("torch.nn.modules."):
+        activation_name = act_fn_name.split(".")[-1]
+        if activation_name == "identity":
+            return nn.Identity()
+        act_fn_name = activation_name
+
     if act_fn_name not in _ACTIVATION_REGISTRY:
         raise ValueError(
             f"Activation function {act_fn_name!r} is not supported.")
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index d34fb58cb5..eebf7b2508 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -5,7 +5,7 @@ from collections.abc import Mapping, Set
 from dataclasses import dataclass
 from enum import IntEnum
 from itertools import groupby
-from typing import Callable, Optional, TypeVar, Union
+from typing import Callable, Optional, TypeVar, Union, cast
 
 import torch
 import torch.nn as nn
@@ -435,9 +435,31 @@ class EmbeddingPoolerHead(PoolerHead):
     def __init__(self) -> None:
         super().__init__(activation=PoolerNormalize())
 
+        # Load ST projector if available
+        from vllm.config import get_current_vllm_config
+        from vllm.model_executor.models.adapters import _load_st_projector
+
+        vllm_config = get_current_vllm_config()
+        self.projector = _load_st_projector(
+            vllm_config.model_config) if vllm_config else None
+
     def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
                 pooling_metadata: PoolingMetadata):
 
+        # Apply ST projector
+        if self.projector is not None:
+            projector = cast(nn.Module, self.projector)
+
+            def _proj(x: torch.Tensor) -> torch.Tensor:
+                orig_dtype = x.dtype
+                y = projector(x.to(torch.float32))
+                return y.to(orig_dtype)
+
+            if isinstance(pooled_data, torch.Tensor):
+                pooled_data = _proj(pooled_data)
+            else:
+                pooled_data = [_proj(t) for t in pooled_data]
+
         pooling_params = get_pooling_params(pooling_metadata)
 
         if isinstance(pooled_data, list):
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 1dbe70f84a..49e9a2d65e 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -7,15 +7,21 @@ from typing import TYPE_CHECKING, Any, Optional, TypeVar, cast
 import torch
 import torch.nn as nn
 
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.models.config import VerifyAndUpdateConfig
+from vllm.transformers_utils.config import (get_hf_file_bytes,
+                                            get_hf_file_to_dict)
 
 from .interfaces_base import VllmModelForPooling, is_pooling_model
 
 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
 
 _T = TypeVar("_T", bound=type[nn.Module])
 
+logger = init_logger(__name__)
+
 _GENERATE_SUFFIXES = [
     "ForCausalLM",
     "ForConditionalGeneration",
@@ -24,6 +30,96 @@ _GENERATE_SUFFIXES = [
 ]
 
 
+def _load_st_projector(model_config: "ModelConfig") -> Optional[nn.Module]:
+    """Load Sentence-Transformers Dense projection layers."""
+
+    try:
+        modules = get_hf_file_to_dict("modules.json", model_config.model,
+                                      model_config.revision)
+        if not modules:
+            return None
+
+        if isinstance(modules, dict):
+            modules = modules.get("modules", [])
+
+        dense_modules = [
+            m for m in modules
+            if m.get("type") == "sentence_transformers.models.Dense"
+        ]
+        if not dense_modules:
+            return None
+
+        module = dense_modules[0]
+        folder = module.get("path", "")
+
+        config_path = f"{folder}/config.json" if folder else "config.json"
+        layer_config = get_hf_file_to_dict(config_path, model_config.model,
+                                           model_config.revision)
+        if not layer_config:
+            return None
+
+        linear = nn.Linear(layer_config.get("in_features", 768),
+                           layer_config.get("out_features", 768),
+                           bias=layer_config.get("bias", True),
+                           dtype=torch.float32)
+
+        if _load_dense_weights(linear, folder, model_config):
+            layers = [linear]
+            if act_name := layer_config.get("activation_function"):
+                layers.append(get_act_fn(act_name))
+            return nn.Sequential(*layers).to(dtype=torch.float32)
+
+    except Exception:
+        logger.exception("ST projector loading failed")
+
+    return None
+
+
+def _load_dense_weights(linear: nn.Linear, folder: str,
+                        model_config: "ModelConfig") -> bool:
+    """Load weights using vLLM's weight_loader pattern."""
+    from vllm.model_executor.model_loader.weight_utils import (
+        default_weight_loader)
+
+    for filename in ["model.safetensors", "pytorch_model.bin"]:
+        file_path = f"{folder}/{filename}" if folder else filename
+
+        try:
+            file_bytes = get_hf_file_bytes(file_path, model_config.model,
+                                           model_config.revision)
+            if not file_bytes:
+                continue
+
+            if filename.endswith(".safetensors"):
+                from safetensors.torch import load as load_safetensors
+                state_dict = load_safetensors(file_bytes)
+            else:
+                import io
+                state_dict = torch.load(io.BytesIO(file_bytes),
+                                        map_location="cpu",
+                                        weights_only=True)
+
+            for weight_key in ["weight", "linear.weight", "dense.weight"]:
+                if weight_key in state_dict:
+                    weight_loader = getattr(linear.weight, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(linear.weight,
+                                  state_dict[weight_key].to(torch.float32))
+
+                    bias_key = weight_key.replace("weight", "bias")
+                    if linear.bias is not None and bias_key in state_dict:
+                        bias_loader = getattr(linear.bias, "weight_loader",
+                                              default_weight_loader)
+                        bias_loader(linear.bias,
+                                    state_dict[bias_key].to(torch.float32))
+                    return True
+        except Exception:
+            logger.exception("Failed to load %s", filename)
+            continue
+
+    return False
+
+
 def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str:
     model_name = orig_model_name
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index fe345bd8f0..674c820dab 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -927,3 +927,25 @@ def get_model_path(model: Union[str, Path], revision: Optional[str] = None):
 
     from huggingface_hub import snapshot_download
     return snapshot_download(repo_id=model, **common_kwargs)
+
+
+def get_hf_file_bytes(file_name: str,
+                      model: Union[str, Path],
+                      revision: Optional[str] = 'main') -> Optional[bytes]:
+    """Get file contents from HuggingFace repository as bytes."""
+    file_path = try_get_local_file(model=model,
+                                   file_name=file_name,
+                                   revision=revision)
+
+    if file_path is None:
+        hf_hub_file = hf_hub_download(model,
+                                      file_name,
+                                      revision=revision,
+                                      token=_get_hf_token())
+        file_path = Path(hf_hub_file)
+
+    if file_path is not None and file_path.is_file():
+        with open(file_path, 'rb') as file:
+            return file.read()
+
+    return None

From 49ab23b3ccc2da9274c739d55f9b19206078c7a9 Mon Sep 17 00:00:00 2001
From: Yu Guo <82124926+yuguo68@users.noreply.github.com>
Date: Sun, 24 Aug 2025 23:29:34 -0700
Subject: [PATCH 553/932] [gpt-oss] use reasoning channel for reasoning text in
 serving_chat (#22920)

Signed-off-by: Yu Guo <yuguo@meta.com>
---
 vllm/entrypoints/openai/serving_chat.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 65aac23ee6..8b50153f01 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -663,9 +663,9 @@ class OpenAIServingChat(OpenAIServing):
                         harmony_parser = harmony_parsers[i]
                         for token_id in output.token_ids:
                             harmony_parser.process(token_id)
-                        # FIXME(woosuk): Support function calling
-                        is_final = harmony_parser.current_channel == "final"
-                        if not (request.include_reasoning or is_final):
+                        is_reasoning = \
+                            harmony_parser.current_channel == "analysis"
+                        if not request.include_reasoning and is_reasoning:
                             # Skip the reasoning content.
                             continue
                         delta_text = harmony_parser.last_content_delta or ""
@@ -695,11 +695,11 @@ class OpenAIServingChat(OpenAIServing):
                             current_token_ids = as_list(output.token_ids)
 
                     if self.use_harmony:
-                        if is_final:
-                            delta_message = DeltaMessage(content=delta_text)
-                        else:
+                        if is_reasoning:
                             delta_message = DeltaMessage(
                                 reasoning_content=delta_text)
+                        else:
+                            delta_message = DeltaMessage(content=delta_text)
                     # handle streaming deltas for tools with named tool_choice
                     elif tool_choice_function_name:
                         if (self.reasoning_parser and not reasoning_end_arr[i]

From 712d0f88d89940291b6a6a4db635c60187398acd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 25 Aug 2025 14:39:58 +0800
Subject: [PATCH 554/932] [Refactor] Dynamic `target` and `content` for prompt
 updates (#23411)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py           | 112 ++-
 vllm/model_executor/models/gemma3_mm.py       |  28 +-
 vllm/model_executor/models/gemma3n_mm.py      |  28 +-
 vllm/model_executor/models/llava.py           |  13 +-
 vllm/model_executor/models/phi3v.py           |  15 +-
 vllm/model_executor/models/phi4_multimodal.py |  12 +-
 vllm/model_executor/models/phi4mm.py          |  24 +-
 .../models/qwen2_5_omni_thinker.py            |   4 +-
 vllm/multimodal/processing.py                 | 683 ++++++++++--------
 9 files changed, 463 insertions(+), 456 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index cb489c47fd..3bebe0ab40 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -17,13 +17,11 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
                                         PromptReplacement, apply_text_matches,
                                         apply_token_matches,
                                         find_mm_placeholders,
-                                        find_text_matches, find_token_matches,
                                         iter_token_matches,
                                         replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import full_groupby
 
 from .utils import random_image
 
@@ -75,12 +73,15 @@ from .utils import random_image
         ),
     ],
 )
+@pytest.mark.parametrize("start_idx", [0, 4, 8])
 # yapf: enable
-def test_iter_token_matches(token_ids, match_ids, expected):
-    result = list(iter_token_matches(token_ids, match_ids))
+def test_iter_token_matches(token_ids, match_ids, expected, start_idx):
+    result = list(iter_token_matches(token_ids, match_ids,
+                                     start_idx=start_idx))
 
     # Manually constructed results
-    assert [item._asdict() for item in result] == expected
+    assert [item._asdict() for item in result
+            ] == [item for item in expected if item["start_idx"] >= start_idx]
 
     # Invariants
     match_lens = [end - start for start, end in result]
@@ -241,21 +242,23 @@ def test_find_token_matches(
     # Should not be used since there is nothing to convert to token IDs
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    prompt_updates = [
-        update_type(key, target, []).bind(mock_tokenizer)
+    prompt_updates = {
+        key: update_type(key, target, []).resolve(mock_tokenizer, 0)
         for key, target in target_by_key.items()
-    ]
-    result = find_token_matches(prompt, prompt_updates)
+    }
+    result = {
+        key: list(update.iter_token_matches(prompt, mock_tokenizer))
+        for key, update in prompt_updates.items()
+    }
 
     # Only displayed on error
     print("result:", result)
 
     # Manually constructed results
-    result_groups = dict(full_groupby(result, key=lambda x: x.modality))
     assert {
         key: [
             dict(start_idx=item.start_idx, end_idx=item.end_idx)
-            for item in result_groups.get(key, [])
+            for item in result.get(key, [])
         ]
         for key in expected_by_key
     } == expected_by_key
@@ -388,21 +391,23 @@ def test_find_text_matches(
     # Should not be used since there is nothing to convert to text
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    prompt_updates = [
-        update_type(key, target, []).bind(mock_tokenizer)
+    prompt_updates = {
+        key: update_type(key, target, []).resolve(mock_tokenizer, 0)
         for key, target in target_by_key.items()
-    ]
-    result = find_text_matches(prompt, prompt_updates)
+    }
+    result = {
+        key: list(update.iter_text_matches(prompt, mock_tokenizer))
+        for key, update in prompt_updates.items()
+    }
 
     # Only displayed on error
     print("result:", result)
 
     # Manually constructed results
-    result_groups = dict(full_groupby(result, key=lambda x: x.modality))
     assert {
         key: [
             dict(start_idx=item.start_idx, end_idx=item.end_idx)
-            for item in result_groups.get(key, [])
+            for item in result.get(key, [])
         ]
         for key in expected_by_key
     } == expected_by_key
@@ -552,39 +557,37 @@ def test_find_update_text(
             update_type,
             expected_by_mm_count,
     ) in expected_by_update_type_mm_count.items():
-        mm_prompt_updates = {
-            key:
-            [update_type(key, target, repl_by_key[key]).bind(mock_tokenizer)]
-            for key, target in target_by_key.items()
-        }
-        mm_matches = {
-            key: find_text_matches(prompt, updates)
-            for key, updates in mm_prompt_updates.items()
-        }
-
         for mm_count, expected in expected_by_mm_count.items():
-            result = apply_text_matches(
+            mm_prompt_updates = {
+                key: [[
+                    update_type(key, target,
+                                repl_by_key[key]).resolve(mock_tokenizer, i)
+                ] for i in range(mm_count)]
+                for key, target in target_by_key.items()
+            }
+
+            new_prompt, result = apply_text_matches(
                 prompt,
-                mm_matches,
-                {key: mm_count
-                 for key in repl_by_key},
+                mm_prompt_updates,
+                mock_tokenizer,
             )
 
             # Only displayed on error
             print("update_type:", update_type)
             print("mm_count:", mm_count)
-            print("mm_matches:", mm_matches)
+            print("mm_prompt_updates:", mm_prompt_updates)
+            print("new_prompt:", new_prompt)
             print("result:", result)
 
             # Manually constructed results
-            assert result == expected
+            assert new_prompt == expected
 
 
 # yapf: disable
 @pytest.mark.parametrize(
     ("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"),  # noqa: E501
     [
-        # Tokenized test cases of `test_find_replace_text`
+        # Tokenized test cases of `test_find_update_text`
         # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
         (
             [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
@@ -726,32 +729,30 @@ def test_find_update_tokens(
             update_type,
             expected_by_mm_count,
     ) in expected_by_update_type_mm_count.items():
-        mm_prompt_updates = {
-            key:
-            [update_type(key, target, repl_by_key[key]).bind(mock_tokenizer)]
-            for key, target in target_by_key.items()
-        }
-        mm_matches = {
-            key: find_token_matches(prompt, updates)
-            for key, updates in mm_prompt_updates.items()
-        }
-
         for mm_count, expected in expected_by_mm_count.items():
-            result = apply_token_matches(
+            mm_prompt_updates = {
+                key: [[
+                    update_type(key, target,
+                                repl_by_key[key]).resolve(mock_tokenizer, i)
+                ] for i in range(mm_count)]
+                for key, target in target_by_key.items()
+            }
+
+            new_prompt, result = apply_token_matches(
                 prompt,
-                mm_matches,
-                {key: mm_count
-                 for key in repl_by_key},
+                mm_prompt_updates,
+                mock_tokenizer,
             )
 
             # Only displayed on error
             print("update_type:", update_type)
             print("mm_count:", mm_count)
-            print("mm_matches:", mm_matches)
+            print("mm_prompt_updates:", mm_prompt_updates)
+            print("new_prompt:", new_prompt)
             print("result:", result)
 
             # Manually constructed results
-            assert result == expected
+            assert new_prompt == expected
 
 
 # yapf: disable
@@ -878,17 +879,12 @@ def test_find_mm_placeholders(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     mm_prompt_updates = {
-        key: [update_type(key, [], repl).bind(mock_tokenizer)]
+        key: [[update_type(key, [], repl).resolve(mock_tokenizer, i)]
+              for i in range(3)]
         for key, repl in repl_by_key.items()
     }
 
-    result = find_mm_placeholders(
-        mm_prompt_updates,
-        prompt,
-        # Effectively match all occurrences in the prompt
-        {key: 3
-         for key in repl_by_key},
-    )
+    result = find_mm_placeholders(prompt, mm_prompt_updates)
 
     # Only displayed on error
     print("result:", result)
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index bf5ad633b9..44188ee4db 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -22,10 +22,12 @@ from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 # yapf: disable
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, BoundPromptUpdate,
+                                        BaseProcessingInfo,
+                                        MultiModalPromptUpdates,
+                                        MultiModalPromptUpdatesApplyResult,
                                         PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptTargetMatch,
-                                        PromptUpdate, PromptUpdateDetails,
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails,
                                         find_mm_placeholders,
                                         replace_token_matches)
 # yapf: enable
@@ -337,14 +339,10 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
     def _apply_token_matches(
         self,
         prompt: list[int],
-        mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
-        mm_item_counts: Mapping[str, int],
-    ) -> list[int]:
-        token_ids = super()._apply_token_matches(
-            prompt,
-            mm_matches,
-            mm_item_counts,
-        )
+        mm_prompt_updates: MultiModalPromptUpdates,
+    ) -> tuple[list[int], MultiModalPromptUpdatesApplyResult]:
+        token_ids, res = super()._apply_token_matches(prompt,
+                                                      mm_prompt_updates)
 
         # "\n\n\n" and "\n\n\n\n" are single tokens
         # Since our replacement can insert "\n\n" next to "\n"
@@ -373,13 +371,12 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
             [newline_4],
         )
 
-        return token_ids
+        return token_ids, res
 
     def _find_mm_placeholders(
         self,
-        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
         new_token_ids: list[int],
-        mm_item_counts: Mapping[str, int],
+        mm_prompt_updates: MultiModalPromptUpdates,
     ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
         # We need to detect "\n\n" inside "\n\n\n" and "\n\n\n\n"
         tokenizer = self.info.get_tokenizer()
@@ -404,8 +401,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
             repl_token_ids.extend(repl_toks)
             repl_orig_idxs.extend(orig_idx for _ in range(len(repl_toks)))
 
-        repls = find_mm_placeholders(mm_prompt_updates, repl_token_ids,
-                                     mm_item_counts)
+        repls = find_mm_placeholders(repl_token_ids, mm_prompt_updates)
 
         return {
             modality: [
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 79061fd30c..042c31ba5c 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -29,10 +29,12 @@ from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 # yapf: disable
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, BoundPromptUpdate,
+                                        BaseProcessingInfo,
+                                        MultiModalPromptUpdates,
+                                        MultiModalPromptUpdatesApplyResult,
                                         PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptTargetMatch,
-                                        PromptUpdate, PromptUpdateDetails,
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails,
                                         find_mm_placeholders,
                                         replace_token_matches)
 # yapf: enable
@@ -254,14 +256,10 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
     def _apply_token_matches(
         self,
         prompt: list[int],
-        mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
-        mm_item_counts: Mapping[str, int],
-    ) -> list[int]:
-        token_ids = super()._apply_token_matches(
-            prompt,
-            mm_matches,
-            mm_item_counts,
-        )
+        mm_prompt_updates: MultiModalPromptUpdates,
+    ) -> tuple[list[int], MultiModalPromptUpdatesApplyResult]:
+        token_ids, res = super()._apply_token_matches(prompt,
+                                                      mm_prompt_updates)
 
         # "\n\n\n" and "\n\n\n\n" are single tokens
         # Since our replacement can insert "\n\n" next to "\n"
@@ -290,13 +288,12 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
             [newline_4],
         )
 
-        return token_ids
+        return token_ids, res
 
     def _find_mm_placeholders(
         self,
-        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
         new_token_ids: list[int],
-        mm_item_counts: Mapping[str, int],
+        mm_prompt_updates: MultiModalPromptUpdates,
     ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
         # We need to detect "\n\n" inside "\n\n\n" and "\n\n\n\n"
         tokenizer = self.info.get_tokenizer()
@@ -321,8 +318,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
             repl_token_ids.extend(repl_toks)
             repl_orig_idxs.extend(orig_idx for _ in range(len(repl_toks)))
 
-        repls = find_mm_placeholders(mm_prompt_updates, repl_token_ids,
-                                     mm_item_counts)
+        repls = find_mm_placeholders(repl_token_ids, mm_prompt_updates)
 
         return {
             modality: [
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index cd41d4fb43..bc53982c93 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -828,26 +828,19 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
                 target=[image_token_id] * num_image_tokens,
                 replacement=get_replacement_mantis,
             )
-        ])
+        ], mm_item_counts)
 
         prompt_ids, prompt, _ = self._apply_prompt_updates(
             result["prompt_token_ids"],
             mantis_mm_repls,
-            mm_item_counts,
         )
 
-        unbound_orig_repls = self._get_prompt_updates(
+        orig_repls = self._get_mm_prompt_updates(
             mm_items,
             hf_processor_mm_kwargs,
             mm_kwargs,
         )
-        orig_repls = self._bind_and_group_updates(unbound_orig_repls)
-
-        mm_placeholders = self._find_mm_placeholders(
-            orig_repls,
-            prompt_ids,
-            mm_item_counts,
-        )
+        mm_placeholders = self._find_mm_placeholders(prompt_ids, orig_repls)
         self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
 
         mm_placeholder_ranges = {
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 078251ee2b..61e09d5604 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -38,7 +38,8 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, BoundPromptUpdate,
+                                        BaseProcessingInfo,
+                                        MultiModalPromptUpdates,
                                         PlaceholderFeaturesInfo,
                                         PromptReplacement, PromptUpdate)
 # yapf: enable
@@ -431,24 +432,21 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
 
             return [_IMAGE_TOKEN_ID] * num_image_tokens
 
-        num_images = mm_items.get_count("image", strict=False)
-
         return [
             PromptReplacement(
                 modality="image",
-                target=image_token,
+                target=image_tokens.__getitem__,
                 replacement=get_replacement_phi3v,
-            ) for image_token in image_tokens[:num_images]
+            )
         ]
 
     def _apply_prompt_updates(
         self,
         token_ids: list[int],
-        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
-        mm_item_counts: Mapping[str, int],
+        mm_prompt_updates: MultiModalPromptUpdates,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
         # align to hf behavior when there are images
-        if len(mm_item_counts):
+        if len(mm_prompt_updates):
             tokenizer = self.info.get_tokenizer()
             # to decode token_ids to the original text, we need to
             # 1. remove the first bos token
@@ -484,7 +482,6 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
         token_ids, text, placeholders = super()._apply_prompt_updates(
             token_ids=token_ids,
             mm_prompt_updates=mm_prompt_updates,
-            mm_item_counts=mm_item_counts,
         )
 
         # Keep the behavior in line with HF processor
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
index ee8b71caf3..492d4bfb7d 100644
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -1032,8 +1032,8 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         tokenizer = self.info.get_tokenizer()
-        image_token_id = tokenizer.vocab[tokenizer.image_token]
-        audio_token_id = tokenizer.vocab[tokenizer.audio_token]
+        image_token_id: int = tokenizer.vocab[tokenizer.image_token]
+        audio_token_id: int = tokenizer.vocab[tokenizer.audio_token]
 
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         audio_processor = self.info.get_feature_extractor(
@@ -1053,9 +1053,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
                     processor=hf_processor,
                 )
 
-            image_tokens = [image_token_id] * num_image_tokens
-
-            return image_tokens
+            return [image_token_id] * num_image_tokens
 
         def get_audio_replacement_phi4mm(item_idx: int):
             audios = mm_items.get_items("audio", AudioProcessorItems)
@@ -1066,9 +1064,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
             audio_embed_size = self.info._compute_audio_embed_size(
                 audio_frames)
 
-            audio_tokens = [audio_token_id] * audio_embed_size
-
-            return audio_tokens
+            return [audio_token_id] * audio_embed_size
 
         return [
             PromptReplacement(
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index b4aed11b86..5129770e8d 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -824,9 +824,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
                     processor=hf_processor,
                 )
 
-            image_tokens = [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens
-
-            return image_tokens
+            return [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens
 
         def get_audio_replacement_phi4mm(item_idx: int):
             audios = mm_items.get_items("audio", AudioProcessorItems)
@@ -837,28 +835,20 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
             audio_embed_size = self.info._compute_audio_embed_size(
                 audio_frames)
 
-            audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
+            return [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
 
-            return audio_tokens
-
-        num_images = mm_items.get_count("image", strict=False)
-        num_audios = mm_items.get_count("audio", strict=False)
-
-        image_repl = [
+        return [
             PromptReplacement(
                 modality="image",
-                target=image_token,
+                target=image_tokens.__getitem__,
                 replacement=get_image_replacement_phi4mm,
-            ) for image_token in image_tokens[:num_images]
-        ]
-        audio_repl = [
+            ),
             PromptReplacement(
                 modality="audio",
-                target=audio_token,
+                target=audio_tokens.__getitem__,
                 replacement=get_audio_replacement_phi4mm,
-            ) for audio_token in audio_tokens[:num_audios]
+            ),
         ]
-        return image_repl + audio_repl
 
 
 @MULTIMODAL_REGISTRY.register_processor(
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 664e3f2985..a61b8ca8f7 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -309,9 +309,8 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
 
         if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
-                mm_prompt_updates,
                 prompt_ids,
-                mm_item_counts,
+                mm_prompt_updates,
             )
             self._validate_mm_placeholders(
                 mm_placeholders,
@@ -328,7 +327,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             ) = self._apply_prompt_updates(
                 prompt_ids,
                 mm_prompt_updates,
-                mm_item_counts,
             )
             self._validate_mm_placeholders(
                 mm_placeholders,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 55fd1479d2..878e83add8 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -44,10 +44,21 @@ PromptSeq = Union[str, list[int]]
 """A token sequence (list of token IDs) or text."""
 
 
+class _GetMatchIndex(Protocol):
+
+    def __call__(
+        self,
+        tokenizer: AnyTokenizer,
+        prompt: PromptSeq,
+        start_idx: int = 0,
+    ) -> Optional[int]:
+        ...
+
+
 @dataclass
 class PromptIndex:
     """Resolves to an index in the prompt."""
-    get_match_index: Callable[[AnyTokenizer, PromptSeq], Optional[int]]
+    get_match_index: _GetMatchIndex
 
 
 class PromptIndexTargets:
@@ -59,7 +70,7 @@ class PromptIndexTargets:
 
         This results in a match even if the prompt is empty.
         """
-        return PromptIndex(lambda tok, prompt: 0)
+        return PromptIndex(lambda tokenizer, prompt, start_idx=0: 0)
 
     @staticmethod
     def prefix(seq: PromptSeq) -> PromptIndex:
@@ -70,7 +81,11 @@ class PromptIndexTargets:
         def get_match_index(
             tokenizer: AnyTokenizer,
             prompt: PromptSeq,
+            start_idx: int = 0,
         ) -> Optional[int]:
+            if start_idx != 0:
+                return None
+
             prefix = seq
 
             if isinstance(prompt, str):
@@ -96,14 +111,24 @@ class PromptIndexTargets:
 
         This results in a match even if the prompt is empty.
         """
-        return PromptIndex(lambda tok, prompt: len(prompt))
+        return PromptIndex(lambda tokenizer, prompt, start_idx=0: len(prompt))
 
 
-PromptTarget = Union[PromptSeq, PromptIndex]
+UpdateTarget = Union[PromptSeq, PromptIndex]
 """
 The token sequence or text to update.
 """
 
+PromptUpdateTarget = Union[Callable[[int], UpdateTarget], UpdateTarget]
+"""
+Given the index of the processed item within
+[`modality`][vllm.multimodal.processing.PromptUpdate.modality],
+output the corresponding token sequence (or text).
+
+For convenience, you can directly pass in the token sequence (or text)
+instead of a function if it does not depend on the input.
+"""
+
 
 @dataclass
 class PromptUpdateDetails(Generic[_S]):
@@ -190,7 +215,7 @@ class PromptUpdate(ABC):
     modality: str
     """The modality for which the update is made."""
 
-    target: PromptTarget
+    target: PromptUpdateTarget
     """The token sequence (or text) to update."""
 
     @property
@@ -205,10 +230,54 @@ class PromptUpdate(ABC):
         """Defines how to update the prompt."""
         raise NotImplementedError
 
-    def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptUpdate":
-        return BoundPromptUpdate(
-            _origin=self,
-            tokenizer=tokenizer,
+    def _resolve_target(
+        self,
+        tokenizer: AnyTokenizer,
+        item_idx: int,
+    ) -> Union["_BoundPromptSequence", PromptIndex]:
+        target = self.target
+        if callable(target):
+            target = target(item_idx)
+
+        if isinstance(target, PromptIndex):
+            return target
+
+        return _BoundPromptSequence.from_seq(tokenizer, target)
+
+    def _resolve_content(
+        self,
+        tokenizer: AnyTokenizer,
+        item_idx: int,
+    ) -> "_BoundPromptContent":
+        content = self.content
+        if callable(content):
+            content = content(item_idx)
+
+        if not isinstance(content, PromptUpdateDetails):
+            content = PromptUpdateDetails.from_seq(content)
+
+        bound_full = _BoundPromptSequence.from_seq(tokenizer, content.full)
+        bound_content = _BoundPromptContent(full=bound_full,
+                                            is_embed=content.is_embed)
+
+        return bound_content
+
+    def resolve(
+        self,
+        tokenizer: AnyTokenizer,
+        item_idx: int,
+    ) -> "ResolvedPromptUpdate":
+        """
+        Given the index of the processed item within
+        [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
+        output a copy of this object with its lazy attributes resolved.
+        """
+        return ResolvedPromptUpdate(
+            modality=self.modality,
+            item_idx=item_idx,
+            mode=self.mode,
+            target=self._resolve_target(tokenizer, item_idx),
+            content=self._resolve_content(tokenizer, item_idx),
         )
 
 
@@ -452,73 +521,90 @@ class _BoundPromptContent:
     is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]]
 
 
-@dataclass
-class BoundPromptUpdate:
+class PromptTargetMatch(NamedTuple):
+    start_idx: int
+    end_idx: int
+
+
+@dataclass(frozen=True)
+class ResolvedPromptUpdate:
     """
-    A [`PromptUpdate`][vllm.multimodal.processing.PromptUpdate] bound
-    to a tokenizer to automatically convert
-    [`target`][vllm.multimodal.processing.PromptUpdate.target] and the result of
-    [`get_content`][vllm.multimodal.processing.BoundPromptUpdate.get_content]
-    between token sequence and text representations.
+    A [`PromptUpdate`][vllm.multimodal.processing.PromptUpdate] with its
+    lazy attributes resolved, apart from those related to tokenization.
     """
-    _origin: PromptUpdate
-    tokenizer: AnyTokenizer = field(repr=False)
 
-    def __post_init__(self) -> None:
-        self._content_cache = dict[int, _BoundPromptContent]()
+    modality: str
+    """The modality for which the update is made."""
 
-    @property
-    def modality(self) -> str:
-        return self._origin.modality
+    item_idx: int
+    """The index within `modality` of the item this update pertains to."""
 
-    @property
-    def target(self) -> Union[_BoundPromptSequence, PromptIndex]:
-        """The token sequence (or text) to update."""
-        target = self._origin.target
+    mode: UpdateMode
+    """Defines how to update the prompt."""
+
+    target: Union[_BoundPromptSequence, PromptIndex]
+    """The token sequence (or text) to update."""
+
+    content: _BoundPromptContent = field(repr=False)
+    """The placeholder tokens that are part of the update."""
+
+    def iter_token_matches(
+        self,
+        prompt: list[int],
+        tokenizer: AnyTokenizer,
+        *,
+        start_idx: int = 0,
+    ) -> Generator[PromptTargetMatch]:
+        """Yield each instance of `self.target` found in `prompt`."""
+        target = self.target
 
         if isinstance(target, PromptIndex):
-            return target
+            match_idx = target.get_match_index(tokenizer, prompt, start_idx)
+            if match_idx is not None:
+                yield PromptTargetMatch(match_idx, match_idx)
 
-        return _BoundPromptSequence.from_seq(self.tokenizer, target)
+            return
 
-    @property
-    def content(self) -> PromptUpdateContent:
-        """The placeholder tokens that are part of the update."""
-        return self._origin.content
+        for match in iter_token_matches(prompt,
+                                        target.token_ids,
+                                        start_idx=start_idx):
+            yield PromptTargetMatch(match.start_idx, match.end_idx)
 
-    @property
-    def mode(self) -> UpdateMode:
-        """Defines how to update the prompt."""
-        return self._origin.mode
+    def iter_text_matches(
+        self,
+        prompt: str,
+        tokenizer: AnyTokenizer,
+        *,
+        start_idx: int = 0,
+    ) -> Generator[PromptTargetMatch]:
+        """Yield each instance of `self.target` found in `prompt`."""
+        target = self.target
 
-    def get_content(self, item_idx: int) -> _BoundPromptContent:
-        """
-        Given the index of the processed item within
-        [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
-        output the token sequence (or text) to update.
-        """
-        content = self.content
-        if callable(content):
-            cache_key = item_idx
-            if cache_key in self._content_cache:
-                return self._content_cache[cache_key]
+        if isinstance(target, PromptIndex):
+            match_idx = target.get_match_index(tokenizer, prompt, start_idx)
+            if match_idx is not None:
+                yield PromptTargetMatch(match_idx, match_idx)
 
-            content = content(item_idx)
-        else:
-            cache_key = None
+            return
 
-        if not isinstance(content, PromptUpdateDetails):
-            content = PromptUpdateDetails.from_seq(content)
+        for match in re.finditer(re.escape(target.text), prompt,
+                                 pos=start_idx):
+            yield PromptTargetMatch(match.start(), match.end())
 
-        bound_full = _BoundPromptSequence.from_seq(self.tokenizer,
-                                                   content.full)
-        bound_content = _BoundPromptContent(full=bound_full,
-                                            is_embed=content.is_embed)
+    def iter_matches(
+        self,
+        prompt: Union[list[int], str],
+        tokenizer: AnyTokenizer,
+        *,
+        start_idx: int = 0,
+    ) -> Generator[PromptTargetMatch]:
+        """Yield each instance of `self.target` found in `prompt`."""
+        if isinstance(prompt, str):
+            return self.iter_text_matches(prompt,
+                                          tokenizer,
+                                          start_idx=start_idx)
 
-        if cache_key is not None:
-            self._content_cache[cache_key] = bound_content
-
-        return bound_content
+        return self.iter_token_matches(prompt, tokenizer, start_idx=start_idx)
 
 
 class _TokenMatch(NamedTuple):
@@ -529,6 +615,8 @@ class _TokenMatch(NamedTuple):
 def iter_token_matches(
     token_ids: list[int],
     match_ids: list[int],
+    *,
+    start_idx: int = 0,
 ) -> Generator[_TokenMatch]:
     """
     Yield each occurrence of `match_ids` in `token_ids`.
@@ -541,7 +629,6 @@ def iter_token_matches(
     if match_len == 0:
         return
 
-    start_idx = 0
     while start_idx < prompt_len - match_len + 1:
         end_idx = start_idx + match_len
 
@@ -581,68 +668,6 @@ def replace_token_matches(
     return flatten_2d_lists(out_seqs)
 
 
-@dataclass(repr=False)
-class PromptTargetMatch(ABC):
-    _origin: BoundPromptUpdate
-
-    @property
-    def modality(self) -> str:
-        return self._origin.modality
-
-    @property
-    @abstractmethod
-    def start_idx(self) -> int:
-        raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def end_idx(self) -> int:
-        raise NotImplementedError
-
-    def __repr__(self) -> str:
-        return (f"{type(self).__name__}(modality={self.modality!r}, "
-                f"start_idx={self.start_idx!r}, end_idx={self.end_idx!r})")
-
-
-@dataclass(repr=False)
-class _PromptTargetIndexMatch(PromptTargetMatch):
-    match_idx: int
-
-    @property
-    def start_idx(self) -> int:
-        return self.match_idx
-
-    @property
-    def end_idx(self) -> int:
-        return self.match_idx
-
-
-@dataclass(repr=False)
-class _PromptTargetTokenMatch(PromptTargetMatch):
-    match: _TokenMatch
-
-    @property
-    def start_idx(self) -> int:
-        return self.match.start_idx
-
-    @property
-    def end_idx(self) -> int:
-        return self.match.end_idx
-
-
-@dataclass(repr=False)
-class _PromptTargetTextMatch(PromptTargetMatch):
-    match: re.Match[str]
-
-    @property
-    def start_idx(self) -> int:
-        return self.match.start()
-
-    @property
-    def end_idx(self) -> int:
-        return self.match.end()
-
-
 @dataclass
 class PlaceholderFeaturesInfo:
     modality: str
@@ -665,163 +690,158 @@ class PlaceholderFeaturesInfo:
         )
 
 
-def find_token_matches(
-    prompt: list[int],
-    prompt_updates: Sequence[BoundPromptUpdate],
-) -> Sequence[PromptTargetMatch]:
-    """Return each target of `prompt_updates` found in `prompt`."""
-
-    def get_matches(update: BoundPromptUpdate):
-        target = update.target
-
-        if isinstance(target, PromptIndex):
-            match_idx = target.get_match_index(update.tokenizer, prompt)
-            if match_idx is None:
-                return []
-
-            return [_PromptTargetIndexMatch(update, match_idx)]
-
-        return [
-            _PromptTargetTokenMatch(update, match)
-            for match in iter_token_matches(prompt, target.token_ids)
-        ]
-
-    return [
-        match for update in prompt_updates for match in get_matches(update)
-    ]
+_MatchToApply = tuple[tuple[str, int], tuple[PromptTargetMatch, int]]
 
 
-def find_text_matches(
-    prompt: str,
-    prompt_updates: Sequence[BoundPromptUpdate],
-) -> Sequence[PromptTargetMatch]:
-    """Return each target of `prompt_updates` found in `prompt`."""
+def _find_matches(
+    prompt: _S,
+    mm_prompt_updates: "MultiModalPromptUpdates",
+    tokenizer: AnyTokenizer,
+    *,
+    prev_end_idx: int = 0,
+    current_result: "MultiModalPromptUpdatesApplyResult",
+) -> tuple[Optional[UpdateMode], list[_MatchToApply]]:
+    mode: Optional[UpdateMode] = None
+    mm_matches = dict[tuple[str, int], tuple[PromptTargetMatch, int]]()
 
-    def get_matches(update: BoundPromptUpdate):
-        target = update.target
+    for modality, modality_updates in mm_prompt_updates.items():
+        for item_idx, item_updates in enumerate(modality_updates):
+            if current_result[modality][item_idx] is not None:
+                continue  # Updates have already been applied for this item
 
-        if isinstance(target, PromptIndex):
-            match_idx = target.get_match_index(update.tokenizer, prompt)
-            if match_idx is None:
-                return []
+            for update_idx, update in enumerate(item_updates):
+                if (modality, item_idx) in mm_matches:
+                    break  # Already found a match for this item
 
-            return [_PromptTargetIndexMatch(update, match_idx)]
+                for match in update.iter_matches(
+                        prompt,
+                        tokenizer,
+                        start_idx=prev_end_idx,
+                ):
+                    # All matches should share the same mode
+                    if mode is None:
+                        mode = update.mode
+                    elif mode != update.mode:
+                        continue
 
-        return [
-            _PromptTargetTextMatch(update, match)
-            for match in re.finditer(re.escape(target.text), prompt)
-        ]
+                    mm_matches[(modality, item_idx)] = match, update_idx
+                    break  # Get only the first valid match per item
 
-    return [
-        match for update in prompt_updates for match in get_matches(update)
-    ]
+    # Prioritize earlier matches
+    matches_to_apply = sorted(mm_matches.items(), key=lambda item: item[1][0])
 
+    # To avoid conflicts, only replace one non-empty item at a time
+    if mode == UpdateMode.REPLACE:
+        matches_to_apply_ = list[_MatchToApply]()
+        has_non_empty_matches = False
 
-def _resolve_matches(
-    prompt: PromptSeq,
-    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
-) -> list[PromptTargetMatch]:
-    """
-    Resolve `mm_matches` to ensure that there are no overlapping matches,
-    and sort them such that earlier matches take priority over later ones.
-    """
-    matches = [m for matches in mm_matches.values() for m in matches]
+        for item in matches_to_apply:
+            _, (match, _) = item
+            if match.start_idx == match.end_idx:
+                matches_to_apply_.append(item)
+            elif not has_non_empty_matches:
+                has_non_empty_matches = True
+                matches_to_apply_.append(item)
 
-    seen_matches: list[Optional[PromptTargetMatch]] = [None] * len(prompt)
+        matches_to_apply = matches_to_apply_
 
-    for match in matches:
-        for idx in range(match.start_idx, match.end_idx):
-            if seen_matches[idx] is not None:
-                raise ValueError("Found overlapping matches "
-                                 f"({seen_matches[idx]} and {match}) "
-                                 f"at index={idx} of prompt={prompt}")
-
-            seen_matches[idx] = match
-
-    return sorted(matches, key=lambda x: x.start_idx)
+    return mode, matches_to_apply
 
 
 def _apply_matches(
     prompt: _S,
-    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
-    mm_item_counts: Mapping[str, int],
-) -> list[_S]:
-    """Apply the updates in `mm_matches` to `prompt`."""
+    mm_prompt_updates: "MultiModalPromptUpdates",
+    tokenizer: AnyTokenizer,
+) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
+    prompt_len = len(prompt)
+
     out_seqs = list[Union[str, list[int]]]()
-    prev_end_idx = 0
-    next_idx_by_modality = defaultdict[str, int](lambda: 0)
+    out_result: MultiModalPromptUpdatesApplyResult = {
+        m: [None] * len(items)
+        for m, items in mm_prompt_updates.items()
+    }
 
-    for match in _resolve_matches(prompt, mm_matches):
-        modality = match.modality
+    start_idx = prev_end_idx = 0
+    while start_idx < max(prompt_len, 1):  # Allow inserts into empty prompt
+        found = False
 
-        item_start_idx = next_idx_by_modality[modality]
-        max_item_count = mm_item_counts.get(modality, 0)
-        if item_start_idx >= max_item_count:
-            continue
+        mode, matches_to_apply = _find_matches(
+            prompt,
+            mm_prompt_updates,
+            tokenizer,
+            prev_end_idx=prev_end_idx,
+            current_result=out_result,
+        )
 
-        start_idx = match.start_idx
-        end_idx = match.end_idx
-        origin = match._origin
-        mode = origin.mode
+        if mode is not None:
+            for (modality, item_idx), (match, update_idx) in matches_to_apply:
+                found = True
 
-        if mode == UpdateMode.INSERT:
-            out_seqs.append(prompt[prev_end_idx:end_idx])
-            num_inserts = max_item_count
-        elif mode == UpdateMode.REPLACE:
-            out_seqs.append(prompt[prev_end_idx:start_idx])
-            num_inserts = max_item_count if start_idx == end_idx else 1
-        else:
-            assert_never(mode)
+                matched_update = mm_prompt_updates[modality][item_idx][
+                    update_idx]
+                matched_content = matched_update.content
 
-        item_end_idx = min(item_start_idx + num_inserts, max_item_count)
+                if mode == UpdateMode.INSERT:
+                    end_idx_to_insert = match.end_idx
+                elif mode == UpdateMode.REPLACE:
+                    end_idx_to_insert = match.start_idx
+                else:
+                    assert_never(mode)
 
-        for item_idx in range(item_start_idx, item_end_idx):
-            content = origin.get_content(item_idx)
-            insert_seq = (content.full.text if isinstance(prompt, str) else
-                          content.full.token_ids)
+                out_seqs.append(prompt[prev_end_idx:end_idx_to_insert])
+                out_seqs.append(matched_content.full.text if isinstance(
+                    prompt, str) else matched_content.full.token_ids)
+                out_result[modality][item_idx] = update_idx
 
-            out_seqs.append(insert_seq)
+                # Exclude overlapping matches
+                start_idx = prev_end_idx = match.end_idx
 
-        prev_end_idx = end_idx
-        next_idx_by_modality[modality] += item_end_idx - item_start_idx
+        if not found:
+            start_idx += 1
 
     out_seqs.append(prompt[prev_end_idx:])
 
-    return cast(list[_S], out_seqs)
+    return cast(list[_S], out_seqs), out_result
 
 
 def apply_token_matches(
     prompt: list[int],
-    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
-    mm_item_counts: Mapping[str, int],
-) -> list[int]:
-    """Apply the updates in `mm_matches` to `prompt`."""
-    if not mm_matches:
-        return prompt
+    mm_prompt_updates: "MultiModalPromptUpdates",
+    tokenizer: AnyTokenizer,
+) -> tuple[list[int], "MultiModalPromptUpdatesApplyResult"]:
+    """
+    Apply the updates in `mm_prompt_updates` to `prompt`.
 
-    token_id_seqs = _apply_matches(prompt, mm_matches, mm_item_counts)
+    Matches are exclusive even when multiple modalities share
+    the same placeholder tokens. In that case, the modality that
+    appears earlier in `mm_prompt_updates` takes priority.
+    """
+    token_id_seqs, result = _apply_matches(prompt, mm_prompt_updates,
+                                           tokenizer)
 
-    return flatten_2d_lists(token_id_seqs)
+    return flatten_2d_lists(token_id_seqs), result
 
 
 def apply_text_matches(
     prompt: str,
-    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
-    mm_item_counts: Mapping[str, int],
-) -> str:
-    """Apply the updates in `mm_matches` to `prompt`."""
-    if not mm_matches:
-        return prompt
+    mm_prompt_updates: "MultiModalPromptUpdates",
+    tokenizer: AnyTokenizer,
+) -> tuple[str, "MultiModalPromptUpdatesApplyResult"]:
+    """
+    Apply the updates in `mm_prompt_updates` to `prompt`.
 
-    texts = _apply_matches(prompt, mm_matches, mm_item_counts)
+    Matches are exclusive even when multiple modalities share
+    the same placeholder tokens. In that case, the modality that
+    appears earlier in `mm_prompt_updates` takes priority.
+    """
+    texts, result = _apply_matches(prompt, mm_prompt_updates, tokenizer)
 
-    return "".join(texts)
+    return "".join(texts), result
 
 
 def _iter_placeholders(
-    mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
     prompt: list[int],
-    mm_item_counts: Mapping[str, int],
+    mm_prompt_updates: "MultiModalPromptUpdates",
 ) -> Iterable[PlaceholderFeaturesInfo]:
     """
     Yield each set of placeholder tokens found in `prompt`.
@@ -833,6 +853,8 @@ def _iter_placeholders(
     Note that empty matches are ignored.
     """
     prompt_len = len(prompt)
+    mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
+
     item_idx_by_modality = defaultdict[str, int](lambda: 0)
 
     start_idx = 0
@@ -844,8 +866,8 @@ def _iter_placeholders(
             if item_idx >= mm_item_counts.get(modality, 0):
                 continue
 
-            for update_info in modality_updates:
-                content = update_info.get_content(item_idx)
+            for update in modality_updates[item_idx]:
+                content = update.content
                 content_tokens_full = content.full.token_ids
                 content_len_full = len(content_tokens_full)
                 end_idx_full = start_idx + content_len_full
@@ -880,11 +902,10 @@ def _iter_placeholders(
 
 
 def find_mm_placeholders(
-    mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
     prompt: list[int],
-    mm_item_counts: Mapping[str, int],
+    mm_prompt_updates: "MultiModalPromptUpdates",
 ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
-    it = _iter_placeholders(mm_prompt_updates, prompt, mm_item_counts)
+    it = _iter_placeholders(prompt, mm_prompt_updates)
     return dict(full_groupby_modality(it))
 
 
@@ -989,12 +1010,20 @@ A collection of hashes with a similar structure as
 [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
 """
 
-MultiModalPromptUpdates = dict[str, Sequence[BoundPromptUpdate]]
+MultiModalPromptUpdates = Mapping[str, list[Sequence[ResolvedPromptUpdate]]]
 """
 A collection of prompt updates with a similar structure as
 [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
 """
 
+MultiModalPromptUpdatesApplyResult = Mapping[str, list[Optional[int]]]
+"""
+For an item `MultiModalPromptUpdates[k][i]`,
+`MultiModalPromptUpdatesApplyResult[k][i]` represents the index of the
+`ResolvedPromptUpdate` instance that has been applied, or `None` if none of the
+`ResolvedPromptUpdate` instances have been applied.
+"""
+
 
 class MultiModalProcessingInfo(NamedTuple):
     kwargs: MultiModalKwargsItems
@@ -1126,14 +1155,60 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         """
         raise NotImplementedError
 
+    def _bind_and_group_updates(
+        self,
+        prompt_updates: Sequence[PromptUpdate],
+        mm_item_counts: Mapping[str, int],
+    ) -> MultiModalPromptUpdates:
+        tokenizer = self.info.get_tokenizer()
+
+        return {
+            modality:
+            [[update.resolve(tokenizer, item_idx) for update in updates]
+             for item_idx in range(mm_item_counts.get(modality, 0))]
+            for modality, updates in full_groupby_modality(prompt_updates)
+        }
+
+    def _get_mm_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> MultiModalPromptUpdates:
+        unbound_prompt_updates = self._get_prompt_updates(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            out_mm_kwargs=out_mm_kwargs,
+        )
+
+        mm_prompt_updates = self._bind_and_group_updates(
+            unbound_prompt_updates,
+            mm_items.get_all_counts(),
+        )
+
+        for modality, prompt_updates in mm_prompt_updates.items():
+            for item_idx, item_prompt_updates in enumerate(prompt_updates):
+                if len(item_prompt_updates) > 1:
+                    logger.warning_once(
+                        "Detected %d prompt updates for `mm_items[%r][%s]`. "
+                        "Multiple prompt updates per item is now "
+                        "deprecated and may be removed in v0.13. "
+                        "Instead, please specify dynamic update targets "
+                        "in the same prompt update definition by passing "
+                        "a function to `PromptUpdate.target`.",
+                        len(prompt_updates),
+                        modality,
+                        item_idx,
+                    )
+
+        return mm_prompt_updates
+
     def _find_mm_placeholders(
         self,
-        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
         new_token_ids: list[int],
-        mm_item_counts: Mapping[str, int],
+        mm_prompt_updates: MultiModalPromptUpdates,
     ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
-        return find_mm_placeholders(mm_prompt_updates, new_token_ids,
-                                    mm_item_counts)
+        return find_mm_placeholders(new_token_ids, mm_prompt_updates)
 
     def _get_hf_mm_data(
         self,
@@ -1421,13 +1496,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
                                         tokenization_kwargs)
 
-        unbound_prompt_updates = self._get_prompt_updates(
+        mm_prompt_updates = self._get_mm_prompt_updates(
             mm_data_items,
             hf_processor_mm_kwargs,
             mm_kwargs,
         )
-        mm_prompt_updates = self._bind_and_group_updates(
-            unbound_prompt_updates)
 
         mm_info = MultiModalProcessingInfo(
             kwargs=mm_kwargs,
@@ -1497,13 +1570,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             mm_missing_kwargs=mm_missing_kwargs,
         )
 
-        unbound_prompt_updates = self._get_prompt_updates(
+        mm_prompt_updates = self._get_mm_prompt_updates(
             mm_data_items,
             hf_processor_mm_kwargs,
             mm_kwargs,
         )
-        mm_prompt_updates = self._bind_and_group_updates(
-            unbound_prompt_updates)
 
         mm_info = MultiModalProcessingInfo(
             kwargs=mm_kwargs,
@@ -1513,47 +1584,33 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         return prompt_ids, mm_info, is_update_applied
 
-    def _bind_and_group_updates(
-        self,
-        prompt_updates: Sequence[PromptUpdate],
-    ) -> dict[str, Sequence[BoundPromptUpdate]]:
-        tokenizer = self.info.get_tokenizer()
-
-        it = (update.bind(tokenizer) for update in prompt_updates)
-        return dict(full_groupby_modality(it))
-
     def _apply_token_matches(
         self,
         prompt: list[int],
-        mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
-        mm_item_counts: Mapping[str, int],
-    ) -> list[int]:
-        return apply_token_matches(prompt, mm_matches, mm_item_counts)
+        mm_prompt_updates: MultiModalPromptUpdates,
+    ) -> tuple[list[int], MultiModalPromptUpdatesApplyResult]:
+        tokenizer = self.info.get_tokenizer()
+        return apply_token_matches(prompt, mm_prompt_updates, tokenizer)
 
     def _apply_text_matches(
         self,
         prompt: str,
-        mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
-        mm_item_counts: Mapping[str, int],
-    ) -> str:
-        return apply_text_matches(prompt, mm_matches, mm_item_counts)
+        mm_prompt_updates: MultiModalPromptUpdates,
+    ) -> tuple[str, MultiModalPromptUpdatesApplyResult]:
+        tokenizer = self.info.get_tokenizer()
+        return apply_text_matches(prompt, mm_prompt_updates, tokenizer)
 
     def _apply_prompt_updates(
         self,
         token_ids: list[int],
-        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
-        mm_item_counts: Mapping[str, int],
+        mm_prompt_updates: MultiModalPromptUpdates,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
         tokenizer = self.info.get_tokenizer()
 
-        mm_token_matches = {
-            modality: find_token_matches(token_ids, updates)
-            for modality, updates in mm_prompt_updates.items()
-        }
-        mm_match_counts = {
-            modality: len(matches)
-            for modality, matches in mm_token_matches.items()
-        }
+        new_token_ids, match_result = self._apply_token_matches(
+            token_ids,
+            mm_prompt_updates,
+        )
 
         # If the search text does not represent a special token,
         # it may have different token IDs in the prompt, because
@@ -1566,48 +1623,38 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         # of the search text in the prompt, we instead perform string-based
         # updates on the decoded token IDs, then encode them back.
         if all(
-            mm_match_counts.get(modality, 0) >= item_count
-            for modality, item_count in mm_item_counts.items()
-        ):  # yapf: disable
-            token_ids = self._apply_token_matches(
-                token_ids,
-                mm_token_matches,
-                mm_item_counts,
-            )
-
-            text = decode_tokens(tokenizer, token_ids)
-            matched_updates = {
-                modality: [match._origin for match in token_matches]
-                for modality, token_matches in mm_token_matches.items()
-            }
+                all(update_idx is not None for update_idx in update_idxs)
+                for update_idxs in match_result.values()):
+            new_text = decode_tokens(tokenizer, new_token_ids)
         else:
-            text = decode_tokens(tokenizer, token_ids)
-
-            mm_text_matches = {
-                modality: find_text_matches(text, updates)
-                for modality, updates in mm_prompt_updates.items()
-            }
-            text = self._apply_text_matches(
-                text,
-                mm_text_matches,
-                mm_item_counts,
+            new_text, match_result = self._apply_text_matches(
+                decode_tokens(tokenizer, token_ids),
+                mm_prompt_updates,
             )
 
-            token_ids = encode_tokens(tokenizer,
-                                      text,
-                                      add_special_tokens=False)
-            matched_updates = {
-                modality: [match._origin for match in token_matches]
-                for modality, token_matches in mm_text_matches.items()
-            }
+            new_token_ids = encode_tokens(
+                tokenizer,
+                new_text,
+                add_special_tokens=False,
+            )
+
+        matched_updates = defaultdict[
+            str, list[Sequence[ResolvedPromptUpdate]]](list)
+        for modality, update_idxs in match_result.items():
+            for item_idx, update_idx in enumerate(update_idxs):
+                assert update_idx is not None, (
+                    "Failed to apply prompt replacement for "
+                    f"mm_items[{modality!r}][{item_idx}]")
+
+                matched_updates[modality].append(
+                    [mm_prompt_updates[modality][item_idx][update_idx]])
 
         placeholders = self._find_mm_placeholders(
-            matched_updates,
-            token_ids,
-            mm_item_counts,
+            new_token_ids,
+            dict(matched_updates),
         )
 
-        return token_ids, text, placeholders
+        return new_token_ids, new_text, placeholders
 
     def _validate_mm_kwargs(
         self,
@@ -1661,9 +1708,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
-                mm_prompt_updates,
                 prompt_ids,
-                mm_item_counts,
+                mm_prompt_updates,
             )
             self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
 
@@ -1677,7 +1723,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             ) = self._apply_prompt_updates(
                 prompt_ids,
                 mm_prompt_updates,
-                mm_item_counts,
             )
             self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
 

From d765cf01fe574f94b85c2a49cc6e685a273245af Mon Sep 17 00:00:00 2001
From: Chenguang Zheng <645327136@qq.com>
Date: Mon, 25 Aug 2025 15:41:17 +0800
Subject: [PATCH 555/932] [Core][Multimodal] Track encode cache entries by
 mm_hash and enable embedding sharing between requests (#22711)

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 tests/v1/core/test_encoder_cache_manager.py  | 144 ++++++++++++
 tests/v1/core/test_scheduler.py              |   8 +-
 tests/v1/core/utils.py                       |   6 +-
 tests/v1/tpu/worker/test_tpu_model_runner.py |  12 +-
 tests/v1/worker/test_gpu_input_batch.py      |   1 +
 tests/v1/worker/test_gpu_model_runner.py     |  12 +-
 vllm/v1/core/encoder_cache_manager.py        | 235 ++++++++++++-------
 vllm/v1/core/sched/output.py                 |   6 +-
 vllm/v1/core/sched/scheduler.py              |  10 +-
 vllm/v1/worker/gpu_input_batch.py            |   1 +
 vllm/v1/worker/gpu_model_runner.py           |  47 ++--
 vllm/v1/worker/tpu_model_runner.py           |  37 +--
 12 files changed, 365 insertions(+), 154 deletions(-)
 create mode 100644 tests/v1/core/test_encoder_cache_manager.py

diff --git a/tests/v1/core/test_encoder_cache_manager.py b/tests/v1/core/test_encoder_cache_manager.py
new file mode 100644
index 0000000000..60d932a878
--- /dev/null
+++ b/tests/v1/core/test_encoder_cache_manager.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
+
+
+# ------------------ Mock Classes ------------------ #
+class MockRequest:
+
+    def __init__(self, request_id, mm_hashes, token_counts):
+        self.request_id = request_id
+        self.mm_hashes = mm_hashes
+        self._token_counts = token_counts
+
+    def get_num_encoder_tokens(self, input_id: int) -> int:
+        return self._token_counts[input_id]
+
+
+# ------------------ Unit Tests ------------------ #
+def test_basic_allocate_and_reuse():
+    cache = EncoderCacheManager(cache_size=10)
+    req = MockRequest("r1", ["imgA"], [4])
+
+    assert not cache.check_and_update_cache(req, 0)
+    assert cache.try_allocate(req, 0, int(1e9))
+
+    cache.allocate(req, 0)
+
+    assert cache.check_and_update_cache(req, 0)
+    assert "r1" in cache.cached["imgA"]
+    assert cache.num_free_slots == 6
+
+    # Free twice to bring refcount to 0.
+    cache.free_encoder_input(req, 0)
+    cache.free_encoder_input(req, 0)
+
+    assert not cache.cached["imgA"]
+    assert "imgA" in cache.freeable
+    assert cache.num_freeable_slots == 10
+    assert cache.num_free_slots == 6
+
+
+def test_freeing_decreases_refcount_and_moves_to_freeable():
+    manager = EncoderCacheManager(cache_size=10)
+    req = MockRequest("req2", ["img3"], [5])
+
+    assert manager.try_allocate(req, 0, int(1e9))
+    manager.allocate(req, 0)
+
+    assert len(manager.cached["img3"]) == 1
+
+    manager.free_encoder_input(req, 0)
+
+    assert not manager.cached["img3"]
+    assert "img3" in manager.freeable
+    assert manager.num_freeable_slots == 10
+
+
+def test_free_request_frees_all_inputs():
+    manager = EncoderCacheManager(cache_size=10)
+    req = MockRequest("req3", ["a", "b"], [2, 3])
+
+    assert manager.try_allocate(req, 0, int(1e9))
+    manager.allocate(req, 0)
+
+    assert manager.try_allocate(req, 1, int(1e9))
+    manager.allocate(req, 1)
+
+    assert len(manager.cached["a"]) == 1
+    assert len(manager.cached["b"]) == 1
+
+    manager.free(req)
+
+    assert not manager.cached["a"]
+    assert not manager.cached["b"]
+    assert "a" in manager.freeable
+    assert "b" in manager.freeable
+    assert manager.num_freeable_slots == 10
+
+
+def test_eviction_when_cache_is_full():
+    manager = EncoderCacheManager(cache_size=10)
+
+    req1 = MockRequest("req1", ["x"], [6])
+    req2 = MockRequest("req2", ["y"], [5])
+
+    assert manager.try_allocate(req1, 0, int(1e9))
+    manager.allocate(req1, 0)
+    manager.free_encoder_input(req1, 0)
+
+    assert manager.try_allocate(req2, 0, int(1e9))
+    manager.allocate(req2, 0)
+
+    # 'x' should have been evicted.
+    assert "x" not in manager.cached
+    assert "x" in manager.get_freed_mm_hashes()
+
+
+def test_get_cached_input_ids():
+    manager = EncoderCacheManager(cache_size=10)
+    req = MockRequest("reqX", ["m", "n", "o"], [2, 4, 3])
+
+    assert manager.try_allocate(req, 0, int(1e9))
+    manager.allocate(req, 0)
+
+    assert manager.try_allocate(req, 2, int(1e9))
+    manager.allocate(req, 2)
+
+    cached_ids = manager.get_cached_input_ids(req)
+    assert cached_ids == {0, 2}
+
+
+def test_has_cache_restores_from_freeable():
+    manager = EncoderCacheManager(cache_size=10)
+    req = MockRequest("reqY", ["imgZ"], [4])
+
+    assert manager.try_allocate(req, 0, int(1e9))
+    manager.allocate(req, 0)
+
+    manager.free_encoder_input(req, 0)
+
+    # Should restore from freeable.
+    assert manager.check_and_update_cache(req, 0)
+    assert len(manager.cached["imgZ"]) == 1
+    assert "imgZ" not in manager.freeable
+    assert manager.num_freeable_slots == 6
+
+
+def test_get_freed_mm_hashes_clears_freed_list():
+    manager = EncoderCacheManager(cache_size=10)
+    req1 = MockRequest("reqA", ["a"], [5])
+    req2 = MockRequest("reqB", ["b"], [6])
+
+    assert manager.try_allocate(req1, 0, int(1e9))
+    manager.allocate(req1, 0)
+    manager.free_encoder_input(req1, 0)
+
+    # Should trigger eviction of 'a'.
+    assert manager.try_allocate(req2, 0, int(1e9))
+    manager.allocate(req2, 0)
+
+    freed = manager.get_freed_mm_hashes()
+    assert "a" in freed
+    assert manager.get_freed_mm_hashes() == []
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 070008fcbf..07d7c12a4f 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -338,7 +338,7 @@ def test_stop_via_update_from_output():
         },
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None)
 
@@ -391,7 +391,7 @@ def test_stop_via_update_from_output():
         },
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
@@ -443,7 +443,7 @@ def test_stop_via_update_from_output():
         },
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
@@ -490,7 +490,7 @@ def test_stop_via_update_from_output():
         },
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None)
 
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 849c3f59ae..78a71f10a5 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -143,7 +143,11 @@ def create_requests(
             mm_position = mm_positions[i]
             mm_item = MultiModalKwargsItem.dummy("dummy_m")
             mm_kwargs = [mm_item] * len(mm_position)
-            mm_hashes = ["hash"] * len(mm_position)
+            # Dummy hash for each mm item should be unique
+            # since encoder cache tracks entries by hash
+            mm_hashes = [
+                "hash" + str(i) + "_" + str(j) for j in range(len(mm_position))
+            ]
         else:
             mm_position = None
             mm_kwargs = None
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 5a05781a03..941aa0a776 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -85,7 +85,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
@@ -164,7 +164,7 @@ def test_update_states_request_finished(model_runner):
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids={req_id},
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
@@ -194,7 +194,7 @@ def test_update_states_request_resumed(model_runner):
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
@@ -221,7 +221,7 @@ def test_update_states_request_resumed(model_runner):
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
@@ -252,7 +252,7 @@ def test_update_states_no_changes(model_runner):
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
@@ -287,7 +287,7 @@ def test_update_states_request_unscheduled(model_runner):
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index d7b4746562..7031859078 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -205,6 +205,7 @@ def _construct_cached_request_state(req_id_suffix: int):
         pooling_params=None,
         mm_kwargs=[],
         mm_positions=[],
+        mm_hashes=[],
         block_ids=([], ),
         generator=None,
         num_computed_tokens=len(output_token_ids),
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index b9b2314ce5..d6cd03fb01 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -141,7 +141,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
@@ -207,7 +207,7 @@ def test_update_states_request_finished(model_runner, dist_init):
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids={req_id},
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
@@ -239,7 +239,7 @@ def test_update_states_request_resumed(model_runner, dist_init):
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
@@ -266,7 +266,7 @@ def test_update_states_request_resumed(model_runner, dist_init):
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
@@ -347,7 +347,7 @@ def test_update_states_no_changes(model_runner, dist_init):
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
@@ -384,7 +384,7 @@ def test_update_states_request_unscheduled(model_runner, dist_init):
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[],
+        free_encoder_mm_hashes=[],
         structured_output_request_ids={},
         grammar_bitmask=None,
     )
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 0b9da60c67..70af419fcb 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections import OrderedDict
 from collections.abc import Mapping
 from typing import TYPE_CHECKING
 
@@ -31,34 +33,52 @@ class EncoderCacheManager:
     within requests, allowing for fine-grained memory management and enabling
     chunked processing of multimodal inputs.
 
-    Note that no caching is shared between requests at this time. If the same
-    input is used across multiple requests, it will be reprocessed for each
-    request.
+    Cache is enabled to share embeddings of same multimodal data 
+    item (identified by their hash value) between different requests, 
+    and eviction takes place at allocation time when there's no free 
+    space for new embeddings.
+    Oldest cached embeddings with no request referenced will be first evicted.
     
     Args:
         cache_size: Limit the size of the cache, measured by the number of
                     tokens from the input sequence.
 
     Attributes:
-        cache_size: Total cache capacity in encoder tokens
-        num_free_slots: Current available cache capacity in encoder tokens
-        cached: Mapping from request_id to set of cached input_ids for that
-                request
-        freed: List of (request_id, input_id) pairs that were recently freed.
-               This is cleared after every call to get_freed_ids().
+        cache_size: Total cache capacity in encoder tokens.
+        num_free_slots: Current available cache capacity in encoder tokens.
+        num_freeable_slots: Capacity that can be immediately reclaimed by
+            evicting entries with zero references (in encoder tokens).
+        cached: Mapping from mm_hash to a set of request IDs that currently
+            reference the cached entry. If the set is empty, the entry exists
+            but is not referenced by any request and is eligible for
+            reclamation.
+        freeable: List of tuples (mm_hash, num_tokens) representing entries
+            whose no current running request is needed and that can be freed to
+            make space when needed.
+        freed: List of mm_hash strings that were actually evicted since the
+            last call to get_freed_mm_hashes(). This list is cleared on return.
     """
 
     def __init__(self, cache_size: int):
         self.cache_size = cache_size
         self.num_free_slots = cache_size
-        # req_id -> cached input ids
-        self.cached: dict[str, set[int]] = {}
-        # list of [req_id, input_id]
-        self.freed: list[tuple[str, int]] = []
+        self.num_freeable_slots = cache_size
 
-    def has_cache(self, request: Request, input_id: int) -> bool:
+        # mm_hash of mm_data => ids of requests that reference the mm_data
+        self.cached: dict[str, set[str]] = {}
+
+        # mm_hash of mm_data => num_encoder_tokens of the mm_data
+        self.freeable: OrderedDict[str, int] = OrderedDict()
+        self.freed: list[str] = []
+
+    def check_and_update_cache(self, request: Request, input_id: int) -> bool:
         """Check if encoder output for a specific multimodal input is cached.
 
+        If the encoder output is cached, update `cached` to add the request id
+        to the set of request ids that reference the cached encoder output.
+        If the encoder output was previously not referenced by any request,
+        update `freeable` and `num_freeable_slots` accordingly.
+
         Args:
             request: The request containing the multimodal input
             input_id: Index of the multimodal input within the request
@@ -66,103 +86,151 @@ class EncoderCacheManager:
         Returns:
             True if the encoder output for this input is already cached
         """
-        req_id = request.request_id
-        return req_id in self.cached and input_id in self.cached[req_id]
+        mm_hash = request.mm_hashes[input_id]
+        # Not cached at all
+        if mm_hash not in self.cached:
+            return False
 
-    def can_allocate(self, request: Request, input_id: int) -> bool:
-        """Check if there's sufficient cache space for a multimodal input.
+        # Cached but currently not referenced by any request
+        if not self.cached[mm_hash]:
+            num_tokens = self.freeable.pop(mm_hash)
+            self.num_freeable_slots -= num_tokens
+
+        self.cached[mm_hash].add(request.request_id)
+        return True
+
+    def try_allocate(self, request: Request, input_id: int,
+                     encoder_budget: int) -> bool:
+        """Check if there's sufficient cache space for a multimodal input. 
+        If there is, return True and update EncoderCacheManager state.
+
+        If there is not enough free space in `num_free_slots` but there is
+        enough reclaimable space in `num_freeable_slots`, entries will be
+        evicted from `freeable` (their mm_hash appended to `freed`) until
+        enough space is available, and then this method returns True. 
+        Older entries are evicted first.
+        
+        Returns False only if the requested number of tokens exceeds both 
+        the free and reclaimable capacities combined.
 
         Args:
-            request: The request containing the multimodal input
-            input_id: Index of the multimodal input within the request
+            request: The request containing the multimodal input.
+            input_id: Index of the multimodal input within the request.
 
         Returns:
-            True if there's enough free cache space to store the encoder output
-            for this multimodal input
+            True if there's enough capacity to hold the encoder output for this
+            input (possibly after reclaiming `freeable` entries); otherwise
+            False.
+
+        Note: This method does not allocate physical memory for the encoder 
+        output but only the state of EncoderCacheManager.
         """
         num_tokens = request.get_num_encoder_tokens(input_id)
-        return num_tokens <= self.num_free_slots
+
+        # Not enough compute budget
+        if num_tokens > encoder_budget:
+            return False
+
+        # Enough free slots
+        if num_tokens <= self.num_free_slots:
+            self.num_free_slots -= num_tokens
+            self.num_freeable_slots -= num_tokens
+            return True
+
+        # Not enough reclaimable slots
+        if num_tokens > self.num_freeable_slots:
+            return False
+
+        # Not enough free slots but enough reclaimable slots
+        # NOTE: Eviction takes place here, but physical memory is not freed
+        # until model runner is notified by the scheduler output.
+        while num_tokens > self.num_free_slots:
+            mm_hash, num_free_token = self.freeable.popitem(last=False)
+            del self.cached[mm_hash]
+            self.freed.append(mm_hash)
+            self.num_free_slots += num_free_token
+        self.num_free_slots -= num_tokens
+        self.num_freeable_slots -= num_tokens
+        return True
 
     def allocate(self, request: Request, input_id: int) -> None:
         """Allocate cache space for a multimodal input's encoder output.
 
-        This method reserves cache space for storing the encoder output of
-        the specified multimodal input. The actual encoder output storage
-        happens in the model runner, but this method ensures the cache
-        manager tracks the allocation.
-
-        Args:
-            request: The request containing the multimodal input
-            input_id: Index of the multimodal input within the request
+        This reserves cache space for storing the encoder output of the
+        specified multimodal input. The actual encoder output storage happens in
+        the model runner; this method updates the manager's bookkeeping.
 
         Note:
-            This method assumes can_allocate() returned True for the same
-            request and input_id. It will reduce available cache space.
+            This method assumes try_allocate() returned True for the same input.
         """
-        req_id = request.request_id
-        if req_id not in self.cached:
-            self.cached[req_id] = set()
-        self.cached[req_id].add(input_id)
-        self.num_free_slots -= request.get_num_encoder_tokens(input_id)
+        # Encoder cache space budget should be already updated for the
+        # multimodal input and non-negative after try_allocate() is called.
+        assert self.num_free_slots >= 0
+        assert self.num_freeable_slots >= 0
+
+        mm_hash = request.mm_hashes[input_id]
+        request_id = request.request_id
+        if mm_hash not in self.cached:
+            self.cached[mm_hash] = set()
+
+        self.cached[mm_hash].add(request_id)
 
     def get_cached_input_ids(self, request: Request) -> set[int]:
         """Get all cached multimodal input IDs for a request.
 
-        Args:
-            request: The request to query
-
-        Returns:
-            Set of input_ids that have cached encoder outputs for this request.
-            Returns empty set if no inputs are cached for this request.
+        Returns the set of input IDs whose `mm_hash` exists in the cache map.
+        This includes entries that are currently unreferenced (and thus present
+        in `freeable`); for such entries, freeing for this request will be a
+        no-op.
         """
-        return self.cached.get(request.request_id, set())
+        return {
+            input_id
+            for input_id in range(len(request.mm_hashes))
+            if request.mm_hashes[input_id] in self.cached
+        }
 
     def free_encoder_input(self, request: Request, input_id: int) -> None:
-        """Free cache space for a single multimodal input's encoder output.
+        """Free the request's reference to the encoder input (`mm_data`)
 
-        This method is called when:
-        - The encoder output has been fully consumed by the decoder and is
-          no longer needed (e.g., in vision-language models after image
-          tokens are processed)
-        - A request is being cancelled or aborted
+        When the reference set for the corresponding `mm_hash` becomes empty,
+        the entry is appended to `freeable` and `num_freeable_slots` is
+        increased by the number of encoder tokens for that input. 
 
-        Args:
-            request: The request containing the multimodal input
-            input_id: Index of the multimodal input to free from cache
+        The entry is NOT physically freed until capacity is needed (e.g., by
+        `can_allocate`).
         """
         req_id = request.request_id
-        if req_id not in self.cached:
+        mm_hash = request.mm_hashes[input_id]
+        # The mm_hash not in cache or the req_id set is empty
+        if not self.cached.get(mm_hash, None):
             return
-
-        self.cached[req_id].discard(input_id)
-        if len(self.cached[req_id]) == 0:
-            del self.cached[req_id]
-        self.num_free_slots += request.get_num_encoder_tokens(input_id)
-        self.freed.append((req_id, input_id))
+        self.cached[mm_hash].discard(req_id)
+        if not self.cached[mm_hash]:
+            num_tokens = request.get_num_encoder_tokens(input_id)
+            self.freeable[mm_hash] = num_tokens
+            self.num_freeable_slots += num_tokens
 
     def free(self, request: Request) -> None:
-        """Free all cached encoder outputs for a request.
+        """Free all encoder input cache reference held by *request*.
 
-        This method is typically called when a request is finished, cancelled,
-        or aborted, and all its encoder outputs should be freed from cache.
+        For each cached input ID, `free_encoder_input` is invoked.  
+        The data stays in memory until eviction is triggered by a future 
+        attempt allocation called by 'can_allocate'.
 
-        Args:
-            request: The request whose encoder outputs should be freed
+        Typically called when a request is finished, cancelled, or aborted.
         """
         input_ids = self.get_cached_input_ids(request).copy()
         for input_id in input_ids:
             self.free_encoder_input(request, input_id)
 
-    def get_freed_ids(self) -> list[tuple[str, int]]:
+    def get_freed_mm_hashes(self) -> list[str]:
         """Get and clear the list of recently freed encoder cache entries.
 
-        This method returns all encoder cache entries that were freed since
-        the last call to this method. It's used by the scheduler to notify
-        workers about which encoder outputs can be removed from their caches.
-
         Returns:
-            List of (request_id, input_id) tuples that were freed since the
-            last call. The internal freed list is cleared after this call.
+            List of mm_hash strings that were actually evicted since the last
+            call to be used by the scheduler to notify workers about which 
+            encoder outputs can be removed from their caches. The internal 
+            list is cleared after this call. 
         """
         freed = self.freed
         self.freed = []
@@ -177,16 +245,11 @@ def compute_encoder_budget(
     """Compute the encoder cache budget based on the model and scheduler 
     configurations.
 
-    Args:
-        model_config: Model configuration.
-        scheduler_config: Scheduler configuration.
-        mm_registry: Provides information about the token cost.
-
     Returns:
-        - Compute budget for encoder execution, in unit of number of tokens 
-            in the input sequence.
-        - Space budget for encoder cache size, in unit of number of tokens 
-            in the input sequence.
+        - Compute budget for encoder execution, measured in number of tokens
+          from the input sequence.
+        - Space budget for encoder cache size, measured in number of tokens
+          from the input sequence.
     """
     if mm_registry.supports_multimodal_inputs(model_config):
         max_tokens_by_modality = mm_registry \
@@ -231,10 +294,10 @@ def compute_mm_encoder_budget(
             non-text modality.
 
     Returns:
-        - Compute budget for encoder execution, in unit of number of tokens 
-            in the input sequence.
-        - Space budget for encoder cache size, in unit of number of tokens 
-            in the input sequence.
+        - Compute budget for encoder execution, measured in number of tokens
+          from the input sequence.
+        - Space budget for encoder cache size, measured in number of tokens
+          from the input sequence.
     """
 
     if not max_tokens_by_modality:
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 9ba7ec9d96..b5cd6c5c8a 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -143,9 +143,9 @@ class SchedulerOutput:
     # steps. This is used to notify the workers about the finished requests
     # so that they can free the cached states for those requests.
     finished_req_ids: set[str]
-    # list of (req_id, encoder_input_index) tuples.
-    # Used to free the encoder cache.
-    free_encoder_input_ids: list[tuple[str, int]]
+    # list of mm_hash strings associated with the encoder outputs to be
+    # freed from the encoder cache.
+    free_encoder_mm_hashes: list[str]
 
     # Dict of request ids to their index within the batch
     # for filling the next token bitmask
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 60d5720b6b..956e23afa0 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -252,6 +252,7 @@ class Scheduler(SchedulerInterface):
                         preempted_req = self.running.pop()
 
                     self.kv_cache_manager.free(preempted_req)
+                    self.encoder_cache_manager.free(preempted_req)
                     preempted_req.status = RequestStatus.PREEMPTED
                     preempted_req.num_computed_tokens = 0
                     if self.log_stats:
@@ -550,7 +551,8 @@ class Scheduler(SchedulerInterface):
             # It contains the request IDs that are finished in between
             # the previous and the current steps.
             finished_req_ids=self.finished_req_ids,
-            free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
+            free_encoder_mm_hashes=self.encoder_cache_manager.
+            get_freed_mm_hashes(),
             structured_output_request_ids=structured_output_request_ids,
             grammar_bitmask=grammar_bitmask,
         )
@@ -698,7 +700,7 @@ class Scheduler(SchedulerInterface):
                 # in the decoder's KV cache.
                 continue
 
-            if self.encoder_cache_manager.has_cache(request, i):
+            if self.encoder_cache_manager.check_and_update_cache(request, i):
                 # The encoder input is already computed and cached.
                 continue
 
@@ -712,8 +714,8 @@ class Scheduler(SchedulerInterface):
                 num_new_tokens = start_pos - num_computed_tokens
                 break
 
-            if (not self.encoder_cache_manager.can_allocate(request, i)
-                    or num_encoder_tokens > encoder_budget):
+            if not self.encoder_cache_manager.try_allocate(
+                    request, i, encoder_budget):
                 # The encoder cache is full or the encoder budget is exhausted.
                 # NOTE(woosuk): We assume that the encoder input tokens should
                 # be processed altogether, as the encoder usually uses
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index f48c9de2f4..284af6bfed 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -33,6 +33,7 @@ class CachedRequestState:
     prompt_token_ids: list[int]
     mm_kwargs: list[MultiModalKwargsItem]
     mm_positions: list[PlaceholderRange]
+    mm_hashes: list[str]
     sampling_params: Optional[SamplingParams]
     pooling_params: Optional[PoolingParams]
     generator: Optional[torch.Generator]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ec9887b801..d634cf280f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -176,8 +176,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.attn_groups: list[list[AttentionGroup]] = []
         # self.kv_cache_config: KVCacheConfig
 
-        # req_id -> (input_id -> encoder_output)
-        self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
+        # mm_hash ->  encoder_output
+        self.encoder_cache: dict[str, torch.Tensor] = {}
 
         self.use_aux_hidden_state_outputs = False
         # Set up speculative decoding.
@@ -436,7 +436,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
-            self.encoder_cache.pop(req_id, None)
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
         # scheduled_req_ids overlap. This happens when a request is aborted and
@@ -447,12 +446,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.input_batch.remove_request(req_id)
 
         # Free the cached encoder outputs.
-        for req_id, input_id in scheduler_output.free_encoder_input_ids:
-            encoder_outputs = self.encoder_cache.get(req_id)
-            if encoder_outputs is not None:
-                encoder_outputs.pop(input_id, None)
-                if not encoder_outputs:
-                    self.encoder_cache.pop(req_id, None)
+        for mm_hash in scheduler_output.free_encoder_mm_hashes:
+            self.encoder_cache.pop(mm_hash, None)
 
         # Remove the unscheduled requests from the persistent batch.
         # NOTE(woosuk): The unscheduled requests are either preempted requests
@@ -496,6 +491,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 prompt_token_ids=new_req_data.prompt_token_ids,
                 mm_kwargs=new_req_data.mm_kwargs,
                 mm_positions=new_req_data.mm_positions,
+                mm_hashes=new_req_data.mm_hashes,
                 sampling_params=sampling_params,
                 pooling_params=pooling_params,
                 generator=generator,
@@ -1161,17 +1157,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
-
         # Batch the multi-modal inputs.
         mm_kwargs = list[MultiModalKwargsItem]()
-        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
+        # list of tuple (mm_hash, position_info)
+        mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
 
             for mm_input_id in encoder_input_ids:
+                mm_hash = req_state.mm_hashes[mm_input_id]
                 mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
-                req_ids_pos.append(
-                    (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
+                mm_hashes_pos.append(
+                    (mm_hash, req_state.mm_positions[mm_input_id]))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -1204,15 +1201,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             for output in curr_group_outputs:
                 encoder_outputs.append(output)
 
-        # Cache the encoder outputs.
-        for (req_id, input_id, pos_info), output in zip(
-                req_ids_pos,
-                encoder_outputs,
-        ):
-            if req_id not in self.encoder_cache:
-                self.encoder_cache[req_id] = {}
-
-            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
+        # Cache the encoder outputs by mm_hash
+        for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs):
+            self.encoder_cache[mm_hash] = scatter_mm_placeholders(
                 output,
                 is_embed=pos_info.is_embed,
             )
@@ -1230,6 +1221,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_computed_tokens = \
                 req_state.num_computed_tokens + shift_computed_tokens
             mm_positions = req_state.mm_positions
+            mm_hashes = req_state.mm_hashes
             for i, pos_info in enumerate(mm_positions):
                 start_pos = pos_info.offset
                 num_encoder_tokens = pos_info.length
@@ -1249,11 +1241,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 start_idx = max(num_computed_tokens - start_pos, 0)
                 end_idx = min(
                     num_computed_tokens - start_pos + num_scheduled_tokens,
-                    num_encoder_tokens)
+                    num_encoder_tokens,
+                )
                 assert start_idx < end_idx
-                assert req_id in self.encoder_cache
-                assert i in self.encoder_cache[req_id]
-                encoder_output = self.encoder_cache[req_id][i]
+
+                mm_hash = mm_hashes[i]
+                encoder_output = self.encoder_cache.get(mm_hash, None)
+                assert encoder_output is not None,\
+                    f"Encoder cache miss for {mm_hash}."
 
                 if (is_embed := pos_info.is_embed) is not None:
                     is_embed = is_embed[start_idx:end_idx]
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 2a8d65948d..4a485b7e07 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -208,8 +208,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Lazy initialization
         self.model: nn.Module  # Set after load_model
         self.kv_caches: list[torch.Tensor] = []
-        # req_id -> (input_id -> encoder_output)
-        self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
+        # mm_hash -> encoder_output
+        self.encoder_cache: dict[str, torch.Tensor] = {}
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
@@ -342,7 +342,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
-            self.encoder_cache.pop(req_id, None)
 
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
@@ -357,12 +356,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 removed_req_indices.append(req_index)
 
         # Free the cached encoder outputs.
-        for req_id, input_id in scheduler_output.free_encoder_input_ids:
-            encoder_outputs = self.encoder_cache.get(req_id)
-            if encoder_outputs is not None:
-                encoder_outputs.pop(input_id, None)
-                if not encoder_outputs:
-                    self.encoder_cache.pop(req_id, None)
+        for mm_hash in scheduler_output.free_encoder_mm_hashes:
+            self.encoder_cache.pop(mm_hash, None)
 
         # Remove the unscheduled requests from the persistent batch.
         # NOTE(woosuk): The unscheduled requests are either preempted requests
@@ -394,6 +389,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 prompt_token_ids=new_req_data.prompt_token_ids,
                 mm_kwargs=new_req_data.mm_kwargs,
                 mm_positions=new_req_data.mm_positions,
+                mm_hashes=new_req_data.mm_hashes,
                 sampling_params=sampling_params,
                 pooling_params=None,
                 generator=None,
@@ -845,14 +841,16 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # Batch the multi-modal inputs.
         mm_kwargs = list[MultiModalKwargsItem]()
-        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
+        # List of tuple (mm_hash, pos_info)
+        mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
 
             for mm_input_id in encoder_input_ids:
+                mm_hash = req_state.mm_hashes[mm_input_id]
                 mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
-                req_ids_pos.append(
-                    (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
+                mm_hashes_pos.append(
+                    (mm_hash, req_state.mm_positions[mm_input_id]))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -895,15 +893,15 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # NOTE (NickLucche) here we diverge from logic in other runners, as we
         # assume to only have whole mm items to process. Hence we avoid the
         # intrinsic dynamism that `scatter_mm_placeholders` introduces.
-        for (req_id, input_id, pos_info), output in zip(
-                req_ids_pos,
+        for (mm_hash, pos_info), output in zip(
+                mm_hashes_pos,
                 encoder_outputs,
         ):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
             assert pos_info.is_embed is None, "Expected all positions to be"\
                 " contiguous and embeddings."
-            self.encoder_cache[req_id][input_id] = output
+            self.encoder_cache[mm_hash] = output
 
     def _gather_mm_embeddings(
         self,
@@ -916,6 +914,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             req_state = self.requests[req_id]
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
+            mm_hashes = req_state.mm_hashes
             # TODO unroll loop and assume/enforce --disable_chunked_mm_input
             # NOTE (NickLucche) here we diverge from logic in other runners, as
             # we assume to only have whole mm items to process. Hence we avoid
@@ -936,11 +935,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     # in the decoder's KV cache.
                     continue
 
-                assert req_id in self.encoder_cache
-                assert i in self.encoder_cache[req_id]
+                mm_hash = mm_hashes[i]
+                encoder_output = self.encoder_cache.get(mm_hash, None)
+                assert encoder_output is not None,\
+                      f"Encoder cache miss for {mm_hash}."
                 assert pos_info.is_embed is None, "Expected all positions to"\
                 " be contiguous and embeddings."
-                encoder_output = self.encoder_cache[req_id][i]
+                encoder_output = self.encoder_cache[mm_hash]
                 mm_embeds.append(encoder_output)
         return mm_embeds
 

From 2da02dd0d8879ee5085e33979698468b3ea68c56 Mon Sep 17 00:00:00 2001
From: ZiTian Zhao <zitian.zhao@tencentmusic.com>
Date: Mon, 25 Aug 2025 15:56:39 +0800
Subject: [PATCH 556/932] [Fix] DeepSeek V3.1 tool parser error message
 (#23492)

Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
---
 .../openai/tool_parsers/deepseekv31_tool_parser.py            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
index 2656db9c62..ff9188190f 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
@@ -64,8 +64,8 @@ class DeepSeekV31ToolParser(ToolParser):
         if (self.tool_calls_start_token_id is None
                 or self.tool_calls_end_token_id is None):
             raise RuntimeError(
-                "DeepSeek-V3 Tool parser could not locate tool call start/end "
-                "tokens in the tokenizer!")
+                "DeepSeek-V3.1 Tool parser could not locate tool call "
+                "start/end tokens in the tokenizer!")
 
     def extract_tool_calls(
         self,

From 0cb7b065c3c2c3e9fa269f48fb4e945616f7f5b9 Mon Sep 17 00:00:00 2001
From: Breno Baldas Skuk <breno.skuk@hcompany.ai>
Date: Mon, 25 Aug 2025 10:28:35 +0200
Subject: [PATCH 557/932] Feature/benchmark/random mm data/images (#23119)

Signed-off-by: breno.skuk <breno.skuk@hcompany.ai>
---
 benchmarks/README.md                    |  77 +++
 tests/benchmarks/test_random_dataset.py | 344 ++++++++++++
 vllm/benchmarks/datasets.py             | 690 ++++++++++++++++++++++--
 3 files changed, 1053 insertions(+), 58 deletions(-)
 create mode 100644 tests/benchmarks/test_random_dataset.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 176b402129..a2dd5bb583 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -59,6 +59,12 @@ become available.
       <td style="text-align: center;">✅</td>
       <td><code>synthetic</code></td>
     </tr>
+    <tr>
+      <td><strong>RandomMultiModal (Image/Video)</strong></td>
+      <td style="text-align: center;">🟡</td>
+      <td style="text-align: center;">🚧</td>
+      <td><code>synthetic</code> </td>
+    </tr>
     <tr>
       <td><strong>Prefix Repetition</strong></td>
       <td style="text-align: center;">✅</td>
@@ -722,4 +728,75 @@ python benchmarks/benchmark_serving.py \
   --endpoint /v1/chat/completion
 ```
 
+### Synthetic Random Images (random-mm)
+
+Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
+
+Notes:
+
+- Works only with online benchmark via the OpenAI  backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
+- Video sampling is not yet implemented.
+
+Start the server (example):
+
+```bash
+vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
+  --dtype bfloat16 \
+  --max-model-len 16384 \
+  --limit-mm-per-prompt '{"image": 3, "video": 0}' \
+  --mm-processor-kwargs max_pixels=1003520
+```
+
+Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
+
+Ex.1: Fixed number of items and a single image resolutionm, enforcing generation of approx 40 tokens:
+
+```bash
+vllm bench serve \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-3B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name random-mm \
+  --num-prompts 100 \
+  --max-concurrency 10 \
+  --random-prefix-len 25 \
+  --random-input-len 300 \
+  --random-output-len 40 \
+  --random-range-ratio 0.2 \
+  --random-mm-base-items-per-request 2 \
+  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
+  --random-mm-bucket-config '{(224, 224, 1): 1.0}' \
+  --request-rate inf \
+  --ignore-eos \
+  --seed 42
+```
+
+The number of items per request can be controlled by passing multiple image buckets:
+
+```bash
+  --random-mm-base-items-per-request 2 \
+  --random-mm-num-mm-items-range-ratio 0.5 \
+  --random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
+  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
+```
+
+Flags specific to `random-mm`:
+
+- `--random-mm-base-items-per-request`: base number of multimodal items per request.
+- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
+- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
+- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
+
+Behavioral notes:
+
+- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
+
+How sampling works:
+
+- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
+- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
+- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
+This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
+- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
+
 </details>
diff --git a/tests/benchmarks/test_random_dataset.py b/tests/benchmarks/test_random_dataset.py
new file mode 100644
index 0000000000..26cae369cd
--- /dev/null
+++ b/tests/benchmarks/test_random_dataset.py
@@ -0,0 +1,344 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+from typing import Any, NamedTuple, Optional, cast
+
+import numpy as np
+import pytest
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.benchmarks.datasets import (RandomDataset, RandomMultiModalDataset,
+                                      SampleRequest)
+
+
+@pytest.fixture(scope="session")
+def hf_tokenizer() -> PreTrainedTokenizerBase:
+    # Use a small, commonly available tokenizer
+    return AutoTokenizer.from_pretrained("gpt2")
+
+
+class Params(NamedTuple):
+    num_requests: int
+    prefix_len: int
+    range_ratio: float
+    input_len: int
+    output_len: int
+
+
+@pytest.fixture(scope="session")
+def random_dataset_params() -> Params:
+    return Params(num_requests=16,
+                  prefix_len=7,
+                  range_ratio=0.3,
+                  input_len=50,
+                  output_len=20)
+
+
+def _fingerprint_sample(req: SampleRequest) -> tuple[str, int, int]:
+    """Project a SampleRequest into a comparable tuple."""
+    return (req.prompt, req.prompt_len, req.expected_output_len)
+
+
+def _collect_samples(dataset: RandomDataset,
+                    tokenizer: PreTrainedTokenizerBase,
+                     num_requests: int = 16,
+                     prefix_len: int = 7,
+                     range_ratio: float = 0.3,
+                     input_len: int = 50,
+                     output_len: int = 20) -> list[tuple[str, int, int]]:
+    samples = dataset.sample(
+        tokenizer=tokenizer,
+        num_requests=num_requests,
+        prefix_len=prefix_len,
+        range_ratio=range_ratio,
+        input_len=input_len,
+        output_len=output_len,
+    )
+    return [_fingerprint_sample(s) for s in samples]
+
+
+@pytest.mark.benchmark
+def test_random_dataset_same_seed(
+        hf_tokenizer: PreTrainedTokenizerBase,
+        random_dataset_params: Params) -> None:
+    """Same seed should yield identical outputs, even if global RNGs change.
+
+    This guards against accidental reliance on Python's random or np.random
+    in RandomDataset after moving to numpy.default_rng.
+    """
+    p = random_dataset_params
+    common_seed = 123
+    dataset_a = RandomDataset(random_seed=common_seed)
+    dataset_b = RandomDataset(random_seed=common_seed)
+    a = _collect_samples(dataset_a,
+                         hf_tokenizer,
+                         num_requests=p.num_requests,
+                         prefix_len=p.prefix_len,
+                         range_ratio=p.range_ratio,
+                         input_len=p.input_len,
+                         output_len=p.output_len)
+
+    # Perturb global RNG state to ensure isolation
+    random.seed(999)
+    _ = [random.random() for _ in range(100)]
+    np.random.seed(888)
+    _ = [np.random.random() for _ in range(100)]
+
+    b = _collect_samples(dataset_b,
+                         hf_tokenizer,
+                         num_requests=p.num_requests,
+                         prefix_len=p.prefix_len,
+                         range_ratio=p.range_ratio,
+                         input_len=p.input_len,
+                         output_len=p.output_len)
+    assert a == b
+
+@pytest.mark.benchmark
+def test_random_dataset_different_seeds(
+        hf_tokenizer: PreTrainedTokenizerBase,
+        random_dataset_params: Params) -> None:
+    """Different seeds should change outputs with overwhelming likelihood."""
+    p = random_dataset_params
+    seed_a = 0
+    dataset_a = RandomDataset(random_seed=seed_a)
+    a = _collect_samples(dataset_a,
+                         hf_tokenizer,
+                         num_requests=p.num_requests,
+                         prefix_len=p.prefix_len,
+                         range_ratio=p.range_ratio,
+                         input_len=p.input_len,
+                         output_len=p.output_len)
+
+    seed_b = 999
+    dataset_b = RandomDataset(random_seed=seed_b)
+    # Perturb global RNG with same seed as dataset_a to ensure isolation
+    random.seed(seed_a)
+    np.random.seed(seed_a)
+    b = _collect_samples(dataset_b,
+                         hf_tokenizer,
+                         num_requests=p.num_requests,
+                         prefix_len=p.prefix_len,
+                         range_ratio=p.range_ratio,
+                         input_len=p.input_len,
+                         output_len=p.output_len)
+    assert a != b
+
+
+# -----------------------------
+# RandomMultiModalDataset tests
+# -----------------------------
+
+def _mm_fingerprint_sample(
+    req: SampleRequest,
+) -> tuple[str, int, int, int, list[str]]:
+    """Create a compact fingerprint for multimodal samples.
+
+    Includes:
+    - prompt string
+    - prompt_len
+    - expected_output_len
+    - count of multimodal items
+    - per-item type and URL prefix (e.g., 'data:image/jpeg;base64,')
+    """
+    items = req.multi_modal_data or []
+    item_prefixes: list[str] = []
+    for it in items:
+        if isinstance(it, dict) and it.get("type") == "image_url":
+            url = it.get("image_url", {}).get("url", "")
+            # Only keep a short identifying prefix to avoid huge strings
+            item_prefixes.append(f"image:{url[:22]}")
+        elif isinstance(it, dict) and it.get("type") == "video_url":
+            url = it.get("video_url", {}).get("url", "")
+            item_prefixes.append(f"video:{url[:22]}")
+        else:
+            item_prefixes.append("unknown:")
+    return (req.prompt, req.prompt_len, req.expected_output_len, len(items),
+            item_prefixes)
+
+
+def _collect_mm_samples(
+    dataset: RandomMultiModalDataset,
+    tokenizer: PreTrainedTokenizerBase,
+    *,
+    num_requests: int = 8,
+    prefix_len: int = 3,
+    range_ratio: float = 0.0,
+    input_len: int = 20,
+    output_len: int = 5,
+    base_items_per_request: int = 2,
+    num_mm_items_range_ratio: float = 0.0,
+    limit_mm_per_prompt: Optional[dict[str, int]] = None,
+    bucket_config: Optional[dict[tuple[int, int, int], float]] = None,
+    enable_multimodal_chat: bool = False,
+) -> list[SampleRequest]:
+    if limit_mm_per_prompt is None:
+        limit_mm_per_prompt = {"image": 5, "video": 0}
+    if bucket_config is None:
+        bucket_config = {(32, 32, 1): 0.5, (52, 64, 1): 0.5}
+    return dataset.sample(
+        tokenizer=tokenizer,
+        num_requests=num_requests,
+        prefix_len=prefix_len,
+        range_ratio=range_ratio,
+        input_len=input_len,
+        output_len=output_len,
+        base_items_per_request=base_items_per_request,
+        num_mm_items_range_ratio=num_mm_items_range_ratio,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        enable_multimodal_chat=enable_multimodal_chat,
+    )
+
+
+@pytest.mark.benchmark
+def test_random_mm_same_seed(hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    seed = 42
+    ds_a = RandomMultiModalDataset(random_seed=seed)
+    ds_b = RandomMultiModalDataset(random_seed=seed)
+    a = _collect_mm_samples(ds_a, hf_tokenizer)
+    b = _collect_mm_samples(ds_b, hf_tokenizer)
+    fa = [_mm_fingerprint_sample(s) for s in a]
+    fb = [_mm_fingerprint_sample(s) for s in b]
+    assert fa == fb
+
+
+@pytest.mark.benchmark
+def test_random_mm_different_seeds(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    ds_a = RandomMultiModalDataset(random_seed=0)
+    ds_b = RandomMultiModalDataset(random_seed=999)
+    a = _collect_mm_samples(ds_a, hf_tokenizer)
+    b = _collect_mm_samples(ds_b, hf_tokenizer)
+    fa = [_mm_fingerprint_sample(s) for s in a]
+    fb = [_mm_fingerprint_sample(s) for s in b]
+    assert fa != fb
+
+@pytest.mark.benchmark
+def test_random_mm_respects_limits(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    # Requesting 3 items with a per-prompt limit of 1 should error per current
+    # design (dataset refuses to silently clamp below the requested baseline).
+    with pytest.raises(ValueError):
+        _collect_mm_samples(
+            ds,
+            hf_tokenizer,
+            num_requests=12,
+            base_items_per_request=3,
+            num_mm_items_range_ratio=0.0,
+            limit_mm_per_prompt={"image": 1, "video": 0},
+            bucket_config={(32, 32, 1): 1.0},
+        )
+
+
+@pytest.mark.benchmark
+def test_random_mm_zero_prob_entries_are_removed(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    # Second bucket has zero probability and should be ignored after
+    # normalization
+    samples = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=6,
+        base_items_per_request=2,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt={"image": 10, "video": 0},
+        bucket_config={(32, 32, 1): 1.0, (52, 64, 1): 0.0},
+    )
+    for s in samples:
+        assert isinstance(s.multi_modal_data, list)
+        typed_mm = cast(list[dict[str, Any]], s.multi_modal_data)
+        for it in typed_mm:
+            assert it.get("type") == "image_url"
+
+
+@pytest.mark.benchmark
+def test_random_mm_zero_items(hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    samples = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=0,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt={"image": 5, "video": 0},
+        bucket_config={(32, 32, 1): 1.0},
+    )
+    for s in samples:
+        assert s.multi_modal_data == []
+
+@pytest.mark.benchmark
+def test_random_mm_num_items_per_prompt(
+    hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    # Fixed number of images per prompt
+    # set num_mm_items_range_ratio to 0.0
+    # TODO: modify video values when video sampling is implemented
+    samples_fixed_items = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=3,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt={"image": 3, "video": 0},
+        bucket_config={(32, 32, 1): 1.0},
+    )
+    # Must have 5 requests each with 3 mm items per prompt
+    assert len(samples_fixed_items) == 5
+    for s in samples_fixed_items:
+        mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
+        assert len(mm_data) == 3
+        for it in mm_data:
+            assert it.get("type") == "image_url"
+
+
+@pytest.mark.benchmark
+def test_random_mm_bucket_config_not_mutated(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+
+    ds = RandomMultiModalDataset(random_seed=0)
+    # This bucket config is not normalized to sum to 1
+    # and has more buckets than requested images
+    original = {(32, 32, 1): 0.2, (52, 64, 1): 6, (25, 64, 1): 3}
+    # Keep a snapshot to compare after sampling
+    snapshot = dict(original)
+
+    _ = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=4,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt={"image": 1, "video": 0},
+        bucket_config=original,
+    )
+
+    # Ensure the original dict content is unchanged
+    assert original == snapshot
+
+
+    # Vary number of mm items per prompt
+    # set num_mm_items_range_ratio to 0.5
+    samples_varying_items = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=2,
+        num_mm_items_range_ratio=0.5,
+        limit_mm_per_prompt={"image": 4, "video": 0},
+        bucket_config={(32, 32, 1): 1.0},
+    )
+    # Must have 5 requests each with less than 4 mm items per prompt
+    # but at least 1 mm item per prompt
+    assert len(samples_varying_items) == 5
+    for s in samples_varying_items:
+        mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
+        assert len(mm_data) <= 4
+        assert len(mm_data) >= 1
+        for it in mm_data:
+            assert it.get("type") == "image_url"
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 920d21bda3..e586337367 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -11,18 +11,21 @@ generation. Supported dataset types include:
   - HuggingFace
   - VisionArena
 """
+import ast
 import base64
 import io
 import json
 import logging
+import math
 import random
 from abc import ABC, abstractmethod
-from collections.abc import Mapping
+from collections.abc import Iterator, Mapping
+from contextlib import suppress
 from copy import deepcopy
 from dataclasses import dataclass
 from functools import cache
 from io import BytesIO
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Optional, Union, cast
 
 import numpy as np
 from PIL import Image
@@ -114,7 +117,9 @@ class BenchmarkDataset(ABC):
     def apply_multimodal_chat_transformation(
             self,
             prompt: str,
-            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
+            mm_content: Optional[
+                        Union[MultiModalDataDict, dict, list[dict]]
+                             ] = None) -> list[dict]:
         """
         Transform a prompt and optional multimodal content into a chat format.
         This method is used for chat models that expect a specific conversation
@@ -122,7 +127,15 @@ class BenchmarkDataset(ABC):
         """
         content = [{"text": prompt, "type": "text"}]
         if mm_content is not None:
-            content.append(mm_content)
+            if isinstance(mm_content, list):
+                content.extend(cast(list[dict[str, Any]], mm_content))
+            elif isinstance(mm_content, dict):
+                content.append(mm_content)
+            else:
+                raise TypeError(  
+                    "Could not process multimodal content of type: " +
+                    f"{type(mm_content)}"  
+                ) 
         return [{"role": "user", "content": content}]
 
     def load_data(self) -> None:
@@ -362,90 +375,536 @@ def process_video(video: Any) -> Mapping[str, Any]:
 
 
 class RandomDataset(BenchmarkDataset):
+    """
+    Synthetic text-only dataset for serving/throughput benchmarks.
+
+    Strategy:
+    - Sample input/output token lengths per request from integer-uniform ranges
+      around configured means (controlled by range_ratio).
+    - Prepend a fixed random prefix of length prefix_len.
+    - Generate the remaining tokens as a reproducible sequence:
+      (offset + index + arange(input_len)) % vocab_size.
+    - Decode then re-encode/truncate to ensure prompt token counts match.
+    - Uses numpy.default_rng seeded with random_seed for reproducible sampling.
+    """
     # Default values copied from benchmark_serving.py for the random dataset.
     DEFAULT_PREFIX_LEN = 0
     DEFAULT_RANGE_RATIO = 0.0
     DEFAULT_INPUT_LEN = 1024
     DEFAULT_OUTPUT_LEN = 128
 
-    def __init__(
-        self,
-        **kwargs,
-    ) -> None:
+    def __init__(self, **kwargs) -> None:
         super().__init__(**kwargs)
-        random.seed(self.random_seed)
-        np.random.seed(self.random_seed)
+        # Use numpy's default_rng for deterministic sampling
+        # Do not use random.seed() or np.random.seed() elsewhere in this class.
+        # This ensures that the RNG is isolated from global RNG state.
+        self._rng = np.random.default_rng(self.random_seed)
 
     def sample(
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
+        request_id_prefix: str = "",
         prefix_len: int = DEFAULT_PREFIX_LEN,
         range_ratio: float = DEFAULT_RANGE_RATIO,
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
-        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
-        # Enforce range_ratio < 1
-        assert range_ratio < 1.0, (
-            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
+
+        input_lens, output_lens, offsets = self.get_sampling_params(
+            num_requests, range_ratio, input_len, output_len, tokenizer
         )
 
+        # Generate prefix once
+        prefix_token_ids = self.get_prefix(tokenizer, prefix_len)
         vocab_size = tokenizer.vocab_size
-        num_special_tokens = tokenizer.num_special_tokens_to_add()
-        real_input_len = input_len - num_special_tokens
-
-        prefix_token_ids = (np.random.randint(
-            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
-
-        # New sampling logic: [X * (1 - b), X * (1 + b)]
-        input_low = int(real_input_len * (1 - range_ratio))
-        input_high = int(real_input_len * (1 + range_ratio))
-        output_low = int(output_len * (1 - range_ratio))
-        output_high = int(output_len * (1 + range_ratio))
-
-        # Add logging for debugging
-        logger.info(
-            "Sampling input_len from [%s, %s] and output_len from [%s, %s]",
-            input_low, input_high, output_low, output_high)
-
-        input_lens = np.random.randint(input_low,
-                                       input_high + 1,
-                                       size=num_requests)
-        output_lens = np.random.randint(output_low,
-                                        output_high + 1,
-                                        size=num_requests)
-        offsets = np.random.randint(0, vocab_size, size=num_requests)
 
         requests = []
         for i in range(num_requests):
-            inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
-                         vocab_size).tolist()
-            token_sequence = prefix_token_ids + inner_seq
-            prompt = tokenizer.decode(token_sequence)
-            # After decoding the prompt we have to encode and decode it again.
-            # This is done because in some cases N consecutive tokens
-            # give a string tokenized into != N number of tokens.
-            # For example for GPT2Tokenizer:
-            # [6880, 6881] -> ['Ġcalls', 'here'] ->
-            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
-            # To avoid uncontrolled change of the prompt length,
-            # the encoded sequence is truncated before being decode again.
-            total_input_len = prefix_len + int(input_lens[i])
-            re_encoded_sequence = tokenizer.encode(
-                prompt, add_special_tokens=False)[:total_input_len]
-            prompt = tokenizer.decode(re_encoded_sequence)
-            total_input_len = len(re_encoded_sequence)
+            prompt, total_input_len = self.generate_token_sequence(
+                tokenizer=tokenizer,
+                prefix_token_ids=prefix_token_ids,
+                prefix_len=prefix_len,
+                vocab_size=vocab_size,
+                input_len=int(input_lens[i]),
+                offset=int(offsets[i]),
+                index=i,
+            )
             requests.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=total_input_len,
                     expected_output_len=int(output_lens[i]),
                     request_id=request_id_prefix + str(i),
-                ))
+                )
+            )
         return requests
 
+    def get_prefix(
+        self, tokenizer: PreTrainedTokenizerBase, prefix_len: int
+    ) -> list[int]:
+        """
+        Get the prefix for the dataset.
+        """
+        return (
+            self._rng.integers(
+                0, tokenizer.vocab_size, size=prefix_len).tolist()
+            if prefix_len > 0
+            else []
+        )
+
+    def get_sampling_params(
+        self,
+        num_requests: int,
+        range_ratio: float,
+        input_len: int,
+        output_len: int,
+        tokenizer: PreTrainedTokenizerBase,
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Get the sampling parameters for the dataset.
+        """
+        # Enforce range_ratio < 1
+        if not (0.0 <= range_ratio < 1.0):
+            raise ValueError("range_ratio must be in [0, 1).")
+        num_special_tokens = int(tokenizer.num_special_tokens_to_add())
+        real_input_len = max(0, int(input_len) - num_special_tokens)
+        # Bounds use floor for low and ceil for high
+        input_low = math.floor(real_input_len * (1 - range_ratio))
+        input_high = math.ceil(real_input_len * (1 + range_ratio))
+        output_low = math.floor(output_len * (1 - range_ratio))
+        output_high = math.ceil(output_len * (1 + range_ratio))
+        # Ensure the lower bound for output length is at least 1 to 
+        # prevent sampling 0 tokens. 
+        output_low = max(output_low, 1)
+
+        if input_low > input_high:
+            raise ValueError(
+                "Invalid input sampling interval: "
+                f"low={input_low} > high={input_high}"
+            )
+        if output_low > output_high:
+            raise ValueError(
+                "Invalid output sampling interval: "
+                f"low={output_low} > high={output_high}"
+            )
+
+        logger.info(
+            "Sampling input_len from [%s, %s] and output_len from [%s, %s]",
+            input_low,
+            input_high,
+            output_low,
+            output_high,
+        )
+
+        input_lens = self._rng.integers(input_low, input_high + 1,
+                                           size=num_requests)
+        output_lens = self._rng.integers(output_low, output_high + 1,
+                                            size=num_requests)
+        offsets = self._rng.integers(0, tokenizer.vocab_size, 
+                                        size=num_requests)
+        return input_lens, output_lens, offsets
+
+
+    def generate_token_sequence(
+        self,
+        *,
+        tokenizer: PreTrainedTokenizerBase,
+        prefix_token_ids: list[int],
+        prefix_len: int,
+        vocab_size: int,
+        input_len: int,
+        offset: int,
+        index: int,
+    ) -> tuple[str, int]:
+        """
+        Returns (prompt, total_input_len).
+
+        NOTE: After decoding the prompt we have to encode and decode it again.
+        This is done because in some cases N consecutive tokens
+        give a string tokenized into != N number of tokens.
+        For example for GPT2Tokenizer:
+        [6880, 6881] -> ['Ġcalls', 'here'] ->
+        [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
+        To avoid uncontrolled change of the prompt length,
+        the encoded sequence is truncated before being decode again.
+        """
+        # Build the inner sequence by sampling sequentially from the vocab
+        inner_seq = ((offset + index + np.arange(input_len)) 
+                    % vocab_size).tolist()
+        token_sequence = prefix_token_ids + inner_seq
+
+        # Decode, then re-encode and truncate to preserve token count invariants
+        prompt = tokenizer.decode(token_sequence)
+        total_input_len = prefix_len + int(input_len)
+
+        re_encoded_sequence = tokenizer.encode(
+            prompt, add_special_tokens=False)[:total_input_len]
+        prompt = tokenizer.decode(re_encoded_sequence)
+        total_input_len = len(re_encoded_sequence)
+
+        return prompt, total_input_len
+
+
+# -----------------------------------------------------------------------------
+# MultiModalDataset Implementation
+# -----------------------------------------------------------------------------
+
+class RandomMultiModalDataset(RandomDataset):
+    """
+    Synthetic multimodal dataset (text + images) that extends RandomDataset.
+
+    Status:
+    - Images: supported via synthetic RGB data.
+    - Video: not yet supported (TODO: implement video generation method).
+    - Audio: not yet supported.
+
+    Sampling overview:
+    1) Number of items per request is sampled uniformly from the integer range
+       [floor(n·(1−r)), ceil(n·(1+r))], where n is the base count and r is
+       `num_mm_items_range_ratio` in [0, 1]. r=0 keeps it fixed; r=1 allows 0.
+       The maximum is further clamped to the sum of per-modality limits.
+    2) Each item’s modality and shape is sampled from `bucket_config`, a dict
+       mapping (height, width, num_frames) → probability. We treat 
+       `num_frames`=1 as image and and `num_frames` > 1 as video. 
+       Entries with zero probability are removed and the rest are renormalized 
+       to sum to 1.
+    3) Per-modality hard caps are enforced via `limit_mm_per_prompt`.
+       When a modality reaches its cap, all of its buckets are excluded and the
+       remaining probabilities are renormalized.
+
+    Example bucket configuration:
+    {(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.1}
+      - Two image buckets (`num_frames`=1) and one video bucket 
+      (`num_frames`=16). 
+    OBS.: Only image sampling is supported for now.
+    """
+
+    IS_MULTIMODAL = True
+    # NOTE: video sampling is WIP. Setting it to 0.
+    DEFAULT_LIMIT_MM_PER_PROMPT = {"image": 255, "video": 0}
+
+    DEFAULT_BASE_ITEMS_PER_REQUEST = 1
+    DEFAULT_NUM_MM_ITEMS_RANGE_RATIO = 0.0
+    DEFAULT_MM_ITEM_BUCKET_CONFIG = {
+        (256, 256, 1): 0.5,
+        (720, 1280, 1): 0.5,
+        (720, 1280, 16): 0.0,
+    }
+    DEFAULT_ENABLE_MULTIMODAL_CHAT = False
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+
+    def generate_synthetic_image(self, width: int, height: int) -> Image.Image:
+        """Generate synthetic PIL image with random RGB values.
+        
+        NOTE: iid pixel sampling results in worst-case compression 
+        (good for stressing I/O), but very unlike real photos. 
+        We could consider a “low-freq” mode (e.g., noise blur)
+        to emulate network realism instead of max stress.
+        """
+        random_pixels = self._rng.integers(
+            0,
+            256,
+            (height, width, 3),
+            dtype=np.uint8,
+        )
+        return Image.fromarray(random_pixels)
+
+    def generate_synthetic_video(self, width: int, 
+                                    height: int, 
+                                    num_frames: int) -> Any:
+        """Generate synthetic video with random values.
+        
+        TODO: Finish this method.
+        """
+        raise NotImplementedError("Video sampling is WIP.")
+
+    def map_config_to_modality(self, config: tuple[int, int, int]) -> str:
+        """Map the configuration to the modality."""
+        if config[-1] == 1:
+            return "image"
+        elif config[-1] > 1:
+            return "video"
+        else:
+            raise ValueError(f"Invalid multimodal item configuration: {config}")
+
+    def normalize_bucket_config(self, bucket_config: dict[tuple[int, int, int], 
+                                float]) -> dict[tuple[int, int, int], float]:
+        """
+        Remove zero probability entries
+        and normalize the bucket config to sum to 1.
+        """
+        # Raise error if value is negative
+        if any(v < 0 for v in bucket_config.values()):
+            raise ValueError("Bucket config values must be non-negative.")
+        # Remove zero probability entries
+        bucket_config = {k: v for k, v in bucket_config.items() if v > 0}
+        # if bucket config is empty, raise error
+        if not bucket_config:
+            raise ValueError("Got invalid bucket config. "
+                             "Bucket config values must be non-zero.")
+        # Normalize the remaining bucket config to sum to 1
+        total = sum(bucket_config.values())
+        return {k: v / total for k, v in bucket_config.items()}
+
+
+    def generate_mm_item(self, 
+                         mm_item_config: tuple[int, int, int],
+                         ) -> Mapping[str, Any]:
+        """
+        Create synthetic images and videos and 
+        apply process_image/process_video respectively.
+        This follows the OpenAI API chat completions
+        https://github.com/openai/openai-python
+        """
+        
+        if self.map_config_to_modality(mm_item_config) == "image":
+            return process_image(self.generate_synthetic_image(
+                                                            mm_item_config[1],
+                                                            mm_item_config[0]))
+        elif self.map_config_to_modality(mm_item_config) == "video":
+            return process_video(self.generate_synthetic_video(
+                                                            mm_item_config[1], 
+                                                            mm_item_config[0], 
+                                                            mm_item_config[2]))
+        else:
+            raise ValueError(f"Invalid multimodal item configuration: "
+                             f"{mm_item_config}")
+
+
+    def get_mm_item_sampling_params(
+        self,
+        base_items_per_request: int,
+        num_mm_items_range_ratio: float,
+        limit_mm_per_prompt: dict[str, int],
+        bucket_config: dict[tuple[int, int, int], float],
+    ) -> tuple[int, int, dict[str, int], dict[tuple[int, int, int], float]]:
+        """
+        Get the sampling parameters for the multimodal items.
+        """
+        # Enforce num_mm_items_range_ratio <= 1
+        if not (0.0 <= num_mm_items_range_ratio <= 1.0):
+            raise ValueError("num_mm_items_range_ratio must be in [0, 1].")
+
+        # Ensure modalities to sample are in limit_mm_per_prompt
+        for k, v in bucket_config.items():
+            # get modality from bucket config
+            modality = self.map_config_to_modality(k)
+            if modality not in limit_mm_per_prompt:
+                raise ValueError(f"Modality {modality} is not in "
+                                 f"limit_mm_per_prompt: "
+                                 f"{limit_mm_per_prompt.keys()}")
+
+        # Remove zero probability entries 
+        # and normalize bucket config to sum to 1
+        bucket_config = self.normalize_bucket_config(bucket_config)
+        logger.info(
+            "Normalized bucket config: %s", bucket_config,
+        )
+        # Only consider limit per prompt for modalities in bucket config
+        allowed_modalities = {self.map_config_to_modality(cfg) 
+                              for cfg in bucket_config}
+        limit_mm_per_prompt = {
+            k: v for k, v in limit_mm_per_prompt.items() 
+            if k in allowed_modalities}
+        if not limit_mm_per_prompt:
+            raise ValueError("No valid limits for modalities present in "
+                             "bucket_config.")
+
+        logger.info(
+            "Updated mm-limit-per-prompt: %s", limit_mm_per_prompt,
+        )
+
+        # Get max and min num mm items and ensure
+        # it is at most the sum of limit_mm_per_prompt for all modalities
+        max_num_mm_items = min(
+            sum(limit_mm_per_prompt.values()), 
+            math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio))
+        )
+        # Ensure min num mm items is at least 0
+        min_num_mm_items = max(
+            0, 
+            math.floor(base_items_per_request * (1 - num_mm_items_range_ratio))
+        )
+        # Raise error if min num mm items is greater than max num mm items
+        if min_num_mm_items > max_num_mm_items:
+            raise ValueError(f"Min num mm items is greater than max mm items: "
+                             f"{min_num_mm_items} > {max_num_mm_items}")
+        
+        logger.info(
+            "Sampling number of multimodal items from [%s, %s]",
+            min_num_mm_items, max_num_mm_items,
+        )
+
+        return (
+            min_num_mm_items,
+            max_num_mm_items,
+            limit_mm_per_prompt,
+            bucket_config,
+        )
+
+    def get_mm_item_iterator(
+        self,
+        min_num_mm_items: int,
+        max_num_mm_items: int,
+        bucket_config: dict[tuple[int, int, int], float],
+        limit_mm_per_prompt: dict[str, int],
+    ) -> Iterator[tuple[int,int, int]]:
+        """
+        Iterator over the multimodal items for each request
+        whose size is between min_num_mm_items and max_num_mm_items.
+
+        Loop over the bucket config and sample a multimodal item.
+        Loop until the number of multimodal items sampled is equal to 
+        request_num_mm_items or limit of multimodal items per prompt 
+        for all modalities is reached.
+
+        Note:
+        - This function operates on a per-request shallow copy of
+          `bucket_config` (tuple->float). The original dict passed to
+          `sample` is not mutated. If this ever changes, a test
+          is implemented and will fail.
+        """
+        # Get the number of multimodal items to sample
+        request_num_mm_items = int(
+            self._rng.integers(min_num_mm_items, max_num_mm_items + 1)
+        ) 
+        # If request_num_mm_items is 0, yield an empty iterator
+        if request_num_mm_items == 0:
+            return
+        # Initialize modality counters
+        modality_counter = {self.map_config_to_modality(k): 0 
+                            for k in bucket_config}
+        # Copy the bucket config to avoid modifying the original
+        bucket_config_copy = bucket_config.copy()
+        # Loop over the number of multimodal items to sample
+        while sum(modality_counter.values()) < request_num_mm_items:
+            # Sample a multimodal item config
+            mm_item_config = self._rng.choice(list(bucket_config_copy.keys()), 
+                                                p=list(bucket_config_copy.values()))
+            modality = self.map_config_to_modality(mm_item_config)
+            # Check that modality count is less than limit per prompt
+            if modality_counter[modality] < limit_mm_per_prompt[modality]:
+                modality_counter[modality] += 1
+                yield (
+                    mm_item_config
+                )
+            else:
+                # If the counter is greater than the limit per prompt
+                # set all multimodal items of this modality to 0
+                for k, v in bucket_config_copy.items():
+                    if self.map_config_to_modality(k) == modality:
+                        bucket_config_copy[k] = 0
+                # If all configs are 0, break the loop
+                # This should not happen as request_num_mm_items is at most
+                # the sum of limit_mm_per_prompt for all modalities
+                if all(v == 0 for v in bucket_config_copy.values()):
+                    logger.warning("Exhausted all multimodal items "
+                                   "of modality %s",
+                                   modality)
+                    break
+                # Renormalize the bucket config
+                bucket_config_copy = self.normalize_bucket_config(
+                                        bucket_config_copy)
+
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        request_id_prefix: str = "",
+        prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN,
+        range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
+        input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
+        output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN,
+        limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT,
+        base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST,
+        num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
+        bucket_config: dict[tuple[int, int, int], float] = 
+                                        DEFAULT_MM_ITEM_BUCKET_CONFIG,
+        enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT,
+        **kwargs,
+    ) -> list[SampleRequest]:
+
+        # NOTE: Video sampling is WIP. Raise error if video is in bucket config
+        # and probability is non-zero.
+        if any(self.map_config_to_modality(cfg) == "video" and p > 0 
+                for cfg, p in bucket_config.items()):
+            raise NotImplementedError("Video sampling not implemented; "
+                                      "set its probability to 0.")
+
+        # Get the sampling parameters for the dataset
+        input_lens, output_lens, offsets = self.get_sampling_params(
+            num_requests, range_ratio, input_len, output_len, tokenizer
+        )
+
+        (
+            min_num_mm_items,
+            max_num_mm_items,
+            limit_mm_per_prompt,
+            bucket_config,
+        ) = self.get_mm_item_sampling_params(
+            base_items_per_request,
+            num_mm_items_range_ratio,
+            limit_mm_per_prompt,
+            bucket_config,
+        )
+
+        # Generate prefix once
+        prefix_token_ids = self.get_prefix(tokenizer, prefix_len)
+        vocab_size = tokenizer.vocab_size
+        # Add synthetic multimodal items to each request
+        mm_requests = []
+        for i in range(num_requests):
+            prompt, total_input_len = self.generate_token_sequence(
+                tokenizer=tokenizer,
+                prefix_token_ids=prefix_token_ids,
+                prefix_len=prefix_len,
+                vocab_size=vocab_size,
+                input_len=int(input_lens[i]),
+                offset=int(offsets[i]),
+                index=i,
+            )
+            # Get multimodal item iterator for a given request
+            mm_item_iterator = self.get_mm_item_iterator(
+                min_num_mm_items,
+                max_num_mm_items,
+                bucket_config,
+                limit_mm_per_prompt,
+            )
+
+            mm_content = cast(list[dict[str, Any]], [
+                self.generate_mm_item(mm_item_config)
+                for mm_item_config in mm_item_iterator
+            ])
+
+            if enable_multimodal_chat:
+                # NOTE: For now this option is only provided for completeness 
+                # given that the serve.py benchmark currently does not use it.
+                mm_chat_prompt: Any = prompt
+                mm_chat_prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
+                sample_request = SampleRequest(
+                    prompt=mm_chat_prompt,
+                    prompt_len=total_input_len,
+                    expected_output_len=int(output_lens[i]),
+                    multi_modal_data=None,
+                    request_id=request_id_prefix + str(i),
+                )
+            else:
+                sample_request = SampleRequest(
+                    prompt=prompt,
+                    prompt_len=total_input_len,
+                    expected_output_len=int(output_lens[i]),
+                    multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(i),
+                )
+            mm_requests.append(sample_request)
+        return mm_requests
 
 # -----------------------------------------------------------------------------
 # ShareGPT Dataset Implementation
@@ -545,8 +1004,8 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         type=str,
         default="random",
         choices=[
-            "sharegpt", "burstgpt", "sonnet", "random", "hf", "custom",
-            "prefix_repetition"
+            "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf", 
+            "custom", "prefix_repetition"
         ],
         help="Name of the dataset to benchmark on.",
     )
@@ -647,6 +1106,98 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
               "input_len * (1 + range_ratio)]."),
     )
 
+    # random multimodal dataset options
+    random_mm_group = parser.add_argument_group(
+        "random multimodal dataset options extended from random dataset")
+    random_mm_group.add_argument(
+        "--random-mm-base-items-per-request",
+        type=int,
+        default=RandomMultiModalDataset.DEFAULT_BASE_ITEMS_PER_REQUEST,
+        help=(
+            "Base number of multimodal items per request for random-mm. "
+            "Actual per-request count is sampled around this base using "
+            "--random-mm-num-mm-items-range-ratio."
+        ),
+    )
+    random_mm_group.add_argument(
+        "--random-mm-num-mm-items-range-ratio",
+        type=float,
+        default=RandomMultiModalDataset.DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
+        help=(
+            "Range ratio r in [0, 1] for sampling items per request. "
+            "We sample uniformly from the closed integer range "
+            "[floor(n*(1-r)), ceil(n*(1+r))] "
+            "where n is the base items per request. "
+            "r=0 keeps it fixed; r=1 allows 0 items. The maximum is clamped "
+            "to the sum of per-modality limits from "
+            "--random-mm-limit-mm-per-prompt. "
+            "An error is raised if the computed min exceeds the max."
+        ),
+    )
+    random_mm_group.add_argument(
+        "--random-mm-limit-mm-per-prompt",
+        type=json.loads,
+        default=RandomMultiModalDataset.DEFAULT_LIMIT_MM_PER_PROMPT,
+        help=(
+            "Per-modality hard caps for items attached per request, e.g. "
+            "'{\"image\": 3, \"video\": 0}'. The sampled per-request item "
+            "count is clamped to the sum of these limits. When a modality "
+            "reaches its cap, its buckets are excluded and probabilities are "
+            "renormalized."
+            "OBS.: Only image sampling is supported for now."
+        ),
+    )
+
+    def _parse_mm_bucket_config(v: object) -> dict[tuple[int, int, int], float]:
+        # If already a dict (e.g., programmatic call), normalize keys
+        def normalize(d: dict) -> dict[tuple[int, int, int], float]:
+            out: dict[tuple[int, int, int], float] = {}
+            for k, val in d.items():
+                key = k
+                if isinstance(key, str):
+                    with suppress(Exception):
+                        key = ast.literal_eval(key)
+                if not (isinstance(key, tuple) and len(key) == 3
+                        and all(isinstance(x, int) for x in key)):
+                    raise ValueError(
+                        f"Invalid bucket key {k!r}. Expected tuple (H, W, T)."
+                    )
+                out[(int(key[0]), int(key[1]), int(key[2]))] = float(val)
+            return out
+
+        if isinstance(v, dict):
+            return normalize(v)
+        if isinstance(v, str):
+            # Python literal (supports tuple keys)
+            parsed = ast.literal_eval(v)
+            if not isinstance(parsed, dict):
+                raise ValueError("Bucket config must parse to a dict.")
+            return normalize(parsed)
+        raise ValueError("Unsupported value for --random-mm-bucket-config.")
+
+    random_mm_group.add_argument(
+        "--random-mm-bucket-config",
+        type=_parse_mm_bucket_config,
+        default=RandomMultiModalDataset.DEFAULT_MM_ITEM_BUCKET_CONFIG,
+        help=(
+            "The bucket config is a dictionary mapping a multimodal item"
+            "sampling configuration to a probability."
+            "Currently allows for 2 modalities: images and videos. "
+            "An bucket key is a tuple of (height, width, num_frames)"
+            "The value is the probability of sampling that specific item. "
+            "Example: "
+            "--random-mm-bucket-config "
+            "{(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.10} "
+            "First item: images with resolution 256x256 w.p. 0.5"
+            "Second item: images with resolution 720x1280 w.p. 0.4 "
+            "Third item: videos with resolution 720x1280 and 16 frames w.p. 0.1"
+            "OBS.: If the probabilities do not sum to 1, they are normalized."
+            "OBS bis.: Only image sampling is supported for now."
+        ),
+    )
+
+
+
     hf_group = parser.add_argument_group("hf dataset options")
     hf_group.add_argument("--hf-subset",
                           type=str,
@@ -821,6 +1372,22 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 range_ratio=args.random_range_ratio,
                 request_id_prefix=args.request_id_prefix,
             ),
+            "random-mm":
+            lambda: RandomMultiModalDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                prefix_len=args.random_prefix_len,
+                range_ratio=args.random_range_ratio,
+                input_len=args.random_input_len,
+                output_len=args.random_output_len,
+                base_items_per_request=args.random_mm_base_items_per_request,
+                limit_mm_per_prompt=args.random_mm_limit_mm_per_prompt,
+                num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio,
+                bucket_config=args.random_mm_bucket_config,
+                request_id_prefix=args.request_id_prefix,
+            ),
             "prefix_repetition":
             lambda: PrefixRepetitionRandomDataset(
                 random_seed=args.seed, dataset_path=args.dataset_path
@@ -836,6 +1403,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
         }
 
         try:
+            # Enforce endpoint compatibility for multimodal datasets.
+            if args.dataset_name == "random-mm" and args.endpoint_type not in [
+                    "openai-chat"]:
+                raise ValueError(
+                    "Multi-modal content (images) is only supported on "
+                    "'openai-chat' backend."
+                )
             input_requests = dataset_mapping[args.dataset_name]()
         except KeyError as err:
             raise ValueError(f"Unknown dataset: {args.dataset_name}") from err

From ebafb0936d95c3926979f925ac65525a7c8778f7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 25 Aug 2025 16:34:54 +0800
Subject: [PATCH 558/932] [Bugfix] Allow dynamic number of patches for
 llava_onevision (#23525)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/llava_onevision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 42ab5e7c74..e4ac0cd919 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -75,7 +75,7 @@ class LlavaOnevisionImagePixelInputs(TensorSchema):
 
     pixel_values: Annotated[
         Union[torch.Tensor, list[torch.Tensor]],
-        TensorShape("bn", "np", 3, "h", "w"),
+        TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np"}),
     ]
 
     image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]

From d0a4a3f645dd57364a0562e01ee8a04e4afa63ba Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 25 Aug 2025 17:00:03 +0800
Subject: [PATCH 559/932] [misc] add shanghai meetup (#23535)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 README.md                 | 3 ++-
 docs/community/meetups.md | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fd8b02ac1f..ef5b435889 100644
--- a/README.md
+++ b/README.md
@@ -18,14 +18,15 @@ Easy, fast, and cheap LLM serving for everyone
 
 *Latest News* 🔥
 
+- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
 - [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
-- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 
 <details>
 <summary>Previous News</summary>
 
+- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 36232e6ad9..61ea44220a 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -2,6 +2,7 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
 - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
 - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).

From 5c4b6e66fec0a59f702e32ec5c476cdd8a800566 Mon Sep 17 00:00:00 2001
From: Ayush Satyam <ayushsatyam146@gmail.com>
Date: Mon, 25 Aug 2025 14:39:36 +0530
Subject: [PATCH 560/932] [Attention] Unify mamba and attention backend
 selection (#23171)

Signed-off-by: Ayush Satyam <ayushsatyam146@gmail.com>
---
 .../test_attention_backends_selection.py      | 104 ++++++++++++++++++
 tests/v1/attention/test_mamba_selectors.py    |  25 -----
 vllm/attention/layer.py                       |   3 +-
 .../layers/attention_layer_base.py            |  23 ++++
 vllm/model_executor/layers/mamba/abstract.py  |  15 ++-
 .../layers/mamba/mamba_mixer.py               |  10 +-
 .../layers/mamba/mamba_mixer2.py              |  10 +-
 .../model_executor/layers/mamba/short_conv.py |  10 +-
 vllm/model_executor/models/minimax_text_01.py |  10 +-
 vllm/v1/attention/backends/mamba_selectors.py |  22 ----
 vllm/v1/worker/gpu_model_runner.py            |  26 ++---
 11 files changed, 186 insertions(+), 72 deletions(-)
 create mode 100644 tests/v1/attention/test_attention_backends_selection.py
 delete mode 100644 tests/v1/attention/test_mamba_selectors.py
 create mode 100644 vllm/model_executor/layers/attention_layer_base.py
 delete mode 100644 vllm/v1/attention/backends/mamba_selectors.py

diff --git a/tests/v1/attention/test_attention_backends_selection.py b/tests/v1/attention/test_attention_backends_selection.py
new file mode 100644
index 0000000000..59e5628149
--- /dev/null
+++ b/tests/v1/attention/test_attention_backends_selection.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for mamba attention backend selectors."""
+
+from types import SimpleNamespace
+
+import pytest
+
+from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
+from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
+from vllm.model_executor.layers.mamba.short_conv import ShortConv
+from vllm.model_executor.models.minimax_text_01 import (
+    MiniMaxText01LinearAttention)
+from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend
+from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
+from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
+from vllm.v1.attention.backends.short_conv_attn import (
+    ShortConvAttentionBackend)
+
+
+@pytest.mark.parametrize(
+    "layer_class, init_kwargs, expected_backend, expected_mamba_type", [
+        (
+            MambaMixer,
+            dict(
+                hidden_size=128,
+                ssm_state_size=16,
+                conv_kernel_size=4,
+                intermediate_size=256,
+                time_step_rank=8,
+                use_conv_bias=True,
+                use_bias=False,
+                use_rms_norm=True,
+            ),
+            Mamba1AttentionBackend,
+            "mamba1",
+        ),
+        (
+            MambaMixer2,
+            dict(
+                hidden_size=128,
+                ssm_state_size=16,
+                conv_kernel_size=4,
+                intermediate_size=256,
+                use_conv_bias=True,
+                use_bias=False,
+                n_groups=1,
+                num_heads=8,
+                head_dim=32,
+            ),
+            Mamba2AttentionBackend,
+            "mamba2",
+        ),
+        (
+            MiniMaxText01LinearAttention,
+            dict(
+                hidden_size=128,
+                hidden_inner_size=256,
+                num_heads=8,
+                head_dim=32,
+                max_position=2048,
+                block_size=64,
+                num_hidden_layer=12,
+                layer_idx=0,
+                linear_layer_idx=0,
+            ),
+            LinearAttentionBackend,
+            "linear_attention",
+        ),
+        (
+            ShortConv,
+            dict(
+                config=SimpleNamespace(conv_L_cache=32, conv_bias=True),
+                dim=128,
+                layer_idx=0,
+            ),
+            ShortConvAttentionBackend,
+            "short_conv",
+        ),
+    ])
+def test_mamba_layers_get_attn_backend(dist_init, layer_class, init_kwargs,
+                                       expected_backend, expected_mamba_type):
+    """Test that Mamba-like layers return the correct attention backend."""
+    layer = layer_class(**init_kwargs)
+
+    backend_class = layer.get_attn_backend()
+    assert backend_class is expected_backend
+    assert layer.mamba_type == expected_mamba_type
+
+
+@pytest.mark.parametrize("layer_class,expected_backend,expected_mamba_type", [
+    (MambaMixer, Mamba1AttentionBackend, "mamba1"),
+    (MambaMixer2, Mamba2AttentionBackend, "mamba2"),
+    (MiniMaxText01LinearAttention, LinearAttentionBackend, "linear_attention"),
+    (ShortConv, ShortConvAttentionBackend, "short_conv"),
+])
+def test_mamba_layers_have_unified_interface(layer_class, expected_backend,
+                                             expected_mamba_type):
+    """Test that all Mamba layers have the unified get_attn_backend 
+    interface."""
+    assert hasattr(layer_class, 'get_attn_backend'), (
+        f"{layer_class.__name__} should have get_attn_backend method")
+    assert hasattr(layer_class, 'mamba_type'), (
+        f"{layer_class.__name__} should have mamba_type property")
diff --git a/tests/v1/attention/test_mamba_selectors.py b/tests/v1/attention/test_mamba_selectors.py
deleted file mode 100644
index 4245b50c71..0000000000
--- a/tests/v1/attention/test_mamba_selectors.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for mamba attention backend selectors."""
-
-import pytest
-
-from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
-from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend
-
-
-@pytest.mark.parametrize(argnames=["mamba_type", "expected_backend"],
-                         argvalues=[("mamba2", Mamba2AttentionBackend)])
-def test_get_mamba_attn_backend_mamba2(mamba_type, expected_backend):
-    backend_class = get_mamba_attn_backend(mamba_type)
-
-    assert backend_class is expected_backend
-
-
-def test_get_mamba_attn_backend_unsupported():
-    unsupported_types = ["mamba", ""]
-
-    for mamba_type in unsupported_types:
-        err_message = f"Mamba Attention type {mamba_type} is not supported yet."
-        with pytest.raises(NotImplementedError, match=err_message):
-            get_mamba_attn_backend(mamba_type)
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 9fbead3178..2d288bcbe0 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -18,6 +18,7 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           is_v1_kv_transfer_group)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -54,7 +55,7 @@ def check_xformers_availability():
     return USE_XFORMERS_OPS
 
 
-class Attention(nn.Module):
+class Attention(nn.Module, AttentionLayerBase):
     """Attention layer.
 
     This class takes query, key, and value tensors as input. The input tensors
diff --git a/vllm/model_executor/layers/attention_layer_base.py b/vllm/model_executor/layers/attention_layer_base.py
new file mode 100644
index 0000000000..782818f55f
--- /dev/null
+++ b/vllm/model_executor/layers/attention_layer_base.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Base class for attention-like layers."""
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+
+class AttentionLayerBase(ABC):
+    """
+    Base class for attention-like layers (Attention, Mamba, etc.) 
+    that support the v1 engine.
+    
+    This provides a common interface for getting attention backends 
+    from different layer types.
+    """
+
+    @abstractmethod
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        """Get the attention backend class for this layer."""
+        pass
diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py
index daebe46f6f..a524e13405 100644
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -1,12 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from collections.abc import Iterable
+from typing import TYPE_CHECKING
 
 import torch
 
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 
-class MambaBase(ABC):
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+
+class MambaBase(AttentionLayerBase):
     """
     Base class for Mamba-like layers which support the v1 engine.
     Inherit from this class if you implement a custom layer.
@@ -32,3 +38,8 @@ class MambaBase(ABC):
     @abstractmethod
     def mamba_type(self) -> str:
         pass
+
+    @abstractmethod
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        """Get the attention backend class for this Mamba layer."""
+        pass
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index a24e72778b..e704bfd451 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import NamedTuple, Optional
+from typing import TYPE_CHECKING, NamedTuple, Optional
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 from torch import nn
@@ -404,6 +407,11 @@ class MambaMixer(MambaBase, CustomOp):
     def mamba_type(self) -> str:
         return "mamba1"
 
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        from vllm.v1.attention.backends.mamba1_attn import (
+            Mamba1AttentionBackend)
+        return Mamba1AttentionBackend
+
     def _time_proj_bias(self) -> Optional[torch.Tensor]:
         if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None:
             return self.dt_proj.bias.float()
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 743e520ec8..bb3fdd38db 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
+from typing import TYPE_CHECKING, Optional, Union
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 from torch import nn
@@ -758,6 +761,11 @@ class MambaMixer2(MambaBase, CustomOp):
     def mamba_type(self) -> str:
         return "mamba2"
 
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        from vllm.v1.attention.backends.mamba2_attn import (
+            Mamba2AttentionBackend)
+        return Mamba2AttentionBackend
+
 
 def mamba_mixer2(
     hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py
index fead1e73e3..335191a5c8 100644
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 
@@ -232,6 +235,11 @@ class ShortConv(MambaBase, CustomOp):
     def mamba_type(self) -> str:
         return "short_conv"
 
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        from vllm.v1.attention.backends.short_conv_attn import (
+            ShortConvAttentionBackend)
+        return ShortConvAttentionBackend
+
 
 def short_conv(
     hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 82e96844cd..0e854bd7d9 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -4,7 +4,10 @@
 import copy
 import math
 from collections.abc import Iterable
-from typing import Optional, Union
+from typing import TYPE_CHECKING, Optional, Union
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 import regex as re
 import torch
@@ -339,6 +342,11 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
     def mamba_type(self) -> str:
         return "linear_attention"
 
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        from vllm.v1.attention.backends.linear_attn import (
+            LinearAttentionBackend)
+        return LinearAttentionBackend
+
     def get_state_dtype(self) -> tuple[torch.dtype]:
         return MambaStateDtypeCalculator.linear_attention_state_dtype(
             self.model_config.dtype,
diff --git a/vllm/v1/attention/backends/mamba_selectors.py b/vllm/v1/attention/backends/mamba_selectors.py
deleted file mode 100644
index fb18445082..0000000000
--- a/vllm/v1/attention/backends/mamba_selectors.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend
-from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
-from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
-from vllm.v1.attention.backends.short_conv_attn import (
-    ShortConvAttentionBackend)
-
-
-def get_mamba_attn_backend(mamba_type: str) -> type[AttentionBackend]:
-    if mamba_type == "mamba1":
-        return Mamba1AttentionBackend
-    if mamba_type == "mamba2":
-        return Mamba2AttentionBackend
-    if mamba_type == "linear_attention":
-        return LinearAttentionBackend
-    if mamba_type == "short_conv":
-        return ShortConvAttentionBackend
-
-    raise NotImplementedError(f"Mamba Attention type {mamba_type} is not "
-                              "supported yet.")
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d634cf280f..73117c75b9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -35,7 +35,8 @@ from vllm.distributed.parallel_state import (
 from vllm.forward_context import (BatchDescriptor, DPMetadata,
                                   set_forward_context)
 from vllm.logger import init_logger
-from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.interfaces import (is_mixture_of_experts,
@@ -55,7 +56,6 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         GiB_bytes, LazyLoader, cdiv, check_use_alibi,
                         get_dtype_size, is_pin_memory_available, round_up,
                         supports_dynamo)
-from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
     make_kv_sharing_fast_prefill_attention_metadata,
@@ -2747,11 +2747,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         """
         assert len(self.attn_groups) == 0, \
             "Attention backends are already initialized"
-        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
 
         def get_attn_backends_for_layers(
                 layer_names: list[str]
         ) -> dict[type[AttentionBackend], list[str]]:
+            layers = get_layers_from_vllm_config(self.vllm_config,
+                                                 AttentionLayerBase,
+                                                 layer_names)
             attn_backends = {}
             attn_backend_layers = defaultdict(list)
             # Dedupe based on full class name; this is a bit safer than using
@@ -2760,7 +2762,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # they are cached correctly, there will be different objects per
             # layer.
             for layer_name in layer_names:
-                attn_backend = attn_layers[layer_name].get_attn_backend()
+                attn_backend = layers[layer_name].get_attn_backend()
                 key = attn_backend.full_cls_name()
                 attn_backends[key] = attn_backend
                 attn_backend_layers[key].append(layer_name)
@@ -2789,20 +2791,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
             kv_cache_spec = kv_cache_group_spec.kv_cache_spec
-            if isinstance(kv_cache_spec, AttentionSpec):
-                attn_backends = get_attn_backends_for_layers(
-                    kv_cache_group_spec.layer_names)
-            # TODO(lucas): move `get_mamba_attn_backend` into the mamba
-            # layers like above
-            elif isinstance(kv_cache_spec, MambaSpec):
-                attn_backends = {
-                    get_mamba_attn_backend(kv_cache_spec.mamba_type):
-                    kv_cache_group_spec.layer_names
-                }
-            else:
-                raise ValueError(
-                    f"Unknown KV cache spec type: {type(kv_cache_spec)}")
-
+            attn_backends = get_attn_backends_for_layers(
+                kv_cache_group_spec.layer_names)
             self.attn_groups.append(
                 create_attn_groups(attn_backends, kv_cache_spec))
 

From e269be2ba2c52ced7581b8499ae19f21383f3c56 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 25 Aug 2025 21:14:15 +0800
Subject: [PATCH 561/932] [Doc] Add caution for API server scale-out (#23550)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/configuration/optimization.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 69d4de9d2f..6c7c31f503 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -196,6 +196,13 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
 !!! note
     API server scale-out is only available for online inference.
 
+!!! warning
+    By default, 8 CPU threads are used in each API server to load media items (e.g. images)
+    from request data.
+
+    If you apply API server scale-out, consider adjusting `VLLM_MEDIA_LOADING_THREAD_COUNT`
+    to avoid CPU resource exhaustion.
+
 !!! note
     [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled
     because it requires a one-to-one correspondance between API and engine core processes.

From 6879cd80ae4dba88121db226d5bfbc6a75b072ba Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 25 Aug 2025 21:31:57 +0800
Subject: [PATCH 562/932] [Refactor] Pass `tokenizer` explicitly instead of
 binding to prompt update (#23542)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_processing.py      |  21 +--
 vllm/model_executor/models/gemma3_mm.py  |   4 +-
 vllm/model_executor/models/gemma3n_mm.py |   4 +-
 vllm/multimodal/processing.py            | 210 +++++++++--------------
 4 files changed, 95 insertions(+), 144 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 3bebe0ab40..6ce5fcfe64 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -243,7 +243,7 @@ def test_find_token_matches(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_updates = {
-        key: update_type(key, target, []).resolve(mock_tokenizer, 0)
+        key: update_type(key, target, []).resolve(0)
         for key, target in target_by_key.items()
     }
     result = {
@@ -392,7 +392,7 @@ def test_find_text_matches(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     prompt_updates = {
-        key: update_type(key, target, []).resolve(mock_tokenizer, 0)
+        key: update_type(key, target, []).resolve(0)
         for key, target in target_by_key.items()
     }
     result = {
@@ -559,10 +559,8 @@ def test_find_update_text(
     ) in expected_by_update_type_mm_count.items():
         for mm_count, expected in expected_by_mm_count.items():
             mm_prompt_updates = {
-                key: [[
-                    update_type(key, target,
-                                repl_by_key[key]).resolve(mock_tokenizer, i)
-                ] for i in range(mm_count)]
+                key: [[update_type(key, target, repl_by_key[key]).resolve(i)]
+                      for i in range(mm_count)]
                 for key, target in target_by_key.items()
             }
 
@@ -731,10 +729,8 @@ def test_find_update_tokens(
     ) in expected_by_update_type_mm_count.items():
         for mm_count, expected in expected_by_mm_count.items():
             mm_prompt_updates = {
-                key: [[
-                    update_type(key, target,
-                                repl_by_key[key]).resolve(mock_tokenizer, i)
-                ] for i in range(mm_count)]
+                key: [[update_type(key, target, repl_by_key[key]).resolve(i)]
+                      for i in range(mm_count)]
                 for key, target in target_by_key.items()
             }
 
@@ -879,12 +875,11 @@ def test_find_mm_placeholders(
     mock_tokenizer = cast(AnyTokenizer, object())
 
     mm_prompt_updates = {
-        key: [[update_type(key, [], repl).resolve(mock_tokenizer, i)]
-              for i in range(3)]
+        key: [[update_type(key, [], repl).resolve(i)] for i in range(3)]
         for key, repl in repl_by_key.items()
     }
 
-    result = find_mm_placeholders(prompt, mm_prompt_updates)
+    result = find_mm_placeholders(prompt, mm_prompt_updates, mock_tokenizer)
 
     # Only displayed on error
     print("result:", result)
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 44188ee4db..f3dc7dde46 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -28,7 +28,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PlaceholderFeaturesInfo,
                                         PromptReplacement, PromptUpdate,
                                         PromptUpdateDetails,
-                                        find_mm_placeholders,
                                         replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
@@ -401,7 +400,8 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
             repl_token_ids.extend(repl_toks)
             repl_orig_idxs.extend(orig_idx for _ in range(len(repl_toks)))
 
-        repls = find_mm_placeholders(repl_token_ids, mm_prompt_updates)
+        repls = super()._find_mm_placeholders(repl_token_ids,
+                                              mm_prompt_updates)
 
         return {
             modality: [
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 042c31ba5c..d59dde1560 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -35,7 +35,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PlaceholderFeaturesInfo,
                                         PromptReplacement, PromptUpdate,
                                         PromptUpdateDetails,
-                                        find_mm_placeholders,
                                         replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
@@ -318,7 +317,8 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
             repl_token_ids.extend(repl_toks)
             repl_orig_idxs.extend(orig_idx for _ in range(len(repl_toks)))
 
-        repls = find_mm_placeholders(repl_token_ids, mm_prompt_updates)
+        repls = super()._find_mm_placeholders(repl_token_ids,
+                                              mm_prompt_updates)
 
         return {
             modality: [
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 878e83add8..8c225e2a3c 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -44,6 +44,44 @@ PromptSeq = Union[str, list[int]]
 """A token sequence (list of token IDs) or text."""
 
 
+@lru_cache(maxsize=2048)
+def _cached_encode(
+    tokenizer: AnyTokenizer,
+    text: str,
+    *,
+    add_special_tokens: Optional[bool] = None,
+) -> list[int]:
+    return encode_tokens(tokenizer,
+                         text,
+                         add_special_tokens=add_special_tokens)
+
+
+@lru_cache(maxsize=2048)
+def _cached_decode(
+    tokenizer: AnyTokenizer,
+    token_ids: tuple[int, ...],
+    *,
+    skip_special_tokens: Optional[bool] = None,
+) -> str:
+    return decode_tokens(tokenizer,
+                         list(token_ids),
+                         skip_special_tokens=skip_special_tokens)
+
+
+def _seq2text(tokenizer: AnyTokenizer, seq: PromptSeq) -> str:
+    if isinstance(seq, str):
+        return seq
+
+    return _cached_decode(tokenizer, tuple(seq))
+
+
+def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]:
+    if isinstance(seq, str):
+        return _cached_encode(tokenizer, seq, add_special_tokens=False)
+
+    return seq
+
+
 class _GetMatchIndex(Protocol):
 
     def __call__(
@@ -137,7 +175,8 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
+    is_embed: Optional[Callable[[AnyTokenizer, PromptSeq],
+                                torch.Tensor]] = None
     """
     Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
     return a boolean mask of shape `(len(full),)` indicating which positions
@@ -159,11 +198,12 @@ class PromptUpdateDetails(Generic[_S]):
         embed_text: str,
     ) -> "PromptUpdateDetails[_S]":
 
-        def is_embed(full: "_BoundPromptSequence") -> torch.Tensor:
-            embed_token_ids = encode_tokens(full.tokenizer, embed_text)
+        def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor:
+            embed_token_ids = encode_tokens(tokenizer, embed_text)
+            token_ids = _seq2tokens(tokenizer, full)
 
             return torch.isin(
-                torch.tensor(full.token_ids),
+                torch.tensor(token_ids),
                 torch.tensor(embed_token_ids),
             )
 
@@ -174,10 +214,13 @@ class PromptUpdateDetails(Generic[_S]):
         seq: _S,
         embed_token_id: int,
     ) -> "PromptUpdateDetails[_S]":
-        return PromptUpdateDetails(
-            full=seq,
-            is_embed=lambda f: torch.tensor(f.token_ids) == embed_token_id,
-        )
+
+        def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor:
+            token_ids = _seq2tokens(tokenizer, full)
+
+            return torch.tensor(token_ids) == embed_token_id
+
+        return PromptUpdateDetails(full=seq, is_embed=is_embed)
 
 
 PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
@@ -230,25 +273,14 @@ class PromptUpdate(ABC):
         """Defines how to update the prompt."""
         raise NotImplementedError
 
-    def _resolve_target(
-        self,
-        tokenizer: AnyTokenizer,
-        item_idx: int,
-    ) -> Union["_BoundPromptSequence", PromptIndex]:
+    def _resolve_target(self, item_idx: int) -> UpdateTarget:
         target = self.target
         if callable(target):
             target = target(item_idx)
 
-        if isinstance(target, PromptIndex):
-            return target
+        return target
 
-        return _BoundPromptSequence.from_seq(tokenizer, target)
-
-    def _resolve_content(
-        self,
-        tokenizer: AnyTokenizer,
-        item_idx: int,
-    ) -> "_BoundPromptContent":
+    def _resolve_content(self, item_idx: int) -> PromptUpdateDetails:
         content = self.content
         if callable(content):
             content = content(item_idx)
@@ -256,17 +288,9 @@ class PromptUpdate(ABC):
         if not isinstance(content, PromptUpdateDetails):
             content = PromptUpdateDetails.from_seq(content)
 
-        bound_full = _BoundPromptSequence.from_seq(tokenizer, content.full)
-        bound_content = _BoundPromptContent(full=bound_full,
-                                            is_embed=content.is_embed)
+        return content
 
-        return bound_content
-
-    def resolve(
-        self,
-        tokenizer: AnyTokenizer,
-        item_idx: int,
-    ) -> "ResolvedPromptUpdate":
+    def resolve(self, item_idx: int) -> "ResolvedPromptUpdate":
         """
         Given the index of the processed item within
         [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
@@ -276,8 +300,8 @@ class PromptUpdate(ABC):
             modality=self.modality,
             item_idx=item_idx,
             mode=self.mode,
-            target=self._resolve_target(tokenizer, item_idx),
-            content=self._resolve_content(tokenizer, item_idx),
+            target=self._resolve_target(item_idx),
+            content=self._resolve_content(item_idx),
         )
 
 
@@ -424,30 +448,6 @@ class PromptReplacement(PromptUpdate):
         return UpdateMode.REPLACE
 
 
-@lru_cache(maxsize=2048)
-def _cached_encode(
-    tokenizer: AnyTokenizer,
-    text: str,
-    *,
-    add_special_tokens: Optional[bool] = None,
-) -> list[int]:
-    return encode_tokens(tokenizer,
-                         text,
-                         add_special_tokens=add_special_tokens)
-
-
-@lru_cache(maxsize=2048)
-def _cached_decode(
-    tokenizer: AnyTokenizer,
-    token_ids: tuple[int, ...],
-    *,
-    skip_special_tokens: Optional[bool] = None,
-) -> str:
-    return decode_tokens(tokenizer,
-                         list(token_ids),
-                         skip_special_tokens=skip_special_tokens)
-
-
 class _HasModalityAttr(Protocol):
     modality: str
 
@@ -468,59 +468,6 @@ def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
     return full_groupby(values, key=lambda x: x.modality)
 
 
-@dataclass
-class _BoundPromptSequence:
-    """
-    A [`_PromptSeq`][vllm.multimodal.processing.PromptSeq] bound
-    to a tokenizer to automatically
-    convert between token sequence and text representations.
-    """
-    tokenizer: AnyTokenizer = field(repr=False)
-
-    _text: Optional[str]
-    _token_ids: Optional[list[int]]
-
-    @staticmethod
-    def from_seq(
-        tokenizer: AnyTokenizer,
-        seq: PromptSeq,
-    ) -> "_BoundPromptSequence":
-        return _BoundPromptSequence(
-            tokenizer=tokenizer,
-            _text=seq if isinstance(seq, str) else None,
-            _token_ids=seq if isinstance(seq, list) else None,
-        )
-
-    def __post_init__(self) -> None:
-        if self._text is None and self._token_ids is None:
-            raise ValueError("At least one of 'text' and 'token_ids' must be "
-                             "specified")
-
-    @property
-    def text(self) -> str:
-        if self._text is None:
-            assert self._token_ids is not None
-            self._text = _cached_decode(self.tokenizer, tuple(self._token_ids))
-
-        return self._text
-
-    @property
-    def token_ids(self) -> list[int]:
-        if self._token_ids is None:
-            assert self._text is not None
-            self._token_ids = _cached_encode(self.tokenizer,
-                                             self._text,
-                                             add_special_tokens=False)
-
-        return self._token_ids
-
-
-@dataclass
-class _BoundPromptContent:
-    full: _BoundPromptSequence
-    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]]
-
-
 class PromptTargetMatch(NamedTuple):
     start_idx: int
     end_idx: int
@@ -542,10 +489,10 @@ class ResolvedPromptUpdate:
     mode: UpdateMode
     """Defines how to update the prompt."""
 
-    target: Union[_BoundPromptSequence, PromptIndex]
+    target: UpdateTarget
     """The token sequence (or text) to update."""
 
-    content: _BoundPromptContent = field(repr=False)
+    content: PromptUpdateDetails = field(repr=False)
     """The placeholder tokens that are part of the update."""
 
     def iter_token_matches(
@@ -565,8 +512,10 @@ class ResolvedPromptUpdate:
 
             return
 
+        target_token_ids = _seq2tokens(tokenizer, target)
+
         for match in iter_token_matches(prompt,
-                                        target.token_ids,
+                                        target_token_ids,
                                         start_idx=start_idx):
             yield PromptTargetMatch(match.start_idx, match.end_idx)
 
@@ -587,7 +536,9 @@ class ResolvedPromptUpdate:
 
             return
 
-        for match in re.finditer(re.escape(target.text), prompt,
+        target_text = _seq2text(tokenizer, target)
+
+        for match in re.finditer(re.escape(target_text), prompt,
                                  pos=start_idx):
             yield PromptTargetMatch(match.start(), match.end())
 
@@ -779,7 +730,7 @@ def _apply_matches(
 
                 matched_update = mm_prompt_updates[modality][item_idx][
                     update_idx]
-                matched_content = matched_update.content
+                matched_content = matched_update.content.full
 
                 if mode == UpdateMode.INSERT:
                     end_idx_to_insert = match.end_idx
@@ -789,8 +740,10 @@ def _apply_matches(
                     assert_never(mode)
 
                 out_seqs.append(prompt[prev_end_idx:end_idx_to_insert])
-                out_seqs.append(matched_content.full.text if isinstance(
-                    prompt, str) else matched_content.full.token_ids)
+                out_seqs.append(
+                    _seq2text(tokenizer, matched_content
+                              ) if isinstance(prompt, str) else _seq2tokens(
+                                  tokenizer, matched_content))
                 out_result[modality][item_idx] = update_idx
 
                 # Exclude overlapping matches
@@ -842,6 +795,7 @@ def apply_text_matches(
 def _iter_placeholders(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
+    tokenizer: AnyTokenizer,
 ) -> Iterable[PlaceholderFeaturesInfo]:
     """
     Yield each set of placeholder tokens found in `prompt`.
@@ -868,7 +822,7 @@ def _iter_placeholders(
 
             for update in modality_updates[item_idx]:
                 content = update.content
-                content_tokens_full = content.full.token_ids
+                content_tokens_full = _seq2tokens(tokenizer, content.full)
                 content_len_full = len(content_tokens_full)
                 end_idx_full = start_idx + content_len_full
 
@@ -878,7 +832,8 @@ def _iter_placeholders(
                 if prompt[start_idx:end_idx_full] == content_tokens_full:
                     content_is_embed = content.is_embed
                     if content_is_embed is not None:
-                        content_is_embed = content_is_embed(content.full)
+                        content_is_embed = content_is_embed(
+                            tokenizer, content.full)
 
                     yield PlaceholderFeaturesInfo(
                         modality=modality,
@@ -904,8 +859,9 @@ def _iter_placeholders(
 def find_mm_placeholders(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
+    tokenizer: AnyTokenizer,
 ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
-    it = _iter_placeholders(prompt, mm_prompt_updates)
+    it = _iter_placeholders(prompt, mm_prompt_updates, tokenizer)
     return dict(full_groupby_modality(it))
 
 
@@ -1160,12 +1116,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         prompt_updates: Sequence[PromptUpdate],
         mm_item_counts: Mapping[str, int],
     ) -> MultiModalPromptUpdates:
-        tokenizer = self.info.get_tokenizer()
-
         return {
-            modality:
-            [[update.resolve(tokenizer, item_idx) for update in updates]
-             for item_idx in range(mm_item_counts.get(modality, 0))]
+            modality: [[update.resolve(item_idx) for update in updates]
+                       for item_idx in range(mm_item_counts.get(modality, 0))]
             for modality, updates in full_groupby_modality(prompt_updates)
         }
 
@@ -1208,7 +1161,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         new_token_ids: list[int],
         mm_prompt_updates: MultiModalPromptUpdates,
     ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
-        return find_mm_placeholders(new_token_ids, mm_prompt_updates)
+        tokenizer = self.info.get_tokenizer()
+
+        return find_mm_placeholders(new_token_ids, mm_prompt_updates,
+                                    tokenizer)
 
     def _get_hf_mm_data(
         self,

From e0329ed4b426af432a8cc0997f964ba1e59cfdc2 Mon Sep 17 00:00:00 2001
From: Driss Guessous <32754868+drisspg@users.noreply.github.com>
Date: Mon, 25 Aug 2025 06:32:42 -0700
Subject: [PATCH 563/932] Updates to Flex + VLLm integration (#21416)

Signed-off-by: drisspg <drisspguessous@gmail.com>
---
 tests/kernels/test_flex_attention.py          | 108 ++++-
 tests/v1/attention/test_attention_backends.py |  30 +-
 vllm/v1/attention/backends/flex_attention.py  | 402 +++++++++++++++---
 3 files changed, 438 insertions(+), 102 deletions(-)

diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
index f76bd19246..39753c0cc1 100644
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -9,12 +9,17 @@ import pytest
 import torch
 from packaging import version
 
-from vllm import SamplingParams
+from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
+                                      create_standard_kv_cache_spec,
+                                      create_vllm_config)
+from vllm.v1.attention.backends.flex_attention import (
+    FlexAttentionMetadataBuilder)
 
-from ..models.utils import check_embeddings_close
+from ..models.utils import check_embeddings_close, check_logprobs_close
 
 TORCH_VERSION = version.parse(torch.__version__)
 MINIMUM_TORCH_VERSION = version.parse("2.7.0")
+DIRECT_BUILD_VERSION = version.parse("2.9.dev0")
 
 
 def set_seed(seed):
@@ -34,22 +39,18 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
     """Test that FlexAttention produces the same outputs as the default backend.
 
     This test compares the outputs from the FlexAttention backend with
-    the default backend, ensuring they are identical when using the same seed.
+    the default backend, ensuring they are similar when using the same seed.
     """
     model_name = "Qwen/Qwen2.5-1.5B-Instruct"
     seed = 42
     max_tokens = 24
+    num_logprobs = 5
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
         "The capital of France is",
     ]
 
-    sampling_params = SamplingParams(temperature=0.0,
-                                     top_p=1.0,
-                                     seed=seed,
-                                     max_tokens=max_tokens)
-
     # Run with flex attention
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
@@ -61,7 +62,8 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
                          tensor_parallel_size=1,
                          num_gpu_blocks_override=128,
                          enforce_eager=True) as llm_flex:
-            output_flex = llm_flex.generate(prompts, sampling_params)
+            output_flex = llm_flex.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs)
 
     # Run with default backend
     with monkeypatch.context() as m:
@@ -71,20 +73,17 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
                          runner="generate",
                          tensor_parallel_size=1,
                          num_gpu_blocks_override=128,
-                         enforce_eager=True) as llm_default:
-            output_default = llm_default.generate(prompts, sampling_params)
+                         enforce_eager=True,
+                         gpu_memory_utilization=0.85) as llm_default:
+            output_default = llm_default.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs)
 
-    # Compare outputs from both backends
-    for i, (flex_result,
-            default_result) in enumerate(zip(output_flex, output_default)):
-        prompt = prompts[i]
-        flex_text = flex_result[1][0]
-        default_text = default_result[1][0]
-
-        assert flex_text == default_text, (
-            f"FlexAttention output doesn't match default for: {prompt!r}\n"
-            f"FlexAttention: {flex_text!r}\n"
-            f"Default: {default_text!r}")
+    check_logprobs_close(
+        outputs_0_lst=output_flex,
+        outputs_1_lst=output_default,
+        name_0="flex",
+        name_1="default",
+    )
 
 
 @pytest.mark.skipif(
@@ -136,5 +135,70 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
     )
 
 
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or TORCH_VERSION < DIRECT_BUILD_VERSION,
+    reason="CUDA not available or PyTorch version < 2.7",
+)
+def test_block_mask_direct_vs_slow_path():
+    """Test that direct path block mask is a superset of slow path.
+
+    The direct path may include extra blocks for performance (over-estimation),
+    but must include all blocks that the slow path determines are necessary.
+    """
+    device = torch.device("cuda")
+
+    vllm_config = create_vllm_config(model_name="meta-llama/Meta-Llama-3-8B",
+                                     block_size=16,
+                                     max_model_len=1024)
+    kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
+
+    # Use a mixed batch that will create groups spanning multiple sequences
+    batch_spec = BatchSpec(seq_lens=[35, 64, 128, 256],
+                           query_lens=[33, 5, 32, 64],
+                           name="test_mixed_batch")
+
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec, vllm_config.cache_config.block_size, device)
+
+    builder = FlexAttentionMetadataBuilder(kv_cache_spec, [], vllm_config,
+                                           device)
+
+    metadata_direct = builder.build(common_prefix_len=0,
+                                    common_attn_metadata=common_attn_metadata)
+    builder.direct_build = False
+    metadata_slow = builder.build(common_prefix_len=0,
+                                  common_attn_metadata=common_attn_metadata)
+
+    assert metadata_direct.block_mask is not None
+    assert metadata_slow.block_mask is not None
+
+    # Extract block indices for comparison, B, H are the same
+    direct_indices = metadata_direct.block_mask.kv_indices[0, 0]
+    slow_indices = metadata_slow.block_mask.kv_indices[0, 0]
+    direct_num = metadata_direct.block_mask.kv_num_blocks[0, 0]
+    slow_num = metadata_slow.block_mask.kv_num_blocks[0, 0]
+
+    # main test: every block needed by slow path must be in direct path
+    num_groups = direct_num.shape[0]
+    all_contained = True
+    missing_details = []
+
+    for group_idx in range(num_groups):
+        direct_blocks = set(
+            direct_indices[group_idx, :direct_num[group_idx]].tolist())
+        slow_blocks = set(
+            slow_indices[group_idx, :slow_num[group_idx]].tolist())
+
+        missing_blocks = slow_blocks - direct_blocks
+        if missing_blocks:
+            all_contained = False
+            missing_details.append(
+                f"Group {group_idx}: missing {sorted(missing_blocks)}")
+
+    assert all_contained, (
+        "Direct path is missing blocks required by slow path:\n" +
+        "\n".join(missing_details))
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index 60e04ad906..e4c07aae0e 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -10,14 +10,15 @@ from tests.v1.attention.utils import (BatchSpec, _Backend,
                                       create_standard_kv_cache_spec,
                                       create_vllm_config,
                                       get_attention_backend)
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv, is_torch_equal_or_newer
 from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
                                               set_kv_cache_layout)
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
     _Backend.FLASH_ATTN_VLLM_V1, _Backend.FLASHINFER_VLLM_V1,
-    _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1, _Backend.TREE_ATTN
+    _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1, _Backend.TREE_ATTN,
+    "FLEX_ATTENTION_SLOW"
 ]
 
 # Remove flashinfer from the list if it's not available
@@ -97,7 +98,7 @@ def create_and_prepopulate_kv_cache(
         common_attn_metadata: CommonAttentionMetadata,
         randomize_blocks: bool = True) -> torch.Tensor:
     """Create and prepopulate a KV cache with context data.
-    
+
     Args:
         k_contexts: List of key context tensors for each sequence
         v_contexts: List of value context tensors for each sequence
@@ -109,9 +110,9 @@ def create_and_prepopulate_kv_cache(
         device: Device to create the cache on
         num_blocks: Total number of blocks in the cache
         block_table: Block table tensor to populate
-        randomize_blocks: Whether to randomly permute blocks 
+        randomize_blocks: Whether to randomly permute blocks
                           or use sequential order
-        
+
     Returns:
         Tuple of (kv_cache, updated_block_table)
     """
@@ -206,10 +207,18 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
                           kv_cache: torch.Tensor) -> torch.Tensor:
     """Run attention computation using the specified backend's AttentionImpl."""
 
-    builder_cls, impl_cls = get_attention_backend(backend)
+    # Handle special case for FLEX_ATTENTION_SLOW
+    actual_backend = backend
+
+    use_direct_block_mask = is_torch_equal_or_newer("2.9.0.dev0")
+    if backend == "FLEX_ATTENTION_SLOW":
+        actual_backend = _Backend.FLEX_ATTENTION
+        use_direct_block_mask = False
+
+    builder_cls, impl_cls = get_attention_backend(actual_backend)
 
     # Mock flashinfer's get_per_layer_parameters if needed
-    if backend == _Backend.FLASHINFER_VLLM_V1:
+    if actual_backend == _Backend.FLASHINFER_VLLM_V1:
         import unittest.mock
 
         from vllm.v1.attention.backends.utils import PerLayerParameters
@@ -239,6 +248,8 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
     else:
         # Build metadata
         builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device)
+        if actual_backend == _Backend.FLEX_ATTENTION:
+            builder.direct_build = use_direct_block_mask
         attn_metadata = builder.build(
             common_prefix_len=0,
             common_attn_metadata=common_attn_metadata,
@@ -453,11 +464,6 @@ def test_backend_correctness(batch_spec_name: str, model: str):
         rtol = 1e-2
         atol = 5e-3
 
-        if backend_name == _Backend.FLEX_ATTENTION:
-            atol = 5e-1  # TODO: figure out why flex_attention has such large
-            # numerical differences for medium_decode, medium_prefill,
-            # mixed_medium
-
         max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item()
         max_rel_diff = torch.max(
             torch.abs(backend_output - sdpa_output) /
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index f4aa54660a..458562ebc8 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention layer with FlashAttention."""
-from collections import defaultdict
+"""Attention layer with FlexAttention."""
+
 from dataclasses import dataclass
-from typing import Optional
+from typing import TYPE_CHECKING, Optional, Union
 
 import torch
+import torch._dynamo.decorators
+import torch.nn.functional as F
 from torch.nn.attention.flex_attention import (BlockMask, _mask_mod_signature,
                                                _score_mod_signature,
                                                create_block_mask,
@@ -16,13 +18,17 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               is_quantized_kv_cache)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
+from vllm.utils import cdiv, is_torch_equal_or_newer
 from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
                                               CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 logger = init_logger(__name__)
 
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+
 create_block_mask_compiled = torch.compile(create_block_mask,
                                            fullgraph=True,
                                            mode="reduce-overhead")
@@ -36,6 +42,23 @@ def _offsets_to_doc_ids_tensor(offsets: torch.Tensor) -> torch.Tensor:
         torch.arange(len(counts), device=device, dtype=torch.int32), counts)
 
 
+def pad_to_multiple(x: torch.Tensor, multiple: int, dim: int):
+    difference = (multiple - (x.shape[dim] % multiple)) % multiple
+    if difference == 0:
+        return x
+
+    dim = dim if dim >= 0 else x.ndim + dim
+    pad_list = []
+
+    for i in range(x.ndim - 1, dim - 1, -1):
+        if i == dim:
+            pad_list.extend([0, difference])
+        else:
+            pad_list.extend([0, 0])
+
+    return F.pad(x, pad_list, mode="constant", value=0)
+
+
 class FlexAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
 
@@ -77,10 +100,10 @@ class FlexAttentionBackend(AttentionBackend):
         return False
 
 
-# @torch.compile(fullgraph=True, mode="reduce-overhead")
-def physical_to_logical_mapping(
-        block_table: torch.Tensor,
-        total_blocks: Optional[int] = None) -> torch.Tensor:
+#@torch.compile(fullgraph=True, mode="reduce-overhead")
+def physical_to_logical_mapping(block_table: torch.Tensor,
+                                seq_lens: torch.Tensor, block_size: int,
+                                total_blocks: int) -> torch.Tensor:
     """
     Creates an inverse mapping from physical block locations to logical indices.
 
@@ -114,13 +137,38 @@ def physical_to_logical_mapping(
     If a physical block is not mapped to by any logical block,
     its value in the result will be -1.
 
+    IMPORTANT: Garbage Value Protection
+    ────────────────────────────────────
+    The block_table tensor may contain garbage values in unused positions
+    (beyond the actual sequence length). For example, if a sequence only
+    needs 3 blocks but the table has space for 8:
+
+        block_table[0] = [10, 25, 7, 999, 1234, 888, ...]
+                                    ^^^^^^^^^^^^^^^^^^^^
+                                    garbage values
+
+    These garbage values can cause issues because:
+    1. They may map to valid physical blocks by coincidence
+    2. The scatter_ operation will assign them logical indices
+    3. Later attention computations may incorrectly access these blocks
+
+    To prevent this, we use seq_lens and block_size to mask out unused
+    entries, ensuring only valid block references are processed.
 
     Args:
         block_table: Tensor of shape [max_reqs, max_num_blocks]
-            mapping logical blocks to physical locations
+            mapping logical blocks to physical locations. May contain
+            garbage values in unused positions.
+        seq_lens: Tensor of sequence lengths for each request. Used to
+            determine how many blocks are actually needed per sequence.
+        block_size: Size of each block in tokens. Used with seq_lens to
+            compute the number of valid blocks per sequence.
+        total_blocks: Total number of physical blocks available
 
     Returns:
-        A tensor of shape [max_reqs, max_physical_block]
+        A tensor of shape [max_reqs, total_blocks] where each entry
+        physical_to_logical[req_id, physical_block] contains the logical
+        block index for that physical block, or -1 if unused.
     """
     max_reqs, max_num_blocks = block_table.shape
     device = block_table.device
@@ -130,17 +178,76 @@ def physical_to_logical_mapping(
                                      dtype=torch.long,
                                      device=device)
 
-    logical_indices = (torch.arange(max_num_blocks,
-                                    device=device).unsqueeze(0).expand(
-                                        max_reqs, -1))
+    # Only process valid blocks to avoid garbage values
+    num_blocks_per_seq = cdiv(seq_lens, block_size)
+    mask = torch.arange(max_num_blocks,
+                        device=device)[None, :] < num_blocks_per_seq[:, None]
 
-    physical_to_logical.scatter_(-1, block_table.to(torch.int64),
-                                 logical_indices)
-    # TODO Confirm - Seems like block 0 is always empty so we reset it manually
+    valid_block_table = torch.where(mask, block_table, 0)
+    valid_logical_indices = torch.where(
+        mask,
+        torch.arange(max_num_blocks, device=device)[None, :], 0)
+
+    physical_to_logical.scatter_(-1, valid_block_table.to(torch.int64),
+                                 valid_logical_indices)
+    # NB - Seems like block 0 is always empty so we reset it manually
     physical_to_logical[:, 0] = -1
     return physical_to_logical
 
 
+def unique_static_unsorted(
+        x: torch.Tensor,
+        *,
+        M: int,  # maximum positive value (0 is “skip me”)
+        dim: int = -1,  # axis along which to deduplicate
+        ignored_val: int = 0,  # value to ignore
+        pad_val: int = -1,  # sentinel for unused slots
+) -> torch.Tensor:
+    """
+    - Keeps the first occurrence of each non-zero value while preserving order,
+      then left-packs those uniques and fills the rest with `pad_val`.
+    - Returns (packed, keep_mask) with the *same shape* as `x`.
+    - Requires that all values be in the range [0, M]
+    - Skips ignored_val
+
+    Works on CPU or GPU, no Python loops, O(B·N) time / O(B·M) memory.
+
+    Example:
+    x =[3, 1, 0, 1, 2], M=3, ignored_val=0 => [3, 1, 2, -1, -1]
+    """
+    if not (-1 <= pad_val <= M):
+        raise ValueError("`pad_val` must lie in [-1, M]")
+
+    # ── move `dim` to the end so we can treat tensor as [B, N] ──────────
+    dim = dim % x.ndim
+    x_perm = x.movedim(dim, -1)  # shape [..., N]
+    B, N = x_perm.numel() // x_perm.shape[-1], x_perm.shape[-1]
+    x_flat = x_perm.reshape(B, N)  # [B, N]
+
+    device = x.device
+    idx = torch.arange(N, device=device).expand(B, N)  # per-row indices
+
+    # ── build first-occurrence table for every v ∈ [0, M] ───────────────
+    first_idx = torch.full((B, M + 1), N, device=device)  # “∞”
+    # scatter_reduce_: first_idx[b, v] = min(first_idx[b, v], i) for each i
+    first_idx.scatter_reduce_(1, x_flat, idx, reduce="amin")
+
+    # ── keep mask: first occurrence *and* value ≠ 0 ─────────────────────
+    keep = (x_flat != ignored_val) & (idx == first_idx.gather(1, x_flat)
+                                      )  # [B, N]
+
+    # ── left-pack uniques into a fresh tensor ───────────────────────────
+    dest_pos = torch.cumsum(keep.to(torch.long), dim=1) - 1  # where to go
+    packed_flat = torch.full_like(x_flat, pad_val)
+
+    rows, src_cols = torch.nonzero(keep, as_tuple=True)
+    packed_flat[rows, dest_pos[rows, src_cols]] = x_flat[rows, src_cols]
+
+    # ── restore original layout ─────────────────────────────────────────
+    packed = packed_flat.reshape(x_perm.shape).movedim(-1, dim)
+    return packed
+
+
 def causal_mask_mod(b: torch.Tensor, h: torch.Tensor, q_idx: torch.Tensor,
                     kv_idx: torch.Tensor):
     return q_idx >= kv_idx
@@ -170,6 +277,7 @@ class FlexAttentionMetadata:
     num_reqs: int
     physical_to_logical: torch.Tensor
     decode_offset: torch.Tensor
+    num_blocks_per_seq: torch.Tensor
 
     # For logging.
     num_input_tokens: int = 0  # Number of tokens including padding.
@@ -179,6 +287,46 @@ class FlexAttentionMetadata:
     block_mask: Optional[BlockMask] = None
     score_mod: Optional[_score_mod_signature] = None
     logical_mask_mod: _mask_mod_signature = causal_mask_mod
+    doc_ids: Optional[torch.Tensor] = None
+    direct_build: bool = True
+    q_block_size: int = 16
+    kv_block_size: int = 16
+    transformed_score_mod: Optional[_score_mod_signature] = None
+
+    def _convert_physical_to_logical(
+        self,
+        request_lookup: torch.Tensor,
+        q_idx: torch.Tensor,
+        physical_kv_idx: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Convert physical indices to logical indices for both query and kv.
+
+        NB is_within_lower_bound: do sequences start on block_boundaries?
+
+        Returns:
+            tuple of (is_valid, logical_q_idx, logical_kv_idx)
+        """
+        # Map query indices to corresponding request indices
+        q_req = request_lookup[q_idx]
+
+        # Convert physical KV indices to logical indices
+        physical_kv_block = physical_kv_idx // self.block_size
+        physical_kv_offset = physical_kv_idx % self.block_size
+        logical_block_idx = self.physical_to_logical[q_req, physical_kv_block]
+        logical_kv_idx = (logical_block_idx * self.block_size +
+                          physical_kv_offset)
+
+        # Determine valid kv indices
+        live_block = logical_block_idx >= 0
+        within_upper_bound = logical_kv_idx < self.seq_lens[q_req]
+        within_lower_bound = logical_kv_idx >= 0
+        is_valid = live_block & within_upper_bound & within_lower_bound
+
+        # Convert physical query indices to logical indices
+        local_q_idx = q_idx - self.query_start_loc[q_req]
+        logical_q_idx = local_q_idx + self.decode_offset[q_req]
+
+        return is_valid, logical_q_idx, logical_kv_idx
 
     def get_causal_mask_mod(self) -> _mask_mod_signature:
         """Creates the mask_mod function for FlexAttention.
@@ -191,11 +339,8 @@ class FlexAttentionMetadata:
         With this info we create the "logical" indices that are passed to
         mask_mod functions. This allows mask mod functions to be agnostic to
         layout of the query and key/value tensors.
-
-        TODO is_within_lower_bound: do sequences start on block_boundaries?
         """
-        # Create a lookup mapping from query indices -> request number
-        request_lookup = _offsets_to_doc_ids_tensor(self.query_start_loc)
+        assert self.doc_ids is not None
 
         def final_mask_mod(
             b: torch.Tensor,
@@ -203,27 +348,9 @@ class FlexAttentionMetadata:
             q_idx: torch.Tensor,
             physical_kv_idx: torch.Tensor,
         ) -> torch.Tensor:
-            # Map query indices to corresponding request indices
-            q_req = request_lookup[q_idx]
-
-            # Convert physical KV indices to logical indices
-            physical_kv_block = physical_kv_idx // self.block_size
-            physical_kv_offset = physical_kv_idx % self.block_size
-            logical_block_idx = self.physical_to_logical[q_req,
-                                                         physical_kv_block]
-            logical_kv_idx = logical_block_idx * self.block_size + physical_kv_offset  # noqa: E501
-
-            # Determine valid kv indices
-            live_block = logical_block_idx >= 0
-            within_upper_bound = logical_kv_idx < self.seq_lens[q_req]
-            within_lower_bound = logical_kv_idx >= 0
-
-            is_valid = live_block & within_upper_bound & within_lower_bound
-
-            # Convert physical query indices to logical indices
-            local_q_idx = q_idx - self.query_start_loc[q_req]
-            logical_q_idx = local_q_idx + self.decode_offset[q_req]
-
+            (is_valid, logical_q_idx,
+             logical_kv_idx) = self._convert_physical_to_logical(
+                 self.doc_ids, q_idx, physical_kv_idx)
             # Apply mask modification only for valid indices
             return torch.where(
                 is_valid,
@@ -236,7 +363,7 @@ class FlexAttentionMetadata:
     def get_bidirectional_mask_mod(self) -> _mask_mod_signature:
         """Creates the encoder mask_mod function for FlexAttention.
 
-        Since the encoder bidirectional attention doesn't run with 
+        Since the encoder bidirectional attention doesn't run with
         KV cache, this function creates a mask based on the
         packed query sequences.
         """
@@ -253,6 +380,97 @@ class FlexAttentionMetadata:
 
         return final_mask_mod
 
+    def get_transformed_score_mod(self) -> Optional[_score_mod_signature]:
+        """Creates the transformed score_mod function for FlexAttention.
+
+        This function wraps the user's score_mod to handle physical-to-logical
+        index conversion, similar to how get_mask_mod works for mask functions.
+        """
+        if self.score_mod is None:
+            return None
+
+        # Create a lookup mapping from query indices -> request number
+        request_lookup = _offsets_to_doc_ids_tensor(self.query_start_loc)
+        user_score_mod = self.score_mod
+
+        def transformed_score_mod(
+            score: torch.Tensor,
+            b: torch.Tensor,
+            h: torch.Tensor,
+            q_idx: torch.Tensor,
+            physical_kv_idx: torch.Tensor,
+        ) -> torch.Tensor:
+            (is_valid, logical_q_idx,
+             logical_kv_idx) = self._convert_physical_to_logical(
+                 request_lookup, q_idx, physical_kv_idx)
+
+            return torch.where(
+                is_valid,
+                user_score_mod(score,
+                               b,
+                               h,
+                               logical_q_idx,
+                               logical_kv_idx,
+                               physical_q=q_idx), -float('inf'))
+
+        return transformed_score_mod
+
+    def _build_block_mask_direct(self) -> BlockMask:
+        """Direct block mask construction for standard causal attention.
+
+        This method constructs the block mask directly using
+        BlockMask.from_kv_blocks which is much more efficient than the
+        generic create_block_mask approach.
+
+        The direct path works as follows:
+        1. For each query token, fetch blocks from block_table using max_seq_len
+           (this fetches more blocks than needed for shorter sequences)
+        2. Group query tokens into chunks of q_block_size
+        3. For each group, deduplicate the blocks using unique_static_unsorted
+        4. Create BlockMask using the deduplicated block indices
+
+        Over-estimation occurs when a group of q_block_size tokens contains
+        multiple sequence IDs (doc_ids). In this case, we fetch ALL blocks for
+        each sequence represented in the group, even though individual query
+        tokens may only need a subset of those blocks based on causal masking
+        and their position.
+
+        """
+        page_to_block_ratio = self.kv_block_size // self.block_size
+        if page_to_block_ratio != 1:
+            raise ValueError(
+                f"FlexAttention currently requires the cache block size "
+                f"({self.block_size}) to be equal to the kv_block_size "
+                f"({self.kv_block_size}). Please check your model's "
+                f"configuration.")
+
+        used_pages = self.block_table[
+            self.doc_ids, :cdiv(self.max_seq_len, self.block_size)]
+        used_pages_padded = pad_to_multiple(used_pages,
+                                            multiple=self.q_block_size,
+                                            dim=0)
+        used_pages_padded = used_pages_padded.reshape(
+            used_pages_padded.shape[0] // self.q_block_size, -1)
+        used_pages_padded = used_pages_padded // page_to_block_ratio
+        kv_indices = unique_static_unsorted((used_pages_padded.long()),
+                                            M=self.num_blocks).to(torch.int32)
+
+        kv_num_blocks = (kv_indices >= 0).sum(dim=-1).to(torch.int32)
+        block_mask_kwargs = {
+            "seq_lengths": (self.num_actual_tokens, self.total_cache_tokens),
+            "kv_num_blocks": kv_num_blocks[None, None],
+            "kv_indices": kv_indices[None, None],
+            "full_kv_num_blocks": None,
+            "full_kv_indices": None,
+            "BLOCK_SIZE": (self.q_block_size, self.kv_block_size),
+            "mask_mod": self.mask_mod,
+        }
+
+        # compute_q_blocks parameter is available in PyTorch 2.9+
+        if is_torch_equal_or_newer("2.9.0.dev0"):
+            block_mask_kwargs["compute_q_blocks"] = False
+        return BlockMask.from_kv_blocks(**block_mask_kwargs)
+
     def build_block_mask(self) -> BlockMask:
         if self.causal:
             mask_mod = self.get_causal_mask_mod()
@@ -267,6 +485,7 @@ class FlexAttentionMetadata:
             self.num_actual_tokens,
             kv_len,
             device=self.block_table.device,
+            BLOCK_SIZE=(self.q_block_size, self.kv_block_size),
         )
 
     def __post_init__(self):
@@ -275,8 +494,21 @@ class FlexAttentionMetadata:
         assert self.cu_prefix_query_lens is None, "Not implemented yet."
         assert self.prefix_kv_lens is None, "Not implemented yet."
         assert self.suffix_kv_lens is None, "Not implemented yet."
+        # Create a lookup mapping from query indices -> request number
+        self.doc_ids = _offsets_to_doc_ids_tensor(self.query_start_loc)
         self.num_blocks = self.total_cache_tokens // self.block_size
-        self.block_mask = self.build_block_mask()
+
+        if self.causal:
+            self.mask_mod = self.get_causal_mask_mod()
+        else:
+            self.mask_mod = self.get_bidirectional_mask_mod()
+
+        self.transformed_score_mod = self.get_transformed_score_mod()
+
+        if self.direct_build and self.causal:
+            self.block_mask = self._build_block_mask_direct()
+        else:
+            self.block_mask = self.build_block_mask()
 
 
 class FlexAttentionMetadataBuilder(
@@ -287,15 +519,24 @@ class FlexAttentionMetadataBuilder(
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
         self.cache_config = vllm_config.cache_config
+        self.device = device
 
         self.num_heads_q = self.model_config.get_num_attention_heads(
-            vllm_config.parallel_config)
+            self.parallel_config)
         self.num_heads_kv = self.model_config.get_num_kv_heads(
-            vllm_config.parallel_config)
+            self.parallel_config)
         self.headdim = self.model_config.get_head_size()
         self.block_size = kv_cache_spec.block_size
         self.kv_cache_spec = kv_cache_spec
-        self.device = device
+        self.direct_build: bool = is_torch_equal_or_newer("2.9.0.dev0")
+        self.q_block_size: int = 16 if is_torch_equal_or_newer(
+            "2.9.0.dev0") else 128
+        self.kv_block_size: int = 16 if is_torch_equal_or_newer(
+            "2.9.0.dev0") else 128
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return False
 
     def build(self,
               common_prefix_len: int,
@@ -310,6 +551,7 @@ class FlexAttentionMetadataBuilder(
         seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
+        num_blocks_per_seq = cdiv(seq_lens, self.block_size)
 
         use_cascade = common_prefix_len > 0
         cu_prefix_query_lens = None
@@ -320,12 +562,15 @@ class FlexAttentionMetadataBuilder(
 
         block_size = self.kv_cache_spec.block_size
         max_possible_seq_len = self.model_config.max_model_len
-        total_cache_tokens = self.cache_config.num_gpu_blocks * block_size
+        num_gpu_blocks = self.cache_config.num_gpu_blocks
+
+        assert num_gpu_blocks is not None, \
+            "FlexAttention requires num_gpu_blocks to be set"
+        total_cache_tokens = (num_gpu_blocks * block_size)
 
         inverse_block_table = physical_to_logical_mapping(
-            block_table_tensor, self.cache_config.num_gpu_blocks)
+            block_table_tensor, seq_lens, block_size, num_gpu_blocks)
 
-        # Get the original offset tensor
         offset_tensor = common_attn_metadata.num_computed_tokens_cpu.to(
             self.device, non_blocking=True)
 
@@ -349,9 +594,16 @@ class FlexAttentionMetadataBuilder(
             physical_to_logical=inverse_block_table,
             total_cache_tokens=total_cache_tokens,
             decode_offset=offset_tensor,
+            num_blocks_per_seq=num_blocks_per_seq,
+            direct_build=self.direct_build,
+            q_block_size=self.q_block_size,
+            kv_block_size=self.kv_block_size,
         )
         return out
 
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        return False
+
 
 class FlexAttentionImpl(AttentionImpl):
     sliding_window: Optional[tuple[int, int]]
@@ -370,6 +622,7 @@ class FlexAttentionImpl(AttentionImpl):
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[str] = None,
+        **kwargs,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -398,6 +651,7 @@ class FlexAttentionImpl(AttentionImpl):
             raise NotImplementedError(
                 "FlexAttention does not support logits soft cap yet.")
 
+        assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
         if kv_sharing_target_layer_name is not None:
@@ -405,7 +659,6 @@ class FlexAttentionImpl(AttentionImpl):
                 "FlexAttention does not support kv sharing yet.")
 
         FlexAttentionBackend.validate_head_size(head_size)
-
         if is_quantized_kv_cache(self.kv_cache_dtype):
             raise NotImplementedError(
                 "FlexAttention does not support quantized kv-cache. Yet")
@@ -493,35 +746,48 @@ class FlexAttentionImpl(AttentionImpl):
         # Doesn't work for now -> constraint violation
         # torch._dynamo.try_mark_dynamic(query, 2)
 
-        # default M=64, N=64 may run out of shared memory on some GPUs
-        # TODO: Explicit configs for each GPU?
-        # Not sure how to calculate the shared memory requirement
-        extra_kernel_options = defaultdict[str, int](lambda: 64)
-        if query.dtype == torch.float32:
-            extra_kernel_options["BLOCK_M"] //= 2
-            extra_kernel_options["BLOCK_N"] //= 2
-        if current_platform.is_cuda():
-            device_props = torch.cuda.get_device_properties()
-            max_shared_memory = device_props.shared_memory_per_block_optin
-            if max_shared_memory < 144 * 1024:
-                extra_kernel_options["BLOCK_M"] //= 2
-                extra_kernel_options["BLOCK_N"] //= 2
+        assert attn_metadata.block_mask is not None
+        block_m, block_n = attn_metadata.block_mask.BLOCK_SIZE
 
+        kernel_options = get_kernel_options(query, block_m, block_n,
+                                            attn_metadata.direct_build)
         out = flex_attention_compiled(
             query,
             key_tensor,
             value_tensor,
-            attn_metadata.score_mod,
+            attn_metadata.transformed_score_mod,
             attn_metadata.block_mask,
             self.scale,
             enable_gqa=enable_gqa,
-            kernel_options={
-                "FORCE_USE_FLEX_ATTENTION": True,
-                **extra_kernel_options
-            },
+            kernel_options=kernel_options,
         )
 
         # Flex doesn't have an out variant today, rely on epilogue fusion
         out = out.permute(0, 2, 1, 3).squeeze(0)
         output[:num_actual_tokens, :, :].copy_(out)
         return output
+
+
+def get_kernel_options(query, block_m, block_n,
+                       use_direct_build: bool) -> dict[str, Union[int, bool]]:
+    kernel_options: dict[str, Union[int, bool]] = {
+        "FORCE_USE_FLEX_ATTENTION": True,
+    }
+    if use_direct_build:
+        kernel_options["BLOCK_M"] = block_m
+        kernel_options["BLOCK_N"] = block_n
+        return kernel_options
+    else:
+        kernel_options["BLOCK_M"] = 64
+        kernel_options["BLOCK_N"] = 64
+        if query.dtype == torch.float32:
+            kernel_options["BLOCK_M"] = 32
+            kernel_options["BLOCK_N"] = 32
+        # if current_platform.is_cuda():
+        if torch.cuda.is_available():
+            device_props = torch.cuda.get_device_properties()
+            max_shared_memory = device_props.shared_memory_per_block_optin
+            if max_shared_memory < 144 * 1024:
+                kernel_options["BLOCK_M"] = 32
+                kernel_options["BLOCK_N"] = 32
+    return kernel_options

From a9082a4d144e516d0ee00bacfa0dca9609b6b2c3 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 25 Aug 2025 21:40:20 +0800
Subject: [PATCH 564/932] [Bugfix] Fix Qwen3 MoE GPTQ inference (#23490)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/models/qwen3_moe.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 2812f79a66..8498f61b35 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -45,6 +45,9 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
@@ -146,11 +149,20 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
                                 enable_eplb=self.enable_eplb,
                                 num_redundant_experts=self.n_redundant_experts)
 
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.num_experts,
-                                     bias=False,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.gate")
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=self._maybe_ignore_quant_config(quant_config),
+            prefix=f"{prefix}.gate")
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid gate quantization.
+        # See: https://huggingface.co/Qwen/Qwen3-30B-A3B-GPTQ-Int4
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -682,4 +694,4 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
         return loader.load_weights(weights)
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-        return self.model.get_expert_mapping()
\ No newline at end of file
+        return self.model.get_expert_mapping()

From 0ff902f3b48376aadfe4e753ce0872b49c8f9f7e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 25 Aug 2025 08:44:48 -0700
Subject: [PATCH 565/932] [Refactor] Refactor persistent buffers with
 CpuGpuBuffer  (#23515)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/v1/worker/gpu_model_runner.py | 173 ++++++++++++-----------------
 vllm/v1/worker/utils.py            |  29 +++++
 2 files changed, 99 insertions(+), 103 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 73117c75b9..5d49bbaf27 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -83,8 +83,9 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import (
     KVConnectorModelRunnerMixin, KVConnectorOutput)
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache,
-                    gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
+from .utils import (AttentionGroup, CpuGpuBuffer, MultiModalBudget,
+                    bind_kv_cache, gather_mm_placeholders,
+                    initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
 if TYPE_CHECKING:
@@ -149,6 +150,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             parallel_config)
         self.hidden_size = model_config.get_hidden_size()
         self.attention_chunk_size = model_config.attention_chunk_size
+        # Only relevant for models using ALiBi (e.g, MPT)
+        self.use_alibi = check_use_alibi(model_config)
 
         self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
 
@@ -242,21 +245,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self._init_device_properties()
 
         # Persistent buffers for CUDA graphs.
-        self.input_ids = torch.zeros(self.max_num_tokens,
-                                     dtype=torch.int32,
-                                     device=self.device)
-        self.positions = torch.zeros(self.max_num_tokens,
-                                     dtype=torch.int64,
-                                     device=self.device)
-        self.query_start_loc = torch.zeros(self.max_num_reqs + 1,
-                                           dtype=torch.int32,
-                                           device=self.device)
-        self.seq_lens = torch.zeros(self.max_num_reqs,
-                                    dtype=torch.int32,
-                                    device=self.device)
-
-        # None in the first PP rank. The rest are set after load_model.
-        self.intermediate_tensors: Optional[IntermediateTensors] = None
+        self.input_ids = self._make_buffer(self.max_num_tokens,
+                                           dtype=torch.int32)
+        self.positions = self._make_buffer(self.max_num_tokens,
+                                           dtype=torch.int64)
+        self.query_start_loc = self._make_buffer(self.max_num_reqs + 1,
+                                                 dtype=torch.int32)
+        self.seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
+        self.inputs_embeds = torch.zeros(
+            (self.max_num_tokens, self.hidden_size),
+            dtype=self.dtype,
+            device=self.device)
 
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
         if self.uses_mrope:
@@ -270,23 +269,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # identical position IDs, making M-RoPE functionally equivalent to
             # 1D-RoPE.
             # See page 5 of https://arxiv.org/abs/2409.12191
-            self.mrope_positions = torch.zeros((3, self.max_num_tokens + 1),
-                                               dtype=torch.int64,
-                                               device=self.device)
-            self.mrope_positions_cpu = torch.zeros(
-                (3, self.max_num_tokens + 1),
-                dtype=torch.int64,
-                device="cpu",
-                pin_memory=self.pin_memory)
-            self.mrope_positions_np = self.mrope_positions_cpu.numpy()
+            self.mrope_positions = self._make_buffer(
+                (3, self.max_num_tokens + 1), dtype=torch.int64)
 
-        # Only relevant for models using ALiBi (e.g, MPT)
-        self.use_alibi = check_use_alibi(model_config)
-
-        self.inputs_embeds = torch.zeros(
-            (self.max_num_tokens, self.hidden_size),
-            dtype=self.dtype,
-            device=self.device)
+        # None in the first PP rank. The rest are set after load_model.
+        self.intermediate_tensors: Optional[IntermediateTensors] = None
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
         # Keep in int64 to avoid overflow with long context
@@ -294,28 +281,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                        self.max_model_len,
                                        self.max_num_tokens),
                                    dtype=np.int64)
-        # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
-        # a faster version of creating a new tensor every time. Thus, we should
-        # not make any assumptions about the values in these tensors.
-        self.input_ids_cpu = torch.zeros(self.max_num_tokens,
-                                         dtype=torch.int32,
-                                         device="cpu",
-                                         pin_memory=self.pin_memory)
-        self.positions_cpu = torch.zeros(self.max_num_tokens,
-                                         dtype=torch.int64,
-                                         device="cpu",
-                                         pin_memory=self.pin_memory)
-        self.positions_np = self.positions_cpu.numpy()
-        self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
-                                               dtype=torch.int32,
-                                               device="cpu",
-                                               pin_memory=self.pin_memory)
-        self.query_start_loc_np = self.query_start_loc_cpu.numpy()
-        self.seq_lens_cpu = torch.zeros(self.max_num_reqs,
-                                        dtype=torch.int32,
-                                        device="cpu",
-                                        pin_memory=self.pin_memory)
-        self.seq_lens_np = self.seq_lens_cpu.numpy()
 
         # Layer pairings for cross-layer KV sharing.
         # If an Attention layer `layer_name` is in the keys of this dict, it
@@ -352,6 +317,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self._draft_token_ids: Optional[Union[list[list[int]],
                                               torch.Tensor]] = None
 
+    def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
+        return CpuGpuBuffer(*args,
+                            dtype=dtype,
+                            device=self.device,
+                            pin_memory=self.pin_memory)
+
     def _init_model_kwargs(self, num_tokens: int):
         model_kwargs = dict[str, Any]()
         num_reqs = self.input_batch.num_reqs
@@ -376,7 +347,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if len(token_type_id_requests) == 0:
             return model_kwargs
 
-        seq_lens = self.seq_lens[:num_reqs]
+        seq_lens = self.seq_lens.gpu[:num_reqs]
         token_type_ids = []
 
         for i in range(num_reqs):
@@ -719,7 +690,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_scheduled_tokens)
 
         # Get positions.
-        positions_np = self.positions_np[:total_num_scheduled_tokens]
+        positions_np = self.positions.np[:total_num_scheduled_tokens]
         np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
                arange,
                out=positions_np)
@@ -742,7 +713,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
                            0,
                            torch.from_numpy(token_indices),
-                           out=self.input_ids_cpu[:total_num_scheduled_tokens])
+                           out=self.input_ids.cpu[:total_num_scheduled_tokens])
 
         self.input_batch.block_table.compute_slot_mapping(
             req_indices, positions_np)
@@ -750,36 +721,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             total_num_scheduled_tokens)
 
         # Prepare the attention metadata.
-        self.query_start_loc_np[0] = 0
-        self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
+        self.query_start_loc.np[0] = 0
+        self.query_start_loc.np[1:num_reqs + 1] = cu_num_tokens
         # Note: pad query_start_loc to be non-decreasing, as kernels
         # like FlashAttention requires that
-        self.query_start_loc_np[num_reqs + 1:].fill(cu_num_tokens[-1])
-        self.query_start_loc.copy_(self.query_start_loc_cpu, non_blocking=True)
-        query_start_loc = self.query_start_loc[:num_reqs + 1]
+        self.query_start_loc.np[num_reqs + 1:].fill(cu_num_tokens[-1])
+        self.query_start_loc.copy_to_gpu()
+        query_start_loc = self.query_start_loc.gpu[:num_reqs + 1]
 
-        self.seq_lens_np[:num_reqs] = (
+        self.seq_lens.np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] +
             num_scheduled_tokens)
         # Fill unused with 0 for full cuda graph mode.
-        self.seq_lens_np[num_reqs:].fill(0)
-        self.seq_lens.copy_(self.seq_lens_cpu, non_blocking=True)
-        seq_lens = self.seq_lens[:num_reqs]
-        max_seq_len = self.seq_lens_np[:num_reqs].max().item()
+        self.seq_lens.np[num_reqs:].fill(0)
+        self.seq_lens.copy_to_gpu()
+        seq_lens = self.seq_lens.gpu[:num_reqs]
+        max_seq_len = self.seq_lens.np[:num_reqs].max().item()
 
         # Copy the tensors to the GPU.
-        self.input_ids[:total_num_scheduled_tokens].copy_(
-            self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
         if self.uses_mrope:
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
-            self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
-                self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
+            self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
+                self.mrope_positions.cpu[:, :total_num_scheduled_tokens],
                 non_blocking=True)
         else:
             # Common case (1D positions)
-            self.positions[:total_num_scheduled_tokens].copy_(
-                self.positions_cpu[:total_num_scheduled_tokens],
-                non_blocking=True)
+            self.positions.copy_to_gpu(total_num_scheduled_tokens)
 
         use_spec_decode = len(
             scheduler_output.scheduled_spec_decode_tokens) > 0
@@ -833,8 +801,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         attn_metadata: dict[str, Any] = {}
 
         # Used in the below loop.
-        query_start_loc_cpu = self.query_start_loc_cpu[:num_reqs + 1]
-        seq_lens_cpu = self.seq_lens_cpu[:num_reqs]
+        query_start_loc_cpu = self.query_start_loc.cpu[:num_reqs + 1]
+        seq_lens_cpu = self.seq_lens.cpu[:num_reqs]
         num_computed_tokens_cpu = (
             self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs])
         spec_decode_common_attn_metadata = None
@@ -1065,9 +1033,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 src_start = num_computed_tokens
                 src_end = num_computed_tokens + prompt_part_len
 
-                self.mrope_positions_cpu[:, dst_start:dst_end] = \
-                    req.mrope_positions[:,src_start:src_end]
-
+                self.mrope_positions.cpu[:, dst_start:dst_end] = (
+                    req.mrope_positions[:, src_start:src_end])
                 mrope_pos_ptr += prompt_part_len
 
             if completion_part_len > 0:
@@ -1076,7 +1043,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 dst_end = mrope_pos_ptr + completion_part_len
 
                 MRotaryEmbedding.get_next_input_positions_tensor(
-                    out=self.mrope_positions_np,
+                    out=self.mrope_positions.np,
                     out_offset=dst_start,
                     mrope_position_delta=req.mrope_position_delta,
                     context_len=num_computed_tokens + prompt_part_len,
@@ -1140,7 +1107,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # Compute the draft token ids.
         # draft_token_indices:      [  1,   2,   3, 105, 106, 208]
-        draft_token_ids = self.input_ids[logits_indices]
+        draft_token_ids = self.input_ids.gpu[logits_indices]
         draft_token_ids = draft_token_ids[target_logits_indices + 1]
 
         metadata = SpecDecodeMetadata(
@@ -1471,7 +1438,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         pooling_metadata = self.input_batch.pooling_metadata
         pooling_metadata.build_pooling_cursor(num_scheduled_tokens_np.tolist(),
                                               device=hidden_states.device)
-        seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs]
+        seq_lens_cpu = self.seq_lens.cpu[:self.input_batch.num_reqs]
 
         # Pooling models D2H & synchronize occurs in pooler.py:build_output
         raw_pooler_output = self.model.pooler(
@@ -1550,7 +1517,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
             inputs_embeds_scheduled = self.model.get_input_embeddings(
-                input_ids=self.input_ids[:num_scheduled_tokens],
+                input_ids=self.input_ids.gpu[:num_scheduled_tokens],
                 multimodal_embeddings=mm_embeds or None,
             )
 
@@ -1569,13 +1536,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # While it is possible to use embeddings as input just like the
             # multimodal models, it is not desirable for performance since
             # then the embedding layer is not included in the CUDA graph.
-            input_ids = self.input_ids[:num_input_tokens]
+            input_ids = self.input_ids.gpu[:num_input_tokens]
             inputs_embeds = None
             model_kwargs = self._init_model_kwargs(num_input_tokens)
         if self.uses_mrope:
-            positions = self.mrope_positions[:, :num_input_tokens]
+            positions = self.mrope_positions.gpu[:, :num_input_tokens]
         else:
-            positions = self.positions[:num_input_tokens]
+            positions = self.positions.gpu[:num_input_tokens]
 
         if get_pp_group().is_first_rank:
             intermediate_tensors = None
@@ -1857,9 +1824,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
             if spec_decode_metadata is None:
                 # input_ids can be None for multimodal models.
-                target_token_ids = self.input_ids[:num_scheduled_tokens]
+                target_token_ids = self.input_ids.gpu[:num_scheduled_tokens]
                 # TODO(woosuk): Support M-RoPE.
-                target_positions = self.positions[:num_scheduled_tokens]
+                target_positions = self.positions.gpu[:num_scheduled_tokens]
                 if self.use_aux_hidden_state_outputs:
                     target_hidden_states = torch.cat(
                         [h[:num_scheduled_tokens] for h in aux_hidden_states],
@@ -1879,9 +1846,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     self.drafter.prepare_inputs(
                     common_attn_metadata, num_rejected_tokens_cpu)
 
-                target_token_ids = self.input_ids[token_indices]
+                target_token_ids = self.input_ids.gpu[token_indices]
                 # TODO(woosuk): Support M-RoPE.
-                target_positions = self.positions[token_indices]
+                target_positions = self.positions.gpu[token_indices]
                 if self.use_aux_hidden_state_outputs:
                     target_hidden_states = torch.cat(
                         [h[token_indices] for h in aux_hidden_states], dim=-1)
@@ -2123,7 +2090,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # If this is a partial request (i.e. chunked prefill),
             # then there is prompt logprob generated for each index.
             req_idx = self.input_batch.req_id_to_index[req_id]
-            offset = self.query_start_loc_np[req_idx].item()
+            offset = self.query_start_loc.np[req_idx].item()
             prompt_hidden_states = hidden_states[offset:offset + num_logits]
             logits = self.model.compute_logits(prompt_hidden_states, None)
 
@@ -2196,7 +2163,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             @functools.cache
             def rand_input_ids() -> torch.Tensor:
                 return torch.randint_like(
-                    self.input_ids,
+                    self.input_ids.gpu,
                     low=0,
                     high=self.model_config.get_vocab_size(),
                     dtype=input_ids.dtype)
@@ -2313,18 +2280,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             attn_metadata = {}
 
             # Make sure max_model_len is used at the graph capture time.
-            self.seq_lens_np[:num_reqs] = self.max_model_len
-            self.seq_lens_np[num_reqs:] = 0
-            self.seq_lens.copy_(self.seq_lens_cpu, non_blocking=True)
+            self.seq_lens.np[:num_reqs] = self.max_model_len
+            self.seq_lens.np[num_reqs:] = 0
+            self.seq_lens.copy_to_gpu()
 
             for kv_cache_group_id, kv_cache_group_spec in enumerate(
                     self.kv_cache_config.kv_cache_groups):
                 common_attn_metadata = CommonAttentionMetadata(
-                    query_start_loc=self.query_start_loc[:num_reqs + 1],
-                    query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs +
+                    query_start_loc=self.query_start_loc.gpu[:num_reqs + 1],
+                    query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
                                                                  1],
-                    seq_lens=self.seq_lens[:num_reqs],
-                    seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
+                    seq_lens=self.seq_lens.gpu[:num_reqs],
+                    seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
                     num_computed_tokens_cpu=self.input_batch.
                     num_computed_tokens_cpu_tensor[:num_reqs],
                     num_reqs=num_reqs,
@@ -2353,14 +2320,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     **self._dummy_mm_kwargs(num_reqs),
                 }
             else:
-                input_ids = self.input_ids[:num_tokens]
+                input_ids = self.input_ids.gpu[:num_tokens]
                 inputs_embeds = None
                 model_kwargs = self._init_model_kwargs(num_tokens)
 
             if self.uses_mrope:
-                positions = self.mrope_positions[:, :num_tokens]
+                positions = self.mrope_positions.gpu[:, :num_tokens]
             else:
-                positions = self.positions[:num_tokens]
+                positions = self.positions.gpu[:num_tokens]
 
             if get_pp_group().is_first_rank:
                 intermediate_tensors = None
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index ffc1a11bc3..b96473e7b1 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -298,3 +298,32 @@ def bind_kv_cache(
     for layer_name, kv_cache in kv_caches.items():
         # NOTE: Use list because of v0 PP virtual engine.
         forward_context[layer_name].kv_cache = [kv_cache]
+
+
+class CpuGpuBuffer:
+
+    def __init__(
+        self,
+        *args,
+        dtype: torch.dtype,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.cpu = torch.zeros(*args,
+                               dtype=dtype,
+                               device="cpu",
+                               pin_memory=pin_memory)
+        self.np = self.cpu.numpy()
+        self.gpu = self.cpu.to(device)
+
+    def copy_to_gpu(self, n: Optional[int] = None) -> None:
+        if n is None:
+            return self.gpu.copy_(self.cpu, non_blocking=True)
+        return self.gpu[:n].copy_(self.cpu[:n], non_blocking=True)
+
+    def copy_to_cpu(self, n: Optional[int] = None) -> None:
+        """NOTE: Because this method is non-blocking, explicit synchronization
+        is needed to ensure the data is copied to CPU."""
+        if n is None:
+            return self.cpu.copy_(self.gpu, non_blocking=True)
+        return self.cpu[:n].copy_(self.gpu[:n], non_blocking=True)

From 2a167b2eeb993638c198db49f3927bae5d55508b Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Mon, 25 Aug 2025 09:25:52 -0700
Subject: [PATCH 566/932] [test][RL] Add sleep level 2 test and fix reload with
 sleep mode (#23521)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 tests/basic_correctness/test_cumem.py | 31 +++++++++++++++++++++++++++
 vllm/v1/worker/gpu_worker.py          |  3 +--
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 34f9389c82..f3ad680b72 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -177,3 +177,34 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
 
         # cmp output
         assert output[0].outputs[0].text == output3[0].outputs[0].text
+
+
+@create_new_process_for_each_test()
+def test_deep_sleep():
+    model = "Qwen/Qwen3-0.6B"
+    free, total = torch.cuda.mem_get_info()
+    used_bytes_baseline = total - free  # in case other process is running
+    llm = LLM(model, enable_sleep_mode=True)
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)
+
+    # Put the engine to deep sleep
+    llm.sleep(level=2)
+
+    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+    assert used_bytes < 3 * GiB_bytes
+
+    llm.wake_up(tags=["weights"])
+    llm.collective_rpc("reload_weights")
+    free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+    assert used_bytes < 4 * GiB_bytes
+
+    # now allocate kv cache and cuda graph memory
+    llm.wake_up(tags=["kv_cache"])
+    output2 = llm.generate(prompt, sampling_params)
+
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index f83a4f4fae..1688b8b83e 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -216,8 +216,7 @@ class Worker(WorkerBase):
         self.model_runner.update_config(overrides)
 
     def reload_weights(self) -> None:
-        with self._maybe_get_memory_pool_context(tag="weights"):
-            self.model_runner.reload_weights()
+        self.model_runner.reload_weights()
 
     @torch.inference_mode()
     def determine_available_memory(self) -> int:

From 8a3cd90af534c39425ebfdfd295eea0a4582d541 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Mon, 25 Aug 2025 11:47:52 -0700
Subject: [PATCH 567/932] [Kernel] Add fused grouped_topk kernel for MoE
 (#23274)

Signed-off-by: Xin Yang <xyangx@amazon.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 CMakeLists.txt                                |   4 +-
 csrc/moe/grouped_topk_kernels.cu              | 757 ++++++++++++++++++
 csrc/moe/moe_ops.h                            |   5 +
 csrc/moe/torch_bindings.cpp                   |   6 +
 tests/kernels/moe/test_grouped_topk.py        |  76 ++
 vllm/_custom_ops.py                           |  11 +
 vllm/envs.py                                  |   6 +
 .../layers/fused_moe/fused_moe.py             |  46 +-
 8 files changed, 909 insertions(+), 2 deletions(-)
 create mode 100644 csrc/moe/grouped_topk_kernels.cu
 create mode 100644 tests/kernels/moe/test_grouped_topk.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aca42c3fe5..b0ed4a284d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -817,7 +817,9 @@ set(VLLM_MOE_EXT_SRC
   "csrc/moe/topk_softmax_kernels.cu")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
+  list(APPEND VLLM_MOE_EXT_SRC
+    "csrc/moe/moe_wna16.cu"
+    "csrc/moe/grouped_topk_kernels.cu")
 endif()
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
new file mode 100644
index 0000000000..78f7b3cc1a
--- /dev/null
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -0,0 +1,757 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/TensorRT-LLM/blob/v0.21.0/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu
+ * Copyright (c) 2025, The vLLM team.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+
+namespace vllm {
+namespace moe {
+
+constexpr unsigned FULL_WARP_MASK = 0xffffffff;
+constexpr int32_t WARP_SIZE = 32;
+constexpr int32_t BLOCK_SIZE = 512;
+constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
+
+namespace warp_topk {
+
+template <int size, typename T>
+__host__ __device__ constexpr T round_up_to_multiple_of(T len) {
+  if (len == 0) {
+    return 0;
+  }
+  return ((len - 1) / size + 1) * size;
+}
+
+template <typename T>
+constexpr __host__ __device__ bool isPowerOf2(T v) {
+  return (v && !(v & (v - 1)));
+}
+
+template <bool greater, typename T>
+__forceinline__ __device__ bool is_better_than(T val, T baseline) {
+  return (val > baseline && greater) || (val < baseline && !greater);
+}
+
+template <bool greater, typename T, typename idxT>
+__forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
+                                               idxT baseline_index) {
+  bool res = (val > baseline && greater) || (val < baseline && !greater);
+  if (val == baseline) {
+    res = (index < baseline_index && greater) ||
+          (index < baseline_index && !greater);
+  }
+  return res;
+}
+
+template <typename T, typename idxT>
+int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
+  int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
+  int64_t n = std::max<int>(num_of_warp / 2 * k, num_of_warp * WARP_SIZE);
+  return max(cache_topk,
+             round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
+}
+
+template <int size, bool ascending, bool reverse, typename T, typename idxT,
+          bool is_stable>
+struct BitonicMerge {
+  // input should be a bitonic sequence, and sort it to be a monotonic sequence
+  __device__ static void merge(T* __restrict__ val_arr,
+                               idxT* __restrict__ idx_arr) {
+    static_assert(isPowerOf2(size));
+    static_assert(size >= 2 * WARP_SIZE);
+    constexpr int arr_len = size / WARP_SIZE;
+
+    constexpr int stride = arr_len / 2;
+    for (int i = 0; i < stride; ++i) {
+      int const other_i = i + stride;
+      T& val = val_arr[i];
+      T& other_val = val_arr[other_i];
+      bool is_better;
+      if constexpr (is_stable) {
+        is_better = is_better_than<ascending>(val, other_val, idx_arr[i],
+                                              idx_arr[other_i]);
+      } else {
+        is_better = is_better_than<ascending>(val, other_val);
+      }
+
+      if (is_better) {
+        T tmp = val;
+        val = other_val;
+        other_val = tmp;
+
+        idxT tmp2 = idx_arr[i];
+        idx_arr[i] = idx_arr[other_i];
+        idx_arr[other_i] = tmp2;
+      }
+    }
+
+    BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
+        val_arr, idx_arr);
+    BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
+        val_arr + arr_len / 2, idx_arr + arr_len / 2);
+  }
+};
+
+template <int size, bool ascending, typename T, typename idxT, bool is_stable>
+struct BitonicSort {
+  __device__ static void sort(T* __restrict__ val_arr,
+                              idxT* __restrict__ idx_arr) {
+    static_assert(isPowerOf2(size));
+    static_assert(size >= 2 * WARP_SIZE);
+    constexpr int arr_len = size / WARP_SIZE;
+
+    BitonicSort<size / 2, true, T, idxT, is_stable>::sort(val_arr, idx_arr);
+    BitonicSort<size / 2, false, T, idxT, is_stable>::sort(
+        val_arr + arr_len / 2, idx_arr + arr_len / 2);
+    BitonicMerge<size, ascending, ascending, T, idxT, is_stable>::merge(
+        val_arr, idx_arr);
+  }
+};
+
+template <bool ascending, typename T, typename idxT, bool is_stable>
+struct BitonicSort<32, ascending, T, idxT, is_stable> {
+  __device__ static void sort(T* __restrict__ val_arr,
+                              idxT* __restrict__ idx_arr) {
+    int const lane = threadIdx.x % WARP_SIZE;
+
+    // ascending doesn't matter before merging since all we need is a bitonic
+    // sequence
+    for (int stage = 0; stage < 4; ++stage) {
+      for (int stride = (1 << stage); stride > 0; stride /= 2) {
+        bool reverse = (lane >> stage) & 2;
+        bool is_second = lane & stride;
+
+        T other = __shfl_xor_sync(FULL_WARP_MASK, *val_arr, stride);
+        idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, *idx_arr, stride);
+
+        bool is_better;
+        if constexpr (is_stable) {
+          if constexpr (ascending) {
+            is_better = ((*val_arr > other) ||
+                         ((*val_arr == other) && (*idx_arr < other_idx))) !=
+                        (reverse != is_second);
+          } else {
+            is_better = ((*val_arr > other) ||
+                         ((*val_arr == other) && (*idx_arr > other_idx))) !=
+                        (reverse != is_second);
+          }
+        } else {
+          is_better = (*val_arr != other &&
+                       (*val_arr > other) != (reverse != is_second));
+        }
+        if (is_better) {
+          *val_arr = other;
+          *idx_arr = other_idx;
+        }
+      }
+    }
+
+    BitonicMerge<32, ascending, ascending, T, idxT, is_stable>::merge(val_arr,
+                                                                      idx_arr);
+  }
+};
+
+template <bool ascending, bool reverse, typename T, typename idxT,
+          bool is_stable>
+struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> {
+  __device__ static void merge(T* __restrict__ val_arr,
+                               idxT* __restrict__ idx_arr) {
+    int const lane = threadIdx.x % WARP_SIZE;
+    for (int stride = WARP_SIZE / 2; stride > 0; stride /= 2) {
+      bool is_second = lane & stride;
+      T& val = *val_arr;
+      T other = __shfl_xor_sync(FULL_WARP_MASK, val, stride);
+      idxT& idx = *idx_arr;
+      idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, idx, stride);
+
+      bool is_better;
+      if constexpr (is_stable) {
+        if constexpr (ascending) {
+          is_better = ((*val_arr > other) ||
+                       ((*val_arr == other) && (*idx_arr < other_idx))) ==
+                      (reverse != is_second);  // for min
+        } else {
+          is_better = ((*val_arr > other) ||
+                       ((*val_arr == other) && (*idx_arr > other_idx))) ==
+                      (reverse != is_second);  // for max
+        }
+      } else {
+        is_better =
+            (val != other && ((val > other) == (ascending != is_second)));
+      }
+
+      if (is_better) {
+        val = other;
+        idx = other_idx;
+      }
+    }
+  }
+};
+
+template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
+class WarpSort {
+ public:
+  __device__ WarpSort(idxT k, T dummy)
+      : lane_(threadIdx.x % WARP_SIZE), k_(k), dummy_(dummy) {
+    static_assert(capacity >= WARP_SIZE && isPowerOf2(capacity));
+
+    for (int i = 0; i < max_arr_len_; ++i) {
+      val_arr_[i] = dummy_;
+      idx_arr_[i] = 0;
+    }
+  }
+
+  // load and merge k sorted values
+  __device__ void load_sorted(T const* __restrict__ in,
+                              idxT const* __restrict__ in_idx, idxT start) {
+    idxT idx = start + WARP_SIZE - 1 - lane_;
+    for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WARP_SIZE) {
+      if (idx < start + k_) {
+        T t = in[idx];
+        bool is_better;
+        if constexpr (is_stable) {
+          is_better =
+              is_better_than<greater>(t, val_arr_[i], in_idx[idx], idx_arr_[i]);
+        } else {
+          is_better = is_better_than<greater>(t, val_arr_[i]);
+        }
+        if (is_better) {
+          val_arr_[i] = t;
+          idx_arr_[i] = in_idx[idx];
+        }
+      }
+    }
+
+    BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
+        val_arr_, idx_arr_);
+  }
+
+  __device__ void dump(T* __restrict__ out, idxT* __restrict__ out_idx) const {
+    for (int i = 0; i < max_arr_len_; ++i) {
+      idxT out_i = i * WARP_SIZE + lane_;
+      if (out_i < k_) {
+        out[out_i] = val_arr_[i];
+        out_idx[out_i] = idx_arr_[i];
+      }
+    }
+  }
+
+  __device__ void dumpIdx(idxT* __restrict__ out_idx) const {
+    for (int i = 0; i < max_arr_len_; ++i) {
+      idxT out_i = i * WARP_SIZE + lane_;
+      if (out_i < k_) {
+        out_idx[out_i] = idx_arr_[i];
+      }
+    }
+  }
+
+ protected:
+  static constexpr int max_arr_len_ = capacity / WARP_SIZE;
+
+  T val_arr_[max_arr_len_];
+  idxT idx_arr_[max_arr_len_];
+
+  int const lane_;
+  idxT const k_;
+  T const dummy_;
+
+};  // end class WarpSort
+
+template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
+class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
+ public:
+  __device__ WarpSelect(idxT k, T dummy)
+      : WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy),
+        k_th_(dummy),
+        k_th_lane_((k - 1) % WARP_SIZE) {
+    extern __shared__ char smem_buf[];  // extern __shared__ T smem_buf[];
+
+    int const num_of_warp = blockDim.x / WARP_SIZE;
+    int const warp_id = threadIdx.x / WARP_SIZE;
+    val_smem_ = reinterpret_cast<T*>(smem_buf);
+    val_smem_ += warp_id * WARP_SIZE;
+    idx_smem_ = reinterpret_cast<idxT*>(
+        smem_buf +
+        round_up_to_multiple_of<256>(num_of_warp * sizeof(T) * WARP_SIZE));
+    idx_smem_ += warp_id * WARP_SIZE;
+  }
+
+  __device__ void add(T const* in, idxT start, idxT end) {
+    idxT const end_for_fullwarp =
+        round_up_to_multiple_of<WARP_SIZE>(end - start) + start;
+    for (idxT i = start + lane_; i < end_for_fullwarp; i += WARP_SIZE) {
+      T val = (i < end) ? in[i] : dummy_;
+      add(val, i);
+    }
+  }
+
+  __device__ void add(T val, idxT idx) {
+    bool do_add;
+    if constexpr (is_stable) {
+      do_add = is_better_than<greater>(val, k_th_, idx, k_th_idx_);
+    } else {
+      do_add = is_better_than<greater>(val, k_th_);
+    }
+
+    uint32_t mask = __ballot_sync(FULL_WARP_MASK, do_add);
+    if (mask == 0) {
+      return;
+    }
+
+    int pos = smem_buf_len_ + __popc(mask & ((0x1u << lane_) - 1));
+    if (do_add && pos < WARP_SIZE) {
+      val_smem_[pos] = val;
+      idx_smem_[pos] = idx;
+      do_add = false;
+    }
+    smem_buf_len_ += __popc(mask);
+    if (smem_buf_len_ >= WARP_SIZE) {
+      __syncwarp();
+      merge_buf_(val_smem_[lane_], idx_smem_[lane_]);
+      smem_buf_len_ -= WARP_SIZE;
+    }
+    if (do_add) {
+      pos -= WARP_SIZE;
+      val_smem_[pos] = val;
+      idx_smem_[pos] = idx;
+    }
+    __syncwarp();
+  }
+
+  __device__ void done() {
+    if (smem_buf_len_) {
+      T val = (lane_ < smem_buf_len_) ? val_smem_[lane_] : dummy_;
+      idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0;
+      merge_buf_(val, idx);
+    }
+
+    // after done(), smem is used for merging results among warps
+    __syncthreads();
+  }
+
+ private:
+  __device__ void set_k_th_() {
+    k_th_ = __shfl_sync(FULL_WARP_MASK, val_arr_[max_arr_len_ - 1], k_th_lane_);
+    if constexpr (is_stable) {
+      k_th_idx_ =
+          __shfl_sync(FULL_WARP_MASK, idx_arr_[max_arr_len_ - 1], k_th_lane_);
+    }
+  }
+
+  __device__ void merge_buf_(T val, idxT idx) {
+    BitonicSort<WARP_SIZE, greater, T, idxT, is_stable>::sort(&val, &idx);
+
+    T& old = val_arr_[max_arr_len_ - 1];
+
+    bool is_better;
+    if constexpr (is_stable) {
+      is_better =
+          is_better_than<greater>(val, old, idx, idx_arr_[max_arr_len_ - 1]);
+    } else {
+      is_better = is_better_than<greater>(val, old);
+    }
+
+    if (is_better) {
+      old = val;
+      idx_arr_[max_arr_len_ - 1] = idx;
+    }
+
+    BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
+        val_arr_, idx_arr_);
+
+    set_k_th_();
+  }
+
+  using WarpSort<capacity, greater, T, idxT, is_stable>::max_arr_len_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::val_arr_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::idx_arr_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::lane_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::k_;
+  using WarpSort<capacity, greater, T, idxT, is_stable>::dummy_;
+
+  T* val_smem_;
+  idxT* idx_smem_;
+  int smem_buf_len_ = 0;
+
+  T k_th_;
+  idxT k_th_idx_;
+  int const k_th_lane_;
+};  // end class WarpSelect
+}  // namespace warp_topk
+
+template <typename T_OUT, typename T_IN>
+__device__ inline T_OUT cuda_cast(T_IN val) {
+  return val;
+}
+
+template <>
+__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
+  return __bfloat162float(val);
+}
+
+template <typename T>
+__device__ void topk_with_k2(T* output, T const* input,
+                             cg::thread_block_tile<32> const& tile,
+                             int32_t const lane_id,
+                             int const num_experts_per_group) {
+  // Get the top2 per thread
+  T largest = -INFINITY;
+  T second_largest = -INFINITY;
+
+  if (num_experts_per_group > WARP_SIZE) {
+    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
+      T value = input[i];
+      if (value > largest) {
+        second_largest = largest;
+        largest = value;
+      } else if (value > second_largest) {
+        second_largest = value;
+      }
+    }
+  } else {
+    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
+      largest = input[i];
+    }
+  }
+
+  __syncwarp();  // Ensure all threads have valid data before reduction
+  // Get the top2 warpwise
+  T max1 = cg::reduce(tile, largest, cg::greater<T>());
+
+  T max2 = max1;
+  bool equal_to_max1 = (max1 == largest);
+
+  int count_max1 = __popc(__ballot_sync(FULL_WARP_MASK, equal_to_max1));
+
+  if (count_max1 == 1) {
+    largest = (largest == max1) ? second_largest : largest;
+    max2 = cg::reduce(tile, largest, cg::greater<T>());
+  }
+
+  if (lane_id == 0) {
+    *output = max1 + max2;
+  }
+}
+
+template <typename T>
+__global__ void topk_with_k2_kernel(T* output, T* input,
+                                    int64_t const num_tokens,
+                                    int64_t const num_cases,
+                                    int64_t const n_group,
+                                    int64_t const num_experts_per_group) {
+  int32_t warp_id = threadIdx.x / WARP_SIZE;
+  int32_t lane_id = threadIdx.x % WARP_SIZE;
+
+  int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
+  if (case_id < num_cases) {
+    input += case_id * num_experts_per_group;
+    output += case_id;
+
+    cg::thread_block block = cg::this_thread_block();
+    cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.wait;");
+#endif
+    topk_with_k2(output, input, tile, lane_id, num_experts_per_group);
+  }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename T, typename IdxT>
+__global__ void group_idx_and_topk_idx_kernel(
+    T* scores, T const* group_scores, T* topk_values, IdxT* topk_indices,
+    T* scores_with_bias, int64_t const num_tokens, int64_t const n_group,
+    int64_t const topk_group, int64_t const topk, int64_t const num_experts,
+    int64_t const num_experts_per_group, bool renormalize,
+    double routed_scaling_factor) {
+  int32_t warp_id = threadIdx.x / WARP_SIZE;
+  int32_t lane_id = threadIdx.x % WARP_SIZE;
+  int32_t case_id =
+      blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;  // one per token
+  scores_with_bias += case_id * num_experts;
+  scores += case_id * num_experts;
+  group_scores += case_id * n_group;
+  topk_values += case_id * topk;
+  topk_indices += case_id * topk;
+
+  int32_t align_num_experts_per_group =
+      warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
+
+  cg::thread_block block = cg::this_thread_block();
+  cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
+
+  extern __shared__ char smem_buf[];  // NOTE: reuse the shared memory here to
+                                      // store the target topk idx
+  int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf);
+  T* s_topk_value =
+      reinterpret_cast<T*>(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
+      warp_id * topk;
+  s_topk_idx += warp_id * topk;
+
+  T value = cuda::std::numeric_limits<T>::min();
+  T topk_group_value = cuda::std::numeric_limits<T>::min();
+  int32_t num_equalto_topkth_group;
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");  // I think all prolog can be put before
+                                         // acqbulk because it's ptr arithmetic
+#endif
+
+  if (case_id < num_tokens) {
+    // calculate group_idx
+    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
+    if (lane_id < n_group &&
+        (isfinite(cuda_cast<float, T>(
+            group_scores[lane_id]))))  // The check is necessary to avoid
+                                       // abnormal input
+    {
+      value = group_scores[lane_id];
+    }
+
+    int count_equal_to_top_value = WARP_SIZE - n_group;
+    int pre_count_equal_to_top_value = 0;
+    // Use loop to find the largset top_group
+    while (count_equal_to_top_value < target_num_min) {
+      __syncwarp();  // Ensure all threads have valid data before reduction
+      topk_group_value = cg::reduce(tile, value, cg::greater<T>());
+      if (value == topk_group_value) {
+        value = cuda::std::numeric_limits<T>::min();
+      }
+      pre_count_equal_to_top_value = count_equal_to_top_value;
+      count_equal_to_top_value = __popc(__ballot_sync(
+          FULL_WARP_MASK, (value == cuda::std::numeric_limits<T>::min())));
+    }
+    num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
+  }
+  __syncthreads();
+
+  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
+                        /* is_stable */ true>
+      queue((int32_t)topk, -INFINITY);
+
+  int count_equalto_topkth_group = 0;
+  bool if_proceed_next_topk =
+      (topk_group_value != cuda::std::numeric_limits<T>::min());
+  if (case_id < num_tokens && if_proceed_next_topk) {
+    for (int i_group = 0; i_group < n_group; i_group++) {
+      if ((group_scores[i_group] > topk_group_value) ||
+          ((group_scores[i_group] == topk_group_value) &&
+           (count_equalto_topkth_group < num_equalto_topkth_group))) {
+        int32_t offset = i_group * num_experts_per_group;
+        for (int32_t i = lane_id; i < align_num_experts_per_group;
+             i += WARP_SIZE) {
+          T candidates =
+              (i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
+                                                 scores_with_bias[offset + i]))
+                  ? scores_with_bias[offset + i]
+                  : cuda::std::numeric_limits<T>::min();
+          queue.add(candidates, offset + i);
+        }
+        if (group_scores[i_group] == topk_group_value) {
+          count_equalto_topkth_group++;
+        }
+      }
+    }
+    queue.done();
+    __syncwarp();
+    // Get the topk_idx
+    queue.dumpIdx(s_topk_idx);
+    __syncwarp();
+  }
+
+  // Load the valid score value
+  // Calculate the summation
+  float topk_sum = 1e-20;
+  if (case_id < num_tokens && if_proceed_next_topk) {
+    for (int i = lane_id;
+         i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
+         i += WARP_SIZE) {
+      T value =
+          i < topk
+              ? scores[s_topk_idx[i]]
+              : cuda_cast<T, float>(0.0f);  // Load the valid value of expert
+      if (i < topk) {
+        s_topk_value[i] = value;
+      }
+      topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
+    }
+  }
+
+  __syncthreads();
+
+  if (case_id < num_tokens) {
+    if (if_proceed_next_topk) {
+      for (int i = lane_id; i < topk; i += WARP_SIZE) {
+        float value;
+        if (renormalize) {
+          value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
+                  routed_scaling_factor;
+        } else {
+          value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
+        }
+        topk_indices[i] = s_topk_idx[i];
+        topk_values[i] = cuda_cast<T, float>(value);
+      }
+    } else {
+      for (int i = lane_id; i < topk; i += WARP_SIZE) {
+        topk_indices[i] = i;
+        topk_values[i] = cuda_cast<T, float>(1.0f / topk);
+      }
+    }
+    // Note: when if_proceed_next_topk==false, choose the first 8 experts as the
+    // default result.
+  }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename T, typename IdxT>
+void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
+                   IdxT* topk_indices, T* scores_with_bias,
+                   int64_t const num_tokens, int64_t const num_experts,
+                   int64_t const n_group, int64_t const topk_group,
+                   int64_t const topk, bool const renormalize,
+                   double const routed_scaling_factor, bool enable_pdl = false,
+                   cudaStream_t const stream = 0) {
+  int64_t num_cases = num_tokens * n_group;
+  int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
+  auto* kernel_instance1 = &topk_with_k2_kernel<T>;
+  cudaLaunchConfig_t config;
+  config.gridDim = topk_with_k2_num_blocks;
+  config.blockDim = BLOCK_SIZE;
+  config.dynamicSmemBytes = 0;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias,
+                     num_tokens, num_cases, n_group, num_experts / n_group);
+
+  int64_t topk_with_k_group_num_blocks =
+      (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
+  size_t dynamic_smem_in_bytes =
+      warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
+                                                           topk);
+  auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
+  config.gridDim = topk_with_k_group_num_blocks;
+  config.blockDim = BLOCK_SIZE;
+  config.dynamicSmemBytes = dynamic_smem_in_bytes;
+  config.stream = stream;
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
+                     topk_values, topk_indices, scores_with_bias, num_tokens,
+                     n_group, topk_group, topk, num_experts,
+                     num_experts / n_group, renormalize, routed_scaling_factor);
+}
+
+#define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \
+  template void invokeNoAuxTc<T, IdxT>(                                     \
+      T * scores, T * group_scores, T * topk_values, IdxT * topk_indices,   \
+      T * scores_with_bias, int64_t const num_tokens,                       \
+      int64_t const num_experts, int64_t const n_group,                     \
+      int64_t const topk_group, int64_t const topk, bool const renormalize, \
+      double const routed_scaling_factor, bool enable_pdl,                  \
+      cudaStream_t const stream);
+
+INSTANTIATE_NOAUX_TC(float, int32_t);
+INSTANTIATE_NOAUX_TC(half, int32_t);
+INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t);
+}  // end namespace moe
+}  // namespace vllm
+
+std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
+    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
+    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
+    double routed_scaling_factor) {
+  auto data_type = scores_with_bias.scalar_type();
+  auto input_size = scores_with_bias.sizes();
+  int64_t num_tokens = input_size[0];
+  int64_t num_experts = input_size[1];
+  TORCH_CHECK(input_size.size() == 2, "scores_with_bias must be a 2D Tensor");
+  TORCH_CHECK(num_experts % n_group == 0,
+              "num_experts should be divisible by n_group");
+  TORCH_CHECK(n_group <= 32,
+              "n_group should be smaller than or equal to 32 for now");
+  TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
+
+  torch::Tensor group_scores = torch::empty(
+      {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
+  torch::Tensor topk_values = torch::empty(
+      {num_tokens, topk}, torch::dtype(data_type).device(torch::kCUDA));
+  torch::Tensor topk_indices = torch::empty(
+      {num_tokens, topk}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+
+  auto stream = c10::cuda::getCurrentCUDAStream(scores_with_bias.get_device());
+
+  switch (data_type) {
+    case torch::kFloat16:
+      // Handle Float16
+      vllm::moe::invokeNoAuxTc<half, int32_t>(
+          reinterpret_cast<half*>(scores.mutable_data_ptr()),
+          reinterpret_cast<half*>(group_scores.mutable_data_ptr()),
+          reinterpret_cast<half*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
+          reinterpret_cast<half*>(scores_with_bias.data_ptr()), num_tokens,
+          num_experts, n_group, topk_group, topk, renormalize,
+          routed_scaling_factor, false, stream);
+      break;
+    case torch::kFloat32:
+      // Handle Float32
+      vllm::moe::invokeNoAuxTc<float, int32_t>(
+          reinterpret_cast<float*>(scores.mutable_data_ptr()),
+          reinterpret_cast<float*>(group_scores.mutable_data_ptr()),
+          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
+          reinterpret_cast<float*>(scores_with_bias.data_ptr()), num_tokens,
+          num_experts, n_group, topk_group, topk, renormalize,
+          routed_scaling_factor, false, stream);
+      break;
+    case torch::kBFloat16:
+      // Handle BFloat16
+      vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>(
+          reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16*>(topk_values.mutable_data_ptr()),
+          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16*>(scores_with_bias.data_ptr()),
+          num_tokens, num_experts, n_group, topk_group, topk, renormalize,
+          routed_scaling_factor, false, stream);
+      break;
+    default:
+      // Handle other data types
+      throw std::invalid_argument(
+          "Invalid dtype, only supports float16, float32, and bfloat16");
+      break;
+  }
+  return {topk_values, topk_indices};
+}
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 661730c968..92fc280b36 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -22,6 +22,11 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              torch::Tensor num_tokens_post_pad, int64_t top_k,
                              int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                              int64_t BLOCK_SIZE_K, int64_t bit);
+
+std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
+    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
+    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
+    double routed_scaling_factor);
 #endif
 
 bool moe_permute_unpermute_supported();
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 7e49f68f62..8f33d6cd66 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -78,6 +78,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "output_tensor) -> ()");
   m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
 
+  // Apply grouped topk routing to select experts.
+  m.def(
+      "grouped_topk(Tensor scores, Tensor scores_with_bias, int n_group, int "
+      "topk_group, int topk, bool renormalize, float "
+      "routed_scaling_factor) -> (Tensor, Tensor)");
+  m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
 #endif
 }
 
diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py
new file mode 100644
index 0000000000..646e763194
--- /dev/null
+++ b/tests/kernels/moe/test_grouped_topk.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for the MoE grouped topk kernel
+
+Run `pytest tests/kernels/moe/test_grouped_topk.py`.
+"""
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.fused_moe import (fused_grouped_topk,
+                                                            grouped_topk)
+from vllm.platforms import current_platform
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test is skipped on non-CUDA platform.")
+@pytest.mark.parametrize("n_token", [1, 33, 64])
+@pytest.mark.parametrize("n_hidden", [1024, 2048])
+@pytest.mark.parametrize("n_expert", [16])
+@pytest.mark.parametrize("topk", [2])
+@pytest.mark.parametrize("renormalize", [True, False])
+@pytest.mark.parametrize("num_expert_group", [8])
+@pytest.mark.parametrize("topk_group", [2])
+@pytest.mark.parametrize("scoring_func", ["softmax", "sigmoid"])
+@pytest.mark.parametrize("routed_scaling_factor", [1.0, 2.5])
+@pytest.mark.parametrize("dtype",
+                         [torch.float16, torch.bfloat16, torch.float32])
+def test_grouped_topk(monkeypatch: pytest.MonkeyPatch, n_token: int,
+                      n_hidden: int, n_expert: int, topk: int,
+                      renormalize: bool, num_expert_group: int,
+                      topk_group: int, scoring_func: str,
+                      routed_scaling_factor: float, dtype: torch.dtype):
+    current_platform.seed_everything(0)
+    hidden_states = torch.randn((n_token, n_hidden),
+                                dtype=dtype,
+                                device="cuda")
+    gating_output = torch.randn((n_token, n_expert),
+                                dtype=dtype,
+                                device="cuda")
+    e_score_correction_bias = torch.randn((n_expert, ),
+                                          dtype=torch.float32,
+                                          device="cuda")
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0")
+        baseline_topk_weights, baseline_topk_ids = grouped_topk(
+            hidden_states=hidden_states,
+            gating_output=gating_output,
+            topk=topk,
+            renormalize=renormalize,
+            num_expert_group=num_expert_group,
+            topk_group=topk_group,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias)
+
+        test_topk_weights, test_topk_ids = fused_grouped_topk(
+            hidden_states=hidden_states,
+            gating_output=gating_output,
+            topk=topk,
+            renormalize=renormalize,
+            num_expert_group=num_expert_group,
+            topk_group=topk_group,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
+            e_score_correction_bias=e_score_correction_bias)
+
+        if renormalize:
+            torch.testing.assert_close(baseline_topk_weights,
+                                       test_topk_weights,
+                                       atol=2e-2,
+                                       rtol=0)
+        torch.testing.assert_close(baseline_topk_ids,
+                                   test_topk_ids,
+                                   atol=0,
+                                   rtol=0)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 3e3b43ce2a..054dc9d985 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1502,6 +1502,17 @@ def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                                   gating_output)
 
 
+def grouped_topk(scores: torch.Tensor, scores_with_bias: torch.Tensor,
+                 num_expert_group: int, topk_group: int, topk: int,
+                 renormalize: bool, routed_scaling_factor: float):
+    if not current_platform.is_cuda():
+        raise NotImplementedError("The fused grouped_topk kernel is only "
+                                  "available on CUDA platforms")
+    return torch.ops._moe_C.grouped_topk(scores, scores_with_bias,
+                                         num_expert_group, topk_group, topk,
+                                         renormalize, routed_scaling_factor)
+
+
 def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
                           b_qweight: torch.Tensor,
                           b_bias: Optional[torch.Tensor],
diff --git a/vllm/envs.py b/vllm/envs.py
index 5d0e972f43..1c9c4cdde8 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -131,6 +131,7 @@ if TYPE_CHECKING:
     VLLM_USE_DEEP_GEMM: bool = False
     VLLM_USE_DEEP_GEMM_E8M0: bool = True
     VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
+    VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
     VLLM_FLASHINFER_MOE_BACKEND: str = "throughput"
@@ -963,6 +964,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_SKIP_DEEP_GEMM_WARMUP":
     lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))),
 
+    # Whether to use fused grouped_topk used for MoE expert selection.
+    "VLLM_USE_FUSED_MOE_GROUPED_TOPK":
+    lambda: bool(int(os.getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1"))),
+
     # Allow use of FlashInfer MoE kernels for fused moe ops.
     "VLLM_USE_FLASHINFER_MOE_FP8":
     lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))),
@@ -1229,6 +1234,7 @@ def compute_hash() -> str:
         "VLLM_DISABLED_KERNELS",
         "VLLM_USE_DEEP_GEMM",
         "VLLM_USE_TRTLLM_FP4_GEMM",
+        "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
         "VLLM_USE_FLASHINFER_MOE_FP8",
         "VLLM_USE_FLASHINFER_MOE_FP4",
         "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 02b7b65f4a..84dafcf00d 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -949,8 +949,23 @@ def grouped_topk(
     num_expert_group: int = 0,
     topk_group: int = 0,
     scoring_func: str = "softmax",
-    e_score_correction_bias: Optional[torch.Tensor] = None
+    routed_scaling_factor: float = 1.0,
+    e_score_correction_bias: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
+    if envs.VLLM_USE_FUSED_MOE_GROUPED_TOPK and \
+            current_platform.is_cuda() and \
+            num_expert_group <= 32 and topk <= 32 and \
+            e_score_correction_bias is not None:
+        return fused_grouped_topk(
+            hidden_states=hidden_states,
+            gating_output=gating_output,
+            topk=topk,
+            renormalize=renormalize,
+            e_score_correction_bias=e_score_correction_bias,
+            num_expert_group=num_expert_group,
+            topk_group=topk_group,
+            scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor)
 
     assert hidden_states.size(0) == gating_output.size(0), (
         "Number of tokens mismatch")
@@ -996,9 +1011,38 @@ def grouped_topk(
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
 
+    topk_weights = topk_weights * routed_scaling_factor
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
+def fused_grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    e_score_correction_bias: torch.Tensor,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert hidden_states.size(0) == gating_output.size(0), (
+        "Number of tokens mismatch")
+
+    if scoring_func == "softmax":
+        scores = torch.softmax(gating_output, dim=-1)
+    elif scoring_func == "sigmoid":
+        scores = gating_output.sigmoid()
+    else:
+        raise ValueError(f"Unsupported scoring function: {scoring_func}")
+
+    scores_with_bias = scores + e_score_correction_bias.unsqueeze(0)
+    topk_values, topk_indices = ops.grouped_topk(
+        scores, scores_with_bias.to(scores.dtype), num_expert_group,
+        topk_group, topk, renormalize, routed_scaling_factor)
+    return topk_values.to(torch.float32), topk_indices.to(torch.int32)
+
+
 def get_config_dtype_str(
         dtype: torch.dtype,
         use_int4_w4a16: Optional[bool] = False,

From 9188ae7cb5e78e6ecf95f41b587d3b279c231609 Mon Sep 17 00:00:00 2001
From: Zhonghua Deng <abzhonghua@gmail.com>
Date: Tue, 26 Aug 2025 03:57:08 +0800
Subject: [PATCH 568/932] [Bugfix][V1][P/D]Fix the issue where repeated
 requests for the same input produce abnormal outputs for P2pNcclConnector
 (#23403)

Signed-off-by: Abatom <abzhonghua@gmail.com>
---
 .../kv_connector/v1/p2p/p2p_nccl_connector.py | 25 +++++++++++++---
 .../kv_connector/v1/p2p/p2p_nccl_engine.py    | 30 ++-----------------
 .../kv_connector/v1/p2p/tensor_memory_pool.py |  5 ++--
 3 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index 32d0e43d71..25675d70fe 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -245,16 +245,33 @@ class P2pNcclConnector(KVConnectorBase_V1):
 
         assert self.p2p_nccl_engine is not None
 
+        def extract_kv_from_layer(
+            layer: torch.Tensor,
+            slot_mapping: torch.Tensor,
+        ) -> torch.Tensor:
+            """Extract the KV cache from the layer.
+
+            Assume the shape of the layer is (2, num_pages, page_size, xxx)
+            if MLA is not used, and (num_pages, page_size, xxx) otherwise.
+            """
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages, page_size = layer.shape[0], layer.shape[1]
+                return layer.reshape(num_pages * page_size, -1)[slot_mapping,
+                                                                ...]
+            num_pages, page_size = layer.shape[1], layer.shape[2]
+            return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
+                                                               ...]
+
         connector_metadata = self._get_connector_metadata()
         assert isinstance(connector_metadata, P2pNcclConnectorMetadata)
         for request in connector_metadata.requests:
             request_id = request.request_id
             ip, port = self.parse_request_id(request_id, True)
             remote_address = ip + ":" + str(port + self._rank)
-            self.p2p_nccl_engine.send_tensor(
-                request_id + "#" + layer_name, kv_layer, remote_address,
-                request.slot_mapping,
-                isinstance(attn_metadata, MLACommonMetadata))
+
+            kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping)
+            self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
+                                             kv_cache, remote_address)
 
     def wait_for_save(self):
         if self.is_producer:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
index b94f2296dc..dfd95548c4 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
@@ -62,8 +62,6 @@ class SendQueueItem:
     tensor_id: str
     remote_address: str
     tensor: torch.Tensor
-    slot_mapping: torch.Tensor
-    is_mla: bool
 
 
 class P2pNcclEngine:
@@ -202,8 +200,6 @@ class P2pNcclEngine:
         tensor_id: str,
         tensor: torch.Tensor,
         remote_address: typing.Optional[str] = None,
-        slot_mapping: torch.Tensor = None,
-        is_mla: bool = False,
     ) -> bool:
         if remote_address is None:
             with self.recv_store_cv:
@@ -213,9 +209,7 @@ class P2pNcclEngine:
 
         item = SendQueueItem(tensor_id=tensor_id,
                              remote_address=remote_address,
-                             tensor=tensor,
-                             slot_mapping=slot_mapping,
-                             is_mla=is_mla)
+                             tensor=tensor)
 
         if self.send_type == "PUT":
             return self.send_sync(item)
@@ -433,9 +427,7 @@ class P2pNcclEngine:
         if item.remote_address not in self.socks:
             self.create_connect(item.remote_address)
 
-        with self.send_stream:
-            tensor = self.extract_kv_from_layer(item.is_mla, item.tensor,
-                                                item.slot_mapping)
+        tensor = item.tensor
 
         sock = self.socks[item.remote_address]
         comm, rank = self.comms[item.remote_address]
@@ -548,21 +540,3 @@ class P2pNcclEngine:
             self._send_thread.join()
         if self._ping_thread is not None:
             self._ping_thread.join()
-
-    @staticmethod
-    def extract_kv_from_layer(
-        is_mla: bool,
-        layer: torch.Tensor,
-        slot_mapping: torch.Tensor,
-    ) -> torch.Tensor:
-        """Extract the KV cache from the layer.
-        Assume the shape of the layer is (2, num_pages, page_size, xxx)
-        if MLA is not used, and (num_pages, page_size, xxx) otherwise.
-        """
-        if is_mla:
-            num_pages, page_size = layer.shape[0], layer.shape[1]
-            return layer.reshape(num_pages * page_size, -1)[slot_mapping, ...]
-
-        num_pages, page_size = layer.shape[1], layer.shape[2]
-        return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
-                                                           ...]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
index 02e3bc6274..b775276d4a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
@@ -99,8 +99,9 @@ class TensorMemoryPool:
                                     addr=self.base_address)
         self.free_lists[self.max_block_size][
             initial_block.addr] = initial_block
-        logger.debug("TensorMemoryPool, base_address:", self.base_address,
-                     self.base_address % self.max_block_size)
+
+        logger.debug("TensorMemoryPool, base_address:%d, max_block_size:%d",
+                     self.base_address, self.max_block_size)
 
     def allocate(self, size: int) -> int:
         """Allocates a memory block of at least the requested size.

From 8a044754bd083671e4bb09a68b1edae9610dfccc Mon Sep 17 00:00:00 2001
From: Chaojun Zhang <chaojun.zhang@intel.com>
Date: Tue, 26 Aug 2025 04:09:26 +0800
Subject: [PATCH 569/932] [XPU] Delay BF16 check to worker init for spawn
 compatibility (#22979)

Signed-off-by: chzhang <chaojun.zhang@intel.com>
---
 vllm/platforms/cuda.py       | 20 +++++++++++++++++++
 vllm/platforms/interface.py  |  7 +++++++
 vllm/platforms/rocm.py       | 20 +++++++++++++++++++
 vllm/platforms/xpu.py        | 37 +++++++++++-------------------------
 vllm/v1/worker/gpu_worker.py | 22 +--------------------
 vllm/v1/worker/xpu_worker.py |  1 +
 6 files changed, 60 insertions(+), 47 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 134ba36e5e..c0e0fe35e4 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -518,6 +518,26 @@ class CudaPlatformBase(Platform):
                     supported = True
         return supported
 
+    @classmethod
+    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+            if not cls.has_device_capability(80):
+                capability = cls.get_device_capability()
+                gpu_name = cls.get_device_name()
+
+                if capability is None:
+                    compute_str = "does not have a compute capability"
+                else:
+                    version_str = capability.as_version_str()
+                    compute_str = f"has compute capability {version_str}"
+
+                raise ValueError(
+                    "Bfloat16 is only supported on GPUs "
+                    "with compute capability of at least 8.0. "
+                    f"Your {gpu_name} GPU {compute_str}. "
+                    "You can use float16 instead by explicitly setting the "
+                    "`dtype` flag in CLI, for example: --dtype=half.")
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 00bc555288..f6c17de86d 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -572,6 +572,13 @@ class Platform:
         """
         return False
 
+    @classmethod
+    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+        """
+        Check if the dtype is supported by the current platform.
+        """
+        raise NotImplementedError
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 323ec591c5..85b2fe2e48 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -462,3 +462,23 @@ class RocmPlatform(Platform):
     def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
                                     model_config: "ModelConfig") -> bool:
         return True
+
+    @classmethod
+    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+            if not cls.has_device_capability(80):
+                capability = cls.get_device_capability()
+                gpu_name = cls.get_device_name()
+
+                if capability is None:
+                    compute_str = "does not have a compute capability"
+                else:
+                    version_str = capability.as_version_str()
+                    compute_str = f"has compute capability {version_str}"
+
+                raise ValueError(
+                    "Bfloat16 is only supported on GPUs "
+                    "with compute capability of at least 8.0. "
+                    f"Your {gpu_name} GPU {compute_str}. "
+                    "You can use float16 instead by explicitly setting the "
+                    "`dtype` flag in CLI, for example: --dtype=half.")
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index af24437f64..235e5d8294 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -97,13 +97,6 @@ class XPUPlatform(Platform):
             from vllm.config import CompilationLevel
             vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION  # noqa: E501
 
-        # Instances created using VllmConfig() typically have model_config as
-        # None by default. The modification involves adding a check to prevent
-        # potential null exceptions check and update model config.
-        if model_config is not None and model_config.dtype == torch.bfloat16 \
-            and not cls.device_support_bf16():
-            model_config.dtype = torch.float16
-
         # lazy import to avoid circular import
         from vllm.config import CUDAGraphMode
         compilation_config = vllm_config.compilation_config
@@ -162,30 +155,11 @@ class XPUPlatform(Platform):
         torch.xpu.reset_peak_memory_stats(device)
         return torch.xpu.max_memory_allocated(device)
 
-    @classmethod
-    def device_support_bf16(cls) -> bool:
-        device_name = cls.get_device_name().lower()
-        if cls.is_client_gpu_a770():
-            logger.warning("Intel Arc A770 have bfloat16 accuracy known issue,"
-                           " fallback to float16")
-            return False
-        else:
-            logger.info(
-                "Device name %s supports bfloat16. Please file an issue "
-                "if you encounter any accuracy problems with bfloat16.",
-                device_name)
-            return True
-
     @classmethod
     def is_data_center_gpu(cls) -> bool:
         device_name = cls.get_device_name().lower()
         return device_name.count("data center gpu") > 0
 
-    @classmethod
-    def is_client_gpu_a770(cls) -> bool:
-        device_name = cls.get_device_name().lower()
-        return device_name.count("a770") > 0
-
     @classmethod
     def get_device_communicator_cls(cls) -> str:
         return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator"  # noqa
@@ -197,3 +171,14 @@ class XPUPlatform(Platform):
     @classmethod
     def device_count(cls) -> int:
         return torch.xpu.device_count()
+
+    @classmethod
+    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+            device_name = cls.get_device_name().lower()
+            # client gpu a770
+            if device_name.count("a770") > 0:
+                raise ValueError(
+                    "Intel Arc A770 have bfloat16 accuracy known issue. "
+                    "You can use float16 instead by explicitly setting the "
+                    "`dtype` flag in CLI, for example: --dtype=half.")
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 1688b8b83e..0dca45a759 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -167,7 +167,7 @@ class Worker(WorkerBase):
             self.device = torch.device(f"cuda:{self.local_rank}")
             current_platform.set_device(self.device)
 
-            _check_if_gpu_supports_dtype(self.model_config.dtype)
+            current_platform.check_if_supports_dtype(self.model_config.dtype)
             gc.collect()
             torch.cuda.empty_cache()
 
@@ -612,23 +612,3 @@ def init_worker_distributed_environment(
                                       parallel_config.pipeline_parallel_size)
 
     ensure_kv_transfer_initialized(vllm_config)
-
-
-def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
-    # Check if the GPU supports the dtype.
-    if torch_dtype == torch.bfloat16:  # noqa: SIM102
-        if not current_platform.has_device_capability(80):
-            capability = current_platform.get_device_capability()
-            gpu_name = current_platform.get_device_name()
-
-            if capability is None:
-                compute_str = "does not have a compute capability"
-            else:
-                version_str = capability.as_version_str()
-                compute_str = f"has compute capability {version_str}"
-
-            raise ValueError(
-                "Bfloat16 is only supported on GPUs with compute capability "
-                f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
-                "You can use float16 instead by explicitly setting the "
-                "`dtype` flag in CLI, for example: --dtype=half.")
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 134d839252..17288cda8e 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -145,6 +145,7 @@ class XPUWorker(Worker):
         ):
             self.device = torch.device(f"xpu:{self.local_rank}")
             current_platform.set_device(self.device)
+            current_platform.check_if_supports_dtype(self.model_config.dtype)
             torch.xpu.empty_cache()
             self.init_gpu_memory = torch.xpu.get_device_properties(
                 self.local_rank).total_memory

From c34c82b7fe5f62e771334bdafc0c4559856ce58f Mon Sep 17 00:00:00 2001
From: Pate Motter <p@temotter.com>
Date: Mon, 25 Aug 2025 14:29:16 -0700
Subject: [PATCH 570/932] [TPU][Bugfix] Fixes prompt_token_ids error in tpu
 tests. (#23574)

Signed-off-by: Pate Motter <patemotter@google.com>
---
 .buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh | 2 +-
 .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index b571618f48..1073a4ee30 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
     && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index d55a786e41..505664f3ae 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
     && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1

From 7b6a8372755dfd6b8b2449b24e2d9d8589ff0291 Mon Sep 17 00:00:00 2001
From: Terrence Zhao <32208165+Terrencezzj@users.noreply.github.com>
Date: Mon, 25 Aug 2025 17:53:52 -0400
Subject: [PATCH 571/932] [Docs] Update Documentation of Cohere Command-A
 Models (#23584)

Signed-off-by: Terrencezzj <terrence@cohere.ai>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com>
---
 docs/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 8fb1019f2b..4763f2281d 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -332,7 +332,7 @@ th {
 | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
 | `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |

From efc88cf64a399f5459cd6256223e99672c13614d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 25 Aug 2025 15:42:29 -0700
Subject: [PATCH 572/932] [Misc] Simplify FlashInfer attention metadata
 (#23585)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
---
 vllm/v1/attention/backends/flashinfer.py | 277 ++++++++++-------------
 1 file changed, 114 insertions(+), 163 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 50819bb2bb..941d2a4d7f 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -123,29 +123,9 @@ class FlashInferMetadata:
 
     num_actual_tokens: int  # Number of tokens excluding padding.
 
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    qo_indptr_cpu: torch.Tensor
-    # An example for paged_kv_indices, paged_kv_indptr:
-    # request 1, page indices [0, 5, 8]
-    # request 2, page indices [1, 6, 7]
-    # request 3, page indices [3, 4]
-    # paged_kv_indices is a concatenation of page indices of all requests:
-    # [0, 5, 8, 1, 6, 7, 3, 4]
-    # paged_kv_indptr is used to index into paged_kv_indices:
-    # [0, 3, 6, 8]
-    # The indptr of the paged kv cache, shape: [batch_size + 1] (CPU for plan)
-    paged_kv_indptr_cpu: torch.Tensor
-    # The page indices of the paged kv cache (on device for plan)
-    paged_kv_indices: torch.Tensor
-    # The number of entries in the last page of each request in
-    # the paged kv cache, shape: [batch_size] (CPU for plan)
-    paged_kv_last_page_len_cpu: torch.Tensor
     # The data type of the query
     q_data_type: torch.dtype
 
-    seq_lens_cpu: torch.Tensor
     slot_mapping: torch.Tensor
 
     # For flashinfer trtllm batch decode
@@ -164,10 +144,6 @@ class FlashInferMetadata:
 
     # For cascade attention (CPU for planning).
     use_cascade: bool
-    shared_qo_indptr_cpu: Optional[torch.Tensor] = None
-    shared_kv_page_indptr_cpu: Optional[torch.Tensor] = None
-    shared_kv_page_indices_cpu: Optional[torch.Tensor] = None
-    shared_kv_last_page_len_cpu: Optional[torch.Tensor] = None
 
     prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
     decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
@@ -327,134 +303,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 2, self._get_workspace_buffer(), get_kv_cache_layout())
         return self._cascade_wrapper
 
-    def _plan(self, attn_metadata: FlashInferMetadata):
-        if attn_metadata.use_cascade:
-            attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
-            attn_metadata.cascade_wrapper.plan(
-                [
-                    attn_metadata.shared_qo_indptr_cpu,
-                    attn_metadata.qo_indptr_cpu
-                ],
-                [
-                    attn_metadata.shared_kv_page_indptr_cpu,
-                    attn_metadata.paged_kv_indptr_cpu
-                ],
-                [
-                    attn_metadata.shared_kv_page_indices_cpu,
-                    attn_metadata.paged_kv_indices
-                ],
-                [
-                    attn_metadata.shared_kv_last_page_len_cpu,
-                    attn_metadata.paged_kv_last_page_len_cpu
-                ],
-                self.num_qo_heads,
-                self.num_kv_heads,
-                self.head_dim,
-                self.page_size,
-                causal=True,
-                sm_scale=self.global_hyperparameters.sm_scale,
-                window_left=self.global_hyperparameters.window_left,
-                logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
-                q_data_type=self.q_data_type,
-                kv_data_type=self.kv_cache_dtype,
-            )
-        else:
-            # Regular attention (common case).
-            # Decodes are at the front and prefills are at the back,
-            # according to reorder_batch()
-            num_prefills = attn_metadata.num_prefills
-            num_decodes = attn_metadata.num_decodes
-            if num_prefills > 0:
-                # Decodes are first so prefills start after the last decode
-                prefill_start = num_decodes
-                attn_metadata.prefill_wrapper = self._get_prefill_wrapper()
-                assert attn_metadata.qo_indptr_cpu[prefill_start:].shape[
-                    0] == num_prefills + 1
-                assert attn_metadata.paged_kv_indptr_cpu[prefill_start:].shape[
-                    0] == num_prefills + 1
-                assert attn_metadata.paged_kv_last_page_len_cpu[
-                    prefill_start:].shape[0] == num_prefills
-                # Since prefill_wrapper.run() will be called with
-                # query[num_decode_tokens:] we need to adjust the qo_indptr
-                # to be relative to the start of the prefill queries.
-                qo_indptr_cpu = attn_metadata.qo_indptr_cpu[
-                    prefill_start:] - attn_metadata.qo_indptr_cpu[prefill_start]
-                paged_kv_indptr_cpu = attn_metadata.paged_kv_indptr_cpu[
-                    prefill_start:]
-                if not attn_metadata.prefill_use_trtllm:
-                    attn_metadata.prefill_wrapper.plan(
-                        qo_indptr_cpu,
-                        paged_kv_indptr_cpu,
-                        attn_metadata.paged_kv_indices,
-                        attn_metadata.
-                        paged_kv_last_page_len_cpu[prefill_start:],
-                        self.num_qo_heads,
-                        self.num_kv_heads,
-                        self.head_dim,
-                        self.page_size,
-                        causal=True,
-                        sm_scale=self.global_hyperparameters.sm_scale,
-                        window_left=self.global_hyperparameters.window_left,
-                        logits_soft_cap=self.global_hyperparameters.
-                        logits_soft_cap,
-                        q_data_type=self.q_data_type,
-                        kv_data_type=self.kv_cache_dtype,
-                    )
-                else:
-                    attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device)
-                    attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
-                        self.device)
-
-            if num_decodes > 0:
-                pure_decode = num_prefills == 0
-                # possible required padding for cudagraph replay
-                use_cudagraph = (self.enable_cuda_graph and pure_decode and
-                                 num_decodes <= self._decode_cudagraph_max_bs)
-                if use_cudagraph:
-                    num_input_tokens = (
-                        self.vllm_config.pad_for_cudagraph(num_decodes))
-                    # Carefully fulfill the padding region with reasonable value
-                    # on cpu.
-                    # Make sure paged_kv_indptr_cpu is not decreasing
-                    self.paged_kv_indptr_cpu[1 + num_decodes:1 +
-                                             num_input_tokens].fill_(
-                                                 attn_metadata.
-                                                 paged_kv_indptr_cpu[-1])
-                    # Fill the remaining paged_kv_last_page_len_cpu with 1.
-                    # This is because flashinfer treats 0 as a full page
-                    # instead of empty.
-                    self.paged_kv_last_page_len_cpu[
-                        num_decodes:num_input_tokens].fill_(1)
-
-                else:
-                    num_input_tokens = num_decodes
-
-                attn_metadata.decode_wrapper = self._get_decode_wrapper(
-                    num_input_tokens, use_cudagraph)
-                if not attn_metadata.decode_use_trtllm:
-                    # Use the persistent buffer with padding length,
-                    # instead of the same address but chunked version
-                    # in atten_metadata when using cudagraph.
-                    fast_plan_decode(
-                        attn_metadata.decode_wrapper,
-                        self.paged_kv_indptr_cpu[:num_input_tokens + 1],
-                        attn_metadata.paged_kv_indices,
-                        self.paged_kv_last_page_len_cpu[:num_input_tokens],
-                        attn_metadata.seq_lens_cpu[:num_input_tokens],
-                        self.num_qo_heads,
-                        self.num_kv_heads,
-                        self.head_dim,
-                        self.page_size,
-                        # Disable flashinfer's pos encoding and use vllm's rope.
-                        pos_encoding_mode="NONE",
-                        sm_scale=self.global_hyperparameters.sm_scale,
-                        window_left=self.global_hyperparameters.window_left,
-                        logits_soft_cap=self.global_hyperparameters.
-                        logits_soft_cap,
-                        q_data_type=self.q_data_type,
-                        kv_data_type=self.kv_cache_dtype,
-                    )
-
     def build(self,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
@@ -548,13 +396,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
-            qo_indptr_cpu=common_attn_metadata.query_start_loc_cpu,
-            paged_kv_indptr_cpu=self.paged_kv_indptr_cpu[:1 + num_reqs],
-            paged_kv_indices=paged_kv_indices,
-            paged_kv_last_page_len_cpu=self.
-            paged_kv_last_page_len_cpu[:num_reqs],
             q_data_type=self.q_data_type,
-            seq_lens_cpu=seq_lens_cpu,
             slot_mapping=common_attn_metadata.slot_mapping,
             max_q_len=max_q_len,
             max_seq_len=max_seq_len,
@@ -567,14 +409,123 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             num_prefills=num_prefills,
             num_prefill_tokens=num_prefill_tokens,
             use_cascade=use_cascade,
-            shared_qo_indptr_cpu=shared_qo_indptr_cpu,
-            shared_kv_page_indptr_cpu=shared_kv_page_indptr_cpu,
-            shared_kv_page_indices_cpu=shared_kv_page_indices_cpu,
-            shared_kv_last_page_len_cpu=shared_kv_last_page_len_cpu,
         )
 
-        self._plan(attn_metadata)
+        qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
+        paged_kv_indptr_cpu = self.paged_kv_indptr_cpu[:1 + num_reqs]
+        paged_kv_last_page_len_cpu = self.paged_kv_last_page_len_cpu[:num_reqs]
 
+        if attn_metadata.use_cascade:
+            attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
+            attn_metadata.cascade_wrapper.plan(
+                [shared_qo_indptr_cpu, qo_indptr_cpu],
+                [shared_kv_page_indptr_cpu, paged_kv_indptr_cpu],
+                [shared_kv_page_indices_cpu, paged_kv_indices],
+                [shared_kv_last_page_len_cpu, paged_kv_last_page_len_cpu],
+                self.num_qo_heads,
+                self.num_kv_heads,
+                self.head_dim,
+                self.page_size,
+                causal=True,
+                sm_scale=self.global_hyperparameters.sm_scale,
+                window_left=self.global_hyperparameters.window_left,
+                logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
+                q_data_type=self.q_data_type,
+                kv_data_type=self.kv_cache_dtype,
+            )
+        else:
+            # Regular attention (common case).
+            # Decodes are at the front and prefills are at the back,
+            # according to reorder_batch()
+            num_prefills = attn_metadata.num_prefills
+            num_decodes = attn_metadata.num_decodes
+            if num_prefills > 0:
+                # Decodes are first so prefills start after the last decode
+                prefill_start = num_decodes
+                attn_metadata.prefill_wrapper = self._get_prefill_wrapper()
+                assert qo_indptr_cpu[prefill_start:].shape[
+                    0] == num_prefills + 1
+                assert paged_kv_indptr_cpu[prefill_start:].shape[
+                    0] == num_prefills + 1
+                assert paged_kv_last_page_len_cpu[prefill_start:].shape[
+                    0] == num_prefills
+                # Since prefill_wrapper.run() will be called with
+                # query[num_decode_tokens:] we need to adjust the qo_indptr
+                # to be relative to the start of the prefill queries.
+                qo_indptr_cpu = qo_indptr_cpu[prefill_start:] - qo_indptr_cpu[
+                    prefill_start]
+                paged_kv_indptr_cpu = paged_kv_indptr_cpu[prefill_start:]
+                if not attn_metadata.prefill_use_trtllm:
+                    attn_metadata.prefill_wrapper.plan(
+                        qo_indptr_cpu,
+                        paged_kv_indptr_cpu,
+                        paged_kv_indices,
+                        paged_kv_last_page_len_cpu[prefill_start:],
+                        self.num_qo_heads,
+                        self.num_kv_heads,
+                        self.head_dim,
+                        self.page_size,
+                        causal=True,
+                        sm_scale=self.global_hyperparameters.sm_scale,
+                        window_left=self.global_hyperparameters.window_left,
+                        logits_soft_cap=self.global_hyperparameters.
+                        logits_soft_cap,
+                        q_data_type=self.q_data_type,
+                        kv_data_type=self.kv_cache_dtype,
+                    )
+                else:
+                    attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device)
+                    attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
+                        self.device)
+
+            if num_decodes > 0:
+                pure_decode = num_prefills == 0
+                # possible required padding for cudagraph replay
+                use_cudagraph = (self.enable_cuda_graph and pure_decode and
+                                 num_decodes <= self._decode_cudagraph_max_bs)
+                if use_cudagraph:
+                    num_input_tokens = (
+                        self.vllm_config.pad_for_cudagraph(num_decodes))
+                    # Carefully fulfill the padding region with reasonable value
+                    # on cpu.
+                    # Make sure paged_kv_indptr_cpu is not decreasing
+                    self.paged_kv_indptr_cpu[1 + num_decodes:1 +
+                                             num_input_tokens].fill_(
+                                                 paged_kv_indptr_cpu[-1])
+                    # Fill the remaining paged_kv_last_page_len_cpu with 1.
+                    # This is because flashinfer treats 0 as a full page
+                    # instead of empty.
+                    self.paged_kv_last_page_len_cpu[
+                        num_decodes:num_input_tokens].fill_(1)
+
+                else:
+                    num_input_tokens = num_decodes
+
+                attn_metadata.decode_wrapper = self._get_decode_wrapper(
+                    num_input_tokens, use_cudagraph)
+                if not attn_metadata.decode_use_trtllm:
+                    # Use the persistent buffer with padding length,
+                    # instead of the same address but chunked version
+                    # in atten_metadata when using cudagraph.
+                    fast_plan_decode(
+                        attn_metadata.decode_wrapper,
+                        self.paged_kv_indptr_cpu[:num_input_tokens + 1],
+                        paged_kv_indices,
+                        self.paged_kv_last_page_len_cpu[:num_input_tokens],
+                        seq_lens_cpu[:num_input_tokens],
+                        self.num_qo_heads,
+                        self.num_kv_heads,
+                        self.head_dim,
+                        self.page_size,
+                        # Disable flashinfer's pos encoding and use vllm's rope.
+                        pos_encoding_mode="NONE",
+                        sm_scale=self.global_hyperparameters.sm_scale,
+                        window_left=self.global_hyperparameters.window_left,
+                        logits_soft_cap=self.global_hyperparameters.
+                        logits_soft_cap,
+                        q_data_type=self.q_data_type,
+                        kv_data_type=self.kv_cache_dtype,
+                    )
         return attn_metadata
 
     def build_for_cudagraph_capture(

From 2a97ffc33de097f267f217132ced42f4714b7de5 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 25 Aug 2025 16:44:51 -0700
Subject: [PATCH 573/932] [Misc] Add release note draft to PR template (#23598)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 .github/PULL_REQUEST_TEMPLATE.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 1b30c1292d..8043df65d5 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -7,8 +7,6 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
 
 ## Test Result
 
-## (Optional) Documentation Update
-
 ---
 <details>
 <summary> Essential Elements of an Effective PR Description Checklist </summary>
@@ -17,6 +15,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
 - [ ] The test plan, such as providing test command.
 - [ ] The test results, such as pasting the results comparison before and after, or e2e results
 - [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
 </details>
 
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)

From 906e461ed6ddccd3cc7b68fa72048d2d3fcbd72c Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 25 Aug 2025 21:29:00 -0400
Subject: [PATCH 574/932] [CI Fix] Pin deepep and pplx tags in
 tools/ep_kernels/, gate multigpu tests (#23568)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                     |  1 +
 tests/distributed/test_comm_ops.py                | 12 +++++-------
 tests/kernels/moe/test_deepep_deepgemm_moe.py     |  3 +++
 tests/kernels/moe/test_deepep_moe.py              |  3 +++
 .../moe/test_modular_kernel_combinations.py       |  2 ++
 tests/kernels/moe/test_pplx_cutlass_moe.py        |  2 ++
 tests/kernels/moe/test_pplx_moe.py                |  5 +++++
 tests/utils.py                                    |  9 ++++++---
 tools/ep_kernels/install_python_libraries.sh      | 15 +++++++++++++--
 9 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 20f3ce1adb..1ccfa93c57 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -390,6 +390,7 @@ steps:
   - csrc/moe/
   - tests/kernels/moe
   - vllm/model_executor/layers/fused_moe/
+  - vllm/distributed/device_communicators/
   commands:
     - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index e2cb579e22..8d84cc2d0f 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -18,7 +18,8 @@ from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_reduce_scatter)
 
-from ..utils import init_test_distributed_environment, multi_process_parallel
+from ..utils import (init_test_distributed_environment, multi_gpu_test,
+                     multi_process_parallel)
 
 
 @ray.remote(num_gpus=1, max_calls=1)
@@ -226,8 +227,7 @@ def send_recv_test_worker(
         torch.testing.assert_close(test_tensor, recv_tensor)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("test_target", [
     all_reduce_test_worker, all_gather_test_worker,
@@ -241,8 +241,7 @@ def test_multi_process_tensor_parallel(
     multi_process_parallel(monkeypatch, tp_size, 1, test_target)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("pp_size", [2])
 @pytest.mark.parametrize(
     "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
@@ -254,8 +253,7 @@ def test_multi_process_pipeline_parallel(
     multi_process_parallel(monkeypatch, 1, pp_size, test_target)
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@multi_gpu_test(num_gpus=4)
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("pp_size", [2])
 @pytest.mark.parametrize("test_target", [
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 6f95581a5e..1e922be47f 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -23,6 +23,7 @@ from vllm.utils import has_deep_ep, has_deep_gemm
 from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
                                   is_deep_gemm_supported)
 
+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 from .utils import make_test_weights
 
@@ -370,6 +371,7 @@ NUM_EXPERTS = [32]
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOPKS)
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
 @pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
@@ -427,6 +429,7 @@ USE_FP8_DISPATCH = [False]
 @pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
 @pytest.mark.parametrize("block_size", [[128, 128]])
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
+@multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
 @pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index 43804c410b..6a53af68cd 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -24,6 +24,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep
 
+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 if has_deep_ep():
@@ -411,6 +412,7 @@ DTYPES = [torch.bfloat16, torch.float8_e4m3fn]
 @pytest.mark.parametrize("topk", [6])
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
+@multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 def test_deep_ep_moe(
     dtype: torch.dtype,
@@ -459,6 +461,7 @@ USE_FP8_DISPATCH = [True, False]
 @pytest.mark.parametrize("topk", [6])
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH)
+@multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int],
                                  num_experts: int, topk: int,
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index d45982384e..6112183be5 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
+from ...utils import multi_gpu_test
 from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
                                           reference_moe_impl,
                                           run_modular_kernel)
@@ -162,6 +163,7 @@ def is_nyi_config(config: Config) -> bool:
     product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
 @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
 @pytest.mark.parametrize("world_size", [2])
+@multi_gpu_test(num_gpus=2)
 @meets_multi_gpu_requirements
 def test_modular_kernel_combinations_multigpu(
         k: int, n: int, e: int, dtype: torch.dtype,
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index 98908f2714..9e78f4d6e4 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
 
+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 try:
@@ -247,6 +248,7 @@ def _pplx_moe(
 @pytest.mark.parametrize("per_out_ch", [True, False])
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])  #, [4, 2]])
 @pytest.mark.parametrize("use_internode", [False])
+@multi_gpu_test(num_gpus=2)
 @pytest.mark.skipif(
     (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
         current_platform.get_device_capability()),
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index c2064de973..3f36d7ada2 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -37,6 +37,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.platforms import current_platform
 from vllm.utils import round_up
 
+from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
 
 requires_pplx = pytest.mark.skipif(
@@ -452,6 +453,7 @@ def _pplx_prepare_finalize(
 @pytest.mark.parametrize("use_internode", [False])
 @pytest.mark.optional
 @requires_pplx
+@multi_gpu_test(num_gpus=2)
 def test_pplx_prepare_finalize_slow(
     mnk: tuple[int, int, int],
     e: int,
@@ -740,6 +742,7 @@ def _pplx_moe(
 @pytest.mark.parametrize("use_internode", [False])
 @pytest.mark.optional
 @requires_pplx
+@multi_gpu_test(num_gpus=2)
 def test_pplx_moe_slow(
     mnk: tuple[int, int, int],
     e: int,
@@ -880,6 +883,7 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @pytest.mark.parametrize("use_internode", [False])
 @requires_pplx
+@multi_gpu_test(num_gpus=2)
 def test_pplx_prepare_finalize(
     world_dp_size: tuple[int, int],
     use_internode: bool,
@@ -893,6 +897,7 @@ def test_pplx_prepare_finalize(
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @pytest.mark.parametrize("use_internode", [False])
 @requires_pplx
+@multi_gpu_test(num_gpus=2)
 def test_pplx_moe(
     world_dp_size: tuple[int, int],
     use_internode: bool,
diff --git a/tests/utils.py b/tests/utils.py
index 4dba549466..9d2073f3c1 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -696,9 +696,12 @@ def multi_process_parallel(
     os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1"
     ray.init(
         runtime_env={
-            "working_dir": VLLM_PATH,
-            "excludes":
-            ["build", ".git", "cmake-build-*", "shellcheck", "dist"]
+            "working_dir":
+            VLLM_PATH,
+            "excludes": [
+                "build", ".git", "cmake-build-*", "shellcheck", "dist",
+                "ep_kernels_workspace"
+            ]
         })
 
     distributed_init_port = get_open_port()
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index e163c83e8b..59bfe69dc0 100644
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -77,6 +77,7 @@ clone_repo() {
     local repo_url=$1
     local dir_name=$2
     local key_file=$3
+    local commit_hash=$4
 
     if [ -d "$dir_name" ]; then
         # Check if directory has uncommitted changes (dirty)
@@ -87,17 +88,27 @@ clone_repo() {
             echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
             rm -rf "$dir_name"
             git clone "$repo_url"
+            if [ -n "$commit_hash" ]; then
+                cd "$dir_name"
+                git checkout "$commit_hash"
+                cd ..
+            fi
         else
             echo "$dir_name directory exists and appears complete; manually update if needed"
         fi
     else
         git clone "$repo_url"
+        if [ -n "$commit_hash" ]; then
+            cd "$dir_name"
+            git checkout "$commit_hash"
+            cd ..
+        fi
     fi
 }
 
 # build and install pplx, require pytorch installed
 pushd $WORKSPACE
-clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py"
+clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
 cd pplx-kernels
 # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
 # PIP_NO_BUILD_ISOLATION=0 disables build isolation
@@ -106,7 +117,7 @@ popd
 
 # build and install deepep, require pytorch installed
 pushd $WORKSPACE
-clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py"
+clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "e3908bf"
 cd DeepEP
 export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
 PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .

From ae067888d6803b0fe0a2201ae9b464a848a0de01 Mon Sep 17 00:00:00 2001
From: weiliang <weiliangl@nvidia.com>
Date: Tue, 26 Aug 2025 09:30:44 +0800
Subject: [PATCH 575/932] Update Flashinfer to  0.2.14.post1 (#23537)

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: siyuanf <siyuanf@nvidia.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docker/Dockerfile                                | 2 +-
 setup.py                                         | 2 +-
 vllm/compilation/collective_fusion.py            | 3 ++-
 vllm/model_executor/layers/quantization/mxfp4.py | 7 ++++++-
 vllm/v1/worker/gpu_worker.py                     | 7 ++++---
 5 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 839ac501db..2e272cbca8 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.2.12"
+ARG FLASHINFER_GIT_REF="v0.2.14.post1"
 # Flag to control whether to compile FlashInfer AOT kernels
 # Set to "true" to enable AOT compilation:
 # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
diff --git a/setup.py b/setup.py
index ca6e0a8592..ffe8ec4e79 100644
--- a/setup.py
+++ b/setup.py
@@ -694,7 +694,7 @@ setup(
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.12"],
+        "flashinfer": ["flashinfer-python==0.2.14.post1"],
         # Optional deps for AMD FP4 quantization support
         "petit-kernel": ["petit-kernel"],
     },
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 6ae50245ed..c44ac8e0aa 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -465,7 +465,8 @@ if flashinfer_comm is not None:
                 quant_out=quant_out,
                 scale_out=scale_out,
                 # in vllm we only support swizzled layout
-                layout_code=flashinfer_comm.FP4QuantizationSFLayout.SWIZZLED,
+                layout_code=flashinfer_comm.QuantizationSFLayout.
+                SWIZZLED_128x4,
                 scale_factor=scale_factor,
             )
         else:
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 6a190ebbc0..df96e5d8c4 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -6,6 +6,7 @@ import torch
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                   FusedMoEMethodBase)
@@ -113,6 +114,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         self.topk_indices_dtype = None
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
+        self.max_capture_size = get_current_vllm_config(
+        ).compilation_config.max_capture_size
 
         if current_platform.is_device_capability(100) and not has_flashinfer():
             logger.warning_once(
@@ -520,7 +523,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 x_scale = None
             else:
                 x_quant, x_scale = mxfp8_quantize(x, False)  # to mxfp8
-                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
+                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
+                    *x.shape[:-1], -1)
             trtllm_gen_output = trtllm_fp4_block_scale_moe(
                 router_logits.to(torch.bfloat16),
                 None,  # routing_bias
@@ -549,6 +553,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 self._get_tile_tokens_dim(x, top_k),
                 1 if renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
+                tune_max_num_tokens=self.max_capture_size,
             )[0]
             return trtllm_gen_output
         else:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0dca45a759..c252193313 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -310,6 +310,10 @@ class Worker(WorkerBase):
             logger.info("Compile and warming up model for size %d", size)
             self.model_runner._dummy_run(size, skip_eplb=True)
 
+        # Warmup and tune the kernels used during model execution before
+        # cuda graph capture.
+        kernel_warmup(self)
+
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
 
@@ -334,9 +338,6 @@ class Worker(WorkerBase):
                 self.model_runner._dummy_sampler_run(
                     hidden_states=last_hidden_states)
 
-        # Warmup kernels used during model execution
-        kernel_warmup(self)
-
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)

From 56dcf4e7e965e34043acf20ca4e4aceda21d41ec Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Mon, 25 Aug 2025 21:41:21 -0400
Subject: [PATCH 576/932] [Bug] Fix DeepGEMM Env Control (#23591)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/utils/deep_gemm.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index c0a4ed077e..b0bc3a79eb 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -27,7 +27,7 @@ def is_deep_gemm_supported() -> bool:
     is_supported_arch = current_platform.is_cuda() and (
         current_platform.is_device_capability(90)
         or current_platform.is_device_capability(100))
-    return has_deep_gemm() and is_supported_arch
+    return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch
 
 
 @functools.cache
@@ -35,12 +35,9 @@ def is_blackwell_deep_gemm_e8m0_used() -> bool:
     """Return ``True`` if vLLM is configured to use DeepGEMM "
     "E8M0 scale on a Blackwell-class GPU.
     """
-    if not (envs.VLLM_USE_DEEP_GEMM):
-        logger.debug_once("DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM=0.")
-        return False
-
-    if not has_deep_gemm():
-        logger.debug_once("DeepGEMM E8M0 disabled: DeepGEMM backend missing.")
+    if not is_deep_gemm_supported():
+        logger.debug_once(
+            "DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.")
         return False
 
     if not envs.VLLM_USE_DEEP_GEMM_E8M0:

From 6fd45e7b8a3dc216875428835036a9008cdc0fe3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Aug 2025 10:34:12 +0800
Subject: [PATCH 577/932] [CI/Build] Use vLLM client's user agent to fetch
 images (#23561)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_vision.py           | 6 ++----
 tests/entrypoints/openai/test_vision_embedding.py | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 8259a81d7b..eaa6c2c163 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -6,8 +6,6 @@ import json
 import openai
 import pytest
 import pytest_asyncio
-import requests
-from PIL import Image
 from transformers import AutoProcessor
 
 from vllm.multimodal.utils import encode_image_base64, fetch_image
@@ -36,7 +34,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
     ],
     [
         "The image shows a Venn diagram with three over",
-        "The image shows a Venn diagram with three intersect",
+        "This image shows a Venn diagram with three intersect",
     ],
     [
         "This image displays a gradient of colors ranging from",
@@ -88,7 +86,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
         "role": "user",
         "content": f"{placeholder}{content}",
     }]
-    images = [Image.open(requests.get(image_url, stream=True).raw)]
+    images = [fetch_image(image_url)]
 
     prompt = processor.tokenizer.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True)
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 4e6a210586..d3cc2fac6a 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -5,7 +5,6 @@ import json
 
 import pytest
 import requests
-from PIL import Image
 from transformers import AutoProcessor
 
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
@@ -64,7 +63,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
 
     placeholder = "<|image_1|> "
     prompt = f"{placeholder}{content}"
-    images = [Image.open(requests.get(image_url, stream=True).raw)]
+    images = [fetch_image(image_url)]
     inputs = processor(prompt, images, return_tensors="pt")
     return inputs.input_ids.shape[1]
 

From 6fad29b11b3680c44782cd6e5fe555779d620d6c Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Mon, 25 Aug 2025 19:34:15 -0700
Subject: [PATCH 578/932] Remove graph_pool as member of VllmBackend and
 argument to CUDAGraphWrapper (#23385)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 vllm/compilation/backends.py          | 14 ++------------
 vllm/compilation/base_static_graph.py |  5 +----
 vllm/compilation/cuda_graph.py        |  8 ++++----
 3 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 56494dffc9..fa86773d24 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -294,13 +294,12 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
 
     def __init__(self, module: torch.fx.GraphModule,
                  compile_submod_names: list[str], vllm_config: VllmConfig,
-                 graph_pool, vllm_backend: "VllmBackend"):
+                 vllm_backend: "VllmBackend"):
         super().__init__(module)
         from torch._guards import detect_fake_mode
         self.fake_mode = detect_fake_mode()
         self.compile_submod_names = compile_submod_names
         self.compilation_config = vllm_config.compilation_config
-        self.graph_pool = graph_pool
         self.vllm_config = vllm_config
         self.vllm_backend = vllm_backend
         # When True, it annoyingly dumps the torch.fx.Graph on errors.
@@ -359,7 +358,6 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
                     runnable=piecewise_backend,
                     vllm_config=self.vllm_config,
                     runtime_mode=CUDAGraphMode.PIECEWISE,
-                    graph_pool=self.graph_pool,
                     cudagraph_options=CUDAGraphOptions(
                         debug_log_enable=piecewise_backend.is_first_graph,
                         gc_disable=not piecewise_backend.is_first_graph,
@@ -405,7 +403,6 @@ class VllmBackend:
 
     vllm_config: VllmConfig
     compilation_config: CompilationConfig
-    graph_pool: Any
     _called: bool = False
     # the graph we compiled
     graph: fx.GraphModule
@@ -433,13 +430,6 @@ class VllmBackend:
         # them, e.g. backbone (default), eagle_head, etc.
         self.prefix = prefix or model_tag
 
-        global_graph_pool = current_platform.get_global_graph_pool()
-
-        # TODO: in the future, if we want to use multiple
-        # streams, it might not be safe to share a global pool.
-        # only investigate this when we use multiple streams
-        self.graph_pool = global_graph_pool
-
         # Passes to run on the graph post-grad.
         self.post_grad_pass_manager = PostGradPassManager()
 
@@ -586,7 +576,7 @@ class VllmBackend:
         # propagate the split graph to the piecewise backend,
         # compile submodules with symbolic shapes
         PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
-                                    self.vllm_config, self.graph_pool,
+                                    self.vllm_config,
                                     self).run(*example_inputs)
 
         graph_path = os.path.join(local_cache_dir, "computation_graph.py")
diff --git a/vllm/compilation/base_static_graph.py b/vllm/compilation/base_static_graph.py
index 1c3f52c533..161d066ce9 100644
--- a/vllm/compilation/base_static_graph.py
+++ b/vllm/compilation/base_static_graph.py
@@ -13,7 +13,7 @@ class AbstractStaticGraphWrapper(Protocol):
     """
 
     def __init__(self, runnable: Callable, vllm_config: VllmConfig,
-                 runtime_mode: CUDAGraphMode, graph_pool: Any, **kwargs):
+                 runtime_mode: CUDAGraphMode, **kwargs):
         """
         Initializes the StaticGraphWrapper class with graph capturing and
         execution-related configurations.
@@ -25,9 +25,6 @@ class AbstractStaticGraphWrapper(Protocol):
                 graph runtime. See CUDAGraphMode in vllm/config.py.
                 Note that only the subset enum `NONE`, `PIECEWISE` and `FULL`
                 are used as concrete runtime mode for cudagraph dispatching.
-            graph_pool (Any):
-                Graph memory pool handle, e.g.,
-                    `torch.cuda.graph_pool_handle()`.
         Keyword Args:
             kwargs: Additional keyword arguments for platform-specific
                 configurations.
diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
index 65a38197ad..e233f959c0 100644
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@@ -67,11 +67,9 @@ class CUDAGraphWrapper:
                  runnable: Callable,
                  vllm_config: VllmConfig,
                  runtime_mode: CUDAGraphMode,
-                 graph_pool: Any = None,
                  cudagraph_options: Optional[CUDAGraphOptions] = None):
         self.runnable = runnable
         self.vllm_config = vllm_config
-        self.graph_pool = graph_pool
         self.runtime_mode = runtime_mode
         self.compilation_config = vllm_config.compilation_config
 
@@ -81,8 +79,10 @@ class CUDAGraphWrapper:
         # assert runtime_mode is not NONE(no cudagraph), otherwise, we don't
         # need to initialize a CUDAGraphWrapper.
         assert self.runtime_mode != CUDAGraphMode.NONE
-        if self.graph_pool is None:
-            self.graph_pool = current_platform.get_global_graph_pool()
+        # TODO: in the future, if we want to use multiple
+        # streams, it might not be safe to share a global pool.
+        # only investigate this when we use multiple streams
+        self.graph_pool = current_platform.get_global_graph_pool()
 
         if cudagraph_options is None:
             cudagraph_options = CUDAGraphOptions()

From b395b3b0a3166d17c75e74f4eaf0ff4b15f2554f Mon Sep 17 00:00:00 2001
From: Zijing Liu <liuzijing2014@users.noreply.github.com>
Date: Mon, 25 Aug 2025 21:06:00 -0700
Subject: [PATCH 579/932] [Disagg][Perf] Use CUDA event sync instead of
 blocking `tolist` to avoid unintentional copy ops blocking across different
 CUDA streams, improving disagg TTIT/TTFT (#22760)

Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Zijing Liu <liuzijing2014@users.noreply.github.com>
---
 vllm/v1/worker/gpu_model_runner.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5d49bbaf27..4f6cf9a350 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -316,6 +316,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Cached outputs.
         self._draft_token_ids: Optional[Union[list[list[int]],
                                               torch.Tensor]] = None
+        self.transfer_event = torch.cuda.Event()
+        self.sampled_token_ids_pinned_cpu = torch.empty(
+            (self.max_model_len, 1),
+            dtype=torch.int64,
+            device="cpu",
+            pin_memory=True)
 
     def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
         return CpuGpuBuffer(*args,
@@ -1691,7 +1697,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         max_gen_len = sampled_token_ids.shape[-1]
         if max_gen_len == 1:
             # No spec decode tokens.
-            valid_sampled_token_ids = sampled_token_ids.tolist()
+            valid_sampled_token_ids = self._to_list(sampled_token_ids)
         else:
             # Includes spec decode tokens.
             valid_sampled_token_ids = self.rejection_sampler.parse_output(
@@ -2219,7 +2225,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 - CUDAGraphMode.PIECEWISE: Piecewise cudagraph.
                 - CUDAGraphMode.FULL: Full cudagraph, attention metadata is
                     needed.
-            force_attention: If True, always create attention metadata. Used to 
+            force_attention: If True, always create attention metadata. Used to
                 warm up attention backend when mode is NONE.
             uniform_decode: If True, the batch is a uniform decode batch.
             skip_eplb: If True, skip EPLB state update.
@@ -3233,3 +3239,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     mamba_type=mamba_module.mamba_type)
 
         return kv_cache_spec
+
+    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
+        # This is a short term mitigation for issue mentioned in
+        # https://github.com/vllm-project/vllm/issues/22754.
+        # `tolist` would trigger a cuda wise stream sync, which
+        # would block other copy ops from other cuda streams.
+        # A cuda event sync would avoid such a situation. Since
+        # this is in the critical path of every single model
+        # forward loop, this has caused perf issue for a disagg
+        # setup.
+        pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]]
+        pinned.copy_(sampled_token_ids, non_blocking=True)
+        self.transfer_event.record()
+        self.transfer_event.synchronize()
+        return pinned.tolist()

From ce0e9dbd43e798d5b27a2a379aa4e13d91a279e3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Aug 2025 14:13:03 +0800
Subject: [PATCH 580/932] [CI/Build] Fix typo in #23561 (#23616)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index eaa6c2c163..106ec121a4 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -34,7 +34,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
     ],
     [
         "The image shows a Venn diagram with three over",
-        "This image shows a Venn diagram with three intersect",
+        "The image shows a Venn diagram with three intersect",
     ],
     [
         "This image displays a gradient of colors ranging from",

From 959783fb996d0d15598f45ca12ffcbee4b681424 Mon Sep 17 00:00:00 2001
From: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Date: Tue, 26 Aug 2025 14:16:36 +0800
Subject: [PATCH 581/932] [fix] fix seed-oss-parser (#23560)

Signed-off-by: jiabin.00 <jiabin.00@bytedance.com>
---
 tests/tool_use/test_seed_oss_tool_parser.py              | 9 ++-------
 .../openai/tool_parsers/seed_oss_tool_parser.py          | 3 +++
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index d85bc9bbf1..c276a598aa 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -102,9 +102,7 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
-        ("""<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-         """<seed:tool_call>\n<function=get_weather>\n"""
+        ("""<seed:tool_call>\n<function=get_weather>\n"""
          """<parameter=location>Barcelona, Spain</parameter>\n</function>\n</seed:tool_call>""",
          [
              ToolCall(function=FunctionCall(
@@ -114,10 +112,7 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
                  }, ),
              ),
                       type='function')
-         ],
-         """<seed:think>\n</seed:cot_budget_reflect>\n</seed:cot_budget_reflect>\n"""
-         """The current thinking budget is 0, so I will directly start answering the question.\n</seed:think>\n"""
-         ),
+         ], None),
         (
             """<seed:think>The user\'s current thinking budget is 512.</seed:cot_budget_reflect>\nLet me analyze the """
             """question. The user wants to know the weather in Barcelona, Spain. Looking at the functions available, """
diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
index 69cf2e68f7..95458f07ff 100644
--- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
@@ -271,6 +271,9 @@ class SeedOssToolParser(ToolParser):
             # Extract content after think end token
             result_content = model_output[think_end_index:]
             thinking_content = model_output[:think_end_index]
+        else:
+            thinking_content = ""
+            result_content = model_output
 
         try:
             function_calls = self._get_function_calls(result_content)

From 7d67a9d9f93f86b74066c64c373405aa088e4a16 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Aug 2025 14:50:17 +0800
Subject: [PATCH 582/932] [mypy] Fix incorrect type hint for EAGLE3 support
 (#23617)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/llama.py | 6 +++---
 vllm/model_executor/models/qwen2.py | 6 +++---
 vllm/model_executor/models/qwen3.py | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f99f1c3643..e39a6df843 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -353,7 +353,7 @@ class LlamaModel(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-        self.aux_hidden_state_layers: tuple[int] = tuple()
+        self.aux_hidden_state_layers = tuple[int, ...]()
 
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
@@ -553,10 +553,10 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
-    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
 
-    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 801741ecaf..27c1e68c67 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -333,7 +333,7 @@ class Qwen2Model(nn.Module):
         else:
             self.norm = PPMissingLayer()
 
-        self.aux_hidden_state_layers: tuple[int] = tuple()
+        self.aux_hidden_state_layers = tuple[int, ...]()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
@@ -488,10 +488,10 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
-    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
 
-    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 2060206633..dddb47048a 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -304,10 +304,10 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
-    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         self.model.aux_hidden_state_layers = layers
 
-    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 

From 3ecbb14b814f9559bce88fa62ea8b5deedbc6076 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Tue, 26 Aug 2025 14:57:08 +0800
Subject: [PATCH 583/932] [Benchmarks] add benchmark for embedding models
 (#23000)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/benchmarks/datasets.py                  |  67 +++--
 vllm/benchmarks/lib/endpoint_request_func.py |  57 +++-
 vllm/benchmarks/serve.py                     | 257 +++++++++++++------
 3 files changed, 274 insertions(+), 107 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index e586337367..93519b5ba1 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -73,7 +73,7 @@ class SampleRequest:
     Represents a single inference request for benchmarking.
     """
 
-    prompt: Union[str, Any]
+    prompt: Union[str, list[str]]
     prompt_len: int
     expected_output_len: int
     multi_modal_data: Optional[
@@ -409,6 +409,7 @@ class RandomDataset(BenchmarkDataset):
         range_ratio: float = DEFAULT_RANGE_RATIO,
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
+        batchsize: int = 1,
         **kwargs,
     ) -> list[SampleRequest]:
 
@@ -439,6 +440,21 @@ class RandomDataset(BenchmarkDataset):
                     request_id=request_id_prefix + str(i),
                 )
             )
+        # only used for embeddings benchmark.
+        if batchsize > 1:
+            batch_requests = []
+            # Create batched requests
+            for i in range(0, num_requests, batchsize):
+                batch = requests[i : i + batchsize]
+                batch_requests.append(
+                    SampleRequest(
+                        prompt=[req.prompt for req in batch],
+                        prompt_len=sum(req.prompt_len for req in batch),
+                        expected_output_len=0,
+                        request_id=request_id_prefix + str(i // batchsize),
+                    )
+                )
+            requests = batch_requests
         return requests
 
     def get_prefix(
@@ -475,8 +491,8 @@ class RandomDataset(BenchmarkDataset):
         input_high = math.ceil(real_input_len * (1 + range_ratio))
         output_low = math.floor(output_len * (1 - range_ratio))
         output_high = math.ceil(output_len * (1 + range_ratio))
-        # Ensure the lower bound for output length is at least 1 to 
-        # prevent sampling 0 tokens. 
+        # Ensure the lower bound for output length is at least 1 to
+        # prevent sampling 0 tokens.
         output_low = max(output_low, 1)
 
         if input_low > input_high:
@@ -506,7 +522,6 @@ class RandomDataset(BenchmarkDataset):
                                         size=num_requests)
         return input_lens, output_lens, offsets
 
-
     def generate_token_sequence(
         self,
         *,
@@ -1105,6 +1120,13 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
               "context length sampled from [input_len * (1 - range_ratio), "
               "input_len * (1 + range_ratio)]."),
     )
+    random_group.add_argument(
+        "--random-batch-size",
+        type=int,
+        default=1,
+        help=("Batch size for random sampling. "
+              "Only used for embeddings benchmark."),
+    )
 
     # random multimodal dataset options
     random_mm_group = parser.add_argument_group(
@@ -1196,8 +1218,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         ),
     )
 
-
-
     hf_group = parser.add_argument_group("hf dataset options")
     hf_group.add_argument("--hf-subset",
                           type=str,
@@ -1348,22 +1368,24 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
     else:
         # For datasets that follow a similar structure, use a mapping.
         dataset_mapping = {
-            "sharegpt":
-            lambda: ShareGPTDataset(random_seed=args.seed,
-                                    dataset_path=args.dataset_path).sample(
-                                        tokenizer=tokenizer,
-                                        num_requests=args.num_prompts,
-                                        output_len=args.sharegpt_output_len,
-                                        request_id_prefix=args.request_id_prefix,
-                                    ),
-            "burstgpt":
-            lambda: BurstGPTDataset(random_seed=args.seed,
-                                    dataset_path=args.dataset_path).
-            sample(tokenizer=tokenizer, num_requests=args.num_prompts, 
-                   request_id_prefix=args.request_id_prefix,),
-            "random":
-            lambda: RandomDataset(random_seed=args.seed,
-                                  dataset_path=args.dataset_path).sample(
+            "sharegpt": lambda: ShareGPTDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                output_len=args.sharegpt_output_len,
+                request_id_prefix=args.request_id_prefix,
+            ),
+            "burstgpt": lambda: BurstGPTDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                request_id_prefix=args.request_id_prefix,
+            ),
+            "random": lambda: RandomDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
                 prefix_len=args.random_prefix_len,
@@ -1371,6 +1393,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 output_len=args.random_output_len,
                 range_ratio=args.random_range_ratio,
                 request_id_prefix=args.request_id_prefix,
+                batchsize=args.random_batch_size,
             ),
             "random-mm":
             lambda: RandomMultiModalDataset(
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
index 76beded4d5..6bb2a49711 100644
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -69,8 +69,8 @@ async def async_request_openai_completions(
     ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
 
     payload = {
-        "model": request_func_input.model_name \
-            if request_func_input.model_name else request_func_input.model,
+        "model": request_func_input.model_name
+        if request_func_input.model_name else request_func_input.model,
         "prompt": request_func_input.prompt,
         "temperature": 0.0,
         "repetition_penalty": 1.0,
@@ -135,7 +135,7 @@ async def async_request_openai_completions(
                             # Decoding phase
                             else:
                                 output.itl.append(timestamp -
-                                                    most_recent_timestamp)
+                                                  most_recent_timestamp)
 
                             most_recent_timestamp = timestamp
                             generated_text += text or ""
@@ -254,7 +254,7 @@ async def async_request_openai_chat_completions(
                             # Decoding phase
                             else:
                                 output.itl.append(timestamp -
-                                                    most_recent_timestamp)
+                                                  most_recent_timestamp)
 
                             generated_text += content or ""
                         elif usage := data.get("usage"):
@@ -394,12 +394,61 @@ async def async_request_openai_audio(
     return output
 
 
+async def async_request_openai_embeddings(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+):
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "embeddings"
+    ), "OpenAI Embeddings API URL must end with 'embeddings'."
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+
+    payload = {
+        "model": request_func_input.model,
+        "input": request_func_input.prompt,
+    }
+
+    output = RequestFuncOutput()
+    st = time.perf_counter()
+    try:
+        async with session.post(
+            url=api_url,
+            headers=headers,
+            json=payload
+        ) as response:
+            if response.status == 200:
+                output.latency = time.perf_counter() - st
+                data = await response.json()
+                output.success = True
+                output.generated_text = ""
+                output.prompt_len = data.get(
+                    "usage", {}).get(
+                    "prompt_tokens", 0)
+            else:
+                output.success = False
+                output.error = response.reason or ""
+    except Exception as e:
+        output.success = False
+        output.error = str(e)
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
 # TODO: Add more request functions for different API protocols.
 ASYNC_REQUEST_FUNCS = {
     "vllm": async_request_openai_completions,
     "openai": async_request_openai_completions,
     "openai-chat": async_request_openai_chat_completions,
     "openai-audio": async_request_openai_audio,
+    "openai-embeddings": async_request_openai_embeddings,
 }
 
 OPENAI_COMPATIBLE_BACKENDS = [
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 79f2c475cb..abb838316c 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -4,7 +4,7 @@ r"""Benchmark online serving throughput.
 
 On the server side, run one of the following commands
 to launch the vLLM OpenAI API server:
-    vllm serve <your_model> <engine arguments>        
+    vllm serve <your_model> <engine arguments>
 
 On the client side, run:
     vllm bench serve \
@@ -26,6 +26,7 @@ import warnings
 from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
+from enum import Enum
 from typing import Any, Literal, Optional
 
 import aiohttp
@@ -46,6 +47,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
 
+class TaskType(Enum):
+    GENERATION = "generation"
+    EMBEDDING = "embedding"
+
+
 @dataclass
 class BenchmarkMetrics:
     completed: int
@@ -75,6 +81,16 @@ class BenchmarkMetrics:
     std_e2el_ms: float
     percentiles_e2el_ms: list[tuple[float, float]]
 
+@dataclass
+class EmbedBenchmarkMetrics:
+    completed: int
+    total_input: int
+    request_throughput: float
+    total_token_throughput :float
+    mean_e2el_ms: float
+    std_e2el_ms: float
+    median_e2el_ms: float
+    percentiles_e2el_ms: float
 
 def _get_current_request_rate(
     ramp_up_strategy: Optional[Literal["linear", "exponential"]],
@@ -146,11 +162,11 @@ async def get_request(
     delay_ts = []
     for request_index, request in enumerate(input_requests):
         current_request_rate = _get_current_request_rate(ramp_up_strategy,
-                                                      ramp_up_start_rps,
-                                                      ramp_up_end_rps,
-                                                      request_index,
-                                                      total_requests,
-                                                      request_rate)
+                                                         ramp_up_start_rps,
+                                                         ramp_up_end_rps,
+                                                         request_index,
+                                                         total_requests,
+                                                         request_rate)
         request_rates.append(current_request_rate)
         if current_request_rate == float("inf"):
             delay_ts.append(0)
@@ -160,7 +176,7 @@ async def get_request(
             # Sample the request interval from the gamma distribution.
             # If burstiness is 1, it follows exponential distribution.
             delay_ts.append(np.random.gamma(shape=burstiness, scale=theta))
-    
+
     # Calculate the cumulative delay time from the first sent out requests.
     for i in range(1, len(delay_ts)):
         delay_ts[i] += delay_ts[i - 1]
@@ -170,11 +186,11 @@ async def get_request(
         # logic would re-scale delay time to ensure the final delay_ts
         # align with target_total_delay_s.
         #
-        # NOTE: If we simply accumulate the random delta values 
-        # from the gamma distribution, their sum would have 1-2% gap 
+        # NOTE: If we simply accumulate the random delta values
+        # from the gamma distribution, their sum would have 1-2% gap
         # from target_total_delay_s. The purpose of the following logic is to
-        # close the gap for stablizing the throughput data 
-        # from different random seeds. 
+        # close the gap for stablizing the throughput data
+        # from different random seeds.
         target_total_delay_s = total_requests / request_rate
         normalize_factor = target_total_delay_s / delay_ts[-1]
         delay_ts = [delay * normalize_factor for delay in delay_ts]
@@ -189,6 +205,51 @@ async def get_request(
         yield request, request_rates[request_index]
 
 
+def calculate_metrics_for_embeddings(
+    outputs: list[RequestFuncOutput], 
+    dur_s: float, 
+    selected_percentiles: list[float]
+) -> EmbedBenchmarkMetrics:
+    """Calculate the metrics for the embedding requests.
+
+    Args:
+        outputs: The outputs of the requests.
+        dur_s: The duration of the benchmark.
+        selected_percentiles: The percentiles to select.
+
+    Returns:
+        The calculated benchmark metrics.
+    """
+    total_input = 0
+    completed = 0
+    e2els: list[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            e2els.append(outputs[i].latency)
+            completed += 1
+            total_input += outputs[i].prompt_len
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2)
+    metrics = EmbedBenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        request_throughput=completed / dur_s,
+        total_token_throughput=total_input / dur_s,
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) 
+            for p in selected_percentiles
+        ],
+    )
+    return metrics
+
+
 def calculate_metrics(
     input_requests: list[SampleRequest],
     outputs: list[RequestFuncOutput],
@@ -334,8 +395,16 @@ async def benchmark(
     ramp_up_end_rps: Optional[int] = None,
     ready_check_timeout_sec: int = 600,
 ):
+    task_type = (
+        TaskType.EMBEDDING
+        if api_url.endswith("/v1/embeddings")
+        else TaskType.GENERATION
+    )
     if endpoint_type in ASYNC_REQUEST_FUNCS:
-        request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
+        if task_type == TaskType.EMBEDDING:
+            request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"]
+        else:
+            request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
     else:
         raise ValueError(f"Unknown endpoint_type: {endpoint_type}")
 
@@ -421,8 +490,8 @@ async def benchmark(
         if profile_output.success:
             print("Profiler started")
 
-    distribution = ("Poisson process" if burstiness == 1.0 
-                   else "Gamma distribution")
+    distribution = ("Poisson process" if burstiness == 1.0
+                    else "Gamma distribution")
 
     if ramp_up_strategy is not None:
         print(f"Traffic ramp-up strategy: {ramp_up_strategy}.")
@@ -449,7 +518,7 @@ async def benchmark(
                                       session=session,
                                       pbar=pbar)
         async with semaphore:
-            return await request_func(request_func_input=request_func_input, 
+            return await request_func(request_func_input=request_func_input,
                                       session=session,
                                       pbar=pbar)
 
@@ -513,14 +582,22 @@ async def benchmark(
 
     benchmark_duration = time.perf_counter() - benchmark_start_time
 
-    metrics, actual_output_lens = calculate_metrics(
-        input_requests=input_requests,
-        outputs=outputs,
-        dur_s=benchmark_duration,
-        tokenizer=tokenizer,
-        selected_percentiles=selected_percentiles,
-        goodput_config_dict=goodput_config_dict,
-    )
+    if task_type == TaskType.GENERATION:
+        metrics, actual_output_lens = calculate_metrics(
+            input_requests=input_requests,
+            outputs=outputs,
+            dur_s=benchmark_duration,
+            tokenizer=tokenizer,
+            selected_percentiles=selected_percentiles,
+            goodput_config_dict=goodput_config_dict,
+        )
+    else:
+        metrics = calculate_metrics_for_embeddings(
+            outputs=outputs,
+            dur_s=benchmark_duration,
+            selected_percentiles=selected_percentiles,
+        )
+        actual_output_lens = 0
 
     print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
@@ -529,39 +606,55 @@ async def benchmark(
                                      max_concurrency))
     if request_rate != float('inf'):
         print("{:<40} {:<10.2f}".format("Request rate configured (RPS):",
-                                        request_rate ))
+                                        request_rate))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
                                     benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print("{:<40} {:<10}".format("Total generated tokens:",
-                                 metrics.total_output))
+    if isinstance(metrics, BenchmarkMetrics):
+        print("{:<40} {:<10}".format(
+            "Total generated tokens:", metrics.total_output))
     print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                     metrics.request_throughput))
     if goodput_config_dict:
         print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
                                         metrics.request_goodput))
-    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
-                                    metrics.output_throughput))
+    if isinstance(metrics, BenchmarkMetrics):
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Output token throughput (tok/s):", metrics.output_throughput
+            )
+        )
     print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
                                     metrics.total_token_throughput))
 
-    result = {
-        "duration": benchmark_duration,
-        "completed": metrics.completed,
-        "total_input_tokens": metrics.total_input,
-        "total_output_tokens": metrics.total_output,
-        "request_throughput": metrics.request_throughput,
-        "request_goodput":
-        metrics.request_goodput if goodput_config_dict else None,
-        "output_throughput": metrics.output_throughput,
-        "total_token_throughput": metrics.total_token_throughput,
-        "input_lens": [output.prompt_len for output in outputs],
-        "output_lens": actual_output_lens,
-        "ttfts": [output.ttft for output in outputs],
-        "itls": [output.itl for output in outputs],
-        "generated_texts": [output.generated_text for output in outputs],
-        "errors": [output.error for output in outputs],
-    }
+    if isinstance(metrics, BenchmarkMetrics):
+        result = {
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+            "total_input_tokens": metrics.total_input,
+            "total_output_tokens": metrics.total_output,
+            "request_throughput": metrics.request_throughput,
+            "request_goodput":
+            metrics.request_goodput if goodput_config_dict else None,
+            "output_throughput": metrics.output_throughput,
+            "total_token_throughput": metrics.total_token_throughput,
+            "input_lens": [output.prompt_len for output in outputs],
+            "output_lens": actual_output_lens,
+            "ttfts": [output.ttft for output in outputs],
+            "itls": [output.itl for output in outputs],
+            "generated_texts": [output.generated_text for output in outputs],
+            "errors": [output.error for output in outputs],
+        }
+    else:
+        result = {
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+            "total_input_tokens": metrics.total_input,
+            "request_throughput": metrics.request_throughput,
+            "total_token_throughput": metrics.total_token_throughput,
+            "input_lens": [output.prompt_len for output in outputs],
+            "errors": [output.error for output in outputs],
+        }
 
     if rps_change_events:
         result["rps_change_events"] = rps_change_events
@@ -598,10 +691,11 @@ async def benchmark(
                                             value))
             result[f"p{p_word}_{metric_attribute_name}_ms"] = value
 
-    process_one_metric("ttft", "TTFT", "Time to First Token")
-    process_one_metric("tpot", "TPOT",
-                       "Time per Output Token (excl. 1st token)")
-    process_one_metric("itl", "ITL", "Inter-token Latency")
+    if task_type == TaskType.GENERATION:
+        process_one_metric("ttft", "TTFT", "Time to First Token")
+        process_one_metric(
+            "tpot", "TPOT", "Time per Output Token (excl. 1st token)")
+        process_one_metric("itl", "ITL", "Inter-token Latency")
     process_one_metric("e2el", "E2EL", "End-to-end Latency")
 
     print("=" * 50)
@@ -732,7 +826,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "initiated, this argument will control how many are actually allowed "
         "to execute at a time. This means that when used in combination, the "
         "actual request rate may be lower than specified with --request-rate, "
-        "if the server is not processing requests fast enough to keep up.")
+        "if the server is not processing requests fast enough to keep up.",
+    )
 
     parser.add_argument(
         "--model",
@@ -743,8 +838,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--tokenizer",
         type=str,
-        help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
     parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument(
@@ -968,6 +1062,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
 def main(args: argparse.Namespace) -> dict[str, Any]:
     return asyncio.run(main_async(args))
 
+
 async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     print(args)
     random.seed(args.seed)
@@ -1046,32 +1141,32 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     gc.freeze()
 
     benchmark_result = await benchmark(
-            endpoint_type=args.endpoint_type,
-            api_url=api_url,
-            base_url=base_url,
-            model_id=model_id,
-            model_name=model_name,
-            tokenizer=tokenizer,
-            input_requests=input_requests,
-            logprobs=args.logprobs,
-            request_rate=args.request_rate,
-            burstiness=args.burstiness,
-            disable_tqdm=args.disable_tqdm,
-            profile=args.profile,
-            selected_percentile_metrics=args.percentile_metrics.split(","),
-            selected_percentiles=[
-                float(p) for p in args.metric_percentiles.split(",")
-            ],
-            ignore_eos=args.ignore_eos,
-            goodput_config_dict=goodput_config_dict,
-            max_concurrency=args.max_concurrency,
-            lora_modules=args.lora_modules,
-            extra_body=sampling_params,
-            ramp_up_strategy=args.ramp_up_strategy,
-            ramp_up_start_rps=args.ramp_up_start_rps,
-            ramp_up_end_rps=args.ramp_up_end_rps,
-            ready_check_timeout_sec=args.ready_check_timeout_sec,
-        )
+        endpoint_type=args.endpoint_type,
+        api_url=api_url,
+        base_url=base_url,
+        model_id=model_id,
+        model_name=model_name,
+        tokenizer=tokenizer,
+        input_requests=input_requests,
+        logprobs=args.logprobs,
+        request_rate=args.request_rate,
+        burstiness=args.burstiness,
+        disable_tqdm=args.disable_tqdm,
+        profile=args.profile,
+        selected_percentile_metrics=args.percentile_metrics.split(","),
+        selected_percentiles=[
+            float(p) for p in args.metric_percentiles.split(",")
+        ],
+        ignore_eos=args.ignore_eos,
+        goodput_config_dict=goodput_config_dict,
+        max_concurrency=args.max_concurrency,
+        lora_modules=args.lora_modules,
+        extra_body=sampling_params,
+        ramp_up_strategy=args.ramp_up_strategy,
+        ramp_up_start_rps=args.ramp_up_start_rps,
+        ramp_up_end_rps=args.ramp_up_end_rps,
+        ready_check_timeout_sec=args.ready_check_timeout_sec,
+    )
 
     # Save config and results to json
     result_json: dict[str, Any] = {}
@@ -1098,7 +1193,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
 
     # Traffic
     result_json["request_rate"] = (args.request_rate if args.request_rate
-                                    < float("inf") else "inf")
+                                   < float("inf") else "inf")
     result_json["burstiness"] = args.burstiness
     result_json["max_concurrency"] = args.max_concurrency
 
@@ -1132,7 +1227,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
                                if args.max_concurrency is not None else "")
         label = label or endpoint_type
         if args.ramp_up_strategy is not None:
-            file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
+            file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
         else:
             file_name = f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
         if args.result_filename:
@@ -1149,4 +1244,4 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             json.dump(result_json, outfile)
         save_to_pytorch_benchmark_format(args, result_json, file_name)
 
-    return result_json
\ No newline at end of file
+    return result_json

From bfc1edc9f5bde581e0eec5c830a5a4a7b710fe6a Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 26 Aug 2025 08:16:44 +0100
Subject: [PATCH 584/932] [Docs] Fix titles for multi-file examples that are
 rendered in the docs (#23573)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/mkdocs/hooks/generate_examples.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 1e8b848db4..881df79169 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -70,6 +70,10 @@ class Example:
         self.other_files = self.determine_other_files()
         self.title = self.determine_title()
 
+    @property
+    def is_code(self) -> bool:
+        return self.main_file.suffix != ".md"
+
     def determine_main_file(self) -> Path:
         """
         Determines the main file in the given path.
@@ -101,6 +105,12 @@ class Example:
         return [file for file in self.path.rglob("*") if is_other_file(file)]
 
     def determine_title(self) -> str:
+        if not self.is_code:
+            with open(self.main_file) as f:
+                first_line = f.readline().strip()
+            match = re.match(r'^#\s+(?P<title>.+)$', first_line)
+            if match:
+                return match.group('title')
         return fix_case(self.path.stem.replace("_", " ").title())
 
     def generate(self) -> str:
@@ -110,11 +120,13 @@ class Example:
         # Use long code fence to avoid issues with
         # included files containing code fences too
         code_fence = "``````"
-        is_code = self.main_file.suffix != ".md"
-        if is_code:
+        # Skip the title from md snippets as it's been included above
+        start_line = 2
+        if self.is_code:
             content += f"{code_fence}{self.main_file.suffix[1:]}\n"
-        content += f'--8<-- "{self.main_file}"\n'
-        if is_code:
+            start_line = 1
+        content += f'--8<-- "{self.main_file}:{start_line}"\n'
+        if self.is_code:
             content += f"{code_fence}\n"
         content += "\n"
 

From ff77764f868290bf746d101d3998095b73e7811d Mon Sep 17 00:00:00 2001
From: Raghavan <oneraghavan@gmail.com>
Date: Tue, 26 Aug 2025 13:35:37 +0530
Subject: [PATCH 585/932] Fix CLI parameter documentation inconsistency in
 pooling_models.md (#23630)

---
 docs/models/pooling_models.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 39f209d0eb..753d8bd0b8 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -205,12 +205,12 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides
 
 There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
 
-For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`,  `--hf_overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
+For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`,  `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
 
 Here is an example to serve a model with Matryoshka Embeddings enabled.
 
 ```text
-vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_dimensions":[256]}'
+vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
 ```
 
 ### Offline Inference

From 9b5f64238fbd0f98928587b3426cbf69eea96ae7 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 26 Aug 2025 16:09:14 +0800
Subject: [PATCH 586/932] [Bugfix] Fix Qwen25VL packed_modules_mapping (#23604)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 0f11636ce3..648ba81eb3 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -853,6 +853,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsQuant):
 
     packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
     }
 

From b5d34af3286ee0334d9f7bd729774ac55c5805e9 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Tue, 26 Aug 2025 02:46:28 -0700
Subject: [PATCH 587/932] [Bugfix] Fix scheduling when repeated images in one
 request (#23544)

Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
---
 tests/v1/core/test_encoder_cache_manager.py | 49 ++++++++++++++-----
 vllm/v1/core/encoder_cache_manager.py       | 32 +++++++-----
 vllm/v1/core/sched/scheduler.py             | 54 +++++++++++++++------
 3 files changed, 96 insertions(+), 39 deletions(-)

diff --git a/tests/v1/core/test_encoder_cache_manager.py b/tests/v1/core/test_encoder_cache_manager.py
index 60d932a878..ae5b751f45 100644
--- a/tests/v1/core/test_encoder_cache_manager.py
+++ b/tests/v1/core/test_encoder_cache_manager.py
@@ -22,7 +22,7 @@ def test_basic_allocate_and_reuse():
     req = MockRequest("r1", ["imgA"], [4])
 
     assert not cache.check_and_update_cache(req, 0)
-    assert cache.try_allocate(req, 0, int(1e9))
+    assert cache.can_allocate(req, 0, int(1e9), 0)
 
     cache.allocate(req, 0)
 
@@ -44,7 +44,7 @@ def test_freeing_decreases_refcount_and_moves_to_freeable():
     manager = EncoderCacheManager(cache_size=10)
     req = MockRequest("req2", ["img3"], [5])
 
-    assert manager.try_allocate(req, 0, int(1e9))
+    assert manager.can_allocate(req, 0, int(1e9), 0)
     manager.allocate(req, 0)
 
     assert len(manager.cached["img3"]) == 1
@@ -60,10 +60,10 @@ def test_free_request_frees_all_inputs():
     manager = EncoderCacheManager(cache_size=10)
     req = MockRequest("req3", ["a", "b"], [2, 3])
 
-    assert manager.try_allocate(req, 0, int(1e9))
+    assert manager.can_allocate(req, 0, int(1e9), 0)
     manager.allocate(req, 0)
 
-    assert manager.try_allocate(req, 1, int(1e9))
+    assert manager.can_allocate(req, 1, int(1e9), 0)
     manager.allocate(req, 1)
 
     assert len(manager.cached["a"]) == 1
@@ -84,11 +84,11 @@ def test_eviction_when_cache_is_full():
     req1 = MockRequest("req1", ["x"], [6])
     req2 = MockRequest("req2", ["y"], [5])
 
-    assert manager.try_allocate(req1, 0, int(1e9))
+    assert manager.can_allocate(req1, 0, int(1e9), 0)
     manager.allocate(req1, 0)
     manager.free_encoder_input(req1, 0)
 
-    assert manager.try_allocate(req2, 0, int(1e9))
+    assert manager.can_allocate(req2, 0, int(1e9), 0)
     manager.allocate(req2, 0)
 
     # 'x' should have been evicted.
@@ -100,10 +100,10 @@ def test_get_cached_input_ids():
     manager = EncoderCacheManager(cache_size=10)
     req = MockRequest("reqX", ["m", "n", "o"], [2, 4, 3])
 
-    assert manager.try_allocate(req, 0, int(1e9))
+    assert manager.can_allocate(req, 0, int(1e9), 0)
     manager.allocate(req, 0)
 
-    assert manager.try_allocate(req, 2, int(1e9))
+    assert manager.can_allocate(req, 2, int(1e9), 0)
     manager.allocate(req, 2)
 
     cached_ids = manager.get_cached_input_ids(req)
@@ -114,7 +114,7 @@ def test_has_cache_restores_from_freeable():
     manager = EncoderCacheManager(cache_size=10)
     req = MockRequest("reqY", ["imgZ"], [4])
 
-    assert manager.try_allocate(req, 0, int(1e9))
+    assert manager.can_allocate(req, 0, int(1e9), 0)
     manager.allocate(req, 0)
 
     manager.free_encoder_input(req, 0)
@@ -131,14 +131,41 @@ def test_get_freed_mm_hashes_clears_freed_list():
     req1 = MockRequest("reqA", ["a"], [5])
     req2 = MockRequest("reqB", ["b"], [6])
 
-    assert manager.try_allocate(req1, 0, int(1e9))
+    assert manager.can_allocate(req1, 0, int(1e9), 0)
     manager.allocate(req1, 0)
     manager.free_encoder_input(req1, 0)
 
     # Should trigger eviction of 'a'.
-    assert manager.try_allocate(req2, 0, int(1e9))
+    assert manager.can_allocate(req2, 0, int(1e9), 0)
     manager.allocate(req2, 0)
 
     freed = manager.get_freed_mm_hashes()
     assert "a" in freed
     assert manager.get_freed_mm_hashes() == []
+
+
+def test_schedule_request_multi_images_respect_space_limit():
+    manager = EncoderCacheManager(cache_size=10)
+    req = MockRequest("reqA", ["a", "b"], [5, 6])
+    compute_budget = 100
+
+    num_tokens_to_schedule = 0
+    assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
+    num_tokens_to_schedule += req.get_num_encoder_tokens(0)
+    compute_budget -= req.get_num_encoder_tokens(0)
+
+    assert not manager.can_allocate(req, 1, compute_budget,
+                                    num_tokens_to_schedule)
+
+
+def test_schedule_request_multi_images_respect_compute_limit():
+    manager = EncoderCacheManager(cache_size=100)
+    req = MockRequest("reqA", ["a", "b"], [5, 6])
+    compute_budget = 10
+    num_tokens_to_schedule = 0
+    assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
+    num_tokens_to_schedule += req.get_num_encoder_tokens(0)
+    compute_budget -= req.get_num_encoder_tokens(0)
+
+    assert not manager.can_allocate(req, 1, compute_budget,
+                                    num_tokens_to_schedule)
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 70af419fcb..c9d18033a1 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -99,8 +99,9 @@ class EncoderCacheManager:
         self.cached[mm_hash].add(request.request_id)
         return True
 
-    def try_allocate(self, request: Request, input_id: int,
-                     encoder_budget: int) -> bool:
+    def can_allocate(self, request: Request, input_id: int,
+                     encoder_compute_budget: int,
+                     num_tokens_to_schedule: int) -> bool:
         """Check if there's sufficient cache space for a multimodal input. 
         If there is, return True and update EncoderCacheManager state.
 
@@ -116,6 +117,10 @@ class EncoderCacheManager:
         Args:
             request: The request containing the multimodal input.
             input_id: Index of the multimodal input within the request.
+            encoder_compute_budget: Number of encoder tokens allowed to be 
+                computed when this method is invoked.
+            num_tokens_to_schedule: Number of tokens already scheduled to be 
+                allocated with cache space when this method is invoked.
 
         Returns:
             True if there's enough capacity to hold the encoder output for this
@@ -128,13 +133,13 @@ class EncoderCacheManager:
         num_tokens = request.get_num_encoder_tokens(input_id)
 
         # Not enough compute budget
-        if num_tokens > encoder_budget:
+        if num_tokens > encoder_compute_budget:
             return False
 
+        num_tokens += num_tokens_to_schedule
+
         # Enough free slots
         if num_tokens <= self.num_free_slots:
-            self.num_free_slots -= num_tokens
-            self.num_freeable_slots -= num_tokens
             return True
 
         # Not enough reclaimable slots
@@ -149,8 +154,6 @@ class EncoderCacheManager:
             del self.cached[mm_hash]
             self.freed.append(mm_hash)
             self.num_free_slots += num_free_token
-        self.num_free_slots -= num_tokens
-        self.num_freeable_slots -= num_tokens
         return True
 
     def allocate(self, request: Request, input_id: int) -> None:
@@ -161,19 +164,24 @@ class EncoderCacheManager:
         the model runner; this method updates the manager's bookkeeping.
 
         Note:
-            This method assumes try_allocate() returned True for the same input.
+            This method assumes can_allocate() returned True for the same input.
         """
-        # Encoder cache space budget should be already updated for the
-        # multimodal input and non-negative after try_allocate() is called.
-        assert self.num_free_slots >= 0
-        assert self.num_freeable_slots >= 0
 
         mm_hash = request.mm_hashes[input_id]
         request_id = request.request_id
         if mm_hash not in self.cached:
             self.cached[mm_hash] = set()
 
+        num_encoder_tokens = request.get_num_encoder_tokens(input_id)
+
+        # NOTE: Encoder cache should always have enough space for encoder inputs
+        # that are scheduled since eviction takes place at can_allocate().
+        assert self.num_free_slots >= num_encoder_tokens
+        assert self.num_freeable_slots >= num_encoder_tokens
+
         self.cached[mm_hash].add(request_id)
+        self.num_free_slots -= num_encoder_tokens
+        self.num_freeable_slots -= num_encoder_tokens
 
     def get_cached_input_ids(self, request: Request) -> set[int]:
         """Get all cached multimodal input IDs for a request.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 956e23afa0..522b340b32 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -182,7 +182,7 @@ class Scheduler(SchedulerInterface):
         token_budget = self.max_num_scheduled_tokens
         # Encoder-related.
         scheduled_encoder_inputs: dict[str, list[int]] = {}
-        encoder_budget = self.max_num_encoder_input_tokens
+        encoder_compute_budget = self.max_num_encoder_input_tokens
         # Spec decode-related.
         scheduled_spec_decode_tokens: dict[str, list[int]] = {}
 
@@ -211,12 +211,13 @@ class Scheduler(SchedulerInterface):
 
             # Schedule encoder inputs.
             encoder_inputs_to_schedule = None
-            new_encoder_budget = encoder_budget
+            new_encoder_compute_budget = encoder_compute_budget
             if request.has_encoder_inputs:
                 (encoder_inputs_to_schedule, num_new_tokens,
-                 new_encoder_budget) = self._try_schedule_encoder_inputs(
+                 new_encoder_compute_budget
+                 ) = self._try_schedule_encoder_inputs(
                      request, request.num_computed_tokens, num_new_tokens,
-                     encoder_budget)
+                     encoder_compute_budget)
 
             if num_new_tokens == 0:
                 # The request cannot be scheduled because one of the following
@@ -298,7 +299,7 @@ class Scheduler(SchedulerInterface):
                 # Allocate the encoder cache.
                 for i in encoder_inputs_to_schedule:
                     self.encoder_cache_manager.allocate(request, i)
-                encoder_budget = new_encoder_budget
+                encoder_compute_budget = new_encoder_compute_budget
 
         # Record the LoRAs in scheduled_running_reqs
         scheduled_loras: set[int] = set()
@@ -382,7 +383,7 @@ class Scheduler(SchedulerInterface):
                     num_computed_tokens = request.num_computed_tokens
 
                 encoder_inputs_to_schedule = None
-                new_encoder_budget = encoder_budget
+                new_encoder_compute_budget = encoder_compute_budget
 
                 # KVTransfer: loading remote KV, do not allocate for new work.
                 if load_kv_async:
@@ -413,10 +414,10 @@ class Scheduler(SchedulerInterface):
                     # Schedule encoder inputs.
                     if request.has_encoder_inputs:
                         (encoder_inputs_to_schedule, num_new_tokens,
-                         new_encoder_budget
+                         new_encoder_compute_budget
                          ) = self._try_schedule_encoder_inputs(
                              request, num_computed_tokens, num_new_tokens,
-                             encoder_budget)
+                             encoder_compute_budget)
                         if num_new_tokens == 0:
                             # The request cannot be scheduled.
                             break
@@ -495,7 +496,7 @@ class Scheduler(SchedulerInterface):
                     # Allocate the encoder cache.
                     for i in encoder_inputs_to_schedule:
                         self.encoder_cache_manager.allocate(request, i)
-                    encoder_budget = new_encoder_budget
+                    encoder_compute_budget = new_encoder_compute_budget
 
         # Put back any skipped requests at the head of the waiting queue
         if skipped_waiting_requests:
@@ -658,7 +659,7 @@ class Scheduler(SchedulerInterface):
         request: Request,
         num_computed_tokens: int,
         num_new_tokens: int,
-        encoder_budget: int,
+        encoder_compute_budget: int,
     ) -> tuple[list[int], int, int]:
         """
         Determine which encoder inputs need to be scheduled in the current step,
@@ -680,11 +681,17 @@ class Scheduler(SchedulerInterface):
         blocks and externally cached blocks (via KVConnector).
         """
         if num_new_tokens == 0 or not request.has_encoder_inputs:
-            return [], num_new_tokens, encoder_budget
+            return [], num_new_tokens, encoder_compute_budget
         encoder_inputs_to_schedule: list[int] = []
         mm_positions = request.mm_positions
         assert mm_positions is not None
         assert len(mm_positions) > 0
+
+        # NOTE: since scheduler operates on the request level (possibly with
+        # multiple encoder inputs per request), we need to create temporary
+        # trackers for accounting at the encoder input level.
+        mm_hashes_to_schedule = set()
+        num_tokens_to_schedule = 0
         for i, pos_info in enumerate(mm_positions):
             start_pos = pos_info.offset
             num_encoder_tokens = pos_info.length
@@ -695,13 +702,20 @@ class Scheduler(SchedulerInterface):
             if start_pos >= num_computed_tokens + num_new_tokens:
                 # The encoder input is not needed in this step.
                 break
+
             if start_pos + num_encoder_tokens <= num_computed_tokens:
                 # The encoder input is already computed and stored
                 # in the decoder's KV cache.
                 continue
 
+            # The same encoder input has already been scheduled in the current
+            # step.
+            if request.mm_hashes[i] in mm_hashes_to_schedule:
+                continue
+
             if self.encoder_cache_manager.check_and_update_cache(request, i):
-                # The encoder input is already computed and cached.
+                # The encoder input is already computed and cached from a
+                # previous step.
                 continue
 
             # If no encoder input chunking is allowed, we do not want to
@@ -714,8 +728,9 @@ class Scheduler(SchedulerInterface):
                 num_new_tokens = start_pos - num_computed_tokens
                 break
 
-            if not self.encoder_cache_manager.try_allocate(
-                    request, i, encoder_budget):
+            if not self.encoder_cache_manager.can_allocate(
+                    request, i, encoder_compute_budget,
+                    num_tokens_to_schedule):
                 # The encoder cache is full or the encoder budget is exhausted.
                 # NOTE(woosuk): We assume that the encoder input tokens should
                 # be processed altogether, as the encoder usually uses
@@ -732,9 +747,16 @@ class Scheduler(SchedulerInterface):
                     num_new_tokens = 0
                 break
 
-            encoder_budget -= num_encoder_tokens
+            num_tokens_to_schedule += num_encoder_tokens
+            encoder_compute_budget -= num_encoder_tokens
+            mm_hashes_to_schedule.add(request.mm_hashes[i])
             encoder_inputs_to_schedule.append(i)
-        return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
+
+        return (
+            encoder_inputs_to_schedule,
+            num_new_tokens,
+            encoder_compute_budget,
+        )
 
     def get_grammar_bitmask(
         self,

From 50fede6634a997f4e971ecb4eb4cce337340e394 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Aug 2025 18:00:18 +0800
Subject: [PATCH 588/932] [V1] Enable V1 for compute capability < 8.0 + FP32
 (#23614)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/arg_utils.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3ab1115f14..f24c50ad73 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1433,15 +1433,15 @@ class EngineArgs:
                                recommend_to_remove=True)
             return False
 
-        # Need at least Ampere for now (FA support required).
-        # Skip this check if we are running on a non-GPU platform,
-        # or if the device capability is not available
-        # (e.g. in a Ray actor without GPUs).
+        # Triton v3.3 has f16 conversion regression issue on Turing and Volta,
+        # which broke fp16 inference
+        # see: https://github.com/triton-lang/triton/issues/6698
         if (current_platform.is_cuda()
-                and current_platform.get_device_capability()
-                and current_platform.get_device_capability().major < 8):
-            _raise_or_fallback(feature_name="Compute Capability < 8.0",
-                               recommend_to_remove=False)
+                and not current_platform.has_device_capability(80)
+                and model_config.dtype == torch.float16):
+            _raise_or_fallback(
+                feature_name="Compute Capability < 8.0 with FP16",
+                recommend_to_remove=False)
             return False
 
         if self.kv_cache_dtype != "auto":

From b00e69f8ca55f4a82847d39466f57ceb748324c1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 26 Aug 2025 11:27:20 +0100
Subject: [PATCH 589/932] Fix nits from #20059 (#23548)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/compilation.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index e2785e7602..56aa00a30d 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -225,7 +225,8 @@ class CompilationConfig:
     # CudaGraph compilation
     cudagraph_mode: Optional[CUDAGraphMode] = None
     """
-    The mode of the cudagraph.
+    The mode of the cudagraph:
+
     - NONE, no cudagraph capture.
     - PIECEWISE. (v1 default)
     - FULL.
@@ -384,13 +385,10 @@ class CompilationConfig:
         if pass_config_exclude:
             exclude["pass_config"] = pass_config_exclude
 
-        # The cast to string is necessary because Pydantic is mocked in docs
-        # builds and sphinx-argparse doesn't know the return type of decode()
-        return str(
-            TypeAdapter(CompilationConfig).dump_json(
-                self,
-                exclude=exclude,  # type: ignore[arg-type]
-                exclude_unset=True).decode())
+        return TypeAdapter(CompilationConfig).dump_json(
+            self,
+            exclude=exclude,  # type: ignore[arg-type]
+            exclude_unset=True).decode()
 
     __str__ = __repr__
 

From 6ace2f72b03fe41475d7d64e2bfd40b79c447f5b Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 26 Aug 2025 04:16:09 -0700
Subject: [PATCH 590/932] Fix writing benchmark results with tuple keys
 (#23633)

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 vllm/benchmarks/lib/utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/lib/utils.py b/vllm/benchmarks/lib/utils.py
index 5f95fdcc75..0c27687dcf 100644
--- a/vllm/benchmarks/lib/utils.py
+++ b/vllm/benchmarks/lib/utils.py
@@ -54,7 +54,12 @@ class InfEncoder(json.JSONEncoder):
 
     def clear_inf(self, o: Any):
         if isinstance(o, dict):
-            return {k: self.clear_inf(v) for k, v in o.items()}
+            return {
+                str(k)
+                if not isinstance(k, (str, int, float, bool, type(None)))
+                else k: self.clear_inf(v)
+                for k, v in o.items()
+            }
         elif isinstance(o, list):
             return [self.clear_inf(v) for v in o]
         elif isinstance(o, float) and math.isinf(o):

From d52358c1e07768266e3db92e847cd28af87ca4b9 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 26 Aug 2025 07:16:33 -0400
Subject: [PATCH 591/932] [Perf] Remove duplicated NVFP4 blockscales to save
 memory (#23379)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../compressed_tensors_moe.py                 | 20 +++++------
 .../schemes/compressed_tensors_w4a4_nvfp4.py  | 11 +++---
 .../layers/quantization/modelopt.py           | 34 ++++++++-----------
 3 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 7bc35cd81a..1ee3478aa4 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -246,13 +246,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             return
 
         # swizzle weight scales
-        layer.w13_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
+        layer.w13_weight_scale = torch.nn.Parameter(swizzle_blockscale(
             layer.w13_weight_scale),
-                                                           requires_grad=False)
+                                                    requires_grad=False)
 
-        layer.w2_blockscale_swizzled = torch.nn.Parameter(swizzle_blockscale(
+        layer.w2_weight_scale = torch.nn.Parameter(swizzle_blockscale(
             layer.w2_weight_scale),
-                                                          requires_grad=False)
+                                                   requires_grad=False)
 
         # w13
         w13_input_global_scale = layer.w13_input_global_scale.max(
@@ -383,8 +383,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                w1_scale=layer.w13_blockscale_swizzled,
-                w2_scale=layer.w2_blockscale_swizzled,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
 
@@ -406,8 +406,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                w1_scale=layer.w13_blockscale_swizzled,
-                w2_scale=layer.w2_blockscale_swizzled,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
                 g1_alphas=layer.g1_alphas,
                 g2_alphas=layer.g2_alphas,
                 a1_gscale=layer.w13_input_scale_quant,
@@ -427,8 +427,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             a=x,
             w1_fp4=layer.w13_weight,
             w2_fp4=layer.w2_weight,
-            w1_blockscale=layer.w13_blockscale_swizzled,
-            w2_blockscale=layer.w2_blockscale_swizzled,
+            w1_blockscale=layer.w13_weight_scale,
+            w2_blockscale=layer.w2_weight_scale,
             g1_alphas=layer.g1_alphas,
             g2_alphas=layer.g2_alphas,
             a1_gscale=layer.w13_input_scale_quant,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 49d76bbeaa..dedd681f15 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -112,13 +112,12 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
                 torch.uint8), epilogue_tile_m).reshape(
                     weight_scale.shape).view(torch.float8_e4m3fn))
 
-            layer.weight_scale_swizzled = Parameter(weight_scale,
-                                                    requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.weight_packed = Parameter(weight, requires_grad=False)
         else:
             swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
-            layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
-                                                    requires_grad=False)
+            layer.weight_scale = Parameter(swizzled_weight_scale,
+                                           requires_grad=False)
             layer.weight_packed = Parameter(layer.weight_packed.data,
                                             requires_grad=False)
 
@@ -136,7 +135,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
                 x=x,
                 input_global_scale=layer.input_global_scale,
                 weight=layer.weight_packed,
-                weight_scale_swizzled=layer.weight_scale_swizzled,
+                weight_scale_swizzled=layer.weight_scale,
                 weight_global_scale=layer.weight_global_scale)
             if bias is not None:
                 out = out + bias
@@ -149,7 +148,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
         x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
 
         mm_args = (x_fp4, layer.weight_packed, x_blockscale,
-                   layer.weight_scale_swizzled, layer.alpha, output_dtype)
+                   layer.weight_scale, layer.alpha, output_dtype)
         if self.backend == "flashinfer-trtllm":
             out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm")
         elif self.backend == "flashinfer-cutlass":
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 046234057f..72864853f7 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -907,20 +907,18 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
                 torch.uint8), epilogue_tile_m).reshape(
                     weight_scale.shape).view(torch.float8_e4m3fn))
 
-            layer.weight_scale_swizzled = Parameter(weight_scale,
-                                                    requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
             layer.weight = Parameter(weight, requires_grad=False)
         else:
             swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
-            layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
-                                                    requires_grad=False)
+            layer.weight_scale = Parameter(swizzled_weight_scale,
+                                           requires_grad=False)
             layer.weight = Parameter(layer.weight.data, requires_grad=False)
 
             if self.backend == "marlin":
                 prepare_fp4_layer_for_marlin(layer)
                 del layer.alpha
                 del layer.input_scale
-                del layer.weight_scale_swizzled
 
     def apply(
         self,
@@ -951,14 +949,14 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         assert (x_fp4.dtype == torch.uint8)
         assert (layer.weight.dtype == torch.uint8)
         assert (x_blockscale.dtype == torch.float8_e4m3fn)
-        assert (layer.weight_scale_swizzled.dtype == torch.float8_e4m3fn)
+        assert (layer.weight_scale.dtype == torch.float8_e4m3fn)
         assert (layer.alpha.dtype == torch.float32)
 
         mm_args = (
             x_fp4,
             layer.weight,
             x_blockscale,
-            layer.weight_scale_swizzled,
+            layer.weight_scale,
             layer.alpha,
             output_dtype,
         )
@@ -1320,16 +1318,16 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 "Weight Blockscale must be represented as FP8-E4M3")
             w13_blockscale_swizzled = swizzle_blockscale(
                 layer.w13_weight_scale)
-            layer.w13_blockscale_swizzled = Parameter(w13_blockscale_swizzled,
-                                                      requires_grad=False)
+            layer.w13_weight_scale = Parameter(w13_blockscale_swizzled,
+                                               requires_grad=False)
 
             assert (layer.w2_weight_scale.shape[2] % 16 == 0), (
                 "Expected weight_scale.dim(1) to be divisible by 16")
             assert (layer.w2_weight_scale.dtype == torch.float8_e4m3fn), (
                 "Weight Blockscale must be represented as FP8-E4M3")
             w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
-            layer.w2_blockscale_swizzled = Parameter(w2_blockscale_swizzled,
-                                                     requires_grad=False)
+            layer.w2_weight_scale = Parameter(w2_blockscale_swizzled,
+                                              requires_grad=False)
             layer.w2_weight = Parameter(layer.w2_weight.data,
                                         requires_grad=False)
 
@@ -1339,8 +1337,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             del layer.g2_alphas
             del layer.w13_input_scale_quant
             del layer.w2_input_scale_quant
-            del layer.w13_blockscale_swizzled
-            del layer.w2_blockscale_swizzled
 
     def apply(
         self,
@@ -1474,8 +1470,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                w1_scale=layer.w13_blockscale_swizzled,
-                w2_scale=layer.w2_blockscale_swizzled,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
         elif (self.allow_flashinfer
@@ -1489,8 +1485,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                w1_scale=layer.w13_blockscale_swizzled,
-                w2_scale=layer.w2_blockscale_swizzled,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
                 g1_alphas=layer.g1_alphas,
                 g2_alphas=layer.g2_alphas,
                 a1_gscale=layer.w13_input_scale_quant,
@@ -1510,8 +1506,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 a=x,
                 w1_fp4=layer.w13_weight,
                 w2_fp4=layer.w2_weight,
-                w1_blockscale=layer.w13_blockscale_swizzled,
-                w2_blockscale=layer.w2_blockscale_swizzled,
+                w1_blockscale=layer.w13_weight_scale,
+                w2_blockscale=layer.w2_weight_scale,
                 g1_alphas=layer.g1_alphas,
                 g2_alphas=layer.g2_alphas,
                 a1_gscale=layer.w13_input_scale_quant,

From fdeb3dac132c9ef92d981dd811529e6496781b07 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 26 Aug 2025 20:09:47 +0800
Subject: [PATCH 592/932] [Model] fix DeepSeek e_score_correction_bias dtype to
 fp32 (#23640)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/deepseek_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index d56224b4b7..7657e7cb00 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -126,7 +126,7 @@ class DeepseekV2MoE(nn.Module):
                                      prefix=f"{prefix}.gate")
         if config.topk_method == "noaux_tc":
             self.gate.e_score_correction_bias = nn.Parameter(
-                torch.empty(config.n_routed_experts))
+                torch.empty(config.n_routed_experts, dtype=torch.float32))
         else:
             self.gate.e_score_correction_bias = None
 

From 384dd1b0a899c6761010b42aefe1159c8062f0a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C3=BA=C5=A1=20N=C3=A1me=C5=A1n=C3=BD?=
 <matus@namesny.com>
Date: Tue, 26 Aug 2025 14:13:15 +0200
Subject: [PATCH 593/932] [Bugfix] Add missing enable_log_outputs parameter to
 init_app_state function (#23634)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Matúš Námešný <matus.namesny@ameria.com>
---
 vllm/entrypoints/openai/api_server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 14ba8aa641..db02767fdf 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1748,6 +1748,7 @@ async def init_app_state(
         reasoning_parser=args.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
+        enable_log_outputs=args.enable_log_outputs,
     ) if "generate" in supported_tasks else None
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
@@ -1765,6 +1766,7 @@ async def init_app_state(
         reasoning_parser=args.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
+        enable_log_outputs=args.enable_log_outputs,
     ) if "generate" in supported_tasks else None
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,

From ebd5a77bb5a6b7643f047f61294da0ce92baf3f6 Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Tue, 26 Aug 2025 14:26:26 +0200
Subject: [PATCH 594/932] feat: add usage to TranscriptionResponse (text and
 json response_format) (#23576)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 .../openai/test_transcription_validation.py     | 14 ++++++++++----
 vllm/entrypoints/openai/protocol.py             |  6 ++++++
 vllm/entrypoints/openai/speech_to_text.py       | 17 ++++++++++++++++-
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 93239f41a4..6009d9aeec 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
             language="en",
             response_format="text",
             temperature=0.0)
-        out = json.loads(transcription)['text']
-        assert "Mary had a little lamb," in out
+        out = json.loads(transcription)
+        out_text = out['text']
+        out_usage = out['usage']
+        assert "Mary had a little lamb," in out_text
+        assert out_usage["seconds"] == 16, out_usage["seconds"]
 
 
 @pytest.mark.asyncio
@@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client):
         language="en",
         response_format="text",
         temperature=0.0)
-    out = json.loads(transcription)['text']
-    counts = out.count("Mary had a little lamb")
+    out = json.loads(transcription)
+    out_text = out['text']
+    out_usage = out['usage']
+    counts = out_text.count("Mary had a little lamb")
     assert counts == 10, counts
+    assert out_usage["seconds"] == 161, out_usage["seconds"]
 
 
 @pytest.mark.asyncio
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index a3d7b78cf4..5cb41bd93d 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -2232,9 +2232,15 @@ class TranscriptionRequest(OpenAIBaseModel):
 
 
 # Transcription response objects
+class TranscriptionUsageAudio(OpenAIBaseModel):
+    type: Literal["duration"] = "duration"
+    seconds: int
+
+
 class TranscriptionResponse(OpenAIBaseModel):
     text: str
     """The transcribed text."""
+    usage: TranscriptionUsageAudio
 
 
 class TranscriptionWord(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index 01140a4bfe..de2619a78f 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -200,7 +200,22 @@ class OpenAISpeechToText(OpenAIServing):
             for result_generator in list_result_generator:
                 async for op in result_generator:
                     text += op.outputs[0].text
-            return cast(T, response_class(text=text))
+
+            if self.task_type == "transcribe":
+                # add usage in TranscriptionResponse.
+                usage = {
+                    "type": "duration",
+                    # rounded up as per openAI specs
+                    "seconds": int(math.ceil(duration_s)),
+                }
+                final_response = cast(T, response_class(text=text,
+                                                        usage=usage))
+            else:
+                # no usage in response for translation task
+                final_response = cast(
+                    T, response_class(text=text))  # type: ignore[call-arg]
+
+            return final_response
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:

From 2b4fc9bd9b8321265ff54065ea47bd9e327c6b6f Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 26 Aug 2025 05:41:52 -0700
Subject: [PATCH 595/932] Support FlashAttention Backend for Hybrid SSM Models
 (#23299)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 .../models/language/generation/test_hybrid.py |  3 --
 vllm/v1/worker/gpu_model_runner.py            | 41 ++++++++-----------
 2 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 2055c44c83..7e7cc893ec 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -110,9 +110,6 @@ def test_models(
     if model in V1_SUPPORTED_MODELS:
         with monkeypatch.context() as m:
             m.setenv("VLLM_USE_V1", "1")
-            if model in HYBRID_MODELS:
-                # required due to reorder_batch behaviour
-                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
             with vllm_runner(model,
                              max_num_seqs=MAX_NUM_SEQS,
                              enable_prefix_caching=False) as vllm_model:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4f6cf9a350..14f2305dad 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3023,40 +3023,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     raise NotImplementedError
 
         if has_attn and has_mamba:
-            self._verify_hybrid_attention_mamba_layout(kv_cache_config,
-                                                       kv_cache_raw_tensors)
+            self._update_hybrid_attention_mamba_layout(kv_caches)
 
         return kv_caches
 
-    def _verify_hybrid_attention_mamba_layout(
-            self, kv_cache_config: KVCacheConfig,
-            kv_cache_raw_tensors: dict[str, torch.Tensor]) -> None:
+    def _update_hybrid_attention_mamba_layout(
+            self, kv_caches: dict[str, torch.Tensor]) -> None:
         """
-        Verify that the KV cache memory layout is compatible for
-        models with both attention and mamba KV cache groups.
+        Update the layout of attention layers from (2, num_blocks, ...) to
+        (num_blocks, 2, ...).
 
         Args:
-            kv_cache_config: The KV cache config
-            kv_cache_raw_tensors: The KV cache buffer of each layer.
+            kv_caches: The KV cache buffer of each layer.
         """
 
         for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
             for layer_name in group.layer_names:
-                raw_tensor = kv_cache_raw_tensors[layer_name]
-                num_blocks = (raw_tensor.numel() //
-                              kv_cache_spec.page_size_bytes)
-                if isinstance(kv_cache_spec, AttentionSpec):
-
-                    kv_cache_shape = group.backend.get_kv_cache_shape(
-                        num_blocks, kv_cache_spec.block_size,
-                        kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
-                    if kv_cache_shape[0] != num_blocks or kv_cache_shape[
-                            1] != 2:
-                        raise ValueError(
-                            "Hybrid models in V1 require an attention "
-                            "backend with kv_cache_shape="
-                            "(num_blocks, 2, ...). Please try setting "
-                            "VLLM_ATTENTION_BACKEND=FLASHINFER")
+                kv_cache = kv_caches[layer_name]
+                if (isinstance(kv_cache_spec, AttentionSpec)
+                        and kv_cache.shape[0] == 2):
+                    assert kv_cache.shape[1] != 2, \
+                        "Fail to determine whether the layout is " \
+                        "(2, num_blocks, ...) or (num_blocks, 2, ...) for " \
+                        f"a tensor of shape {kv_cache.shape}"
+                    hidden_size = kv_cache.shape[2:].numel()
+                    kv_cache.as_strided_(size=kv_cache.shape,
+                                         stride=(hidden_size, 2 * hidden_size,
+                                                 *kv_cache.stride()[2:]))
 
     def initialize_kv_cache_tensors(
             self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:

From 164b2273c87ad72b2d3b1f2762367de42d6e946b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 26 Aug 2025 14:00:18 +0100
Subject: [PATCH 596/932] [Docs] Fix broken links to `docs/api/summary.md`
 (#23637)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/examples/README.md          | 6 +++---
 docs/models/generative_models.md | 2 +-
 docs/models/pooling_models.md    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/examples/README.md b/docs/examples/README.md
index 34e4dfd408..3cf93027f4 100644
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -2,6 +2,6 @@
 
 vLLM's examples are split into three categories:
 
-- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference/)
-- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving/)
-- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others/)
+- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference)
+- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving)
+- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others)
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index a64ecd31eb..d02522a665 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -19,7 +19,7 @@ Run a model in generation mode via the option `--runner generate`.
 ## Offline Inference
 
 The [LLM][vllm.LLM] class provides various methods for offline inference.
-See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
+See [configuration](../api/README.md#configuration) for a list of options when initializing the model.
 
 ### `LLM.generate`
 
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 753d8bd0b8..fbb5f6f6dd 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -81,7 +81,7 @@ which takes priority over both the model's and Sentence Transformers's defaults.
 ## Offline Inference
 
 The [LLM][vllm.LLM] class provides various methods for offline inference.
-See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
+See [configuration](../api/README.md#configuration) for a list of options when initializing the model.
 
 ### `LLM.embed`
 

From b78bed1bc5debead116092f429eee51398691fc8 Mon Sep 17 00:00:00 2001
From: En Ouyang <en.ouyang93@outlook.com>
Date: Tue, 26 Aug 2025 21:04:25 +0800
Subject: [PATCH 597/932] [Hardware][Mac] Fix the installation fail for Apple
 Silicon (CPU)  (#23565)

Signed-off-by: oye93 <en.ouyang93@outlook.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 cmake/cpu_extension.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index cc38cd41a5..52bfd82c7f 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,6 +1,7 @@
 include(FetchContent)
 
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 

From f66673a39d9f364194c249f28098cad8a5584ccb Mon Sep 17 00:00:00 2001
From: nvjullin <jullin@nvidia.com>
Date: Tue, 26 Aug 2025 21:54:04 +0800
Subject: [PATCH 598/932] [Kernel] Added flashinfer fp8 per-tensor gemms
 (#22895)

Signed-off-by: Julien Lin <jullin@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  1 +
 tests/compile/test_fusion.py                  | 15 ++--
 tests/compile/test_sequence_parallelism.py    |  3 +-
 tests/compile/test_silu_mul_quant_fusion.py   | 13 ++--
 .../quantization/test_flashinfer_scaled_mm.py | 73 +++++++++++++++++++
 .../model_executor/layers/quantization/fp8.py |  5 +-
 .../layers/quantization/ptpc_fp8.py           |  4 +-
 .../layers/quantization/utils/w8a8_utils.py   | 59 +++++++++++----
 vllm/utils/flashinfer.py                      | 61 ++++++++++++++++
 9 files changed, 198 insertions(+), 36 deletions(-)
 create mode 100644 tests/kernels/quantization/test_flashinfer_scaled_mm.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 1ccfa93c57..0d3b7a294d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -655,6 +655,7 @@ steps:
     - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
     - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
     - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 5cfad935a0..c4229f9346 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -15,7 +15,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape, QuantKey, ScaleDesc)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
+    Fp8LinearOp, maybe_create_device_identity)
 from vllm.platforms import current_platform
 
 from .backend import TestBackend
@@ -26,9 +26,9 @@ FP8_DTYPE = current_platform.fp8_dtype()
 class TestModel(torch.nn.Module):
 
     def __init__(self, hidden_size: int, eps: float, static: bool,
-                 cutlass_fp8_enabled: bool, *args, **kwargs):
+                 force_fp8_e4m3fnuz: bool, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.cutlass_fp8_enabled = cutlass_fp8_enabled
+        self.force_fp8_e4m3fnuz = force_fp8_e4m3fnuz
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
         self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
@@ -43,7 +43,7 @@ class TestModel(torch.nn.Module):
             for _ in range(2)
         ]
         self.fp8_linear = Fp8LinearOp(
-            cutlass_fp8_supported=cutlass_fp8_enabled,
+            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
             act_quant_static=static,
             act_quant_group_shape=group_shape,
         )
@@ -81,12 +81,11 @@ class TestModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("static", [True, False])
-@pytest.mark.parametrize("cutlass_fp8_enabled",
-                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
+@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                     reason="Only test on CUDA and ROCm")
 def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
-                              cutlass_fp8_enabled):
+                              force_fp8_e4m3fnuz):
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
     torch.manual_seed(1)
@@ -103,7 +102,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
         fusion_pass = FusionPass.instance(vllm_config)
 
         backend = TestBackend(noop_pass, fusion_pass)
-        model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)
+        model = TestModel(hidden_size, eps, static, force_fp8_e4m3fnuz)
 
         # First dimension dynamic
         x = torch.rand(num_tokens, hidden_size)
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index a6baa97fe6..fb9f9dde22 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -104,8 +104,7 @@ class TestQuantModel(torch.nn.Module):
         # Initialize weights
         torch.nn.init.normal_(self.gate_proj, std=0.02)
 
-        self.fp8_linear = Fp8LinearOp(cutlass_fp8_supported=True,
-                                      use_per_token_if_dynamic=False)
+        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=False)
 
         self.scale = torch.rand(1, dtype=torch.float32)
         # Create a weight that is compatible with torch._scaled_mm,
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 5351a3cf35..0e1059e654 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_FP8_SUPPORTED, Fp8LinearOp)
+    Fp8LinearOp)
 from vllm.platforms import current_platform
 
 from .backend import TestBackend
@@ -20,7 +20,7 @@ from .backend import TestBackend
 
 class TestModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, cutlass_fp8_enabled: bool, *args,
+    def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, *args,
                  **kwargs):
         super().__init__(*args, **kwargs)
         self.silu_and_mul = SiluAndMul()
@@ -32,7 +32,7 @@ class TestModel(torch.nn.Module):
             hidden_size).to(dtype=current_platform.fp8_dtype()).t())
 
         self.fp8_linear = Fp8LinearOp(
-            cutlass_fp8_supported=cutlass_fp8_enabled,
+            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
             act_quant_static=True,
             act_quant_group_shape=GroupShape.PER_TENSOR,
         )
@@ -48,12 +48,11 @@ class TestModel(torch.nn.Module):
 
 @pytest.mark.parametrize("num_tokens", [256])
 @pytest.mark.parametrize("hidden_size", [64])
-@pytest.mark.parametrize("cutlass_fp8_enabled",
-                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
+@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                     reason="Only test on CUDA and ROCm")
 def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
-                                   cutlass_fp8_enabled):
+                                   force_fp8_e4m3fnuz):
     torch.set_default_device("cuda")
     torch.set_default_dtype(torch.float16)
 
@@ -64,7 +63,7 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
     fusion_pass = ActivationQuantFusionPass(config)
 
     backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
-    model = TestModel(hidden_size, cutlass_fp8_enabled)
+    model = TestModel(hidden_size, force_fp8_e4m3fnuz)
 
     # First dimension dynamic
     x = torch.rand(num_tokens, hidden_size * 2)
diff --git a/tests/kernels/quantization/test_flashinfer_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
new file mode 100644
index 0000000000..9f669c6df8
--- /dev/null
+++ b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason=
+        "Flashinfer FP8 gemms requires compute capability of 10.0 or above.",
+        allow_module_level=True,
+    )
+
+DTYPES = [torch.float16, torch.bfloat16]
+# m, n, k
+SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)]
+PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
+SHAPES.extend(PAD_SHAPES)
+
+SEEDS = [42]
+CUDA_DEVICES = ["cuda:0"]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("autotune", [False, True])
+@torch.inference_mode()
+def test_flashinfer_fp8_gemm(
+    dtype: torch.dtype,
+    shape: tuple[int, int, int],
+    use_bias: bool,
+    seed: int,
+    device: str,
+    autotune: bool,
+) -> None:
+    current_platform.seed_everything(seed)
+    m, n, k = shape
+    a = torch.randn((m, k), dtype=dtype, device=device)
+    b = torch.randn((n, k), dtype=dtype, device=device) / k
+
+    a_fp8, a_scale = ops.scaled_fp8_quant(a)
+    b_fp8, b_scale = ops.scaled_fp8_quant(b)
+
+    expected_out = torch.mm(
+        a_scale * a_fp8.to(dtype=torch.float32),
+        b_scale * b_fp8.to(dtype=torch.float32).t(),
+    ).to(dtype=dtype)
+
+    if use_bias:
+        bias = torch.randn((n, ), dtype=dtype, device=device)
+        expected_out = expected_out + bias
+    else:
+        bias = None
+
+    import flashinfer
+
+    with flashinfer.autotune(autotune):
+        out = flashinfer_scaled_fp8_mm(
+            a_fp8,
+            b_fp8.t(),
+            a_scale,
+            b_scale,
+            dtype,
+            bias=bias,
+        )
+
+    torch.testing.assert_close(out, expected_out, atol=1e-2, rtol=1e-2)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a4de4d7094..d45d368b58 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -223,8 +223,7 @@ class Fp8LinearMethod(LinearMethodBase):
 
         self.fp8_linear = Fp8LinearOp(
             act_quant_static=self.act_q_static,
-            act_quant_group_shape=self.act_q_group_shape,
-            cutlass_fp8_supported=cutlass_fp8_supported())
+            act_quant_group_shape=self.act_q_group_shape)
 
     def create_weights(
         self,
@@ -376,6 +375,8 @@ class Fp8LinearMethod(LinearMethodBase):
             # Update the layer with the new values.
             layer.weight = Parameter(qweight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            # layer.input_scale is None indicates dynamic quant and scale is
+            # computed from input.
             layer.input_scale = None
 
         # If checkpoint is fp8, handle that there are N scales for N
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index d11cba2cab..466fd5fba7 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -97,8 +97,8 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
         self.quant_config.is_checkpoint_fp8_serialized = False
         self.fp8_linear = Fp8LinearOp(
             act_quant_static=False,
-            cutlass_fp8_supported=False,
-            act_quant_group_shape=GroupShape.PER_TOKEN)
+            act_quant_group_shape=GroupShape.PER_TOKEN,
+            force_fp8_e4m3fnuz=True)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.weight = torch.nn.Parameter(layer.weight.data,
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 36d16960ec..5333bbd310 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape)
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
+from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm, has_flashinfer
 
 # Input scaling factors are no longer optional in _scaled_mm starting
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
@@ -157,6 +158,19 @@ def cutlass_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
     return output.view(*output_shape)
 
 
+def flashinfer_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
+                              out_dtype: torch.dtype, scale_a: torch.Tensor,
+                              scale_b: torch.Tensor, bias: torch.Tensor,
+                              output_shape: list, **kwargs) -> torch.Tensor:
+
+    return flashinfer_scaled_fp8_mm(qinput,
+                                    weight,
+                                    out_dtype=out_dtype,
+                                    scale_a=scale_a,
+                                    scale_b=scale_b,
+                                    bias=bias)
+
+
 def rocm_per_tensor_w8a8_scaled_mm_impl(
         qinput: torch.Tensor, weight: torch.Tensor, out_dtype: torch.dtype,
         scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor,
@@ -231,8 +245,8 @@ def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor,
                                    out_dtype: torch.dtype,
                                    scale_a: torch.Tensor,
                                    scale_b: torch.Tensor, bias: torch.Tensor,
-                                   input_2d: torch.Tensor,
-                                   output_shape: list) -> torch.Tensor:
+                                   input_2d: torch.Tensor, output_shape: list,
+                                   **kwargs) -> torch.Tensor:
     # Note: Callers of this function should check USE_ROWWISE_TORCH_SCALED_MM
     #  when using it.
     #  For now it has only been validated on ROCm platform.
@@ -303,16 +317,22 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
 
 
 def dispatch_w8a8_scaled_mm(
-        cutlass_fp8_supported: bool, per_tensor_weights: bool,
+        preferred_backend: str, per_tensor_weights: bool,
         per_tensor_activations: bool) -> Callable[..., torch.Tensor]:
 
-    # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
-    if cutlass_fp8_supported:
-        return cutlass_w8a8_scaled_mm
     if per_tensor_weights and per_tensor_activations:
-        if current_platform.is_rocm():
+        if preferred_backend == "rocm":
             return rocm_per_tensor_w8a8_scaled_mm
+        if preferred_backend == "flashinfer":
+            return flashinfer_w8a8_scaled_mm
+        if preferred_backend == "cutlass":
+            return cutlass_w8a8_scaled_mm
         return torch_per_tensor_w8a8_scaled_mm
+
+    # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
+    if preferred_backend == "cutlass" or preferred_backend == "flashinfer":
+        return cutlass_w8a8_scaled_mm
+
     # If torch.scaled_mm supports per-channel (weights) per-token (inputs)
     if not per_tensor_weights and not per_tensor_activations \
             and USE_ROWWISE_TORCH_SCALED_MM:
@@ -334,10 +354,20 @@ class Fp8LinearOp:
 
     def __init__(self,
                  act_quant_static: bool,
-                 cutlass_fp8_supported: bool = cutlass_fp8_supported(),
                  act_quant_group_shape: GroupShape = GroupShape.PER_TENSOR,
-                 pad_output: Optional[bool] = None):
-        self.cutlass_fp8_supported = cutlass_fp8_supported
+                 pad_output: Optional[bool] = None,
+                 force_fp8_e4m3fnuz: bool = False):
+        if current_platform.is_rocm():
+            self.preferred_backend = "rocm"
+        elif current_platform.is_cuda(
+        ) and not force_fp8_e4m3fnuz and cutlass_fp8_supported():
+            if has_flashinfer() and current_platform.has_device_capability(
+                    100):
+                self.preferred_backend = "flashinfer"
+            else:
+                self.preferred_backend = "cutlass"
+        else:
+            self.preferred_backend = "torch"
 
         # Note: we pad the input because torch._scaled_mm is more performant
         # for matrices with batch dimension > 16.
@@ -347,8 +377,7 @@ class Fp8LinearOp:
         if pad_output is None:
             config = get_current_vllm_config().compilation_config
             pad_output = config.level < CompilationLevel.PIECEWISE and \
-                         not cutlass_fp8_supported and \
-                         not current_platform.is_rocm()
+                         self.preferred_backend == "torch"
 
         self.output_padding = 17 if pad_output else None
         self.act_quant_static = act_quant_static
@@ -393,9 +422,9 @@ class Fp8LinearOp:
         per_tensor_activations = (x_scale.numel() == 1)
 
         # TODO(luka) do this dispatch during init (after ScaledMM refactor)
-        w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(
-            self.cutlass_fp8_supported, per_tensor_weights,
-            per_tensor_activations)
+        w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(self.preferred_backend,
+                                                      per_tensor_weights,
+                                                      per_tensor_activations)
 
         return w8a8_scaled_mm_func(qinput=qinput,
                                    weight=weight,
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 5dd239c50f..fab134733d 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -265,6 +265,37 @@ if has_flashinfer():
                            dtype=dtype,
                            device=A.device)
 
+    @torch.library.custom_op(
+        "vllm::bmm_fp8",
+        mutates_args=[],
+        device_types="cuda",
+    )
+    def bmm_fp8(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        A_scale: torch.Tensor,
+        B_scale: torch.Tensor,
+        dtype: torch.dtype,
+        backend: str,
+    ) -> torch.Tensor:
+        from flashinfer import bmm_fp8 as bmm_fp8_
+        return bmm_fp8_(A, B, A_scale, B_scale, dtype, None, backend)
+
+    @torch.library.register_fake("vllm::bmm_fp8", )
+    def bmm_fp8_fake(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        A_scale: torch.Tensor,
+        B_scale: torch.Tensor,
+        dtype: torch.dtype,
+        backend: str,
+    ) -> torch.Tensor:
+        return torch.empty(A.shape[0],
+                           A.shape[1],
+                           B.shape[2],
+                           dtype=dtype,
+                           device=A.device)
+
 
 def flashinfer_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
                              block_scale_a: torch.Tensor,
@@ -293,6 +324,35 @@ def flashinfer_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
     )
 
 
+def flashinfer_scaled_fp8_mm(
+        a: torch.Tensor,
+        b: torch.Tensor,
+        scale_a: torch.Tensor,
+        scale_b: torch.Tensor,
+        out_dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    assert a.ndim == 2 and b.ndim == 2
+    assert a.shape[1] == b.shape[0]
+    assert scale_a.numel() == 1 and scale_b.numel() == 1
+    assert a.dtype == torch.float8_e4m3fn and b.dtype == torch.float8_e4m3fn
+    assert a.device.type == "cuda" and b.device.type == "cuda"
+    assert scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32
+    assert scale_a.device.type == "cuda" and scale_b.device.type == "cuda"
+
+    output = bmm_fp8(
+        a.unsqueeze(0),
+        b.unsqueeze(0),
+        scale_a,
+        scale_b,
+        out_dtype,
+        "auto",
+    ).view(a.shape[0], b.shape[1])
+
+    if bias is not None:
+        output = output + bias
+    return output
+
+
 __all__ = [
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
@@ -307,4 +367,5 @@ __all__ = [
     "supports_trtllm_attention",
     "use_trtllm_attention",
     "flashinfer_scaled_fp4_mm",
+    "flashinfer_scaled_fp8_mm",
 ]

From 7c04779afa7d0811dba3e1ec98c0ac1bc56570be Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Tue, 26 Aug 2025 16:05:29 +0200
Subject: [PATCH 599/932] [Doc]: fix various spelling issues in multiple files
 (#23636)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 .buildkite/nightly-benchmarks/README.md | 2 +-
 benchmarks/README.md                    | 2 +-
 docs/configuration/optimization.md      | 4 ++--
 docs/configuration/tpu.md               | 2 +-
 docs/design/fused_moe_modular_kernel.md | 6 +++---
 vllm/distributed/kv_transfer/README.md  | 4 ++--
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index b39f9899a8..e6f5c8b60f 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -141,7 +141,7 @@ When run, benchmark script generates results under `benchmark/results` folder, a
 `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
 If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
 
-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
 
 |   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
diff --git a/benchmarks/README.md b/benchmarks/README.md
index a2dd5bb583..38072152b6 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -749,7 +749,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
 
 Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
 
-Ex.1: Fixed number of items and a single image resolutionm, enforcing generation of approx 40 tokens:
+Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
 
 ```bash
 vllm bench serve \
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 6c7c31f503..bb47e1b90f 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -168,7 +168,7 @@ llm = LLM(
     Batch-level DP is not to be confused with API request-level DP
     (which is instead controlled by `data_parallel_size`).
 
-The availablilty of batch-level DP is based on model implementation.
+The availability of batch-level DP is based on model implementation.
 Currently, the following models support `mm_encoder_tp_mode="data"`:
 
 - Llama4 (<gh-pr:18368>)
@@ -205,7 +205,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
 
 !!! note
     [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled
-    because it requires a one-to-one correspondance between API and engine core processes.
+    because it requires a one-to-one correspondence between API and engine core processes.
 
 ## Multi-Modal Caching
 
diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
index a93435ed71..ac2b6baffd 100644
--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@@ -70,7 +70,7 @@ For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64,
 
 The fewer tokens we pad, the less unnecessary computation TPU does, the better performance we can get. For example, if num_tokens=300, with exponential padding, we pad to 512, with the bucket_padding above, we pad to 320.
 
-However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compilaed graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
+However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
 
 #### Quantization
 
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index 3c4c7d2102..202e9c1caf 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -133,7 +133,7 @@ class FusedMoEModularKernel:
 Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & Combine implementation / kernel. For example,
 
 * PplxPrepareAndFinalize type is backed by Pplx All2All kernels,
-* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughtput All2All kernels, and
+* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughput All2All kernels, and
 * DeepEPLLPrepareAndFinalize type is backed by DeepEP Low-Latency All2All kernels.
 
 #### Step 1: Add an All2All manager
@@ -183,7 +183,7 @@ implementations that input `FusedMoEActivationFormat.Standard` support chunking
 
 #### maybe_make_prepare_finalize
 
-The `maybe_make_prepare_finalize` method is responsbile for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled.  The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case.  Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case.
+The `maybe_make_prepare_finalize` method is responsible for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled.  The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case.  Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case.
 Please refer to the implementations in,
 
 * `ModelOptNvFp4FusedMoE`
@@ -198,7 +198,7 @@ Please refer to the implementations in,
 * `CompressedTensorsW8A8Fp8MoECutlassMethod`
 * `Fp8MoEMethod`
 * `ModelOptNvFp4FusedMoE`
-dervied classes.
+derived classes.
 
 #### init_prepare_finalize
 
diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
index 349d3dfbd8..39377aabcc 100644
--- a/vllm/distributed/kv_transfer/README.md
+++ b/vllm/distributed/kv_transfer/README.md
@@ -2,7 +2,7 @@
 # Distributed KV cache transfer
 
 This folder implements distributed KV cache transfer across vLLM instances.
-Currently the main usecase is for disaggregated prefilling.
+Currently the main use case is for disaggregated prefilling.
 
 ## Abstractions
 
@@ -14,7 +14,7 @@ The KV cache transfer contains three layer of abstractions:
 
 Why we need KV lookup buffer: FIFO pipe itself is not enough as prefill vLLM worker may process requests in a different order compared to decode vLLM worker. Say the QPS is really high, prefill worker may handle requests in order A -> B -> C, but the decode worker may process request C first. This is not the case that can be naturally handled by FIFO pipe, so we provide KV lookup buffer to help translate a FIFO pipe to a lookup buffer.
 
-NOTE: KV pipe layer is bypassible: you can skip this layer if your distributed
+NOTE: KV pipe layer is bypassable: you can skip this layer if your distributed
 communication service already supports key-value-based lookup (like redis or
 RDMA database).
 

From f58675bfb36b67cdbca4d2356a2f580e7a706ec3 Mon Sep 17 00:00:00 2001
From: TianyuLi0 <116711075+TianyuLi0@users.noreply.github.com>
Date: Tue, 26 Aug 2025 22:09:17 +0800
Subject: [PATCH 600/932] [CPU] add cpu fused moe pytorch native implementation
 (#23146)

Signed-off-by: Tianyu Li <tianyu.li@arm.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 .../layers/fused_moe/cpu_fused_moe.py         | 286 +++++++++++-------
 vllm/model_executor/layers/fused_moe/layer.py |   4 +-
 2 files changed, 180 insertions(+), 110 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index e67ff66882..769a04b7de 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -3,10 +3,110 @@
 from typing import Callable, Optional
 
 import torch
+from torch.nn import functional as F
 
 from vllm import envs
 
 
+def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
+    d = x.shape[-1] // 2
+    return F.silu(x[..., :d]) * x[..., d:]
+
+
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert hidden_states.shape[0] == gating_output.shape[0], (
+        "Number of tokens mismatch")
+
+    gating_output = gating_output.float()
+    if scoring_func == "softmax":
+        scores = torch.softmax(gating_output, dim=-1)
+    elif scoring_func == "sigmoid":
+        scores = gating_output.sigmoid()
+    else:
+        raise ValueError(f"Unsupported scoring function: {scoring_func}")
+
+    num_token = scores.shape[0]
+    if e_score_correction_bias is not None:
+        original_scores = scores
+        scores = scores + e_score_correction_bias.unsqueeze(0)
+        group_scores = (scores.view(num_token, num_expert_group,
+                                    -1).topk(2, dim=-1)[0].sum(dim=-1))
+    else:
+        group_scores = scores.view(num_token, num_expert_group,
+                                   -1).max(dim=-1).values  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1,
+                           sorted=False)[1]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = group_mask.unsqueeze(-1).expand(
+        num_token, num_expert_group,
+        scores.shape[-1] // num_expert_group).reshape(num_token, -1)  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(),
+                                    float("-inf"))  # [n, e]
+
+    if e_score_correction_bias is not None:
+        topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
+        topk_weights = original_scores.gather(1, topk_ids)
+    else:
+        topk_weights, topk_ids = torch.topk(tmp_scores,
+                                            k=topk,
+                                            dim=-1,
+                                            sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_ids.to(torch.int32)
+
+
+def select_experts(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    use_grouped_topk: bool,
+    renormalize: bool,
+    topk_group: Optional[int] = None,
+    num_expert_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if use_grouped_topk:
+        assert topk_group is not None
+        assert num_expert_group is not None
+        return grouped_topk(hidden_states=hidden_states,
+                            gating_output=router_logits,
+                            topk=top_k,
+                            renormalize=renormalize,
+                            num_expert_group=num_expert_group,
+                            topk_group=topk_group,
+                            scoring_func=scoring_func,
+                            e_score_correction_bias=e_score_correction_bias)
+    elif custom_routing_function is None:
+        assert scoring_func == "softmax"
+        topk_weights = torch.nn.functional.softmax(router_logits,
+                                                   dim=1,
+                                                   dtype=torch.float32)
+        topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1)
+        if renormalize:
+            topk_weights /= topk_weights.sum(dim=-1, keepdim=True)
+        return topk_weights, topk_ids.to(torch.int32)
+    else:
+        return custom_routing_function(hidden_states=hidden_states,
+                                       gating_output=router_logits,
+                                       topk=top_k,
+                                       renormalize=renormalize)
+
+
 class IPEXFusedMOE:
 
     def __init__(self, layer: torch.nn.Module) -> None:
@@ -56,113 +156,6 @@ class SGLFusedMOE:
     def __init__(self, layer: torch.nn.Module) -> None:
         pass
 
-    @staticmethod
-    def _grouped_topk(
-        hidden_states: torch.Tensor,
-        gating_output: torch.Tensor,
-        topk: int,
-        renormalize: bool,
-        num_expert_group: int = 0,
-        topk_group: int = 0,
-        scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        assert hidden_states.shape[0] == gating_output.shape[0], (
-            "Number of tokens mismatch")
-
-        gating_output = gating_output.float()
-        if scoring_func == "softmax":
-            scores = torch.softmax(gating_output, dim=-1)
-        elif scoring_func == "sigmoid":
-            scores = gating_output.sigmoid()
-        else:
-            raise ValueError(f"Unsupported scoring function: {scoring_func}")
-
-        num_token = scores.shape[0]
-        if e_score_correction_bias is not None:
-            # Store original scores before applying correction bias. We use
-            # biased scores for expert selection but original scores for
-            # routing weights
-            original_scores = scores
-            scores = scores + e_score_correction_bias.unsqueeze(0)
-            group_scores = (scores.view(num_token, num_expert_group,
-                                        -1).topk(2, dim=-1)[0].sum(dim=-1))
-        else:
-            group_scores = scores.view(num_token, num_expert_group,
-                                       -1).max(dim=-1).values  # [n, n_group]
-        group_idx = torch.topk(group_scores,
-                               k=topk_group,
-                               dim=-1,
-                               sorted=False)[1]  # [n, top_k_group]
-        group_mask = torch.zeros_like(group_scores)  # [n, n_group]
-        group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
-        score_mask = group_mask.unsqueeze(-1).expand(
-            num_token, num_expert_group,
-            scores.shape[-1] // num_expert_group).reshape(num_token,
-                                                          -1)  # [n, e]
-        tmp_scores = scores.masked_fill(~score_mask.bool(),
-                                        float("-inf"))  # [n, e]
-
-        if e_score_correction_bias is not None:
-            topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
-            # Use original unbiased scores for the routing weights
-            topk_weights = original_scores.gather(1, topk_ids)
-        else:
-            topk_weights, topk_ids = torch.topk(tmp_scores,
-                                                k=topk,
-                                                dim=-1,
-                                                sorted=False)
-
-        if renormalize:
-            topk_weights = topk_weights / topk_weights.sum(dim=-1,
-                                                           keepdim=True)
-
-        return topk_weights, topk_ids.to(torch.int32)
-
-    @staticmethod
-    def _select_experts(
-        hidden_states: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        use_grouped_topk: bool,
-        renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        # DeekSeekv2 uses grouped_top_k
-        if use_grouped_topk:
-            assert topk_group is not None
-            assert num_expert_group is not None
-            topk_weights, topk_ids = SGLFusedMOE._grouped_topk(
-                hidden_states=hidden_states,
-                gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                scoring_func=scoring_func,
-                e_score_correction_bias=e_score_correction_bias)
-        elif custom_routing_function is None:
-            assert scoring_func == "softmax"
-            topk_weights = torch.nn.functional.softmax(router_logits,
-                                                       dim=1,
-                                                       dtype=torch.float32)
-            topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1)
-            if renormalize:
-                topk_weights /= topk_weights.sum(dim=-1, keepdim=True)
-            topk_ids = topk_ids.to(torch.int32)
-        else:
-            topk_weights, topk_ids = custom_routing_function(
-                hidden_states=hidden_states,
-                gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize)
-
-        return topk_weights, topk_ids
-
     def __call__(
         self,
         layer: torch.nn.Module,
@@ -183,7 +176,7 @@ class SGLFusedMOE:
     ) -> torch.Tensor:
         assert activation == "silu", f"{activation} is not supported."
         assert not apply_router_weight_on_input
-        topk_weights, topk_ids = SGLFusedMOE._select_experts(
+        topk_weights, topk_ids = select_experts(
             hidden_states=x,
             router_logits=router_logits,
             use_grouped_topk=use_grouped_topk,
@@ -213,3 +206,80 @@ class SGLFusedMOE:
             True,
         )
         return x
+
+
+class CPUFusedMOE:
+
+    def __init__(self, layer: torch.nn.Module) -> None:
+        pass
+
+    def __call__(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        assert activation == "silu", f"{activation} is not supported."
+        assert not apply_router_weight_on_input
+        topk_weights, topk_ids = select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+
+        # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53
+        len_experts = global_num_experts
+
+        cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
+        cnts.scatter_(1, topk_ids.to(torch.int64), 1)
+        tokens_per_expert = cnts.sum(dim=0)
+        idxs = topk_ids.view(-1).argsort()
+
+        sorted_tokens = x[idxs // topk_ids.shape[1]]
+        tokens_per_expert = tokens_per_expert.cpu().numpy()
+
+        outputs = []
+        start_idx = 0
+        for i, num_tokens in enumerate(tokens_per_expert):
+            end_idx = start_idx + num_tokens
+            if num_tokens == 0:
+                continue
+            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+
+            layer_w13_weight = layer.w13_weight[i]
+            layer_w2_weight = layer.w2_weight[i]
+
+            gate_up = F.linear(tokens_for_this_expert, layer_w13_weight)
+            gate_up = silu_and_mul(gate_up)
+            expert_out = F.linear(gate_up, layer_w2_weight)
+            outputs.append(expert_out)
+            start_idx = end_idx
+
+        outs = torch.cat(outputs,
+                         dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+        new_x = torch.empty_like(outs)
+
+        new_x[idxs] = outs
+        final_out = (new_x.view(
+            *topk_ids.shape, -1).type(topk_weights.dtype).mul_(
+                topk_weights.unsqueeze(dim=-1)).sum(dim=1).type(new_x.dtype))
+        return final_out
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index fcc6987d26..54406a5a2d 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -358,8 +358,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 use_prepack=True,
             )
         elif current_platform.is_cpu():
+            from vllm.model_executor.layers.fused_moe import cpu_fused_moe
             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
-                from vllm.model_executor.layers.fused_moe import cpu_fused_moe
                 from vllm.model_executor.layers.utils import (
                     check_cpu_sgl_kernel)
                 dtype_w13 = layer.w13_weight.dtype
@@ -382,7 +382,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 else:
                     layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
             else:
-                raise NotImplementedError("CPU MOE only supports x86 arch.")
+                layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
 
     def apply(
         self,

From 1fdc732419d9b9eb00e003f38d6e02c480131ac8 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Tue, 26 Aug 2025 10:32:37 -0400
Subject: [PATCH 601/932] [ROCm] Starting to add AMD code reviewers for ROCm
 components (#23496)

Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
---
 .github/CODEOWNERS | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ce9590f02c..c087fd555c 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -79,4 +79,10 @@ mkdocs.yaml @hmellor
 /vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
 /vllm/attention/ops/triton_unified_attention.py @tdoublep
 
+# ROCm related: specify owner with write access to notify AMD folks for careful code review
+/docker/Dockerfile.rocm* @gshtras
+/vllm/v1/attention/backends/rocm*.py @gshtras
+/vllm/v1/attention/backends/mla/rocm*.py @gshtras
+/vllm/attention/ops/rocm*.py @gshtras
+/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
 

From 379f828fba68bcafec8b283acfd2b831fc35afb9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 26 Aug 2025 16:43:28 +0100
Subject: [PATCH 602/932] [Docs] Reduce requirements for docs build (#23651)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/mkdocs/hooks/generate_argparse.py | 52 +++++++++++++++++------
 requirements/docs.txt                  | 14 -------
 vllm/sequence.py                       |  7 +++-
 vllm/transformers_utils/config.py      | 58 ++++++++++++--------------
 4 files changed, 72 insertions(+), 59 deletions(-)

diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index ed5d3b0092..051a2d9044 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
 import logging
 import sys
 from argparse import SUPPRESS, HelpFormatter
@@ -7,25 +8,52 @@ from pathlib import Path
 from typing import Literal
 from unittest.mock import MagicMock, patch
 
+from pydantic_core import core_schema
+
+logger = logging.getLogger("mkdocs")
+
 ROOT_DIR = Path(__file__).parent.parent.parent.parent
 ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"
 
 sys.path.insert(0, str(ROOT_DIR))
-sys.modules["aiohttp"] = MagicMock()
-sys.modules["blake3"] = MagicMock()
 sys.modules["vllm._C"] = MagicMock()
 
-from vllm.benchmarks import latency  # noqa: E402
-from vllm.benchmarks import serve  # noqa: E402
-from vllm.benchmarks import throughput  # noqa: E402
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs  # noqa: E402
-from vllm.entrypoints.cli.openai import ChatCommand  # noqa: E402
-from vllm.entrypoints.cli.openai import CompleteCommand  # noqa: E402
-from vllm.entrypoints.openai import cli_args  # noqa: E402
-from vllm.entrypoints.openai import run_batch  # noqa: E402
-from vllm.utils import FlexibleArgumentParser  # noqa: E402
 
-logger = logging.getLogger("mkdocs")
+class PydanticMagicMock(MagicMock):
+    """`MagicMock` that's able to generate pydantic-core schemas."""
+
+    def __get_pydantic_core_schema__(self, source_type, handler):
+        return core_schema.any_schema()
+
+
+def auto_mock(module, attr, max_mocks=50):
+    """Function that automatically mocks missing modules during imports."""
+    logger.info("Importing %s from %s", attr, module)
+    for _ in range(max_mocks):
+        try:
+            # First treat attr as an attr, then as a submodule
+            return getattr(importlib.import_module(module), attr,
+                           importlib.import_module(f"{module}.{attr}"))
+        except importlib.metadata.PackageNotFoundError as e:
+            raise e
+        except ModuleNotFoundError as e:
+            logger.info("Mocking %s for argparse doc generation", e.name)
+            sys.modules[e.name] = PydanticMagicMock()
+
+    raise ImportError(
+        f"Failed to import {module}.{attr} after mocking {max_mocks} imports")
+
+
+latency = auto_mock("vllm.benchmarks", "latency")
+serve = auto_mock("vllm.benchmarks", "serve")
+throughput = auto_mock("vllm.benchmarks", "throughput")
+AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs")
+EngineArgs = auto_mock("vllm.engine.arg_utils", "EngineArgs")
+ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
+CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
+cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
+run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
+FlexibleArgumentParser = auto_mock("vllm.utils", "FlexibleArgumentParser")
 
 
 class MarkdownFormatter(HelpFormatter):
diff --git a/requirements/docs.txt b/requirements/docs.txt
index a24b9c7e92..3b72a8a9e7 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -14,20 +14,6 @@ ruff
 # Required for argparse hook only
 -f https://download.pytorch.org/whl/cpu
 cachetools
-cbor2
-cloudpickle
-fastapi
 msgspec
-openai
-openai-harmony
-partial-json-parser
-pillow
-psutil
-pybase64
 pydantic
-setproctitle
 torch
-transformers
-zmq
-uvloop
-prometheus-client
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 43d5c8beef..3c4c77aea5 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -16,14 +16,17 @@ import msgspec
 import torch
 
 from vllm.inputs import SingletonInputs
-from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
 if TYPE_CHECKING:
+    from vllm.lora.request import LoRARequest
     from vllm.v1.worker.kv_connector_model_runner_mixin import (
         KVConnectorOutput)
+else:
+    LoRARequest = Any
+    KVConnectorOutput = Any
 
 VLLM_TOKEN_ID_ARRAY_TYPE = "l"
 
@@ -1138,7 +1141,7 @@ class IntermediateTensors:
     """
 
     tensors: dict[str, torch.Tensor]
-    kv_connector_output: Optional["KVConnectorOutput"]
+    kv_connector_output: Optional[KVConnectorOutput]
 
     def __init__(self, tensors):
         # manually define this function, so that
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 674c820dab..2cd799e5eb 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -27,19 +27,6 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
 from vllm.logger import init_logger
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config,
-                                             EAGLEConfig, JAISConfig,
-                                             KimiVLConfig, MedusaConfig,
-                                             MLPSpeculatorConfig,
-                                             Nemotron_Nano_VL_Config,
-                                             NemotronConfig, OvisConfig,
-                                             RWConfig, SpeculatorsConfig,
-                                             Step3TextConfig, Step3VLConfig,
-                                             UltravoxConfig)
-# yapf: enable
-from vllm.transformers_utils.configs.mistral import adapt_config_dict
 from vllm.transformers_utils.utils import check_gguf_file
 
 if envs.VLLM_USE_MODELSCOPE:
@@ -67,24 +54,31 @@ def _get_hf_token() -> Optional[str]:
     return None
 
 
-_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
-    "chatglm": ChatGLMConfig,
-    "deepseek_vl_v2": DeepseekVLV2Config,
-    "kimi_vl": KimiVLConfig,
-    "Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config,
-    "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
-    "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
-    "jais": JAISConfig,
-    "mlp_speculator": MLPSpeculatorConfig,
-    "medusa": MedusaConfig,
-    "eagle": EAGLEConfig,
-    "speculators": SpeculatorsConfig,
-    "nemotron": NemotronConfig,
-    "ovis": OvisConfig,
-    "ultravox": UltravoxConfig,
-    "step3_vl": Step3VLConfig,
-    "step3_text": Step3TextConfig,
-}
+class LazyConfigDict(dict):
+
+    def __getitem__(self, key):
+        import vllm.transformers_utils.configs as configs
+        return getattr(configs, super().__getitem__(key))
+
+
+_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
+    chatglm="ChatGLMConfig",
+    deepseek_vl_v2="DeepseekVLV2Config",
+    kimi_vl="KimiVLConfig",
+    Llama_Nemotron_Nano_VL="Nemotron_Nano_VL_Config",
+    RefinedWeb="RWConfig",  # For tiiuae/falcon-40b(-instruct)
+    RefinedWebModel="RWConfig",  # For tiiuae/falcon-7b(-instruct)
+    jais="JAISConfig",
+    mlp_speculator="MLPSpeculatorConfig",
+    medusa="MedusaConfig",
+    eagle="EAGLEConfig",
+    speculators="SpeculatorsConfig",
+    nemotron="NemotronConfig",
+    ovis="OvisConfig",
+    ultravox="UltravoxConfig",
+    step3_vl="Step3VLConfig",
+    step3_text="Step3TextConfig",
+)
 
 _CONFIG_ATTRS_MAPPING: dict[str, str] = {
     "llm_config": "text_config",
@@ -461,6 +455,8 @@ def get_config(
                 model, revision, **kwargs)
             config_dict["max_position_embeddings"] = max_position_embeddings
 
+        from vllm.transformers_utils.configs.mistral import adapt_config_dict
+
         config = adapt_config_dict(config_dict)
 
         # Mistral configs may define sliding_window as list[int]. Convert it

From 513298f1b44157f7ae2f7007ef7b17c2929d11d4 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Tue, 26 Aug 2025 23:47:50 +0800
Subject: [PATCH 603/932] [Bugfix] fix bf16 multimodal model hash (#23623)

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/multimodal/hasher.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 210a4ec762..479961776a 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -43,7 +43,19 @@ class MultiModalHasher:
             return cls.item_to_bytes(
                 "image", np.asarray(convert_image_mode(obj, "RGBA")))
         if isinstance(obj, torch.Tensor):
-            return cls.item_to_bytes("tensor", obj.cpu().numpy())
+            tensor_obj: torch.Tensor = obj.cpu()
+            tensor_dtype = tensor_obj.dtype
+            if tensor_dtype == torch.bfloat16:
+                tensor_obj = tensor_obj.contiguous()
+                tensor_obj = tensor_obj.view(
+                    (tensor_obj.numel(), )).view(torch.uint8)
+                return cls.item_to_bytes(
+                    "tensor", {
+                        "original_dtype": str(tensor_dtype),
+                        "original_shape": tuple(tensor_obj.shape),
+                        "data": tensor_obj.numpy()
+                    })
+            return cls.item_to_bytes("tensor", tensor_obj.numpy())
         if isinstance(obj, np.ndarray):
             # If the array is non-contiguous, we need to copy it first
             arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes()

From 9d4183dd2e751e94442d7f02966d33cc085de708 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Tue, 26 Aug 2025 23:48:08 +0800
Subject: [PATCH 604/932] [model] support qwen2audio embedding input (#23625)

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../models/qwen2_5_omni_thinker.py            |  13 ++-
 vllm/model_executor/models/qwen2_audio.py     | 109 ++++++++++++++----
 2 files changed, 93 insertions(+), 29 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index a61b8ca8f7..5c64c81547 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -47,7 +47,7 @@ from vllm.model_executor.models.qwen2_5_vl import (
     Qwen2_5_VLProcessingInfo, Qwen2_5_VLVideoEmbeddingInputs,
     Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs)
 from vllm.model_executor.models.qwen2_audio import (
-    Qwen2AudioInputs, Qwen2AudioProcessingInfo,
+    Qwen2AudioFeatureInputs, Qwen2AudioProcessingInfo,
     _get_feat_extract_output_lengths)
 from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -534,7 +534,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
             return torch.concat(mm_input, dim=dim)
 
     def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
+            self, **kwargs: object) -> Optional[Qwen2AudioFeatureInputs]:
         input_audio_features = kwargs.pop('input_audio_features', None)
         audio_feature_lengths = kwargs.pop('audio_feature_lengths', None)
         feature_attention_mask = kwargs.pop('feature_attention_mask', None)
@@ -548,9 +548,10 @@ class Qwen2_5OmniConditionalGenerationMixin:
         if not isinstance(input_audio_features, (torch.Tensor, list)):
             raise ValueError("Incorrect type of audio input features. "
                              f"Got type: {type(input_audio_features)}")
-        return Qwen2AudioInputs(input_features=input_audio_features,
-                                audio_feature_lengths=audio_feature_lengths,
-                                feature_attention_mask=feature_attention_mask)
+        return Qwen2AudioFeatureInputs(
+            input_features=input_audio_features,
+            audio_feature_lengths=audio_feature_lengths,
+            feature_attention_mask=feature_attention_mask)
 
     def _parse_and_validate_image_input(
         self,
@@ -630,7 +631,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
 
     def _process_audio_input(
         self,
-        audio_input: Qwen2AudioInputs,
+        audio_input: Qwen2AudioFeatureInputs,
         audio_hashes: list[str] = None,
         cached_audio_features: torch.Tensor = None,
     ) -> torch.Tensor:
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 86c567ca36..86b4a9a018 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -23,7 +23,7 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Optional, TypedDict, Union
+from typing import Any, Literal, Optional, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -36,9 +36,11 @@ from transformers.models.whisper import WhisperFeatureExtractor
 from vllm.config import VllmConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+from vllm.multimodal.inputs import (AudioItem, ModalityData,
+                                    MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargsItems)
-from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
+from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
+                                   ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -52,7 +54,8 @@ from .utils import (AutoWeightsLoader, init_vllm_registered_model,
 
 
 # # === Audio Inputs === #
-class Qwen2AudioInputs(TypedDict):
+class Qwen2AudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
     input_features: torch.Tensor
     """Shape: `(num_audios, num_mel_bins, 3000)`"""
 
@@ -60,6 +63,16 @@ class Qwen2AudioInputs(TypedDict):
     """Shape: `(num_audios, 3000)`"""
 
 
+class Qwen2AudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    audio_embeds: list[torch.Tensor]
+    """Shape: `(num_audio_features, hidden_size)`
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+Qwen2AudioInputs = Union[Qwen2AudioFeatureInputs, Qwen2AudioEmbeddingInputs]
+
 # === Audio Encoder === #
 
 
@@ -128,12 +141,38 @@ class Qwen2AudioDummyInputsBuilder(
         }
 
 
+def _qwen2audio_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    return dict(
+        audio_embeds=MultiModalFieldConfig.batched("audio"),
+        input_features=MultiModalFieldConfig.batched("audio"),
+        feature_attention_mask=MultiModalFieldConfig.batched("audio"),
+    )
+
+
+class Qwen2AudioMultiModalDataParser(MultiModalDataParser):
+
+    def _parse_audio_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[AudioItem]],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="audio",
+                required_fields={"audio_embeds"},
+                fields_factory=_qwen2audio_field_config,
+            )
+
+        return super()._parse_audio_data(data)
+
+
 class Qwen2AudioMultiModalProcessor(
         BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
         feature_extractor = self.info.get_feature_extractor()
-        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+        return Qwen2AudioMultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate)
 
     def _call_hf_processor(
         self,
@@ -173,10 +212,7 @@ class Qwen2AudioMultiModalProcessor(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            input_features=MultiModalFieldConfig.batched("audio"),
-            feature_attention_mask=MultiModalFieldConfig.batched("audio"),
-        )
+        return _qwen2audio_field_config(hf_inputs)
 
     def _get_prompt_updates(
         self,
@@ -184,6 +220,7 @@ class Qwen2AudioMultiModalProcessor(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
+
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
@@ -211,7 +248,15 @@ class Qwen2AudioMultiModalProcessor(
             audio_output_lengths = audio_output_lens.tolist()
 
         def get_replacement_qwen2_audio(item_idx: int):
-            num_features = audio_output_lengths[item_idx]
+
+            if audio_output_lengths:
+                num_features = audio_output_lengths[item_idx]
+            else:
+                audio_embeds = out_mm_data["audio_embeds"][item_idx]
+                assert len(audio_embeds.shape
+                           ) == 2, "audio_embeds must be a 2D tensor"
+                num_features = audio_embeds.shape[0]
+
             if num_features == 0:
                 audios = mm_items.get_items("audio", AudioProcessorItems)
                 audio_len = audios.get_audio_length(item_idx)
@@ -286,21 +331,39 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
     def _parse_and_validate_audio_input(
             self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
         input_features = kwargs.pop('input_features', None)
+        audio_embeds = kwargs.pop('audio_embeds', None)
         feature_attention_mask = kwargs.pop('feature_attention_mask', None)
-        if input_features is None:
-            return None
-        input_features = self._validate_and_reshape_mm_tensor(
-            input_features, 'input_features')
-        feature_attention_mask = self._validate_and_reshape_mm_tensor(
-            feature_attention_mask, 'feature_attention_mask')
-        if not isinstance(input_features, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of audio input features. "
-                             f"Got type: {type(input_features)}")
-        return Qwen2AudioInputs(input_features=input_features,
-                                feature_attention_mask=feature_attention_mask)
 
-    def _process_audio_input(self,
-                             audio_input: Qwen2AudioInputs) -> torch.Tensor:
+        if input_features is None and audio_embeds is None:
+            return None
+
+        if audio_embeds is not None:
+            if not isinstance(audio_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio embeds. "
+                                 f"Got type: {type(audio_embeds)}")
+            audio_embeds = self._validate_and_reshape_mm_tensor(
+                audio_embeds, "audio_embeds")
+            return Qwen2AudioEmbeddingInputs(type="audio_embeds",
+                                             audio_embeds=audio_embeds)
+
+        if input_features is not None:
+            input_features = self._validate_and_reshape_mm_tensor(
+                input_features, 'input_features')
+            feature_attention_mask = self._validate_and_reshape_mm_tensor(
+                feature_attention_mask, 'feature_attention_mask')
+            return Qwen2AudioFeatureInputs(
+                type="audio_features",
+                input_features=input_features,
+                feature_attention_mask=feature_attention_mask)
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_audio_input(
+        self, audio_input: Qwen2AudioInputs
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        if audio_input["type"] == "audio_embeds":
+            audio_embeds = audio_input["audio_embeds"]
+            return tuple(audio_embeds)
 
         input_features = audio_input["input_features"]
         feature_attention_mask = audio_input["feature_attention_mask"]

From 7ea22e42d5f666a26b3ce4117724dadfdb4d3887 Mon Sep 17 00:00:00 2001
From: nvjullin <jullin@nvidia.com>
Date: Tue, 26 Aug 2025 23:53:04 +0800
Subject: [PATCH 605/932] [Misc] Add override for allreduce fusion thresholds
 (#23639)

Signed-off-by: Julien Lin <jullin@nvidia.com>
---
 vllm/compilation/collective_fusion.py | 13 +++++++++++++
 vllm/envs.py                          | 11 +++++++++++
 2 files changed, 24 insertions(+)

diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index c44ac8e0aa..0c545d8cff 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -10,6 +10,7 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch.distributed._symmetric_memory import enable_symm_mem_for_group
 
+import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
@@ -401,6 +402,18 @@ if flashinfer_comm is not None:
         6: MiB // 2,  # 512KB
         8: MiB // 2,  # 512KB
     }
+
+    try:
+        _FI_MAX_SIZES.update({
+            int(k): int(float(v) * MiB)
+            for k, v in
+            envs.VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB.items()
+        })
+    except Exception as e:
+        raise ValueError(
+            "Failed to parse VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB: "
+            + str(e)) from e
+
     # opt for a more conservative default value
     # when world size is not in _FI_MAX_SIZES
     _DEFAULT_FI_MAX_SIZE = MiB // 2
diff --git a/vllm/envs.py b/vllm/envs.py
index 1c9c4cdde8..66c7c2c7f2 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
+import json
 import os
 import sys
 import tempfile
@@ -1046,6 +1047,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
     lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
 
+    # Specifies the thresholds of the communicated tensor sizes under which
+    # vllm should use flashinfer fused allreduce. The variable should be a
+    # JSON with the following format:
+    #     { <world size>: <max size in mb> }
+    # Unspecified world sizes will fallback to
+    #     { 2: 64, 4: 1, <everything else>: 0.5 }
+    "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
+    lambda: json.loads(os.getenv(
+        "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB", "{}")),
+
     # MoE routing strategy selector.
     # See `RoutingSimulator.get_available_strategies()` # for available
     # strategies.

From 44ac25eae2cbbdc1cbcca423777107a5ca90a8f4 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 27 Aug 2025 00:20:13 +0800
Subject: [PATCH 606/932] [CI] [Doc]: Add GH Action for auto labeling issues
 with `rocm` tag (#20988)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .github/workflows/issue_autolabel.yml | 305 ++++++++++++++++++++++++++
 1 file changed, 305 insertions(+)
 create mode 100644 .github/workflows/issue_autolabel.yml

diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
new file mode 100644
index 0000000000..6401d6586c
--- /dev/null
+++ b/.github/workflows/issue_autolabel.yml
@@ -0,0 +1,305 @@
+name: Label issues based on keywords
+on:
+  issues:
+    types: [opened, edited, reopened]
+permissions:
+  issues: write          # needed so the workflow can add labels
+  contents: read
+concurrency:
+  group: issue-labeler-${{ github.event.issue.number }}
+  cancel-in-progress: true
+jobs:
+  add-labels:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Label issues based on keywords
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea  # v7.0.1
+        with:
+          script: |
+            // Configuration: Add new labels and keywords here
+            const labelConfig = {
+              rocm: {
+                // Keyword search - matches whole words only (with word boundaries)
+                keywords: [
+                  {
+                    term: "composable kernel",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "rccl",
+                    searchIn: "body"  // only search in body
+                  },
+                  {
+                    term: "migraphx",
+                    searchIn: "title"  // only search in title
+                  },
+                  {
+                    term: "hipgraph",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "ROCm System Management Interface",
+                    searchIn: "body"
+                  },
+                ],
+                
+                // Substring search - matches anywhere in text (partial matches)
+                substrings: [
+                  {
+                    term: "VLLM_ROCM_",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "rocm",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "amd",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "hip-",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "gfx",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "cdna",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "rdna",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "torch_hip",
+                    searchIn: "body"  // only in body
+                  },
+                  {
+                    term: "_hip",
+                    searchIn: "both"
+                  },
+                  {
+                    term: "hip_",
+                    searchIn: "both"
+                  },
+                  
+                  // ROCm tools and libraries
+                  {
+                    term: "hipify",
+                    searchIn: "both"
+                  },
+                ],
+                
+                // Regex patterns - for complex pattern matching
+                regexPatterns: [
+                  {
+                    pattern: "\\bmi\\d{3}[a-z]*\\b",
+                    description: "AMD GPU names (mi + 3 digits + optional letters)",
+                    flags: "gi",
+                    searchIn: "both"  // "title", "body", or "both"
+                  }
+                ],
+              },
+            };
+            
+            // Helper function to create regex based on search type
+            function createSearchRegex(term, type) {
+              // Escape special regex characters in the term
+              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+              
+              switch (type) {
+                case 'keyword':
+                  // Word boundary search - matches whole words only
+                  return new RegExp(`\\b${escapedTerm}\\b`, "gi");
+                case 'substring':
+                  // Substring search - matches anywhere in the text
+                  return new RegExp(escapedTerm, "gi");
+                default:
+                  throw new Error(`Unknown search type: ${type}`);
+              }
+            }
+            
+            // Helper function to find matching terms in text with line information
+            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
+              const matches = [];
+              const lines = text.split('\n');
+              
+              for (const termConfig of searchTerms) {
+                let regex;
+                let term, searchIn, pattern, description, flags;
+                
+                // Handle different input formats (string or object)
+                if (typeof termConfig === 'string') {
+                  term = termConfig;
+                  searchIn = 'both'; // default
+                } else {
+                  term = termConfig.term;
+                  searchIn = termConfig.searchIn || 'both';
+                  pattern = termConfig.pattern;
+                  description = termConfig.description;
+                  flags = termConfig.flags;
+                }
+                
+                // Skip if this term shouldn't be searched in the current location
+                if (searchIn !== 'both' && searchIn !== searchLocation) {
+                  continue;
+                }
+                
+                // Create appropriate regex
+                if (searchType === 'regex') {
+                  regex = new RegExp(pattern, flags || "gi");
+                } else {
+                  regex = createSearchRegex(term, searchType);
+                }
+                
+                const termMatches = [];
+                
+                // Check each line for matches
+                lines.forEach((line, lineIndex) => {
+                  const lineMatches = line.match(regex);
+                  if (lineMatches) {
+                    lineMatches.forEach(match => {
+                      termMatches.push({
+                        match: match,
+                        lineNumber: lineIndex + 1,
+                        lineContent: line.trim(),
+                        searchType: searchType,
+                        searchLocation: searchLocation,
+                        originalTerm: term || pattern,
+                        description: description,
+                        // Show context around the match in the line
+                        context: line.length > 100 ? 
+                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
+                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
+                          : line.trim()
+                      });
+                    });
+                  }
+                });
+                
+                if (termMatches.length > 0) {
+                  matches.push({
+                    term: term || (description || pattern),
+                    searchType: searchType,
+                    searchLocation: searchLocation,
+                    searchIn: searchIn,
+                    pattern: pattern,
+                    matches: termMatches,
+                    count: termMatches.length
+                  });
+                }
+              }
+              
+              return matches;
+            }
+            
+            // Helper function to check if label should be added
+            async function processLabel(labelName, config) {
+              const body = context.payload.issue.body || "";
+              const title = context.payload.issue.title || "";
+              
+              core.notice(`Processing label: ${labelName}`);
+              core.notice(`Issue Title: "${title}"`);
+              core.notice(`Issue Body length: ${body.length} characters`);
+              
+              let shouldAddLabel = false;
+              let allMatches = [];
+              let reason = '';
+              
+              const keywords = config.keywords || [];
+              const substrings = config.substrings || [];
+              const regexPatterns = config.regexPatterns || [];
+              
+              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
+              
+              // Search in title
+              if (title.trim()) {
+                core.notice(`Searching in title: "${title}"`);
+                
+                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
+                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
+                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
+                
+                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
+              }
+              
+              // Search in body
+              if (body.trim()) {
+                core.notice(`Searching in body (${body.length} characters)`);
+                
+                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
+                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
+                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
+                
+                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
+              }
+              
+              if (allMatches.length > 0) {
+                core.notice(`Found ${allMatches.length} matching term(s):`);
+                
+                for (const termMatch of allMatches) {
+                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
+                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
+                  
+                  if (termMatch.searchType === 'regex') {
+                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+                  } else {
+                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+                  }
+                  
+                  // Show details for each match
+                  termMatch.matches.forEach((match, index) => {
+                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
+                    if (match.description) {
+                      core.notice(`       Description: ${match.description}`);
+                    }
+                    core.notice(`       Context: ${match.context}`);
+                    if (match.lineContent !== match.context) {
+                      core.notice(`       Full line: ${match.lineContent}`);
+                    }
+                  });
+                }
+                
+                shouldAddLabel = true;
+                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
+                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
+                const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
+                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
+                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
+                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
+                
+                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
+              }
+              
+              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
+              core.notice(`Reason: ${reason || 'No matching terms found'}`);
+              
+              if (shouldAddLabel) {
+                const existingLabels = context.payload.issue.labels.map(l => l.name);
+                if (!existingLabels.includes(labelName)) {
+                  await github.rest.issues.addLabels({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: context.issue.number,
+                    labels: [labelName],
+                  });
+                  core.notice(`Label "${labelName}" added. ${reason}`);
+                  return true;
+                }
+                core.notice(`Label "${labelName}" already present.`);
+                return false;
+              }
+              
+              core.notice(`No matching terms found for label "${labelName}".`);
+              return false;
+            }
+            
+            // Process all configured labels
+            const processLabels = Object.entries(labelConfig)
+              .map(([labelName, config]) => processLabel(labelName, config));
+            const labelsAdded = await Promise.all(processLabels);
+            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
+            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
\ No newline at end of file

From 9b0187003e62bdb7311b23b5b5026ea8e4e207d3 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 27 Aug 2025 01:10:42 +0800
Subject: [PATCH 607/932] [Bugfix] Fix cuda event usage with CPU model runner
 (#23643)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 vllm/v1/worker/cpu_model_runner.py | 28 +++++++++++++++++++++++++---
 vllm/v1/worker/gpu_model_runner.py |  2 +-
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index a7180afbd6..137578f0e6 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -11,6 +11,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+from vllm.v1.worker.utils import CpuGpuBuffer
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -21,7 +22,8 @@ logger = init_logger(__name__)
 class CPUModelRunner(GPUModelRunner):
 
     def __init__(self, vllm_config: VllmConfig, device: torch.device):
-        super().__init__(vllm_config, device)
+        with _torch_cuda_wrapper():
+            super().__init__(vllm_config, device)
 
         assert device == torch.device("cpu")
         assert self.speculative_config is None, "spec decode is not supported."
@@ -71,8 +73,8 @@ class CPUModelRunner(GPUModelRunner):
                 setattr(obj, device_attr_name, cpu_tensor)
 
         for k, v in vars(self).items():
-            if k.endswith("_cpu") and isinstance(v, torch.Tensor):
-                replace_tensor(self, k, k[:-4])
+            if isinstance(v, CpuGpuBuffer):
+                v.gpu = v.cpu
 
         for k, v in vars(self.input_batch).items():
             if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor):
@@ -108,6 +110,26 @@ class CPUModelRunner(GPUModelRunner):
     def _sync_device(self) -> None:
         pass
 
+    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
+        return sampled_token_ids.tolist()
+
+
+@contextmanager
+def _torch_cuda_wrapper():
+
+    class _EventPlaceholder:
+
+        def __init__(self, *args, **kwargs) -> None:
+            self.record = lambda: None
+            self.synchronize = lambda: None
+
+    try:
+        cuda_event = torch.cuda.Event
+        torch.cuda.Event = _EventPlaceholder
+        yield
+    finally:
+        torch.cuda.Event = cuda_event
+
 
 @contextmanager
 def _set_global_compilation_settings(config: VllmConfig):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 14f2305dad..f1ceaaae62 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -321,7 +321,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             (self.max_model_len, 1),
             dtype=torch.int64,
             device="cpu",
-            pin_memory=True)
+            pin_memory=self.pin_memory)
 
     def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
         return CpuGpuBuffer(*args,

From 730d0ac8b9678d64294ddc1e3431a27a50b5e42f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?=
 <ohg3417@gmail.com>
Date: Wed, 27 Aug 2025 03:19:23 +0900
Subject: [PATCH 608/932] [Docs] Fix warnings in `mkdocs build` (#23649)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../backends/differential_flash_attn.py       | 14 ++++++----
 vllm/attention/backends/flash_attn.py         |  5 ++--
 vllm/attention/backends/rocm_flash_attn.py    | 11 ++++----
 vllm/attention/backends/utils.py              |  2 +-
 vllm/attention/backends/xformers.py           | 12 ++++-----
 vllm/core/block_manager.py                    |  8 +++---
 vllm/engine/async_llm_engine.py               |  4 +--
 vllm/engine/llm_engine.py                     |  8 +++---
 vllm/entrypoints/llm.py                       | 10 +++----
 .../tool_parsers/minimax_tool_parser.py       |  3 ++-
 vllm/model_executor/layers/lightning_attn.py  | 11 +++++++-
 vllm/model_executor/layers/linear.py          |  5 ++--
 vllm/outputs.py                               |  4 +--
 vllm/sequence.py                              | 27 +++++++------------
 14 files changed, 66 insertions(+), 58 deletions(-)

diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py
index ce9467efd2..caa02530d2 100644
--- a/vllm/attention/backends/differential_flash_attn.py
+++ b/vllm/attention/backends/differential_flash_attn.py
@@ -805,14 +805,18 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
         """Forward pass with FlashAttention.
 
         Args:
-            query: shape = [num_tokens, num_heads, head_size]
-            key: shape = [num_tokens, num_kv_heads, head_size]
-            value: shape = [num_tokens, num_kv_heads, head_size]
-            output: shape = [num_tokens, num_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            layer: Attention layer instance.
+            q: Query tensor with shape = [num_tokens, num_heads, head_size]
+            k: Key tensor with shape = [num_tokens, num_kv_heads, head_size]
+            v: Value tensor with shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache: KV cache tensor with shape 
+                [2, num_blocks, block_size, num_kv_heads, head_size].
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
+            output: Output tensor with shape [num_tokens, num_heads, head_size]
+            output_scale: Optional output scale tensor.
+            output_block_scale: Optional output block scale tensor.
         NOTE: It in-place updates the output tensor.
         NOTE: FP8 quantization, flash-attn expect the size of
               {q,k,v}_descale to be (num_sequences, num_kv_heads).
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ba7a9afe86..d8cb208c4f 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -605,7 +605,8 @@ class FlashAttentionImpl(AttentionImpl):
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
             output: shape = [num_tokens, num_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: KV cache tensor with shape 
+                [2, num_blocks, block_size, num_kv_heads, head_size].
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
@@ -850,7 +851,7 @@ class FlashAttentionImpl(AttentionImpl):
 
 
 def _get_query_key_seq_metadata(
-    attn_metadata,
+    attn_metadata: FlashAttentionMetadata,
     is_prompt: bool,
     attn_type: str,
 ) -> tuple:
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index e4c27a0ef3..9262144e37 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -584,17 +584,18 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                 use prefill sequence attributes
 
         Args:
+            layer: Attention layer instance.
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            kv_cache: KV cache tensor with shape 
+                [2, num_blocks, block_size * num_kv_heads * head_size].
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
-            attn_type: Select attention type, between encoder attention,
-                       decoder self-attention, or encoder/decoder cross-
-                       attention. Defaults to decoder self-attention,
-                       which is the vLLM default generally
+            output: Optional output tensor.
+            output_scale: Optional output scale tensor.
+            output_block_scale: Optional output block scale tensor.
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 34e059067d..7b6c426b0f 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -561,7 +561,7 @@ def get_num_prefill_decode_query_kv_tokens(
 
     Raises:
         AssertionError: If the number of encoder tokens in `attn_metadata` 
-        is `None` when required for the calculations.
+            is `None` when required for the calculations.
     """
     num_prefill_query_tokens = 0
     num_decode_query_tokens = 0
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index c1213f7620..302d3d7ea9 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -471,17 +471,18 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
                 max_encoder_seq_len)
     
         Args:
+            layer: Attention layer instance.
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            kv_cache: KV cache tensor with shape 
+                [2, num_blocks, block_size * num_kv_heads * head_size].
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
-            attn_type: Select attention type, between encoder attention,
-                       decoder self-attention, or encoder/decoder cross-
-                       attention. Defaults to decoder self-attention,
-                       which is the vLLM default generally
+            output: Optional output tensor.
+            output_scale: Optional output scale tensor.
+            output_block_scale: Optional output block scale tensor.
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
@@ -644,7 +645,6 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
         for API spec.
 
         Args:
-            output: shape = [num_prefill_tokens, num_heads, head_size]
             query: shape = [num_prefill_tokens, num_heads, head_size]
             key: shape = [num_prefill_tokens, num_kv_heads, head_size]
             value: shape = [num_prefill_tokens, num_kv_heads, head_size]
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 4ec5a775f4..cbfa4d7ff3 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -352,7 +352,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
         with num_lookahead_slots.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap in.
+            seq_group (SequenceGroup): The sequence group to swap in.
             num_lookahead_slots (int): Number of lookahead slots used in 
                 speculative decoding, default to 0.
 
@@ -405,8 +405,6 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
 
         Args:
             seq_group (SequenceGroup): The sequence group to swap out.
-            num_lookahead_slots (int): Number of lookahead slots used in 
-                speculative decoding, default to 0.
 
         Returns:
             bool: Whether it's possible to swap out current sequence group.
@@ -420,7 +418,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
         swapping out the given sequence_group with num_lookahead_slots.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap out.
+            seq_group (SequenceGroup): The sequence group to swap out.
 
         Returns:
             List[Tuple[int, int]]: The mapping of swapping block from 
@@ -473,7 +471,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
         on to the 'device'.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap in/out.
+            seq_group (SequenceGroup): The sequence group to swap in/out.
             device (Device): device to swap the 'seq_group' on.
             status (SequenceStatus): The status of sequence which is needed
                 for action. RUNNING for swap out and SWAPPED for swap in
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 84ad2299b0..4fb028627a 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -486,10 +486,10 @@ class AsyncLLMEngine(EngineClient):
     _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
 
     def __init__(self,
-                 *args,
+                 *args: Any,
                  log_requests: bool = True,
                  start_engine_loop: bool = True,
-                 **kwargs) -> None:
+                 **kwargs: Any) -> None:
         if envs.VLLM_USE_V1:
             raise ValueError(
                 "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index dbf8d3ba50..cbd714c159 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -644,10 +644,10 @@ class LLMEngine:
         Details:
             - Set arrival_time to the current time if it is None.
             - Set prompt_token_ids to the encoded prompt if it is None.
-            - Create `n` number of [Sequence][vllm.Sequence] objects.
-            - Create a [SequenceGroup][vllm.SequenceGroup] object
-              from the list of [Sequence][vllm.Sequence].
-            - Add the [SequenceGroup][vllm.SequenceGroup] object to the
+            - Create `n` number of [Sequence][vllm.sequence.Sequence] objects.
+            - Create a [SequenceGroup][vllm.sequence.SequenceGroup] object
+              from the list of [Sequence][vllm.sequence.Sequence].
+            - Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the
               scheduler.
 
         Example:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 728ed8328d..8816ff56d6 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -186,7 +186,7 @@ class LLM:
                                            CompilationConfig]] = None,
         logits_processors: Optional[list[Union[str,
                                                type[LogitsProcessor]]]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         """LLM constructor."""
 
@@ -697,8 +697,8 @@ class LLM:
         Generate responses for a chat conversation.
 
         The chat conversation is converted into a text prompt using the
-        tokenizer and calls the [generate][] method to generate the
-        responses.
+        tokenizer and calls the [generate][vllm.LLM.generate] method to generate
+        the responses.
 
         Multi-modal inputs can be passed in the same way you would pass them
         to the OpenAI API.
@@ -1334,8 +1334,8 @@ class LLM:
 
     def wake_up(self, tags: Optional[list[str]] = None):
         """
-        Wake up the engine from sleep mode. See the [sleep][] method
-        for more details.
+        Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
+        method for more details.
 
         Args:
             tags: An optional list of tags to reallocate the engine memory
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
index 283e609501..0fd62f0b6a 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -461,7 +461,8 @@ class MinimaxToolParser(ToolParser):
                 i += 1
         return boundaries
 
-    def _extract_tool_args(self, tool_content: str, args_match) -> str:
+    def _extract_tool_args(self, tool_content: str,
+                           args_match: re.Match[str]) -> str:
         """
         Extract tool arguments from tool content.
         
diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
index 8ffc700ca5..0b87acc851 100644
--- a/vllm/model_executor/layers/lightning_attn.py
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
 import torch
 from einops import rearrange
 
@@ -453,7 +455,14 @@ class _attention(torch.autograd.Function):
 lightning_attention_ = _attention.apply
 
 
-def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
+def lightning_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    ed: torch.Tensor,
+    block_size: int = 256,
+    kv_history: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Apply lightning attention algorithm 
     to compute attention efficiently.
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index dd54aebeb0..c0fcacd1e6 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -233,10 +233,10 @@ class LinearBase(CustomOp):
     Args:
         input_size: input dimension of the linear layer.
         output_size: output dimension of the linear layer.
-        bias: If true, add bias.
         skip_bias_add: If true, skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
+        prefix: Prefix for parameter names.
         return_bias: If true, return bias together with outputs in forward pass.
     """
 
@@ -378,13 +378,14 @@ class MergedReplicatedLinear(ReplicatedLinear):
 
     Args:
         input_size: input dimension of the linear layer.
-        output_size: output dimension of the linear layer.
+        output_sizes: list of output dimensions of the linear layer.
         bias: If true, add bias.
         skip_bias_add: If true, skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
     """
 
     def __init__(
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 9784a88944..acdb2f89ce 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -409,7 +409,7 @@ class EmbeddingOutput:
 
     Args:
         embedding: The embedding vector, which is a list of floats.
-        Its length depends on the hidden dimension of the model.
+            Its length depends on the hidden dimension of the model.
     """
     embedding: list[float]
 
@@ -447,7 +447,7 @@ class ClassificationOutput:
 
     Args:
         probs: The probability vector, which is a list of floats.
-        Its length depends on the number of classes.
+            Its length depends on the number of classes.
     """
     probs: list[float]
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 3c4c77aea5..36b1b198bd 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -147,18 +147,7 @@ class SequenceDataDelta(
 
 class SequenceData(msgspec.Struct,
                    omit_defaults=True):  # type: ignore[call-arg]
-    """Data associated with a sequence.
-
-    Args:
-        prompt_token_ids: The token IDs of the prompt.
-        output_token_ids: The token IDs of the output. Set to an empty list if
-            None.
-
-    Attributes:
-        prompt_token_ids: The token IDs of the prompt.
-        output_token_ids: The token IDs of the output.
-        cumulative_logprob: The cumulative log probability of the output.
-    """
+    """Data associated with a sequence."""
     # NOTE: we cannot use Union[list, array] because msgspec cannot support
     # union of 2 list types.
     _prompt_token_ids: array
@@ -256,10 +245,12 @@ class SequenceData(msgspec.Struct,
 
     @property
     def cumulative_logprob(self) -> float:
+        """The cumulative log probability of the output."""
         return self._cumulative_logprob
 
     @property
     def prompt_token_ids(self) -> tuple[int, ...]:
+        """The token IDs of the prompt."""
         return self._prompt_token_ids_tuple
 
     @prompt_token_ids.setter
@@ -277,6 +268,7 @@ class SequenceData(msgspec.Struct,
 
     @property
     def output_token_ids(self) -> tuple[int, ...]:
+        """The token IDs of the output."""
         return tuple(self._output_token_ids)
 
     @output_token_ids.setter
@@ -940,7 +932,7 @@ class SequenceGroupMetadata(
         omit_defaults=True):  # type: ignore[call-arg]
     """Metadata for a sequence group. Used to create `AttentionMetadata`.
 
-    Args:
+    Attributes:
         request_id: The ID of the request.
         is_prompt: Whether the request is at prompt stage.
         seq_data: The sequence data. (Seq id -> sequence data)
@@ -950,14 +942,14 @@ class SequenceGroupMetadata(
         do_sample: True if sampling is required. Sampling is not required when
             e.g., prefill is chunked, and the current iteration only computes
             query tokens for prefill, we don't need sampling.
-        token_chunk_size: The number of tokens to be processed (per sequence).
-            None if chunking is not required.
+        pooling_params: Pooling parameters.
         lora_request: LoRA request.
         computed_block_nums: The block numbers that are already computed,
             used in prefix caching.
         state: Internal state tied to this sequence group.
+        token_type_ids: Token type IDs.
         multi_modal_data: Multi modal data.
-        mm_processor_kwargs: Multimodal input processor / mapper overrides.
+        multi_modal_placeholders: Multi modal placeholders.
         encoder_seq_data: Optional sequence data for encoder prompt
                           (SequenceGroup.encoder_seq). Should be None
                           unless you are working with an encoder/decoder
@@ -1043,12 +1035,13 @@ class SequenceOutput(
         array_like=True):  # type: ignore[call-arg]
     """The model output associated with a sequence.
 
-    Args:
+    Attributes:
         parent_seq_id: The ID of the parent sequence (for forking in beam
             search).
         output_token: The output token ID.
         logprobs: The logprobs of the output token.
             (Token id -> logP(x_i+1 | x_0, ..., x_i))
+        output_embed: Optional output embedding tensor.
     """
     parent_seq_id: int
     output_token: int

From 227e231b55901be4e050d5a8f033e90f45cfba85 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 26 Aug 2025 20:33:16 +0200
Subject: [PATCH 609/932] [Docs] [V1] [Hybrid] Update docs to remove FlashInfer
 constraint for hybrid models (#23665)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 docs/usage/v1_guide.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 7fc615d4c0..64bd0d9bf5 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -111,11 +111,10 @@ Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaFor
 
 Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
-these models currently require disabling prefix caching and using the FlashInfer attention backend in V1.
+these models currently require disabling prefix caching in V1.
 
 Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
-Please note that these models currently require disabling prefix caching, enforcing eager mode, and using the FlashInfer
-attention backend in V1.
+Please note that these models currently require disabling prefix caching and enforcing eager mode in V1.
 
 #### Encoder-Decoder Models
 

From 98aa16ff41353e3e6c8a3c2f4e933a888dbce1cb Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 26 Aug 2025 14:49:06 -0400
Subject: [PATCH 610/932] [v1] Add cross-attention KV cache support for
 encoder-decoder models (#23664)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/multimodal/registry.py                  | 19 +++++++
 vllm/v1/core/kv_cache_coordinator.py         | 34 ++++++++----
 vllm/v1/core/kv_cache_manager.py             |  6 ++-
 vllm/v1/core/sched/scheduler.py              | 37 ++++++++++++-
 vllm/v1/core/single_type_kv_cache_manager.py | 56 +++++++++++++++++++-
 vllm/v1/kv_cache_interface.py                | 15 ++++++
 6 files changed, 153 insertions(+), 14 deletions(-)

diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index ded56cca80..8cd9e56048 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -372,3 +372,22 @@ class MultiModalRegistry:
             )
 
         return dummy_data
+
+    def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int:
+        """
+        Get the maximum length of the encoder input for encoder-decoder models.
+        """
+        if not model_config.is_encoder_decoder:
+            return 0
+        max_tokens = self.\
+            get_max_tokens_per_item_by_nonzero_modality(model_config)
+        if not max_tokens:
+            # TODO - this function assumes encoder-decoder models are
+            # multimodal. This will need to change when adding support for more
+            # than whisper.
+            return 0
+        assert len(max_tokens) == 1, "Encoder-decoder models are expected \
+            to implement the multimodal interface with at most one modality."
+
+        first_modality = next(iter(max_tokens))
+        return max_tokens[first_modality]
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index a0ea4d9601..f082ad00f2 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -6,7 +6,7 @@ from typing import Optional
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.core.single_type_kv_cache_manager import (
-    FullAttentionManager, get_manager_for_kv_cache_spec)
+    CrossAttentionManager, FullAttentionManager, get_manager_for_kv_cache_spec)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.request import Request
@@ -42,9 +42,10 @@ class KVCacheCoordinator(ABC):
             ) for i, kv_cache_group in enumerate(
                 self.kv_cache_config.kv_cache_groups))
 
-    def get_num_blocks_to_allocate(
-            self, request_id: str, num_tokens: int,
-            new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> int:
+    def get_num_blocks_to_allocate(self, request_id: str, num_tokens: int,
+                                   new_computed_blocks: tuple[
+                                       list[KVCacheBlock], ...],
+                                   num_encoder_tokens: int) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
 
@@ -54,14 +55,22 @@ class KVCacheCoordinator(ABC):
                 tokens that are already allocated).
             new_computed_blocks: The new computed blocks just hitting the
                 prefix caching.
+            num_encoder_tokens: The number of encoder tokens for allocating
+                blocks for cross-attention.
 
         Returns:
             The number of blocks.
         """
         num_blocks_to_allocate = 0
         for i, manager in enumerate(self.single_type_managers):
-            num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
-                request_id, num_tokens, new_computed_blocks[i])
+            if isinstance(manager, CrossAttentionManager):
+                # For cross-attention, we issue a single static allocation
+                # of blocks based on the number of encoder input tokens.
+                num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
+                    request_id, num_encoder_tokens, [])
+            else:
+                num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
+                    request_id, num_tokens, new_computed_blocks[i])
         return num_blocks_to_allocate
 
     def save_new_computed_blocks(
@@ -79,8 +88,11 @@ class KVCacheCoordinator(ABC):
             manager.save_new_computed_blocks(request_id,
                                              new_computed_blocks[i])
 
-    def allocate_new_blocks(self, request_id: str,
-                            num_tokens: int) -> tuple[list[KVCacheBlock], ...]:
+    def allocate_new_blocks(
+            self,
+            request_id: str,
+            num_tokens: int,
+            num_encoder_tokens: int = 0) -> tuple[list[KVCacheBlock], ...]:
         """
         Allocate new blocks for the request to give it at least `num_tokens` 
         token slots.
@@ -89,12 +101,16 @@ class KVCacheCoordinator(ABC):
             request_id: The request ID.
             num_tokens: The total number of tokens that need a slot (including 
                 tokens that are already allocated).
+            num_encoder_tokens: The number of encoder tokens for allocating
+                blocks for cross-attention.
 
         Returns:
             The new allocated blocks.
         """
         return tuple(
-            manager.allocate_new_blocks(request_id, num_tokens)
+            manager.allocate_new_blocks(
+                request_id, num_encoder_tokens if isinstance(
+                    manager, CrossAttentionManager) else num_tokens)
             for manager in self.single_type_managers)
 
     def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index fd0bdb2c80..b427a9c497 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -187,6 +187,7 @@ class KVCacheManager:
         new_computed_blocks: Optional[KVCacheBlocks] = None,
         num_lookahead_tokens: int = 0,
         delay_cache_blocks: bool = False,
+        num_encoder_tokens: int = 0,
     ) -> Optional[KVCacheBlocks]:
         """Add slots for a request with new tokens to append.
 
@@ -253,6 +254,7 @@ class KVCacheManager:
             request_id=request.request_id,
             num_tokens=num_tokens_need_slot,
             new_computed_blocks=new_computed_block_list,
+            num_encoder_tokens=num_encoder_tokens,
         )
 
         if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
@@ -273,7 +275,7 @@ class KVCacheManager:
                                                   new_computed_block_list)
 
         new_blocks = self.coordinator.allocate_new_blocks(
-            request.request_id, num_tokens_need_slot)
+            request.request_id, num_tokens_need_slot, num_encoder_tokens)
 
         # P/D: delay caching blocks if we have to recv from
         # remote. Update state for locally cached blocks.
@@ -292,7 +294,7 @@ class KVCacheManager:
 
     def free(self, request: Request) -> None:
         """Free the blocks allocated for the request.
-        We free the blocks in reverse order so that he tail blocks are evicted 
+        We free the blocks in reverse order so that the tail blocks are evicted
         first when caching is enabled.
 
         Args:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 522b340b32..14a914d8f2 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -58,6 +58,7 @@ class Scheduler(SchedulerInterface):
         self.parallel_config = vllm_config.parallel_config
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
+        self.is_encoder_decoder = vllm_config.model_config.is_encoder_decoder
 
         # include_finished_set controls whether a separate set of finished
         # request ids should be included in the EngineCoreOutputs returned
@@ -83,6 +84,9 @@ class Scheduler(SchedulerInterface):
             assert len(self.kv_cache_config.kv_cache_groups) == 1, (
                 "Multiple KV cache groups are not currently supported "
                 "with KV connectors")
+            assert not self.is_encoder_decoder, (
+                "Encoder-decoder models are not currently supported "
+                "with KV connectors")
             self.connector = KVConnectorFactory.create_connector(
                 config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
 
@@ -431,6 +435,22 @@ class Scheduler(SchedulerInterface):
                                               == 0 else
                                               self.num_lookahead_tokens)
 
+                # Determine if we need to allocate cross-attention blocks.
+                if self.is_encoder_decoder and request.has_encoder_inputs:
+                    # TODO(russellb): For Whisper, we know that the input is
+                    # always padded to the maximum length. If we support other
+                    # encoder-decoder models, this will need to be updated if we
+                    # want to only allocate what is needed.
+                    assert ("whisper"
+                            in self.vllm_config.model_config.model.lower()), (
+                                "Whisper is the only supported "
+                                "encoder-decoder model.")
+                    num_encoder_tokens = MULTIMODAL_REGISTRY.\
+                        get_encdec_max_encoder_len(
+                        self.vllm_config.model_config)
+                else:
+                    num_encoder_tokens = 0
+
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
                     num_new_tokens + num_external_computed_tokens,
@@ -438,6 +458,7 @@ class Scheduler(SchedulerInterface):
                     new_computed_blocks,
                     num_lookahead_tokens=effective_lookahead_tokens,
                     delay_cache_blocks=load_kv_async,
+                    num_encoder_tokens=num_encoder_tokens,
                 )
 
                 if new_blocks is None:
@@ -703,7 +724,21 @@ class Scheduler(SchedulerInterface):
                 # The encoder input is not needed in this step.
                 break
 
-            if start_pos + num_encoder_tokens <= num_computed_tokens:
+            if self.is_encoder_decoder and num_computed_tokens > 0:
+                assert start_pos == 0, (
+                    "Encoder input should be processed at the beginning of "
+                    "the sequence when encoder-decoder models are used.")
+                # Encoder input has already been computed
+                # The calculation here is a bit different. We don't turn encoder
+                # output into tokens that get processed by the decoder and
+                # reflected in num_computed_tokens. Instead, start_pos reflects
+                # the position where we need to ensure we calculate encoder
+                # inputs. This should always be 0 to ensure we calculate encoder
+                # inputs before running the decoder.  Once we've calculated some
+                # decoder tokens (num_computed_tokens > 0), then we know we
+                # already calculated encoder inputs and can skip here.
+                continue
+            elif start_pos + num_encoder_tokens <= num_computed_tokens:
                 # The encoder input is already computed and stored
                 # in the decoder's KV cache.
                 continue
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 82e0292522..f0af921229 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -8,8 +8,9 @@ from vllm.utils import cdiv
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
 from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
-                                        FullAttentionSpec, KVCacheSpec,
-                                        MambaSpec, SlidingWindowSpec)
+                                        CrossAttentionSpec, FullAttentionSpec,
+                                        KVCacheSpec, MambaSpec,
+                                        SlidingWindowSpec)
 from vllm.v1.request import Request
 
 
@@ -552,11 +553,62 @@ class MambaManager(SingleTypeKVCacheManager):
         return new_blocks
 
 
+class CrossAttentionManager(SingleTypeKVCacheManager):
+    """Manager for cross-attention KV cache in encoder-decoder models."""
+
+    def save_new_computed_blocks(
+            self, request_id: str,
+            new_computed_blocks: list[KVCacheBlock]) -> None:
+        # We do not cache blocks for cross-attention to be shared between
+        # requests, so  `new_computed_blocks` should always be empty.
+        assert len(new_computed_blocks) == 0
+
+    def cache_blocks(self, request: Request, num_tokens: int) -> None:
+        # We do not cache blocks for cross-attention to be shared between
+        # requests, so this method is not relevant.
+        raise ValueError("Should not be called as prefix caching is disabled.")
+
+    def get_num_common_prefix_blocks(self, request_id: str,
+                                     num_running_requests: int) -> int:
+        # Cross-attention blocks contain request-specific encoder states
+        # and are not shared between different requests
+        return 0
+
+    @classmethod
+    def find_longest_cache_hit(
+        cls,
+        block_hashes: list[BlockHash],
+        max_length: int,
+        kv_cache_group_ids: list[int],
+        block_pool: BlockPool,
+        kv_cache_spec: KVCacheSpec,
+        use_eagle: bool,
+    ) -> tuple[list[KVCacheBlock], ...]:
+        assert isinstance(kv_cache_spec, CrossAttentionSpec), (
+            "CrossAttentionManager can only be used for cross-attention groups"
+        )
+        # Cross-attention does not benefit from prefix caching since:
+        # 1. Encoder states are unique per request (different audio/image
+        #    inputs)
+        # 2. Encoder states are computed once per request, not incrementally
+        # 3. No reusable prefix exists between different multimodal inputs
+        # Return empty blocks to indicate no cache hits
+        raise NotImplementedError(
+            "CrossAttentionManager does not support caching")
+
+    def remove_skipped_blocks(self, request_id: str,
+                              num_computed_tokens: int) -> None:
+        # Cross-attention blocks represent encoder states which are needed
+        # for the entire decoding process, so no blocks should be skipped
+        pass
+
+
 spec_manager_map: dict[type[KVCacheSpec], type[SingleTypeKVCacheManager]] = {
     FullAttentionSpec: FullAttentionManager,
     SlidingWindowSpec: SlidingWindowManager,
     ChunkedLocalAttentionSpec: ChunkedLocalAttentionManager,
     MambaSpec: MambaManager,
+    CrossAttentionSpec: CrossAttentionManager,
 }
 
 
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index ed8e0bf798..a3e4d393e4 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -11,6 +11,7 @@ from typing_extensions import Self
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.utils import cdiv, get_dtype_size
 
 logger = init_logger(__name__)
@@ -211,6 +212,20 @@ class EncoderOnlyAttentionSpec(AttentionSpec):
         return 0
 
 
+@dataclass(frozen=True)
+class CrossAttentionSpec(AttentionSpec):
+    """
+    KV cache spec for cross-attention layers in encoder-decoder models.
+    """
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        # For cross-attention, we need to cache encoder states
+        # Get encoder length (e.g., 1500 for Whisper).
+        max_encoder_len = MULTIMODAL_REGISTRY.\
+            get_encdec_max_encoder_len(vllm_config.model_config)
+        return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes
+
+
 @dataclass
 class KVCacheTensor:
     """

From 9715f7bb0fd70fa3dac6f35c824e90e58f0086ce Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 03:01:25 +0800
Subject: [PATCH 611/932] [Bugfix] Fix incorrect original shape in hashing
 (#23672)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 tests/multimodal/test_hasher.py |  7 ++++---
 vllm/multimodal/hasher.py       | 10 ++++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py
index 75a233c256..2751e38760 100644
--- a/tests/multimodal/test_hasher.py
+++ b/tests/multimodal/test_hasher.py
@@ -45,10 +45,11 @@ def test_hash_collision_image_transpose():
     assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
 
 
-def test_hash_collision_tensor_shape():
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+def test_hash_collision_tensor_shape(dtype):
     # The hash should be different though the data is the same when flattened
-    arr1 = torch.zeros((5, 10, 20, 3))
-    arr2 = torch.zeros((10, 20, 5, 3))
+    arr1 = torch.zeros((5, 10, 20, 3), dtype=dtype)
+    arr2 = torch.zeros((10, 20, 5, 3), dtype=dtype)
 
     hasher = MultiModalHasher
     assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 479961776a..3708dc7065 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -45,16 +45,22 @@ class MultiModalHasher:
         if isinstance(obj, torch.Tensor):
             tensor_obj: torch.Tensor = obj.cpu()
             tensor_dtype = tensor_obj.dtype
+            tensor_shape = tensor_obj.shape
+
+            # NumPy does not support bfloat16.
+            # Workaround: View the tensor as a contiguous 1D array of bytes
             if tensor_dtype == torch.bfloat16:
                 tensor_obj = tensor_obj.contiguous()
                 tensor_obj = tensor_obj.view(
                     (tensor_obj.numel(), )).view(torch.uint8)
+
                 return cls.item_to_bytes(
                     "tensor", {
                         "original_dtype": str(tensor_dtype),
-                        "original_shape": tuple(tensor_obj.shape),
-                        "data": tensor_obj.numpy()
+                        "original_shape": tuple(tensor_shape),
+                        "data": tensor_obj.numpy(),
                     })
+
             return cls.item_to_bytes("tensor", tensor_obj.numpy())
         if isinstance(obj, np.ndarray):
             # If the array is non-contiguous, we need to copy it first

From c37c0af990ed1f3623448b82903c1ae52e84cc05 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Wed, 27 Aug 2025 03:31:20 +0800
Subject: [PATCH 612/932] [Misc] Fix comments in `tests/kernels/quantization`
 (#23675)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 tests/kernels/quantization/test_awq_triton.py          | 2 +-
 tests/kernels/quantization/test_cutlass_2of4_sparse.py | 2 +-
 tests/kernels/quantization/test_cutlass_scaled_mm.py   | 2 +-
 tests/kernels/quantization/test_cutlass_w4a8.py        | 2 +-
 tests/kernels/quantization/test_machete_mm.py          | 2 +-
 tests/kernels/quantization/test_marlin_gemm.py         | 2 +-
 tests/kernels/quantization/test_triton_scaled_mm.py    | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
index 96797e85bd..9354495642 100644
--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the AWQ Triton kernel.
 
-Run `pytest tests/kernels/test_awq_triton.py`.
+Run `pytest tests/kernels/quantization/test_awq_triton.py`.
 """
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
index 878f66647e..ae61b3b3a2 100644
--- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for sparse cutlass kernels
 
-Run `pytest tests/kernels/test_semi_structured.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_2of4_sparse.py`.
 """
 
 import pytest
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index a15decdf6f..65320509e1 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for cutlass kernels
 
-Run `pytest tests/kernels/test_cutlass.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_scaled_mm.py`.
 """
 import random
 
diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py
index 7832f8179d..f659408efe 100644
--- a/tests/kernels/quantization/test_cutlass_w4a8.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the CUTLASS W4A8 kernel.
 
-Run `pytest tests/kernels/test_cutlass_w4a8.py`.
+Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`.
 """
 
 from dataclasses import dataclass
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index 0e09661c95..50584f3f82 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the machete kernel.
 
-Run `pytest tests/kernels/test_machete_mm.py`.
+Run `pytest tests/kernels/quantization/test_machete_mm.py`.
 """
 
 import math
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index ad077e0b94..0be020085b 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the marlin kernel.
 
-Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
+Run `pytest tests/kernels/quantization/test_marlin_gemm.py`.
 """
 import pytest
 import torch
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index 24245663fb..d8cfb5710d 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the triton_scaled_mm kernel
 
-Run `pytest tests/kernels/test_triton_scaled_mm.py`.
+Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`.
 """
 import importlib
 from typing import Optional

From 9816b81f5f9f85391dc30ae5f48185542dfec2af Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 27 Aug 2025 03:46:52 +0800
Subject: [PATCH 613/932] [Model] Enable video support for InternVL3.5 models
 (#23658)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md                        |  4 ++--
 tests/models/multimodal/processing/test_common.py      |  3 +++
 .../models/multimodal/processing/test_tensor_schema.py |  7 ++++++-
 tests/models/registry.py                               |  5 ++++-
 vllm/model_executor/models/internvl.py                 | 10 +++++++---
 5 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 4763f2281d..74f3a9d1cd 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -627,7 +627,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
@@ -701,7 +701,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th
     - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.
 
 !!! note
-    Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
+    For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc), InternVL3 and InternVL3.5 have video inputs support currently.
 
 !!! note
     To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index a604d11f0e..74ca10d326 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -286,6 +286,9 @@ def _test_processing_correctness_one(
     "internlm/Intern-S1",
     "OpenGVLab/InternVL2-1B",
     "OpenGVLab/InternVL3-1B",
+    "OpenGVLab/InternVL3_5-1B",
+    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
+    "OpenGVLab/InternVL3_5-30B-A3B",
     "Kwai-Keye/Keye-VL-8B-Preview",
     "moonshotai/Kimi-VL-A3B-Instruct",
     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 79164f02c3..2d8cd49edc 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -38,7 +38,12 @@ ARCH_NEEDS_EXTRAS = [
     "MiniCPMV",
     "PaliGemmaForConditionalGeneration",
 ]
-REPO_ID_TO_SKIP = {"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test"}
+REPO_ID_TO_SKIP = {
+    "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
+    # FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model
+    # after support PP for GPT-OSS
+    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model",
+}
 
 ImageInput = list[Image.Image]
 VideoInput = Union[list[Image.Image], list[np.ndarray],
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b34c6f2e5d..20c7c3af67 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -422,7 +422,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                         trust_remote_code=True),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B",
-                                                 "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
+                                                 "3.0": "OpenGVLab/InternVL3-1B",   # noqa: E501
+                                                 "3.5-qwen3": "OpenGVLab/InternVL3_5-1B",   # noqa: E501
+                                                 "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B",   # noqa: E501
+                                                 "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"},  # noqa: E501
                                          trust_remote_code=True),
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                     trust_remote_code=True),
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index da8ad83967..b09ed7bbe7 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -855,9 +855,13 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
 
     def get_video_token(self) -> Optional[str]:
         text_model_type = self.get_hf_config().get_text_config().model_type
-        if text_model_type == "qwen2":
-            return "<|video_pad|>"
-        return None
+        video_token_map = {
+            "qwen2": "<|video_pad|>",
+            "qwen3": "<|video_pad|>",
+            "qwen3_moe": "<|video_pad|>",
+            "gpt_oss": "<|reserved_200000|>",
+        }
+        return video_token_map.get(text_model_type)
 
     def get_num_frames_with_most_features(
         self,

From d696f86e7bdf23a6a4c212fee3522a589a460b24 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 26 Aug 2025 13:19:05 -0700
Subject: [PATCH 614/932] [doc] Hybrid KV Cache Manager design doc (#22688)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../basic_grouping_example.png                | Bin 0 -> 24096 bytes
 .../hybrid_kv_cache_manager/full_attn.png     | Bin 0 -> 4120 bytes
 .../hybrid_kv_cache_manager/memory_layout.png | Bin 0 -> 63113 bytes
 .../hybrid_kv_cache_manager/overview.png      | Bin 0 -> 39501 bytes
 .../hybrid_kv_cache_manager/sw_attn.png       | Bin 0 -> 4560 bytes
 docs/design/hybrid_kv_cache_manager.md        | 245 ++++++++++++++++++
 6 files changed, 245 insertions(+)
 create mode 100644 docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png
 create mode 100644 docs/assets/design/hybrid_kv_cache_manager/full_attn.png
 create mode 100644 docs/assets/design/hybrid_kv_cache_manager/memory_layout.png
 create mode 100644 docs/assets/design/hybrid_kv_cache_manager/overview.png
 create mode 100644 docs/assets/design/hybrid_kv_cache_manager/sw_attn.png
 create mode 100644 docs/design/hybrid_kv_cache_manager.md

diff --git a/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png b/docs/assets/design/hybrid_kv_cache_manager/basic_grouping_example.png
new file mode 100644
index 0000000000000000000000000000000000000000..185f61e6a3edeefb7f3c2bc5bcb942af3408da0e
GIT binary patch
literal 24096
zcma&O1yog0+ckPnB&0)-kPd0-ZlqJXr5jYbLqb}*ySuxTknV2j?(To#cfWhb|9<1%
z|DNH{!}jcb)?Ux^%xBKI)+SJ1Rt)Ji?rR7Hf+Qg>@(BWYAqn0$!My-~zd@zYgg~^n
zBt!(2oRSYx?6ok5pSrg={%#*)er%M&lz1az`BCY^hl)292u+%32qEF=YHD6DUW_d+
zF1{06SbTg`YwL#p;q_(fBx!5yaV90j$!ek1(RhjZyC5pG01+x2G*j#6^cn~p^b2nR
zQ9{$De}|Cu$ZR$?HWpi49VZKw-Q3(*Frjg<kdTmu)A;k6nqZ6+F}PxowpBQzWsGof
zadp}}4z{=7zkh$-54G-Tx!4#ZI`@2?7dJFxV`D#ccS>4HG-~>~x;FNv%f5a4#u5#O
zgT=|oiG_t_=Qy2iX)csk3AIQaP+5Rj*zD<4=DfDP4h{Jc94r_A!P~r&$Wy}Al{>xT
zfB#<jjR)^i^5<{C2@p7g@fINZ`4x`E=KpkHEiEeqiI<mGT3Y(Mh`6+LaejVzWo2e!
z_LHiTlG63{^@k51)HF1Lf`X6<ILXL-BO*qB{qkb+{l;W1BO_yNeONIDLPbPFL&MB$
zJDT}1+FMXiFd-p<goH#zMTG<3`j@Yjl@%W!-^Im6V`C#17nge&I-IY?bm_rzo9DzU
zM|=C5>uV)tWhxpPJRF>e4hgbPpFe+gaH#e2`ZbWi{0bTxW|)$gIHRFqc47h(A`eET
zq%=Y4lbATCCe0iX6BCn`CU*lnFf@dWjBIRdoXTnu>+9QjgwDpw>i%$B)7q*dCl`L|
z6Bixr_HY{$5%KkSofIa#`wo1os%lYwzP!#CO!UC@l@&cby-`Pf1c-}^OM6Gh9rWqh
zSy5r(3&`C3yt|tlh--CC%^g2~7Rg<I9K+<~B$yNMed}MK1zX+kV$;$XXlS@Crf3mi
zm_MJUrV{AtF8B3Gg0P+w3+|ldAD^BU78O0UXg~|zKRh5HAdr)jn+&H8{u<E~EX>b`
zgM-`N-i~K9__N*{1v|X8(H{rKfk7#ojd*@}85<R)_~}z7;`!Mb20A)OBHRqE=;~^g
z4<o;hDt-+Q5BK$%A9Hkvy-Q*;di|zzgq5_(0u2ofA0HnC0i+dp1H$rMWN2Uj6BV`I
zZU>z+yS%)7Sn+(VJDf2_TUWO%GgIHhL=1UnXGgvKvv+y<aiMZic6K(1Oj2UvEhQ~2
ztw=b%ZfkOIu*KEkqC3g{Tn$1%M`+{z?X8QEk&(K(x~8Tk6ciL59v(@=4<a~%aCRmp
zbaZr$R<{O!e;9UlM-h=uq8lX@mDjId(`(hEF7kd|6?tm=^7->$(_V`4xjAYYnxecs
zF=XfaYeo!=C=l6#0+1C&Wo1WR@E8~ve!jlBGD)?jqa+j*xy&X*=_^Z1OCZB8_h)T4
z2lR}L+`+))4H31S_4G1ALPBzLb8Bi?xw-FR=`?ReI6NPDAVfSo&7O0+yStC7_9v@d
zrnMLp(m!KkYbz?y(9w6(4s`oU3kxH`H^;<iSLnY&K^d+z>K7+N_aPD?8J(Nc(9&`{
zn5)Um%sks1Bt?aLh+=1BGdDAXhlih?nGu(g8ct#_D=I?ZHMFsjbLIls;mOccRAez#
ztiG_YFf}y=Ze1}?*2>aS1jYUSIwB=y5yY~+-6t?`2L${(=K9)NM^DdNR8#?+x9I5Y
z?d_+VgLr+v>gzdVWrxJV2|}egtp?@;F(`9oQ@EV)O_%)J;SO;FMLwyh@Y!vnHo6}F
z{oNZyndMpd=;Z8ta&n?qp(c5@d2le;-+$LcmPOCa!=nW<BDm{QVx4B4bzo4Cijoo$
z92f<dREL#zzeh9GQY}vRTf2#B5mC_>kWZgJjR>={vVy^@(K^r$Bt=C<H8;DNm{9r<
zy?XUZjdp5y7)9`B61xp37gc5DjL}#I-O`*KR&l=Ppn%UMu}gwXMnpm?DJc<CU`x2z
zo35&;h#1847Wlid!3)CDxcB!j5d#DNXCYSBnovCU%=y&RR1iQ=V*r`JEDQ||nbucT
zRmI28Jjb`e&#&wL=1c((6&?KrL|jIOoG>&zd~0RpCdGNL!}IB(dvGu%B_))o$LAFS
zC|Z!1XwU`4#h6%Fhi8uuQnLZb??8}0rbSImC?UeE9UX-e^#CYtXlNMS%C4-83=coL
zIoo=OI-0E#6B2q6IyE;}SXe0em}YKnuG8dj5dOlS{O<N*FF84RJ7(wbFqPAG^YUOG
z4cg4wdUI=w65H|q`YRe*1k^GLXgqQB+WLBWP>`R`w_I<#g$zpM<;#~hr;*}h?Ck8a
zva+;T-1=vt-&^Z`{P^+n=g-Q@$|y>?)e|SBsZ?Iq{ey#We9ligI!pW#c6R3=yi!t9
zP!Q-BzrnSamOR$}gn~LS1r6)V>3T0XcRWvSetw>i&#fL*fF~a)is0~Y0JcwI@3_GY
zz(KcHnGCa8FPPfcc>ew6-Q3K>G6!G)R9G61v%ZZ@oBQ2mb~f2JWJX3tfT?>^C9dEa
z-Qk4p?(U<<0K-j8Ok7>>w3{4q^76u0H*oDil8lXyPnT#099qv-lC!hNo#OZMfFv5v
zmDw6jQ&myPMU<14o|u|?)VAo|j*O2t1p@(n!OCh;EGa9CbgsCsxA*2^PnDay>9H3a
zX3%N=!IqI8$_!9)JHu=|mz9NOq5i7@t)Zs6`eK7EtQ7-*@|Dd^(S9R6y{~q5Ahx1e
z2}wy{Y)|wU*w}HP+3-=yZmxC(gRr6y^5NT`fNA>u`!~3y^`D1_hq<{Frv~W-1?mb4
zqx^b8LPDaVFdA|%Ahhb0WT4R{B_)ZIfxsGrQa7z{Y;;jpR(5o}GKDWKE<V4y;s<$)
z75obF76-@A-#-$s_1LKyl=EzrNkF8sy80}@oKl^ZAX(`*sHlJc{{8j)HxvXF4ec3Q
z0|L5Ddto3G6BE*jEb#8K(b3UgIPhKcK(3b*7msX-h=>@1PO77$^AH6K12Z=}n=#74
z!Lh%;uhZ-V1=-%&=>-^Zkvrk#<pt6-OVvLx&}^~MetYuzd<TPuX0KGcX~Z-vER2xf
zLj*=cVxqUV7jTWmCdX*+Jb)XHhsgl}upkLJkN<??-CZ4l)OaLpl7;cUEG75w@R*yM
zJ2*J-_xG2OkN{l|bg*bN5>ir=!NjhiA$lsRK%$;Vaw%}3oWeqC28K50vrU8UFsSFg
znVgbhGL)?T=@SVJ&E4rnKbVdyhW@UuuDLlbHa0doy3m*y`@uw3P|%=zYG`P9dwYW%
z+1uWRKzLlvd*<eDCyP`8B{DNJ18FRl#_8$lSxuduk@0oCCjuZx)@Dg&rjV2r4K1y-
zu5OyQx1gLH0|!U^>o{veLn9+bX6C4;dytOB#l?VxJQ(2O;B0Jd+00cJ$T@xbgicDj
z28Iqkh)u5*pP0zyU`|d(2EJ}6g}b|FW?oL32=$Yq;^@(MgYA~}LOoh=S5R>9l^h?q
zDUgFu5Nc{_&~2FY+Mz+Uod3+rGZ7H*ijJO8>tSPLOyIPofTXv!K5ma@y+t%WTxfuJ
z$t1D8<8zD0ADo`Pa^xieT@uuh=i@!+Mqm^b6&0uB;X(|Js%T4pRb}Pa@82t6!nV2J
z`NhOc{`s>xQK0zsE5H#n)iNFJ78m=cYS3t+C}hknEf)ag*q^N`({9Sm$?@x0`NFZ_
zkAw#@2Lb_619zte+7~nPnW>qqw6vj#2>{30I-9hHh8xg4K<4!v^?@okGc~2vZFPIE
z4q{vY8Xx~7h~fJ({+BO>)9S``0U^LfM_=9CjE#${GZ`kJr>D0&>?|&(1;GD}#o74=
z>L~+YQ1t1Cx6;<u^bjIa(yXGQ{^@B|F)=Y^<%@Pd1QC*xqshxQT)>s>?d`$v0RgP2
zsJOhm1ZCw4=vi18vQZs=0umk@hziJzV5X5je<&CjY<~N^iuMNV1$-BcO7S&KmvC7^
z7)f+!6zHMcj)x+35@eAT6(=B^Kc84-ML&N0YHPdP>W&wByfaP-2*Ulnn;kn5kyk`S
zMR74As}z`bv5tp_2PlYX{~K};NC1x8Yy{zT<>fy@LPQ)#^FV@P38Dg!6BZU`X>Msr
z@$=j03dV->0bm7&ow)+;fQN^N+hJb-we$CHK}E%=1&+h*?fjAwUUO^<dC@P^TH4w-
zH#bzmzJ7k%T3Rp?nYp>5vm8`(babE|0I`!x<9oix;9wfShl-8nXJ;euC`St$0GS01
z4S=4hvGJeY-h_yVlcs7E0v>@hhtuyOn;iit2BRE-prEj;7r+l!Hbu_o+i0Yuq`TSU
z0f&dybaW+PPKe+t^*a0?L!;(vtrX?u1@N>%zYP>g<aNDV=?FlB=JdGdFk03W74-+y
z(S-|4Bp_}NQE9xcDxhROeLDT+{Sp8KIwodyb#-<~xnVCl1VTut3;6bXYR}Vkcm{@}
zY>6m`!v*J+_E#W%-@Qu#Uk)t@6cK<-fEBxYd%x3gDaXghao)ZKDJqK02xgU$QLqYd
z-)B^gh`^znh4VQ-KewDI_qL>4SX><a!pK+w_(Nn?Lt`Tt)1{oXjSau^>3VD|7PKIM
z)!JG%h?fWnuk$H1q}+Z_g`dBzNcCAaadB|mx9kIs49Ii(k@5>~uf}(8-n^Neodsi)
zkdy@I4JI=XK+bR~@Aov;bkk(Aj^DpW+YVHPg!DT@qgc+8MG8tv2K)NH+1r0~WPBS^
zQ*!|t?C4@-Bs!qz0LKsJ>#~SJ*U-|^0%A|h5gz(QggD?PA`C3EH7E~cVw!w+0uBYB
z!Jqt-3xEangYW$D0|H5(PGO<}w+|qdhhTYOAvr$Y>0(!zB{?l^d1mJAA~HPu%E}6$
z4v3AKIy$L8e~yff3aU>(I)2vI7xJ)TCyEe<cqJ#}S3ACjq!$<Si1+)~)ztwmxa)1K
zr$@}d;CXj>046RhXIB@<ONbX3IRIlA$lvw#?6NX|GR}ZrRDXqm{AzF4H!%2aU8d9G
z(&3Mk#^+vHQK9$wbNbwz9JBF&04f;=hf{AP`CCN)dtV5|da1d=W|<G1eC`bt2Prql
zKoa{A$UYw*pSJPYv9Z;|Ll02DQ2ali{F@w)R8&FV=Dvb2D=udF@S(h_ire$iO;vRo
zP$E$G<6~oBQh;n3;n>F3F*7SjOC!{HRRQpBY6=+%2{d+*wPvT2RRDckLn+LRjDf+y
zd*j=@`f$)f1_o=O!;M}J1CZ?N>uYtr0t&|R;r0R)5)kB8rlv&QHkOw6*C(WnbYx_P
zW@drb%Z>I@GBTl+)?Y7nCvljK--C|0yXzki0L0woRnaXT5JRBbgM0@<a;eSJ#=@e1
zXvo^iYOu4@_mM`SXC8Fk_36337#ct#kq{9<e+~%^&C1LKZ@^g}-VOj-sjshZXKxP$
z0pbc^0v@}y2tUW8=)vJ)({i(OR%t1p!JiO-vl3AhP>}e91bAp6!1FXT5&_$S2nY)s
z02qKp#s?k&k*Wijq`Pa0Pf)l?>~^^(b`>u$!Dmm;hXL&dsMYpmr5)5fyY&Jb#MI18
z&)7KE&Plt<7^#8s`#x@%0|N0&*2l<xAymWuSUPKKYXEJ4TmirW(I%l3<mRWcwXm>w
z2?4S`vIEq{XecNwK!Pf%t7kFXUp;+4K0B+cu0{<Msm}5dAz@`_M|l1EPghsOvIQQS
z<=f6?&?SD>CB(;rq6OvoN9DZi`1lyu0zcz^0j2(dnHd2QQDi=URA53EZBdjk1Vjt~
z8vvaU^GzhW>CR3=kf&f^`Oc*1fgoKR934GD_5i>ejR!mou*~gRpf`a40u;lKE#jky
zh@FE2P~FS*Utd}U-2+k+D5BA5x3gV%u)n|1=Gj(SN<W965+ARoqXV$wXn!AE%LEa|
z_Tko^77GOP@pwfT1Q953z-5B588UkTf$i_^?*8@bv$b`YKbS_Kjf{+qBM0$Z1ZC6s
z9>4`alTG2WH&=>2J3V~?=~0mcvDVSy10YLHU6_~G-SPYa{S5)~1#kdA1Z;3#%6&3W
z?dkA=BCGRtHYoTU0tr>MwGIGb^YSRY1vtL00v@Re+Hbxo3?$wCDV&gBRE&rg@HpTj
zfiZy(1HE8lZcfr|2}B{?c#Xw$Pnw`qENw(&q@;wz$?-90KHu!@Lc+u4WMyF>VCK_n
zk-C5dBSHe8?jgr{ZDl1eH1y*5SitrDiBoe~Sy_5I5g}n(NC=|&JOol@JV=5b2*R%0
z=0WbLyj1^n?J30>0}T!2(a_)^If_Nk&`@l2v`C%3ogH9|L+$OW>+6O9UjXwaCid=F
z0m@ZAYJ74M@Tk1}d=8(??>;_<%WZ^MSpF&#OiWCm%Gv9c2nbTo&+S0-WFe!YLn`?t
zAubMPxuL#3eFbzx@QT*Pe!>LG8~Oz?Gqd8ykFOv<6B1?@7fUiSh-qk=zHRjJgSO95
zS5?*Ie#e%G50O_<@qgqmfB_BwD4Jb!-?5PqA~LehzP`DwwK;%jpuB5qYq#lvmf8?I
znE_feucV{}@Dw}j;I2$vSva3!jT&lDVq|2{R+2)_7rSqqn<^{6fvNv)c(}D?Vrd!D
z_#%vc9mR(a)N)06dHS+2<hi0nMMZfCj{g1?*zraOKaoTL>=|kMqN=Lu>e}@4r@XoO
zE>M4fZ3O!pe0U23;}6xwCkAud0*JrFp#(rBR4lC6*jUjfp1bt?{Fyd%ENpCWa^05|
zp}t?BjhvmF94t0rFNXg3A<*<q>{azi;C&1t1n2|-8WYe)qqXNi2ixWZtVUrIh}^N+
z*#@h5HoGe@*MOlxyll6ICa0&Pk`lqd03LazjCYXG)0bvv8v<<HK0I0LuBomD(gc9(
z2m_?r1lbA}+!mkf<vuE}%by^AHr=pS0BnKB@^KFp&?|XW)v}V3kB{t~yMT1Ub^U_8
z@J=r&u>!>+3c3-H4ETw|qgi6W<pA(9(pvZT?`IbL611(2jSDK(j~_PxSZ-eipb-9g
zTIg(QY#iagygFJ&gC3oprsQTM#SCo&5D*m`8wQk$g$1|!Eeiy4;M!!rhXo5Orl$7R
zhX`msK=|1|e0W<Ge1!n)E1*9Bo&l2u`i0(Xr4a=M#rN*KeOGV*m=(cTbV6b?Kw)Bn
zNV~@xP$yL$AqoDZQ7wgg{W{a#11KKQuuY##34spF1umAZZa5KeHxXfBVQ-)WaI#wK
z06zT)42+45C4q}fNui^q4Y?ErtU3u;F<>Bmn24#$Pp$0E&dz;XwwOkWLbF$~F)>{|
zJx)hUymr&Gvk56FEkJC`p_4@9=jWSRSj>-)e<$hzL<{&-0CoUz`SGKt(SC2_(X1LP
zK0cl!s1G=%0OBS`N4<%9Qn(#wW@r2R`>9CpD~=APN;Dtt>ey_Sic3mDyVua5LFxZm
z`69eyOwb63zKL8CTS-F$j^Ix~zk$nkzH#|*=V)xa86W=^T5xr3O<hfGL{*WHUZ?pz
zs9L@yevf-_x@k2qjesZ$@b@Qy`=qJKZPXVHq|IZCrJS4`GIE%Y&kJZlP{Y710;C{_
zNL*X{9Uyr?BcMV9Uqh$XG9fH%i0G9<$x$fVCLIqC{za0cloYV!(pW99@bHoV%7FF?
zCVAuuW~08Yvr`06Z&(xpBDmAlE(CzVz;;wnP~hWh1^GY<$MoWCYI4#@U%$PtkIQO~
zMNe-fODr5{tgKSlHf9@bZEfH-2*)s}l?$k<V!xx1CPS~Yn)mP0&&kOFGJbMmVuaG`
zb;y3{#`=0sUmwtrL4koX$s8x)=6Ch6on2ir@gD>Q1Oxy8M@5~1X;b>S*xif<=S!zq
z`}Ye6h$bjZU|b-@7v|;~&sCd&ECEDvU|;~S`t7waM$t{+QUN{~6dDRz*VGi&yLpz+
zHsgQQFBome(BT6*kT_oN9vLZ)b=Rx|Z2($7$VSt;y*LJ480naVgo6D1p5b8zK={eX
zL>{I5{rm#K>&Fk^4W1nzBVN;;viE_QU!0#iIXO|tCie^uK7gX;Lwht`0$nAmtZbRG
z99T`l!r#NgZGfKxD3>tKIUDUPKzpE+*Z%&MmzV#nr>9<L{qFWs7<GN7!hi!`RzU$R
zpd%t>^A>POM%ztkAk~0y0)JO-VMd5AR2aZAGgkpF3H+6U0yAyx#pQkq^gsZgpzVOB
z4k(dW7(OxK?c+le$Y$XCrl+R^)a7D3bPTaQ^CuEFFfaf;;a1(3NF20sadB~B;g?3x
zlGEKmL9b7zNeIHjfp-OHG5-`QG;xG@bOX3W;K@W1Vln}uwjF?;u)e+yP-EgEzp(Ii
zi3qSAe~PV<!hHg33t@ZG|GEV@4;T;s^j0LLkpOBoVEzEUIXRuzv5l@nu1i56K_P+H
zqiO;FLzyN>Vh}(lCypB|#9*|ZQUGB`$mGDh1z88k)Au|X2?_YuuSrmHbF*7oT0oip
z!1bpAT$TI%`~3R)c>GV`^qKJA%mqb7MWv+_N15Tb<@9$w1d)860?>khLxE)s;P%QZ
z^1=b_3Ydy}roDp$fMq{FzYNc`3i(vPrEqXKKR=3)fd26XbpvR`mwd!z=v2hS#C&`n
z+tY6KmCVe{pl9B?x@3hrrkR_10<R3P$&t9X<iOzsLlwR<C&=7yZf-t4{?g~jD(UFx
zXlG{!JT-ZI7b@LpLr@=g%iO1Xj?jH(#!K61ii+bP_p%(j`uh=ryF|sr##PW$mRR2>
zC5{wW`dY^b&nj&azI!X6tJUV=-WBu)+$8Wr!0TC2a(^QRn0i_UH8}qP!2WPgPkwRn
z{_bvMY;5jOvC_ZwBB+sOq@=clmq=g{jA+99(pFaJ-#Xat|9N5Q+(i~-)e8v7a;m&*
z-}|-n;H2?I?eCQZ@F{&Tc18GoxlH+|P(&!lxpUOdBYEK+LjQeGqtbkm8tB8aCC3dz
za<lhnOtjxlCF!uNv;Jdw4J>nYxDMhq4&~M7|AnSv+}-iHe|K<@;)Q&K8!aQVt;CkD
zxgpOMBN%f9+Zg-2qGyQzVi50nIqwhgNVDMo;Q%R%O^(sz!Mp3nl^A9FK{~6BLC{w+
zBZG#94h2^yCfO&aST9j*EX@lFO9@rJ8f~!t&NyMMLF!QCyo^?zK=eS%msXH?6(OEk
z6BIErGBf5)IHP)<mQ&DBlU*D%MeQ#G+gg=b89PNSDh}Q?Zl<}Sgs28d(KI#Y-+JI>
zIFEl&J2^Cve>%Ql3v)blMPjczJS4!0xzUS|_}=-I>zl#+Qt(3u!?IiJ!sEzD|LG$>
zkDyU$SLT5uVJkuDPz5O`tC)^Og>O5)HT+u29etEI3mY}xa{EWaK_jKjGs(n-FDL0G
z@hhK}M~baXu~9LFEr^A4ikpo$?@O8^bH^lYKT>Kswcbctd=&N(Cp+MD3RN9b(pvf?
z*}I!HjG{q<O91CXTd~;o?Ey^#0f+i)APy7t8I`BZU`aI=E7@UF3r7401D{hK9jfs&
z)x`o8`o-d@7QI2C7v9%bH%={QX?n$5SC4Ek-*4^~ME!tw4sNs`Ynbb(nU}K635;Uw
zQ>Lrq%&4TCy7S<e@|}&2-CVtc!sjmo3r0<E)+q*3IYU)k{67jR++@bcek6ZWjm3nh
zQp_Fl`*Y!GEBx^pxIwj=llaKykj6s%68i`5`72$%vwM8JdubV?(}TOR*b>h8H&6ay
zqR`9k*K-U+?E`9YQ$qfdmToXu{m0uwjNjm3x$%%L5F>^?GMZ3yPF!5{QgcR4+D1Oo
zq_9kG6rAzh6t&jvFRaGdv~H}&+|vt5Y8)~TGcca)xEGRwA=&V|U7-XW?hv2LDXS~1
z>L`p;$_rZiRIZAgT7NSi!aMr)Tk~U~1@4mj{o^b{bxzGYq=Nk75*t%2%)fN>CHuO<
z=4Mt;r>q0~?;D$%@|rVYtlUdV%}mX#<YWhLE_q+qlmQ6${>`X!bbp4>-(zNBj)#k@
z+0*nUrp7b_grdei6y)ww+V}6mYp~TL;LRRhq8#C3d(RK$Px0IJi<e7T?BA28)y|H0
zTxSC#@uultKn6aHYF5~>SzRE$3!nC&R*W^Xc+bQ*I_RjW?XD<d*H=J_+%Dv%M%%IU
z&LMJ@ME7NuY02L31Un1ioJCnK6Aq-@eVhpq##fAdTw_AMcYvP9er4sJQ}j7Nk)LdH
zcMB5Unn!ttgs5o*-!iLO^Dgg{yQSu!K|UW^c1YzpGBSSJIpRFkKjd|(dYT!OU_!5e
z$7!l7t#>k~+!#f6+d`jzh+w2j+&xX0^#cP)(_KczT73!OAZXm&y;M<^YRIkzeS721
zM3ulA`CGe5sxH1m+L;|UB|U{mLdJLm-n(nrvOH`!=+z+mfQNh18rC<p+}hERsCotl
zI}ArdFB$Sk*;$k}j#IR|$YX{HQxi?h4BIvg)eO<G`cVmDlJR$XyJoo=9ZW@5-SEWF
z^gPZ7XGUheZ44Z4NDWSNaR8vdbwi*;{XyJ{QqRGOMeAs&uaEf#?T1XyjAh&PCOIYK
z*R53^rFh^{YbD{}p#JD(P3_s`VR_THy@sc&?ZRCba=*aYhL4t(w2y>ZA|IjO=lr2C
z%%|?JEdP+z<TBTH7q(<+YPqtufqSxegSU##bY30P+tjEbnMco;9T<sryfb-$L*YVD
z_Fzh9m=AY+hMMFuub?^4wY!^ieN%7t-t0#*U!!01JKTtwMHzX;RB2&eVX%dv+o5BW
zxx3bI3yU&mb3|iEUP7`rm3#hpCF%;~)LdeatqI<k#%S8k4zb9e>{Px+a`rYd60VP0
zN~Tv+Z<H)J@thBkmS+>*JhAQ1uh}d)mMuM&$qmn*FqeYc;S4<TX<ZnWy8QF`S1j)q
z6eP{fv%aw<^zo6U^GZ<V9*;J5WgtgvU5TODt&UU2?P~S(1<l%W%-!{Dz!7mpZ$#>E
zJ{0}$Y1lDJm*G+I$t||a8*ztQnXUX1VaE$~78kGYNC61zwj9L|kJQyv&`RHlo6Y|a
z87mC!o}L@(4{?n`zZo;G;552mWFeuVd9<?9^YU)}Q^^V|gIHsWzU=4Wj*0X4al7CO
zl<{}n{2AYnyE^2%)}|POv&7)2wZDP$?j#cvY;f?Wk&dsfogt8yx1;g_yNNJ@-a5QK
zFPd2}nQ2uthEw%2#Vp=<c6qPWad4=`(I*Q6bS3Ab8aKTZt!hA?X5c8K?jO?_`pMxj
zr7SJ-CF(&RfGCUz2^Agb)9OVY)Ne+r9cQWi%Xupfe<HYN^y=Gf1Rz;Q*t(Zg(sH#u
zc)#IFi)Edc_`SDha6W9XzoCHx0{Jsub8fHP!VjxR;C{DqM{(CB<l_c`AUo8$H>r4D
zA{_j3Y<<GCWHg|@W&gDj&)?R%;^xGh*m)MZoH$6iaFp`4S$*ox_BiqtGleWFEJR!q
zZl1nspKxn2@2D>b1AB;7(DCX>C-nE{RyU+O?Nr7Sv#y+&0I*)vZDL|TPRN>;!7m{y
z<dsIr%FDpswq8EY1p@;E@yczyZ*QOgC<WUluV|+<!mMx9Wi@ZOxFrMy`5}T%`qAoL
zaMue+W0Nb)u){tPF<GA|il`_pXrGGMlDDhBPBe#kTAMu_&>l=iUnS%3Cr#5Wl^d^Q
zU^w~+ubI(p4g30%^EjUbR>B6;n_3*rG>&qe?`5gV4YfTSMIOy-f;w<}AO$dm0tnR$
zh=&N-I=QQuyJsZoEeRX(+U0}l1eN>xri`SJZ!;Y|#Y8uETV<oHhGcXz4ITO1xNv@v
z4Ru+5M7pUxKndcy4yP7`cn-nU!GK@C`%Cf*-(bHXJCgm0^^Qo^sHED^h64j5D?a}h
z(Hd-DpT%38ihM@VwXB2u_w7#zWyPg0+fwVRqfL#J7ytSgWxI0EcYXBJ(3qc}<9<*2
z<!eP<-Bnnnz}-Cur`eR*K-W!hYV9R5n+g^!7IFl>Fc)`|YBBdcC<q(EL)o-KK?}|f
z0a=wkhlHHH(2m&6@vu~fjGAPAuWrYZSvUQhi~Zu{%)cYoW2d{hxl?~foOoJ;r^X&3
z?AYBK%uS$+jcPL+EcQl`!CtXFVGHcRbF<AKpMHUYV6IcpmNVrhs-E>Mt4FWR@8Tui
zonw8m{mR0^q9_W1eEC{w`>m3d^@>7znD$4QlyDv40^jP2!)B=CuX&VZSgN@2xZttC
zteToEPkaLFA(x;r1*qtf2zS)KTg0e8*2&1w3rbr9-7rgvGALg`Oik}Mn(G@lArN3b
zi?0*`Q`mywR_ncr3ck@KD&&K6q>`Y*k$KfCzYWV`T6F#PHbw5K^U)|>c?x#*AQk;b
z{*kFAz1A<9T0+JmUU$`0=GC#xN(B6Os7XHsmz0$w>1dAj&@RhH=cdAbe_rx@UAz2k
z%%i`z3@QR=_SL1PtjZ*!%|7i#F9WTu>tjSE+ABc_LRf--q`yymJ`vs?Ln|5wKJ#R2
zjF`?qQ#gO9ypEumn&B926;-sD$=f}b=1bD$kUWAk)`@kedQKTc`}^;i8uK1=*&UQe
zS}hNRjdVT3MOg*aCP@diE!gi%qzH|NlP_1K;548}P=6fpA$>*|46$EBXJ)EG7+sWW
z<p+H{B+ufBe%MTL!q@NGx}jCXk&qv(G1O*@D9WV91?=5O5!%&N6l`S4{iyK9f_Mc-
z@v=YV!J3Jx2!1uQ3I7{@EK(V;g}|U2jH7sWUaT>c7^|y>l@=X%2u7R4-u=U$Pr-Ho
z$;YEJqhra{1<r;Kn!cJTBPRX{WyXp_MR6V#?JotTlw2CxQ_Cr*!!1Fh3=t~-?fuBO
z0s#;FAR78Q4Zzn&E48XGbC?(H)#1@Hv`-XtbO(ENDW^A=RRQEIB#dS06JrZrwS|vx
zSpA<_U2hQ>_^Xr3TV6h~5{*rW2XN(tFW54Xa2Zt`w1P%Itm~ov*=V6VbSwF+c4re0
z(YT~4nU#*3jgEqE@aW+k<d^>>LYk|`D7$8pp|`rEhPA24r}`&@K3~dkOOGci)28tp
z&=6o~9Rt3HPIG(mMzNZb*JWhnc-B~kHT(v!7DC5fQj-$U?byis<h1%%^i7UL8#gw7
z*v-k}nw09n)f2&BUr<qDsfoV+{?TGbUr>|l!7mNit&rjuR1#=qj!<KfnRgClFc4Zz
z9*dQrmS7bwi68LpH@LY(?B2<*gN;7K*LcjXCVUM{z)^oi-0I$f3dpMEZ25%wPN{tn
z7U--2zFh!%PPq$|KCftDXj>U&&Ei?b#={2uxjIL6u+kP3=3Pt5pP4|>6>@T}KbM~R
z`LfPNN3_E`TzLI7LEBgbSw+gx%!1nE^kH;l<n*$OTq78_&a|-Krc&3k>2dpZ5&<6C
zXZ|c4_Of_%@>6C_DZlN`x$JlKJ(GcXCilWWP2tNo*Y&m5)w!~xK<GG@62!B89319n
zqAa_<^=n}ZHpy6_M79!X%B2tw%n@yITiH;RS2Jc*d<raLo=fU#Z^ysA7)4-M&W=t7
z*?FYHO9Yq|NknQr7Y$Vp1Tv$h&ez<G6&F9c<eg{PNs6*(Y+wX`cX?gVy>;Ktl$YW_
zEm@hf;hx9T{`o_ywF$^6xNynMjg8yKF8iRc*Dj`c9p5Su_{deT#?K;~zYDFYeW~Zf
zf>~JogO6_s@oM4TudjC*+zQIUDwM1wbQc2Hl!9VYsl$QsPW*#sn8Nn77-6d~Kqy!b
z2RiCfqO0|S0%^XZ=0(uj@NK_?40)e|?pn4L^N{P$wW1)AXnv|Xe2{xEqXu>X-Ca(B
z=HFYuiBKE@NW<F7W_Cs@+VSzFf()y(M$udJ7xz4JS}AwXV}1cGVY#0)LZ)ry(pJa%
zkczP9=G4X(Z7bah{B984!Dp$jYzTBd8{y;Q4)+C}xjXjv2n0_B`FBE7Ra9@Q4(r}x
z#wi`zsae;#$);`gRlF2-pp%i4Q&muq2)@k3=Y4qc1=xR4K$FVP6ME_9h7?k*)m-)d
z0V4;yw}-=JZbQsz6{W2LmpwLBJ)Ox2XL0fW68{2qeHoV2XNt}miIt}egKKuo4IT0A
zzlV$s(^+k1t!aXHc%j=V)8w~=lOC69pT)4Rion?=#+57Oi|+>dtK*y;Gss8U>>4kX
z;s(<6(#sgB)&EK6ZGp3?engBGg|sst9@$s_d;#dNNk1U4dpMvI+5hQy;e4wlak5W_
z(yyNpBTHWqBeKeaJxvd`72_RYDE-z$Qy>R0pu~8z!TBK}5)$G=w~Mbc0V}sdGC~hj
zQu=h#R6qp!;Vt6FVVd`T2vEXcgeB><&P`X_q0>!Z18{_RBHOsfE}|=GaA+Gd9h4yB
zsfH8;QeyQEsL-{j%G>S+yEUA&>VoW5d@6F9r}S;TK+ts*rKhEn8iqQ*jUVMMT|QuC
z6gMG4MkW?0vY6VQPH~*v;vQn7+&&rmFfqs2m3aD6rmrA{(oGeijJPdYfqT1of>$2Y
z1f6@9##-#M<RxT%Gp>`u(D$yXM%gdS>7+NDk6%{T4Rim@YX-B*plWb6=I)W>HzT<^
z=STQn^-)1NO=^!2CtQ}iEOxht4h5vJd8?AxLs1J?!0Cm2no-i6!f7?1C-ELhiPfe~
zOIJf(8vF0o&M(Kzs7oehW;T}E>grm>(x%0c86R0Ej%dV#Sj}BC8s-ll_UCIfq?NpQ
zh}Q|tj($B&{OX*T0oXexYK99$UqEFKPN0a(;myYOBhq+oKI4ZEmTFb<x^f$bFB#6a
zYeW0#bv$3hMFOJ-byt}8Oq48=puSDB@)sfm6(6_lYc~$$m;SsvJ&NwKqpL+jwqwPS
zfyrTp*pqmgea=O-GRPVK{4G~;+Q#->pN+86@a0AxH{NL%_M7dMD93jWX-@_e2k%lq
zfT9e^dWeB+gYV_XQwYtosHv{Kdv(8O!2}Yl=lZ+LCLJ+nQAWi_In7s)58VrYtEQ14
z0+z>TS$EUnv>m0Thnul%eN^~>eVgwk`o{z17*V&S3=t5T2^Mxe5X<%co}w)8#ZJP7
zvq&BxUID*@gv39+2UfX9##M6XRz;t^CSu_hy+xoNn|ZX{nzgcGh#OmRH`*E1XkW%)
zH~jhh0fa>^xhcF$UpdB*clnx?weB_iIlUh>C5?<u9PD$}D}jNQR229{8L+6P8yo!w
zo?vaAW2EpL_wfUEw;o!uf7pAlVzOTvj{6bnJ?y*bZ(l3<FRu6R=-e(|(@?=jOoZ^9
zD#*&R^=-&K+h0QO@=cZibLcIKzYGv3i>#aXAPhxEgdycts7%if!H@h$bQ<NVfu<5C
zgNFS2HDUu*$A{~oZtW_L3szCyT~E@K^w7K=RIgP0SmD^(2UgV^HOc=OUQiQzu2Hd=
zR_504M%usHevOWq{(GVgwpz8)kkRV>a2{f0cUpV*qmd#uX4>~7`Gv0PT#rD1r~G`U
zyzB8sM12E?WPjm^i;gSoqgbar>wr%6+OL!OZ3klqJ%KMF7nup`Sw?Rrfng&&@cMaj
zvR+*b%q#IKJFrb{&p`3u(V5^7!RfQOt}S&;*M#nm_{n*8yVp6srBBJ-RDt|@Tr8p~
z+8e;O=JuGHG#sMnSBGJi5#yk&oZR18T!dEo9yUs?pyw?}vh4iVZ)FT*_e%&eSE>8?
zglo~7-*-yuq=DJ}a%=cTQ+!DgyPyV~#o{<PFpM>rT~^uEF9QI``EMQt=zdm5Z)2#M
zSkogzys~}22+bf2PwQ%G%o8qh{^=N0P}W9)j3`BUI^Dg8c*VTB>YQ{wC6j=71!i1=
zv+P<_i3M9bKZFC47aT+rwy#-d-Y{8C)+-*&`_+E)v&k!nxKt2@@59yUU>!hh0Hqu=
zvm;+^xHN)F*-@Y%95d=^^;~kvM97eMvrKSxobCZcuo@pvfV2JhhZ|K8Rj#he_)p{d
zJz#;rayA)GWTVDLeV$L@fI@^-d^;@*^O~F0f)rZX0g$y8DnsOiiDaaLoI(N;dL$5m
zPeM+Ur+`3HP>6mryy+b~#e8RZ@3sJPSu9t~D=pwt$GXI)Ze+`C3^9#QN*bRqA-@I&
zM8b-R*aZcxUvP%SHu}Ui3O(Hu@?I>}n$2!=P5iS>-^<%ip#z#Eg%!W(d>oYMHx>k%
zomYDNLzC*b#y_m`^qo_r*5#{8@_Qw=A!H_`r_gm4kT)`=`uxB0IOb(jgT23Nfh$Cf
zEr}tGSYt*91#au{=|vp#e{_y?=kNb}EBWsO+0(X_H8pP_@;d5ZVPA-w_vea&jyl5j
zajD?!gs-g8%SL5nBSjpg8wk;p0a!emh=B$ZhuX6Zwsrg8OWGV*f#qJX4ic`r9GAf1
z?eLnoBda{!wB^FMvB^T7jWt7Y*6W{R4x_xp-#p#?_1m`>5Z{l!o#b8lc?hS^Mm&V8
z`9D8NxUKOVpZ{M^Sp4TR7(H-<^Rdd@?LMTp0ckUiyK70q9fP4G!fG|9+AiPz5Pc<B
zaB<r@ZFyYL8D382b@{q&6oVC4l3e#kdVY1qs%~W$-vW-;cM&fo2tmq}E<5gtz_oG7
zN!;R12L(H~P09rYARzEkbSg@!QzIH(<qhj@7vq|Q;2@jevQXFouY|=LVG>^Qvyq`J
zcMovLw8TQ(H$cQ%v&v#J(>@O+rKsd_5L(0F7u=wxo3y^WmHX3i^?JzZSkTI*sE|10
zO2yzH9jW{$l~1WLv8?CUDAv5CMJ4>sH#7S;o22ip_qPuZ9p+p{t~#c}F1-WZkNK9R
zJ;Y36nw99!_(J>2CesT48!Js&PJ^<V(huOhh7QB>v(mkJ9^-NwOh1)uF0jw#byRK+
zHv$ez=`|2;m{*7+@Zz|oBa=+)JU%*zh8^73q@*ZC4qk7)e}YMJC?$+>1N|=V?KW<r
z3}MWIXs~a5XB6yEZEf|KMh`=EU5!bu@viK6PR&m}j9Xr}#aPyA6-G-BHxy=;6w&{h
zyYPerbq&!VFdi8-`D=eG3ov5w_eE5xZT2vWV5drLMRirJrat4*@~tT0&nt!&q`_Qn
zg4tJ)k)&K^<x!4PvFL&Z#y3F(&dZt_-~RBdb)5<op2@^^`gUK2A5X14NNY<u6DteX
z;D9L;MTJHRhY#20>y}jz(U$YLy|AqVPZ}5%MrMw^%9YYCT{?=I|F}g%iIV)W&b|3z
z;9{!}p`cW|GNUllMZmdX*4ZM?QXXqsDQ;}G!NW#d<{jkoL*<~J;EL|jU80$0bxp0p
z9~(3C*($TNKhV4-fw$}oOX^LA9am+)Azt6P!<L#JKM$S1nv2S%pv?ZueGCFW9+5)L
zklpAMD`RNNl6Zf7xK>BMNWN9;F>745WdnhPYp3EnayftLRxtW1<Nhiu3H#`rQx_Bb
zU#JN#pX-xNO`K>zSH4A2Rrz1wKRVe}J7$yT?BR@LC?g}@P4hMT3z>etwo`oh9BAj+
z8#P)LmrRxtKg6Vfx`o}nU6fS2%$lFi+wDdd&Z~@&m%U*DP<}G|=o67t2j=?<e-14I
zjcHQcGwPt0&v<iOGYy3JkgJ@Ri~Ib15(DL^ezEBh0+EkbtzP=`yK{kqo?$g?-X7&D
zj$uJ3Qt`QtbgE?)RSkj-mcyd{+~)e<!axMv?0m*!Us|xw`{lt7Yx8WKH8DKYu(Gn1
z=+w(9X_0-HI&%NJx0$<HBgKvm-+5LiYautvlw@_fFrN~d63Wz@I(R1)B&mCIe|0(E
zU!U*WGh%u!J25%cq~rO)CU7#fgL?y$f^fl^g^6Wua8J<AA-|yE<j)(;F-0aQ2$DV0
zVCdlbLSd2loW?v9q-3z9?8&ve?+<R&JBRs)#=NGur2eQ!pW#KG4JX6tHe+#+vw7Wp
zB1&o@jj_~|k3w0mT3@CmM)Mgp5<E}k$dR7E@wv%TyU+F*^|aC?PyI#DNe?};EV70L
z4mzG=sg1SyQp3>6(1z<?tW|1VXOKC{4e%0gtuBqLWNuzvBd6M-qhsx_DV(l9GAB(+
zZPC@Nu52(S7*)HMeQ(-hMh`r{ZL5<ke2v`1UTIsthqIm9;1S8=ZleBmcK72R-DU!}
zE?3a+&mO9W{dr{|GfJv;>4|#UCLB(@CeKiG(%!y(E30ddg@%#Jp}x7>g7B*06|r}Q
z5Qz|NF)jZ7{%WK?B4W#yj+Q(=7N*DHA1Y^ZPMlfz=_B2Ceb;dF9W(3Ms$T1dpxi&3
zWoh?vS{#|VIei`dRR+r{GVd<7P;?PJGDNyQhNbX#4*f3WR(2g9x=594zU|JyJUc?g
z!jz4R2|Ti+;-;n*l~MG2EM{tDCTEmMqc8bZkM>SuBk$@7105}~Zsr*XB*nyFDJf0#
zJQS=QbX+xIVJTrF1hRpA_ADyQr^izq6n&d0!=c53P&>mB6J<g6;AS&~zbQ`<<-
zUnijO(Hq~I(J?l`55Cd;cH|~;7xydt=<oi)VLEyF*rOOhg+7L_N|<jGJT7Tp!@>WM
z`0-{P>z;l%xtNDG)_b9smWYgqmRg99nv_O(2utv%?vRue-P6_Vqg=-8Sr@88>gbqg
z#S)VXoA-ST6zCFB>l6U&%Ha`Sg9L$2(b(87f2)jSq{5~GTxNol6m5wUs#|}>=h8%l
z?_?)9vYJvez^SO7Vp4LLIZF5-acgiqpA=r&igNz?_~G&IrNJ&H@3*tdwo<TP2S_pL
z0qJNl#<z$OGxNQ^i56gpK>h!O;!1-+ir9*ITn<NSs@P$#Lb~-UMD)u}OnxDLc@1nm
zk^7C8-r{8X9=gKq`W0m8VEtPd8Eh}1u!$NZGtUvvUSg(1hev!WGQDgddWj{ny#rYL
z=gzU;SvPC0@fkP|HPTqvm>6)->NgT#G0C#srya>;BuyC^;dPp903inGNPWAK?k?{i
z-80(_5|a{3ur$Ct$eZQc&<@aORO%DY5m6HL@Q;ttx)AAxR{XWmk{TNg1&F@xJHdCj
z_PX!%tHR0p;!F<*pNEczpocSg!$hauw`)I)IQDsK;kvve99mtglFSS@r|Al)2L}0Q
z<u@1xQ9Vsmfdvn1@AKbh=NSGyN!hkmmvA<6j!E{}KRQ(!|J22#PwIJpj?p{RBdD!#
zOO(ff6Kr&S(Y$K(#t;eup17o=Ygx>TF^XB|Fq1GdluAR5%1)=1tBreN%2?hR8(aZy
z)xJCxJ?--&Mzl}!V&`e(EbkY7Ag4uvB0ryD!ofqMq+yp((pn!@ctV<)nFTvdX=n<2
zh+{Tl78bPrdE0{zOXG4jt%ljghjV{B2L^aNceMJlK*PMcyuF^;kdRQ=(G0)3vf-Os
zt;hNPn^w1~xW0V7J+S{(gWr-4p)TJ^ru2{BpNF)#6B6Semr28yW16P)?9KJ(!0e1i
zF&{3wtsh~`@DDY2*@CB39BgKiA`%l&qv~y2vixi3Sk{)OK0-s^sM=I7)*bvz>K_Mm
z<(X>y8u|bw_}E2*mBRCdVp~((L-u#?OOrF(*YdD-Z;&<wmeawg0WCSyIucipqhl!H
zC|~;m_z{4V#Uoz(nHIf~y<54F$;p_vNh~o-iPn(CMo!<cJY{-1`VPmxJ72fe-n(DY
z<mDN@AD!e-!opg3?V8k|SL%6({xdGc6A&!I<qj2_$}HI((qd85Ju?mZa;Jln(&Bn)
z3FVVEl*4?y@SsNYU21)E>;2R7sqdQMx;&SdnCKkt8=cKLDi8=f)ydY<A^XtsT={}`
zaME69PL@hAEj%wY?@R58Y`J(c_wDo6bQ|JeRoZMjq(lz&+@>P%c*6O^nuUpt`yGCI
zW~tluL~Eu`YiI&~GCo0Cst&)SMZg0Ag)xPq?b?B?ro0UrXIJk6&r3J%eeT2e<fL!T
zS>wS|3BYTzz_R)LhY=VNG3%XVE^RTgI$RD<n^{G9Hh{4L&eC#vgL05kxm?*;TKhPa
z(=@<g+=N7<JQ35YtA;yg{=(;;dz$<7^h~GObx8lPcv)K+?6Bi~x#*ekn~apzB5_p7
z<?A4cFc~UMA9Wt??l^#z-rfOXN1<`F-V-WYtRcYA^b|F<`~M74l#z*95O;`*NQ=ls
z4fzifesMs@M6Y-6&1oso){<N}m~?;VPPdC0N{k{V_0Q$H!M{x<&=h|}cY}QdZK#NS
zHUmRcDOx$+T)i3L`1TH-+d8@J9Ujir>e~3|PfiFRJGMyJqyJ+Y33!abXhW9=X(t`y
z$LsRS3YeP!!(5{hd$q;!@imvVjNGus$intC#pm5C>uJ!5b!j{v_RAN4zH`SNt*FI^
z8cXZ8*&h%Y9QlW37QR@#kp_NK4+jK%?nh8?a<iM;MAf_Oynnl347%Gm)M1`gHIXO$
ze^Pq5gTeWSq;S6GM;2;zfZ&JoE~bD=58cU5D}_!*c6a%a2ROddtTA0w5Zm%u5Mn9M
zX#JXUk(+v$`g{C&aK2n4GtC>uTP+vosa2mbMg6CgCTHl>z~q`Al{Xevx@<JFZU?bt
zNvTWRd<uROv!Psj+3$SgWVtzGoMZA5a((T*5woFX!-s!?{_H9+y)d<0vevb$sHxjw
zes|X6w0f(hC3(NTv1W`#QMXu%0p|<Q|A}v&uYTE;nyBY#J7*7&ch~Skooct0NoMvD
z;Y;MAw0gcgt&4xRU+ojSS<+lh^6RK+a+P!2x-&fgBXcZOpqZ)61b_3MnyaR(ko2mw
zcK>Cq14}y?BDSoSXx;Z#KiVWKsRT{6FVN1Ff)vKNW}>%f{F%4d8d~zr=A&Gp@Ggbu
zb9kzjR8-VfR%S^B771_TLPR;5@2=;_`SbjW-xslr*QO`ya8lC9dER24nq$tn;Ew85
zY=`s$zF?k6U6$I06Ke7DS&X%TG1ATO8?KV!KnZ$4AQ=JW79Bc8U9pU_dZ4&O{r><e
zr1nv`A)}R%ixvj*Gbtr5Hu~|v7Lqu%OertB6BnpO(@)dY={g;Rfref1zGPc>n=wx@
zQ=i29tH*L?o1Nno#}W5nZx``<+G|oGOo#v(YMc9IIC8jEz6~_Q7ux^Zx38d$|BSNz
z@=gA7cXDb1Jl+n7WJZ%>?w<P-E)O;KKNHFGCEcK=x&CeEQ%Nk?0=>BOyf*cPBiX02
z`EC!^VNVros+VJDg+mAfSB3rfHW5cDRy8IjniLKl9hGZ*EQF~})(tx>$1%qM9~taL
zooyTOZ-jwB5D)-gT@UKNS;G#4K#T|2;1R&TWL=bJrzf)UWTA5te{G1Fx9&d$HB5%2
z+x!Mf;RWOylWp3PCo+WS{9bF(s?x_&;~SSy!|A^*Q>IX0pxRcv420~BgMWr<2S-=>
z3icYm!6aGLk#|y@oSwXR-GPCHZ)VM^s3SqoMN2_P)sod>P&mraUsGPb`hJ@9VCN8Q
zHHU!|S1nbWk2Wd?Pt7tc!a-)7KhM`OK!Hm+AWV5Uoh5{)VM9I}>m{ZpFHOu~V&OAR
zng)fnI@qa6o0zC7nVyHt=0@kxR;rg9pB1`mq5v&qt#<gi>JKhNAk`r_JgqLTDKDqM
zXrL75xk&w?=?3vgNn-`JwG~~HqJ+gGpCXuH!4#jhjyJJ$f$nRJ3nmoQ>YtB)WLU@2
zjI#;fU}4}NcZ^taAqQj)$fzmR*EYsBYicUe&{6|4Gzv57=kp(oD9o3Tc+EjXIfTa5
zZ-SDdyDM~c<eU9XO-+=tu@T8cI5(tRBplxgGai*Xs1#Nh6*5NeeeNX--La84{_bPO
zyt(9k+`FnT18KF;yr8v^yiI&=w{4o@1WXg0vXQZ7_}2)TGJX&*Ah=tQVL3QBiw|NK
zzBM@<M=Ne$eSkpN8R{-EOPW-*U-?3lWq0P`$^;^YBH!Zj>`li4+v*Z?aHc11&lDsJ
z1{!;p05;^yX5!h6rpeGUm+j@Pgb>O~ewM9*U~jTt)$k(6#mIP0c7AnzIW*)$NBhLl
zc-{HL0M+=#IZ_@bcC=G{Tk>dGv?BxmbLMvqQqfcW>FL})!Z7{J6XRB$QJNYP8gm#c
z0o<FPC$MybNx7w>E|M^fgOtCp-(cA~U7eh-hX5rmXXAd3Xg4a%Oe^iy`~a>Ef!O8l
zvaqrlnQY<?vp_?DMlRK;MlPwIwNk@*gXIo-84$v@Vhn7oEOb?xV|gN?6o1B6D>3e5
zyL`I==(T-ur6QrAr6UDyagx>Tfvs2Z0vVuh;18g0KfUB9eoReHlz?j<ghs_AZC@-W
zZH4nl21x>CjdF|IY&1;q?a{0kN23wfz-<;_F!ZU3so7G93GUQ1E9T}KD2ST6IzijF
zgHFdowYsgpVOfACsvc>JWDS<Hj3*RYF*;J<TJ7|8u@x0J+?Leys_R^4Y_))X4|I7?
zZ=bE}gI7{qlV^siHiK+znw{ZrU`dhV>Pd>+GM{MEHj6!LZj!8~Av0lUt<%j5;i{dv
z+d22hBC`S=&o8s{duwy#c%P!iQeB(+sV=#`i28L6l(W2g_OLTptF=%AbiHhr^otyU
zCMH-!f&ek$_;4iqWEwe!KD^u%;h4hkj@$+)GrlKXaumpi$uE6M9lG~2x1>Zpo{v)r
zk1Jpsm+-Lb`r>5LJ|IDN2_oWNpS-$wY=fOQA8dODhiK?&)$1)E|Gs#h8KGCEJQlrE
zElK<>4`WkvqMA5aK281OGThTuw&nlj1qzs2EoD=PThFu(&0E;zD&-Py<h-%3xwu5O
zFs1v~HX-*2MfUS0x`Gz)Szh(oGcQrFHHy5Ry*fG5HX`JIpUFc0AFcKO^}snYqN$bD
zD~OWnWO+r!s4R-16L|ZdW?gQge&CL-z+*SqU|&E3|If7>B?gbE+{7%+&4b@@0XD}D
z@$k{o(@6>k{?t>@(Q>wX;Qasa2Cx0DeY;*2{PPA^n;`H0d*AJUExmoGK0>jh`{zLZ
zKg(kO?PdPo>tz4qfT-z`Pn)l5tkM-qIK1rp0eT#3Yp~<^7`dR#44+Ka#Z5(0TJ-6-
zIi&r1WIZG9BUq1jb!+lGlvoJQ27aL8*lHD@tJ~cpV|8nz_mqZ<M2Pf-C!r9qwQ>tu
zH&*;ifY*hds&!UebQ?h6MT~yW$#W@2QlgBQl$jhk9nLqV(e)w!eZi^oo8t!qQz=E7
zk{ZOo)6Bl|ve9T_=ogIC9}a`_JYC$T)Z+Mr&K!=aU5!7<J{@B_E{ni+p|G=q-#u_J
z(h_-JDY)w9j&a^~rulRX&ecKTM{FrpEj?N+Z8o*Cri>8pnB){CB3s`t-J`72k_;U@
zc79X7q^~-0rKj7$xH_Liq#^YdJQmq&|1_rBj%rqDb8_)|9b-KMxOQs~0|B8aG0%Hw
z^L3TPG-1K;-{_Y8lCj12ycU7>gx9rcrQxJbnF@LoaJ9I*;bre=6^#&XY;r*^<9)d9
z3m(W${$FKWWmHt*7QIL#($a|1(hS{Q(hZWLfaD-44GJP5%}95HAl)F{AssSEH%L3=
zyW?B$*PAszW=-CE?wRxLv%h_IyfIgEnhTW<N@PIP3H1A`cA5`Q12Q$VoQi`G39V#z
z*%bYB4_?2PB|2<rXL&}C0latHX%`*EFHCy-ik!RV@u!%W?*6GSm<E&jy;z?gs!e2+
zt0TF?XJSUC=E*23Hl`ROk2n$;r*SCbZ8_e|F$5QuEki*`m~2jV-42}MlE3+@F*FGu
z_<JF+eR8)L-M-Xa2~uT_7F9YY<xkv-S&HLgcG6Kds{3Y}c0c)Ka)MjnJziF#q%_=B
zDFr#-xD_ZBoZup&BH~A_rL5mbV@Wcgf~o1%N9<{}dgtb1w^a62^TP8sT)goq3_+hf
zYgYWs&=WSlt{6J=l2Q(o4rmvE<6v+NKLnbW>Xup|u7*ZG-5J3nn>|!u*Pc>R;?l7M
z3Ny%5rNcA9rYHD}a`cgx*cBvfYH+7;ISL;DZSSwXt|wM~uWV_!?)C0^U`<VI754}F
zlcxh)Rml3Zhe@~?QPLDX5zqKhA_P0)A(MPQ{}dmmsie7_HVePNS}635r%ioBWYYAB
z(RLVS0QT1JH?uqwQq@<hJCa(`FST?yg;`s_<$uBMqrT%v2}`!U(AF{92=mK}`<&Ov
z#-X{00gPvq-;Lh7BIy{iYl5R5^4NG7*VAD|&qVCtU<QS}a?h%NG%qcz%&|!Nzt>0J
zF&sh~0Y1I*b7JnLnQpNXv%au6Co$*|q6gbP91iK_EL@7qoj6ymdCTS?74E8XIiF$J
zV686eNSqxig@=Df^oq}h(k?FFic!}+QO1wHZ?xshjp*_5<LIi+@PU%}&Nc-aL%*uR
z=FpS3@66_B5Z5XvL%MDxi^hwdoWjBS`J%_BJ}}xZxRH9}F<lVIezkb4!p{dP3`AT)
z@T&_>B~$Z!jd!sq??B~BWx4I_gsj(2$cbH**7BNf>w_zMdO~Gp0n<O;e_!R~L>ATb
zr^iPhWq6DI-Dz?23pme;$SMlV!fHQ-sT|IVMYw<tA<L2>X2Q;bOhVaZMMPbJi2dV(
zx!%)#YKN2Ec>0^y7jx7g!;N`+^+dwQUzQ3QpF4S&{w}~5A?d|$q@b`5Ph`Hjd`dYw
zvn+LtO>s%DrKFm5zxCie3}0milGQGv>Ew-)`4qpMn-*jp@&5(>?c@r2H25-4p5#0>
z;q%?I*NNC@bO12)_w);>9Rx6R%uM8UwAw=qVzz>g&fV4P413Kvp+?e;>vB9N<qeT<
zZT%5*1LR%@j=Q)tg%RNVB9lJG1+QoDm3;DuAOo$R$68{`Lf)EZ8aUZVn#v8~Y07YK
zzuXD0FnHQV6L3A^Wld@gNHMpG3PaS)+=;-s@u#K>o`pFLC##dhVmBu3Y?rshwCYbr
zd86@OlXJs{tPfv$<OQUC4xFp`Th?a|#rgaP1xVoAxTp}ZZFKlF9d3lZ^1S7PSluE_
zyB7*|yllC2y5Bg!*wg`lK@ptmgR}v~WM|b`F%4b|ia+2yyn(m?jfuDof*tq~iT)||
z+eKTziPiL|)khN+U!Se?OERy&jY#*FV+2RV(7o_Ir^|kHcl5pAQh=8zhU^-M;G}#F
zPf|<;klPchU2!{=RuHb-n9xj2Kg%5~G-;wvotb?jb+F-PlHfCn@XsLjayOz2Bl&&X
zWf}Gyc-k$P(_GzLof|*Xgz+gNG(IA<TH|Su)PA>TrR$l0Lv>|!U9{EmR=+(|$7VN&
zirQrVMaqfaFN=w_>VlC8`XkDh>KvnN_O^B-O8qnA2J((nfW_>NTHRv6%M?tqj89hz
z3!>%J-E}Acgs8f<ur&8pn;avuE!pE<Xy-Wjq$Iu{iu0I8-V7yUYHnF%_YLC)p*L1j
zuVg8Lbphv>Z!2X2-}?Ar%*`;JuOc&|#hJa~dBH+4D#BGeCnxvw0yyfQQE6?hJ$gi`
zE9*-)y5CS|9sOt_;f3+S%LSE_!Ni(%WMS<NWwh6r%>p?HVzQJxanmozWx-$JnIP@g
zEG$W6LQb(xYUS&wzUcPu`e58gOTFj@pI=$s3#FCI52i@o?)7>A1X@}Au;9kpcF&d~
zc%$0BrsxEp6Y{3XH=J`eO<zh-Jf=xc$j~iwU<U#@eaYHr#ySO@%~_~0Q?oawz)-k*
zp5v0E<X7hsk30%VcMZfV8I9tcF~4mvoHuIv(|$YA;pt{c&aFD!I)?MmzCy`GKm8yb
z2s9s!NUuGeCBDZFA;Y!erGLm4CC6}nhU@$x;}UKv?(Dh8bDh7pb+69MuZ*jqHLaIb
zagPsVeamkZs@FA2y(@E(d5<IR(b5x|KmOXZ@o8yNe5$ObZXn23cPxkxaY=}mXGwT1
zu($4Ujs0FczL9AG2QzTFamm%4qUtt1T$GGeM9HAbFW@W?ms5QmEk!sdR|!chL5>(J
z%b>ZLi84h{Y*=h)W~zBoML`RU`pE_V!YN$T&F9=H^?-S%zTQKu8DSUWBF_-a`oh~d
z&7Rfnx{r`xgNtjIqqVfUw7kUZ_a$u-5hRC!uDC4g!IC4?k=A)XHY*9VM2Mu@s`}tk
z6V?ACaA)W6K-#ek$^<n?JTGT3HZ$pA?w?$q-#Om6hl}00opeY?<Xg8BD_gB@SK}p+
z$Cl*eDtYQRw$`??6u#NNn(M<8am0x#tZ#Dhv2PwL4GhB}cBtpzH@AFgFgdQfOjjrp
zHq-Q}PY!)2Ck-C-6WyqX_%TzY={tHsD7XM>R)j;LA77jdMt;L(iB1_!Jv@bE<uu@c
zQh?>v^WR-=5zj%oTLGp+OsCCDRE6A_aU64M+s!d735`#EJ*4GxWFvu!S0bC-I4<-{
z5@RHrp`CsFTJc^;0XAvJYS;e$3KbwAI@#}Kqkb5*KQot!?V|nS<%y>TxLa!X*6D5I
zyEBwnrF~kpm-?1*tHCyc=I)*zr~tSIz5YO~x}}Z@V1?u4b?1Qr2LOrIzY7y{qT3Kr
zMy?xUKG0^S;P^J4i<gY@azFsWeerG4CY0X3Cr_?q$o}W@5hZuAf{bC6z%AcWOjx1D
zd>Ena;KG@KffMuq1ef|hvlUBKZQPMVwU?z&{3}pV0P!Vp{87otaf4^g#*+^VP9LF4
z2IuH}=niJs%qL6X5};^aRk-s^;WnD^-&7|RF!52jJ_N7I9oUbisy(zHw5QoTd`z$c
zEwY~w&|LQLr2(XTnwk#k@<aE>{yaPu5UeYC%T_kmlmuA;Bda4}Yc9F(y>;6o5p~US
z#p?!}8xwEXObEmZ@Uah#gZU~_wP{cfgjq(B)xpJgE>7-zWblI$zib2#wcVMfCr~%l
z?G1J`m#Wo6){-_CN345!c(`#buJM=V{5-b!jp;G*I4^pyHuahv#w&3i8jOYEg_}Dz
zJ?iUZXJpG3)=Ny2lhfWlP9y`*+1|+^Swz-zV}{Z1<jji#02=)NKC${;{gjN-n}Wpp
z>-Vbai{jWx^y60pLxNo3T-PgN+LSz51E_IAUJ)Oonitb1%3a|-gvRmjhcY^_^dS}f
z!|cP&>d-d-zZn$<7Ot{G!2@xa6-NcNMXz{WgS0>(4MV7R63OW0^BYH35XX}u!}}I~
z_}+YdbHfhm$i-2VRK_lQm@cwXMcdbj3s}6d{N?I2w7WDk6v5iV{8_t>aM8g`$aACY
zX4q0f`pMZIsx-l6;9|=IvA4mu26ckd^=HpxT?{Wv(uYXP>evULC+fwa0B-Jh-*rpF
z5<6-rnh?qI><;Y2Oe%4qN=HJCu_%l6yzx>`1*Qbj{og0aG4MbveZ0NlO#z#m8)(43
zczfbRy`Jie_43R7&iy!b-z_xZ=Pq1XMG1d$D+Es`9vaRY<}s@XOxMRa^0q%c628oB
zv_*eF=Zji*ek8FcCCrBwQkOgO9=##@mrGuDV5~F)5x&XNb2{Y!{dv3mMi!bum|k2m
zDANaVJryoSnyaYD@7dG#VK7}5CZ|uT!x89S^7+BG95za2AiztDLhTBi_p?3?=ZGjB
zvS;pB#xvXa>jOqnQ}MYqHU2zP_L<l7(}%}+bBKY_f%1YvIe3o+cfwF62&r!kWRE6M
z2YR}cM}sB?LWn@rAg|Ua5!CKFB-U_IbN&f2L_<o5)lKFJ2Pr*O{bVfQsi?R%9X;2y
zWK1k!DOg~CXp<+CU$Z?s#lOHm{o{)Ne)V3+kRA4<xdH&TLn)Vj-UqxO19K<m-l^$h
zVtQg*#vc@MMP%!Ex;}1Xd{5rd|9Q#Jx_ZdAeimPn085TgIeX1B-EhYlPaG+q$wKNf
zyXLPZISQt9r+6SdQI_s@{4{tGpOCr}fDhVwW=oG$(O})m*to_cm6B)>lK*v<!JwS(
zugd#{BF?dwI$C5m!fB9TIffoIX5Wh&7?IkSkUO~K`OclYLiunvb*2Hz${o)e#ryJ6
z&U!X3f(e!~T}&jSnk~zCs;{TN_U`iF&4$*g|6#Xh`m!tiwhp;)u+-Bh%f21wzv9dC
ze!2QMnFu?3)(lb-RD{w#F?$aVdE3LCs9OUpMv8;KuY)HjH0&zF%+Xc(8|=p<Zni`V
zERhF1fytUI2>xW5ep2*IXBC)|QA5!i1!S5S{ytQ?2|6y<2k0OSbZQZyy5LB0L8;Tq
zYpq?2I?a_JeyxDnbKQ<FvE_fZeb6e{GFr0D0vAWPh6tyQJQO#D*9z2s2_Z*D5*Leq
z*w{+hT~%pyI7HImV<~5apZD6vdDo@Mn8bkk04nf?;Z4BJ7{2%mx;QZtc-&Unu<D?6
zp%J}~o_@8>JR!e0quUZaMADZIcepTX@-#vT$Cj!n6d`|3Cd)tsHdrQ-tKPStOYg4<
zK<i6rUwgxKLEF_9_24V>T)$R-*j(6=_q)qS8)$JrV7xKsh7IX~M0GNR`#D6<Ob;fu
zI^i-O*)X05D*!i@DuC#!s0Sjaq)0xb2o*f?&l<rkCG1TukJ@r4Z8wfn!bSAN8G*pb
zLE<7tALkVcJ6y1LylcLtOn*cH{J9(W8rNbDbKO{7al7@MUg=+uaNSK<|Ei)+#C-Ik
z|1%7?2Cr;yo|!iD8teuK`&GmnHxb*buKFG`Qb)A0LGRw8f!E_OGa#H(B7-FMs%J7Y
zKJ_CcR4r8BJiuMSB}d2#M3Z@G%@ILCDAELOo0>piRCe#BMs`=j8zeo25G~?4<lq;0
zozppHR0giEU|T8p#IUDpOx!iQ5-ji<g3@}5u@Ry4!jO@17dx5v;Yl@0FH}VUkPBi`
z$a~VfCYMY_g}E14?d)FTu36sQ+uJe2GcNKyjIdaL$Y!K685@X6VbFT12|DXomLw8n
zh<WCF$t&yGyNa?=saR{5r(|fmId_uiIG*?>vQ!PCs&3_GNg(!8vpV;Q+}b$rFF^s!
zXjEqWt3lR*gvYvkf;F1uaX83?ulz{4`3(dxt_x!nXJARe3CD3T{T>irTzBDi?o}r#
zy6*O+7DV8hr`E0SIu3~O!D_r$Lis&XHeH?oG}SzeQ<5Z0K-}?=EP-lLUQ+TMO8X}B
zbEQbtY5s{R-k%c?O;lhzbthxNAN=bw1s7)N7!}ABmhQ6l-YWg_@}AUuAy@~&I=fq`
zIcccN_BPc+jRpjPz}THa25BvSG-<oi-Ryr-xm~G8pq0Y>7pog1f+%#)y%FK|+2u8v
z_?u<S>hG%;=JUliWx~^{TOj9fF>~?j>;v)T1;PuRNwM%NnZ7E|Ps<;8#psC<U#n}6
zM82B<z?jdGrl22aMEN@CX8~&IWB&Edc6rHW#4h*+!UDYYqEO8ltsj;Y5aMd$&N%+U
z;%8Vyn`HTtvW2;=I<n?gEcv_;OASd8k~@xqnyg<AZL#Zel|?^<{IfIJ7P1atOQ_|0
zkB}ke38dWoVh~t(5a9pOUGGs(W4=Ht*@A=F+`=yp+hs?Mul3%&DJd8wBWmA3I)hmi
zw!XSDl~fW(ov0t9*8(5#_?+K=(}d@s<uSm}gAD|Rw#^O%ffN-FoxERoRqTc?QSAEK
z<KHfWj+M72Jhp^1^j{iim5VErFQV881I3!omhm&OXk^cym0RhV=_V&9$eNY#{SXZf
z&5RK1A#1u5d6|!tB4Rzx`eYwA#c*8JiR0D`_K)4Lh~zL(WhsEC4+<@7TJicmOs17w
zZ5}g!DP{$Lk>Zk{9oXOrfrkzRf(zsieAKc*wSh>)CrMv=kO=^Q=R6ePeN#pCs=LFI
z!YT9p--&aOf5J7yz*7e;gV=W+ilGH-dyo4&!{gH!IQf-7q<%eJi>T1C*?s_kttATl
z+4M&3#{@Wh;oa?bdwmy9AS=6W;1u30EKuE_?qL=380=MKP&qCz{^fEYWv!q%F~@AR
z1_4@!1-G3}!qzvw<@!M*CEcuCnz5cPubV@SF+xAygYB&M?tYH}ph+YJ7DTzgOLi_k
zI=OKWiLUT5!O}v4kzc^IZ&j8q4vAOxF6PyBlg63xFh0)v^{%YKjh{)x3XyIWT*1Ee
z&1PTd^=al3FX!9jg!Jmp8X#7Pv+c`3&pi<DdThh9?VB-@osHDp<Ia20F=x|2W1O;5
zJMNmaCSR2!)^*!=Mk$lJ6v@BQX24P+!&r3OIn6)gB7BfRnHuKp=pfPJ(b%=R+5%B`
z!W(p#9@(U_Q`G}eCn~_r!z+H#+|haAO8(ckdH&XManMmE42{U@b60Gw%FI|Pdoe79
z;kemKr51N8BlH*BFeekCG6ZLWy-YH~V$H(+vc2b|eY(D$y0+faBnZ!%fMmlHzhSA5
z?M>V&8VIbY)ArY6YJFg1)wM+|<q0Va(i9ZWo`ILPF5OV_jO|~<+k?T>rs%72yA{j)
zdbfUF+lXjjG1yahWoeyL_T>2CX>2{n&5bk3y|}_vlqDotQ8XZ;HHmIXak&4pocOuN
z@_Kd=2`Vu08brMIqvf0=I8EtCI`sZcV31@_<e8irjFhvM4vewqsK2|%!@@=!ji#m-
ze&TBa{d$n{>qPI5-wpSL>vOSA?wwYBiDX>b>N7QBLT6;aj9$Mdn{xFacP@#2hff+2
zQit5wEy6b$B|(06jo^P?=RuqQc6|T;i5Ro7fteY7K*~=I)r$_lzg+`-@c%aCsDU@p
zptq!!0nvXc{XnoBtEoLYVIx1$+yhM)LEqIBRuDdb&J1&)ofUXH!|uM3OjG{%_O&Pf
elh*d%qiC^Zluu#jl7fRDfU>;0T)B+JhyMW8?6Ud*

literal 0
HcmV?d00001

diff --git a/docs/assets/design/hybrid_kv_cache_manager/full_attn.png b/docs/assets/design/hybrid_kv_cache_manager/full_attn.png
new file mode 100644
index 0000000000000000000000000000000000000000..30eade5c7051cc050ffd0ccae797fd083d4389b0
GIT binary patch
literal 4120
zcmbVPXH=70v-Y6JL+@PyL6N33r3S(gK|o3XDM}GS4ZRbD1QY~>NN>^!9U@>LbSYA#
zcZhTk9Vr1sYDoBAzkC1OwZ0$s-u>g9z1Q0B-ZQgjo|z{~Ur&RMnw$E{l`C{wnrenu
zu247tb0ta&;QP31$^Oa}hVNQxkBkDcb{B$_CeO(22Ml3qwxjR7qK1&7hUEpqDbWl4
zP9s!q2D?=1g@toe6$4w#<1s0E{7;3L%KR|54V0VW3%!fkY6RC532EII<Wqv4Nww7m
zqK%KFxVO>Chir?ptnzp^G>1_6eir*dUsw&e`^xF<<pSjIN&6*(r1T%a#L(LAw^oJi
zz1IMt`6hu15b+eZSpaeC&nI^Q@i+B*9YDNh<xB^}6_Nk>qb6-*ePyM%k`iL7+&DTq
zT0X8Ai<RQ%|1vhl)Rb!v$BkdQ#69pu(^!@_5eNiDMa8`n>PSNqlZeCK-={+2Gc)Jf
zm*jt8bUG?OiHUO0PRG*n|2j7AezeCnO0sA!Na0=RQ2>F=ArJ<~w4RmwmQAcpqv3>*
z?|#0%HyeB!63W`wh09>$2&1-%g_%DVMjAk7KgN(*Q<IZ89M02!Zf*{^ebaE<UON3M
zykkw}KUnb}&9`rmOGoIiRjt~;I*?aTPg1}q8jdIPP3-@aw$a_Fu&(}UE0(I#N{X-&
zzu%dqgam6FaG9UX6qxV5H2uVy665efOY<pooB3r6LKGGN@>>^h?G}Bh<rsv4J$r#n
z%unQwaPMq3g`f_JL}Kx<#lzrG#R!48o}k0^p}g$k;^LLBF`fEtK_`SHA;{|5T17?0
z+Ulw#E7jTAnZ(P_`X(lISZqT4DJ16=`9_IewppXMVCE3}Zdqw*gV)OSq&{3-or%7F
zJ6@eq@Xnok&p-Wbnf<lRmCs-M=~G^%fLr|hQX%9Et)#oqZFUY04mLLI^N$j37m2NH
zOOL_Bl>Fe_kV`Vj7-M3<#(jVp97G9)Dy4!$e7C(VdKW#0%&r|7@lbQsOS3@uj$z$w
zyL)>y-);F?dk9WrLp$0ku5d}wmNXPvjFmc$2;w=kB<_q=_z3wE70q$A522gUxO!@O
zA_94YxHh~A&8-TnpMBR#Ac?U({^d_lLE`m7;&X#RpgzYKtskUq&U`c)O#|NFni8XR
zf<Pc*Vwml3RexQ-&dtqjVq|3f?)c;cc<H(6X*&l8!d$(_*Y{j|i!Gg_Y*8qbkC#^&
z2E($erLCQ*=;!S3?{8;U_~y-*8pn}+UnbSEvNGVTlA}0cXJvi;$<9no$i+cVVc{KR
z<rcv8%RkP~&oeSJH&$0~D*E=ex4-ULS^B{o7!<U-v-7#7MZ`76)ehfAI#b2%9mk^x
zWUoh;!pU4hx!gB&md0ocN}3OIg-8&-*-#;khBSENM`4M5BXkd(>bv^JP|XV;#YvXO
z^{2<R%N;J?vj^cPC}q#%M-aFpzMF-!NdOBqpEJYKJ3Z>0TQ8pNjEZCSsAaN}J?7=i
zccc$;-v=)vZ=LP+RIei&kLRNnd^MO<rKF_ti5|7}^^hE;na<A6(b0H<cw7%C_#z-M
z@Vpzv0R+J0)YQq|cL<!7-<N!G1|&*oXlQbB^766`VD#=>z3ryJTSX8EaK_5cjuNZ{
zgK>HO{Q1+TPodDIj&K@fWf3x5*{;5%L`YZ|cC<A`B9SK7Bldtmij0hObab?8MJNaf
z2?+>v3=C-BVjNvSk}qC)dU`&2^2FQwXJ;e>9UUDKi4=Js{85aVk@4@dAuA18>Aj5-
zk!s_{iH+NDIhhk_Qmfi>;pwwi&p&KhZ!FJ!n<uU8uLcXUil5CIzu2X*8Gs!xN5HEB
zn-n*PMKMku^K4qaf`UDzPF$1UmIJ6%g$MNlEzewYj^w7JDf!7LL#`JJY;@TwWJsBh
z#9lSC#zaRT9ri5E<HT+DM%dvMMpY1;u(Q2QLr2#=T3%71PN|zN{U?8ywPdkwrirCx
z3_6H(e)J~|```zuu+65128$+NAh2j2&-V5G=uZ*T($WfxEh#JO>+6e}_Vw`@7#P4*
zmHeEainoNrPZ}B;w9~{5I0Yu$+}-`Xy{mD!JF-AhH#aqjii!OoNT(0>_V%iyf-UEz
zWM#=`8@a!|qul@iK-ser1OicdKh{-!Nq4Hvxb~NN=1sa5_D4um-b=mUeJ;EH3eR8Z
zrOHKE_MSFRrUw{f2a`QWXxid{vB9_0#8h-Mh8&JQ=ZK(0=MaYTJ6!D5<?2I1-VlV*
zg!`9>)f`KxDRs@riU{+mN=ko?LJ33A#jjtyNit-NN1ZNl!mz2p_%kyPVQ@n*{OsE@
z2sBH^?MpbI$WP(Ed5MXMo12?iSvN&QZn{j>I1DX&H_p|$9iE(Q90YKPTgZE_QbxE#
zAS3PF+Fo7<IXS%G+^VW7mBCB(sFt7;ZZ58tU{c^e!#{s|15xqf#S5EfR#t7m`22Z(
zem)CNNWE{KpP#R&2)jrR>J|4|x5A~Tr;FUaU4X%8!%ZzMkpN;+q8AqzKY#w*)uk2^
z5<*?Tw=MA2%-o!Zho>MvU)YL*a|SaQd%TX7-t%aXGf9nN+;a6mAAg7_EONOgKHScD
z{vp2uBR3AITD<wTVtNdJv1?P*yNCDDUH{?f2o<I#5(~nzDt7B&TinoIx~>p|S)<E{
z0(6bTz#qeEEuR)JybFDWpjm-ZOaNK(u3b`jajQTk3(pQha7l7u><z`$t3yLWfHU3+
z2?+|aadT%RBtT&>n2U=G6bkk5h(o(LIHcz0N|?#T#Kmc}Ewm@O-@SX6_Uk_zbZ`D{
zx527qcFE}9zi$MNqnnYDl}$}fF8rig4bnF-7;8lmFD@=hTbLbBE1R0m69u)6jE&vg
z+(h2fPb@U~VZy_^f#56~LMO!PTz{-8%o;5lj>}HCoSYwS^ql|hRBOIZi&~q`xBc3m
z(=u~2uu@Imilr{HTC6o9!3%65ykkYCEA}gXvtx1m7ineK6w?A!t6#gZwhso=iY8QA
z&I5zCoy(6kE?v53dA94qLsr<cQaw34qu>;|BM@!vf{U&($)AUZfJ&Vm9f5&?FFywu
zn46;<?cC=Z5(d^XGBOn9<v(O(Fk@i83~^9R%?t7?*Y+wE4>^xTTy?csv;XdJHpt5#
zQ(o@u>@0sXHaz^#2$r7!Zohi<N+XW9XeBew)h;&a*$&Kw-L0;kicwYPe560Na&mgl
zmfn?|IpTE#^pS&4ij#SHnU;-?hp79<2p`*ITTuX)b$nfFF|bdts1U;Ai=vt_3cL*T
zGk&GhTfAOuX4ZVozE&$=#`bM|>sZ#ToKa;hty5N3XmO>KHk1OZ(CrI_m}>A0#nos6
zuy%T|k|th6x~S1aAU=nOhhs39!otGXSUokht|^SEh}FyU(P^wL4K;Nvud*`?#>vUK
zxv>$+XlQI))J&9em2EEjL7H;ws!~CObU5AxgM<4L1#@z83=9lxY;5Ew#>dA+MMZ&?
zgjMT3Vc|T`#?sOkhnjl#g(hfc1W?UnWo6x`KcR8B<U|i}b*+xBWkKYhT&n2!fG#WJ
zWRWshfA4aT3r8?j4}w~HWS3fdBV(|Wy(JL5yo<?|B}Ctm<)Wo}8?Udq=MpF-g;g-|
z$K8!!!hVAe+Yo1Hf|V1-pPN?n7L!E$@`op*9U0U!-R|oGs!a!98|?2lt8?WVh70@q
z`9-nsk9y=ft8I^Sqd)#0I=+oVSlb?4ZEa6n&k26((fgPftG3|#jFdoQ!=zeNRAgvq
zXdw>A#r_orfS}u4T{ajQI9j4IE-E4-0tSP{#8A_$92^>yM@L6v3zn^jS8%wT?stS;
zm2^p_#P$wsXh1%U(b>(tSL^JH_l#m7{Z5jngoKQcVBkWXr(mqZf;spge98CMwJ8be
z>S5&&qt}dEFb^#S7NtJx2xbiwzVzO+MSBQta{!i_l9BpmAO|*vk~E8oLX~7uIlDNi
zQMTNZ=x^ipXhd9H7y~>rq+8oqCd8zwypa8aDzf;4c$eq>8>*VZtl)tC_Q65g$J%z?
z(QJ*4jc~&JBT7Fw9B5Xywzq8?u9p-Se+4Q;!+h?@1|QH-3=PE)HYL}GvoU35T#Sr>
zVq06(R8-jd`W_#?C<6lvi=DN#we4*HDGZKooJT}crVRjKqU^IytGbzH6+F`tBp2D4
zE*E$FjdyHp4BO`A?QLje1h`m6I)DBAeFEvNn9zv-3=fZw^)nN1BU{VbH`p^EjfE9r
zArnWqXR7G^$h?@V@F$jK`NS7~scX4bsX;gAp`bSfg0KKgKDt>(G6BC_I`<E9wRdsq
zrN4!G-Bz<)3LZ1hF++|a2%Fn4r;@y+_%o{$AwXNlDpVT8r&$bo3IH6;$!QFa|C)4L
zUS3}5)z7Z+aYIwn`ntN*AqT(@Hu4(v0S7jYjvMI@9GzWU4$m(9w(&q#kMZza5Bs92
zqhrzNZ4W?(wl*pv;)&Wde6znbI}6Li+}!rYhDH{%5Gy-7fDLK*N-WmP$SAqFnfN8_
zuN&Ol&+nN$c@puSc6U!cZ|WU{dZ_xMW^AWB<udq=(1^)|bl%E`2y|W{x7UG4#54bE
z^~jPSVlWpjNu=Z*|L!M|(XZ|UH^2}RJ!t+hD{c29*KGeI<bF^IgElnul0mK<({G-!
z^$;f%*ZK3-?YG4vq^*E0l<4A$l&!rN7^q}FQ&Lh+za55#g~4~G4^9tk*-<iXvkzTe
z%n}k3fG%V8@XSJ-<FXu`kzr<UUz9ORBuZqYryrbM<mU@GIXSVguw?K?GBi{2-%SE4
zV&9VIrZD{s9VO*-OVGZu;->s-hLE<(%1U~AdO<<K>gwu{02zsJ%LEx0{5|u!_Z|3<
z>R16~6TBz4y@xe769D^K>MZ7ZkDMke_5Y^CvZi<~j{C=ZdHXK-`EWX$T|W(>PfAY2
zXG`DkF;Z1k)z+r*NzTj5yA@~((gW~0Bjc2OK>u*HK-ppNgM!ccKYP(A)C5UGr4$6E
zh^<XwXubv*7(zx|SW+bQ-NEmaQRM!cS^i(R<bSg;E`rH*-6I*>uQn}>2G5XmzA7sO
zSdqqw7ho=Lzyn{}at!9b7#&bhNDD8Q^nt+9SR>y8h+TV0XHSpgKc1|_=^hOAHHzgV
zYk=myzSIR?q${$E39R^U+AGOuLB0xX|9@!rzoE#Ne|e7UKlyl4Cia_aYN_j~RjOKt
F{tJ^y_ig|H

literal 0
HcmV?d00001

diff --git a/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png b/docs/assets/design/hybrid_kv_cache_manager/memory_layout.png
new file mode 100644
index 0000000000000000000000000000000000000000..bcffc27a716497b2b280be176b4f40e520dadda0
GIT binary patch
literal 63113
zcmeFZWmr^g6gCPLC5Rv(A|eJT9Rkt{h)9aWPy<SL4xK6~4bmmuAPqx|2uSA;5<>|L
zIh1sL&!E0vob%(H>pDNr{Ll^T{lt3Kx>wzM0$$6%BqqE?h=YSe{7OpV4GzwgARL^F
zdH5H=Zypm)C*t7jzJ4X~T-hmc^~CeDmP*pu8HVt^mVFTJbzkc4W`!R!Bc%I3U+KXW
zUO6VL=fU-tpa<IfZ6Ay&amo9phtxBe4&26Jj;p=ic^kA)vdj0pX@XhiY}xJXXY$}g
z0>t)kJ%Psz-q&!U3wjJ+NJN9X7V*WFPfKR1%X&ivc76?tWFH==u|OP$a+Qu&?eb=w
zcU;$#@`g;Y3j`l8emhydKbJVMYxqllFPr}VE1ds}44NS&YO<Mca+UK~UCS&qF&qiq
zl?HxEiY%06qvlLer558b&9NqaA_C!i&I0k(pU7M;^ETILX--91jN3w5bW3*X11dc}
z!`GEYIbWUazcejcx^jL7J(X)qOG|DdbmO!Rl3)(4gcVzoJT11E&UuYOL->o}$CQEd
zw(9tbY%vz&l`PC%`R3zy=;&hDSEUtizZFP|7HuP9i0S01&*ufhZ~YxUSXux(I;ZZq
z15=cAHqxRl-Ey5o@Z00N(ytSm5@KUtd&<Fhx_|$Mp)3-aziN^Ts4ETcUL0h*x>ibn
zh;(rO3_k)xp?+_MUZAH1BY8RHDua<8Mz)vR`1y#XLS0vaQd3iR9mL$O5)<bld<!H3
zLmL_zBy}R1nK|kj8bq9HDjloFd{UvjGZmZZgDoi?A+HWwMNJ=@65)aYo5Ogs3U&mk
z?ET_ovy_=Cs+&l@T8vX~GJ5AIGofr061oiyhE$G2rFbh|Kl;71!!Kvgprynv!{0s8
z*B3UGBa6;?VO#Xg*Eq&(B}LuHuh0=fBPxwL%w3JW<bEs|%73oT&5GnZ(x`0B^|z4`
z3jE04cBhf{8-zY$A}Nkcd4q#lT5S0?$)?7}Ihrr)B%4bM9EK>ci!H=1wxe|l-?tWP
zEB@(~Qv{KRsfuGfk@DH|H#D|v_^lQ4^T}oy@-E`=iJi+DM@TI?Q|{Tq94usW(`IKx
zmI2vuF9XMr`|qFYfb=Xz@v<$8ii>;Wpv;l&`do3+ncrWF;K=F8{~5wfJ=#D$x*)X7
z!qvq^V*$1V6Nm)sy)2E!DZd;A153fd@#Yd17UtxfITm>Gq*==kXUyl%S>w2kR<bfC
z+YK8QDl;v?G){}l)Kgz?XpT_eeE$+b0-k*K{YGHRU~TEBd5KyOCK&wBn=mvfuPJc%
z>Uib+<RH|bY?Q$vvw~y3xFyNY*>T7JQ%l<F?eR*bT;*>>23BqI8xwb@zkKkqDm1r7
zv2_@hEy3VGf&@vC%|oU<MxAaVDe~DTA&-K*enr$0i#Y7zJx})cb$NUbBlB>sWvT9K
z>RN(p@uI`L*2Xdc@pj96-D}sW5O=&QQhlw6rBl8y=O$4y*I);f&7+FfM~vRdYT1lC
zP8JoMWFS0Wr?)(SyVWS2Nw)PY5lS(cSwwwn;^vN=@RZ^T`RG^}v040fmOR?{<d<Vv
zF6;Mc$JMdx2L#T<PuCX@m=GZ<Iyn}P>0+$V4Gc}I-Rs{f7AOcWTUcQjF4v`JRB{}@
z7{64=oY*RSRA;GI9{LIujpu1t!8^EC;_onzAw)1g7wqq;-FL?uK9M&|2TGQJ@l^VQ
zmCc_WI;q0?d}>gHbOX6@8sFq4s;9rL(~2*^-c9z5s$^UbYT|#iH=c`^DLfy?c6ln(
zuQ&OE!^4CO;rvNBUR})8l7se2jXJ7;<4XlDRgg~p1axhLKry=t6V`+3wb#V#MTAj(
z6cpY&p2YvDd83w4USr6vutjaG)=6Z3@tFXf_3Mg#u%boJU;nKrba}!b9@zhVBFp#c
z(qL&I{D`0-vVf-A8=*t&#LnowEGve7(q5cjfzwGJ_+4!B4ql3M1$Btm4{sLD?bT<a
zZ!NB2VxIa@TAK_JFAWAxSTWK&W)du>@!EPmY0k||dRMAA8cDCDyyh<MVR&(uvZY9j
z@>`yQ^PPss@;nH%tpE}HfxUm<4J)Uz|5in_p#F0<WchfPk|9H>ioiJ5Az!`f#Dm5-
z_}XLu8f9pWPjMOj^P*VfbUF$DB<oW<a{--g^Sm3Sq^~SI#T$(`-s2e&HnLt*vKwrF
z-q*wF;?UrN`JhR$b+gI*7oW?MWvI`T!ZW?*T0L{&J<3yQW9SEw)51N(b(Y&$*tv@f
ztUp(IAiAfVC107zgwMRNYl59>^RpGzS9P`+CC{BiuL)-kCrF+oI-}RHY~GeXvi-Ak
zrmyx-N+c5Lq2MDrhlAUhF;=<8^6TzYkKHdf5q9sJw{8(SQSHpOqql>TKX}O-`vcTZ
z{rRbs=><o{e2@LgD@;vsad8DNZ_{?i?K^iIJfhJ#6g_C+$o5MxXl#?!ucE|I&;65X
zMD+xH1A20>d(8g%FvWJLr{>`l{vQA#_%g4{#ESoVed^X57RscN;reS3D+$z0h{d@4
zA?a6Uo_AMMdZY}=1+L#TxxA+x8<f6~mkxXC9R0anU#OTE%_cnEaee2Wkb;FmZM@WC
zb8d|LHy-xhaGen9#|UO_H<=IUGV<&Np2qA_OhTm%WEQo-`LxJ%F({`!D=WaP+Me@l
zzpDOB&^PQHOJ+^@^n!=QUy+lO_w?GYPnB!R6&6XH@Hjy--@FR3l~xsF%tV^>7KQxA
zK=N;G6R{_j#Svzx*!nz{C}ruSmZC|-@$J_VW6Y(FHsa5A+Vc<k+;8vY!ddRbHdf7y
z77y=5e`FWkQIR{dZ~Xkz<Cq+EW+(S2TG5E=oF6PVaB+bL(6snk4wRoL$javM?V3)G
zXjfW<hKAC}E|ym_CdnId#m&sTD-^$A`c5NNRq)fdx1FN}sf~{GO$AzPBQ8RW-j65)
zx8N4{Qpv@D7vuNKg}x`LIdh^Ut|$s_l%{c<m#B}E{UH^n2_xIwRDhk44`6l3%ZmPu
zmn_4t2~JbT{p%D4vI+`SB7TrSecHIOAln)iaq+!+7uVd6{x7DR-<iHw&>Sd#)MneJ
zy8FctId5B|#$t-ceAHNf*Tm7d=q>Lul;B}H-K|@n*5*eNCETzZz3O-wOw&iaEUz=<
zkW}2?hY21}VDSba!Nj}?(8whD>^Hgo8kyZM9v){4*&#SS?UqYP()1KJGK*e%t>9hj
zp|q=372RK96YlLltm$8rHv9$PRpiez?{gm#b~Q@+Mfbgv1@C1qEjBbHB}zU!U5!yO
zu;fF2lR)JU10+{mkPALoYmK}Tkxw~X$f(`laNM7vMc<)pvHAE{%Hk<w#znA39H{xr
z0Tc2}KIr=)fzDFXig+~2wnod+N=|M+ckzM!Lcz}~4$%g=;bzFp@MZZyw~X4DAb)Oj
z&;tQhLP~U~XWCK9AAPD?a^twp@w2j?KHNavBKMoAp6fAYXq@7%Z%SZC<fi)Z1z?54
zGWUljROKFS9k`{tM;UKk9ZrZ{<$6AQwyB`?_iz@km<shvaHP(=1T&b89m7nw$ZWMA
zj8}^Pu&~8ApdlY4k^FP@aNg2VD6yF(lYmN_A@GVldl?Q-<s@h{UL?ytoI5kMk6EnS
zK&HjXjcZ;9tDHUjyGn?OB^ev*2$7OU=H#h~8Gc?|l9odmX7WdCYg4gAc;~#VoIHeu
z@`R*SSV?6?#d|EtuFXffqAVAMK8lNt<>2D2+!>JC=?IU>tn;d^4JvARE1U)m$PziN
zv|!XVH@qiu8i4WqWAFjllFD+S_Co{p8znKW1^L6i@bHwn7v?$h8ZTb8-OgjZK=P6%
zczkyG6PP9SHNI4m`MB!(kBTyNz60Q!LC{E~DcA(PHM)3H*x~@+`70Em?=7fiC6H(N
ztH_(Vw78b924?c62XZX>zf6k0N@i{V_IN*EaHFs0J?t9Opyb_8-@>FO@A3BA78Yeh
zHdS1(bQxEve^}dczykn`(O6?N#DUiUHa}v{&t2bG3oCq58W1KGc+kA6#w8|bP}ucx
z_y?2#VvUUsK0rXM;Yv;5pFn;V>S_D$_)BvyQ_0DyEf^aY@u8!8*|m@ng4ARte4k1W
z#U}%KIo)F$Upx?#yg7rnG3*{FdDNN|8Qp2%TwIhqtXmQbVnv&<(c0gXRN;NM=(HRL
zdCJpC_VEYTx_B|tPSBzQH|?V<)*`2d7iSro&wiy$pB7^c`58;DBO5SqI~tzMJY^<a
zSEZ`o<C=x4$OzhAcY#{MpP{ah<ytcZ5GEj!1}$V`uT+KG@oDY_dGi~1j6bn?ot$vF
z0G=NJpEAwvDOh)&)-ciE7@E(zV~$7CDre^TEt=j{r0aSO`f;Z|*VqqC=QGRuFG`qu
z>kkOFALH+T%5jBODE@U%e=Jj1x>f~yZ&IF%xZ0kie0~)ES;JWJ%4F7+)efa^YmRG!
zPn+yy_Uer@aD$`qFU{$CbmzF<i*L)DOYkM&6t-s=Ahf#-FFx2ff*cjf$yv(ohpM&p
zK=_FEWA|G)O>bdpf{!kM^S`j0RsXf6iH~gHfY!D^<dk(6asBTSy31e>^@9VN$|M=y
zy6!DzGfWOzLGrx3)86D(hYHUH8>hbZf8?~a8Yh<e`q7QV;Vb%C_6pvvAH{_21<RI{
z`q)alnD)Ot?9CTGZeh^5<8j2hRmUOrBdgGy;c_#lJooqV!us001Gi5@Bfg$0Z9yUf
zgO6?K&K3@*?*8+1lb7=NtlnuaI)doK484~{{Ez{+mT*!5#VfcMU3eR6H}_tdiNX<m
zi4KDL#5+2u%HT)XBV8`c=gve!(J<PW;6rd@j#JX@*`(F2XS;T7-OH+?mw$_IWOtoN
zZ9K0qgJ9|dzTFhw-)N~t^UWEqMfHVxil11i=26ZL{!NbkN%GHQf1oD5*Uw@9|C_64
zj!cw+xw{~%%escY*671YV;vpoe~y7*0uC0H{2|#Yta5@+>`=}hl*PfhdjJ2?6?CyV
zfV?$?mK#NQmqqp5jR1^Z>`N7AluslTW6>x#?oWH+t9fxyTng`e4tQSxIU@<Jz}&I4
zvR`b18T{|u;ermUKTGslDOokj-!^`@Ohg%LN%H02DxhuL0g+{c?GXZmE2PYS{1pxk
zsy!~c=bGNz>vVrq3FnJoJAIKo-~BxC+oXSF=!MBBq`e-LLt^+>s9r@p{QRSRyRr~G
zCAFu2gbynO(_K=K$qsMQQ%HPT^0x6pngERKJQ2mg`Sw^o@u|&hbAVoJ2<H__+9qqr
z`A+&OZpDM?6=?<DhWtscL6$iNw!py1*u-*!=t=NjDW4BtZYH`mMll`mCr`wA#%Bgw
z!{_s@O}qF{=BR=e!lz^cuq%HPuP+G|WCKu#QG+1c7e@cKV>ztVQ3>SpB9Jn?FIDd*
z23roo!GUX~tB(Vr<)n)HDZct6=?x=UF}(Z#Mu)PwElP^v{Y@Zna3;Zy2#|8$SYG@)
z7%kN3eB!G`>}J6M%9j&7rT8m>MGstlRN9L`eKAHncruU$IH)93stuCAsb=5Fv6z58
znSi<iqxVyX)tmgZ0zv=wF4`N0DeYuKMh93gFNM$w{y2s$KKSps^01yb@1FWhCC!(d
zv;qNtAL=5Me*n^(SAmqQy4ML{H~%gp_Qf36b$h<Z@vr`u1Q`Ao#l33_v<)oFo0kB(
zv0EA;57oNhkN@{6FFvEVk?EO|XDk1NHQk+?DhOL74XnTS=C9)5G|b&2`_Q6&o%8R~
zyzbf-F#oSN*tb0|X4(C;`1c@i27#7*=wz#v#kX1g$;pr{e&b&mYgzN_TiigYy8{2l
zPJJ3!&$UQl@$#>vNu#w^qh37dbNF`yaabg>#Q)M*;NSg!;q19bHU~4N`nQ-LPXndC
z{$w8W`FF!Ra06IP<=wwlYV4~eA%?x<{l7b2#Xue6z&wd0toQ!il+4Xf%TKHE{O>yY
z>VVo~*Tzp9FZu6xWJV<r%J65%f4k9`XW8xe0(s*1Z(!m%-D0e`VT)l)m3N*0-8UvK
zFW;(LZ2YHu1AN^$SaTRMmHyu*Eq<}wYXmp^JC|pI{O6nc{?EpZxoiBr-<CkU|67a4
ze|vcMPwW!k;}Sy>N$fL0=(ogsuKPF=^NxrkRDN;8;SU<Cx=rv;!@&afYIoc!!BW{T
z(P7MFd!bwKR}l9?cLJZ)WL;UWi0kR`K^DzrtG~<pLR5cP#5d{G1k=>jpR98>O0pjw
zHA*>Yyynpd)3hf1r?DznMLXW#9MN{|hKcL06_oX8R9mNNp)S}w`)5ea>z-#vzgj~P
z`?G;rG!Zw^$*!31K15S{4SR8x_<sfV4C1si1W9q<9j%&l>4b@s8Db`CsI;sb!K@Z=
z`ZC(T>MK=8rR_9Bh1mB&gR;o-2^bqsWaE8}9Xi?La49>*<1mrmPGCYvNj_QVAY1fg
zNx}0FscBWeQ#*BnnL0a}I{EMgaWalL+3*vcm|1xK&)g%{z?dyW>~+%~$Mx&(>pI6R
zI*G2Bnk1_?=JjCVXPb0qn|`9ljVFUTXFEEct9o=RZjpEYq3EUUf>f+PMSiDV1ENxw
zjKASoT)#Wk$&NT)NU$;uRZ=h_{kMfYh_>^*=V|X*uJHc*Yi`p`bf+7u)X1?!jl$jI
z?cOttBLaEn><Dq@x{%HNVTI!SSRV{v#Rz04@4B&C)~j7@O-}%Y<|>=ceD}c!VQSFH
zM?e@>8L!H2$FQ6~A(a7yZe~`K41|hKa!k(MebDcWPRgE#0dWj({%V!0Fyj5EZS8gd
z(W!LY6y4`8((U6AK*@Uq*Xs_p(fOfuVEaJBIR+C)1F>)|cZl{pMVFLt9&7`F%(Mhj
zC0HmNw^Ovn`tv=2=hMR=;)_Ka9*4-34!RzJl>)?3ErNSIcHd4ldTu#y<p7;>HdO5K
z(EI61L7B*INoeMq$4J&sf(_L|J3Bi`c0K$(#a4)uT^aWJ-&>vRXJE2y(plkk=UzO~
z{pWUtGC?Cco=|)&-hg}2<w$wEJDy9KF<IH!-4)CscRY`6&}h$t7*CU<Qv&RA<GGDU
z&isK$zWL90#+>~UJxxE9s?azY@jQVjtu0;f+~)HHwS{z%t*Nsk&#5Y_l-%8Bts7Nh
z|NJV!c7TGL*vU>6;)Im|8hE+`Gyq%|9{LZr```L^s<B}3d_=?XR%e#RfLyudpa8Yv
zKk2@*ZGRZ!wkS?#>mlsCH5Y#W<<-h;v)lZ(b93#HJ;ii~eSW8>>t~a1T@7g#06aZe
z>f-K&MZ3id?v<Zyd%6u7+u`6??DmO@s*diV)l1Go&q~_g>h&2kY=%x9Hm#rC_0)Eo
zNu$Uw;uyd4peF*cR<oEM^AvG<8hqw7zeD9idS){#>aqJ<5xogOdX@cbi=DWfy}jq@
zuOMJL0*Hp+12Sj3R%eD^g*O!+-S9YB(Gf&QhG%ydA_#DBcCe74({OMLS%`yEj_eJ+
z0}+S!dd>)Z41OL{vz>x*5Q8HQ6A;G;cg&e-rLa^Yx5pmtW{T(OPL~}zrQW!KbhbsO
zzQ7kocl5KB4ZfJx!^hqapkouLyC6avNat(~al-d;Eg%%w+)Lu6v~Pc-im(T{RSI(4
z3mlSPEe5#+=$iqMy~}%8_Mb6D47MW&b<l~?Ttg8UI{*RT3=S3&vNX7eI43A@yh!2-
zlzm7qF5N*Azq;~QJJ}@k4VS%+Q>#4VMx(zR-A?Ae=dCu-+1Bm26b18*@23EcnZaAF
z+&KJLg=i@!W90#xJ`Rq6{%U2?N9K`Gi*XY;eLF|B?a6e<4wvk6C=B8h8I`xnh9rJY
z6(1+EU!!8SUVS2u{c)_^a3zaJ(?A)IULwyydP(b(dAd4?5Xb9X_b{m1q?Zd_{F!`H
z9!F~-e5SjzV&^4T5O3;LGvnaUmhOeu%JI>^GA*Wig7SWlTKk<wYOfFK-{u?4XZ6#m
zY)SQt;K|%5o-7|O$^=A~_wV1&(tPju`ftdCbMv3?_<wix%<pQER!TeqW<Cx!@yd7b
zI5YP^hLKt2T!zeb?b0d^F-yK*h}M33pbyYmTVTU-e3zh$9Yn=7Ti?w{dT(>0NMQ*e
zLJU6t0G#*!=bLeY{zSQ5l=l;cgUz{6{&PQ%+E8Nt?_r@!z7H;<C?=o5Upi3rIneM<
zPK>_{XSioGj$sw^e^^fi#({T($ZNPjT@Oy-gom{~S~0Hy(*F)N8=UD9!xzEpz4QVg
zUgAPss{1%o^v5xJz5C~Uu~&;%T68%Ry0ZZK5dR6fjjz}Z&^Mm<ci3Y(vuOTA!+jCz
zR09-&Eucqa*eT1P#1xgN18RQRNP8U+a}DZ<4?9p0`e?CHeSMB{9GT>Ef&efP!!M)*
zMI2IHV`C|^S0!pU#4hm_Z+~i;ZElt*n624}QI)bUCWHkY4?acnMYdOqKE^JQ)JRE5
zDOBVzXQ~Wf+zD`!!YOYeiCLMRyLQ88ifXiqRP%>SE2J}LYo4dR4x=%SNitC1vY|H2
z1#@jPtvx=F1SMnk0&J@&6N74S+Y?Kjo{X9h-{^dJ-bN5e>OpUtPTU0*JlmSYUaSk{
zO{lQYY{;l@RH_AS9%5o*AEAw&SWqiLU6M#sW9d)`CQSo*CgEjz@8gWNPy*7)pXXMf
zpqdAq`Bh*KNa##IbVg1!ue4ScD1mY7uApd@=~~E|V>~#nKA`=4g`+M`3;s*KcN-6n
zj$TSiQnG8C!|6A8R>TDSg6hlhng-d3uU#`^e;g%^TU?5*3_JgLmHDO}lqyTbD*0nN
zLGy+&8)W7c8mwJ+5}kUS-(|kBDV2J4%4GKq>_k3$D5g2sK)pt*r?fzA9u_fH&qR%R
zdR(~~2zw{Nn-H~`D5O)PC4X`=3%O*@YX%RNig;Bkpq?vYNuKV_rJhU1@AJ};#~ULC
zGt44cO%Ztur&tv8GYSdoOJ!9p*0z8XZ1A{>3%nwd_#UJasrMq2<ArsEf%=$hMRJR=
zO)8x_-dvqY=9}=DFX$g!UVXC<ycuOYoA8QnvHJO_kD9BJ<Q6G2jq%&dE7fR4i+_L5
zait)+zk8+VtBWk(18PaWVj5JP<y|>PA2CNB31C~QUx;>gS?djcQCZdO7-E40ZrP+V
zVo`1SYW+nP%?Hb8oKW*_IRBEy+|{!#eXgdaCSn6lQ2M29L;ZF$-SPpwo32{H#~+RL
z7oFH*R#sN*(&_^O2(qRKb6G!b&u5BNC1iRy2#K2YFbbsf?(OXGBz5VkQq$iszJwYg
zg)`TI(vu0py@7E=0oI}b&OrAv^8wC0n=^EM2o+BpqKPkx9$?CdfAq~};oI`cN`9@4
z9YNBK-tY_?RkoP%pL#ctaR%d++oJ?e3wum%(b26-l%3llDln`FcT1!$DpR%`H&2d*
z=Pus3@8fi4%0SZ+?Y5E<rae+9tP6A*ml(QCm(7E4Q+Uu%I2X^upuT%z@UQwR8i9et
zW+f0%>4e!!@&Bx}q)w`2G6T<A78Fc`@sc=QReBuYpOextov1+@db`&??A*d%J(r@A
z)+7^nl^<qm{5er6Pl@m9fWx@-?0qU%i^KUcKi86J)wUP!$E(G?v)M?LsxlPiHXmi3
zS6V^(T0}BlIUd_bKo2xdRhH_sqEtB>(`>X1(@68-5(vn_FOx&^*vMUdd!BGUyO%+9
z5Aw-aPmIx|f}cL$Dv{nPi6uSq!^>lUCZPA%ZK(7~x8$d>N;tMh-6KC|Nke<RfJfHi
zjd<;V_GI&&dk<@6EVOwOh>_?Xq6$65JxXvJQH4h|Es!fb(Z<?!0xj%zQ7E*HKhd0*
z##Xdva!_Z*!!1^rl$d(F($0GRuw8lxCLZz5dfo-jECf#Ve?xAxY9XrRe{)mU&VjqU
z5d?>OtE1ezMZ&JTw?#ebQiWyZ<km*ZUZMgfdItxUB`hrc3mgV8Tk#F4mYb6k?wzA9
zW28wnSu~_NV;V&k6Lz4NBV)gHP$y6pMF*<a)7|A<@X8HAxADr>*4FuP11{=OUT5Q;
zGgYQ24tuvAv2q`==UI0^H)AV&LF|pafo6NI_|>un--)M@?K$c0NoAB<U+$&Z*E&_*
zDLe1ZyNRriKznRz9CL~yr&mpNb?;S=5B2)n8SpCmX7RqlJQrR+aBFz|=#ra&P^gq(
zA~>^d!zLZkWwjAWPLA95a$`g7()Qo-NHSbys~+O&Pd+JahY6B>z?kBjFr;yF+dsoE
zc|HM9YLS;Kxpr43YrOwOR#w&>_uq=W*#N)lTy{(kwv&ZjL+%JK*NlFmU<S}egUWXG
zdl`C1*yVT5VOBSfCI;RAlCJ*raD|FZOU)6Y_}U7hvUNuZKfGPv*UwLU@^$U{N@m#J
zf=BT64msHP#S}`#S7=taxBP%9a+#8tpXm7$MM^5g#JSJZg!BN_bCo*)a%25K?nPVV
z63qH3>q_CAQMzjW{G<x-Oxh9rGW)&<?h!4y`3^3i49$fGgcB@rf~o!f8wGXL$4?>1
zHPC*rNR6K~Ma|!ccm842qT64r@BzYNFLHl*G=}O~?2GsF&M{Ed&=&sn<vZXIj|TDR
z!jG)u)lysf7h4&9JwH7!xT+?co-hJ^7vAT?1$Ra*F4$j+r|*XnhocOz*jeu`V>P<p
zqG|asRs0BAHIui(t5uP)cMnOtbm!#c1h_YMR5w6YTEnuiqpSYMK#I4t@EBLt^~I{3
zHfO^fYo<0Ejxc8BPl{OwKR?5#x5}@eS+=dYz2K`+TO3n6VA=Izy1u4v$o^j7SZ%rR
zlh3>`9Qq``9CCq~Jh^<qsHg%A=<wU$zgS4}NLFj$HY1@;RYxfc4>#c>IYNx-PIy?#
zZ6iRXXgsl>HvhF}i#WiW=OX!GC%{96JQRxQ%pJ`cgGL*Gl%(e9d0m1OCE;bXpv*YS
zbO-Ek)LqWhox=LA7=AAz?b6Hg#+MJ2MAvdR)HTz$J4|YvnSGpMi6wc9Y3L;h<gjS|
z;L3Sa<A)KGa5;*x_~~S4b;fm$>KGZwIj~lxjk9P~Tep0oSQ}$;`j!4Nw8PsH{FK}t
zPdW>9h!}qaQ@bugy<x)RNJPaJBrcD>lR7`4su|5ABvc@xvgfN&=w*lLLmgcg@p-!Z
zg?Vjv)V=mrequ<N{K$KtPtl?HY*~>pF-Nwrm9D`ZQ!`=o+?&Wiug%b&R?vl}f$HnI
zxoIUoH44fQ)Y%SH{M4drU6Q}+g#L9C0Uj?^tf*z_v}LDaM(hARWQ(zL$gx;}d|c4N
z@yJYSOrbRzYxYu+D>pQzBSrg{b8E82$Ye&ibp$Lg7geO`wlM+d=WunKyHB_DQv_=u
zCp#mZ*GoOrC%1b=*AK=3(BvHTUwGQvSm;sv$)A_MfmbEQp_b{w%Ni5CU&}ogi%{hP
z_pb}nEYvjZPh24zNV=EMEkGsBEI#+WkF3hen)Nv1`aRwe)z~b75*bv$?(KIy{zgI>
zyOlb|kLa$Q9_NR@7D-vx=S)IEaozJ@7-A4tj6FGXhD?)Pi6Tvg-G|r9^N<WzdVl0#
z=)061ZAm215bB`=+WT0a1seeNJ@}+B)$KLS1J^0{y&?B@cF%*(jU%gY7H5-L6$~C~
zdo{l*_?;!Dymk=&Lb2y)7?pJqe@Q+SN(g8q@U)kmrR7|l2($0hkeiA!L9ff!yaILy
z-{f+1^6XPs`>3;sos!<Bq1Nh9a)4|Y^u4}C@j90Jl;dKAGy<7N@|k3^#({0w<L1TX
zt$1>ju?==rWgW9EffXtYLe30k$rm9jA%;5%$83a{y!(46UyiQ#zPVbs(e19Z(C~QJ
z@**mjfP0pMft%|3K+<<%7Pf)Lc8wdeY~OE4$I}>I{u$CfZ@ZNq(XJnF7JG<`Pok>N
zsXVyzy2p=~Lh6WY>1XTDnyaAgXH7H&vCk?Pu;Rt--OjY6W*{^-uCpWW%)_qd8JWG)
z6EmD%E_m49I>xmfzUK}Cvi7C9pgH@ghYa^gnqonHdsA0k74&$1h;}BjeL7gOt(j<U
z_TJV`9tjS5|2?MPfGe&?%M+ItG}%|K3?;=0vI}J+Z{?}2sxi%Fya(N)w>&B0f*UY_
zwVE&oFpl<5;{yICPcpQ7D=oMkutCDNf>P_5@1QJU7G?8pd27}RaCz!mh`8BQ@Ii<0
zLY@ym-U{Uh&o7b_ED3?GX2?8D<I0Q&l{!4ipNu}PyS*t_P@Y5eRpqV8);!(*8U1*1
zad8{t)WjX7LS~kjPTNu9-XRUzcUjR&H6!xD0vSqr7!F7-r69v>I^jnK=AqWLWeU}6
zM0LKLl@{5gA25Ug7*qdMHcF=#<K42fv2h}gynlY(G(=iFuR3U~q9YU2DqT%G&u)mW
zs@1#f3=R&ahl((HtJOJeGKwC&+8NEweJs94mR3QQ2x7P$k}B3Ap=bjYCJH0>pJJ(B
z8hk9qZStw#a83+qwo=XyU7%vU5%$=YA<{%3c2Sm66-$yU;5N}HR0YxSgj9=urIhIA
z0k5K25beKiwuAkBd%Hz0y{fg_sLE}dT2pL-3%5WoRwiX>hTl9-aG6j({#~+kb$UmR
zy9#t1Ai+zeTXkB*6THKPB8(lJniA#0eoBF3G{XIqo0|(tMAVaU1JbKs>MNlHzGa`@
z7OL94p#lv^-PDpi>H*S$gJm`O?1!|b&e=ir7Z~;(S2on7oTCIcwQ`j|i6=qQb-Bd#
zy_`G(=4|;9n`hx{aI#!V(dQYT@Ohs_Dc!#^0E1jyEb;{jv6R)+z~rex%pNN@c3lta
z2m5Ajm<m<twh$t_19*6nLGECqO&=D<CS_zRz&d0rotdbS;cIz>qE>Hvj*@#_yrnwJ
z|EtApnY6{}ibIe6&8l*F`Cp6FLm*X}j)3ngU#?6O47otdUFwXQe2FpihDy28)wr(T
zcZ|&NmT>gp3&UVNA3gAVC!Gyv8*D$|_Xw-jCT&6LItL>yTCQ0sMl=X6WN#17vVK(+
zW-JSQ+O>`G3o8ognS+gyMRTb}2G;X(Hl_|w`-@#N8WVAjsw&M_o7WANj~{mK&n%K7
z@?RDjVX@S}Gjv0iF8GqoU6Ol=ZFk(Y3852hFzUTyZ@scJqO*S0-6u<pCGVrAW-pt7
zS6ER437EiZ7-$ZVeAJ)>Q|emb&KPr?aDHeGk=YZ+sC?)xE=d^Q{d@Fwak?eO8CN*m
zmd9prv$^t+AM$K#3U(u+U-E5I7M627s0g<w*WY=h#)Hj`pGkm;JpdZ!n$`VykZNY|
z+c<WF*Kul1wrjpl@t9lygxhZh`9Y4V;WYEPpC??}*p<A{hX+^%(gZPGyrht>p|gDB
z+7%i)y3iaZm6bV|0PTVb2m)#Iam~|e6BGFev9Sp;CSZ9`d!Xkn>Y^edL{K$5i<j#u
z+0bdl+dpM!#V~@T!#040zry+gDtm&~oslEW?o}F)2Ru(k5YN3~?_MM!O=9ek2C#@|
zQnMjhVrn|Nk<#y#SE~j|3Nd$;sQF(&ax>hi28GMExTi`~xXSrD(A$lS%98ovSsJ+6
zvPDj7$9W4{9^4$h{UrhwAlbJid$yxPgy4As<bd<#g@zcTH!$6b@aXCa+iXgPX)~C7
zo^DAFs+9+FrhkbZdhuTO(xBG2{tknT7gqppFCbC=4)ocLAy}cR_`C;~ZHGQrzk5-6
zF@>|?iHoeqf}jv?+%R+rHpb2h@^e*P$t3zKT8g+v(#*Gr8(5NVund%~XPqit50lQP
z*L&0O8=K!e=mg=YEFZl;x-jfWvbXH6e67h&=h62V!qH0Fvw($*qysz*1Bvp1VVW#m
z{dsW`Zwr1gFnTe^bYk9(kG1|d87;FAjFLWB8$K8n_67-xDQmffX3x&zWJCQ#>N}9R
z3TIUU{*1u=aY0o{#+qxQDZOUG3owWgS94gPm@!qQ!;8!7%?B>TcDv(G57mgfPdZKp
zaa%t3(`SHKyYvkY>QnQV+FKyU1(~)4$h6zeuoZ*Fjjr$}?w7JvX+%D^IS)H^PPvw&
zFTF;mZo@!pp{KlA!LBML*+)9Lmhh%*mYg=-LJ>#W5YPCBFwBj2rri^%Dj>Dbd{bCV
z|HVh7kZPgYPQbC?S9^QQcZqAJ!O2};^@kGR!4r2Ay7K~pjM$?8);+on%RpG1B;OgE
zRX?Ab%-EUqjgTJrIDsVjdghegPOYr%)Sx4C!pznO*|l7N1bLNHKZ9c9gd*&<pz!J_
z+0L^I4zohqxyk}M*$Zw1V6TaR{Ld?p9G3rgbwbk!-E~3b<AuG_e9KqeI?Kl~P<xrp
zgemTyJ+an~RL8Ra)lUo$r$J^`|7%weTT?Se#up|Gn?KB6xypKDyv#@e6<e@9_!bku
zm~6aQVIg{ZPcG%i1L<zIR1@)0hcn5@+3&VJ#IdX3a5%hYauPmzXMo4iQV1eroFm^I
z8cAK1e^r`6a12MB5kJyEm!#&FS5VsA0Aq6hM?oz+U6#0afNinX#K1>dbS>40je#aN
zIUW>|sQ4=!m#!dojpI9m%HziwG%#_<n#(Mj$X)jP2%E+psvM)si4J!$i9L_!ZtS>d
z3a3?`2{)T$xRzHV>%0QVpV5pSkr+Tw5j_*JQ(J{wdPjrV9ieNTRtKsML#%7=t+RZH
zY{~ayp1Td=J<+%c6~G9;Jcz$!ShIY}xIS94a%{&Ieuml@O~8znbi&fZ9`q`wxGql8
zL7iuKuAz?7Rkhj9j>f1ed8$&X)k}1Jj-E4Kh$Kit^LiwEo;vS+#Avb18G0l;%@T~Z
zfmqXRD>CKxL{=)HVDE1A0ZJLp>^-Q1>W`hOFPyR{t`(40rSi#rZKJC5*-qSfftB-d
zu?(QtA}u)5)mY?ZWvkhX#Zvo5MyN;$0eflsdXN1csjRQM1v~AQpmvok1GkWn)*cF#
zpJ`k?B*1i+pAa+PRb-9mE8fHm^Olc?vH!l0z!ze#WZ0rb)W7mLSv_Ftz0!twacLb{
zhMrGhPwXtf_{>>G$Vv@We7v|j?JS{KkiVuS)Wa#zYMIf@uG%BC-9^qQ-<E;~eY{Y1
zN8@_EE?ywJrLN$f;y49Xc40Ta-7lnRUw1ldirpIOt&*VjfzbrgQCq^pTb;)mIXo3D
zy!&ffpJsD#TYWVHrzQ`+Y_IfcyNtcnJC}L!$?sbTBR<Q6<zSj&hf?7}O^+Ey9oL<z
zEi`PGJOQyuk>0)JfjxxjBES4yrLB>QDs9&x;uMp`c^XLQq44<P<k+n6nFHoAB6&~#
z@<1TM<)GiwW5W4(8MG!B(*i#@&TBtau>*O4=-Kf$w(5yi8uJ?UsxMS4NkbqIsofbM
z?CEucjFWy19%}==EYZzo@?^zn0Q91V{h=LIpy8*~uz`n%H>7hSc5O|2peEIBs&=S8
z(snA^WvdSNcY?bNYSJoUTCn4xOmvPg`HN)U6)`ycdcu8Kj_U<AH2dbff=`~@fLese
zu6Q$_MXTG2Z>EjWb0PZTS&_uO*~hoCt~|jlCM1tqu&oJ8sG+MmCx#PI(+IQEBUB}n
z`XdDGiKYB5He@_F5GJ+dMx*@v{IavNQ8QNJ?(-~m*SFLpszWTS+_H8Y0-w9_(PxO`
zRo`b~yereKlP5A6Ezu?CN>xZ40<=G7YKzsPY&ap~j>m#hr=J~P+nx%t$)DslsW+@7
zl>s*V_KfQ69kh3zj!J9-*uiSOQqQlw(+xH~IrH1}&{iSXdZ+`+1F`#7yd*xpZ$~Xg
z+YXr*=*mVa=C(#Oo;X<C3je(_g_=mF^h5p#liz=77)Zq)9hh@P;%yIaV*<aVq@<W-
z*6A|;oT|GSLi{y3oMWANO5kZm<guy91ygB>#k{-MXB*h39VWMZmgUlPh7>n~`9TH#
zZUg>OA1+3NmpZJbsn)-gsaIFPQ3|cTKCByA?rv%&E$>HiHzV+3p)cl?ZatbH>GK50
zO4&@Y%Cp<oi`q=QA(DYKU94QEjv`3X=g2m*lEU&*HB>3dHMT_#+7;k|`-z_rAfhLt
zPsA@))R5xi;OOh?6wcT7CY8JKz@(P2hK`l_chL6Wo1n1%cX(p(8jC5AxINSSc|_tI
z!b>4$OdUt6yZFf<oG<6&L~e>hn>d8UP9XdEXTxfqX4{e%=GUG1JO-GU|NN3RV@BDY
zJQixmJ|pSxHHs@~PDI6nsvp})bBnwbq$1@I;Ea8Ba#1~~_~=D-m@M;JfhH_AD{rD#
z18)$hYF@$TtzzyhT_#Zlg?7$FZMRBxEd5#6%;jvx%ZPd2l`BPq%@fm*JNhGVFAqUk
z>A~;hig-f)EX^IABCk&B_A9#hiq-lhhu8}Jg38@tv#ND!RD)4@E4)0aUf6o1X}Q?U
zs-D-$mGSGvzT4O+C`=02we+<qMc0$JYdasI(S4UclL{QN>7)q{>^`)_ja5ON>7)AA
z5P+iLI_KLvc|+x6*d?kxt&K<{y6R=$Uabkc#ui~Jx0Bk0gr243ToDK#tahUmbfOQi
zu)61fbEyf6i?bLDN{INJVHi$Sn0{=5GTkA%RkrGT&@`D?yYO_3E_`F3<K8`PCA_(7
z^(PiLz0TP2NQO|7G$%}tvR1S~MIE0$`nPYAc6Vxslto*apNF<1jm7W$FR^TK|25ra
z9K5jaR0tUU=SS56)1MRc+uk##aH0sgCx}{nVaT&zbmQX@MNiaoNs6>Yt8MGC9jR_v
zy5otw3dac#BhB4P_Z7Z)*BN0!hl)&G|0p@LN=uD06-oP~nW{wzDAsN4R$dAh08A^;
zigR3_1VjVU4_IQyUDZ{0zj%+eNQILEcVgVPZ|u(^<q!@!23f(ZjjO+9cWSb9js_Ks
zmZpAs4gYB~j)ERmRzlc=ijixE5EK`YpS3Ub$}+lu8(mfM5M_E7rQ~|@Wl8m}#@_1B
z%;;40k0+hTdOk}vP1M<(<EcsZG5@DM)Yr`gVj3&U&Z6%@_Agw8fk_XJmE2hwkP)@n
zRN-@5h+DS1HnH>dc=YYI$JXk_B)3`)J!n5IAc;NU;?n)s5%Z6kHfKw!Q9KPh7~C3Y
za>S?qG<BRjwg1{EVH)+r(?)HN>w2=`yDS3ZyrY!5U8Gq#-derV?&M>}O3h@9eX_t=
zqRa<JW8&SZqo!IR$NZYJ*4p{2HX0H+!sCS4XJey_6m4H9uQleKC=Ypb1gv3Aqels%
z?IgEB;#wesI7M|Q*AAIaaWiGxBkzf=jmBEDlg|X0EO}*NrG@iA6T_TR{CU0I&D43r
zFw<}Sk@<<OQT*VifYxiVmKVa5H9k9>g*u+CW@dZt4j^f&+=gzg?zy#}h)%BUedhUj
zRu+@ou;=GUK<~nK=7eHRx{CQhY&H~JVI;2GX4-13IJPUCF6zAdK1Jy0+n8qYjwY4;
zQL-W;+oeFp69`V{3}Z;&4AgxPHPAj?1fStv{xMdT)EcJp?VRpU$`00)Q}6M0_wae|
zMB$r;mm({l+X(0j=3|JVik_@36A%#{zE19<7Sn3gtn!Ny^0fEhHl2{6!PQB+$$9FM
z9j#LvM0UB$lbTHQk;f+8aE;@N(BfX+TUOPk3a(6)d3wl-mg`D|sp;;{y5(-rPc}@=
zxAn(`{<4-!2Mr?=<sJ2!=;sKCy=>r+b)l>YUY8-Zclm|J`!?w<hEADs?5t#k@U!;m
z`PUb}b2)Y_tSY(c1K5D`Z6}J#i!9c-<8fp{r2xnhX*WH&%VL*p-Xl`+a(L3E(3QLQ
z)zsX_<Z(tKr7_%Y`g1;1pM}sxLF%}gml3;Q?N|sOoabjwM7y>-1{nJgN;GqSjSP=-
z-A4ndIZ+~OwjcNAP6$oOGz5-PFyY*kIA{CZ-JEzo1{%U?QQuM3?+T~tY?E7HnBU#S
zV%<<XqxGaMSN3K<!Pm8*M_m4<I^S|X=zrc)&;_*v;0kWRDZ@Q)>~dAxx|^O3lvsd3
z_yP5s?U+*4$7vkuxmgSBa@{(hM6u{?CR)FcOj|qnaxfFB;z1;6yp%BILc&!yUUJg2
z-CHwfmp|OWw*zLK_%W0Y4F3Mkb$b&?)YNwpjl+d1{Ew8A?cmSF%<Odwpm)tLWY?TM
z^ehq}5%2lO@E;(2N$^fgxV9*gd{NPdbiB&r7eLUtV<xKbjR~$UH#=ZEz1@{oC%(7P
z)O0J$BQ$+ZBoY*2tMHT4V)u6m+xc1&-|ad%w!2l<vW57@?T=1O6G!CqpeF*9&p5E~
zEugQDgx7~oxcdz0fzr!Y(DxtFIYT5^UWWHQgByV|!@B#NC8!uD97KI5pa;$$lUi(P
zk4}i(u>l~mLWKrJ<?TJXN&C)1KJH<j$H&q$-lQr=kN5f@Rq*ZZ_Cu_+Wl>MroeEYg
zYO5wHT{|Nt13!#kd}0cC17@J1US=4)<ooPg{RIr~qVAt@1BAaRwRap7>hTlCe6`KZ
zv}TC`CMptuVsd({0tS{tT`xXj;^uN8^4|t!ZQ9eA>=&JS(Dr=+hW~kk!pU!H>iWyA
z&W(CdA5s|?_GNvs)<LJT4$#I2xTO!L_yf6L64W}`xlxzC#d_J(Kd@NLPcZ6oT+1H8
zrw1Taz9-+l$ojM=QayRp9(r+^FqJ@!N^HQl0q*?64TPJ7^jL>gkrL6bX`RSeR)@K{
z!gN9A2JWSE(t&??DsyxkfR(o(UMiYS?3om#2tyt`G#ffm@Ia?H<k9R0JE@e_seWCQ
zU~~r#@S`0ZAlox}bWDM;k~`*o?HyIV4Vuk7LubzG`9E;V&UpvVg60~wKxTnFx@Hy%
z;*1(TW&mP0<Ss<a0}QYS@P%joc0GwOHl>H%9hnuk4F)i9mfN2ASoz7vnpK4P^0gzM
z>|#&HZGcs{G|Fq)7eIALW~$s|;aIZHR9*YF6R)Husd)8pNz{jzq!;ne33YuB)y62o
z0P5`nETnz?hB5t#0`7LVJAjy6tC={TmUr#=(GC_pz)Au9-umwbs2!4NhgIle!SuuX
zWPX{@_gUxEwPqQl*-H+3+UdobV{9~8#9-QSY-m?@<)|Y2xb&+-u(E}~s-74;3kt%6
z5kzfGfEWkj@lODw;O`d-Zj!KD2=M8P34AfZ!V|3R|KtWtxlf&q!<ff<CmkQvE=T_%
zk^};8gw7cPp6N*m`fRHov9L_2SNeultHuK~DzQqgvM0KfY>vfrVZVkk*X#=p^VDlK
z>3j&5)zGs8$=BjOq+%0gdLsa8Y)|=Pu<`KmaAH012t%k)8a*h@mh2Ayp1vIfwL#sR
zTC8%E>jia|bj2)ovwzDI9JU8Jb=C2DLc`Y7Y3N_ucLvGi<*NvbKyV^<<i)xLX0x{i
z9n7&L9Y?%wGOrLUOnwn(SX<W5Jgk-rBI^MEBRme6k%h+Gu2ZS7g&!cRohv?Iosu@P
z{N1%wt_ug=Y!KzVXAglfFsoeXE}EmRZ%-{IUTZCr=&_4<+ACpx21~lON}N*vDkOzM
z>?hdhaN^LXrCk-n4|^;(SH_<9F3OR(TNsNbKeSEmPZ0b;s8r_Qw9wXX6nb*Jl)g0r
zt(rpE>==V0+wj+Hk3(5h4974}0KhL0({LgSy+HB+j-|_Mh#U0iC{(con<mhzI~$*#
z$F!nk9!b%^bn$&vo5rnOH*)e;z%Ko$5#9pR5Iwf~7!zu#%@+oa#sIbr2V8|^E5A!y
zvb7W2t(RjGYGme)C6nRgC8x~MQ?@JJxPmdA2-9oesDxIQmMM`YGOx@hwp4z=c^mEK
zRrii<8hLavWc-N5ZdyGb>J^vC?{=tbmRiCZdp?CZnzGrf4@*FlOT7{=aq;|<rHM-)
zK`}5|nbd74ZmD(tn)fnv)O@&dO)6moH%aIbx_-Ig#Xc)8Oxw}z$UZo{dqvV;QH*A^
z&e8`WHS!6B^@&WCVPcl+RJ6QRe7V4dyQivJW3wzYx+^>S?CcaVb%)-b&<H@CXb_-O
zNbP4rHR4sPCS+wckF(<J7<Fq;bp&0HVlJ<a){B{OR9x+=XDd_+XY~Gh<w#RU#lazO
zOaU9BO@x}6XrOL!vbJmoPEK&D#@a(Y?T%aE<4%N1r<BhMAHmpdn5kOBfYInxJYyBZ
zz63x4(XwfwfW!h`l+<e}@X8W$>G#Hlbu24j<cOp|=s=qRbd74BzhS&4?Q01d*;;H>
zW->TgSFc^mMlNl5&fw_2=uW+jn`w>Np~Mzair_Em1j)4yO5BspxSnWeQ@wbAHe^M}
zb}xYpLx3L$?D2N)e0TRdJXrpkCSs;%soc20vL-Hi_OiE*X(CmNoAsDAYW)}~&zP;$
zd8M%NbYbD^$MTikBNTs<DuZg3BoNC5GG?BhKp0t-QC_ce2%0!$!sNauUG>U4@C-94
z2>cQfnn=8uaOsmQ#sM=`S2s;HUp<<luB0mn)Xxf4nWLmDGz#D3=Fnsymzw8cN&Ju0
za~`FCo9<&hjxOx$xDwc+J1ehs4Bn*P*2(5AQ)7v@R35Q)sofEN0&8IDjF`IFe!_g&
zkP+A$NSV<G%3!}93eJWq^R2s?3+z$cjVc{`SD`bCGqW`caVyRZ{msAd2G_8FW0d?T
z<qh*Rp(bK=lHn~YBo5h53<M`*Wcis{{^_R9yOo$%i#V^kQFAO-^2!j@S%!uUrQs)d
za;GTfstvKWflBc9_Zy`L7UZud80`c`uez=eR-|a(r%JN+?Y0EEP6l7dsTnUCUq1u+
z62Bsfdk8(qRl=b(2%z&lcQcPuJU>qlk-Gar-27!kC;02}k0#Z%?1o(*3;OKxNd6?Z
z;=GeV5T@8}5;C4@g5dB0tW1@Ol<#ivCs!$>aR)%&(&qxLe9Vtg^GYo?Ga5m@oDlHk
z1-pDtxRQi&z}EptXMzUaShWGUKK>)}Z-A=?cw5+)?Xz#?)a%>lvGjgUV)1T6Z@WcS
zCe>vmO<Kw`qdQ9r-sFOJR3r8wgkADnO51_;-lSL%12xLHFS(gUdB9pmv_)@nSV28H
zs|IN)5eLHdK6r9K^rgh56^(LoMwT$L`i9P!>MH6U)byS$y?4cNDM0rdl^*JyTR`Gk
zkEbG!I?xxjiMkAJpI@OigdQ#Nfa=AWlVh9uc*Fj+^`L-VniVE~^2y6a&P;pJ!*4yU
z^RZ5@UbM~(1huBcsOtHAa6ezV)_A6=Xa&$qgA18IG{^UXd=pLA;uBQc={{t~*)h}+
zmFauW@&6ol7K}1SIN)2FPN?z~)>flGU_C8J1%__b{mG{dKChUc)9K>tgrBdxsnF<#
z(YUpH%i~|q3I|<>8u#vbTaR8sdND~4o43nf26a=wi|?<$-Sp@TON=a{4c~UEoivV)
zoGG@p&)RnSAi`i{IKI2TU#)6A)?8&-+Y@vAt#Xqbwh(6oUJ(&H6|$P7_H^Ta<$2+`
zW28+&&`(eT5J{}1pDQJ!eY-CBr2gz{*Us!P@3XR*;n^xjk;UUfkmT_GjOFMNMU%xf
zw%>xgM^R@FcimdmNyb<_>B44d68;y9F`ipv3dN&9J~Mlsy&+Kf?OQJIM%4b#a-9+@
zmYJ!4h0$y=`EZF_{=HhL{ZeL1b#?!xWf1ya7dr+#(hq!*2A0^$!trTJuE^eMYD(u}
z1IRzkEH!u%ASO<|h@{epAFe&XQ7r*N@A0CjC`QNw5chRp{V!-O)FDpTlHA^R7Ur3W
zgdF(lf_r<!L(^UfKqR&}x-;HFU{D^xDGC}qxV;_TG9<U;UjOck!lU@co=Uz>d9A|;
zFj^9u=J&T{+`LHvnN0++2m)Ku1aC=7Ew@a}7L7SI%=tY^l6M>xjrP(^eCegp$zB$%
zBYF8Y>19T_$T-MKJETpk6ulM77yov$`rR7K)rHyiS4e%eVHZU;l#@Ih=s}of9(zqS
z`a;L>m$5RV-j|@3czXIZs?NV=pTse0uL>~d=92VL?frl;l!z=O4qmyTghwHwYW&L|
zQlV;TY}{7=-auWRnDv)-Le{B!({|Immi&QRa%y-X`09+UU!xtZ8TIJS15!@R&}Wdl
z;6aK~3Y4)K%oRvccJ24{+V2Sk$_9>DE^8kkNqSfgjnR&;Kf(4b|5PY;HoC5ZC@2Up
zuZ|b{(^)Zd-~}UWeWJehR6l9aE7lh#e~G*_>}8U0eQ_~+1h2go<c_iz_}YsDafZlE
z!F74j&z%PMd>MXLu00UJ<;j!JnM=I!1h8Kim59n%DzYE-#B9&Z&fVfPfiQx0wT15I
z*C0T-m%Z_)ncCAe`2GYL%zBg9+s0`DpMX}mHIxpq-f%Qlnhz+wAh;-CtrZKLeL68z
z!)~zLI0}m50`I(ysFN~O$v77)2;>XXHXe6I`1<<JExqkC(|jWQ^-0?HLx}DSnDgxp
zpbkOQgt6>Vz#t-^$F^}FftLpK9dRv2#;blu2kWe^X=2kmF+g~f#xKc;SsL01VN7;P
zgVqvq&)*ifA3~pGr*o^pjl~I5>Al?XaV!0f`G#}M!H45G|5SR@wRamhUEs}Xm@UA*
z$Y6hr8$Vr%o5<A2Nb4Ld>4_LyAO1OE*a^_W2V^BOsyx-Pe4N5(j&Q)o0hxlXes8jH
z;Wbzk_VqjP%3kFvVPPQ38e$*BSQS#{Cvc>_;S9Mi#2EYRCKBnX?aci}OYw~o{f<&V
zNC3tymyUA>itP(F>(G!3G|R1u2{1amD_B|Wm%h^61)JeXGU>@n>)H4J?Z&V3mE{_?
zm-+yo0O}YBBA3251dn|*{J<;WO4)=z11!c{bQMug6S{dnQ=iiY^VxV!#~lEr)z$1b
zrYiNL^nrC%pIzDhL|lHH7|Na_co1Sb5>O{KXP*Y;@L+rxCC#63kaQiP^h-+~P+dIn
zmy7L1b?jxT#)^mpYJ$^5be2gTpHm!X*Y`Jn>FMdQGQa>0*>!i7+I0j_5dask2{Ex<
zR2a4UjJ%kC=~mutMdWRs-n@h@sY(mM$SM984}nEvX=!Ze?SpfM^aX7Z5fMyX@pcyA
zz7mZ9!qC1}ZtBN~PKF%3*~j2TdRzX&#*zW)eDKmnddOO$U7j}MmPXWB{=)qzY4FVg
z8nqD>X6^j<#{xj#9<VPk1TDa~xg2uEb%7`;{Uu786#2e0OYrXoQaRV|0vVFp{iRQ%
z&q>s~gKcg8pw>Yqbbw`p6l>SREosWD#qsI4BLe~gyf!Wo2PjHqtAq^J3#+nU*wG-c
zGlQ~F&eM9NzJ9$fZ&#DJt$f~C#P}BiA~Z{{T@?n97h9Wjisf#9Hc$tg<fqw@34plV
z%AKLaqe1$Flk>Vs!`<Ewf;d~9lB}J8$Xr~k;6A+lWrC4ypGA<_sd9d$xPQOBa&3_;
z-*Rn^_tT-<3m1?7!QPvPQ@ywS-&9IuC`yuwiZY~<nG`aUDKeMLV_4>~43U%}vqfey
zWo98XA@dx|GS4!U%+v4uF73UqYhTxWKlkr`p5r<0Kkn~wwD(GDE#KiYoS)D6e!os6
zcU#DKE`f9u9>6NjA%1CgPfqA_3`;NMj50U>zA>At3%;t!GlH3C$K*m^DD<z>M^hyY
z=a`O5)|r-0tX)JYU5NnQbaO7`jD_aS9UbrE#Pcl#;#+7SpO?{0LHCQQC1#OH9JkR$
zSBwtDV9wJ;?W|R{=4V~)c}!)LJY9B*4y`6nr%Q|TtFxRL<DRKrS8{oEG?$)JJp&*Y
zeCip|mXg<BSYV|b5UuoYl`;rgzQJD09|PD~*Ubdv50cL1IdJgT>guZ($dPqF@5A<Y
zxGHh^l$)6sQAq0HFq%?td8P|q+Q&JM_%m;vx_a|wj?hT|`+8y@?;LGC>h=ttlr|#G
z>(ZgZoA(}=@a$?lFt3yepa8qS!Hf|iBMMzzkCJ6l9j#-O6ON&pMBUP1aYq}ftKN!+
z6kOI^Z7Am1P)^X<pj3{uZ|)}c%FHE-hngccxEe(2={fDO&21I}XG{DQMQCxwwy#fK
zyn&a<hT)Ub6a#tN0-I^D9|_|$g}vY}FM5h7OQH~ofPun?+3{Or%;<O!vhj0qjh9i)
zt%#LNkiIt)6@|05V-kDZB04F>I-}U*vv`3nH!wE4%ha0g#xte^T2X{1FIB6@)vl(W
zxJ7i}VzEU7VZv=Px$FG21)pG<hHR`q|8!EaLz@hrDQ92=B_%9Ev<FHWQKGFr;lFeW
z5gBFntq&l{%~)#9u(OR5A%AO^YX;q_G}JVp)Ek9XH&Jnmz+_&0ds!!vAXqSmeI}0P
z{63{VPpibML|PiXse@m6^D+0?Pn+EltP(2E3dLSZTuO249x3RzbwJiou;KK|T*v75
zR@CZE8y+cXNl*d7fZ3}Ji=``_*sAZF6bpTbjtNnxTdam-?X4KnL`4f*=aSBDrx!nQ
zIfl6!Q2E7{C3~JSB1oP}Q7MUBE8FFs$+AGrEJ-cr@~v{1rmBc9KQOkh+joU)-wIY%
z)zNNMjRAcGdIZ7T2#-D2H;@Mou7sbffJsl$I8Q#g&8feQsPH%o1g!#@5TX%=(!H&6
z)_F2dj{z&}CnKVemBi3TjLik2$gn!~LU-q9L=BxnB6SAbm>(6w-@maW7N}EYD!rX#
zZ08_erJMjIaTMdaq8iN)!9qYN)$2d4H7_>c@)GAT%%*K46P-_$w@msL$AO%a|IkY;
zNPyd}(OA&gkqe-7v_vrEen#Yek<j<%Pb;KK+C@+uP3v|VUvkR!Y{(cOIaAQPoJreS
zFMjnM4RS(ypDd(q^q}_ZSr{Ap?Zc!OMN*3i-E6~0;Z24(wWp_J)H76`Qh>RiH@NlJ
zuR=_oMrDd6h`7)pIrd~9&41{<ywN%x4d=ugk3T1=LcPz=cXr&NO$4c{NIjc>pilM{
z6u+5Wf~?u?pb;*L&?N+Z8muXsPs&y`jVUxWQ+)isb>T(9yHKp36lbVjz7iTjloG~k
z_yB+`p`0=}Dr5goVdT;B3)+q$0ZI0){PiXt2fQo&rCl-mBDaeuW|31zs=(rs7IQ|Z
z4_7o7>iYd3GQl8GzJ;19$$!FqIC|31B5Sdo|7}8+ve0cs8Pr|}46dL<q-L<;^V?xm
z6E+KR60_7{|9KLte5J5GDHUhOA;d{SUor@Wt0@tXLEs7vS;4Y9%tx^DM@;16Fp1q?
z0uNU%>rCs2<_#9h)kU0`5SQJvwTvF$;ITp#ma<6&kQh-o`n%Nc!xrO0-<0u~Etnph
z!FFx?WiTf$Wl~-!fi7@+k`#Hytr+++`pDZ-E&*4MYtC@@uZq{^9{C#TtduA(ublCE
za@2;rXmQpGBccAaLQ3;nXQjG}8mB@z>)J+-{O#nkEmvk?^jusYVa?%OxoP77>A3ST
zJy9@tcIpV<ciA4gg{cKm{u8)T*n21kvumplw|T|75r@$B_OrdX@NS&j5Od4TcX-A0
zZ9O)fif`LPFcTL^?B#kuXLrn|M7IAd(yGx3xWyj&xfC5&>o`B5W((2`r3_x-Lo{T3
z%qg6qendwCx*3y)Y9dVuBaD0^`eoe|Qq?SuFv0{^usg(|x=fv(#>9y0thb<EUT?o?
zti6a=cUAvrd*$A_J%n_?b?AU6eNZ!KTd~kLW?Wn~Nt2ztZ}#)up<5@SjmtE$_eJ8A
zYv%$K_wTb4_gHn0jo<$?49FnTi1d8XbpOKxy3dC{(ih2IM$41-bX`K+KCiqf7-#MC
zd5dZYHQ!3iMc^8n<xhyz$<H@>TUrR}){Lmj5X0kmV{&%APNJXU=EhXgydHn;o;N|+
zp(RlM$E!tTxJ`|`lU;Q9N~-NwSwyC@IdK2DtFvliS?KFaOgl^Z_?0t6CayQnwJL@x
zR4t8+y(Kh2!5Yth9hz#uQKiV|GMCpF+v1iR#0r}}<bI>f5A7OZt1_1Dep4U6Ju=<f
zTQA~g&dn+S=pTo3X=Xj^$k=ifQEay^{eZyq!@@<vyBYnfOh6OTIGPkB-=w9%S6<T_
z>fFKVy;$B+Y@umL3EI_ylW8Ak4w{;_2DMCfNyqEuUE-v+7<+ItzGU$D?a*D>?i*6$
zHoY7uwP&o(C-~l;-SiAiJU&u%h5^VO%?`?ukHv%}=Jl?$G<{HW^!cuzfI{oum_R|i
zvPk;b(1@8P;LdWU)ZEg8Y}ZeHHtz-PY@8*K7ES*cbW|XwnCt-&;y`6nTatuO!}!Yg
zUfW+&ZC#maJ#I9smHvagpI>$zv<rj>o!HBurlGItke(HO4=_^fZ9#XW5_Lq<v@PZW
zF{gwsu^R`4-L$!p*r+T>aRDvw;vU{vLe3|ug=7Etm{G-HDk<)@c!Q6|-CsBwe}!hn
zNDY~UBVUdT=-{&IC%Q*bZv;-+ei5S~@$o(Qq`kh)f5g>a`L6mAni7P&^k#Zt!9jTH
z#RGJ7GzJzOor*b-TK}bWQ)B|wT-pPd5!LGC>l_z*p{97wGpb1cuwtE?$S3w>Qa-}a
zw?KDVX1hLk_+=vmjH=E|Y29Fg$WW1<e+O2O3#ggL!Wn@d2crcpRZV^6Fnw*J1~9Vx
zBkc5{L=JH-<Yg)0UI#cB)mg&Ep1f`tS7Et0xeK2+H9vK_Ta3EA?qM>-v)q><wzi3;
z*D9j6;gl0ao8to!4ym+`ncMXX-LOj#+Xqd#+nGzBnRh*3^vr{`Rf}y`A{jBIUM`Ra
zBc2@AV_sl1L7Y~kN28+n&q1;UmVXd6;jNP-k(0C-oWr0SMB^ZgnS!{fP%$S{EqA41
zdMtB{)IemCN2D9gT}Nl{)Bt8H*0AS;7Cw!d{G1sMG?I9s60eVzxuW+;qZxBrw$kck
zylN>u!HOxA7Gb4H;l^n+Xc@30w|k2LXE>MqZ2nPjUi2X0!K!j+t>F^XK#vsFhfdKh
zp`f9jm#<zmgb6)^#hkpz`&_0lX)u~hPn)RMr=!7N2r8(aR!alyjLw<s-6p>ZdX1S$
zdzR1c_!(L-KVvI6uG1Tv=F6Wwy)<6NI=R&KwQ8SHoYMqAPsUo7CeS}>BQgwgAK{%!
zcCgP3+qj4u2XcKR3@~=CB15I-F;D!Y0QmtOV$e6dnJ)f>`<rIz*h6p~E_~ZTg9v!g
z!!ZJR%TH3_^}fuiFyXX<_-|W*xhE~;_i2G#OvDc&74>$PV0hC9mz}wOWp^RS(<IY)
zEdNZ}-6~yLgZ&J*wmZ1)NBJ{N+iP(51!d;EDfbo^#*3C<V2q6i(!gtuttU^Oc+#DA
zQXYLNJ%&rnvNN79zGxZFrk=PWOYPLxU%Zfa2lf*lSl_*;i8&9LP>m<Yrm5}N&%p8_
z+<@q`;^WbeRW3<$=7k+IXCxQRx=K1kzWLUDG{;=D!=8AjPNgP7rDjOgQ+2~kfNv`^
z8td^cwIw2k^J(05gjjh_9UL@n(6^Bx$bUvP>;8}6SFWFO$>oXL1_yWH^%&x|ALsCx
zu8>y!#(w|Z8;xvT?JHJ~pUxh{aD>fPe7o0vI7OP(*FtopuEF2zvJ%Q>q;S?O9P}4c
zmMZNG{3`7UcM8fva`yLwGy2pq4*K-ufSy=X2xni}=VeAhzIko52c?AsGxJN%^Jk36
zkkMuYS)LoHHy?{n#Xi*!Wo<r?o9KC>t=|89EynOV?Fl=imwT@HJ(2_fT$uFrw4_LK
zIe@z6T>toBZK}22Ul|r*gTkQFm%|ST1dB%3$fiG)^8}6CM1?uG9Vq*CWLPnIYV1OY
z0V<XY`IUY#Yrd#(gZldE_URjZ^dCUS8q;*_%$?`*@|Vik`1j6B@6vk)w1N#k-}G@x
zaenlfGiP$TOg~Y!C4`3hc_UCgf)TcE<Q*7uZW$r^59j!12>AJ|C8zZCpnXE>>Jio+
zSpp^F)6ZZYMgG$IH|J}M@MK6!wuTuJuYew+DU3$P7H@rhgGJIw_yYp@b7EVf{0!a(
z>M2Sl5C>Z460~08<5s84#_Ospnz(SD?_fc#t2?;12I$x8xDy1DFMj;^0X)5tvdELY
zg_ioE)8ZFuL5hAt<Oo>UV0#Im3kTbU^2<Gov7jRvxcU6XQGaOZM@87CE9)T0us}95
z44c5jUoCKqs@>l~W=6JTA3=zBZqt2+TPFC?2<Qzoi5=LgMif;m+HaB)4M|$q4;cvd
z{KG_j%i#~G!M}qdNh!&6$*(UbXZWHLLYePQH+9GhG^xlKGVCd}34$)Ap-(j&Z7*tx
z-}%M7+~c$~5oKD>_8LrT%Wy8$20;A)_(Sp1Yz3!0&$mc1$Jx2iqP}|g8Vh+A0}T7&
z#k7)vV<N5E;58dYXLXG^^i11MMqA4CS1tqJq-uMTZjf^=E_ZUS<ZhgxlTC9hUm9Bc
znixGmP$sD#G{<(D*k10+7|j;0WcTKsE9A@Ib8R7I__Q)tqKg?*npAW7>OLN(=1rNX
zD&X|9<#H|HM3lA~PNu77)@vCz$2z<(ZN9nr9l~uRuaD0tCT;jQ0|TOX23?h#S`+I!
z)~c8YJef?w7Og7QnXA|n_?>>TcgRT|+w-Tdb(?m^{JPJJT;YZUTc_Pm{kgFiS2v%Z
z-I`h&3y$OCYMr`+n{lgguGpk#`OV!-;vrY-%1NOG#j_sk;}L_7qhwU<(Sd8E6i9O=
zkPJbH9ox(Owk=#hKS;7m?e6lUMzj<7e{|bi$*78R7;tJfmMheQJAqI@HtK+gqO2Sk
z<%95ljN1`<3ZPLOeope)PBg{~51PBc(rX!@C2vJz2X=Q>>CBH{8i76HVsv_h)+n~7
z@t1LDuKRK(jFHP;XSd#L*}*r<gtIvTgT$K>ZgXluFMwE@V^L&$AnfcaT|$P_VILaM
ztjc=HJ%6>k)w27-d5nV;ADa+(datjyc<jvT?)I%OBj^>-cajyS37iHKC56YD*U~IN
zOB{CAC+W;v%tiSw?Ooi-v={Ol2sowBEH}zj*1>VEsHYP)@2yvQjbcn2zwhZ$2ndc%
zz&Q@78=+Hf{sfvxL9AH^LWY5tnXh62)S_|hdHNp^moatNV%eCDfolLRMU1H|p#MIQ
z&Y;{sbX86f{Lz91Nl&oxuJ+q@N$7ZNFAr`_@TB&IYE`W_S;Zb7PPT0kasXVHZ#1XN
zA73j5M4aLNGUNkJ<Hv0<zVMMMCun4y5);j68Cvvj@*Baq-~nLbXg{tY>hsF2`1utg
z-KhAS9F1)Itu<?QfMfy=hiJiNaP!3tlkhz~GX7@3Qv`6$Vu0?Jh|A&&&gQk&XE&4C
zuMpOn0fGju)NBdVi#tH~Sb-b%E|}>LDAENihx?BJ;OW$CufewtRILS6wWj98jO)1m
z_`JLJYIhCZKjc1>yStGq44@Xf_fkQB;Ce=!x;v!)BLGi<!A36h{vAAg2ReGUrWLfX
zs?Al8U5Bc3ey8|<Q*r_O^<Sg@OdYG*-ClTgR(NM5fCGkIJp|1IQK5c{AMYPr3GH!&
zz4q#?2ebqj4!aTb3?j<B|E?M(`WKtaN5>8F74RgAfLSye$OGYqIAD-gYhd9;D*p!y
z3eyXb^3GS*Ts%Js{)03M1gEsUf=#9CVn{>V_f<G8A^a2=!s;F80h;4EEnY+SyYmq|
z_-3wdb8_$T__6<PI|S8uu@Cao|52~rfpHr~lecgwmB3DE&IOJVrsm6q5B2aAVHE8%
zNkbq#7E+!7XObbl?cf8BC$-G30ryP{hEM%vTf)Qq(5o(QN)rc5r%dHx##Xmr+?;4k
zIGOkoK9Z@Ld6)t5AcyhK6?Vv$T$*2tqqtj%fp@MMU8^Vo{0W|e^Ygk{t_rmP5kccY
z`0()AqsW~qPyiIqmd_r<jo|7ZBsF-&Pzz%%JO~pTXKDs9`10Atbwlia0^{7l-C^O~
zwOZj8A<R$~(L~=<%cb(coo3q0_8Zf-gG=vMBP}Q}Ul1(Rv)#43lRc}@yo_S4eQQ?U
z!~oFhL)2R?#zugs7`H?A&I!nH>r24E+WIkWlGH><P}uvo=(E)zTz9R$M`t^Gix0j1
zX~N3&v$1Wf5XADmg@&0+vtAT8k#<gJPJw}Zm<*7rT6U?gGw+Sl1g$=QK*Rpdy`KvD
zf0I8diHeOqjXxl_M2xiTTeY%?>wVce=H$iK_0nN=mj#iDi3YK|KiK_62zxxpc8|9=
z`BC^R95-+z#0;ObIv}olP<-;8Gf*p7v(6w$7!ZVbQ|>pxuu{V}ATh`6oThWb7mk$`
zb}nQo19tC=z|?^t8$`K*FJDGO5P?61XD@<CZy+xFJxBBns-x3$y4^^`!P<(>*=)a;
z8sj;C<0^6hgv0R=B09z&AOEAf^!I<gkNb`NIa`vcc0Y@a_A988M2Io02%IXv{QfZ-
z3m&*$!;LqTdLO^UK#+->I5g-kNo-x89{~HS`WW^pZdiGV_yq?;9*z*<52C()vk;Id
znRD&W!S8}wV)-x@X}^u#`YF;542-Z@g)Bg}JAlRmo+hwGZq&Sd1`(FFv5k!l3OPC<
z@H>G2x<=1C{>uB)iOqhm0jK9Q`(Q`)=X(QWb5)FxK9Li-{=>f!H`nu>=#z*)V1H|j
z@f)#mPj29RB!8`NY`O~{rpGgKe&4!Vk~C-kU|0QfkS59l#AV>lhe<>B5r5xqaYlxc
zAI4{{9m}-1ag_L<dWJV*q%p96L&6NEDUL!5fyT_j64G@Y*yJhgwHp+2rTA71xf0xV
z5V)v%v&t{pj4p0298u?wkWU*8ZS|=t?6bS~W@)T8E#dBI<SRejUX;tUz#`rr6UD=Q
zX5Buhf5F+jmcF=b-H<;v8YL_4BE=MHt&<{jaj<Ut*rX5eY?Ly_Y|4fV8yq;ON#?cd
zg$`6{g-Ge*i8kx5114T7NX-UF_(jC?udf$cV0*8tZftBo4pQWdqmitvEa-|E^|8f`
zKBMdYx0Jwc1;{4+1B695(g_%^1M^`Aw!HD+7hpnxo?4PS>e(6*^;^%z1HsXD3~13W
zI(=SHe!)_Q8`2Av$to;F6<?99+c77%2<eNE@o6CGozS|X1}KKRDomjnU8WCLhCbOg
zlfn``^p5mTwW$5!4dW>O?DrQe*&1N?Dcs=mVu0wJ&OOLE@hoAP8hM!b>-{H3l#Bhx
zPL*3s0%7X+vICGzAaWPl+lT*uz)jUpxAJG7tA7QA8X-3R2P?j)kQ08La#P-vz$R*L
zdH8(j0@%@4O@#v5Mnr@m!x%8B&pTC?*GRa2iJKjvqFZTekdqb^qF`PCoaU`M&1R@B
zk*>Lqj8x*r7uuaDJimhOs~+sK@Q2;&xX1S!Ib&3yJ{hR*PCdbnU%teL%E7JoZe%=~
z|7>>tLYQ>hoV7xR%Ge>f?MsAmf3#=N%tu>_guq&gEZJoW=v>e(dCV(%ks$%mxu?bu
zoU<vY49Q`DbV>i)Y17N>IhJ|`1V${iSA*^UPM!Jugg;zRe;6SS5D<X6*{m8=8ODNh
zN+zr8|3sS^zDsq~%f;^szab>`QB;qgr#`!iXT4A!WYE5!&ypSHEnlja3l7*THm7}0
zn`@T5kk<_38vAy?r1;1{tWaO%=+(8e$Zzpf_X)c4y7&SrM3v!U=8}kwZ*4c3S0S!A
zWUcPAPK^8d_hB8j(&ILYL)`OwJ?s-LgEoD!MaeY?>%iTd9eHKT%)nX;=idSpX$LHd
z7A>Z70QqVHN-h!1IhzzeZYKD4SsI&lVxtG<%neh?Ls8QJX+StK1()yS-8V<6BXuNl
zV$pRtZhpr5W|4i#zV<1V75+kE()JO36@Z9IZ9&tR)+T}_IVYCJbem1!Yq5nL-{{av
zX^YF+LB}(}w!APpjJbP~gS7+UePv4?K{^Kz{eCmkT9DmpnAi}hG(4U&`6neS_nZEb
z?^xolNWi8`*6;6pnKw;0b7G+hR}!0@M%E@(<nTggBz-pixv~VCsaaX^s$0<_W1~8Z
zM0h|8^lHzhSVI9w5{uamV2g!r-n=4kHMG8NQ4vkbu~ja>4E(7&2_`0{4{UOo^tm0O
zeNh(aLiw4W%cqlFgq!R#3b3m7j9R#P%LHAAb*^f+rUMHS7XC}L)`>=Q`-tBccQMr~
z=f*j#Kd2^Ja1xvUXwoBbfBReWPb>M_f1Tc?JRuhWstB0i#JO`hap4Cfn{-*c5kwgR
zk+YB@z~>0<K&i?g23z~7*kcZKmhA2>lU~ia!Y<P$UV&2HkPby`@*s+K+a1<M-qOIa
zHD;E(qq-B@?5?-O^Iorz@9^VsACNa&Vk-?0s1x>>8Q5dYH43`cmun84t=O2!6e{VQ
zXHuD=_a4knS;S}_F1TAo?^|JU*pXBM7bDlgU8+0olzFmWe%ZcD8Fu#0oFRAOINiGL
zZef{zEwQHjGq~Oaw|~X&UVbr>6C}-V70iVCa4$xcXU_h_Q{zOoE5Mlv|4+Cw`el0A
zstuKp+2X5&(e-1GDBt<lLh;M8Nh;7vz#;nu=m|<07hXvU8Q4*<@XME|u^cb*kuSZh
zg(u6nhrxz))lJCv6RM>y(JHpQuORrqM#9!o{@v7xt=UO!6}G`~X*bvW`k!@tB4I~b
zI|d{_nF0~7DHBy#e0tf&ZtkDI=e1Dvsk{Uue_Lu!d*0!OUG3Y{10p8L&Z63IQaA4Y
ziGvCLs;SMK{}>~K6%{oRJw(~=<j0>tmE%7EZRaUCh~vWNGGa>W+36pSgiP=%`jy`U
zG=5ZY_l0|p)iPgsZ;kN^u)*AtyWfbeDq^!q_)>)R)wqs~vdHeqKH-C=4(j1uJw222
zhkjTxssnj@fz6`8leJ^oXq78t`Cj!CVg|w8=f_UB9XuN>yIq*Z`26h?@j17#`M%jh
zmy<{{RyJzjRR$?3lH&S%;zy2_J`r(Zt<@J$D2Vsy^ktg`#1L7rEtq{B&#zEhZu|+x
z<Z6plDn1j3ESNJG(rhDee1U~*W{~K2duzGWNy0nnxCehRtxT}r{Qh1$@SePcWZLX4
z;7>J39FUm!`O`-CrKR1{`(g#nOu0PfjN=}JMWz2te@UD9hR$90@r8Nbap(5j56{JG
zjP8fg$=i?K*hd(s&yVYkm*@bPZQQ1ryr^l*v^AErI=7H>tzc9(=H$D#{@q9B_K%zn
zwl{>SI70<${wQj%3A-W@T;x5C_>V$cH_@9M!|~(mIK#J>LpHgamU7;Hbja!(TkC5p
zd&TqMb&RUjR1{x&;lN{YMrZCu+h=6#)3ldGFv|Kcv55FP`^AQr5_y{qN4%iSD=YqR
zqsw3W!>%G@BkhkD&pHQINXf=WYvMh)j*;xJN?`eL$6HI_(fvEtOD4skyYJHH6Xpv^
zZRogOg({*4ui-b}m`%7y$i}_;(C#>=Sg|w~#|!E|+dpOYQ@rJQ5#o$JIWZw$E|aZE
zJzimjvjD!rJoWtS&40^xSzszsVQG{mbtlq#cp@N3_&}|=-TAGlx_OfKM@~s6?IYp*
zc-HWi(zJs{*OT@dfu80&IEqKNw9g>@;3P;gpIUKWY|(F8ip#W2@2n>|<4}MDBeR5{
z%(cP~JgXK-o2mE9$xu8@vO2x%2{v&6n;AZHf@XS1pPj^-<{`<r7tO>{|22@XxT}b`
zx0Pm`^&EO|=Y;4TZHjSMW927>9KkQ$UZ5M!#kU+PQE<Cw^#1j>A=RRw?o9Z@#A@F!
z0uqWD_-)Z^)S(JeYp>gUDm)wpWx6XD;Ni;4GaFsV-Mn&**81cZSRnY(;LA^N@2F3p
z6dUhCC<dJ_cJ*h4<=7z(S6XeR(CNlynpOrV#bv30tN)6<8Y^#>u9x>HBWP(mre6Pu
z>Jne!{<~=%aik{YYRQ(r&W0o;Ff2P4Tf}JaAvdJp`RQjgSBY<wfn=ylv*PNgk`E;)
zj)oS&aa;6Wy0uzNgr5RNqepz`@Y|(>cfu_%CW=WK+BVy8*3i|m)T@5nbae0Fi<)R{
zRXDA2$!J?;mZV`kU(5cZZIP-(WX8BkUs$f27c&1>n=3&4c3fuMB4We5&YYDo`}Gh+
zG3gGa78w)44LJVUk3fT3W;e&a)NBfRKAbbaUwkR|M5%FG*j6&|Yf`=ZoAn>!>5}}R
zsQ_%0r1=u4B5N4%iz?oVs_4r%C?*C<N0u;CHJ)Sj2@U^Wz&1~YHjIzSDnQpPQv|~O
zyi*i^R@k@OE*9m-ju;)U(a4@iz5>D1XOTzuIFlKYm%8#|<z@IgiM8C@{~dT`gfnvR
zu3(iIXdnmIb|q=>i!e`h`M}9zH)1=8@jMkfi5*%oNt$F%xZ|dLDN6F}N}`{|=3lU3
zKBz6TSY3<*Q$gy>j2A`^Cbgbf%gm15cp0atU8Y5TB_rVOy>qSCfd7e%kH1%x;C`wE
z+tZ;vHQ--Llfs9iO_0**VJ{g#Xlx?Cp(@}cx{jlaOuN?t>8K0cU%mfRu1c_<8QIHA
zB-W{Op=wU7<<IY3F)WtnJyOs?=1B4ztHQ`+bYLHBrH^?a%EVvy0Ah@gW#p}P^C77O
zSk)_t@a7nQx@d1Q)D1g*+a1bt@s}1Ye@_I?)6DEG`7|<PbY$c^3r!s<x#VMg0~)!N
zA-V@<&d+=})ucUc8sYomD6Yv*vhM5qzfo57Ks@&sWd$D~9)rlt;$~6g1yuHnGh|DB
z6Z7RoH#m<TMxRS2y*v{ue<eL@zW;KP*~s~{Il00U-E|!M{@X;AC?jNmuKs?(s<BH#
zL;YcXWuIgG{C@+Y!qca?c_rP&aNOwqqw~8(zac8_UJ}u!M6uV)Mv9g)H8v)B_CGrL
z0-prBDo`dPHKI(@{S~bd>6zQcmjSmly!9WDDbW7PAr5bl%rt~DdU9i^pHur1(^Jig
zPO+?p1}i6fTxxx<Iu^A&du=^x;qk+W{{)#*74>PdMGP5ATBYdY>e3bC98Blwgj$~R
z&ZTy7ERgE5fKp(KIhx{@WKX1|EkC=+O02K2V0N_BfiL`)Hv39#%D%sxEP55VSpZ^u
zmaw1L?4<%1VhbqZreTaNj-73*baN>a{RkC^Tx2b5^X{X>iya&Z=~ss*K?_tHJ3k}q
zd(uXcc<w`YPWlj<S2y=zvwN26Ipp3`dN9ZkGHSfKmW=s)P25mMjmu$kT0MQNEYq__
zzi({$;XFxGT+;(@R2s3ZLAhuq+!pB+ja?PvguAnLqCnlCpLMjc$-8<xK_iKolTCd&
z)$%+~9VD(9N23D9m)2qD4Oyo{r9`-k*HcZboZe2>+c3Wq@dLpVC`}gtck^>I?=x}H
z;rW8gug`?W&I6Ffj`LGcuxj<H+AWc}U+;`$I`<#`=D*~4dP0<Ir{xUh<KN9*W;lcY
zBI4vfLBuwaXk40z8)75x6nGu9O!YG88P)qEAKW@7-Lp5l+igGP0OMrx+E+yfyV}OL
zR`k=~=dxqw7_@CCzt|Re`_6Q3@kUvfe|@8$pG0;Q_4KMb(?n1vVcjL^D{doEFG@dh
zF@Fi6RZUQ6y=iW<e|Uq^c|tEgcqZdAHHmw$vP(ePR$r3I70z~3%NXM_Bl*b_@66?}
zRWG)_5DDnMi97zd?+L(V(t?ybXDycfMz&TdZ$%eohK9|Ia$l3a1dDPizJKAIH2ovz
zWcBZylgfJ`M_84YBHm6X_i3jG{0X3KeZun=l>Ojs1^Iu7i~aYRYHUp@2n00lPJ!2P
zwY5?h1A6{5m{o}%y<ygI4uj1?FDtH@Op)W3bXwdvPS|h?kB*MYbG;a>PV$u|6B$U{
zD1xk6a*>!z!oQ`Q#5G0s%Qp`M$#=V&P+*?%IKOXQNPIo4h7&sQI47ICIoL{F6}E7x
zN#YF5Ozyxw%MYy-rNeRF?Hr9ExnU9s+N<vM!RDIPE_?zgV7c}&4_Yzu&1oHKY^6yR
z3S>fQY-hi2nT(ON!)0WC$&JTZESh<8`&P|+TJl}FZ}|S_BiQ3Xg58r_8`1)l>pq#P
z3f(xbO~1bK@PHMaKC$eFEJ}nWgqcPl6ZmTr5<%xmu7dqCg_DQBR}0V?X1QmxUN0!}
zToEqLtz1SS_!9`d5fclp9Tz#3{xD_bb%!?EM5?`h9>8U&gn{=dCFWZrFu$ChE>~F<
zcKT>!g#lMQ_<6;Xt-SA-{YBnws@wt#zV7CS;=oJ^2b{Y{0K3U(j^2l*d2EIltaIG0
zLnco)dh)duW<393&g3+Z_?0eCg#{3e-+81}EZ=aqFHNXKL;26aKDYAho8vbhJCsiJ
z#|d<#3Ou#S-2YX%@QLO8rf$`*TVJ@~>A`GnOE($UnR^c%3>i{G&DMv6JyBoW@;_Fo
zn&u3!#59qUk=@}LRL{Hr`f#3085M(=uCDGHcP2gffcl%lJ!}_Uzve!a&wQ$TZ&j;q
zr2H?2iJL`TQmg%l&eK}uTXQjIv&+3HfrYv7YhCW>RM6ml1|Hi1H%CWDkYbzp8o$ep
zU6>83W&B`gt|?yUE9hl?GSKYnKd+q)Qi;Si!9L@btn|KC!##?Uk;@}v!x0{c48>sE
z%0c6W#G)ES;2-tZsME!amF<4$=9ql{bG`Wc*~P+tw!EAqW<V%E^o<t%KfCC7vbfsg
zRN1N)Lvt%PG}mqUH~pIi*PgdcpbDPNZwCv=Ayz5G=;>Adym>eZf(mmxcellwLr?%?
z29x`9(_^O@tx=tw;><(CbOV5o3_MU~@U%_l`=UXHbO%-j&OjjJtqv%x^~*JEmU}vb
zaq7JCXUt}<i@=G;0=gxvGb?Fne=VgM@5b0cmgkK&LOtodS7JP>X>e<Te1VZ<x}S_v
zhXGwKWML9gxzPG+=i|JiJY%wGC-JmiB3Si3_#W5gTD}Y-k3)Jem}@qVmvg<&Dza9i
zZ`G1y?qV0cCSw*SFr#E9H})n)-0XJW=Yp9##BE{`rhgGmK1BQr;iPQ;b*Y9rZ%SaA
z1XSFnAB)-`jI{WTc7Xlo!%M)cfc83CoGDf}Zj&cHe#0)t2q|@H&FjrwoLr_|uU_@E
z#D8*{2_XSo<Eq?!TwMU9pwZ7R2G4s8xD2i*AlBx=aT#yQ=7;+t?*XF~U<bRi`3>Hb
zgX?K%u)lW%iuBjtx0uWB(Q)h@Jj`&Wbawdd_Uj+ogv9tVHWxy`6L#rK$(JYae^>{j
zb)i`d^Ua*N<uD7@;~C0P7l`q|3F{wAUIQ=$MEy9lwE=L19Rxo@lLL2&KqR^nsS4#8
z8RNL+InDIxP6XKmQEMzPTRP-!{nXm<4?wW_un+>B?Z2c>gcrv+5tbnk%Wds^fEw}@
zx<6zpV6O%th6oh@qsVj?8WMup=H%n!<3$c*gam+SAW)6T+8~xalr{5Pn^chjNd<B4
zC`$>E$94?-fIji2r2Eu_eER%onOT)ShJH#sG&B@8=N*u;Ev*duRa-o}Y<=#C(0m0i
zu8aFxsP0xS&2JDml}OO8?}c(w?e8%5BS?XzOs;QLf;g8^-=g1s^{??sj)QxnmL6~q
zFWJ9LM|`LV{b$K+&(*c|mhL9<*bmJ+-`_ER=k(i1v&!gYajWDMJv|q~#Rt-fCpj$x
z4KcH0us3Q#d|ecJ>-dMGpRL?Es>U_r=mf9_7K%wvJR6@ub}U4rdq0moPw(@B$L+H4
znhKl2S&sH-zE~qrcyM!JdsrKgAN{+dH~=8R#upCk*^{c%3`5{k$Jwseiqh}rm%sAp
zt<e3VcrecL<)QiMv7nBjs`v;7Gce~BOj>e*olghcl8^6r_Z5iv&Avz7Tg>sCID$VS
zEr|wdVj&lEuGdC)-8IGaE}X3w537)?1b8<EIxY~11NQ@d+cx;-c6w#*T7fCk?A_qf
zMeq%l3@QUa3HVkU;LX42jkw+IN>JBJVRX62(;auG|6h<Hb9q;mUF*S`?|v6pN;_bV
zdk<6-5kDX*rI)9K)@a*(H0lK42w~$f#AAI<=P&q9T$yHy>nqpp%fVD405u>j*fBwH
zCo$uMk~zp2u<itw#<v;0U~X*S)I*xT``uobw0=$J_X{+;w>b>v;783~?ztkef-X~&
z%`M=_Q<ZB%et+CiEI9kugzgo^+gSY*DTHv!a*q)5OZ)wWM#G2sf&@}3;9}*cKx^K*
zKOx^6>t^~2@no+Wn)0R#O_d3ajKW+te>?T9=etI9KHp~N8Fd&*=bJhoE4Z}Uy8Nba
z5O`XS(tfM2GuJf`cc}ntoeRJ}I!bmi-}8eOE)HHyJkCOSw53{J+z=r>%>97-v&fce
zn`T9$v*N(;U*v-n7|T0$^}{{b8X)~KOx~GOv3M$*dLA0_C{J#z(KWPy+Y(zel@6B!
zAT6p^_g$si>q!uKK@f^0a=_OPuOvLyD<G%y1zu`t5AV(IAHbz+3%*ELhqdRhRQb>s
z{%X?pVXj}aG@Pjh|K0pI_cT}Ks#TFj3oW4yh^v&&Dn{7fTWA0NCdF!=MNo=#VJv2&
zvWMdRb}RA+MuwZ&2cvm@148!j6`n6?Az1^o)K}O~ml9rBV%Gz9m!Bf10zXSimLG<E
z6G~0(=V?FKN`Ajg$K$%fG^qv!o+iT|PQ(Cj%E=Oe6~Uk#8+FsMc4Yx-5LDmqNnXT%
z0C*t~b8!Flfy}<U@BuKvcHw~@zuhTLH;D@M{wddp$Ea|E;|_tGGhPFE*zS0ehXv-i
z|21&-n-bu<!tkIF=+S?HLZDA92LR9`d}W2Byuyn1k#^{ZTxxk@MQC#rZg!ahgRxn!
z$^jYG8MySc7IS5~(irxyYXS+QA!oUlw#6eo;GX>9Bw5iBrHn=v_&XcB<;YzD-P!=`
zs2{ld5Kc`W(Y*I@{KX8xLq(R5bHMMP??$XPQoXQ~dpVS%f)mx{vXiK|x!hr-|Ler*
zNupPF$3+mD2|!Jtyw=>og9lgeFejZ3vL|@@0qiM@R%0kbxPNsuaZZ#&VCPK<11X>i
z@Ly2R_9`z&>gc;G-pOQcc1thTxpU4O7ZFr;%P|$cm{c;vd)Lq@YnG(Z2I*b!U7w)W
zvht?EBO@9^xz}p|848&KXAZib*Xd%Tkoo~r7J2jZ+ZAO(KD2KB%I`rKS8eqvGkTmb
zvS}}kje3R5cL>Ad-IGcOmIv<qRlnq744dIy;o}UKk-iRt;X!kSqWHfNL<kIj%Sp6T
zpf?V}fJ=DM?z;36^*h&3G`qtsq0jLEAKq6N&xg#8#E}A4jqFlJsA6=Q=lVoWnMv5j
zi8yPW#gL-}h7^0c6kICuMZkJ`8WFPSk}x^i{h(^)wNWhd{p*bN$XpEAxYHG{y$4B!
z&MxF#>5BXLVo7Z1&oAKD|I<DK>gL_$1{xrFZUaDTk8~LwT~)Z&v;ga(B7+hwZ7`Q$
z5S-S0{e)%zyKdX5Oyu&_;@aW_oZ*p&Z}rGLJ;mZpIr4kygHU`BTv^2g8{_hSNcCvW
z6zB%qZ1CB+%WgosgYhF8;d&&vcu-&6I$>Nlh8)S$s(<ip{veAj(LW;Uf*h;E;OZ_I
zueb|V>4DzCCR}`XCmJ}F0QD#yLqWtqk)w5Yk@|~%3Jcz^;SMm4(;I!Q?t(I7f@x7p
zFP7#@x0{Ey2LVJfwx-VDUFq<4KDxl~wb3u8XYaWS&$b-nYXSDfB(N_qCQaStJGHwV
zCUXD_7%X*u9ZQ}a^LMERL5P9#?!%Va?C}U6r*)th^z}v`45V4G6OL@||F#}Zfcwfh
zXVY?)M({|tHsd5uQ|zE$4}s0P+T3lI1?pwgL8qMlnK<DqjwiFq#hN*sZEKyNm@Tf<
zxv-38pI_PqA5^i0Q9*AI9Syz~g|iw`2S%WM!tNF`#aE|XjMY(h83J?<{h+({BQS#j
z0Dw2&NqbukpY!s`*0o%?S&(UI{q~y(Nm->X>RbOn|BPV(Byzjqok8;^mb(RCkup9|
zCg|KTcaw<l)Go4~Lh2T)Xg-J7Rb+<V`FPtJTXmY3daIidfG5%J?E8R$gDhz0Ym06)
zS@H8d3m;j(V>Qpq(Qc<551w^EhmAaX@Zb&N+UG*L_hAww1B#dOk{1}TguyPxj29kh
z&CDY?Gj_Yal|8-%wxaDV4p+Y(9v%j~@D9i6TUlczt_|A{ZfS+BciPMa1GWWP>d}Sf
z18VN(@h&k#ZqCp7Lpoqh=`{b<{N6cAO#nW91cET&K~Xm3fcZE*%aUQOLp7`7J*obb
zS#rKlzh`c>+!kX*v8}b;wRh%3VphO-`ow>H7!X3uwt`kv)|zw<MAdiqiST2Kl`}e_
z5>U2k0UN91qbOR~G=?<@YzP-=q$nx!NTzl8H%#-}D&#9Dia)7I&|G83iikgf9pxFG
z=q7V)`0aPozNi9jdmGX6d^Q>PM;BIesUy+Q<q*)@q%zPj)`jjluaURR!Pdgps3Ded
z7vG%wCv4)c_bj<40hT5mO9L2(7D2_EDe`=i>U2|4bZ?s$=_pvw*V(fR5nez?m)ENg
z#HQz9cBPESc^SkaFV6svp_KDemsM6~@)svUbse0=(}2z2%M8Kz{5@tO{5@;4`?0cZ
z37~0qi#W_}6EQ#wnZs<mPxk%`3`2bKUtt)-X@zE^IjsgtPf7_*Iw7@?{;_xt#gDGn
zhh!rz5O%T3w7R*&dD6k?>ZI@NjIsPib9Rq^0go6o(wH{5Uz&<wm>a19D8|%)F*oSJ
zXO{fne%ZqPf_$byK`)9Qisv2sMdYW>%H?6Qco2{mUx7voF(ocT3&J;}<Z<)wY6k!q
zlMVNaSKvq;&YKc$Y}i>W@-tY%cxkdaD1F-)2NoQM_f)y7ze~(;KiT}DS^KAb$JT8j
zK&|s%_w2165V+W*-+HDBD0;KuZBn{h&+PJTn&38GV6MPIEKs8!_x05G9@CMm8uu1e
zw1@n*kKX6<L#!U;9-fb3rUzzNr@4F|;2)vP4XQ~lQ@>1o@rJF<hjO^ZXASF#>uul7
zaq(o^2s=VT*}FGE@cw$iAR*L{pHj7n&2nCPsrrdO8DO}SjItttni&tw6llt0%LId=
zS|hZ=D-mW6kIuK%dqGngz(h!=43VH0HMs-tIv%2IvGe@lD1TA--55&JTa}3vXbY;~
zz{v6poxTUR3NWR-)At~_jc`(b7Zk(Qr(w2k04qmeVl^zvn`w5G6Ac9&4a-Wvszq5R
zvYNf~aKiA&ro?>o$IQH!K2Za-kGIG?2!uwyCM+N<+^v4V*g#%(w@_q4I^TCbBmU;-
zn4C~|N`(8!h%eI%+n*Vj+z{vx7tfD9EoeEjQPzlXXL4a$GdjHj2~^_2p$8WI$B{I0
zOl0KHe7@ZevgufFdAh!7Bd8HGq)z*w{&vzs#;VAvuU;e*D&!#N;<3G{^~)C3_pRQd
zKP;>IV%kUZA-;JoCZPSPL--y=u+P|!%A~$AGePo-<Fgv*2aZW5eZa*n9uO?_2^Fos
zpS{d=TI8T6;51u}57=9`5Wy?TX%;F;sLKO{+1Ic6FLWLs<eMIoGl5FsPR!~&`CwhO
zkuTxCk)te2ud>GWX6|welspnl3>s#O1_+S8aA?P!v_N!!j&?nXR&Zl4U-?J<6ujx+
zj$SJ=ivmU)L}vingnY%*D=45sA!}C6Cysd*<?Nr$V8sbX_QIj;029ecHnEe}DOA;t
zPtDE-B+JqFgdMsUVf%%>fl>ImALj#g;-iM2^=~A=uNznzcl`wZYR&%R(O1DB4PuDe
z;<+0L3T_Y274kX=j0S*<*z*D3;B;FA$lieUCK@@7LYUkDViD3&Gkm1fqzxwBA>?>4
zTP`v?a<ZN8;k&H0>a5RtN9n60YsrltF_##q{mcZQdV%qEh^~SZw%RILKU>T5Gan(>
zy?H{=K~uzT{5Ij$=5^Di$t5>8w*y7!!rw*l?(-#pG6hK#fr+Yj51q*k8+y#$k3FU3
zCkDc`$&MVUJFao`P+n%TGOIwT-2u;H<th!}^O6-K+L~KQeUYsUmU=Yp;MQH=|45tg
z9Ah($`OSDEA}D%DaPG3?6h^Z`1mhbnEqS@q>;_ssP5$^hDr_+@Nif#jF&4&gsrc&W
z=&&neH$ev1ubJdfMMkgk^;x-3T#U8RnK*lNU4WPlb05*DgAzEF3!(;Te+Y0jkB^m=
z<lJU<2-zT#xrWF<DLnv+FHc4ieU9qLa(G&Te)V6#h1lzutDPyrT@5$JLf*_sUy@g>
zhh7W0&xi)_G=|oL2*Wet)vH$m3$qm&;ikZc0t=dZzoqD*i?xA0+U*w+T^#Thh+Fv5
zX6zVq@2ywO?4$YjoQ30oP_^X`&O!v)!L(2TSs`#mpS7X4fYT%prv^nw^#$O*-*sxf
zhRlAtCu7|Ye7`-x3=D{eIxnw+A=BiC1rGaYnNcA(>}pAO^Ro6O>dx{GphF1WgR)TI
z+t!2q;p+`c{n?u^T>n_?F{3&J)Lcm3mjB8(P={J(-l)J3qZ)dGb%P|>rb1*FX*Xxf
zWV1Iv5yg)~Uf5{^voQNgp1Zkt?uz^c$e?(NkxEW|Xtm6WJ^88r^j14Xl@Rg8o=}TN
zl$Pb6j7u~5$5`~av~0JI@xv}H6wa9J0S#WtGiXro+h({!EU<Vjk}<HP4v9ae8o0!4
zXD7jZefx(?f`R(OoMEPB$<?MVN#>Zul{4pckYD%>#Crg<Qq6Q4cng%JzKl+zB3F>k
z@m`b6P*JiXfj+c-YRLU-nOo>llEd2NpDrcCfb`h?<&|$CiqTuWR^AczOSwc}y2}gK
z9zf-tRQGTLzO!eR_fsAUPL0RaUfy1I?at5gykP_i59}*uJBpgy{#%}(W^7DJ`Fp9C
z&5FaS8WI>FHcgx0pvM+>fTfrLW-<U~?iKw){KbDdsbP1^2|#(SmR%ZnyFR22kQjLn
z({MOx+)Ts5+gS!4j}UXDEd_1oeB0B7zSbbHf}*8#a7(RD?N_wyw2ANL|LCjh)yU??
z9&EBp@nq#Z*VIN*y2UiAGPyuOEDe##ei5%z(fTN)GZPd>gy(iu_F4~K$*_|rU*(T4
z(|Re8QS2up*%K8e-sO-{?wl1p;&Yg?eiU>6mNea>9CfOv3@i4RCauKIvgkIe#A*Vx
zzfE%evdLz|^8|sY?!)Csuw8wJyfLgNg@RJcMfR#<*6r=)0X`QZ%W}&<s^;x+3Gi+|
z-*Zxhu1!LQZ9)P;_PRq(p`TSW+WciC(=uY-a7oaYYOKC(pK*BFbDW8D-V(9lijbU)
zrqB133H03N4n3}e+rB5s{D>Qnm`K?I@LOd<ei-q}Bc?;x^ukMhVI_P4Wi0vFZ&*TR
zJQD1_dZ*g&Bz)#SYWG#J@tElfOHaAA2aNq?p7ms1x<a|PR>^TO^%f%q`=ngj>aW%G
z;Wh6w;%8r9y2EmXL0fQvqG+x9fy@JM7h>C+@vE}ITj`krjyhxbu7BWF@AvtxBU4bF
zlz_lkz@eYXs^~@^x9P8%*Su(&h@hPPBT@SqCs@LsZ&DEb%*>v*VI1k?&vm_->sHIx
zrCIZcyWE{?KYuUr!4*XNqMfxYeO~ExbIctEZ7Rd1Z#<IW20%G!SrdQRl(kvn<GJrj
znzM{%z6LvE4B5x4)A(QOr;(tL?(+Z>RRE=M6?8LHAX5={S&1J_p0o0kbtoj@p+H{n
zprK9rg!xHdDP}}qaSIh`D!gN8CD}72!VLI;)BRgY$xz!%VP^afYCyre4*~M?Cz&Fz
z>Y3wbp}p#(zAzt6=K!bZuNkQD(Iz@xv&bHDivLhAK?u<&iUvK1+y8)>nHd=Ia0m>Z
zDgAsArUEUo8|UMTN{l<%Dgf6G>fJa|3s+acp^Ni1k;N9}%(rc=xXsi!--Nbh{}b;~
zdW`xE-m}{Vp51pI8=gp=M*28D5s_zpvhv6gZAs+qcUQ9K$>qkP5@O}a49gC^FV8O6
z<f6_4gn%?Iq(kVd(u7>!htzqfMrV2uWh1+IyoVNcgkdN6FDOzVf?mWo?Nf}f;ACy}
zf$f-q{2u`&{muo5qW+&Ms@>Le9GGx?nMO;G9CpI<POn-woT-U{x{<?KNVkJYs_YEl
zXl7;jQf47VbCpdaAn56#M{kL=;*RUsa4z4D(rNNNCJ(J3`$1H0F!ZzvOK)JIzy!Zn
z`e$#31(|*^NHLIxj_BJl9NKoch#bhEUGSf@c<J(Oq`JDg4mtc>9?c)+=MZN=J5Qc&
zL-}gR$t_OtaghTZOR<od(fzhKcMMmU6BZGW)1eQ&Gl6vtUf(^`b?^i6d+LwR1ZLvT
zwNDp5)(&i6Tzt18x%My;WpjvMotQzOtbqxY6plzFrmn}?J%(N|r1PcNVs58-Nj~9`
z1G+%rg$SmZx*)xXyXG@WzB;yEe`caKk7eywY+DODS?t=6cxD(Zr%^ZKTPByX5AoX_
z54dAA+ze%?Mg)8_N?$+aI5wwVB`b!Nt<>^rtDiT*4*Y}pUtJ#Yp}dJ%!wC_+W-W{V
zdBn8e;DpTuWiK-CXE0itUXx)p;^B3H$zdkDd=Oo1fxYfkr++=}GcUa=kAkw7FZ4at
z4VXov5{%CuGI)CBPV1O(D3_cP=wi-dc+?c8y(O9jiccbre|gf6of>Z^>$Drk%hcCM
za#ID$$_j)91fT@VTU|NR_(Nu+5?O)eO2WDYw$%s<q}5<-jb&1;#pO~JmWOXep7d5B
z@#kJI`$r=*TXtF8>1HqqQ8kg5wKHNd^)LS~to=I*K7;LVx~be&?=Zf$rB3Zuy7c)=
z4K_1|t5=I^YV4EZUD^OFGJ^b=T;Ii_k3xiZ4K1IUq*ZjejQ)CB2MgSge53S9_3fbx
zhi$yG2Djg^F}$@^QTW7~tenXw5+~_&{Ov1awHWA($zgHz8AbVsSRn+_V-g-l^Y`C1
z9bGJ$Ape@%T*qPBsDB;WiwcX0x^H;Kp(E-PWhpBulr~4D=4^Aq4juHnWFP!d62h_A
z5H`fAzrj~(dU?z+dV;^VK8gM(0i9d9y}uK)S>>sGs_6MKNr#l^CaH^X@<D8Iy9?Ug
zQ)5f(P*n>zWOh&bB?xm=SL(c*3tDUP*&O;5u0)KRbxaq!LYWXS+()uqN><5M<;3|a
z*L^;5$Im*oIlKmti?@O$^X|t&p7-M$C<X#0CKf|54IG8nMT$Rr<s&sD`lKjuwTzFr
zs`ejocUUg{U^QKl`~61Mt4fy!-<4c=3q%At-=VJA1(&4xO=np7?!{}W*G<V-sYxJi
zA)8Yz&YmA<D2Nmnju{r{MK3#&d~3;?8w)ZhPNYYl{+x4qCZ1MfU2O<Ti3?hfcyBok
zRB-OB*~m7Sj6Q-f73$yS`xEJi(jvC{Jxdo4ke+bG%EWm{gk5rRp1WUTYo!@-bqIw0
zAYCXigb`&P8`a}(4FD|ZMQM&$uKUEh&UFobjqm8aYrgYE?FBb8A$N8!i+AaYj2b&)
zJl52k1AI)cTKk_asJql#^U?d;DIB7LXR->Kp>X(m40F8e^&z?a^U`Bz(B=NLRP5$Q
z)r#aHzS50p;a%mEcj8Wz2Q`<W^sdvM4r$Kln|sOO<CqvqRl818_}NNB1S3uc1_`(T
z7&`Nv(9^{iZkg^VF?@oZW+poluf1R>%J8>h8v@Kez*Y$qdqBbHUf^bfwb)uhvL_vO
z4aJF(deh#h8beV@=GXa$*UchF4ytz~sa~${D*-<=&TSRPs5O?{#h0-#`}%;LGE3U@
zElHk|ITv#f_JuOgvwFQ@AM@!9)1~}z8;tQwL=A%?{N~j&wC|jA-qk4pY^p+@`1Tm}
zF<e)W;!_*3q!O!XcDkx>*b*=Zwff{>ABRys^(Ke<!ht)&PKc?*CZbPHULLXXGB|`Z
zjBh<(E-f$)_~ruzE2L8xGcOhNvZc{x2aOJNtsiQ$2M<4Q$2aS4{#W}jdm#!^Hay6X
zk#R3I2NWv|RIkQ-MNHnXVGakxE(?A=AC86;>=1uO83$`wGmWv37?Y!8d2KeTWSKrD
zLh2KiDQB&ZOnc|=_!YZfqKvb^%Dz6q1r~WwZEbBno0OhG-b0`aA&<X<)wzc8>#WO(
zhi#^5KO{-E)k|GynBp}<-ayF%YjU=Rw<;_mekgu9`DcsJ91RP+C4%93KDTp36jA4j
zN9fuQUrY-P)ggQcv{=~M(a07*a+IE*&tGy1`Id9dBc+&3=?N4FC3?;69UUWGWICZD
zw=7S>Ccp+nMQErHVUGPWNkG^|QK*Njyw}QsTt*`^k~jzDr=Xf0wTL-VN4@wBc54Vr
z0YyI4?b`s1E(@kGLvwRk;%i7<1qMHdi)ejB{&MQnDZPGHf*aB=<xYqwte-se->;a5
zN~o`|Z$l16WH0kaZn7;{A;4t=OCqHVFrg6yorFt!%}`tWUp0$KJb!3L;5iZNG!ext
z9~&gx665gYtZp24-)VwdF&3poYjbD{{;D0QszAnlP}}fd`|cs!C~Ts`0-nF>=Gu@K
z__m7>OC+0m{@8Td?$+Ih)J_`vd?5voKX8FatqkP&M_1;*`I&jnG}N^ZnaM60P;z(Q
zoXEB01Z)4=vXu^CN4uPiV(=h^y*&*dm0kzQZMz$*cA{ibFKBk>4AwuEd8{)Ou=`gH
z?(R@Dw^TV!L<oGDzlP8O=2ECc{0f@YAC%JPt@g=O-SeoNjB^3$u`CzUW%un_T~daf
zncSVPDA5sldzEA*MK)Sef=IH7#;A|NjL4ZACS0<_JK>SP5S$dpmofO~S6@CHMSP}$
zPCqfTIibS4+k?B?U??{z79#L07Jy2>OG|jm`T^~Uln9*-#N@or8F8X7fa8kTB$Vm$
zL+!F5)nAa~<d?^4d$0aS8G!$1+LHhGto~=&#Q*ns{O|LC!E0~iqU3-W-wtYAp8Q!<
zY+;rQ`r_e1eyADAv5?;l6U4dVy+k*B;J@tL{_7KaIPSfX(o;PUDF=ld|2t^@?>+ec
zp&hL4mN-<^*r=>-Pr6(c7H~@RmS&&Ty=TgL_ZTJh(>3W2n3<Ogl>@p>TW`K^OqaY<
z9Tx5+VS902i#NbfVJ;AL(Na~m;kdFm_qqL=58hnOWH0{yG}^toD+ObB{4m+k!~@@Y
z1*P(f)Rp`X$0Uawdbm=P_mU!Qw=pU%jt?iapr+W_RVgH7)cot0tjB~)Gd^~xzWygf
z<bTqud{zq{0s@75ILlSQuZ&DWZO07<k5z}c;<PP1a%OvVunMfK!rRk#v6eDKS{06M
z)~7KIlsK+qZQ7c}=6SdvlBw0rv+ipNqr6qy*s3Nc_e-E>=fxb^TGriN-dw2K9e#G!
zvafL3m(Fr+j9J*E{Ek7b(?GxO)@R+VropYiQQjDCi|Ck`0$eP~kVRYnbVu5Fi^qC|
zhx*zMomb79Mf5%b!x|{LQ4@ne4?Jma_{%W-8O4VG=JXH{{KKof#|H?`r46nH4ni1A
z@u>xPB3kQ7R21qK<OFXX>|$+&59<h=-(AO1)v|FP<SN(_&fQwtU2PeeG_%^m?b=Lw
z6l2eHe$n05-CZRWUK1V@hRfJ#abGDof%W1yDZzRyt9fv>l_T&P>Y?YXIcx2^i}B`w
zz*0zFY>dxjz_IV9801|`k9tPFmAm_W>w5C0)y}BZc8y0|3G&UKZ5PrRa9dkjb0ve8
zc|z%-!t0^JQNOgWBIVVP<K3U2ly@%>-KuQ8^EZdnMm2Dkf=u6(i;Fk}de49GjuqJU
zU3<9MY+<kYbfe61p1#()*u1LQUQ|JJc6RpVC>tBweal1XaGDQH$%S3@essH#pA~vo
z&d~MFaaJgxWZ?V<>LYg?qBJQs6U)tc4sf9?N4L}*=A3>Byy~?iHl3;wcqQmKdGu%(
zA&~}%)3uPJ5J$;N{wFDi9zlFWhA8I>J}f_c@A}$e#T=8$VVr?wO-xexRff%#!E&qJ
z%3{qY&d$p?V0REM7ruLU_2dz_TUBBge?wFr(Ej;-|5vY+^WicA<uobwE~@ogVNssR
ztn;@L{9S6ye&*JuT}=1%6u<lTJhPnl!<pqP-;m1+Zz{Lyt=JqXbAXGsD|r8Y!1%_g
zK;JpvYU@8|vDutL96Slb&k2WC&p5Z*pMN{r$$FRfWx-HWucahctM<f+3cfb&Tf-}J
zRSuhiccLTxp2hP`@ivFg^}xNGF?{%NxHCf;u3VQ9c>vPH3VV<3)M1=YeeZkQm^r35
zaek9m`AA7gi(@=>^+5fMKmPe{%v7&#t7d+7zJdR@H;F*}IDX<z+suq9<$+?ly~W(s
z@Tkn>CAX@F;8)Dxd68H*rrL+29f(-i>}@*Zyf7g|c_(;b^M|`JlJjC6IF^^UzLO^U
zy4fjCEzY}ntCeLYy}GkbNpAV`u{*(Y_Q;Usnejb9b$K82BTjaf+j)jplFdr6UNsM{
zp7eegS$ew!dz|i<v(WNt&g}C+o*20~rY1w>0qM=q(xf%HF~}|)`S=cJX~d)1?^1#d
zT#)|GusQKmazn6X7PKJp62-TPpPnOrDxY><$8Kbq_D*nrzRJYpWJGxQk+W;4tnBPw
z%YDz8b-2*-%*0PEOIUThMm~=?omsehvTG!J^#-?vkn1n+IbRjgM;+Z?QnP0t4%&;p
zeKLQ&<jl$)2Yka_%d5>P+wjeu?|W3vke;B)z3|>(X+B_bAP+~W5?LqOzK`UdxZzX(
z=&~UBV|fuP4VEPWH#jtPbh<6uXP2*Aj$<rydt?OPyHv)_Vsd6r6_>#n=`6JV+>|@U
ztEO6?;(A;t&ykM`hQUwQ(i+bRwkTUEap%kyV=a2@WE@8n%GJ8&?7N!Ui?8uL(&HB^
zO11qz?7d}JRZ-VAiXDW~DIhIKOP6$)ly15kq#G0@Hr?IbNXI5rx@!ZnDd`3QDXB9z
zKJWXU?|j$woqy-o*?&M>u;!X;&N1#W?t6@R-fK}mv&2ES@Y9!kq}2}j;y$sBKaC!1
zC@;}&H+X8|rNE0R)$4;Hzd_PH{qtv}P@ujUL1klk$=uZ0hfqBH`~xrDvJ^1HA+zeB
zm+%xlszH<S-?0f&e|$<N=!aS6lc+QNI}+R~recU))A1EAKZ~L6&HD14^0)wtP^`cD
zO{d!qkH4w^@vZfKyI41D+hTjeRX?m!0A2EOTd)pawwm@%+YQLXsBxqi1m+2c=-xzW
zpkq#SyHH@hwq1)(MV1_C0-rx7*N!uc=Cr&^VC`HPHkY1}kznj-X=zDwKuxmDOdYV7
zJ8Cwvizlv3rNcdF!oKUD-<qvirv6e<z+(S-<2SmE@~D6NXG~76=Ok^WZhzJUhFe&`
ze6wg>mx7|VWa83biJ^=Got4SerlfZCRToGwG)Ce1Iz%-#lrZvSl|&k1BK#cZry-hq
ztt<_E`ndSe{HgCJf5F>LvP_;!{C#Vyn0?+=V%Jxw``tZ*uVvDNY5NxzmFbH=-^UST
z^KF$W1?A;Y&&VfgTg=8^iKr+(2z`FrrYh7gawxV?hS;HtmGR`MuO9r>y7+UJ8f5p7
ziH}xRB$Q_o(G@Y+tY-_^vv$Otx5Cm-vyV4JaiT|_K>TA1huEnoKV|7h*3H;qk+l<2
zEANcmF;M<>*X6OOg^kTS>0+Ch98SF3k$#S}38D*rDhFOUa4Qhjv~z3m&%-0o;g!@=
zkv%rv&xMbBGQ0LVrjb`8Z&DQ$M*%LsUzRxZF(~!*?<5n#v`B5ml?!hmtE+{e`|n@m
zU|fY{OYO+)iS0PUF~39F;WS}247+PQCoIa@F!)+6gfT`ifeTI=(!|xS-AfEQiSW_~
zQ-Dq)J0~3(E=Lck1S!rxLdxP7z3;Q`QnI`Zj7Wdqe4Kv#)%HJ*!!d!Usejh}C;pr<
za&s+rlU{6<3v@>am>~c8XrKS^SDGYKx<@x~L@3H=xjJ{k-1CxL2?fWD;bPE7g#ul(
z_;5p=lx}98Zbf36b(tA-I%aFesk^7MHuV$I6~IKm=1AQc8jZwGOdy2XFULr|al}hX
zN|yGL2lLHRT+I$|^a+yT2fhAgu(|TbtW%Sd9kz%5xm$g8zNMmXJ~)V4&029zjJ6(r
zVE0tQ&US)1G=5@0+8pqwAuhe$xC>cY1FFwq5KYbgJ?CQ%iiiQwUPC2?>LgQeA1Fm;
zc1@ajvB;uMKKhaaqj|3LgNB)!@qR>h&r;>*L(Pc4+xkA=cbQ!s>So^Ep|Iuslez1x
zdLQg7u%L$;$8CX|7;c;bkIzMs4qJX^U7sT{RvwWW5r}diEBng3)}U(m45RZnVbR#O
zOS?x_Bt)ZhYt^C|?7%lEgR85??Q3~Xt&6p85JQqxil*P+OyyL^s8h5=Xw7QPMaOL#
zpNoZ8H9TqIHckS#OP}Dxy~P<eXH);k?=vs?)KT-!M+jUKI@P7eVx~K}eQd-GP3>&f
zCoE8A3=6ibg;1;$p*;kfd1@OfnJZ{4N(Nu)v3b_6WSx~fZ!t%WI*6sX+PYtGEdBgG
zp=7>iSmAIhH;P=oJ@znc!{Xrm%2D}V7B~Vd<L{0njt@7Fy4f8Xqwi^&JYP>8n4Y9X
z#ohDJz}TT4c_Zc$Cgzy1tDYI~A5*jM@#zaj=fz>`Ol$Lg9pg*|fHQy1>SsjPqwPrw
zrnZx;Od5tmEhl!inb!~V%3sqFYT>m>YHE~P$o!%UQ)MpnBWE#Bch+Wh&eKlhONvm{
z+Un~OaZacUj2a7gMV>3mviW5QI+2f_W~&oB%8@dd1x?|6KTO5cD=G2vKpogp6A8lQ
zN7XRMRoEhf<lfhr-7dvP_9-<f1)yO_yxrITY8aC88KGUiJv2Preeu09RmvHr?KU*V
z+0V^pEYuw#T~@(H8nny}UC1_)teK9G%JoKM*h@}OzrvTaCgJ^h`kXbG6#x%SWkSb<
z#s{|H0I69mfJ3dXJ?fE$4(t)6q0_x?pRuaXwyBYwV&&?ef3G!lUzogH4ZFHop&-UC
z=@ljIZt#TD9NUWH0Vbp>s(hP?nYqM=2*F=_S3z+8%sVw!grs&}3FaBDTOLneK$$x(
z|1t}^W|gw48pY1F^I3PpW+`(~oqCFaq7uV3TbEE@f(k?<qN%3~m`2wnqX|2HU0GCg
zuZ3YI=Lv72tKR-GEM8#`&<nrcY3|Iva;X$QKp!?2&j;s}EfJ1+^cG@ndSWT5kdjz=
z+bZM%1=t@04~0jzk2^AqR86lt6PCFlElv8Lka^|I;I54JInRv4zx~p~hhyc;aGY1(
zN%5{~9ILXp*7lyeqn%{7U36S!^DL|mO6DyuB`Ih)U)u$)ez+GIxn~H>55E$PfS7{p
zEL#=I5jAD%7>Op0d|YM7c>)&Au#^FR?uj$RjLtjb>aPZ6){ZUDQ(729j%_WM-&y?~
z1lL2q@8ov2G@xNbTfRWqRt<Eznzh)2C231Ru9f`eRd!QNccO4e03ki{&;UJ8NZ1dl
z^GxSFAxBbLQwT1<FP&SjUC9~q^3hI7LKqFc3qoycB%98yO|(Agu1He^l<YKy!QD}N
ziUl=t`6(?Nive!Mw>+C0FOwwaouvNFxa$3vvzL6FL_~{<;Tz>U?RvRy=yO=cFZE5R
z;>;CGc$IiP))5Y@e>xMD6S%N;7rWbBmCg4(ONnmjlk#b+<Whp9#8PS<2Xyqf{14x;
zZJjUOS*}w$-l6UOW$>ORXrr4f|FIOt+RGrzCJ_u~Y^i;ThL?Mq3N@0MKW6F(UNd8T
z@Q#&#w4;dm(~g!Xj_7J(AQ9Aal#|VzHmay4bX9+3)3HjkaNdHKcC}{uK2`>^lWYv#
z$J~P<akb+9cFlR)C24XGC>GN7B&J4xkx`H#Oj|tmxP;hm;Hra#!IPF4<+;(Kx>3A~
zwBYztPQd;+FETQX(y|I^!{&4}HDgZWTxWyu)7=QQtEW?k4iZLRw(dVeFE04tW+~!~
zkird#No*sJ;=kcckq?>hj6V4$Ff6Mj<-tN=#zk@*W@K(IEs&l=m#k0(shCn3zPCj6
zwWfxOw80hUpu)>S9rVBjD-ErKyE#-ibE3AgGD~NPkZE?xrk<Vj`tY!7QUkZ?9c8X;
zrgjEj3^>#ijl!Fm;&CE`;0;F9VDG9)6&I6QbXmaZD*t$|oP)V}5gH>-wRiC=wdeN@
zro%cWAk;C))*?|AY2Rt-)5pypm7R4-q@iVFRjB-|B|XuN_?zLd{{GiQ$6^jGNp}_u
zZB&~QbFTAl1>G!g0;x0H``kmVSEmRY)UCDi{R~Hv^#JQBYb3_1`{t`tL96i^H$i(#
z7rnad31Pu~Q)_GamLH6{$&#;K7^VVuonc$rV#=y@$uDu+wPWP1m30_Dj=n=Vkg?=U
zY4n3`8tJ$0*FzY2%n^Dli5ou}u+^fcDaK>ht+FQxWQCu*rcar@3Eq-`sWkUe8%$IK
z(2m}&6xufV^j=aVhUg3LvyQ$|><US*8T6unxExyb@bs^9!F`!9WZcw*ifs>K!JKaI
zkIu{JGw8zT0zQ{3N8O4Dn+H|^$QsZTg7Gs8crHW?_rEb-?!Z7;hsdz3(EIv*f1A%_
z@PlIbNanU#i7ZF$f0~svkFn95MWh_Qm$HlPcP*dVj>{UdR9z(;$jDEnjdF&iabEjm
zhm_03a5wfJ!MHm^T*R#PZDWd_e|k`yDpC0hOzv&yt1klF$y}?$%;3=sGH9(a=HqZY
zBj89c8TqKd0C!bj_(4Z3()4<~ce6%SHeRL|k{i|MutYe$@>F*s$1~M+$o(LhSEe%c
za~eNJfBYoWoCN!*i2QMt>{g!&ZsIdz$DYNJ&n*QyZoZh#0SYaL9Yq@H<kGBSAW;6j
zCUzTOn1P^M$o<^BtH}V1)O+n$`!ESrfgg5BI%@mZ;99nTuIBrb#p`0u$^mZk<Il&?
zS<_M#G-cj(SQu1Jkw6<Je?1X@Y2U<T7aLJs@^00@dHaj}&omoyWeL@U1`|7*yyQP&
zw@u<v#&a=@v461SYLSQSUu^xFJg5!<j*i&oDNh<_rjxZbV=#ps)v6SAO-q=r2erce
zEnE1L<D(a#VF^)J(1D3ukiK!o{)M^)%Am^Bt7qJ}%4v8EkAyQ%qT?mk8$#?L@5yUz
zyOTI>ce$LHa_(S(dh{2Y)X^N8`azeuvb>}?sSOelUeTDL%?{tbts!GV?eK)uIxf@Z
zuM(+`$9&snD<o=LRq>vgXliSxDXA4t_HMbS5l&7{=BwXJ%z+sV@jU@pqV}bravDMw
z!Os2Xb78^q;XrlfiM86$G19mc%W5T-w7pU+d;5866N>M}Qx2PGqR`K6yk?}xW#`yT
zRp2XiNEm$rvB*?o4>NLdQtN+;N@!CgP?O~+C;pnPaWov!sJFq+pmU@`d|ol9Trl42
zEz(KonO&sbr9z|L9Dk!KdoyOD_eMFovK#@Ue1c3ne3A}Hh>l&CF-m3Ni1}m-=PB}B
zn8IrEJXTbHJ{G2z%>yi0xJF)>1A+O3T>AQI&t^wXe^9plltq~89<4zX)LS_-O}&FX
zWzukoLumB}QyGkZSXNdxhYl+)F=psFdw5}}sSE?rpy2CO7n;@cWT`ach@xBaTMiLR
z%SNQO6bHE!VXh=sfSG~X^#NJ`Y&!ZIu_MCAn-c7!sR|*H-Obli#Yw3H?`aj}Q1I^y
z^QTYga2`zQ;E0DwP+Hd?uAe<esF&o`vIJAGP8oV$7<7uZlRp-VD6&5B(haxDIK0by
z5V)}ESVI~!$B~9M`VsUm`8F9|xRcg1Uea4F<UiWI<^Brf*r*y`(i;ivkO{gQ2<*&&
z>;{RTyYC*G<bUJ0p)~Jh5K@Pec?=+vOT#`lD@O}LZU?h`b;R$u0?0OLYe;jlvN~DS
zLHm<c*rnWSRhe0}E<HBuL-&{x5!kQ^C-9oAJXOnCz_x}-7+z}l<NXcwxgs0hT&h%{
zND4{1?{5*7-t$##BBd$pXC}yFK;;aWCN~|4!X$1(n=py?BYw<r9L@CPGow!I_uY29
zY2oW`Vg29>w~#{H7y4@}UYUp|Rj}m2wGp&Yma#r=A7d(bRUo!zwqRsvTF*j)zMv|m
zK!|DE%x-)^9m$0K*&;TBq+&BEW)8sz9(RM)WmXx-HQ$?_>N4$94o>U1n_B0&b)m!3
zGKqM}W>}<&z|OdKVkGG5KqheFS50htn@l=irp*xY9&=_e@y849q~&I~cU(OA)mwzk
zlY7nh`9VnQne}o)JXAx?ywedHp&Z&o%xj|FvJD-Y7i|6TZ1Yfn3tB=Qs#*{)VbUcY
zCk!=`B-W$zV^OA|fli=^;ZWzdE6PbMEhdLniUr_Cuyj0B*bz@m41r1%E6Uf;$cLl`
zGp5T}M$qV*<W9`4nYkG#;3rhZ3NO;;)YyGLyr2C3PJuwAs{$=5Pnv#9nPB!4FS6`-
zhu{;r#vrUu%iN<$sNiYc24Q@U+VGp5C9YI{Ba()`?}JCz*b(>}!CxZr8Q}x2vhNQa
z47P}qqJ`3YCY%zwSc~b-XkFVPUh#49+9Xya^+7npl&BN_?x%Tf(bikddmR>KthAP0
z?nktFzv=dtp*z^p)L9NfjfpBB!LB%7nlcDT$@`$K9XG*ZelJ3zn3}CwOx{wzSwzYb
z>fByqIaAX=@9GrWAje*$6jj0;+?a)xF&G^?#qrSZ^DgJl#fxhM0&@9#%NcfbbVQi-
z2_(BHQv>&2raZ2G%KSpcquFlI#KFzYZKH+OJ1KKseY-Gpg$^dm-bCTuUendB$v0+$
zZkq^<7}m2o5<5l2{FfSUg=RH_dsqqRV<$;JkCwJk-4f9SBA4W`o)k5GwCEzmB2)fs
zStg-OHJ`Ua$tWtUJFVq3p+@HU`}5hkO2*%nw(GgrNxrU9=#bX$WS4#)PNBA()59DD
z^Na3H(sT8T>k3?tDKOjSNtw?dDmL-i$XI4g!tIEjr!{`v5{mm7kiB=e=j;BRn+sZ<
z=$p&w3tLUAf!Es^UPq67+$ngSc&+cb*sZGwd3z?y6D@=f`uu+8bAx(NJ@t3z^ZvYT
zEzhw_(an5FMB8k{Jl4(Z(YF`%bH|;Y^<gOvww@b-kDi(tV}iJR9i(-8d|LmE$%5wH
z9w+ik8gG+X&_GQz7BklJ)MG*qm5MerE^Ij_W%fE{;v_qsSLK^4+NL`!jDJQ*W1BNW
z7B)vLOR^M$n?ueX=plAAw@%DwjC7T4rmI$W>wS*ZxuN{lYa&fqm?=hBaz_0w(hC-~
zaD0hHG01wyp=}g(1P*iYi|m&42bf{h*RRU-E#Ai1F~k(s%-Imz;N7%5HmixWgRgh~
zu31_Mj$bOWl;BOZi^l43`}UFDZNI<2pO~2V;I~U|YjnRBfgoOYsbTjMI9y7DKi<yZ
z)SWW79jD`nnJ^OUU^O41tOgkvei?c+*wtx%3*(%`)zS#V1Q((RjOE=sPlcs49$r=3
z&L8;mtqSR$ZL&LX`E=Q(!dm~JGuFlU6yFzVD&jeZE2du$p(2<EMVn-=PEUM{#jlTk
zIxtKPYj1N8k@;K|;?495O~+B@I*c*1uEasQ^77m{OtC<eeXo5G?vt<9+XGYZc)S%C
zrJv7AfEJ)b%)7}!M#w=OZl@M$_;D`5jz_45A>hQ60?<Tn-7v@WpN`!va&_g(ie#!x
zWFifw#v2bCr?L)&q>=Ny$Oj23UdF5$pWibs`!kd`A2w6NBW=c*ju$*me+(}lr0Nlj
znLkN$F0o&8JQv~BHzN$zh8!&^CqOJIBwkG?%kWihb=swpXczDHeH4PIClag|`7;NM
zF5-wZF`zC`0*Uv_Y<B!H3HPphMR~*4DXxC`)THNw3(ewJ>tpM6<^t7QLY)m7nI<(~
zhU`)o7Mx@L4!@XpX3V|#o{?TS+n|A?AUpYVc6cqtTk&c=ETcsSbyLUla`iCOV=!yZ
zCV@U~9L@SRuNuGj66a^MC(K^k#q8~ep#qOd{BYo#^J?u*47zmNfAy}dy98y>vc60c
zpMExLmwM)huEW><4(sjU<vB5a6_--U)|U;qd3<!D9mR(ku-?1VHF-{E`%;05S-Zt!
zaebP&!}8$osOIr|nY&;8k?JPZXxMW8e18OnAivPrZ_hNWri}srvp>!UeZquFG6$M>
z3d^sjxguVim!I^XDXF{I6`Z7;m9eOlXTVM-Ty&dv%b9hAz_D-+Q-Xim@15HtSHzO2
z@v`WLWj;k>J=Aaakz9sK73s-Td+V;cYM1*@p>v;Bkl(~ZW6Z(E3Os!$XBvc0NwYG$
z$<^Vryl-JfnD(`OhV+KI)3MeOxK#Y%mYC%LF7K1SB=tlPiG)gr;7xi{pc?h^nsa9O
z#f_|Gmm6FRRD3<}J`5rcE3RB3JlmG_FfQh48*w-DYWS0CLd$RqfgNUW0Z5qNx~bW&
zbC3-`+HfIFK?qoYYfOJQs92)pul?hAqC9!#R?NVUZLbJz7u2eNl&t~zF@wGQ2=E`k
zOYc5-2>765RMIFBFW<LDSYW-y`1ciMpt1>27b^B!abPT2G7rhpZFifPt(`PFOpCCQ
zE9v^a{`VAOPuF>829^)NQ$W&^2eoG#+h@E_TLr-OyZ)u92wt;Y+kfbyn4b>GXRn<^
zZgx`4eMFLD6tBe66s?C^4!VYQd}g$Do8dnDo!z>4LYh|dCHt*r2lL~|TY()Qdx6~t
zm=}E(f}b`Iyl<P=#L7Bzu5;|;3BCDejl-AEBKTA4Onhf)JnC0<NhbN-XShA0A~C8<
zeNG1XT0a(?^l!j7!LlNpDLQJ<b~r|#=5xe%5j<Y*#&#)jTPzq3(vk~#T-z4j&hLJ4
z#KrV{ED4(-+oEkp()Ku~-pHyzq1Cy6ZEl(`^`F_raw!3eGWc<IO_0)srF5dC?Y_>L
zTM-09^#PJSds|z(e*+=lC)C|l9rrjN-HWo7FOc?U*OI@RXa|Ws`JtF`4qU;_&Q<*>
z*`>25u4<z^B4f5QO~~VBx4`?dVVN6PEt8FSHXNA80;!YXZpAVS_+`q`>>9)2vgW^g
zAr-9`OX-jHq%`r5f!#9qNd^N0bR5~=#XH@0U}ZalU+!%JOS_oZ(ueU$`veur%B`BN
zFAw-Hzh}eG1o<-(VgBD^mMW^mhU`dOy5*rIX%>i6A8WIC)3n}ucOs<*zn+j?)+Hwk
zASa5lt}#U@=7iSTgsakc>B{>k;-t%k!6(VN#0uJ9XMhF&Wj=h*rLbsPRER&4pi*1N
z#Tt=Ov-z_l9(oG9Acif$;@ae36Yv#HY`kxYNLKFc3#xKwB8}0<R;d$Ip9n{VE=)Wl
zi|8p|hZI=ZXCQ3B&7D1~JMMBF^mpt!7*0?A(B5jpCo?MUd)&YDDrFYzV}WP0m<z8f
z>E$ok>@L{!rq%VvZuveLs><16&pYVtU$F{XI=(50nH&^a?yU_|;f;5W*CP$wq8B)R
zvPGnG?&72)#i-TF(twXu)j*GyqS{r^>1wtX+Ua=k$NEUV&D2$QNoS{A>E*2R6#jC=
zkmvMdM<ALW-R%~#|6sF#dt+=l<+UZ0!;YU1C(5K)`i$VvSjdd~K~1A8V!*1+?<Jdp
z%cSFgpT?W!tQ&xp;y9Ji8s*hSAkD7?Gd{8@!(n1A$HEHPT-ww`sM;JP!@U2fBfR8w
zmvSzaE6CJt2z?%-bRkb`_!bKcgd(8G+9ef#0#FZlGY)355<>9q^nd4`^#7`5XDbL9
z(t3ZKAp_eLn^C{Ft>0E@ZfQ8KK7;=K%`EVV8T{8Ze^Y82=g07zN8Wv{xl=&V%=bTJ
zY7pNG2OXYDh(B`2m?w=N54WEC3CVja>YGp5?cDVB^%i(%U(;Co*42Q{EgSUKv4wc^
zP(FEB#s3g;(&kPF9Y639%on4YYz44?J6i!co`Ab$6%||n|K|Y4`aebpOQwHQtZ9W~
zUqOy#|2*S*TNUysRvCdZY$(}UqMiX2r^2d#lVr;Tu##Oh@@HLchT%?eWA;r*{x`YA
zwqY67g$CmjqccUbyDV|s`5zY@CE2bH3M{B*@&MOV#JtSx&gOn|W|!M4;WmBVsv`yK
z>`h!;$DOZvX)4d?gpltse`UnaLlNW!)@ON;$rC_+vo1%)0Kf7*a2+PunkTP3C2&~A
z;#%dgqIr<wX?)5!ZlUeZ=;{D+3QK@R1_t?QyfPonyRs}w(ZC@t+O6fi`FdK$p_%%d
zd9s$vKVHvf`L5q<n1eNV`ET3T)wN$eYP$lqK-%gF_E=Is-5HiLd}J|IbUg+8*ba&F
zvg~wD-cJD%`s_FRqk5}0CYXRf%zF=^vEXF~_L`aXe;lN=r;?nM1|y)I3l!3Lowj0;
zyf+Dwg#oEh_1CT)4wUbIuMHi45FauJT>wt5bVF|DE$3g{P#bEa9~hLDfPth{e%pSb
z+u(5X-PTmK-Z<11*M}E79b5VXB-I7j^cfc$2BXrFwLLEnEKToj{Jqc{5PDNi0}oAz
z+Pnu=2Czmwg*N>J2lU{Cp$u)T19eC$t#i7-<b+-7X17D}%-&xE16r_#WG)jzoyt=E
z{U6N8e`8h#=enRMu|i8Es3s@xu!9jj7AF2nl<+>0#`i1m+c_!);(y`=KBx5@c9CuK
zsv2068){3&RWg0aME44lEv3QUoXETOxY}UEY%x~UqpfOw7v}VWzH%V4viuwOtudfC
zS>EY9W5!0^`}E-m9zixdv<dgeUmA<%XcY6CCAUU)f{}+Ty{lGv{AhbUi;T1fTrh!k
zdqKJHe=bmyS}Lxc=p+1G^%B{ZtuPTpROW42p%{_re-D-qI`LFTi^d6$#`E%Q5pNvK
zDRpXJ+h#8rdc4Z%-vxg%s3J0ROVZXD^kR;0H6I_y3v->QYAz0XhMgaZL4b`!7;oc%
z8!NfWKR<sFJ9>w{lFmXxIxT1n%lI|f!JrVLWM15WLLKyZneG^HDc%gNEMu*VHvjKS
zDuIcG!=yqhs}e%#Xj!4@#rEw^pRtfi1(d>2fQVcP=j<<-7{c!KS%F2!4@|m)YJA=x
z*E-W4{5rNSXbY#S>14Q-kU2+%U2Mexn80UOzu$)2f2}!RTbm`W70N5~gp^g5gLtqL
z7zb{ww}Q90rT4!G{5Q3p_lf~|fAFUGA;1U_*q)^o0`aqE!e?c=gPNSJ1K$bVk9O__
z7xYDW;METzi0-z&E?|=aGW7sr=%P&lzAn8Zs{%QXFqzPkrBO{t>giugWpz;?gGr^0
z5b_$F;L-x#8yTJ+BsXL&g*3JP9yfgt=Lke%wwS>=V4|@DTTgQ8*_`7Db~|(GYq)LA
z&=HI&E(=v|r&l-2Hx%C70R6KCEGR%cm8XC={aR=#{=b{0qJI(Jqlz@ln;nwN^r_|<
z{sPP0CdG>|sf}PuawTi%XbQn!-Z5`1*%d6w=(Uu4d*#etqLbgn3lyGdJ8*hC>~c{#
zE=z^ZcZ^I$oy5GSNFZqGT;@NjEEn>*VD5aT=(FyBrTd{Wdc4z69*Z}V?%!<s^3~hx
z?f=IP0pjgud};7tftJZ?i#uFjz4qiU@p6rYT*<Y{fo@mG@)DS6qLjKm$6>ovfUMC3
z`I*U%`#!fYJUU@=CDE2Z<~ASDP_XU>50fRlDDS*iQDeTgWH&!FyxXw<*@NAgDc)zo
zn=p;n`kf*&1Auf~^RwEs=hdeRG+D$dOMPVr=gU=r-@8<bYc?23*ONk>t|D&ce28z1
zo3pL60YF2#FpfziFar2PdZXCJ82RZ$;@XL)o7r@L#10_7XaxZyQ36n_&=Ki4>)4Nb
zDjl{4%=?4RIr$Z&rfmprN4@=*1~`U+7&}+{F(OTNHO_MdYg#mD^@<@FN4xUNAZdfQ
z0^mwZ0-2f*Wq3IWUxqO1JjtAzp*gahO`be0t@$e*u=NfccQ$XXI)Dg)?c{RZR;#CV
z`wbGI60iORw$^Q{J7Rr?SoLb7?2PTP!fSci6a|H`cg@I$Gfyh5|9x{gal_}BB4&Fv
zo#)8~PX(zD2#xDbV_cxOYi49xd$zy7zi1%F3tq<_XVG8WxKB4d7f$(Y-Q1>)`XzEk
zHyz=+4~2iZ={Wi2nhVllHaYR8kv|SbV0~f)VQ%_*l27}cSed0%N5Z_1gTwnH-q;{R
zgNBLA{{2Bk8q_wXWmpV5Y(@2&a@fM*$SJ~8mNE0*bC;bSmt~$byfAb|f$?|Ep=v6i
z(b&SqR944*=Pen`i)Y>`cbg0ZuF~+Ac;;aE-sRU0W-W>FOUB5iHwlJmPaVLfr4C<z
z@v4yzEPk8mW=-u^2Ec<Ty?rDn@<61CE|@5+3xNQ%!$)l{55p)zz_fnX13chEKVgD^
zt%vFBYD@(wLw;EP!4yviFy;p8Nvc^P)0z7~A5`%HuVOHF%fddJ4$ItQPhiEPRci{@
zG<3^b_VYAOTH`OFL#w3>->IS8D`^I%m>DikYw)z-P0D7-7JihTE$BJM+Be%h=7R5D
zNv6wdE{kXqWh33Jb<HdbuY)kZCr7%ax3?q#KX8MdAmc(kkBP!jiB;X)Td1IkI$;#t
z1b``JU_*YG6E2*anX;d_DGU!4n6hc1i+sG{#t=Sa8UR3R>li-YCmbg3fBBd%52hme
zz^{LUgJ0iXFOL(|sxC-US>}&n<E5qU?{7^224$?P9axg<vu#QC{2C;Vd3Phgn4aQb
z%MSulWR&vLWWZN~hUu+@Q{L`FG)(cRbvK~9p4vvAYnZTjAe?Fann_7uIHnDVe+Y<9
z;=13E!N<nN4i68vwzft#20i;pIz*jG3P8COOuO4ojELaAaKZSK_OBv}y5>-g__Gg=
zhX_RS#>v@10IRt2o3#+*fptLT0-oQZmVbB|_lpcc(~imduT$PfZBv(*;jZ-f4;5(?
z4rsU(7<{)$F6R;3wmmOQR&2tT?^mP&iR3uXSqnfk)tU^>V*-banB^b^K;1eB{K9cY
z_7zx<&!O{|gPMgY_w0id((o+V%-Xng4Hr{<YBUC}<2fhQo@39y;5(e#d9An#+(v$$
z1HOwz85X7HTxVJ~@BV51oY>qf&Yzrm5TILd(k3#8eg$qxY!!0y#=+nRtf1}4_dFUC
z>SyiZW2kYN0CI}fRiX!aox=x@=*C)jP9-ogY+5%@3sXe1zw7<t4&T2(^a!ntKYsyH
zIBbIFMja+@IKy_G`F1P9msROLeLc>E8)WRrx*svh_n7iue}qS7iwx`W^b)+ej_nnq
zzQ)aw30;>E*v32agyLI$9^O&Y&_<r-5|klyT%>rty<m+~rK#tceODor)7sm++y0{b
za77EGMu|<CgMXJ>S>JneLHfJRXNmfJIHTpJuCpt#0VGJp9@1KD41HNQ!dqk7n_aXf
zt3}xbNc1BbsVfL@T|5_u0HCsNy#X$ZF@<Xk-O{joP3SDJ?Z*FX+S2Rfo6COvL^J@*
zc$jg8SsvTNOuZWx|6*Ywi;2XZ9v!dK&YeNAq#1yzSQc&oyaC~5W6$&TSz<78_}@Rw
z0=+3x#(6%RpyR#?4qC1MnLeGRlOn|dE1az|?w_%}*?OUm<pyzM*fgTcLEn|DuB(eb
z(1`j770$u!bFnzj95bSF9O@2v<VXJ7ODG-kLrh|6nLE6&9t%j#G}fNCt-zPvQm=3>
zz{Fu5J0?DWF6+qtn6m?zm`0-PZVdVOn<%%i3{S$fXk@4gzkdXfp$SVO18q`1^sCk7
z=Ty;Up2fe7mgx1Ww)Ox_pmF$vP6UjMjCM|1#H(pB3R(+b)weVBW{LJ9cIy(y%RY6<
zhm<`sCS?dEd|Rllq0#xwyx*^(oZ;bLo^VhhPB5D;iX(1MXNO>r6-a$=tw*9y5S(DT
zmms}3oZyM6X6HQQ&h7c91Hh5ez*`EmQ*%EHHZZEaHrd{~!GxqdUkIK21Zr3w?;X}#
z2Gn&u3W%HRZ&?pttf(o}<@f(3ihZ3cO`swz%Ij8Sp7^Ll_UXXzuwt&fibnsVxUSQy
zEd4J>2i{5rD}FDh&8JbR`<0||7!{g7IIT35^GD0a=PA5_uFIyY8ov%Vhakm8H^)H)
zojvY9Y3-3D43PS+0I6@+bl50f=tDUz{!@%6FI2Z;l?;)hTwPseBMkx-LQ!7Ahy7Kr
z;OVJaw&v!q4B2ZFht6HYp;_>~J|-o#km|&12ha0|@d|tASj-ey?-}rB8v$YvS|38{
zSxZTN!T^%+m2@G@f=N&`^*b{tiO#Io&)46ioC<ub0JtjvDcWJ)hgtWZYy;laSov4N
z9>eWNd>4J0ZaLMF4ULnTtUn!AmJOB+)~do&&_}{>e{eN?Ch^WlaIb*wAsn&k2$Y^l
zsW!<Nm`_7}_kbD}sTHOi?6}p)09qBP75;ljHg5uZLKoGcMs)b$J@P{b9#VIcesds1
zQi<4tgy&7t1gD-3qo{>sl3Y~24LhHkotxaB^LL76#KfiDNMV4g^yeop`tluLroATB
zFJ-_oA7#XB|5CUd?sku^O*w;kQV%J%C=U8kKHrBs7FMGwBAP{-lfKR&$Ye;XjC0;v
zVKZw{D?#M7iB#9U!&eK<1`Vj8En)7*d}DqVxk2?TcGZVkNeMSNIWK6J$T7R2&fjtc
zUWG}B+QgrP<_d5eew3L_$vP}Z;@DWZEB_Y^?(3=$C%7q!m@@p#RVmU>iUls49u@t$
zqomFbaZkQ+Etl9FgT~1<!96h<4c$So6)X5P{Rd4=)PH)}aiD73{JG4f3uu#5hJ97p
zT7hBI_7Tt7J*j6FECk#SWt(bdfN-FUqoD@0I1O;tsGg2}W@K$`JvN@#yRIsr&{0W$
zU4h5yuipUF5-)lYl~fs+yHKDg0Mz9{;Q012yr5#*3P_D2*a>D=Sc0B0q;LE_`s-x>
zndZQ=$j8`55{y1`#dARsRSqz3C$v$xW>~VnJJ74gBSHm&F2TJ9SJr^iFj67JHc*GO
z<k4sM(nKvt|1xZ5w^*ZK*ID!W)ni|Y9L$KHo(uRNbe3$E+rZ;+2<bTR(@i<Gvye3X
zsoHJ=@MrbeW}gH<>H!KJBe9eAl+)b#x!Z9CRCcl}qhZI?4QNC7y^i^=MtdjadA}f!
zC6CmU5HS^jeT`-f8Xy@Pi>MLcmMN!2bdk?y$KJLP>ibH>lu`3V0Z7jD3Hp58(+lzf
z0eJ-OwZ$odBYrFVv+$7_z$h*35I^HCNdM6b=OCR9`a`=LP`;f^($13}mAz*1T2bCU
zjIZ%+jB6`=-mqgEUK)4>axDL`Bj7tAZUqw8puMWOr|=P=1R(*2<iPNsq#o!%Jg+Zk
znQ!Hhdcr{Hh=vp^9lXn%*~Ji4xGIUS;7E}t^h3(4v-O;uILhuE2=(@JVP-{+;O2?F
zOlzW8T=8YZ%!6i(OSqcj-$3+sbn-hN0eK|Fv4gfot1BG;Euq|-s{whwQM4CWLbX$E
zudn1UOSYQ`&4+*}rY=Kli?qrQ>)>v$7q>YVNfXKj{9Ot!c*FU>J&Ova-}x^eJO4kJ
zQGbF?0<cg2&-|y70Hig4LZUE1Rrom--XoCqvY0ZP*03g!hKe+8QZXO6ZD5%{DsN&E
zqZoCyUfBVvm0DqkbF*=C<CRl&Ba$RS^KkWVaq;}c5dxHS&}=ER$AcCdEQnAb3ghW0
zQl`Z(92my8*ZjB+l=m}bg`p23_K5BP(IpV8RC#%C0}OAIzHqCr1Jc5osw>`^vbUXP
z=ZKMzw6oP{$U0T$)VWpt%xYtFaZS}1$MME%)qUfQtxAhvQ!Mi6?$vy8#%Z@NCk30K
zpktId`;ZvT11-q338GK}Q>%EvekJo*&6%J$&PHCp)KP#JynpZ3B}@6xG2Ru&jc@ZH
z3uL$PP3J!eAv=%^mwE>TLI27GkW~l>AX$mf@pcC~Kx-gd`2(^vWZr9l(2>YqwLIuw
zKlAr<){b<IQiB%N&=<dcnPy_<8y~^@(P=|DZ(`qQ_=&g)KaKUyM^E~<-+@n&HIfr~
ze%~DPorH|p`)_#?-^^2>Pb*jz{SgVo=Ffnlm!aBwGA9OA^>7_2a(kP!n(`BZY-?oN
zCsWh3og4OnwyLg9Js_!gi@x<~!MfX=jfNP@(wnG;!acE!Sa2@kzH+Gxwyu>??)OBn
z3BBh-ZE5OW<(hn*+AjJW-s2S?9}o8q&+w<^ZF_qKreoZ*71*3UY$c?&{i}i5A7SX2
zSX^Fod8#F<h4E`VO*YgeOFPiUbNJ}C5P$+ofc9`JhHbN2)bp`J!MOPwqO}F7!U(zp
z@_k2Mpd0rXFsUpdG-EisFPwSoKoEeyX3gqTpC;YqLfX@~-E-uPl%5<+tas$aOUy`w
z2_|Bh{_VpWlNaW3cw4ygqXB|N8y|~j9`4?kC^Dx)5O6@Jl!^#oqFwTA9x55%G+XEX
z&=MzLQvG`Z&AlbvRcp%z$Y==yfWqHNd$*Hq^KJ>m8CbqQp2}QEZ8-erS)4>J-8h%%
zOKl8e;tNhn#IAG26tNakWa||}G!4|duiyeeXBx*QZD6%FvX961w87{&zRsu?BF34e
zpSWaKboDxUu_Jzew77;Qt2|9B^PPY$_|quGQn77l#;_!IP?xX9w#vpU{e=?xDtg^*
zdP2wiftCTQ6(Mn+HBEqpf&;-?EI2_jX=~-~w0mphFd&!!koi~j0wB%(Ipgp>A}D|`
zWpB~b0Biu32Oyb5qz6yGx$a$W^Ev!i`GSi_hQdH1=y~>gv0cRm4Qmi|Q`{1v&3o~b
ziw|BfAd?#%Pw_xFQh5tkclzyRro3-_LsHyRW|;8@_OadveOI2W-@o7nk_lGY|LNGq
zWDSl&s>p(rLvJyZ;GF*xo_f3MQ9~ha*a4`m0e2VjJ_KqB>-xJB@!TlRfX})O{*AW=
zg?+((M98hj5S;U`8Es9p1XMh-mRkVUsw}?e$q%@0d3m|XER`SudvNv{mD#~yep~Wq
z&di_cAJkibA*Oa~MNT2gGJ~uXm%pX#F4Nc+|9GALcQF|J6AfDX<`>L^IGZnLBo@;u
z_Fa-6f&+6Ht%w$^`br3cRAn!LR(yH8T;WG4uz|r|t=ygR-9Yh0;OTk|WMDwxg8q*H
zY5QL*e{Q!l$kzg-e*(5O0x66^@+=_KYcRfW0{`77)jR-=C;<3HDkYfdfJp!Hm@YEw
z^~*U&GyF?YNsppOx=FICeY15UH4w&gEl1d#x!`=L{wU;2tN?)u<1d-g-&iR{Z8)0~
za2f6#x<h`&YX{yZX)JtT)f{9n(^Mndaap`RU^C6t57DBO(}jxg9`{Y>4lq92<>mDF
zlS=eR)$)5s4Y_DoQ~v8<iiK0jv`0$w>Wuhogz*~}-Z`V7>}k3LJ+Vg+XR!UnF~j4i
zGOB#~1O4W;c9!l<BU<$mZpVx19U~BJ(rRXmSi{#DG{r5SAYoIkbg3UG<{_M7Q^ql$
za)D|*ZLHvBRYh9*+x|_{osMjJoJpHSst+=tqptkYNgW+NP_bTcNZh6)Z_Yzri$*1a
z0Gd^i*8TqJZC`Q%b`iFs2AryASi2SGNkMf<?hPiy1u-wEzkxHFK;hBG9if!89d3n^
z39PgoxAe|b8~CkR`&KFqlk1_=2)=p6S_n<}P*}@BAPMH7HqkiMl#9;;Xcf2eJP>hp
zTKRew><-4nM1@7dq6Op13|;_!5C(JFD^YFyxSxE<u$9?N{&+z2RK0}SX2yzha6L<9
zD&b>rP$b(DftwB6E8}+=Iidc{@WBCwW_uTVqzFs2d8uixd~s9|4DF)T-QwK^zvgG=
zO3z^pisQ7};CtT4nKJ7*qdy<+uq>?U=Td%}scm3}89eLf92SFjDPP7}$SH#}#^V;K
zWgfLVl2rv!Uv*25H|yCd+xeovhX@X9lJHW3jF8iNb(o1o6;KI2A3nm;mEF=dZ7T$6
z73gC;<1!^+feO^kF2#hrBf0w=j2epbUvm!4>&(42br=j9uZI+J4?Qh#Jj}(t@Z=%$
z>7#FIK%UmgqJkw<-;Y_k<SzVTwOGEF5t^z{vh&23oE0ux>K!TVOYkzqS%6L^D1e`y
z4gln$36cQCT=xljGfB;Cor;3EbZ8N>o-AkdIp09}JLSNOCl4@u`cE^*wnE3UC6v?g
zhTCf71Pptc7UL1pFAXuJS!WUml<7>a$j~0<Og@#TbH;3!(kzQN%U!g~j=oCOt~F{%
z>S~Tzg*zFzmLlsK!g-DCIr%<ptF<ePsJnh7k>r2d&d0cN-lv|GTlH|V{TgZXkdoE^
zr66(`xf212?tl!`H1Aws3AKa}0c?k@Yo=WCF4~t|iNQlmB*8UqvB`$t`myiZ*~ue*
zJ0LfD8dd*30}i7nf^P3XsPHr$(YliQ7^|INbD<QN3#XvU1U^y^z%e7E@vPCtA?k8y
z<clL=apjuI%9rEqXNH)S5X8NKR+nSDB->Z}8jg^qejRz%4phF2S3S2Sd&NCxz-bzl
zM^VDT&eJEbv9b!1k=S+Hztu2N?yA>VZ!Es#HG)r>IMecx2|0?dha$Xd{i~;Y1B~kw
z-f<?-<%~b|Y+Zs^5O%rYH*6J`fKD&>4vq>0k9&Qt_==|k|5fU=flw~%o`-dP-d&LI
z<qdZLXaotGtcy@?D|d98k4d}j08w8*7hg4V`swBho4MEL8c<D)ocUNl=C=dP_RoKK
z1>vmOXY;>hBoL`j8$c?z+YYIIy)Ai^Ez;&c9rAM_jk@vN%uBFoyK)ZMP5M%!L_v8K
zlt9>Ntc!1?^(!bO*|lUvEmS4J-=N-V`tW3?&7C1tEZy07(St;#e&&Vy#$$g+NdSvE
z(2_TBoD2m)P@p|E=qffH#?^MJJd@<rR=$nr`Vm%MGM|R`BV4S89wysuuGwY6!)vsW
zj>_+7S{E~D%W<tNFsc$pBCJS=RC_8hw&*Cv%Lgr>G1>t14?ur5rwHSNk0k3Z-s>*u
zu1nA?7kZ|THN3RD-rEU%^2kV&Bu}Q~i?!%#P&?GnJbPoM@72~DSx~YHCysMWB%$AA
zd0DL_{YW!kNm`?vXY)P}!foJ~;tuPccAC#~|C|^G$)TX`YS20d1!YyaD^j6ci0#@v
zD$&lMGbvOJheeRL(Z1vTlu32(zwS&*q{e8QB*s^m!JCnqIO2p1@XY6q_N!;D%&j+4
zTGGgeK+%&0DTejK2S(5Dm%>9xI>y+)NMeiAYDCfHEPp^cgHKhjY^Bzl$`ezEj`;0)
z6GM;@D<-KtPMVgo!3W254&+~<%Q_5j438d%N>`v@r>`?IGe3KH&mgEbE2bpq3$k_t
zrLPqJGA(Dtu}nS0oi~x3C;2WItki}KvVa(@CsIN8AE2Tz>y|5?{8tzD_Wxc*eEHLG
zO3SMq<W4}jC4A&Rd<w`y0pfGZ#LKG%>3D#u{PS|GP_6?>H!?|CUsKc3pn*aVN074f
zybOu#-ZpBR!|voGc27VlBB*z*n5v$`1MJkq!~`gjT562u{_|^I^cliN%qi7<^$93$
z1S=f9=pk6)zFf3TsC2VHdvB9bAcc*60I>Fp?rCI5fG#`-%A!+wo%);^e#^!^*-Y)H
zb^>MsMeT=qwd(&{k#6y~|7ro0Isf0K!u|i}?}z}LbOuMU$NhmSdM@Qe2)?`S#TOFw
z{4)y5X+hmFL%G>cP+Wg`(dM(lJp?M<M>FfuKnVHpo@6SF+Y+662y&RAF=E(`#+0#6
zoVp%(zOmqQjF}B4k=qTOQb8+>fe1aYwU6!NL%)bv>?vH0kB-K~Xb%$>@PzbXjK-&<
zl4M1;Au&DU$Z_&$+jZhLLy@mJQ8=anXK987`8{Y&@jsmj{;MxQ#`T|q1)m$QgCknO
z1L(rvvW2P{Qk1uq*SibXySmq_x+_8zO<VLKVZwu#%R<*VsnaJ@-n&!YS5rQ7HXcf}
zZD*a#E5o2cTw8CdBVC`+)wmFHAm5M&0U<x1jL}?>fjjuTrbJSY|NnRY-_C)OMC<;o
z!b15M1}M<Y84b!VEc8-<;Le~S2hwvZ3izZ8%{DVM%o*RrX;8d%gTCZ+A2VXsF1#z#
zZpTZ<L%)d<3o<B3k#=+Tk0QEYF+xbec#(`qLEW^oW-*9HKLkvhy#Zxi`xkY@YIm}&
zVp{vi88oREPK)e2SAwdO+~J@wbtbGTKPqHbeFk4B*wu7^Trj|#xruO};k?q~QeZ$N
z0q@M*;xe7NjTI)#S7YJih>ntZ|3`oFpvlYiLKZX-yLkrc6X5;aF;J@<%p2QvHZB0p
z8x)XxL<`F0I3@3{#YUaD!{4OHhcn^GdZ`8D=ezpU%rRO1U?Dm9Ycd=0{P?+#qUr)(
z=Vre%v8>pMT=g`LS01S>``KN%VEEch?~W%6!|!p`mjb%>hR#zXT>iyAnPuzM7@18q
zisJq|)%Il7HK*w^HPcNs5nfx}Hk#H9w7a$He#3A4Lul8`U8Ay}Z}sXLIn-&YJ`G?N
zRWPHUHiur358pJiW5iBrH*>)gR(AfVEqUmMUk&s;s=g~Ek2`T>%<1jYc^X+bCW#LO
zW3m00m!QkSPhjUkuLU`Pi2ah=S>W!-dQL@t;pgz&hu&5OuZIB{v>OmR2de#e-?vU%
z<@-NKUuXS;YX~UT^mS-PI}6B#G~;uJV~Z=d@-Ud3FMs)){?n#@nHZFg*<}&m|Ni<}
zjxz~uFvUR=$PWVlc>!`rYy~O)ikS9?Ne3<*sAL0AFW;!?4JW9-oWm85;(VoqFs-)8
zn6)bSf|AK_Ns^ZiD|wfg=`5m#unOtTMB!DTZmWS?aGEy%7K>3fk3Q3O$!n|j*RK9h
zW+>H)g)WQQ*wtHzI{xvHELX#$2;P_G?h~veIT^uzGZ6Z3=9L9>P5d1-y7Xh->uZeW
zln4&<enQY)ra>diyWlQ=L&J-we+J(zstEtkst`i#uGCo|wwM=8BDGN!24}2dH$As9
z@s?Rn8~ymR!nrO3a<tQTRcLzT_u+F!zR99%|CIL9gA}iCW|*ep5lqM~BlL1Fh$30v
zSb=QgrP6UFu78Zcy*O^EB|k^i@@)&<835trnvDzbdEi6Rgw|Zk*>prjnk$wc<GY)Z
zmoJ&S2aR3)!HuF6F&hJChS$Iw)s?Si85M5b+j8U;UBIkqTT5a5&WqIjsufj#G9G?i
zuF#hwZ*r%jtgH-K!Acfe+ndf^!-B<;<zSG2_mRZZ$;k=u$DSVNM5+EIvsnY{-o+bE
zJBjv>L{qoee}6~Arodo~ak;;X3a6L&24n)Cir=9o1bDwH@P1#qM9i5#z{1L^WsDXC
z3U=azx+N9c?NW6rOiAl|9Y1O0vAl7Q0q2FCBoY~5Ft)%s_3|>`c(SywIKs4JT)uQn
z*cT&Jv`T7?ZSKbUDn{%T=I<(It4bcC#O+_OiL$<$=}=;OQTQDf!41b&?pg}480gse
zl~tob7ek!8=toG?;iP@!0COiQT{;Jej<?2VtqTpM6BhD&eD!p}AR=S%?}zn!MHpxx
z=DVz#5x90x-QU5F@@j?OP{IVNR*f&_-&+okoa)gm+)=_(#wkjuq72&*BO1-C{(zJJ
zZeholT&jQ5=w*Svoy^t18u<cr$&XO!F_>Q5=i<saqe{%?zT_K@yg*d;yKv~#YxoFM
zgg?IzY{bkiXUZV?WBb`0#13c-P{~z$&}dizQt4q~pyz|VQ|?*8E(i@tRZrP&OHa%s
zy4K;~2RB=8Iz{$PcCFoOIXpC<x6uj46m+Wzqi|$f1=K3dvfqP@cNvBB02mZFvM2T6
zhv@eeY=1;atQR#HnoMEv{#Wph`V>+injkV}lW+)im!wbu8_r<x?6#=eqh<c>hq0O8
zq~c6#+zJE%pm5tOiKsjb+oQ3ZA%$w6qKsPjc@RgBF*`_M_2Nh&O!L_y_6d#!8dN`1
z7R!1me=I}Ah7hKeUSdrQj=v(PAUo&D3Z63bAGculBUfs}kCoJ`pD}L4lEWYy&<kny
zE;6&`c=_VfJ8<a8>pLOCe^q-ByB0VN&CMh?dZXLNpb{XI4AbZi1SI6jwWib-SnSsw
z+1x`G{AIi~NpHyanr!k4=*f;ftje0R2R>8Mi;)J(Nj(xp^qP8m>llF}Wj<Vc+do)7
zZjZ)H?l^Tha1*_H7`2Z}rH;>dpKL(sAxGlJ7_c;o+|p_7>B{cQ!AMDO#^ZWdURw;s
zMWOX+y|&g^_MwY<GB*;pFE*Up`3Pb_P~hFiJd;TvA(4r;Vkag16wTcBy*-mJ=tII1
z>VvPQ_uW;yw|+EOCqiBXqClH}D3wZRIBAyQ6X0#wR}h}HK@t1eQ?hXHOA2@wGV-CF
zLHIdEPS!uTr&yG6dY|D<^Ths*VqzqWZQ}~9k+@n?mft|LjXtP+1&dlzX)V;J@<VyU
zR#Xs*(Fk=dpR<9LGpJXDfcr{)cMa<WAu70Ukn5v}jI=~W`4I~3b7RI&GBNv)C$10y
z$R{-xlY*GM8Jcpj&)gniur3ogFjBgvG1S;PouM?gRw)0%R95uvBnJ;aEU!2#gp%Sj
z?h=HndmEEowq$bC8Ag#l&IvOADe9q%uI2>W$uGUhc`9FRnZ%zo0p2J_u|}v;ZbZqG
z#Six|FInJ)QZHzlRQOdnke=$Dt?^-zY>CatHN8(v3&WICr%dU%^hWA{+n=-2CoANS
z>bfgFe|~fDWOJ-8R8jI<=9E~d(_p5;PRUh~6+@k7R@h^OZR~|pc6F9}N~BA92rJH#
zlo1{<iS+Ue`+Z!?JB9UQ6pY75vnzu8!M*7YRF06`MD9m!=cucnu%i1Z)&KgD_sQ|G
zvoEMnbT#EiS&9YK&S4i6rW~wr2|8q~yS461D&vSpz5m!aVbQt${v3WkHnO>^=<nG>
zu&1kS;H+YV{ep64+QngE=&aKY;WJ$ec^1`Ldd|{@scv^rSz;Ti7-uZ#3sbWiu|&J7
zSSLBcn);g0<d<-q<Xi}bH*br`48-vdFzr|f3a`3K*wDc)0JmIvKzt*TZ4V%`>dYS@
zdifieJBP$TIy<%UQu8Q9amqo($Y;-vw;K7`?Knnon|A$D%cn^r=6S8qZ5yECArva_
zK%+wZ=mU*$Ui&c&9a*e&P`!Nn1o~r9Mp2v=yn(mV=C5W7k9w%Q>G0_VJpLqRL|gvG
z!(iypsHJjD#HcE;3Uvx2ZqoC*q`ZTg!W&MBr%rmCXx=YcFnjkl?ejB}uj0*8+fldk
zorz`15pS)bU}f@`B%`2Ha=4?=M#3cT)d}HuE3E3TF(djGTXrq1FTqKCR0-TY8+eQb
zhE45Ov7_>j!Jd1p%%_#|We8{$m1x!JF~;_4envDSQeP3u)|$wMY}toq*D74ld*q%t
zhzEK~ZLrYnGUnLR&c6c>o$4-T>=NQknsSh|$yO*JN~p~#n5h33JiL<(vT_tft-wV8
zd}zqfJE?TuytSl4y~MaGZ@X!W7KFmJIu-aCh2_CH+VDUY#|ez%-qm0Xo*<*tFBr=s
z^92&xHiZiKFP9BVEoY@9aVmS1?CITwi6=jCgIgqbFsP<@RzC!92rjlaoQ&x4&(ftS
z8BKI9K=6Gbv5#)4Cx48#00Hr1VazxOQCP=9H*Imrk{SMSV?XJeEGN~$`F_?1_ALG&
zH+=}l-j~X02~t%Jb$UT2t}j}&pgUQrYASN37f=+%9tq!91$y|*9|M6-Mk7Nzl-g{a
z`>G6z#6;(1<>lcuZ_Cs>4WsXw)Cl))(sTWmk!P$Hl{ntkvd0T8_8w3BMo~*(x=~N*
z-#j>hWvCs<R#Z&9iQQ8g6|%xx>0}=j@XRDJ+<h1mn90l^D_p6da4q$jB1HIuvpThq
z8r6IeCr<YGrqPbP=QXj@pD!oMCL1e`)0H{BI6bZV37l;zGZ~nY#cAKxtwN1*zc2-7
z6EU|hU0fHM`jMDMuw{IFy4Y0yH$Ub|Tj7Ln=rx#4ienGS80Htop(2w-%J{uy#zjLb
z06D~PT8Nl(S1Uo$P%@3shLtVU<61<I?dj0kvs}Q|mQPNoD>*wI14PkG!_02*;MksQ
zi`!**O)68)_i1@t#@-Ey<M1k2C&l!lIio6>xP-<4%`plP;nbz5@Gfnz@mqCtIqgcs
zU>DI`@Z)Jfu@#hq-Mf?I50#3x`>jc~WH~%!r^9M}8pDDk(0$!8$TkDm&%T%bR8(p7
zXr=GkWU~8BkY2kI6<?AxskMpn77Fr@9n8`l8gxtRBEc)|EU|nzmIXBP6{S(VbR{1^
zHz)ebRs8EEm5{$vp5ln?=c0R(syD{}*J}7*R>nm&KQ*7(d3VLRV@6-%a$bI|N_w?-
zi`Xgmg9TG}OgvV8UgWEaaY>};SJ&9nP25{o94VAJ|CvpGe^<w4EiT|zxwd`3u6+Ex
z)b^o>>%NygKl_Yl0}I}T*I%D3oR!%=P2qgbn)U^g_Fm61n?8Md`nRR#eors*)PegB
zN3-@7oxB_SO5w|!M($Vb(_Z<;W?8GUFw70>yq06O^?k&Z_jSJe>Vv;e`&jB*Ryk{z
z%&QC5rYCtfOWSWb`*F9&o#=0MZFSx2?z(1Xiy7^kH1&K`rgZ-FUqy?%m)yBnAN$+j
zo}zQUko{!-y(fE0mQ9JZ%=p)5+<d10@XeH<DW@34)ZHy5=52cAYyGyr_;cisgs=z8
zR~lCD6Z%vd{?5`w{a1wA!YW-h1La5RkJ2{Jd^|_)zW=f3fu^0_dx8%~7!-!z%5&fK
z@Pgp_P5xVSg?|IjP?5SeGdtdPrcFWjmGjqbB|lM8nRxQT8%Yzt!gX)A`FH&k_`PNJ
zj5!;(GH>c=n?ASd&~$aZbFnuiUT1k&Z#i`O38NdIYKOS&$2nU6>?Ho5x^o1nssD4A
zcDYmd(R;Hy`5k~8#sA(=@!#5(e)~=Ew+l0L>h@dS%e(4-_}tSTom_k8?v1Il+w(Nv
zJN?nwWq5z~q8<95OD^fXZBQ)oO8MlNK5czaTiLmsvZ%h7tBX<?i#DB`S@l$H-{rF*
z{5>vm;UyD0e_T{Gw!ZDV*=M!+ns|{(%@4~%lx1y9|A9lu^GnUmp3AY1d&1WyEjMLL
zu91L_To~-r&H<jH2nI7sfmI(EG^CEQ23HvTxBr&9b<@++G8&)=@^tlcS?83{1OV!1
BZL$CW

literal 0
HcmV?d00001

diff --git a/docs/assets/design/hybrid_kv_cache_manager/overview.png b/docs/assets/design/hybrid_kv_cache_manager/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..ac80581f491da6235c53f14601b74288ead3b185
GIT binary patch
literal 39501
zcmb?@WmMMD*X2V9NJ~qrDBazSfV6;=q%=r}(kLJx4bsvjA>E-Op@4Lkv>@F`y@&rh
zYt4L`nKf(pz*2em#l0u?-e+G!)l?qiV3K1Z5C|Lvc^M4^0$C6KyorGf-yiU1Tp$oB
zL<%yJnx1K!vu*}N>(_|g{$4xu=S)L+{%?`PCT3raC0l;{=+-$o`@G;|#O!P~eIPCE
zySD@yLoJ(?E!+p^W*1K0E&JQ?vOaU4y=HAa($d%3$1F2Cty0iXB@bB_l-I)#hnfi*
z43v>GRab<i*otEs2Bs%_rzfT6D`<@bu=Au*2R}CxOeRP3e9_&G#39!u!|=4jUXezu
zDyS7!pXCibyDQ66L^btevU5mc-szqZOFZr!nG7y0y2lQAr>~~Y>h%T@LR89GQ7>2p
zf-7{$&~~{>8H6~!Ts`>dZ6Elc2YDx}3CZBM#Ap_Ke2j9~`a3f6RkMulh)Sj9(eFEq
zlf<e^Lw;n-=B8hUJp{T*F~&U*82d+!RLSPU1K2Cy@UqlmULHTH#p&A-=^682=}x?T
z^UQdmO6gj34ht_!HJ<*9Tsg?pe%h*B>Jqj>tdZAhCfO>%Kz09}Z}X30BCNCE`)mqX
zxZ!wG8r50v%xDR6*Axz^u=a0Bt>a4`=E!985bbgJgzUZgiolj><_cAu51=)oy?IM<
zS2|gIflTkyC}RoydX<or#F!|&|J*$u{aW;J=3WLuF$5!!79()z-p%(+*LU!p*8ji%
z_CJ1x3{P_Nng8PtkMM&{+S;C<|K8cTdcZ;wu=E|hYajh9ZFeF%+_T8TQzWh9krg=y
zB`Jm@Nw{$Z2KCNtK|+2&$9+^O%JzAxdl-T2v@)iF{q%p@rdcuDIg&L$4HjPW)eZ9O
zdJ05Ejk6KaU$tVOo>6AOa~@#?o1kLNS6g&cm3Q6ey!YGe?=u!GAJ6N_;eqtjF_%}U
z!6uZrhK71VE<+w2LAO}=ek~Ca$+o<E<#f_HYq@a8VB5&2kQ009V9)%Zz);9VlkO@t
zs!d+TK+w;9t@Fqm>Fd%$vcu4tQo4bb?yuF%9h_yXw{v3{8d1gqDlvzp%$D5qxhc0z
zPCoQ`)t1%PT5D<;c<y)i@7oYd{+8l&Km77**Jvj-i5wGEXKwmp!Dl*xdS<1`MAY3u
zw`0|qEO<dc1z%?CAB_sWKqLy{Yv6{BuKOwlAH|$FI=xM#F`0^nqt(vXWfPxfkESWg
zruCmURb6;=t?yS3Y^NOd^w!LF%B~-emt67_e7~mr%JH7o*S5K%+@r5*=Q+Bh7%v1s
z?i?o%Pi=*Zh}Zg6aKeGdh4a+syrx74gsi&t=QvOAXswvKvh)l-OY0oD>`IOGcfM4D
z26LD1l87me>v{HQ-Dn9G1Oj5{7>O1IB8Pc)Z=nwGM4Cz6QBn6dOUjD56Lm`{J+h}G
zx#R&pfdH;hx(e<0+kvz?I=Z2BJNDW&N_t5ds=uStPLUD&E-sFgR^+GNfnDv@daOqN
zRg<U)eAGuocjy$H>jmOQWlWLV{pl34-U$ZaQ{WTH!c7V8z#5^l;V%lM<F-rC(jgEW
z@B6;Wng+f&ej^jz7&vEtvlj?Dh02B*Bk7lYDTiB?ZcTFUDGUweriCZ;d2J}~9(PM1
z`ZGl1w6$H=Co^)5j}~8{N~Se?7gR)C*8dnLDVvch|2@@Fd?|G1f#*Em__9>Hn2+m`
zlr;5`-P-8qr{M<$zZ?Z6$|v@1Ch?wHxLL2%di_PiMJEjn4LPAc{JS|`>NaeCu*$H^
zB}TJ<P+6nLn%einC-q^R<WP@J#RLbxFd;(b{-V}mvHF|W@aufGL!Q=JvJ;<AwAqtg
zT(8;F^&88cb7gdN2Hn4y2_0^kK0mZst8qBTC80pi<SW&i746RPiarkz6<-oi#-Esa
z7%p7rN-@Fu`(H;-{-f__aUqf0^DRE@!P}C`Cp%LN_?bfYH6BC7ihljrkX5+RuM5xR
zLvNaRi=tEv-9vmSmo{znJ6Qr>YQpJ#F7z52^jhEdH2Q4K{G$qad;f!`q76^H1O`8w
zQ99pMs<!=<$M)RLZtIp_>9MlSuftGjVYk1-4UezS>&k3uXo<pbHz(7s`8;-aD+(gF
z<1~K`8MY-0ySXNQyv+Y)=;}FNYeX}{x!m3}!61xonL)y8MBh=+{5nUy2n~lO^`Fq4
zu6x`MykxF%{TB%Rk~|JJCP&7L$ZP&}x7xRD&K>r>b0SrtVP#@yc3r(lE6Qm++Mebr
zyv%CR59s78F~Ik>I+;+I$D`!482*$2A%=I^yfl^fv}erT+10wz^Vf?AlKnZ0QUiep
zg7@ayi_eb23*3Zw(uj*5u=t?+({?)~<;%v>A!RCRD8%|N2`Iz}QK8muQ97T<X(;{}
zE{}foWX#181FyxwY;B^_EmKL+jLfmVUJk{p?lT4dtJSOLslN>IaPWMLO_}DZZA^N`
zw#FTdinRlj<c1MkMc?~onNJt<hD_Iwo>*!!tsFgBOF>8FJ#o0}?PzrVOH{AgE^kC~
zV(EOY52cs2#IRG=me+=!|AbBSU#ZUS8@*ffxH}Fb6UP^Q6bUO+gdxXCx;pVhD=1$&
zpFb9MbMU$xT2CwCKX-q5>a>v3XG^jz?)&ga%hXz7Z>xCin7V<dP&IYpSqDE`x^~gX
z$xFi&F^^YH>I<py&&=rimJcLyw9jkoTrL(T`K@L5D%9x|o)YElB;Gn>%AC3vIKoa>
zWLEL+?!gCinwb{6E7J-L0ZOS-<P4GkJ6y}2sSX3&I-i1V8%!qVpE9N>F=RS+y<egW
zKgF-C5Xl$QZ3&+YA<A{^0wfBx2S3LxqT^CL8O|K^&$<(g^vAUkUhyu5gobXG1|w0}
zM%aGr^4t2j5WjV9<3H=Rb?F8aq{NJbi>9z#H8#VHtGW~Gzm0@Ec9w>6lQs1c5|R@$
z5~(Kg<C0K>H0X)Kby17UY3qtYJkC|@!!bW8#D0p^{JByY`P{1*Awc<2jgF0--MnIQ
zyU!azyE*Q~qwLbqJQSPlLXVr@syKOZFu3wJF`CD!I>lnn7h!B|K5A(!9xKy;U-ST4
zpS)<GiNuC$?9VVhswOHSAN%WHZZ3XF_Y$92r@VY&zqmfSc=1DeOX#G!oPd!CAAf?4
zO=xj3Ih84Ib7U~?IdUr9_s6_eFI>(y&axFJQ>({yldYW_W*RHn^SB=$954|q61u;f
z%2JCR<+>Y5s7H<+@m?X8rMu7C%D!N<ko5E1)OG5;#9szU;y&7PdWy2Iio#Qy9&^vz
zBv*5Y*J-8xvRKWh#63ByqGv0kZVR6AjeGhbA?8D3TtY@_jwVA1xh^5gZ$|;TqNhdi
zrWFej%$83cjyen(<`q1mASWk8_3G+eImgSGX)(`>sYo_Qc@o%Rv$#;>{&!n0qSR9P
zqubRoaZI;FskS<iRZVVl%E$>;ptr1=8Drqc10EcK$OBJr-Gmo^2B^Q1?kLlUYceHl
zin&tKXS$rUo;W<G=6F#phoks#wDI}CVMC9F3i%_VF!@+YUK723-<C<cUs3Z>jUMM=
zp`mv}`a(VT9rsdda?p-!mbT<jMr)<{k<s<N9*;kwW+EmdzgufJYpD0|uCTS@i-~qI
zD*bXDR7CHEb%OyLHy2+`wap8QuePg8Atp(sKGJ~*cB5<hsBJdgs`bv5Vad6&;Lc98
zV7Zuuwv?n~Bpe~Cci$<wU7iFrzdjCJ8ScVsbKH0rnw+ds(9`#8V*enfVYt<6|L-LP
z(b`%oFV||RPMhsuRGd#*2?7P{N%7vi-!&Io3hlrNK{h&Bgm#hyzEAy~kLr1krg-a~
z7F`$MzBTc5w@4{!2nxu`lJK`QUw*>GY>--y;;)zIn;N&ixvCOFgos!oi;ANE{*Z)t
zmDlN{*2N~XFn^Mgg<F*QK>1UFhf6G-n^5|j95>mZ#aEj36`{#>pFgWSwyvR*#OJ{w
zLnD$M<mp?gQlM(~q-HFQ<vH4|a4TnGj3rrJ6Kdo!srYTL!WM4KTm4dg&{fyTT1(6P
zscPMjse-p%d`(1pka_#hFs>LKgNHE_f&vJ%$-8&&$4Ht9_dL5@<ls`-oQlqnfJySz
z+15r^_ja&cxzA-8ecPPyt22^FX<uL6#qc$$nC=2u=9qW#s(qbK%~>_P3RhpB5k@@`
z3GKO4;dVJCFgE!2tvBxte!e`da>YQ1P-l$>sysG@I*J+@prL?_ki<8jE{xag>mq6A
zpvI%u$y9$Mcd#vFK>gJMN+7%<jvmer3qldGXu8699_6P3k^>`6;rOVM>O8XQsFJ&<
z{!2S-K})EKSc(r7hQBP`;qlj#MMi`?`7KZwPZuKX_$qT0Sphi`NfzlVazZRBf;xlN
zEps?VMC_|+fZi=stWSBL64^~l?Cl;BKQwDy7rZ(bbRoH>E4@ES8Q60x47mo=RVcRS
zAwr_`(Y2!pqhd&R!5i|2__=fHMe+#a{Q$YkY5TDI31QcQarxGdo{r^f$JmTIcyMC7
z3RUVIPu~f<*t*xtppA_ZcAt1(Z%r*%<7|6jLyyyncju*TO|JI2_{xUs&My-J`Y%nJ
z7zi>Id8VEZy~zfeN$5P)FXvzNelNll>@!d+K4aC4SgotX&t<#8zuoi&>Tw@%$kSWS
za;poYd)9=7ga?VNc>D}*0a=j%Y7pHh5gVVJoScwwSNV;c^i!fB=X)BTZ(zJU)tS#Z
zQ{V`V)!hGz`~Po&{y!;{OzS__1bS9?&rtufx4+9BOCzP`H?OgA=oBg@|Ab?moenf%
zN*=Zk$uJaYYcNn<5fb4LBo7VgWbQ~rv*QEp$fi*llfw+E)W|oz^T+H|QU}$LQR@e>
z;tlEQ3iJ*bSOaz=&GNgRkI(zV>4zRkpLaBG+dH%4oYN^nabykt^P&GRv-P>5vgT%{
zW4D-SRbwl^5OF90(W>7IND{T=8*fdl3sWxlydAYGj-ZvFWoC(^ssFfaI-hCIO~Hqx
zN2NUerk^e@`L8q?8L+>k$hUuTXGVGhcV0K?r-mO=k?x!Shh~*DE572RQ_iMMx9_9S
zCBuAWhfr$!@vdhD1<fWwBZ`wlHBc(Ak3A;8jg@zKNm@uSl_E=$r@P>WS&d-*&|P64
zQv8``gTs9;<)>VL8h*gU_CDYGWYu0FUCzO2072%cQ(uHMK0P_p{#D5#hSmuE2EY3b
zZpe%F4|MJ3g_A(rJW8!aD6cbPt|}N;Vfftg_qNyhhl`MmzE-CpKp1@)Uw5@bMj7zq
zz|i0y=+;1$oO?X%pAr%M8R!2a%lm)lhyVNEe3QTKx2KA+-*FSV_y|O)c9Y-rRbe3m
z;`g4X2sIueJ158Q>dX=V`pzHx<v8j8{upkEG`>u+cIoP`^wgM`n1lpghq-2de?%ay
z0A*xrj&!stu5=%eS`PWx!-E5_y+zr+J9v0er>CBQw1;HGfg?DY!ZP^FRDrZ4?1l#C
z-9vd#)Y)WO2@2gI#T?{g?M6RoDJfwiJiWZSzJD**0pJ)!>xMLndk6IlM-pHB@{rKQ
zYvEfE1L=Juy#HLvN6e5~uas|H>r*QV<lr0Me?GFy&S&AI?9MJOZbDSSCjW3B_b1T=
z%E`%5Q&S%wAOFKeM@W>7tR`wbW9H%Ur#7Mv?AYM5LRN@n=l-dXu3}^Y(e<w1+KP=}
zZ}J~zPVMIA=J)S1p)$ybKZJ+u95h0NgmD&qDZDRdbmkYD@kVjmFgrRrQu*w|-oGab
zm9em}K$XlKPrsMg^F&ec{rmU%su^=FULHO{>N;$G_~~&*Wq=UuNCJ@ZyAsjcIdH=f
z*gSd~Q9Fd#{%8XmuH+H5S&VHiSgCTW_kC`;Twh&2Z*tet)YQ?_!wi!03EDF+EGuJU
zWUMGJPvW)JK0n?%qWa~aCUmhc>bYy&`Tnlx>%-Hl^PTkc^c&UWL4qReQG?I9yOGhn
zk?&dFHVQ^~e-mlk`1h=Ukhr{n=&n!bD1yl;sy?SL2V!WS7RGXg8Cep!zkE6StE+L_
zEJ?`C#?|$(H<|nV=&z-*vDyie_bVo5W_F|2X0MZNw&#s@bIrQ25^shUNZIJ<UVM9d
z`*9@Y`b0%et69-_yy8mTye~6BXe5oeVd9FIZ_EipnX$xURV>0*J?S_it;9S(5)8^E
z=iOMvl^UbC=~ue$yQRZ&wN6W&35kgrqF&mx_#@^GMkt{&TraxuNl6n$y^j9~lOT91
z4|}xr^n~oEs(=4Bj>ozt$>?wMK68Hgayybrc%x*XB=A%8#KqU=Daqt}#7qP$-^67!
zs-<L8$oJ9R_b-lQP2@i2{tBC`vQ)LOSUvhXuFiHr1pD=JuKDcZ?7*xmavkmxkK1PV
zcl`4Xgld!f)*)55r5S9=<0$IV!otGT#ci3DYR5WG&gCNB7qjuFXBxKhgeWhuPC8Fa
zoj3Aji?g#AcNf}WlZxYpS~VzfEyv5A!>W~*m4Q0NK;_#&gxL{CuWHf%v2;g5jJ~X_
zOgWXW#dFV;Dqc%X`}xCh-kLhKV)eY;H-)_MsVza8lB<c`8=3XR&pvbQga|SGwO!rW
znJ+hK7xvixBa9DEYXBR=!^P$7>N?^<n~{u5B_!f?T!M@FlckW04}ZS_HR#`tIJWGt
zH?D~tE&SuBH;dWX*=jwCvsSZ>ZZBTsw&qBz@Af35rrMgCX74bFiPzTFuB@!MsYT1=
zlVA?0+__Ft`%?62Y(!Gh+}s>~=%$8mYG(FSLjyhyap=76Dn$AC^Nx6@X@z_x+v|OX
zluiz%jDs!l$7IH2+Wbe>v2=svZf<WnOxF+VeoNyVx}9}M6<b^g^Tg$ei5~8cxR3>}
zW{t)Ooex~z2rH<P_^6<1!FNe0LmPtF?Q^z||5aH$45Xc~L%vczpzd72?=`Q`yX@=T
zIzj%zf!%(!b%GY`R4?OWMIs$N57p~z$f{=QABW<WnMuXolyo)+_LAf7JYMWK4yCN<
z#DotE-;EwpV0YnQ-zEt!oMn_7)-;jKQYr3m3hevFft_P*Ty+~4lJ$MZx7#F!E@B~X
zs&v|jzb+7%a+(;+mkWNMcdu5b(@lCNA4`|{{xnfhQIRL5kxca5D=#n4rbdrl_AixU
zh|y<sJnE9?O|{1{c~27OT7Aw>ENyD0a47hiUmt}voYSBiwn6|8<tp^b8n(RVg>cW)
z5cE3UEH5ulaXH?ZZ+r10Hl^t%pWXhBEwqP(hlgJu|6^oiZ1FlNOZ`ERK@@Cahg;#)
z5m;`tvv&&>VJr}7QtmvM_C2veWbT^fh?QW;GKyvb=!e9+>q!o(z%Rt3m2N%S%d0P3
z>UYG>?fBHy4ga-!%*THqQegVIszmWjPveD^Onkxz6-v&`+<MQ0jSi18Ba-mj2>h^j
ze-nSPCkrQZJaVKY4G+DTxe(@mNi*vwuyth1cJjGZb;3?cPfPHXYg=_Z9rX_P)#~87
zYuqE{*L#b%5Cc4-7QIOm6B9X)qqgT-npV@2l9KK`kQ>RZ-q_eUKiQ!X_q{+VsK&j^
zwLDg;PeT@^SBiW4wxHX_$Wm9-^`z0hFnNT<pMfj|1qFi87f_ICuI^<sg#EinB)fP&
zAK&dqL`>|qGuJZp7&1CJKK}5W+d8GJq~wJ!w+&B<<QudW-^+hSiK*%7KIg~wfwZ|A
z){vamDXY#6{U1e(&n}+i4G}YI>*(lkc+}ZXk7dh-Lrqdn;<(4gw(an_f5jQ<`=?JE
zruoP8K}!O%Lndu=?A`t5RS1L*OG#z+#aKcW$Ym4PE{`N7B<J)a6#FcY!h^<n<QE0L
zrhPF5!A+g$VT|S<SH0DFr?lVu5|2umSlN~x8;FTlqPA!06k_St6&{?gwDwih@hZGZ
za6IXh<wM5^RQu_sKhC<acgtIhjLev{|KrZ=oY&Q%6PBgw>Cso#0DP1$IxK;-zE4JP
zKo)A`5)Td~Zk^tvQU>Og@_>%l*Ob{}n=5-rBQ{476#Pk=d$S(%v(2xD=jO!d>DQKj
z#PP1N7_}NOB~(t@A>Kts9`EjwK~qWMG!t{|A*#`PuBNto&$9LP(a-7W^J#}RDnaMv
zeC5=uQJO=FAgP$d#I9amCU*AgU7FUeD4Gmj+c7_u=Q~gtUYn2|&wcwYl@;B7W^d}*
z7ZT9EU^i1g*wytoPlK40w5y{dFF&7&C@jU~k78bIgSwaTzYL!bIxN~toe_*-8xSRV
zc`HBS=$$Bc26G<E$e_A>yF5QR^S;Veh(r7y$dW!->&L^z{ng!_#AO~<RK(~KshIbA
zr;_(E$e@Wmy!`wH*8f__peL>Ix*8S_E~54T`pibObMR5<u<(WSnBPGi_)`7Klq`6O
zOcwd8>1|n4Y!slp_=`%d_Y~6AWXdO#G>USq@hQk&&NTGQ4eFL=zrBy#F2VLJhVBPx
z7%K7|guTh+mw|ylAv>KdeZK^J_iDTSrz>oCauX}G2sCKlt_`XWG5vU3<kDbJ@4@_{
z^R0<^uP#CIp~cVMTVoMhjF{ii>NSc+)~luNNDhNeX5`<q;!J`?h%wg}#LLN>gA}j$
zF!AvK&u>&Vq6I(%r5fSiGamP61NSH`-Ga^sjrQvIR}?@gD(OPRL_{2=q_VQI{Tbp$
zt=@vcCh$2bAy+aEV+lRIsqNXOmX?-eW;LR`Q5##MD)+w(%EP_-mn|oP3b8f%Tu@-9
z>l|qC@uO2xULXD${PIO&wO*ybYbX6Pe17hXmw(PSN{fLAbY)njL|Yr1MnD=^SmENr
z$=QV|51boBxVgERm=u{3#wbgu+B2T#wo0z={yjS1p{midx3Kt-nksBHkSQI4J2^e=
zu{EW|mYA530L@sHDFF=~-RspWc!T%UqK#1SG>SCu$Gj)w{MOmoIX^GX!&6;SVh6?Y
z>({RU?Wly^4Qgz(`m&2gM@OL^`ko(yI-GC)diVENXkYj4-7{b#MKro^rNqRj1C~M*
zsAg;r<zYp2_5Ape3_-fL)CCBKo+SKaYkF;GhdoAKE{fXT%xpMY7Rw@CI$eeT<&1#7
z{`AY)M)Mah92*Q`<XsoP-+uP&89#sB#DvbODuuXjb7^U5NJ!UYmF3^V(5rC{T1aBI
zV7X2?sow`qMY;c!r)&d_NlBJ{gfpL+%L*ojV9B0N<+(&siI`fCaxigKKQS(@Z9P8l
zOHv%x3(MAKyI9!Vi+kf${g-^=&~2zEJFt1d$4g6rPOkI&QuO1NDKB)Mn&$$xf%0@D
zVW@hgA37{P8wgzPwd=1r1nEudXlY9tPyb3FHeWAdye~G>?A(|fl@S-NCE2=JkW850
zt9Q7RoorX@xwmn2*?Lz~E6<S2@})ja$VdAY6&f583@2(D{W?1*VWW_%=XyaQ!3Ulq
zVn+DHccy9*jMfn2)8l3Da4`6=^s79bnpPuc_4sF7%$dWV&lmJ=xC)77v_WZo_~gS(
zlSlvfI4MG}+Uo0yb8<o_n^CKsiOC?WEP{;NLJ8__ox}V~W8>X+Jn`N%K|+N4=7foZ
z!_MMjrAa$le#9Q+w!6Fg_V)H#e+D{2EF<IN#~a`V+mSBp!L>py;yD0e@Z*Q|_Dn-B
z-aygQg2sjh2#MbAZmZ3S3Z?6BSC{8hl$4ZIRJPhkQ0=M!9~T!h57Sh@Ble~8L&=~1
z`qc#52qHQuiHwZQ%GMS_--xmcb~Gm^Cj^%Y6EXhT=%y=mMo`$>+taIl?4_%P-xd@Q
z5D*srtY4>cKgM-y%I#pakKM4@(b3U3+uYRA*}2i{gbhI{;Ap%xRrBNni<7Hsnz*0u
z^H!Z}><I5Hl?*Xa*FOVo0w(a|*(Q(A4Nqcoi;MqmP1nu0`d}eUE0SKR8UY$$XYc>{
z6M%PdbMpnf3@$EiPa?bD)kOpJMQm(W505tIm9V<Hljc_kOiWB1*e-5vvo*GfEiIQD
zqs0;LA&Fkg3?D)(rQos3)F^@_Lm>G0_#j(&^O%V+y}ZtwJa(Y?qT!H>yxM=8zkZk7
z0v;^Oy;mr`%znBSuxTRO^Zk>P3?6F&<zaYB7gyI8Vs+0x8?rJn5roPB#R|cr*&Zz>
zJ-GXk*Y-m}!P;<vD!o$DK&B*0joziDYR?;Vyw@k&m+Tpbev7XsQVH+rIxw$S-=C$q
zN>o=YAhGhk(8Hk;G}+8ur#9>>CD%JRMaJqui&40Rt5Wu$Q>7?>J=(I!4)<hewosP2
zF02C$OWRJwdsXpGaD?S$HCI9GOFLT!W2<J*Q8fC*d>9e55`K7LV-XYolq&gjtWq)u
zB=(GE=bollPe8Fm7WCm4xeH2I)HTDZa%hSQ>qrY@Rwa`aDSuSWH16`V;W55rVPC<V
z%=MMOClgb`dBoP0KX0Blm4K-&g__5%Bs6rA{cAzvG5PQDPo-)3<1a;xjg4J*NAhkD
zud*4ou$IbO+V05Xmjp5M`wDNLe{oMQwtC$WEUJxD+CT3t;Pur8@lL+C|LS0E+5Vx=
zzRB?@y~D=uTO0Fj*ANSSe$AggeNxYnLqHLGGj&s+_+3_&Qu5yf27X}Gmm7AqvMLM+
zkOT;{HO%KQ*Sq3;vNJF4v2F19@#9#!43Ss&2|@$kyy3T>;<5ZinUde+x{i#1otuUP
zfaq5qj-nRj;^b^_T##~f<Ullg?%jXzpuM}B;o-x)<Bpw~hBr&6S<)ferF!NL4xb0D
zer1UFFYT#(6cM!il?EhOoro8%i-2zXGoJCOUaeh`8pF)Y3;;VPKwe@#|0X6U2YGxJ
zzTL94v|L3)K%USDIDQ!#;^OB&`7@Y97Q6+e@@#)aTU%Q(51z#7cym%Yh3D1ju8E1s
z0_3&;<*%CuDA8H?{X28iCYcix@Zv{tKQWid6N7q3lZkSZ>-EupWCH{;M(h<jED4IX
z+aErBP^2TEr>D>Nu~_?+4&b5b^%2Ob$<P6S#|M1*{yq0eBHN=!k6K$>A@|#Wkpai6
zs;tDu!NH>uvokZx8D3{$W&Lc>0R4qfRkfdlRqutlIjdeZgNVpQqLFVhzXNq*Pl7z1
z2(@aKT5fJGkHcIVV|xD@zwH<c3rk63<5X{N2{bJpWpST>uKy0#Ys`H2mlX^Q4BXvo
zi;C9QhYOw?7`)(`48kI1Gi;uKtcB%5KvPc9E(Rw58#rB9SXgXa9Ag;f?c2K5R)eq_
zi%Uz-$_#Xyl<UPF=&(?>OVP~j9a<08jQg)%`L5(8=#Q~*an~8tzC4+@_k4|2*X3%&
z1dn~5ZT7;?$I<09iLvKHe*qFhSdCZDC#R<tE&|QO6z<m?EBZSm_NDgZ`FXjaI1%ra
z-dd_s+N^HXy&lHolW?m!{?ke&fX0*;*tIk)Q9##bx4QKX@R54({cq#q9FMqqe~d~O
z47+cR73<-sW{CV5v(2oy-ISg7-AT#L|6z-_%k^=B+1d`yCJ8<vv89!jW@&B|^+bj9
zaQ_22>`hT)+M+i@i;Az@j!vpb%JljCb}D+T8Gpnw{o9UsGOzscl4)-x=bYc7SAQ%s
zD^FYhGrGAI4>@+b4$Bx+Kp?&bF3nF1RA8rjdV8VZ3wWP$AZ8%Kp*F9suJW!GYJLI@
zWoy384_X1ViM_o&fk+3490)durZU0O>}*pgQlQ}|s;UBj%xBmDz2kx0do>0kC}|%B
zoJ@!Fl~1;3{{oWO*w7QM&G5T=HB;{dOa$HEW$o8PIyy>WcRRS=yzjYvo9`un1T7sM
z`C(W1^YPa7Gi~h;A3y%$Ip(Sndh+DSYDI4sK4Qd%H*yJ{`e1MG>})M#9-bJ=ETp|^
zDj)O$TPV&V)cx)44;UC2?%xlbe0IZTcsbKVz4*Af<MP+xy6EWWJuCZgGoPQNYvMP^
zYGr%-V@isiy873WLn?CepQaUQX=&7A-d5Juf_4)e2zVob$iJ>aDfzZ&Cxob9H!+f8
zViq<x?X0XY5RehQ$vEUZRx|aSh-!y<Y6R$Xlw@QA)<blGv`MP(lpY-%9Q++AdMY9D
z1!_AAGICN6y!4GqrTS6iu81c`<PwoABN66<2RSMJ2mAX&!^2cmRLL)i2?+_sGOkzq
zD7d++QmQc#<PrJ6rh4h0mTCin;(q=S7~jrB#S#R0QvaDH0wqF^^>dkFgY`~#j1RG%
zan(H$9;7ZO>x5j(HnzS?{r(KG1!bFkRp#1q1V&)T(vlt`7A-(cX1H2A=MDv#Py-KR
ztW+^ysi(K+UD7C|-vXN2@ySWaO9&=%9n|oAYjTMq4FQ3sBPt%#jR{Git>l<_oZ=Eh
zDT$E#dW*?Q^UttK_w?61Rt(xj6892Av9asz@G)*<zuD|<-Yju7BYb*d(W;0*d0>{H
zn2^CfYF+dE_2Fj?L@-ha7M%h)8F|glOcl9Ao@OBtrf$Ng%E}8Em6rxi?Fe~xj*n;6
zzt30rB*t!1FldF1jZMO4aJm#F{%LeWHk^dB3yO55W&g*sgEfG^4fXX1e@!Oh@D5%)
zC@k^vbY#J|Zrwui&&tYDW5{J-5*PP_0#_6M#i`k-%@;u%_v2G>ah`@XV1CeVw2ML1
zVIIcTH#9VC+G#5<hy7Qii%r4qGp(rBE7fE=+TBGURv+|Y7wjJ%q9OpT!v+C-Ca0j_
z#w)suk~h4b^yu%!#l>o0>iOv@0}*Dr$}?TvqCqPm*FOjZ8XB6nSlc5+f}*TCQRGrW
zLIPykEyOBYunD+T7}Xhw#@0%^oe-JUjTQ{37uQC48byDusa)00)BI$LG&{e4AL*;?
zZetf>V^d+59rp_e2!J5GNg{3SF%Y?uGNQ?XwY0U#$;r7KTmvMclaoVDfP$6Wl~<0<
zmC8c+Ll<iCKHG<SIypHR8XEf9u*Fnp!uI4?SU^B2g=f7x<^kk59|s2l0Td<f#|L&J
zG3%k+%*;$^0i5h=0Q}<O;@aEWLA1P$5Qs!aLu1pgW#Q&N7%hHw`R|B=>xDdM78*~V
zN=r+-I6H$xbC-t30R%4{9nZz@p&*_?otbU(%lMI_v>rjutN7%}lA{1{6lgA3BrLGt
zaDOqUMVVQ#$!oO804cU{D;}T(38QMdl8VLt9j4Z9dO_JKw8me<`tY_5h$+b2cV_j=
zZeh2BOQE3or-Elpqwgf8QQKJOqenJ1ily2nWDLu>+Bk?zO@?Z6A=>iG(&9C?=gk@E
z$sQ{gp<!VfVaO4B4~keaINW~4@=jYSQMS4qP{(Ro^Wx)=7(Ob+^zmFvy|YxKXIzo#
zHqF+Bw~dSC6BCy#=rOg7^gQspX+?A6zp0L=x(-;ehMg@cVf<Vqgs4tdJ0D`Rv$?+(
zL{bUJ4Z6D3U~>!%8k<aJ7ILT-Yw7Fo+6@f+>+0y~K+Co!wA`Mo)U(t19FhcT%h1pe
zKvttBcl*7?4!|E~!JJJE4Z1oy292)PBZca{$t|ys8UcmK%gg)vUIAbg6%`E+3sX{3
z!le;2(9|TNqPpB)d6JS3XuQsDQUfFiKtxb3k^E)uo0J(_TQkP}2n!ESPfI)Xz6v8|
zhDOT=d=2n1i0^>ePeH&93JMZ--*j|xQczOb94nm$VBzfSTx&m_KWOFc<1;%u3)Jrf
z$|`8Ep)&XL+p~`oUqMrll9ujI7k;Lx=?WCMxmg6FIpqxh#|tit_xy=H4;6rzrX(c+
z!q3djrQ_n_Vq^?Gn|7jn#z55jJJa9azdZzRZF$*lzLg3A{9~TEV$Kvv!qjx}{ax;t
zGxhgK!rQ-n0|nRz8jSUDKFEP~U;L4JUxApBC{KruOKH44*XAew=uv)sy(i!-za~)&
z07w2vC?5eb1T^Kng%=A70s$q8QgEohe@FbnEjl`y1713j-6%LHC|%6wHH3Lhi2rp^
zY%D>51T8`6)syuj%BU`3SfE1noaW|cs}!J42R{>-k-m(KXwngYic1m>&5kKXzPGOr
zx@_;#UJ^R_-MgF#wLl`G$az0O0tE*L0}%m5l98F2n}^4HN8#mkZ7JZJi3+piCesQX
z1|sAyumj3UN^+5uub|t(^6KmBdu&dq)8Yeh;;?)<Rc#Hucy4Y^AoBd*zpMQo_KzPw
z&ikA^2XG425l9;(UqnO%KlCp^n-OwmR#u*ef7Gd?#!L12AAJKP{QC9lN}ZAik0W78
zSK-!V!Ds*e)!I!~s(f6!Ms1fYiY|_Kj+yY!M<5t5W$sr@e3360EsFoWKKpHCW^sBa
zAM<U!3i_jb*oCX9hXWyWKR%RBUsl^i$mJ{?oUxFE>wc0eeuEZar-xSathi8RRwyE!
z$7QPitCD3_L`#l_Bqpj<#e^DDX-bPDNn#h!VZNv?8va+!2?_PeAusQ)GqURaV_xP7
z{`%co+#<ICdwcQ&?Yque_f&Tpn|Vg7nRs1hQYE9dG~1m&8qZl-nVI#Qwh|}f9#c=Y
zrS~lL>sF_9Skz%|7>|C@V?zgO!>U(1Xv15~xBYLS$plH+&Jo+;g;|}=Fm-yZVbqm5
znjsW<fCaOyK3?D7-4Tf79&LhHSzcM;j5zL{h1P;cDS+SQWM+2LaNYk5+;H)y`AUM0
zzx{J^9)z@qMnuduxU4~p&CJf~vWx+`w@Lw;`Qno+Ky^?9VF+c|?CB&W6_}bz5xE3Y
zFGv(j3=Ewri@vwF39V9~;sU7U9*(&NF-lbxP_pOd1Sf7t6=>K<C}^IZo@96L!p(6(
zE!E!AO9!_2@91wb=tyNCQE4){xw)BtDhK@5>v|`U*3Qn3hrACy01CSI#i<hj9}u|S
zy?Y0&wtUhK3k%BvB^sc#nBP@e=sf`evM#4UTAM^22NNFUkf0zODj^<rc6M>`HifuH
z&@QKIY@b24fd+80If;UJ`0ybc8yir3azHQ$r3|r4;3mL#ot&JyBB?02EqaX-)eQ_N
z@$nsDl!G_<>@`r9RDOq>(6zJMgw_aweQtEs3{TgaF3c|`CidXL10kW;5Z5&IYlDF0
zkx@`Adz1Ez6vsbvTmCWtIO_0NJ(dox^MoXPv_zMC^_^S}BO99qxd<~8ldI74J|YP@
z6O$|utNPA`IXPoGOnI$`-U=FOYg^dcKlgI{9eq#whOSVHbkArNX#z0qHAY2r_xAE|
zbDJ?m!rvimt$j-GfUdf<XVdm7HZ~SS0S^y$7MAMra$1b+`g#F+`mW*O19mYtrHnQm
zmX|}hKa4~`DZ2;h-VE@}NZjFiHe&ZZ6YpSIL%QN+S1GALBr|bXfvS`qnj3!TcOqFN
zV;)%&QsN|iJ`T>6G+ePN`^#9bg;0@?{@D!HS_w+p4~rPOZhy;3<_~_Om&eM~&Cgv2
z61S2WUgQ#4WQ-6&7bi81s)9<KruKwzqt%EFA-79PwxS{?D+@)X4oeos6R-5XmAyXJ
zUL=Y8F@LIsu%5V-e#o}|?TA4c@;&i3U6Jj^!UH0ceVeeZ_NRt<KZ}b+TT&B=NHO2u
zcfoQ}mo%4q9hUy)X}jDi<}KNB$FN+5*!viiU7YEW%UBq)HOE>SB9i4Pd<P58lK1GZ
z?d%x;r=iAs-=`EEn~Vd<t@!5OpFKTQ<>kh&Gkbpi1mjgr4KIQr;peAMH<HX0ecP*p
z{L0Fqo}S|TuExwvT5<9D@$qMm;*k7dP_A7Z5f$Yykcs4yE*_VZRH$9t(%hUxbPFLU
zC<wD-P%n9L3V<p4UCjV)7#fNd{AMsSI2f}|0<#^H3Y{JM;FQ$V)QpUsjg3#EHc$&~
z%lfOw_^bvH2q1~4-mO_#ChF>gAXxeM_#`*ng{xo&Wq<k<CWwPz2!p<83y7XjT37e3
z@^H<$+Pn+2VW{5ef-bA<SI0uTWjh0%o%sM|fXjet_F_!?&t%nFC?UNS%<eWJ?SK@J
z@mS#_O7!ctVW6p+A$GL42h~EN&%+8}W<`ZlpRGdd3uoubH*em^(?Lf>l}t)bhA3oA
zP_(pMOIai&Ab2Zf(igb}4Na%+<t#`+Q*{nEj4Xv4epXUaV$&L*91bFDlJ!0tD=Ucg
z&_DsxK^`iN($UjD&Qg=7BdE}MZ;}KHke*HzxukS-tG&Fsy85Kmq{FlVpn)cnGLZGY
zKKMXHc(@Bla2t1T-@Xk`Z)`YNS;+<9HbM^Y2gS`J)%{un2fjtFCk}2%`?3_`v>oTI
z?c5=D{<F+g&pxxeP=+8Cc+Fao{JWM$;RA(5SjZB6)!(M(u<#H`G{0&dR8&`&ob6uU
z70cM!*|{EH8(}7r-8&6zee;iT`S7|<b67xfLTJK<H&cDM%Xf0JW(|S$L{e5zVc7VE
zot@1~C4tF7gblH>H6wQW7UfoTN=gWd?BMD3yc!b&DL-i%&=C$qMDTZA7JXs;nv`q?
z#YeY<9}8=s>#^RxA!}r4HoT!G{wA5Du>z4^XMF9zDGNWuATl-7%aorxVQVs34HO2b
zJn6~Jtq658PSqx7?OvfnsL?5_69ps5sw-JWXh>F0*tKc<^5EF;G7$TvVEv7nQYdzR
z#*@*1eTm0KPhW5n$zum~tUVZ^Hq86WqDnmXVaaNhcKg`aaf>!T3}c%{yh#0a##wWq
zV(j=h-!-kM&p)qv)Tghj%UPFfRDqzy$FT8uu_*A%)sagqaqcxGLOwQLF%MYA%Zc&@
z7=<SJ{eh+pijRs4E+S3P1uPR{N|?G8rX4l5<KEuh2xE|8ps4EV>Xz!);bLNf3V98b
z8A*aE!Ts-8DR2%DZn1E1fF>aa+~}-VS62YPm4aFzra>D3F?DAktNmLL7E)$;`EEgm
zuh8p525wX{2tT+Y8#A*lh^^DL_CDU;N!C-4mOw-RUE0qz3s$}Ti6?%+0T>5hkC&Gh
zca{5%9JRIvs&x~nSKJQ57HM>d5`zY3xEj!R_~hAQM>rDHfHoLxNJ>f~At6EE=;`aD
zg0TyjDWLzJAFdw&%>DWECrs!$I5})t`jWXVZ`A7;JOhix;)K}PA(%K$W#BBg5AtZS
zKCor?lO3+slU)?hjDLnQ;Da7%EM(3BzBi51`^1<UMW<W$Y(}p|D1X<D$PZU5$}#~%
zkhbAjZVE3}IK9^TGRmY;aL#&d&2t@B$PlK%Bp(!5B08;?M@Nhqn=K!a@?z+t_LHA0
z@JnMNpN@%>DCiX65*m^kg{d@yG?dV#$3P>4RXYFsxou3-6=XgzOTgftnVA^~RPrDW
zU~AaM2i)|*SsGwDC;z^7kdvK_jErn*YKkz1SHldt&&TIJ*ZhhpA*{m`%mX*|2lnK~
z0Kqi|e`qjxDJAvoO8`m~l`t9t2$_?Miy7u8PSDgsWe5lfxj`3$a2evcggL?P*~QH;
z;MOfppXYb)-UVF;oJk;@`WYI|fi?jE#}T^gyLT9X74qGn<wdXebZ~4Qt`D>7*NS+b
z?gGCf#_WK|9<c%6!;1oy^zHe!o$c)_aDM<;!UoL1$M?aSs}$-eSY|nlzW|50+0~3>
zJ`RvL#SKQWiw0rT!2GPt+c=SRKLEUoU28O=9(6?pbsFlQ3m(#TuLa&ha3zB(C0{VS
z1;Mc~Qq(J6KDh~kBB<=S&CN4&b04jVB0@uxxy%)}x*p5QuB@#AVVU^ypRXT36qpjk
z0LnwK0MZ5{tSACtoC%rxa(4^_WLH;LL0%pwYmbt+3%ocCv5;s%e{`5{tzj?IWRjMW
zf&s(U=B9|rx3?f9gB=01DhSn%A3xTcc3J_%P%FgR@p(af`uqbaF*`_tHVF_40KOz&
zpjhbKM-3YrHu<xkOG<iYXMG^gl!WbJxd3uQ;csYY$jZ7mtYZnuYi+GCTn!3<CKC=0
zj=A2;w)5>KxMd7osL2G(?;s*ZYVGyYRZiB8^83lER^wW8PLWN>EbL56hShgJ*SbpK
zFN?Au)R+>W<b&617y^MY?B7BNO_JZ=*`^c7DH;(^wYUeicqY$^M|-mkpad#mRVE}5
z2b(|}1tG${j#42uKR;hdD$iQ)eoW!$2IT+l?k><O&SBeSXF=s*=Z5B0b?86)`}=Ux
zYMmAo6+6Dv<c@DV^`O!@3s7RW3sB+?lm9xT`?0d05^zPQz-8tKy-iLO-tA~-KBFO1
zrGS~8ou7ll_%$jkE2}LZ+M=UCFF{N11;wLa00boA9oX@vKmUdQGC@5XNiq~gl@tr}
zsA-310pyzoY9dfxs1mxBFTez?3xU@eNmZ;}9D;rKTiWGT-8`FKb@$JoVfpJ?#oD(~
z13^C^Atn8+Ta^#`Q1AU)(4_OMU+*vfkPgNc^t)=#$S?x@S^Vi!lH3!Rh#elf0(0Bi
zDji;jt_nq}7K{!MrZ5h612O<e7gQS{nG7r}?*W!fR<3MLR*|w9)Ej>d{FN>YU(=J5
zeIG@=;0_>Qknz8yj9ls!`I{ns!_$DwdN@0qYG;|RwvW*{=V|<Szt0blv-9E47nP8l
zwbsL{kS8~5Dg;<1S<(o038E$yf(jCgltpI%z&Ca~xb!SJBhb!X!`Ul*GUe;>tvU{D
zLJX%oWUCa@ioFyuan!TL1%$DgS$j#9nDcV?e^(!2CMHY-$O15%6*cR8pAsWq1zH)@
z5nz5v0dz1%bocO>hj0UOpRbZWm9I>Jf`SrDr<x|Px4+Wk(00kk%Ukp6U{xw>w~NN_
z1=`iw*%_ERklM>p&~Y6bW}v|RO5rVrm4^Pb4HXZ>1V~mM7@@$NcX!tjI1}VCAu6ZM
z2*a~y<6sd%e1V%mC5K&{E_*I)u_p@uo}xvJ$OJ{s7TN^3dLW;G`zOo32QW2W{EQu8
z1+8V+W~xyA0T2@yzQNr-io9|LMG4ANb#1K$(>s`W^4j+AOxNMHmvfqR@&Nu1{W>{$
z=5m+$=+0<w?`0G&pnsO)=bD;tQ&O%Uq@V@Ten&%%iYkg{%9GkN8f)uxGRRTinQW}d
zE%eWJl|7@W?!qTT%TV1t^J_!deU?{U4dt+)CJKNoj5C|-5^qW8X*_Rv?FMz8SHH=-
z{`c?SPzpKznuGc=V(7ic4mPU!9qa=y@ZNwV-mB4R#P>cq;(Kp{1#S$T!Z43ZVzSlV
zoY@PmyOQ`dA){PalNhMTVcAwG7M;rSbg@&+c9&U@BDf(>y{sywvN9PqTF!Rb+BCJa
zwE8xABIMdFs^-I=7HPuxxI#xM_QjSHWv_$f>O1XX^I+VfQJYe2@Df_3<THfPV#LxV
z2TzK#s27hCpzZ}cB|BO{S)=mBu(<UGK(YJQ)a6M`HK2wYyeH!62>cXU?csUFk{<(P
zEh}rO$Ije!#bKM4-zwD=YYG}$VkX1u{#rLUNfDz^EEkc%x7DS~wcgfD8zN1#Op3iR
zxBopagiCjHDo>nR%;%|{apv%4-z`A{5iS<Wu8I$(`o*Iej#jFo$sVTxkZimVw7^hd
zc*gk{+^C`=B4q{*`{y-3%M4pUJXM2Dhl&7}CrH0Flr|H{K#mvP(eMvS=|VLyJA&zg
zhpTIeuFy6f0FfJ0A{d<@DgA`&^ue*)*a$r}Zhhh((Z-MFb`Mn#Wj0i>;@WIqEwC$F
zEXr&>5TT9|{B~H(iGIyT3dq&enD@$&HEG`H)3K6$KKfn@;i<P6_w&^j&Ir`K>A76q
zcB^F800h`!R*h~iUiysNBz7N~UTqnjnBO4w)4e5Ow~giBzd>Buoo9cyR38w46w(ed
z0q$`8<-Ntl#iZMU@;D?|T~0}SFS{#@+>h^afV^p$+t$hPdmrNg;kSWzwC_Add8wl~
z`0in2&)c^%<Kt7}eqAH%e1w|#M2+aTz&$=%r3fU%DVzws-|VBmDf&95ow$607N5X?
z&BSiM?CSCGmjwd3&bKj~Ul;HNRCsv4y)+bb5jssolOu~#RU2s{Od?~z^z;twc~IXx
z&$iENtXB%I*r(hnS>X91At6!v+wFykiP`A74vd(9gv9-5L(2m7D&$5htTgX|jT>!r
zvqelaxLEd`Mv3#7et!pg;7#H0Spn1223Ws(C$GuTLUJB9$Fw=aOa>edZ{NOk1xYJU
z!`#9G*<UJ4!1YhGMo}7<IpzVgBvS$x)+t;gqJNE^j_$X>OH)&4*hbK;Kzk%Xz;xzx
zcR?D;0A*KSU*C%JD;XIXxGaoM;0a-<de+0<29{`U@lGwKQRwg>jJBSwQdl$<LWN3*
zkH3KFL8)HNztvO+5J{jG(GmOrM-&Xhl7hrRK7$D`%o;Z`33O@R;glcsjK3B4b>zRu
z^WdHqd(+8CjG1|l{_j?T%hyNwtI~22mT*`Y&+n4yaw;fpjL-ee<M%60y91A-?6=Z?
zDu#XYZJDnbM|i06*QAJfT_~-lKGVjrdwahkE9S>^T5noOugY^_Sy>sFF}Yy%*pjNc
zZOn}iPlxw)I#*(rrHQNHmt0B9EeN#Q9+Z*xXB3ToTIXJM+nj29IzG0(9z?1_bte=j
zI4YTr#Ob4~2saNG8g`S%l7?~9uU21Ho{UNYT~!c#U}gj(CTQ#6Wx}{okaFZAl6*vA
zePSLwhMD;z<&+mTHj3&*Px*Hb53w;Ypr3*;KWxPV-H2W#4Fen74y0Cu0gSu>jRObR
z2F$3XHOk(0K|oBLH)wTJ^IsoYLCpsU>EYplg7O9?wbl01AJ+52A|tgKh;AGj5HK)6
z24|JEgM&C|{1$Iz%*|K8oK~Z^1-&v!;S$;lj1ZxeQwuoK2AhxthjuIi_0G%9RZ8Lr
zXt02L0?Jx(7OYAtzxucug_bA1tZH2|1ucz5ulk+!go(Cdaaq|m3<OO~?n&aqe7f21
zx>e>rygbO0>P!hBFn}?m5l{hK2nQP*=sxf#DjJ$L=snP$pl>V#X9q`}?r#Xv3Q)U1
zR%j2#u^lM{X#$4WU=9N`^9Sa73JSNY(5wO8sjI8Qpjzoscx0rzv-6M@5A;00gH;^x
zSi7vLSXo&CjRAh5q^#W2)5A-yY-3}ym5$mfZ@?DwA=b;dKi~dbrJ-?pl{f~clw$uh
znka+IS;7)UI<9gkyX1X!saIW)cq9dH;S*`w)aJC!Z-Fzse-qP%8h4bZm{S73U8KQ>
zqzfYZQub%#9-ov@OgF12mb<vt+8Dc@8SJi9eZ55Q6vb_DSv~cQ>Gr4l(ZtU_f7|O0
z>4E9Y05uv~r()iPwN>|rj5k%_Kl`bu$z=X5e1eavakZ6|Y(_Mi{l?xpq(Ojpi>BV~
zXl5yTnv$)py8btu`#z|n=1l$Y=Un%qT0VC9;Ea^r&<@m5@JCdA%#)`B(3o8as^bd_
z3piN_mKgx&NxHiw_4PL+h8yk7eX}0Q1mG@^=fFS*qXSY>(vx9TaVjzIdcYOHYjmN-
z!l0_ms15YI56Q_zFlF2GfrhFNqBZF8&><ROss!BwzzuW)dl;|+`UI*AS5Inb5k-J4
z6~4fD;QDtgqGv^e$PLa#{Q2{zva%Ao90qD2;9Dq405%Rhn{}%!IHfUwr9oV<#>h|A
z+CNc`2P<U&3c8e(RIC5{)0*njd)HF1?-0FS&#`WF%$^>v%YSYf8XB;*ii$sBegoSK
z#%2&a0dnmh9Na<%<H-*wH(=kTAR>y-$Y_J7rQm-l0!#vAe((bsusvf+fZ5%m02>h|
z08}7@Fbdk-+5$sUjm;=iST;!epcn~KMuKq!5(dy0oVWPa-VSK>2I0ahEH5vE?+TU>
zj)Z_O3tYW7jGc?C@|f$65eGHRO9L^>oj)_hel!ZVB0vgJi^r{X&{_C1M7N>!jh((s
zwBCo6{aK^i?9i9mbTa}&yX%#aj^VDK4~YgcN3GaqTHD;Gk?2XsqrL9*X9UfA5|NTn
zHCT;m2f}}vpvJV^`N%@RKOoD*+G)_^>8gPrS+Qm}_s~#cUn<E+ZBKaoc${C-=fOe>
z-dYjAy~Dbtd|o_^MvCLY6Mr__4{SB%cK8Wtu8YpyOVPr2C$%VXAL~6lTRCl*4f!S5
z^y-hRgx@Wkd|0O9Q9XVA3^;2610E#`#;7hr%pjQFs3<EdtEl)wi}LmL_4aO7V*p(?
zI5l+|U?d)u&<k7JVl@URY%uw81`YsIO;|BVdzk)$Sw)_O<Ov;tt?gfct#A$pQW%hN
z57=#sM?qW7_gDv+8`K&=sX#I205so7d+9>p^$~ac{Urs%I_1Xt0E#|r+E|J1n<atB
zrEn5fXD6r7s3=Q0nOHhq4UPMWJsIF<gTfLYPlT8Na1R1IoZA=!B_u4n_5JfVYFqec
zP+1gspVW?F;NVbky}+ny3MwtNmy_#+s@oMo0cXg7NhTy{85(|+%ef0mW7XInU?4!q
z=K&$-o3;P?6{-7If-RA|<qmLIATW?ANPu=gea#T_X|$W<g0WVDBKX{F+=m9%T|r{#
zS`zg>b%arx(q;eGulIz7Q)%CUKb8xpwAX#XNCf-M$g|sh-WO-c(a)Y5P~Rg6$R*G)
zdf1y(hurmLZcvtiq@W-d%-4Di3zn?@;i>eICJ70ch}R=;X*Q|I-+ga?|K8+s-|Z^E
z{`m|`o?^L&^UB^SkH6HtHRo3X5>6%oEjc~8>v3E|nDrutaGF@xda(AAUKh1+gbC~e
z*eFFL7|72UiAJ3VY-sO^*?b!jWI?vuq?El6W*7o+n;LgB?5viJ@s*WMxejb}e>T}e
zK)oM@(**XA93Vjr0z$FCBcbd9PojwN7lcK3V0!v<7|%df!bU<aZf$J*#O{dhS$Tto
zV;Wq)Ggb2!KpYE+EqFEnN*-;Dg2x4cusAg8L#XG=tS(UEf-6=G_Qa^(aRJ6S2-<u1
z7Ft#aAaxDfe5uIczo)?r`u6RcRH-&N%)nnpaKpvmD~ZY<e1P`G#^5YWGO2*g)+?=r
zQ!+5j^oF4+sTwaToY%Svtf_I+(<84sA|DpqgSil})tMOs1Uw<MNg$?RcGR7&g_AWP
zHjIsF7YvV5+dv;rfymI&!9ZvfRf86zJL7N7Y<B#_#G;Kum=R3s7G5SGLIHR19!aK|
z>b18J^h6BU)PRZymlR6hw*9l=7p3(^?zYn)Ij@fHouF5@xapOZz0&PWO%0>3#}Kp+
zE^D-NtnF;yEP0w-=}Y^^KV#QrsSRc1MyAxX$+Qlm`#SNRMQ}kj0afWtP1i7At(y_k
zA1x){%74Go)Tm9{n=Lw5B}e<Ot6oZevAG74e^F6Gc4G=1lGX41QyAG7+~Px*x$kr}
zO^M=P^4>K;(R#ETNg<XjxL8wSm`GvJNE)DfSa#CaW5g~r3ec+yLDK|Ps~1_P*Gng7
zaCvoANLUzbnzpvKG&C700byjpqgRx7P-Bvl1z|n`><ygJ_vz@~PAVn!S&V;dP#;G1
zft-W}(4Wk`1~l)-j~|EuNG1y^WdsnQOVCxoP6kEX-25Q|PM0wyC~jRB3r6Hy^QHD(
zgn^W*Q52tV9U=ErSNGLwZ!+KuP~(H%zV%*Qt7L=!wqU=Y(F5eN?QJ-L1a;fU?@}vI
z1BXV;8^l_WPeCZ~gSrLPBP~50AlC$R4LI2WJ?drztg8#I4zT`=8-Rk<)qnH#86pV!
zr>?Fp+&K9I3W7*>G_7~R5e~`-LpcI@R8Cfwli~shX*B>}@RyuXB7gZFN(efwFgR6v
zdzC0CC}4FUe(=(4k2=(tjC*sfpRs@39L_4uscKLEJHAPoPBE>AO(p8|W9CQldT7-4
zJO!skSZZ-cris}zwlr#rqtoMvV($N9>dOPEY`d=y2^mTyNs5pRWy(y(2q8&C=8##D
zB(suI#>_*S%t@wXP6(k4nUav1kSR02b)NT+?>m3IPj5NLeeV0ZuD$o#Ypp$+BOv%O
zr49GQ&rGhpKw)<|ISrM}hb*^shG)Nw@sT5QWVQ#Ihvrn1c2i)j;bLA_)z|14pL%8U
z_n(7=Ytu)UB*|?i=gKz);-}9vrWKc~rRdaWk@zZo>{dNJx+U;For_nP<Mp!d;O=0h
zHl9avtfp%BbDqaVBJ;Y4YWTGBCP>dfctFnlgg>DX*^7h=;nvE+LL9xtyLWYUbjrk<
zj5BmPx0nT|!ay+C1fd^IOAv&R6cV1dTH6{ti<6%P)-hNei1eb=c|`3$k;@E9PVfuT
z)Z>VtHE_hz4<8IGz5Bj?bwsX%%$5bT7d%xQEs3C$85_%>GDX(22zn_e=jU(VoNztP
z#fkdmHNSr03ecw|!b{ldU^tuQ_n=&aT*Ct)X=>bN(*nxjEqauS0HP4MOv>&E-uzty
zA;;+<6{nvch!kw3OCW_a6Hpl682J(fGz7vQw2F@tTCld_m1qcrx;X&F(DFz~OB2r&
z$Vh6<S65a9>^cw*I-6#;p?U=J=&>}Z6fun28F57n#5JoJu=cN}Hj)vfrKM3Z=Ob~3
zj={=|3<UEZPXj8*c@Fe+cdq?xes|`oZcKC}pF#N&*fHJA_J1FS(<}WdbM|gqT-~4g
zl`rdgYEvRF9i`&Tuh=4s%~;h-JSn$kw-l$380qjZ5ek^vzBetdq^!fTq_N>kqM*~x
z!0PbEj?KU)_FWksQHe8sKGv><>U?QCNvrPLxJ)xAc1?E|c~yTw)#QF&EHZ{6O9yHC
zza?s6Q9&S>JSh~kEW3S;B;Cxvudt*x#q)=^PnRwZJAbfU>WZK|gvKGlab`KmPizi+
zJJ7U{TRHWW5CJYxhaUp7gLnv)Vt(0BGZy*`>l+wgGkIXY;DCm1aus*XZqh|W>eWs0
zM2IMnBj9jH2oPeaMv?_mOTC2<1w|nAX8_9&GBBXwO+<~W1?<s1N2Z69-|)SC+*!78
zwG^P>1$lW$B6P8CDz~>hj)uB{@j9yA+1UwnQ9xMu58@9lbT_lOmcD*5kv{xN*=zDd
z{%%tf#lU}E6YQGR+nJum?SI`p{lD#C@=cXDhIPcFp;(nCL*gyMI*q>^3V7+w#COy0
z><~gX?8QTv3uu@3zI&AF*T`;b%Spdeh`V8sj*NP^<F%HGDI=O)5ByZr(=WqJZLsvT
zAPv29i?xbOvT3e7A3Mnon(F&(iLAerQaU0Oj>%V_>u&tJ5dXl0f-C4tef_(_!t9=_
zD=s=0QU`sEWYhN!pZ7O=PP7}?W`1E)@-I`kNx4*LVmt9w*wd!qcy=a{jEmdfYDJj#
z3HuH`*V(!)l;6xx&{5qQvtXtujMvE``8A?fGO9jmVrN3O`tSyGPWcPTd6VZago%D%
z|DPw=>?*&M!UlVK`&Pf)H>UZ9=eJgBg+xW)mM00#`=JRPCpeNE@7t(rS1`W5ncQZ5
z%hS<_<}Tf%vrR__>eF<={QPq)#{}Ba+<54N4GlL6n?JA50%@AGdKCPb;pG>efEj9&
z_Y3Uo-0A5NR}(5LCz{g+Mn`o|+Z<VajfM_*P)kR4LazAs0m8R+%RqI}BYA>ch7OVJ
z!zl24_ZMet3zhJajR>CI>sePXjP(z`d$h5STh;bRb}+Q5*yI6$fv-5DFJyx(#mDzn
z=8poz!7<PYyJP8H`*%ZLrUvbdzMnsdS4d%*tAF;qTVoo3o&e1aW)1n3!%!^}_VVX%
zCS18OHB50oFfg#RbUpKO*5vayIX$Sdz&i!3v3P_M7g>Ou7r&8~S^S+<fXOgFc(_DD
z1{xc0^Y``M5fNh8S5)0DZlBn!;94VFYqgh|!JA*GFD$fhDerB*Ge|`@u$B?5y7V8J
zBp~XZ0iynFV^l7y&2trv$fHM&l%oPeOW&(b`vQd6Z%it;Pz)ix${s3Q6)~9q+iX!x
zZrA^*s_iT#p}_dyhtC2ZZ?ICf*eq4P=lZvOdigQ$feYrRS5G<{^1B^P(>%CGRYL<w
zunN*Rq7FWvfoZ-U>@Bd(g*Z8h4KbhvM45U@bxsZ74D1(EGqb&njN=0XAI)=HroJM;
zJq!-^n)@Aq`+|oAV(8*N_^kEh+N7|(`1qwsu72#hYa{oAz2aX-pusw-aj7H1IiMi&
zxInc&^YbN%?}6l6>B35uLi&ZDx_Sz(o|iIYq3(^0iaK@W9a6rkuC5}SAWbv8^_7Uy
z|A0KA7Fk<eMW^*&`KjSWN?cliS_*Nqqh^$suR2&sy&AaxaV@y}6CGJGSy@@$?BTqH
zs&ST1=Q9teIrMpQ?(!dSmN4WCO`VvZ&x_&T&&K|Yp;E&v{IXMjnL*z1?v7Uh2H9Kd
z)Luf2zP@#~_Q#yv1ol%emT_Lcdv}cpvQ&BpE8A_K!zx71fdKJ#&t^a&(VfF2=gDFh
z8c^Zk-@xaiWOj4D4Q1E)riJeM`s>THeI$vS%P}y*z?!C<cU3)q{_&G1o@jR6y(_77
zF(f1e$%<dLfr!HodI9xUtp)Zy^9Yv!j@Lry<tOQhZyEvTBk1A7J9v|tT9EQ98*slH
zqfcQqPfbj${*nm?T`eg$cMgGc=>U)aIQ5T#UGf#`?(Xgg=6m<<Mec>Z6ME>V;?T92
zo1gzZcgJTX3>h-S-fU>tg(xb8tr6gKFx+_YKf2?Qqeq`bMshrDz{3SOvA4ixXljag
z-)8eCD{!aLn@0GCmcq2}>|@p?D&j@D^PW0IBtATR2o@!lh=8Er+`oUsZu!#E=DACx
zoo|hh9r*w!xNs1DB-i0UQa_fKmlvVx2hI4&{|J1Gi;MOMAIZtd#l@?YA50Al3;;bK
zqM#<+ZhfT`_KozvUlpd7g7-WDK=Jr7@5z(!J-{<jr?UT^-s)3!2XBEw1uhZm7Zvc9
zy-aHNSeIf9`7Wrd6W<aXe2R+;MiW>ptH3NbHa8vY?PYYip(z24FacV;x)N4@&k)p&
z@{<Qauue>5MlK8r=)dW!q6w&Ke&8^G?*!Nhp~i-2B!jx;y|p@q+wZgZn7#fwQ7_&q
zi&o=g2QTN&(c$3%@tN^)MmoBdJj?UI4*y)@`z`@pMv^8PMim$I$oxGSh%QUx6iOF?
zje#r))wRBr6{ki!EF;jqNAt=avCkuaNH!chKj6WGt+lBfS1ktr?=D353RfNwR(W_P
z!3UU_n1EmkTnH54fEgG6{vEh6c`M2`=q30rvY;MgOLtlpQ)<h;fc2NBGm`2iQbGJy
z+rknAM63`L<Np>H0X?k+*&<uRf`{P+<qqQFcnU2*dbu(RqNto4C*VOK5`nH;@#-G2
z^ELegC0&-IqvKl0PWj25(9RFP!Uwx=ZMXw00#{F+^R$@Qb?xUxMQ)otLy>~WnxWi9
z;$;XMOBF9art^D;Wi#HqdGhGdE+hje@;g-kL&r<F5b1r3-E?bRz^tByha-J^R4Jp8
ztqV@C1}ohn@=ixmMY=Nzg^FaWb?E-&FDQb7d-G;4k-)}e14&Mk1TuT%9RP<~ozg4Q
zba-%#0|4(t1EEs<6ep(@I@&^U>Myj&cJ2f*P!rs7{V+=HWh9SbdtRchq-Wf}|B{v#
zD?Jr11|U{15Oh)d?!V<vu|P#o>-+d@hlSCVpItvV`YqW6eBF(exk?#_fscN`{lRH_
zlbPv!`?iexyu}&yY0CM%FFtFNtzME$f}I9Ykb|5VH6!zGpzscmg8)*`D=gg1KL-WV
z>bxbWABu|AB~Sh-a&U4Q7dszkWBXk`1mZ2en|KI{7+(4ywCwkv(EP4LVG|FHM%IGq
z-q)JM6KWfVE2wVp!&q3@Y1GvMSi1lt$an4p!+&E?5v3To%V*hIYabsBP%uMx6U1jC
z2M<R?$IZV!c90NZ$8T@GJqQ>Y;M~=Brd7}7@BL4pQ9t|%FeQ?6qLc}3h41IeD7D>)
zemw+V@T2PgdleK2M{rC}Pop1-yc`G0CFZ*AwOlMBV)q|)6cKX+uXfZFM1sm*yNUK5
z(2gIN;Tvsi$~xf4Yl;;#D!D}z%OWDg-6QT1=#qdF0JWS~Js6~jBMk8MR8(AC!)qdq
zI9u1w!C`uKmbmPZ?`^&kX+tw25NQSI2*i#EqoBXU6=UPAbr~gWAD$nGC?kzA+kZ{a
zp94D+#AqZgdHMNZFoZBm<e@{2!2TY4){$L_ZWHb`t0iS#>!Om9l0riB$WftY#D~Lt
z=CFjb1ximc(yGc*&Ie{qL9bAPj(6p-w5Mk3W=&m&VTt<!C<((jzXP||pn8Xv2;>Wg
zzLKRb6Q3jCp`q@8GJ_GwW^^>?zDJXnZO?^WAi?)*)YH<U+rNKdapDr^PJ=5~R@T;t
z9eb$ZU!9~|0BHh}8yFkla-jbCQCq8&)}Gt*2hFCSms{_%3wQs-ZN=FPdV|kx|G>a=
zckX<EN)t^3RXAob*P$)xg{^>wMSLni;vXoDBxw%GPcp?@`+3FzfklqS&c+5I0ydP5
zt?ieSbVEZ!*UZi5Z#G6Wj-YC`(2Y-}pk=x>^qH1$1vo2G3R%aS+mZh%MM3@;k&V}2
zIdA~yD!^8I8=E+&X|Wv8ki2k^!+#uB74<Qo?y&Lkg{b-+IB=lp&kDMjh1RVvL(UaK
z=c$&WO?X#Sw4Fbf?rH^+jg5^B&hAzxMoR5(Sni&`(1F2%1~A8T@SvWywz;Y4z~EqO
z=4D=4S=sM>I|(@^m1u_yfCKm@JA3H}wf}cJM5APuV#8<HzzCRl+m$QzV++U;W9e58
z<_}eB5|89x<?HD4Bk}Y9?q<zJJMXTQp##ka-XI7SV=*$FZyg<lxPjn*A+WE7ynQC=
zz45`2kBAWx9fH0QGLNt=9aN3LL4<d;wzgs|9jE-%(7?^g`rNWH8q^cKs{;HE{KF=y
z=FQDb;stt=V>eA61YoVL9osVpZXn|W2}wzEGUCg#o>WjJnEO&H_yah(CZ*`eR;8bR
z)_!fQa^XTKk_~JWBpa#`s7U`oKr8AHUs@^^Sl#M&D>)g^4IEld3JdGt46eUUJm$-h
zFFg8Rj$&(@sr)MU{ri6l3)BcE2n%y})<R}ZztYXZNtK_Mu9*h7w-0ya!R{lOvmfY&
z^ZjfCRg%7Z`xY7$WX|v~yupH~==xQ*ilhe7qGcv;TM0)q`uT74^9iQNVQ@Giz|_n7
z+XmL3&;vdPu@*{878VnD*r6dXH3fKo7)uQB-t2e<o!iV$V%!EOjK&+7%n&gQ4iCSc
zeNH#*Li`L__NNUp7D*TW<7_QpYvF9t(b0jw$I8k|Q@ltcz@F~z54(SF{6KwKY$yW>
zA76}=(=X{r>b=9sumD@7_&3vcK4Lhz`P$RQ5$N@h-QfuX)kdjLB$G(H=T%l#rleRN
z30y1WK6PrRFCsK$fP$ObGKenV%%BA5?d~3z5+*(gEXi~LVf+RK<46$-_hjm1;!3~^
z!X*LBgpdI3Qe<b?Dpc82x-qfnDLNN>qFy1OBTt9UY&m%VYx4fc-QktzE`G<&Mz(Ps
zHV42wpp&rjW3<Y;0?y6oXeU-SqLQSz__$Pejt;PxW(R)$9Q<;~T@^#;@POgZJnRI%
zL+}x)e~8vZb|leHD?UPD$MpC>OYXJ6=bhO}iA!bIc90Tkm6QmkrmW9oowk0u!!knn
z(oXrULy^0uxR0n#NRfRMl$MwnLBq(HlSAkn&Pn}~|NbSFLUMQKr(6CIHxt)nyK*Dj
z<gB^y(qPm}E*`Wq73(y?6JE9*&8SPCT;Q$67P1!iqUDRZ>ZEs9n<pxH<~QSmO>gJZ
z@zxhB%RDc|N9r4Mw_7i59)D~m-tv9=bm_BzUpilS;I8HJGetF@`}`hC+NR+Sp^q|N
z%)$h6X;CwqPfjzoW_FiNIIS!W>ei-2)1xtK5VT9%M1ZE<_Iu8LlRIng>30P<yBB)P
zwBbGWq+$#%(nmBBBswR^pZf3u$^hF76shpWk+qPNql3=HIM)Aw1cpO%=}4m#mIMfY
zICrQ5ez!;Yo5ex}4W0+icxy|`r5RSWz@@XskN!AYKTr#lxRPKH7vyfZsrS={3C<{-
z9XpHTGIAmyZK}`=x}bK)Y^d8}Iea12ib3*}zL=Q~Z-=|mWr>|5w{<^N9Z7U&_L7mH
ziu%_!BqotU)l1IV8LiAFL(y)+x8V~j$Euap0WaptDz1)_m(KcDj)y{N`)}Q_wdQQQ
zE=(Y30Xaj$gH}8dwM5pA3~%YZjLtIXLhx9_YI)mgjZ9_~bglk7(R@aCL0$z(3_b#d
z_|fFItPvdT?d>QH_wC)AduQ%9Rw)Vb%iVPx$)K1pcI^DcN2`CbC46bbvWiemX`1iX
zOdd}jz#K$P9kKs6;7+5ThJ`;QgZ9-4FpzYp8Clz>s&gi^eY?cb@=YOCyLfb9iDSo(
zn}<z2_9myW`(>Wszr5*d_K{8Ch-zIqpNdNJ;l92OBymTF8ovEkZy@zTEI3X_xyYz`
z#tkYgq#H!(6DsGmAfloc=P@2lvhd!D6Krhe|0|T%|Mfaul>A;9p;lXw^+>+rtK)4>
zHId87CVQDrblaP{$fb~GKX@|~MCo_UaQBvg<asD*BtKHuO_dyd&7R?<x9>^t6UmjH
z3=w+Oj($D0N-)L#{_TnzF}vyH&G@~kDJ&;OK!@itvL+-jf<9ZG<z#=|LHu3?$PF1x
zN(}8;lq+ROkkJW~seIdi3Relf7Y#MF;)K?A$WOJ=d8O%uULGCS)_*JdFnVhI{O#v?
z>j|s2&eFGIA~GS3kG(FfY8<?f9jy@csq*;n{!sJ0LY~zR#xr+!nPTOkWPoln!dPva
z*`fO#(IN;blB*$~(nZ`D@La7BfhIp#f5zViC>*m@?8@J0n|w)i`7vj(4%6lytvA~s
zHCbCv*zDgdx&NlORar73NaNJf^1Bx{-{sciBW*L4E=~D6Up6Z~s+_|`9|UbApMKu!
zna5k6I1h<#5%@-~hBNqkl?aI2t{#cGN@O=fA0juQ_04!pCpLkRa^)mdVVM_Wc8I;T
znH<S8Jp-edkx_<5YZ@n0&dy1#<6HN%=uiF5Z_=pA$^SA{u{b_%4b>wc^%DNIh1#Jx
zlxiTRT&)`c1|6G_Fm?0#B^a{#`;$iQUV2lYZDY$J7Ztn*6dkic1(~9B$0_F815_`3
z8_kBsYu9ej)21qa+ShO*QSRS$gwd9}+1teQ2T1mjgPE<?u>0G6@OthQIFP3(o@tMu
zPf#MPJ=i=UKtoB|H9lF8tMiKeQ3*@=!s@l$$+al+`jPn(mdLrCfWDG%ZLPC>64(qi
zuM(h1slH1X`Q(!la(cho{q1<;XC-TYY1buQ+Aw8N?6+AW{eNC!a1?_r5tV6WsUIM?
zvOOu)-}Y$i2-*KY`F`#$GkXQbL;Dqu&8=H9+Awu9D{$4|KlZ<uUDO3>*1so_G4sjO
z)tor>@>#V2%?Zj;cPrz)eqp8|F2!f3$C6Y1O!q46B_WW>-}u0?Sk@~1u!bz=oIsxi
z2p~0NkCj>$OOIcN-5B9XGbcXOsie(%fSPjm*sP`^$O&qJ$t(=ikn(CW%j9}hzYpf^
zaxkeJnX9hjiJC7r3ne{v^bkYI%_S;Db~e$qo(EBlyM7mK1e(F$Hvh`(FM=>Fnfyaq
z?{$Bxit7?Teg2N9w0|7gr_j3XBeKK7(n2kZ!({r7x?dO_<;~|dGAv`%lrk(n3|khB
z@t04EzHLqKf1UEQQ1|EPjk&}ovBP?A3C~k5NBnv6rgAga-;D8bfKlZg5fO=i#_G6!
zv1P|TUVdK7TzDwqzfv9>Z*;Rxo#f);Icbr*HR*3C<2^oh=Ea^_>Xu37dcBED<+FgK
zba=9h4`b<shtK(i5^3PbjJ$mkQzxKA1d$93!3-TJ4_|9%=w@w-@AZ-SAid$xnNZxW
zJo~fY_t&MzUHr*$r(JdGlk8@_9^WkPo*IsU_x9k80gcJ5se>0T(hEeX&{VIGdT!3w
z>U~XSpxQ++&7Ygki>UVSX-)4C54pzpd34}<XtJ7j+{YoW@r3)|>XOf$Gm-J|tE=AS
z|J>Skb;IU@qT-5w&fL`HlbkQvUsg=_m%gf87&v<5XrG5E>*qG(6Td!qKB%s87V2+G
z5-rKgORV+QqaJq76`SD3?;MzW)Dq~X%OAgsakI#SpOS24u3FZq_uc(()np3NgGx;w
zj`bx!ctEN2LF<112hY3HZ9Qyz?Y6(tLXXQpIVWp(y6z_Pjz4Q7?HM^w#S?QD$EfF{
z)>ry3X=mI5jU45ZD_BKP6u7%f5dJ{nfSiZrP5?Z&wg;uP&_M<tY4i)w0OYjzUjQ`V
zQEbQh$d#XKm)jv+@V<Oid`s2yXZ7`5w@(p2Tqk#7^aGhvdDa*SA+Y+uBU)=Fal^MN
z_F4-!b|`Ml1Ybp412Z2EpCAxU3xAHJs!i>n*5OIX-Sv6!cPeLh?v7HnY99yF`+~tv
zq$Fk*>WoZ#4f*6RejM3LdaP<_zw&1?6FGu!z%hpHjh?DxvBQVVq>m1W&ow@>J5st7
zuJ`mLgPFAowj9iYzn{hAf$C#4!CPr+YqfNgk6<i!;n11qxk-Ib+|M&7{-mOICh@Hv
zc~nErb9{Pq(0%?3`~FKV*$*t9QCuSllT3nnv+fQr^4<Ne<0q9!Lj&hl@5|w{EA95y
z3aE2&JbAK<{hEJsl^_i{&7oB`&b-%WzYbMR+l`U1&N_%b*r8O_-V*lp%-Y<}n%^D0
z-m&=w2FgJ-;xXspcEmp9=8%b9d=_WK=MfTlI^+!;yTDGgKFi2dZfAbr(Js-z=dcK)
zvlk^Hh-l=&W_6{Wnxx($4}2RiRl(-dqK6?u^`EESh@X2ZAF6PyZOYUVK3e62(?N6*
zj*o{J`V-n>fV}|q6gu{;m7lI$$v91~8dPJhsQC2LhHbF&2|7w8wtvl8Vze~D{y&QN
zn)s!B=6}@B^-mNS@Wq@PKrg-<htvSQl8gIUGw+eL5tlRNSLqpsbsJSGe8u0bw+vCZ
z-IDBUE520bxw(0@NY-OR?xNYR;KsCkA??qa@zZ@jN4hBXcJU<&i;A!aIhW<T44tTa
zz#Hl8LY=jcwtM-IKvScbY3G}r7cVGMFSwitiSK@YZF#1g;%CL}7aD<lm=L3K{L4Mr
z*=COf5zi+2WQL=06Yj7DfA82Ou`)38`Tf6Y<qOYMR?Mv)VM52ncQ-peMav%vW&Wv1
z+H$E*mq+ded7)bEptHN@$y3MKxwqWEc`M$LJ}FzYQq!93$`qw}E-wBmrBe6Pb9<=*
z%C3ff70c;E@9a{2mJe;Q$gSa;)MpwuXx2rByGq6Qb}79K@7Hj-Vawc|`dCv;_P9h<
z_lL_mXVbC94I?o;Bj!^0JdasZO&t7ozFxkr=hl_#m=;35USzvG+~rXlu-dN?=U{Go
zf2cdSu}gua&*i2|m#wh!=TuJ_i`V66+;8#Azx(X7{mOF3)6-Qa&XmQAEm=FSb!Uu{
z_}%{ZkN8Q2_Sfv3ayniX-VW!lIj0#2co>8}w9x<)5_|zW3e}-?`s3BPJ;>CF)#6N_
z2ij@$N`PNlTUy?gl-vbY1C9oK8?L-F^}|}xOaoPjAQv%=MgY2ZI6!d@T(`37L|LF5
z%p@Tt<@BQ<11$`cfdpTCW(aD<p4Jt#UjaQrECDdc#l?ks&mQnOoPk&n&`~jk6ECnZ
ztiU7zGtz|Stq(pPNoEf$>4(dYbQU>Sx*t*v_j8auD(*fSV|195w0CLd$fEEW%c04g
z;ES@W9(h3Wz)$&=)#qkYMozj<ZSIW@M-pS+_6S<ti19BgdaD6mcTB_U*P439x-WHk
zCG0~K_+X-OIpPZx=Pw=}bY5<)aiSo5Z<CzJcyR6Szs)qk+|~;|J>{(}^=G;E0kIkQ
zokU5R7Da_N#7TbMD!~#pHvLM^kXH*DQ=doMD-K7_^W=sXP7hz)bd+^B*4MAFn6WtR
z!mG&M{zfbZ4ci~CwkLA76go*$Y?$hDKZ+i?z`QE)QMR$ZyTEn*Ykx?Rx#L$eV@f~5
zGg<0w_o+$mdx6lo`ba&$C&$I`n3nC+(cMfkoxT(?-;VO~o?6BCk`m)P<9_EHhiJ`>
zlhdyA9TdxM4!??Rmi&D`zPC(Cfu&>FE&fkc#V_tXwKLyvad}LBTxsWyz~F}FuQdXc
zUU%xcyM@!eD59ut+Z)PEmfFaiir>hy8wv?-beDedI`2tpdT?*&o2!LOW7Bq53+xa4
zn<{_lFMU`l@XeFoW?sE<vGID1nc5L1&x*Z=2EJV-r#*bl;o9lzll7MqmgDvu&bR6C
z->l1#EEf0lN;*(~aEe_l8Zhej#ztNNfp`C-(tdCrp>pv=Yagg_Q!v&61SA05;KVm|
zs{|NtviQwU9UV(pU?3ChAb@~B_z+Eb5SVYk5e7gmcp{fBQORS-&e-TEsjp_56sjMT
za&bQKFd=}skq*xpV8`g*psfK~_S5L-RNX9qmPv7OJwW8ZR7?WUGqg46`h<f!KGkEH
zrNq_gZ}gd}OICbZJl2H<me<BqcBG5&u(My|J4kUSMmlarjxaFC`sd>JhH4Mr)UKS^
zWziF-c^7DvK7DBD-*n!{j0@-}nWl(&Ycx67@uw*zpTuu^jfUBKaKmQmv#6Bln`a_E
zwL!GKO-<P}w84Ue7u}dqBH=YNf8H&bJ8B-oD~x(Yzi!wBWGA&}@OC{2VKTgE%tJ@n
zeCqJe;+y<3qLY*NVm&TY2S3r6`SroZ-aJ#A=S+EgU|rqH8U<(NllCe902T;uZ4Z=N
zcZ{C$@K0D>+k4WpIilyWcjA2#x~3bPG&*X<k1)L881shv^=nrH!>(U+Q)Ka6n7$ML
z(gyC_c8+Gd2&5J7-W1&Ft=KQiQlWKGci-X7N$kC%Sy`{`!SZGg|18VPaPJ>cSnz#b
z(Di89$;8pxq21?i{mV}qdD=^xy?P#t>z_Z;>2Y&2R0)X+Ui);3hu*FFWkqs1*Kvts
zyyxzYuK0OX={f!^cHQ{2Nlu%+Q=xUss8>UynMGMCx%AMp#J|5aY`rh-af&P8nVi^9
z;Ns>{c@ipex(G8YQgo*l+?B&Zr`XfLEItg3eHb=bhv0~)#l<Xa-2$IZ5;$1`Ej49K
z)y{|)bMnTy1AL8W5s38!{T4rLu&{*Ss&q^8h=4~R6&djI7cX8QYk>?xSU><Emb(Tw
zTlmB3K^<LPQo`ffANcv)Yjsdp0RF_kE`U5m;`{jVd28!&{Jr#<GZ4U`0JMZ-1T4jN
z2M&#or^jrbl4YRTk)my8)?*6S%`<DR6Ti|*-CX!T3c%sKmHwY2ootM$pIXZB4R<n3
zkg&+_vEOZ7_|CSKhW#3tdnXQ#an2a?KRdVYWLTqowyrthz=)-@gtc&_`Ytkgm6XfQ
zOhHtQJYUWyZ`}3y;r0rGbe7T^U2=qCiMX$CHO_^w9M8(MV5o^`lf#&nbY5Oz+XL1j
zOhKvc(p!(m_MRgBAzpIDOdMk(VxkySd3aYmt2QhT`;lv>Rad-uk+$7`td2?W_R46V
z6klJ--HqRy7!Wi(tYQ;yl`kPq?i`R<J=jK=;8yKhr?JYH&%TE!OOV@<-oe8cdVz%g
zz^LnFXOW$rPlnE?kvZezpA)W%j!Rh+cKwx{3qanKn~|%$L*|r0`JJslKOf!mqVVed
z-hP;XrEQ=S)_Z+`GK%V!WVUA32sN47ah=Pm$d0~OX}=EBbGh^LPm}t!{rQ+=6m`Dp
zd($TnqEfUuPt+_i2}D+tK-sUzJ`@xrbnal$pC^mwmYtUVh`la*!?7{vwQMEB$yp}(
z>l(GZOh%||<mZncO(BYK8Eq<dU$8<y5h0J30G3tVkl;0qm{V8Dp{jgRYhL^LGpr)R
zZJ0p(ymk9FIdQL;n!+UrMs)w<tR(?t{u(RzDwR?+L4)qT;g+M~JrK~`;CV68mU6Yo
z5ekBIkYR0p)Ym^;?Si*sOmuVw)cgp7sGpI20ac))q1hPsxntC3E5i7Io8CE&KGPD~
z*qg<ybU|OvKMp{dCt&oRDj+e=0Io~~-@1E{Hl(FhNZy!VTfHg<fTyMST;+*lZjn?}
z)xNA(7(;Bn+j5GSF}{;9EYwDhJm}2B+XxPE`PTTvgQfG=Ua=oN6LOh5>Y0qUG@n3W
zoNXNajsfQn1~BCBAdm|aR3n5n^bJJ5cqF7;^m`ql<{mZh<F&qxYPhAMo!w!NC?-O6
zCP&m=FG*RU@K|x5$@(t=)P)8g1;TibojKl9-(B-DOwZ_)hn`uQb;S2U#@JYWY4014
zU;0)fIQK9WQpGwP;Z~zyjcN2WD0odmnHZ+mNJc7ebMvCPr*m6Yk%!xx)A{d)o#kHn
zr5-ibkDEAN8&JNs=fEQgD=UWHPX3!tqf>uuUtQcm2;<mAPzzwFyUEWYyAwP1ZG;L<
z@RP?>3%`@=VwvRh^=~S<bBCBAhj!97nIp7aim+_URPGE;_}~$Erea%EYG_^mak~kU
zDUWi8x*1`Dfz!d&2+yWl=~`UGX4UC)Y{*k!mBz|SAi(8Uqa7_E&`2}l=I)fC<pTB+
zI%uE_CqZrtbt9-1O|Akor$t0gojjQ+={gBQ5?JXlPfU!C{%jb>N(lyMW~Ob>FPuZ@
z38Dc*^d%yEyqTy93ZIaq<d2>n{gVvPibA(@2gw+RDp*mfDcW8&mF0Y9jA0zyInf=U
z2*uZuZtFVineO}H)*^O(R6Se;rkuk<+%3W+tWVXf<41D`z7(6{NACS7jL=cg7b%$i
zd77X91TXAYKrJ2V5aJ9KN=c8T3O3M)Ftu_NKuUldB361bfiI~<gR!-=;Prjey=no8
zM0RTA%k)1@w?xSel9Z3-d=_{DM$`~SIYepqTyHTgRPHNPG@bp|3~#@Z<;gd?8Ie>H
zs<sl$0-}N%7c{nBW(E?hMx!Zwezg@{Yw13wSn~c~f4SunYM+zM;s-`jgf)f;q@Y_}
zVWL%EGy9sKR(ad5@4a5#%$O$go6HC;0;wON?k1^RT1Tc1)xM8SpE6&kzY7mG3ia-%
zndrEyGRudLA7!Au!F<aAmIUFCGUA?|zl221UR2oLKH=$7>h;QF!nE|X_M(sZF^@uX
z;6I<Mc7iS~ljZC3_wREjdX^Jf?@8{Nk-aheQPbTiKP!_1l)N&RA@k6;<;r^|f0ln^
zJuI~!Zig7Nb0UV*PrjKW;!EP6C(m~$btF$wjC&FH@$TJLj$>nEF#mfon)WY{$SXw`
zR%W;fWHxAl!i$rlqu=AiNxe+dS{mAOt$Jt<Oe3^bCTC_eFCyx60)Pg}ta4EY$@LXj
ztkkyI7TOjv5zxqjjE1Oy00g(TzAgsz8uDG-7y><<M=qiQ2MI3;85hDM82g|Aq9GIK
zkF^N(tfmzVB!6~wb$$BepJ75p=8xtn1~vc(xpb)=Y$NoR@$RMZ*6NQRSz$N=1<_ic
z@l>T&mi0ls<+ugCzjr(%JeL;?jWvRkB?GGmQ&Y>BB%S?L?4NhEC3!459pcE=ZaGp4
zHUn*#PCI*L0?)p~={mxy+EfyqO?9(><9gwK!~;g_f{$TAXHRy9fkwM<wstG>)6s<h
zp5xiAZKp^1Z#p}yjc;jCH!7bV%dzd_^Q|6KO}UJTIfL)U_zc=>`m)qCrApnKj6cNJ
z4e=h(seTabc2wNX!raQDN%ro{*Kn`iRZ@5$xsb5OfAG*)zq#vy8P(1+x4v_@7wTlJ
zIZqy%x6QQ6F&6Xoy1Dr~<M&JduYGI(jtGA$%usYc(3f&<TQ@t4L#X8Hth{&j54Egx
zly_hIXk^_jax~L==NE3uB%A0fT}Uct6j0}RRj2XH#=>0p)GK(&Y<#c|PO+>myy|vq
zyshzb=j2)nF9uHDAXSO@edFQkd9fSC(%B)YHgC>{dh?{bRFVtRmcPUiFtn0-rjAM2
z{>jivVt^f!$or@-)+XDX?cRp!5sNQ7t}-Vk)SP_~oFILkLzM0@t#o<Po@XUDC)EE&
z#!-J(Pvpyf+0=AObat2HOus?az12ohE_x&@`%~trZMcolZi1-!+!Mwst9-z!r6~CU
z+!jGV&0+)}v9_9;VW$hPY}Q~`!A5BgsDOZ<2!Psf5q%>h?At<jhlf^IJf1@@C=C@-
zt-0%XOBEV_5@35+TUn74&<UMxc>zO~FJHexN;2J<eV70%6?m^<DP03Q0{#@`Snl95
z#Vvq|yO)J!_Rk;mr?-FP^Fv1lHj&-!3w%60Fx-%JoupsjtUwPURCY`J+GiS>WLZd{
z!6qQU3<}0eo@mm+f<f~3pOu^2lRncceO;fGKO6Ju+D-?CWLYp64b90LYjl)(HU$|f
z>@{U<5!Rxwf6alZmEt;jPd$O8Xq{qT_4H4b&*qGC=W6<av`8a@KI9tP-nUx5n&*TL
z?xoxnPoCmmL%2bDkW$d9N+g?nc;4+`Ort8zj__EmEct9U31<JgK-)vA?Y_ieUs;hD
z+9iFoYG?<6i~jk=r|KI`Hl8GY80NX)cgOIz?v>2IV6OVCR@uAnw;#V-DeH0%I+|^y
z6Tjzf3Z`dP?}XD^OS*gtn+=n#@!s7rrnzz%tR%kttRI@<=*LyU^qxu|mEVyb`C3Jt
zI53L?mnsMbI0o5#_chYrDgR5ex=#IBzV)-$5f-WBZNl|gsvmuu*=~^DGAK!X`)zDL
z6D`68!+y2n*20*c^>>x^X}EX@wGv~<4vifx&$WDY&r>OpCD01f&QH3{VxsI74mJU1
znH!7(F^Xj>ry|ujUX_Tg>U~?Q#DIeLU;u471Z%v|f~^`T3s#nvJ>A_KL`&`MjRj=G
zD7O|ugU~v4hCnCgr2c$Y8hZP{0MIT4cOH65kjr7MyM=Tdc6@8+YP}>u2?5EUXe%+5
z*Mg}!#*odz8r4;Q+{50%LDrj=TR`CN@)?(=CzOQo_Ov}N5bEF+V$()6q=vzZg~c%H
zMv~|F5RpB9pe$)tw9SWWj@L4b8Hw(!vg6-v3&X=(e-^vgCA05)Q~}qB?fJBh{fy+2
zNA$C@7yINRI8rWeyPQp7BO%nu8@VWId@B9Qi*d-QaZVOAJK|F<^ZK_AR6ie0CiuQg
z|8cz2FmbiZwx@Kei|n)EyHP4z5&hFtxwkhM$$3V$?=M0Y(6D^_V)kaxc)@GU^_d)9
zZq=xHAI)pVoXivf8%28O((ZLN@L#{M+S$^Qp8gJ1i}%9xm4J;t#n_D+Lp-pcPqxm-
z_S3j1>P|&PG{90WC}Q)y_xI8)!I93N0S5fJ3UdKXfo344=<>x}V~kEpf{pzC!-rwy
z4+F_qpHtl2`o+$c@H9h6!p&X=zYxu6wAF}CHb^=_OF%aVA!%`LZgB1n3L}^>7=i{_
zVACF(da}@<9Ap|8rWV<D*n-gxf;Dg;bRn~K57N^=Jo^$hgnY!o7R~fAPoL(S*9JvK
zMgn{MnkcaV)CbfDO-)Ud^+e$@d<h5xpq`+u_tNGr(Vc`o549SUXhGG3C(&WT2|;*=
z8V&Lf!HJ&6#wk30XvW^W`Gdy(eol2pQTtfqXOn9ezGYnpkpCIfg{rC-AZr34fCc%c
zTURkyOfW06o%t<pyk1I>uOQfpSdI-yPXw+)+7vJB@N4zzAOpjw`ftxYr}w4Jux~IU
ztw)-*y1sq_gxJwhI7?t@avncU0<4?JAtuVp;AjCN5)_blfs^YEMjFyQ6rblcHDTy;
z6$~u`_!wYEv<e^j&%^dL{!Ai=j4tHpnIEvK0M8EVc^7fm6_IoZ4gg9`j1Y>b9)#=H
z0EkG0zbF+z>H-V`Fk~k+13kDB2M)j-V?B_?8{X~)<#%17;Kiv;+$QhL6pw}?@Jka`
zCeo5)JB<z>PJkviOD7ZCGaJ_kJ9U8fc)|p_0|!LlW`hl3Cmc`YjLbH?on8FtBMP}W
zY7p2he1M{KeSLds3ibeXxTT|xQ2B1TT7f6}=zA-v-;Krlxnv~5v}!xKqPCw099mU`
z_xJL`LjL>r-dh{RnrSZR4q#*wn6M<K2rrBeYHMl+#Jyq0eTn#LM++L*4Jpyp2LB8U
zrVrv&5?|dcT+R%V6vD?(pV-938LQu6-@i^xjgO0iF}1JFG3s5rCa_&yU6&x~0S&1K
zx)iilkBHxZqrBr!IOngg%w3C9zdg}*98?mk+8?OF@PAT(?%?hlz}LxtCpDJ4D@rst
zOhC2(xkf*R1YoQKvLQCMm+-iU8X1{8CMCVi%rs|Ez~~q_I^io`IDg(_Z82l(*`GDd
zZxdQ48M@vWE|`3O7DY*b3!!=!yT7NPz@N<r8W;)19>D>o6w$o<`gJjw=pZ|T%~Be;
z&EU1*^}jA~%}v7>S^_RfQEDkMF?#6a<`|WJ9$O&!2nyDJP85D}AYfM+cJ}s4ad2SS
zpCEdqRXn_}@Ir+HqzXPhq7fiOOAQ}TPZs9qgWv-1GVFx^<aOxx(qW#fCK@sLcsxXC
z76oZ|`S_w6ERaN+tAW{KraNHHvk%7s>_cC2Wy0taJ!PzX>{6?>P+=z8(`wa#NWdlA
zb7ArNZzrbnU=6wBQI7rtVGvyd@M{XpYpJZ5_Fsd12Hzp8_^Or`oEqp39a=qB3zwIG
zIFaol_=($nfn6Yu^0#qrej7WjD$C<bMLcGYEon{bQPj;<_I)K?4aHSJKU@Y@+QtUF
zG|;TY;0#+!OSr6oj|U?Y+*N{Hinc081iE#&O;!r|85!^#4$w@4NggOApz0twp17%#
z)d8jE9EeMci=H5?E5B;BErh%YB6JXC_dTj<Z_h`Xj6VbH0qG&Xs0g<XM-&P*U_XTC
zT3onYpkKe%=02_Stl8>x*D*>qX=!GHg^<3o@&izSAa{g>AAM@QH&@K?P;G4ujEwrQ
zG>LkBn|3Rl7lVU?Y~d=eTF2+yw?1&}Q+0I}A%N{JxIm9goI9v(;uek$*y-}bWwT=r
z(DdpZpD8C9C#-wSIO8LLpY<ofsRB-XJW;l|NKnHpH7ViBUT7P2<)qaN@P&K&Y3Tc#
zvE(g981nhRUN4aBE)hQ9)k7R6k%O%jsEx6m?!){o!gOT2g$Kkn+kN+wxm@kPd8|@#
zhddo4;lZFVZPR34vbV=*j?wMgq7Tdf)-yahi^>6Hbd<P+fyqhCPO9CtpA!_+AKl$J
zAI@_`AWmQ!iPd5<sJ3{)=RPj(kkUmcD%rxVQJ}&$3TZ#q4(`XF{z}=9Aq?#T_qw66
z5jz>N0KWsr8h#o69$;A^=<q5WdGFs>IFEe6X^$}g*PNXCp*Afx^q&490&7bv^4CNq
zyuu4DSe3X!vW%FI!xd6x_#z;u#fXSD)X(7PLb8ZT2j&GJRE~pe=-_~p#u1ww10jy5
zR09A6E9+tE1V~dji!&f!heIcJ4#;_MfrWSzMG_p;2!A)1`!B=_3|8lChy|yxh1WMU
zfT#QhQ3;k<n{bcg4gy)1cF&%oq9O`kIKc|+rzIf*I5?J}-_X5za}Ftx%<I!E)$ihU
z*1BSUje-vg67K!`WQ6oXn*ke#UOc=r&vk3;^GIdl2P<={D;IS|EEjzrwU4>Zb|~^~
zzx+($J=4uxR`$ko@`L<B%p0ek)B{SN$|jzk%x2mu@T@(UpXYw&J3vSr0)o?V#KTL!
z(+9^Y))vT!xw*N>&~Q>@yn1!b#wH^;n6|yU8x&rgigT-;y8~S?O$K8UAZ3Lx4U~63
zIRWbyR=yZ&0+C#T;U2}s?xv=uR#vHng-ZxdaT8`ZOoT*4EQ5%y0BR}ob93-*rQNk_
z4pHFUJ7ndeLPECv6=z(CGZdIaY`^1~z!~FpN($2dSFo#WYO+TWu!Ee2jw%3GZCSyX
zrjgJ|gsWP#P-S8W+1%;|Y#@a+N=+UhVq)G9O*N7G21AV8E1ocPm69q&s6ZwW%leSm
zzPfe}smW5P)xIqa$(oyGP>$hqf#}02f!~D#$sz(O4!wv53mjk)x5nuR@UsVNC?7|$
z`m{Hw+JwJIi4ck4kO9@|b)dR162R&O4f!MwPdru@G|WUxMgVu#xVuQ1jDX@LG=Kf}
z4a25%L6H3)M+sIeK$>xmQTnT>s!l-hh;SFHq!9<5_y}B`9Q!N4hAxK<foMWGsGs2b
z!uh+Yw$>8&o79h$o7)DP4f%1u=Rz}T9xggcs1Gqp@gHni&|A|;mchUWI2pLYCR|tp
z%WxTqHDVKY{>x^>aGagkPHR1zx??v6wz7FK2_9n&`tFh|pYdAyGm$gX?0I*2wklDM
zj_fh*>yWssnAY_{^IUF^xYE6u?J@bh9<q>oy43WwG$VCa?3wH2vsR}m6N0ZXS{I^j
zC-b`|t~{x-J22hr$T0ukomcF?w~s$;8LvN)R8sV%IlQmT`F(Eo=9}c`A#wSO93eyY
zqROq$JHpUq-Y<CsPN>W~nzz5*d$VaPbPl9A)6H*tVc8Wy$)Re_s7#0J$m>2kJ>_`@
z4j*xGy9fg#BbQx%Vq-Qm>>^+n69s;w&xt0e0=*6BR|`GXAeF;OOM>AVHAHL3sf5^A
zfqrMgM~*%$5Ft0aP-ctF{lp_|?H8k5susbzC-1n+E1()p1vU_9MsOfE#s@=gKum!#
z2XB-)HRJs#8;qg12UIhbZ8_%!kG#~fb3A(5tG%kq?~uO!>AB4vn;NBuLWbbOaofd3
z3Posh73vKXG0Z@$%aL)zvgmdD1l$Yq@t`>m3V|evk`<c_ZfkiEj=?PrWN8{k9R4^f
zJpd`fh0xX(4jvuIgZ@o*&0?#7O2(l|)GcP~Zo_N`WM?$1P7|<^1U*b@>gwzYe?TCG
z$y*f&erfID96i|TV5B}_mYB!ofcjr6V2^$W){aB#%UZCt5U%ij4T>Ct%|O^G?|P$u
zNmrNXWr=*z<IWvWOo_zmP!`#c&`=iaT)<BFj$0VjhJjcJ_C(JgC@z6{%iR9M*v`0l
zs7k;v#UF$A2=!l7Mo7yvgD8A|za&kH?wNSRswy<45v}lIU~X5EE^kI8<^4Nm^5N<{
z*YbA})HygaGhg?_^)Qm3WRR!3uXR5rD$=UF;nPi6?9$Y&ke0DlvYy}#to_u@Y?z{*
z8L1W?3vN$lW+WY%ZNjF9_-^hT_yn+@*;$7lQ#3yHRQGa&g)=h+UyP~CyTm_5md7nU
zW&F&vE*e*j3<_nu`H&u4eaxEhA`C7K`c8R~Y6oK09RrO){f>K<p_3VV;J|%01-5XD
ziiYnKw+xTIXrHjq&QMWTIeu9wp!RIGjMZ>{SadW4J^go#6$0NE=Z#=HPngfNQ$=Ph
z!`QOu5HZ<4%hu4`OdO<%o7-c58&-wtoE}a`0wE-W!JRBdrQ%>we+o5n7Y1_7_LUYv
zZ2az>K)8yIkr9R?5aS>g3<T=nvV$}j!9gU?tS#ArB#$S$<0CRwqN#c4Q6Pr5LA3X0
z<<v}sl))H+uYLRW!SMxDzhv*V2FNLY{D{w}ZiY(+I;&u&BQiIQKE$E`8W^9SpD)MS
zjXVX`>mXK3fD4^kQjSCw_04-+fsBkdv?wt2#iYm|Kh}__!#}2{#5D_~k@|0&CzR=~
zTuyoU`h(U<(K}Dq)TrqWZKcaAP6f?_SbbvOkYIT0X6ee3<LsH|k4+5K)n|JDaKBo2
zg0$tT-SUfa12`o!9U7{l*yBcP-&T5W*{#|`H!>!#``xWyKaV;{uiyF`_tzo$lfP#!
zqe<zli=m>=c9iUFdL~4^L&Ku`l~VqRY?L}P@<%V#(V{uR#l9s%Q?0TOFn$2rO=HLR
zx)k4@$%w-8dCvW~(yGF-$mmE7X2Eyo60E$W*2q~T<>+eOs8$zcoAl*edOYJG{fR|3
zPxu5|HT$}&?XS}E<>!tht>NLRRy7UmRee`Wm#4pb-UyAppjh41{D^mq-<8^4GW}|!
z)4E&z*ZTAbou*_#$MrSZGrgiyy$xc#?7?-4hvr;9{dru3Gyr)L%uxh9j<Ih4==Z$9
zO&^3*16gM<#>)|pG9>N`NYe1|Vuuj2rH|(PQxil~g^ol0X@+(NJyn1q-!dKmMEPb9
zoCUU#s3IA>R+59l!65r98tFVd=08qDGv6d^2@tIxx$XIwZM)f~p;zg7XS|&*&rWzX
zR)l*6a@;3Vpi#abm{&vT&(-))?F#vk$dA7J<oj9ojUHoXImV-AN=74jaxc@dlU4WY
z<1bT&g%#%ii(k}*#H2v>kSEJvv&XFyPuI3CXIx18w)uTJ^U~Nam#<FF$Q%%PKmlVp
zcrcJv#mLAAhR|@+PWpY4o@&+8TTd@WhsP+_JYYczX`P}QWsQJ3$G)ZIW%Tz^*X$s~
zL+yX{D&ExF+Y7%RNIM}Ofs4_qXGM>_ri#k*bvleITJdx9f3YW!dZsU<?oY^s1plwh
z9LF-J5|=E;LAxxPk_zwq;_^bfhR5yIZ05V85(^DZ$#!+ZvGvP5wa4<G?$sU{<~m-V
zlTum5f5*g3jQ)-O@V^k`XqfR%-=TD?WV%#w{k><|*vHnkgf1Q@<5KN;qlO}i<>~Ez
z-%B<^E2PS19Tlkwf34@XG#%DUmSx)B7cws1r9WczxQUNkey>;Zw62ls*Pj7v6~Cgb
z6qR~<65Sskp!ycQzRXIhTmH|yy}UOtH0j~tB*WX9)r0<>(W%=%=4aY2+i_~f&xbnw
zD=Jtl7n+TWU3r}onk<s=<W%!AUsC1TOmFeyn2MXff9`ytGCy7Ar4gL8>fYEj>a$o|
z^~$xQ%ST-Q)6m>EqUNMDr+@VuX}HSIJIkVWAJoi4wdlxHRaEM~eoZ}Se`qUIy5<NI
z%>zuvhQEfPp)_{=y(+onZ(qM|Ew*34OvVZSN;qYp;)Q9?khnJz3+T9zDb?U0APzr*
z8;BHCPm+>sY;3@FFt5tm-57o9LW8920+@OzD&B*LQ!{OS;NBj1HDR!V?pwnin5|s8
z^z=VjDLs{vgTpDKOjs(Vw_1ZZgxsbFy|ey)jCM?jk8iTdS4(->#~4^WF+EK*`*xp4
z5&>laEb4X#vO_I{7X~o)<XLj4-gS51MMj!?1-g>Nr%x}T7Paf*^OO6_)sU;mu$v;%
zM4C6M@w&`*-uqYCE3+Q%o>fm0S9N>k++yAjj~A&$-R+7|Qsr)-N=b_e&1tsUN$}Kq
zH#RbsJO?L)I2xs7FCm|~JX$+l^1uh{E99(P2J?MY8>ghkz9sFIm27Qn>}u>S8Xg~P
zXt#PM>ykOXaAB+9_9X@L*oO4bC#oyGW9wsO1$WL39?MV<F;fqW?7!EzvuHBQ!=uEi
z@&lQCm=80bZE^p~PipGp+7p|fr8>lBJG%_|rEhpnzP=TpHa&HalFL*yVYIA(V>UGB
zzS7xC^sd7L@+R9KjZC}+Il1*S<`<W!H%4boz9w#J|5_7^*gsTW=7qKIE$+@EEX`E?
zXV&SC$K+z7J2gY>6Y>lE$w>i$qo?k8*FC$ZI>dQ}JT}gpu~^!3w#8I{jDjKtuc#=E
zIH7<}wm2Ct$@?w(bdzIaM35Ia6~=v_%5}ecx9!M3%=$qlalJVnl{tLB79mQ7-xp*p
zSUkuwt}$|kt0Z2n08WQ!3o;>p+22q3-((ImF=3<I3dLPH8C`4sLaG8D!kA}rmv`yW
ziDxa=!l)ooG6RN$Sh;m-2|mkdTvQ4w9zsx9qDVx+9u*x8U1}RuHBm~Kd+i)#en{f~
zr`Kb>zxd1?8e}c476|-7!U%Lt6Qev9axZTojs4}&(@U3BA~vp8joutO&CXV?#HP9w
z%&T0^$I7Mihvdph)@YSuzFy^h#=k0_a$cBvPg2@9^R72oS<ymWnjps!O(}g}G^=Cg
z9rtBpX#!v3=8N)drRkAZQ&SmSyaANjcLv5kSC(BX>=q2H?ycMN(w9R{Ta7I!Z>ztz
zW;>y3&^PG(*lTBZZ8e1(Dl1<d5{9WpVuw{KKMuqv=y{YB?92Gc&&o(4_g3o|rI3Ix
zJDtq(CsJNQw4Zkbn@5S1zkl_8!;e-w9;qu2-0!i_;p#X_5dPV&7oEIkH?7#=C9Eo^
zuC7t1)s;rY_4{mdAAR*xwbU%B?j7aeIIfc7oF+%LqndK~SX|1jFfszh>fQnta{2Os
z@_IZReTYTG&EUbT1o{=y4G6eiDk>Kmcp(=BV1pc>ZR91w5?ou?XDd&rIs>jX<a5Kn
zejO*)$Dj)<d_bT>b5KN76tk5$jvv2B?}{qzhb|<YpkYm@b;GwCVE+p(Y<g5mHRf}u
z39%L+%OqlJJ`Ql;YPN#IWC^Td0OBBV!ercpW?(V6POE&_J-;X@pjTkTaAhQ@z%Sv1
z3zwBbgX|95Kqm?cdZk6L&XUQgOV7N%p0A>!p|UA+nR@glLz;o{0F%3}%b(SQ+kYqO
zt_JN%HqQ^AJS@9usCVE%!pFgR*e?4Mjwbi_v@#uf`Yz|j_4yAqEc}sOKg(=eraq@y
z*&CLY<sl<!xcNwjGpeEb)$gkbv$Cs;KI2Pgaw-a{6uE*}p4#W@tCEIsSmx4nE$@mU
zImV%?<vw9}L*>KI9UQ6Z;_hX4><u2o?lAwoZ+k7q&Ddz`eS`Iiy`H?4u5!<6JE>n_
zLwXvQ+Q6|IlCUrIvQZ{YP4Vw@Zj6OHeG4P=UN9K><|>w)IR8h=36729g}kyoZ$!h-
zG95@5xY1pA-Qv;l-+RQIDz9X^pLW`>X(w{e$F}S1M*$)Chf*+0zjc&1>dTuqC&!Y%
z)|i~Y?%I=n8@XVsL*U08CVH-*-MZ<FWyk8Q@{?R;0f^l?a-2X=h>ea`W`Xuts#XvX
znAfwqp^@R?mJ<{9HrKsR88Po7Px;mMy{n5iN{wEr|L#;a5Wf-?7{9e&eNtH{vf=x2
z@ZI_<MQu&UF_o~@?*%hoLuztucH9mValP9*mi#*9!k1?Y^DHJwSz4KwPN>`3IWi?k
zw#u@`vQkp&=gvJ6&2n9xEOIjBPF`qge=&G*h|{7|$EbPxM(+c-*Yg|4mJW1WJ6G|U
zxjVtGe|p9~S=Nq9>7v=*)`a&n);5lpJ=%JP)zjxx2E+$EDEEJUQuvhB(o?x5{asp`
zWx^j;2@xZ<r!K{1Wx*`o1@3*<OW&Ue*2up?PIK4jtPb;iI(E4$XPau<E<H`t*q-~8
zIP&<Jp8MSt&DdMZw>Ry0`&-@@diuP%)A=Wcd3QpGd7S7_hNaVnd-Y_BXu{=;dBfbh
z<v)7QO^;NY=f{mF`H@KPF0X#EN1hI)*-3u>b9DFLE<Ox$E@Vu2>LWk<w{=1+LHa%$
znbHlKI_BGZz;{rg^lh*R`!ak|)VjQ%=Pw;khmcWDc-`35Qw)1vP%2m#^-z+=aMy_P
zzHo3_TKs33-(&T(m0#|YB?$?^<fwG<NO@6cQu)lWVL3{JS+@bQQ5ME}s~z{g`VYo}
zzTzpHZT7-#Wn7MI_u_JX5A*Jr9b7F=qK|5{BRDS73m&4ytX+;#+g=ZWz-rn0uIGnM
zb9-(lMAePG;=;JM$Ch$DF)t`_llM~9Ql%6Wbq!2#M^Tbcee8?&<KF*rnf?e%5_2f4
zT<_b{2~VE=s?a`EXYrAoeC^EH-*pG~@AfKJd_ZBkxceLfR}d<iWwf3t{edfPNiEsk
zA;cLz7{cCxjd$IUX+d4UL&IZr0{$F(x8-~y$Zks=k5-}dkNonQQ8|pG_8=vJOqaUK
z{Du50{~OG5IG|th%WMXH8_VyhUn`21J?#>r^I0%Kh0-syat|4ynkz7Acy5*S?7*9K
zt2+RDAJv)%k&0`wQsM)}HA&_6lF74NC8ta=*m>-ozUrPA6!Kw)8TUV`2ED#w|6ZN1
zTEPEonD0kI1lewi6z%gO$4`_>70PV=O1AIk3ID?Wirpssda=S=oyMEYu^MFlFBi8<
z)=32fXm%;TlmgbL15>Y&nTyi5$4?x|zCX}J^^z+>Gd6{*jl1FT)9pp~lFO1_a&C6#
zwC=xr$a~=6x1`mh@+~KxC9b8pCOGI)!KTf3NUx)u!d_6CZLxR9sRgNSS&7hG6GiQ{
zr<IFqhi(<B&d2oJr&A!ziuLv7yYTw$f2B}eG|TgQUc7)*;KAO3u3Hm$x4@Gd%IQ)T
zEBy-lIL@|YOUa7GHPI;)D?F6&t*#oMrj_6d%DeJVwyeVW>xLdBbBszYCDlm=t5inD
zqohN-`<nmul%%Kx5$OAz`)rSK2RYB~@2u5xTv4Yd5KI}l>F`X%HN$te1)tv8cWYwY
zpV#fm(O+`n`}V6@GrEmiY0>ZF*!7^nLFz1n0yk9{eXX-Y{$Ylou<9`h`HSD_dLW9{
z`$kV2(|DxQLHK7tpJSScm|=s~hg_~8^LW*gL&dRaV)}7K-RCG2<me)6&Gl4`;&-G(
z{yn8DrTIaE^Qf^L<)T%V;hzWOf$((0xa7A*Qa_2<nNy0yQ8BKdk}mXS8tNTNlPP^{
z^pYF=!ka#QW&fnmdUTWEH?;N4T<-_%-+g9`opy|-f?xK8lHYF`WG%fL98MzCb(IC(
zP#~ok(>kTG$Al{g)8<(WO=g%t<XT%>`#HY%s(}G!CgLuszY9HEuQ)m5x=`F?D80{K
zH2sZ3@R8#~c}A1bcbIk6!~WPB9Zu*myCb|}_<Gbh=3ywg`*}krBfc0I#@_@S^H%JP
zCpQnzZ|7o7dHERV(ytd9KuY+>yi^kg4T%`)Ded1!bN_E#!Sk%b<{!UEj>hbg^4>$~
zN5N8V+~4Cg^G$8{jZMNcbt-}QW-&+CT`OGso3>@7`7)=Ed@$&h_ELCP`uQEYXK1qk
zwf<m7sD6yZ3gRAEe)lZ#UPge&KnmFP-^ewJhlU54Izo7O!dc<h>2KPj4IRCBL_8yN
zB3TKV5lsa&<bJ-j3S5M@-`{tL7L?2Ie!V())TZd)v#oEM-)kSNA3wLMKzSpPBQB1K
z?7nXR8yD$J;e=26hcl<oSldU&9IkUgMT_<r2KPu@o;b914o$*nw1FM_B*knganBnq
z@}QX3<GBdP3&3@Atl%}EpHHNid35?FB#eJ?#00(aKNF=*Qdh|9r-;0n>dJKN`#M7p
zy+-ASs~65?T9S%|q~|*gOr0*e8FXO(?~&|%bH(=KZ_WLW?IiOh-Th=-@QSzNpSDBl
z`JCi+0Us)Rl)yknkFvvd30*_@#a`w)kbhye#zN?_p5DO&2jC-EvHZ)UKq0&Iok2GI
z&|oomCD&Z(KlInr^F7!Y4h}@pISx6%DoM%7dz7udwH<5pJy5FC=bhGU?IMsbL`g{}
z_lWFG>x;}tYqtkXnp{C^6JuX9rOK^%tIh`A^9%c(8-Bw_^=?^?ub9sF`@8&CCbM$T
zM@BIgw^3E4MPF-`e_BWg<evWvgaCX06$*vBhXVir-+c4UfddEhdi~>$5eb3_3JO}f
zbg5G!f^mmK5sM9TFx|79K*;B?xSXLFF54Zr)j<H}Ci)c)8yz}c@V6L^Mx)V4l1w7^
zFL-!(Jk}uWU&dfC7>&lq)JuV)C<H;2N+m5{=;XfvKLq_FRP%HD00000NkvXXu0mjf
Dg+;w&

literal 0
HcmV?d00001

diff --git a/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png b/docs/assets/design/hybrid_kv_cache_manager/sw_attn.png
new file mode 100644
index 0000000000000000000000000000000000000000..10aa6146dc7ab478dc60c9383ab9fdb1c5de9d30
GIT binary patch
literal 4560
zcmbtYcT`i)mq&hnMG*vnAX24w5D2~fq>4fSBho=4p*KOg^biOLh*U9vAfVI;NC_oK
zQy>(n2{l9lp%+D@3H#i$f9=^ncF(@^&bu@3y_val=g#MT=Az6XhK%%_^fWXyj7E3$
z9?;Oxx&UR3%e25#w}%X;p}C%8q<80G2zqNi)QXjGOt;&e{c$JemU`^lJCey}*Du;K
z7h7D^FNtkv4-`+f5qmsZkr@49vZ6UzXYx@kVajI1rf6!4y)*Xm<vIk9^VR#bLNRmN
zU%s$a<)56uueBdX^70lA8d20o2ip_^Aj5sAUf4xo@aPUf?I}s1Ky&IF{pt!JU(yP*
z0^-&`_e22kZ~9DQKt!{Ny#@r0xS9L~Ks<Qs{8#gmfA|0G<8I9V)G(=dU&q#Vc6+`x
zE;jaAVBpglJy%z1IrM#3SJ!Q7suP%^BSSgcS@~P~KA4i2__49^Y@8J)&cvtHgM|S7
ze%5{z64HD)!e-d9I@93EDq{Q@YXbs-a5&syfbi&EVI&^RlcvyAij$Cb(V~3sj|kKn
z!>mwm2Jy&7Y+HHXb8>PLoGFCLr=+AvOG)`vP4I<iEaXz|6$kF8a2N-mv2lN%_w{C<
ztUoVt92!yBRC73<tDVUNgTV%$x_9{i75-QA{(+W#eG1}D%PA~0eLrq*XGd9nN|?#Z
z<K#?n@$z~hZ7#;K_>c{Sh<(c5Xrp$C=%1k1mG^@buhSW#ZkSgm(P)PPs=pYF!GRnh
z-&<M>E?+tkPWD)zsaUrODaP9QI5`z1B_-9@*T=@vB6}Jem0-IqU{+RE{oZrf{Zs7?
za-;H8=4hscls6ABbep4&qp53z1u7{I*^4MA>IkcFrxq6%qfn?4cu0tfwszqNRb5@Z
zva%AIiQjU&$-Sx!Zp`%ABscyHYk4u~B}T7*c`pn*m&*R+s`}cOnXgQT5%V@JmzD;-
z#RotPyKmMd3{Gz-Ph=kWtLKU=1#&Ej4e)Ss4&d=`v?7jI-&a&t#_JSSS4)xPrx_Jx
zWz&XS-sI&41qFGyyW4j}&iD1*MQ-fDOiWB192{zEYtz%yLs;<iJDVi#vJySZ@d<Z|
zhRxvu@~DW<sn5F{V|?Nkb63a=N?7;uq$fRtt#9Bs{DtrBoab_C%EHF`2_&mTQMthR
zIb@!EKWwq}O3aSP<G!K6!54s2)z?>6SHtCBEBnlS-c3(^@nUdzn0M3)3XLPHyJB<d
z>w*lqUQhd}+W^LY+KL;6FRIY|SzCXdQCKq$f5A$BNimRPRGj)@HavA|Y)syBs;a@0
zP-EMa>PGg*YH4XzS68>Rv@|p{Oixb(RrwK~Rq9rfBY3p)iMw7!39$CW);;gC@?S}Z
zaq^}@3@8L0OHSnncRk?(0h_RvnV>JSUey>dxWT7N{E2F)zHlVuU1gt=r4+mCHp-`>
zlIXw6J}SV*usPqnZ$8Q+Sbv345gw3JWxszWTE{5twa})jpio~_<fIqw>iRw*ApwKg
zL9vQDa~Q5Wn+vhTBM{GbJL%%%<Fm4|kVqt;e>(YIo0Wq@L2GNcy**E351Cw~Yjp0v
zw|_3x3SRD+{~&EWZP+xS(UQi0e3*I}DmmpkT{>I8U3<fjccD>7jJxOM{l(vbd`9*u
z(%1Nf*ciYBO^2qfWX);eo-{|Zi^#cGuU=(ITDrjDAT~BX6>iRygMQIWuMxqL_wNa(
zMQv?!Gc#e+b#RV#IN+#L+pA524<10yj;({rSxk;L`MGzR8fuS2Y<gnhz>0{?WZ1(z
zdet)9Xmw@ywY5p}^ZDK6yUJgBdq>yy2L=WJJju<?{r1gFt59}!E*n|lZH_}H>8yP@
z$ZnZnB1uH;CVSPF1U4w|Br=A)42HJs;dQxyEjTiqBh1sDA0*MyQnh#zaN_(1zq6tB
zvibBNYkVxlvToc{lJ9v~qski5l<OceYHxq|2VF4y%`Az;2LffG(Wi5+mNj<MGcyvi
zdU|!vD=RDhD(KZ!Hw1HvL2_DJN5~$`)HH37;PR>7b)==WRo1$$XJCL{m%C`mD6Z^U
zx2)I$hyyP_MI!)L)))!5)|$k^YKaF8H!t7`l44>42<|yzgl7KSflEbzFl5taX;Usu
zW<&7XjSm=^>&d-PySAIg0pICHET&()VBq5`)K6yTM@&pkD#**z($e0h$a63xYjdjy
zd4+@=?vlGX{MBu^5M7<(HTBnZz;KJ~3`?lDDsrx?)p8OXxFVf$=W;(@@}iWZW7zrE
zsV^Q3qtubY;D7+zX1}78q58_Vv@wf=uNBPA&B2_E{A^rYvjjr$){NrKl}BvnEob@e
z>`ItvNN7;`M_h}L(u8@AlSZ$_f$Jy4)R*<Ng@SvQj{*u*9b588HeT~IClRXv06uZ|
zeV7C4yWLbs;A-!|RkrtgZmW-NF_ayB&$3OFk@n0^Mwa>LnxGg0uIos#ZyapR>N#=p
zD=7TjTl(syWZQ^Jbncgukx`P9OE(ts_4SoyqrZRuJ{+g?&f3;iJ7o8ku<$>omaH+I
zU{cWOwekJ6iwD&ylkJ-M$R#Pq)26d?3WXB8Kyr59h-PMCI3OJ!f2fVmFDMe9#3BkY
zh1Hc|&Cb6bX*@*$7|3C;qHQ-wtmhYc;`-3!XeY!!>SRmV)N~{^HWq<El#~bs2L8eZ
zP`{%@GBYzb=UbIEHSP8P`FdeM<*Mza@zVipyFk&~>em@LCqvFRd>OER+IT@0j96`?
zmRP07Ed-~B-=<nsRTY5rV`F0_C#cGbio@q;l>Q8qzFNM|lPA!J4~4qz7^T}y=zQAv
zqM{9=l$h`6TRi!*?JL;iT>c>y9c}Vxb)aIjoLUSWDfe8x+Z2_1T1v3RqrP@`>nf&g
zE4+|$&aHg=b~kb^#&pbgmGg8>?flu!EvAmro~6Q?f;H-I6aU>zCm9_vVd2uZZ=-u*
zDQ=jWn$(mO&4^=gJ|xFmK)_lah#i5?3jQcFr?s}UI0MF3Rac*#p8oS^Lrjp3f!oC1
z8%_m>&b%S&bR20&KgacAv`<gk(+<ct%SIO-1cybP(s89pj%LdB7)?CVa`UXI#)vn(
z`93v0mCJGFvbNJCM^RE#oNo&qCy_czOP7Bv<F8%2)^Vys2Gx(Uu(I0Q+n1}*N6H8b
zhdINhWR>f3I?7L+KDHMW^>_VNdfcbT<4c;dlPph<55@=}jzilyiu2KE5eQnPxc1$<
zfAI@NaEW_+drxl<0Ku+m6Kw!qR~Yho-`sy24))d*!S`?UVz7P!rIqideQTvowA=<N
z1cyCKH6@N!fytz2T3p!a!9V{t{z~}1LE0NGPEPu(SCf;IGq^eW*~`h={{H@By3WfK
zN}}RBSp1nepg=fFES5)_>$iPYx5`~&_iu04w4)~65b-A0$F)T!m!&Q76jmlCK8ORE
ziw%Na{7zZn*<1&yjCTHuzvo)&45(vuj%<Q5^e-Rg;pUc-mX?-~7#kmFP410}iIJ0&
z3*H%M#w=s}RM1y%$k?@oTsP&``Or`s=IRywW&~FFdD4Dss4$8>Kq%0nr!3<kRjs+!
zd(%FH@X@tSm+m+e)Ek^-F^P&ykh`<lny4)-jp?dk;NPFzE7=*@I=Q_pH5@wm_4TZe
zDjNFW<;l^4l|`WU4$VM#9TW-$=-3k&Oziki$v|{!DkM0#$vHhlDq`X2froGTeTJyG
zk&WqRi<gGwA}{l6>OxuZDE!p+m}?&CSSfD3c@A%F0sluJX0fY3^QtIRPr%7xo1ks4
zsT(Mx<JLtXpu&v&(494y?57D|1Kd44>ZZLkH8l}0(P%qAzuorm-#sunq#MF4D5$ZZ
zpa3I_tNo^$1MyhrX!}X#1;|S{)ml<Z>)G#>P=<)4A3x~#3LVflILBw_i%LB|Mvjzr
zUcez60o;v>CwdyCb+2x3e~AkB7z9u<&X&}=_ba{Gp0wCzgJd3?m{R6&RrOqG)w&=F
z*V=UZR8dh;ZQEpHZ7mF3A|lyYSs-?H5fPDr-4BTqWZvZ73Jiw&^9cCua*<&gZ$BQN
zhFjf}cQ41xHGUZx7?y+mG-|2iEHHDB{k?y77+NH^G+3~369P?$AxmTzvkc`aR<|9l
zW%ZMbiVTJY?UA)5lCf%zJf?LnCU4Z{)2t}Jf+{_~6QgRaq-1g=t+~0mC%%1qsde;r
zBA66G-PuW_w&o&m1u{TO3gbFgt^fjH8w@GsJQjr6`B5`_HPOR0nv;sU=ZKfsA+~$E
zj-O{{faze{+Rz-VLM$i=NlC+CWV`XjxIcw8Odg&!^ZsH9V>@xFerJb|>h}(6>S<B>
zjNX4Dd6xwZ=;-KN2Xi~geu++Fz}BLtr^jNkSFT*C<7Cubml_`*huYen^rZ`4=C8GC
zi#m_g*6zqT{AMh~ws{s(TUy=SLXoYVypMR9lxJXcXZyEnA#bup*0LoVsnjPiACW*B
zV+_aKjdXjq9KFn!U{GqFpUhlQDWMhhywdD;H1qm)*sFjw3V>t`Sbl!~99c&Z<VQvZ
z1|awi0?}GmM(3^&3&2I1TU(b<g2vBwKZxNZkIpu#?~6`?KMhrtw=bqKuWxJ{JK!Tv
zca;%<9alF7x2}y(bJ~PGq@IB|Q<Quc+T!S#t}!tQBIFem0Jo<C!SK;WT;~lst-mzw
zuak$NnAbf#k0k`%{h||?zyA&-;r!{lYrMuvj8q@E45Ry9>I3}!f#~22y0fm519{J<
z83q;-q8$I)*xK3xPAS3LbLo?ur~jHiKRPLENdj)T!w7Dd`A26XmVZNRC@aU(s(p`G
zfOrhiFG|Ik@3r%_Erfj?3YRM_dLrpHl)5$g6Q;{E^u@GB>SzeoH4$&MB4_o<&D_VQ
z2B7;*O~Fr}%H=~yxCnKm9m4F%6T!lfug{Z1o0Q`KI^FD^MWwj=1~n=w5iIn4hI{-}
z+?-vOTE#TB1kc4~76L7pKp^+MR$P(Ves9nwk-WUT4DcNfkFcLBLqOyjZMq)mz8i*B
z&5=()>ZM<pdS#HELHHVud#9!r@@r}$Fkne1UrNNdTB`lMuN!Bh3*BT9$IEu`r^lVK
z0FOhXxz+Nu!VkQprH!F^Mrf@F&RMU}B~evhVmMsyU5wnaqx_xACjPlts$GBgDR<H1
zTOpH=Sn+br$2_e+Jq4l0bR`Ue$)859@~)89x3+V+>ay3(<HrD`w8S{ro0)xt_0se4
z@!jN2z~$m_kr2qRC#D@KntDrAwnO0TEkIr07r!*HgsX;r$I_%S)7#c}M;yGm1bO{C
z78C@Xe+ro}>>C((RY2(P>MH*9X@4L~QiInQFlKi4LvI2<x&Iy0JzA2<+hfPaK7B1@
z@_YN>q<i-^f*m>cXcZZ~CHyss_jz13)Ex<9a(9OhOI6Mv$u`Z4`hoDH8O7H#16Br)
zDjm9ddh}igN1fj27$y=`z~G0CdlYPQ^JDy&!tGIo+eaq^b6tDrjalcwk@$Kaegs>V
z+bf*$;3nWeTjCums<C-_c{w>ZR?K3{LHG#|hT|=%EpF*8?ma3?naLZaVtosXK(fw&
zaxL}RGd{UrlAsdV&tqAWB?$!cq}(LE7X3TsSGu1U8jED~J8S(Gmk28-L;fZL{x3)W
z|H>I;QjOu?h{O+NNMD$Rz*A#?;ZYzEJp)a4MAR|>d|X$ian6z|_{Sr%z=^OoMopb8
zo0wH!ui=1+n-*x?tPC~!djgNjBoeLm3j0%<4R7V(vvA@pz}sDqv8lgTw$DxL|IA42
o+TA{-13vt}(-!|N-*J8+BiqL-GhLJu=$yt#AEH;I^EmcD05IUwsQ>@~

literal 0
HcmV?d00001

diff --git a/docs/design/hybrid_kv_cache_manager.md b/docs/design/hybrid_kv_cache_manager.md
new file mode 100644
index 0000000000..8f17b473ad
--- /dev/null
+++ b/docs/design/hybrid_kv_cache_manager.md
@@ -0,0 +1,245 @@
+# Hybrid KV Cache Manager
+
+!!! warning
+    This document was written based on commit [458e74](https://github.com/vllm-project/vllm/commit/458e74eb907f96069e6d8a4f3c9f457001fef2ea). This feature is still in its early stage and things may change.
+
+## What is a hybrid model?
+
+Many recent "hybrid" LLMs combine multiple attention types within one model. For example:
+
+1. Sliding window attention (sw) + full attention (full): gpt-oss, Gemma 2/3, Ministral, cohere, etc.
+2. Mamba + full: Bamba, Jamba, Minimax, etc.
+3. Local chunked attention + full: Llama4
+
+To serve these models efficiently, our [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] must:
+
+1. Allocate different slots to different layer type, for example:
+    - Full attention layers: reserve slots for **all** tokens.
+    - Sliding window layers: reserve slots only for the most recent **`sliding_window_size`** tokens.
+2. Support layer-specific prefix-cache rules, for example:
+    - Full attention: a cache hit prefix requires **all** tokens remain in the KV cache.
+    - Sliding window: a cache hit prefix only requires the last **`sliding_window_size`** tokens remain in the KV cache.
+
+## Definitions
+
+1. **kv hidden size**: The number of bytes to store one token's KV cache for a single layer.
+2. **block**: the memory reserved for kv cache are divided into multiple *blocks* with the same *page size* (defined below)
+3. **block size**: number of tokens inside a block
+4. **page size**: the physical memory size of a block, defined as:
+
+    $$
+    \text{num_layers} \times \text{block_size} \times \text{kv_hidden_size}
+    $$
+
+    `num_layers` doesn't mean the total number of layers in the model. The exact number depends on the context in this doc.
+
+    !!! note
+        This is different from `KVCacheSpec.page_size_bytes` in the code, which is defined as:
+
+        $$
+        \text{block_size} \times \text{kv_hidden_size}
+        $$
+
+## Allocation
+
+### High level idea
+
+We use a single memory pool for all layer types. The memory pool is split into multiple blocks with the same page size. [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] allocates different numbers of blocks to different layers according to its attention type.
+
+The core challenge is ensuring every layer type uses the same **page size**.  For full-attention-only models, the page size is straightforward, defined as:
+
+$$
+\text{page_size} = \text{block_size} \times \text{num_hidden_layers} \times \text{kv_hidden_size}
+$$
+
+However, in hybrid models, `num_hidden_layers` varies by attention type, which would normally produce mismatched page sizes. The cases below show how we unify them.
+
+### Case 1: toy model
+
+Let's start with a toy example: a model has 1 full attention layer and 3 sliding window attention layers. All layers have the same `kv_hidden_size`.
+
+We let each block to hold `block_size` tokens for one layer, so:
+
+$$
+\text{page_size} = \text{kv_hidden_size} \times \text{block_size}
+$$
+
+[KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] allocates a different number of blocks to each layer.
+
+This case is only a toy example. For real models, please refer to the following cases.
+
+### Case 2: same `kv_hidden_size` and a regular pattern
+
+When the model has more layers, e.g., 20 sliding window attention layers and 10 full attention layers with the same `kv_hidden_size`. Calling the allocator once per layer (30 calls) is OK but becomes inefficient. As a solution, we group the allocation of layers that need the same number of blocks to reduce the number of calls.
+
+The grouping is feasible because there is usually a beautiful ratio between the number of different types of layers. For example:
+
+- Gemma-2: 1 sw : 1 full
+- Llama 4: 3 local : 1 full
+
+Our example can be regarded as 2 sw : 1 full. We can allocate blocks as if there are 2 sw and 1 full in the model, and repeat the result by 10 times to generate the `block_ids` for the 30 layers. The page size becomes:
+
+$$
+10 \times \text{kv_hidden_size} \times \text{block_size}
+$$
+
+Assume `block_size` 16, sliding window size 32, request length 112, then for the above example model, we need to allocate 11 blocks (0-6 for full, 7-8 for sw group 1, 9-10 for sw group 2).
+
+![Allocation Result](../assets/design/hybrid_kv_cache_manager/basic_grouping_example.png)
+
+Here, "/" denotes no block needed (sliding‑window layers don't need slots for early tokens).
+
+See the formal definition below. The layers are divided into multiple *KV Cache Groups* so that there is:
+
+1. **Identical attention type inside each group**: Each group only contains layers with the same attention type and thus need the same number of blocks for a given request. This enables layers in the same group share the same block ids without memory waste.
+2. **Identical page size across groups**: Because our memory pool only have one page size.
+
+Our example model is divided into 3 KV cache groups:
+
+- Group 0: 10 full attention layers (full.0 - full.9)
+- Group 1: 10 sliding window attention layers (sw.0 - sw.9)
+- Group 2: 10 sliding window attention layers (sw.10 - sw.19)
+
+Obviously, it satisfies rule 1. For rule 2, all 3 groups have
+
+$$
+10 \times \text{kv_hidden_size} \times \text{block_size}
+$$
+
+as their page size.
+
+### Case 3: same `kv_hidden_size` and no regular pattern
+
+Unfortunately, not all models have such a beautiful ratio, and approach in Case 2 will produce too many small groups. For example, Gemma-3-27b has 52 sliding window attention layers and 10 full attention layers. With the constraints in case 2, it would be 26 sliding window groups and 5 full attention groups, each contains 2 layers. The allocation is still inefficient. To reduce the number of kv cache groups, we group layers using the smallest layer count among all attention types. For example, min(52, 10)=10 layers per group in Gemma-3-27b. Then the grouping result is:
+
+- Group 0: 10 full attention layers (full.0 - full.9)
+- Group 1: 10 sliding window attention layers (sw.0 - sw.9)
+- Group 2: 10 sliding window attention layers (sw.10 - sw.19)
+- ...
+- Group 6: 10 sliding window attention layers (sw.40 - sw.49)
+- Group 7: 2 sliding window attention layers (sw.50 - sw.51) and 8 padding layers
+
+We will update this algorithm if this heuristic leads to a bad result when a new model comes out (e.g., 20 full + 30 sw, the group size should be 10 instead of 20).
+
+This case happens in Gemma-3 series models, and models in case 2 but with eagle speculative decoding which introduce one full attention layer. The solution has some memory waste and is not perfect. Please report any cases where padding overhead becomes unacceptable so we can refine the algorithm.
+
+### Case 4: different `kv_hidden_size` (mainly hybrid mamba models)
+
+Some architectures (e.g., Bamba, Jamba, Minimax) interleave standard attention layers with Mamba layers, where each Mamba layer's state size per token can be much larger than the attention layers' `kv_hidden_size`. Because we only support a single page size across all groups, we must reconcile these differing hidden sizes.
+
+The current algorithm is:
+
+1. Increase the `block_size` of attention layers until
+    $$
+    \text{block_size} \times \text{kv_hidden_size}_{\text{att}} \ge \text{state_size}_{\text{mamba}}
+    $$
+2. Pad the mamba state per layer to
+    $$
+    \text{block_size} \times \text{kv_hidden_size}_{\text{att}}
+    $$
+3. Apply the grouping strategy in case 3.
+
+!!! note
+    This can lead to more than 400 `block_size` for attention layers, which is too large. Another padding strategy is to increase `block_size` until
+
+    $$
+    \text{block_size} \times \text{kv_hidden_size}_{\text{att}} \times \text{num_attn_layers} \ge \text{state_size}_{\text{mamba}}
+    $$
+
+    This padding strategy is still a work in progress.
+
+### Case 5: KV sharing
+
+KV sharing refers to a layer using the KV cache of another layer, e.g., gemma-3n.
+In these models, [KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager] ignores all layers with kv sharing and only allocates KV cache for layers that need kv cache, and some patches are made in model runner to apply the allocation result to kv sharing layers.
+
+## Prefix caching
+
+For simplicity, we assume `block_size=1` in this section.
+
+### High level idea
+
+The block pool uses a dict similar to `tuple(block_hash, group_id) -> block` to catch the full blocks. That means the same tokens of different groups are cached and evicted independently.
+
+When a new request comes in, we check the cache hit prefix of each group, and return the intersection of these groups as the cached prefix of the request. See below for the detailed algorithm for checking the cache hit of one group & performing the intersection.
+
+### Case 0: full attention only models
+
+For full attention layers, blocks are allocated for all tokens in the request. For details on the underlying design, see [Prefix Caching](prefix_caching.md)
+
+To find the longest cache hit prefix of a request, we enumerate from left (the first block) to right (the last block), checking whether the block is cached, and exit when cache misses. For example, we will return the first 7 tokens (0-6) as the cache hit prefix in the below example (blue blocks are cached):
+
+![Prefix Caching of Full Attention](../assets/design/hybrid_kv_cache_manager/full_attn.png)
+
+### Case 1: sliding window attention only models
+
+For sliding window attention layers, a naive implementation for memory allocation is to allocate `sliding_window_size` blocks and fill in the blocks in a round-robin way. But this naive implementation is not compatible with prefix caching so we didn't pick this design. In vLLM,  we allocate different blocks for different tokens and free blocks that are outside the sliding window.
+
+For a new request, the cache hit prefix only requires the last `sliding_window_size - 1` tokens being cached.
+Let's say `sliding_window_size = 4` and `block_size = 1`, and the request is a 15-token prompt (blue blocks are cached):
+
+![Prefix Caching of Sliding Window Attention](../assets/design/hybrid_kv_cache_manager/sw_attn.png)
+
+There are 3 possible cache hit prefixes:
+
+- cache hit length 5, compute prefill with [2, 3, 4] → [5, 6, …, 14]
+- cache hit length 6, compute prefill with [3, 4, 5] → [6, 7, …, 14]
+- cache hit length 14, compute prefill with [11, 12, 13] → [14] (most efficient)
+
+We can check the cache hit from right to left, and early exit when we find a match.This is opposite from full attention, where we check from left to right and early exit when the match fails. One potential cons (compared to full attention) is that we end up iterating over the entire list of tokens when there's no match, which is often a common case. This could potentially cause non-negligible overheads, but fine with full + swa, as discussed below.
+
+### Case 2: sliding window attention + full attention models
+
+The first problem is how to find the cache hit prefix. We need to "intersect" the cache hits of global and sliding window attention layers by:
+
+1. Get the longest cache hit for full attention (scanning from left to right)
+2. Get the longest cache hit for sliding window attention that is within that length. Implemented by checking cache hits from right to left starting from the cache hit length of full attention.
+
+It can be ensured that the resulting cache hit of sliding window attention layers is also a cache hit of full attention layers. This is more efficient than finding all possible prefixes of each group and doing the intersection, because our approach can exit early if there is no cache hit.
+
+The algorithm applies to models with exactly two attention types full attention + X, where X can be an arbitrary efficient attention algorithm like sliding window, llama 4 local attention, and mamba. It doesn't support models without full attention layers, and models with more than 2 types of attention. This is enough for most hybrid models at the moment of writing this doc.
+
+The second question is the cache eviction policy. For now, we use one LRU queue for all kv cache groups. The blocks are added to the LRU queue when freed, either because the request is finished or the block is out of the sliding window.
+
+### Case 3: mamba models
+
+The prefix caching support of the mamba model is work in progress. Once implemented, models with mamba layer + full attention layer can be supported via the full attention + X algorithm in case 2.
+
+## Implementation
+
+### Overview
+
+![Overview of Hybrid KV Cache Manager](../assets/design/hybrid_kv_cache_manager/overview.png)
+
+The `KVCacheManager` is organized into 3 layers:
+
+- **[KVCacheManager][vllm.v1.core.kv_cache_manager.KVCacheManager]**: The interface between the scheduler and kv cache management system.
+- **[KVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.KVCacheCoordinator]**: coordinate per-group SingleTypeKVCacheManagers to generate the allocation result of a request. Depending on the model's configuration, one of these coordinators is chosen:
+    - **[KVCacheCoordinatorNoPrefixCache][vllm.v1.core.kv_cache_coordinator.KVCacheCoordinatorNoPrefixCache]**: Used when prefix caching is disabled.
+    - **[UnitaryKVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.UnitaryKVCacheCoordinator]**: If only one KV cache group. The prefix caching logic is simplified as no intersection is needed.
+    - **[HybridKVCacheCoordinator][vllm.v1.core.kv_cache_coordinator.HybridKVCacheCoordinator]**: Handles exactly two KV cache groups (must include one full‑attention group plus one other efficient‑attention group). Other cases are not implemented. You can disable prefix caching to use the KVCacheCoordinatorNoPrefixCache.
+- **[SingleTypeKVCacheManager][vllm.v1.core.single_type_kv_cache_manager.SingleTypeKVCacheManager]**: Each instance manages allocation and prefix caching for one KV cache group, implementing the attention‑type–specific logic (e.g., full attention, sliding window, Mamba).
+
+The blue box in the above figure shows the case with 10 full attention layers and 20 sliding window attention layers, thus:
+
+- use `HybridKVCacheCoordinator`
+- use 1 `FullAttentionManager` and 2 `SlidingWindowManager` for the 3 `KVCacheGroup`s.
+
+### Memory Layout
+
+For a model with n `KVCacheGroup`s, each with m layers, we allocate m buffers. Each buffer is shared by n layers, one from each group.
+
+The following figure is for a model with 10 full attention layers (full.0 - full.9) and 20 sliding window attention layers (sw.0-sw.19). It follows "case 2" in "Allocation" section and is divided into 3 groups:
+
+- Group 0: 10 full attention layers (full.0 - full.9)
+- Group 1: 10 sliding window attention layers (sw.0 - sw.9)
+- Group 2: 10 sliding window attention layers (sw.10 - sw.19)
+
+And for a request, we allocate 11 blocks with `block_id` 0-6 to group 0, 7-8 to group 1, and 9-10 to group 2.
+
+With such an example, the physical memory is divided into 10 buffers (`KVCacheTensor` 0 - `KVCacheTensor` 9). Each buffer is shared by 3 layers (e.g., `KVCacheTensor` 0 is shared by full.0 from group 0, sw.0 from group 1, and sw.10 from group 2) and is divided into pieces with size `block_size * kv_hidden_size`. The KV cache of these 3 attention layers are saved to different pieces of the buffer based on the allocated `block_ids`:
+
+![Example Memory Layout](../assets/design/hybrid_kv_cache_manager/memory_layout.png)
+
+!!! note
+    One logic "block" is mapped to 10 pieces in the 10 buffers of the physical memory.

From 2f13319f47eb9a78b471c5ced0fcf90862cd16a2 Mon Sep 17 00:00:00 2001
From: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Date: Wed, 27 Aug 2025 00:41:36 +0400
Subject: [PATCH 615/932] Enhance the pre-notification policy (#23532)

Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
---
 SECURITY.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/SECURITY.md b/SECURITY.md
index 414669fb37..d6319cdb1a 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -42,4 +42,9 @@ For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we ma
 
 * If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
 
+* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications
+    * Substantial internal deployment leveraging the upstream vLLM project.
+    * Established internal security teams and comprehensive compliance measures.
+    * Active and consistent contributions to the upstream vLLM project.
+
 * We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.

From 6421b66bf4894a3e1e22d17c78901e3974173e09 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 26 Aug 2025 23:26:46 +0100
Subject: [PATCH 616/932] [Docs] Move quant supported hardware table to README
 (#23663)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/features/quantization/README.md          | 48 ++++++++++++++++++-
 docs/features/quantization/bitblas.md         |  2 +-
 .../quantization/supported_hardware.md        | 32 -------------
 3 files changed, 48 insertions(+), 34 deletions(-)
 delete mode 100644 docs/features/quantization/supported_hardware.md

diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index e18c128f30..4605ba7781 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -4,7 +4,6 @@ Quantization trades off model precision for smaller memory footprint, allowing l
 
 Contents:
 
-- [Supported Hardware](supported_hardware.md)
 - [AutoAWQ](auto_awq.md)
 - [AutoRound](auto_round.md)
 - [BitsAndBytes](bnb.md)
@@ -19,3 +18,50 @@ Contents:
 - [AMD Quark](quark.md)
 - [Quantized KV Cache](quantized_kvcache.md)
 - [TorchAO](torchao.md)
+
+## Supported Hardware
+
+The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
+
+<style>
+td:not(:first-child) {
+  text-align: center !important;
+}
+td {
+  padding: 0.5rem !important;
+  white-space: nowrap;
+}
+
+th {
+  padding: 0.5rem !important;
+  min-width: 0 !important;
+}
+
+th:not(:first-child) {
+  writing-mode: vertical-lr;
+  transform: rotate(180deg)
+}
+</style>
+
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
+| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
+| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
+| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
+| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |
+| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌          | ❌           |
+
+- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
+- ✅︎ indicates that the quantization method is supported on the specified hardware.
+- ❌ indicates that the quantization method is not supported on the specified hardware.
+
+!!! note
+    This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+
+    For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md
index 6f53a448ee..53b689ad53 100644
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -5,7 +5,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic
 !!! note
     Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
     Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
-    For details see [supported hardware](supported_hardware.md).
+    For details see [supported hardware](README.md#supported-hardware).
 
 Below are the steps to utilize BitBLAS with vLLM.
 
diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md
deleted file mode 100644
index 06264d08b5..0000000000
--- a/docs/features/quantization/supported_hardware.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Supported Hardware
-
-The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
-
-<style>
-th {
-  white-space: nowrap;
-  min-width: 0 !important;
-}
-</style>
-
-| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
-|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
-| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
-| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
-| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
-| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
-| BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌          | ❌           |
-
-- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
-- ✅︎ indicates that the quantization method is supported on the specified hardware.
-- ❌ indicates that the quantization method is not supported on the specified hardware.
-
-!!! note
-    This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
-
-    For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.

From c3b0fd1ee670079649cd58abd99376bee521a8ff Mon Sep 17 00:00:00 2001
From: Zhonghua Deng <abzhonghua@gmail.com>
Date: Wed, 27 Aug 2025 06:56:16 +0800
Subject: [PATCH 617/932] [V1][P/D]P2pNcclConnector supports flashinfer
 (#23536)

Signed-off-by: Abatom <abzhonghua@gmail.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 .../kv_connector/v1/p2p/p2p_nccl_connector.py | 158 +++++++++---------
 1 file changed, 78 insertions(+), 80 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index 25675d70fe..2485c57d86 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -30,27 +30,19 @@ logger = init_logger(__name__)
 class ReqMeta:
     # Request Id
     request_id: str
-    # Request tokens
-    token_ids: torch.Tensor
-    # Slot mappings, should have the same length as token_ids
-    slot_mapping: torch.Tensor
+    # Request block ids
+    block_ids: torch.Tensor
+    # Request num tokens
+    num_tokens: int
 
     @staticmethod
     def make_meta(request_id: str, token_ids: list[int], block_ids: list[int],
                   block_size: int) -> "ReqMeta":
-        valid_num_tokens = len(token_ids)
-        token_ids_tensor = torch.tensor(token_ids)
         block_ids_tensor = torch.tensor(block_ids)
-        num_blocks = block_ids_tensor.shape[0]
-        block_offsets = torch.arange(0, block_size)
-        slot_mapping = block_offsets.reshape((1, block_size)) + \
-                block_ids_tensor.reshape((num_blocks, 1)) * block_size
-        slot_mapping = slot_mapping.flatten()[:valid_num_tokens]
-
         return ReqMeta(
             request_id=request_id,
-            token_ids=token_ids_tensor,
-            slot_mapping=slot_mapping,
+            block_ids=block_ids_tensor,
+            num_tokens=len(token_ids),
         )
 
 
@@ -123,63 +115,58 @@ class P2pNcclConnector(KVConnectorBase_V1):
             return
 
         def inject_kv_into_layer(
-            dst_kv_cache_layer: torch.Tensor,
-            src_kv_cache: torch.Tensor,
-            slot_mapping: torch.Tensor,
+            layer: torch.Tensor,
+            kv_cache: torch.Tensor,
+            block_ids: torch.Tensor,
             request_id: str,
         ) -> None:
-            """Inject the KV cache into the layer.
+            """
+            Inject KV cache data into a given attention layer tensor.
+
+            This function updates `layer` in-place with values from `kv_cache`,
+            handling different backend layouts:
+              - MLA (Multi-Linear Attention) or FlashInfer: KV tensors are
+                indexed along the first dimension.
+              - FlashAttention: KV tensors are indexed along the second
+                dimension.
+
+            If the number of provided block IDs does not match the number of KV
+            blocks, only the overlapping portion is updated, and a warning is
+            logged.
 
             Args:
-                dst_kv_cache_layer (torch.Tensor): the destination KV cache
-                    layer. In shape [2, num_pages, page_size, xxx] if not
-                    using MLA, [num_pages, page_size, xxx] otherwise.
-                src_kv_cache (torch.Tensor): the source KV cache. In shape
-                    [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx]
-                    otherwise.
-                slot_mapping (torch.Tensor): the slot mapping. In shape
-                    [num_tokens].
-                request_id (str): request id for log
+                layer (torch.Tensor): The attention layer KV tensor to update.
+                kv_cache (torch.Tensor): The KV cache tensor to inject.
+                block_ids (torch.Tensor): Indices of the blocks to update.
+                request_id (str): Request identifier used for logging.
+
+            Returns:
+                None. The function modifies `layer` in-place.
             """
-            dst_kv_cache_layer_shape = dst_kv_cache_layer.shape
-            if isinstance(attn_metadata, MLACommonMetadata):
-                num_pages = dst_kv_cache_layer_shape[0]
-                page_size = dst_kv_cache_layer_shape[1]
-                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
-                    num_pages * page_size, -1)
-                self.check_tensors_except_dim(dst_kv_cache_layer, src_kv_cache,
-                                              0)
-                num_token = src_kv_cache.shape[0]
-                if len(slot_mapping) == num_token:
-                    dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache
+            if (isinstance(attn_metadata, MLACommonMetadata)
+                    or layer.shape[1] == 2):  # MLA or FlashInfer
+                num_block = kv_cache.shape[0]
+                self.check_tensors_except_dim(layer, kv_cache, 0)
+                if len(block_ids) == num_block:
+                    layer[block_ids, ...] = kv_cache
                 else:
-                    dst_kv_cache_layer[slot_mapping[:num_token],
-                                       ...] = src_kv_cache
+                    layer[block_ids[:num_block], ...] = kv_cache
                     logger.warning(
-                        "🚧src_kv_cache does not match, num_slot:%d, "
-                        "num_token:%d, request_id:%s", len(slot_mapping),
-                        num_token, request_id)
+                        "🚧kv_cache does not match, block_ids:%d, "
+                        "num_block:%d, request_id:%s", len(block_ids),
+                        num_block, request_id)
 
-                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
-            else:
-                num_pages = dst_kv_cache_layer_shape[1]
-                page_size = dst_kv_cache_layer_shape[2]
-                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
-                    2, num_pages * page_size, -1)
-                self.check_tensors_except_dim(dst_kv_cache_layer, src_kv_cache,
-                                              1)
-                num_token = src_kv_cache.shape[1]
-                if len(slot_mapping) == num_token:
-                    dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache
+            elif layer.shape[0] == 2:  # FlashAttention
+                num_block = kv_cache.shape[1]
+                self.check_tensors_except_dim(layer, kv_cache, 1)
+                if len(block_ids) == num_block:
+                    layer[:, block_ids, ...] = kv_cache
                 else:
-                    dst_kv_cache_layer[:, slot_mapping[:num_token],
-                                       ...] = src_kv_cache
+                    layer[:, block_ids[:num_block], ...] = kv_cache
                     logger.warning(
-                        "🚧src_kv_cache does not match, num_slot:%d, "
-                        "num_token:%d, request_id:%s", len(slot_mapping),
-                        num_token, request_id)
-
-                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+                        "🚧kv_cache does not match, block_ids:%d, "
+                        "num_block:%d, request_id:%s", len(block_ids),
+                        num_block, request_id)
 
         # Get the metadata
         metadata: KVConnectorMetadata = \
@@ -201,19 +188,17 @@ class P2pNcclConnector(KVConnectorBase_V1):
                 if kv_cache is None:
                     continue
 
-                kv_cache_layer = kv_cache[ \
-                    forward_context.virtual_engine]
+                layer = kv_cache[forward_context.virtual_engine]
 
                 kv_cache = self.p2p_nccl_engine.recv_tensor(
                     request.request_id + "#" + layer_name)
 
                 if kv_cache is None:
-                    logger.warning("🚧src_kv_cache is None, %s",
-                                   request.request_id)
+                    logger.warning("🚧kv_cache is None, %s", request.request_id)
                     continue
 
-                inject_kv_into_layer(kv_cache_layer, kv_cache,
-                                     request.slot_mapping, request.request_id)
+                inject_kv_into_layer(layer, kv_cache, request.block_ids,
+                                     request.request_id)
 
     def wait_for_layer_load(self, layer_name: str) -> None:
         """Blocking until the KV for a specific layer is loaded into vLLM's
@@ -247,20 +232,33 @@ class P2pNcclConnector(KVConnectorBase_V1):
 
         def extract_kv_from_layer(
             layer: torch.Tensor,
-            slot_mapping: torch.Tensor,
+            block_ids: torch.Tensor,
         ) -> torch.Tensor:
-            """Extract the KV cache from the layer.
-
-            Assume the shape of the layer is (2, num_pages, page_size, xxx)
-            if MLA is not used, and (num_pages, page_size, xxx) otherwise.
             """
-            if isinstance(attn_metadata, MLACommonMetadata):
-                num_pages, page_size = layer.shape[0], layer.shape[1]
-                return layer.reshape(num_pages * page_size, -1)[slot_mapping,
-                                                                ...]
-            num_pages, page_size = layer.shape[1], layer.shape[2]
-            return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
-                                                               ...]
+            Extract KV cache slices from a given attention layer tensor.
+
+            This function handles multiple backend layouts:
+              - MLA (Multi-Linear Attention) or FlashInfer: KV tensors are
+                indexed along the first dimension.
+              - FlashAttention: KV tensors are indexed along the second
+                dimension.
+
+            Args:
+                layer (torch.Tensor): The KV cache from the attention layer.
+                block_ids (torch.Tensor): Indices of blocks to extract.
+
+            Returns:
+                torch.Tensor: A tensor containing the extracted KV slices.
+                Returns None if the layout is unsupported.
+            """
+            if (isinstance(attn_metadata, MLACommonMetadata)
+                    or layer.shape[1] == 2):  # MLA or FlashInfer
+                return layer[block_ids, ...]
+
+            if layer.shape[0] == 2:  # FlashAttention
+                return layer[:, block_ids, ...]
+
+            return None
 
         connector_metadata = self._get_connector_metadata()
         assert isinstance(connector_metadata, P2pNcclConnectorMetadata)
@@ -269,7 +267,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
             ip, port = self.parse_request_id(request_id, True)
             remote_address = ip + ":" + str(port + self._rank)
 
-            kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping)
+            kv_cache = extract_kv_from_layer(kv_layer, request.block_ids)
             self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
                                              kv_cache, remote_address)
 

From 5f1af97f86021cf2819e5ab2d84722dac53c2257 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 27 Aug 2025 01:28:55 +0200
Subject: [PATCH 618/932] [V1] [Hybrid] Enable Full CUDA graph by default for
 hybrid models in V1 (#22594)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/model_executor/models/config.py | 42 ++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 882df7e816..f62209326b 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -4,6 +4,7 @@ from copy import deepcopy
 from typing import TYPE_CHECKING
 
 import vllm.envs as envs
+from vllm.config.compilation import CUDAGraphMode
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
@@ -275,6 +276,42 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
                     "%d for performance.", 1024)
 
 
+class MambaModelConfig(VerifyAndUpdateConfig):
+
+    @classmethod
+    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        """
+        Enable FULL_AND_PIECEWISE cuda graph mode by default (required
+        to get good performance for mamba layers in V1).
+
+        Args:
+            vllm_config: vLLM Config
+        """
+
+        if not envs.VLLM_USE_V1:
+            return
+
+        model_config = vllm_config.model_config
+        compilation_config = vllm_config.compilation_config
+
+        model_cls, _ = ModelRegistry.resolve_model_cls(
+            model_config.architecture,
+            model_config=model_config,
+        )
+
+        # TODO(tdoublep): remove as full cuda graph support is added
+        FCG_NOT_SUPPORTED_MODELS = [
+            "Lfm2ForCausalLM", "MiniMaxText01ForCausalLM"
+        ]
+
+        if (model_config.architecture not in FCG_NOT_SUPPORTED_MODELS
+                and compilation_config.cudagraph_mode is None):
+            logger.info(
+                "Hybrid or mamba-based model detected: setting cudagraph mode "
+                "to FULL_AND_PIECEWISE in order to optimize performance.")
+            compilation_config.cudagraph_mode = CUDAGraphMode.FULL_AND_PIECEWISE
+
+
 class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
 
     @classmethod
@@ -293,6 +330,9 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
         if not envs.VLLM_USE_V1:
             return
 
+        # Enable FULL_AND_PIECEWISE by default
+        MambaModelConfig.verify_and_update_config(vllm_config)
+
         cache_config = vllm_config.cache_config
         model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
@@ -374,4 +414,6 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "JambaForSequenceClassification": JambaForSequenceClassificationConfig,
     "GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig,
     "GptOssForCausalLM": GptOssForCausalLMConfig,
+    "MambaForCausalLM": MambaModelConfig,
+    "Mamba2ForCausalLM": MambaModelConfig,
 }

From 714872f1a9c779c2ce9bbf5440f08ec278dc569a Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 26 Aug 2025 19:48:32 -0400
Subject: [PATCH 619/932] [Compile] Fix Cmake Warning (#23689)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b0ed4a284d..b0eb0f32e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12", "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
 
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")

From 585e0bde36abdb2ab2967fd42005cbe62459020e Mon Sep 17 00:00:00 2001
From: Federico <65908512+coval3nte@users.noreply.github.com>
Date: Wed, 27 Aug 2025 02:29:52 +0200
Subject: [PATCH 620/932] [Bugfix] UnboundLocalError when GptOss reasoning
 specified (#23054)

Signed-off-by: Federico <65908512+coval3nte@users.noreply.github.com>
---
 vllm/entrypoints/openai/serving_chat.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 8b50153f01..7e0e627780 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -887,7 +887,8 @@ class OpenAIServingChat(OpenAIServing):
                         delta_message = DeltaMessage(content=delta_text)
 
                     # update the previous values for the next iteration
-                    if tool_choice_auto or self.reasoning_parser:
+                    if ((tool_choice_auto or self.reasoning_parser)
+                            and not self.use_harmony):
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
                         previous_texts[i] = current_text

From b1625dbe9cee497c0eefd9d1221377f64fec1e03 Mon Sep 17 00:00:00 2001
From: zixuanzhang226 <zixuanzhang@bytedance.com>
Date: Tue, 26 Aug 2025 18:06:10 -0700
Subject: [PATCH 621/932] feat: add triton fused moe config for GLM-4.5-Air-FP8
 on B200 (#23695)

Signed-off-by: Zixuan Zhang <zixuanzhang@bytedance.com>
---
 ...evice_name=NVIDIA_B200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000..b962d19506
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}

From 6891205b161e78ea6e255da194a4470e06997a3b Mon Sep 17 00:00:00 2001
From: wuhang <wuhang6@huawei.com>
Date: Wed, 27 Aug 2025 09:06:58 +0800
Subject: [PATCH 622/932] [Feature][Responses API] Support MCP tool in
 background mode (#23494)

Signed-off-by: wuhang <wuhang6@huawei.com>
---
 vllm/entrypoints/context.py                  |  31 ++-
 vllm/entrypoints/openai/serving_responses.py | 265 ++++++++++---------
 2 files changed, 162 insertions(+), 134 deletions(-)

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index f70e1fc207..9d587e8669 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -4,13 +4,15 @@ import json
 import logging
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Union
+from contextlib import AsyncExitStack
+from typing import TYPE_CHECKING, Optional, Union
 
 from openai_harmony import Author, Message, Role, StreamState, TextContent
 
 from vllm.entrypoints.harmony_utils import (
     get_encoding, get_streamable_parser_for_assistant, render_for_completion)
 from vllm.entrypoints.tool import Tool
+from vllm.entrypoints.tool_server import ToolServer
 from vllm.outputs import RequestOutput
 
 if TYPE_CHECKING:
@@ -37,6 +39,11 @@ class ConversationContext(ABC):
     def render_for_completion(self) -> list[int]:
         pass
 
+    @abstractmethod
+    async def init_tool_sessions(self, tool_server: Optional[ToolServer],
+                                 exit_stack: AsyncExitStack) -> None:
+        pass
+
 
 class SimpleContext(ConversationContext):
 
@@ -55,16 +62,21 @@ class SimpleContext(ConversationContext):
     def render_for_completion(self) -> list[int]:
         raise NotImplementedError("Should not be called.")
 
+    async def init_tool_sessions(self, tool_server: Optional[ToolServer],
+                                 exit_stack: AsyncExitStack) -> None:
+        pass
+
 
 class HarmonyContext(ConversationContext):
 
     def __init__(
         self,
         messages: list,
-        tool_sessions: dict[str, Tool],
+        available_tools: list[str],
     ):
         self._messages = messages
-        self.tool_sessions = tool_sessions
+        self.available_tools = available_tools
+        self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
 
         self.parser = get_streamable_parser_for_assistant()
         self.num_init_messages = len(messages)
@@ -116,10 +128,10 @@ class HarmonyContext(ConversationContext):
         if recipient is not None:
             if recipient.startswith("browser."):
                 return await self.call_search_tool(
-                    self.tool_sessions["browser"], last_msg)
+                    self._tool_sessions["browser"], last_msg)
             elif recipient.startswith("python"):
                 return await self.call_python_tool(
-                    self.tool_sessions["python"], last_msg)
+                    self._tool_sessions["python"], last_msg)
         raise ValueError("No tool call found")
 
     def render_for_completion(self) -> list[int]:
@@ -161,6 +173,15 @@ class HarmonyContext(ConversationContext):
                     recipient=Role.ASSISTANT)
         ]
 
+    async def init_tool_sessions(self, tool_server: Optional[ToolServer],
+                                 exit_stack: AsyncExitStack) -> None:
+        if tool_server:
+            for tool_name in self.available_tools:
+                if tool_name not in self._tool_sessions:
+                    self._tool_sessions[
+                        tool_name] = await exit_stack.enter_async_context(
+                            tool_server.new_session(tool_name))
+
 
 class StreamingHarmonyContext(HarmonyContext):
 
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 5adcb310e3..67eec2d523 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -8,7 +8,7 @@ from collections.abc import AsyncGenerator, AsyncIterator, Sequence
 from contextlib import AsyncExitStack
 from copy import copy
 from http import HTTPStatus
-from typing import Any, Callable, Final, Optional, Union
+from typing import Callable, Final, Optional, Union
 
 import jinja2
 import openai.types.responses as openai_responses_types
@@ -248,10 +248,10 @@ class OpenAIServingResponses(OpenAIServing):
             raw_request.state.request_metadata = request_metadata
 
         if self.tool_server is not None and isinstance(
-                self.tool_server, MCPToolServer
-        ) and (request.background or request.stream) and request.tools and any(
-                tool.type in ["web_search_preview", "code_interpreter"]
-                for tool in request.tools):
+                self.tool_server,
+                MCPToolServer) and request.stream and request.tools and any(
+                    tool.type in ["web_search_preview", "code_interpreter"]
+                    for tool in request.tools):
             return self.create_error_response(
                 "MCP tool server is not supported in background mode and "
                 "streaming mode")
@@ -265,103 +265,70 @@ class OpenAIServingResponses(OpenAIServing):
                 builtin_tool_list.append("browser")
             if self.tool_server.has_tool("python"):
                 builtin_tool_list.append("python")
-        async with AsyncExitStack() as exit_stack:
-            try:
-                if self.tool_server is not None:
-                    # TODO: initialize tool sessions lazily when the session
-                    # is actually used.
-                    tool_session_ctxs: dict[str, Any] = {
-                        tool_name:
-                        exit_stack.enter_async_context(
-                            self.tool_server.new_session(tool_name))
-                        for tool_name in builtin_tool_list
-                    }
-                    tool_sessions = {}
-                    for tool_name in builtin_tool_list:
-                        tool_sessions[tool_name] = (
-                            await tool_session_ctxs[tool_name])
-                else:
-                    assert len(builtin_tool_list) == 0
-                    tool_sessions = {}
-                for i, engine_prompt in enumerate(engine_prompts):
-                    default_max_tokens = self.max_model_len - len(
-                        engine_prompt["prompt_token_ids"])
-                    sampling_params = request.to_sampling_params(
-                        default_max_tokens, self.default_sampling_params)
 
-                    trace_headers = (None if raw_request is None else await
-                                     self._get_trace_headers(
-                                         raw_request.headers))
+        if self.tool_server is not None:
+            available_tools = builtin_tool_list
+        else:
+            assert len(builtin_tool_list) == 0
+            available_tools = []
+        try:
+            for i, engine_prompt in enumerate(engine_prompts):
+                default_max_tokens = self.max_model_len - len(
+                    engine_prompt["prompt_token_ids"])
+                sampling_params = request.to_sampling_params(
+                    default_max_tokens, self.default_sampling_params)
 
-                    context: ConversationContext
-                    if self.use_harmony:
-                        if request.stream:
-                            context = StreamingHarmonyContext(
-                                messages, tool_sessions)
-                        else:
-                            context = HarmonyContext(messages, tool_sessions)
+                trace_headers = (None if raw_request is None else await
+                                 self._get_trace_headers(raw_request.headers))
+
+                context: ConversationContext
+                if self.use_harmony:
+                    if request.stream:
+                        context = StreamingHarmonyContext(
+                            messages, available_tools)
                     else:
-                        context = SimpleContext()
-                    generator = self._generate_with_builtin_tools(
-                        request_id=request.request_id,
-                        request_prompt=request_prompts[i],
-                        engine_prompt=engine_prompt,
-                        sampling_params=sampling_params,
-                        context=context,
-                        lora_request=lora_request,
-                        priority=request.priority,
-                        trace_headers=trace_headers,
-                    )
-                    generators.append(generator)
-            except ValueError as e:
-                # TODO: Use a vllm-specific Validation Error
-                return self.create_error_response(str(e))
-
-            assert len(generators) == 1
-            result_generator, = generators
-
-            # Store the input messages.
-            if request.store:
-                self.msg_store[request.request_id] = messages
-
-            if request.background:
-                created_time = int(time.time())
-                response = ResponsesResponse.from_request(
-                    request,
-                    sampling_params,
-                    model_name=model_name,
-                    created_time=created_time,
-                    output=[],
-                    status="queued",
-                    usage=None,
+                        context = HarmonyContext(messages, available_tools)
+                else:
+                    context = SimpleContext()
+                generator = self._generate_with_builtin_tools(
+                    request_id=request.request_id,
+                    request_prompt=request_prompts[i],
+                    engine_prompt=engine_prompt,
+                    sampling_params=sampling_params,
+                    context=context,
+                    lora_request=lora_request,
+                    priority=request.priority,
+                    trace_headers=trace_headers,
                 )
-                async with self.response_store_lock:
-                    self.response_store[response.id] = response
+                generators.append(generator)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
 
-                # Run the request in the background.
-                task = asyncio.create_task(
-                    self._run_background_request(
-                        request,
-                        sampling_params,
-                        result_generator,
-                        context,
-                        model_name,
-                        tokenizer,
-                        request_metadata,
-                        created_time,
-                    ),
-                    name=f"create_{response.id}",
-                )
+        assert len(generators) == 1
+        result_generator, = generators
 
-                # For cleanup.
-                response_id = response.id
-                self.background_tasks[response_id] = task
-                task.add_done_callback(
-                    lambda _: self.background_tasks.pop(response_id, None))
-                return response
+        # Store the input messages.
+        if request.store:
+            self.msg_store[request.request_id] = messages
 
-            if request.stream:
-                return self.responses_stream_generator(
+        if request.background:
+            created_time = int(time.time())
+            response = ResponsesResponse.from_request(
+                request,
+                sampling_params,
+                model_name=model_name,
+                created_time=created_time,
+                output=[],
+                status="queued",
+                usage=None,
+            )
+            async with self.response_store_lock:
+                self.response_store[response.id] = response
+
+            # Run the request in the background.
+            task = asyncio.create_task(
+                self._run_background_request(
                     request,
                     sampling_params,
                     result_generator,
@@ -369,21 +336,41 @@ class OpenAIServingResponses(OpenAIServing):
                     model_name,
                     tokenizer,
                     request_metadata,
-                )
+                    created_time,
+                ),
+                name=f"create_{response.id}",
+            )
 
-            try:
-                return await self.responses_full_generator(
-                    request,
-                    sampling_params,
-                    result_generator,
-                    context,
-                    model_name,
-                    tokenizer,
-                    request_metadata,
-                )
-            except Exception as e:
-                return self.create_error_response(str(e))
-        return self.create_error_response("Should not reach here")
+            # For cleanup.
+            response_id = response.id
+            self.background_tasks[response_id] = task
+            task.add_done_callback(
+                lambda _: self.background_tasks.pop(response_id, None))
+            return response
+
+        if request.stream:
+            return self.responses_stream_generator(
+                request,
+                sampling_params,
+                result_generator,
+                context,
+                model_name,
+                tokenizer,
+                request_metadata,
+            )
+
+        try:
+            return await self.responses_full_generator(
+                request,
+                sampling_params,
+                result_generator,
+                context,
+                model_name,
+                tokenizer,
+                request_metadata,
+            )
+        except Exception as e:
+            return self.create_error_response(str(e))
 
     async def _make_request(
         self,
@@ -439,14 +426,16 @@ class OpenAIServingResponses(OpenAIServing):
         if created_time is None:
             created_time = int(time.time())
 
-        try:
-            async for _ in result_generator:
-                pass
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
-        except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+        async with AsyncExitStack() as exit_stack:
+            try:
+                await context.init_tool_sessions(self.tool_server, exit_stack)
+                async for _ in result_generator:
+                    pass
+            except asyncio.CancelledError:
+                return self.create_error_response("Client disconnected")
+            except ValueError as e:
+                # TODO: Use a vllm-specific Validation Error
+                return self.create_error_response(str(e))
 
         if self.use_harmony:
             assert isinstance(context, HarmonyContext)
@@ -838,7 +827,7 @@ class OpenAIServingResponses(OpenAIServing):
             status_code=HTTPStatus.BAD_REQUEST,
         )
 
-    async def responses_stream_generator(
+    async def _process_streaming_events(
         self,
         request: ResponsesRequest,
         sampling_params: SamplingParams,
@@ -847,18 +836,8 @@ class OpenAIServingResponses(OpenAIServing):
         model_name: str,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
-        created_time: Optional[int] = None,
+        created_time: int,
     ) -> AsyncGenerator[str, None]:
-        # TODO:
-        # 1. Handle disconnect
-
-        if not isinstance(context, StreamingHarmonyContext):
-            raise NotImplementedError(
-                "Streaming is not supported for responses API without Harmony."
-            )
-
-        created_time = created_time or int(time.time())
-
         sequence_number = 0
 
         def _send_event(event: BaseModel):
@@ -1270,3 +1249,31 @@ class OpenAIServingResponses(OpenAIServing):
                 sequence_number=-1,
                 response=final_response.model_dump(),
             ))
+
+    async def responses_stream_generator(
+        self,
+        request: ResponsesRequest,
+        sampling_params: SamplingParams,
+        result_generator: AsyncIterator[Optional[ConversationContext]],
+        context: ConversationContext,
+        model_name: str,
+        tokenizer: AnyTokenizer,
+        request_metadata: RequestResponseMetadata,
+        created_time: Optional[int] = None,
+    ) -> AsyncGenerator[str, None]:
+        # TODO:
+        # 1. Handle disconnect
+
+        if not isinstance(context, StreamingHarmonyContext):
+            raise NotImplementedError(
+                "Streaming is not supported for responses API without Harmony."
+            )
+
+        created_time = created_time or int(time.time())
+
+        async with AsyncExitStack() as exit_stack:
+            await context.init_tool_sessions(self.tool_server, exit_stack)
+            async for event_data in self._process_streaming_events(
+                    request, sampling_params, result_generator, context,
+                    model_name, tokenizer, request_metadata, created_time):
+                yield event_data

From c7c80af084e4d87c4e73148cb71ee990970281ff Mon Sep 17 00:00:00 2001
From: yzds <41983536+youzhedian@users.noreply.github.com>
Date: Wed, 27 Aug 2025 09:21:11 +0800
Subject: [PATCH 623/932] fix pynccl reduce_scatter (#23648)

Co-authored-by: hongchao <hongchao@msh.team>
---
 vllm/distributed/device_communicators/cuda_communicator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 0ea8de2f36..eef3f9f75f 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -152,7 +152,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
                              dtype=input_tensor.dtype,
                              device=input_tensor.device)
 
-        pynccl_comm.reduce_scatter(output, input_)
+        pynccl_comm.reduce_scatter(output, input_tensor)
 
         # Reshape before returning
         return output.movedim(0, dim).contiguous()
@@ -186,9 +186,9 @@ class CudaCommunicator(DeviceCommunicatorBase):
                              device=input_tensor.device)
 
         if sizes is not None:
-            pynccl_comm.reduce_scatterv(output, input_, sizes=sizes)
+            pynccl_comm.reduce_scatterv(output, input_tensor, sizes=sizes)
         else:
-            pynccl_comm.reduce_scatter(output, input_)
+            pynccl_comm.reduce_scatter(output, input_tensor)
 
         # Reshape before returning
         return output.movedim(0, dim).contiguous()

From 2c2b140ae8c60dc0c38e4d37274fc7106a72c21b Mon Sep 17 00:00:00 2001
From: czhu-cohere <conway.zhu@cohere.com>
Date: Tue, 26 Aug 2025 21:23:23 -0400
Subject: [PATCH 624/932] [quantization] use channel scales for w4a8 + misc
 fixes (#23570)

Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
---
 tests/quantization/test_compressed_tensors.py | 44 +++++++++++++++++--
 .../schemes/compressed_tensors_w4a8_fp8.py    | 13 +++++-
 .../kernels/mixed_precision/MPLinearKernel.py |  1 +
 .../kernels/mixed_precision/cutlass.py        | 19 ++++----
 4 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 296743dbfa..b9774b7ee2 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -14,10 +14,10 @@ from compressed_tensors.quantization import QuantizationType
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensors24, CompressedTensorsLinearMethod,
-    CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
-    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16)
+    CompressedTensorsW4A4Fp4, CompressedTensorsW4A8Fp8,
+    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     cutlass_fp4_supported)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
@@ -683,3 +683,39 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
         assert output
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda()
+    or not current_platform.has_device_capability(90),
+    reason="W4A8 FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize("args", [
+    ("czhu-cohere/TinyLlama-1.1B-Chat-v1.0-W4A8-e2e", CompressedTensorsW4A8Fp8)
+])
+def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
+    model, scheme = args
+    with vllm_runner(model, enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            for proj in (qkv_proj, o_proj, gate_up_proj, down_proj):
+                assert isinstance(proj.quant_method,
+                                  CompressedTensorsLinearMethod)
+                assert isinstance(proj.scheme, scheme)
+
+                assert proj.weight_packed.dtype is torch.int32
+                assert proj.weight_scale.dtype is torch.float8_e4m3fn
+                assert proj.weight_chan_scale.dtype is torch.float32
+                assert proj.scheme.group_size == 128
+
+        llm.apply_model(check_model)
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
index f6cc49c231..3d98270588 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
@@ -79,7 +79,8 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
             act_type=torch.float8_e4m3fn,  # always use fp8(e4m3)
             group_size=self.group_size,
             zero_points=not self.symmetric,
-            has_g_idx=self.has_g_idx
+            has_g_idx=self.has_g_idx,
+            out_type=params_dtype
         )
 
         kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
@@ -122,7 +123,7 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
             torch.empty(
                 output_size_per_partition,
                 scales_and_zp_size,
-                dtype=params_dtype,
+                dtype=torch.float8_e4m3fn,
             )
         }
 
@@ -140,9 +141,17 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
                                                           dtype=torch.int64),
                                          weight_loader=weight_loader)
 
+        # per-channel scales
+        weight_chan_scale = ChannelQuantScaleParameter(
+            data=torch.empty((output_size_per_partition, 1),
+                             dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader)
+
         layer.register_parameter("weight_packed", weight)
         layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight_shape", weight_shape)
+        layer.register_parameter("weight_chan_scale", weight_chan_scale)
 
         self.kernel = kernel_type(mp_linear_kernel_config,
                                   w_q_param_name="weight_packed",
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
index 07ecc09623..1280f5f1ea 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -20,6 +20,7 @@ class MPLinearLayerConfig:
     group_size: int
     zero_points: bool
     has_g_idx: bool
+    out_type: Optional[torch.dtype] = None
 
 
 class MPLinearKernel(ABC):
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
index f1d49693fc..9e23c0dd35 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
@@ -60,13 +60,17 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
         if in_features % 128 or out_features % 128:
             return False, "K and N must be divisible by 128, got "\
                            f"{c.partition_weight_shape}"
+
+        if c.out_type != torch.bfloat16:
+            return False, "Only bfloat16 output type currently supported"\
+                           f"got {c.out_type=}"
+
         return True, None
 
     # note assumes that
     #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
     #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
     def process_weights_after_loading(self, layer: torch.nn.Module):
-        c = self.config
 
         # TODO(czhu): optimize speed/mem usage
         def transform_w_q(x):
@@ -86,19 +90,15 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
         # Encode/reorder weights and pack scales
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
-
-        # TODO(czhu): support loading channel scales
-        self.w_ch_s = torch.ones((c.partition_weight_shape[1], ),
-                                 dtype=torch.float32,
-                                 device='cuda')
+        self._transform_param(layer, "weight_chan_scale", lambda x: x)
 
     def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        assert bias is None, "bias not supported by CUTLASS W4A8"
         c = self.config
         w_q, w_s, _, _ = self._get_weight_params(layer)
+        w_ch_s = layer.weight_chan_scale
 
         x_2d = x.reshape(-1, x.shape[-1])
         out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
@@ -109,6 +109,9 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
                                      b_group_scales=w_s,
                                      b_group_size=c.group_size,
                                      a_token_scales=act_scales,
-                                     b_channel_scales=self.w_ch_s)
+                                     b_channel_scales=w_ch_s)
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
 
         return output.reshape(out_shape)

From eb1995167e04e01c465e1cf4c39d5fd0b2031724 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 26 Aug 2025 18:23:26 -0700
Subject: [PATCH 625/932] [gpt-oss] Enable unit test for response API harmony
 integration (#23533)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 .../openai/test_response_api_with_harmony.py  | 45 ++++++++++++-------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 1ca52599c5..72d468db08 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -11,18 +11,25 @@ from openai import BadRequestError, NotFoundError, OpenAI
 
 from ...utils import RemoteOpenAIServer
 
-pytest.skip(allow_module_level=True, reason="gpt-oss can't run on CI yet.")
-
 MODEL_NAME = "openai/gpt-oss-20b"
-DTYPE = "bfloat16"
 
 
 @pytest.fixture(scope="module")
-def server():
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module")
+def server(monkeypatch_module: pytest.MonkeyPatch):
     args = ["--enforce-eager", "--tool-server", "demo"]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
+    with monkeypatch_module.context() as m:
+        m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
 
 
 @pytest_asyncio.fixture
@@ -269,10 +276,11 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_streaming(client: OpenAI, model_name: str):
+    # TODO: Add back when web search and code interpreter are available in CI
     prompts = [
         "tell me a story about a cat in 20 words",
-        "What is 13 * 24? Use python to calculate the result.",
-        "When did Jensen found NVIDIA? Search it and answer the year only.",
+        # "What is 13 * 24? Use python to calculate the result.",
+        # "When did Jensen found NVIDIA? Search it and answer the year only.",
     ]
 
     for prompt in prompts:
@@ -281,15 +289,15 @@ async def test_streaming(client: OpenAI, model_name: str):
             input=prompt,
             reasoning={"effort": "low"},
             tools=[
-                {
-                    "type": "web_search_preview"
-                },
-                {
-                    "type": "code_interpreter",
-                    "container": {
-                        "type": "auto"
-                    }
-                },
+                # {
+                #     "type": "web_search_preview"
+                # },
+                # {
+                #     "type": "code_interpreter",
+                #     "container": {
+                #         "type": "auto"
+                #     }
+                # },
             ],
             stream=True,
         )
@@ -317,6 +325,7 @@ async def test_streaming(client: OpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="Web search tool is not available in CI yet.")
 async def test_web_search(client: OpenAI, model_name: str):
     response = await client.responses.create(
         model=model_name,
@@ -331,6 +340,7 @@ async def test_web_search(client: OpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.")
 async def test_code_interpreter(client: OpenAI, model_name: str):
     response = await client.responses.create(
         model=model_name,
@@ -436,6 +446,7 @@ async def test_function_calling(client: OpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.flaky(reruns=5)
 async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
     tools = [
         {

From de02b07db4741cc9ed40b8262d7a67e6bce30211 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 26 Aug 2025 21:34:57 -0400
Subject: [PATCH 626/932] [Bugfix] Lazy import gpt_oss_triton_kernels_moe for
 mxfp4 (#23678)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index df96e5d8c4..bdeb169a4b 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -10,8 +10,6 @@ from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                   FusedMoEMethodBase)
-from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
-    triton_kernel_moe_forward)
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -557,6 +555,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             )[0]
             return trtllm_gen_output
         else:
+            from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (  # noqa: E501
+                triton_kernel_moe_forward)
             return triton_kernel_moe_forward(
                 hidden_states=x,
                 w1=self.w13_weight_triton_tensor,

From 6dab89b8ece7e022bd3df5774c9ddf309e2eb2d9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 02:47:08 +0100
Subject: [PATCH 627/932] [Docs] Fix math rendering in docs (#23676)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/mkdocs/javascript/mathjax.js | 20 ++++++++++++++++++++
 mkdocs.yaml                       |  7 ++++---
 requirements/docs.txt             |  1 -
 3 files changed, 24 insertions(+), 4 deletions(-)
 create mode 100644 docs/mkdocs/javascript/mathjax.js

diff --git a/docs/mkdocs/javascript/mathjax.js b/docs/mkdocs/javascript/mathjax.js
new file mode 100644
index 0000000000..5da0d44357
--- /dev/null
+++ b/docs/mkdocs/javascript/mathjax.js
@@ -0,0 +1,20 @@
+// Enables MathJax rendering
+window.MathJax = {
+  tex: {
+    inlineMath: [["\\(", "\\)"]],
+    displayMath: [["\\[", "\\]"]],
+    processEscapes: true,
+    processEnvironments: true
+  },
+  options: {
+    ignoreHtmlClass: ".*|",
+    processHtmlClass: "arithmatex"
+  }
+};
+
+document$.subscribe(() => { 
+  MathJax.startup.output.clearCache()
+  MathJax.typesetClear()
+  MathJax.texReset()
+  MathJax.typesetPromise()
+})
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 47fe1ebce9..507a80c41e 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -129,15 +129,16 @@ markdown_extensions:
   - toc:
       permalink: true
   # For math rendering
-  - mdx_math:
-      enable_dollar_delimiter: true
+  - pymdownx.arithmatex:
+      generic: true
 
 extra_css:
   - mkdocs/stylesheets/extra.css
 
 extra_javascript:
   - mkdocs/javascript/run_llm_widget.js
-  - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
+  - mkdocs/javascript/mathjax.js
+  - https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
   - mkdocs/javascript/edit_and_feedback.js
   - mkdocs/javascript/slack_and_forum.js
 
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 3b72a8a9e7..d1c5463987 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -7,7 +7,6 @@ mkdocs-awesome-nav
 mkdocs-glightbox
 mkdocs-git-revision-date-localized-plugin
 mkdocs-minify-plugin
-python-markdown-math
 regex
 ruff
 

From fecbb7c782980d0d9d104784a233ecb95a20ddda Mon Sep 17 00:00:00 2001
From: Wei <weiweinpu@gmail.com>
Date: Tue, 26 Aug 2025 19:54:23 -0700
Subject: [PATCH 628/932] [Bugfix][gpt-oss] passing the cache config in gpt-oss
 (#23613)

Signed-off-by: Wei Wei <wwei6@meta.com>
---
 vllm/model_executor/models/gpt_oss.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index cd93f0ef1e..9c1c05320c 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -174,12 +174,15 @@ class TransformerBlock(torch.nn.Module):
     def __init__(
         self,
         config: GptOssConfig,
+        cache_config: CacheConfig,
         quant_config: QuantizationConfig,
         prefix: str = "",
     ):
         super().__init__()
         self.layer_idx = extract_layer_index(prefix)
-        self.attn = OAIAttention(config, prefix=f"{prefix}.attn")
+        self.attn = OAIAttention(config,
+                                 prefix=f"{prefix}.attn",
+                                 cache_config=cache_config)
         self.mlp = MLPBlock(config,
                             self.layer_idx,
                             quant_config=quant_config,
@@ -203,6 +206,7 @@ class GptOssModel(nn.Module):
     ):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
+        self.cache_config = vllm_config.cache_config
         self.quant_config = vllm_config.quant_config
         self.parallel_config = vllm_config.parallel_config
         self.config.hidden_size = self.config.hidden_size
@@ -213,6 +217,7 @@ class GptOssModel(nn.Module):
         self.layers = torch.nn.ModuleList([
             TransformerBlock(
                 self.config,
+                cache_config=self.cache_config,
                 quant_config=self.quant_config,
                 prefix=maybe_prefix(prefix, f"block.{layer_idx}"),
             ) for layer_idx in range(self.config.num_hidden_layers)

From 786835807b491279af1fc5f565df9c6baedf3827 Mon Sep 17 00:00:00 2001
From: Yiheng Xu <charlesyihengxu@gmail.com>
Date: Wed, 27 Aug 2025 10:58:32 +0800
Subject: [PATCH 629/932] [Bugfix]: Qwen3 Coder Tool Parser (#23099)

Signed-off-by: Yiheng Xu <charlesyihengxu@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
---
 examples/tool_chat_template_qwen3coder.jinja  | 117 ++++
 tests/tool_use/test_qwen3coder_tool_parser.py | 178 +++++-
 .../tool_parsers/qwen3coder_tool_parser.py    | 519 ++++++++++--------
 3 files changed, 571 insertions(+), 243 deletions(-)
 create mode 100644 examples/tool_chat_template_qwen3coder.jinja

diff --git a/examples/tool_chat_template_qwen3coder.jinja b/examples/tool_chat_template_qwen3coder.jinja
new file mode 100644
index 0000000000..49b0e8d0ee
--- /dev/null
+++ b/examples/tool_chat_template_qwen3coder.jinja
@@ -0,0 +1,117 @@
+{% macro render_extra_keys(json_dict, handled_keys) %}
+    {%- if json_dict is mapping %}
+        {%- for json_key in json_dict if json_key not in handled_keys %}
+            {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
+                {{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
+            {%- else %}
+                {{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
+            {%- endif %}
+        {%- endfor %}
+    {%- endif %}
+{% endmacro %}
+
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{%- if not tools is defined %}
+    {%- set tools = [] %}
+{%- endif %}
+
+{%- if system_message is defined %}
+    {{- "<|im_start|>system\n" + system_message }}
+{%- else %}
+    {%- if tools is iterable and tools | length > 0 %}
+        {{- "<|im_start|>system\nYou are Qwen, a helpful AI assistant that can interact with a computer to solve tasks." }}
+    {%- endif %}
+{%- endif %}
+{%- if tools is iterable and tools | length > 0 %}
+    {{- "\n\n# Tools\n\nYou have access to the following functions:\n\n" }}
+    {{- "<tools>" }}
+    {%- for tool in tools %}
+        {%- if tool.function is defined %}
+            {%- set tool = tool.function %}
+        {%- endif %}
+        {{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }}
+        {%- if tool.description is defined %}
+            {{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }}
+        {%- endif %}
+        {{- '\n<parameters>' }}
+        {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
+            {%- for param_name, param_fields in tool.parameters.properties|items %}
+                {{- '\n<parameter>' }}
+                {{- '\n<name>' ~ param_name ~ '</name>' }}
+                {%- if param_fields.type is defined %}
+                    {{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
+                {%- endif %}
+                {%- if param_fields.description is defined %}
+                    {{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}
+                {%- endif %}
+                {%- set handled_keys = ['name', 'type', 'description'] %}
+                {{- render_extra_keys(param_fields, handled_keys) }}
+                {{- '\n</parameter>' }}
+            {%- endfor %}
+        {%- endif %}
+        {% set handled_keys = ['type', 'properties'] %}
+        {{- render_extra_keys(tool.parameters, handled_keys) }}
+        {{- '\n</parameters>' }}
+        {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
+        {{- render_extra_keys(tool, handled_keys) }}
+        {{- '\n</function>' }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+{%- endif %}
+{%- if system_message is defined %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if tools is iterable and tools | length > 0 %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in loop_messages %}
+    {%- if message.role == "assistant" and message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
+            {{- '\n' + message.content | trim + '\n' }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+            {%- if tool_call.arguments is defined %}
+                {%- for args_name, args_value in tool_call.arguments|items %}
+                    {{- '<parameter=' + args_name + '>\n' }}
+                    {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                    {{- args_value }}
+                    {{- '\n</parameter>\n' }}
+                {%- endfor %}
+            {%- endif %}
+            {{- '</function>\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "user" or message.role == "system" or message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user\n' }}
+        {%- endif %}
+        {{- '<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>\n' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 40c3158e9e..ccb2acf512 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -16,7 +16,7 @@ from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
 from vllm.transformers_utils.detokenizer import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
-MODEL = "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8"
+MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
 
 
 @pytest.fixture(scope="module")
@@ -397,7 +397,9 @@ hello world
         "no_tools",
         "single_tool",
         "single_tool_with_content",
+        "single_tool_multiline_param",
         "parallel_tools",
+        "tool_with_typed_params",  # Added this test case
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
@@ -422,7 +424,7 @@ fahrenheit
                                           "state": "TX",
                                           "unit": "fahrenheit"
                                       })))
-        ], ""),
+        ], None),
         ('''Sure! Let me check the weather for you.<tool_call>
 <function=get_current_weather>
 <parameter=city>
@@ -445,6 +447,30 @@ fahrenheit
                                       })))
         ], "Sure! Let me check the weather for you."),
         ('''<tool_call>
+<function=calculate_area>
+<parameter=shape>
+rectangle
+</parameter>
+<parameter=dimensions>
+{"width": 10, 
+ "height": 20}
+</parameter>
+<parameter=precision>
+2
+</parameter>
+</function>
+</tool_call>''', [
+            ToolCall(function=FunctionCall(name="calculate_area",
+                                           arguments=json.dumps({
+                                               "shape": "rectangle",
+                                               "dimensions": {
+                                                   "width": 10,
+                                                   "height": 20
+                                               },
+                                               "precision": 2
+                                           })))
+        ], None),
+        ('''<tool_call>
 <function=get_current_weather>
 <parameter=city>
 Dallas
@@ -484,13 +510,36 @@ celsius
                                           "state": "FL",
                                           "unit": "celsius"
                                       })))
-        ], ""),
+        ], None),
+        # Added tool_with_typed_params test case
+        ('''Let me calculate that area for you.<tool_call>
+<function=calculate_area>
+<parameter=shape>
+circle
+</parameter>
+<parameter=dimensions>
+{"radius": 15.5}
+</parameter>
+<parameter=precision>
+3
+</parameter>
+</function>
+</tool_call>''', [
+            ToolCall(function=FunctionCall(name="calculate_area",
+                                           arguments=json.dumps({
+                                               "shape": "circle",
+                                               "dimensions": {
+                                                   "radius": 15.5
+                                               },
+                                               "precision": 3
+                                           })))
+        ], "Let me calculate that area for you."),
     ],
 )
 def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer,
                                       sample_tools, model_output,
                                       expected_tool_calls, expected_content):
-    """Test incremental streaming behavior"""
+    """Test incremental streaming behavior including typed parameters"""
     request = ChatCompletionRequest(model=MODEL,
                                     messages=[],
                                     tools=sample_tools)
@@ -539,7 +588,7 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer,
                             "arguments"] += tool_call.function.arguments
 
     # Verify final content
-    assert other_content == expected_content
+    assert other_content == (expected_content or "")  # Handle None case
 
     # Verify we got all expected tool calls
     assert len(tool_states) == len(expected_tool_calls)
@@ -559,6 +608,125 @@ def test_extract_tool_calls_streaming(qwen3_tool_parser, qwen3_tokenizer,
         assert actual_args == expected_args
 
 
+def test_extract_tool_calls_missing_closing_parameter_tag(
+        qwen3_tool_parser, sample_tools):
+    """Test handling of missing closing </parameter> tag"""
+    # Using get_current_weather from sample_tools but with malformed XML
+    model_output = '''Let me check the weather for you:
+<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>'''
+
+    request = ChatCompletionRequest(model=MODEL,
+                                    messages=[],
+                                    tools=sample_tools)
+    extracted_tool_calls = qwen3_tool_parser.extract_tool_calls(
+        model_output, request=request)
+
+    # The parser should handle the malformed XML gracefully
+    assert extracted_tool_calls.tools_called
+    assert len(extracted_tool_calls.tool_calls) == 1
+
+    # Verify the function name is correct
+    assert extracted_tool_calls.tool_calls[
+        0].function.name == "get_current_weather"
+
+    # Verify the arguments are parsed despite the missing closing tag
+    args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
+    assert "city" in args
+    assert args["city"] == "Dallas"
+    assert args["state"] == "TX"
+    assert args["unit"] == "fahrenheit"
+
+    # Check that content before the tool call is preserved
+    assert "Let me check the weather for you:" in extracted_tool_calls.content
+
+
+def test_extract_tool_calls_streaming_missing_closing_tag(
+        qwen3_tool_parser, qwen3_tokenizer, sample_tools):
+    """Test streaming with missing closing </parameter> tag"""
+    # Using get_current_weather from sample_tools but with malformed XML
+    model_output = '''Let me check the weather for you:
+<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>'''
+
+    request = ChatCompletionRequest(model=MODEL,
+                                    messages=[],
+                                    tools=sample_tools)
+
+    other_content = ''
+    tool_states = {}
+
+    for delta_message in stream_delta_message_generator(
+            qwen3_tool_parser, qwen3_tokenizer, model_output, request):
+
+        if delta_message.content:
+            other_content += delta_message.content
+
+        if delta_message.tool_calls:
+            for tool_call in delta_message.tool_calls:
+                idx = tool_call.index
+
+                if idx not in tool_states:
+                    tool_states[idx] = {
+                        "id": None,
+                        "name": None,
+                        "arguments": "",
+                        "type": None
+                    }
+
+                if tool_call.id:
+                    tool_states[idx]["id"] = tool_call.id
+
+                if tool_call.type:
+                    assert tool_call.type == "function"
+                    tool_states[idx]["type"] = tool_call.type
+
+                if tool_call.function:
+                    if tool_call.function.name:
+                        tool_states[idx]["name"] = tool_call.function.name
+
+                    if tool_call.function.arguments is not None:
+                        tool_states[idx][
+                            "arguments"] += tool_call.function.arguments
+
+    # Verify content was streamed
+    assert "Let me check the weather for you:" in other_content
+
+    # Verify we got the tool call
+    assert len(tool_states) == 1
+    state = tool_states[0]
+    assert state["id"] is not None
+    assert state["type"] == "function"
+    assert state["name"] == "get_current_weather"
+
+    # Verify arguments were parsed correctly despite missing closing tag
+    assert state["arguments"] is not None
+    args = json.loads(state["arguments"])
+    assert args["city"] == "Dallas"
+    assert args["state"] == "TX"
+    assert args["unit"] == "fahrenheit"
+
+
 def test_extract_tool_calls_streaming_incremental(qwen3_tool_parser,
                                                   qwen3_tokenizer,
                                                   sample_tools):
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
index 2501d6739e..955813ddd3 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import ast
 import json
 import uuid
 from collections.abc import Sequence
@@ -22,7 +22,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 logger = init_logger(__name__)
 
 
-@ToolParserManager.register_module(["qwen3_coder"])
+@ToolParserManager.register_module("qwen3_coder")
 class Qwen3CoderToolParser(ToolParser):
 
     def __init__(self, tokenizer: AnyTokenizer):
@@ -30,6 +30,8 @@ class Qwen3CoderToolParser(ToolParser):
 
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
+        # Override base class type - we use string IDs for tool calls
+        self.current_tool_id: Optional[str] = None  # type: ignore
         self.streamed_args_for_tool: list[str] = []
 
         # Sentinel tokens for streaming mode
@@ -42,20 +44,6 @@ class Qwen3CoderToolParser(ToolParser):
         self.is_tool_call_started: bool = False
         self.failed_count: int = 0
 
-        # Streaming state variables
-        self.current_tool_index: int = 0
-        self.header_sent: bool = False
-        self.current_tool_string_id: Optional[str] = None
-        self.current_function_name: Optional[str] = None
-        self.current_param_name: Optional[str] = None
-        self.current_param_value: str = ""
-        self.param_count: int = 0
-        self.in_param: bool = False
-        self.in_function: bool = False
-        self.accumulated_text: str = ""
-        self.json_started: bool = False
-        self.json_closed: bool = False
-
         # Enhanced streaming state - reset for each new message
         self._reset_streaming_state()
 
@@ -67,7 +55,8 @@ class Qwen3CoderToolParser(ToolParser):
         self.tool_call_function_regex = re.compile(
             r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL)
         self.tool_call_parameter_regex = re.compile(
-            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL)
+            r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
+            re.DOTALL)
 
         if not self.model_tokenizer:
             raise ValueError(
@@ -84,8 +73,8 @@ class Qwen3CoderToolParser(ToolParser):
                 "Qwen3 XML Tool parser could not locate tool call start/end "
                 "tokens in the tokenizer!")
 
-        logger.debug("vLLM Successfully import tool parser %s !",
-                     self.__class__.__name__)
+        logger.info("vLLM Successfully import tool parser %s !",
+                    self.__class__.__name__)
 
     def _generate_tool_call_id(self) -> str:
         """Generate a unique tool call ID."""
@@ -96,7 +85,7 @@ class Qwen3CoderToolParser(ToolParser):
         self.current_tool_index = 0
         self.is_tool_call_started = False
         self.header_sent = False
-        self.current_tool_string_id = None
+        self.current_tool_id = None
         self.current_function_name = None
         self.current_param_name = None
         self.current_param_value = ""
@@ -106,122 +95,122 @@ class Qwen3CoderToolParser(ToolParser):
         self.accumulated_text = ""
         self.json_started = False
         self.json_closed = False
+        # Store accumulated parameters for type conversion
+        self.accumulated_params = {}
+        self.streaming_request = None
+
+    def _get_arguments_config(
+            self, func_name: str,
+            tools: Optional[list[ChatCompletionToolsParam]]) -> dict:
+        """Extract argument configuration for a function."""
+        if tools is None:
+            return {}
+        for config in tools:
+            if not hasattr(config, "type") or not (hasattr(
+                    config, "function") and hasattr(config.function, "name")):
+                continue
+            if config.type == "function" and config.function.name == func_name:
+                if not hasattr(config.function, "parameters"):
+                    return {}
+                params = config.function.parameters
+                if isinstance(params, dict) and "properties" in params:
+                    return params["properties"]
+                elif isinstance(params, dict):
+                    return params
+                else:
+                    return {}
+        logger.warning("Tool '%s' is not defined in the tools list.",
+                       func_name)
+        return {}
+
+    def _convert_param_value(self, param_value: str, param_name: str,
+                             param_config: dict, func_name: str) -> Any:
+        """Convert parameter value based on its type in the schema."""
+        # Handle null value for any type
+        if param_value.lower() == "null":
+            return None
+
+        if param_name not in param_config:
+            if param_config != {}:
+                logger.warning(
+                    "Parsed parameter '%s' is not defined in the tool "
+                    "parameters for tool '%s', directly returning the "
+                    "string value.", param_name, func_name)
+            return param_value
+
+        if isinstance(param_config[param_name],
+                      dict) and "type" in param_config[param_name]:
+            param_type = str(param_config[param_name]["type"]).strip().lower()
+        else:
+            param_type = "string"
+        if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
+            return param_value
+        elif param_type.startswith("int") or param_type.startswith(
+                "uint") or param_type.startswith(
+                    "long") or param_type.startswith(
+                        "short") or param_type.startswith("unsigned"):
+            try:
+                return int(param_value)
+            except (ValueError, TypeError):
+                logger.warning(
+                    "Parsed value '%s' of parameter '%s' is not an "
+                    "integer in tool '%s', degenerating to string.",
+                    param_value, param_name, func_name)
+                return param_value
+        elif param_type.startswith("num") or param_type.startswith("float"):
+            try:
+                float_param_value = float(param_value)
+                return float_param_value if float_param_value - int(
+                    float_param_value) != 0 else int(float_param_value)
+            except (ValueError, TypeError):
+                logger.warning(
+                    "Parsed value '%s' of parameter '%s' is not a float "
+                    "in tool '%s', degenerating to string.", param_value,
+                    param_name, func_name)
+                return param_value
+        elif param_type in ["boolean", "bool", "binary"]:
+            param_value = param_value.lower()
+            if param_value not in ["true", "false"]:
+                logger.warning(
+                    "Parsed value '%s' of parameter '%s' is not a boolean "
+                    "(`true` or `false`) in tool '%s', degenerating to "
+                    "false.", param_value, param_name, func_name)
+            return param_value == "true"
+        else:
+            if param_type in ["object", "array", "arr"
+                              ] or param_type.startswith(
+                                  "dict") or param_type.startswith("list"):
+                try:
+                    param_value = json.loads(param_value)
+                    return param_value
+                except (json.JSONDecodeError, TypeError, ValueError):
+                    logger.warning(
+                        "Parsed value '%s' of parameter '%s' cannot be "
+                        "parsed with json.loads in tool '%s', will try "
+                        "other methods to parse it.", param_value, param_name,
+                        func_name)
+            try:
+                param_value = ast.literal_eval(param_value)  # safer
+            except (ValueError, SyntaxError, TypeError):
+                logger.warning(
+                    "Parsed value '%s' of parameter '%s' cannot be "
+                    "converted via Python `ast.literal_eval()` in tool "
+                    "'%s', degenerating to string.", param_value, param_name,
+                    func_name)
+            return param_value
 
     def _parse_xml_function_call(
             self, function_call_str: str,
             tools: Optional[list[ChatCompletionToolsParam]]
     ) -> Optional[ToolCall]:
 
-        def get_arguments_config(func_name: str) -> dict:
-            if tools is None:
-                return {}
-            for config in tools:
-                if not hasattr(config, "type") or not (
-                        hasattr(config, "function")
-                        and hasattr(config.function, "name")):
-                    continue
-                if (config.type == "function"
-                        and config.function.name == func_name):
-                    if not hasattr(config.function, "parameters"):
-                        return {}
-                    params = config.function.parameters
-                    if isinstance(params, dict) and "properties" in params:
-                        return params["properties"]
-                    elif isinstance(params, dict):
-                        return params
-                    else:
-                        return {}
-            logger.warning("Tool '%s' is not defined in the tools list.",
-                           func_name)
-            return {}
-
-        def convert_param_value(param_value: str, param_name: str,
-                                param_config: dict, func_name: str) -> Any:
-            # Handle null value for any type
-            if param_value.lower() == "null":
-                return None
-
-            converted_value: Any
-
-            if param_name not in param_config:
-                if param_config != {}:
-                    logger.warning(
-                        "Parsed parameter '%s' is not defined in the tool "
-                        "parameters for tool '%s', directly returning the "
-                        "string value.", param_name, func_name)
-                return param_value
-
-            if (isinstance(param_config[param_name], dict)
-                    and "type" in param_config[param_name]):
-                param_type = str(
-                    param_config[param_name]["type"]).strip().lower()
-            else:
-                param_type = "string"
-            if param_type in [
-                    "string", "str", "text", "varchar", "char", "enum"
-            ]:
-                return param_value
-            elif (param_type.startswith("int") or param_type.startswith("uint")
-                  or param_type.startswith("long")
-                  or param_type.startswith("short")
-                  or param_type.startswith("unsigned")):
-                try:
-                    converted_value = int(param_value)
-                    return converted_value
-                except ValueError:
-                    logger.warning(
-                        "Parsed value '%s' of parameter '%s' is not an "
-                        "integer in tool '%s', degenerating to string.",
-                        param_value, param_name, func_name)
-                return param_value
-            elif (param_type.startswith("num")
-                  or param_type.startswith("float")):
-                try:
-                    float_param_value = float(param_value)
-                    converted_value = (float_param_value if float_param_value -
-                                       int(float_param_value) != 0 else
-                                       int(float_param_value))
-                    return converted_value
-                except ValueError:
-                    logger.warning(
-                        "Parsed value '%s' of parameter '%s' is not a float "
-                        "in tool '%s', degenerating to string.", param_value,
-                        param_name, func_name)
-                return param_value
-            elif param_type in ["boolean", "bool", "binary"]:
-                param_value = param_value.lower()
-                if param_value not in ["true", "false"]:
-                    logger.warning(
-                        "Parsed value '%s' of parameter '%s' is not a "
-                        "boolean (`true` of `false`) in tool '%s', "
-                        "degenerating to false.", param_value, param_name,
-                        func_name)
-                return param_value == "true"
-            else:
-                if param_type == "object" or param_type.startswith("dict"):
-                    try:
-                        converted_value = json.loads(param_value)
-                        return converted_value
-                    except json.JSONDecodeError:
-                        logger.warning(
-                            "Parsed value '%s' of parameter '%s' is not a "
-                            "valid JSON object in tool '%s', will try other "
-                            "methods to parse it.", param_value, param_name,
-                            func_name)
-                logger.warning(
-                    "Parameter '%s' has unknown type '%s'. "
-                    "The value will be treated as a string.", param_name,
-                    param_type)
-                return param_value
-
         # Extract function name
         end_index = function_call_str.index(">")
         function_name = function_call_str[:end_index]
-        param_config = get_arguments_config(function_name)
+        param_config = self._get_arguments_config(function_name, tools)
         parameters = function_call_str[end_index + 1:]
         param_dict = {}
-        for match in self.tool_call_parameter_regex.findall(parameters):
-            match_text = match[0] if match[0] else match[1]
+        for match_text in self.tool_call_parameter_regex.findall(parameters):
             idx = match_text.index(">")
             param_name = match_text[:idx]
             param_value = str(match_text[idx + 1:])
@@ -231,7 +220,7 @@ class Qwen3CoderToolParser(ToolParser):
             if param_value.endswith("\n"):
                 param_value = param_value[:-1]
 
-            param_dict[param_name] = convert_param_value(
+            param_dict[param_name] = self._convert_param_value(
                 param_value, param_name, param_config, function_name)
         return ToolCall(
             type="function",
@@ -284,8 +273,7 @@ class Qwen3CoderToolParser(ToolParser):
                 for function_call_str in function_calls
             ]
 
-            # Populate prev_tool_call_arr for serving layer to set
-            # finish_reason
+            # Populate prev_tool_call_arr for serving layer to set finish_reason
             self.prev_tool_call_arr.clear()  # Clear previous calls
             for tool_call in tool_calls:
                 if tool_call:
@@ -298,8 +286,8 @@ class Qwen3CoderToolParser(ToolParser):
 
             # Extract content before tool calls
             content_index = model_output.find(self.tool_call_start_token)
-            content_index = (content_index if content_index >= 0 else
-                             model_output.find(self.tool_call_prefix))
+            idx = model_output.find(self.tool_call_prefix)
+            content_index = content_index if content_index >= 0 else idx
             content = model_output[:content_index]  # .rstrip()
 
             return ExtractedToolCallInformation(
@@ -324,13 +312,16 @@ class Qwen3CoderToolParser(ToolParser):
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
     ) -> Union[DeltaMessage, None]:
-        # If no delta text, return None unless it's an EOS token after tool
-        # calls
+        # Store request for type conversion
+        if not previous_text:
+            self._reset_streaming_state()
+            self.streaming_request = request
+
+        # If no delta text, return None unless it's an EOS token after tools
         if not delta_text:
             # Check if this is an EOS token after all tool calls are complete
-            # We check for tool calls in the text even if is_tool_call_started
-            # is False because it might have been reset after processing all
-            # tools
+            # Check for tool calls in text even if is_tool_call_started
+            # is False (might have been reset after processing all tools)
             if (delta_token_ids
                     and self.tool_call_end_token_id not in delta_token_ids):
                 # Count complete tool calls
@@ -339,24 +330,19 @@ class Qwen3CoderToolParser(ToolParser):
 
                 # If we have completed tool calls and populated
                 # prev_tool_call_arr
-                if (complete_calls > 0 and len(self.prev_tool_call_arr) > 0):
+                if complete_calls > 0 and len(self.prev_tool_call_arr) > 0:
                     # Check if all tool calls are closed
-                    open_calls = (
-                        current_text.count(self.tool_call_start_token) -
-                        current_text.count(self.tool_call_end_token))
+                    open_calls = current_text.count(
+                        self.tool_call_start_token) - current_text.count(
+                            self.tool_call_end_token)
                     if open_calls == 0:
-                        # Return empty delta message to allow finish_reason
-                        # processing
+                        # Return empty delta for finish_reason processing
                         return DeltaMessage(content="")
                 elif not self.is_tool_call_started and current_text:
                     # This is a regular content response that's now complete
                     return DeltaMessage(content="")
             return None
 
-        # Check if this is the first call (reset state if needed)
-        if not previous_text:
-            self._reset_streaming_state()
-
         # Update accumulated text
         self.accumulated_text = current_text
 
@@ -371,11 +357,11 @@ class Qwen3CoderToolParser(ToolParser):
                 self.param_count = 0
                 self.json_started = False
                 self.json_closed = False
+                self.accumulated_params = {}
 
                 # Check if there are more tool calls
-                tool_starts_count = current_text.count(
-                    self.tool_call_start_token)
-                if self.current_tool_index >= tool_starts_count:
+                tool_starts = current_text.count(self.tool_call_start_token)
+                if self.current_tool_index >= tool_starts:
                     # No more tool calls
                     self.is_tool_call_started = False
                 # Continue processing next tool
@@ -412,20 +398,20 @@ class Qwen3CoderToolParser(ToolParser):
 
         # We're in a tool call, find the current tool call portion
         # Need to find the correct tool call based on current_tool_index
-        tool_starts: list[int] = []
+        tool_start_positions: list[int] = []
         idx = 0
         while True:
             idx = current_text.find(self.tool_call_start_token, idx)
             if idx == -1:
                 break
-            tool_starts.append(idx)
+            tool_start_positions.append(idx)
             idx += len(self.tool_call_start_token)
 
-        if self.current_tool_index >= len(tool_starts):
+        if self.current_tool_index >= len(tool_start_positions):
             # No more tool calls to process yet
             return None
 
-        tool_start_idx = tool_starts[self.current_tool_index]
+        tool_start_idx = tool_start_positions[self.current_tool_index]
         # Find where this tool call ends (or current position if not ended yet)
         tool_end_idx = current_text.find(self.tool_call_end_token,
                                          tool_start_idx)
@@ -438,19 +424,19 @@ class Qwen3CoderToolParser(ToolParser):
         # Looking for function header
         if not self.header_sent:
             if self.tool_call_prefix in tool_text:
-                func_start = (tool_text.find(self.tool_call_prefix) +
-                              len(self.tool_call_prefix))
+                func_start = tool_text.find(self.tool_call_prefix) + len(
+                    self.tool_call_prefix)
                 func_end = tool_text.find(">", func_start)
 
                 if func_end != -1:
                     # Found complete function name
                     self.current_function_name = tool_text[func_start:func_end]
-                    self.current_tool_string_id = self._generate_tool_call_id()
+                    self.current_tool_id = self._generate_tool_call_id()
                     self.header_sent = True
                     self.in_function = True
 
-                    # IMPORTANT: Add to prev_tool_call_arr immediately when we
-                    # detect a tool call. This ensures
+                    # IMPORTANT: Add to prev_tool_call_arr immediately when
+                    # we detect a tool call. This ensures
                     # finish_reason="tool_calls" even if parsing isn't complete
                     already_added = any(
                         tool.get("name") == self.current_function_name
@@ -466,7 +452,7 @@ class Qwen3CoderToolParser(ToolParser):
                     return DeltaMessage(tool_calls=[
                         DeltaToolCall(
                             index=self.current_tool_index,
-                            id=self.current_tool_string_id,
+                            id=self.current_tool_id,
                             function=DeltaFunctionCall(
                                 name=self.current_function_name, arguments=""),
                             type="function",
@@ -496,10 +482,11 @@ class Qwen3CoderToolParser(ToolParser):
                 # Close JSON
                 self.json_closed = True
 
-                # Extract the complete tool call to update prev_tool_call_arr
-                # with final arguments. Find the function content
-                func_start = (tool_text.find(self.tool_call_prefix) +
-                              len(self.tool_call_prefix))
+                # Extract complete tool call to update
+                # prev_tool_call_arr with final arguments
+                # Find the function content
+                func_start = tool_text.find(self.tool_call_prefix) + len(
+                    self.tool_call_prefix)
                 func_content_end = tool_text.find(self.function_end_token,
                                                   func_start)
                 if func_content_end != -1:
@@ -507,15 +494,17 @@ class Qwen3CoderToolParser(ToolParser):
                     # Parse to get the complete arguments
                     try:
                         parsed_tool = self._parse_xml_function_call(
-                            func_content, request.tools if request else None)
+                            func_content, self.streaming_request.tools
+                            if self.streaming_request else None)
                         if parsed_tool:
-                            # Update existing entry in prev_tool_call_arr with
-                            # complete arguments
+                            # Update existing entry in
+                            # prev_tool_call_arr with complete args
                             for i, tool in enumerate(self.prev_tool_call_arr):
-                                if (tool.get("name") ==
-                                        parsed_tool.function.name):
-                                    self.prev_tool_call_arr[i]["arguments"] = (
-                                        parsed_tool.function.arguments)
+                                if tool.get(
+                                        "name") == parsed_tool.function.name:
+                                    args = parsed_tool.function.arguments
+                                    self.prev_tool_call_arr[i][
+                                        "arguments"] = args
                                     break
                     except Exception:
                         pass  # Ignore parsing errors during streaming
@@ -530,73 +519,110 @@ class Qwen3CoderToolParser(ToolParser):
                 # Reset state for next tool
                 self.in_function = False
                 self.json_closed = True
+                self.accumulated_params = {}
 
                 return result
 
             # Look for parameters
-            # Count how many complete parameters we have processed
-            complete_params = tool_text.count(self.parameter_end_token)
+            # Find all parameter starts
+            param_starts = []
+            idx = 0
+            while True:
+                idx = tool_text.find(self.parameter_prefix, idx)
+                if idx == -1:
+                    break
+                param_starts.append(idx)
+                idx += len(self.parameter_prefix)
 
             # Check if we should start a new parameter
-            if not self.in_param and self.param_count < complete_params:
-                # Find the unprocessed parameter
-                # Count parameter starts
-                param_starts = []
-                idx = 0
-                while True:
-                    idx = tool_text.find(self.parameter_prefix, idx)
-                    if idx == -1:
-                        break
-                    param_starts.append(idx)
-                    idx += len(self.parameter_prefix)
+            if (not self.in_param and self.param_count < len(param_starts)
+                    and len(param_starts) > self.param_count):
+                # Process the next parameter
+                param_idx = param_starts[self.param_count]
+                param_start = param_idx + len(self.parameter_prefix)
+                remaining = tool_text[param_start:]
 
-                if len(param_starts) > self.param_count:
-                    # Process the next parameter
-                    param_idx = param_starts[self.param_count]
-                    param_start = param_idx + len(self.parameter_prefix)
-                    remaining = tool_text[param_start:]
+                if ">" in remaining:
+                    # We have the complete parameter name
+                    name_end = remaining.find(">")
+                    self.current_param_name = remaining[:name_end]
 
-                    if ">" in remaining:
-                        # We have the complete parameter name
-                        name_end = remaining.find(">")
-                        self.current_param_name = remaining[:name_end]
+                    # Find the parameter value
+                    value_start = param_start + name_end + 1
+                    value_text = tool_text[value_start:]
+                    if value_text.startswith("\n"):
+                        value_text = value_text[1:]
 
-                        # Find the parameter value
-                        value_start = param_start + name_end + 1
-                        value_text = tool_text[value_start:]
-                        if value_text.startswith("\n"):
-                            value_text = value_text[1:]
+                    # Find where this parameter ends
+                    param_end_idx = value_text.find(self.parameter_end_token)
+                    if param_end_idx == -1:
+                        # No closing tag, look for next parameter or
+                        # function end
+                        next_param_idx = value_text.find(self.parameter_prefix)
+                        func_end_idx = value_text.find(self.function_end_token)
 
-                        # Find where this parameter ends
-                        param_end_idx = value_text.find(
-                            self.parameter_end_token)
-                        if param_end_idx != -1:
-                            # Complete parameter found
-                            param_value = value_text[:param_end_idx]
-                            if param_value.endswith("\n"):
-                                param_value = param_value[:-1]
-
-                            # Build complete JSON fragment for this parameter
-                            if self.param_count == 0:
-                                json_fragment = (
-                                    '"' + self.current_param_name + '": "' +
-                                    json.dumps(param_value)[1:-1] + '"')
+                        if next_param_idx != -1 and (func_end_idx == -1
+                                                     or next_param_idx
+                                                     < func_end_idx):
+                            param_end_idx = next_param_idx
+                        elif func_end_idx != -1:
+                            param_end_idx = func_end_idx
+                        else:
+                            # Neither found, check if tool call is complete
+                            if self.tool_call_end_token in tool_text:
+                                # Tool call is complete, so parameter
+                                # must be complete too. Use all
+                                # remaining text before function end
+                                param_end_idx = len(value_text)
                             else:
-                                json_fragment = (
-                                    ', "' + self.current_param_name + '": "' +
-                                    json.dumps(param_value)[1:-1] + '"')
+                                # Still streaming, wait for more content
+                                return None
 
-                            self.param_count += 1
+                    if param_end_idx != -1:
+                        # Complete parameter found
+                        param_value = value_text[:param_end_idx]
+                        if param_value.endswith("\n"):
+                            param_value = param_value[:-1]
 
-                            return DeltaMessage(tool_calls=[
-                                DeltaToolCall(
-                                    index=self.current_tool_index,
-                                    function=DeltaFunctionCall(
-                                        arguments=json_fragment),
-                                )
-                            ])
+                        # Store raw value for later processing
+                        self.accumulated_params[
+                            self.current_param_name] = param_value
 
-            # Continue parameter value
+                        # Get parameter configuration for type conversion
+                        param_config = self._get_arguments_config(
+                            self.current_function_name or "",
+                            self.streaming_request.tools
+                            if self.streaming_request else None)
+
+                        # Convert param value to appropriate type
+                        converted_value = self._convert_param_value(
+                            param_value, self.current_param_name, param_config,
+                            self.current_function_name or "")
+
+                        # Build JSON fragment based on the converted type
+                        # Use json.dumps to properly serialize the value
+                        serialized_value = json.dumps(converted_value,
+                                                      ensure_ascii=False)
+
+                        if self.param_count == 0:
+                            json_fragment = (f'"{self.current_param_name}": '
+                                             f'{serialized_value}')
+                        else:
+                            json_fragment = (f', "{self.current_param_name}": '
+                                             f'{serialized_value}')
+
+                        self.param_count += 1
+
+                        return DeltaMessage(tool_calls=[
+                            DeltaToolCall(
+                                index=self.current_tool_index,
+                                function=DeltaFunctionCall(
+                                    arguments=json_fragment),
+                            )
+                        ])
+
+            # Continue parameter value - Not used in the current implementation
+            # since we process complete parameters above
             if self.in_param:
                 if self.parameter_end_token in delta_text:
                     # End of parameter
@@ -608,25 +634,42 @@ class Qwen3CoderToolParser(ToolParser):
                         gt_idx = value_chunk.find(">")
                         value_chunk = value_chunk[gt_idx + 1:]
 
-                    if (not self.current_param_value
-                            and value_chunk.startswith("\n")):
+                    if not self.current_param_value and value_chunk.startswith(
+                            "\n"):
                         value_chunk = value_chunk[1:]
 
-                    # Calculate incremental JSON
+                    # Store complete value
                     full_value = self.current_param_value + value_chunk
-                    prev_escaped = (json.dumps(self.current_param_value)[1:-1]
-                                    if self.current_param_value else "")
-                    full_escaped = json.dumps(full_value)[1:-1]
-                    delta_escaped = full_escaped[len(prev_escaped):]
+                    self.accumulated_params[
+                        self.current_param_name] = full_value
 
+                    # Get parameter configuration for type conversion
+                    param_config = self._get_arguments_config(
+                        self.current_function_name or "",
+                        self.streaming_request.tools
+                        if self.streaming_request else None)
+
+                    # Convert the parameter value to the appropriate type
+                    converted_value = self._convert_param_value(
+                        full_value, self.current_param_name or "",
+                        param_config, self.current_function_name or "")
+
+                    # Serialize the converted value
+                    serialized_value = json.dumps(converted_value,
+                                                  ensure_ascii=False)
+
+                    # Since we've been streaming the quoted version,
+                    # we need to close it properly
+                    # This is complex - for now just complete the value
                     self.in_param = False
                     self.current_param_value = ""
 
+                    # Just close the current parameter string
                     return DeltaMessage(tool_calls=[
                         DeltaToolCall(
                             index=self.current_tool_index,
                             function=DeltaFunctionCall(
-                                arguments=delta_escaped + '"'),
+                                arguments='"'),  # Close the string quote
                         )
                     ])
                 else:
@@ -638,18 +681,18 @@ class Qwen3CoderToolParser(ToolParser):
                         gt_idx = value_chunk.find(">")
                         value_chunk = value_chunk[gt_idx + 1:]
 
-                    if (not self.current_param_value
-                            and value_chunk.startswith("\n")):
+                    if not self.current_param_value and value_chunk.startswith(
+                            "\n"):
                         value_chunk = value_chunk[1:]
 
                     if value_chunk:
                         # Stream the escaped delta
-                        prev_escaped = (json.dumps(
-                            self.current_param_value)[1:-1]
-                                        if self.current_param_value else "")
+                        prev_escaped = json.dumps(
+                            self.current_param_value, ensure_ascii=False
+                        )[1:-1] if self.current_param_value else ""
                         self.current_param_value += value_chunk
-                        full_escaped = json.dumps(
-                            self.current_param_value)[1:-1]
+                        full_escaped = json.dumps(self.current_param_value,
+                                                  ensure_ascii=False)[1:-1]
                         delta_escaped = full_escaped[len(prev_escaped):]
 
                         if delta_escaped:
@@ -661,4 +704,4 @@ class Qwen3CoderToolParser(ToolParser):
                                 )
                             ])
 
-        return None
+        return None
\ No newline at end of file

From c905684cfeaee3b2be2c736eee473b2c6ae7f7bf Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Tue, 26 Aug 2025 20:05:34 -0700
Subject: [PATCH 630/932] [Core] Asynchronous h2d in
 merge_multimodal_embeddings via pinned memory. (#23686)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/model_executor/models/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 6c27fedc61..11e098f1d7 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -508,7 +508,9 @@ def merge_multimodal_embeddings(
     """
     if isinstance(placeholder_token_id, list):
         placeholder_token_id = torch.tensor(placeholder_token_id,
-                                            device=input_ids.device)
+                                            pin_memory=True).to(
+                                                device=input_ids.device,
+                                                non_blocking=True)
         return _merge_multimodal_embeddings(
             inputs_embeds,
             torch.isin(input_ids, placeholder_token_id),

From 644d57d53191b94d9e50a4765891c498790d924b Mon Sep 17 00:00:00 2001
From: CSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com>
Date: Wed, 27 Aug 2025 12:02:55 +0800
Subject: [PATCH 631/932] [Model] Add Ernie4.5 VL Model Support (#22514)

Signed-off-by: wangyafeng <wangyafeng@baidu.com>
---
 docs/models/supported_models.md               |    1 +
 examples/offline_inference/vision_language.py |   32 +
 requirements/test.in                          |    1 +
 requirements/test.txt                         |    3 +
 .../multimodal/processing/test_common.py      |    1 +
 tests/models/registry.py                      |    2 +
 .../rotary_embedding/ernie45_vl_rope.py       |   72 +
 .../layers/rotary_embedding/mrope.py          |  123 ++
 vllm/model_executor/models/ernie45_vl.py      | 1504 +++++++++++++++++
 vllm/model_executor/models/ernie45_vl_moe.py  |  723 ++++++++
 vllm/model_executor/models/registry.py        |    1 +
 11 files changed, 2463 insertions(+)
 create mode 100644 vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
 create mode 100644 vllm/model_executor/models/ernie45_vl.py
 create mode 100644 vllm/model_executor/models/ernie45_vl_moe.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 74f3a9d1cd..19ce8c0672 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -616,6 +616,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
 | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
 | `DonutForConditionalGeneration`<sup>^</sup> | Donut | T + I | `ByteDance/Dolphin`, `naver-clova-ix/donut-base-finetuned-docvqa`, etc. | | | |
+| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ |
 | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
 | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 8d97ba2668..4e879666f6 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -173,6 +173,37 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Ernie4.5-VL
+def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
+    )
+
+    if modality == "image":
+        placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
+    elif modality == "video":
+        placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
+
+    prompts = [
+        (
+            f"<|begin_of_sentence|>User: {question}{placeholder}\n"
+            "Assistant: <think></think>"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Florence2
 def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1602,6 +1633,7 @@ model_example_map = {
     "chameleon": run_chameleon,
     "command_a_vision": run_command_a_vision,
     "deepseek_vl_v2": run_deepseek_vl2,
+    "ernie45_vl": run_ernie45_vl,
     "florence2": run_florence2,
     "fuyu": run_fuyu,
     "gemma3": run_gemma3,
diff --git a/requirements/test.in b/requirements/test.in
index 098a9242bc..92c577c501 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -54,3 +54,4 @@ runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
 pydantic>=2.10 # 2.9 leads to error on python 3.10
 terratorch==1.1rc2 # required for PrithviMAE test
+decord==0.6.0
diff --git a/requirements/test.txt b/requirements/test.txt
index 8b872752d8..0c27c9bb67 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -156,6 +156,8 @@ datasets==3.0.2
     #   mteb
 decorator==5.1.1
     # via librosa
+decord==0.6.0
+    # via -r requirements/test.in
 dill==0.3.8
     # via
     #   datasets
@@ -493,6 +495,7 @@ numpy==1.26.4
     #   contourpy
     #   cupy-cuda12x
     #   datasets
+    #   decord
     #   einx
     #   encodec
     #   evaluate
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 74ca10d326..6361cb9b55 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -272,6 +272,7 @@ def _test_processing_correctness_one(
     "CohereLabs/command-a-vision-07-2025",
     "deepseek-ai/deepseek-vl2-tiny",
     "naver-clova-ix/donut-base-finetuned-docvqa",
+    "baidu/ERNIE-4.5-VL-28B-A3B-PT",
     "microsoft/Florence-2-base",
     "adept/fuyu-8b",
     "google/gemma-3-4b-it",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 20c7c3af67..f2c09d3e84 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -396,6 +396,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                 transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
+    "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo("baidu/ERNIE-4.5-VL-28B-A3B-PT",  # noqa: E501
+                                                              trust_remote_code=True),
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
     "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it",    # noqa: E501
diff --git a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
new file mode 100644
index 0000000000..05322e56f2
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from .common import apply_rotary_emb_dispatch
+from .mrope import MRotaryEmbedding
+
+
+class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
+    """3D rotary positional embedding. 3D is t:time h:height w:width"""
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        assert positions.ndim == 1 or positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if positions.ndim == 2:
+            assert self.mrope_section
+
+            section_h = self.mrope_section[0]  # 22
+            section_w = self.mrope_section[1]  # 22
+            section_t = self.mrope_section[2]  # 20
+            assert section_h == section_w
+            # Split according to [h w h w h w h w... t t t...]
+            section_cos_t = cos[..., -section_t:]
+            section_cos_h = cos[..., :section_h + section_w:2]
+            section_cos_w = cos[..., 1:section_h + section_w:2]
+
+            cos_t, cos_h, cos_w = section_cos_t[0], section_cos_h[
+                1], section_cos_w[2]
+            cos_hw = torch.stack([cos_h, cos_w],
+                                 dim=-1).reshape(cos_h.shape[:-1] +
+                                                 (cos_h.shape[-1] * 2, ))
+            cos = torch.cat([cos_hw, cos_t], dim=-1)
+
+            section_sin_t = sin[..., -section_t:]
+            section_sin_h = sin[..., :section_h + section_w:2]
+            section_sin_w = sin[..., 1:section_h + section_w:2]
+
+            sin_t, sin_h, sin_w = section_sin_t[0], section_sin_h[
+                1], section_sin_w[2]
+            sin_hw = torch.stack([sin_h, sin_w],
+                                 dim=-1).reshape(sin_h.shape[:-1] +
+                                                 (sin_h.shape[-1] * 2, ))
+            sin = torch.cat([sin_hw, sin_t], dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin,
+                                              self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin,
+                                            self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index a091cfb743..e374aa9beb 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -393,6 +393,15 @@ class MRotaryEmbedding(RotaryEmbedding):
                 context_len=context_len,
                 seq_len=seq_len,
             )
+        elif hf_config.model_type in ["ernie4_5_moe_vl", "ernie4_5_vl"]:
+            return cls._ernie_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                context_len=context_len,
+                seq_len=seq_len,
+            )
         else:
             return cls._vl_get_input_positions_tensor(
                 input_tokens=input_tokens,
@@ -513,6 +522,120 @@ class MRotaryEmbedding(RotaryEmbedding):
                                 len(input_tokens)).item()
         return llm_positions, mrope_position_delta
 
+    @classmethod
+    def _ernie_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value for Ernie VL."""
+
+        image_token_id = hf_config.im_patch_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_conv_size = hf_config.spatial_conv_size
+        temporal_conv_size = hf_config.temporal_conv_size
+        llm_pos_ids_list: list = []
+
+        if not (image_grid_thw is None and video_grid_thw is None):
+            if isinstance(image_grid_thw, torch.Tensor):
+                image_grid_thw = image_grid_thw.tolist()
+
+            input_token_type: list[str] = []
+            video_check_flg = False
+            for token in input_tokens:
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if (token == image_token_id) and (video_check_flg is False):
+                    input_token_type.append("image")
+                elif (token == image_token_id) and (video_check_flg is True):
+                    input_token_type.append("video")
+                else:
+                    input_token_type.append("text")
+
+            input_type_group: list[tuple[str, int, int]] = []
+            for key, group_iter in itertools.groupby(
+                    enumerate(input_token_type), lambda x: x[1]):
+                group_list = list(group_iter)
+                start_index = group_list[0][0]
+                end_index = group_list[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            video_frame_num = 1
+            mm_data_idx = 0
+            for modality_type, start_idx, end_idx in input_type_group:
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                    llm_pos_ids_list) > 0 else 0
+                if modality_type == "image":
+                    t, h, w = (
+                        image_grid_thw[mm_data_idx][0],
+                        image_grid_thw[mm_data_idx][1],
+                        image_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = \
+                        t, h // spatial_conv_size, w // spatial_conv_size
+
+                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
+                        -1, llm_grid_h * llm_grid_w).flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                        llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                        llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx)
+                    mm_data_idx += 1
+
+                elif modality_type == "video":
+                    t, h, w = (
+                        video_grid_thw[mm_data_idx][0],
+                        video_grid_thw[mm_data_idx][1],
+                        video_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = (t //
+                                                          temporal_conv_size,
+                                                          h //
+                                                          spatial_conv_size,
+                                                          w //
+                                                          spatial_conv_size)
+
+                    for t_idx in range(llm_grid_t):
+                        t_index = torch.tensor(t_idx).view(-1, 1).expand(
+                            -1, llm_grid_h * llm_grid_w).flatten()
+                        h_index = torch.arange(llm_grid_h).view(
+                            1, -1, 1).expand(1, -1, llm_grid_w).flatten()
+                        w_index = torch.arange(llm_grid_w).view(
+                            1, 1, -1).expand(1, llm_grid_h, -1).flatten()
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                    mm_data_idx += 1
+                    video_frame_num += 1
+
+                else:
+                    text_len = end_idx - start_idx
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) +
+                        st_idx)
+                    video_frame_num = 1
+
+        else:
+            text_len = len(input_tokens)
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1))
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = llm_positions[:, context_len:seq_len]
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+        return llm_positions, mrope_position_delta
+
     @classmethod
     def _vl_get_input_positions_tensor(
         cls,
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
new file mode 100644
index 0000000000..d880fc434e
--- /dev/null
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -0,0 +1,1504 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Baidu team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Erine VL model compatible with HuggingFace weights."""
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import Any, Callable, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from transformers import BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.activation import QuickGELU
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargsItems)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import _Backend, current_platform
+from vllm.sequence import IntermediateTensors
+
+from .ernie45_vl_moe import Ernie4_5_VLMoeForCausalLM
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, WeightsMapper, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+_MAX_FRAMES_PER_VIDEO = 16
+
+# === Vision Transformer === #
+
+
+def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1),
+                         "... d two -> ... (d two)",
+                         two=2)
+
+
+def apply_rotary_emb_torch(x: torch.Tensor,
+                           cos: torch.Tensor,
+                           sin: torch.Tensor,
+                           interleaved: bool = False) -> torch.Tensor:
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(
+        sin,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos +
+            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb_vision(t: torch.Tensor,
+                                freqs: torch.Tensor) -> torch.Tensor:
+    t_ = t.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    apply_rotary_emb = apply_rotary_emb_torch
+    if current_platform.is_cuda():
+        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+    output = apply_rotary_emb(t_, cos, sin).type_as(t)
+    return output
+
+
+def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
+    """All-gather the input tensor interleavely across model parallel group."""
+    import torch.distributed as dist
+    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
+    dist.all_gather(gathered_tensors,
+                    local_tensor,
+                    group=parallel_state.get_tp_group().device_group)
+
+    gathered_tensors_split = [
+        torch.split(tensor, hidden_size // tp_size, -1)
+        for tensor in gathered_tensors
+    ]
+    ordered_tensors = [
+        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
+    ]
+    result_tensor = torch.cat(ordered_tensors, dim=-1)
+    return result_tensor
+
+
+class Ernie4_5_VisionAttention(nn.Module):
+    """VisionAttention using VLLM framework APIs"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads)
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size)
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv")
+        self.proj = RowParallelLinear(input_size=projection_size,
+                                      output_size=embed_dim,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.proj")
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
+                _Backend.ROCM_AITER_FA
+        }:
+            raise RuntimeError(
+                f"Ernie45-VL does not support {self.attn_backend} backend now."
+            )
+        self.is_flash_attn_backend = self.attn_backend in {
+            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+        }
+
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        # [s, b, 3 * head * head_dim]
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
+                                        self.tp_size)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
+        q, k, v = qkv.chunk(3, dim=2)
+
+        # 3 * [s, b, head * head_dim]
+        if self.tp_size > 1:
+            splitter = partial(dist_utils.split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
+        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
+                     self.hidden_size_per_attention_head)
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+        # [s, b, c] --> [s, b, head * 3 * head_dim]
+        x, _ = self.qkv(x)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
+        q, k, v = self.split_qkv(x)
+        batch_size = q.shape[1]
+
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
+                   for x in (q, k, v))
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+
+        if self.is_flash_attn_backend:
+            # from vllm_flash_attn.flash_attn_interface import (
+            #   flash_attn_varlen_func)
+            if self.attn_backend == _Backend.ROCM_AITER_FA:
+                from aiter import flash_attn_varlen_func
+            else:
+                from flash_attn import flash_attn_varlen_func
+
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+
+            output = flash_attn_varlen_func(q,
+                                            k,
+                                            v,
+                                            cu_seqlens_q=cu_seqlens,
+                                            cu_seqlens_k=cu_seqlens,
+                                            max_seqlen_q=max_seqlen,
+                                            max_seqlen_k=max_seqlen,
+                                            dropout_p=0.0,
+                                            causal=False)
+
+            context_layer = rearrange(output,
+                                      "(b s) ... -> b s ...",
+                                      b=batch_size)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            # Execute attention entry by entry for speed & less VRAM.
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+                                 for x in [q_i, k_i, v_i])
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                          k_i,
+                                                          v_i,
+                                                          dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1)
+        elif self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                       kv_seqlen=None,
+                                                       device=q.device)
+
+            context_layer = xops.memory_efficient_attention_forward(
+                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> s b (h d)").contiguous()
+
+        output, _ = self.proj(context_layer)
+        return output
+
+
+class Ernie4_5_VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        act_layer: type[nn.Module] = QuickGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(in_features,
+                                        hidden_features,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1")
+        self.act = act_layer()
+        self.fc2 = RowParallelLinear(hidden_features,
+                                     in_features,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        x_parallel = self.act(x_parallel)
+        x, _ = self.fc2(x_parallel)
+        return x
+
+
+class Ernie4_5_VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float,
+        act_layer: type[nn.Module] = QuickGELU,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.attn = Ernie4_5_VisionAttention(embed_dim=dim,
+                                             num_heads=num_heads,
+                                             projection_size=dim,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.attn")
+
+        self.mlp = Ernie4_5_VisionMLP(dim,
+                                      mlp_hidden_dim,
+                                      act_layer=act_layer,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.mlp")
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            max_seqlen=max_seqlen,
+            seqlens=seqlens,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+class Ernie4_5_VisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        in_channels: int = 3,
+        embed_dim: int = 1280,
+        prefix="",
+    ) -> None:
+
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Linear(in_channels * patch_size * patch_size,
+                              embed_dim,
+                              bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.to(target_dtype)
+        hidden_states = self.proj(hidden_states)
+
+        return hidden_states
+
+
+class Ernie4_5_VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.inv_freq = 1.0 / theta**(
+            torch.arange(start=0, end=dim, step=2, dtype=torch.float32) / dim)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen,
+                           device=self.inv_freq.device,
+                           dtype=self.inv_freq.dtype)
+        freqs = torch.outer(input=seq, vec2=self.inv_freq)
+        return freqs
+
+
+class Ernie4_5_VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+
+        super().__init__()
+        patch_size = vision_config.patch_size
+        spatial_merge_size = vision_config.spatial_merge_size
+        in_channels = vision_config.in_channels
+        hidden_size = vision_config.hidden_size
+        embed_dim = vision_config.embed_dim
+        depth = vision_config.depth
+        num_heads = vision_config.num_heads
+        mlp_ratio = vision_config.mlp_ratio
+
+        self.spatial_merge_size = spatial_merge_size
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+
+        self.patch_embed = Ernie4_5_VisionPatchEmbed(
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim,
+            prefix=f"{prefix}.patch_embed",
+        )
+
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        head_dim = embed_dim // num_heads
+        self.rotary_pos_emb = Ernie4_5_VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([
+            Ernie4_5_VisionBlock(dim=embed_dim,
+                                 num_heads=num_heads,
+                                 mlp_ratio=mlp_ratio,
+                                 norm_layer=norm_layer,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.blocks.{layer_idx}")
+            for layer_idx in range(depth)
+        ])
+
+        assert (hidden_size == embed_dim
+                ), "vit's config.hidden must be equal to config.embed_dim"
+        self.ln = nn.LayerNorm(hidden_size, eps=1e-6)
+
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def compute_attn_mask_seqlen(
+            self, cu_seqlens: torch.Tensor
+    ) -> tuple[Optional[int], Optional[list[int]]]:
+        max_seqlen, seqlens = None, None
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        elif self.attn_backend == _Backend.XFORMERS:
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        return max_seqlen, seqlens
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                grid_thw: torch.Tensor,
+                num_pad=0) -> torch.Tensor:
+
+        hidden_states = self.patch_embed(hidden_states)
+
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
+
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
+                                             grid_thw[:, 0]).cumsum(
+                                                 dim=0, dtype=torch.int32)
+
+        if num_pad > 0:
+            cu_seqlens = F.pad(cu_seqlens, (1, 1), value=0)
+            cu_seqlens[-1] = cu_seqlens[-2] + num_pad
+        else:
+            cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        # add batch size
+        if hidden_states.ndim == 2:
+            hidden_states = hidden_states.unsqueeze(dim=1)
+
+        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
+        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+
+        for i, blk in enumerate(self.blocks):
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
+
+        final_output = self.ln(hidden_states)
+
+        if final_output.ndim == 3:
+            final_output = final_output.squeeze(dim=1)
+
+        return final_output
+
+    def load_weights(self, weights) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+# === Vision Inputs === #
+
+
+class Ernie4_5_VLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Ernie4_5_VLImageInputs = Ernie4_5_VLImagePixelInputs
+
+
+class Ernie4_5_VLVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Ernie4_5_VLVideoInputs = Ernie4_5_VLImagePixelInputs
+
+# === Vision Processor === #
+
+
+def round_by_factor(number: Union[int, float], factor: int) -> int:
+    return round(number / factor) * factor
+
+
+def ceil_by_factor(number: Union[int, float], factor: int) -> int:
+    return math.ceil(number / factor) * factor
+
+
+def floor_by_factor(number: Union[int, float], factor: int) -> int:
+    return math.floor(number / factor) * factor
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 4 * 28 * 28,
+    max_pixels: int = 16384 * 28 * 28,
+):
+    MAX_RATIO = 200
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        if height > width:
+            new_width = max(factor, round_by_factor(width, factor))
+            new_height = floor_by_factor(new_width * MAX_RATIO, factor)
+        else:
+            new_height = max(factor, round_by_factor(height, factor))
+            new_width = floor_by_factor(new_height * MAX_RATIO, factor)
+
+        height = new_height
+        width = new_width
+
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+
+    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
+        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
+
+    return h_bar, w_bar
+
+
+class VariableResolutionResamplerModel(nn.Module):
+
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 spatial_conv_size,
+                 temporal_conv_size,
+                 config,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.config = config
+        self.spatial_conv_size = spatial_conv_size
+        self.temporal_conv_size = temporal_conv_size
+        self.use_temporal_conv = config.use_temporal_conv
+
+        # compress 2d conv(picture) to 1d
+        self.spatial_dim = (self.in_dim * self.spatial_conv_size *
+                            self.spatial_conv_size)
+        # compress 3d conv(video) to 1d
+        self.temporal_dim = (self.in_dim * self.spatial_conv_size *
+                             self.spatial_conv_size * self.temporal_conv_size)
+
+        self.spatial_linear1 = ColumnParallelLinear(
+            self.spatial_dim,
+            self.spatial_dim,
+            bias=True,
+            gather_output=True,
+            quant_config=getattr(config, 'quant_config', None),
+            prefix=f"{prefix}.spatial_linear1",
+        )
+
+        self.spatial_gelu = nn.GELU()
+
+        self.spatial_linear2 = ColumnParallelLinear(
+            self.spatial_dim,
+            self.spatial_dim,
+            bias=True,
+            gather_output=True,
+            quant_config=getattr(config, 'quant_config', None),
+            prefix=f"{prefix}.spatial_linear2",
+        )
+
+        self.spatial_norm = nn.LayerNorm(self.spatial_dim, eps=1e-6)
+
+        if self.use_temporal_conv:
+            self.temporal_linear1 = ColumnParallelLinear(
+                self.temporal_dim,
+                self.spatial_dim,
+                bias=True,
+                gather_output=True,
+                quant_config=getattr(config, 'quant_config', None),
+                prefix=f"{prefix}.temporal_linear1",
+            )
+
+            self.temporal_gelu = nn.GELU()
+
+            self.temporal_linear2 = ColumnParallelLinear(
+                self.spatial_dim,
+                self.spatial_dim,
+                bias=True,
+                gather_output=True,
+                quant_config=getattr(config, 'quant_config', None),
+                prefix=f"{prefix}.temporal_linear2",
+            )
+
+            self.temporal_norm = nn.LayerNorm(self.spatial_dim, eps=1e-6)
+
+        self.mlp = ColumnParallelLinear(
+            self.spatial_dim,
+            self.out_dim,
+            bias=True,
+            gather_output=True,
+            quant_config=getattr(config, 'quant_config', None),
+            prefix=f"{prefix}.mlp",
+        )
+
+        self.after_norm = RMSNorm(hidden_size=out_dim,
+                                  eps=getattr(config, 'rms_norm_eps', 1e-6))
+
+    def spatial_conv_reshape(self, x, spatial_conv_size):
+        S, C = x.shape
+        x = x.reshape([-1, C * (spatial_conv_size**2)])
+        return x
+
+    def forward(self, x, grid_thw):
+
+        def fwd_spatial(x):
+            x = self.spatial_conv_reshape(x, self.spatial_conv_size)
+
+            x, _ = self.spatial_linear1(x)
+            x = self.spatial_gelu(x)
+            x, _ = self.spatial_linear2(x)
+            x = self.spatial_norm(x)
+
+            return x
+
+        def fwd_placeholder(x, grid_thw, to_tensor=False):
+
+            grid_thw_cpu = grid_thw.cpu().numpy()
+            grid_t, grid_hw = grid_thw_cpu[:, 0], grid_thw_cpu[:, 1:]
+            grid_hw_after_conv = grid_hw.prod(-1) // (self.spatial_conv_size**
+                                                      2)
+
+            tokens_per_img_or_vid = grid_thw_cpu.prod(-1) // (
+                self.spatial_conv_size**2)
+            batch_offset = np.empty(tokens_per_img_or_vid.size,
+                                    dtype=tokens_per_img_or_vid.dtype)
+            batch_offset[0] = 0
+            batch_offset[1:] = tokens_per_img_or_vid.cumsum()[:-1]
+
+            slice_offsets = []
+            for temporoal_size, spatial_size, b_offset in zip(
+                    grid_t, grid_hw_after_conv, batch_offset):
+                for temp_offset in range(0, temporoal_size, 2):
+                    slice_offsets.append(
+                        np.arange(
+                            b_offset + (temp_offset) * spatial_size,
+                            b_offset + (temp_offset + 1) * spatial_size,
+                        ))
+            slice_offsets = torch.tensor(np.concatenate(slice_offsets,
+                                                        axis=-1)).to(x.device)
+
+            slice_offsets2 = []
+            for temporoal_size, spatial_size, b_offset in zip(
+                    grid_t, grid_hw_after_conv, batch_offset):
+                for temp_offset in range(1 if temporoal_size > 1 else 0,
+                                         temporoal_size, 2):
+                    slice_offsets2.append(
+                        np.arange(
+                            b_offset + (temp_offset) * spatial_size,
+                            b_offset + (temp_offset + 1) * spatial_size,
+                        ))
+            slice_offsets2 = torch.tensor(
+                np.concatenate(slice_offsets2, axis=-1)).to(x.device)
+
+            x_timestep_1 = torch.index_select(x, dim=0, index=slice_offsets)
+            x_timestep_2 = torch.index_select(x, dim=0, index=slice_offsets2)
+            x = torch.concat([x_timestep_1, x_timestep_2], dim=-1)
+            return x
+
+        def fwd_temporal(x):
+            x, _ = self.temporal_linear1(x)
+            x = self.temporal_gelu(x)
+            x, _ = self.temporal_linear2(x)
+            x = self.temporal_norm(x)
+            return x
+
+        def fwd_mlp(x):
+            x, _ = self.mlp(x)
+            x = self.after_norm(x)
+            return x
+
+        x = fwd_spatial(x)
+        if self.use_temporal_conv:
+            x = fwd_placeholder(x, grid_thw)
+            x = fwd_temporal(x)
+        x = fwd_mlp(x)
+        return x
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.model_config.hf_config
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(use_fast=True, **kwargs)
+
+    def get_image_processor(self, **kwargs: object):
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+        image_processor: Optional[Any],
+    ) -> tuple[ImageSize, int]:
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+
+        patch_size = vision_config.patch_size
+        spatial_conv_size = hf_config.spatial_conv_size
+        temporal_conv_size = hf_config.temporal_conv_size
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * spatial_conv_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width,
+                                          height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width,
+                                          height=image_height)
+
+        grid_t = max(num_frames // temporal_conv_size, 1)
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches // (spatial_conv_size**2)
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor: Optional[Any],
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            image_processor=image_processor,
+        )
+        return num_image_tokens
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+        image_processor: Optional[Any],
+    ) -> int:
+        _, num_video_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            num_frames=num_frames,
+            image_processor=image_processor,
+        )
+        return num_video_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=9999999,
+            image_height=9999999,
+            image_processor=None,
+        )
+        return max_image_size
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_image_tokens = self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            image_processor=None,
+        )
+        return num_image_tokens
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_frames = 0
+
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+                image_processor=None,
+            )
+
+            if next_max_tokens > max_tokens:
+                break
+
+            num_frames = next_num_frames
+
+        # If the number of frames is odd, discard one frame.
+        if num_frames % 2 != 0:
+            num_frames -= 1
+
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
+
+        return max(max_frames_per_video, 2)
+
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
+            image_processor=None,
+        )
+
+
+class Ernie4_5VLMultiModalProcessor(
+        BaseMultiModalProcessor[Ernie4_5_VLProcessingInfo]):
+
+    def _pixel_values_norm(
+        self,
+        pixel_values: torch.Tensor,
+        mm_kwargs: object,
+    ) -> torch.Tensor:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.vision_config
+        image_processor = self.info.get_image_processor(**mm_kwargs)
+        image_mean_tensor = torch.tensor(image_processor.image_mean,
+                                         dtype=torch.float32).reshape(
+                                             [1, 3, 1, 1])
+        image_std_tensor = torch.tensor(image_processor.image_std,
+                                        dtype=torch.float32).reshape(
+                                            [1, 3, 1, 1])
+        rescale_factor = torch.tensor(image_processor.rescale_factor,
+                                      dtype=torch.float32)
+        patch_size_squared = vision_config.patch_size**2
+
+        image_mean_tensor = (image_mean_tensor.squeeze(
+            [-2, -1]).repeat_interleave(patch_size_squared, -1))
+        image_std_tensor = (image_std_tensor.squeeze(
+            [-2, -1]).repeat_interleave(patch_size_squared, -1))
+
+        if not image_mean_tensor.is_contiguous():
+            image_mean_tensor = image_mean_tensor.contiguous()
+        if not image_std_tensor.is_contiguous():
+            image_std_tensor = image_std_tensor.contiguous()
+
+        pixel_values = (rescale_factor * pixel_values.to(torch.float32) -
+                        image_mean_tensor) / image_std_tensor
+        pixel_values = pixel_values.to(hf_config.torch_dtype)
+        return pixel_values
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # when the prompt is not empty but the multimodal data is empty,
+        # directly invoke the tokenizer.
+        if "images" not in mm_data and "videos" not in mm_data and prompt != "":
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt)
+            tokenizer_output = BatchFeature(dict(input_ids=[prompt_ids]),
+                                            tensor_type="pt")
+            return tokenizer_output
+
+        if "images" not in mm_data:
+            mm_data["images"] = []
+        if "videos" not in mm_data:
+            mm_data["videos"] = []
+        processor_output = self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=[prompt],
+                 images=mm_data["images"],
+                 videos=mm_data["videos"]),
+            dict(**mm_kwargs, **tok_kwargs),
+        )
+
+        # Divide the processor_output into two modalities: image and video.
+        if processor_output is not None:
+            pixel_values = processor_output['images']
+            if pixel_values is not None:
+                processor_output['images'] = self._pixel_values_norm(
+                    pixel_values, mm_kwargs)
+            for key in list(processor_output.keys()):
+                if processor_output[key] is None:
+                    del processor_output[key]
+                    continue
+                if key == "grid_thw":
+                    grid_thw = processor_output['grid_thw']
+                    pixel_values_all = processor_output['images']
+                    # Identify elements where the first
+                    # dimension is greater than 1 and
+                    # treat them as the video modality
+                    mask = grid_thw[:, 0] > 1
+                    processor_output["video_grid_thw"] = grid_thw[mask]
+                    processor_output["image_grid_thw"] = grid_thw[~mask]
+                    image_patch_num = processor_output["image_grid_thw"].prod(
+                        dim=1).sum()
+                    processor_output[
+                        'pixel_values'] = pixel_values_all[:image_patch_num]
+                    processor_output['pixel_values_videos'] = pixel_values_all[
+                        image_patch_num:]
+                    del processor_output['images']
+
+        return processor_output
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        before_placeholder = {
+            "image": "<|image@placeholder|>",
+            "video": "<|video@placeholder|>"
+        }
+
+        after_placeholder = {
+            # image and video have same placeholder
+            "image": "<|IMAGE_PLACEHOLDER|>",
+            "video": "<|IMAGE_PLACEHOLDER|>"
+        }
+
+        merge_length = hf_processor.spatial_conv_size**2
+
+        def get_replacement_ernie45vl(item_idx: int, modality: str):
+            out_item = out_mm_kwargs[modality][item_idx]
+            grid_thw = out_item[f"{modality}_grid_thw"].data
+            assert isinstance(grid_thw, torch.Tensor)
+            if modality == "video":
+                num_tokens = int(grid_thw.prod(
+                )) // hf_processor.temporal_conv_size // merge_length
+            else:
+                num_tokens = int(grid_thw.prod()) // merge_length
+            return after_placeholder[modality] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=before_placeholder[modality],
+                replacement=partial(get_replacement_ernie45vl,
+                                    modality=modality),
+            ) for modality in ("image", "video")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+
+        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+        image_grid_sizes = image_grid_thw.prod(-1)
+
+        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+        video_grid_sizes = video_grid_thw.prod(-1)
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_grid_sizes),
+            image_grid_thw=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+                "video", video_grid_sizes),
+            video_grid_thw=MultiModalFieldConfig.batched("video"),
+        )
+
+
+class Ernie4_5_VLDummyInputsBuilder(
+        BaseDummyInputsBuilder[Ernie4_5_VLProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+        prompt = ""
+        for i in range(num_images):
+            prompt += (f"Picture {i+1}:"
+                       "<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>")
+
+        for i in range(num_videos):
+            prompt += (f"Video {i+1}:"
+                       "<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>")
+        return prompt
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(width=target_width,
+                                   height=target_height,
+                                   num_frames=target_num_frames,
+                                   num_videos=num_videos)
+        }
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Ernie4_5VLMultiModalProcessor,
+    info=Ernie4_5_VLProcessingInfo,
+    dummy_inputs=Ernie4_5_VLDummyInputsBuilder)
+class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                             SupportsLoRA, SupportsPP):
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+            # model.resampler_model.-> language_model.model.resampler_model.
+            # language_model.model.resampler_model. -> resampler_model.
+            "language_model.model.resampler_model.": "resampler_model.",
+        },
+        # resampler_weight_mappings
+        orig_to_new_substr={
+            "spatial_linear.0.": "spatial_linear1.",
+            "spatial_linear.2.": "spatial_linear2.",
+            "spatial_linear.3.": "spatial_norm.",
+            "temporal_linear.0.": "temporal_linear1.",
+            "temporal_linear.2.": "temporal_linear2.",
+            "temporal_linear.3.": "temporal_norm.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
+        if modality.startswith("video"):
+            return "<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.vision_model = Ernie4_5_VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
+
+        self.language_model = Ernie4_5_VLMoeForCausalLM(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.resampler_model = VariableResolutionResamplerModel(
+            self.config.pixel_hidden_size,
+            self.config.hidden_size,
+            self.config.spatial_conv_size,
+            self.config.temporal_conv_size,
+            config=self.config,
+            prefix=maybe_prefix(prefix, "resampler_model"))
+
+        self.visual_token_mask = None
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        """compute logits"""
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def _vision_forward(
+        self,
+        pixel_values: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        if grid_thw is not None:
+            grid_thw = grid_thw[grid_thw > 0]
+            if grid_thw.numel() % 3 != 0:
+                raise ValueError(
+                    f"grid_thw has {grid_thw.numel()} elements after filtering,"
+                    "which is not divisible by 3.")
+            grid_thw = grid_thw.reshape(-1, 3)
+            # example: [[1,64,64],[2,80,80]] -> [[1,64,64],[1,80,80],[1,80,80]]
+            grid_thw = F.pad(
+                torch.repeat_interleave(grid_thw[:, 1:], grid_thw[:, 0], 0),
+                [1, 0, 0, 0],
+                value=1,
+            )
+        image_features = self.vision_model(pixel_values, grid_thw)
+        return image_features
+
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
+        if getattr(self.config, "im_patch_id", None) is not None:
+            self.visual_token_mask = (
+                input_ids == self.config.im_patch_id).reshape(-1, 1)
+        else:
+            self.visual_token_mask = None
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Ernie4_5_VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Ernie4_5_VLImagePixelInputs(type="pixel_values",
+                                               pixel_values=pixel_values,
+                                               image_grid_thw=image_grid_thw)
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[Ernie4_5_VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Ernie4_5_VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+    def _process_image_input(
+            self,
+            image_input: Ernie4_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values = image_input["pixel_values"].type(
+            self.vision_model.dtype)
+        image_features = self._vision_forward(pixel_values=pixel_values,
+                                              grid_thw=grid_thw)
+        image_embeds = self.resampler_model(image_features, grid_thw)
+
+        merge_size = self.vision_model.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self,
+            video_input: Ernie4_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values_videos = video_input["pixel_values_videos"].type(
+            self.vision_model.dtype)
+        video_features = self._vision_forward(pixel_values=pixel_values_videos,
+                                              grid_thw=grid_thw)
+        video_embeds = self.resampler_model(video_features, grid_thw)
+
+        merge_size = self.vision_model.spatial_merge_size
+        sizes = (grid_thw.prod(-1) //
+                 self.config.temporal_conv_size) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("pixel_values_videos",
+                             "video_embeds") and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += video_embeddings
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is None:
+            return inputs_embeds
+
+        self._set_visual_token_mask(input_ids)
+        inputs_embeds = merge_multimodal_embeddings(input_ids, inputs_embeds,
+                                                    multimodal_embeddings,
+                                                    [self.config.im_patch_id])
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+
+        forward_kwargs = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "intermediate_tensors": intermediate_tensors,
+            "inputs_embeds": inputs_embeds,
+        }
+
+        if self.visual_token_mask is not None:
+
+            if self.visual_token_mask.shape[0] != inputs_embeds.shape[0]:
+                padding_len = inputs_embeds.shape[
+                    0] - self.visual_token_mask.shape[0]
+                # right pad False
+                pad = torch.zeros(
+                    (padding_len, self.visual_token_mask.shape[1]),
+                    dtype=self.visual_token_mask.dtype,
+                    device=self.visual_token_mask.device)
+                self.visual_token_mask = torch.cat(
+                    [self.visual_token_mask, pad], dim=0)
+
+            forward_kwargs.update(
+                {"visual_token_mask": self.visual_token_mask})
+            self.visual_token_mask = None
+
+        hidden_states = self.language_model.model(
+            **forward_kwargs,
+            **kwargs,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
new file mode 100644
index 0000000000..f56c098435
--- /dev/null
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -0,0 +1,723 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Baidu team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Erine VL model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+# from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding.ernie45_vl_rope import (
+    Ernie4_5_VLRotaryEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .ernie45_moe import Ernie4_5_MoeMLP
+from .interfaces import SupportsPP
+from .utils import (PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Ernie4_5_VLMoeMLP(Ernie4_5_MoeMLP):
+    pass
+
+
+class Ernie4_5_VLMoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: Optional[int] = None,
+        rope_theta: float = 500000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        freq_allocation: int = 20,
+        max_position_embeddings: int = 131072,
+        rms_norm_eps: float = 1e-05,
+        qkv_bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix) if len(prefix) > 0 else 0
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=qkv_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
+
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        t_rope = freq_allocation
+        h_rope = (self.head_dim // 2 - freq_allocation) // 2
+        w_rope = (self.head_dim // 2 - freq_allocation) // 2
+
+        self.rotary_emb = Ernie4_5_VLRotaryEmbedding(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position_embeddings=max_position_embeddings,
+            base=rope_theta,
+            is_neox_style=False,
+            dtype=torch.get_default_dtype(),
+            mrope_section=[h_rope, w_rope, t_rope])
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+
+        # Attention
+        attn_output = self.attn(q, k, v)
+        # Output projection
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Ernie4_5_VLMoeMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        layer_idx = extract_layer_index(prefix)
+        self.layer_idx = layer_idx
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.has_shared_experts = (getattr(config, "moe_num_shared_experts", 0)
+                                   > 0)
+        self.hidden_size = config.hidden_size
+
+        moe_num_experts = config.moe_num_experts
+        max_moe_num_experts = max(moe_num_experts)
+
+        if self.tp_size > max_moe_num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {moe_num_experts}.")
+
+        moe_layer_start_index = config.moe_layer_start_index
+        text_moe_layer_start_index = moe_layer_start_index[0]
+        vision_moe_layer_start_index = moe_layer_start_index[1]
+        moe_layer_end_index = config.moe_layer_end_index
+        moe_layer_end_index = getattr(
+            config, "moe_layer_end_index",
+            [config.num_hidden_layers - 1, config.num_hidden_layers - 1])
+        text_moe_layer_end_index = moe_layer_end_index[0]
+        vision_moe_layer_end_index = moe_layer_end_index[1]
+
+        assert config.moe_num_experts[0] == config.moe_num_experts[1]
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty(2, config.moe_num_experts[0]))
+
+        assert text_moe_layer_start_index <= text_moe_layer_end_index
+
+        if layer_idx >= text_moe_layer_start_index and \
+            layer_idx <= text_moe_layer_end_index:
+            self.text_experts_gate = ReplicatedLinear(
+                config.hidden_size,
+                config.moe_num_experts[0],
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.text_experts_gate")
+
+            self.text_experts = FusedMoE(
+                num_experts=config.moe_num_experts[0],
+                top_k=config.moe_k,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.moe_intermediate_size[0],
+                reduce_results=False,
+                renormalize=True,
+                quant_config=quant_config,
+                e_score_correction_bias=self.e_score_correction_bias[0],
+                prefix=f"{prefix}.text_experts")
+        else:
+            self.text_experts = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                use_bias=getattr(config, 'use_bias', False),
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp")
+
+        assert vision_moe_layer_start_index <= vision_moe_layer_end_index
+        if layer_idx >= vision_moe_layer_start_index and \
+            layer_idx <= vision_moe_layer_end_index:
+            self.vision_experts_gate = ReplicatedLinear(
+                config.hidden_size,
+                config.moe_num_experts[1],
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.vision_experts_gate")
+
+            self.vision_experts = FusedMoE(
+                num_experts=config.moe_num_experts[1],
+                top_k=config.moe_k,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.moe_intermediate_size[1],
+                reduce_results=False,
+                renormalize=True,
+                quant_config=quant_config,
+                e_score_correction_bias=self.e_score_correction_bias[1],
+                prefix=f"{prefix}.vision_experts")
+        else:
+            self.vision_experts = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                use_bias=getattr(config, 'use_bias', False),
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp")
+
+        if self.has_shared_experts:
+            intermediate_size = (config.moe_intermediate_size[0] *
+                                 config.moe_num_shared_experts)
+            self.shared_experts = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_experts",
+                reduce_results=self.text_experts.
+                must_reduce_shared_expert_outputs())
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        visual_token_mask: torch.Tensor,
+        **kwargs: object,
+    ) -> torch.Tensor:
+
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if self.has_shared_experts:
+            shared_output = self.shared_experts(hidden_states)
+
+        if visual_token_mask is not None and visual_token_mask.any():
+            # assert visual_token_mask.shape[0] != hidden_states.shape[0]
+            visual_token_mask = visual_token_mask.repeat(
+                1, self.hidden_size).bool()
+            text_token_mask = ~visual_token_mask
+            final_hidden_states = torch.zeros_like(hidden_states)
+
+            text_hidden_states = hidden_states[text_token_mask].reshape(
+                -1, self.hidden_size)
+            vision_hidden_states = hidden_states[visual_token_mask].reshape(
+                -1, self.hidden_size)
+
+            text_router_logits, _ = self.text_experts_gate(text_hidden_states)
+            final_hidden_states[text_token_mask] = self.text_experts(
+                hidden_states=text_hidden_states,
+                router_logits=text_router_logits).flatten()
+
+            vision_router_logits, _ = self.vision_experts_gate(
+                vision_hidden_states)
+            final_hidden_states[visual_token_mask] = self.vision_experts(
+                hidden_states=vision_hidden_states,
+                router_logits=vision_router_logits).flatten()
+        else:
+            # text modal input processing directly
+            text_router_logits, _ = self.text_experts_gate(hidden_states)
+
+            final_hidden_states = self.text_experts(
+                hidden_states=hidden_states, router_logits=text_router_logits)
+
+        if self.has_shared_experts and \
+              shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = (
+                self.text_experts.maybe_all_reduce_tensor_model_parallel(
+                    final_hidden_states))
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Ernie4_5_VLMoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 500000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        freq_allocation = getattr(config, "freq_allocation", 20)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          131072)
+
+        self.self_attn = Ernie4_5_VLMoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=getattr(config, 'head_dim', None),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            freq_allocation=freq_allocation,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'use_bias', False),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        layer_idx = extract_layer_index(prefix)
+        self.layer_idx = layer_idx
+
+        # MoE
+        moe_layer_start_index = config.moe_layer_start_index
+        min_moe_layer_start_index = min(moe_layer_start_index)
+        moe_layer_end_index = getattr(
+            config, "moe_layer_end_index",
+            [config.num_hidden_layers - 1, config.num_hidden_layers - 1])
+        max_moe_layer_end_index = max(moe_layer_end_index)
+        assert min_moe_layer_start_index <= max_moe_layer_end_index
+        moe_num_experts = config.moe_num_experts
+        max_moe_num_experts = max(moe_num_experts)
+        moe_layer_interval = getattr(config, "moe_layer_interval", 1)
+        use_moe = getattr(config, "use_moe", max_moe_num_experts > 0)
+
+        if (use_moe and ((layer_idx + 1) % moe_layer_interval == 0)
+                and layer_idx >= min_moe_layer_start_index
+                and layer_idx <= max_moe_layer_end_index):
+            self.mlp = Ernie4_5_VLMoeMoE(config=config,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                use_bias=getattr(config, 'use_bias', False),
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp")
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        visual_token_mask: Optional[torch.Tensor],
+        **kwargs: object,
+    ) -> torch.Tensor:
+
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+
+        if isinstance(self.mlp, Ernie4_5_VLMoeMoE):
+            hidden_states = self.mlp(hidden_states, visual_token_mask,
+                                     **kwargs)
+        else:
+            hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+# Since Ernie VL distinguishes between text experts and vision experts,
+# enabling torch.compile will cause errors.
+# @support_torch_compile(
+#     dynamic_arg_dims={
+#         "input_ids": 0,
+#         "positions": -1,
+#         "intermediate_tensors": 0,
+#         "inputs_embeds": 0,
+#         "visual_token_mask": 0,
+#     })
+class Ernie4_5_VLMoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+
+        self.im_patch_id = config.im_patch_id
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens")
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Ernie4_5_VLMoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        visual_token_mask: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual,
+                                            visual_token_mask, **kwargs)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+# only used as text backbone for ernie4.5-vl
+class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Ernie4_5_VLMoeModel(vllm_config=vllm_config,
+                                         prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(config.vocab_size,
+                                          config.hidden_size,
+                                          quant_config=quant_config)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds, **kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=max(self.config.moe_num_experts))
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if self.config.tie_word_embeddings and name.endswith(
+                    "lm_head.weight"):
+                loaded_params.add("lm_head.weight")
+                continue
+            # MTP will be supported soon.
+            if "mtp" in name or \
+               "vision_model" in name or \
+               "resampler_model" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Distinguish between vision experts and text experts
+                if "mlp.experts" in name:
+                    moe_offset = int(name.split(".")[-3])
+                    vision_expert_start_idx = self.config.moe_num_experts[0]
+                    is_text_expert = \
+                        moe_offset <= vision_expert_start_idx - 1
+                    if is_text_expert:
+                        name = name.replace(".experts.", ".text_experts.")
+                    else:
+                        name = name.replace(
+                            f".experts.{moe_offset}",
+                            f".vision_experts.{moe_offset-vision_expert_start_idx}"
+                        )
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+
+                    if weight_name not in name:
+                        continue
+
+                    # Distinguish between vision experts and text experts
+                    moe_offset = int(name.split(".")[-3])
+                    is_text_expert = \
+                        moe_offset <= self.config.moe_num_experts[0] - 1
+
+                    name = name.replace(weight_name, param_name)
+                    if is_text_expert:
+                        name = name.replace(".experts.", ".text_experts.")
+                    else:
+                        name = name.replace(".experts.", ".vision_experts.")
+
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Distinguish between vision expert gate
+                    # and text expert gate
+                    if name.endswith("mlp.gate.weight"):
+                        name = name.replace("gate.weight",
+                                            "text_experts_gate.weight")
+                        loaded_weight = loaded_weight.T
+                    elif name.endswith("mlp.gate.weight_1"):
+                        name = name.replace("gate.weight_1",
+                                            "vision_experts_gate.weight")
+                        loaded_weight = loaded_weight.T
+
+                    if "e_score_correction_bias" in name:
+                        name = name.replace(".moe_statics.", ".")
+
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index ebf78771e4..c65c58d4a0 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -206,6 +206,7 @@ _MULTIMODAL_MODELS = {
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
     "Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
+    "Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"),  # noqa: E501
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
     "Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"),    # noqa: E501

From 32102644213a6367d10ec3a92ae76fb0004f3a52 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 26 Aug 2025 21:58:59 -0700
Subject: [PATCH 632/932] [Frontend] Add --log-error-stack to print stack trace
 for error response (#22960)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/openai/api_server.py             | 10 ++++++++++
 vllm/entrypoints/openai/cli_args.py               |  2 ++
 vllm/entrypoints/openai/serving_chat.py           |  4 +++-
 vllm/entrypoints/openai/serving_classification.py |  2 ++
 vllm/entrypoints/openai/serving_completion.py     |  2 ++
 vllm/entrypoints/openai/serving_embedding.py      |  4 +++-
 vllm/entrypoints/openai/serving_engine.py         |  9 +++++++++
 vllm/entrypoints/openai/serving_pooling.py        |  4 +++-
 vllm/entrypoints/openai/serving_responses.py      |  2 ++
 vllm/entrypoints/openai/serving_score.py          |  4 +++-
 vllm/entrypoints/openai/serving_tokenization.py   |  4 +++-
 vllm/entrypoints/openai/serving_transcription.py  |  8 ++++++--
 vllm/entrypoints/openai/speech_to_text.py         |  4 +++-
 13 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index db02767fdf..9a2470649c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1749,6 +1749,7 @@ async def init_app_state(
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
         enable_log_outputs=args.enable_log_outputs,
+        log_error_stack=args.log_error_stack,
     ) if "generate" in supported_tasks else None
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
@@ -1767,6 +1768,7 @@ async def init_app_state(
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
         enable_log_outputs=args.enable_log_outputs,
+        log_error_stack=args.log_error_stack,
     ) if "generate" in supported_tasks else None
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
@@ -1776,6 +1778,7 @@ async def init_app_state(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
+        log_error_stack=args.log_error_stack,
     ) if "generate" in supported_tasks else None
     state.openai_serving_pooling = OpenAIServingPooling(
         engine_client,
@@ -1784,6 +1787,7 @@ async def init_app_state(
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
+        log_error_stack=args.log_error_stack,
     ) if "encode" in supported_tasks else None
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
@@ -1792,12 +1796,14 @@ async def init_app_state(
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
+        log_error_stack=args.log_error_stack,
     ) if "embed" in supported_tasks else None
     state.openai_serving_classification = ServingClassification(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
+        log_error_stack=args.log_error_stack,
     ) if "classify" in supported_tasks else None
 
     enable_serving_reranking = ("classify" in supported_tasks and getattr(
@@ -1807,6 +1813,7 @@ async def init_app_state(
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
+        log_error_stack=args.log_error_stack,
     ) if ("embed" in supported_tasks or enable_serving_reranking) else None
 
     state.openai_serving_tokenization = OpenAIServingTokenization(
@@ -1816,18 +1823,21 @@ async def init_app_state(
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
+        log_error_stack=args.log_error_stack,
     )
     state.openai_serving_transcription = OpenAIServingTranscription(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
+        log_error_stack=args.log_error_stack,
     ) if "transcription" in supported_tasks else None
     state.openai_serving_translation = OpenAIServingTranslation(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
+        log_error_stack=args.log_error_stack,
     ) if "transcription" in supported_tasks else None
 
     state.enable_server_load_tracking = args.enable_server_load_tracking
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 6e4eff5c80..d0b5d013eb 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -180,6 +180,8 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
     h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT
     """Maximum number of HTTP headers allowed in a request for h11 parser.
     Helps mitigate header abuse. Default: 256."""
+    log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
+    """If set to True, log the stack trace of error responses"""
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 7e0e627780..1c0ffdfb91 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -76,13 +76,15 @@ class OpenAIServingChat(OpenAIServing):
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         enable_force_include_usage=enable_force_include_usage)
+                         enable_force_include_usage=enable_force_include_usage,
+                         log_error_stack=log_error_stack)
 
         self.response_role = response_role
         self.chat_template = chat_template
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 377f7f6847..1d510d0b60 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -129,12 +129,14 @@ class ServingClassification(ClassificationMixin):
         models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(
             engine_client=engine_client,
             model_config=model_config,
             models=models,
             request_logger=request_logger,
+            log_error_stack=log_error_stack,
         )
 
     async def create_classify(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a0ce654094..b81fd63ece 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -59,6 +59,7 @@ class OpenAIServingCompletion(OpenAIServing):
         return_tokens_as_token_ids: bool = False,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
+        log_error_stack: bool = False,
     ):
         super().__init__(
             engine_client=engine_client,
@@ -67,6 +68,7 @@ class OpenAIServingCompletion(OpenAIServing):
             request_logger=request_logger,
             return_tokens_as_token_ids=return_tokens_as_token_ids,
             enable_force_include_usage=enable_force_include_usage,
+            log_error_stack=log_error_stack,
         )
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
         self.default_sampling_params = (
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 9dcad8e391..45c1932f18 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -593,11 +593,13 @@ class OpenAIServingEmbedding(EmbeddingMixin):
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 0f4a7c0186..a97935e109 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -5,6 +5,7 @@ import io
 import json
 import sys
 import time
+import traceback
 from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
 from http import HTTPStatus
@@ -205,6 +206,7 @@ class OpenAIServing:
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
         enable_force_include_usage: bool = False,
+        log_error_stack: bool = False,
     ):
         super().__init__()
 
@@ -222,6 +224,7 @@ class OpenAIServing:
 
         self._async_tokenizer_pool: dict[AnyTokenizer,
                                          AsyncMicrobatchTokenizer] = {}
+        self.log_error_stack = log_error_stack
 
     def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer:
         """
@@ -412,6 +415,12 @@ class OpenAIServing:
             message: str,
             err_type: str = "BadRequestError",
             status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
+        if self.log_error_stack:
+            exc_type, _, _ = sys.exc_info()
+            if exc_type is not None:
+                traceback.print_exc()
+            else:
+                traceback.print_stack()
         return ErrorResponse(error=ErrorInfo(
             message=message, type=err_type, code=status_code.value))
 
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 38745d001a..e8cb1aed84 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -58,11 +58,13 @@ class OpenAIServingPooling(OpenAIServing):
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 67eec2d523..899cb07b2b 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -88,6 +88,7 @@ class OpenAIServingResponses(OpenAIServing):
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(
             engine_client=engine_client,
@@ -96,6 +97,7 @@ class OpenAIServingResponses(OpenAIServing):
             request_logger=request_logger,
             return_tokens_as_token_ids=return_tokens_as_token_ids,
             enable_force_include_usage=enable_force_include_usage,
+            log_error_stack=log_error_stack,
         )
 
         self.chat_template = chat_template
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index c246274514..37838e22a4 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -47,11 +47,13 @@ class ServingScores(OpenAIServing):
         models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)
 
     async def _embedding_score(
         self,
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 58d7204747..2f258255d5 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -39,11 +39,13 @@ class OpenAIServingTokenization(OpenAIServing):
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 0d6989fe91..9ba58d4425 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -32,13 +32,15 @@ class OpenAIServingTranscription(OpenAISpeechToText):
         *,
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
+        log_error_stack: bool = False,
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         task_type="transcribe")
+                         task_type="transcribe",
+                         log_error_stack=log_error_stack)
 
     async def create_transcription(
         self, audio_data: bytes, request: TranscriptionRequest,
@@ -88,13 +90,15 @@ class OpenAIServingTranslation(OpenAISpeechToText):
         *,
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
+        log_error_stack: bool = False,
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         task_type="translate")
+                         task_type="translate",
+                         log_error_stack=log_error_stack)
 
     async def create_translation(
         self, audio_data: bytes, request: TranslationRequest,
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index de2619a78f..1cbd7dba39 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -53,12 +53,14 @@ class OpenAISpeechToText(OpenAIServing):
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
         task_type: Literal["transcribe", "translate"] = "transcribe",
+        log_error_stack: bool = False,
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
                          request_logger=request_logger,
-                         return_tokens_as_token_ids=return_tokens_as_token_ids)
+                         return_tokens_as_token_ids=return_tokens_as_token_ids,
+                         log_error_stack=log_error_stack)
 
         self.default_sampling_params = (
             self.model_config.get_diff_sampling_param())

From 142ac0803045b3a3edcd7aa58fe079872903a30c Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 26 Aug 2025 21:59:14 -0700
Subject: [PATCH 633/932] [Frontend] Optimize beam search performance by
 limiting concurrency (#23599)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 benchmarks/benchmark_throughput.py |   1 -
 tests/conftest.py                  |   8 +-
 tests/samplers/test_beam_search.py |  53 +++++++++++
 vllm/entrypoints/llm.py            | 138 ++++++++++++++++-------------
 4 files changed, 136 insertions(+), 64 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c7f290e1eb..6b24b8c8f3 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -96,7 +96,6 @@ def run_vllm(
         end = time.perf_counter()
     else:
         assert lora_requests is None, "BeamSearch API does not support LoRA"
-        prompts = [request.prompt for request in requests]
         # output_len should be the same for all requests.
         output_len = requests[0].expected_output_len
         for request in requests:
diff --git a/tests/conftest.py b/tests/conftest.py
index 2bf88abb0f..f8bfdfc8e6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1022,15 +1022,17 @@ class VllmRunner:
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        concurrency_limit: Optional[int] = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
         inputs = self.get_inputs(prompts,
                                  images=images,
                                  videos=videos,
                                  audios=audios)
 
-        outputs = self.llm.beam_search(
-            inputs,
-            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
+        outputs = self.llm.beam_search(inputs,
+                                       BeamSearchParams(beam_width=beam_width,
+                                                        max_tokens=max_tokens),
+                                       concurrency_limit=concurrency_limit)
         returned_outputs = []
         for output in outputs:
             token_ids = [x.tokens for x in output.sequences]
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index bdf48c7687..cc9a88a255 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -67,6 +67,59 @@ def test_beam_search_single_input(
                 f"vLLM: {vllm_output_ids}")
 
 
+@pytest.mark.skip_v1  # FIXME: This fails on V1 right now.
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", MAX_TOKENS)
+@pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
+def test_beam_search_with_concurrency_limit(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    # example_prompts[1]&[3]&[7] fails due to unknown reason even without
+    # concurency limit. skip them for now.
+    example_prompts = (example_prompts[:8])
+    concurrency_limit = 2
+    assert len(example_prompts) > concurrency_limit
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        outputs_with_limit = vllm_model.generate_beam_search(
+            example_prompts,
+            beam_width,
+            max_tokens,
+            concurrency_limit=concurrency_limit)
+        outputs_without_limit = []
+
+        for i in range(0, len(example_prompts), concurrency_limit):
+            outputs_without_limit.extend(
+                vllm_model.generate_beam_search(
+                    example_prompts[i:i + concurrency_limit], beam_width,
+                    max_tokens))
+
+    correct = True
+    for i in range(len(example_prompts)):
+        output_ids_with_limit, output_texts_with_limit = outputs_with_limit[i]
+        output_ids_without_limit, output_texts_without_limit = (
+            outputs_without_limit[i])
+        for j, (text_with_limit, text_without_limit) in enumerate(
+                zip(output_texts_with_limit, output_texts_without_limit)):
+            print(f">>>{j}-th with limit output:")
+            print(text_with_limit)
+            print(f">>>{j}-th without limit output:")
+            print(text_without_limit)
+        assert len(output_ids_with_limit) == len(output_ids_without_limit)
+        for j in range(len(output_ids_with_limit)):
+            if output_ids_with_limit[j] != output_ids_without_limit[j]:
+                print(f"Test{i} output{j}:\n+limit: {output_ids_with_limit}\n"
+                      f"-limit: {output_ids_without_limit}")
+                correct = False
+    assert correct
+
+
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
 @pytest.mark.parametrize("beam_width", MM_BEAM_WIDTHS)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 8816ff56d6..72b6123670 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -523,6 +523,7 @@ class LLM:
         params: BeamSearchParams,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         use_tqdm: bool = False,
+        concurrency_limit: Optional[int] = None,
     ) -> list[BeamSearchOutput]:
         """
         Generate sequences using beam search.
@@ -533,6 +534,8 @@ class LLM:
             params: The beam search parameters.
             lora_request: LoRA request to use for generation, if any.
             use_tqdm: Whether to use tqdm to display the progress bar.
+            concurrency_limit: The maximum number of concurrent requests.
+                If None, the number of concurrent requests is unlimited.
         """
         # TODO: how does beam search work together with length penalty,
         # frequency, penalty, and stopping criteria, etc.?
@@ -551,6 +554,15 @@ class LLM:
             length_penalty,
         )
 
+        if use_tqdm and concurrency_limit is not None:
+            logger.warning(
+                "Progress bar is not supported when using concurrency_limit. "
+                "Disabling progress bar.")
+            use_tqdm = False
+
+        if concurrency_limit is None:
+            concurrency_limit = len(prompts)
+
         def create_tokens_prompt_from_beam(
                 beam: BeamSearchSequence) -> TokensPrompt:
             token_prompt_kwargs: TokensPrompt = {
@@ -595,73 +607,79 @@ class LLM:
                     **mm_kwargs,
                 ), )
 
-        token_iter = range(max_tokens)
-        if use_tqdm:
-            token_iter = tqdm(token_iter,
-                              desc="Beam search",
-                              unit="token",
-                              unit_scale=False)
-            logger.warning(
-                "The progress bar shows the upper bound on token steps and "
-                "may finish early due to stopping conditions. It does not "
-                "reflect instance-level progress.")
+        for prompt_start in range(0, len(prompts), concurrency_limit):
+            instances_batch = instances[prompt_start:prompt_start +
+                                        concurrency_limit]
 
-        for _ in token_iter:
-            all_beams: list[BeamSearchSequence] = list(
-                sum((instance.beams for instance in instances), []))
-            pos = [0] + list(
-                itertools.accumulate(
-                    len(instance.beams) for instance in instances))
-            instance_start_and_end: list[tuple[int, int]] = list(
-                zip(pos[:-1], pos[1:]))
+            token_iter = range(max_tokens)
+            if use_tqdm:
+                token_iter = tqdm(token_iter,
+                                  desc="Beam search",
+                                  unit="token",
+                                  unit_scale=False)
+                logger.warning(
+                    "The progress bar shows the upper bound on token steps and "
+                    "may finish early due to stopping conditions. It does not "
+                    "reflect instance-level progress.")
+            for _ in token_iter:
+                all_beams: list[BeamSearchSequence] = list(
+                    sum((instance.beams for instance in instances_batch), []))
+                pos = [0] + list(
+                    itertools.accumulate(
+                        len(instance.beams) for instance in instances_batch))
+                instance_start_and_end: list[tuple[int, int]] = list(
+                    zip(pos[:-1], pos[1:]))
 
-            if len(all_beams) == 0:
-                break
+                if len(all_beams) == 0:
+                    break
 
-            # create the corresponding batch entries for prompt & optional lora
-            prompts_batch, lora_req_batch = zip(
-                *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
-                  for beam in all_beams])
+                # create corresponding batch entries for prompt & optional lora
+                prompts_batch, lora_req_batch = zip(
+                    *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
+                      for beam in all_beams])
 
-            # only runs for one step
-            # we don't need to use tqdm here
-            output = self.generate(prompts_batch,
-                                   sampling_params=beam_search_params,
-                                   use_tqdm=False,
-                                   lora_request=lora_req_batch)
+                # only runs for one step
+                # we don't need to use tqdm here
+                output = self.generate(prompts_batch,
+                                       sampling_params=beam_search_params,
+                                       use_tqdm=False,
+                                       lora_request=lora_req_batch)
 
-            for (start, end), instance in zip(instance_start_and_end,
-                                              instances):
-                instance_new_beams = []
-                for i in range(start, end):
-                    current_beam = all_beams[i]
-                    result = output[i]
+                for (start, end), instance in zip(instance_start_and_end,
+                                                  instances_batch):
+                    instance_new_beams = []
+                    for i in range(start, end):
+                        current_beam = all_beams[i]
+                        result = output[i]
 
-                    if result.outputs[0].logprobs is not None:
-                        # if `result.outputs[0].logprobs` is None, it means
-                        # the sequence is completed because of the max-model-len
-                        # or abortion. we don't need to add it to the new beams.
-                        logprobs = result.outputs[0].logprobs[0]
-                        for token_id, logprob_obj in logprobs.items():
-                            new_beam = BeamSearchSequence(
-                                tokens=current_beam.tokens + [token_id],
-                                logprobs=current_beam.logprobs + [logprobs],
-                                lora_request=current_beam.lora_request,
-                                cum_logprob=current_beam.cum_logprob +
-                                logprob_obj.logprob,
-                                multi_modal_data=current_beam.multi_modal_data,
-                                mm_processor_kwargs=current_beam.
-                                mm_processor_kwargs)
+                        if result.outputs[0].logprobs is not None:
+                            # if `result.outputs[0].logprobs` is None, it means
+                            # the sequence is completed because of the
+                            # max-model-len or abortion. we don't need to add
+                            # it to the new beams.
+                            logprobs = result.outputs[0].logprobs[0]
+                            for token_id, logprob_obj in logprobs.items():
+                                new_beam = BeamSearchSequence(
+                                    tokens=current_beam.tokens + [token_id],
+                                    logprobs=current_beam.logprobs +
+                                    [logprobs],
+                                    lora_request=current_beam.lora_request,
+                                    cum_logprob=current_beam.cum_logprob +
+                                    logprob_obj.logprob,
+                                    multi_modal_data=current_beam.
+                                    multi_modal_data,
+                                    mm_processor_kwargs=current_beam.
+                                    mm_processor_kwargs)
 
-                            if token_id == tokenizer.eos_token_id and \
-                                not ignore_eos:
-                                instance.completed.append(new_beam)
-                            else:
-                                instance_new_beams.append(new_beam)
-                sorted_beams = sorted(instance_new_beams,
-                                      key=sort_beams_key,
-                                      reverse=True)
-                instance.beams = sorted_beams[:beam_width]
+                                if token_id == tokenizer.eos_token_id and \
+                                    not ignore_eos:
+                                    instance.completed.append(new_beam)
+                                else:
+                                    instance_new_beams.append(new_beam)
+                    sorted_beams = sorted(instance_new_beams,
+                                          key=sort_beams_key,
+                                          reverse=True)
+                    instance.beams = sorted_beams[:beam_width]
 
         outputs = []
         for instance in instances:

From d272415e57c95da63c798c22c7d87cc5c0cda21f Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 27 Aug 2025 01:00:21 -0400
Subject: [PATCH 634/932] [Quantization] Expand compressed-tensors MoE matching
 logic to support NFP4 + FP8 MoEs (#22674)

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Dipika <dipikasikka1@gmail.com>
---
 .../compressed_tensors/compressed_tensors.py  | 13 +++----
 .../compressed_tensors_moe.py                 | 36 +++++++++++++++++--
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index ce74375aab..245cf122eb 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -425,6 +425,10 @@ class CompressedTensorsConfig(QuantizationConfig):
             weight_quant: BaseModel,
             input_quant: BaseModel,
             format: Optional[str] = None) -> "CompressedTensorsScheme":
+
+        # use the per-layer format if defined, otherwise, use global format
+        format = format if format is not None else self.quant_format
+
         # Detect If Mixed Precision
         if self._is_fp4a16_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A16Fp4()
@@ -437,14 +441,14 @@ class CompressedTensorsConfig(QuantizationConfig):
                                             actorder=weight_quant.actorder)
 
         if self._is_wNa16_group_channel(weight_quant, input_quant):
-            if (self.quant_format == CompressionFormat.marlin_24.value
+            if (format == CompressionFormat.marlin_24.value
                     and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
                 assert weight_quant.symmetric
                 return CompressedTensorsW4A16Sparse24(
                     strategy=weight_quant.strategy,
                     num_bits=weight_quant.num_bits,
                     group_size=weight_quant.group_size)
-            if (self.quant_format == CompressionFormat.pack_quantized.value
+            if (format == CompressionFormat.pack_quantized.value
                     and weight_quant.num_bits in WNA16_SUPPORTED_BITS):
                 return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
@@ -453,10 +457,7 @@ class CompressedTensorsConfig(QuantizationConfig):
                     group_size=weight_quant.group_size,
                     actorder=weight_quant.actorder)
 
-        act_quant_format = is_activation_quantization_format(
-            format
-        ) if format is not None else is_activation_quantization_format(
-            self.quant_format)
+        act_quant_format = is_activation_quantization_format(format)
         if act_quant_format:
             if self._is_fp4a4_nvfp4(weight_quant, input_quant):
                 if cutlass_fp4_supported(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 1ee3478aa4..6279bb8b60 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -22,6 +22,8 @@ from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
     is_valid_flashinfer_cutlass_fused_moe)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    find_matched_target)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1,
@@ -65,12 +67,40 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
     @staticmethod
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
-        layer: torch.nn.Module,
+        layer: torch.nn.Module
     ) -> "CompressedTensorsMoEMethod":
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
-        weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
-        input_quant = quant_config.target_scheme_map["Linear"].get(
+        # Check if a using "Linear" to select scheems
+        if "Linear" in quant_config.target_scheme_map:
+            matched_target = "Linear"
+        else:
+            # May have instead defined the linear layers in the fused model
+
+            fused_layers = [
+                "re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"
+            ]
+            current_scheme = None
+            for fused_layer in fused_layers:
+                # Check if one of the fused layers are defined in quant_config
+                matched_target = find_matched_target(
+                    layer_name=fused_layer,
+                    module=layer,
+                    targets=quant_config.target_scheme_map.keys(),
+                    fused_mapping=quant_config.packed_modules_mapping)
+
+                # Only valid if down_proj, gate_proj, and up_proj
+                # are mapped to the same quant scheme in the quant_config
+                if current_scheme is None:
+                    current_scheme = quant_config.target_scheme_map.get(
+                        matched_target)
+                else:
+                    assert current_scheme == quant_config.target_scheme_map.get(
+                        matched_target)
+
+        weight_quant = quant_config.target_scheme_map[matched_target].get(
+            "weights")
+        input_quant = quant_config.target_scheme_map[matched_target].get(
             "input_activations")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):

From fce10dbed5441b4f918b23a2b63aae72bc00a2f6 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Wed, 27 Aug 2025 13:33:27 +0800
Subject: [PATCH 635/932] [XPU] Add xpu torch.compile support (#22609)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 .buildkite/scripts/hardware_ci/run-xpu-test.sh |  1 +
 vllm/attention/layer.py                        |  3 +--
 vllm/compilation/fix_functionalization.py      |  8 ++++++++
 vllm/platforms/cpu.py                          |  4 ++++
 vllm/platforms/cuda.py                         |  4 ++++
 vllm/platforms/interface.py                    |  8 ++++++++
 vllm/platforms/rocm.py                         |  4 ++++
 vllm/platforms/xpu.py                          | 15 ++++++---------
 8 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 445cd2735c..73f3e63fbf 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -31,6 +31,7 @@ docker run \
     set -e
     echo $ZE_AFFINITY_MASK
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
     cd tests
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 2d288bcbe0..237802afcc 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -190,8 +190,7 @@ class Attention(nn.Module, AttentionLayerBase):
         # torch.compile works by registering the attention as one giant
         # opaque custom op. For other platforms, we directly call them
         # and let torch.compile handle them.
-        self.use_direct_call = not current_platform.is_cuda_alike(
-        ) and not current_platform.is_cpu()
+        self.use_direct_call = not current_platform.opaque_attention_op()
 
         self.use_output = self.attn_backend.accept_output_buffer
         compilation_config = get_current_vllm_config().compilation_config
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 286221d32c..60ae143318 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -9,6 +9,7 @@ import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 from .fx_utils import is_func
 from .vllm_inductor_pass import VllmInductorPass
@@ -26,6 +27,13 @@ class FixFunctionalizationPass(VllmInductorPass):
     """
 
     def __call__(self, graph: torch.fx.Graph):
+        # XPU does not support auto-functionalization yet.
+        # Will enable this when switch to vllm-xpu-kernels.
+        if current_platform.is_xpu():
+            logger.debug("XPU platform does not support fix functionalization"
+                         "pass currently.")
+            return
+
         self.begin()
         self.dump_graph(graph, "before_fix_functionalization")
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index c748595a71..5686fae5cd 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -335,3 +335,7 @@ class CpuPlatform(Platform):
         return (cls.supports_v1(model_config)
                 and arch in (CpuArchEnum.X86, CpuArchEnum.POWERPC,
                              CpuArchEnum.ARM, CpuArchEnum.S390X))
+
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index c0e0fe35e4..5cbb734643 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -442,6 +442,10 @@ class CudaPlatformBase(Platform):
     def use_custom_allreduce(cls) -> bool:
         return True
 
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
+
     @classmethod
     def get_static_graph_wrapper_cls(cls) -> str:
         return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f6c17de86d..01f3e2d977 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -509,6 +509,14 @@ class Platform:
         """
         return False
 
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        """
+        Returns True if we register attention as one giant opaque custom op
+        on the current platform
+        """
+        return False
+
     @classmethod
     def validate_request(
         cls,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 85b2fe2e48..c6d14aa87c 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -411,6 +411,10 @@ class RocmPlatform(Platform):
         supported_archs = ['gfx94', 'gfx95']
         return any(gfx in gcn_arch for gfx in supported_archs)
 
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
+
     @classmethod
     def get_cu_count(cls, device_id: int = 0) -> int:
         return torch.cuda.get_device_properties(
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 235e5d8294..84f4cd7256 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -90,21 +90,14 @@ class XPUPlatform(Platform):
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 64
 
-        # FIXME: Temporarily forcing eager mode
-        # remove after t.compile support stabilizes.
-        if (envs.VLLM_USE_V1 and model_config is not None
-                and not vllm_config.model_config.enforce_eager):
-            from vllm.config import CompilationLevel
-            vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION  # noqa: E501
-
         # lazy import to avoid circular import
         from vllm.config import CUDAGraphMode
         compilation_config = vllm_config.compilation_config
         if compilation_config.cudagraph_mode is None or \
                 compilation_config.cudagraph_mode.max_cudagraph_mode() \
                     != CUDAGraphMode.NONE:
-            logger.info("[XPU] CUDA graph is not supported on XPU, "
-                        "disabling cudagraphs.")
+            logger.info("[XPU] CUDA graph is not supported on XPU, disabling "
+                        "cudagraphs. Fallback to cudagraph_mode=NONE")
             compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
         # check and update parallel config
@@ -182,3 +175,7 @@ class XPUPlatform(Platform):
                     "Intel Arc A770 have bfloat16 accuracy known issue. "
                     "You can use float16 instead by explicitly setting the "
                     "`dtype` flag in CLI, for example: --dtype=half.")
+
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True

From 9de25c294b92e42a12d1fbbb3ab3f633fa80291c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 27 Aug 2025 13:51:50 +0800
Subject: [PATCH 636/932] [CI/Build] Remove redundant LoRA model tests (#23706)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py      |   5 --
 tests/lora/test_baichuan.py | 112 ------------------------------------
 tests/lora/test_phi.py      |  71 -----------------------
 3 files changed, 188 deletions(-)
 delete mode 100644 tests/lora/test_baichuan.py
 delete mode 100644 tests/lora/test_phi.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index cba573b63c..3475993ff8 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -216,11 +216,6 @@ def tinyllama_lora_files():
     return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
 
 
-@pytest.fixture(scope="session")
-def phi2_lora_files():
-    return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
-
-
 @pytest.fixture
 def reset_default_device():
     """
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
deleted file mode 100644
index 774ebb9db2..0000000000
--- a/tests/lora/test_baichuan.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-import vllm
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "baichuan-inc/Baichuan-7B"
-
-PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-    prompts = [
-        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
-        PROMPT_TEMPLATE.format(
-            query=
-            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
-        ),
-        PROMPT_TEMPLATE.format(
-            query=
-            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
-        ),
-    ]
-    print(prompts)
-    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-def test_baichuan_lora(baichuan_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   trust_remote_code=True)
-
-    expected_lora_output = [
-        "SELECT count(*) FROM singer",
-        "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE Country  =  'France'",  # noqa: E501
-        "SELECT name ,  country ,  age FROM singer ORDER BY age ASC",
-    ]
-
-    output1 = do_sample(llm, baichuan_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i] == expected_lora_output[i]
-    output2 = do_sample(llm, baichuan_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i] == expected_lora_output[i]
-
-
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
-                                           num_gpus_available, fully_sharded):
-    if num_gpus_available < 4:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
-
-    llm_tp1 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
-    output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
-
-    del llm_tp1
-    cleanup_dist_env_and_memory()
-
-    llm_tp2 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       tensor_parallel_size=2,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
-    output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
-
-    del llm_tp2
-    cleanup_dist_env_and_memory()
-
-    assert output_tp1 == output_tp2
-
-    llm_tp4 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       tensor_parallel_size=4,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
-    output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
-
-    del llm_tp4
-    cleanup_dist_env_and_memory()
-
-    assert output_tp1 == output_tp4
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
deleted file mode 100644
index 3090941e63..0000000000
--- a/tests/lora/test_phi.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import vllm
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "microsoft/phi-2"
-
-PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-    prompts = [
-        PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "Which catalog publisher has published the most catalogs?",
-            context="CREATE TABLE catalogs (catalog_publisher VARCHAR);"),
-        PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "Which trip started from the station with the largest dock count? Give me the trip id.",  # noqa: E501
-            context=
-            "CREATE TABLE trip (id VARCHAR, start_station_id VARCHAR); CREATE TABLE station (id VARCHAR, dock_count VARCHAR);"  # noqa: E501
-        ),
-        PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "How many marine species are found in the Southern Ocean?",  # noqa: E501
-            context=
-            "CREATE TABLE marine_species (name VARCHAR(50), common_name VARCHAR(50), location VARCHAR(50));"  # noqa: E501
-        ),
-    ]
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=64,
-                                          stop="### End")
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
-    )
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-def test_phi2_lora(phi2_lora_files):
-    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
-    # Otherwise, the lora-test will fail due to CUDA OOM.
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=2,
-                   enforce_eager=True,
-                   enable_chunked_prefill=True)
-
-    expected_lora_output = [
-        "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;",  # noqa: E501
-        "SELECT trip.id FROM trip JOIN station ON trip.start_station_id = station.id WHERE station.dock_count = (SELECT MAX(dock_count) FROM station);",  # noqa: E501
-        "SELECT COUNT(*) FROM marine_species WHERE location = 'Southern Ocean';",  # noqa: E501
-    ]
-
-    output1 = do_sample(llm, phi2_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i].startswith(expected_lora_output[i])
-    output2 = do_sample(llm, phi2_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i].startswith(expected_lora_output[i])

From 8dbf6ed7be3f8602257ce1879825d4b5e3554d67 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Wed, 27 Aug 2025 13:54:39 +0800
Subject: [PATCH 637/932] [Bugfix] fix when config.yaml config value is list
 parse error (#23528)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 tests/utils_/test_utils.py | 41 ++++++++++++++++++++++++++++++++++++++
 vllm/utils/__init__.py     |  9 +++++++--
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index 084d82dee1..04195ea0cf 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -5,13 +5,17 @@
 import asyncio
 import hashlib
 import json
+import os
 import pickle
 import socket
+import tempfile
 from collections.abc import AsyncIterator
+from pathlib import Path
 from unittest.mock import patch
 
 import pytest
 import torch
+import yaml
 import zmq
 from transformers import AutoTokenizer
 from vllm_test_utils.monitor import monitor
@@ -991,3 +995,40 @@ def test_current_stream_multithread():
         child_thread.join(timeout=5)
         if child_thread.is_alive():
             pytest.fail("Child thread failed to exit properly")
+
+
+def test_load_config_file(tmp_path):
+    # Define the configuration data
+    config_data = {
+        "enable-logging": True,
+        "list-arg": ["item1", "item2"],
+        "port": 12323,
+        "tensor-parallel-size": 4
+    }
+
+    # Write the configuration data to a temporary YAML file
+    config_file_path = tmp_path / "config.yaml"
+    with open(config_file_path, "w") as config_file:
+        yaml.dump(config_data, config_file)
+
+    # Initialize the parser
+    parser = FlexibleArgumentParser()
+
+    # Call the function with the temporary file path
+    processed_args = parser.load_config_file(str(config_file_path))
+
+    # Expected output
+    expected_args = [
+        "--enable-logging",
+        "--list-arg",
+        "item1",
+        "item2",
+        "--port",
+        "12323",
+        "--tensor-parallel-size",
+        "4",
+    ]
+
+    # Assert that the processed arguments match the expected output
+    assert processed_args == expected_args
+    os.remove(str(config_file_path))
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 7c34a858c0..60bddc5b50 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1974,7 +1974,7 @@ class FlexibleArgumentParser(ArgumentParser):
 
         file_path = args[index + 1]
 
-        config_args = self._load_config_file(file_path)
+        config_args = self.load_config_file(file_path)
 
         # 0th index is for {serve,chat,complete}
         # optionally followed by model_tag (only for serve)
@@ -2005,7 +2005,7 @@ class FlexibleArgumentParser(ArgumentParser):
 
         return args
 
-    def _load_config_file(self, file_path: str) -> list[str]:
+    def load_config_file(self, file_path: str) -> list[str]:
         """Loads a yaml file and returns the key value pairs as a
         flattened list with argparse like pattern
         ```yaml
@@ -2046,6 +2046,11 @@ class FlexibleArgumentParser(ArgumentParser):
             if isinstance(value, bool) and key not in store_boolean_arguments:
                 if value:
                     processed_args.append('--' + key)
+            elif isinstance(value, list):
+                if value:
+                    processed_args.append('--' + key)
+                    for item in value:
+                        processed_args.append(str(item))
             else:
                 processed_args.append('--' + key)
                 processed_args.append(str(value))

From 69244e67e6822f1c15816f887659e1ccc18c2632 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 14:19:13 +0800
Subject: [PATCH 638/932] [Core] Use key-only cache for
 `BaseMultiModalProcessor` (#23018)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/configuration/conserving_memory.md       |   2 +-
 docs/configuration/optimization.md            |  44 +-
 .../multimodal/processing/test_common.py      |   8 +-
 tests/multimodal/test_cache.py                | 182 +++++++-
 vllm/config/__init__.py                       |  26 +-
 vllm/engine/arg_utils.py                      |  14 +-
 vllm/engine/llm_engine.py                     |  15 +-
 vllm/inputs/preprocess.py                     |  22 +-
 vllm/inputs/registry.py                       |  12 +-
 .../models/hyperclovax_vision.py              |   7 +-
 vllm/model_executor/models/llava.py           |   8 +-
 vllm/model_executor/models/minicpmv.py        |  40 +-
 vllm/model_executor/models/mistral3.py        |   8 +-
 vllm/model_executor/models/phi3v.py           |  20 +-
 vllm/model_executor/models/phi4mm.py          |  21 +-
 vllm/model_executor/models/tarsier.py         |   7 +-
 vllm/multimodal/cache.py                      | 405 +++++++++++++++++-
 vllm/multimodal/inputs.py                     |  38 +-
 vllm/multimodal/processing.py                 | 187 ++++----
 vllm/multimodal/profiling.py                  |   4 +-
 vllm/multimodal/registry.py                   |  90 ++--
 vllm/v1/engine/async_llm.py                   |   3 +-
 vllm/v1/engine/core.py                        |  17 +-
 vllm/v1/engine/llm_engine.py                  |   3 +-
 vllm/v1/engine/mm_input_cache.py              | 121 ------
 vllm/v1/engine/processor.py                   |  29 +-
 vllm/v1/worker/gpu_model_runner.py            |   3 +
 vllm/v1/worker/tpu_model_runner.py            |   3 +
 vllm/v1/worker/utils.py                       |   9 +-
 29 files changed, 954 insertions(+), 394 deletions(-)
 delete mode 100644 vllm/v1/engine/mm_input_cache.py

diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 058eba5fe0..efda9c8e01 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
 
 If you run out of CPU RAM, try the following options:
 
-- (Multi-modal models only) you can set the size of multi-modal processor cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB per API process + 4 GiB per engine core process)
+- (Multi-modal models only) you can set the size of multi-modal cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB).
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
 
 ## Multi-modal input limits
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index bb47e1b90f..3eaf2185a5 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -204,20 +204,33 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
     to avoid CPU resource exhaustion.
 
 !!! note
-    [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled
-    because it requires a one-to-one correspondence between API and engine core processes.
+    API server scale-out disables [multi-modal IPC caching](#ipc-caching)
+    because it requires a one-to-one correspondance between API and engine core processes.
+
+    This does not impact [multi-modal processor caching](#processor-caching).
 
 ## Multi-Modal Caching
 
-### Processor Cache
-
-By default, the multi-modal processor cache is enabled to avoid repeatedly processing
-the same multi-modal inputs via Hugging Face `AutoProcessor`,
+Multi-modal caching avoids repeated transfer or processing of the same multi-modal data,
 which commonly occurs in multi-turn conversations.
 
-You can adjust the size of the cache by setting the value of `mm_processor_cache_gb`
-(default 4 GiB per API process + 4 GiB per engine core process).
-If you do not benefit much from the cache, you can disable it completely via `mm_processor_cache_gb=0`.
+### Processor Caching
+
+Multi-modal processor caching is automatically enabled
+to avoid repeatedly processing the same multi-modal inputs in `BaseMultiModalProcessor`.
+
+### IPC Caching
+
+Multi-modal IPC caching is automatically enabled when
+there is a one-to-one correspondance between API (`P0`) and engine core (`P1`) processes,
+to avoid repeatedly transferring the same multi-modal inputs between them.
+
+### Configuration
+
+You can adjust the size of the cache by setting the value of `mm_processor_cache_gb` (default 4 GiB).
+
+If you do not benefit much from the cache, you can disable both IPC
+and processor caching completely via `mm_processor_cache_gb=0`.
 
 Examples:
 
@@ -230,3 +243,16 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
 llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
           mm_processor_cache_gb=0)
 ```
+
+### Cache Placement
+
+Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows:
+
+| Processor Caching | IPC Caching | `P0` Cache | `P1` Cache | Max. Memory |
+|-------------------|-------------|------------|------------|-------------|
+| ✅ | ✅ | K | K + V | `mm_processor_cache_gb * data_parallel_size` |
+| ✅ | ❌ | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
+| ❌ | ❌ | N/A | N/A | `0` |
+
+K: Stores the hashes of multi-modal items  
+V: Stores the processed tensor data of multi-modal items
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 6361cb9b55..3ff4360b83 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -14,8 +14,9 @@ from PIL import Image
 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
+from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
-from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache
+from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                cached_tokenizer_from_config,
                                                encode_tokens)
@@ -63,6 +64,8 @@ def _test_processing_correctness(
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
+        # Ensure that the cache can fit all of the data
+        mm_processor_cache_gb=2048,
     )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
@@ -71,8 +74,7 @@ def _test_processing_correctness(
         model_config,
         tokenizer=cached_tokenizer_from_config(model_config),
     )
-    # Ensure that it can fit all of the data
-    cache = ProcessingCache(capacity_gb=2048)
+    cache = MultiModalProcessorOnlyCache(model_config)
 
     processing_info = factories.info(ctx)
     supported_mm_limits = processing_info.get_supported_mm_limits()
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index 088cd00db2..44c05db227 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -1,32 +1,64 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import numpy as np
 import pytest
 import torch
 
-from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
+from vllm.config import ModelConfig, ParallelConfig, VllmConfig
+from vllm.multimodal.cache import (MultiModalCache,
+                                   MultiModalProcessorCacheItem,
+                                   MultiModalProcessorCacheItemMetadata,
+                                   processor_cache_from_config,
+                                   receiver_cache_from_config)
+from vllm.multimodal.hasher import MultiModalHasher
 from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
                                     MultiModalKwargsItems,
                                     MultiModalSharedField)
+from vllm.multimodal.processing import PromptInsertion
+from vllm.multimodal.registry import MultiModalRegistry
 
 
-def _dummy_elem(modality: str, key: str, size: int):
+def _dummy_elem(
+    modality: str,
+    key: str,
+    size: int,
+    *,
+    rng: Optional[np.random.RandomState] = None,
+):
+    if rng is None:
+        data = torch.empty((size, ), dtype=torch.int8)
+    else:
+        data = torch.from_numpy(rng.randint(4, size=(size, ), dtype=np.int8))
+
     return MultiModalFieldElem(
         modality=modality,
         key=key,
-        data=torch.empty((size, ), dtype=torch.int8),
+        data=data,
         field=MultiModalSharedField(1),
     )
 
 
-def _dummy_item(modality: str, size_by_key: dict[str, int]):
+def _dummy_item(
+    modality: str,
+    size_by_key: dict[str, int],
+    *,
+    rng: Optional[np.random.RandomState] = None,
+):
     return MultiModalKwargsItem.from_elems([
-        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
+        _dummy_elem(modality, key, size, rng=rng)
+        for key, size in size_by_key.items()
     ])
 
 
-def _dummy_items(size_by_key_modality: dict[str, dict[str, int]]):
+def _dummy_items(
+    size_by_key_modality: dict[str, dict[str, int]],
+    *,
+    rng: Optional[np.random.RandomState] = None,
+):
     return MultiModalKwargsItems.from_seq([
-        _dummy_item(modality, size_by_key)
+        _dummy_item(modality, size_by_key, rng=rng)
         for modality, size_by_key in size_by_key_modality.items()
     ])
 
@@ -48,5 +80,139 @@ def test_cache_item_size(item, expected_size):
     cache[""] = item
     assert cache.currsize == expected_size
 
-    cache[""] = MultiModalCacheItemMetadata.wraps(item)
+    prompt_update = PromptInsertion("dummy", "target", "insertion") \
+        .resolve(0)
+
+    cache[""] = MultiModalProcessorCacheItem(item, [prompt_update])
     assert cache.currsize == expected_size
+
+    cache[""] = MultiModalProcessorCacheItemMetadata(item, [prompt_update])
+    assert cache.currsize == expected_size
+
+
+def _create_vllm_config(
+    *,
+    mm_processor_cache_gb: float,
+    enable_ipc: bool,
+):
+    return VllmConfig(
+        model_config=ModelConfig(mm_processor_cache_gb=mm_processor_cache_gb),
+        parallel_config=ParallelConfig(
+            data_parallel_size=1 if enable_ipc else 2),
+    )
+
+
+def _compare_caches(
+    config_0: VllmConfig,
+    config_1: VllmConfig,
+    *,
+    item_capacity: int = 8,
+    hit_rate: float = 0.5,
+    max_items_per_iter: int = 3,
+    is_cached_calls_per_iter: int,
+    n_iter: int = 100,
+    seed: int = 0,
+):
+    mm_registry = MultiModalRegistry()
+    cache_0_p0 = processor_cache_from_config(config_0, mm_registry)
+    cache_0_p1 = receiver_cache_from_config(config_0, mm_registry)
+    cache_1_p0 = processor_cache_from_config(config_1, mm_registry)
+    cache_1_p1 = receiver_cache_from_config(config_1, mm_registry)
+
+    cache_size_gb = max(
+        config_0.model_config.mm_processor_cache_gb,
+        config_1.model_config.mm_processor_cache_gb,
+    )
+    item_size_gb = int(cache_size_gb / item_capacity)
+
+    rng = np.random.RandomState(seed)
+    all_items = [
+        _dummy_item("item", {"key": item_size_gb}, rng=rng)
+        for _ in range(int(item_capacity / hit_rate))
+    ]
+    all_hashes = [
+        MultiModalHasher.hash_kwargs(item=item.get_data())
+        for item in all_items
+    ]
+
+    # Should not be used since there is nothing to convert to text
+    prompt_update = PromptInsertion("dummy", "target", "insertion")
+
+    for it in range(n_iter):
+        num_items_to_select = rng.randint(0, max_items_per_iter)
+        item_idxs_to_select = rng.choice(len(all_items), num_items_to_select)
+
+        selected_items = [all_items[idx] for idx in item_idxs_to_select]
+        selected_hashes = [all_hashes[idx] for idx in item_idxs_to_select]
+
+        if cache_0_p0 is None:
+            cache_0_p0_out = selected_items
+        else:
+            for _ in range(is_cached_calls_per_iter):
+                cache_0_p0.is_cached(selected_hashes)
+            cache_0_p0_out = [
+                item for item, _ in cache_0_p0.get_and_update(
+                    [(item, prompt_update.content) for item in selected_items],
+                    selected_hashes,
+                )
+            ]
+
+        if cache_1_p0 is None:
+            cache_1_p0_out = selected_items
+        else:
+            for _ in range(is_cached_calls_per_iter):
+                cache_1_p0.is_cached(selected_hashes)
+            cache_1_p0_out = [
+                item for item, _ in cache_1_p0.get_and_update(
+                    [(item, prompt_update.content) for item in selected_items],
+                    selected_hashes,
+                )
+            ]
+
+        if cache_0_p1 is None:
+            cache_0_p1_out = cache_0_p0_out
+        else:
+            cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out,
+                                                       selected_hashes)
+
+        if cache_1_p1 is None:
+            cache_1_p1_out = cache_1_p0_out
+        else:
+            cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out,
+                                                       selected_hashes)
+
+        assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}"
+
+
+@pytest.mark.parametrize("is_cached_calls_per_iter", [1, 2, 3])
+def test_ipc_enable_disable_consistency(is_cached_calls_per_iter):
+    cache_size_gb = 1 / (1 << 20)
+
+    vllm_config_ipc_enabled = _create_vllm_config(
+        mm_processor_cache_gb=cache_size_gb,
+        enable_ipc=True,
+    )
+    vllm_config_ipc_disabled = _create_vllm_config(
+        mm_processor_cache_gb=0,
+        enable_ipc=False,
+    )
+    vllm_config_cache_disabled = _create_vllm_config(
+        mm_processor_cache_gb=cache_size_gb,
+        enable_ipc=True,
+    )
+
+    _compare_caches(
+        vllm_config_ipc_enabled,
+        vllm_config_ipc_disabled,
+        is_cached_calls_per_iter=is_cached_calls_per_iter,
+    )
+    _compare_caches(
+        vllm_config_ipc_disabled,
+        vllm_config_cache_disabled,
+        is_cached_calls_per_iter=is_cached_calls_per_iter,
+    )
+    _compare_caches(
+        vllm_config_cache_disabled,
+        vllm_config_ipc_enabled,
+        is_cached_calls_per_iter=is_cached_calls_per_iter,
+    )
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index cd0e17977e..ac6f51df95 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -437,7 +437,7 @@ class ModelConfig:
     from `AutoProcessor.from_pretrained`. The available overrides depend on the
     model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
     """
-    mm_processor_cache_gb: int = 4
+    mm_processor_cache_gb: float = 4
     """The size (in GiB) of the multi-modal processor cache, which is used to
     avoid re-processing past multi-modal inputs.
 
@@ -884,12 +884,6 @@ class ModelConfig:
 
         return None
 
-    def set_mm_processor_cache_gb(self, value: int) -> None:
-        mm_config = self.get_multimodal_config()
-
-        self.mm_processor_cache_gb = value
-        mm_config.mm_processor_cache_gb = value
-
     def _get_encoder_config(self):
         return get_sentence_transformer_tokenizer_config(
             self.model, self.revision)
@@ -1697,22 +1691,6 @@ class ModelConfig:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
-    @property
-    def enable_mm_processor_cache(self) -> bool:
-        """Whether the multi-modal processor cache should be enabled."""
-        mm_config = self.multimodal_config
-        if mm_config is None:
-            return False
-
-        return mm_config.mm_processor_cache_gb > 0
-
-    def get_mm_input_cache_gb(self) -> int:
-        mm_config = self.multimodal_config
-        if mm_config is None:
-            return 0
-
-        return envs.VLLM_MM_INPUT_CACHE_GIB
-
     @property
     def is_cross_encoder(self) -> bool:
         return (self._model_info.supports_cross_encoding
@@ -2561,7 +2539,7 @@ class MultiModalConfig:
     `{"num_crops": 4}`.
     """
 
-    mm_processor_cache_gb: int = 4
+    mm_processor_cache_gb: float = 4
     """
     The size (in GiB) of the multi-modal processor cache, which is used to
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f24c50ad73..9e7c95ea52 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -351,7 +351,7 @@ class EngineArgs:
     mm_processor_kwargs: Optional[Dict[str, Any]] = \
         MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
-    mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
+    mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
     mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     # LoRA fields
@@ -1293,18 +1293,6 @@ class EngineArgs:
             worker_extension_cls=self.worker_extension_cls,
         )
 
-        if model_config.is_multimodal_model:
-            dp_supports_mm_processor_cache = (self.data_parallel_size == 1
-                                              or data_parallel_external_lb)
-            if (not dp_supports_mm_processor_cache
-                    and model_config.mm_processor_cache_gb > 0):
-                logger.warning(
-                    "Multi-modal processor cache is disabled because "
-                    "it is not compatible with data parallelism when "
-                    "there does not exist a one-to-one correspondance "
-                    "between API and engine core processes.")
-                model_config.set_mm_processor_cache_gb(0)
-
         speculative_config = self.create_speculative_config(
             target_model_config=model_config,
             target_parallel_config=parallel_config,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index cbd714c159..03c2f0375d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -36,6 +36,7 @@ from vllm.logits_process import get_bad_words_logits_processors
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.outputs import (PoolingRequestOutput, RequestOutput,
                           RequestOutputFactory)
@@ -250,9 +251,13 @@ class LLMEngine:
         self.generation_config_fields = (
             self.model_config.try_get_generation_config())
 
-        self.input_preprocessor = InputPreprocessor(self.model_config,
-                                                    self.tokenizer,
-                                                    mm_registry)
+        self.input_preprocessor = InputPreprocessor(
+            self.model_config,
+            self.tokenizer,
+            mm_registry,
+            mm_processor_cache=processor_only_cache_from_config(
+                self.model_config, mm_registry),
+        )
 
         self.model_executor = executor_class(vllm_config=vllm_config)
 
@@ -840,8 +845,8 @@ class LLMEngine:
 
     def reset_mm_cache(self) -> bool:
         """Reset the multi-modal cache."""
-        return self.input_preprocessor.mm_registry.reset_processor_cache(
-            self.model_config)
+        self.input_preprocessor.clear_cache()
+        return True
 
     def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
         """Reset prefix cache for all devices."""
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 3f521012e8..f0d0cab3df 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -11,6 +11,7 @@ from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                     MultiModalInputs)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -32,12 +33,14 @@ class InputPreprocessor:
         model_config: ModelConfig,
         tokenizer: Optional[TokenizerGroup],
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        mm_processor_cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> None:
         super().__init__()
 
         self.model_config = model_config
         self.tokenizer = tokenizer
         self.mm_registry = mm_registry
+        self.mm_processor_cache = mm_processor_cache
 
     def get_tokenizer_group(self) -> TokenizerGroup:
         if self.tokenizer is None:
@@ -261,8 +264,11 @@ class InputPreprocessor:
         """
         tokenizer = self._get_mm_tokenizer(lora_request)
 
-        mm_processor = self.mm_registry.create_processor(self.model_config,
-                                                         tokenizer=tokenizer)
+        mm_processor = self.mm_registry.create_processor(
+            self.model_config,
+            tokenizer=tokenizer,
+            cache=self.mm_processor_cache,
+        )
 
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
@@ -286,8 +292,12 @@ class InputPreprocessor:
         """
         tokenizer = await self._get_mm_tokenizer_async(lora_request)
 
-        mm_processor = self.mm_registry.create_processor(self.model_config,
-                                                         tokenizer=tokenizer)
+        mm_processor = self.mm_registry.create_processor(
+            self.model_config,
+            tokenizer=tokenizer,
+            cache=self.mm_processor_cache,
+        )
+
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
@@ -860,3 +870,7 @@ class InputPreprocessor:
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
         )
+
+    def clear_cache(self) -> None:
+        if self.mm_processor_cache is not None:
+            self.mm_processor_cache.clear_cache()
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index ef146fdfbf..f0b392e976 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -223,20 +223,26 @@ class InputRegistry:
         The model is identified by ``model_config``.
         """
         # Avoid circular import
+        from vllm.multimodal.cache import processor_only_cache_from_config
         from vllm.sequence import SequenceData
 
         if not model_config.is_multimodal_model:
             seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
             return DummyData(seq_data=seq_data)
 
+        cache = processor_only_cache_from_config(model_config, mm_registry)
+
         # Encoder dummy data does not contain multi-modal data
         if is_encoder_data:
-            enc_data = mm_registry.get_encoder_dummy_data(
-                model_config, seq_len)
+            enc_data = mm_registry.get_encoder_dummy_data(model_config,
+                                                          seq_len,
+                                                          cache=cache)
             seq_data = SequenceData.from_seqs(enc_data.prompt_token_ids)
             return DummyData(seq_data=seq_data)
 
-        dec_data = mm_registry.get_decoder_dummy_data(model_config, seq_len)
+        dec_data = mm_registry.get_decoder_dummy_data(model_config,
+                                                      seq_len,
+                                                      cache=cache)
 
         return DummyData(
             seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids),
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index eeb8291c77..53f0585541 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -33,12 +33,13 @@ from vllm.inputs import InputProcessingContext
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargsItems)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
@@ -367,7 +368,7 @@ def _build_hcxvision_hf_processor(
     info: HCXVisionProcessingInfo,
     dummy_inputs: BaseDummyInputsBuilder[HCXVisionProcessingInfo],
     *,
-    cache: Optional[ProcessingCache] = None,
+    cache: Optional[BaseMultiModalProcessorCache] = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, HCXVisionProcessingInfo):
         return HCXVisionMultiModalProcessor(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index bc53982c93..0ee26b6834 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -22,14 +22,14 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputs, MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.jsontree import json_map_leaves
@@ -394,7 +394,7 @@ def _build_llava_or_pixtral_hf_processor(
     info: _I,
     dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
-    cache: Optional[ProcessingCache] = None,
+    cache: Optional[BaseMultiModalProcessorCache] = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, PixtralHFProcessingInfo):
         return PixtralHFMultiModalProcessor(
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index a2a71bdd12..c22d871ab2 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -58,7 +58,8 @@ from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    VideoItem, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+                                        PromptUpdate, PromptUpdateDetails,
+                                        ResolvedPromptUpdate, _seq2text)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -744,6 +745,43 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
             for modality, pattern in placeholders
         ]
 
+    def _recompute_cached_prompt_update(
+        self,
+        cached_update: ResolvedPromptUpdate,
+        new_item_idx: int,
+    ) -> ResolvedPromptUpdate:
+        new_update = super()._recompute_cached_prompt_update(
+            cached_update,
+            new_item_idx,
+        )
+
+        if cached_update.modality == "image":
+            tokenizer = self.info.get_tokenizer()
+            image_processor = self.info.get_image_processor()
+            version = self.info.get_model_version()
+
+            text = _seq2text(tokenizer, cached_update.content.full)
+            prev_item_idx = cached_update.item_idx
+
+            if version == (2, 0) or version == (2, 5):
+                im_start = image_processor.im_start_token
+                im_end = image_processor.im_end_token
+            else:
+                im_start = image_processor.im_id_start
+                im_end = image_processor.im_id_end
+
+            new_update = new_update.with_content(
+                PromptUpdateDetails.select_text(
+                    text.replace(
+                        f"{im_start}{prev_item_idx}{im_end}",
+                        f"{im_start}{new_item_idx}{im_end}",
+                        1,
+                    ),
+                    "<unk>",
+                ))
+
+        return new_update
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 438513433d..08948960b2 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -22,14 +22,14 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
@@ -322,7 +322,7 @@ def _build_mistral3_processor(
     info: _I,
     dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
-    cache: Optional[ProcessingCache] = None,
+    cache: Optional[BaseMultiModalProcessorCache] = None,
 ) -> BaseMultiModalProcessor:
     assert isinstance(info, Mistral3ProcessingInfo)
     return Mistral3MultiModalProcessor(
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 61e09d5604..4522c7043d 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -41,7 +41,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo,
                                         MultiModalPromptUpdates,
                                         PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        ResolvedPromptUpdate)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
@@ -440,6 +441,23 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
             )
         ]
 
+    def _recompute_cached_prompt_update(
+        self,
+        cached_update: ResolvedPromptUpdate,
+        new_item_idx: int,
+    ) -> ResolvedPromptUpdate:
+        new_update = super()._recompute_cached_prompt_update(
+            cached_update,
+            new_item_idx,
+        )
+
+        if cached_update.modality == "image":
+            hf_processor = self.info.get_hf_processor()
+            image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+            new_update = new_update.with_target(image_tokens[new_item_idx])
+
+        return new_update
+
     def _apply_prompt_updates(
         self,
         token_ids: list[int],
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 5129770e8d..211cbd9c81 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -27,7 +27,7 @@ from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
                                    MultiModalDataItems, MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+                                        PromptUpdate, ResolvedPromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -850,6 +850,25 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
             ),
         ]
 
+    def _recompute_cached_prompt_update(
+        self,
+        cached_update: ResolvedPromptUpdate,
+        new_item_idx: int,
+    ) -> ResolvedPromptUpdate:
+        new_update = super()._recompute_cached_prompt_update(
+            cached_update,
+            new_item_idx,
+        )
+
+        if cached_update.modality == "image":
+            image_tokens: list[str] = self.info.image_tokens  # type: ignore
+            new_update = new_update.with_target(image_tokens[new_item_idx])
+        elif cached_update.modality == "audio":
+            audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
+            new_update = new_update.with_target(audio_tokens[new_item_idx])
+
+        return new_update
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     Phi4MMMultiModalProcessor,
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index 9b9cca8c6b..c66867315e 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -25,12 +25,13 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.jsontree import json_map_leaves
@@ -332,7 +333,7 @@ def _build_tarsier_hf_processor(
     info: _I_Tarsier,
     dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier],
     *,
-    cache: Optional[ProcessingCache] = None,
+    cache: Optional[BaseMultiModalProcessorCache] = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, TarsierProcessingInfo):
         return TarsierMultiModalProcessor(
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 5cec8e71fb..0e81cb6d4d 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import sys
-from collections.abc import Mapping
-from dataclasses import dataclass
-from typing import TypeVar, Union
+from abc import ABC, abstractmethod
+from collections.abc import Mapping, Sequence
+from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union
 
 import torch
+from typing_extensions import TypeAlias, override
 
 from vllm.logger import init_logger
 from vllm.utils import GiB_bytes, LRUCache
@@ -15,24 +16,67 @@ from .inputs import (MultiModalFieldElem, MultiModalKwargs,
                      MultiModalKwargsItem, MultiModalKwargsItems,
                      NestedTensors)
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+
+    from .processing import ResolvedPromptUpdate
+    from .registry import MultiModalRegistry
+
 logger = init_logger(__name__)
 
 
-@dataclass
-class MultiModalCacheItemMetadata:
-    size: int
+class MultiModalProcessorCacheItem:
+    """
+    The data to store inside `MultiModalProcessorOnlyCache`.
 
-    @classmethod
-    def wraps(cls, value: "MultiModalCacheValue"):
-        return cls(size=MultiModalCache.get_item_size(value))
+    Args:
+        item: The processed tensor data corresponding to a multi-modal item.
+        prompt_updates: The prompt updates corresponding to `item`.
+    """
+
+    def __init__(
+        self,
+        item: MultiModalKwargsItem,
+        prompt_updates: Sequence["ResolvedPromptUpdate"],
+    ) -> None:
+        super().__init__()
+
+        self.item = item
+        self.prompt_updates = prompt_updates
+
+
+class MultiModalProcessorCacheItemMetadata:
+    """
+    The metadata to store inside `MultiModalProcessorSenderCache`.
+
+    Args:
+        item: The processed tensor data corresponding to a multi-modal item.
+            Since P1 already stores the tensor data, we only store its size
+            metadata in P0 to reduce memory usage. The size metadata is still
+            needed to keep the same cache eviction policy as P0.
+        prompt_updates: The prompt updates corresponding to `item`.
+            This needs to stay on P0 because for some models, they are
+            dependent on the processed tensor data (cached on P1).
+    """
+
+    def __init__(
+        self,
+        item: MultiModalKwargsItem,
+        prompt_updates: Sequence["ResolvedPromptUpdate"],
+    ) -> None:
+        super().__init__()
+
+        self.item_size = MultiModalCache.get_item_size(item)
+        self.prompt_updates = prompt_updates
 
 
 MultiModalCacheValue = Union[
+    MultiModalProcessorCacheItem,
+    MultiModalProcessorCacheItemMetadata,
     MultiModalKwargsItems,
     MultiModalKwargsItem,
     MultiModalKwargs,
     Mapping[str, NestedTensors],
-    MultiModalCacheItemMetadata,
 ]
 
 _V = TypeVar("_V", bound=MultiModalCacheValue)
@@ -47,8 +91,10 @@ class MultiModalCache:
         *,
         debug: bool = False,
     ) -> int:
-        if isinstance(leaf, MultiModalFieldElem):
-            return cls.get_item_size(leaf.data)  # type: ignore
+        if isinstance(leaf, MultiModalProcessorCacheItem):
+            return cls.get_leaf_size(leaf.item)
+        if isinstance(leaf, MultiModalProcessorCacheItemMetadata):
+            return leaf.item_size
 
         # These are not subclasses of dict
         if isinstance(leaf, MultiModalKwargsItems):
@@ -58,13 +104,13 @@ class MultiModalCache:
         if isinstance(leaf, MultiModalKwargs):
             return cls.get_item_size(leaf.data)  # type: ignore
 
+        if isinstance(leaf, MultiModalFieldElem):
+            return cls.get_item_size(leaf.data)  # type: ignore
+
         # sys.getsizeof doesn't work for tensors
         if isinstance(leaf, torch.Tensor):
             return leaf.nbytes
 
-        if isinstance(leaf, MultiModalCacheItemMetadata):
-            return leaf.size
-
         return sys.getsizeof(leaf)
 
     @classmethod
@@ -98,3 +144,332 @@ class MultiModalCache:
             GiB_bytes * capacity_gb,
             getsizeof=lambda x: cls.get_item_size(x, debug=debug),
         )
+
+
+_I = TypeVar("_I", contravariant=True)
+_O = TypeVar("_O", covariant=True)
+
+
+class BaseMultiModalCache(ABC, Generic[_I, _O]):
+    """
+    Abstract base class to read/write multi-modal items from cache.
+
+    The idea of multi-modal caching is based on having a client and server
+    where the client executes in the frontend process (=P0) and
+    the server in the core process (=P1). The data flow is as follows:
+
+    ```
+                  is_cached() x N    get_and_update()
+    P0: From API -----------------> -----------------> To P1
+
+                 get_and_update()
+    P1: From P0 -----------------> To model
+    ```
+
+    `is_cached()` can be called any number of times in P0. However,
+    `get_and_update()` must be called in P0 and P1 one after another
+    so that their cache eviction order remains the same.
+
+    This ensures that the keys in P0 and P1 caches are mirrored,
+    allowing us to determine whether a key is cached in P1 by looking
+    up the P0 cache, without having to communicate with P1.
+    """
+
+    @abstractmethod
+    def get_and_update_item(
+        self,
+        mm_item: _I,
+        mm_hash: str,
+    ) -> _O:
+        """
+        Possibly update a multi-modal item based on whether it is
+        in the underlying cache.
+        
+        This update is done out-of-place and updates the cache eviction order.
+
+        Args:
+            mm_item: The multi-modal item to update.
+            mm_hash: The hash of `mm_item`.
+
+        Returns:
+            The update multi-modal item.
+        """
+        raise NotImplementedError
+
+    def get_and_update(
+        self,
+        mm_items: Sequence[_I],
+        mm_hashes: list[str],
+    ) -> list[_O]:
+        """
+        Possibly update a sequence of multi-modal items based on whether they
+        are in the underlying cache.
+
+        This update is done out-of-place and updates the cache eviction order.
+
+        Args:
+            mm_items: The multi-modal items to update.
+            mm_hashes: The hash of each item in `mm_items`.
+
+        Returns:
+            A new list of updated multi-modal items.
+        """
+        assert len(mm_items) == len(mm_hashes)
+
+        return [
+            self.get_and_update_item(mm_item, mm_hash)
+            for mm_item, mm_hash in zip(mm_items, mm_hashes)
+        ]
+
+    @abstractmethod
+    def clear_cache(self) -> None:
+        """Clear the underlying cache."""
+        raise NotImplementedError
+
+
+MultiModalProcessorCacheInItem: TypeAlias = \
+    Optional[tuple[MultiModalKwargsItem, Sequence["ResolvedPromptUpdate"]]]
+
+
+MultiModalProcessorCacheOutItem: TypeAlias = \
+    tuple[Optional[MultiModalKwargsItem], Sequence["ResolvedPromptUpdate"]]
+
+
+class BaseMultiModalProcessorCache(
+        BaseMultiModalCache[MultiModalProcessorCacheInItem,
+                            MultiModalProcessorCacheOutItem]):
+    """The required interface for caches on P0."""
+
+    @abstractmethod
+    def is_cached_item(self, mm_hash: str) -> bool:
+        """
+        Check whether a multi-modal item is
+        in the underlying cache.
+
+        This **DOES NOT** update the cache eviction order.
+
+        Args:
+            mm_hash: The hash of the item to check.
+
+        Returns:
+            `True` if the item is cached, otherwise `False`.
+        """
+        raise NotImplementedError
+
+    def is_cached(self, mm_hashes: list[str]) -> list[bool]:
+        """
+        Check whether a sequence of multi-modal items are
+        in the underlying cache.
+
+        This **DOES NOT** update the cache eviction order.
+    
+        Args:
+            mm_hashes: The hash of each item to check.
+
+        Returns:
+            For each item, `True` if the item is cached, otherwise `False`.
+        """
+        return [self.is_cached_item(mm_hash) for mm_hash in mm_hashes]
+
+
+class MultiModalProcessorOnlyCache(BaseMultiModalProcessorCache):
+    """
+    The cache which is used on P0 when IPC caching is disabled.
+
+    How to update each item:
+
+    - If the item is in the cache, replace the input with the cached item.
+    - If the item is not in the cache, store that item (which includes
+      tensor data and metadata) into the cache, and return the input.
+    """
+
+    def __init__(self, model_config: "ModelConfig") -> None:
+        super().__init__()
+
+        mm_config = model_config.get_multimodal_config()
+
+        self._cache = MultiModalCache.get_lru_cache(
+            mm_config.mm_processor_cache_gb,
+            MultiModalProcessorCacheItem,
+        )
+
+    @override
+    def is_cached_item(self, mm_hash: str) -> bool:
+        return mm_hash in self._cache
+
+    @override
+    def get_and_update_item(
+        self,
+        mm_item: MultiModalProcessorCacheInItem,
+        mm_hash: str,
+    ) -> MultiModalProcessorCacheOutItem:
+        if (cached_item := self._cache.get(mm_hash)) is not None:
+            return cached_item.item, cached_item.prompt_updates
+
+        assert mm_item is not None, f"Expected a cached item for {mm_hash=}"
+
+        self._cache[mm_hash] = MultiModalProcessorCacheItem(*mm_item)
+
+        return mm_item
+
+    @override
+    def clear_cache(self) -> None:
+        self._cache.clear()
+
+
+class MultiModalProcessorSenderCache(BaseMultiModalProcessorCache):
+    """
+    The cache which is used on P0 when IPC caching is enabled.
+
+    How to update each item:
+
+    - If the item is already in the cache, clear the input to avoid
+      unnecessary IPC.
+
+    - If the item is not in the cache, store the metadata of that item so
+      that the eviction policy remains the same as the cache on P1,
+      and return the input.
+      By only storing the metadata, we avoid keeping the data itself in
+      memory inside P0.
+    """
+
+    def __init__(self, model_config: "ModelConfig") -> None:
+        super().__init__()
+
+        mm_config = model_config.get_multimodal_config()
+
+        self._cache = MultiModalCache.get_lru_cache(
+            mm_config.mm_processor_cache_gb,
+            MultiModalProcessorCacheItemMetadata,
+        )
+
+    @override
+    def is_cached_item(self, mm_hash: str) -> bool:
+        return mm_hash in self._cache
+
+    @override
+    def get_and_update_item(
+        self,
+        mm_item: MultiModalProcessorCacheInItem,
+        mm_hash: str,
+    ) -> MultiModalProcessorCacheOutItem:
+        if (cached_item := self._cache.get(mm_hash)) is not None:
+            return None, cached_item.prompt_updates
+
+        assert mm_item is not None, f"Expected a cached item for {mm_hash=}"
+
+        self._cache[mm_hash] = MultiModalProcessorCacheItemMetadata(*mm_item)
+
+        return mm_item
+
+    @override
+    def clear_cache(self) -> None:
+        self._cache.clear()
+
+
+def _enable_processor_cache(
+    model_config: "ModelConfig",
+    mm_registry: "MultiModalRegistry",
+) -> bool:
+    if not mm_registry.supports_multimodal_inputs(model_config):
+        return False
+
+    mm_config = model_config.get_multimodal_config()
+    return mm_config.mm_processor_cache_gb > 0
+
+
+def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool:
+    parallel_config = vllm_config.parallel_config
+    supports_ipc_cache = (parallel_config.data_parallel_size == 1
+                          or parallel_config.data_parallel_external_lb)
+
+    return supports_ipc_cache
+
+
+def processor_cache_from_config(
+    vllm_config: "VllmConfig",
+    mm_registry: "MultiModalRegistry",
+) -> Optional[BaseMultiModalProcessorCache]:
+    """Return a `BaseMultiModalProcessorCache`, if enabled."""
+    model_config = vllm_config.model_config
+
+    if not _enable_processor_cache(model_config, mm_registry):
+        return None
+
+    if not _enable_ipc_cache(vllm_config):
+        return MultiModalProcessorOnlyCache(model_config)
+
+    return MultiModalProcessorSenderCache(model_config)
+
+
+def processor_only_cache_from_config(
+    model_config: "ModelConfig",
+    mm_registry: "MultiModalRegistry",
+):
+    """Return a `MultiModalProcessorOnlyCache`, if enabled."""
+    if not _enable_processor_cache(model_config, mm_registry):
+        return None
+
+    return MultiModalProcessorOnlyCache(model_config)
+
+
+class BaseMultiModalReceiverCache(
+        BaseMultiModalCache[Optional[MultiModalKwargsItem],
+                            MultiModalKwargsItem]):
+    """The required interface for caches on P1."""
+
+
+class MultiModalReceiverCache(BaseMultiModalReceiverCache):
+    """
+    The cache which is used on P1 when IPC caching is enabled.
+
+    How to update each item:
+
+    - If the item is in the cache, replace the input with the cached item.
+    - If the item is not in the cache, store that item (which includes tensor
+      data) into the cache, and return the input.
+    """
+
+    def __init__(self, model_config: "ModelConfig") -> None:
+        super().__init__()
+
+        mm_config = model_config.get_multimodal_config()
+
+        self._cache = MultiModalCache.get_lru_cache(
+            mm_config.mm_processor_cache_gb,
+            MultiModalKwargsItem,
+        )
+
+    @override
+    def get_and_update_item(
+        self,
+        mm_item: Optional[MultiModalKwargsItem],
+        mm_hash: str,
+    ) -> MultiModalKwargsItem:
+        if (cached_item := self._cache.get(mm_hash)) is not None:
+            return cached_item
+
+        assert mm_item is not None, f"Expected a cached item for {mm_hash=}"
+
+        self._cache[mm_hash] = mm_item
+        return mm_item
+
+    @override
+    def clear_cache(self) -> None:
+        self._cache.clear()
+
+
+def receiver_cache_from_config(
+    vllm_config: "VllmConfig",
+    mm_registry: "MultiModalRegistry",
+) -> Optional[BaseMultiModalReceiverCache]:
+    """Return a `BaseMultiModalReceiverCache`, if enabled."""
+    model_config = vllm_config.model_config
+
+    if not _enable_processor_cache(model_config, mm_registry):
+        return None
+
+    if not _enable_ipc_cache(vllm_config):
+        return None
+
+    return MultiModalReceiverCache(model_config)
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 581f9a109c..2c0ebaced6 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -7,11 +7,11 @@ from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
 from functools import partial
 from itertools import accumulate
-from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
-                    Union, cast, final)
+from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, Union,
+                    cast, final)
 
 import numpy as np
-from typing_extensions import NotRequired, TypeAlias, deprecated
+from typing_extensions import NotRequired, TypeAlias, TypeVar, deprecated
 
 from vllm.utils import LazyLoader, full_groupby, is_list_of
 from vllm.utils.jsontree import JSONTree, json_map_leaves
@@ -668,7 +668,15 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
         return {key: elem.data for key, elem in self.items()}
 
 
-class MultiModalKwargsItems(UserDict[str, Sequence[MultiModalKwargsItem]]):
+_I = TypeVar(
+    "_I",
+    MultiModalKwargsItem,
+    Optional[MultiModalKwargsItem],
+    default=MultiModalKwargsItem,
+)
+
+
+class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
     """
     A dictionary of
     [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem]s
@@ -714,27 +722,37 @@ class MultiModalKwargsItems(UserDict[str, Sequence[MultiModalKwargsItem]]):
         items_by_modality = full_groupby(items, key=lambda x: x.modality)
         return MultiModalKwargsItems(items_by_modality)
 
-    def __getitem__(self, modality: str):
+    def __getitem__(self, modality: str) -> Sequence[_I]:
         if modality not in self:
             raise KeyError(f"Modality {modality!r} not found. "
                            f"Available modalities: {set(self.keys())}")
 
-        return super().__getitem__(modality)
+        return super().__getitem__(modality)  # type: ignore[return-value]
 
     def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs":
         elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
-        for items in self.values():
-            for item in items:
+        for modality, items in self.items():
+            for i, item in enumerate(items):
+                if item is None:
+                    raise RuntimeError("Cannot build data from empty "
+                                       f"mm_items[{modality}][{i}]")
+
                 for key, elem in item.items():
                     elems_by_key[key].append(elem)
 
         return MultiModalKwargs({
             key:
             elems[0].field.reduce_data(elems, pin_memory=pin_memory)
-            for key, elems in elems_by_key.items() if len(elems) > 0
+            for key, elems in elems_by_key.items()
         })
 
 
+MultiModalKwargsOptionalItems: TypeAlias = Union[
+    MultiModalKwargsItems[MultiModalKwargsItem],
+    MultiModalKwargsItems[Optional[MultiModalKwargsItem]],
+]
+
+
 class MultiModalKwargs(UserDict[str, NestedTensors]):
     """
     A dictionary that represents the keyword arguments to
@@ -898,7 +916,7 @@ class MultiModalInputs(TypedDict):
     token_type_ids: NotRequired[list[int]]
     """The token type IDs of the prompt."""
 
-    mm_kwargs: MultiModalKwargsItems
+    mm_kwargs: MultiModalKwargsOptionalItems
     """Keyword arguments to be directly passed to the model after batching."""
 
     mm_hashes: "MultiModalHashDict"
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 8c225e2a3c..6ecdf80d4a 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 from collections import defaultdict
 from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping,
                              Sequence)
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace
 from enum import Enum
 from functools import lru_cache
 from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
@@ -20,12 +20,11 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
 from vllm.utils import flatten_2d_lists, full_groupby
 
-from .cache import MultiModalCache
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                      MultiModalFieldConfig, MultiModalInputs,
                      MultiModalKwargsItem, MultiModalKwargsItems,
-                     PlaceholderRange)
+                     MultiModalKwargsOptionalItems, PlaceholderRange)
 from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
                     MultiModalDataParser)
 
@@ -34,6 +33,7 @@ if TYPE_CHECKING:
     from transformers.feature_extraction_utils import BatchFeature
     from transformers.processing_utils import ProcessorMixin
 
+    from .cache import BaseMultiModalProcessorCache
     from .profiling import BaseDummyInputsBuilder
 
 logger = init_logger(__name__)
@@ -557,6 +557,15 @@ class ResolvedPromptUpdate:
 
         return self.iter_token_matches(prompt, tokenizer, start_idx=start_idx)
 
+    def with_target(self, target: UpdateTarget):
+        return replace(self, target=target)
+
+    def with_content(self, content: PromptUpdateInfo):
+        if not isinstance(content, PromptUpdateDetails):
+            content = PromptUpdateDetails.from_seq(content)
+
+        return replace(self, content=content)
+
 
 class _TokenMatch(NamedTuple):
     start_idx: int
@@ -865,21 +874,6 @@ def find_mm_placeholders(
     return dict(full_groupby_modality(it))
 
 
-class ProcessingCache(MultiModalCache):
-
-    def __init__(self, capacity_gb: float) -> None:
-        super().__init__()
-
-        self._cache = self.get_lru_cache(capacity_gb, MultiModalKwargsItem)
-
-        self.get = self._cache.get
-        self.put = self._cache.put
-        self.reset = self._cache.clear
-
-
-_CacheItemOrHash = Union[MultiModalKwargsItem, str]
-
-
 class BaseProcessingInfo:
     """Base class to provide the information necessary for data processing."""
 
@@ -982,7 +976,7 @@ For an item `MultiModalPromptUpdates[k][i]`,
 
 
 class MultiModalProcessingInfo(NamedTuple):
-    kwargs: MultiModalKwargsItems
+    kwargs: MultiModalKwargsOptionalItems
     hashes: MultiModalHashes
     prompt_updates: MultiModalPromptUpdates
 
@@ -994,11 +988,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
     Not to be confused with `transformers.ProcessorMixin`.
     """
 
-    def __init__(self,
-                 info: _I,
-                 dummy_inputs: "BaseDummyInputsBuilder[_I]",
-                 *,
-                 cache: Optional[ProcessingCache] = None) -> None:
+    def __init__(
+        self,
+        info: _I,
+        dummy_inputs: "BaseDummyInputsBuilder[_I]",
+        *,
+        cache: Optional["BaseMultiModalProcessorCache"] = None,
+    ) -> None:
         super().__init__()
 
         self.info = info
@@ -1355,32 +1351,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         return prompt_ids, mm_processed_data, False
 
-    def _get_cache_missing_items(
-        self,
-        cache: ProcessingCache,
-        mm_data_items: MultiModalDataItems,
-        mm_hashes: MultiModalHashes,
-    ) -> tuple[dict[str, list[_CacheItemOrHash]], MultiModalDataItems]:
-        mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]] = {
-            modality: [(h if (v := cache.get(h)) is None else v)
-                       for h in hashes]
-            for modality, hashes in mm_hashes.items()
-        }
-
-        mm_missing_idxs = {
-            modality: [
-                idx for idx, item_or_hash in enumerate(items_or_hashes)
-                if isinstance(item_or_hash, str)
-            ]
-            for modality, items_or_hashes in mm_cache_items_or_hashes.items()
-        }
-        mm_missing_data = {
-            modality: [mm_data_items[modality][idx] for idx in idxs]
-            for modality, idxs in mm_missing_idxs.items()
-        }
-
-        return mm_cache_items_or_hashes, self._to_mm_items(mm_missing_data)
-
     def _hash_mm_items(
         self,
         mm_items: MultiModalDataItems,
@@ -1401,28 +1371,92 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             for modality, items in mm_items.items()
         }
 
+    def _get_cache_missing_items(
+        self,
+        cache: "BaseMultiModalProcessorCache",
+        mm_data_items: MultiModalDataItems,
+        mm_hashes: MultiModalHashes,
+    ) -> MultiModalDataItems:
+        mm_is_cached = {
+            modality: cache.is_cached(hashes)
+            for modality, hashes in mm_hashes.items()
+        }
+
+        mm_missing_idxs = {
+            modality: [
+                idx for idx, item_is_cached in enumerate(items_is_cached)
+                if not item_is_cached
+            ]
+            for modality, items_is_cached in mm_is_cached.items()
+        }
+        mm_missing_data = {
+            modality: [mm_data_items[modality][idx] for idx in idxs]
+            for modality, idxs in mm_missing_idxs.items()
+        }
+
+        return self._to_mm_items(mm_missing_data)
+
+    def _recompute_cached_prompt_update(
+        self,
+        cached_update: ResolvedPromptUpdate,
+        new_item_idx: int,
+    ) -> ResolvedPromptUpdate:
+        """
+        Override this if other attributes of `ResolvedPromptUpdate`
+        also need to be recomputed after retrieving from the cache.
+        """
+        return replace(cached_update, item_idx=new_item_idx)
+
     def _merge_mm_kwargs(
         self,
-        cache: ProcessingCache,
-        mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]],
+        cache: "BaseMultiModalProcessorCache",
+        mm_hashes: MultiModalHashes,
         mm_missing_kwargs: MultiModalKwargsItems,
-    ) -> MultiModalKwargsItems:
+        mm_missing_prompt_updates: MultiModalPromptUpdates,
+    ) -> tuple[MultiModalKwargsOptionalItems, MultiModalPromptUpdates]:
+        # Need to calculate this at the beginning to avoid skipping cache logic
+        # for subsequently repeated items in the same modality
+        mm_is_cached = {
+            modality: cache.is_cached(hashes)
+            for modality, hashes in mm_hashes.items()
+        }
+
         mm_missing_next_idx = defaultdict[str, int](lambda: 0)
 
-        merged_items = defaultdict[str, list[MultiModalKwargsItem]](list)
-        for modality, items_or_hashes in mm_cache_items_or_hashes.items():
-            for item_or_hash in items_or_hashes:
-                if isinstance(item_or_hash, str):
-                    kw_item = mm_missing_kwargs[modality][
-                        mm_missing_next_idx[modality]]
-                    cache.put(item_or_hash, kw_item)
+        merged_kwargs = defaultdict[str,
+                                    list[Optional[MultiModalKwargsItem]]](list)
+        merged_prompt_updates = defaultdict[
+            str, list[Sequence[ResolvedPromptUpdate]]](list)
+        for modality, hashes in mm_hashes.items():
+            missing_kwargs = mm_missing_kwargs.get(modality, [])
+            missing_prompt_updates = mm_missing_prompt_updates.get(
+                modality, [])
+
+            for item_idx, item_hash in enumerate(hashes):
+                kwargs: Optional[MultiModalKwargsItem]
+                if not mm_is_cached[modality][item_idx]:
+                    missing_next_idx = mm_missing_next_idx[modality]
+                    kwargs = missing_kwargs[missing_next_idx]
+                    updates = missing_prompt_updates[missing_next_idx]
+
                     mm_missing_next_idx[modality] += 1
+
+                    item = kwargs, updates
                 else:
-                    kw_item = item_or_hash
+                    item = None
 
-                merged_items[modality].append(kw_item)
+                kwargs, updates = cache.get_and_update_item(item, item_hash)
 
-        return MultiModalKwargsItems(merged_items)
+                merged_kwargs[modality].append(kwargs)
+                merged_prompt_updates[modality].append([
+                    self._recompute_cached_prompt_update(update, item_idx)
+                    for update in updates
+                ])
+
+        mm_kwargs = MultiModalKwargsItems(merged_kwargs)
+        mm_prompt_updates = dict(merged_prompt_updates)
+
+        return mm_kwargs, mm_prompt_updates
 
     def _apply_hf_processor(
         self,
@@ -1490,10 +1524,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
                                         tokenization_kwargs)
-        (
-            mm_cache_items_or_hashes,
-            mm_missing_data_items,
-        ) = self._get_cache_missing_items(
+
+        mm_missing_data_items = self._get_cache_missing_items(
             cache=cache,
             mm_data_items=mm_data_items,
             mm_hashes=mm_hashes,
@@ -1520,16 +1552,17 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                                        hf_processor_mm_kwargs),
         )
 
-        mm_kwargs = self._merge_mm_kwargs(
-            cache,
-            mm_cache_items_or_hashes=mm_cache_items_or_hashes,
-            mm_missing_kwargs=mm_missing_kwargs,
+        mm_missing_prompt_updates = self._get_mm_prompt_updates(
+            mm_missing_data_items,
+            hf_processor_mm_kwargs,
+            mm_missing_kwargs,
         )
 
-        mm_prompt_updates = self._get_mm_prompt_updates(
-            mm_data_items,
-            hf_processor_mm_kwargs,
-            mm_kwargs,
+        mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs(
+            cache,
+            mm_hashes=mm_hashes,
+            mm_missing_kwargs=mm_missing_kwargs,
+            mm_missing_prompt_updates=mm_missing_prompt_updates,
         )
 
         mm_info = MultiModalProcessingInfo(
@@ -1614,7 +1647,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _validate_mm_kwargs(
         self,
-        mm_kwargs: MultiModalKwargsItems,
+        mm_kwargs: MultiModalKwargsOptionalItems,
         mm_item_counts: Mapping[str, int],
     ) -> None:
         for modality, item_count in mm_item_counts.items():
@@ -1655,7 +1688,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         self,
         mm_items: MultiModalDataItems,
         prompt_ids: list[int],
-        mm_kwargs: MultiModalKwargsItems,
+        mm_kwargs: MultiModalKwargsOptionalItems,
         mm_prompt_updates: MultiModalPromptUpdates,
         is_update_applied: bool,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index ea2efbdd8b..ffc69a2db6 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -13,7 +13,7 @@ import vllm.envs as envs
 from vllm.logger import init_logger
 
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                     MultiModalInputs, MultiModalKwargsItems,
+                     MultiModalInputs, MultiModalKwargsOptionalItems,
                      MultiModalPlaceholderDict)
 from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                          EncDecMultiModalProcessor)
@@ -43,7 +43,7 @@ class DummyDecoderData(NamedTuple):
     """Dummy data used for profiling."""
 
     prompt_token_ids: list[int]
-    multi_modal_data: MultiModalKwargsItems
+    multi_modal_data: MultiModalKwargsOptionalItems
     multi_modal_placeholders: MultiModalPlaceholderDict
 
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 8cd9e56048..38adbf8f35 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
-from functools import lru_cache
 from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
 
 import torch.nn as nn
@@ -13,8 +12,9 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer,
                                                cached_tokenizer_from_config)
 from vllm.utils import ClassRegistry
 
-from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
-                         ProcessingCache)
+from .cache import (BaseMultiModalProcessorCache,
+                    processor_only_cache_from_config)
+from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
                         DummyEncoderData, MultiModalProfiler)
 
@@ -65,7 +65,7 @@ class MultiModalProcessorFactory(Protocol[_I]):
         info: _I,
         dummy_inputs: BaseDummyInputsBuilder[_I],
         *,
-        cache: Optional[ProcessingCache] = None,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> BaseMultiModalProcessor[_I]:
         ...
 
@@ -80,20 +80,13 @@ class _ProcessorFactories(Generic[_I]):
         self,
         ctx: InputProcessingContext,
         *,
-        cache: Optional[ProcessingCache] = None,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ):
         info = self.info(ctx)
         dummy_inputs_builder = self.dummy_inputs(info)
         return self.processor(info, dummy_inputs_builder, cache=cache)
 
 
-# Make sure a different cache is used for each model config
-# NOTE: ModelConfig is not hashable so it cannot be passed directly
-@lru_cache(maxsize=1)
-def _get_processor_cache(model_id: str, capacity_gb: int):
-    return ProcessingCache(capacity_gb) if capacity_gb > 0 else None
-
-
 class MultiModalRegistry:
     """
     A registry that dispatches data processing according to the model.
@@ -103,31 +96,6 @@ class MultiModalRegistry:
         self._processor_factories = ClassRegistry[nn.Module,
                                                   _ProcessorFactories]()
 
-    def _get_processor_cache(self, model_config: "ModelConfig"):
-        model_id = model_config.model
-        capacity_gb = model_config.mm_processor_cache_gb
-        return _get_processor_cache(model_id, capacity_gb)
-
-    def reset_processor_cache(self, model_config: "ModelConfig") -> bool:
-        """Reset the multi-modal processing cache."""
-        if processor_cache := self._get_processor_cache(model_config):
-            processor_cache.reset()
-
-        return True  # Success
-
-    def enable_mm_input_cache(self, model_config: "ModelConfig") -> bool:
-        """Whether the multi-modal input cache should be enabled.
-        NOTE: This is put under MultiModalRegistry on purpose to respect 
-        text-only mode for multimodal models.
-        """
-
-        if not self.supports_multimodal_inputs(model_config):
-            return False
-
-        mm_config = model_config.get_multimodal_config()
-
-        return mm_config.mm_processor_cache_gb > 0
-
     def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
         """
         Checks if the model supports multimodal inputs.
@@ -157,6 +125,8 @@ class MultiModalRegistry:
     def get_max_tokens_per_item_by_modality(
         self,
         model_config: "ModelConfig",
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of tokens per data item from each modality based
@@ -165,11 +135,11 @@ class MultiModalRegistry:
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(model_config, disable_cache=False)
+        processor = self.create_processor(model_config, cache=cache)
         profiler = MultiModalProfiler(processor)
 
         seq_len = model_config.max_model_len
-        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache)
 
         return profiler.get_mm_max_contiguous_tokens(
             seq_len,
@@ -182,6 +152,8 @@ class MultiModalRegistry:
     def get_max_tokens_per_item_by_nonzero_modality(
         self,
         model_config: "ModelConfig",
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of tokens per data item from each modality based
@@ -192,15 +164,19 @@ class MultiModalRegistry:
             This is currently directly used only in V1 for profiling the memory
             usage of a model.
         """
-        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache)
+        max_tokens_per_item = self.get_max_tokens_per_item_by_modality(
+            model_config,
+            cache=cache,
+        )
 
         return {
             key: max_tokens_per_mm_item
-            for key, max_tokens_per_mm_item in
-            self.get_max_tokens_per_item_by_modality(model_config).items()
+            for key, max_tokens_per_mm_item in max_tokens_per_item.items()
             if mm_limits[key] > 0
         }
 
+    # TODO: Remove once V0 is gone
     def get_max_tokens_by_modality(
         self,
         model_config: "ModelConfig",
@@ -209,14 +185,19 @@ class MultiModalRegistry:
         Get the maximum number of tokens from each modality
         for profiling the memory usage of a model.
         """
-        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        cache = processor_only_cache_from_config(model_config, self)
+        mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache)
+        max_tokens_per_item = self.get_max_tokens_per_item_by_modality(
+            model_config,
+            cache=cache,
+        )
 
         return {
             key: mm_limits[key] * max_tokens_per_mm_item
-            for key, max_tokens_per_mm_item in
-            self.get_max_tokens_per_item_by_modality(model_config).items()
+            for key, max_tokens_per_mm_item in max_tokens_per_item.items()
         }
 
+    # TODO: Remove once V0 is gone
     def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         """
         Get the maximum number of multi-modal tokens
@@ -227,6 +208,8 @@ class MultiModalRegistry:
     def get_mm_limits_per_prompt(
         self,
         model_config: "ModelConfig",
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of multi-modal input instances for each modality
@@ -235,7 +218,7 @@ class MultiModalRegistry:
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(model_config, disable_cache=False)
+        processor = self.create_processor(model_config, cache=cache)
         profiler = MultiModalProfiler(processor)
         return profiler.get_mm_limits()
 
@@ -303,7 +286,7 @@ class MultiModalRegistry:
         model_config: "ModelConfig",
         *,
         tokenizer: Optional[AnyTokenizer] = None,
-        disable_cache: Optional[bool] = None,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
@@ -311,15 +294,10 @@ class MultiModalRegistry:
         if not model_config.is_multimodal_model:
             raise ValueError(f"{model_config.model} is not a multimodal model")
 
-        if disable_cache is None:
-            disable_cache = not model_config.enable_mm_processor_cache
-
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]
 
         ctx = self._create_processing_ctx(model_config, tokenizer)
-        cache = None if disable_cache else self._get_processor_cache(
-            model_config)
 
         return factories.build_processor(ctx, cache=cache)
 
@@ -328,13 +306,15 @@ class MultiModalRegistry:
         model_config: "ModelConfig",
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> DummyDecoderData:
         """
         Create dummy data for profiling the memory usage of a model.
 
         The model is identified by ``model_config``.
         """
-        processor = self.create_processor(model_config, disable_cache=False)
+        processor = self.create_processor(model_config, cache=cache)
         profiler = MultiModalProfiler(processor)
         dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts)
 
@@ -352,13 +332,15 @@ class MultiModalRegistry:
         model_config: "ModelConfig",
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> DummyEncoderData:
         """
         Create dummy data for profiling the memory usage of a model.
 
         The model is identified by ``model_config``.
         """
-        processor = self.create_processor(model_config, disable_cache=False)
+        processor = self.create_processor(model_config, cache=cache)
         profiler = MultiModalProfiler(processor)
         dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts)
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 342d7b24f8..dbea0b610b 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -597,8 +597,7 @@ class AsyncLLM(EngineClient):
         await asyncio.gather(*coros)
 
     async def reset_mm_cache(self) -> None:
-        self.processor.mm_registry.reset_processor_cache(self.model_config)
-        self.processor.mm_input_cache_client.reset()
+        self.processor.clear_cache()
         await self.engine_core.reset_mm_cache_async()
 
     async def reset_prefix_cache(self,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 32765cda64..b614828061 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -22,6 +22,7 @@ from vllm.logger import init_logger
 from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import receiver_cache_from_config
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
@@ -38,7 +39,6 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType,
                             ReconfigureDistributedRequest, ReconfigureRankType,
                             UtilityOutput, UtilityResult)
-from vllm.v1.engine.mm_input_cache import MultiModalInputCacheServer
 from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -128,8 +128,9 @@ class EngineCore:
         )
         self.use_spec_decode = vllm_config.speculative_config is not None
 
-        self.mm_input_cache_server = MultiModalInputCacheServer(
-            vllm_config.model_config, MULTIMODAL_REGISTRY)
+        self.mm_registry = mm_registry = MULTIMODAL_REGISTRY
+        self.mm_receiver_cache = receiver_cache_from_config(
+            vllm_config, mm_registry)
 
         # Setup batch queue for pipeline parallelism.
         # Batch queue for scheduled batches. This enables us to asynchronously
@@ -370,7 +371,8 @@ class EngineCore:
             logger.warning("Resetting the multi-modal cache when requests are "
                            "in progress may lead to desynced internal caches.")
 
-        self.mm_input_cache_server.reset()
+        if self.mm_receiver_cache is not None:
+            self.mm_receiver_cache.clear_cache()
 
     def reset_prefix_cache(self):
         self.scheduler.reset_prefix_cache()
@@ -435,10 +437,11 @@ class EngineCore:
             assert request.mm_kwargs is not None
 
             # Note on thread safety: no race condition.
-            # `mm_input_cache_server` is reset at the end of LLMEngine init,
+            # `mm_receiver_cache` is reset at the end of LLMEngine init,
             # and will only accessed in the input processing thread afterwards.
-            request.mm_kwargs = self.mm_input_cache_server.get_and_update(
-                request.mm_kwargs, request.mm_hashes)
+            if self.mm_receiver_cache is not None:
+                request.mm_kwargs = self.mm_receiver_cache.get_and_update(
+                    request.mm_kwargs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request,
                                                self.request_block_hasher)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 5a00a93095..7130f666ef 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -271,8 +271,7 @@ class LLMEngine:
         self.engine_core.profile(False)
 
     def reset_mm_cache(self):
-        self.processor.mm_registry.reset_processor_cache(self.model_config)
-        self.processor.mm_input_cache_client.reset()
+        self.processor.clear_cache()
         self.engine_core.reset_mm_cache()
 
     def reset_prefix_cache(self, device: Optional[Device] = None):
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
deleted file mode 100644
index aa7dc62fd4..0000000000
--- a/vllm/v1/engine/mm_input_cache.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Sequence
-from typing import TYPE_CHECKING, Optional
-
-from vllm.multimodal import MultiModalRegistry
-from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
-from vllm.multimodal.inputs import MultiModalKwargsItem
-from vllm.utils import is_list_of
-
-if TYPE_CHECKING:
-    from vllm.config import ModelConfig
-
-# The idea of multimodal input caching is based on having a client and
-# a server, where the client executes in the frontend process (=P0) and the
-# server in the core process (=P1).
-#
-# -- P0:
-#  - BaseMultiModalProcessor calls MultiModalHasher to get the `mm_hash` of
-#    each input multi-modal item (e.g. image),
-#  - BaseMultiModalProcessor processes the input items into `mm_kwargs`,
-#    which are MultiModalKwargsItem instances that each correspond to an
-#    input multi-modal item.
-#  - MultiModalInputCacheClient accepts the `mm_kwargs` and corresponding
-#    `mm_hash` for each item. It stores the `mm_hash` as keys and the size
-#    of `mm_kwargs`, but not the `mm_kwargs` themselves, to avoid taking
-#    up additional memory in P0.
-#  - The `mm_hash` is always sent to P1.
-#  - The corresponding `mm_kwargs` are only sent to P1 if they are not cached
-#    in MultiModalInputCacheServer.
-#
-# -- P1:
-#  - If the `mm_hash` is cached (i.e. `mm_kwargs` are not sent from P0),
-#    MultiModalInputCacheServer retrieves the corresponding `mm_kwargs`.
-#  - If the `mm_hash` is not cached (i.e. `mm_kwargs` are sent from P0),
-#    MultiModalInputCacheServer stores `mm_kwargs` under the key `mm_hash`.
-#  - Either way, the `mm_hash` and corresponding `mm_kwargs` are sent to
-#    the engine for model execution.
-#
-# Both Client and Server must perform cache update and eviction based on the
-# same item size. This ensures that the keys of MultiModalInputCacheClient
-# and MultiModalInputCacheServer are mirrored, allowing us to determine in P0
-# whether a key is cached in MultiModalInputCacheServer by querying
-# MultiModalInputCacheClient without having to communicate with P1.
-
-
-class MultiModalInputCacheClient:
-    """Used by P0 to check whether multi-modal kwargs are cached in P1."""
-
-    def __init__(self, model_config: "ModelConfig",
-                 mm_registry: MultiModalRegistry) -> None:
-        super().__init__()
-
-        self.enabled = mm_registry.enable_mm_input_cache(model_config)
-        self.mm_cache = MultiModalCache.get_lru_cache(
-            model_config.get_mm_input_cache_gb(),
-            MultiModalCacheItemMetadata,
-        )
-
-    def get_and_update(
-        self,
-        mm_kwargs: Sequence[MultiModalKwargsItem],
-        mm_hashes: list[str],
-    ) -> list[Optional[MultiModalKwargsItem]]:
-        if not self.enabled:
-            return list(mm_kwargs)
-
-        assert len(mm_kwargs) == len(mm_hashes)
-
-        out_mm_items = list[Optional[MultiModalKwargsItem]]()
-        for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
-            if self.mm_cache.get(mm_hash) is not None:
-                out_mm_items.append(None)
-            else:
-                self.mm_cache[mm_hash] = \
-                    MultiModalCacheItemMetadata.wraps(mm_item)
-                out_mm_items.append(mm_item)
-
-        return out_mm_items
-
-    def reset(self) -> None:
-        self.mm_cache.clear()
-
-
-class MultiModalInputCacheServer:
-    """Used by P1 to avoid requiring past multi-modal kwargs from P0."""
-
-    def __init__(self, model_config: "ModelConfig",
-                 mm_registry: MultiModalRegistry) -> None:
-        super().__init__()
-
-        self.enabled = mm_registry.enable_mm_input_cache(model_config)
-        self.mm_cache = MultiModalCache.get_lru_cache(
-            model_config.get_mm_input_cache_gb(),
-            MultiModalKwargsItem,
-        )
-
-    def get_and_update(
-        self,
-        mm_kwargs: Sequence[Optional[MultiModalKwargsItem]],
-        mm_hashes: list[str],
-    ) -> list[MultiModalKwargsItem]:
-        if not self.enabled:
-            mm_kwargs_lst = list(mm_kwargs)
-            assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem)
-            return mm_kwargs_lst
-
-        assert len(mm_kwargs) == len(mm_hashes)
-
-        out_mm_items = list[MultiModalKwargsItem]()
-        for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
-            if mm_item is None:
-                out_mm_items.append(self.mm_cache[mm_hash])
-            else:
-                self.mm_cache[mm_hash] = mm_item
-                out_mm_items.append(mm_item)
-
-        return out_mm_items
-
-    def reset(self) -> None:
-        self.mm_cache.clear()
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 300b0713b2..7ed6015662 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -11,6 +11,7 @@ from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import processor_cache_from_config
 from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import argsort_mm_positions
@@ -18,7 +19,6 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
 from vllm.v1.structured_output.backend_guidance import (
     validate_guidance_grammar)
 from vllm.v1.structured_output.backend_lm_format_enforcer import (
@@ -47,16 +47,17 @@ class Processor:
 
         self.generation_config_fields = (
             self.model_config.try_get_generation_config())
-        self.input_preprocessor = InputPreprocessor(self.model_config,
-                                                    self.tokenizer,
-                                                    mm_registry)
 
-        self.mm_input_cache_client = MultiModalInputCacheClient(
-            self.model_config, mm_registry)
+        self.mm_registry = mm_registry
+        self.mm_processor_cache = processor_cache_from_config(
+            vllm_config, mm_registry)
 
-    @property
-    def mm_registry(self):
-        return self.input_preprocessor.mm_registry
+        self.input_preprocessor = InputPreprocessor(
+            self.model_config,
+            self.tokenizer,
+            mm_registry,
+            mm_processor_cache=self.mm_processor_cache,
+        )
 
     def _validate_logprobs(
         self,
@@ -310,7 +311,7 @@ class Processor:
             # in the input sequence.
             sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
 
-            orig_sorted_mm_inputs = [
+            sorted_mm_inputs = [
                 decoder_mm_inputs[modality][idx]
                 for modality, idx in sorted_mm_idxs
             ]
@@ -323,11 +324,6 @@ class Processor:
                 for modality, idx in sorted_mm_idxs
             ]
 
-            sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
-                orig_sorted_mm_inputs,
-                sorted_mm_hashes,
-            )
-
         return decoder_inputs.get("prompt"), EngineCoreRequest(
             request_id=request_id,
             prompt_token_ids=decoder_inputs["prompt_token_ids"],
@@ -415,3 +411,6 @@ class Processor:
             # TODO: Find out how many placeholder tokens are there so we can
             # check that chunked prefill does not truncate them
             # max_batch_len = self.scheduler_config.max_num_batched_tokens
+
+    def clear_cache(self) -> None:
+        self.input_preprocessor.clear_cache()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f1ceaaae62..053aaf4f96 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2186,10 +2186,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         max_items_per_batch: int,
     ) -> BatchedTensorInputs:
         """Dummy data for profiling and precompiling multimodal models."""
+        assert self.mm_budget is not None
+
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
             model_config=self.model_config,
             seq_len=self.max_num_tokens,
             mm_counts={modality: 1},
+            cache=self.mm_budget.cache,
         )
         dummy_mm_data = dummy_decoder_data.multi_modal_data
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 4a485b7e07..d364236604 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1813,10 +1813,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         max_items_per_batch: int,
     ) -> BatchedTensorInputs:
         """Dummy data for profiling and precompiling multimodal models."""
+        assert self.mm_budget is not None
+
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
             model_config=self.model_config,
             seq_len=self.max_num_tokens,
             mm_counts={modality: 1},
+            cache=self.mm_budget.cache,
         )
         dummy_mm_data = dummy_decoder_data.multi_modal_data
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index b96473e7b1..82ede5ad8e 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -10,6 +10,7 @@ from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import ModelConfig, SchedulerConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
+from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.registry import MultiModalRegistry
 from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
@@ -33,14 +34,18 @@ class MultiModalBudget:
         self.model_config = model_config
         self.scheduler_config = scheduler_config
         self.mm_registry = mm_registry
+        self.cache = cache = processor_only_cache_from_config(
+            model_config, mm_registry)
 
         self.max_model_len = model_config.max_model_len
         self.max_num_reqs = scheduler_config.max_num_seqs
 
-        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config)
+        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config,
+                                                              cache=cache)
 
         max_tokens_by_modality = mm_registry \
-            .get_max_tokens_per_item_by_nonzero_modality(model_config)
+            .get_max_tokens_per_item_by_nonzero_modality(model_config,
+                                                         cache=cache)
 
         encoder_compute_budget, encoder_cache_size = compute_mm_encoder_budget(
             scheduler_config,

From 64466778397482e0cb9ff9f6b320ca6d9dc567ae Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Wed, 27 Aug 2025 15:27:14 +0800
Subject: [PATCH 639/932] [XPU]fix cuda event used in XPU model runner (#23708)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/v1/worker/xpu_model_runner.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py
index 59f8d0fcf5..fb892211f1 100644
--- a/vllm/v1/worker/xpu_model_runner.py
+++ b/vllm/v1/worker/xpu_model_runner.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import contextmanager
 from typing import TYPE_CHECKING
 
 import torch
@@ -22,7 +23,8 @@ class XPUModelRunner(GPUModelRunner):
         vllm_config: VllmConfig,
         device: torch.device,
     ):
-        super().__init__(vllm_config, device)
+        with _torch_cuda_wrapper():
+            super().__init__(vllm_config, device)
         # FIXME: To be verified.
         self.cascade_attn_enabled = False
 
@@ -31,3 +33,21 @@ class XPUModelRunner(GPUModelRunner):
 
     def _sync_device(self) -> None:
         torch.xpu.synchronize()
+
+
+@contextmanager
+def _torch_cuda_wrapper():
+
+    class _EventPlaceholder:
+
+        def __init__(self, *args, **kwargs) -> None:
+            self.record = lambda: None
+            self.synchronize = lambda: None
+
+    try:
+        # replace cuda Event with xpu Event, this should work by default
+        torch.cuda.Event = torch.xpu.Event
+        yield
+    finally:
+        # if anything goes wrong, just patch it with a placeholder
+        torch.cuda.Event = _EventPlaceholder

From 91e382c935c2905c29f3ca22c658e03e8f02deaa Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 16:11:15 +0800
Subject: [PATCH 640/932] [CI/Build] Remove redundant register in model init
 tests (#23715)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/test_initialization.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index bbd3da982a..b4d516233b 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -38,11 +38,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
                               model_arch=model_arch,
                               exist_overrides=model_info.hf_overrides)
 
-    if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
-        from vllm.model_executor.models.llama4 import Llama4ForCausalLM
-        from vllm.model_executor.models.registry import ModelRegistry
-        ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
-
     # Avoid calling model.forward()
     def _initialize_kv_caches_v0(self) -> None:
         self.cache_config.num_gpu_blocks = 0

From 5bd9f841581a3a9e9eecdd8764240575bb28e391 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 27 Aug 2025 17:50:09 +0800
Subject: [PATCH 641/932] [Docs] Fix an admonition important (#23726)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/configuration/optimization.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 3eaf2185a5..a8eab9985c 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -164,7 +164,7 @@ llm = LLM(
 )
 ```
 
-!! important
+!!! important
     Batch-level DP is not to be confused with API request-level DP
     (which is instead controlled by `data_parallel_size`).
 

From 6578e873655859462758c5c51e51f876f2aa24a3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 27 Aug 2025 02:52:45 -0700
Subject: [PATCH 642/932] Optimize input preparation for FlashInfer [2/N]
 (#23174)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 82 ++++++++++++++++--------
 1 file changed, 55 insertions(+), 27 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 941d2a4d7f..f948157c2b 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -6,6 +6,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import ClassVar, Optional, Union
 
+import numpy as np
 import torch
 from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
                         BatchPrefillWithPagedKVCacheWrapper,
@@ -22,6 +23,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey, kFp8StaticTensorSym, kNvfp4Quant)
 from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv, is_pin_memory_available
 from vllm.utils.flashinfer import (supports_trtllm_attention,
                                    use_trtllm_attention)
@@ -230,6 +232,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                dtype=torch.int32,
                                                device="cpu",
                                                pin_memory=pin_memory)
+        self.paged_kv_indptr_np = self.paged_kv_indptr_cpu.numpy()
         self.paged_kv_indices_cpu = torch.zeros(max_num_pages,
                                                 dtype=torch.int32,
                                                 device="cpu",
@@ -238,10 +241,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                       dtype=torch.int32,
                                                       device="cpu",
                                                       pin_memory=pin_memory)
-
-        self.block_table_arange = torch.arange(max_num_pages_per_req,
-                                               dtype=torch.int32,
-                                               device=self.device)
+        self.paged_kv_last_page_len_np = (
+            self.paged_kv_last_page_len_cpu.numpy())
 
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
@@ -317,9 +318,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         max_seq_len = common_attn_metadata.max_seq_len
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
+        seq_lens_np = seq_lens_cpu.numpy()
         block_table_tensor = common_attn_metadata.block_table_tensor
 
-        block_table_bounds_cpu = (seq_lens_cpu + page_size - 1) // page_size
+        num_blocks_np = (seq_lens_np + (page_size - 1)) // page_size
 
         use_cascade = common_prefix_len > 0
         if use_cascade:
@@ -342,37 +344,41 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
             # Remove the blocks of the shared prefix from all requests.
             block_table_tensor = block_table_tensor[:, num_common_kv_blocks:]
-            block_table_bounds_cpu -= num_common_kv_blocks
+            num_blocks_np -= num_common_kv_blocks
         else:
             shared_qo_indptr_cpu = None
             shared_kv_page_indptr_cpu = None
             shared_kv_page_indices_cpu = None
             shared_kv_last_page_len_cpu = None
 
-        max_num_blocks = block_table_bounds_cpu.max().item()
-        block_table_bounds = block_table_bounds_cpu.to(self.device,
-                                                       non_blocking=True)
-        mask = (self.block_table_arange[:max_num_blocks].unsqueeze(0)
-                < block_table_bounds.unsqueeze(1))
-        # write self.paged_kv_indices inplace
-        num_actual_pages = torch.sum(mask)
-        paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
-        torch.masked_select(block_table_tensor[:, :max_num_blocks],
-                            mask,
-                            out=paged_kv_indices)
-
         # write self.paged_kv_indptr_cpu inplace (0-index is always 0)
-        torch.cumsum(block_table_bounds_cpu,
-                     dim=0,
-                     dtype=torch.int32,
-                     out=self.paged_kv_indptr_cpu[1:1 + num_reqs])
+        np.cumsum(
+            num_blocks_np,
+            dtype=np.int32,
+            out=self.paged_kv_indptr_np[1:num_reqs + 1],
+        )
+        paged_kv_indptr = self.paged_kv_indptr[:num_reqs + 1]
+        paged_kv_indptr.copy_(self.paged_kv_indptr_cpu[:num_reqs + 1],
+                              non_blocking=True)
+
+        # write self.paged_kv_indices inplace
+        num_actual_pages = num_blocks_np.sum().item()
+        paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
+        _copy_page_indices_kernel[(num_reqs, )](
+            paged_kv_indices,
+            block_table_tensor,
+            block_table_tensor.stride(0),
+            paged_kv_indptr,
+            BLOCK_SIZE=1024,
+        )
 
-        paged_kv_last_page_len_cpu = seq_lens_cpu % page_size
         # write self.paged_kv_last_page_len_cpu inplace
-        torch.where(paged_kv_last_page_len_cpu == 0,
-                    torch.tensor(page_size),
-                    paged_kv_last_page_len_cpu,
-                    out=self.paged_kv_last_page_len_cpu[:num_reqs])
+        paged_kv_last_page_len_np = seq_lens_np % page_size
+        self.paged_kv_last_page_len_np[:num_reqs] = np.where(
+            paged_kv_last_page_len_np == 0,
+            page_size,
+            paged_kv_last_page_len_np,
+        )
 
         # Check if any layer uses sinks (requires TRTLLM attention)
         has_sinks = self.global_hyperparameters.has_sinks
@@ -1002,3 +1008,25 @@ def fast_plan_decode(
     self._sm_scale = sm_scale
     self._rope_scale = rope_scale
     self._rope_theta = rope_theta
+
+
+@triton.jit
+def _copy_page_indices_kernel(
+    page_indices,
+    block_table,
+    block_table_stride,
+    cu_num_blocks,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = block_table + req_idx * block_table_stride
+    start_idx = tl.load(cu_num_blocks + req_idx)
+    end_idx = tl.load(cu_num_blocks + req_idx + 1)
+    num_blocks = end_idx - start_idx
+
+    offset = tl.arange(0, BLOCK_SIZE)
+    for i in tl.range(0, num_blocks, BLOCK_SIZE):
+        block_ids = tl.load(row_ptr + i + offset, mask=i + offset < num_blocks)
+        tl.store(page_indices + start_idx + i + offset,
+                 block_ids,
+                 mask=i + offset < num_blocks)

From 04ff1e43fb6e2e675170d0c90399290f8925abb7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 27 Aug 2025 03:25:00 -0700
Subject: [PATCH 643/932] [Misc] Move CpuGpuBuffer to vllm/v1/utils.py (#23728)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/utils.py                   | 29 +++++++++++++++++++++++++++++
 vllm/v1/worker/cpu_model_runner.py |  2 +-
 vllm/v1/worker/gpu_model_runner.py |  6 +++---
 vllm/v1/worker/utils.py            | 29 -----------------------------
 4 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index b5750c82db..8f9face6fb 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -96,6 +96,35 @@ class ConstantList(Generic[T], Sequence):
         return f"ConstantList({self._x})"
 
 
+class CpuGpuBuffer:
+
+    def __init__(
+        self,
+        *args,
+        dtype: torch.dtype,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.cpu = torch.zeros(*args,
+                               dtype=dtype,
+                               device="cpu",
+                               pin_memory=pin_memory)
+        self.np = self.cpu.numpy()
+        self.gpu = self.cpu.to(device)
+
+    def copy_to_gpu(self, n: Optional[int] = None) -> torch.Tensor:
+        if n is None:
+            return self.gpu.copy_(self.cpu, non_blocking=True)
+        return self.gpu[:n].copy_(self.cpu[:n], non_blocking=True)
+
+    def copy_to_cpu(self, n: Optional[int] = None) -> torch.Tensor:
+        """NOTE: Because this method is non-blocking, explicit synchronization
+        is needed to ensure the data is copied to CPU."""
+        if n is None:
+            return self.cpu.copy_(self.gpu, non_blocking=True)
+        return self.cpu[:n].copy_(self.gpu[:n], non_blocking=True)
+
+
 def get_engine_client_zmq_addr(local_only: bool,
                                host: str,
                                port: int = 0) -> str:
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 137578f0e6..742e553b77 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -10,8 +10,8 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1
+from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
-from vllm.v1.worker.utils import CpuGpuBuffer
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 053aaf4f96..d93460d618 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -78,14 +78,14 @@ from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.kv_connector_model_runner_mixin import (
     KVConnectorModelRunnerMixin, KVConnectorOutput)
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (AttentionGroup, CpuGpuBuffer, MultiModalBudget,
-                    bind_kv_cache, gather_mm_placeholders,
-                    initialize_kv_cache_for_kv_sharing,
+from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache,
+                    gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
 if TYPE_CHECKING:
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 82ede5ad8e..f407534687 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -303,32 +303,3 @@ def bind_kv_cache(
     for layer_name, kv_cache in kv_caches.items():
         # NOTE: Use list because of v0 PP virtual engine.
         forward_context[layer_name].kv_cache = [kv_cache]
-
-
-class CpuGpuBuffer:
-
-    def __init__(
-        self,
-        *args,
-        dtype: torch.dtype,
-        device: torch.device,
-        pin_memory: bool,
-    ):
-        self.cpu = torch.zeros(*args,
-                               dtype=dtype,
-                               device="cpu",
-                               pin_memory=pin_memory)
-        self.np = self.cpu.numpy()
-        self.gpu = self.cpu.to(device)
-
-    def copy_to_gpu(self, n: Optional[int] = None) -> None:
-        if n is None:
-            return self.gpu.copy_(self.cpu, non_blocking=True)
-        return self.gpu[:n].copy_(self.cpu[:n], non_blocking=True)
-
-    def copy_to_cpu(self, n: Optional[int] = None) -> None:
-        """NOTE: Because this method is non-blocking, explicit synchronization
-        is needed to ensure the data is copied to CPU."""
-        if n is None:
-            return self.cpu.copy_(self.gpu, non_blocking=True)
-        return self.cpu[:n].copy_(self.gpu[:n], non_blocking=True)

From 11eddf02f0234f79435d747f2d3dce117ab39aa1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 27 Aug 2025 03:45:04 -0700
Subject: [PATCH 644/932] [FlashInfer] Cache hyper params in metadata builder
 (#23732)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 30 ++++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index f948157c2b..1115fc606b 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -214,6 +214,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         # TODO: discard this for trtllm-gen backend
         self.global_hyperparameters = infer_global_hyperparameters(
             get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl))
+        self.sm_scale = self.global_hyperparameters.sm_scale
+        self.window_left = self.global_hyperparameters.window_left
+        self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap
+        self.has_sinks = self.global_hyperparameters.has_sinks
 
         # Preparing persistent buffers (device-side)
         self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
@@ -381,8 +385,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         )
 
         # Check if any layer uses sinks (requires TRTLLM attention)
-        has_sinks = self.global_hyperparameters.has_sinks
-
         prefill_use_trtllm = use_trtllm_attention(self.num_qo_heads,
                                                   self.num_kv_heads,
                                                   num_prefill_tokens,
@@ -390,7 +392,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                   self.cache_dtype,
                                                   self.q_data_type,
                                                   is_prefill=True,
-                                                  has_sinks=has_sinks)
+                                                  has_sinks=self.has_sinks)
         decode_use_trtllm = use_trtllm_attention(self.num_qo_heads,
                                                  self.num_kv_heads,
                                                  num_decode_tokens,
@@ -398,7 +400,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                  self.cache_dtype,
                                                  self.q_data_type,
                                                  is_prefill=False,
-                                                 has_sinks=has_sinks)
+                                                 has_sinks=self.has_sinks)
 
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -433,9 +435,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 self.head_dim,
                 self.page_size,
                 causal=True,
-                sm_scale=self.global_hyperparameters.sm_scale,
-                window_left=self.global_hyperparameters.window_left,
-                logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
+                sm_scale=self.sm_scale,
+                window_left=self.window_left,
+                logits_soft_cap=self.logits_soft_cap,
                 q_data_type=self.q_data_type,
                 kv_data_type=self.kv_cache_dtype,
             )
@@ -472,10 +474,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                         self.head_dim,
                         self.page_size,
                         causal=True,
-                        sm_scale=self.global_hyperparameters.sm_scale,
-                        window_left=self.global_hyperparameters.window_left,
-                        logits_soft_cap=self.global_hyperparameters.
-                        logits_soft_cap,
+                        sm_scale=self.sm_scale,
+                        window_left=self.window_left,
+                        logits_soft_cap=self.logits_soft_cap,
                         q_data_type=self.q_data_type,
                         kv_data_type=self.kv_cache_dtype,
                     )
@@ -525,10 +526,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                         self.page_size,
                         # Disable flashinfer's pos encoding and use vllm's rope.
                         pos_encoding_mode="NONE",
-                        sm_scale=self.global_hyperparameters.sm_scale,
-                        window_left=self.global_hyperparameters.window_left,
-                        logits_soft_cap=self.global_hyperparameters.
-                        logits_soft_cap,
+                        sm_scale=self.sm_scale,
+                        window_left=self.window_left,
+                        logits_soft_cap=self.logits_soft_cap,
                         q_data_type=self.q_data_type,
                         kv_data_type=self.kv_cache_dtype,
                     )

From e03940762b43812fccd3c214bda60201cff9d16a Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 27 Aug 2025 18:59:35 +0800
Subject: [PATCH 645/932] [CI/Build] Reduce LoRA layer test cases (#23721)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_layers.py | 72 ++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 39 deletions(-)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 92db023bab..6e2dda464d 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -243,7 +243,7 @@ def check_punica_wrapper(punica_wrapper) -> bool:
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
@@ -347,7 +347,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 @torch.inference_mode()
 # @pytest.mark.skip(
 #     reason="Fails when loras are in any slot other than the first.")
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
@@ -486,7 +486,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
@@ -620,12 +620,15 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
-def test_linear_replicated(dist_init, num_loras, device, stage,
-                           bias_enabled) -> None:
+def test_linear_replicated(
+    dist_init,
+    num_loras,
+    device,
+    stage,
+) -> None:
 
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
@@ -634,10 +637,11 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        lora_dtype=torch.float16,
+    )
 
     def create_random_linear_replicated_layer():
 
@@ -651,10 +655,6 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
         lora_linear.create_lora_weights(max_loras, lora_config)
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
             lora_linear.lora_b_stacked) == 1)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
     for i in range(NUM_RANDOM_SEEDS):
@@ -734,14 +734,13 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("orientation", ["row", "column"])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device, stage, bias_enabled) -> None:
+                         device, stage) -> None:
 
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
@@ -750,11 +749,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        fully_sharded_loras=fully_shard,
+        lora_dtype=torch.float16,
+    )
 
     def create_random_linear_parallel_layer():
         if orientation == "row":
@@ -777,10 +777,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
         lora_linear.create_lora_weights(max_loras, lora_config)
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
             lora_linear.lora_b_stacked) == 1)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
+
         return linear, lora_linear
 
     for i in range(NUM_RANDOM_SEEDS):
@@ -860,14 +857,13 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("repeats", [1, 2, 3])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device, stage, bias_enabled) -> None:
+                                device, stage) -> None:
 
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
@@ -876,11 +872,12 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        fully_sharded_loras=fully_shard,
+        lora_dtype=torch.float16,
+    )
 
     def create_column_parallel_packed_layer():
         if repeats == 2:
@@ -924,10 +921,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                         model_config=FakeConfig())
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
             lora_linear.lora_b_stacked) == n_slices)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
+
         return linear, lora_linear
 
     for i in range(NUM_RANDOM_SEEDS):

From 8f0d7eaea87409a54ccaed76995b59c6b0a3d4cf Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli0116@gmail.com>
Date: Wed, 27 Aug 2025 19:57:38 +0800
Subject: [PATCH 646/932] [XPU] Fix OOM issue for data parallel with Ray
 backend (#22500)

Signed-off-by: Fanli Lin <fanli.lin@intel.com>
Signed-off-by: Fanli Lin <fanli0116@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/v1/engine/core.py  | 27 ++++++++++++++++++---------
 vllm/v1/engine/utils.py | 35 +++++++++++++++++++++++++++++++----
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index b614828061..a7038e2d2c 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -39,7 +39,8 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType,
                             ReconfigureDistributedRequest, ReconfigureRankType,
                             UtilityOutput, UtilityResult)
-from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses
+from vllm.v1.engine.utils import (EngineHandshakeMetadata, EngineZmqAddresses,
+                                  get_device_indices)
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
@@ -1169,22 +1170,30 @@ class DPEngineCoreActor(DPEngineCoreProc):
         # https://github.com/ray-project/ray/pull/40461/files#diff-31e8159767361e4bc259b6d9883d9c0d5e5db780fcea4a52ead4ee3ee4a59a78R1860 # noqa: E501
         # and get_accelerator_ids_for_accelerator_resource() in worker.py
         # of ray.
-        self._set_cuda_visible_devices(vllm_config, local_dp_rank)
+        self._set_visible_devices(vllm_config, local_dp_rank)
 
         super().__init__(vllm_config, local_client, "", executor_class,
                          log_stats)
 
-    def _set_cuda_visible_devices(self, vllm_config: VllmConfig,
-                                  local_dp_rank: int):
+    def _set_visible_devices(self, vllm_config: VllmConfig,
+                             local_dp_rank: int):
         from vllm.platforms import current_platform
-        device_control_env_var = current_platform.device_control_env_var
+        if current_platform.is_xpu():
+            pass
+        else:
+            device_control_env_var = current_platform.device_control_env_var
+            self._set_cuda_visible_devices(vllm_config, local_dp_rank,
+                                           device_control_env_var)
+
+    def _set_cuda_visible_devices(self, vllm_config: VllmConfig,
+                                  local_dp_rank: int,
+                                  device_control_env_var: str):
         world_size = vllm_config.parallel_config.world_size
         # Set CUDA_VISIBLE_DEVICES or equivalent.
         try:
-            os.environ[device_control_env_var] = ",".join(
-                str(current_platform.device_id_to_physical_device_id(i))
-                for i in range(local_dp_rank *
-                               world_size, (local_dp_rank + 1) * world_size))
+            value = get_device_indices(device_control_env_var, local_dp_rank,
+                                       world_size)
+            os.environ[device_control_env_var] = value
         except IndexError as e:
             raise Exception(
                 f"Error setting {device_control_env_var}: "
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 62f229e286..56ef8477d2 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -164,19 +164,33 @@ def set_device_control_env_var(vllm_config: VllmConfig,
     """
     world_size = vllm_config.parallel_config.world_size
     evar = current_platform.device_control_env_var
+
+    value = get_device_indices(evar, local_dp_rank, world_size)
+    with patch.dict(os.environ, values=((evar, value), )):
+        yield
+
+
+def get_device_indices(device_control_env_var: str, local_dp_rank: int,
+                       world_size: int):
+    """
+    Returns a comma-separated string of device indices for the specified
+    data parallel rank.
+
+    For example, if world_size=2 and local_dp_rank=1, and there are 4 devices,
+    this will select devices 2 and 3 for local_dp_rank=1.
+    """
     try:
         value = ",".join(
             str(current_platform.device_id_to_physical_device_id(i))
             for i in range(local_dp_rank * world_size, (local_dp_rank + 1) *
                            world_size))
     except IndexError as e:
-        raise Exception(f"Error setting {evar}: "
+        raise Exception(f"Error setting {device_control_env_var}: "
                         f"local range: [{local_dp_rank * world_size}, "
                         f"{(local_dp_rank + 1) * world_size}) "
                         "base value: "
-                        f"\"{os.getenv(evar)}\"") from e
-    with patch.dict(os.environ, values=((evar, value), )):
-        yield
+                        f"\"{os.getenv(device_control_env_var)}\"") from e
+    return value
 
 
 class CoreEngineActorManager:
@@ -254,6 +268,19 @@ class CoreEngineActorManager:
             dp_vllm_config = copy.deepcopy(vllm_config)
             dp_vllm_config.parallel_config.placement_group = pg
             local_client = index < local_engine_count
+
+            # Ray XPU known issue: dpctl initializes the GPU runtime early, so
+            # setting device env vars in Ray actor's initialization method
+            # will not affect device selection. See:
+            # https://github.com/ray-project/ray/blob/master/python/ray/_private/accelerators/intel_gpu.py#L56 # noqa: E501
+            if current_platform.is_xpu():
+                device_evar = current_platform.device_control_env_var
+                device_indices = get_device_indices(device_evar, local_index,
+                                                    world_size)
+                actor_env_vars = self.env_vars_dict.copy()
+                actor_env_vars[device_evar] = device_indices
+                runtime_env = RuntimeEnv(env_vars=actor_env_vars)
+
             actor = ray.remote(DPEngineCoreActor).options(
                 scheduling_strategy=PlacementGroupSchedulingStrategy(
                     placement_group=pg,

From 1f7a9c95e4b2a1e02b19e94fd7371443f08b2e4b Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 27 Aug 2025 20:37:52 +0800
Subject: [PATCH 647/932] [Docs] Fix a 1-2-3 list and style issues in tpu.md
 (#23729)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/configuration/tpu.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
index ac2b6baffd..e456077e04 100644
--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@@ -45,30 +45,30 @@ This initial compilation time ranges significantly and is impacted by many of th
 
 ### Optimize based on your data
 
-#### max model len vs. most model len
+#### max-model-len vs. most-model-len
 
 ![most_model_len](../assets/design/tpu/most_model_len.png)
 
-If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most model len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
+If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most-model-len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
 
 For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`.
 
-The requests get subdivided into max-model-len and most-model-len categories, for the latter category, we can gain better performance since the server can process more requests at a time.
+The requests get subdivided into max-model-len and most-model-len categories, for the latter category, you can gain better performance since the server can process more requests at a time.
 
 #### Padding
 
-For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128: 128, 256, etc.
+For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128 (e.g., 128, 256, etc.)
 
-The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about tpu padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
+The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about TPU padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
 
-1) the default exponential padding (pad to the nearest power of 2)
-2) bucket padding (pad to the nearest linearly increasing bucket).
+1. the default exponential padding (pad to the nearest power of 2)
+2. bucket padding (pad to the nearest linearly increasing bucket).
 
 When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`.
 
 For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512].
 
-The fewer tokens we pad, the less unnecessary computation TPU does, the better performance we can get. For example, if num_tokens=300, with exponential padding, we pad to 512, with the bucket_padding above, we pad to 320.
+The fewer tokens you pad, the less unnecessary computation TPU does, the better performance you can get. For example, if num_tokens=300, with exponential padding, you pad to 512, with the bucket_padding above, you pad to 320.
 
 However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
 

From 9d30de44698e1e337e4736ff62b83ebe1bbd4d40 Mon Sep 17 00:00:00 2001
From: tc-mb <157115220+tc-mb@users.noreply.github.com>
Date: Wed, 27 Aug 2025 20:38:00 +0800
Subject: [PATCH 648/932] [model] Support MiniCPM-V 4.5 (#23586)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: tc-mb <caitianchi@modelbest.cn>
Signed-off-by: Xin Yang <xyangx@amazon.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Signed-off-by: chzhang <chaojun.zhang@intel.com>
Signed-off-by: Pate Motter <patemotter@google.com>
Signed-off-by: Terrencezzj <terrence@cohere.ai>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: siyuanf <siyuanf@nvidia.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Zijing Liu <liuzijing2014@users.noreply.github.com>
Signed-off-by: jiabin.00 <jiabin.00@bytedance.com>
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: tc-mb <157115220+tc-mb@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Huy Do <huydhn@gmail.com>
Signed-off-by: Matúš Námešný <matus.namesny@ameria.com>
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: oye93 <en.ouyang93@outlook.com>
Signed-off-by: Julien Lin <jullin@nvidia.com>
Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Tianyu Li <tianyu.li@arm.com>
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Signed-off-by: Federico <65908512+coval3nte@users.noreply.github.com>
Signed-off-by: Zixuan Zhang <zixuanzhang@bytedance.com>
Signed-off-by: wuhang <wuhang6@huawei.com>
Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
Signed-off-by: Wei Wei <wwei6@meta.com>
Signed-off-by: Yiheng Xu <charlesyihengxu@gmail.com>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
Co-authored-by: Xin Yang <105740670+xyang16@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com>
Co-authored-by: Chaojun Zhang <chaojun.zhang@intel.com>
Co-authored-by: Pate Motter <p@temotter.com>
Co-authored-by: Terrence Zhao <32208165+Terrencezzj@users.noreply.github.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: weiliang <weiliangl@nvidia.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Zijing Liu <liuzijing2014@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Raghavan <oneraghavan@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Co-authored-by: Huy Do <huydhn@gmail.com>
Co-authored-by: Matúš Námešný <matus@namesny.com>
Co-authored-by: Guillaume Calmettes <gcalmettes@scaleway.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: En Ouyang <en.ouyang93@outlook.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: nvjullin <jullin@nvidia.com>
Co-authored-by: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Co-authored-by: TianyuLi0 <116711075+TianyuLi0@users.noreply.github.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Lukas Geiger <lukas.geiger94@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: Federico <65908512+coval3nte@users.noreply.github.com>
Co-authored-by: zixuanzhang226 <zixuanzhang@bytedance.com>
Co-authored-by: wuhang <wuhang6@huawei.com>
Co-authored-by: yzds <41983536+youzhedian@users.noreply.github.com>
Co-authored-by: hongchao <hongchao@msh.team>
Co-authored-by: czhu-cohere <conway.zhu@cohere.com>
Co-authored-by: Wei <weiweinpu@gmail.com>
Co-authored-by: Yiheng Xu <charlesyihengxu@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: CSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com>
---
 docs/models/supported_models.md               |   2 +-
 tests/models/registry.py                      |   2 +-
 vllm/model_executor/models/minicpmv.py        | 314 +++++++++++++++++-
 .../chat_templates/registry.py                |  11 +
 .../chat_templates/template_minicpmv45.jinja  |  93 ++++++
 5 files changed, 407 insertions(+), 15 deletions(-)
 create mode 100644 vllm/transformers_utils/chat_templates/template_minicpmv45.jinja

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 19ce8c0672..35a5fa0c2e 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -638,7 +638,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
 | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
 | `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, etc. | ✅︎ | | ✅︎ |
+| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
 | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
 | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index f2c09d3e84..ee546e7af8 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -451,7 +451,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
                                 trust_remote_code=True),
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
-                                extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4"},  # noqa: E501
+                                extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4", "4.5": "openbmb/MiniCPM-V-4_5"},  # noqa: E501
                                 trust_remote_code=True),
     "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
                                               trust_remote_code=True,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index c22d871ab2..2d785c30fd 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -27,12 +27,14 @@ import math
 from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
+from itertools import chain
 from typing import Annotated, Any, Callable, Literal, Optional, Union
 
 import numpy as np
 import torch
 import torch.types
 from torch import nn
+from torch.nn.init import trunc_normal_
 from transformers import BatchFeature, PretrainedConfig
 from typing_extensions import TypeVar
 
@@ -47,10 +49,11 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
+from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    ImageProcessorItems, ImageSize,
                                    ModalityData, ModalityDataItems,
@@ -218,6 +221,187 @@ class Resampler2_5(BaseResampler):
         return x
 
 
+class Resampler4_5(Resampler2_5):
+
+    def __init__(self,
+                 num_queries: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 kv_dim: Optional[int] = None,
+                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+                 max_size: tuple[int, int] = (70, 70),
+                 max_temporal_size: int = 36000,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__(num_queries,
+                         embed_dim,
+                         num_heads,
+                         kv_dim,
+                         norm_layer,
+                         max_size,
+                         quant_config=quant_config,
+                         prefix=prefix)
+
+        trunc_normal_(self.query, std=.02)
+        self.max_temporal_size = max_temporal_size
+        self._set_temporal_pos_cache(self.max_temporal_size)
+        self.apply(self._init_weights)
+
+    def get_1d_sincos_pos_embed_from_temporal_size(self, embed_dim: int,
+                                                   pos: np.ndarray):
+        """
+        embed_dim: output dimension for each position
+        pos: a list of positions to be encoded: size (M,)
+        out: (M, D)
+        """
+        assert embed_dim % 2 == 0
+        omega = np.arange(embed_dim // 2, dtype=np.float32)
+        omega /= embed_dim / 2.
+        omega = 1. / 10000**omega  # (D/2,)
+
+        pos = pos.reshape(-1)  # (M,)
+        out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+        emb_sin = np.sin(out)  # (M, D/2)
+        emb_cos = np.cos(out)  # (M, D/2)
+
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+        return emb
+
+    def _set_temporal_pos_cache(self,
+                                max_temporal_size: int,
+                                device: torch.types.Device = "cpu") -> None:
+        temporal_size = np.arange(max_temporal_size, dtype=np.float32)
+        pos_embed = torch.from_numpy(
+            self.get_1d_sincos_pos_embed_from_temporal_size(
+                self.embed_dim, temporal_size)).float().to(device)
+        self.register_buffer("temporal_pos_embed", pos_embed, persistent=False)
+
+    def _adjust_temporal_pos_cache(self,
+                                   max_temporal_size: int,
+                                   device: torch.types.Device = "cpu"):
+        if max_temporal_size > self.max_temporal_size:
+            self.max_temporal_size = max_temporal_size
+            self._set_temporal_pos_cache(self.max_temporal_size, device)
+
+    def _init_weights(self, m: Union[nn.Linear, nn.LayerNorm]):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        tgt_sizes: torch.Tensor,
+        # temporal_ids for high refresh rate videos
+        temporal_ids=None
+    ) -> torch.Tensor:
+        assert x.shape[0] == tgt_sizes.shape[0]
+        bs = x.shape[0]
+
+        device = x.device
+        dtype = x.dtype
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes, device=device)
+
+        temporal_pos_emb = False
+        temporal_ids_flatten = None
+        if temporal_ids is not None:
+            # example: [[-1], [-1], [2, 6, 9]]
+            temporal_ids_flatten = list(chain.from_iterable(temporal_ids))
+            max_temporal_size = max(temporal_ids_flatten, default=0)
+            if max_temporal_size > -1:
+                temporal_pos_emb = True
+            if max_temporal_size > self.max_temporal_size:
+                self._adjust_temporal_pos_cache(max_temporal_size, device)
+
+        max_patch_len = patch_len.max().item()
+        assert isinstance(max_patch_len, int)
+
+        key_padding_mask = torch.zeros((bs, max_patch_len),
+                                       dtype=torch.bool,
+                                       device=device)
+
+        x, _ = self.kv_proj(x)  # B * L * D
+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+        q = self.ln_q(self.query)  # Q * D
+
+        pos_embed_2d = []
+        pos_embed_temporal = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i]
+            if temporal_pos_emb:
+                if temporal_ids_flatten[i] == -1:
+                    pos_embed_temporal.append(
+                        torch.zeros(self.embed_dim, dtype=dtype,
+                                    device=device))
+                else:
+                    pos_embed_temporal.append(self.temporal_pos_embed[
+                        temporal_ids_flatten[i]].to(dtype))  # D
+
+            pos_embed_2d.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
+                (tgt_h * tgt_w, -1)).to(dtype))  # patches * D
+            key_padding_mask[i, patch_len[i]:] = True
+
+        pos_embed_2d = torch.nn.utils.rnn.pad_sequence(
+            pos_embed_2d, batch_first=True,
+            padding_value=0.0).permute(1, 0, 2)  # BLD => L * B * D
+
+        k = x
+        v = x + pos_embed_2d
+        if pos_embed_temporal:
+            k += torch.stack(pos_embed_temporal, dim=0)
+            bs = len(temporal_ids)
+            merge_k = []
+            merge_v = []
+            merge_key_padding_mask = []
+
+            start = 0
+            for tp in temporal_ids:
+                end = start + len(tp)
+                # L * (end-start) * D -> (end-start) * L * D
+                # -> 1 * L*(end-start) * D
+                merge_k.append(k[:, start:end, :].permute(1, 0, 2).reshape(
+                    -1, self.embed_dim))
+                merge_v.append(v[:, start:end, :].permute(1, 0, 2).reshape(
+                    -1, self.embed_dim))
+                merge_key_padding_mask.append(
+                    key_padding_mask[start:end, :].reshape(-1, 1))
+
+                start = end
+
+            k = torch.nn.utils.rnn.pad_sequence(merge_k,
+                                                batch_first=True,
+                                                padding_value=0.0).permute(
+                                                    1, 0, 2)  # L*(end-start)
+            v = torch.nn.utils.rnn.pad_sequence(merge_v,
+                                                batch_first=True,
+                                                padding_value=0.0).permute(
+                                                    1, 0, 2)  # L*(end-start)
+            key_padding_mask = torch.nn.utils.rnn.pad_sequence(
+                merge_key_padding_mask, batch_first=True,
+                padding_value=True).squeeze(-1)
+
+        out = self.attn(
+            self._repeat(q, bs),  # Q * B * D
+            k,  # L * B * D +  L * B * D
+            v,
+            key_padding_mask=key_padding_mask,
+        )[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+
 def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]:
     version_float = getattr(config, "version", None)
 
@@ -354,9 +538,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         mm_limits = {"image": None}
-        if self.get_model_version() == (2,
-                                        6) or self.get_model_version() == (4,
-                                                                           0):
+        if self.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
             mm_limits["video"] = None
 
         return mm_limits
@@ -637,8 +819,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
         out_keys: set[str],
     ) -> dict[str, NestedTensors]:
         # This processor supports zipping prompt and mm_data together
-        if self.info.get_model_version() == (
-                2, 6) or self.info.get_model_version() == (4, 0):
+        if self.info.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
             inputs = super()._call_hf_processor(
                 prompt=prompts,  # type: ignore
                 mm_data=mm_data,
@@ -816,7 +997,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         # and config class
         self.config = config
         self.multimodal_config = multimodal_config
-        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
         self.version = get_version_by_config(self.config)
         self.llm = self.init_llm(vllm_config=vllm_config,
@@ -1364,11 +1544,9 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         prefix: str = "",
     ) -> nn.Module:
         quant_config = self._maybe_ignore_quant_config(quant_config)
-        model = Idefics2VisionTransformer(
-            config.vision_config,
-            quant_config=quant_config,
-            prefix=prefix,
-            use_data_parallel=self.use_data_parallel)
+        model = Idefics2VisionTransformer(config.vision_config,
+                                          quant_config=quant_config,
+                                          prefix=prefix)
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model
@@ -1436,11 +1614,121 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         return loader.load_weights(weights)
 
 
+class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        assert self.version == (4, 5)
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)):
+            return None
+        return quant_config
+
+    def init_llm(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> nn.Module:
+        return Qwen3ForCausalLM(vllm_config=vllm_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        quant_config = self._maybe_ignore_quant_config(quant_config)
+        model = Idefics2VisionTransformer(config.vision_config,
+                                          quant_config=quant_config,
+                                          prefix=prefix)
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        quant_config = self._maybe_ignore_quant_config(quant_config)
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 4.0 remains consistent with the one in 2.5/2.6.
+            resampler = Resampler4_5(num_queries=self.config.query_num,
+                                     embed_dim=embed_dim,
+                                     num_heads=embed_dim // 128,
+                                     kv_dim=vision_dim,
+                                     quant_config=quant_config,
+                                     prefix=prefix)
+
+        return resampler.to(device=current_platform.device_type,
+                            dtype=torch.get_default_dtype())
+
+    def get_vision_hidden_states(
+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
+        tgt_sizes = data["tgt_sizes"]
+        temporal_ids = data.get('temporal_ids', None)
+
+        B = len(pixel_values)
+        P = pixel_values[0].shape[-2]
+        L = max(item.shape[-1] for item in pixel_values)
+        device = pixel_values[0].device
+        dtype = pixel_values[0].dtype
+
+        all_pixel_values = torch.zeros((B, 3, P, L),
+                                       dtype=dtype,
+                                       device=device)
+        all_temporal_ids = None if temporal_ids is None else flatten_2d_lists(
+            temporal_ids)
+        for i, pixel_values_item in enumerate(pixel_values):
+            L_item = pixel_values_item.shape[-1]
+            all_pixel_values[i, ..., :L_item] = pixel_values_item
+
+        num_patches = tgt_sizes.prod(-1)
+        max_patches = num_patches.max().item()
+        assert isinstance(max_patches, int)
+
+        patch_attn_mask = torch.zeros((B, max_patches),
+                                      dtype=torch.bool,
+                                      device=device)
+        for i, num_patches_item in enumerate(num_patches):
+            patch_attn_mask[i, :num_patches_item] = True
+
+        vision_embedding = self.vpm(
+            all_pixel_values,
+            patch_attention_mask=patch_attn_mask.unsqueeze(1),
+            tgt_sizes=tgt_sizes,
+        )
+
+        return self.resampler(vision_embedding, tgt_sizes, all_temporal_ids)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self,
+                                   skip_prefixes=["apm.", "audio", "tts"])
+        return loader.load_weights(weights)
+
+
 _SUPPORT_VERSION = {
     (2, 0): MiniCPMV2_0,
     (2, 5): MiniCPMV2_5,
     (2, 6): MiniCPMV2_6,
     (4, 0): MiniCPMV4_0,
+    (4, 5): MiniCPMV4_5,
 }
 
 
diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py
index e0ef7f0999..d09c5fa924 100644
--- a/vllm/transformers_utils/chat_templates/registry.py
+++ b/vllm/transformers_utils/chat_templates/registry.py
@@ -20,6 +20,16 @@ def _get_qwen_chat_template_fallback(
     return CHAT_TEMPLATES_DIR / "template_basic.jinja"
 
 
+def _get_minicpmv_chat_template_fallback(
+        tokenizer_name_or_path: str) -> Optional[Path]:
+    # MiniCPM-V-4.5 version uses a dedicated template
+    if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
+        return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
+
+    # Other versions use chatml template
+    return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
+
+
 # yapf: disable
 _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
     "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
@@ -27,6 +37,7 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
     "deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
     "florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
     "fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
+    "minicpmv": _get_minicpmv_chat_template_fallback,
     "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
     "qwen": _get_qwen_chat_template_fallback,
 }
diff --git a/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
new file mode 100644
index 0000000000..661ebd1cf5
--- /dev/null
+++ b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
@@ -0,0 +1,93 @@
+{%- set enable_thinking = enable_thinking | default(false) %}
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+    {%- if enable_thinking is defined and enable_thinking is true %}
+        {{- '<think>\n' }}
+    {%- endif %}
+{%- endif %}
\ No newline at end of file

From 8c13820f0b203976eab8e821c102234a73f338cd Mon Sep 17 00:00:00 2001
From: cndoit18 <cndoit18@outlook.com>
Date: Wed, 27 Aug 2025 20:42:20 +0800
Subject: [PATCH 649/932] [Bugfix] Fix task field initialization when
 PYTHONOPTIMIZE is enabled (#23718)

Signed-off-by: cndoit18 <cndoit18@outlook.com>
---
 vllm/worker/pooling_model_runner.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index 8d8d9b4d05..3e1950798d 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -199,8 +199,9 @@ class PoolingModelRunner(
 
             pooling_params = seq_group_metadata.pooling_params
             assert pooling_params is not None
-            assert (task := pooling_params.task) is not None, (
-                "You did not set `task` in the API")
+
+            task = pooling_params.task
+            assert task is not None, "You did not set `task` in the API"
 
             model = cast(VllmModelForPooling, self.model)
             to_update = model.pooler.get_pooling_updates(task)

From a403d0fa41cc68e3b6da4e1097dc896fde2f1a6a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 27 Aug 2025 05:50:47 -0700
Subject: [PATCH 650/932] [Misc] Remove unnecessary `_send_reconfig_message()`
 in `core_client.py` (#23127)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core_client.py | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 079dd9a7d3..65f7abc971 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1190,21 +1190,6 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
         await self._send_input(EngineCoreRequestType.ABORT, request_ids,
                                engine)
 
-    async def _send_reconfig_message(
-            self, reconfig_request: ReconfigureDistributedRequest,
-            engine: EngineIdentity) -> asyncio.Future:
-        """Send reconfiguration message and return the result future without
-        waiting for completion."""
-        call_id = uuid.uuid1().int >> 64
-        future = asyncio.get_running_loop().create_future()
-        self.utility_results[call_id] = future
-        message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode(
-            (self.client_index, call_id, "reinitialize_distributed",
-             (reconfig_request, ))))
-        await self._send_input_message(message, engine, reconfig_request)
-        self._ensure_output_queue_task()
-        return future
-
     async def scale_elastic_ep(self, new_data_parallel_size: int) -> None:
         """Scale elastic EP data parallel size"""
         cur_data_parallel_size = len(self.core_engines)
@@ -1214,7 +1199,7 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
             f"different from cur_data_parallel_size {cur_data_parallel_size}")
 
         assert self.vllm_config.parallel_config.data_parallel_backend == \
-            "ray", ("Only ray DP backend supports scaling elastic EP")
+            "ray", "Only ray DP backend supports scaling elastic EP"
 
         scale_up = new_data_parallel_size > cur_data_parallel_size
 
@@ -1246,9 +1231,10 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
                 data_parallel_master_ip,
                 new_data_parallel_master_port=self.vllm_config.parallel_config.
                 data_parallel_master_port)
-            future = await self._send_reconfig_message(reconfig_request,
-                                                       engine)
-            reconfig_futures.append(future)
+            coro = self._call_utility_async("reinitialize_distributed",
+                                            reconfig_request,
+                                            engine=engine)
+            reconfig_futures.append(asyncio.create_task(coro))
 
         logger.info("All reconfigure messages sent, starting engine creation")
 
@@ -1318,9 +1304,10 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
             if cur_dp_rank >= new_data_parallel_size:
                 reconfig_request.new_data_parallel_rank = \
                 ReconfigureRankType.SHUTDOWN_CURRENT_RANK
-            future = await self._send_reconfig_message(reconfig_request,
-                                                       engine)
-            reconfig_futures.append(future)
+            coro = self._call_utility_async("reinitialize_distributed",
+                                            reconfig_request,
+                                            engine=engine)
+            reconfig_futures.append(asyncio.create_task(coro))
 
         for _ in range(new_data_parallel_size, cur_data_parallel_size):
             self.core_engines.pop()

From 704432af3c129b7a57fca9b059eefe214159f836 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 27 Aug 2025 14:51:54 +0200
Subject: [PATCH 651/932] [V1] [Hybrid] Disable prefix caching by default for
 hybrid or mamba-based models  (#23716)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 docs/usage/v1_guide.md               | 10 ++++++----
 vllm/model_executor/models/config.py |  9 +++++----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 64bd0d9bf5..20234e7611 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -107,14 +107,16 @@ to enable simultaneous generation and embedding using the same engine instance i
 #### Mamba Models
 
 Models using selective state-space mechanisms instead of standard transformer attention are supported.
-Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported.
+Please note that prefix caching is not yet supported for these models.
 
 Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
-`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
-these models currently require disabling prefix caching in V1.
+`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
+Please note that prefix caching is not yet supported for these models.
 
 Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
-Please note that these models currently require disabling prefix caching and enforcing eager mode in V1.
+Please note that prefix caching is not yet supported for these models.
+It is also necessary to enforce eager mode for these models in V1.
 
 #### Encoder-Decoder Models
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index f62209326b..88b3154de2 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -292,12 +292,13 @@ class MambaModelConfig(VerifyAndUpdateConfig):
             return
 
         model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
         compilation_config = vllm_config.compilation_config
 
-        model_cls, _ = ModelRegistry.resolve_model_cls(
-            model_config.architecture,
-            model_config=model_config,
-        )
+        # TODO(tdoublep): remove once prefix caching is enabled
+        cache_config.enable_prefix_caching = False
+        logger.info("Hybrid or mamba-based model detected: disabling prefix "
+                    "caching since it is not yet supported.")
 
         # TODO(tdoublep): remove as full cuda graph support is added
         FCG_NOT_SUPPORTED_MODELS = [

From 5eeef1b90852917b300ed67b98e341eb846ba2e9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 21:24:09 +0800
Subject: [PATCH 652/932] [Model] Explicit `default_pooling_type` interface
 (#23736)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/bert.py            |  4 +--
 vllm/model_executor/models/bert_with_rope.py  |  5 ++--
 vllm/model_executor/models/gritlm.py          |  2 +-
 vllm/model_executor/models/interfaces.py      | 19 +------------
 vllm/model_executor/models/interfaces_base.py | 28 +++++++++++++++++++
 vllm/model_executor/models/internlm2.py       |  3 +-
 vllm/model_executor/models/modernbert.py      |  3 +-
 .../models/prithvi_geospatial_mae.py          |  7 +++--
 vllm/model_executor/models/qwen2_rm.py        |  3 +-
 vllm/model_executor/models/registry.py        |  7 +++--
 vllm/model_executor/models/roberta.py         |  3 +-
 11 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 22b6c44012..b34ca5cbe9 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -28,8 +28,8 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
 
-from .interfaces import (SupportsCrossEncoding, SupportsQuant,
-                         default_pooling_type)
+from .interfaces import SupportsCrossEncoding, SupportsQuant
+from .interfaces_base import default_pooling_type
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 129450927e..dcb7e75456 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -27,13 +27,14 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import (SupportsQuant,
-                                                   default_pooling_type)
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import SupportsQuant
+from .interfaces_base import default_pooling_type
+
 
 class BertWithRopeEmbedding(nn.Module):
 
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 3f6790269a..1b3d541c65 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -20,7 +20,7 @@ from vllm.sequence import PoolerOutput
 from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
-from .interfaces import default_pooling_type
+from .interfaces_base import default_pooling_type
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 9415e67924..22f005849e 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -3,7 +3,7 @@
 
 from collections.abc import Iterable, Mapping, MutableSequence
 from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
-                    TypeVar, Union, overload, runtime_checkable)
+                    Union, overload, runtime_checkable)
 
 import numpy as np
 import torch
@@ -641,23 +641,6 @@ def supports_cross_encoding(
     return is_pooling_model(model) and _supports_cross_encoding(model)
 
 
-_T = TypeVar("_T", bound=type[torch.nn.Module])
-
-
-def default_pooling_type(pooling_type: str):
-    """Set default_pooling_type decorator. """
-
-    def func(model: _T) -> _T:
-        model.default_pooling_type = pooling_type  # type: ignore
-        return model
-
-    return func
-
-
-def get_default_pooling_type(model: Union[type[object], object]) -> str:
-    return getattr(model, "default_pooling_type", "LAST")
-
-
 class SupportsQuant:
     """The interface required for all models that support quantization."""
 
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 697fa020de..19a3ef1a3b 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -144,6 +144,17 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
         MRO of your model class.
     """
 
+    default_pooling_type: ClassVar[str] = "LAST"
+    """
+    Indicates the
+    [vllm.model_executor.layers.pooler.PoolerConfig.pooling_type][]
+    to use by default.
+
+    You can use the
+    [vllm.model_executor.models.interfaces_base.default_pooling_type][]
+    decorator to conveniently set this field.
+    """
+
     pooler: Pooler
     """The pooler is only called on TP rank 0."""
 
@@ -165,3 +176,20 @@ def is_pooling_model(
         return False
 
     return getattr(model, "is_pooling_model", False)
+
+
+_T = TypeVar("_T", bound=type[nn.Module])
+
+
+def default_pooling_type(pooling_type: str):
+    """Decorator to set `VllmModelForPooling.default_pooling_type`."""
+
+    def func(model: _T) -> _T:
+        model.default_pooling_type = pooling_type  # type: ignore
+        return model
+
+    return func
+
+
+def get_default_pooling_type(model: Union[type[object], object]) -> str:
+    return getattr(model, "default_pooling_type", "LAST")
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index d0c4bf5450..26bc48ffbd 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -31,7 +31,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type
+from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces_base import default_pooling_type
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 72290bf2ee..4778555861 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -26,7 +26,8 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
 
-from .interfaces import SupportsCrossEncoding, default_pooling_type
+from .interfaces import SupportsCrossEncoding
+from .interfaces_base import default_pooling_type
 from .utils import WeightsMapper, maybe_prefix
 
 
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 59e9f3e8a4..f46d6375e1 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -27,9 +27,6 @@ from transformers import BatchFeature
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import (
-    IsAttentionFree, MultiModalEmbeddings, SupportsMultiModalWithRawInput,
-    default_pooling_type)
 from vllm.model_executor.models.utils import AutoWeightsLoader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
@@ -43,6 +40,10 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import (IsAttentionFree, MultiModalEmbeddings,
+                         SupportsMultiModalWithRawInput)
+from .interfaces_base import default_pooling_type
+
 
 def _prithvi_field_config(hf_inputs: Mapping[str, torch.Tensor]):
     # This model receives in input a multi-dimensional tensor representing
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index e0a30e04c6..421b43563b 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -18,7 +18,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type
+from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces_base import default_pooling_type
 from .qwen2 import Qwen2Model
 from .utils import AutoWeightsLoader, maybe_prefix
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index c65c58d4a0..196b5f35e1 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -25,11 +25,12 @@ from vllm.logger import init_logger
 from vllm.transformers_utils.dynamic_module import (
     try_get_class_from_dynamic_module)
 
-from .interfaces import (get_default_pooling_type, has_inner_state, has_noops,
-                         is_attention_free, is_hybrid, supports_cross_encoding,
+from .interfaces import (has_inner_state, has_noops, is_attention_free,
+                         is_hybrid, supports_cross_encoding,
                          supports_multimodal, supports_multimodal_raw_input,
                          supports_pp, supports_transcription, supports_v0_only)
-from .interfaces_base import is_pooling_model, is_text_generation_model
+from .interfaces_base import (get_default_pooling_type, is_pooling_model,
+                              is_text_generation_model)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 49a37342c6..2bfa511629 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -22,7 +22,8 @@ from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
 from vllm.sequence import IntermediateTensors
 
 from .bert_with_rope import BertWithRope, JinaRobertaModel
-from .interfaces import SupportsCrossEncoding, default_pooling_type
+from .interfaces import SupportsCrossEncoding
+from .interfaces_base import default_pooling_type
 
 
 class RobertaEmbedding(nn.Module):

From 8dd2baa5978f123974177023d6efab731153a2f4 Mon Sep 17 00:00:00 2001
From: rebel-hongseok <hongseok@rebellions.ai>
Date: Wed, 27 Aug 2025 22:25:49 +0900
Subject: [PATCH 653/932] Add vLLM Korea Meetup in the README.md and meetups.md
 (#23746)

Signed-off-by: rebel-hongseok <hongseok@rebellions.ai>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 README.md                 | 1 +
 docs/community/meetups.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index ef5b435889..8812aac4ea 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ Easy, fast, and cheap LLM serving for everyone
 *Latest News* 🔥
 
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 61ea44220a..d76238cb31 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -3,6 +3,7 @@
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
+- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). 
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
 - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
 - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).

From 16dc4052b004261b547fc50fe7b20e2d2fbf915d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 14:39:48 +0100
Subject: [PATCH 654/932] Fix pre-commit on main (#23747)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/community/meetups.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index d76238cb31..221a7bd962 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -3,7 +3,7 @@
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
-- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). 
+- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
 - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
 - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).

From fe8d7b6f03e7d8a36ffb6931397fc81ee594dd64 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 21:41:22 +0800
Subject: [PATCH 655/932] [Model] Interface to enable batch-level DP support
 (#23733)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/configuration/optimization.md       |  7 +++++--
 vllm/config/__init__.py                  |  7 +++++++
 vllm/model_executor/models/interfaces.py | 11 +++++++++++
 vllm/model_executor/models/minicpmv.py   |  2 ++
 vllm/model_executor/models/mllama4.py    |  2 ++
 vllm/model_executor/models/qwen2_5_vl.py |  2 ++
 vllm/model_executor/models/registry.py   |  9 +++++++--
 vllm/model_executor/models/step3_vl.py   |  2 ++
 8 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index a8eab9985c..b11ccb5c00 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -168,8 +168,11 @@ llm = LLM(
     Batch-level DP is not to be confused with API request-level DP
     (which is instead controlled by `data_parallel_size`).
 
-The availability of batch-level DP is based on model implementation.
-Currently, the following models support `mm_encoder_tp_mode="data"`:
+Batch-level DP needs to be implemented on a per-model basis,
+and enabled by setting `supports_encoder_tp_data = True` in the model class.
+Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to use this feature.
+
+Known supported models:
 
 - Llama4 (<gh-pr:18368>)
 - MiniCPM-V-4 (<gh-pr:23327>)
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index ac6f51df95..e3fb6d796d 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -872,6 +872,13 @@ class ModelConfig:
 
     def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
         if self._model_info.supports_multimodal:
+            if (self.mm_encoder_tp_mode == "data" and
+                    not self._model_info.supports_multimodal_encoder_tp_data):
+                logger.warning_once(
+                    "This model does not support `--mm-encoder-tp-mode data`. "
+                    "Falling back to `--mm-encoder-tp-mode weights`.")
+                self.mm_encoder_tp_mode = "weights"
+
             return MultiModalConfig(
                 limit_per_prompt=self.limit_mm_per_prompt,
                 media_io_kwargs=self.media_io_kwargs,
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 22f005849e..506732fed3 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -52,6 +52,12 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
+    supports_encoder_tp_data: ClassVar[bool] = False
+    """
+    A flag that indicates whether this model supports
+    `multimodal_config.mm_encoder_tp_mode="data"`.
+    """
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         """
@@ -137,6 +143,11 @@ def supports_multimodal(
     return getattr(model, "supports_multimodal", False)
 
 
+def supports_multimodal_encoder_tp_data(
+        model: Union[type[object], object]) -> bool:
+    return getattr(model, "supports_encoder_tp_data", False)
+
+
 @runtime_checkable
 class SupportsMultiModalWithRawInput(SupportsMultiModal, Protocol):
     """The interface required for all multi-modal models."""
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2d785c30fd..0181bfeebd 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1521,6 +1521,8 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         ],
     }
 
+    supports_encoder_tp_data = True
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         assert self.version == (4, 0)
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 595bdd17cf..ac9b968f7a 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -716,6 +716,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 648ba81eb3..b528083b7c 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -868,6 +868,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.": "language_model.model.",
         })
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 196b5f35e1..80eac78cdf 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -27,8 +27,10 @@ from vllm.transformers_utils.dynamic_module import (
 
 from .interfaces import (has_inner_state, has_noops, is_attention_free,
                          is_hybrid, supports_cross_encoding,
-                         supports_multimodal, supports_multimodal_raw_input,
-                         supports_pp, supports_transcription, supports_v0_only)
+                         supports_multimodal,
+                         supports_multimodal_encoder_tp_data,
+                         supports_multimodal_raw_input, supports_pp,
+                         supports_transcription, supports_v0_only)
 from .interfaces_base import (get_default_pooling_type, is_pooling_model,
                               is_text_generation_model)
 
@@ -324,6 +326,7 @@ class _ModelInfo:
     supports_cross_encoding: bool
     supports_multimodal: bool
     supports_multimodal_raw_input: bool
+    supports_multimodal_encoder_tp_data: bool
     supports_pp: bool
     has_inner_state: bool
     is_attention_free: bool
@@ -343,6 +346,8 @@ class _ModelInfo:
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_multimodal_raw_input=supports_multimodal_raw_input(model),
+            supports_multimodal_encoder_tp_data=
+            supports_multimodal_encoder_tp_data(model),
             supports_pp=supports_pp(model),
             has_inner_state=has_inner_state(model),
             is_attention_free=is_attention_free(model),
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index f8877b584b..f379d2c15f 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -867,6 +867,8 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         "lm_head.": "language_model.lm_head.",
     })
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):

From 513c1fe255f7d4ec3e91f7f5c2dd2d97c0460765 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 14:55:12 +0100
Subject: [PATCH 656/932] Only run `get_attr_docs` if generating help text
 (#23723)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9e7c95ea52..3399d505e3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -152,9 +152,17 @@ def is_online_quantization(quantization: Any) -> bool:
     return quantization in ["inc"]
 
 
+NEEDS_HELP = (
+    "--help" in (argv := sys.argv)  # vllm SUBCOMMAND --help
+    or (argv0 := argv[0]).endswith("mkdocs")  # mkdocs SUBCOMMAND
+    or argv0.endswith("mkdocs/__main__.py")  # python -m mkdocs SUBCOMMAND
+)
+
+
 @functools.lru_cache(maxsize=30)
 def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
-    cls_docs = get_attr_docs(cls)
+    # Save time only getting attr docs if we're generating help text
+    cls_docs = get_attr_docs(cls) if NEEDS_HELP else {}
     kwargs = {}
     for field in fields(cls):
         # Get the set of possible types for the field
@@ -172,7 +180,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
 
         # Get the help text for the field
         name = field.name
-        help = cls_docs[name].strip()
+        help = cls_docs.get(name, "").strip()
         # Escape % for argparse
         help = help.replace("%", "%%")
 
@@ -254,6 +262,9 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
 def get_kwargs(cls: ConfigType) -> dict[str, Any]:
     """Return argparse kwargs for the given Config dataclass.
 
+    If `--help` or `mkdocs` are not present in the command line command, the
+    attribute documentation will not be included in the help output.
+
     The heavy computation is cached via functools.lru_cache, and a deep copy
     is returned so callers can mutate the dictionary without affecting the
     cached version.

From 3af47c3cc693f432b59658019891393385aa0e2a Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 27 Aug 2025 10:09:08 -0400
Subject: [PATCH 657/932] [Feature] Add Hopper DeepGEMM E8M0 for DeepSeekV3.1
 scale_fmt (#23666)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 tests/kernels/moe/test_block_fp8.py           |  5 +-
 tests/kernels/moe/test_deepep_deepgemm_moe.py |  7 ++-
 vllm/envs.py                                  |  8 ++-
 .../layers/fused_moe/batched_deep_gemm_moe.py |  4 +-
 .../layers/fused_moe/fused_moe.py             |  7 ++-
 .../layers/fused_moe/triton_deep_gemm_moe.py  |  6 +--
 .../model_executor/layers/quantization/fp8.py |  9 ++--
 .../layers/quantization/utils/fp8_utils.py    |  4 +-
 vllm/transformers_utils/config.py             | 18 +++++++
 vllm/utils/deep_gemm.py                       | 53 +++++++++----------
 10 files changed, 68 insertions(+), 53 deletions(-)

diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index 9e4eaf221f..ecc57acc67 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, modular_triton_fused_moe)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 
 dg_available = has_deep_gemm()
 
@@ -226,8 +226,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
-@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
-                    reason="Not E8M0 scale MOE")
+@pytest.mark.skipif(is_deep_gemm_e8m0_used(), reason="Not E8M0 scale MOE")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
                                             monkeypatch):
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 1e922be47f..36a98522a6 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -20,8 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep, has_deep_gemm
-from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
-                                  is_deep_gemm_supported)
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -374,7 +373,7 @@ NUM_EXPERTS = [32]
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
+@pytest.mark.skipif(is_deep_gemm_e8m0_used(),
                     reason="Skipping test for Blackwell DeepGEMM")
 def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
                                 topk: int, world_dp_size: tuple[int, int]):
@@ -432,7 +431,7 @@ USE_FP8_DISPATCH = [False]
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
+@pytest.mark.skipif(is_deep_gemm_e8m0_used(),
                     reason="Skipping test for Blackwell DeepGEMM")
 def test_ll_deepep_deepgemm_moe(
     mnk: tuple[int, int, int],
diff --git a/vllm/envs.py b/vllm/envs.py
index 66c7c2c7f2..35735b5525 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -131,6 +131,7 @@ if TYPE_CHECKING:
     VLLM_TPU_USING_PATHWAYS: bool = False
     VLLM_USE_DEEP_GEMM: bool = False
     VLLM_USE_DEEP_GEMM_E8M0: bool = True
+    VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False
     VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
     VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
@@ -954,9 +955,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
 
     # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
-    # E8M0 is faster on B200 but may reduce accuracy.
     "VLLM_USE_DEEP_GEMM_E8M0":
     lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))),
+    # TODO(wentao): unify the two E8M0 flags after verifying the correctness.
+    # Whether to use E8M0 scaling when DeepGEMM is used on Hopper GPUs.
+    "VLLM_USE_DEEP_GEMM_E8M0_HOPPER":
+    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))),
     # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
     # JIT all the required kernels before model execution so there is no
     # JIT'ing in the hot-path. However, this warmup increases the engine
@@ -1244,6 +1248,8 @@ def compute_hash() -> str:
         "VLLM_USE_FLASHINFER_SAMPLER",
         "VLLM_DISABLED_KERNELS",
         "VLLM_USE_DEEP_GEMM",
+        "VLLM_USE_DEEP_GEMM_E8M0",
+        "VLLM_USE_DEEP_GEMM_E8M0_HOPPER",
         "VLLM_USE_TRTLLM_FP4_GEMM",
         "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
         "VLLM_USE_FLASHINFER_MOE_FP8",
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index c4d680af93..a5326dfe84 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (fp8_m_grouped_gemm_nt_masked,
-                                  is_blackwell_deep_gemm_e8m0_used)
+                                  is_deep_gemm_e8m0_used)
 
 logger = init_logger(__name__)
 
@@ -174,7 +174,7 @@ def silu_mul_fp8_quant_deep_gemm(
         eps,
         fp8_min,
         fp8_max,
-        is_blackwell_deep_gemm_e8m0_used(),
+        is_deep_gemm_e8m0_used(),
         BLOCK=group_size,
         NUM_STAGES=4,
         num_warps=1,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 84dafcf00d..17a5c735a5 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 
 from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
 
@@ -1431,9 +1431,8 @@ def fused_experts(hidden_states: torch.Tensor,
     # E8M0 scale, which means we requantize the weight and input to the specific
     # scale. Fallen back to cutlass or triton for some cases would cause
     # accuracy issue.
-    if (allow_deep_gemm and use_fp8_w8a8
-            and (is_blackwell_deep_gemm_e8m0_used()
-                 or _valid_deep_gemm(hidden_states, w1, w2))):
+    if (allow_deep_gemm and use_fp8_w8a8 and
+        (is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2))):
         assert apply_router_weight_on_input is False
         assert is_act_and_mul, (
             "DeepGemm only supports is_act_and_mul=True for now.")
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 486ca881df..6cd81d97f0 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
     DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape,
     deep_gemm_block_shape)
 from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 
 
 class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
@@ -107,7 +107,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
-        if self.allow_deep_gemm and (is_blackwell_deep_gemm_e8m0_used()
+        if self.allow_deep_gemm and (is_deep_gemm_e8m0_used()
                                      or _valid_deep_gemm_shape(M, N, K)):
             assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.workspace_shapes(
@@ -143,7 +143,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     ):
         use_deep_gemm = (self.allow_deep_gemm
                          and (_valid_deep_gemm(hidden_states, w1, w2)
-                              or is_blackwell_deep_gemm_e8m0_used()))
+                              or is_deep_gemm_e8m0_used()))
 
         experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert
         assert experts is not None
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d45d368b58..be358cfa94 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -48,8 +48,7 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
-                                  is_deep_gemm_supported)
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
 from vllm.utils.flashinfer import has_flashinfer_moe
 
 if TYPE_CHECKING:
@@ -427,7 +426,7 @@ class Fp8LinearMethod(LinearMethodBase):
         # On B200, if E8M0 for DeepGemm is used, we need to
         # requantize the weight and input to the specific scale
         # at the same time.
-        if is_blackwell_deep_gemm_e8m0_used():
+        if is_deep_gemm_e8m0_used():
             assert layer.weight_block_size is not None
             block_sz = tuple(layer.weight_block_size)
             requant_weight_ue8m0_inplace(
@@ -734,7 +733,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
             # DeepGemm scales need to be transposed and aligned.  We try to do
             # it ahead of time for performance reasons.
-            if self.allow_deep_gemm and not is_blackwell_deep_gemm_e8m0_used():
+            if self.allow_deep_gemm and not is_deep_gemm_e8m0_used():
                 # Lazy import to avoid CUDA initialization problems.
                 if _is_col_major(layer.w13_weight_scale_inv):
                     layer.w13_weight_scale_inv = \
@@ -871,7 +870,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-        if is_blackwell_deep_gemm_e8m0_used():
+        if is_deep_gemm_e8m0_used():
             assert layer.weight_block_size is not None
             # Re-quantise the expert weights so their scales are UE8M0.
             block_sz = tuple(layer.weight_block_size)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index ab1d5383f4..7b324dce3c 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv, direct_register_custom_op
-from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
+from vllm.utils.deep_gemm import (is_deep_gemm_e8m0_used,
                                   should_use_deepgemm_for_fp8_linear)
 
 logger = init_logger(__name__)
@@ -385,7 +385,7 @@ def per_token_group_quant_fp8(
         scaling factor.
     """
     if use_ue8m0 is None:
-        use_ue8m0 = is_blackwell_deep_gemm_e8m0_used()
+        use_ue8m0 = is_deep_gemm_e8m0_used()
     dtype = current_platform.fp8_dtype() if dtype is None else dtype
     assert (x.shape[-1] % group_size == 0), (
         f"the last dimension of `x` {x.shape[-1]} must be divisible "
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 2cd799e5eb..bec792465b 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -501,6 +501,24 @@ def get_config(
 
     if quantization_config is not None:
         config.quantization_config = quantization_config
+        # auto-enable DeepGEMM UE8M0 on Hopper if model config requests it
+        scale_fmt = quantization_config.get("scale_fmt", None)
+        if scale_fmt in ("ue8m0", ):
+            if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"):
+                os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1"
+                logger.info_once(
+                    ("Detected quantization_config.scale_fmt=%s; "
+                     "enabling Hopper UE8M0."),
+                    scale_fmt,
+                )
+            elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
+                logger.warning_once(
+                    ("Model config requests UE8M0 "
+                     "(quantization_config.scale_fmt=%s), but "
+                     "VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
+                     "Hopper UE8M0 disabled."),
+                    scale_fmt,
+                )
 
     if hf_overrides_kw:
         logger.debug("Overriding HF config with %s", hf_overrides_kw)
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index b0bc3a79eb..cd1dbfb813 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -31,34 +31,33 @@ def is_deep_gemm_supported() -> bool:
 
 
 @functools.cache
-def is_blackwell_deep_gemm_e8m0_used() -> bool:
+def is_deep_gemm_e8m0_used() -> bool:
     """Return ``True`` if vLLM is configured to use DeepGEMM "
-    "E8M0 scale on a Blackwell-class GPU.
+    "E8M0 scale on a Hopper or Blackwell-class GPU.
     """
     if not is_deep_gemm_supported():
-        logger.debug_once(
+        logger.info_once(
             "DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.")
         return False
 
-    if not envs.VLLM_USE_DEEP_GEMM_E8M0:
-        logger.debug_once("DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM_E8M0=0.")
-        return False
-
     _lazy_init()
 
     if _fp8_gemm_nt_impl is None:
-        logger.debug_once(
-            "DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
+        logger.info_once("DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
         return False
 
-    enabled = (current_platform.is_cuda()
-               and current_platform.has_device_capability(100))
-    if enabled:
-        logger.debug_once("DeepGEMM E8M0 enabled on Blackwell GPU.")
-    else:
-        logger.debug_once(
-            "DeepGEMM E8M0 disabled: not running on Blackwell GPU.")
-    return enabled
+    if current_platform.is_device_capability(100) and \
+            envs.VLLM_USE_DEEP_GEMM_E8M0:
+        logger.info_once("DeepGEMM E8M0 enabled on Blackwell GPU.")
+        return True
+
+    if current_platform.is_device_capability(90) and \
+            envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
+        logger.info_once("DeepGEMM E8M0 enabled on Hopper GPU.")
+        return True
+
+    logger.info_once("DeepGEMM E8M0 disabled on current configuration.")
+    return False
 
 
 def _missing(*_: Any, **__: Any) -> NoReturn:
@@ -124,20 +123,18 @@ def fp8_gemm_nt(*args, **kwargs):
     _lazy_init()
     if _fp8_gemm_nt_impl is None:
         return _missing(*args, **kwargs)
-    return _fp8_gemm_nt_impl(
-        *args,
-        disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(),
-        **kwargs)
+    return _fp8_gemm_nt_impl(*args,
+                             disable_ue8m0_cast=not is_deep_gemm_e8m0_used(),
+                             **kwargs)
 
 
 def m_grouped_fp8_gemm_nt_contiguous(*args, **kwargs):
     _lazy_init()
     if _grouped_impl is None:
         return _missing(*args, **kwargs)
-    return _grouped_impl(
-        *args,
-        disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(),
-        **kwargs)
+    return _grouped_impl(*args,
+                         disable_ue8m0_cast=not is_deep_gemm_e8m0_used(),
+                         **kwargs)
 
 
 def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
@@ -145,9 +142,7 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
     if _grouped_masked_impl is None:
         return _missing(*args, **kwargs)
     return _grouped_masked_impl(
-        *args,
-        disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(),
-        **kwargs)
+        *args, disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), **kwargs)
 
 
 def _ceil_to_ue8m0(x: torch.Tensor):
@@ -211,7 +206,7 @@ __all__ = [
     "m_grouped_fp8_gemm_nt_contiguous",
     "fp8_m_grouped_gemm_nt_masked",
     "per_block_cast_to_fp8",
-    "is_blackwell_deep_gemm_e8m0_used",
+    "is_deep_gemm_e8m0_used",
     "is_deep_gemm_supported",
     "should_use_deepgemm_for_fp8_linear",
 ]

From 841490434aaee4b1c8d8427112af740b6662f384 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 27 Aug 2025 22:45:17 +0800
Subject: [PATCH 658/932] [Model] Enable native HF format InternVL support
 (#23742)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md               |  1 +
 .../multimodal/generation/test_common.py      | 29 +++++++++----------
 tests/models/registry.py                      |  3 +-
 vllm/model_executor/models/registry.py        |  1 +
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 35a5fa0c2e..20cf75873a 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -629,6 +629,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 96208f8eda..2b60faae8e 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -222,21 +222,6 @@ VLM_TEST_SETTINGS = {
         },
         marks=[large_gpu_mark(min_gb=32)],
     ),
-    # Check "auto" with fallback to transformers
-    "internvl-transformers": VLMTestInfo(
-        models=["OpenGVLab/InternVL3-1B-hf"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
-        max_model_len=4096,
-        use_tokenizer_eos=True,
-        image_size_factors=[(0.25, 0.5, 1.0)],
-        vllm_runner_kwargs={
-            "model_impl": "auto",
-        },
-        auto_cls=AutoModelForImageTextToText,
-        marks=[pytest.mark.core_model],
-    ),
     #### Extended model tests
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
@@ -461,6 +446,20 @@ VLM_TEST_SETTINGS = {
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
     ),
+    "intern_vl-hf": VLMTestInfo(
+        models=["OpenGVLab/InternVL3-1B-hf"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO,
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
+        video_idx_to_prompt=lambda idx: "<video>",
+        max_model_len=8192,
+        use_tokenizer_eos=True,
+        auto_cls=AutoModelForImageTextToText,
+    ),
     "kimi_vl": VLMTestInfo(
         models=["moonshotai/Kimi-VL-A3B-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/registry.py b/tests/models/registry.py
index ee546e7af8..2538e71692 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -429,6 +429,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                  "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B",   # noqa: E501
                                                  "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"},  # noqa: E501
                                          trust_remote_code=True),
+    "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),    # noqa: E501
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                     trust_remote_code=True),
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
@@ -584,7 +585,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
 _TRANSFORMERS_BACKEND_MODELS = {
     "TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"),
     "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
-    "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
+    "TransformersForMultimodalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
 }
 
 _EXAMPLE_MODELS = {
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 80eac78cdf..02ef301a52 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -220,6 +220,7 @@ _MULTIMODAL_MODELS = {
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
+    "InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
     "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),

From 83f555f637b41a0f533fa1d37b194df6f564ac64 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Wed, 27 Aug 2025 16:59:34 +0200
Subject: [PATCH 659/932] [Doc]: upgrade version of crate-ci tool for improved
 typo detection (#23755)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 612b290e88..c16bdeeecd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
   - id: ruff-format
     files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.34.0
+  rev: v1.35.5
   hooks:
   - id: typos
 - repo: https://github.com/PyCQA/isort

From 3ce8285d6d96b929fddbb8d29be9ed3b81adcd75 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 27 Aug 2025 08:11:33 -0700
Subject: [PATCH 660/932] [LogitsProcs] Deduplicate built-in LP implementation
 logic (#23362)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 .../offline_inference/logits_processor.py     |  38 ++---
 tests/v1/logits_processors/utils.py           |  37 ++---
 vllm/v1/sample/logits_processor/builtin.py    | 148 ++++++++----------
 vllm/v1/sample/logits_processor/interface.py  |  15 +-
 4 files changed, 95 insertions(+), 143 deletions(-)

diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py
index 7ef20efa7d..3e12231916 100644
--- a/examples/offline_inference/logits_processor.py
+++ b/examples/offline_inference/logits_processor.py
@@ -42,8 +42,8 @@ from vllm.config import VllmConfig
 from vllm.v1.sample.logits_processor import (
     BatchUpdate,
     LogitsProcessor,
-    MoveDirectionality,
 )
+from vllm.v1.sample.logits_processor.builtin import process_dict_updates
 
 
 # Hypothetical custom logits processor
@@ -53,38 +53,22 @@ class DummyLogitsProcessor(LogitsProcessor):
     def __init__(
         self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
     ):
-        self.req_info: dict[int, SamplingParams] = {}
+        self.req_info: dict[int, int] = {}
 
     def is_argmax_invariant(self) -> bool:
         """Never impacts greedy sampling"""
         return False
 
     def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        # Process added requests.
-        for index, params, _, _ in batch_update.added:
-            assert params is not None
-            if params.extra_args and (
-                target_token := params.extra_args.get("target_token")
-            ):
-                self.req_info[index] = target_token
-
-        if self.req_info:
-            # Process removed requests.
-            for index in batch_update.removed:
-                self.req_info.pop(index, None)
-
-            # Process moved requests, unidirectional move (a->b) and swap
-            # (a<->b)
-            for adx, bdx, direct in batch_update.moved:
-                a_val = self.req_info.pop(adx, None)
-                b_val = self.req_info.pop(bdx, None)
-                if a_val is not None:
-                    self.req_info[bdx] = a_val
-                if direct == MoveDirectionality.SWAP and b_val is not None:
-                    self.req_info[adx] = b_val
+        process_dict_updates(
+            self.req_info,
+            batch_update,
+            # This function returns the LP's per-request state based on the
+            # request details, or None if this LP does not apply to the
+            # request.
+            lambda params, _, __: params.extra_args
+            and (params.extra_args.get("target_token")),
+        )
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         if not self.req_info:
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
index c0bfc1a18f..c36f1bd021 100644
--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@@ -8,10 +8,9 @@ from typing import Optional
 import torch
 
 from vllm.config import VllmConfig
-from vllm.sampling_params import SamplingParams
 from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate,
-                                             LogitsProcessor,
-                                             MoveDirectionality)
+                                             LogitsProcessor)
+from vllm.v1.sample.logits_processor.builtin import process_dict_updates
 
 MODEL_NAME = "facebook/opt-125m"
 POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
@@ -45,37 +44,19 @@ class DummyLogitsProcessor(LogitsProcessor):
 
     def __init__(self, vllm_config: "VllmConfig", device: torch.device,
                  is_pin_memory: bool):
-        self.req_info: dict[int, SamplingParams] = {}
+        self.req_info: dict[int, int] = {}
 
     def is_argmax_invariant(self) -> bool:
         """Never impacts greedy sampling"""
         return False
 
     def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        # Process added requests.
-        for index, params, _, _ in batch_update.added:
-            assert params is not None
-            if params.extra_args and (target_token :=
-                                      params.extra_args.get("target_token")):
-                self.req_info[index] = target_token
-
-        if self.req_info:
-            # Process removed requests.
-            for index in batch_update.removed:
-                self.req_info.pop(index, None)
-
-            # Process moved requests, unidirectional move (a->b) and swap
-            # (a<->b)
-            for adx, bdx, direct in batch_update.moved:
-                a_val = self.req_info.pop(adx, None)
-                b_val = self.req_info.pop(bdx, None)
-                if a_val is not None:
-                    self.req_info[bdx] = a_val
-                if direct == MoveDirectionality.SWAP and b_val is not None:
-                    self.req_info[adx] = b_val
+        process_dict_updates(
+            self.req_info,
+            batch_update,
+            lambda params, _, __: params.extra_args and
+            (params.extra_args.get("target_token")),
+        )
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         if not self.req_info:
diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index 00dd757489..60f9c0bdb6 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar
 
 import torch
 
+from vllm import SamplingParams
 from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
                                                        LogitsProcessor,
                                                        MoveDirectionality)
@@ -12,6 +13,8 @@ from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
+T = TypeVar("T")
+
 
 class MinPLogitsProcessor(LogitsProcessor):
 
@@ -130,49 +133,15 @@ class LogitBiasLogitsProcessor(LogitsProcessor):
         return False
 
     def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        needs_update: bool = False
-        # Process added requests.
-        for index, params, _, _ in batch_update.added:
-            if lb := params.logit_bias:
-                self.biases[index] = lb
-                needs_update = True
-            else:
-                # Drop biases metadata at batch index
-                if self.biases.pop(index, None) is not None:
-                    # If a new request replaces an old request which
-                    # specified biases, we should update processor tensors
-                    needs_update = True
-
-        if self.biases:
-            # Process removed requests.
-            for index in batch_update.removed:
-                if self.biases.pop(index, None):
-                    needs_update = True
-
-            # Process moved requests, unidirectional (a->b) and swap (a<->b)
-            for a_index, b_index, direct in batch_update.moved:
-                if direct == MoveDirectionality.UNIDIRECTIONAL:
-                    if (a_entry := self.biases.pop(a_index, None)) is None:
-                        if self.biases.pop(b_index, None) is not None:
-                            needs_update = True
-                    else:
-                        self.biases[b_index] = a_entry
-                        needs_update = True
-                else:
-                    a_entry = self.biases.pop(a_index, None)
-                    if (b_entry := self.biases.pop(b_index, None)) is not None:
-                        self.biases[a_index] = b_entry
-                        needs_update = True
-                    if a_entry is not None:
-                        self.biases[b_index] = a_entry
-                        needs_update = True
+        needs_update = process_dict_updates(
+            self.biases, batch_update,
+            lambda params, _, __: params.logit_bias or None)
 
         # Update tensors if needed.
         if needs_update:
-            reqs, tok_ids, biases = [], [], []
+            reqs: list[int] = []
+            tok_ids: list[int] = []
+            biases: list[float] = []
             for req, lb in self.biases.items():
                 reqs.extend([req] * len(lb))
                 tok_ids.extend(lb.keys())
@@ -216,52 +185,18 @@ class MinTokensLogitsProcessor(LogitsProcessor):
         of the argmax operation in greedy sampling."""
         return False
 
+    @staticmethod
+    def add_request(
+        params: SamplingParams, _: list[int], output_tok_ids: list[int]
+    ) -> Optional[tuple[int, Sequence[int], set[int]]]:
+        min_tokens = params.min_tokens
+        if not min_tokens or len(output_tok_ids) >= min_tokens:
+            return None
+        return min_tokens, output_tok_ids, params.all_stop_token_ids
+
     def update_state(self, batch_update: Optional[BatchUpdate]):
-        needs_update = False
-
-        if batch_update:
-            # Process added requests.
-            for index, params, _, output_tok_ids in batch_update.added:
-                if ((min_tokens := params.min_tokens)
-                        and len(output_tok_ids) < min_tokens):
-                    # Replace request metadata at batch index
-                    self.min_toks[index] = (min_tokens, output_tok_ids,
-                                            params.all_stop_token_ids)
-                    needs_update = True
-                else:
-                    # Drop min_toks metadata at batch index
-                    if self.min_toks.pop(index, None) is not None:
-                        # If a new request replaces an old request which
-                        # specified min_toks, we should update processor tensors
-                        needs_update = True
-
-            if self.min_toks:
-                # Process removed requests.
-                for index in batch_update.removed:
-                    if self.min_toks.pop(index, None):
-                        needs_update = True
-
-                # Process moved requests, unidirectional (a->b) and
-                # swapped (a<->b)
-                for a_index, b_index, direct in batch_update.moved:
-                    if direct == MoveDirectionality.UNIDIRECTIONAL:
-                        if (a_entry := self.min_toks.pop(a_index,
-                                                         None)) is None:
-                            if self.min_toks.pop(b_index, None) is not None:
-                                needs_update = True
-                        else:
-                            self.min_toks[b_index] = a_entry
-                            needs_update = True
-                    else:
-                        a_entry = self.min_toks.pop(a_index, None)
-                        if (b_entry := self.min_toks.pop(b_index,
-                                                         None)) is not None:
-                            self.min_toks[a_index] = b_entry
-                            needs_update = True
-                        if a_entry is not None:
-                            self.min_toks[b_index] = a_entry
-                            needs_update = True
-
+        needs_update = process_dict_updates(self.min_toks, batch_update,
+                                            self.add_request)
         if self.min_toks:
             # Check for any requests that have attained their min tokens.
             to_remove = tuple(index for index, (min_toks, out_tok_ids,
@@ -295,3 +230,44 @@ class MinTokensLogitsProcessor(LogitsProcessor):
             # Inhibit EOS token for requests which have not reached min length
             logits[self.logits_slice] = -float("inf")
         return logits
+
+
+def process_dict_updates(
+    req_entries: dict[int, T], batch_update: Optional[BatchUpdate],
+    new_state: Callable[[SamplingParams, list[int], list[int]], Optional[T]]
+) -> bool:
+    """Utility function to update dict state for sparse LogitsProcessors."""
+
+    if not batch_update:
+        # Nothing to do.
+        return False
+
+    updated = False
+    for index, params, prompt_tok_ids, output_tok_ids in batch_update.added:
+        if (state := new_state(params, prompt_tok_ids,
+                               output_tok_ids)) is not None:
+            req_entries[index] = state
+            updated = True
+        elif req_entries.pop(index, None) is not None:
+            updated = True
+
+    if req_entries:
+        # Process removed requests.
+        for index in batch_update.removed:
+            if req_entries.pop(index, None):
+                updated = True
+
+        # Process moved requests, unidirectional (a->b) and
+        # swapped (a<->b)
+        for a_index, b_index, direct in batch_update.moved:
+            a_entry = req_entries.pop(a_index, None)
+            b_entry = req_entries.pop(b_index, None)
+            if a_entry is not None:
+                req_entries[b_index] = a_entry
+                updated = True
+            if b_entry is not None:
+                updated = True
+                if direct == MoveDirectionality.SWAP:
+                    req_entries[a_index] = b_entry
+
+    return updated
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
index 12b4db24bf..16cd00943d 100644
--- a/vllm/v1/sample/logits_processor/interface.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -44,10 +44,16 @@ class BatchUpdate:
     # Key assumption: the `output_tok_ids` list (which is an element of each
     # tuple in `added`) is a reference to the request's running output tokens
     # list; via this reference, the logits processors always see the latest
-    # list of generated output tokens
+    # list of generated output tokens.
+    #
+    # NOTE:
+    # * Added or moved requests may replace existing requests with the same
+    #   index.
+    # * Operations should be processed in the following order:
+    #   - removed, added, moved
     removed: Sequence[RemovedRequest]
-    moved: Sequence[MovedRequest]
     added: Sequence[AddedRequest]
+    moved: Sequence[MovedRequest]
 
 
 class LogitsProcessor(ABC):
@@ -59,6 +65,11 @@ class LogitsProcessor(ABC):
 
     @abstractmethod
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        """Apply LogitsProcessor to batch logits tensor.
+
+        The updated tensor must be returned but may be
+        modified in-place.
+        """
         raise NotImplementedError
 
     @abstractmethod

From 2b61d2e22fbcfd6c9df9cdf06f5905b311c2ca18 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 17:22:21 +0100
Subject: [PATCH 661/932] [Docs] Remove in-tree Gaudi install instructions
 (#23628)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/getting_started/installation/README.md   |   1 -
 .../installation/intel_gaudi.md               | 388 ------------------
 2 files changed, 389 deletions(-)
 delete mode 100644 docs/getting_started/installation/intel_gaudi.md

diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index 0ee680f5c6..8a658b7a91 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -12,7 +12,6 @@ vLLM supports the following hardware platforms:
     - [Apple silicon](cpu.md#apple-silicon)
     - [IBM Z (S390X)](cpu.md#ibm-z-s390x)
 - [Google TPU](google_tpu.md)
-- [Intel Gaudi](intel_gaudi.md)
 - [AWS Neuron](aws_neuron.md)
 
 ## Hardware Plugins
diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md
deleted file mode 100644
index ff912efec9..0000000000
--- a/docs/getting_started/installation/intel_gaudi.md
+++ /dev/null
@@ -1,388 +0,0 @@
-# Intel Gaudi
-
-This page provides instructions on running vLLM with Intel Gaudi devices.
-
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
-## Requirements
-
-- OS: Ubuntu 22.04 LTS
-- Python: 3.10
-- Intel Gaudi accelerator
-- Intel Gaudi software version 1.18.0
-
-Please follow the instructions provided in the
-[Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
-to set up the execution environment. To achieve the best performance,
-please follow the methods outlined in the
-[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
-
-## Configure a new environment
-
-### Environment verification
-
-To verify that the Intel Gaudi software was correctly installed, run:
-
-```bash
-hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
-pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-pip list | grep neural # verify that neural_compressor_pt is installed
-```
-
-Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
-for more details.
-
-### Run Docker Image
-
-It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
-for more details.
-
-Use the following commands to run a Docker image:
-
-```bash
-docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-docker run \
-  -it \
-  --runtime=habana \
-  -e HABANA_VISIBLE_DEVICES=all \
-  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-  --cap-add=sys_nice \
-  --net=host \
-  --ipc=host \
-  vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-```
-
-## Set up using Python
-
-### Pre-built wheels
-
-Currently, there are no pre-built Intel Gaudi wheels.
-
-### Build wheel from source
-
-To build and install vLLM from source, run:
-
-```bash
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -r requirements/hpu.txt
-python setup.py develop
-```
-
-Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
-
-```bash
-git clone https://github.com/HabanaAI/vllm-fork.git
-cd vllm-fork
-git checkout habana_main
-pip install -r requirements/hpu.txt
-python setup.py develop
-```
-
-## Set up using Docker
-
-### Pre-built images
-
-Currently, there are no pre-built Intel Gaudi images.
-
-### Build image from source
-
-```bash
-docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
-docker run \
-  -it \
-  --runtime=habana \
-  -e HABANA_VISIBLE_DEVICES=all \
-  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-  --cap-add=sys_nice \
-  --net=host \
-  --rm vllm-hpu-env
-```
-
-!!! tip
-    If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
-
-## Extra information
-
-### Supported features
-
-- [Offline inference](../../serving/offline_inference.md)
-- Online serving via [OpenAI-Compatible Server](../../serving/openai_compatible_server.md)
-- HPU autodetection - no need to manually select device within vLLM
-- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
-- Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
-  prefill attention, Root Mean Square Layer Normalization, Rotary
-  Positional Encoding
-- Tensor parallelism support for multi-card inference
-- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
-  for accelerating low-batch latency and throughput
-- Attention with Linear Biases (ALiBi)
-- INC quantization
-
-### Unsupported features
-
-- Beam search
-- LoRA adapters
-- AWQ quantization
-- Prefill chunking (mixed-batch inferencing)
-
-### Supported configurations
-
-The following configurations have been validated to function with
-Gaudi2 devices. Configurations that are not listed may or may not work.
-
-| Model | TP Size| dtype | Sampling |
-|-------|--------|--------|----------|
-| [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 8 | BF16 | Random / Greedy |
-
-## Performance tuning
-
-### Execution modes
-
-Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
-
-|   `PT_HPU_LAZY_MODE` |   `enforce_eager` | execution mode     |
-|----------------------|-------------------|--------------------|
-|                    0 |                 0 | torch.compile      |
-|                    0 |                 1 | PyTorch eager mode |
-|                    1 |                 0 | HPU Graphs         |
-
-!!! warning
-    In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-
-[](){ #gaudi-bucketing-mechanism }
-
-### Bucketing mechanism
-
-Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
-In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
-
-!!! note
-    Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-
-Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
-
-```text
-INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-```
-
-| Parameter      | Description                                                                 |
-|----------------|-----------------------------------------------------------------------------|
-| `min`          | Determines the lowest value of the bucket.                                  |
-| `step`         | Determines the interval between buckets.                                     |
-| `max`          | Determines the upper bound of the bucket.                                    |
-| Ramp-up phase  | A special handling phase applied between `min` and `step`:<br/>- `min` is multiplied by consecutive powers of two until `step` is reached.<br/>- Minimizes resource wastage for small batch sizes.<br/>- Allows larger padding for larger batches. |
-
-Example (with ramp-up):
-
-```text
-min = 2, step = 32, max = 64
-=> ramp_up = (2, 4, 8, 16)
-=> stable = (32, 64)
-=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
-```
-
-Example (without ramp-up):
-
-```text
-min = 128, step = 128, max = 512
-=> ramp_up = ()
-=> stable = (128, 256, 384, 512)
-=> buckets = ramp_up + stable => (128, 256, 384, 512)
-```
-
-In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
-
-!!! warning
-    If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-
-As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
-
-!!! note
-    Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-
-### Warmup
-
-Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
-
-??? console "Logs"
-
-    ```text
-    INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
-    INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
-    INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
-    ...
-    INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
-    INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
-    INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
-    ...
-    INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
-    INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    ```
-
-This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
-
-!!! tip
-    Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-
-### HPU Graph capture
-
-[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
-
-When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default).
-Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage.
-Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable.
-Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured.
-Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture.
-With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache.
-Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
-Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
-
-!!! note
-    `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
-
-User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
-
-- `max_bs` - graph capture queue will be sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
-- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
-
-When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
-
-!!! note
-    `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt to do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-
-Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
-
-??? console "Logs"
-
-    ```text
-    INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-    INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-    INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-    INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-    INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-    INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
-    INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-    INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
-    INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
-    INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
-    INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
-    INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
-    ...
-    INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
-    INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    ...
-    INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
-    INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
-    ...
-    INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
-    INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
-    INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
-    INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
-    INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
-    INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
-    INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-    INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
-    INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
-    ```
-
-### Recommended vLLM Parameters
-
-- We recommend running inference on Gaudi 2 with `block_size` of 128
-  for BF16 data type. Using default values (16, 32) might lead to
-  sub-optimal performance due to Matrix Multiplication Engine
-  under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
-- For max throughput on Llama 7B, we recommend running with batch size
-  of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
-  If you encounter out-of-memory issues, see troubleshooting section.
-
-### Environment variables
-
-**Diagnostic and profiling knobs:**
-
-- `VLLM_PROFILER_ENABLED`: If `true`, enable the high level profiler. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). `false` by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: If `true`, log graph compilations for each vLLM engine step when any occurs. Highly recommended to use with `PT_HPU_METRICS_GC_DETAILS=1`. `false` by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: If `true`, always log graph compilations for each vLLM engine step even if none occurred. `false` by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: If `true`, log CPU fallbacks for each vLLM engine step when any occurs. `false` by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, always log CPU fallbacks for each vLLM engine step even if none occurred. `false` by default.
-
-**Performance tuning knobs:**
-
-- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default
-
-- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default
-
-- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default
-
-- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
-
-- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default
-
-- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
-
-    - `{phase}` is either `PROMPT` or `DECODE`
-
-    - `{dim}` is either `BS`, `SEQ` or `BLOCK`
-
-    - `{param}` is either `MIN`, `STEP` or `MAX`
-
-    - Default values:
-
-| `{phase}` | Parameter | Env Variable | Value Expression |
-|-----------|-----------|--------------|------------------|
-| Prompt | Batch size min | `VLLM_PROMPT_BS_BUCKET_MIN` | `1` |
-| Prompt | Batch size step | `VLLM_PROMPT_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
-| Prompt | Batch size max | `VLLM_PROMPT_BS_BUCKET_MAX` | `min(max_num_seqs, 64)` |
-| Prompt | Sequence length min | `VLLM_PROMPT_SEQ_BUCKET_MIN` | `block_size` |
-| Prompt | Sequence length step | `VLLM_PROMPT_SEQ_BUCKET_STEP` | `block_size` |
-| Prompt | Sequence length max | `VLLM_PROMPT_SEQ_BUCKET_MAX` | `max_model_len` |
-| Decode | Batch size min | `VLLM_DECODE_BS_BUCKET_MIN` | `1` |
-| Decode | Batch size step | `VLLM_DECODE_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
-| Decode | Batch size max | `VLLM_DECODE_BS_BUCKET_MAX` | `max_num_seqs` |
-| Decode | Sequence length min | `VLLM_DECODE_BLOCK_BUCKET_MIN` | `block_size` |
-| Decode | Sequence length step | `VLLM_DECODE_BLOCK_BUCKET_STEP` | `block_size` |
-| Decode | Sequence length max | `VLLM_DECODE_BLOCK_BUCKET_MAX` | `max(128, (max_num_seqs*max_model_len)/block_size)` |
-
-Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
-
-- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used; if `1`, PyTorch Lazy backend for Gaudi will be used. `1` is default.
-- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
-
-## Troubleshooting: tweaking HPU graphs
-
-If you experience device out-of-memory issues or want to attempt
-inference at higher batch sizes, try tweaking HPU Graphs by following
-the below:
-
-- Tweak `gpu_memory_utilization` knob. It will decrease the
-  allocation of KV cache, leaving some headroom for capturing graphs
-  with larger batch size. By default `gpu_memory_utilization` is set
-  to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
-  short profiling run. Note that decreasing reduces the number of KV
-  cache blocks you have available, and therefore reduces the effective
-  maximum number of tokens you can handle at a given time.
-- If this method is not efficient, you can disable `HPUGraph`
-  completely. With HPU Graphs disabled, you are trading latency and
-  throughput at lower batches for potentially higher throughput on
-  higher batches. You can do that by adding `--enforce-eager` flag to
-  server (for online serving), or by passing `enforce_eager=True`
-  argument to LLM constructor (for offline inference).

From 4f35be10a96feeca0328d3ab8d359e1eaae5c23d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Wed, 27 Aug 2025 12:47:28 -0400
Subject: [PATCH 662/932] [BugFix] Fix topk_softmax assert (#19764)

Signed-off-by: Luka Govedic <lgovedic@redhat.com>
---
 csrc/moe/topk_softmax_kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 99c52ef17d..cd80bfda7d 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -573,7 +573,7 @@ void topk_softmax(
             stream);
     }
     else {
-        assert(topk_indices.scalar_type() == at::ScalarType::Int64);
+        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
         vllm::moe::topkGatingSoftmaxKernelLauncher(
             gating_output.data_ptr<float>(),
             topk_weights.data_ptr<float>(),

From 52883ed08461943ff55d5dd3cf12a28c00902fa7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 28 Aug 2025 01:01:50 +0800
Subject: [PATCH 663/932] [Model] Merge `SupportsMultiModalWithRawInput` with
 `SupportsMultiModal` (#23749)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config/__init__.py                       |  8 ++--
 vllm/model_executor/models/interfaces.py      | 45 +++++--------------
 .../models/prithvi_geospatial_mae.py          |  6 +--
 vllm/model_executor/models/registry.py        | 11 ++---
 vllm/v1/worker/gpu_model_runner.py            | 10 +++--
 5 files changed, 30 insertions(+), 50 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index e3fb6d796d..351833d3f0 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1698,6 +1698,10 @@ class ModelConfig:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
+    @property
+    def is_multimodal_raw_input_only_model(self) -> bool:
+        return self._model_info.supports_multimodal_raw_input_only
+
     @property
     def is_cross_encoder(self) -> bool:
         return (self._model_info.supports_cross_encoding
@@ -1707,10 +1711,6 @@ class ModelConfig:
     def is_pp_supported(self) -> bool:
         return self._model_info.supports_pp
 
-    @property
-    def is_multimodal_raw_input_supported(self) -> bool:
-        return self._model_info.supports_multimodal_raw_input
-
     @property
     def is_attention_free(self) -> bool:
         return self._model_info.is_attention_free
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 506732fed3..2ee966fb5c 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -52,6 +52,12 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
+    supports_multimodal_raw_input_only: ClassVar[bool] = False
+    """
+    A flag that indicates this model supports multi-modal inputs and processes
+    them in their raw form and not embeddings.
+    """
+
     supports_encoder_tp_data: ClassVar[bool] = False
     """
     A flag that indicates whether this model supports
@@ -143,45 +149,16 @@ def supports_multimodal(
     return getattr(model, "supports_multimodal", False)
 
 
+def supports_multimodal_raw_input_only(
+        model: Union[type[object], object]) -> bool:
+    return getattr(model, "supports_multimodal_raw_input_only", False)
+
+
 def supports_multimodal_encoder_tp_data(
         model: Union[type[object], object]) -> bool:
     return getattr(model, "supports_encoder_tp_data", False)
 
 
-@runtime_checkable
-class SupportsMultiModalWithRawInput(SupportsMultiModal, Protocol):
-    """The interface required for all multi-modal models."""
-
-    supports_multimodal_raw_input: ClassVar[Literal[True]] = True
-    """
-    A flag that indicates this model supports multi-modal inputs and processes
-    them in their raw form and not embeddings.
-
-    Note:
-        There is no need to redefine this flag if this class is in the
-        MRO of your model class.
-    """
-
-
-@overload
-def supports_multimodal_raw_input(
-        model: object) -> TypeIs[SupportsMultiModalWithRawInput]:
-    ...
-
-
-@overload
-def supports_multimodal_raw_input(
-        model: type[object]) -> TypeIs[type[SupportsMultiModalWithRawInput]]:
-    ...
-
-
-def supports_multimodal_raw_input(
-    model: Union[type[object], object]
-) -> Union[TypeIs[type[SupportsMultiModalWithRawInput]],
-           TypeIs[SupportsMultiModalWithRawInput]]:
-    return getattr(model, "supports_multimodal_raw_input", False)
-
-
 @runtime_checkable
 class SupportsScoreTemplate(Protocol):
     """The interface required for all models that support score template."""
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index f46d6375e1..2d14fe6d58 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -41,7 +41,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (IsAttentionFree, MultiModalEmbeddings,
-                         SupportsMultiModalWithRawInput)
+                         SupportsMultiModal)
 from .interfaces_base import default_pooling_type
 
 
@@ -174,10 +174,10 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
     info=PrithviGeoSpatialMAEProcessingInfo,
     dummy_inputs=PrithviGeoSpatialMAEInputBuilder,
 )
-class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree,
-                           SupportsMultiModalWithRawInput):
+class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
     """Prithvi Masked Autoencoder"""
 
+    supports_multimodal_raw_input_only = True
     is_pooling_model = True
 
     @classmethod
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 02ef301a52..12c0c77784 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -29,7 +29,7 @@ from .interfaces import (has_inner_state, has_noops, is_attention_free,
                          is_hybrid, supports_cross_encoding,
                          supports_multimodal,
                          supports_multimodal_encoder_tp_data,
-                         supports_multimodal_raw_input, supports_pp,
+                         supports_multimodal_raw_input_only, supports_pp,
                          supports_transcription, supports_v0_only)
 from .interfaces_base import (get_default_pooling_type, is_pooling_model,
                               is_text_generation_model)
@@ -326,7 +326,7 @@ class _ModelInfo:
     default_pooling_type: str
     supports_cross_encoding: bool
     supports_multimodal: bool
-    supports_multimodal_raw_input: bool
+    supports_multimodal_raw_input_only: bool
     supports_multimodal_encoder_tp_data: bool
     supports_pp: bool
     has_inner_state: bool
@@ -346,7 +346,8 @@ class _ModelInfo:
             default_pooling_type=get_default_pooling_type(model),
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
-            supports_multimodal_raw_input=supports_multimodal_raw_input(model),
+            supports_multimodal_raw_input_only=
+            supports_multimodal_raw_input_only(model),
             supports_multimodal_encoder_tp_data=
             supports_multimodal_encoder_tp_data(model),
             supports_pp=supports_pp(model),
@@ -743,13 +744,13 @@ class _ModelRegistry:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_multimodal
 
-    def supports_multimodal_raw_input(
+    def is_multimodal_raw_input_only_model(
         self,
         architectures: Union[str, list[str]],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
-        return model_cls.supports_multimodal_raw_input
+        return model_cls.supports_multimodal_raw_input_only
 
     def is_pp_supported_model(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d93460d618..20d2d20ba0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -139,8 +139,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 cache_config.cache_dtype]
 
         self.is_pooling_model = model_config.pooler_config is not None
-        self.is_multimodal_raw_input_supported = (
-            model_config.is_multimodal_raw_input_supported)
+        self.is_multimodal_raw_input_only_model = (
+            model_config.is_multimodal_raw_input_only_model)
+
         self.max_model_len = model_config.max_model_len
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
@@ -612,7 +613,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self,
         scheduler_output: "SchedulerOutput",
     ) -> BatchedTensorInputs:
-        if not self.is_multimodal_raw_input_supported or not scheduler_output:  # noqa: SIM102
+        if not scheduler_output or not self.is_multimodal_raw_input_only_model:
             return {}
 
         mm_kwargs = list[MultiModalKwargsItem]()
@@ -631,8 +632,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         return mm_kwargs_combined
 
     def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs:
-        if not self.is_multimodal_raw_input_supported:
+        if not self.is_multimodal_raw_input_only_model:
             return {}
+
         mm_budget = self.mm_budget
         assert mm_budget is not None
 

From dd589322801e2eb8426aa2b95f2729699ff431c5 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 27 Aug 2025 19:05:16 +0200
Subject: [PATCH 664/932] [V1] [Hybrid] Enable compile and piecewise CUDA graph
 for MiniMax-Text models (#22589)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/config/compilation.py                    |   1 +
 vllm/model_executor/models/minimax_text_01.py | 234 ++++++++----------
 2 files changed, 98 insertions(+), 137 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 56aa00a30d..5c3b220016 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -339,6 +339,7 @@ class CompilationConfig:
         "vllm.mamba_mixer2",
         "vllm.mamba_mixer",
         "vllm.short_conv",
+        "vllm.linear_attention",
     ]
 
     def compute_hash(self) -> str:
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 0e854bd7d9..176a40179b 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only MiniMaxText01 model."""
-import copy
 import math
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Optional, Union
@@ -19,13 +18,14 @@ from transformers import MiniMaxConfig
 
 from vllm import envs
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
                          get_current_vllm_config)
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size)
-from vllm.forward_context import get_forward_context
+from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -43,12 +43,15 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
     MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
+from vllm.utils import direct_register_custom_op
 from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
 
 from .interfaces import HasInnerState, IsHybrid
@@ -143,61 +146,6 @@ class MiniMaxText01RMSNormTP(CustomOp):
         return self._forward(x)
 
 
-class MiniMaxText01RotaryEmbedding(CustomOp):
-    name = "MiniMaxText01RotaryEmbedding"
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position: int,
-        base: float,
-        is_neox_style: bool,
-        cache_dtype: torch.dtype,
-    ) -> None:
-        super().__init__()
-        self.head_size = head_size
-        self.rotary_dim = rotary_dim
-        self.max_position_embeddings = max_position
-        self.base = base
-        self.is_neox_style = is_neox_style
-        self.cache_dtype = cache_dtype
-        cache = self._compute_cos_sin_cache().to(cache_dtype)
-        self.register_buffer("cos_sin_cache", cache, persistent=False)
-
-    def _compute_inv_freq(self, base: float) -> torch.Tensor:
-        """Compute the inverse frequency."""
-        inv_freq = 1.0 / (base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
-        return inv_freq
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        """Compute the cos and sin cache."""
-        inv_freq = self._compute_inv_freq(self.base)
-        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos()
-        sin = freqs.sin()
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        from vllm import _custom_ops as ops
-        self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
-        query_cast = query.to(self.cache_dtype)
-        key_cast = key.to(self.cache_dtype)
-        ops.rotary_embedding(positions, query_cast, key_cast, self.head_size,
-                             self.cos_sin_cache, self.is_neox_style)
-        query = query_cast.to(query.dtype)
-        key = key_cast.to(key.dtype)
-        return query, key
-
-
 class MiniMaxText01MLP(nn.Module):
 
     def __init__(
@@ -526,20 +474,40 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
                                               slot_id, 32)
         return hidden
 
-    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
-                kv_caches: MinimaxCacheParams, **kwargs) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
+    def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: MinimaxCacheParams) -> None:
+        if not envs.VLLM_USE_V1:
+            self._forward(hidden_states, output, positions, kv_caches)
+        else:
+            torch.ops.vllm.linear_attention(
+                hidden_states,
+                output,
+                positions,
+                self.prefix,
+            )
+
+    def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                 positions: torch.Tensor,
+                 kv_caches: Optional[MinimaxCacheParams]) -> None:
+        forward_context = get_forward_context()
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+        if envs.VLLM_USE_V1 and attn_metadata is not None:
+            assert isinstance(attn_metadata, dict)
+            attn_metadata = attn_metadata[self.prefix]
+            assert isinstance(attn_metadata, LinearAttentionMetadata)
+            num_actual_tokens = attn_metadata.num_prefill_tokens + \
+                attn_metadata.num_decode_tokens
+        else:
+            num_actual_tokens = hidden_states.shape[0]
+
+        qkv, _ = self.qkv_proj(hidden_states[:num_actual_tokens])
         qkv32 = qkv.to(torch.float32)
         qkvact = torch.nn.functional.silu(qkv32)
         qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1))
         q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
-        forward_context = get_forward_context()
-        attn_metadata = forward_context.attn_metadata
         if envs.VLLM_USE_V1:
             if attn_metadata is not None:
-                assert isinstance(attn_metadata, dict)
-                attn_metadata = attn_metadata[self.prefix]
-                assert isinstance(attn_metadata, LinearAttentionMetadata)
                 kv_cache = self.kv_cache[forward_context.virtual_engine][0]
                 state_indices_tensor = attn_metadata.state_indices_tensor
 
@@ -578,13 +546,11 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
                 hidden = self._decode_infer(q, k, v, kv_cache,
                                             state_indices_tensor,
                                             attn_metadata)
-
         hidden = self.norm._forward(hidden)
-        gate, _ = self.output_gate(hidden_states)
+        gate, _ = self.output_gate(hidden_states[:num_actual_tokens])
         hidden = F.sigmoid(gate) * hidden
         hidden = hidden.to(hidden_states.dtype)
-        hidden, _ = self.out_proj(hidden)
-        return hidden
+        output[:num_actual_tokens], _ = self.out_proj(hidden)
 
 
 class MiniMaxText01Attention(nn.Module):
@@ -652,23 +618,23 @@ class MiniMaxText01Attention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
         )
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=rotary_dim,
+            max_position=max_position,
+            base=int(rope_theta),
+            is_neox_style=True,
+            dtype=torch.float32,
+        )
         return
 
-    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
-                **kwargs) -> torch.Tensor:
-        forward_context = get_forward_context()
-        attn_metadata = forward_context.attn_metadata
+    def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                positions: torch.Tensor, **kwargs) -> None:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        if envs.VLLM_USE_V1:
-            if attn_metadata is not None:
-                q, k = attn_metadata[f"{self.prefix}.attn"].rotary_emb(
-                    positions, q, k)
-        else:
-            q, k = attn_metadata.rotary_emb(positions, q, k)
+        q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
-        output, _ = self.o_proj(attn_output)
-        return output
+        output[:], _ = self.o_proj(attn_output)
 
 
 class MiniMaxText01DecoderLayer(nn.Module):
@@ -816,16 +782,15 @@ class MiniMaxText01DecoderLayer(nn.Module):
                 is_warmup: bool = False,
                 **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
 
-        forward_context = get_forward_context()
-        attn_metadata = forward_context.attn_metadata
         layernorm_input = hidden_states
         layernorm_output = self.input_layernorm(layernorm_input)
         residual = layernorm_output if self.postnorm else layernorm_input
-        self_attention_output = self.self_attn(
+        self_attention_output = torch.empty_like(layernorm_output)
+        self.self_attn(
             hidden_states=layernorm_output,
+            output=self_attention_output,
             positions=positions,
             kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
         )
 
         residual = residual * self.layernorm_attention_alpha
@@ -839,8 +804,8 @@ class MiniMaxText01DecoderLayer(nn.Module):
         if self.expert_num == 1:
             hidden_states = self.mlp(layernorm_output)
         else:
-            moe_hidden_states = self.block_sparse_moe(
-                copy.deepcopy(layernorm_output))
+            moe_layernorm_output = layernorm_output.clone()
+            moe_hidden_states = self.block_sparse_moe(moe_layernorm_output)
             if self.shared_moe:
                 before_moe_dtype = layernorm_output.dtype
                 moe_hidden_fp32 = moe_hidden_states.to(torch.float32)
@@ -878,18 +843,16 @@ class MiniMaxText01DecoderLayer(nn.Module):
         return
 
 
+@support_torch_compile
 class MiniMaxText01Model(nn.Module):
 
-    def __init__(
-        self,
-        config: MiniMaxConfig,
-        model_config: Optional[ModelConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        scheduler_config=None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+        config: MiniMaxConfig = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        quant_config = vllm_config.quant_config
+        cache_config = vllm_config.cache_config
+        scheduler_config = vllm_config.scheduler_config
 
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -976,24 +939,6 @@ class MiniMaxText01Model(nn.Module):
             self.minimax_cache = MinimaxCacheManager(
                 dtype=torch.float32, cache_shape=self.cache_shape)
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        head_dim = getattr(config, "head_dim", None)
-        if head_dim is None:
-            head_dim = config.hidden_size // config.num_attention_heads
-        if hasattr(config, "max_model_len") and isinstance(
-                config.max_model_len, int):
-            max_position_embeddings = min(config.max_position_embeddings,
-                                          config.max_model_len)
-        self.rotary_emb = MiniMaxText01RotaryEmbedding(
-            head_dim,
-            rotary_dim=config.rotary_dim
-            if hasattr(config, "rotary_dim") else head_dim,
-            max_position=max_position_embeddings,
-            base=int(rope_theta),
-            is_neox_style=True,
-            cache_dtype=torch.float32,
-        )
-
         norm_kwargs = {}
         if hasattr(config, "rms_norm_eps"):
             norm_kwargs["eps"] = config.rms_norm_eps
@@ -1043,12 +988,11 @@ class MiniMaxText01Model(nn.Module):
         attn_metadata = forward_context.attn_metadata
         if not envs.VLLM_USE_V1 and attn_metadata is None:
             return None
-        if "request_ids_to_seq_ids" not in kwargs:
-            kwargs["request_ids_to_seq_ids"] = {}
-        if "finished_requests_ids" not in kwargs:
-            kwargs["finished_requests_ids"] = []
-
         if not envs.VLLM_USE_V1:
+            if "request_ids_to_seq_ids" not in kwargs:
+                kwargs["request_ids_to_seq_ids"] = {}
+            if "finished_requests_ids" not in kwargs:
+                kwargs["finished_requests_ids"] = []
             (
                 minimax_cache_tensors,
                 state_indices_tensor,
@@ -1077,16 +1021,6 @@ class MiniMaxText01Model(nn.Module):
 
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
-            if attn_metadata is not None:
-                # TODO (tdoublep): this whole thing with the rotary_emb is
-                # weird. we shouldn't be passing it via attn_metadata imo.
-                if envs.VLLM_USE_V1:
-                    if isinstance(layer.self_attn, MiniMaxText01Attention):
-                        attn_metadata[layer.prefix +
-                                      ".attn"].rotary_emb = self.rotary_emb
-                else:
-                    attn_metadata.rotary_emb = self.rotary_emb
-
             _caches = None
             if not envs.VLLM_USE_V1 and isinstance(
                     layer.self_attn, MiniMaxText01LinearAttention):
@@ -1120,7 +1054,6 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
 
         super().__init__()
         config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
@@ -1133,13 +1066,8 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         self.unpadded_vocab_size = self.config.vocab_size
         if hasattr(vllm_config.model_config, "max_model_len"):
             self.config.max_model_len = vllm_config.model_config.max_model_len
-        self.model = MiniMaxText01Model(
-            self.config,
-            model_config=vllm_config.model_config,
-            cache_config=vllm_config.cache_config,
-            quant_config=quant_config,
-            scheduler_config=vllm_config.scheduler_config,
-            prefix=maybe_prefix(prefix, "model"))
+        self.model = MiniMaxText01Model(vllm_config=vllm_config,
+                                        prefix=maybe_prefix(prefix, "model"))
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
                 self.unpadded_vocab_size,
@@ -1469,3 +1397,35 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
             tp_size=parallel_config.tensor_parallel_size,
             head_dim=hf_config.head_dim,
         )
+
+
+def linear_attention(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    positions: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self._forward(hidden_states=hidden_states,
+                  output=output,
+                  positions=positions,
+                  kv_caches=None)
+
+
+def linear_attention_fake(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    positions: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="linear_attention",
+    op_func=linear_attention,
+    mutates_args=["output"],
+    fake_impl=linear_attention_fake,
+    dispatch_key=current_platform.dispatch_key,
+)

From 4e4d017b6f70c729e7c78f74e4328a4ebca7b8ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?=
 <ohg3417@gmail.com>
Date: Thu, 28 Aug 2025 02:17:29 +0900
Subject: [PATCH 665/932] [Docs] Fix warnings in `mkdocs build` (continued)
 (#23743)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
---
 vllm/core/block/naive_block.py               |  2 +-
 vllm/core/block/prefix_caching_block.py      |  2 +-
 vllm/core/scheduler.py                       |  2 +-
 vllm/v1/attention/backends/cpu_attn.py       |  3 ++-
 vllm/v1/attention/backends/flash_attn.py     |  3 ++-
 vllm/v1/attention/backends/flashinfer.py     |  8 +++-----
 vllm/v1/attention/backends/flex_attention.py |  3 ++-
 vllm/v1/attention/backends/pallas.py         |  5 +++--
 vllm/v1/attention/backends/rocm_aiter_fa.py  |  3 ++-
 vllm/v1/attention/backends/tree_attn.py      |  3 ++-
 vllm/v1/attention/backends/triton_attn.py    |  3 ++-
 vllm/v1/attention/backends/xformers.py       |  3 ++-
 vllm/v1/core/encoder_cache_manager.py        |  8 ++++----
 vllm/v1/core/kv_cache_coordinator.py         |  3 ++-
 vllm/v1/core/kv_cache_manager.py             | 11 ++++++-----
 vllm/v1/executor/ray_distributed_executor.py |  3 ++-
 vllm/v1/metrics/prometheus.py                |  2 +-
 vllm/v1/sample/logits_processor/interface.py |  4 ++--
 vllm/v1/sample/rejection_sampler.py          |  2 +-
 vllm/v1/sample/tpu/sampler.py                |  2 +-
 vllm/v1/structured_output/backend_types.py   |  4 ++--
 vllm/v1/worker/gpu_input_batch.py            |  3 ---
 vllm/v1/worker/gpu_model_runner.py           |  2 +-
 vllm/v1/worker/tpu_model_runner.py           | 10 +++++-----
 vllm/v1/worker/utils.py                      |  8 ++++----
 vllm/v1/worker/worker_base.py                |  4 ++--
 26 files changed, 56 insertions(+), 50 deletions(-)

diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index dae6ead04e..7d9b32cd4b 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -207,7 +207,7 @@ class NaiveBlockAllocator(BlockAllocator):
 
         Args:
             absolute_id (int): The absolute block id for the block 
-            in whole allocator.
+                in whole allocator.
 
         Returns:
             int: The zero-offset block id on certain device.
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 2913a01bf3..a21d69323a 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -61,7 +61,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
     Args:
         num_blocks (int): The total number of blocks to manage.
         block_size (int): The size of each block in tokens.
-        block_ids(Optional[Iterable[int]], optional): An optional iterable of
+        block_ids (Optional[Iterable[int]], optional): An optional iterable of
             block IDs. If not provided, block IDs will be assigned sequentially
             from 0 to num_blocks - 1.
     """
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 63894e7f5d..c89f3f6632 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -657,7 +657,7 @@ class Scheduler:
                 `budget.num_batched_tokens` has not enough capacity to schedule
                 all tokens.
             partial_prefill_metadata: information about the partial prefills
-            that are currently running
+                that are currently running
 
         Returns:
             SchedulerRunningOutputs.
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 973979fdf7..ced8234a7b 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -491,7 +491,8 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size * num_kv_heads * head_size]
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 6e7096de92..dd2b956d4f 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -438,7 +438,8 @@ class FlashAttentionImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 1115fc606b..70d3471a47 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -637,11 +637,9 @@ class FlashInferImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache: shape -
-            # NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
-            # HND: [num_blocks, 2,  num_kv_heads, block_size, head_size]
-
-
+            kv_cache: KV cache tensor with different possible shapes:
+                - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
+                - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 458562ebc8..a596f6b2b3 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -689,7 +689,8 @@ class FlexAttentionImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index fd97db0abb..26f9abf13d 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -235,7 +235,8 @@ class PallasAttentionBackendImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
+            kv_cache: shape =
+                [num_blocks, block_size, num_kv_heads * 2, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
@@ -329,7 +330,7 @@ def write_to_kv_cache(
     Args:
         key: shape = [num_tokens, num_kv_heads, head_size]
         value: shape = [num_tokens, num_kv_heads, head_size]
-        kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
+        kv_cache: shape = [num_blocks, block_size, num_kv_heads * 2, head_size]
         num_slices_per_kv_cache_update_block: int
     """
     _, page_size, num_combined_kv_heads, head_size = kv_cache.shape
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 403ad8e88a..173a0a255e 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -429,7 +429,8 @@ class AiterFlashAttentionImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index c93223a340..b96d957a15 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -362,7 +362,8 @@ class TreeAttentionImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index b12036c599..a37a7f6811 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -285,7 +285,8 @@ class TritonAttentionImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index e0eb7d8be9..7f888c1135 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -330,7 +330,8 @@ class XFormersAttentionImpl(AttentionImpl):
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index c9d18033a1..bd2ec03683 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -255,9 +255,9 @@ def compute_encoder_budget(
 
     Returns:
         - Compute budget for encoder execution, measured in number of tokens
-          from the input sequence.
+            from the input sequence.
         - Space budget for encoder cache size, measured in number of tokens
-          from the input sequence.
+            from the input sequence.
     """
     if mm_registry.supports_multimodal_inputs(model_config):
         max_tokens_by_modality = mm_registry \
@@ -303,9 +303,9 @@ def compute_mm_encoder_budget(
 
     Returns:
         - Compute budget for encoder execution, measured in number of tokens
-          from the input sequence.
+            from the input sequence.
         - Space budget for encoder cache size, measured in number of tokens
-          from the input sequence.
+            from the input sequence.
     """
 
     if not max_tokens_by_modality:
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index f082ad00f2..9421341f99 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -119,7 +119,8 @@ class KVCacheCoordinator(ABC):
 
         Args:
             request: The request.
-            num_tokens: The total number of tokens that need to be cached 
+            num_computed_tokens: The total number of tokens
+                that need to be cached
                 (including tokens that are already cached).
         """
         for manager in self.single_type_managers:
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index b427a9c497..87a11fe58a 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -54,14 +54,15 @@ class KVCacheBlocks:
     def get_block_ids(
         self,
         allow_none: bool = False,
-    ):
+    ) -> Optional[tuple[list[int], ...]]:
         """
         Converts the KVCacheBlocks instance to block_ids.
-        
+
         Returns:
-            tuple[list[int], ...]: A tuple of lists where
-            * the outer tuple corresponds to KV cache groups
-            * each inner list contains the block_ids of the blocks in that group
+            tuple[list[int], ...]: A tuple of lists where:
+                - the outer tuple corresponds to KV cache groups
+                - each inner list contains the block_ids of the blocks in that
+                  group
         """
         if allow_none and all(len(group) == 0 for group in self.blocks):
             return None
diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
index c05ad1966d..8394ae788a 100644
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -8,6 +8,7 @@ from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.executor.ray_distributed_executor import (  # noqa
     RayDistributedExecutor as RayDistributedExecutorV0)
 from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
@@ -64,7 +65,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
 
     def execute_model(
         self,
-        scheduler_output,
+        scheduler_output: SchedulerOutput,
     ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
         """Execute the model on the Ray workers.
 
diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py
index 61ba5d66cb..a43cf9ce25 100644
--- a/vllm/v1/metrics/prometheus.py
+++ b/vllm/v1/metrics/prometheus.py
@@ -36,7 +36,7 @@ def setup_multiprocess_prometheus():
                        "and vLLM will properly handle cleanup.")
 
 
-def get_prometheus_registry():
+def get_prometheus_registry() -> CollectorRegistry:
     """Get the appropriate prometheus registry based on multiprocessing 
     configuration.
     
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
index 16cd00943d..683fc7c00d 100644
--- a/vllm/v1/sample/logits_processor/interface.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -91,7 +91,7 @@ class LogitsProcessor(ABC):
         to each forward pass.
 
         Args:
-            batch_update is non-None iff there have been
-            changes to the batch makeup.
+            batch_update: Non-None iff there have been changes
+                to the batch makeup.
         """
         raise NotImplementedError
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index b2354c5330..2d9ce3101b 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -68,7 +68,7 @@ class RejectionSampler(nn.Module):
                 different requests are flattened into a single tensor because
                 this is the shape of the output logits.
                 NOTE: `target_logits` can be updated in place to save memory.
-            bonus_token_ids_tensor (torch.Tensor):
+            bonus_token_ids (torch.Tensor):
                 A tensor containing bonus tokens. Shape is [batch_size, 1].
                 Bonus tokens are added to the end of the sequence if all
                 proposed tokens are accepted. We generate the bonus tokens
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index 04545d587e..e84136e3a6 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -89,7 +89,7 @@ class Sampler(nn.Module):
         Gather logprobs for topk and sampled/prompt token.
 
         Args:
-          logits: (num tokens) x (vocab) tensor
+          logprobs: (num tokens) x (vocab) tensor
           num_logprobs: minimum number of logprobs to
                         retain per token
           token_ids: prompt tokens (if prompt logprobs)
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index d500783aa4..9a53aa7a1a 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -110,7 +110,7 @@ class StructuredOutputBackend(ABC):
 
         Args:
             request_type (StructuredOutputOptions): The type of structured
-              output request.
+                output request.
             grammar_spec (str): The grammar specification to compile.
 
         Returns:
@@ -124,7 +124,7 @@ class StructuredOutputBackend(ABC):
 
         Args:
             max_num_seqs (int): The maximum number of sequences for which
-              to allocate the bitmask.
+                to allocate the bitmask.
         """
 
     @abstractmethod
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 284af6bfed..f4c2f45df5 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -525,9 +525,6 @@ class InputBatch:
         Any consecutive empty indices at the very end of the list are not
         filled.
 
-        Args:
-          empty_req_indices: empty indices which may be filled.
-
         Returns:
           swaps: list of (from,to) swap tuples for moved requests
           empty_req_indices: indices not filled by condensation
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 20d2d20ba0..01c90b2ea3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2955,7 +2955,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         Args:
             kv_cache_config: The KV cache config
             kv_cache_raw_tensors: The KV cache buffer of each layer, with
-            correct size but uninitialized shape.
+                correct size but uninitialized shape.
         Returns:
             Dict[str, torch.Tensor]: A map between layer names to their
             corresponding memory buffer for KV cache.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index d364236604..70ffde39ca 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -552,7 +552,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         return kv_cache_spec
 
     def _get_slot_mapping_metadata(self, num_reqs,
-                                   num_scheduled_tokens_per_req):
+                                   num_scheduled_tokens_per_req) -> np.ndarray:
         """
         Computes metadata for mapping slots to blocks in the key-value (KV)
         cache for a batch of requests.
@@ -565,15 +565,15 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         Args:
             num_reqs (int): Number of requests in the current batch.
             num_scheduled_tokens_per_req (int or np.ndarray): Number of tokens
-            to be scheduled for each request.
+                to be scheduled for each request.
 
         Returns:
             np.ndarray: A 2D array of shape (total_block_len, 3), where each row
-            contains:
+                contains:
                 - kv_cache_start_index (int): The starting index in the KV cache
-                    for the corresponding slice.
+                  for the corresponding slice.
                 - new_kv_start_index (int): The starting index in the new KV
-                    cache for the corresponding slice.
+                  cache for the corresponding slice.
                 - slice_len (int): The length of the slice.
         """
         slices_start = self.input_batch.num_computed_tokens_cpu[:num_reqs]
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index f407534687..a519336e41 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -172,10 +172,10 @@ def scatter_mm_placeholders(
 
     Args:
         embeds: The multimodal embeddings.
-          Shape: `(num_embeds, embed_dim)`
+            Shape: `(num_embeds, embed_dim)`
         is_embed: A boolean mask indicating which positions in the placeholder
-          tokens need to be filled with multimodal embeddings.
-          Shape: `(num_placeholders, num_embeds)`
+            tokens need to be filled with multimodal embeddings.
+            Shape: `(num_placeholders, num_embeds)`
     """
     if is_embed is None:
         return embeds
@@ -278,7 +278,7 @@ def bind_kv_cache(
     Args:
         kv_caches: The allocated kv_caches with layer names as keys.
         forward_context: The global forward context containing all Attention
-        layers with layer names as keys.
+            layers with layer names as keys.
         runner_kv_caches: The kv_cache declared by ModelRunner.
     """
     # Bind kv_caches to ModelRunner
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 9c93754f93..038ce4b54f 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -36,8 +36,8 @@ class WorkerBase(WorkerBaseV0):
             local_rank: Local device index
             rank: Global rank in distributed setup
             distributed_init_method: Distributed initialization method
-            is_driver_worker: Whether this worker handles driver 
-            responsibilities
+            is_driver_worker: Whether this worker handles driver
+                responsibilities
         """
         # Configuration storage
         super().__init__(vllm_config=vllm_config)

From 3c0ef769bace3d48b276c7233ed6f39fe03f95b7 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
Date: Wed, 27 Aug 2025 10:41:48 -0700
Subject: [PATCH 666/932] ci: Add arm64 docker build to release pipeline
 (#23210)

Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
Signed-off-by: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
---
 .buildkite/release-pipeline.yaml | 38 +++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index f96c38bf57..86aae426c2 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -7,7 +7,7 @@ steps:
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -62,23 +62,49 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
-  - block: "Build release image"
+  - block: "Build release image (x86)"
     depends_on: ~
     key: block-release-image-build
 
-  - label: "Build release image"
+  - label: "Build release image (x86)"
     depends_on: block-release-image-build
-    id: build-release-image
+    id: build-release-image-x86
     agents:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+      # re-tag to default image tag and push, just in case arm64 build fails
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
+  - label: "Build release image (arm64)"
+    depends_on: block-release-image-build
+    id: build-release-image-arm64
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+  # Add job to create multi-arch manifest
+  - label: "Create multi-arch manifest"
+    depends_on:
+      - build-release-image-x86
+      - build-release-image-arm64
+    id: create-multi-arch-manifest
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
   - label: "Annotate release workflow"
     depends_on:
-      - build-release-image
+      - create-multi-arch-manifest
       - build-wheel-cuda-12-8
       - build-wheel-cuda-12-6
       - build-wheel-cuda-11-8

From 0585a9e73c072a8cbb1a64bea3c26dd0d2dde402 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 20:03:05 +0100
Subject: [PATCH 667/932] Disable `torch.compile` for dynamic rope models in
 Transformers backend (#23738)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers.py | 25 +++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index fc242d1ada..dffc347a73 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -88,6 +88,23 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
     logger.debug("%s: %s -> %s", name, old_module, new_module)
 
 
+def can_enable_torch_compile(vllm_config: VllmConfig) -> bool:
+    """
+    Callable to be passed to `@support_torch_compile`'s `enable_if` argument.
+
+    Defaults to `True` but is disabled in the following situations:
+
+    - The model uses dynamic rope scaling.
+    """
+    enable = True
+    text_config = vllm_config.model_config.hf_config.get_text_config()
+    # Dynamic rope scaling is not compatible with torch.compile
+    rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {}
+    if rope_scaling.get("rope_type") == "dynamic":
+        enable = False
+    return enable
+
+
 def replace_linear_class(
     linear: nn.Linear, style: Literal["colwise", "rowwise"],
     quant_config: QuantizationConfig
@@ -641,7 +658,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
-@support_torch_compile
+@support_torch_compile(enable_if=can_enable_torch_compile)
 class TransformersModel(TransformersBase):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -653,7 +670,7 @@ class TransformersModel(TransformersBase):
         })
 
 
-@support_torch_compile
+@support_torch_compile(enable_if=can_enable_torch_compile)
 class TransformersForCausalLM(TransformersBase):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -709,12 +726,14 @@ def flatten_and_concat(x: list[torch.Tensor]) -> torch.Tensor:
     info=MultiModalProcessingInfo,
     dummy_inputs=MultiModalDummyInputsBuilder)
 @support_torch_compile(
+    # set `positions` to last dim to support Qwen-mrope
     dynamic_arg_dims={
         "input_ids": 0,
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    })  # set `positions` to last dim to support Qwen-mrope
+    },
+    enable_if=can_enable_torch_compile)
 class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal):
     # Backwards compatibility for prev released models. State dicts back then
     # had different formats and cannot be loaded with `AutoModel` mapping as is

From 8bf6266a17933b130f94f6d53f32ac029ed8ba1b Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Wed, 27 Aug 2025 13:24:31 -0700
Subject: [PATCH 668/932] [Multimodal] Generate mm_hash based on request
 metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 vllm/inputs/preprocess.py                     | 71 ++++++++++++++++---
 vllm/model_executor/models/deepseek_vl2.py    |  3 +
 vllm/model_executor/models/h2ovl.py           |  3 +
 vllm/model_executor/models/llava.py           |  8 ++-
 vllm/model_executor/models/mllama.py          |  8 ++-
 vllm/model_executor/models/paligemma.py       |  8 ++-
 vllm/model_executor/models/pixtral.py         |  2 +
 .../models/prithvi_geospatial_mae.py          |  7 +-
 vllm/model_executor/models/transformers.py    |  7 +-
 vllm/model_executor/models/voxtral.py         |  2 +
 vllm/multimodal/processing.py                 | 36 ++++++++--
 vllm/v1/engine/processor.py                   | 48 +++++++++++++
 12 files changed, 179 insertions(+), 24 deletions(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index f0d0cab3df..fff9c42fe3 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -257,6 +257,8 @@ class InputPreprocessor:
         mm_processor_kwargs: Optional[Mapping[str, object]],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
@@ -273,10 +275,13 @@ class InputPreprocessor:
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(prompt,
-                                  mm_data,
-                                  hf_processor_mm_kwargs=mm_processor_kwargs,
-                                  tokenization_kwargs=tokenization_kwargs)
+        return mm_processor.apply(
+            prompt,
+            mm_data,
+            hf_processor_mm_kwargs=mm_processor_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
+        )
 
     async def _process_multimodal_async(
         self,
@@ -285,6 +290,8 @@ class InputPreprocessor:
         mm_processor_kwargs: Optional[Mapping[str, object]],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         """
         Async version of
@@ -301,10 +308,13 @@ class InputPreprocessor:
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(prompt,
-                                  mm_data,
-                                  hf_processor_mm_kwargs=mm_processor_kwargs,
-                                  tokenization_kwargs=tokenization_kwargs)
+        return mm_processor.apply(
+            prompt,
+            mm_data,
+            hf_processor_mm_kwargs=mm_processor_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
+        )
 
     def _process_embeds(
         self,
@@ -341,6 +351,8 @@ class InputPreprocessor:
         parsed_content: TokensPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = parsed_content["prompt_token_ids"]
         token_type_ids = parsed_content.get("token_type_ids")
@@ -353,6 +365,7 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         else:
             inputs = token_inputs(
@@ -370,6 +383,8 @@ class InputPreprocessor:
         parsed_content: TokensPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = parsed_content["prompt_token_ids"]
         token_type_ids = parsed_content.get("token_type_ids")
@@ -382,6 +397,7 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         else:
             inputs = token_inputs(
@@ -399,6 +415,8 @@ class InputPreprocessor:
         parsed_content: TextPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -410,6 +428,7 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         else:
             prompt_token_ids = self._tokenize_prompt(
@@ -432,6 +451,8 @@ class InputPreprocessor:
         parsed_content: TextPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -443,6 +464,7 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         else:
             prompt_token_ids = await self._tokenize_prompt_async(
@@ -465,6 +487,8 @@ class InputPreprocessor:
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> SingletonInputs:
         """
         Extract the singleton inputs from a prompt.
@@ -486,18 +510,21 @@ class InputPreprocessor:
             return self._process_tokens(
                 parsed["content"],
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         if parsed["type"] == "text":
             return self._process_text(
                 parsed["content"],
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         if parsed["type"] == "str":
             return self._process_text(
                 TextPrompt(prompt=parsed["content"]),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         assert_never(parsed)
@@ -507,6 +534,8 @@ class InputPreprocessor:
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> SingletonInputs:
         """
         Async version of
@@ -520,18 +549,21 @@ class InputPreprocessor:
             return await self._process_tokens_async(
                 parsed["content"],
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         if parsed["type"] == "text":
             return await self._process_text_async(
                 parsed["content"],
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         if parsed["type"] == "str":
             return await self._process_text_async(
                 TextPrompt(prompt=parsed["content"]),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         assert_never(parsed)
@@ -641,6 +673,8 @@ class InputPreprocessor:
         self,
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> EncoderDecoderInputs:
         """
         For encoder/decoder models only:
@@ -682,6 +716,7 @@ class InputPreprocessor:
             encoder_inputs = self._prompt_to_llm_inputs(
                 prompt["encoder_prompt"],
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
             if (decoder_input := prompt["decoder_prompt"]) is None:
                 decoder_inputs = None
@@ -697,6 +732,7 @@ class InputPreprocessor:
             inputs = self._prompt_to_llm_inputs(
                 prompt,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
             if self.model_config.is_multimodal_model:
                 # Encoder-Decoder Multimodal model
@@ -712,6 +748,8 @@ class InputPreprocessor:
         self,
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> EncoderDecoderInputs:
         """
         Async version of
@@ -724,6 +762,7 @@ class InputPreprocessor:
             encoder_task = self._prompt_to_llm_inputs_async(
                 prompt["encoder_prompt"],
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
             if (decoder_input := prompt["decoder_prompt"]) is None:
@@ -733,6 +772,7 @@ class InputPreprocessor:
                 decoder_task = self._prompt_to_llm_inputs_async(
                     decoder_input,
                     tokenization_kwargs=tokenization_kwargs,
+                    mm_hash_overrides=mm_hash_overrides,
                 )
 
                 encoder_inputs, decoder_inputs = await asyncio.gather(
@@ -748,6 +788,7 @@ class InputPreprocessor:
             inputs = await self._prompt_to_llm_inputs_async(
                 prompt,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
             if self.model_config.is_multimodal_model:
                 # Encoder-Decoder Multimodal model
@@ -774,6 +815,8 @@ class InputPreprocessor:
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
@@ -794,6 +837,7 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         return self._build_decoder_only_llm_inputs(prompt_comps)
@@ -803,6 +847,8 @@ class InputPreprocessor:
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> DecoderOnlyInputs:
         """
         Async version of
@@ -812,6 +858,7 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         return self._build_decoder_only_llm_inputs(prompt_comps)
@@ -821,6 +868,8 @@ class InputPreprocessor:
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
         if self.model_config.is_encoder_decoder:
@@ -829,6 +878,7 @@ class InputPreprocessor:
             return self._process_encoder_decoder_prompt(
                 prompt,
                 tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         if is_explicit_encoder_decoder_prompt(prompt):
@@ -840,6 +890,7 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
     async def preprocess_async(
@@ -847,6 +898,8 @@ class InputPreprocessor:
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> ProcessorInputs:
         """
         Async version of
@@ -858,6 +911,7 @@ class InputPreprocessor:
             return await self._process_encoder_decoder_prompt_async(
                 prompt,
                 tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         if is_explicit_encoder_decoder_prompt(prompt):
@@ -869,6 +923,7 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
     def clear_cache(self) -> None:
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index ceb5e1364b..1bd2802a86 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -290,6 +290,7 @@ class DeepseekVL2MultiModalProcessor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 2 vs > 2
         # Since the processing cache assumes that the processor output is
@@ -301,6 +302,7 @@ class DeepseekVL2MultiModalProcessor(
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         return super()._cached_apply_hf_processor(
@@ -308,6 +310,7 @@ class DeepseekVL2MultiModalProcessor(
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
 
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 87e451a276..306775af68 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -479,6 +479,7 @@ class H2OVLMultiModalProcessor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 1 vs > 1
         # Since the processing cache assumes that the processor output is
@@ -490,6 +491,7 @@ class H2OVLMultiModalProcessor(
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         return super()._cached_apply_hf_processor(
@@ -497,6 +499,7 @@ class H2OVLMultiModalProcessor(
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 0ee26b6834..8a847a6180 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -795,6 +795,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
@@ -805,8 +806,11 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
             image_height=-1,
         )
 
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                               tokenization_kwargs)
+        result = super().apply(prompt,
+                               mm_data,
+                               hf_processor_mm_kwargs,
+                               tokenization_kwargs,
+                               mm_hash_overrides=mm_hash_overrides)
 
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 2a60450de4..cc2216996f 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -184,9 +184,13 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalEncDecInputs:
-        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                                  tokenization_kwargs)
+        mm_inputs = super().apply(prompt,
+                                  mm_data,
+                                  hf_processor_mm_kwargs,
+                                  tokenization_kwargs,
+                                  mm_hash_overrides=mm_hash_overrides)
 
         image_token_id = self.info.get_hf_config().image_token_index
         # Check that the number of image tokens in the decoder prompt matches
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 95abb190e0..b74a09ee92 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -203,9 +203,13 @@ class PaliGemmaMultiModalProcessor(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
-        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                                  tokenization_kwargs)
+        mm_inputs = super().apply(prompt,
+                                  mm_data,
+                                  hf_processor_mm_kwargs,
+                                  tokenization_kwargs,
+                                  mm_hash_overrides=mm_hash_overrides)
         prompt_token_ids = mm_inputs["prompt_token_ids"]
 
         tokenizer = self.info.get_tokenizer()
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 461b9c85d1..a74e01a596 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -314,12 +314,14 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         # NOTE: The tokens are already inserted by the chat template
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 2d14fe6d58..2edc357d2d 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -138,6 +138,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         if "image" in mm_data:
             image_data = mm_data["image"]
@@ -146,8 +147,10 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
             mm_data = {"image": mm_data}
 
         mm_items = self._to_mm_items(mm_data)
-        mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
-                                        tokenization_kwargs or {})
+        tokenization_kwargs = tokenization_kwargs or {}
+        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
+                     self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs))
         mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
 
         mm_processed_data = BatchFeature(image_data)
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index dffc347a73..edf3dddb1b 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -327,6 +327,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -393,9 +394,11 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
             self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
                                        num_image_patches),
         )
+        # Use overrides if provided; fallback to data-dependent hashing.
+        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
+                     self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs))
 
-        mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
-                                        tokenization_kwargs)
         return MultiModalInputs(
             type="multimodal",
             prompt=prompt,
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 77f11a691e..eed8d89ca4 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -288,12 +288,14 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         # NOTE: The tokens are already inserted by the chat template
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 6ecdf80d4a..41595df2e2 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1020,8 +1020,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         prompt: str,
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        mm_hash_overrides: Optional[MultiModalHashes] = None,
     ) -> MultiModalInputs:
-        return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
+        return self.apply(prompt,
+                          mm_data,
+                          hf_processor_mm_kwargs,
+                          mm_hash_overrides=mm_hash_overrides)
 
     def _get_data_parser(self) -> MultiModalDataParser:
         """
@@ -1357,7 +1362,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
     ) -> MultiModalHashes:
-        """Create MM hashes to be returned (only used in V1)."""
+        """Create MM hashes to be returned (only used in V1).
+
+        Note: When overrides are provided via callers of `apply`,
+        `_hash_mm_items` will be bypassed and the overrides will be used.
+        """
         model_id = self.info.model_id
 
         return {
@@ -1464,6 +1473,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        *,
+        mm_hash_overrides: Optional[MultiModalHashes] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         (
             prompt_ids,
@@ -1483,8 +1494,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                                        hf_processor_mm_kwargs),
         )
 
-        mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
-                                        tokenization_kwargs)
+        # Use overrides if provided; fallback to data-dependent hashing.
+        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
+                     self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs))
 
         mm_prompt_updates = self._get_mm_prompt_updates(
             mm_data_items,
@@ -1506,6 +1519,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        *,
+        mm_hash_overrides: Optional[MultiModalHashes] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         """
         Apply the HF processor on the full prompt text,
@@ -1520,10 +1535,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
-        mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
-                                        tokenization_kwargs)
+        # Use overrides if provided; fallback to data-dependent hashing.
+        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
+                     self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs))
 
         mm_missing_data_items = self._get_cache_missing_items(
             cache=cache,
@@ -1723,6 +1741,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1751,6 +1771,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             mm_items,
             hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         # NOTE: tokenization_kwargs are not required to init processor
@@ -1835,6 +1856,8 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        *,
+        mm_hash_overrides: Optional[MultiModalHashes] = None,
     ) -> MultiModalEncDecInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1849,6 +1872,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
             mm_data,
             hf_processor_mm_kwargs,
             tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         return self._get_enc_dec_inputs(
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 7ed6015662..df915258d8 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -225,6 +225,41 @@ class Processor:
             # Remember that this backend was set automatically
             params.guided_decoding.backend_was_auto = True
 
+    def _maybe_build_mm_hash_overrides(
+        self,
+        request_id: str,
+        prompt: PromptType,
+    ) -> Optional[dict[str, list[str]]]:
+        """Build per-item multimodal hash overrides when enabled. In this case,
+        multimodal data items are identified by their request id, modality and
+        index rather than their content.
+
+        Returns a dictionary of modality -> list[str] of overrides, or None if
+        disabled or no multimodal data is present.
+        """
+
+        def _extract_mm_data(p: PromptType):
+            if isinstance(p, dict) and "encoder_prompt" in p:
+                enc = p.get("encoder_prompt")
+                if isinstance(enc, dict):
+                    return enc.get("multi_modal_data")
+                return None
+            if isinstance(p, dict):
+                return p.get("multi_modal_data")
+            return None
+
+        mm_data = _extract_mm_data(prompt)
+        if not mm_data:
+            return None
+
+        overrides: dict[str, list[str]] = {}
+        for modality, data in mm_data.items():
+            n = len(data) if isinstance(data, list) else 1
+            overrides[modality] = [
+                f"{request_id}-{modality}-{i}" for i in range(n)
+            ]
+        return overrides
+
     def process_inputs(
         self,
         request_id: str,
@@ -254,6 +289,18 @@ class Processor:
         if arrival_time is None:
             arrival_time = time.time()
 
+        # Optionally generate multimodal hash overrides based on request id.
+        # NOTE: when users explicitly turn off BOTH prefix caching and input
+        # processing caching, no multimodal features or embeddings will be
+        # reused across requests, therefore hashing is no longer necessary.
+        if (self.model_config.multimodal_config and
+                self.model_config.multimodal_config.mm_processor_cache_gb == 0
+                and not self.cache_config.enable_prefix_caching):
+            mm_hash_overrides = self._maybe_build_mm_hash_overrides(
+                request_id, prompt)
+        else:
+            mm_hash_overrides = None
+
         # Process inputs, which includes:
         # 1. Tokenize text prompt, with LoRA request if one exists.
         # 2. For multimodal models with a merged preprocessor, preprocess
@@ -262,6 +309,7 @@ class Processor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
         from vllm.platforms import current_platform
         current_platform.validate_request(

From 853c371fc33e7c99aa2ab9f6e2cd7cbd1cadcf99 Mon Sep 17 00:00:00 2001
From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Wed, 27 Aug 2025 23:53:30 +0300
Subject: [PATCH 669/932] [V1][Mamba] - Enable V1 by default for Mamba Models
 (#23650)

Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
---
 .../models/language/generation/test_hybrid.py | 147 ++++++++----------
 vllm/engine/arg_utils.py                      |   5 -
 vllm/model_executor/models/config.py          |   1 +
 3 files changed, 70 insertions(+), 83 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 7e7cc893ec..31ca3a6f0f 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -100,21 +100,19 @@ def test_models(
         else:
             hf_outputs = None
 
-    if model not in V0_UNSUPPORTED_MODELS:
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-    else:
-        vllm_v0_outputs = None
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        if model not in V0_UNSUPPORTED_MODELS:
+            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                    example_prompts, max_tokens, num_logprobs)
+        else:
+            vllm_v0_outputs = None
 
     if model in V1_SUPPORTED_MODELS:
-        with monkeypatch.context() as m:
-            m.setenv("VLLM_USE_V1", "1")
-            with vllm_runner(model,
-                             max_num_seqs=MAX_NUM_SEQS,
-                             enable_prefix_caching=False) as vllm_model:
-                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                    example_prompts, max_tokens, num_logprobs)
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
     else:
         vllm_v1_outputs = None
 
@@ -137,7 +135,7 @@ def test_models(
         )
 
 
-@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_batching(
@@ -147,10 +145,6 @@ def test_batching(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    if model in V0_UNSUPPORTED_MODELS:
-        pytest.skip(
-            f"Unsupported V0 Engine. Skipping `test_batching` on {model}.")
-
     try:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
         model_info.check_available_online(on_fail="skip")
@@ -188,29 +182,32 @@ def test_chunked_prefill(
     max_tokens: int,
     num_logprobs: int,
     chunked_prefill_token_size: int,
+    monkeypatch,
 ) -> None:
     max_num_seqs = chunked_prefill_token_size
     max_num_batched_tokens = chunked_prefill_token_size
 
-    with vllm_runner(model,
-                     enable_chunked_prefill=True,
-                     max_num_batched_tokens=max_num_batched_tokens,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        chunked = vllm_model.generate_greedy_logprobs(example_prompts,
-                                                      max_tokens, num_logprobs)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model,
+                         enable_chunked_prefill=True,
+                         max_num_batched_tokens=max_num_batched_tokens,
+                         max_num_seqs=max_num_seqs) as vllm_model:
+            chunked = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
 
-    with vllm_runner(model,
-                     enable_chunked_prefill=False,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        non_chunked = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+        with vllm_runner(model,
+                         enable_chunked_prefill=False,
+                         max_num_seqs=max_num_seqs) as vllm_model:
+            non_chunked = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
 
-    check_logprobs_close(
-        outputs_0_lst=chunked,
-        outputs_1_lst=non_chunked,
-        name_0="chunked",
-        name_1="non_chunked",
-    )
+        check_logprobs_close(
+            outputs_0_lst=chunked,
+            outputs_1_lst=non_chunked,
+            name_0="chunked",
+            name_1="non_chunked",
+        )
 
 
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -281,25 +278,29 @@ def test_models_preemption_recompute(
     example_prompts,
     model: str,
     max_tokens: int,
+    monkeypatch,
 ) -> None:
     """
     Tests that outputs are identical with and w/o preemptions (recompute).
     """
-    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        scheduler = vllm_model.llm.llm_engine.scheduler[0]
-        scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
-        preempt_vllm_outputs = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            scheduler = vllm_model.llm.llm_engine.scheduler[0]
+            scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
+            preempt_vllm_outputs = vllm_model.generate_greedy(
+                example_prompts, max_tokens)
 
-        scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
 
-    check_outputs_equal(
-        outputs_0_lst=preempt_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="vllm_preepmtions",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=preempt_vllm_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="vllm_preepmtions",
+            name_1="vllm",
+        )
 
 
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -402,24 +403,18 @@ def test_full_cuda_graph(
         else:
             hf_outputs = None
 
-    if model not in V0_UNSUPPORTED_MODELS:
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-    else:
-        vllm_v0_outputs = None
-
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        if model in HYBRID_MODELS:
-            # required due to reorder_batch behaviour
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
-        with vllm_runner(model,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         compilation_config={'full_cuda_graph': True},
-                         enable_prefix_caching=False) as vllm_model:
-            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
+        m.setenv("VLLM_USE_V1", "0")
+        if model not in V0_UNSUPPORTED_MODELS:
+            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                    example_prompts, max_tokens, num_logprobs)
+        else:
+            vllm_v0_outputs = None
+
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
     if hf_outputs is not None and vllm_v0_outputs is not None:
         check_logprobs_close(
@@ -466,24 +461,20 @@ def test_fp32_state(
         else:
             hf_outputs = None
 
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model,
+                         max_num_seqs=MAX_NUM_SEQS,
+                         mamba_ssm_cache_dtype="float32") as vllm_model:
+            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
+
     with vllm_runner(model,
                      max_num_seqs=MAX_NUM_SEQS,
                      mamba_ssm_cache_dtype="float32") as vllm_model:
-        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        if model in HYBRID_MODELS:
-            # required due to reorder_batch behaviour
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
-        with vllm_runner(model,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         mamba_ssm_cache_dtype="float32",
-                         enable_prefix_caching=False) as vllm_model:
-            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
     if hf_outputs is not None:
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3399d505e3..e4d205aeb8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1463,11 +1463,6 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
-        # V1 mamba models are unoptimized.
-        if model_config.has_inner_state and _warn_or_fallback(
-                feature_name="Mamba"):
-            return False
-
         # No Concurrent Partial Prefills so far.
         if (self.max_num_partial_prefills
                 != SchedulerConfig.max_num_partial_prefills
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 88b3154de2..b0dbfacece 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -417,4 +417,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GptOssForCausalLM": GptOssForCausalLMConfig,
     "MambaForCausalLM": MambaModelConfig,
     "Mamba2ForCausalLM": MambaModelConfig,
+    "FalconMambaForCausalLM": MambaModelConfig,
 }

From 082cc07ef8f810bea61eaed77a60137684ca78f8 Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Wed, 27 Aug 2025 17:33:21 -0400
Subject: [PATCH 670/932] DP/EP Support for gpt-oss with deepep-ht comm kernel
 on SM100 (#23608)

---
 .../base_device_communicator.py               |   2 +-
 .../model_executor/layers/fused_moe/config.py |   6 +
 vllm/model_executor/layers/fused_moe/layer.py |   6 +-
 .../layers/fused_moe/trtllm_moe.py            | 197 ++++++++++++++++++
 vllm/model_executor/layers/fused_moe/utils.py |  16 ++
 .../compressed_tensors_moe.py                 |   8 +-
 .../model_executor/layers/quantization/fp8.py |   1 +
 .../layers/quantization/modelopt.py           |   2 +
 .../layers/quantization/mxfp4.py              | 110 ++++++++++
 .../layers/quantization/utils/mxfp4_utils.py  |   9 +-
 .../layers/quantization/utils/mxfp8_utils.py  |  20 ++
 11 files changed, 365 insertions(+), 12 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/trtllm_moe.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/mxfp8_utils.py

diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 9e5aa4e4c2..9131582eef 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -255,7 +255,7 @@ class DeviceCommunicatorBase:
             if module.__class__.__name__ == "FusedMoE"
         ]
         for module in moe_modules:
-            module.quant_method.init_prepare_finalize()
+            module.quant_method.init_prepare_finalize(module)
 
     def dispatch(
             self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 7c1a7b636a..cab610decf 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -450,6 +450,12 @@ class FusedMoEConfig:
             if quant_dtype is None and isinstance(quant_config, Fp8Config):
                 quant_dtype = torch.float8_e4m3fn
 
+            from vllm.model_executor.layers.quantization.mxfp4 import (
+                Mxfp4Config)
+            if (quant_dtype is None and isinstance(quant_config, Mxfp4Config)
+                    and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8):
+                quant_dtype = "mxfp8"
+
             from vllm.model_executor.layers.quantization.modelopt import (
                 ModelOptNvFp4Config)
             if quant_dtype is None and isinstance(quant_config,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 54406a5a2d..b9de03ddd2 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -200,7 +200,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
 
     # Note: init_prepare_finalize should only be called by
     # prepare_communication_buffer_for_model.
-    def init_prepare_finalize(self):
+    def init_prepare_finalize(self, layer: torch.nn.Module):
         assert self.moe is not None
         prepare_finalize = self.maybe_make_prepare_finalize(self.moe)
 
@@ -211,7 +211,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             assert self.fused_experts is None, \
                 f"Attempt to override experts for {id(self)}!"
             self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
-            experts = self.select_gemm_impl(prepare_finalize, self.moe)
+            experts = self.select_gemm_impl(prepare_finalize, self.moe, layer)
             self.fused_experts = FusedMoEModularKernel(
                 prepare_finalize,
                 experts,
@@ -221,6 +221,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> FusedMoEPermuteExpertsUnpermute:
         # based on the all2all implementation, select the appropriate
         # gemm implementation
@@ -273,6 +274,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         prepare_finalize: FusedMoEPrepareAndFinalize,
         # TODO(bnell): Remove. Every layer should have an moe config object.
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> FusedMoEPermuteExpertsUnpermute:
         if (prepare_finalize.activation_format ==
                 FusedMoEActivationFormat.BatchedExperts):
diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
new file mode 100644
index 0000000000..14dfce4b0e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP)
+from vllm.utils import next_power_of_2
+
+
+class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        moe: FusedMoEConfig,
+        gemm1_alpha,
+        gemm1_beta,
+        gemm1_clamp_limit,
+        w13_bias,
+        w2_bias,
+        max_capture_size,
+    ):
+        super().__init__(moe.quant_config)
+        self.moe = moe
+        self.gemm1_alpha = gemm1_alpha
+        self.gemm1_beta = gemm1_beta
+        self.gemm1_clamp_limit = gemm1_clamp_limit
+        self.w13_bias = w13_bias
+        self.w2_bias = w2_bias
+        self.max_capture_size = max_capture_size
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        return TopKWeightAndReduceNoOP()
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        # The workspaces for this implementation are managed by flashinfer.
+        # TODO(varun) : workspace1 is could be used as the output tensor. This
+        # is error-prone. Allow the `workspace_shapes` to return None workspaces
+        workspace1 = (M, K)
+        workspace2 = (0, 0)
+        output = (M, K)
+        return (workspace1, workspace2, output, a.dtype)
+
+    def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int,
+                             local_num_experts: int):
+        # Number of tokens in the input tensor.
+        num_tokens = x.shape[0]
+        # Factor to account for the imbalance of the experts.
+        # factor equals to the
+        # max_real_num_tokens_per_expert / perfect_num_tokens_per_expert
+        # 1.0 means perfect expert distribution.
+        # > 1.0 means some experts have more tokens than the perfect
+        # distribution.
+        # < 1.0 does not make sense.
+        imbalance_factor = 1.3
+        # Calculate the number of tokens per expert assuming perfect
+        # distribution.
+        num_tokens_per_expert = (num_tokens * top_k) // local_num_experts
+        # Apply the imbalance factor.
+        num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
+        # And pad the number to the next power of 2.
+        tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
+        # Cap to 8-64 tokens per CTA tile as it's the range supported by the
+        #  kernel.
+        tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+
+        return tile_tokens_dim
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
+        topk = topk_ids.size(-1)
+        local_num_experts = w1.size(0)
+        intermediate_size = w2.size(1)
+        local_expert_offset = self.moe.ep_rank * local_num_experts
+
+        x_quant = hidden_states
+        x_scale = a1q_scale
+        if x_scale is not None:
+            x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
+                *x_quant.shape[:-1], -1)
+
+        packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to(
+            torch.bfloat16).view(torch.int16)
+
+        assert w1_scale is not None
+        assert w2_scale is not None
+        kwargs = {
+            "topk_ids":
+            packed_tensor,
+            "routing_bias":
+            None,
+            "hidden_states":
+            x_quant,
+            "hidden_states_scale":
+            x_scale,
+            "gemm1_weights":
+            w1,
+            "gemm1_weights_scale":
+            w1_scale,
+            "gemm1_bias":
+            self.w13_bias,
+            "gemm1_alpha":
+            self.gemm1_alpha,
+            "gemm1_beta":
+            self.gemm1_beta,
+            "gemm1_clamp_limit":
+            self.gemm1_clamp_limit,
+            "gemm2_weights":
+            w2,
+            "gemm2_weights_scale":
+            w2_scale,
+            "gemm2_bias":
+            self.w2_bias,
+            "output1_scale_scalar":
+            None,
+            "output1_scale_gate_scalar":
+            None,
+            "output2_scale_scalar":
+            None,
+            "num_experts":
+            global_num_experts,
+            "top_k":
+            topk,
+            "n_group":
+            None,
+            "topk_group":
+            None,
+            "intermediate_size":
+            intermediate_size,
+            "local_expert_offset":
+            local_expert_offset,
+            "local_num_experts":
+            local_num_experts,
+            "routed_scaling_factor":
+            None,
+            "tile_tokens_dim":
+            self._get_tile_tokens_dim(x_quant, topk, local_num_experts),
+            "routing_method_type":
+            1,
+            "do_finalize":
+            True,
+            "output":
+            output,
+            "tune_max_num_tokens":
+            self.max_capture_size,
+        }
+
+        from flashinfer import trtllm_fp4_block_scale_routed_moe
+        trtllm_fp4_block_scale_routed_moe(**kwargs)
+        return output
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index 4c3e700ad3..1aeb3f92bc 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -12,6 +12,8 @@ from vllm.model_executor.layers.quantization.utils.int8_utils import (
     per_token_group_quant_int8, per_token_quant_int8)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     quant_dequant_mxfp4)
+from vllm.model_executor.layers.quantization.utils.mxfp8_utils import (
+    mxfp8_quantize)
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv
@@ -177,6 +179,18 @@ def _mxfp4_quantize(
     return A, None
 
 
+def _mxfp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert A_scale is None
+    assert not per_act_token_quant
+    assert block_shape is None
+    return mxfp8_quantize(A)
+
+
 def moe_kernel_quantize_input(
     A: torch.Tensor,
     A_scale: Optional[torch.Tensor],
@@ -195,6 +209,8 @@ def moe_kernel_quantize_input(
                              is_sf_swizzled_layout=is_fp4_scale_swizzled)
     elif quant_dtype == "mxfp4":
         return _mxfp4_quantize(A, A_scale, per_act_token_quant, block_shape)
+    elif quant_dtype == "mxfp8":
+        return _mxfp8_quantize(A, A_scale, per_act_token_quant, block_shape)
     else:
         return A, A_scale
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 6279bb8b60..af9d1c46f6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -322,6 +322,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         self,
         prepare_finalize: mk.FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> mk.FusedMoEPermuteExpertsUnpermute:
         """Return the appropriate GEMM experts implementation."""
         experts = select_nvfp4_gemm_impl(
@@ -719,10 +720,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 dtype=torch.int64)
 
     def select_gemm_impl(
-        self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-        moe: FusedMoEConfig,
-    ) -> FusedMoEPermuteExpertsUnpermute:
+            self, prepare_finalize: FusedMoEPrepareAndFinalize,
+            moe: FusedMoEConfig,
+            layer: torch.nn.Module) -> FusedMoEPermuteExpertsUnpermute:
         # cutlass path
         if self.use_cutlass:
             from vllm.model_executor.layers.fused_moe import (
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index be358cfa94..0200b0e9ed 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -897,6 +897,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> FusedMoEPermuteExpertsUnpermute:
         from vllm.model_executor.layers.fused_moe import (
             BatchedTritonOrDeepGemmExperts, TritonOrDeepGemmExperts)
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 72864853f7..adce598c4f 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -311,6 +311,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         self,
         prepare_finalize: mk.FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> mk.FusedMoEPermuteExpertsUnpermute:
         experts = select_cutlass_fp8_gemm_impl(
             moe,
@@ -1032,6 +1033,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         self,
         prepare_finalize: mk.FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> mk.FusedMoEPermuteExpertsUnpermute:
         experts = select_nvfp4_gemm_impl(
             moe,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index bdeb169a4b..6724796904 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -10,6 +10,8 @@ from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                   FusedMoEMethodBase)
+from vllm.model_executor.layers.fused_moe import modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -445,6 +447,91 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
         return tile_tokens_dim
 
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+        layer: torch.nn.Module,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        if (prepare_finalize.activation_format ==
+                mk.FusedMoEActivationFormat.BatchedExperts):
+            raise NotImplementedError(
+                "Mxfp4 does not support batched experts format for EP")
+        else:
+            if should_use_flashinfer_mxfp4():
+                # B200 code-path
+                kwargs = {
+                    "gemm1_alpha": layer.gemm1_alpha,
+                    "gemm1_beta": layer.gemm1_beta,
+                    "gemm1_clamp_limit": layer.gemm1_clamp_limit,
+                    "w13_bias": layer.w13_bias,
+                    "w2_bias": layer.w2_bias,
+                    "max_capture_size": self.max_capture_size,
+                }
+                return TrtLlmGenExperts(moe, **kwargs)
+            else:
+                # Use matmul_ogs from triton_kernels here!
+                raise NotImplementedError(
+                    "Mxfp4 does not support non-batched experts format for EP")
+
+    def _route_and_experts(
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            router_logits: torch.Tensor,
+            top_k: int,
+            renormalize: bool,
+            use_grouped_topk: bool = False,
+            topk_group: Optional[int] = None,
+            num_expert_group: Optional[int] = None,
+            global_num_experts: int = -1,
+            expert_map: Optional[torch.Tensor] = None,
+            custom_routing_function: Optional[Callable] = None,
+            scoring_func: str = "softmax",
+            e_score_correction_bias: Optional[torch.Tensor] = None,
+            apply_router_weight_on_input: bool = False,
+            activation: str = "silu",
+            enable_eplb: bool = False,
+            expert_load_view: Optional[torch.Tensor] = None,
+            logical_to_physical_map: Optional[torch.Tensor] = None,
+            logical_replica_count: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+
+        assert isinstance(self.fused_experts, mk.FusedMoEModularKernel)
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count)
+
+        return self.fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -503,6 +590,29 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 activation=activation,
                 expert_map=expert_map)
 
+        if self.fused_experts is not None:
+            return self._route_and_experts(
+                layer,
+                x,
+                router_logits,
+                top_k,
+                renormalize,
+                use_grouped_topk,
+                topk_group,
+                num_expert_group,
+                global_num_experts,
+                expert_map,
+                custom_routing_function,
+                scoring_func,
+                e_score_correction_bias,
+                apply_router_weight_on_input,
+                activation,
+                enable_eplb,
+                expert_load_view,
+                logical_to_physical_map,
+                logical_replica_count,
+            )
+
         assert _can_support_mxfp4(
             use_grouped_topk, topk_group, num_expert_group, expert_map,
             custom_routing_function, e_score_correction_bias,
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 48f9cc3737..3de928fea7 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -66,11 +66,10 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
                        logical_to_physical_map: Optional[torch.Tensor] = None,
                        logical_replica_count: Optional[torch.Tensor] = None):
     return not (use_grouped_topk or topk_group or num_expert_group
-                or expert_map or custom_routing_function
-                or e_score_correction_bias or apply_router_weight_on_input
-                or scoring_func != "softmax" or activation != "swigluoai"
-                or expert_load_view or logical_to_physical_map
-                or logical_replica_count)
+                or custom_routing_function or e_score_correction_bias
+                or apply_router_weight_on_input or scoring_func != "softmax"
+                or activation != "swigluoai" or expert_load_view
+                or logical_to_physical_map or logical_replica_count)
 
 
 def _dequant_mxfp4(x: torch.Tensor, scale: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
new file mode 100644
index 0000000000..2a6b21c918
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def mxfp8_quantize(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+
+    try:
+        from flashinfer import mxfp8_quantize
+    except ImportError as err:
+        raise ImportError("The package `flashinfer` is required to do "
+                          "MX-FP8 quantization. Please install it with" \
+                          "`pip install flashinfer`") from err
+
+    return mxfp8_quantize(x, is_sf_swizzled_layout=False)

From f9ca2b40a0357d98e3fb8bd951745dfaceab459e Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 27 Aug 2025 17:48:16 -0400
Subject: [PATCH 671/932] [Bugfix] Fix Marlin NVFP4 for modelopt (#23659)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../layers/quantization/modelopt.py           | 25 +++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index adce598c4f..9d4e453ffc 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -891,7 +891,11 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Block scale must be represented as FP8-E4M3")
 
-        if self.backend == "flashinfer-trtllm":
+        if self.backend == "marlin":
+            prepare_fp4_layer_for_marlin(layer)
+            del layer.alpha
+            del layer.input_scale
+        elif self.backend == "flashinfer-trtllm":
             # FlashInfer TRTLLM FP4 GEMM requires a different weight layout.
             # FlashInfer provides nvfp4_quantize to quantize + shuffle the
             # layout but we use our own quantization so we have to call
@@ -916,11 +920,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
                                            requires_grad=False)
             layer.weight = Parameter(layer.weight.data, requires_grad=False)
 
-            if self.backend == "marlin":
-                prepare_fp4_layer_for_marlin(layer)
-                del layer.alpha
-                del layer.input_scale
-
     def apply(
         self,
         layer: torch.nn.Module,
@@ -1312,6 +1311,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             del layer.w2_weight_scale
             del layer.w13_weight
             del layer.w13_weight_scale
+        elif self.use_marlin:
+            # Marlin processing
+            prepare_moe_fp4_layer_for_marlin(layer)
+            del layer.g1_alphas
+            del layer.g2_alphas
+            del layer.w13_input_scale_quant
+            del layer.w2_input_scale_quant
         else:
             # Non-TRT-LLM processing (Cutlass or non-flashinfer)
             assert (layer.w13_weight_scale.shape[2] % 16 == 0), (
@@ -1333,13 +1339,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             layer.w2_weight = Parameter(layer.w2_weight.data,
                                         requires_grad=False)
 
-        if self.use_marlin:
-            prepare_moe_fp4_layer_for_marlin(layer)
-            del layer.g1_alphas
-            del layer.g2_alphas
-            del layer.w13_input_scale_quant
-            del layer.w2_input_scale_quant
-
     def apply(
         self,
         layer: torch.nn.Module,

From 321938e9ac4000e0cb37e328359a7fd3026bc672 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 27 Aug 2025 17:52:24 -0400
Subject: [PATCH 672/932] [Feature] Add `VLLM_DISABLE_PAD_FOR_CUDAGRAPH` to
 Avoid Hang Issue (#23595)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/envs.py                       | 7 +++++++
 vllm/v1/worker/gpu_model_runner.py | 1 +
 2 files changed, 8 insertions(+)

diff --git a/vllm/envs.py b/vllm/envs.py
index 35735b5525..a6a795dcfc 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -166,6 +166,7 @@ if TYPE_CHECKING:
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
     VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
     VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
+    VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False
 
 
 def get_default_cache_root():
@@ -1144,6 +1145,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_ENABLE_CUDAGRAPH_GC":
     lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))),
 
+    # Disable padding to CUDA graph capture batch sizes.
+    # TODO(wentao): https://github.com/vllm-project/vllm/issues/23378
+    # After the issue is fixed, we can remove this flag.
+    "VLLM_DISABLE_PAD_FOR_CUDAGRAPH":
+    lambda: bool(int(os.getenv("VLLM_DISABLE_PAD_FOR_CUDAGRAPH", "0"))),
+
     # Used to force set up loopback IP
     "VLLM_LOOPBACK_IP":
     lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 01c90b2ea3..a194808e51 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1491,6 +1491,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+                and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
             # Use CUDA graphs.
             # Add padding to the batch size.

From 5da4f5d857933329aaca779e3a81f1385c84e34a Mon Sep 17 00:00:00 2001
From: Hanchenli <61769611+Hanchenli@users.noreply.github.com>
Date: Wed, 27 Aug 2025 17:44:52 -0700
Subject: [PATCH 673/932] [Bugfix] Fix for V1 priority scheduling crashes at
 preemption (#23713)

Signed-off-by: Hanchenli <lihanc2002@gmail.com>
---
 tests/v1/core/test_scheduler.py | 91 +++++++++++++++++++++++++++++++--
 vllm/v1/core/sched/scheduler.py |  2 +
 2 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 07d7c12a4f..70e8691788 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1293,7 +1293,8 @@ def create_requests_with_priority(
         mm_positions: Optional[list[list[PlaceholderRange]]] = None,
         max_tokens: int = 16,
         stop_token_ids: Optional[list[int]] = None,
-        prompt_logprobs: Optional[int] = None):
+        prompt_logprobs: Optional[int] = None,
+        starting_idx: int = 0):
     """Create requests with specified priorities and arrival times."""
     assert len(priorities) == num_requests
     if arrival_times is not None:
@@ -1315,8 +1316,8 @@ def create_requests_with_priority(
             mm_position = None
             mm_kwargs = None
         request = Request(
-            request_id=f"{i}",
-            prompt_token_ids=[i] * num_tokens,
+            request_id=f"{i + starting_idx}",
+            prompt_token_ids=[i + starting_idx] * num_tokens,
             sampling_params=sampling_params,
             pooling_params=None,
             multi_modal_kwargs=mm_kwargs,
@@ -1813,3 +1814,87 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
     assert len(output.scheduled_new_reqs) == 0
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 1
+
+
+def test_priority_scheduling_preemption_when_out_of_kv():
+    """Test that priority scheduling preempts lower priority requests
+    when out of KV cache space."""
+    # Create scheduler with very limited memory to force preemption
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=2,  # Allow multiple requests
+        max_num_batched_tokens=200,
+        num_blocks=5,  # Can hold 64 tokens (first block is null)
+        block_size=16,  # Standard block size
+    )
+
+    # Create a request and schedule it
+    request_low = create_requests_with_priority(
+        num_requests=1,
+        priorities=[1],
+        arrival_times=[0.0],
+        num_tokens=30,
+        starting_idx=0,
+    )[0]
+    scheduler.add_request(request_low)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 1
+
+    # Simulate model execution
+    model_output = ModelRunnerOutput(
+        req_ids=[request_low.request_id],
+        req_id_to_index={request_low.request_id: 0},
+        sampled_token_ids=[[100]],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Create a high priority request and schedule it
+    request_high = create_requests_with_priority(
+        num_requests=1,
+        priorities=[0],
+        arrival_times=[1.0],
+        num_tokens=32,
+        starting_idx=1,
+    )[0]
+    scheduler.add_request(request_high)
+    output = scheduler.schedule()
+    # KV cache should be full at this point
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == 0
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 2
+
+    # Simulate model execution
+    requests = [request_low, request_high]
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(requests)
+        },
+        sampled_token_ids=[[100] for _ in requests],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Schedule again - this should trigger preemption
+    # req_low needs 32 tokens = 2 blocks
+    # req_high needs 33 tokens = 3 blocks
+    # so doesn't fit in 4 blocks.
+    output = scheduler.schedule()
+
+    # Should have preempted req_low
+    assert len(output.scheduled_new_reqs) == 0
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler.running) == 1
\ No newline at end of file
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 14a914d8f2..3bd2fe2f05 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -253,6 +253,8 @@ class Scheduler(SchedulerInterface):
                             key=lambda r: (r.priority, r.arrival_time),
                         )
                         self.running.remove(preempted_req)
+                        if preempted_req in scheduled_running_reqs:
+                            scheduled_running_reqs.remove(preempted_req)
                     else:
                         preempted_req = self.running.pop()
 

From a69693e38f27f12e5a5d05b6792e590b520ca27b Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Wed, 27 Aug 2025 19:43:26 -0700
Subject: [PATCH 674/932] Migrate Qwen inputs to TensorSchema (#23473)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/qwen_vl.py | 51 +++++++++++++--------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 2950ca664a..90200f3194 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -11,7 +11,7 @@ import math
 import unicodedata
 from collections.abc import Collection, Mapping, Sequence, Set
 from functools import lru_cache, partial
-from typing import Callable, Literal, Optional, TypedDict, Union
+from typing import Annotated, Callable, Literal, Optional, Union
 
 import regex as re
 import torch
@@ -40,6 +40,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -47,26 +48,34 @@ from .qwen import QWenBaseModel, QWenModel
 from .utils import flatten_bn, merge_multimodal_embeddings
 
 
-class QwenImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    data: torch.Tensor
+class QwenImagePixelInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_images, 3, image_size, image_size)`
-
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+    
     Note that image_size is the value in the vision config to which we resize
     the image to in the normalization transform. Currently multi-image support
     can only be leveraged by passing image embeddings directly.
     """
+    type: Literal["pixel_values"] = "pixel_values"
+    data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
 
-class QwenImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, 256, hidden_size)`
-
+class QwenImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - ifs: Image feature size (256)
+        - hs: Hidden size
+    
     `hidden_size` must match the hidden size of the language model backbone
     and is stored in the visual config of the model if we have one.
     """
+    type: Literal["image_embeds"] = "image_embeds"
+    data: Annotated[torch.Tensor, TensorShape("bn", 256, "hs")]
 
 
 QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs]
@@ -697,19 +706,6 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
 
         self.transformer: QwenVLModel
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.visual["image_size"]
-        expected_dims = (3, h, w)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[QwenImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -720,10 +716,13 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
+            expected_h = expected_w = self.config.visual["image_size"]
+            resolve_bindings = {"h": expected_h, "w": expected_w}
+
             return QwenImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                data=flatten_bn(pixel_values, concat=True),
+                resolve_bindings=resolve_bindings,
             )
 
         if image_embeds is not None:

From 1b7b161a09289214eea41e17895a68a7ccd4b1dc Mon Sep 17 00:00:00 2001
From: Shrey Gupta <66182248+Shrey1306@users.noreply.github.com>
Date: Thu, 28 Aug 2025 08:42:44 +0530
Subject: [PATCH 675/932] [Feature] models: pass layer prefix to
 replace_linear_class for per-layer quantization routing. Addresses #23239
 (#23556)

Signed-off-by: Shrey Gupta <shreyg1303@gmail.com>
---
 vllm/model_executor/models/deepseek_vl2.py | 12 ++++++++----
 vllm/model_executor/models/transformers.py | 14 ++++++++++----
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 1bd2802a86..5eab02b171 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -408,13 +408,17 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
             if isinstance(module, nn.Linear):
                 parent, attr_name = self._get_parent_and_attr(vit, name)
                 if isinstance(parent, timm.layers.Mlp) and attr_name == "fc1":
-                    new_linear = replace_linear_class(module, "colwise",
-                                                      quant_config)
+                    new_linear = replace_linear_class(module,
+                                                      "colwise",
+                                                      quant_config,
+                                                      prefix=name)
                     setattr(parent, attr_name, new_linear)
                 elif isinstance(parent,
                                 timm.layers.Mlp) and attr_name == "fc2":
-                    new_linear = replace_linear_class(module, "rowwise",
-                                                      quant_config)
+                    new_linear = replace_linear_class(module,
+                                                      "rowwise",
+                                                      quant_config,
+                                                      prefix=name)
                     setattr(parent, attr_name, new_linear)
 
         return vit
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index edf3dddb1b..f7ced6134d 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -106,8 +106,11 @@ def can_enable_torch_compile(vllm_config: VllmConfig) -> bool:
 
 
 def replace_linear_class(
-    linear: nn.Linear, style: Literal["colwise", "rowwise"],
-    quant_config: QuantizationConfig
+    linear: nn.Linear,
+    style: Literal["colwise", "rowwise"],
+    quant_config: QuantizationConfig,
+    *,
+    prefix: str = "",
 ) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
     """
     Replace nn.Linear with one of vLLM's tensor parallel linear classes.
@@ -141,6 +144,7 @@ def replace_linear_class(
         output_size=linear.out_features,
         bias=linear.bias is not None,
         quant_config=quant_config,
+        prefix=prefix,
         return_bias=False,
         **vllm_linear_kwargs,
     )
@@ -557,8 +561,10 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
                     generator = (p for p in tp_plan if re.match(p, qual_name))
                     pattern = next(generator, None)
                     style = tp_plan.get(pattern, "replicate")
-                    new_module = replace_linear_class(child_module, style,
-                                                      self.quant_config)
+                    new_module = replace_linear_class(child_module,
+                                                      style,
+                                                      self.quant_config,
+                                                      prefix=qual_name)
                     setattr(module, child_name, new_module)
                     log_replacement(qual_name, child_module, new_module)
                 else:

From a781e84ec25b1d1b6c245f2e8ffec6e10bafdaa1 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 27 Aug 2025 23:12:53 -0400
Subject: [PATCH 676/932] [Perf] Tune configs for triton block fp8 gemm
 H100/H200 (#23748)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 benchmarks/kernels/bench_block_fp8_gemm.py    | 113 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  90 +++++------
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  82 +++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  62 ++++----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  54 +++----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  82 +++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  84 +++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 118 +++++++-------
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 132 ++++++++--------
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  82 +++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  76 ++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  60 +++----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 100 ++++++------
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 108 ++++++-------
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  76 ++++-----
 21 files changed, 1592 insertions(+), 603 deletions(-)
 create mode 100644 benchmarks/kernels/bench_block_fp8_gemm.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
new file mode 100644
index 0000000000..883f0cf7e5
--- /dev/null
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    w8a8_block_fp8_matmul,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton as vllm_triton
+
+assert current_platform.is_cuda(), (
+    "Only support benchmarking w8a8 block fp8 kernel on CUDA device."
+)
+
+# DeepSeek-V3 weight shapes
+DEEPSEEK_V3_SHAPES = [
+    (512 + 64, 7168),
+    ((128 + 64) * 128, 7168),
+    (128 * (128 + 128), 512),
+    (7168, 16384),
+    (7168, 18432),
+    (18432 * 2, 7168),
+    (24576, 1536),
+    (12288, 7168),
+    (4096, 7168),
+    (7168, 2048),
+]
+
+
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+    """Build runner function for w8a8 block fp8 matmul."""
+    factor_for_scale = 1e-2
+
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    # Create random FP8 tensors
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    # Create scales
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
+    Bs = (
+        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
+        * factor_for_scale
+    )
+
+    def run():
+        return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+
+    return run
+
+
+@vllm_triton.testing.perf_report(
+    vllm_triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["torch-bf16", "w8a8-block-fp8"],
+        line_names=["torch-bf16", "w8a8-block-fp8"],
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs W8A8 Block FP8 GEMMs",
+        args={},
+    )
+)
+def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
+    M = batch_size
+    device = "cuda"
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
+        b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+        )
+    else:  # w8a8-block-fp8
+        run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: run_w8a8(), quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+if __name__ == "__main__":
+    block_size = (128, 128)
+
+    for N, K in DEEPSEEK_V3_SHAPES:
+        print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
+
+        print(f"TFLOP/s comparison (block_size={block_size}):")
+        benchmark_tflops.run(
+            print_data=True,
+            # show_plots=False,
+            # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
+            N=N,
+            K=K,
+            block_size=block_size,
+        )
+
+    print("\nBenchmark finished!")
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..0ea0225c96
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..be487f2805
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..f74a52fc17
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..8cab1b0932
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 1c61451fb3..ae244f90bb 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,73 +1,73 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 4
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
-    "24": {
-        "BLOCK_SIZE_M": 64,
+    "4": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -75,7 +75,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -83,7 +83,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -107,7 +107,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -115,15 +115,15 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -133,13 +133,13 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 63e661c80d..b2931d68f4 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,83 +1,83 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 4
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
-    "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "16": {
-        "BLOCK_SIZE_M": 64,
+    "4": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,9 +99,9 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,8 +139,8 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
-}
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 56b939e52f..ad630f0d78 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,30 +1,30 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 8,
         "num_stages": 3
     },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -32,19 +32,19 @@
         "num_stages": 3
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 2
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 3
     },
     "32": {
@@ -59,9 +59,9 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
@@ -75,7 +75,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -83,7 +83,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 63d9a0bf5d..10b940c04f 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,50 +1,50 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 8,
         "num_stages": 4
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 3
     },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 2
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
         "num_stages": 3
     },
     "32": {
@@ -59,15 +59,15 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 2
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -75,7 +75,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 7fa398c15a..94ce6e77f0 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,55 +1,55 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
         "num_stages": 3
     },
-    "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
-    "16": {
-        "BLOCK_SIZE_M": 64,
+    "4": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
-    "24": {
-        "BLOCK_SIZE_M": 64,
+    "8": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
@@ -59,31 +59,31 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,7 +99,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -107,7 +107,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -131,7 +131,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index f15d8f64c7..9540df4079 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,57 +1,57 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 8,
         "num_stages": 3
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -59,33 +59,33 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
@@ -93,23 +93,23 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..96f6c307b3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..567675787d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 51e237b91b..0894ff2fa3 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,6 +1,6 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -8,55 +8,55 @@
         "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
-    "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 4
-    },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
     "48": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
@@ -64,83 +64,83 @@
         "num_stages": 4
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
-    "3072": {
+    "1536": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
         "num_stages": 3
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 6280219c9e..86c68e08a1 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,78 +1,78 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "48": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -80,38 +80,14 @@
         "num_stages": 5
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "1536": {
+    "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
@@ -119,19 +95,43 @@
         "num_warps": 4,
         "num_stages": 5
     },
-    "2048": {
+    "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
     "3072": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 0a1e14cffb..af1a384cbc 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,14 +1,14 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
@@ -16,26 +16,26 @@
         "num_stages": 5
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
@@ -43,9 +43,9 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 64,
@@ -59,7 +59,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -67,31 +67,31 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -101,25 +101,9 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "2048": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
@@ -127,13 +111,29 @@
         "num_warps": 4,
         "num_stages": 3
     },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
     "3072": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
@@ -141,6 +141,6 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 15b1c93f60..d381764a26 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,22 +1,22 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 5
     },
-    "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 5
-    },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -24,18 +24,18 @@
         "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 5
     },
@@ -45,47 +45,47 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
@@ -93,29 +93,29 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 8ff12e64c1..821ad0c704 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,43 +1,43 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
         "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 5
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "24": {
         "BLOCK_SIZE_M": 64,
@@ -45,7 +45,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 64,
@@ -59,7 +59,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 5
     },
@@ -73,19 +73,19 @@
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
@@ -99,21 +99,21 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
@@ -123,9 +123,9 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "3072": {
         "BLOCK_SIZE_M": 64,
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 4532f93681..daaf21c286 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,67 +1,67 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 5
     },
-    "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 5
-    },
     "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
     },
     "24": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "64": {
         "BLOCK_SIZE_M": 64,
@@ -73,25 +73,25 @@
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,31 +99,31 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
@@ -141,6 +141,6 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index ca7f32b955..2583b5a344 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,57 +1,57 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 5
-    },
-    "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 5
+        "num_warps": 8,
+        "num_stages": 3
     },
-    "8": {
-        "BLOCK_SIZE_M": 64,
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
-    "16": {
-        "BLOCK_SIZE_M": 64,
+    "8": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -59,43 +59,35 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
-    "512": {
+    "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
@@ -103,19 +95,27 @@
         "num_warps": 4,
         "num_stages": 3
     },
-    "1024": {
+    "512": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
     "1536": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -131,7 +131,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,8 +139,8 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 5acea242cc..baa64f8d3d 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,65 +1,65 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 8,
-        "num_stages": 4
+        "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
-    "8": {
-        "BLOCK_SIZE_M": 64,
+    "16": {
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
-    },
-    "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "24": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -69,21 +69,21 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,13 +99,13 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -131,15 +131,15 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     }

From a11adafdcab61c059d2a76d952367a722e1b71d5 Mon Sep 17 00:00:00 2001
From: Jan Kessler <Ithanil@users.noreply.github.com>
Date: Thu, 28 Aug 2025 05:14:00 +0200
Subject: [PATCH 677/932] Gracefully handle edge cases in harmony utils
 (#23155)

Signed-off-by: Jan Kessler <jakessle@uni-mainz.de>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/entrypoints/harmony_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index bc810f683f..078d316844 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -155,7 +155,7 @@ def parse_chat_input(chat_msg) -> Message:
         contents = [TextContent(text=content)]
     else:
         # TODO: Support refusal.
-        contents = [TextContent(text=c["text"]) for c in content]
+        contents = [TextContent(text=c.get("text", "")) for c in content]
     msg = Message.from_role_and_contents(role, contents)
     return msg
 
@@ -218,8 +218,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
             )
             output_items.append(reasoning_item)
     elif message.channel == "commentary":
-        if message.recipient.startswith("functions."):
-            function_name = message.recipient.split(".")[-1]
+        if recipient is not None and recipient.startswith("functions."):
+            function_name = recipient.split(".")[-1]
             for content in message.content:
                 random_id = random_uuid()
                 response_item = ResponseFunctionToolCall(
@@ -230,8 +230,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
                     id=f"ft_{random_id}",
                 )
                 output_items.append(response_item)
-        elif message.recipient.startswith(
-                "python") or message.recipient.startswith("browser"):
+        elif recipient is not None and (recipient.startswith("python")
+                                        or recipient.startswith("browser")):
             for content in message.content:
                 reasoning_item = ResponseReasoningItem(
                     id=f"rs_{random_uuid()}",
@@ -245,7 +245,7 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
                 )
                 output_items.append(reasoning_item)
         else:
-            raise ValueError(f"Unknown recipient: {message.recipient}")
+            raise ValueError(f"Unknown recipient: {recipient}")
     elif message.channel == "final":
         contents = []
         for content in message.content:

From f48a9af8924ea617a964b1158acc142b64843edb Mon Sep 17 00:00:00 2001
From: Alex <30671301+killershrimp@users.noreply.github.com>
Date: Wed, 27 Aug 2025 23:27:36 -0500
Subject: [PATCH 678/932] [CI] make all multi-gpu weight loading tests run
 nightly (#23792)

Signed-off-by: Alex Yun <alexyun04@gmail.com>
---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0d3b7a294d..cf90505257 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -798,6 +798,7 @@ steps:
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
+  optional: true
   source_file_dependencies:
   - vllm/
   - tests/weight_loading

From c8851a47235f5dfd3da3abf6c89453b3bdb41ad1 Mon Sep 17 00:00:00 2001
From: Jinheng <ahengljh@gmail.com>
Date: Thu, 28 Aug 2025 13:34:29 +0800
Subject: [PATCH 679/932] Add deprecation warning for lora_extra_vocab_size
 (#23635)

Signed-off-by: Jinheng Li <ahengljh@gmail.com>
---
 vllm/config/__init__.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 351833d3f0..cfc5e07d83 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -2439,8 +2439,8 @@ class LoRAConfig:
     lora_dtype: Union[torch.dtype, LoRADType] = "auto"
     """Data type for LoRA. If auto, will default to base model dtype."""
     lora_extra_vocab_size: int = 256
-    """Maximum size of extra vocabulary that can be present in a LoRA adapter
-    (added to the base model vocabulary)."""
+    """(Deprecated) Maximum size of extra vocabulary that can be present in a 
+    LoRA adapter. Will be removed in v0.12.0."""
     lora_vocab_padding_size: ClassVar[int] = current_platform\
         .get_lora_vocab_padding_size()
 
@@ -2482,6 +2482,12 @@ class LoRAConfig:
         return hash_str
 
     def __post_init__(self):
+        # Deprecation warning for lora_extra_vocab_size
+        logger.warning(
+            "`lora_extra_vocab_size` is deprecated and will be removed "
+            "in v0.12.0. Additional vocabulary support for "
+            "LoRA adapters is being phased out.")
+
         # Setting the maximum rank to 512 should be able to satisfy the vast
         # majority of applications.
         possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)

From 22feac8e957a2f9787eb721c685269afc15bb3b1 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 28 Aug 2025 02:43:48 -0400
Subject: [PATCH 680/932] [Transform] [Quantization] Add transforms to
 compressed tensors (#22486)

---
 tests/conftest.py                             |  43 +++-
 tests/quantization/test_compressed_tensors.py |  22 ++
 vllm/model_executor/layers/linear.py          |  16 +-
 .../compressed_tensors/compressed_tensors.py  |  52 ++--
 .../compressed_tensors/transform/linear.py    | 227 ++++++++++++++++++
 .../compressed_tensors/transform/module.py    | 135 +++++++++++
 .../transform/schemes/linear_qutlass_nvfp4.py |  21 ++
 .../compressed_tensors/transform/utils.py     |  13 +
 vllm/model_executor/parameter.py              | 168 +++++++++++--
 9 files changed, 661 insertions(+), 36 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py

diff --git a/tests/conftest.py b/tests/conftest.py
index f8bfdfc8e6..6052ada1c5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
+import math
 import os
 import tempfile
 from enum import Enum
-from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
+from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
 
 import numpy as np
 import pytest
@@ -33,6 +34,7 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
+from vllm.sequence import Logprob
 from vllm.transformers_utils.utils import maybe_model_redirect
 
 logger = init_logger(__name__)
@@ -602,7 +604,7 @@ class HfRunner:
     def _hidden_states_to_logprobs(
         self,
         hidden_states: tuple[tuple[torch.Tensor, ...], ...],
-        num_logprobs: int,
+        num_logprobs: Optional[int],
     ) -> tuple[list[dict[int, float]], int]:
         seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
         output_len = len(hidden_states)
@@ -630,7 +632,7 @@ class HfRunner:
         self,
         prompts: list[str],
         max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
         videos: Optional[PromptVideoInput] = None,
@@ -677,7 +679,7 @@ class HfRunner:
         self,
         encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
         max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
         images: Optional[PromptImageInput] = None,
         **kwargs: Any,
     ) -> list[TokensTextLogprobs]:
@@ -966,7 +968,7 @@ class VllmRunner:
         self,
         prompts: list[str],
         max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
         num_prompt_logprobs: Optional[int] = None,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
@@ -991,11 +993,40 @@ class VllmRunner:
                                         videos=videos,
                                         **kwargs)
 
+    def generate_prompt_perplexity(self, prompts: list[str]) -> list[float]:
+        """
+        Return the perplexity score associated with generating the prompts
+
+        :param prompts: list of prompts to score
+        :return: perplexity score of each prompt
+        """
+        outputs = self.generate_greedy_logprobs(prompts,
+                                                max_tokens=1,
+                                                num_logprobs=None,
+                                                num_prompt_logprobs=0)
+
+        perplexities = []
+        for output in outputs:
+            output = cast(TokensTextLogprobsPromptLogprobs, output)
+            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
+            assert token_datas[0] is None
+            token_log_probs = []
+            for token_data in token_datas[1:]:
+                assert token_data is not None
+                assert len(token_data) == 1
+                token_log_prob = list(token_data.values())[0].logprob
+                token_log_probs.append(token_log_prob)
+
+            perplexity = math.exp(-sum(token_log_probs) / len(token_log_probs))
+            perplexities.append(perplexity)
+
+        return perplexities
+
     def generate_encoder_decoder_greedy_logprobs(
         self,
         encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
         max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
         num_prompt_logprobs: Optional[int] = None,
         skip_special_tokens: bool = True,
     ) -> Union[list[TokensTextLogprobs],
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index b9774b7ee2..484f53246f 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -719,3 +719,25 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
         assert output
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test is skipped on non-CUDA platform.")
+@pytest.mark.parametrize("model,prompt,exp_perplexity", [
+    (
+        "nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16",
+        "Flat is better than nested.\nSparse is better than dense.",
+        150.0,
+    ),
+    (
+        "nm-testing/Llama-3.2-1B-Instruct-quip-w4a16",
+        "Flat is better than nested.\nSparse is better than dense.",
+        150.0,
+    ),
+])
+def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
+                                                  exp_perplexity):
+    with vllm_runner(model, enforce_eager=True) as llm:
+        perplexity = llm.generate_prompt_perplexity([prompt])[0]
+        print(perplexity)
+        assert perplexity <= exp_perplexity
\ No newline at end of file
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index c0fcacd1e6..19ff631450 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -35,6 +35,7 @@ logger = init_logger(__name__)
 
 WEIGHT_LOADER_V2_SUPPORTED = [
     "CompressedTensorsLinearMethod",
+    "CompressedTensorsLinearTransformMethod",
     "BitBLASLinearMethod",
     "GPTQBitBLASLinearMethod",
     "AWQMarlinLinearMethod",
@@ -199,6 +200,7 @@ class UnquantizedLinearMethod(LinearMethodBase):
         set_weight_attrs(weight, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # special postprocessing for CPU SGL
         if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL:
             from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
             N, K = layer.weight.size()
@@ -1470,7 +1472,7 @@ class QKVCrossParallelLinear(LinearBase):
             self.bias = torch.nn.Parameter()
             set_weight_attrs(self.bias, {
                 "output_dim": 0,
-                "weight_loader": self.weight_loader,
+                "weight_loader": self.weight_loader_v1,
             })
         else:
             self.bias = None
@@ -1580,6 +1582,18 @@ class QKVCrossParallelLinear(LinearBase):
             k, v = kv_enc.split(self.kv_size, dim=-1)
         return q, k, v
 
+    def weight_loader_v1(self,
+                         param: torch.nn.Parameter,
+                         loaded_weight: torch.Tensor,
+                         loaded_shard_id: Optional[str] = None):
+        # just like all other parameters, does not yet
+        # support loading bias with weight_loader_v2
+        layer = (self.q_proj_decoder
+                 if loaded_shard_id == "q" else self.kv_proj_encoder)
+        target_param = self.select_proj_params(layer, param)
+        shard_id_args = (loaded_shard_id, ) if loaded_shard_id != "q" else ()
+        layer.weight_loader(target_param, loaded_weight, *shard_id_args)
+
     def weight_loader(self,
                       param: torch.nn.Parameter,
                       loaded_weight: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 245cf122eb..230572041c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -11,6 +11,7 @@ from compressed_tensors.config import (CompressionFormat,
 from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
+from compressed_tensors.transform import TransformConfig
 from pydantic import BaseModel
 
 import vllm.envs as envs
@@ -30,6 +31,8 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
     CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import (  # noqa: E501
+    CompressedTensorsLinearTransformMethod, get_linear_transform_schemes)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, is_activation_quantization_format,
     should_ignore_layer)
@@ -60,6 +63,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         sparsity_ignore_list: list[str],
         kv_cache_scheme: Optional[dict[str, Any]] = None,
         config: Optional[dict[str, Any]] = None,
+        transform_config: Optional[TransformConfig] = None,
     ):
         super().__init__()
         self.ignore = ignore
@@ -71,6 +75,12 @@ class CompressedTensorsConfig(QuantizationConfig):
         self.sparsity_ignore_list = sparsity_ignore_list
         self.config = config
 
+        if transform_config is not None:
+            self.transform_config = TransformConfig.model_validate(
+                transform_config)
+        else:
+            self.transform_config = None
+
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
 
@@ -103,18 +113,27 @@ class CompressedTensorsConfig(QuantizationConfig):
     ) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
-        # Check if the layer is skipped for quantization.
-        # TODO (@robertgshaw2): support module names
-        if should_ignore_layer(prefix,
-                               ignore=self.ignore,
-                               fused_mapping=self.packed_modules_mapping):
-            return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
-            scheme = self.get_scheme(layer=layer, layer_name=prefix)
-            if scheme is None:
-                return UnquantizedLinearMethod()
-            layer.scheme = scheme
-            return CompressedTensorsLinearMethod(self)
+            # collect schemes
+            quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            input_tfms, output_tfms = get_linear_transform_schemes(
+                layer, prefix, self.transform_config,
+                self.packed_modules_mapping)
+
+            # choose quantization method
+            quant_method: LinearMethodBase = UnquantizedLinearMethod()
+            if quant_scheme is not None:
+                layer.scheme = quant_scheme
+                quant_method = CompressedTensorsLinearMethod(self)
+
+            # choose transform method
+            if any((input_tfms, output_tfms)):
+                return CompressedTensorsLinearTransformMethod.from_schemes(
+                    quant_method, input_tfms, output_tfms)
+
+            else:
+                return quant_method
+
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
@@ -129,6 +148,7 @@ class CompressedTensorsConfig(QuantizationConfig):
             config=config)
         sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
             config=config)
+        transform_config = config.get("transform_config")
 
         return cls(
             target_scheme_map=target_scheme_map,
@@ -137,6 +157,7 @@ class CompressedTensorsConfig(QuantizationConfig):
             sparsity_scheme_map=sparsity_scheme_map,
             sparsity_ignore_list=sparsity_ignore_list,
             config=config,
+            transform_config=transform_config,
         )
 
     @classmethod
@@ -537,9 +558,11 @@ class CompressedTensorsConfig(QuantizationConfig):
 
         # Find the "target" in the compressed-tensors config
         # that our layer conforms to.
-        # TODO (@robertgshaw): add compressed-tensors as dep
-        # so we do not have to re-write these functions
-        # need to make accelerate optional in ct to do this
+        # TODO (@kylesayrs): support ignore module names with ct matching utils
+        if should_ignore_layer(layer_name,
+                               ignore=self.ignore,
+                               fused_mapping=self.packed_modules_mapping):
+            return None
 
         # Will be empty for models with only sparsity
         weight_quant = input_quant = None
@@ -722,7 +745,6 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
         layer input.  See LinearMethodBase for param details
 
         """
-
         scheme = layer.scheme
         if scheme is None:
             raise ValueError("A scheme must be defined for each layer")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
new file mode 100644
index 0000000000..2fc94b3c25
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
@@ -0,0 +1,227 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Generator
+from itertools import accumulate
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors.transform import (TransformArgs, TransformConfig,
+                                          TransformLocation, TransformScheme)
+from compressed_tensors.utils import is_match
+
+from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED,
+                                               LinearMethodBase,
+                                               QKVCrossParallelLinear)
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.module import (  # noqa: E501
+    HadamardTransform)
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import (  # noqa: E501
+    TransformTuple)
+
+
+class CompressedTensorsLinearTransformMethod(LinearMethodBase):
+    """
+    Wraps `CompressedTensorsLinearMethod` or `UnquantizedLinearMethod` and adds
+    input and output transforms to either side of the original apply method
+    """
+
+    @classmethod
+    def from_schemes(
+        cls, quant_method: LinearMethodBase, input_tfms: dict[int,
+                                                              TransformTuple],
+        output_tfms: dict[int, TransformTuple]
+    ) -> "CompressedTensorsLinearTransformMethod":
+        assert input_tfms or output_tfms
+
+        # TODO (@ksayers): implement QutlassLinearMethodNvFP4
+        # hadacore and fwht can be selected by Transform module
+
+        return cls(quant_method, input_tfms, output_tfms)
+
+    def __init__(self, quant_method: LinearMethodBase,
+                 input_tfms: dict[int, TransformTuple],
+                 output_tfms: dict[int, TransformTuple]):
+        self.quant_method = quant_method
+        self.input_tfms = input_tfms
+        self.output_tfms = output_tfms
+
+        self.input_transform: Optional[HadamardTransform] = None
+        self.output_transform: Optional[HadamardTransform] = None
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+
+        # get weight loader for transforms
+        weight_loader: Callable = extra_weight_attrs.get(
+            "weight_loader")  # type: ignore[assignment]
+
+        # HACK: UnquantizedLinearMethod does not support weight loader v2, but
+        # transforms (specifically SharedWeightParameter) requires
+        # weight loader v2. Until UnquantizedLinearMethod supports v2, we must
+        # hack around this by getting weight loader v1 so ULM can load correctly
+        quant_method_name = self.quant_method.__class__.__name__
+        if quant_method_name not in WEIGHT_LOADER_V2_SUPPORTED:
+            if isinstance(layer, QKVCrossParallelLinear):
+                weight_loader_v1 = layer.weight_loader_v1
+            else:
+                weight_loader_v1 = layer.weight_loader
+            extra_weight_attrs["weight_loader"] = weight_loader_v1
+
+        self.quant_method.create_weights(
+            layer=layer,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            input_size=input_size,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            **extra_weight_attrs)
+
+        # validate schemes
+        num_partitions = len(output_partition_sizes)
+        self._validate_tfm_schemes(num_partitions)
+
+        # create submodules for weight loading
+        if len(self.input_tfms) > 0:
+            scheme_name = list(self.input_tfms.values())[0].scheme_name
+            location = list(self.input_tfms.values())[0].args.location
+            transform_name = f"{scheme_name}_{location}"
+
+            transform = HadamardTransform(self.input_tfms, layer,
+                                          weight_loader,
+                                          input_size_per_partition,
+                                          output_partition_sizes)
+            layer.register_module(transform_name, transform)
+            self.input_transform = transform
+
+        if len(self.output_tfms) > 0:
+            scheme_name = list(self.output_tfms.values())[0].scheme_name
+            location = list(self.output_tfms.values())[0].args.location
+            transform_name = f"{scheme_name}_{location}"
+
+            transform = HadamardTransform(self.output_tfms, layer,
+                                          weight_loader,
+                                          input_size_per_partition,
+                                          output_partition_sizes)
+            layer.register_module(transform_name, transform)
+            self.output_transform = transform
+
+        # compute partition ranges for slicing activations
+        starts = [0] + list(accumulate(output_partition_sizes))[:-1]
+        self.partition_ranges = list(zip(starts, output_partition_sizes))
+
+    def process_weights_after_loading(self, layer):
+        self.quant_method.process_weights_after_loading(layer)
+
+        for submodule in layer.children():
+            if isinstance(submodule, HadamardTransform):
+                submodule.process_weights_after_loading()
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        if self.input_transform is not None:
+            x = self.input_transform(x)
+
+        assert bias is None
+        x = self.quant_method.apply(layer, x, bias)
+
+        # TODO (@ksayers): Write a triton kernel to do this in parallel
+        if self.output_transform is not None:
+            for part_id, (start, length) in enumerate(self.partition_ranges):
+                x[:, start:start + length] = self.output_transform(
+                    x[:, start:start + length], part_id=part_id)
+
+        return x
+
+    def _validate_tfm_schemes(self, num_partitions: int):
+        if len(self.input_tfms) > 0:
+            if 0 not in self.input_tfms:
+                raise ValueError("Must have same input")
+
+            for part_index in range(num_partitions):
+                if self.input_tfms[part_index] != self.input_tfms[0]:
+                    raise ValueError("Must have same input")
+
+        if len(self.output_tfms) > 0:
+            scheme_name = list(self.output_tfms.values())[0].scheme_name
+            location = list(self.output_tfms.values())[0].args.location
+
+            for tfm in self.output_tfms.values():
+                if tfm.scheme_name != scheme_name:
+                    raise ValueError("Must have same scheme name")
+                if tfm.args.location != location:
+                    raise ValueError("Must have same location")
+
+        return self.input_tfms, self.output_tfms
+
+
+def get_linear_transform_schemes(
+    layer: torch.nn.Module, layer_name: str,
+    transform_config: Optional[TransformConfig],
+    packed_modules_mapping: dict[str, list[str]]
+) -> tuple[dict[int, TransformTuple], dict[
+        int, TransformTuple]]:  # [input_transform, [output_transform, ...]]
+    # there can only be one transform input scheme per (fused) module
+    input_tfms = {}
+    output_tfms = {}
+
+    partition_names = get_layer_partition_names(layer_name,
+                                                packed_modules_mapping)
+
+    for scheme_name, scheme, args in get_schemes_args(transform_config):
+        for part_index, part_name in enumerate(partition_names):
+            if is_match(part_name, layer, args.targets,
+                        args.ignore) and args.is_online():
+                if args.location == TransformLocation.INPUT:
+                    input_tfms[part_index] = TransformTuple(
+                        scheme_name, scheme, args)
+
+                elif args.location == TransformLocation.OUTPUT:
+                    output_tfms[part_index] = TransformTuple(
+                        scheme_name, scheme, args)
+
+                else:
+                    raise ValueError(f"Cannot apply `{args.location}` "
+                                     f"transform to `{layer_name}`")
+
+    return (input_tfms, output_tfms)
+
+
+def get_schemes_args(
+    transform_config: Optional[TransformConfig]
+) -> Generator[tuple[str, TransformScheme, TransformArgs]]:
+    if transform_config is None:
+        return
+
+    for scheme_name, scheme in transform_config.config_groups.items():
+        for args in scheme.apply:
+            yield (scheme_name, scheme, args)
+
+
+def get_layer_partition_names(
+        layer_name: str, packed_modules_mapping: dict[str,
+                                                      list[str]]) -> list[str]:
+    """
+    Get all partition names associated with this layer.
+    Names are returned in order of their partition indices.
+    
+    ```python
+    mapping = {"gate_up_proj", "gate_proj", "up_proj"}
+
+    assert get_layer_partition_names(
+        "mlp.gate_up_proj", mapping) == ["gate_proj", "up_proj"]
+    assert get_layer_partition_names(
+        "mlp.down_proj", mapping) == ["down_proj"]
+    """
+    for fused_suffix, part_suffixes in packed_modules_mapping.items():
+        if layer_name.endswith(fused_suffix):
+            return [
+                layer_name.removesuffix(fused_suffix) + part_suffix
+                for part_suffix in part_suffixes
+            ]
+
+    return [layer_name]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
new file mode 100644
index 0000000000..b3be254717
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from collections.abc import Hashable
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors.transform import TransformLocation, TransformScheme
+from torch import Tensor
+
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import (  # noqa: E501
+    TransformTuple)
+from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.parameter import SharedWeightParameter
+
+
+class HadamardTransform(torch.nn.Module):
+    """
+    Class which handles weight loading, postprocessing, and application of
+    transforms. Meant to be used with `CompressedTensorsLinearTransformMethod`
+    and attention transforms method (not implemented yet)
+    """
+    transforms: dict[int, TransformTuple]  # info parsed from transforms config
+    weight: SharedWeightParameter  # container for shared tensors
+
+    kernel: Callable  # function used during application
+    scales: dict[int, float]  # hadamard scale, usually sqrt(matrix.size(0))
+
+    def __init__(self,
+                 transforms: dict[int, TransformTuple],
+                 layer: torch.nn.Module,
+                 weight_loader: Callable,
+                 input_size_per_partition: int,
+                 output_partition_sizes: list[int],
+                 kernel: Optional[Callable] = None):
+        super().__init__()
+        self.transforms = transforms
+        self.scales = {}
+
+        if get_tensor_model_parallel_world_size() > 1:
+            raise NotImplementedError("Online transforms with tensor "
+                                      "parallelism is not supported")
+
+        # Similar to row/col parallel params, but tensors are separate
+        # to allow for loading with shared memory
+        self.weight = SharedWeightParameter(weight_loader=weight_loader)
+
+        # create shared partition data for each partition of the original weight
+        input_size = input_size_per_partition
+        for part_index, (_scheme_name, scheme,
+                         args) in self.transforms.items():
+            output_size = output_partition_sizes[part_index]
+            weight_size = self._get_weight_size(layer, args.location,
+                                                input_size, output_size)
+
+            data_key = self._get_data_key(scheme, weight_size)
+            self.weight.add_partition(
+                part_index,
+                data_key,
+                size=(weight_size, weight_size),
+                dtype=scheme.precision,
+            )
+
+        # validate that shared tensors and schemes are correct
+        self._validate_input_transforms()
+
+        # select kernel based on transform schemes
+        self.kernel = self._infer_kernel() if kernel is None else kernel
+
+    def process_weights_after_loading(self):
+        for part_id in self.weight.partitions:
+            data = self.weight.partitions[part_id].data
+
+            # required by torch.compile
+            self.weight.process_weights_after_loading()
+
+            # precompute scale as a runtime multiply, not division
+            # do not fold into weight in order to utilize FWHT
+            self.scales[part_id] = 1 / math.sqrt(data.size(0))
+
+            # FUTURE: avoid runtime tranpose by processing weights
+            # prior to apply
+
+    def forward(self, value: Tensor, part_id: int = 0) -> Tensor:
+        if part_id not in self.weight.partitions:
+            return value
+
+        weight = self.weight.partitions[part_id]
+        weight = weight if self.transforms[
+            part_id].args.inverse else weight.T  # linear := x(W.T)
+        scale = self.scales[part_id]
+        return self.kernel(self, value.to(weight.dtype), weight, None).to(
+            value.dtype) * scale
+
+    def _get_data_key(self, scheme: TransformScheme,
+                      weight_size: int) -> Hashable:
+        return (id(scheme), weight_size)
+
+    def _get_weight_size(self, layer: torch.nn.Module,
+                         location: TransformLocation, input_size: int,
+                         output_size: int) -> int:
+        if isinstance(layer, LinearBase):
+            if location == TransformLocation.INPUT:
+                return input_size
+
+            elif location == TransformLocation.OUTPUT:
+                return output_size
+
+        elif isinstance(layer, VocabParallelEmbedding):
+            if location == TransformLocation.INPUT:
+                return output_size
+
+            elif location == TransformLocation.OUTPUT:
+                return input_size
+
+        raise ValueError()
+
+    def _validate_input_transforms(self):
+        assert len(self.transforms) > 0
+        location = list(self.transforms.values())[0].args.location
+
+        if location == TransformLocation.INPUT:
+            first_data = self.weight.partitions[0].data
+            for partition in self.weight.partitions.values():
+                if partition.data.data_ptr() != first_data.data_ptr():
+                    raise ValueError("")
+
+    def _infer_kernel(self) -> Callable:
+        # TODO (@ksayers): use fwht, hadacore
+        return dispatch_unquantized_gemm()
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
new file mode 100644
index 0000000000..f42258f9f9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import (  # noqa: E501
+    CompressedTensorsLinearTransformMethod)
+
+
+# Because qutlass fuses hadamard with quantization, it cannot automatically be
+# composed with kernels in the way CompressedTensorsLinearTransformMethod does.
+# Therefore, a separate scheme must be created for each quantized dtype
+class QutlassLinearMethodNvFP4(CompressedTensorsLinearTransformMethod):
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # fused hadamard quant linear method
+        raise NotImplementedError()
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py
new file mode 100644
index 0000000000..2f353de1e6
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import NamedTuple
+
+from compressed_tensors.transform import TransformArgs, TransformScheme
+
+__all__ = ["TransformTuple"]
+
+
+class TransformTuple(NamedTuple):
+    scheme_name: str
+    scheme: TransformScheme
+    args: TransformArgs
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 750ee78502..9465308e94 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -1,13 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Hashable
 from fractions import Fraction
 from typing import Callable, Optional, Union
+from weakref import WeakValueDictionary
 
 import torch
 from torch.nn import Parameter
 
-from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
 from vllm.logger import init_logger
 from vllm.model_executor.utils import _make_synced_weight_loader
 
@@ -27,7 +30,7 @@ class BasevLLMParameter(Parameter):
     into the parameter when the provided weight loader is called.
     """
 
-    def __new__(cls, data: torch.Tensor, **kwargs):
+    def __new__(cls, data: Optional[torch.Tensor], **kwargs):
 
         return super().__new__(cls, data=data, requires_grad=False)
 
@@ -81,6 +84,17 @@ class BasevLLMParameter(Parameter):
     def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
         self._assert_and_load(loaded_weight)
 
+    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+
+        # if not int, assume shard_id for qkv
+        # map to int and return
+        qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        assert isinstance(shard_id, str)
+        assert shard_id in qkv_idxs
+        return qkv_idxs[shard_id]
+
 
 class _ColumnvLLMParameter(BasevLLMParameter):
     """
@@ -113,6 +127,7 @@ class _ColumnvLLMParameter(BasevLLMParameter):
 
         shard_offset = kwargs.get("shard_offset")
         shard_size = kwargs.get("shard_size")
+        # TODO: move these to PackedColumnParameter and PackedvLLMParameter
         if isinstance(
                 self,
             (PackedColumnParameter,
@@ -137,6 +152,7 @@ class _ColumnvLLMParameter(BasevLLMParameter):
         shard_id = kwargs.get("shard_id")
         num_heads = kwargs.get("num_heads")
 
+        # TODO: move these to PackedColumnParameter and PackedvLLMParameter
         if isinstance(
                 self,
             (PackedColumnParameter,
@@ -224,19 +240,8 @@ class PerTensorScaleParameter(BasevLLMParameter):
     """
 
     def __init__(self, **kwargs):
-        self.qkv_idxs = {"q": 0, "k": 1, "v": 2}
         super().__init__(**kwargs)
 
-    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
-        if isinstance(shard_id, int):
-            return shard_id
-
-        # if not int, assume shard_id for qkv
-        # map to int and return
-        assert isinstance(shard_id, str)
-        assert shard_id in self.qkv_idxs
-        return self.qkv_idxs[shard_id]
-
     # For row parallel layers, no sharding needed
     # load weight into parameter as is
     def load_row_parallel_weight(self, *args, **kwargs):
@@ -373,6 +378,141 @@ class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
     pass
 
 
+class SharedWeightParameter(BasevLLMParameter):
+    """
+    Parameter for weights with many shared tensors across a model
+
+    For example, when applying transforms to the "gate" and "up" partitions of
+    `MergedColumnParallelLinear`, the transform weights must stay separate
+    tensors in order to allow for tensor memory sharing between layers.
+    """
+    # global registry for sharing tensors based on passed `data_key`
+    # this dict holds weaksrefs to avoid memory leak after model cleanup
+    tensors_registry: WeakValueDictionary = WeakValueDictionary()
+
+    # local container for strong references to shared tensors
+    # this set compensates for the fact that torch.nn.Parameter
+    # and Parameter subclasses do not hold reliable references to tensors
+    local_tensors: set[torch.Tensor]
+
+    # dictionary mapping partition indices to associated parameters
+    partitions: dict[int, Union[ModelWeightParameter, Parameter]]
+
+    def __new__(cls, **kwargs):
+        return super().__new__(cls, data=None, **kwargs)
+
+    def __init__(self, input_dim: int = 1, output_dim: int = 0, **kwargs):
+        weight_loader: Callable = kwargs.get(
+            "weight_loader")  # type: ignore[assignment]
+        super().__init__(data=None, weight_loader=weight_loader)
+
+        self.local_tensors = set()
+        self.partitions = {}
+        self.kwargs = {
+            "input_dim": input_dim,
+            "output_dim": output_dim,
+            "weight_loader": self._fake_weight_loader
+        }
+
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > 1:
+            raise NotImplementedError(f"{self.__class__.__name__} does not "
+                                      "currently support tensor parallelism")
+
+    def add_partition(self, index: int, data_key: Hashable, *args, **kwargs):
+        """
+        Add a partition to the weight parameter. Partitions whose `data_key`
+        is the same will share tensor data
+
+        :param index: index of partition to add
+        :param data_key: hashable key used to key shared tensors
+        :param *args: arguments for `torch.empty`
+        :param **kwargs: keyword arguments for `torch.empty`
+        """
+        # load (shared) tensor using `data_key`
+        if data_key not in self.tensors_registry:
+            data = torch.empty(*args, **kwargs)
+            self.tensors_registry[data_key] = data
+        else:
+            data = self.tensors_registry[data_key]
+
+        # create associated model parameter
+        self.partitions[index] = ModelWeightParameter(
+            data=data, **self.kwargs)  # type: ignore[arg-type]
+
+        # hold local reference, since ModelWeightParameter does not
+        # see https://github.com/pytorch/pytorch/issues/75932
+        self.local_tensors.add(data)
+
+    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
+        assert len(self.partitions) == 1 and 0 in self.partitions
+        partition = self.partitions[0]
+
+        ModelWeightParameter.load_column_parallel_weight(
+            partition, loaded_weight)
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        assert len(self.partitions) == 1 and 0 in self.partitions
+        partition = self.partitions[0]
+
+        ModelWeightParameter.load_row_parallel_weight(partition, loaded_weight)
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        partition_id = kwargs.pop("shard_id")
+        partition_id = self._shard_id_as_int(partition_id)
+        partition = self.partitions[partition_id]
+
+        input_dim = self.kwargs.get("input_dim")
+        shard_size = partition.data.size(input_dim) // self.tp_size
+        shard_offset = self.tp_rank * shard_size
+
+        ModelWeightParameter.load_merged_column_weight(
+            partition,
+            loaded_weight,
+            shard_offset=shard_offset,
+            shard_size=shard_size)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        partition_id = self._shard_id_as_int(kwargs.pop("shard_id"))
+        partition = self.partitions[partition_id]
+
+        input_dim = self.kwargs.get("input_dim")
+        shard_size = partition.data.size(input_dim) // self.tp_size
+        shard_offset = self.tp_rank * shard_size
+        shard_id = "q"  # fake first partition
+        num_heads = kwargs.get("num_heads")
+
+        ModelWeightParameter.load_qkv_weight(
+            partition,
+            loaded_weight,
+            shard_offset=shard_offset,
+            shard_size=shard_size,
+            shard_id=shard_id,
+            num_heads=num_heads,
+        )
+
+    def process_weights_after_loading(self):
+        for key in self.partitions:
+            self.partitions[key] = torch.nn.Parameter(
+                data=self.partitions[key].data, requires_grad=False)
+
+    @property
+    def data(self):
+        raise ValueError("Accessing `data` of a "
+                         "`PartitionedModelWeightParameter` is not allowed. "
+                         "Instead, use `get_partition` to get the weight of "
+                         "the particular partition you want to access")
+
+    def _fake_weight_loader(self, param: BasevLLMParameter,
+                            loaded_weight: torch.Tensor,
+                            loaded_weight_shard_id: Optional[Union[str, int]]):
+        raise ValueError("When loading partition weights of "
+                         f"{self.__class__.__name__}, use methods provided by "
+                         f"{self.__class__.__name__}, not partition loader")
+
+
 def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
                           output_dim: int, **kwargs) -> BasevLLMParameter:
     """
@@ -456,4 +596,4 @@ def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor,
             shard_offset=shard_offset,
             bitblas_tile_size=bitblas_tile_size)
 
-    return shard_size, shard_offset
\ No newline at end of file
+    return shard_size, shard_offset

From c07a73317d202c2dad67f12893fcddb6d3664950 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Thu, 28 Aug 2025 14:51:24 +0800
Subject: [PATCH 681/932] [CI] enable idefics3 and fuyu-8b test in multimodal
 test (#23790)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 .../multimodal/generation/test_common.py      | 36 ++++++++-----------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 2b60faae8e..d61b182761 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -189,23 +189,21 @@ VLM_TEST_SETTINGS = {
         },
         marks=[pytest.mark.core_model],
     ),
-    # FIXME(Isotr0py): Enable this test after
-    # https://github.com/huggingface/transformers/pull/39470 released
-    # "idefics3-transformers": VLMTestInfo(
-    #     models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
-    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-    #     prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
-    #     img_idx_to_prompt=lambda idx: "<image>",
-    #     max_model_len=8192,
-    #     max_num_seqs=2,
-    #     auto_cls=AutoModelForImageTextToText,
-    #     hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
-    #     image_size_factors=[(0.25, 0.5, 1.0)],
-    #     vllm_runner_kwargs={
-    #         "model_impl": "transformers",
-    #     },
-    #     marks=[pytest.mark.core_model],
-    # ),
+    "idefics3-transformers": VLMTestInfo(
+        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+        },
+        marks=[pytest.mark.core_model],
+    ),
     # Pixel values from processor are not 4D or 5D arrays
     "qwen2_5_vl-transformers": VLMTestInfo(
         models=["Qwen/Qwen2.5-VL-3B-Instruct"],
@@ -322,10 +320,6 @@ VLM_TEST_SETTINGS = {
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        # FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we
-        # should enable this again after the fix is released:
-        # https://github.com/huggingface/transformers/pull/39915
-        marks=[pytest.mark.skip("HF model is broken")],
     ),
     "gemma3": VLMTestInfo(
         models=["google/gemma-3-4b-it"],

From daa1273b14da5bdf643aa4b1bcbef3985b1edd75 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Thu, 28 Aug 2025 15:27:45 +0800
Subject: [PATCH 682/932] [Bugfix] when set offline model running error
 (#23711)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/entrypoints/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index d8905fc141..d2d7dba3ae 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -313,12 +313,14 @@ def log_non_default_args(args: Union[argparse.Namespace, EngineArgs]):
 
     # Handle EngineArgs instance
     elif isinstance(args, EngineArgs):
-        default_args = EngineArgs()  # Create default instance
+        default_args = EngineArgs(model=args.model)  # Create default instance
         for field in dataclasses.fields(args):
             current_val = getattr(args, field.name)
             default_val = getattr(default_args, field.name)
             if current_val != default_val:
                 non_default_args[field.name] = current_val
+        if default_args.model != EngineArgs.model:
+            non_default_args["model"] = default_args.model
     else:
         raise TypeError("Unsupported argument type. " \
         "Must be argparse.Namespace or EngineArgs instance.")

From 186aced5ffb62b62b41eb1beaf2a598ada43351b Mon Sep 17 00:00:00 2001
From: yzds <41983536+youzhedian@users.noreply.github.com>
Date: Thu, 28 Aug 2025 15:29:11 +0800
Subject: [PATCH 683/932] [Kernel] cuda kernels for upcoming decode context
 parallel feature (#23791)

Co-authored-by: hongchao <hongchao@msh.team>
---
 csrc/cache.h                          |  17 +-
 csrc/cache_kernels.cu                 | 247 ++++++++++++++++++++++++++
 csrc/torch_bindings.cpp               |  15 ++
 tests/kernels/attention/test_cache.py |  72 ++++++++
 vllm/_custom_ops.py                   |  24 +++
 5 files changed, 374 insertions(+), 1 deletion(-)

diff --git a/csrc/cache.h b/csrc/cache.h
index fb0c353b96..e8e069aefd 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -36,6 +36,13 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                           const std::string& kv_cache_dtype,
                           torch::Tensor& scale);
 
+void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
+                                   torch::Tensor& cp_local_token_select_indices,
+                                   torch::Tensor& kv_cache,
+                                   torch::Tensor& slot_mapping,
+                                   const std::string& kv_cache_dtype,
+                                   torch::Tensor& scale);
+
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
@@ -47,4 +54,12 @@ void gather_and_maybe_dequant_cache(
     torch::Tensor const& cu_seq_lens,  // [BATCH+1]
     int64_t batch_size, const std::string& kv_cache_dtype,
     torch::Tensor const& scale,
-    std::optional<torch::Tensor> seq_starts = std::nullopt);
\ No newline at end of file
+    std::optional<torch::Tensor> seq_starts = std::nullopt);
+
+// TODO(hc): cp_gather_cache need support scaled kvcahe in the future.
+void cp_gather_cache(
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
+    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index b3a985c2d5..fc82a1fa8e 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -1,6 +1,7 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAException.h>
 
 #include "cuda_utils.h"
 #include "cuda_compat.h"
@@ -395,6 +396,51 @@ __global__ void concat_and_cache_mla_kernel(
   copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }
 
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void cp_fused_concat_and_cache_mla_kernel(
+    const scalar_t* __restrict__ kv_c,  // [num_full_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_full_tokens, pe_dim]
+    const int64_t* __restrict__ cp_local_token_select_indices,  // [num_tokens]
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                     // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int block_stride,                    //
+    const int entry_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size,                      //
+    const float* scale                         //
+) {
+  const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
+  const int64_t slot_idx = slot_mapping[blockIdx.x];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+
+  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
+                  int src_stride, int dst_stride, int size, int offset) {
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+      const int64_t src_idx = token_idx * src_stride + i;
+      const int64_t dst_idx =
+          block_idx * block_stride + block_offset * entry_stride + i + offset;
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst[dst_idx] = src[src_idx];
+      } else {
+        dst[dst_idx] =
+            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
+      }
+    }
+  };
+
+  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+}
+
 }  // namespace vllm
 
 // KV_T is the data type of key and value tensors.
@@ -508,6 +554,20 @@ void reshape_and_cache_flash(
           kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
           reinterpret_cast<const float*>(scale.data_ptr()));
 
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)     \
+  vllm::cp_fused_concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>   \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
+          cp_local_token_select_indices.data_ptr<int64_t>(),            \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
+          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
+          reinterpret_cast<const float*>(scale.data_ptr()));
+
 void concat_and_cache_mla(
     torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
     torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@@ -546,6 +606,50 @@ void concat_and_cache_mla(
                              CALL_CONCAT_AND_CACHE_MLA);
 }
 
+// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
+// calls into one:
+// k_c_normed.index_select(0, cp_local_token_select_indices) + \
+// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
+// concat_and_cache_mla.
+void cp_fused_concat_and_cache_mla(
+    torch::Tensor& kv_c,  // [num_total_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,  // [num_total_tokens, pe_dim]
+    torch::Tensor& cp_local_token_select_indices,  // [num_tokens]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
+                                  // pe_dim)]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& scale) {
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
+  int kv_lora_rank = kv_c.size(1);
+  int pe_dim = k_pe.size(1);
+  int block_size = kv_cache.size(1);
+
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+
+  int kv_c_stride = kv_c.stride(0);
+  int k_pe_stride = k_pe.stride(0);
+  int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(kv_lora_rank, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                             CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
+}
+
 namespace vllm {
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@@ -779,3 +883,146 @@ void gather_and_maybe_dequant_cache(
 
   DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
 }
+
+namespace vllm {
+template <typename scalar_t>
+// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
+// block_size.
+__global__ void cp_gather_cache(
+    const scalar_t* __restrict__ src_cache,   // [NUM_BLOCKS, BLOCK_SIZE,
+                                              // ENTRY_SIZE]
+    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRY_SIZE]
+    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
+    const int32_t block_size, const int32_t entry_size,
+    const int64_t block_table_stride, const int64_t cache_block_stride,
+    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
+    const int32_t* __restrict__ seq_starts  // Optional: starting offsets per
+                                            // batch
+) {
+  const int64_t bid = blockIdx.x;  // Batch ID
+  const int32_t num_splits = gridDim.y;
+  const int32_t split = blockIdx.y;
+  const int32_t seq_start = cu_seq_lens[bid];
+  const int32_t seq_end = cu_seq_lens[bid + 1];
+  const int32_t seq_len = seq_end - seq_start;
+  const int32_t tot_slots = seq_len;
+  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
+
+  const int32_t split_start = split * split_slots;
+  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
+
+  const bool is_active_split = (split_start < tot_slots);
+  const bool is_last_split = (split_end == tot_slots);
+
+  if (!is_active_split) return;
+
+  // Adjust the pointer for the block_table for this batch.
+  // If seq_starts is provided, compute an offset based on it
+  const int32_t batch_offset = bid * block_table_stride;
+  int32_t offset = split_start;
+  if (seq_starts != nullptr) {
+    offset += seq_starts[bid];
+  }
+  int32_t offset_div = offset / block_size;
+  offset = offset % block_size;
+  const int32_t* batch_block_table = block_table + batch_offset;
+
+  // Adjust dst pointer based on the cumulative sequence lengths.
+  dst += seq_start * dst_entry_stride;
+
+  auto copy_entry = [&](const scalar_t* __restrict__ _src,
+                        scalar_t* __restrict__ _dst) {
+    for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
+      _dst[i] = _src[i];
+  };
+
+  for (int pid = split_start; pid < split_end; ++pid) {
+    auto block_id = batch_block_table[offset_div];
+    auto block_start_ptr = src_cache + block_id * cache_block_stride;
+    auto block_dst_ptr = dst + pid * dst_entry_stride;
+    copy_entry(block_start_ptr + offset * cache_entry_stride, block_dst_ptr);
+    offset += 1;
+    // bump to next block
+    if (offset == block_size) {
+      offset_div += 1;
+      offset = 0;
+    }
+  }
+}
+}  // namespace vllm
+
+// Macro to dispatch the kernel based on the data type.
+#define CALL_CP_GATHER_CACHE(CPY_DTYPE)                                 \
+  vllm::cp_gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>(         \
+      reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()),               \
+      reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()),                     \
+      block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
+      block_size, entry_size, block_table_stride, cache_block_stride,   \
+      cache_entry_stride, dst_entry_stride, seq_starts_ptr);
+
+// Gather sequences from the cache into the destination tensor.
+//  - cu_seq_lens contains the cumulative sequence lengths for each batch
+//  - block_table contains the cache block indices for each sequence
+//  - Optionally, seq_starts (if provided) offsets the starting slot index by
+//  seq_starts[bid]
+void cp_gather_cache(
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
+    int64_t batch_size,
+    std::optional<torch::Tensor> seq_starts = std::nullopt) {
+  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int32_t block_size = src_cache.size(1);
+  int32_t entry_size = src_cache.flatten(2, -1).size(2);
+
+  TORCH_CHECK(block_table.dtype() == torch::kInt32,
+              "block_table must be int32");
+  TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
+              "cu_seq_lens must be int32");
+  if (seq_starts.has_value()) {
+    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
+                "seq_starts must be int32");
+  }
+
+  TORCH_CHECK(src_cache.device() == dst.device(),
+              "src_cache and dst must be on the same device");
+  TORCH_CHECK(src_cache.device() == block_table.device(),
+              "src_cache and block_table must be on the same device");
+  TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
+              "src_cache and cu_seq_lens must be on the same device");
+  if (seq_starts.has_value()) {
+    TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
+                "src_cache and seq_starts must be on the same device");
+  }
+
+  int64_t block_table_stride = block_table.stride(0);
+  int64_t cache_block_stride = src_cache.stride(0);
+  int64_t cache_entry_stride = src_cache.stride(1);
+  int64_t dst_entry_stride = dst.stride(0);
+
+  // Decide on the number of splits based on the batch size.
+  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+  dim3 grid(batch_size, num_splits);
+  dim3 block(1024);
+
+  TORCH_CHECK(src_cache.dtype() == dst.dtype(),
+              "src_cache and dst must have the same dtype");
+
+  const int dtype_bits = src_cache.element_size() * 8;
+  const int32_t* seq_starts_ptr =
+      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
+
+  if (dtype_bits == 32) {
+    CALL_CP_GATHER_CACHE(uint32_t);
+  } else if (dtype_bits == 16) {
+    CALL_CP_GATHER_CACHE(uint16_t);
+  } else if (dtype_bits == 8) {
+    CALL_CP_GATHER_CACHE(uint8_t);
+  } else {
+    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
+  }
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 7ae054dc19..608b724403 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -686,6 +686,16 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                     Tensor scale) -> ()");
   cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);
 
+  cache_ops.def(
+      "cp_fused_concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
+      "                              Tensor cp_local_token_select_indices,"
+      "                              Tensor! kv_cache,"
+      "                              Tensor slot_mapping,"
+      "                              str kv_cache_dtype,"
+      "                              Tensor scale) -> ()");
+  cache_ops.impl("cp_fused_concat_and_cache_mla", torch::kCUDA,
+                 &cp_fused_concat_and_cache_mla);
+
   // Convert the key and value cache to fp8 data type.
   cache_ops.def(
       "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
@@ -702,6 +712,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                               Tensor scale, Tensor? seq_starts) -> ()");
   cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
                  &gather_and_maybe_dequant_cache);
+
+  cache_ops.def(
+      "cp_gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
+      "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
+  cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache);
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index cbf11da63c..69e96dfd2c 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -790,6 +790,78 @@ def test_gather_and_maybe_dequant_cache_mla(kv_lora_rank, qk_rope_head_dim,
     torch.testing.assert_close(dst, expected)
 
 
+@pytest.mark.parametrize("kv_lora_rank", [512])
+@pytest.mark.parametrize("qk_rope_head_dim", [64])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_blocks", [1024])
+@pytest.mark.parametrize("max_seq_len", [512])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype",
+                         ["auto"])  # You can also test "fp8" if needed.
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_cp_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
+                             num_blocks, max_seq_len, batch_size, dtype,
+                             kv_cache_dtype, device):
+    entry_size = kv_lora_rank + qk_rope_head_dim
+    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                  kv_cache_dtype, device)
+    _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
+
+    seq_len_tensor = torch.randint(0,
+                                   max_seq_len + 1, (batch_size, ),
+                                   device=device)
+
+    total_tokens = seq_len_tensor.sum()
+    cu_seq_lens = torch.empty((batch_size + 1),
+                              dtype=torch.int32,
+                              device=device)
+    cu_seq_lens[0] = 0
+    cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
+    print("seq_len_tensor", seq_len_tensor)
+
+    tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
+    block_table = torch.empty((batch_size, num_blocks),
+                              dtype=torch.int32,
+                              device=device)
+
+    for b in range(batch_size):
+        perm = torch.randperm(num_blocks, device=device)
+        block_table[b, :] = perm
+
+    dst = torch.zeros((total_tokens, entry_size),
+                      dtype=src_cache.dtype,
+                      device=device)
+
+    expected_batches = []
+    for b in range(batch_size):
+        s = seq_len_tensor[b]
+        if s == 0:
+            continue
+        tot = tot_blocks_tensor[b]
+        blocks = block_table[b, :tot].tolist()
+
+        gathered_rows = []
+        for i in range(tot - 1):
+            gathered_rows.append(src_cache[blocks[i]])
+        remaining = s - (tot - 1) * block_size
+        gathered_rows.append(src_cache[blocks[-1], :remaining, :])
+
+        batch_expected = torch.cat(gathered_rows, dim=0)
+        expected_batches.append(batch_expected)
+    expected = torch.cat(expected_batches, dim=0)
+
+    opcheck(
+        torch.ops._C_cache_ops.cp_gather_cache,
+        (src_cache, dst, block_table, cu_seq_lens, batch_size, None),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.cp_gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size)
+    torch.testing.assert_close(dst, expected)
+
+
 @pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
 @pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 054dc9d985..340d6e1164 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1625,6 +1625,20 @@ def concat_and_cache_mla(
                                                 scale)
 
 
+def cp_fused_concat_and_cache_mla(
+    kv_c: torch.Tensor,
+    k_pe: torch.Tensor,
+    cp_local_token_select_indices: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    scale: torch.Tensor,
+) -> None:
+    torch.ops._C_cache_ops.cp_fused_concat_and_cache_mla(
+        kv_c, k_pe, cp_local_token_select_indices, kv_cache, slot_mapping,
+        kv_cache_dtype, scale)
+
+
 def copy_blocks(key_caches: list[torch.Tensor],
                 value_caches: list[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
@@ -1662,6 +1676,16 @@ def gather_and_maybe_dequant_cache(
         scale, seq_starts)
 
 
+def cp_gather_cache(src_cache: torch.Tensor,
+                    dst: torch.Tensor,
+                    block_table: torch.Tensor,
+                    cu_seq_lens: torch.Tensor,
+                    batch_size: int,
+                    seq_starts: Optional[torch.Tensor] = None) -> None:
+    torch.ops._C_cache_ops.cp_gather_cache(src_cache, dst, block_table,
+                                           cu_seq_lens, batch_size, seq_starts)
+
+
 def get_device_attribute(attribute: int, device: int) -> int:
     return torch.ops._C_cuda_utils.get_device_attribute(attribute, device)
 

From 11a7fafaa8807bfeea4b60466c576ec6a7031bfd Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Thu, 28 Aug 2025 15:36:42 +0800
Subject: [PATCH 684/932] [New Model]: Support
 GteNewModelForSequenceClassification (#23524)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/supported_models.md               |  4 +
 tests/conftest.py                             |  5 +-
 tests/models/language/pooling/embed_utils.py  |  3 +
 tests/models/language/pooling/mteb_utils.py   |  6 ++
 .../pooling/test_bge_reranker_v2_gemma.py     | 24 ++----
 tests/models/language/pooling/test_gte.py     | 24 +++---
 .../language/pooling/test_mxbai_rerank.py     | 19 ++---
 .../language/pooling/test_qwen3_reranker.py   | 26 ++----
 tests/models/registry.py                      |  4 +
 tests/models/utils.py                         | 28 ++++---
 vllm/model_executor/models/bert_with_rope.py  | 83 +++++++++++++++++--
 vllm/model_executor/models/config.py          |  1 +
 vllm/model_executor/models/registry.py        |  6 +-
 13 files changed, 157 insertions(+), 76 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 20cf75873a..34e4655848 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -497,6 +497,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ |
 | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. |  |  | ✅︎ |
 | `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ |
@@ -513,6 +514,9 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
     vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}'
     ```
 
+!!! note
+    The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
+
 !!! note
     Load the official original `mxbai-rerank-v2` by using the following command.
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 6052ada1c5..9fed43cba5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -456,11 +456,10 @@ class HfRunner:
         # output is final logits
         all_inputs = self.get_inputs(prompts)
         outputs = []
+        problem_type = getattr(self.config, "problem_type", "")
+
         for inputs in all_inputs:
             output = self.model(**self.wrap_device(inputs))
-
-            problem_type = getattr(self.config, "problem_type", "")
-
             if problem_type == "regression":
                 logits = output.logits[0].tolist()
             elif problem_type == "multi_label_classification":
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
index 61c5fcab4f..a74ad2aa25 100644
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -51,6 +51,9 @@ def correctness_test_embed_models(hf_runner,
     vllm_extra_kwargs = vllm_extra_kwargs or {}
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
     with vllm_runner(model_info.name,
                      runner="pooling",
                      max_model_len=None,
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 4a1f8a53d0..640858125b 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -172,6 +172,9 @@ def mteb_test_embed_models(hf_runner,
     vllm_extra_kwargs = vllm_extra_kwargs or {}
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
     with vllm_runner(model_info.name,
                      runner="pooling",
                      max_model_len=None,
@@ -284,6 +287,9 @@ def mteb_test_rerank_models(hf_runner,
     vllm_extra_kwargs = vllm_extra_kwargs or {}
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
     with vllm_runner(model_info.name,
                      runner="pooling",
                      max_model_len=None,
diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
index 206524d7ca..f473e0ba01 100644
--- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
@@ -13,7 +13,14 @@ from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models
 
 RERANK_MODELS = [
     LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
-                               architecture="GemmaForSequenceClassification"),
+                               architecture="GemmaForSequenceClassification",
+                               hf_overrides={
+                                   "architectures":
+                                   ["GemmaForSequenceClassification"],
+                                   "classifier_from_token": ["Yes"],
+                                   "method":
+                                   "no_post_processing",
+                               }),
 ]
 
 PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501
@@ -119,22 +126,9 @@ class GemmaMtebEncoder(VllmMtebEncoder):
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo,
-                            monkeypatch) -> None:
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    assert model_info.architecture == "GemmaForSequenceClassification"
-
-    vllm_extra_kwargs: dict[str, Any] = {
-        "hf_overrides": {
-            "architectures": ["GemmaForSequenceClassification"],
-            "classifier_from_token": ["Yes"],
-            "method": "no_post_processing",
-        }
-    }
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
 
     mteb_test_rerank_models(GemmaRerankerHfRunner,
                             vllm_runner,
                             model_info,
-                            vllm_extra_kwargs,
                             vllm_mteb_encoder=GemmaMtebEncoder)
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index f805a64103..9911620c01 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
 
 import pytest
 
@@ -33,12 +32,15 @@ MODELS = [
     ########### NewModel
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
                              architecture="GteNewModel",
+                             hf_overrides={"architectures": ["GteNewModel"]},
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
                              architecture="GteNewModel",
+                             hf_overrides={"architectures": ["GteNewModel"]},
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
                              architecture="GteNewModel",
+                             hf_overrides={"architectures": ["GteNewModel"]},
                              enable_test=True),
     ########### Qwen2ForCausalLM
     LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
@@ -60,11 +62,16 @@ MODELS = [
 ]
 
 RERANK_MODELS = [
-    # classifier_pooling: mean
     CLSPoolingRerankModelInfo(
+        # classifier_pooling: mean
         "Alibaba-NLP/gte-reranker-modernbert-base",
         architecture="ModernBertForSequenceClassification",
         enable_test=True),
+    CLSPoolingRerankModelInfo(
+        "Alibaba-NLP/gte-multilingual-reranker-base",
+        architecture="GteNewForSequenceClassification",
+        hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
+        enable_test=True),
 ]
 
 
@@ -75,12 +82,7 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
         check_transformers_version(model_info.name,
                                    max_transformers_version="4.53.2")
 
-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "GteNewModel":
-        vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
-
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info,
-                           vllm_extra_kwargs)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
@@ -91,12 +93,8 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
         check_transformers_version(model_info.name,
                                    max_transformers_version="4.53.2")
 
-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "GteNewModel":
-        vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
-
     correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts, vllm_extra_kwargs)
+                                  example_prompts)
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py
index 480bd5e456..73823deeff 100644
--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@@ -10,12 +10,20 @@ from tests.conftest import HfRunner
 from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
 from .mteb_utils import mteb_test_rerank_models
 
+mxbai_rerank_hf_overrides = {
+    "architectures": ["Qwen2ForSequenceClassification"],
+    "classifier_from_token": ["0", "1"],
+    "method": "from_2_way_softmax",
+}
+
 RERANK_MODELS = [
     LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
                                architecture="Qwen2ForSequenceClassification",
+                               hf_overrides=mxbai_rerank_hf_overrides,
                                enable_test=True),
     LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
                                architecture="Qwen2ForSequenceClassification",
+                               hf_overrides=mxbai_rerank_hf_overrides,
                                enable_test=False)
 ]
 
@@ -71,13 +79,4 @@ class MxbaiRerankerHfRunner(HfRunner):
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "Qwen2ForSequenceClassification":
-        vllm_extra_kwargs["hf_overrides"] = {
-            "architectures": ["Qwen2ForSequenceClassification"],
-            "classifier_from_token": ["0", "1"],
-            "method": "from_2_way_softmax",
-        }
-
-    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
+    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info)
diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py
index 37f5566a33..8c6537f319 100644
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -11,12 +11,20 @@ from tests.utils import multi_gpu_test
 from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
 from .mteb_utils import mteb_test_rerank_models
 
+qwen3_reranker_hf_overrides = {
+    "architectures": ["Qwen3ForSequenceClassification"],
+    "classifier_from_token": ["no", "yes"],
+    "is_original_qwen3_reranker": True,
+}
+
 RERANK_MODELS = [
     LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
                                architecture="Qwen3ForSequenceClassification",
+                               hf_overrides=qwen3_reranker_hf_overrides,
                                enable_test=True),
     LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
                                architecture="Qwen3ForSequenceClassification",
+                               hf_overrides=qwen3_reranker_hf_overrides,
                                enable_test=False)
 ]
 
@@ -74,18 +82,7 @@ class Qwen3RerankerHfRunner(HfRunner):
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
 
-    assert model_info.architecture == "Qwen3ForSequenceClassification"
-
-    vllm_extra_kwargs: dict[str, Any] = {
-        "hf_overrides": {
-            "architectures": ["Qwen3ForSequenceClassification"],
-            "classifier_from_token": ["no", "yes"],
-            "is_original_qwen3_reranker": True,
-        }
-    }
-
-    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
+    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
@@ -96,11 +93,6 @@ def test_rerank_models_mteb_tp(vllm_runner,
     assert model_info.architecture == "Qwen3ForSequenceClassification"
 
     vllm_extra_kwargs: dict[str, Any] = {
-        "hf_overrides": {
-            "architectures": ["Qwen3ForSequenceClassification"],
-            "classifier_from_token": ["no", "yes"],
-            "is_original_qwen3_reranker": True,
-        },
         "tensor_parallel_size": 2,
     }
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 2538e71692..85b4c96e3b 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -365,6 +365,10 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
 
     # [Cross-encoder]
     "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True),  # noqa: E501
+    "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base",  # noqa: E501
+                                                       trust_remote_code=True,
+                                                       hf_overrides={
+                                                           "architectures": ["GteNewForSequenceClassification"]}),# noqa: E501
     "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501
     "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True),  # noqa: E501
     "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True),  # noqa: E501
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 84aeb927c5..0fb1f5b375 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -3,7 +3,8 @@
 
 import warnings
 from collections.abc import Sequence
-from typing import Any, NamedTuple, Optional, Union
+from dataclasses import dataclass
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -339,36 +340,43 @@ def softmax(data):
         return F.softmax(data, dim=-1)
 
 
-class EmbedModelInfo(NamedTuple):
+@dataclass
+class ModelInfo:
     name: str
-    is_matryoshka: bool = False
-    matryoshka_dimensions: Optional[list[int]] = None
     architecture: str = ""
     dtype: str = "auto"
+    hf_overrides: Optional[dict[str, Any]] = None
     default_pooling_type: str = ""
     enable_test: bool = True
 
 
+@dataclass
+class EmbedModelInfo(ModelInfo):
+    is_matryoshka: bool = False
+    matryoshka_dimensions: Optional[list[int]] = None
+
+
+@dataclass
 class CLSPoolingEmbedModelInfo(EmbedModelInfo):
     default_pooling_type: str = "CLS"
 
 
+@dataclass
 class LASTPoolingEmbedModelInfo(EmbedModelInfo):
     default_pooling_type: str = "LAST"
 
 
-class RerankModelInfo(NamedTuple):
-    name: str
-    architecture: str = ""
-    dtype: str = "auto"
-    default_pooling_type: str = ""
-    enable_test: bool = True
+@dataclass
+class RerankModelInfo(ModelInfo):
+    pass
 
 
+@dataclass
 class CLSPoolingRerankModelInfo(RerankModelInfo):
     default_pooling_type: str = "CLS"
 
 
+@dataclass
 class LASTPoolingRerankModelInfo(RerankModelInfo):
     default_pooling_type: str = "LAST"
 
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index dcb7e75456..3be7e11d94 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -27,12 +27,15 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.utils import WeightsMapper
+from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
+                                              maybe_prefix)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsQuant
+from ..layers.pooler import ClassifierPooler, DispatchPooler, Pooler
+from .bert import BertPooler
+from .interfaces import SupportsCrossEncoding, SupportsQuant
 from .interfaces_base import default_pooling_type
 
 
@@ -406,9 +409,14 @@ class BertWithRopeEncoder(nn.Module):
 class BertWithRope(nn.Module, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 add_pooling_layer: bool = False):
         super().__init__()
         self.vllm_config = vllm_config
+        self.add_pooling_layer = add_pooling_layer
         self.config = vllm_config.model_config.hf_config
         self.embeddings = BertWithRopeEmbedding(self.config)
         self.encoder = BertWithRopeEncoder(
@@ -416,6 +424,7 @@ class BertWithRope(nn.Module, SupportsQuant):
             bias=getattr(self.config, "bias", True),
             rotary_kwargs=self.config.rotary_kwargs,
             prefix=f"{prefix}.encoder")
+        self.pooler = BertPooler(self.config) if add_pooling_layer else None
 
     def forward(
         self,
@@ -448,7 +457,7 @@ class BertWithRope(nn.Module, SupportsQuant):
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if "pooler" in name:
+            if not self.add_pooling_layer and "pooler" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
@@ -508,8 +517,8 @@ class GteNewModel(BertWithRope):
             "attention.o_proj": "attn.out_proj",
         })
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs):
+        super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
 
         # GteNewModel only gate_up_proj does not have bias.
         # Hack method learned from vllm/model_executor/models/glm.py
@@ -614,3 +623,65 @@ class JinaRobertaModel(BertWithRope):
                                                    torch.Tensor]]) -> set[str]:
         weights = self.jina_merge_lora_weights(weights)
         return super().load_weights(weights)
+
+
+@default_pooling_type("CLS")
+class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding):
+    is_pooling_model = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.new = GteNewModel(vllm_config=vllm_config,
+                               prefix=prefix,
+                               add_pooling_layer=True)
+        self.classifier = RowParallelLinear(config.hidden_size,
+                                            config.num_labels,
+                                            input_is_parallel=False,
+                                            bias=True,
+                                            quant_config=quant_config,
+                                            prefix=maybe_prefix(
+                                                prefix, "classifier"),
+                                            return_bias=False)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler({
+            "encode":
+            Pooler.for_encode(pooler_config),
+            "classify":
+            ClassifierPooler(
+                pooling=self.new.pooler,
+                classifier=self.classifier,
+                act_fn=ClassifierPooler.act_fn_for_seq_cls(
+                    vllm_config.model_config),
+            ),
+            "score":
+            ClassifierPooler(
+                pooling=self.new.pooler,
+                classifier=self.classifier,
+                act_fn=ClassifierPooler.act_fn_for_cross_encoder(
+                    vllm_config.model_config),
+            ),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        loaded_params = loader.load_weights(weights)
+        return loaded_params
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        return self.new(input_ids=input_ids,
+                        positions=positions,
+                        inputs_embeds=inputs_embeds,
+                        intermediate_tensors=intermediate_tensors)
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index b0dbfacece..377b7bf26a 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -406,6 +406,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
 MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteModel": SnowflakeGteNewModelConfig,
     "GteNewModel": GteNewModelConfig,
+    "GteNewForSequenceClassification": GteNewModelConfig,
     "NomicBertModel": NomicBertModelConfig,
     "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
     "Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 12c0c77784..9040189ee5 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -191,12 +191,14 @@ _EMBEDDING_MODELS = {
 
 _CROSS_ENCODER_MODELS = {
     "BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
+    "GteNewForSequenceClassification": ("bert_with_rope",
+                                        "GteNewForSequenceClassification"),
+    "ModernBertForSequenceClassification": ("modernbert",
+                                            "ModernBertForSequenceClassification"),
     "RobertaForSequenceClassification": ("roberta",
                                          "RobertaForSequenceClassification"),
     "XLMRobertaForSequenceClassification": ("roberta",
                                             "RobertaForSequenceClassification"),
-    "ModernBertForSequenceClassification": ("modernbert",
-                                            "ModernBertForSequenceClassification"),
     # [Auto-converted (see adapters.py)]
     "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
 }

From c5d004aaaf3b2106d33974c673bec0568c18f762 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 28 Aug 2025 16:03:28 +0800
Subject: [PATCH 685/932] [Model] Add PP support and VLM backbone compatability
 for GPT-OSS (#23680)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md       |   2 +-
 vllm/model_executor/models/gpt_oss.py | 119 +++++++++++++++++++-------
 2 files changed, 87 insertions(+), 34 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 34e4655848..17947e8cfa 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -358,7 +358,7 @@ th {
 | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
 | `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ |
-| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | | ✅︎ |
+| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ | ✅︎ |
 | `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 9c1c05320c..2b118d8491 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -11,7 +11,8 @@ from transformers import GptOssConfig
 from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_ep_group, get_tensor_model_parallel_rank,
+from vllm.distributed import (get_ep_group, get_pp_group,
+                              get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -27,7 +28,10 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.utils import cdiv
 
+from .interfaces import SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
 
@@ -75,8 +79,6 @@ class OAIAttention(nn.Module):
                         dtype=torch.bfloat16,
                         requires_grad=False))
 
-        self.norm = RMSNorm(config.hidden_size, eps=1e-5)
-
         self.q_size = self.num_attention_heads * self.head_dim // tp_size
         self.kv_size = self.num_key_value_heads * self.head_dim // tp_size
         self.scaling = self.head_dim**-0.5
@@ -119,16 +121,13 @@ class OAIAttention(nn.Module):
 
     def forward(self, hidden_states: torch.Tensor,
                 positions: torch.Tensor) -> torch.Tensor:
-        t = self.norm(hidden_states)
-
-        qkv, _ = self.qkv(t)
+        qkv, _ = self.qkv(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
         v = v.contiguous()
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
-
-        return output + hidden_states
+        return output
 
 
 class MLPBlock(torch.nn.Module):
@@ -145,7 +144,6 @@ class MLPBlock(torch.nn.Module):
         self.num_experts = config.num_local_experts
         self.experts_per_token = config.num_experts_per_tok
         self.world_size = dist.get_world_size() if dist.is_initialized() else 1
-        self.norm = RMSNorm(config.hidden_size, eps=1e-5)
         self.router = torch.nn.Linear(config.hidden_size,
                                       config.num_local_experts,
                                       dtype=torch.bfloat16)
@@ -163,10 +161,9 @@ class MLPBlock(torch.nn.Module):
                                 activation="swigluoai")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        t = self.norm(x)
-        g = self.router(t)
-        t = self.experts(hidden_states=t, router_logits=g)
-        return x + t
+        g = self.router(x)
+        x = self.experts(hidden_states=x, router_logits=g)
+        return x
 
 
 class TransformerBlock(torch.nn.Module):
@@ -187,12 +184,28 @@ class TransformerBlock(torch.nn.Module):
                             self.layer_idx,
                             quant_config=quant_config,
                             prefix=f"{prefix}.mlp")
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
 
-    def forward(self, hidden_states: torch.Tensor,
-                positions: torch.Tensor) -> torch.Tensor:
-        attn_output = self.attn(hidden_states, positions)
-        output = self.mlp(attn_output)
-        return output
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.attn(hidden_states, positions)
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        output = self.mlp(hidden_states)
+        return output, residual
 
 
 @support_torch_compile
@@ -214,22 +227,52 @@ class GptOssModel(nn.Module):
             self.config.vocab_size,
             self.config.hidden_size,
         )
-        self.layers = torch.nn.ModuleList([
-            TransformerBlock(
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            self.config.num_hidden_layers,
+            lambda prefix: TransformerBlock(
                 self.config,
                 cache_config=self.cache_config,
                 quant_config=self.quant_config,
-                prefix=maybe_prefix(prefix, f"block.{layer_idx}"),
-            ) for layer_idx in range(self.config.num_hidden_layers)
-        ])
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
         self.norm = RMSNorm(self.config.hidden_size, eps=1e-5)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], self.config.hidden_size))
 
-    def forward(self, input_ids: torch.Tensor,
-                positions: torch.Tensor) -> torch.Tensor:
-        x = self.embedding(input_ids)
-        for layer in self.layers:
-            x = layer(x, positions)
-        x = self.norm(x)
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embedding(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                x = inputs_embeds
+            else:
+                x = self.get_input_embeddings(input_ids)
+
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            x = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            x, residual = layer(x, positions, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": x,
+                "residual": residual
+            })
+        x, _ = self.norm(x, residual)
         return x
 
     def _load_weights_mxfp4(
@@ -264,6 +307,10 @@ class GptOssModel(nn.Module):
                           intermediate_size)
 
         for name, weight in weights:
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+
             # FIXME(woosuk): Remove this after testing.
             weight = weight.cuda()
 
@@ -445,6 +492,10 @@ class GptOssModel(nn.Module):
                           intermediate_size)
 
         for name, weight in weights:
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+
             if ".w13_weight" in name:
                 # Handle MLP gate and up projection weights
                 # Extract gate and up projection parts
@@ -562,18 +613,15 @@ class GptOssModel(nn.Module):
                                             weights, stacked_params_mapping)
 
 
-class GptOssForCausalLM(nn.Module):
+class GptOssForCausalLM(nn.Module, SupportsPP):
     packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]}
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             ".self_attn.": ".attn.",
-            ".post_attention_layernorm.": ".mlp.norm.",
         },
         orig_to_new_suffix={
             ".embed_tokens.weight": ".embedding.weight",
-            ".input_layernorm.weight": ".attn.norm.weight",
-            ".post_attention_layernorm.weight": ".mlp.norm.weight",
 
             # MoE MXFP4 weights
             ".gate_up_proj_blocks": ".w13_weight",
@@ -609,6 +657,11 @@ class GptOssForCausalLM(nn.Module):
             self.config.hidden_size,
         )
         self.logits_processor = LogitsProcessor(self.config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
 
     def forward(self,
                 input_ids: torch.Tensor,

From 3462c1c522d214755f1dfce3d645ab5afe7f00ae Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Thu, 28 Aug 2025 11:03:22 +0200
Subject: [PATCH 686/932] [FIXBUG] Add return_success parameter to
 moe_wna16_weight_loader function (#22797)

Signed-off-by: JartX <sagformas@epdcenter.es>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../layers/quantization/moe_wna16.py          | 32 +++++++++++--------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 364d1ac314..0cde104cc7 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -124,7 +124,7 @@ class MoeWNA16Config(QuantizationConfig):
         awq_min_capability = AWQConfig.get_min_capability()
 
         gptq_compatible = quant_method == "gptq" and \
-                not desc_act and num_bits in [4, 8]
+            not desc_act and num_bits in [4, 8]
         awq_compatible = quant_method == "awq" and num_bits == 4 and \
             device_capability >= awq_min_capability
 
@@ -175,11 +175,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
         quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
     """
 
-    def __init__(
-        self,
-        quant_config: MoeWNA16Config,
-        moe: FusedMoEConfig,
-    ):
+    def __init__(self, quant_config: MoeWNA16Config,
+                 moe: "FusedMoEConfig") -> None:
         super().__init__(moe)
         self.quant_config = quant_config
 
@@ -187,6 +184,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
 
+        self.moe = layer
         layer.quant_config = self.quant_config
         bit8_pack_factor = self.quant_config.bit8_pack_factor
         group_size = self.quant_config.group_size
@@ -308,7 +306,6 @@ class MoeWNA16Method(FusedMoEMethodBase):
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `MoeWNA16Method` yet.")
@@ -404,12 +401,14 @@ class MoeWNA16Method(FusedMoEMethodBase):
 
         def moe_wna16_weight_loader(param: torch.nn.Parameter,
                                     loaded_weight: torch.Tensor,
-                                    weight_name: str, shard_id: str,
-                                    expert_id: int):
+                                    weight_name: str,
+                                    shard_id: str,
+                                    expert_id: int,
+                                    return_success: bool = False):
             if "g_idx" in weight_name:
-                return
+                return False if return_success else None
             if not layer.quant_config.has_zp and "qzeros" in weight_name:
-                return
+                return False if return_success else None
 
             device = get_tp_group().device
             tp_rank = get_tensor_model_parallel_rank()
@@ -455,11 +454,18 @@ class MoeWNA16Method(FusedMoEMethodBase):
                     param.data[expert_id, :shard_size // 2] = tensor
                 else:
                     param.data[expert_id, shard_size // 2:] = tensor
+                return True if return_success else None
             elif "w2_qzeros" in weight_name:
                 param.data[expert_id] = loaded_weight.view(
                     loaded_weight.size(0), layer.tp_size, -1)[:, tp_rank]
+                return True if return_success else None
             else:
-                weight_loader(param, loaded_weight, weight_name, shard_id,
-                              expert_id)
+                # Delegate to the original loader, passing return_success
+                return weight_loader(param,
+                                     loaded_weight,
+                                     weight_name,
+                                     shard_id,
+                                     expert_id,
+                                     return_success=return_success)
 
         return moe_wna16_weight_loader

From d99c3a4f7bd33e3e3acf7c2c82d52d15ba501eaf Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Thu, 28 Aug 2025 13:38:19 +0200
Subject: [PATCH 687/932] [Doc]: fix typos in .md files (including those of
 #23751) (#23825)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 docs/contributing/ci/update_pytorch_version.md     | 2 +-
 docs/contributing/model/multimodal.md              | 2 +-
 docs/deployment/frameworks/lobe-chat.md            | 2 +-
 docs/deployment/k8s.md                             | 2 +-
 docs/design/fused_moe_modular_kernel.md            | 2 +-
 docs/design/metrics.md                             | 4 ++--
 docs/features/lora.md                              | 2 +-
 docs/features/reasoning_outputs.md                 | 2 +-
 docs/features/structured_outputs.md                | 2 +-
 docs/getting_started/installation/aws_neuron.md    | 4 ++--
 docs/getting_started/installation/cpu/apple.inc.md | 2 +-
 docs/getting_started/installation/gpu/cuda.inc.md  | 2 +-
 docs/getting_started/installation/gpu/rocm.inc.md  | 4 ++--
 docs/models/pooling_models.md                      | 2 +-
 docs/models/supported_models.md                    | 2 +-
 docs/usage/usage_stats.md                          | 2 +-
 16 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 7ef22d6f8c..3dae62dd5d 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -90,7 +90,7 @@ address the long build time at its source, the current workaround is to set `VLL
 to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
 when manually triggering a build on Buildkite. This branch accomplishes two things:
 
-1. Increase the timeout limit to 10 hours so that the build doesn't timeout.
+1. Increase the timeout limit to 10 hours so that the build doesn't time out.
 2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket
 to warm it up so that future builds are faster.
 
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 76d0f067fd..dc742c8fcf 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -855,7 +855,7 @@ Examples:
 
 ### Custom HF processor
 
-Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].
+Some models don't define an HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].
 
 Examples:
 
diff --git a/docs/deployment/frameworks/lobe-chat.md b/docs/deployment/frameworks/lobe-chat.md
index e3e7dbe6e1..8ecd1484ea 100644
--- a/docs/deployment/frameworks/lobe-chat.md
+++ b/docs/deployment/frameworks/lobe-chat.md
@@ -6,6 +6,6 @@ Supports speech-synthesis, multi-modal, and extensible (function call) plugin sy
 
 One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application.
 
-It supports vLLM as a AI model provider to efficiently serve large language models.
+It supports vLLM as an AI model provider to efficiently serve large language models.
 
 For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm).
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index cad801a431..ca23e0b9fd 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -380,7 +380,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
 
 ### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"
 
-If the startup or readiness probe failureThreshold is too low for the time needed to startup the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:
+If the startup or readiness probe failureThreshold is too low for the time needed to start up the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:
 
 1. container log contains "KeyboardInterrupt: terminated"
 2. `kubectl get events` shows message `Container $NAME failed startup probe, will be restarted`
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index 202e9c1caf..b03483d1c9 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C
 
 #### Step 1: Add an All2All manager
 
-The purpose of the All2All Manager is to setup the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
+The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
 
 #### Step 2: Add a FusedMoEPrepareAndFinalize Type
 
diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index b24364247b..90b2fd32f2 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -99,11 +99,11 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201
 
 ### Multi-process Mode
 
-In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
+In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
 
 ### Built in Python/Process Metrics
 
-The following metrics are supported by default by `prometheus_client`, but they are not exposed when multi-process mode is used:
+The following metrics are supported by default by `prometheus_client`, but they are not exposed when multiprocess mode is used:
 
 - `python_gc_objects_collected_total`
 - `python_gc_objects_uncollectable_total`
diff --git a/docs/features/lora.md b/docs/features/lora.md
index 668460a368..db794b2ebd 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -52,7 +52,7 @@ Check out <gh-file:examples/offline_inference/multilora_inference.py> for an exa
 ## Serving LoRA Adapters
 
 LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
-`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server:
+`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kick off the server:
 
 ```bash
 vllm serve meta-llama/Llama-2-7b-hf \
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 04b943efbb..d9a785eb73 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -143,7 +143,7 @@ OpenAI Python client library does not officially support `reasoning_content` att
             print(content, end="", flush=True)
     ```
 
-Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+Remember to check whether the `reasoning_content` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
 
 ## Tool Calling
 
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 8a934d406f..0d6294a5fd 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -205,7 +205,7 @@ This section covers the OpenAI beta wrapper over the `client.chat.completions.cr
 
 At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104).
 
-For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct`
+For the following examples, vLLM was set up using `vllm serve meta-llama/Llama-3.1-8B-Instruct`
 
 Here is a simple example demonstrating how to get structured output using Pydantic models:
 
diff --git a/docs/getting_started/installation/aws_neuron.md b/docs/getting_started/installation/aws_neuron.md
index b8bd76bd5b..ff2500f035 100644
--- a/docs/getting_started/installation/aws_neuron.md
+++ b/docs/getting_started/installation/aws_neuron.md
@@ -140,8 +140,8 @@ Alternatively, users can directly call the NxDI library to trace and compile you
 
 - `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid
   compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
-  artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set,
-  but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts
+  artifacts under `neuron-compiled-artifacts/{unique_hash}/` subdirectory in the model path. If this environment variable is set,
+  but the directory does not exist, or the contents are invalid, Neuron will also fall back to a new compilation and store the artifacts
   under this specified path.
 - `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
 - `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md
index 2828173a76..124a41adf1 100644
--- a/docs/getting_started/installation/cpu/apple.inc.md
+++ b/docs/getting_started/installation/cpu/apple.inc.md
@@ -1,6 +1,6 @@
 # --8<-- [start:installation]
 
-vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS.
+vLLM has experimental support for macOS with Apple Silicon. For now, users must build from source to natively run on macOS.
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index 69a9842e47..275232e12e 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -48,7 +48,7 @@ uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VE
 
 #### Install the latest code
 
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
 
 ```bash
 uv pip install -U vllm \
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
index 560883d3ca..80e99d3034 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -149,7 +149,7 @@ Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm
 **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
 If you choose to build this rocm_base image yourself, the steps are as follows.
 
-It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
 
 ```json
 {
@@ -170,7 +170,7 @@ DOCKER_BUILDKIT=1 docker build \
 #### Build an image with vLLM
 
 First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
-It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
 
 ```bash
 {
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index fbb5f6f6dd..d2fbb1870d 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -258,4 +258,4 @@ Expected output:
 {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
 ```
 
-A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
+An OpenAI client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 17947e8cfa..01c1090c6f 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -40,7 +40,7 @@ If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it mean
 
 #### Custom models
 
-If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
+If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM!
 
 For a model to be compatible with the Transformers backend for vLLM it must:
 
diff --git a/docs/usage/usage_stats.md b/docs/usage/usage_stats.md
index e78c67522f..4c7a7ff019 100644
--- a/docs/usage/usage_stats.md
+++ b/docs/usage/usage_stats.md
@@ -51,7 +51,7 @@ tail ~/.config/vllm/usage_stats.json
 
 ## Opting out
 
-You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
+You can opt out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
 
 ```bash
 # Any of the following methods can disable usage stats collection

From 67cee40da035b7478483c76dfbe0bfc321c3822f Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 28 Aug 2025 19:57:05 +0800
Subject: [PATCH 688/932] [CI/Build][Bugfix] Fix Qwen VL tests on CPU (#23818)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../scripts/hardware_ci/run-cpu-test.sh       | 20 +++++++++----------
 vllm/model_executor/models/utils.py           |  8 ++++----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 9dec9f8e9e..8b8f0e8c65 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -49,23 +49,23 @@ function cpu_tests() {
   # Run kernel tests
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -v -s tests/kernels/test_onednn.py"
+    pytest -x -v -s tests/kernels/test_onednn.py"
 
   # Run basic model test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     # Note: disable until supports V1
-    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
 
     # Note: disable Bart until supports V1
-    pytest -v -s tests/models/language/generation -m cpu_model \
+    pytest -x -v -s tests/models/language/generation -m cpu_model \
                 --ignore=tests/models/language/generation/test_bart.py
-    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
                 --ignore=tests/models/language/generation/test_bart.py
 
-    pytest -v -s tests/models/language/pooling -m cpu_model
-    pytest -v -s tests/models/multimodal/generation \
+    pytest -x -v -s tests/models/language/pooling -m cpu_model
+    pytest -x -v -s tests/models/multimodal/generation \
                 --ignore=tests/models/multimodal/generation/test_mllama.py \
                 --ignore=tests/models/multimodal/generation/test_pixtral.py \
                 -m cpu_model"
@@ -73,20 +73,20 @@ function cpu_tests() {
   # Run compressed-tensor test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -s -v \
+    pytest -x -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
 
   # Note: disable it until supports V1
   # Run AWQ test
   # docker exec cpu-test-"$NUMA_NODE" bash -c "
   #   set -e
-  #   VLLM_USE_V1=0 pytest -s -v \
+  #   VLLM_USE_V1=0 pytest -x -s -v \
   #   tests/quantization/test_ipex_quant.py"
 
   # Run multi-lora tests
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -s -v \
+    pytest -x -s -v \
     tests/lora/test_qwen2vl.py"
 
   # online serving
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 11e098f1d7..28cfefac30 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -507,10 +507,10 @@ def merge_multimodal_embeddings(
         This updates ``inputs_embeds`` in place.
     """
     if isinstance(placeholder_token_id, list):
-        placeholder_token_id = torch.tensor(placeholder_token_id,
-                                            pin_memory=True).to(
-                                                device=input_ids.device,
-                                                non_blocking=True)
+        placeholder_token_id = torch.tensor(
+            placeholder_token_id,
+            pin_memory=is_pin_memory_available()).to(device=input_ids.device,
+                                                     non_blocking=True)
         return _merge_multimodal_embeddings(
             inputs_embeds,
             torch.isin(input_ids, placeholder_token_id),

From a3432f18fdd85eb18e29fc32327507fe1063ad57 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 28 Aug 2025 05:26:45 -0700
Subject: [PATCH 689/932] [BugFix][Spec Decode] Use float64 for uniform_probs
 (#23803)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 examples/offline_inference/spec_decode.py | 2 +-
 vllm/v1/sample/rejection_sampler.py       | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index c4972f02d0..5af232cb6a 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -138,7 +138,7 @@ def main():
     sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
     if not args.custom_mm_prompts:
         outputs = llm.generate(
-            TokensPrompt(prompt_token_ids=prompt_ids),
+            [TokensPrompt(prompt_token_ids=x) for x in prompt_ids],
             sampling_params=sampling_params,
         )
     else:
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 2d9ce3101b..511cdb3234 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -365,9 +365,14 @@ def generate_uniform_probs(
             A tensor of shape `(num_tokens, )` containing uniform
             random values in the range [0, 1).
     """
+    # NOTE(woosuk): We deliberately use float64 instead of float32 here
+    # because when using float32, there's a non-negligible chance that
+    # uniform_prob is sampled to be exact 0.0 as reported in
+    # https://github.com/pytorch/pytorch/issues/16706. Using float64
+    # mitigates the issue.
     uniform_probs = torch.rand(
         (num_tokens, ),
-        dtype=torch.float32,
+        dtype=torch.float64,
         device=device,
     )
     start_idx = 0

From bfab219648fdd6d398c09cd022117b0e663c9e36 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Thu, 28 Aug 2025 20:36:55 +0800
Subject: [PATCH 690/932] [Model] [gpt-oss] fix gpt-oss pp support (#23815)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/model_executor/models/gpt_oss.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 2b118d8491..e0b4df7728 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -668,9 +668,8 @@ class GptOssForCausalLM(nn.Module, SupportsPP):
                 positions: torch.Tensor,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
-        assert intermediate_tensors is None
-        assert inputs_embeds is None
-        return self.model(input_ids, positions)
+        return self.model(input_ids, positions, intermediate_tensors,
+                          inputs_embeds)
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:

From d3da2eea546b33b9444519f99c26721f7344117f Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Thu, 28 Aug 2025 14:37:38 +0200
Subject: [PATCH 691/932] [Doc]: fix typos in Python scripts (#23828)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 vllm/compilation/backends.py                                  | 4 ++--
 vllm/config/cache.py                                          | 2 +-
 vllm/engine/arg_utils.py                                      | 2 +-
 vllm/entrypoints/chat_utils.py                                | 2 +-
 vllm/entrypoints/openai/api_server.py                         | 2 +-
 .../quantization/compressed_tensors/compressed_tensors.py     | 2 +-
 .../quantization/compressed_tensors/compressed_tensors_moe.py | 2 +-
 vllm/v1/cudagraph_dispatcher.py                               | 4 ++--
 vllm/v1/worker/block_table.py                                 | 2 +-
 vllm/v1/worker/cpu_model_runner.py                            | 2 +-
 10 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index fa86773d24..3361b65a9b 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -271,7 +271,7 @@ def split_graph(graph: fx.GraphModule,
         outputs.append(
             SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
 
-    # sort by intetger graph_id, rather than string name
+    # sort by integer graph_id, rather than string name
     outputs.sort(key=lambda x: x.graph_id)
 
     return split_gm, outputs
@@ -424,7 +424,7 @@ class VllmBackend:
 
         # if the model is initialized with a non-empty prefix,
         # then usually it's enough to use that prefix,
-        # e.g. launguage_model, vision_model, etc.
+        # e.g. language_model, vision_model, etc.
         # when multiple parts are initialized as independent
         # models, we need to use the model_tag to distinguish
         # them, e.g. backbone (default), eagle_head, etc.
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index a9550d4390..3d2aa6b17b 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -115,7 +115,7 @@ class CacheConfig:
 
     In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
     some layers can skip tokens corresponding to prefill. This flag enables
-    attention metadata for eligible layers to be overriden with metadata
+    attention metadata for eligible layers to be overridden with metadata
     necessary for implementing this optimization in some models (e.g. Gemma3n)
     """
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e4d205aeb8..7802802f13 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1053,7 +1053,7 @@ class EngineArgs:
                                    self.trust_remote_code, self.revision,
                                    self.code_revision, self.config_format)
 
-            # if loading a SpeculatorsConfig, load the specualtive_config
+            # if loading a SpeculatorsConfig, load the speculative_config
             # details from the config directly
             # no user input required / expected
             if isinstance(hf_config, SpeculatorsConfig):
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 7b11a50642..1954cbcbf1 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -640,7 +640,7 @@ class BaseMultiModalContentParser(ABC):
     def __init__(self) -> None:
         super().__init__()
 
-        # stores model placehodlers list with corresponding
+        # stores model placeholders list with corresponding
         # general MM placeholder:
         # {
         #   "<##IMAGE##>": ["<image>", "<image>", "<image>"],
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 9a2470649c..a28d38729f 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1096,7 +1096,7 @@ if envs.VLLM_SERVER_DEV_MODE:
             raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
                                 detail="Missing 'method' in request body")
         # For security reason, only serialized string args/kwargs are passed.
-        # User-defined `method` is responsible for deseralization if needed.
+        # User-defined `method` is responsible for deserialization if needed.
         args: list[str] = body.get("args", [])
         kwargs: dict[str, str] = body.get("kwargs", {})
         timeout: Optional[float] = body.get("timeout")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 230572041c..b07bf675ca 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -579,7 +579,7 @@ class CompressedTensorsConfig(QuantizationConfig):
             format = scheme_dict.get("format")
 
         # Find the sparsity scheme of the layer
-        # assume that fused layers inerhit first component's sparsity scheme
+        # assume that fused layers inherit first component's sparsity scheme
         sparsity_targets = (self.sparsity_scheme_map.keys() -
                             set(self.sparsity_ignore_list))
         sparsity_scheme: Optional[SparsityCompressionConfig] = None
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index af9d1c46f6..2cad9ff0d3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -71,7 +71,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
     ) -> "CompressedTensorsMoEMethod":
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
-        # Check if a using "Linear" to select scheems
+        # Check if a using "Linear" to select schemes
         if "Linear" in quant_config.target_scheme_map:
             matched_target = "Linear"
         else:
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 02e65820b7..3b4f1d20b6 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -11,7 +11,7 @@ logger = init_logger(__name__)
 
 class CudagraphDispatcher:
     """
-    Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs.
+    Runtime cudagraph dispatcher to dispatch keys for multiple set of cudagraphs.
 
     The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
     for FULL cudagraph runtime mode. The keys are initialized depending on 
@@ -21,7 +21,7 @@ class CudagraphDispatcher:
 
     At runtime, the dispatch method generates the runtime cudagraph mode (FULL, 
     PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
-    based on the input key. After dispatching (commuicate via forward context), 
+    based on the input key. After dispatching (communicate via forward context),
     the cudagraph wrappers will trust the dispatch key to do either capturing
     or replaying (if mode matched), or pass through to the underlying runnable 
     without cudagraph (if mode no match or mode is NONE).
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 5662fc350e..6ab5ce2748 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -110,7 +110,7 @@ class BlockTable:
         self.block_table_cpu.fill_(0)
 
     def get_device_tensor(self) -> torch.Tensor:
-        """Ruturns the device tensor of the block table."""
+        """Returns the device tensor of the block table."""
         return self.block_table
 
     def get_cpu_tensor(self) -> torch.Tensor:
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 742e553b77..7d07261127 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -43,7 +43,7 @@ class CPUModelRunner(GPUModelRunner):
         Args:
             scheduler_output: The scheduler output.
         """
-        # Attention free models have zero kv_cache_goups, however models
+        # Attention free models have zero kv_cache_groups, however models
         # like Mamba are also attention free but use the kv_cache for
         # keeping its internal state. This is why we check the number
         # of kv_cache groups instead of solely checking

From 66548f66031006ca873ac799d2dc8497fec33339 Mon Sep 17 00:00:00 2001
From: "YUQI.CHENG" <420985011@qq.com>
Date: Thu, 28 Aug 2025 21:44:09 +0800
Subject: [PATCH 692/932] [Bugfix] Fix benchmark_moe.py for blockwise fp8.
 (#23823)

Signed-off-by: crischeng <420985011@qq.com>
Co-authored-by: cris <grace@guisenbindeMacBook-Pro.local>
---
 benchmarks/kernels/benchmark_moe.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 752c2d0082..710d30adfd 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -419,8 +419,10 @@ class BenchmarkWorker:
         )
         # NOTE(woosuk): The current naming convention uses w2.shape[2], which
         # is the intermediate size after silu_and_mul.
+        block_n = block_quant_shape[0] if block_quant_shape else None
+        block_k = block_quant_shape[1] if block_quant_shape else None
         op_config = get_moe_configs(
-            num_experts, shard_intermediate_size // 2, dtype_str
+            num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k
         )
         if op_config is None:
             config = get_default_config(
@@ -430,6 +432,7 @@ class BenchmarkWorker:
                 hidden_size,
                 topk,
                 dtype_str,
+                block_quant_shape,
             )
         else:
             config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]

From 1f096f9b9536aa8f520f89b178a518da294a7dce Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Thu, 28 Aug 2025 15:52:01 +0200
Subject: [PATCH 693/932] [CI] Fix linting error on main (#23835)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/v1/cudagraph_dispatcher.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 3b4f1d20b6..d2db7dcb3f 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -11,7 +11,8 @@ logger = init_logger(__name__)
 
 class CudagraphDispatcher:
     """
-    Runtime cudagraph dispatcher to dispatch keys for multiple set of cudagraphs.
+    Runtime cudagraph dispatcher to dispatch keys for multiple set of
+    cudagraphs.
 
     The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
     for FULL cudagraph runtime mode. The keys are initialized depending on 

From 95089607fa307c5facfb9706ea919292fb56e78c Mon Sep 17 00:00:00 2001
From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com>
Date: Thu, 28 Aug 2025 21:56:20 +0800
Subject: [PATCH 694/932] [Model][gpt-oss] Support DP+EP for GPT-OSS with
 FlashInfer trtllm-gen MoE (#23819)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
---
 vllm/model_executor/layers/fused_moe/config.py   | 15 ++++++++-------
 vllm/model_executor/layers/fused_moe/layer.py    |  8 ++++----
 vllm/model_executor/layers/quantization/mxfp4.py |  6 ++----
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index cab610decf..0b501cd87f 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -190,12 +190,6 @@ class FusedMoEParallelConfig:
         return (self.use_all2all_kernels
                 and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
 
-    @property
-    def use_flashinfer_cutlass_kernels(self):
-        return (envs.VLLM_USE_FLASHINFER_MOE_FP4
-                and has_flashinfer_cutlass_fused_moe()
-                and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput")
-
     @staticmethod
     def make(tp_size_: int, dp_size_: int,
              vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig":
@@ -404,7 +398,14 @@ class FusedMoEConfig:
 
     @property
     def use_flashinfer_cutlass_kernels(self):
-        return self.moe_parallel_config.use_flashinfer_cutlass_kernels
+        """
+        Whether to use FlashInfer cutlass kernels for NVFP4 MoE.
+        """
+        return (self.quant_config is not None
+                and self.quant_config.quant_dtype == "nvfp4"
+                and envs.VLLM_USE_FLASHINFER_MOE_FP4
+                and has_flashinfer_cutlass_fused_moe()
+                and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput")
 
     @staticmethod
     def make(
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b9de03ddd2..28123d3958 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -920,7 +920,7 @@ class FusedMoE(CustomOp):
         self.batched_router_logits: Optional[torch.Tensor] = None
         if (self.moe_parallel_config.use_pplx_kernels
                 or self.moe_parallel_config.use_deepep_ll_kernels
-                or self.moe_parallel_config.use_flashinfer_cutlass_kernels):
+                or self.moe_config.use_flashinfer_cutlass_kernels):
             self.batched_hidden_states = torch.zeros(
                 (moe.max_num_tokens, self.hidden_size),
                 dtype=moe.in_dtype,
@@ -974,7 +974,7 @@ class FusedMoE(CustomOp):
 
     @property
     def use_flashinfer_cutlass_kernels(self):
-        return self.moe_parallel_config.use_flashinfer_cutlass_kernels
+        return self.moe_config.use_flashinfer_cutlass_kernels
 
     def update_expert_map(self):
         # ep_size and ep_rank should already be updated
@@ -1665,7 +1665,7 @@ class FusedMoE(CustomOp):
         # only when data parallelism (DP) is enabled.
         use_flashinfer_cutlass_kernels = (
             self.dp_size > 1
-            and self.moe_parallel_config.use_flashinfer_cutlass_kernels)
+            and self.moe_config.use_flashinfer_cutlass_kernels)
         if (self.moe_parallel_config.use_pplx_kernels
                 or self.moe_parallel_config.use_deepep_ll_kernels
                 or use_flashinfer_cutlass_kernels):
@@ -1674,7 +1674,7 @@ class FusedMoE(CustomOp):
         do_naive_dispatch_combine: bool = (
             self.dp_size > 1
             and not self.moe_parallel_config.use_deepep_ht_kernels
-            and not self.moe_parallel_config.use_flashinfer_cutlass_kernels)
+            and not self.moe_config.use_flashinfer_cutlass_kernels)
         if do_naive_dispatch_combine:
             hidden_states, router_logits = get_ep_group().dispatch(
                 hidden_states, router_logits)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 6724796904..f7d591328f 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -623,8 +623,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
         if should_use_flashinfer_mxfp4():
             from flashinfer import mxfp8_quantize, trtllm_fp4_block_scale_moe
-            assert not self.moe.use_ep, (
-                "EP is not supported for flashinfer mxfp4 moe backend yet.")
             if _should_use_flashinfer_mxfp4_bf16():
                 assert x.dtype == torch.bfloat16
                 x_quant = x
@@ -650,12 +648,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 None,  # output1_scale_scalar
                 None,  # output1_scale_gate_scalar
                 None,  # output2_scale_scalar
-                self.num_experts,
+                global_num_experts,
                 top_k,
                 None,  # n_group
                 None,  # topk_group
                 self.intermediate_size,  # padded to multiple of 256
-                0,  # local_expert_offset
+                layer.ep_rank * layer.local_num_experts,  # local_expert_offset
                 self.num_experts,  # local num experts
                 None,
                 self._get_tile_tokens_dim(x, top_k),

From db74d604900d397e4ee524f93bcb256537679ce4 Mon Sep 17 00:00:00 2001
From: Angela Yi <angelayi@meta.com>
Date: Thu, 28 Aug 2025 08:25:56 -0700
Subject: [PATCH 695/932] [Bugfix] Add fake mode around passes (#23349)

Signed-off-by: angelayi <yiangela7@gmail.com>
---
 vllm/compilation/activation_quant_fusion.py |  2 +
 vllm/compilation/collective_fusion.py       |  6 ++
 vllm/compilation/fusion.py                  |  2 +
 vllm/compilation/fusion_attn.py             | 71 ++++++++++-----------
 vllm/compilation/inductor_pass.py           | 20 ++++++
 vllm/compilation/sequence_parallelism.py    |  2 +
 6 files changed, 64 insertions(+), 39 deletions(-)

diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
index ce4e50a2b0..826014f770 100644
--- a/vllm/compilation/activation_quant_fusion.py
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -10,6 +10,7 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
+from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
@@ -61,6 +62,7 @@ class ActivationQuantFusionPass(VllmInductorPass):
     https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
     """
 
+    @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 0c545d8cff..7a99aaff70 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -19,6 +19,7 @@ from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
+from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -349,6 +350,7 @@ class AllGatherCutlassScaledMMPattern(BasePattern):
 
 class AsyncTPPass(VllmInductorPass):
 
+    @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 
@@ -1121,6 +1123,10 @@ class AllReduceFusionPass(VllmInductorPass):
             # in fallback path, when we don't use flashinfer
             fuse_rms_quant=config.compilation_config.pass_config.enable_fusion)
 
+        self.register_patterns()
+
+    @enable_fake_mode
+    def register_patterns(self):
         for epsilon in [1e-5, 1e-6]:
             AllReduceFusedRMSNormStaticQuantFP8Pattern(
                 epsilon,
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 0d8d562514..afa739c966 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 
 from .fx_utils import find_getitem_maybe
+from .inductor_pass import enable_fake_mode
 from .multi_output_match import MultiOutputMatch
 from .vllm_inductor_pass import VllmInductorPass
 
@@ -528,6 +529,7 @@ class FusionPass(VllmInductorPass):
             cls._instance.pass_config = config.compilation_config.pass_config
         return cls._instance
 
+    @enable_fake_mode
     def __init__(self, config: VllmConfig):
         assert self.__class__._instance is None, \
             "FusionPass singleton instance already exists"
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index f942afe6a2..3095f17110 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -7,8 +7,6 @@ import torch
 import torch._inductor.pattern_matcher as pm
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
-from torch._subclasses.fake_tensor import (FakeTensorMode,
-                                           unset_fake_temporarily)
 
 from vllm.attention import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
@@ -19,6 +17,7 @@ from vllm.platforms import current_platform
 from vllm.utils import round_up
 
 from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32
+from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
@@ -139,24 +138,21 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
                                       output_block_scale=None)
             return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])
 
-        # Need custom fake mode, otherwise tracing happens with real tensors.
-        # That would not work for the unified_attention custom op.
-        with unset_fake_temporarily(), FakeTensorMode():
-            inputs = [
-                empty_bf16(5, self.num_heads, self.head_size),  # q
-                empty_bf16(5, self.num_heads, self.head_size),  # k
-                empty_bf16(5, self.num_heads, self.head_size),  # v
-                empty_bf16(5, self.num_heads, self.head_size),  # attn_output
-                self.empty_quant(5, self.num_heads *
-                                 self.head_size),  # quant_output
-                empty_fp32(1, 1)  # scale
-            ]
+        inputs = [
+            empty_bf16(5, self.num_heads, self.head_size),  # q
+            empty_bf16(5, self.num_heads, self.head_size),  # k
+            empty_bf16(5, self.num_heads, self.head_size),  # v
+            empty_bf16(5, self.num_heads, self.head_size),  # attn_output
+            self.empty_quant(5,
+                             self.num_heads * self.head_size),  # quant_output
+            empty_fp32(1, 1)  # scale
+        ]
 
-            pm.register_replacement(
-                pattern, replacement, inputs,
-                AttentionQuantPattern.wrap_trace_fn(
-                    AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
-                pm_pass)
+        pm.register_replacement(
+            pattern, replacement, inputs,
+            AttentionQuantPattern.wrap_trace_fn(
+                AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
+            pm_pass)
 
 
 class AttentionNvfp4QuantPattern(AttentionQuantPattern):
@@ -219,27 +215,23 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
                                 [-1, self.num_heads * self.head_size // 2])
             return output, at2[2]
 
-        # Need custom fake mode, otherwise tracing happens with real tensors.
-        # That would not work for the unified_attention custom op.
-        with unset_fake_temporarily(), FakeTensorMode():
-            inputs = [
-                empty_bf16(5, self.num_heads, self.head_size),  # q
-                empty_bf16(5, self.num_heads, self.head_size),  # k
-                empty_bf16(5, self.num_heads, self.head_size),  # v
-                empty_bf16(5, self.num_heads, self.head_size),  # output_attn
-                self.empty_quant(5, self.num_heads * self.head_size //
-                                 2),  # output_quant
-                empty_i32(128,
-                          round_up(self.num_heads * self.head_size // 16,
-                                   4)),  # output_scale
-                empty_fp32(1, 1),  # input_scale
-            ]
+        inputs = [
+            empty_bf16(5, self.num_heads, self.head_size),  # q
+            empty_bf16(5, self.num_heads, self.head_size),  # k
+            empty_bf16(5, self.num_heads, self.head_size),  # v
+            empty_bf16(5, self.num_heads, self.head_size),  # output_attn
+            self.empty_quant(5, self.num_heads * self.head_size //
+                             2),  # output_quant
+            empty_i32(128, round_up(self.num_heads * self.head_size // 16,
+                                    4)),  # output_scale
+            empty_fp32(1, 1),  # input_scale
+        ]
 
-            pm.register_replacement(
-                pattern, replacement, inputs,
-                AttentionQuantPattern.wrap_trace_fn(
-                    AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
-                pm_pass)
+        pm.register_replacement(
+            pattern, replacement, inputs,
+            AttentionQuantPattern.wrap_trace_fn(
+                AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
+            pm_pass)
 
 
 class AttnFusionPass(VllmInductorPass):
@@ -255,6 +247,7 @@ class AttnFusionPass(VllmInductorPass):
     support are attention kernels, which need to support fusing output quant.
     """
 
+    @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 2a149c65b3..e1b691df38 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import functools
 import hashlib
 import inspect
 import json
@@ -10,6 +11,8 @@ from typing import Any, Callable, Optional, Union
 
 import torch
 from torch import fx
+from torch._subclasses.fake_tensor import (FakeTensorMode,
+                                           unset_fake_temporarily)
 
 from vllm.utils import is_torch_equal_or_newer
 
@@ -114,3 +117,20 @@ class CallableInductorPass(InductorPass):
 
     def uuid(self) -> Any:
         return self._uuid
+
+
+def enable_fake_mode(fn: Callable[..., Any]) -> Callable[..., Any]:
+    """
+    Applies a FakeTensorMode context. This is useful when you don't want to
+    create or run things with real tensors.
+    """
+
+    @functools.wraps(fn)
+    def fn_new(*args, **kwargs) -> Any:
+        with torch._guards.tracing(
+                None), unset_fake_temporarily(), FakeTensorMode():
+            result = fn(*args, **kwargs)
+
+        return result
+
+    return fn_new
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index ebc025cba7..1758ed4c86 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -14,6 +14,7 @@ from vllm.distributed.parallel_state import (
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
+from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
@@ -436,6 +437,7 @@ class SequenceParallelismPass(VllmInductorPass):
     performance.
     """
 
+    @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 

From 0583578f42fb23cc8a6d612e041c6be402551282 Mon Sep 17 00:00:00 2001
From: Jean Schmidt <4520845+jeanschmidt@users.noreply.github.com>
Date: Thu, 28 Aug 2025 17:59:19 +0200
Subject: [PATCH 696/932] [ci] breaks down V1 Test into 3 groups of approx 30
 minutes runtime (#23757)

Signed-off-by: Jean Schmidt <contato@jschmidt.me>
---
 .buildkite/test-pipeline.yaml | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index cf90505257..24cc57e9df 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -234,7 +234,26 @@ steps:
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
 
-- label: V1 Test
+- label: V1 Test e2e + engine
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
+
+- label: V1 Test entrypoints
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1/entrypoints
+
+- label: V1 Test others
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
     - vllm/
@@ -242,8 +261,6 @@ steps:
   commands:
     # split the test to avoid interference
     - pytest -v -s v1/core
-    - pytest -v -s v1/engine
-    - pytest -v -s v1/entrypoints
     - pytest -v -s v1/executor
     - pytest -v -s v1/sample
     - pytest -v -s v1/logits_processors
@@ -256,9 +273,6 @@ steps:
     - pytest -v -s v1/test_utils.py
     - pytest -v -s v1/test_oracle.py
     - pytest -v -s v1/test_metrics_reader.py
-    # TODO: accuracy does not match, whether setting
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

From 8805ad9fa9c04b2ce4e2a9adc217471798b1ae64 Mon Sep 17 00:00:00 2001
From: Jean Schmidt <4520845+jeanschmidt@users.noreply.github.com>
Date: Thu, 28 Aug 2025 18:31:20 +0200
Subject: [PATCH 697/932] Add scale_config.yml file for Meta autoscalers for GH
 Actions (#23840)

Signed-off-by: Jean Schmidt <contato@jschmidt.me>
---
 .github/scale-config.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 .github/scale-config.yml

diff --git a/.github/scale-config.yml b/.github/scale-config.yml
new file mode 100644
index 0000000000..c41a3ee3eb
--- /dev/null
+++ b/.github/scale-config.yml
@@ -0,0 +1,21 @@
+# scale-config.yml:
+#   Powers what instance types are available for GHA auto-scaled
+#   runners. Runners listed here will be available as self hosted
+#   runners, configuration is directly pulled from the main branch.
+# runner_types:
+#   runner_label:
+#     instance_type: m4.large
+#     os: linux
+#     # min_available defaults to the global cfg in the ALI Terraform
+#     min_available: undefined
+#     # when max_available value is not defined, no max runners is enforced
+#     max_available: undefined
+#     disk_size: 50
+#     is_ephemeral: true
+
+runner_types:
+  linux.2xlarge:
+    disk_size: 150
+    instance_type: c5.2xlarge
+    is_ephemeral: true
+    os: linux

From f32a5bc5058afc2fb601dcb456b581e2fefa94dd Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Thu, 28 Aug 2025 10:29:37 -0700
Subject: [PATCH 698/932] Migrate Llama4ImagePatchInputs to TensorSchema
 (#22021)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/mllama4.py | 41 +++++++++++++++------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index ac9b968f7a..ecbbb5f57b 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -19,7 +19,7 @@
 import math
 from collections.abc import Iterable, Mapping
 from itertools import tee
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 from torch import nn
@@ -53,6 +53,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.multimodal.utils import run_dp_sharded_vision_model
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .llama4 import Llama4ForCausalLM
@@ -60,28 +61,34 @@ from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
 
 
-class Llama4ImagePatchInputs(TypedDict):
-    type: Literal["pixel_values"]
-    flat_data: torch.Tensor
+class Llama4ImagePatchInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_chunks, num_channels, image size, image size)`
+    Dimensions:
+        - batch_size: Batch size
+        - total_num_chunks: Batch size * number of chunks
+        - num_channels: Number of channels
+        - image_size: Size of each image
     """
-    patches_per_image: torch.Tensor
+
+    type: Literal["pixel_values"] = "pixel_values"
+
+    flat_data: Annotated[torch.Tensor,
+                         TensorShape("total_num_chunks", "num_channels",
+                                     "image_size", "image_size")]
+
+    patches_per_image: Annotated[torch.Tensor, TensorShape("batch_size")]
     """
     The number of total patches for each image in the batch.
-
+    
     This is used to split the embeddings which has the first two dimensions
     flattened just like `flat_data`.
     """
 
-    aspect_ratios: Union[torch.Tensor, list[torch.Tensor]]
+    aspect_ratios: Annotated[torch.Tensor, TensorShape("batch_size", 2)]
     """
     A list of aspect ratios corresponding to the number of tiles
     in each dimension that each image in the batch corresponds to.
-
-    Shape:
-    `(batch_size, ratio)` where ratio is a pair `(ratio_h, ratio_w)`
+    Each aspect ratio is a pair (ratio_h, ratio_w).
     """
 
 
@@ -623,7 +630,7 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
                 for (r_h, r_w) in aspect_ratios
             ]
 
-            processed_outputs["aspect_ratios"] = aspect_ratios
+            processed_outputs["aspect_ratios"] = torch.tensor(aspect_ratios)
             processed_outputs["patches_per_image"] = torch.tensor(
                 patches_per_image)
 
@@ -770,11 +777,9 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         # TODO: confirm handling for variable lengths
         flat_pixel_values = flatten_bn(pixel_values, concat=True)
         patches_per_image = flatten_bn(kwargs.pop("patches_per_image"))
-
-        aspect_ratios = kwargs.pop("aspect_ratios", None)
-        if not isinstance(aspect_ratios, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of aspect_ratios. "
-                             f"Got type: {type(aspect_ratios)}")
+        aspect_ratios = kwargs.pop("aspect_ratios")
+        if aspect_ratios.ndim == 3:
+            aspect_ratios = aspect_ratios.squeeze(1)
 
         return Llama4ImagePatchInputs(
             type="pixel_values",

From 04d1dd7f4a444a61ae4b01ea0271490082dbd605 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Thu, 28 Aug 2025 13:18:08 -0500
Subject: [PATCH 699/932] [ROCm][Aiter] Add triton fp8 bmm kernel for mla
 (#23264)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
Co-authored-by: ShaoChunLee <Shao-Chun.Lee@amd.com>
---
 vllm/envs.py                             |   8 ++
 vllm/v1/attention/backends/mla/common.py | 108 ++++++++++++++++++++---
 2 files changed, 104 insertions(+), 12 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index a6a795dcfc..1232bd7bf9 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -99,6 +99,7 @@ if TYPE_CHECKING:
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_USE_AITER_MLA: bool = True
     VLLM_ROCM_USE_AITER_MHA: bool = True
+    VLLM_ROCM_USE_AITER_FP8BMM: bool = True
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -774,6 +775,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in
              ("true", "1")),
 
+    # Whether to use aiter triton fp8 bmm kernel
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_FP8BMM":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_FP8BMM", "True").lower() in
+             ("true", "1")),
+
     # use rocm skinny gemms
     "VLLM_ROCM_USE_SKINNY_GEMM":
     lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in
@@ -1272,6 +1279,7 @@ def compute_hash() -> str:
         "VLLM_ROCM_USE_AITER_RMSNORM",
         "VLLM_ROCM_USE_AITER_MLA",
         "VLLM_ROCM_USE_AITER_MHA",
+        "VLLM_ROCM_USE_AITER_FP8BMM",
         "VLLM_ROCM_USE_SKINNY_GEMM",
         "VLLM_ROCM_FP8_PADDING",
         "VLLM_ROCM_MOE_PADDING",
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index ce45b34f64..9f93b50b07 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -193,6 +193,7 @@ from dataclasses import dataclass, field
 from typing import ClassVar, Generic, Optional, TypeVar, Union
 
 import torch
+from tqdm import tqdm
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
@@ -203,6 +204,7 @@ from vllm.attention.backends.utils import get_mla_dims
 from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import is_global_first_rank
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase,
@@ -234,6 +236,28 @@ try:
 except ImportError:
     flashinfer_available = False
 
+
+def is_rocm_aiter_fp8bmm_enabled() -> bool:
+    return current_platform.is_rocm() \
+        and envs.VLLM_ROCM_USE_AITER_FP8BMM \
+        and envs.VLLM_ROCM_USE_AITER
+
+
+if is_rocm_aiter_fp8bmm_enabled():
+    from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (  # noqa: E501 # isort: skip
+        batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant
+        as aiter_triton_fp8_bmm)
+
+    def dynamic_per_batched_tensor_quant(
+            x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn):
+        DTYPE_MAX = torch.finfo(dtype).max
+        min_val, max_val = x.aminmax()
+        amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
+        scale = DTYPE_MAX / amax
+        x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
+        return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
 logger = init_logger(__name__)
 
 CUDNN_WORKSPACE_SIZE = 12800
@@ -945,10 +969,21 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     def _v_up_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
-        # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
-        x = torch.bmm(x, self.W_UV)
-        # Convert from (N, B, V) to (B, N * V)
-        return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+        if is_rocm_aiter_fp8bmm_enabled():
+            # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V)
+            x = aiter_triton_fp8_bmm(x,
+                                     self.W_V,
+                                     self.W_V_scale,
+                                     group_size=128,
+                                     transpose_bm=True)
+            # Convert from (B, N, V) to (B, N * V)
+            x = x.reshape(-1, self.num_heads * self.v_head_dim)
+        else:
+            # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
+            x = torch.bmm(x, self.W_UV)
+            # Convert from (N, B, V) to (B, N * V)
+            x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+        return x
 
     def process_weights_after_loading(self, act_dtype: torch.dtype):
 
@@ -996,10 +1031,50 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         W_UK, W_UV = kv_b_proj_weight.split(
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
-        # Convert from (L, N, V) to (N, L, V)
-        self.W_UV = W_UV.transpose(0, 1)
-        # Convert from (L, N, P) to (N, P, L)
-        self.W_UK_T = W_UK.permute(1, 2, 0)
+        if is_rocm_aiter_fp8bmm_enabled():
+            W_K = W_UK.transpose(0, 1)  # 16 512 128
+            W_V = W_UV.permute(1, 2, 0)  # 16 128 512
+            self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(
+                W_K, dtype=current_platform.fp8_dtype())
+            self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(
+                W_V, dtype=current_platform.fp8_dtype())
+
+            # The kernel operates on non-padded inputs. Hence, pre-compiling
+            # triton kernel to avoid runtime compilation for unseen batch sizes
+            # Pre-compile for batch sizes 1 to 1024 to cover most use-cases.
+            # On DS-R1, this step adds roughly 50s to the model loading time.
+            max_batch_size = 1024  # [ToDo] Find the optimal upper limit
+            pre_compilation_list = list(range(1, max_batch_size + 1))
+            if is_global_first_rank():
+                pre_compilation_list = tqdm(
+                    pre_compilation_list,
+                    desc="[Aiter Triton] Pre-compiling fp8 BMM kernel",
+                    total=max_batch_size,
+                )
+
+            for m in pre_compilation_list:
+                x = torch.empty((self.W_K.shape[0], m, self.W_K.shape[2]),
+                                dtype=torch.bfloat16,
+                                device=self.W_K.device)
+                aiter_triton_fp8_bmm(x,
+                                     self.W_K,
+                                     self.W_K_scale,
+                                     group_size=128,
+                                     transpose_bm=True)
+
+                x = torch.empty((self.W_V.shape[0], m, self.W_V.shape[2]),
+                                dtype=torch.bfloat16,
+                                device=self.W_V.device)
+                aiter_triton_fp8_bmm(x,
+                                     self.W_V,
+                                     self.W_V_scale,
+                                     group_size=128,
+                                     transpose_bm=True)
+        else:
+            # Convert from (L, N, V) to (N, L, V)
+            self.W_UV = W_UV.transpose(0, 1)
+            # Convert from (L, N, P) to (N, P, L)
+            self.W_UK_T = W_UK.permute(1, 2, 0)
 
     def _compute_prefill_context(
         self,
@@ -1203,10 +1278,19 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                 [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
             # Convert from (B, N, P) to (N, B, P)
             decode_q_nope = decode_q_nope.transpose(0, 1)
-            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
-            decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
-            # Convert from (N, B, L) to (B, N, L)
-            decode_ql_nope = decode_ql_nope.transpose(0, 1)
+
+            if is_rocm_aiter_fp8bmm_enabled():
+                # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L)
+                decode_ql_nope = aiter_triton_fp8_bmm(decode_q_nope,
+                                                      self.W_K,
+                                                      self.W_K_scale,
+                                                      group_size=128,
+                                                      transpose_bm=True)
+            else:
+                # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+                decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
+                # Convert from (N, B, L) to (B, N, L)
+                decode_ql_nope = decode_ql_nope.transpose(0, 1)
 
             if fp8_attention:
                 ql_nope_shape = decode_ql_nope.shape

From 57d4ede520b6071341ebd310c0ddd4c6f4d54917 Mon Sep 17 00:00:00 2001
From: Jingkai He <he-jingkai@outlook.com>
Date: Fri, 29 Aug 2025 03:05:20 +0800
Subject: [PATCH 700/932] [bugfix] [spec-decoding] fix data race in
 sample_recovered_tokens_kernel (vLLM v1) (#23829)

Signed-off-by: He-Jingkai <he-jingkai@outlook.com>
---
 vllm/v1/sample/rejection_sampler.py | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 511cdb3234..3d5e59addf 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -598,17 +598,10 @@ def sample_recovered_tokens_kernel(
     vocab_offset = tl.arange(0, PADDED_VOCAB_SIZE)
     if NO_DRAFT_PROBS:
         draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
-        orig_prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
-                            draft_token_id)
-        # Temporarily zero out the probability of the draft token.
-        # This is essentially the same as target_prob - draft_prob, except that
-        # n-gram does not have draft_prob. We regard it as 1.
-        tl.store(
-            target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id,
-            0)
         prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
                        vocab_offset,
-                       mask=vocab_offset < vocab_size,
+                       mask=((vocab_offset < vocab_size) &
+                             (vocab_offset != draft_token_id)),
                        other=0)
     else:
         draft_prob = tl.load(draft_probs_ptr + (start_idx + pos) * vocab_size +
@@ -628,9 +621,3 @@ def sample_recovered_tokens_kernel(
                 other=float("-inf"))
     recovered_id = tl.argmax(prob / q, axis=-1)
     tl.store(output_token_ids_ptr + start_idx + pos, recovered_id)
-
-    if NO_DRAFT_PROBS:
-        # Restore the original probability.
-        tl.store(
-            target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id,
-            orig_prob)

From 16a45b3a281805ea4d4ff3908cef512fdf6d9f84 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Fri, 29 Aug 2025 03:36:50 +0800
Subject: [PATCH 701/932] [NVIDIA] Support SiluMul + NVFP4 quant fusion
 (#23671)

Signed-off-by: jindih <jindih@nvidia.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: jindih <jindih@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Luka Govedic <lgovedic@redhat.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +
 CMakeLists.txt                                |   2 +
 csrc/dispatch_utils.h                         |  16 +
 csrc/ops.h                                    |   8 +
 .../activation_nvfp4_quant_fusion_kernels.cu  | 368 ++++++++++++++++++
 csrc/torch_bindings.cpp                       |   7 +
 tests/compile/test_silu_mul_quant_fusion.py   |  97 +++--
 .../test_silu_nvfp4_quant_fusion.py           | 126 ++++++
 vllm/compilation/activation_quant_fusion.py   | 170 ++++++--
 vllm/compilation/fix_functionalization.py     |   7 +
 .../layers/quantization/modelopt.py           |   7 +-
 11 files changed, 746 insertions(+), 64 deletions(-)
 create mode 100644 csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
 create mode 100644 tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 24cc57e9df..454aaca0a1 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -668,6 +668,7 @@ steps:
     # Quantization
     - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
     - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -677,6 +678,7 @@ steps:
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b0eb0f32e0..e92e08f0d0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -541,6 +541,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
     set(SRCS
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
       "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
@@ -559,6 +560,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
     set(SRCS
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
       "csrc/quantization/fp4/nvfp4_experts_quant.cu"
       "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
       "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index f7b75c4837..2728aa81f0 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -19,6 +19,13 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
+#define VLLM_DISPATCH_CASE_HALF_TYPES(...)            \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_HALF_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_HALF_TYPES(__VA_ARGS__))
+
 // ROCm devices might use either fn or fnuz, so set up dispatch table for both.
 // A host-based check at runtime will create a preferred FP8 type for ROCm
 // such that the correct kernel is dispatched.
@@ -45,6 +52,15 @@
 #define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))
 
+#define AT_DISPATCH_BYTE_CASE(enum_type, ...) \
+  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, byte_t, __VA_ARGS__)
+
+#define VLLM_DISPATCH_CASE_BYTE_TYPES(...) \
+  AT_DISPATCH_BYTE_CASE(at::ScalarType::Byte, __VA_ARGS__)
+
+#define VLLM_DISPATCH_BYTE_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_BYTE_TYPES(__VA_ARGS__))
+
 #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
 
diff --git a/csrc/ops.h b/csrc/ops.h
index 86fe848e2f..78a487201b 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -130,6 +130,14 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
                         torch::Tensor& scale);
 
+#ifndef USE_ROCM
+
+void silu_and_mul_nvfp4_quant(torch::Tensor& out,
+                              torch::Tensor& output_block_scale,
+                              torch::Tensor& input,
+                              torch::Tensor& input_global_scale);
+#endif
+
 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
new file mode 100644
index 0000000000..9bbeb0334f
--- /dev/null
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_fp8.h>
+#include "dispatch_utils.h"
+
+#include "cuda_utils.h"
+
+namespace vllm {
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = c10::Half;
+};
+
+template <>
+struct TypeConverter<c10::Half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = c10::BFloat16;
+};
+
+template <>
+struct TypeConverter<c10::BFloat16> {
+  using Type = __nv_bfloat162;
+};
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
+        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
+                                                       int numCols,
+                                                       SFType* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
+                CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
+                       outerMIdx * outerMStride + innerMIdx * innerMStride +
+                       innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+#endif
+  return nullptr;
+}
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
+
+template <class Type>
+__inline__ __device__ PackedVec<Type> compute_silu(PackedVec<Type>& vec,
+                                                   PackedVec<Type>& vec2) {
+  PackedVec<Type> result;
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
+    if constexpr (std::is_same_v<Type, c10::Half>) {
+      half2 val(0.5f, 0.5f);
+      half2 t0 = __hmul2(vec.elts[i], val);
+      half2 t1 = __hfma2(h2tanh(t0), val, val);
+      half2 t2 = __hmul2(vec.elts[i], t1);
+      result.elts[i] = __hmul2(t2, vec2.elts[i]);
+    } else {
+      __nv_bfloat162 val(0.5f, 0.5f);
+      __nv_bfloat162 t0 = __hmul2(vec.elts[i], val);
+      __nv_bfloat162 t1 = __hfma2(h2tanh(t0), val, val);
+      __nv_bfloat162 t2 = __hmul2(vec.elts[i], t1);
+      result.elts[i] = __hmul2(t2, vec2.elts[i]);
+    }
+  }
+  return result;
+}
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
+                                                  PackedVec<Type>& vec2,
+                                                  float SFScaleVal,
+                                                  uint8_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  PackedVec<Type> out_silu = compute_silu(vec, vec2);
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(out_silu.elts[0]);
+
+  // Local maximum value.
+  #pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(out_silu.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(
+                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                   : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+  #pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, c10::Half>) {
+      fp2Vals[i] = __half22float2(out_silu.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+#else
+  return 0;
+#endif
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(1024, 4) silu_and_cvt_fp16_to_fp4(
+#else
+silu_and_cvt_fp16_to_fp4(
+#endif
+    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
+    uint32_t* out, uint32_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is
+  // (448.f / (Alpha_A / 6.f)).
+  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+
+  // Input tensor row/col loops.
+  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
+         colIdx += blockDim.x) {
+      int64_t inOffset =
+          rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) +
+                          numCols / CVT_FP4_ELTS_PER_THREAD + colIdx;
+      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      PackedVec in_vec2 = reinterpret_cast<PackedVec const*>(in)[inOffset2];
+
+      // Get the output tensor offset.
+      // Same as inOffset because 8 elements are packed into one uint32_t.
+      int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      ;
+      auto& out_pos = out[outOffset];
+
+      auto sf_out =
+          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
+                                             CVT_FP4_NUM_THREADS_PER_SF>(
+              rowIdx, colIdx, numCols, SFout);
+
+      out_pos = silu_and_cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(
+          in_vec, in_vec2, SFScaleVal, sf_out);
+    }
+  }
+#endif
+}
+
+}  // namespace vllm
+
+void silu_and_mul_nvfp4_quant(torch::Tensor& output,  // [..., d]
+                              torch::Tensor& output_sf,
+                              torch::Tensor& input,  // [..., 2 * d]
+                              torch::Tensor& input_sf) {
+  TORCH_CHECK(input.dtype() == torch::kFloat16 ||
+              input.dtype() == torch::kBFloat16);
+  int32_t m = input.size(0);
+  int32_t n = input.size(1) / 2;
+  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+  int multiProcessorCount =
+      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
+  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
+  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
+  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
+  int const numBlocksPerSM = 2048 / block.x;
+  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+  VLLM_DISPATCH_HALF_TYPES(
+      input.scalar_type(), "act_and_mul_quant_kernel", [&] {
+        auto input_ptr = reinterpret_cast<scalar_t const*>(input.data_ptr());
+        VLLM_DISPATCH_BYTE_TYPES(
+            output.scalar_type(), "fused_act_and_mul_quant_kernel_nvfp4_type",
+            [&] {
+              vllm::silu_and_cvt_fp16_to_fp4<scalar_t>
+                  <<<grid, block, 0, stream>>>(
+                      m, n, input_ptr, input_sf_ptr,
+                      reinterpret_cast<uint32_t*>(output_ptr),
+                      reinterpret_cast<uint32_t*>(sf_out));
+            });
+      });
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 608b724403..b769c09adc 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -115,6 +115,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
   ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
 
+#ifndef USE_ROCM
+  ops.def(
+      "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
+      "Tensor input, Tensor input_global_scale) -> ()");
+  ops.impl("silu_and_mul_nvfp4_quant", torch::kCUDA, &silu_and_mul_nvfp4_quant);
+#endif
+
   ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
   ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);
 
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 0e1059e654..fcc2589e42 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -4,32 +4,41 @@ import pytest
 import torch
 
 import vllm.envs as envs
-from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
-from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
+from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.compilation.activation_quant_fusion import (
+    FUSED_OPS, SILU_MUL_OP, ActivationQuantFusionPass)
+# yapf: enable
+from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import CompilationConfig, PassConfig, VllmConfig
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+    GroupShape, kFp8StaticTensorSym, kNvfp4Quant)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp)
 from vllm.platforms import current_platform
 
 from .backend import TestBackend
 
+FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
 
-class TestModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, *args,
-                 **kwargs):
-        super().__init__(*args, **kwargs)
+def is_nvfp4_supported():
+    return current_platform.has_device_capability(100)
+
+
+class TestSiluMulFp8QuantModel(torch.nn.Module):
+
+    def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, **kwargs):
+        super().__init__()
         self.silu_and_mul = SiluAndMul()
         self.wscale = torch.rand(1, dtype=torch.float32)
         self.scale = torch.rand(1, dtype=torch.float32)
 
-        self.w = (torch.rand(
-            hidden_size,
-            hidden_size).to(dtype=current_platform.fp8_dtype()).t())
+        self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
 
         self.fp8_linear = Fp8LinearOp(
             force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
@@ -45,14 +54,56 @@ class TestModel(torch.nn.Module):
                                    input_scale=self.wscale)
         return x2
 
+    def ops_in_model_before(self):
+        return [SILU_MUL_OP, QUANT_OPS[kFp8StaticTensorSym]]
 
-@pytest.mark.parametrize("num_tokens", [256])
-@pytest.mark.parametrize("hidden_size", [64])
+    def ops_in_model_after(self):
+        return [FUSED_OPS[kFp8StaticTensorSym]]
+
+
+class TestSiluMulNvfp4QuantModel(torch.nn.Module):
+
+    def __init__(self, hidden_size: int, **kwargs):
+        super().__init__()
+        self.silu_and_mul = SiluAndMul()
+        self.w = torch.randint(256, (hidden_size, hidden_size // 2),
+                               dtype=FP4_DTYPE)
+        self.wscale = torch.randn(hidden_size,
+                                  hidden_size // 16).to(dtype=FP8_DTYPE)
+        self.wscale2 = torch.rand(1, dtype=torch.float32)
+        self.scale = torch.rand(1, dtype=torch.float32)
+
+    def forward(self, x):
+        y = self.silu_and_mul(x)
+        y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale)
+        out = cutlass_scaled_fp4_mm(a=y_quant,
+                                    b=self.w,
+                                    block_scale_a=y_block_scale,
+                                    block_scale_b=self.wscale,
+                                    alpha=self.scale * self.wscale2,
+                                    out_dtype=y.dtype)
+        return out
+
+    def ops_in_model_before(self):
+        return [SILU_MUL_OP, QUANT_OPS[kNvfp4Quant]]
+
+    def ops_in_model_after(self):
+        return [FUSED_OPS[kNvfp4Quant]]
+
+
+@pytest.mark.parametrize("num_tokens", [64])
+@pytest.mark.parametrize("hidden_size", [128])
+@pytest.mark.parametrize(
+    "model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
+    if is_nvfp4_supported() else [TestSiluMulFp8QuantModel])
 @pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                     reason="Only test on CUDA and ROCm")
-def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
+def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
                                    force_fp8_e4m3fnuz):
+    if model_class == TestSiluMulNvfp4QuantModel and force_fp8_e4m3fnuz:
+        pytest.skip("Duplicate tests for NVFP4")
+
     torch.set_default_device("cuda")
     torch.set_default_dtype(torch.float16)
 
@@ -63,7 +114,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
     fusion_pass = ActivationQuantFusionPass(config)
 
     backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
-    model = TestModel(hidden_size, force_fp8_e4m3fnuz)
+    model = model_class(hidden_size=hidden_size,
+                        force_fp8_e4m3fnuz=force_fp8_e4m3fnuz)
 
     # First dimension dynamic
     x = torch.rand(num_tokens, hidden_size * 2)
@@ -80,17 +132,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
                                atol=1e-3,
                                rtol=1e-3)
 
-    # Check substitution worked
-    pre_nodes = backend.graph_pre_pass.nodes
-    post_nodes = backend.graph_post_pass.nodes
+    # In pre-nodes, quant op should be present and fused kernels should not
+    backend.check_before_ops(model.ops_in_model_before())
 
-    silu_and_mul_quant = torch.ops._C.silu_and_mul_quant.default
-    fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
-
-    # In pre-nodes, fp8 quant should be present and fused kernels should not
-    assert find_auto_fn_maybe(pre_nodes, silu_and_mul_quant) is None
-    find_auto_fn(pre_nodes, fp8_quant)
-
-    # In post-nodes, fused kernels should be present and fp8 quant should not
-    find_auto_fn(post_nodes, silu_and_mul_quant)
-    assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
+    # In post-nodes, fused kernels should be present and quant op should not
+    backend.check_after_ops(model.ops_in_model_after())
diff --git a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
new file mode 100644
index 0000000000..969f14cc3f
--- /dev/null
+++ b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
+                allow_module_level=True)
+
+DTYPES = [torch.float16, torch.bfloat16]
+SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
+SEEDS = [42]
+CUDA_DEVICES = ['cuda:0']
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+BLOCK_SIZE = 16
+
+
+def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
+             global_scale: torch.Tensor,
+             ref_output_scale: torch.Tensor) -> torch.Tensor:
+    silu_and_mul_out = silu_and_mul.forward_native(x)
+    assert not current_platform.is_rocm()
+    assert silu_and_mul_out.ndim >= 1, (
+        f'input.ndim needs to be >= 1, but got {silu_and_mul_out.ndim}.')
+    other_dims = 1 if silu_and_mul_out.ndim == 1 else -1
+    silu_and_mul_out = silu_and_mul_out.reshape(other_dims,
+                                                silu_and_mul_out.shape[-1])
+    m, n = silu_and_mul_out.shape
+    device = silu_and_mul_out.device
+
+    # Two fp4 values will be packed into an uint8.
+    out = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+
+    output_scale = ref_output_scale
+
+    torch.ops._C.scaled_fp4_quant(out, silu_and_mul_out, output_scale,
+                                  global_scale)
+
+    return out, output_scale
+
+
+def ops_impl(x: torch.Tensor, global_scale: torch.Tensor,
+             ref_output_scale: torch.Tensor) -> torch.Tensor:
+    out_shape = (x.shape[0], x.shape[1] // 4)
+    output_scale = ref_output_scale
+    out = torch.empty(out_shape, dtype=torch.uint8, device=x.device)
+    torch.ops._C.silu_and_mul_nvfp4_quant(out, output_scale, x, global_scale)
+    return out, output_scale
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_quantize_to_fp4(
+    dtype: torch.dtype,
+    shape: tuple[int, int],
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    m, n = shape
+
+    x = torch.randn((m, n), dtype=dtype)
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+
+    block_size = 16
+
+    assert n % block_size == 0, (
+        f'last dim has to be multiple of 16, but got {n}.')
+    assert x.dtype in (torch.float16, torch.bfloat16), (
+        f'input.dtype needs to be fp16 or bf16 but got {x.dtype}.')
+
+    round_up = lambda x, y: (x + y - 1) // y * y
+    rounded_m = round_up(x.shape[0], 128)
+    scale_n = x.shape[1] // (2 * block_size)
+    rounded_n = round_up(scale_n, 4)
+    output_scale = torch.empty((rounded_m, rounded_n // 4),
+                               device=x.device,
+                               dtype=torch.int32)
+
+    layer = SiluAndMul()
+
+    ref_out, ref_out_scale = ref_impl(layer, x, global_scale, output_scale)
+
+    fusion_out, fusion_out_scale = ops_impl(x, global_scale, output_scale)
+
+    assert ref_out.dtype == torch.uint8
+    assert fusion_out.dtype == torch.uint8
+    assert ref_out.shape == fusion_out.shape
+
+    assert ref_out_scale.dtype == torch.int32
+    assert fusion_out_scale.dtype == torch.int32
+    assert ref_out_scale.shape == fusion_out_scale.shape
+
+    # Allow up to 2% of mismatched values since BF16 has accuracy issues.
+    mis_threshold = 0.02
+    atol = 0.4
+    rtol = 0.4
+    ref_logits = ref_out[-1]
+    fusion_logits = fusion_out[-1]
+
+    mis_count = torch.sum(
+        torch.abs(fusion_logits - ref_logits) > (atol +
+                                                 rtol * torch.abs(ref_logits)))
+    mis_ratio = mis_count / fusion_logits.numel()
+
+    assert mis_ratio < mis_threshold, \
+        f"Mismatch ratio {mis_ratio} exceeds threshold {mis_threshold}"
+
+    torch.testing.assert_close(ref_out_scale, fusion_out_scale)
+
+    opcheck(torch.ops._C.silu_and_mul_nvfp4_quant,
+            (fusion_out, fusion_out_scale, x, global_scale))
diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
index 826014f770..40e124a03e 100644
--- a/vllm/compilation/activation_quant_fusion.py
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -1,55 +1,154 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from abc import ABC, abstractmethod
+
 import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import (PatternMatcherPass, fwd_only,
                                              register_replacement)
+from torch._ops import OpOverload
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey, kFp8StaticTensorSym, kNvfp4Quant, kStaticTensorScale)
 from vllm.platforms import current_platform
 
+from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32
 from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
+FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
 
-def silu_mul_pattern_static(result: torch.Tensor,
-                            result_silu_mul: torch.Tensor, input: torch.Tensor,
-                            scale: torch.Tensor):
-    at1 = auto_functionalized(torch.ops._C.silu_and_mul.default,
-                              result=result_silu_mul,
-                              input=input)
-    at2 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
-                              result=result,
-                              input=at1[1],
-                              scale=scale)
-    return at2[1]
+SILU_MUL_OP = torch.ops._C.silu_and_mul.default
+
+FUSED_OPS: dict[QuantKey, OpOverload] = {
+    kFp8StaticTensorSym: torch.ops._C.silu_and_mul_quant.default,  # noqa: E501
+}
+if current_platform.is_cuda() and hasattr(torch.ops._C,
+                                          "silu_and_mul_nvfp4_quant"):
+    FUSED_OPS[
+        kNvfp4Quant] = torch.ops._C.silu_and_mul_nvfp4_quant.default  # noqa: E501
 
 
-def silu_mul_replacement_static(result: torch.Tensor,
-                                result_silu_mul: torch.Tensor,
-                                input: torch.Tensor, scale: torch.Tensor):
-    at = auto_functionalized(torch.ops._C.silu_and_mul_quant.default,
-                             result=result,
-                             input=input,
-                             scale=scale)
-    return at[1]
+class ActivationQuantPattern(ABC):
+    """
+    The base class for Activation+Quant fusions.
+    Should not be used directly.
+    """
+
+    def __init__(
+        self,
+        quant_key: QuantKey,
+    ):
+        self.quant_key = quant_key
+        self.quant_dtype = quant_key.dtype
+
+        assert self.quant_key in QUANT_OPS, \
+            f"unsupported quantization scheme {self.quant_key}"
+        self.QUANT_OP = QUANT_OPS[self.quant_key]
+
+        assert self.quant_key in FUSED_OPS, \
+            f"unsupported fusion scheme {self.quant_key}"
+        self.FUSED_OP = FUSED_OPS[self.quant_key]
+
+    def empty_quant(self, *args, **kwargs):
+        kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs}
+        return torch.empty(*args, **kwargs)
+
+    @abstractmethod
+    def register(self, pm_pass: PatternMatcherPass):
+        raise NotImplementedError
 
 
-def empty_bf16(*args, **kwargs):
-    return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
+class SiluMulFp8StaticQuantPattern(ActivationQuantPattern):
+    """
+    Fusion for SiluMul+Fp8StaticQuant Pattern
+    """
+
+    def __init__(self, symmetric: bool = True):
+        quant_key = QuantKey(dtype=FP8_DTYPE,
+                             scale=kStaticTensorScale,
+                             symmetric=symmetric)
+        super().__init__(quant_key)
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(result: torch.Tensor, result_silu_mul: torch.Tensor,
+                    input: torch.Tensor, scale: torch.Tensor):
+            at1 = auto_functionalized(SILU_MUL_OP,
+                                      result=result_silu_mul,
+                                      input=input)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at1[1],
+                                      scale=scale)
+            return at2[1]
+
+        def replacement(result: torch.Tensor, result_silu_mul: torch.Tensor,
+                        input: torch.Tensor, scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     scale=scale)
+            return at[1]
+
+        inputs = [
+            self.empty_quant(5, 4),  # result
+            empty_bf16(5, 4),  # result_silu_mul
+            empty_bf16(5, 4),  # input
+            empty_fp32(1, 1)  # scale
+        ]
+
+        register_replacement(pattern, replacement, inputs, fwd_only, pm_pass)
 
 
-def empty_fp8(*args, **kwargs):
-    fp8 = current_platform.fp8_dtype()
-    return torch.empty(*args, **kwargs, dtype=fp8, device="cuda")
+class SiluMulNvfp4QuantPattern(ActivationQuantPattern):
+    """
+    Fusion for SiluMul+Nvfp4Quant Pattern
+    """
 
+    def __init__(self):
+        super().__init__(kNvfp4Quant)
 
-def empty_fp32(*args, **kwargs):
-    return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(result: torch.Tensor, output_scale: torch.Tensor,
+                    result_silu_mul: torch.Tensor, input: torch.Tensor,
+                    scale: torch.Tensor):
+            at1 = auto_functionalized(SILU_MUL_OP,
+                                      result=result_silu_mul,
+                                      input=input)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      output=result,
+                                      input=at1[1],
+                                      output_scale=output_scale,
+                                      input_scale=scale)
+            return at2[1], at2[2]
+
+        def replacement(result: torch.Tensor, output_scale: torch.Tensor,
+                        result_silu_mul: torch.Tensor, input: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     result_block_scale=output_scale,
+                                     input=input,
+                                     input_global_scale=scale)
+            return at[1], at[2]
+
+        inputs = [
+            self.empty_quant(5, 32),  # result
+            empty_i32(128, 4),  # output_scale
+            empty_bf16(5, 64),  # result_silu_mul
+            empty_bf16(5, 64),  # input
+            empty_fp32(1, 1)  # scale
+        ]
+
+        register_replacement(pattern, replacement, inputs, fwd_only, pm_pass)
 
 
 class ActivationQuantFusionPass(VllmInductorPass):
@@ -69,15 +168,11 @@ class ActivationQuantFusionPass(VllmInductorPass):
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="activation_quant_fusion_pass")
 
-        inputs = [
-            empty_fp8(5, 4),  # Quant output
-            empty_bf16(5, 4),  # Silu_and_mul output
-            empty_bf16(5, 4),  # Input
-            empty_fp32(1, 1)  # Scale
-        ]
-        register_replacement(silu_mul_pattern_static,
-                             silu_mul_replacement_static, inputs, fwd_only,
-                             self.patterns)
+        pattern_silu_mul_fp8 = SiluMulFp8StaticQuantPattern()
+        pattern_silu_mul_fp8.register(self.patterns)
+
+        pattern_silu_mul_nvfp4 = SiluMulNvfp4QuantPattern()
+        pattern_silu_mul_nvfp4.register(self.patterns)
 
     def __call__(self, graph: torch.fx.Graph):
         self.begin()
@@ -89,3 +184,8 @@ class ActivationQuantFusionPass(VllmInductorPass):
 
         self.dump_graph(graph, "after_act_quant_fusion")
         self.end_and_log()
+
+    def uuid(self):
+        return VllmInductorPass.hash_source(self, ActivationQuantPattern,
+                                            SiluMulFp8StaticQuantPattern,
+                                            SiluMulNvfp4QuantPattern)
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 60ae143318..a36dd8b845 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -97,6 +97,13 @@ class FixFunctionalizationPass(VllmInductorPass):
                                      node,
                                      mutated_args,
                                      args=('result', 'input', 'scale'))
+            elif at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default:
+                mutated_args = {1: 'result', 2: 'result_block_scale'}
+                self.defunctionalize(graph,
+                                     node,
+                                     mutated_args,
+                                     args=('result', 'result_block_scale',
+                                           'input', 'input_global_scale'))
             else:
                 continue  # skip the count
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 9d4e453ffc..1fbb2e3bb6 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -885,6 +885,10 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         layer.alpha = Parameter(layer.input_scale * layer.weight_scale_2,
                                 requires_grad=False)
 
+        # Calculate `1 / input_scale` so that we don't need to do so at runtime
+        layer.input_scale_inv = Parameter(
+            (1 / layer.input_scale).to(torch.float32), requires_grad=False)
+
         # Swizzle the weight blockscale.
         # contracting dimension is input dimension
         # block_size = 16;
@@ -941,8 +945,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         output_shape = [x.shape[0], layer.weight.shape[0]]
 
         # quantize BF16 or FP16 to (FP4 and interleaved block scale)
-        s_quant = 1 / layer.input_scale
-        x_fp4, x_blockscale = scaled_fp4_quant(x, s_quant)
+        x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_scale_inv)
 
         # validate dtypes of quantized input, input block scale,
         # weight and weight_blockscale

From 27e88cee748d41e07268ca140d15252c6b38acf1 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 28 Aug 2025 13:17:15 -0700
Subject: [PATCH 702/932] chore: build release image by default (#23852)

Signed-off-by: Codex <codex@openai.com>
---
 .buildkite/release-pipeline.yaml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 86aae426c2..92a1bcada3 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -62,12 +62,8 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
-  - block: "Build release image (x86)"
-    depends_on: ~
-    key: block-release-image-build
-
   - label: "Build release image (x86)"
-    depends_on: block-release-image-build
+    depends_on: ~
     id: build-release-image-x86
     agents:
       queue: cpu_queue_postmerge
@@ -80,7 +76,7 @@ steps:
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Build release image (arm64)"
-    depends_on: block-release-image-build
+    depends_on: ~
     id: build-release-image-arm64
     agents:
       queue: arm64_cpu_queue_postmerge

From 7ffbf27239c3ff68d773e7d2e2cd284f1375349f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 28 Aug 2025 14:22:46 -0700
Subject: [PATCH 703/932] [BugFix][FlashInfer] Fix potential race condition for
 paged_kv_indptr_cpu (#23737)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 70d3471a47..5fc3a1517b 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -237,6 +237,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                device="cpu",
                                                pin_memory=pin_memory)
         self.paged_kv_indptr_np = self.paged_kv_indptr_cpu.numpy()
+        self.paged_kv_indptr_buffer = torch.zeros_like(
+            self.paged_kv_indptr_cpu, pin_memory=pin_memory)
         self.paged_kv_indices_cpu = torch.zeros(max_num_pages,
                                                 dtype=torch.int32,
                                                 device="cpu",
@@ -361,12 +363,18 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             dtype=np.int32,
             out=self.paged_kv_indptr_np[1:num_reqs + 1],
         )
+        # NOTE(woosuk): Because self.paged_kv_indptr_cpu can be modified
+        # after this line (e.g., for cuda graphs), we need to copy the data to
+        # self.paged_kv_indptr_buffer to avoid race condition.
+        self.paged_kv_indptr_buffer[:num_reqs +
+                                    1] = (self.paged_kv_indptr_cpu[:num_reqs +
+                                                                   1])
         paged_kv_indptr = self.paged_kv_indptr[:num_reqs + 1]
-        paged_kv_indptr.copy_(self.paged_kv_indptr_cpu[:num_reqs + 1],
+        paged_kv_indptr.copy_(self.paged_kv_indptr_buffer[:num_reqs + 1],
                               non_blocking=True)
 
         # write self.paged_kv_indices inplace
-        num_actual_pages = num_blocks_np.sum().item()
+        num_actual_pages = self.paged_kv_indptr_np[num_reqs]
         paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
         _copy_page_indices_kernel[(num_reqs, )](
             paged_kv_indices,

From cb293f6a790d555d6d7ced872118ff029bd828e8 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Thu, 28 Aug 2025 14:54:30 -0700
Subject: [PATCH 704/932] [V1] Enable prefill optimization for Gemma3n (#22628)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/e2e/test_kv_sharing_fast_prefill.py |  57 ---
 vllm/config/cache.py                         |  12 +-
 vllm/model_executor/models/gemma3n.py        | 435 +++++++++++++++----
 vllm/model_executor/models/gemma3n_mm.py     |   2 +-
 vllm/v1/attention/backends/utils.py          | 139 +++++-
 vllm/v1/engine/async_llm.py                  |   7 +
 vllm/v1/worker/gpu_model_runner.py           |  96 ++--
 vllm/v1/worker/tpu_model_runner.py           |  39 +-
 vllm/v1/worker/utils.py                      |  40 +-
 9 files changed, 591 insertions(+), 236 deletions(-)

diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index d72e50e519..7bc7f44dd7 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
-from typing import Optional, Union
 
 import pytest
 import torch
@@ -10,12 +9,6 @@ import torch
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationLevel
 from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.forward_context import get_forward_context
-from vllm.model_executor.models.gemma3n_mm import (
-    Gemma3nForConditionalGeneration)
-from vllm.model_executor.models.registry import ModelRegistry
-from vllm.model_executor.models.utils import extract_layer_index
-from vllm.sequence import IntermediateTensors
 
 from ...utils import fork_new_process_for_each_test
 
@@ -23,54 +16,6 @@ from ...utils import fork_new_process_for_each_test
 SEED = 42
 
 
-class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = super().forward(input_ids, positions,
-                                        intermediate_tensors, inputs_embeds,
-                                        **kwargs)
-        attn_metadata = get_forward_context().attn_metadata
-        # attn_metadata is None during dummy runs
-        if (attn_metadata is not None
-                and self.language_model.cache_config.kv_sharing_fast_prefill):
-            assert isinstance(attn_metadata, dict)  # true in V1
-            # Gemma3n-E2B has 30 layers, with last 20 layers being
-            # cross-decoder layers. Check attention metadata is correct
-            for layer_name, metadata in attn_metadata.items():
-                layer_idx = extract_layer_index(layer_name)
-                if layer_idx >= 20:
-                    assert hasattr(metadata, 'logits_indices_padded')
-                    assert hasattr(metadata, 'num_logits_indices')
-                else:
-                    assert not hasattr(metadata, 'logits_indices_padded')
-                    assert not hasattr(metadata, 'num_logits_indices')
-
-            # Last layer will be a KV sharing layer
-            layer_attn_metadata = attn_metadata[
-                self.language_model.model.layers[-1].self_attn.attn.layer_name]
-            logits_indices_padded = (layer_attn_metadata.logits_indices_padded)
-            assert logits_indices_padded is not None
-            num_logits_indices = layer_attn_metadata.num_logits_indices
-            assert num_logits_indices > 0
-            # Reset hidden states to random values and
-            # only set logits at logits_indices to valid values
-            # Because logits_indices are the only positions that are used
-            # for output token sampling, this still produces same outputs
-            logits_hs = hidden_states[logits_indices_padded]
-            hidden_states = torch.randn_like(hidden_states)
-            gen_indices = logits_indices_padded[:num_logits_indices]
-            hidden_states[gen_indices] = logits_hs[:num_logits_indices]
-
-        return hidden_states
-
-
 @pytest.fixture
 def test_prompts():
     """
@@ -124,8 +69,6 @@ def test_kv_sharing_fast_prefill(
     enforce_eager: bool,
     test_prompts: list[str],
 ):
-    ModelRegistry.register_model("Gemma3nForConditionalGeneration",
-                                 TestGemma3nForConditionalGeneration)
     sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
     compilation_config = CompilationConfig(
         # This allows vLLM compilation backend to handle allocating and
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 3d2aa6b17b..79761e7844 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -145,12 +145,19 @@ class CacheConfig:
 
         self._verify_cache_dtype()
         self._verify_prefix_caching()
+        self._verify_kv_sharing_fast_prefill()
 
     def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
         # metrics info
         return {key: str(value) for key, value in self.__dict__.items()}
 
+    def _verify_kv_sharing_fast_prefill(self) -> None:
+        if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1:
+            raise NotImplementedError(
+                "Fast prefill optimization for KV sharing is not supported "
+                "in V0 currently.")
+
     @model_validator(mode='after')
     def _verify_args(self) -> Self:
         if self.cpu_offload_gb < 0:
@@ -162,11 +169,6 @@ class CacheConfig:
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
 
-        if self.kv_sharing_fast_prefill:
-            logger.warning_once(
-                "--kv-sharing-fast-prefill is currently work in progress "
-                "and not functional yet (i.e. no prefill savings)")
-
         return self
 
     def _verify_cache_dtype(self) -> None:
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index ffec340870..0e0e191e75 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -23,9 +23,11 @@ from torch import nn
 from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig
 
 from vllm.attention import Attention
+from vllm.compilation.backends import set_model_tag
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY,
                                                    GeluAndMul,
@@ -45,6 +47,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backends.utils import KVSharingFastPrefillMetadata
 
 from .interfaces import SupportsQuant
 from .utils import (AutoWeightsLoader, extract_layer_index,
@@ -533,7 +536,178 @@ class Gemma3nDecoderLayer(nn.Module):
         return corrected_predictions
 
 
-@support_torch_compile
+# This enables torch.compile if --kv-sharing-fast-prefill passed
+@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
+                       kv_sharing_fast_prefill)
+class Gemma3nSelfDecoder(nn.Module):
+    """
+    Includes altup embedding and self decoder layers
+    """
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        decoder_layers: list[Gemma3nDecoderLayer],
+        layer_idx_start: int,
+        per_layer_model_projection: ColumnParallelLinear,
+        embed_scale_per_layer: torch.Tensor,
+        embed_tokens_per_layer: VocabParallelEmbedding,
+        per_layer_projection_norm: RMSNorm,
+        per_layer_input_scale: torch.Tensor,
+        altup_projections: nn.ModuleList,
+        eps: torch.Tensor,
+        embed_tokens: VocabParallelEmbedding,
+        embed_scale: torch.Tensor,
+    ):
+        super().__init__()
+        self.decoder_layers = decoder_layers
+        self.layer_idx_start = layer_idx_start
+        self.per_layer_model_projection = per_layer_model_projection
+        self.config = vllm_config.model_config.hf_config
+        self.embed_scale_per_layer = embed_scale_per_layer
+        self.embed_tokens_per_layer = embed_tokens_per_layer
+        self.per_layer_projection_norm = per_layer_projection_norm
+        self.per_layer_input_scale = per_layer_input_scale
+        self.altup_projections = altup_projections
+        self.eps = eps
+        self.embed_tokens = embed_tokens
+        self.embed_scale = embed_scale
+
+    def get_per_layer_input_embeddings(
+            self, input_ids: torch.Tensor) -> torch.Tensor:
+        # Deal with the fact that vocab_size_per_layer_input < vocab_size
+        # which causes us to have some out of vocab tokens by setting
+        # those token ids to 0. This matches the HF implementation.
+        per_layer_inputs_mask = torch.logical_and(
+            input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input)
+        per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids,
+                                              torch.zeros_like(input_ids))
+        return self.embed_tokens_per_layer(
+            per_layer_inputs_tokens) * self.embed_scale_per_layer
+
+    def get_per_layer_inputs(
+        self,
+        hidden_states_0: torch.Tensor,
+        per_layer_inputs: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        per_layer_projection = self.per_layer_model_projection(hidden_states_0)
+        per_layer_projection = per_layer_projection.reshape(
+            *hidden_states_0.shape[:-1],
+            self.config.num_hidden_layers,
+            self.config.hidden_size_per_layer_input,
+        )
+        per_layer_projection = self.per_layer_projection_norm(
+            per_layer_projection)
+        if per_layer_inputs is not None:
+            # Profiling run does not compute per_layer_inputs
+            per_layer_inputs = per_layer_projection + per_layer_inputs
+            per_layer_inputs *= self.per_layer_input_scale
+        else:
+            per_layer_inputs = per_layer_projection
+        return per_layer_inputs
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids) * self.embed_scale
+
+    def altup_embed(self, hidden_states_0: torch.Tensor) -> torch.Tensor:
+        # Altup embed.
+        hidden_states = [hidden_states_0] * self.config.altup_num_inputs
+        target_magnitude = torch.mean(hidden_states_0**2, dim=-1,
+                                      keepdim=True)**0.5
+        for i in range(1, self.config.altup_num_inputs):
+            hidden_states[i] = self.altup_projections[i - 1](hidden_states[i])
+            new_magnitude = torch.mean(hidden_states[i]**2,
+                                       dim=-1,
+                                       keepdim=True)**0.5
+            hidden_states[i] *= target_magnitude / torch.maximum(
+                new_magnitude, self.eps)
+        hidden_states = torch.stack(hidden_states, dim=-1)
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if inputs_embeds is not None:
+            hidden_states_0 = inputs_embeds
+        else:
+            hidden_states_0 = self.get_input_embeddings(input_ids)
+
+        adjusted_per_layer_inputs = self.get_per_layer_inputs(
+            hidden_states_0, per_layer_inputs)
+        hidden_states = self.altup_embed(hidden_states_0)
+
+        # [altnum_inputs, num_tokens, hidden_size]
+        hidden_states = hidden_states.permute(2, 0, 1)
+
+        for idx, layer in enumerate(self.decoder_layers):
+            layer_idx = idx + self.layer_idx_start
+            # [altup_num_inputs, num_tokens, hidden_size]
+            hidden_states = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                per_layer_input=adjusted_per_layer_inputs[:, layer_idx, :],
+                **kwargs,
+            )
+
+        # [num_tokens, hidden_size, altnum_inputs]
+        hidden_states = hidden_states.permute(1, 2, 0)
+
+        return hidden_states, adjusted_per_layer_inputs
+
+
+# This enables torch.compile if --kv-sharing-fast-prefill passed
+@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
+                       kv_sharing_fast_prefill)
+class Gemma3nCrossDecoder(nn.Module):
+    """
+    Cross-decoder layers
+    """
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        decoder_layers: list[Gemma3nDecoderLayer],
+        layer_idx_start: int,
+    ):
+        super().__init__()
+        self.decoder_layers = decoder_layers
+        self.layer_idx_start = layer_idx_start
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        per_layer_inputs: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        # [altnum_inputs, num_tokens, hidden_size]
+        hidden_states = hidden_states.permute(2, 0, 1)
+        for idx, layer in enumerate(self.decoder_layers):
+            layer_idx = idx + self.layer_idx_start
+            # [altup_num_inputs, num_tokens, hidden_size]
+            hidden_states = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                per_layer_input=per_layer_inputs[:, layer_idx, :],
+                **kwargs,
+            )
+        # [num_tokens, hidden_size, altnum_inputs]
+        hidden_states = hidden_states.permute(1, 2, 0)
+        return hidden_states
+
+
+# This disables torch.compile if --kv-sharing-fast-prefill passed
+@support_torch_compile(enable_if=lambda vllm_config: not vllm_config.
+                       cache_config.kv_sharing_fast_prefill)
 class Gemma3nTextModel(nn.Module, SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -543,7 +717,6 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
@@ -613,95 +786,211 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
             lambda prefix: Gemma3nDecoderLayer(
                 config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
+
+        self.eps = torch.tensor(torch.finfo().min)
+
+        first_kv_shared_layer_idx = (config.num_hidden_layers -
+                                     config.num_kv_shared_layers)
+        # Layer idx 0-19 are self-decoder layers in You Only Cache Once (YOCO)
+        with set_model_tag("self_decoder"):
+            self.self_decoder = Gemma3nSelfDecoder(
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.self_decoder",
+                decoder_layers=self.layers[:first_kv_shared_layer_idx],
+                layer_idx_start=0,
+                per_layer_model_projection=self.per_layer_model_projection,
+                embed_scale_per_layer=self.embed_scale_per_layer,
+                embed_tokens_per_layer=self.embed_tokens_per_layer,
+                per_layer_projection_norm=self.per_layer_projection_norm,
+                per_layer_input_scale=self.per_layer_input_scale,
+                altup_projections=self.altup_projections,
+                eps=self.eps,
+                embed_tokens=self.embed_tokens,
+                embed_scale=self.embed_scale,
+            )
+        # Layer idx 20-30 are cross-decoder layers in YOCO
+        with set_model_tag("cross_decoder"):
+            self.cross_decoder = Gemma3nCrossDecoder(
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.cross_decoder",
+                decoder_layers=self.layers[first_kv_shared_layer_idx:],
+                layer_idx_start=first_kv_shared_layer_idx,
+            )
+
         self.norm = RMSNorm(
             config.hidden_size,
             eps=config.rms_norm_eps,
         )
-        self.eps = torch.tensor(torch.finfo().min)
+
+        self.fast_prefill_enabled = cache_config.kv_sharing_fast_prefill
+
+        if self.fast_prefill_enabled:
+            # Allocate static buffers for CUDAGraph
+            # TODO(sarckk): Extract this functionality to interface
+            max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
+            device = next(self.parameters()).device
+            self.positions = torch.zeros(max_num_tokens,
+                                         dtype=torch.int64,
+                                         device=device)
+            self.hidden_states = torch.zeros(
+                (max_num_tokens, config.hidden_size,
+                 self.config.altup_num_inputs),
+                dtype=self.embed_tokens.weight.dtype,
+                device=device,
+            )
+            self.per_layer_inputs = torch.zeros(
+                (max_num_tokens, self.config.num_hidden_layers,
+                 self.config.hidden_size_per_layer_input),
+                dtype=self.embed_tokens.weight.dtype,
+                device=device,
+            )
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids) * self.embed_scale
+        return self.self_decoder.get_input_embeddings(input_ids)
 
-    def get_per_layer_input_embeddings(
-            self, input_ids: torch.Tensor) -> torch.Tensor:
-        # Deal with the fact that vocab_size_per_layer_input < vocab_size
-        # which causes us to have some out of vocab tokens by setting
-        # those token ids to 0. This matches the HF implementation.
-        per_layer_inputs_mask = torch.logical_and(
-            input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input)
-        per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids,
-                                              torch.zeros_like(input_ids))
-        return self.embed_tokens_per_layer(
-            per_layer_inputs_tokens) * self.embed_scale_per_layer
+    def fast_prefill_forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        logits_indices_padded, num_logits_indices = None, None
+        attn_metadata = get_forward_context().attn_metadata
+
+        # attn_metadata is None during dummy runs
+        if (self.fast_prefill_enabled and attn_metadata is not None):
+            assert isinstance(attn_metadata, dict)
+            # Last layer is a KV sharing layer
+            layer_attn_metadata = attn_metadata[
+                self.layers[-1].self_attn.attn.layer_name]
+            if (isinstance(layer_attn_metadata, KVSharingFastPrefillMetadata)):
+                logits_indices_padded = (
+                    layer_attn_metadata.logits_indices_padded)
+                num_logits_indices = layer_attn_metadata.num_logits_indices
+
+        # Copy inputs for cudagraph
+        batch_size = positions.size(0)
+        self.positions[:batch_size].copy_(positions)
+        self_decoder_hidden_states, per_layer_inputs_adjusted = \
+            self.self_decoder(
+                input_ids=input_ids,
+                positions=self.positions[:batch_size],
+                inputs_embeds=inputs_embeds,
+                per_layer_inputs=per_layer_inputs,
+                **kwargs,
+            )
+
+        if logits_indices_padded is None:
+            logits_indices_padded = torch.arange(
+                positions.size(0),
+                dtype=positions.dtype,
+                device=positions.device,
+            )
+
+        # NOTE(sarckk): There is currently a bug caused by
+        # vLLM converting output of last piecewise CUDA graph
+        # to weakref, causing memory to be prematurely freed
+        # when there are multiple compilation units
+        # Keep .clone() until fix in
+        # https://github.com/vllm-project/vllm/pull/22282
+        hidden_states = self_decoder_hidden_states.clone()
+
+        # Copy inputs for cudagraph
+        num_padded_logits_indices = logits_indices_padded.size(0)
+        self.positions[:num_padded_logits_indices].copy_(
+            positions[logits_indices_padded])
+        self.hidden_states[:num_padded_logits_indices].copy_(
+            self_decoder_hidden_states[logits_indices_padded])
+        self.per_layer_inputs[:num_padded_logits_indices].copy_(
+            per_layer_inputs_adjusted[logits_indices_padded])
+        cross_decoder_hidden_states = self.cross_decoder(
+            positions=self.positions[:num_padded_logits_indices],
+            hidden_states=self.hidden_states[:num_padded_logits_indices],
+            per_layer_inputs=self.per_layer_inputs[:num_padded_logits_indices],
+            **kwargs,
+        )
+
+        if num_logits_indices is not None:
+            assert num_logits_indices > 0
+            # Merge cross-decoder and self-decoder hidden states
+            hidden_states[logits_indices_padded[:num_logits_indices]] = (
+                cross_decoder_hidden_states[:num_logits_indices])
+        else:
+            hidden_states = cross_decoder_hidden_states
+
+        return hidden_states
+
+    def normal_forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states, per_layer_inputs = self.self_decoder(
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+            per_layer_inputs=per_layer_inputs,
+            **kwargs,
+        )
+        hidden_states = self.cross_decoder(
+            positions=positions,
+            hidden_states=hidden_states,
+            per_layer_inputs=per_layer_inputs,
+            **kwargs,
+        )
+        return hidden_states
+
+    def altup_unembed(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Altup unembed.
+        target_magnitude = torch.mean(hidden_states[..., 0]**2,
+                                      dim=-1,
+                                      keepdim=True)**0.5
+        for i in range(1, self.config.altup_num_inputs):
+            hidden_states[..., i] = self.altup_unembed_projections[i - 1](
+                hidden_states[..., i])
+            new_magnitude = torch.mean(hidden_states[..., i]**2,
+                                       dim=-1,
+                                       keepdim=True)**0.5
+            hidden_states[..., i] *= target_magnitude / torch.maximum(
+                new_magnitude, self.eps)
+        # [num_tokens,hidden_size, altup_num_inputs] -> [num_tokens,hidden_size]
+        hidden_states = torch.mean(hidden_states, dim=-1)
+        return hidden_states
 
     def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        per_layer_inputs: torch.Tensor,
+        per_layer_inputs: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        if inputs_embeds is not None:
-            hidden_states_0 = inputs_embeds
-        else:
-            hidden_states_0 = self.get_input_embeddings(input_ids)
-
-        per_layer_projection = self.per_layer_model_projection(hidden_states_0)
-        per_layer_projection = per_layer_projection.reshape(
-            *hidden_states_0.shape[:-1],
-            self.config.num_hidden_layers,
-            self.config.hidden_size_per_layer_input,
-        )
-        per_layer_projection = self.per_layer_projection_norm(
-            per_layer_projection)
-
-        if per_layer_inputs is not None:
-            # Profiling run does not compute per_layer_inputs
-            per_layer_inputs = per_layer_projection + per_layer_inputs
-            per_layer_inputs *= self.per_layer_input_scale
-        else:
-            per_layer_inputs = per_layer_projection
-
-        # Altup embed.
-        hidden_states = [hidden_states_0] * self.config.altup_num_inputs
-        target_magnitude = torch.mean(hidden_states_0**2, dim=-1,
-                                      keepdim=True)**0.5
-        for i in range(1, self.config.altup_num_inputs):
-            hidden_states[i] = self.altup_projections[i - 1](hidden_states[i])
-            new_magnitude = torch.mean(hidden_states[i]**2,
-                                       dim=-1,
-                                       keepdim=True)**0.5
-            hidden_states[i] *= target_magnitude / torch.maximum(
-                new_magnitude, self.eps)
-        hidden_states = torch.stack(hidden_states, dim=0)
-
-        # Transformer blocks.
-        for layer_idx, layer in enumerate(self.layers):
-            # [altup_num_inputs, num_tokens, hidden_size]
-            hidden_states = layer(
-                positions=positions,
-                hidden_states=hidden_states,
-                per_layer_input=per_layer_inputs[:, layer_idx, :],
+        if self.fast_prefill_enabled:
+            hidden_states = self.fast_prefill_forward(
+                input_ids,
+                positions,
+                inputs_embeds,
+                per_layer_inputs,
                 **kwargs,
             )
-
-        # Altup unembed.
-        target_magnitude = torch.mean(hidden_states[0]**2,
-                                      dim=-1,
-                                      keepdim=True)**0.5
-        for i in range(1, self.config.altup_num_inputs):
-            hidden_states[i] = self.altup_unembed_projections[i - 1](
-                hidden_states[i])
-            new_magnitude = torch.mean(hidden_states[i]**2,
-                                       dim=-1,
-                                       keepdim=True)**0.5
-            hidden_states[i] *= target_magnitude / torch.maximum(
-                new_magnitude, self.eps)
-        # [altup_num_inputs,num_tokens,hidden_size] -> [num_tokens,hidden_size]
-        hidden_states = torch.mean(hidden_states, dim=0)
-
+        else:
+            hidden_states = self.normal_forward(
+                input_ids,
+                positions,
+                inputs_embeds,
+                per_layer_inputs,
+                **kwargs,
+            )
+        hidden_states = self.altup_unembed(hidden_states)
         return self.norm(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str,
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index d59dde1560..aba4f98ea5 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -620,7 +620,7 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal):
         # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache
         # them here, as the model  forward has only access to the input_embeds.
         if input_ids is not None:
-            per_layer_inputs = self.language_model.model.get_per_layer_input_embeddings(
+            per_layer_inputs = self.language_model.model.self_decoder.get_per_layer_input_embeddings(
                 input_ids)
             per_layer_inputs = per_layer_inputs.reshape(
                 -1, self.config.text_config.num_hidden_layers,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 39bdbe1256..ad53b2e80b 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -4,11 +4,13 @@ import abc
 import enum
 import functools
 from abc import abstractmethod
-from dataclasses import dataclass, make_dataclass
-from typing import TYPE_CHECKING, Any, ClassVar, Generic, Optional, TypeVar
+from dataclasses import dataclass, fields, make_dataclass
+from typing import (TYPE_CHECKING, Any, ClassVar, Generic, Optional, Protocol,
+                    TypeVar)
 
 import numpy as np
 import torch
+from typing_extensions import runtime_checkable
 
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.utils import cdiv
@@ -19,7 +21,8 @@ if TYPE_CHECKING:
     from vllm.v1.worker.gpu_input_batch import InputBatch
 
 import vllm.envs as envs
-from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata)
 from vllm.attention.layer import Attention
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     get_kv_connector_cache_layout)
@@ -65,6 +68,10 @@ class CommonAttentionMetadata:
 
     causal: bool = True
 
+    # Needed by FastPrefillAttentionBuilder
+    logits_indices_padded: Optional[torch.Tensor] = None
+    num_logits_indices: Optional[int] = None
+
 
 @dataclass
 class UbatchSlice:
@@ -542,6 +549,69 @@ def make_local_attention_virtual_batches(
     )
 
 
+def make_kv_sharing_fast_prefill_common_attn_metadata(
+    common_attn_metadata: CommonAttentionMetadata,
+) -> CommonAttentionMetadata:
+    if common_attn_metadata.max_query_len == 1:
+        # All requests are decode (assume 1 token for now)
+        # Skip computing fast prefill path
+        return common_attn_metadata
+
+    assert common_attn_metadata.logits_indices_padded is not None
+    assert common_attn_metadata.num_logits_indices is not None
+
+    logits_indices_padded = common_attn_metadata.logits_indices_padded
+    num_logits_indices = common_attn_metadata.num_logits_indices
+    # Get rid of CUDAGraph padding, if any
+    logits_indices = logits_indices_padded[:num_logits_indices]
+    num_reqs = common_attn_metadata.num_reqs
+    query_start_loc = common_attn_metadata.query_start_loc
+    seq_lens = common_attn_metadata.seq_lens
+    # Example inputs
+    # num_reqs: 3
+    # generation_indices:  [14, 18, 19, 27]
+    # query_start_loc: [0, 15, 20, 28]
+    # seq_lens:        [41, 31, 40]
+
+    # Find how many decode indices belong to each request
+    # request_ids: [0, 1, 1, 2]
+    request_ids = torch.bucketize(logits_indices,
+                                  query_start_loc[1:],
+                                  right=True)
+
+    # Figure out how many tokens are in each request
+    # num_decode_tokens: [1, 2, 1]
+    num_decode_tokens = torch.bincount(request_ids, minlength=num_reqs)
+
+    # Calculate new query_start_loc with tokens in generation_indices
+    # decode_query_start_loc: [0, 1, 3, 4]
+    decode_query_start_loc = torch.empty(num_reqs + 1,
+                                         device=query_start_loc.device,
+                                         dtype=query_start_loc.dtype)
+
+    decode_query_start_loc[0] = 0
+    decode_query_start_loc[1:] = torch.cumsum(num_decode_tokens, dim=0)
+    decode_max_query_len = int(num_decode_tokens.max().item())
+    total_num_decode_tokens = int(num_decode_tokens.sum().item())
+
+    common_attn_metadata = CommonAttentionMetadata(
+        query_start_loc=decode_query_start_loc,
+        query_start_loc_cpu=decode_query_start_loc.to("cpu",
+                                                      non_blocking=True),
+        seq_lens=seq_lens,
+        seq_lens_cpu=seq_lens.to("cpu", non_blocking=True),
+        num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
+        num_reqs=num_reqs,
+        num_actual_tokens=total_num_decode_tokens,
+        max_query_len=decode_max_query_len,
+        max_seq_len=common_attn_metadata.max_seq_len,
+        block_table_tensor=common_attn_metadata.block_table_tensor,
+        slot_mapping=common_attn_metadata.slot_mapping,
+        causal=True,
+    )
+    return common_attn_metadata
+
+
 def subclass_attention_backend(
         name_prefix: str, attention_backend_cls: type[AttentionBackend],
         builder_cls: type[AttentionMetadataBuilder[M]]
@@ -679,13 +749,56 @@ def subclass_attention_metadata(
     return Wrapped
 
 
-def make_kv_sharing_fast_prefill_attention_metadata(
-    metadata_cls: Any, ) -> Any:
-    """
-    Return a new subclass of `metadata_cls` for fast prefill
-    """
-    return subclass_attention_metadata(
-        name_prefix="KVSharingFastPrefill",
-        metadata_cls=metadata_cls,
-        fields=KV_SHARING_FAST_PREFILL_METADATA_FIELDS,
-    )
+@runtime_checkable
+class KVSharingFastPrefillMetadata(Protocol):
+    logits_indices_padded: torch.Tensor
+    num_logits_indices: int
+
+
+def create_fast_prefill_custom_backend(
+    prefix: str,
+    underlying_attn_backend: AttentionBackend,
+) -> type[AttentionBackend]:
+
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+
+    class FastPrefillAttentionBuilder(underlying_builder):  # type: ignore
+
+        def build(self,
+                  common_prefix_len: int,
+                  common_attn_metadata: CommonAttentionMetadata,
+                  fast_build: bool = False) -> AttentionMetadata:
+            new_common_attn_metadata =\
+            make_kv_sharing_fast_prefill_common_attn_metadata(common_attn_metadata)
+            metadata = super().build(common_prefix_len,
+                                     new_common_attn_metadata, fast_build)
+
+            class KVSharingFastPrefillAttentionMetadata(
+                    metadata.__class__,  #  type: ignore
+                    KVSharingFastPrefillMetadata):
+
+                def __init__(self, metadata, common_attn_metadata):
+                    # Shallow copy all fields in metadata cls
+                    for field in fields(metadata.__class__):
+                        setattr(self, field.name,
+                                getattr(metadata, field.name))
+
+                    # Set additional fields that will be used in model code
+                    assert (common_attn_metadata.logits_indices_padded
+                            is not None
+                            and common_attn_metadata.num_logits_indices
+                            is not None)
+                    self.logits_indices_padded = \
+                        common_attn_metadata.logits_indices_padded
+                    self.num_logits_indices = \
+                        common_attn_metadata.num_logits_indices
+
+            return KVSharingFastPrefillAttentionMetadata(
+                metadata, common_attn_metadata)
+
+    attn_backend = subclass_attention_backend(
+        name_prefix=prefix,
+        attention_backend_cls=underlying_attn_backend,
+        builder_cls=FastPrefillAttentionBuilder)
+
+    return attn_backend
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index dbea0b610b..7440fe1f07 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -335,6 +335,13 @@ class AsyncLLM(EngineClient):
         returning the RequestOutput back to the caller.
         """
 
+        if (self.vllm_config.cache_config.kv_sharing_fast_prefill
+                and sampling_params.prompt_logprobs):
+            raise ValueError(
+                "--kv-sharing-fast-prefill produces incorrect logprobs for "
+                "prompt tokens, please disable it when the requests need "
+                "prompt logprobs")
+
         try:
             # We start the output_handler on the first call to generate() so
             # we can call __init__ before the event loop, which enables us
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a194808e51..0250a4e19a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import dataclasses
 import gc
 import itertools
 import time
@@ -58,7 +57,7 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         supports_dynamo)
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
-    make_kv_sharing_fast_prefill_attention_metadata,
+    create_fast_prefill_custom_backend,
     reorder_batch_to_split_decodes_and_prefills)
 from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 from vllm.v1.kv_cache_interface import (AttentionSpec,
@@ -84,9 +83,10 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import (
     KVConnectorModelRunnerMixin, KVConnectorOutput)
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache,
-                    gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
-                    sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
+from .utils import (AttentionGroup, MultiModalBudget,
+                    add_kv_sharing_layers_to_kv_cache_groups, bind_kv_cache,
+                    gather_mm_placeholders, sanity_check_mm_encoder_outputs,
+                    scatter_mm_placeholders)
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -860,6 +860,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 max_seq_len=max_seq_len,
                 block_table_tensor=blk_table_tensor,
                 slot_mapping=slot_mapping,
+                logits_indices_padded=logits_indices_padded,
+                num_logits_indices=logits_indices.size(0),
                 causal=True,
             )
 
@@ -884,28 +886,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     common_attn_metadata=common_attn_metadata,
                 ))
 
-                fast_prefill_metadata = attn_metadata_i
-                if (self.cache_config.kv_sharing_fast_prefill
-                        and self.kv_sharing_fast_prefill_eligible_layers):
-                    # Dynamically create a a dataclass type that inherits
-                    # from attention metadata type but includes additional
-                    # fields logits_indices_padded and num_logits_indices
-                    # which are required for prefill truncation
-                    fast_prefill_metadata_type = (
-                        make_kv_sharing_fast_prefill_attention_metadata(
-                            metadata_cls=type(attn_metadata_i), ))
-                    fast_prefill_metadata = fast_prefill_metadata_type(
-                        **dataclasses.asdict(attn_metadata_i),
-                        logits_indices_padded=logits_indices_padded,
-                        num_logits_indices=logits_indices.size(0),
-                    )
-
                 for layer_name in attn_group.layer_names:
-                    if (self.cache_config.kv_sharing_fast_prefill
-                            and layer_name
-                            in self.kv_sharing_fast_prefill_eligible_layers):
-                        attn_metadata[layer_name] = fast_prefill_metadata
-                        continue
                     attn_metadata[layer_name] = attn_metadata_i
 
         # Hot-Swap lora model
@@ -1484,6 +1465,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return self.kv_connector_no_forward(scheduler_output,
                                                 self.vllm_config)
 
+        if self.cache_config.kv_sharing_fast_prefill:
+            assert not self.input_batch.num_prompt_logprobs, (
+                "--kv-sharing-fast-prefill produces incorrect logprobs for "
+                "prompt tokens, tokens, please disable it when the requests "
+                "need prompt logprobs")
+
         # Prepare the decoder inputs.
         (attn_metadata, logits_indices, spec_decode_metadata,
          num_scheduled_tokens_np, spec_decode_common_attn_metadata,
@@ -2742,6 +2729,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # layer.
             for layer_name in layer_names:
                 attn_backend = layers[layer_name].get_attn_backend()
+
+                if layer_name in self.kv_sharing_fast_prefill_eligible_layers:
+                    attn_backend = create_fast_prefill_custom_backend(
+                        "FastPrefill",
+                        attn_backend,
+                    )
+
                 key = attn_backend.full_cls_name()
                 attn_backends[key] = attn_backend
                 attn_backend_layers[key].append(layer_name)
@@ -3074,20 +3068,40 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
                                                    kv_cache_raw_tensors)
 
-        # Setup `kv_cache_config` and `kv_caches` for models
-        # with cross-layer KV sharing
-        if self.shared_kv_cache_layers:
-            initialize_kv_cache_for_kv_sharing(
-                self.shared_kv_cache_layers,
-                kv_cache_config.kv_cache_groups,
-                kv_caches,
-                self.attn_groups,
-                self.runner_only_attn_layers,
-            )
+        # Set up cross-layer KV cache sharing
+        for layer_name, target_layer_name in self.shared_kv_cache_layers.items(
+        ):
+            logger.debug("%s reuses KV cache of %s", layer_name,
+                         target_layer_name)
+            kv_caches[layer_name] = kv_caches[target_layer_name]
+
+        bind_kv_cache(kv_caches,
+                      self.compilation_config.static_forward_context,
+                      self.kv_caches)
+        return kv_caches
+
+    def maybe_add_kv_sharing_layers_to_kv_cache_groups(
+            self, kv_cache_config: KVCacheConfig) -> None:
+        """
+        Add layers that re-use KV cache to KV cache group of its target layer.
+        Mapping of KV cache tensors happens in `initialize_kv_cache_tensors()`
+        """
+        if not self.shared_kv_cache_layers:
+            # No cross-layer KV sharing, return
+            return
+
+        add_kv_sharing_layers_to_kv_cache_groups(
+            self.shared_kv_cache_layers,
+            kv_cache_config.kv_cache_groups,
+            self.runner_only_attn_layers,
+        )
+
+        if self.cache_config.kv_sharing_fast_prefill:
+            # In You Only Cache Once (https://arxiv.org/abs/2405.05254) or other
+            # similar KV sharing setups, only the layers that generate KV caches
+            # are involved in the prefill phase, enabling prefill to early exit.
             attn_layers = get_layers_from_vllm_config(self.vllm_config,
                                                       Attention)
-            # Iterate in reversed order and add layers that re-use KV cache
-            # e.g. in YOCO-like KV sharing setups (e.g. Gemma3n)
             for layer_name in reversed(attn_layers):
                 if layer_name in self.shared_kv_cache_layers:
                     self.kv_sharing_fast_prefill_eligible_layers.add(
@@ -3095,11 +3109,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 else:
                     break
 
-        bind_kv_cache(kv_caches,
-                      self.compilation_config.static_forward_context,
-                      self.kv_caches)
-        return kv_caches
-
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
@@ -3111,6 +3120,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.kv_cache_config = kv_cache_config
         self.may_reinitialize_input_batch(kv_cache_config)
         self.may_add_encoder_only_layers_to_kv_cache_config()
+        self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config)
         self.initialize_attn_backend(kv_cache_config)
         kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 70ffde39ca..2307006127 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -55,9 +55,8 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import (
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
 
-from .utils import (MultiModalBudget, bind_kv_cache,
-                    initialize_kv_cache_for_kv_sharing,
-                    sanity_check_mm_encoder_outputs)
+from .utils import (MultiModalBudget, add_kv_sharing_layers_to_kv_cache_groups,
+                    bind_kv_cache, sanity_check_mm_encoder_outputs)
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -1599,6 +1598,30 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.encoder_cache.clear()
         gc.collect()
 
+    def maybe_setup_cross_layer_kv_sharing(
+        self,
+        kv_caches: dict[str, torch.Tensor],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        """
+        Add layers that re-use KV cache to KV cache group of its target layer.
+        Mapping of KV cache tensors happens in `initialize_kv_cache_tensors()`
+        """
+        if not self.shared_kv_cache_layers:
+            # No cross-layer KV sharing, return
+            return
+
+        add_kv_sharing_layers_to_kv_cache_groups(
+            self.shared_kv_cache_layers,
+            kv_cache_config.kv_cache_groups,
+        )
+
+        for layer_name, target_layer_name in self.shared_kv_cache_layers.items(
+        ):
+            logger.debug("%s reuses KV cache of %s", layer_name,
+                         target_layer_name)
+            kv_caches[layer_name] = kv_caches[target_layer_name]
+
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
@@ -1664,14 +1687,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 else:
                     raise NotImplementedError
 
-        # Setup `kv_cache_config` and `kv_caches` for models
-        # with cross-layer KV sharing
-        if self.shared_kv_cache_layers:
-            initialize_kv_cache_for_kv_sharing(
-                self.shared_kv_cache_layers,
-                kv_cache_config.kv_cache_groups,
-                kv_caches,
-            )
+        # Set up cross-layer KV cache sharing if needed
+        self.maybe_setup_cross_layer_kv_sharing(kv_caches, kv_cache_config)
 
         bind_kv_cache(
             kv_caches,
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index a519336e41..6767804c71 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -203,12 +203,9 @@ def gather_mm_placeholders(
     return placeholders[is_embed]
 
 
-def initialize_kv_cache_for_kv_sharing(
+def add_kv_sharing_layers_to_kv_cache_groups(
     shared_kv_cache_layers: dict[str, str],
     kv_cache_groups: list[KVCacheGroupSpec],
-    kv_caches: dict[str, torch.Tensor],
-    # Optional for now to avoid breaking TPU
-    attn_groups: Optional[list[list[AttentionGroup]]] = None,
     runner_only_attn_layers: Optional[set[str]] = None,
 ) -> None:
     """
@@ -223,38 +220,15 @@ def initialize_kv_cache_for_kv_sharing(
             means this layer will perform attention using the keys and values
             from the KV cache of `shared_kv_cache_layers[layer_name]`.
         kv_cache_groups: The KV cache groups of the model.
-        kv_caches: The allocated kv_caches with layer names as keys.
-            Note that layers in shared_kv_cache_layers.keys() are not
-            originally included as it only contains layers which have its own
-            KV cache allocation.
-        attn_groups: Optional list of attention groups. Layers in the same KV
-            cache group may be placed in different attention groups if they
-            have different attention backends.  Currently only provided by 
-            GPU model runner.
     """
-    # mapping from layer name to tuple of (kv_cache_group_idx, attn_group_idx)
-    layer_to_attn_group_idx: dict[str, tuple[int, int]] = {}
-    if attn_groups:
-        for kv_cache_group_idx, kv_attn_groups in enumerate(attn_groups):
-            for attn_group_idx, attn_group in enumerate(kv_attn_groups):
-                for layer_name in attn_group.layer_names:
-                    layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx,
-                                                           attn_group_idx)
-    else:
-        for kv_cache_group_idx, kv_cache_group in enumerate(kv_cache_groups):
-            for layer_name in kv_cache_group.layer_names:
-                # attn group idx default to 0 if not provided
-                layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx, 0)
+    layer_to_kv_cache_group: dict[str, KVCacheGroupSpec] = {}
+    for kv_cache_group in kv_cache_groups:
+        for layer_name in kv_cache_group.layer_names:
+            layer_to_kv_cache_group[layer_name] = kv_cache_group
 
     for layer_name, target_layer_name in shared_kv_cache_layers.items():
-        kv_caches[layer_name] = kv_caches[target_layer_name]
-        kv_cache_group_idx = layer_to_attn_group_idx[target_layer_name][0]
-        kv_cache_groups[kv_cache_group_idx].layer_names.append(layer_name)
-
-        if attn_groups:
-            attn_group_idx = layer_to_attn_group_idx[target_layer_name][1]
-            attn_groups[kv_cache_group_idx][attn_group_idx].layer_names.append(
-                layer_name)
+        tgt_kv_cache_group = layer_to_kv_cache_group[target_layer_name]
+        tgt_kv_cache_group.layer_names.append(layer_name)
 
         if runner_only_attn_layers is not None:
             runner_only_attn_layers.add(layer_name)

From d3d2aad5a2a06b0ea22ae09cb0c6fb6912fa64d8 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 28 Aug 2025 18:18:10 -0400
Subject: [PATCH 705/932] [Log] Use Debug Once for DeepGEMM E8M0 When not
 Enabled (#23858)

---
 vllm/utils/deep_gemm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index cd1dbfb813..90cdd39620 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -36,7 +36,7 @@ def is_deep_gemm_e8m0_used() -> bool:
     "E8M0 scale on a Hopper or Blackwell-class GPU.
     """
     if not is_deep_gemm_supported():
-        logger.info_once(
+        logger.debug_once(
             "DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.")
         return False
 

From b668055a114086b8968d9ff4a53586f1d8ea0b47 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 28 Aug 2025 18:05:52 -0700
Subject: [PATCH 706/932] [V0 Deprecation] Remove V0 Samplers test (#23862)

---
 tests/samplers/test_sampler.py         | 769 -------------------------
 tests/samplers/test_seeded_generate.py |  86 ---
 2 files changed, 855 deletions(-)
 delete mode 100644 tests/samplers/test_sampler.py
 delete mode 100644 tests/samplers/test_seeded_generate.py

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
deleted file mode 100644
index 520b88d03a..0000000000
--- a/tests/samplers/test_sampler.py
+++ /dev/null
@@ -1,769 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import itertools
-import random
-from dataclasses import dataclass
-from typing import Optional
-from unittest.mock import Mock, patch
-
-import pytest
-import torch
-from transformers import GenerationConfig, GenerationMixin
-
-import vllm.envs as envs
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import Counter, is_pin_memory_available
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This file tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-class MockLogitsSampler(Sampler):
-
-    def __init__(self, fake_logits: torch.Tensor):
-        super().__init__()
-        self.fake_logits = fake_logits
-
-    def forward(self, *args, **kwargs):
-        return super().forward(*args, **kwargs)
-
-
-def _prepare_test(
-        batch_size: int
-) -> tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
-    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
-    fake_logits = torch.full((batch_size, VOCAB_SIZE),
-                             1e-2,
-                             dtype=input_tensor.dtype)
-    sampler = MockLogitsSampler(fake_logits)
-    return input_tensor, fake_logits, sampler
-
-
-VOCAB_SIZE = 32000
-RANDOM_SEEDS = list(range(128))
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-
-def _do_sample(
-    batch_size: int,
-    input_tensor: torch.Tensor,
-    sampler: MockLogitsSampler,
-    sampling_params: SamplingParams,
-    device: str,
-):
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    seq_lens: list[int] = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=sampling_params,
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=is_pin_memory_available())
-    return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_greedy(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
-
-    sampling_params = SamplingParams(temperature=0)
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-    expected = torch.argmax(fake_logits, dim=-1)
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == expected[i].item()
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_random(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    for i in range(batch_size):
-        fake_logits[i, i] = 1e2
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == i
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_random_seed(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    for i in range(batch_size):
-        fake_logits[i, i] = 1e2
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-        seed=random.randint(0, 10000),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == i
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_random_seed_deterministic(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-        seed=random.randint(0, 10000),
-    )
-    first_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                      sampling_params, device)
-
-    second_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                       sampling_params, device)
-
-    assert first_sampler_output == second_sampler_output
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_min_tokens_penalty(seed: int, device: str):
-    seq_id_counter = Counter(start=random.randint(0, 100))
-    set_random_seed(seed)
-    torch.set_default_device(device)
-
-    def create_sampling_params(min_tokens,
-                               eos_token_id=0,
-                               *,
-                               stop_token_ids: Optional[list[int]] = None,
-                               prompt_logprobs: Optional[int] = None):
-        sampling_params = SamplingParams(
-            min_tokens=min_tokens,
-            max_tokens=9999,  # keep higher than max of min_tokens
-            stop_token_ids=stop_token_ids,
-            # requesting prompt_logprobs changes the structure of `logits`
-            prompt_logprobs=prompt_logprobs,
-        )
-        sampling_params.all_stop_token_ids.add(eos_token_id)
-        return sampling_params
-
-    def create_sequence_data(num_input=3, num_generated=0):
-        seq_data = SequenceData.from_seqs(
-            random.choices(range(0, VOCAB_SIZE), k=num_input))
-        if num_generated > 0:
-            seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE),
-                                                       k=num_generated)
-        return seq_data
-
-    def generate_test_case():
-        # generate multiple seq groups but limit total batch size
-        batch_size = random.randint(1, 128)
-
-        expected_penalization = []
-        sequence_metadata_list: list[SequenceGroupMetadata] = []
-        # 20% chance to generate seq group metadata list with all prompts
-        is_prompt = random.random() < 0.2
-        while batch_size > 0:
-            num_seqs = 1 if is_prompt else random.randint(1, batch_size)
-
-            eos_token_id = random.randint(0, VOCAB_SIZE - 1)
-            min_tokens = random.randint(0, 50)
-            num_stop_tokens = random.randint(0, 8)
-            if num_stop_tokens > 0:
-                stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1),
-                                                k=num_stop_tokens)
-            else:
-                stop_token_ids = None
-
-            sampling_params = create_sampling_params(
-                min_tokens=min_tokens,
-                eos_token_id=eos_token_id,
-                stop_token_ids=stop_token_ids)
-
-            seq_data: dict[int, SequenceData] = {}
-            seq_group_penalization: list[bool] = []
-            for _ in range(num_seqs):
-                num_input = random.randint(1, 100)
-                num_generated = 0 if is_prompt else random.randint(1, 100)
-                seq_data[next(seq_id_counter)] = create_sequence_data(
-                    num_input=num_input, num_generated=num_generated)
-                seq_group_penalization.append(num_generated < min_tokens)
-
-            expected_penalization.extend(seq_group_penalization)
-            sequence_metadata_list.append(
-                SequenceGroupMetadata(
-                    request_id=f"test_{batch_size}",
-                    is_prompt=is_prompt,
-                    seq_data=seq_data,
-                    sampling_params=sampling_params,
-                    block_tables={},
-                ))
-            batch_size -= num_seqs
-
-        return {
-            "expected_penalization": expected_penalization,
-            "seq_group_metadata_list": sequence_metadata_list,
-        }
-
-    # define some explicit test cases for edge case behavior
-    prompt_without_penalization = {
-        "expected_penalization": [False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
-                },
-                sampling_params=create_sampling_params(0),
-                block_tables={},
-            ),
-        ]
-    }
-
-    prompt_with_penalization = {
-        "expected_penalization": [True],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
-                },
-                sampling_params=create_sampling_params(1),
-                block_tables={},
-            ),
-        ]
-    }
-
-    prompt_with_penalization_and_prompt_logprobs = {
-        "expected_penalization": [False, False, True],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(num_input=3),
-                },
-                sampling_params=create_sampling_params(1, prompt_logprobs=3),
-                block_tables={},
-            ),
-        ]
-    }
-
-    stop_penalizing_after_min_tokens = {
-        "expected_penalization": [False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=False,
-                seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                },
-                sampling_params=create_sampling_params(1),
-                block_tables={},
-            )
-        ]
-    }
-
-    stop_token_ids = [42, 99, 42, 0]  # intentional duplication
-    prompt_combination = {
-        "expected_penalization": [False, True, False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_2",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(num_input=2),
-                },
-                sampling_params=create_sampling_params(1, prompt_logprobs=3),
-                block_tables={},
-            ),
-            SequenceGroupMetadata(
-                request_id="test_3",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
-                },
-                sampling_params=create_sampling_params(
-                    0, stop_token_ids=stop_token_ids),
-                block_tables={},
-            )
-        ]
-    }
-
-    stop_token_ids = [1, 999, 37, 37]  # intentional duplication
-    decode_combination = {
-        "expected_penalization": [True, False, False, True, False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=False,
-                seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=100),
-                },
-                sampling_params=create_sampling_params(
-                    2, stop_token_ids=stop_token_ids),
-                block_tables={},
-            ),
-            SequenceGroupMetadata(
-                request_id="test_2",
-                is_prompt=False,
-                seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=20),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=10),
-                },
-                sampling_params=create_sampling_params(
-                    10, prompt_logprobs=5, stop_token_ids=stop_token_ids),
-                block_tables={},
-            ),
-        ]
-    }
-
-    if seed == 0:
-        test_cases = [
-            prompt_without_penalization,
-            prompt_with_penalization,
-            prompt_with_penalization_and_prompt_logprobs,
-            stop_penalizing_after_min_tokens,
-            prompt_combination,
-            decode_combination,
-        ]
-    else:
-        test_cases = [generate_test_case()]
-
-    def run_test_case(*, expected_penalization: list[bool],
-                      seq_group_metadata_list: list[SequenceGroupMetadata]):
-        assert expected_penalization, \
-            "Invalid test case, need expected_penalization"
-        assert seq_group_metadata_list, \
-            "Invalid test case, need seq_group_metadata_list"
-
-        batch_size = 0
-        seq_lens: list[int] = []
-        sampling_params_per_row: list[SamplingParams] = []
-        for sgm in seq_group_metadata_list:
-            sampling_params = sgm.sampling_params
-
-            num_rows = len(sgm.seq_data)
-            if sgm.is_prompt:
-                # a prompt seq_group has only one sequence
-                seq_data = next(iter(sgm.seq_data.values()))
-                prompt_len = seq_data.get_prompt_len()
-                seq_lens.append(prompt_len)
-
-                assert sgm.sampling_params is not None
-                if sgm.sampling_params.prompt_logprobs:
-                    # with prompt_logprobs each token in the prompt has a row in
-                    # logits
-                    num_rows = prompt_len
-
-            batch_size += num_rows
-            sampling_params_per_row.extend(
-                itertools.repeat(sampling_params, num_rows))
-
-        assert len(
-            expected_penalization
-        ) == batch_size, \
-            ("Invalid test case, expected_penalization does not match computed"
-             "batch size")
-
-        _, fake_logits, sampler = _prepare_test(batch_size)
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens=seq_lens if seq_lens else None,
-            query_lens=seq_lens if seq_lens else [1] * batch_size,
-            device=device,
-            pin_memory=is_pin_memory_available())
-        # the logits tensor is modified in-place by the sampler
-        _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
-
-        for logits_idx, (should_penalize, sampling_params) in enumerate(
-                zip(expected_penalization, sampling_params_per_row)):
-
-            tokens_to_check = sampling_params.all_stop_token_ids
-
-            if should_penalize:
-                for token_id in tokens_to_check:
-                    assert fake_logits[logits_idx, token_id] == -float(
-                        'inf'
-                    ), f"Expected token {token_id} for logits row {logits_idx}"
-                    " to be penalized"
-                # no other tokens should be set to -inf
-                assert torch.count_nonzero(
-                    fake_logits[logits_idx, :] == -float('inf')) == len(
-                        tokens_to_check
-                    ), f"Expected only {len(tokens_to_check)} to be penalized"
-            else:
-                # no tokens should be set to -inf
-                assert torch.count_nonzero(
-                    fake_logits[logits_idx, :] ==
-                    -float('inf')) == 0, "No tokens should have been penalized"
-
-    for test_case in test_cases:
-        run_test_case(**test_case)
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_mixed(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
-
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    expected_tokens: list[Optional[list[int]]] = []
-    seq_lens: list[int] = []
-    for i in range(batch_size):
-        expected: Optional[list[int]] = None
-        sampling_type = random.randint(0, 2)
-        if sampling_type == 0:
-            sampling_params = SamplingParams(temperature=0)
-            expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
-        elif sampling_type in (1, 2):
-            n = random.randint(1, 10)
-            sampling_params = SamplingParams(
-                temperature=random.random() + 0.1,
-                top_p=min(random.random() + 0.1, 1),
-                top_k=random.randint(0, 10),
-                n=n,
-                presence_penalty=random.randint(0, 1),
-            )
-            if sampling_type == 2:
-                sampling_params.seed = random.randint(0, 10000)
-            else:
-                for idx in range(n):
-                    fake_logits[i, i + idx] = 1e2
-                expected = list(range(i, i + n))
-
-        expected_tokens.append(expected)
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=sampling_params,
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    generators: dict[str, torch.Generator] = {}
-
-    def test_sampling():
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            query_lens=seq_lens,
-            device=device,
-            pin_memory=is_pin_memory_available(),
-            generators=generators)
-        sampler_output = sampler(logits=fake_logits,
-                                 sampling_metadata=sampling_metadata)
-
-        for i, (sequence_output, metadata) in enumerate(
-                zip(sampler_output, seq_group_metadata_list)):
-            assert metadata.sampling_params is not None
-
-            if (metadata.sampling_params.seed is not None
-                    and expected_tokens[i] is None):
-                # Record seeded random result to compare with results of
-                # second invocation
-                expected_tokens[i] = [
-                    nth_output.output_token
-                    for nth_output in sequence_output.samples
-                ]
-                continue
-
-            expected_tokens_item = expected_tokens[i]
-            assert expected_tokens_item is not None
-
-            for n, nth_output in enumerate(sequence_output.samples):
-                assert metadata.sampling_params is not None
-
-                if (metadata.sampling_params.temperature == 0
-                        or metadata.sampling_params.seed is not None):
-                    # Ensure exact matches for greedy or random with seed
-                    assert nth_output.output_token == expected_tokens_item[n]
-                else:
-                    # For non-seeded random check that one of the high-logit
-                    # tokens were chosen
-                    assert nth_output.output_token in expected_tokens_item
-
-    # Test batch
-    test_sampling()
-
-    # Shuffle the batch and resample
-    target_index = list(range(batch_size))
-    for list_to_shuffle in (target_index, seq_group_metadata_list,
-                            expected_tokens, seq_lens):
-        random.Random(seed).shuffle(list_to_shuffle)
-    target_index = torch.tensor(target_index)
-    input_tensor.data = input_tensor.index_select(0, target_index)
-    fake_logits.data = fake_logits.index_select(0, target_index)
-
-    # This time, results of seeded random samples will be compared with
-    # the corresponding sample in the pre-shuffled batch
-    test_sampling()
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_top_k_top_p(seed: int, device: str):
-    set_random_seed(seed)
-    batch_size = random.randint(1, 256)
-    top_k = random.randint(100, 500)
-    top_p = random.random() * 0.1
-    vocab_size = 32000
-    input_tensor = torch.rand((batch_size, 1024),
-                              device=device,
-                              dtype=torch.float16)
-    fake_logits = torch.normal(0,
-                               5,
-                               size=(batch_size, vocab_size),
-                               device=input_tensor.device,
-                               dtype=input_tensor.dtype)
-    sampler = MockLogitsSampler(fake_logits)
-
-    generation_model = GenerationMixin()
-    generation_config = GenerationConfig(top_k=top_k,
-                                         top_p=top_p,
-                                         do_sample=True)
-
-    @dataclass
-    class MockConfig:
-        is_encoder_decoder: bool = False
-
-    generation_model.config = MockConfig()  # needed by the following method
-    generation_model._prepare_special_tokens(generation_config, device=device)
-    processors = generation_model._get_logits_processor(generation_config,
-                                                        None,
-                                                        None,
-                                                        None, [],
-                                                        device=device)
-    assert len(processors) == 2  # top_p and top_k
-
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    seq_lens: list[int] = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(
-                    temperature=1,
-                    top_k=top_k,
-                    top_p=top_p,
-                ),
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=is_pin_memory_available())
-
-    sample_probs = None
-
-    def mock_sample(probs, *args, **kwargs):
-        nonlocal sample_probs
-        sample_probs = probs
-        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
-                 for prob in probs], None)
-
-    # top-k and top-p is only calculated when flashinfer kernel is not available
-    with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \
-         patch("vllm.model_executor.layers.sampler."
-               "flashinfer_top_k_top_p_sampling", None):
-        sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
-
-    assert sample_probs is not None
-
-    hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone())
-    hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
-    torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
-    assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_flashinfer_fallback(seed: int, device: str):
-    if not envs.VLLM_USE_FLASHINFER_SAMPLER:
-        pytest.skip("Flashinfer sampler is disabled")
-
-    pytest.skip("After FlashInfer 0.2.3, sampling will never fail")
-
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    def failing_flashinfer_sampling(*_args, **_kwargs):
-        return None, torch.zeros(batch_size, device=device, dtype=torch.int32)
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-        seed=random.randint(0, 10000),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    with patch(
-            "vllm.model_executor.layers.sampler."
-            "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling):
-        fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                             sampling_params, device)
-
-    assert sampler_output == fallback_sampler_output
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_repetition_penalty_mixed(device: str):
-
-    vocab_size = 8
-
-    def test_sampling_params(sampling_params: list[SamplingParams]):
-
-        seq_group_metadata_list: list[SequenceGroupMetadata] = []
-        seq_lens: list[int] = []
-        for i in range(2):
-            seq_group_metadata_list.append(
-                SequenceGroupMetadata(
-                    request_id=f"test_{i}",
-                    is_prompt=True,
-                    seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                    sampling_params=sampling_params[i],
-                    block_tables={0: [1]},
-                ))
-            seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            query_lens=seq_lens,
-            device=device,
-            pin_memory=is_pin_memory_available())
-
-        fake_logits = torch.full((2, vocab_size),
-                                 1e-2,
-                                 device=device,
-                                 dtype=torch.float16)
-
-        fake_logits[:, 5] = 1.1e-2
-        fake_logits[:, 1] = 1.2e-2
-
-        sampler = MockLogitsSampler(fake_logits)
-
-        sampler_output = sampler(logits=fake_logits,
-                                 sampling_metadata=sampling_metadata)
-
-        generated_tokens = []
-        for output in sampler_output:
-            generated_tokens.append(output.samples[0].output_token)
-
-        return generated_tokens
-
-    # one configuration is greedy with repetition_penalty
-    sampling_params_rep = SamplingParams(
-        temperature=0.0,
-        repetition_penalty=2.0,
-    )
-
-    # other configuration is sampling w/o repetition_penalty
-    sampling_params_sample = SamplingParams(
-        temperature=1.0,
-        top_k=1,
-        seed=42,
-    )
-
-    tokens1 = test_sampling_params(
-        [sampling_params_rep, sampling_params_sample])
-
-    tokens2 = test_sampling_params(
-        [sampling_params_sample, sampling_params_rep])
-
-    assert tokens1[0] == tokens2[1]
-    assert tokens1[1] == tokens2[0]
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_include_gpu_probs_tensor(device: str):
-    set_random_seed(42)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-    sampler.include_gpu_probs_tensor = True
-    sampler.should_modify_greedy_probs_inplace = False
-
-    sampling_params = SamplingParams(temperature=0)
-
-    mock_inplace = Mock()
-    with patch(
-            "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace",
-            mock_inplace):
-
-        sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                    sampling_params, device)
-        mock_inplace.assert_not_called()
-
-    assert sampler_output.sampled_token_probs is not None
-    assert sampler_output.logprobs is not None
-    assert sampler_output.sampled_token_ids is not None
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
deleted file mode 100644
index 5a0efd98ac..0000000000
--- a/tests/samplers/test_seeded_generate.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Verify that seeded random sampling is deterministic.
-
-Run `pytest tests/samplers/test_seeded_generate.py`.
-"""
-import copy
-import random
-from itertools import combinations
-
-import pytest
-
-from vllm import SamplingParams
-from vllm.model_executor.utils import set_random_seed
-
-MODEL = "facebook/opt-125m"
-RANDOM_SEEDS = list(range(5))
-
-
-@pytest.fixture
-def vllm_model(vllm_runner, monkeypatch):
-    # This file relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    with vllm_runner(MODEL, dtype="half") as vllm_model:
-        yield vllm_model
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_random_sample_with_seed(
-    vllm_model,
-    example_prompts,
-    seed: int,
-) -> None:
-    set_random_seed(seed)
-
-    sampling_params = SamplingParams(
-        # Parameters to ensure sufficient randomness
-        temperature=3.0,
-        top_p=min(random.random() + 0.3, 1),
-        top_k=random.randint(5, 20),
-        n=random.randint(1, 10),
-        presence_penalty=random.randint(0, 1),
-        max_tokens=8,
-        ignore_eos=True,
-    )
-
-    sampling_params_seed_1 = copy.deepcopy(sampling_params)
-    sampling_params_seed_1.seed = 100
-    sampling_params_seed_2 = copy.deepcopy(sampling_params)
-    sampling_params_seed_2.seed = 200
-
-    llm = vllm_model.llm
-
-    for prompt in example_prompts:
-        for params in (
-                sampling_params,
-                sampling_params_seed_1,
-                sampling_params_seed_2,
-                sampling_params,
-                sampling_params_seed_1,
-                sampling_params_seed_2,
-        ):
-            llm._add_request(prompt, params=params)
-
-    results = llm._run_engine(use_tqdm=False)
-    all_outputs = [[out.token_ids for out in output.outputs]
-                   for output in results]
-
-    for i in range(0, len(example_prompts), 6):
-        outputs = all_outputs[i:i + 6]
-
-        # verify all non-seeded requests differ
-        for output_a, output_b in combinations(
-            (outputs[0], outputs[1], outputs[2], outputs[3]),
-                2,
-        ):
-            assert output_a != output_b
-
-        # verify requests with the same seed match
-        assert outputs[1] == outputs[4]
-        assert outputs[2] == outputs[5]
-
-        # verify generations within the same parallel sampling group differ
-        for output in outputs:
-            for sub_output_a, sub_output_b in combinations(output, 2):
-                assert sub_output_a != sub_output_b

From 235c9db8a755e0404628a568bf29a492257fe52e Mon Sep 17 00:00:00 2001
From: Chaojun Zhang <chaojun.zhang@intel.com>
Date: Fri, 29 Aug 2025 09:23:04 +0800
Subject: [PATCH 707/932] [XPU] support data parallel for MoE models on XPU
 (#22887)

Signed-off-by: chzhang <chaojun.zhang@intel.com>
---
 .../device_communicators/xpu_communicator.py          | 11 +++++++++++
 vllm/model_executor/layers/fused_moe/layer.py         |  2 ++
 2 files changed, 13 insertions(+)

diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
index dee5ed7a28..067315deb7 100644
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -7,8 +7,13 @@ import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
 
+import vllm.envs as envs
+from vllm.logger import init_logger
+
 from .base_device_communicator import DeviceCommunicatorBase
 
+logger = init_logger(__name__)
+
 
 class XpuCommunicator(DeviceCommunicatorBase):
 
@@ -18,6 +23,12 @@ class XpuCommunicator(DeviceCommunicatorBase):
                  device_group: Optional[ProcessGroup] = None,
                  unique_name: str = ""):
         super().__init__(cpu_group, device, device_group, unique_name)
+        if self.use_all2all:
+            all2all_backend = envs.VLLM_ALL2ALL_BACKEND
+            if all2all_backend == "naive":
+                from .all2all import NaiveAll2AllManager
+                self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
+                logger.info("Using naive all2all manager.")
 
     def all_reduce(self, input_) -> torch.Tensor:
         dist.all_reduce(input_, group=self.device_group)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 28123d3958..5a87763c07 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -655,6 +655,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         forward_native = forward_tpu
     elif current_platform.is_cpu():
         forward_native = forward_cpu
+    elif current_platform.is_xpu():
+        forward_native = forward_xpu
     else:
         forward_native = forward_cuda
 

From de533ab2a14192e461900a4950e2b426d99a6862 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Fri, 29 Aug 2025 02:26:34 +0100
Subject: [PATCH 708/932] [Models] Improve iteration over layers (#19497)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/model_executor/models/arcee.py            | 3 ++-
 vllm/model_executor/models/arctic.py           | 3 ++-
 vllm/model_executor/models/baichuan.py         | 3 ++-
 vllm/model_executor/models/bailing_moe.py      | 4 ++--
 vllm/model_executor/models/bamba.py            | 3 +--
 vllm/model_executor/models/bloom.py            | 3 ++-
 vllm/model_executor/models/chameleon.py        | 3 ++-
 vllm/model_executor/models/chatglm.py          | 3 ++-
 vllm/model_executor/models/commandr.py         | 3 ++-
 vllm/model_executor/models/dbrx.py             | 3 ++-
 vllm/model_executor/models/deepseek.py         | 5 +++--
 vllm/model_executor/models/deepseek_v2.py      | 3 ++-
 vllm/model_executor/models/dots1.py            | 3 ++-
 vllm/model_executor/models/ernie45_moe.py      | 4 ++--
 vllm/model_executor/models/ernie45_vl_moe.py   | 4 ++--
 vllm/model_executor/models/exaone.py           | 3 ++-
 vllm/model_executor/models/exaone4.py          | 3 ++-
 vllm/model_executor/models/falcon.py           | 3 ++-
 vllm/model_executor/models/gemma.py            | 3 ++-
 vllm/model_executor/models/gemma2.py           | 3 ++-
 vllm/model_executor/models/gemma3.py           | 3 ++-
 vllm/model_executor/models/glm4_moe.py         | 4 ++--
 vllm/model_executor/models/gpt2.py             | 3 ++-
 vllm/model_executor/models/gpt_bigcode.py      | 3 ++-
 vllm/model_executor/models/gpt_j.py            | 5 +++--
 vllm/model_executor/models/gpt_neox.py         | 3 ++-
 vllm/model_executor/models/granite.py          | 3 ++-
 vllm/model_executor/models/granitemoe.py       | 3 ++-
 vllm/model_executor/models/granitemoehybrid.py | 3 +--
 vllm/model_executor/models/granitemoeshared.py | 4 ++--
 vllm/model_executor/models/grok1.py            | 4 ++--
 vllm/model_executor/models/internlm2.py        | 3 ++-
 vllm/model_executor/models/internlm2_ve.py     | 3 ++-
 vllm/model_executor/models/jais.py             | 3 ++-
 vllm/model_executor/models/jamba.py            | 3 ++-
 vllm/model_executor/models/lfm2.py             | 5 +++--
 vllm/model_executor/models/llama.py            | 3 ++-
 vllm/model_executor/models/mamba2.py           | 4 +---
 vllm/model_executor/models/mimo.py             | 3 ++-
 vllm/model_executor/models/minicpm.py          | 3 ++-
 vllm/model_executor/models/minimax_text_01.py  | 4 ++--
 vllm/model_executor/models/mixtral.py          | 3 ++-
 vllm/model_executor/models/mixtral_quant.py    | 3 ++-
 vllm/model_executor/models/molmo.py            | 3 ++-
 vllm/model_executor/models/mpt.py              | 3 ++-
 vllm/model_executor/models/nemotron.py         | 3 ++-
 vllm/model_executor/models/nemotron_h.py       | 3 +--
 vllm/model_executor/models/nemotron_nas.py     | 4 ++--
 vllm/model_executor/models/olmo.py             | 3 ++-
 vllm/model_executor/models/olmo2.py            | 3 ++-
 vllm/model_executor/models/olmoe.py            | 3 ++-
 vllm/model_executor/models/opt.py              | 3 ++-
 vllm/model_executor/models/orion.py            | 3 ++-
 vllm/model_executor/models/persimmon.py        | 3 ++-
 vllm/model_executor/models/phi.py              | 3 ++-
 vllm/model_executor/models/phimoe.py           | 3 ++-
 vllm/model_executor/models/plamo2.py           | 3 ++-
 vllm/model_executor/models/qwen.py             | 3 ++-
 vllm/model_executor/models/qwen2.py            | 3 ++-
 vllm/model_executor/models/qwen2_moe.py        | 3 ++-
 vllm/model_executor/models/qwen3_moe.py        | 4 ++--
 vllm/model_executor/models/seed_oss.py         | 3 ++-
 vllm/model_executor/models/stablelm.py         | 3 ++-
 vllm/model_executor/models/starcoder2.py       | 3 ++-
 vllm/model_executor/models/step3_text.py       | 4 ++--
 65 files changed, 129 insertions(+), 83 deletions(-)

diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index 4cf73e2e0e..13ed4da060 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -9,6 +9,7 @@
 # activation.
 
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -243,7 +244,7 @@ class ArceeModel(nn.Module):
 
         aux_hidden_states: list[torch.Tensor] = []
         for idx, layer in enumerate(
-                self.layers[self.start_layer:self.end_layer]):
+                islice(self.layers, self.start_layer, self.end_layer)):
             if idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(
                     hidden_states +
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 4693c9487a..c566611266 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Snowflake Arctic model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -403,7 +404,7 @@ class ArcticModel(nn.Module):
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 804a2f1785..4563c35666 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -22,6 +22,7 @@
 """Inference-only BaiChuan model compatible with HuggingFace weights."""
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -309,7 +310,7 @@ class BaiChuanModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index 23cab3509c..a42640cef9 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only BailingMoE model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -359,8 +360,7 @@ class BailingMoeModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 hidden_states,
                 position_ids,
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index e2cd31af53..a72bbdebe5 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -345,8 +345,7 @@ class BambaModel(nn.Module):
 
         residual = None
         num_attn = 0
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
+        for i, layer in enumerate(self.layers):
             if isinstance(layer, BambaAttentionDecoderLayer):
                 num_attn += 1
 
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 1264045848..13ecda0122 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -20,6 +20,7 @@
 """Inference-only BLOOM model compatible with HuggingFace weights."""
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -273,7 +274,7 @@ class BloomModel(nn.Module):
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states = layer(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index e6914ad4c4..28a1a66c23 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -3,6 +3,7 @@
 
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
+from itertools import islice
 from typing import Annotated, Any, Literal, Optional, Union
 
 import torch
@@ -914,7 +915,7 @@ class ChameleonModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 5470ff3e8b..1fc2da3e4d 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -5,6 +5,7 @@
 """Inference-only ChatGLM model compatible with THUDM weights."""
 import json
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -281,7 +282,7 @@ class GLMTransformer(nn.Module):
         hidden_states: torch.Tensor,
         position_ids: torch.Tensor,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(hidden_states=hidden_states,
                                   position_ids=position_ids)
 
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 4dd84b8f8f..7f87e31abd 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -23,6 +23,7 @@
 # This file is based on the LLama model definition file in transformers
 """PyTorch Cohere model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -322,7 +323,7 @@ class CohereModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index e74d90e0b1..519cd52221 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -359,7 +360,7 @@ class DbrxModel(nn.Module):
         else:
             assert intermediate_tensors
             hidden_states = intermediate_tensors["hidden_states"]
-        for block in self.blocks[self.start_layer:self.end_layer]:
+        for block in islice(self.blocks, self.start_layer, self.end_layer):
             hidden_states = block(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 2f0202f1e0..e815f13d66 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only Deepseek model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -377,7 +378,7 @@ class DeepseekModel(nn.Module):
         else:
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
@@ -483,4 +484,4 @@ class DeepseekForCausalLM(nn.Module, SupportsPP):
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
\ No newline at end of file
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 7657e7cb00..ed033954f7 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -25,6 +25,7 @@
 """Inference-only DeepseekV2/DeepseekV3 model."""
 import typing
 from collections.abc import Callable, Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -712,7 +713,7 @@ class DeepseekV2Model(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 5f410c0ae5..c386f8db9e 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -25,6 +25,7 @@
 # limitations under the License.
 """Inference-only dots1 model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -391,7 +392,7 @@ class Dots1Model(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index 4780ea931e..33ec27fc63 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -23,6 +23,7 @@
 # limitations under the License.
 """Inference-only ErineMoE model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -419,8 +420,7 @@ class Ernie4_5_MoeModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index f56c098435..780974c3b7 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -23,6 +23,7 @@
 # limitations under the License.
 """Inference-only Erine VL model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -508,8 +509,7 @@ class Ernie4_5_VLMoeModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual,
                                             visual_token_mask, **kwargs)
 
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 8052b6bb82..942db0143a 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -26,6 +26,7 @@
 """Inference-only Exaone model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -371,7 +372,7 @@ class ExaoneModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 827e901418..971fcbd2aa 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -22,6 +22,7 @@
 """Inference-only Exaone model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -354,7 +355,7 @@ class Exaone4Model(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 62a93dabd5..a9fe0924ba 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -22,6 +22,7 @@
 
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -389,7 +390,7 @@ class FalconModel(nn.Module):
                 hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 59c3102add..12eb275038 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -18,6 +18,7 @@
 """Inference-only Gemma model compatible with HuggingFace weights."""
 from collections.abc import Iterable
 from functools import cache
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -308,7 +309,7 @@ class GemmaModel(nn.Module):
         else:
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 8cfe92c645..0bdb6c6bf7 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -17,6 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -292,7 +293,7 @@ class Gemma2Model(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index b762be3c52..410c715d52 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -16,6 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -398,7 +399,7 @@ class Gemma3Model(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index fe5e46a998..fcc63815ac 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -24,6 +24,7 @@
 """Inference-only GLM-4.5 model compatible with HuggingFace weights."""
 import typing
 from collections.abc import Callable, Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -440,8 +441,7 @@ class Glm4MoeModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 98d7633739..4446b5ab18 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -20,6 +20,7 @@
 # limitations under the License.
 """Inference-only GPT-2 model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -228,7 +229,7 @@ class GPT2Model(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states = layer(hidden_states)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 036ded530f..d5c2604145 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -21,6 +21,7 @@
 # limitations under the License.
 """Inference-only GPTBigCode model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -246,7 +247,7 @@ class GPTBigCodeModel(nn.Module):
         else:
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states = layer(hidden_states)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index bd162a5e57..584c7f5d8a 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -19,6 +19,7 @@
 # limitations under the License.
 """Inference-only GPT-J model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -223,7 +224,7 @@ class GPTJModel(nn.Module):
                 hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states = layer(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
@@ -336,4 +337,4 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
\ No newline at end of file
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index d418d8bb86..e97db188e2 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -19,6 +19,7 @@
 # limitations under the License.
 """Inference-only GPT-NeoX model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -235,7 +236,7 @@ class GPTNeoXModel(nn.Module):
                 hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 507a9206c4..f8ba022921 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only IBM Granite model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -316,7 +317,7 @@ class GraniteModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 7d31854dce..07ad75bcf1 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only GraniteMoe model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional
 
 import torch
@@ -303,7 +304,7 @@ class GraniteMoeModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index f451e65338..79c6d8146b 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -397,8 +397,7 @@ class GraniteMoeHybridModel(nn.Module):
             residual = intermediate_tensors["residual"]
 
         num_attn = 0
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
+        for i, layer in enumerate(self.layers):
             if isinstance(layer, GraniteMoeHybridAttentionDecoderLayer):
                 num_attn += 1
 
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index 1e2e854417..0b568a4b22 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -6,6 +6,7 @@ The architecture is the same as granitemoe but with the addition of shared
 experts.
 """
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional
 
 import torch
@@ -200,8 +201,7 @@ class GraniteMoeSharedModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index 3659249cd8..a591134383 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -23,6 +23,7 @@
 # limitations under the License.
 """Inference-only Grok1 model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -347,8 +348,7 @@ class Grok1Model(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 26bc48ffbd..320e8d9d48 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -3,6 +3,7 @@
 
 from collections.abc import Iterable
 from functools import partial
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -297,7 +298,7 @@ class InternLM2Model(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 4bbb49da0e..d41ac2b70b 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -123,7 +124,7 @@ class InternLM2VEModel(InternLM2Model):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index bed4a5dff2..91a06dd502 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -23,6 +23,7 @@
 
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -276,7 +277,7 @@ class JAISModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states = layer(hidden_states)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 3c1a0b68df..aebd2cbe2e 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Jamba model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional
 
 import torch
@@ -350,7 +351,7 @@ class JambaModel(nn.Module):
 
         kv_cache_index = 0
         mamba_cache_index = 0
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             layer_mamba_cache_params = None
             if isinstance(layer, JambaAttentionDecoderLayer):
                 kv_cache_index += 1
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index 5f3148b47e..927f78c4e4 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional
 
 import torch
@@ -374,7 +375,7 @@ class Lfm2Model(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
@@ -554,4 +555,4 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
             skip_prefixes=(["lm_head."]
                            if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
\ No newline at end of file
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e39a6df843..a22bde194f 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -383,7 +384,7 @@ class LlamaModel(nn.Module):
 
         aux_hidden_states = []
         for idx, layer in enumerate(
-                self.layers[self.start_layer:self.end_layer]):
+                islice(self.layers, self.start_layer, self.end_layer)):
             if idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(hidden_states + residual)
             hidden_states, residual = layer(positions, hidden_states, residual)
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 3432cf29fe..81b9a12538 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -164,9 +164,7 @@ class Mamba2Model(nn.Module):
             # v1 get mamba2_metadata from forward_context
             mamba2_metadata = None
 
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-
+        for i, layer in enumerate(self.layers):
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py
index 5b497dd9d8..ea5292d0df 100644
--- a/vllm/model_executor/models/mimo.py
+++ b/vllm/model_executor/models/mimo.py
@@ -26,6 +26,7 @@
 # limitations under the License.
 """Inference-only MiMo model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -74,7 +75,7 @@ class MiMoModel(Qwen2Model):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index d398a5d12b..5632f8c8cc 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -25,6 +25,7 @@
 """Inference-only MiniCPM model compatible with HuggingFace weights."""
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -414,7 +415,7 @@ class MiniCPMModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 176a40179b..93ef13d5d1 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -3,6 +3,7 @@
 """Inference-only MiniMaxText01 model."""
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import TYPE_CHECKING, Optional, Union
 
 if TYPE_CHECKING:
@@ -1019,8 +1020,7 @@ class MiniMaxText01Model(nn.Module):
 
         minimax_cache_index = 0
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             _caches = None
             if not envs.VLLM_USE_V1 and isinstance(
                     layer.self_attn, MiniMaxText01LinearAttention):
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 30de83da49..52fcbbfc58 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only Mixtral model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -307,7 +308,7 @@ class MixtralModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index c8ad358c62..692267b4d7 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only Mixtral model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import numpy as np
@@ -346,7 +347,7 @@ class MixtralModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 5fc28ed0e4..b2fc7be1af 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -5,6 +5,7 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
 from functools import cached_property, partial
+from itertools import islice
 from typing import Annotated, Optional, Union
 
 import numpy as np
@@ -842,7 +843,7 @@ class MolmoModel(nn.Module, SupportsQuant):
             residual = intermediate_tensors["residual"]
 
         # Apply blocks one-by-one.
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 8db52a6992..48ac91fa6d 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -4,6 +4,7 @@
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -260,7 +261,7 @@ class MPTModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for block in self.blocks[self.start_layer:self.end_layer]:
+        for block in islice(self.blocks, self.start_layer, self.end_layer):
             hidden_states = block(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index eabf47b1ae..10adc62d3d 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only Nemotron model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -353,7 +354,7 @@ class NemotronModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 07cd5a4c6e..8a563288cb 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -399,8 +399,7 @@ class NemotronHModel(nn.Module):
 
         residual = None
         num_non_mamba_layers = 0
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
+        for i, layer in enumerate(self.layers):
             layer_mamba_cache_params = None
             if isinstance(layer,
                           NemotronHMambaDecoderLayer) and mamba_cache_params:
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index a766ed9476..f8e38dcd80 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only deci model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -287,8 +288,7 @@ class DeciModel(nn.Module):
             residual = intermediate_tensors["residual"]
 
         kv_cache_index = 0
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             if not layer._is_no_op_attention:
                 hidden_states, residual = layer(positions, hidden_states,
                                                 residual)
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 01639d3981..7157598956 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only OLMo model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -280,7 +281,7 @@ class OlmoModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
 
         # Apply blocks one-by-one.
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             # shape: (batch_size, seq_len, d_model)
             hidden_states = layer(positions, hidden_states)
 
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 66a0f91155..bccd1b8704 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -26,6 +26,7 @@
 
 from collections.abc import Iterable
 from functools import partial
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -305,7 +306,7 @@ class Olmo2Model(nn.Module):
             assert isinstance(hidden_states, torch.Tensor)
 
         # Apply blocks one-by-one.
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             # shape: (batch_size, seq_len, d_model)
             hidden_states = layer(positions, hidden_states)
 
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index a47c3bd416..9b8525bfad 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -15,6 +15,7 @@
 """Inference-only OLMoE model compatible with HuggingFace weights."""
 from collections.abc import Iterable
 from functools import partial
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -314,7 +315,7 @@ class OlmoeModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 9eaac1e28d..b92e586f0b 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -20,6 +20,7 @@
 # limitations under the License.
 """Inference-only OPT model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -269,7 +270,7 @@ class OPTDecoder(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(hidden_states)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index d121188ba5..add751ebf0 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -7,6 +7,7 @@
 # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
 """Inference-only Orion-14B model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -252,7 +253,7 @@ class OrionModel(nn.Module):
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index f8db99eb92..6bdd38d068 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -23,6 +23,7 @@
 # limitations under the License.
 """Inference-only persimmon model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -255,7 +256,7 @@ class PersimmonModel(nn.Module):
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 21d517b3a4..789b24eb0f 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -38,6 +38,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """Inference-only Phi-1.5 model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -240,7 +241,7 @@ class PhiModel(nn.Module):
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index cfe0982204..15ae081a9f 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only PhiMoE model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -500,7 +501,7 @@ class PhiMoEModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index e5034b5362..7f70e44b10 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only PLaMo2 model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional
 
 import torch
@@ -614,7 +615,7 @@ class Plamo2Decoder(torch.nn.Module):
         mamba2_metadata: Mamba2Metadata,
     ) -> torch.Tensor:
         mamba_cache_index = 0
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             layer_mamba_cache_params = None
             if layer.is_mamba:
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index e804f03e01..e32dc51f00 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -8,6 +8,7 @@
 """Inference-only QWen model compatible with HuggingFace weights."""
 import json
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -234,7 +235,7 @@ class QWenModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 27c1e68c67..54dc0bebd9 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -25,6 +25,7 @@
 # limitations under the License.
 """Inference-only Qwen2 model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -358,7 +359,7 @@ class Qwen2Model(nn.Module):
 
         aux_hidden_states = []
         for idx, layer in enumerate(
-                self.layers[self.start_layer:self.end_layer]):
+                islice(self.layers, self.start_layer, self.end_layer)):
             if idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(hidden_states + residual)
             hidden_states, residual = layer(positions, hidden_states, residual)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 5c4ad34246..5551ad8c32 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -25,6 +25,7 @@
 # limitations under the License.
 """Inference-only Qwen2MoE model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -381,7 +382,7 @@ class Qwen2MoeModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 8498f61b35..94e6a66bea 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -24,6 +24,7 @@
 """Inference-only Qwen3MoE model compatible with HuggingFace weights."""
 import typing
 from collections.abc import Callable, Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -420,8 +421,7 @@ class Qwen3MoeModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index 34a87a6a69..e3c7c700f8 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -23,6 +23,7 @@
 # limitations under the License.
 """Inference-only SeedOss model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -340,7 +341,7 @@ class SeedOssModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index d6ec743ce8..9e880ebd50 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -22,6 +22,7 @@
 """Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
 model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -247,7 +248,7 @@ class StableLMEpochModel(nn.Module):
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 9d9a2bff0e..62ff9b6182 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -21,6 +21,7 @@
 # limitations under the License.
 """ PyTorch Starcoder2 model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -250,7 +251,7 @@ class Starcoder2Model(nn.Module):
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index 47d2af5c2a..97611d3e14 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Jurassic model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional
 
 import torch
@@ -346,8 +347,7 @@ class Step3TextModel(nn.Module):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:

From 006477e60b49babfca96352c7c648f10fff4a053 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Thu, 28 Aug 2025 21:52:27 -0500
Subject: [PATCH 709/932] [ROCm][Fix] Fix rocm build caused by #23791 (#23847)

Signed-off-by: charlifu <charlifu@amd.com>
---
 csrc/cache_kernels.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index fc82a1fa8e..fbb022464e 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -913,7 +913,6 @@ __global__ void cp_gather_cache(
   const int32_t split_end = min((split + 1) * split_slots, tot_slots);
 
   const bool is_active_split = (split_start < tot_slots);
-  const bool is_last_split = (split_end == tot_slots);
 
   if (!is_active_split) return;
 

From c8b3b299c9f3142546e0a41f835e561af1aaffb7 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 29 Aug 2025 00:25:33 -0400
Subject: [PATCH 710/932] [tests] Improve speed and reliability of
 test_transcription_api_correctness (#23854)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .../correctness/test_transcription_api_correctness.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index 58195f98bd..0d0ce0be8c 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -49,8 +49,7 @@ async def transcribe_audio(client, tokenizer, y, sr):
     return latency, num_output_tokens, transcription.text
 
 
-async def bound_transcribe(model_name, sem, client, audio, reference):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
+async def bound_transcribe(sem, client, tokenizer, audio, reference):
     # Use semaphore to limit concurrent requests.
     async with sem:
         result = await transcribe_audio(client, tokenizer, *audio)
@@ -63,15 +62,19 @@ async def bound_transcribe(model_name, sem, client, audio, reference):
 async def process_dataset(model, client, data, concurrent_request):
     sem = asyncio.Semaphore(concurrent_request)
 
+    # Load tokenizer once outside the loop
+    tokenizer = AutoTokenizer.from_pretrained(model)
+
     # Warmup call as the first `librosa.load` server-side is quite slow.
     audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
-    _ = await bound_transcribe(model, sem, client, (audio, sr), "")
+    _ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "")
 
     tasks: list[asyncio.Task] = []
     for sample in data:
         audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
         task = asyncio.create_task(
-            bound_transcribe(model, sem, client, (audio, sr), sample["text"]))
+            bound_transcribe(sem, client, tokenizer, (audio, sr),
+                             sample["text"]))
         tasks.append(task)
     return await asyncio.gather(*tasks)
 

From 98ac0cb32d9462e50bd998f9f2eb6e4c09232c95 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 29 Aug 2025 12:41:20 +0800
Subject: [PATCH 711/932] [Bugfix] Use `ReplicatedLinear` for
 SequenceClassification head (#23836)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/language/pooling/test_qwen3_reranker.py | 7 ++-----
 vllm/model_executor/models/adapters.py               | 5 ++---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py
index 8c6537f319..5dd2d9eae9 100644
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -96,8 +96,5 @@ def test_rerank_models_mteb_tp(vllm_runner,
         "tensor_parallel_size": 2,
     }
 
-    mteb_test_rerank_models(Qwen3RerankerHfRunner,
-                            vllm_runner,
-                            model_info,
-                            vllm_extra_kwargs,
-                            atol=1.2e-2)
+    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
+                            vllm_extra_kwargs)
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 49e9a2d65e..50c2cd97f3 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -248,7 +248,7 @@ def as_seq_cls_model(cls: _T) -> _T:
         return cls
 
     # Lazy import
-    from vllm.model_executor.layers.linear import RowParallelLinear
+    from vllm.model_executor.layers.linear import ReplicatedLinear
     from vllm.model_executor.layers.pooler import (ClassifierPooler,
                                                    DispatchPooler, Pooler,
                                                    PoolingMethod, PoolingType)
@@ -264,10 +264,9 @@ def as_seq_cls_model(cls: _T) -> _T:
             config = vllm_config.model_config.hf_config
             quant_config = vllm_config.quant_config
 
-            self.score = RowParallelLinear(
+            self.score = ReplicatedLinear(
                 config.hidden_size,
                 config.num_labels,
-                input_is_parallel=False,
                 bias=False,
                 params_dtype=torch.float32,
                 quant_config=quant_config,

From 5264015d74f2e0213a1e7d51041a558d7ea580e8 Mon Sep 17 00:00:00 2001
From: Jinghui Zhang <jinghuizhang0804@gmail.com>
Date: Thu, 28 Aug 2025 22:54:12 -0700
Subject: [PATCH 712/932] [BugFix][AMD][Deepseek] fix a dtype mismatch error
 for deepseek running on AMD (#23864)

Signed-off-by: Jinghui Zhang <jinghuizhang0804@gmail.com>
---
 .../layers/fused_moe/rocm_aiter_fused_moe.py              | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 93e20c3477..b838fd798b 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -279,7 +279,7 @@ def rocm_aiter_grouped_topk(
     if e_score_correction_bias is not None:
         torch.ops.vllm.rocm_aiter_biased_grouped_topk(
             gating_output,
-            e_score_correction_bias,
+            e_score_correction_bias.to(gating_output.dtype),
             topk_weights,
             topk_ids,
             num_expert_group,
@@ -409,15 +409,15 @@ def shuffle_weights(
     *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
 ) -> tuple[torch.Tensor, ...]:
     """
-    Applies shuffle_weight function from AITER to each 
+    Applies shuffle_weight function from AITER to each
     input tensor and returns them.
-    
+
     Rearranges (shuffles) the input tensor/s
     into a specified block layout for optimized computation.
 
     Args:
         *tensors: Variable number of torch.Tensor objects.
-        layout: A pair of integers specifying the 
+        layout: A pair of integers specifying the
         block sizes used to divide the tensors during shuffling.
         Default is (16, 16).
 

From 6597d7a4566d344835f5a90621397d8fee490b10 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Fri, 29 Aug 2025 13:54:16 +0800
Subject: [PATCH 713/932] [Platform] import activation_quant_fusion for CUDA
 only (#23882)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/compilation/pass_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index e07e52be9f..1b1cbe4fa1 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -8,13 +8,13 @@ from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
 if current_platform.is_cuda_alike():
+    from .activation_quant_fusion import ActivationQuantFusionPass
     from .fusion import FusionPass
     from .fusion_attn import AttnFusionPass
 
 if current_platform.is_cuda():
     from .collective_fusion import AllReduceFusionPass, AsyncTPPass
 
-from .activation_quant_fusion import ActivationQuantFusionPass
 from .fix_functionalization import FixFunctionalizationPass
 from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
 from .noop_elimination import NoOpEliminationPass

From 05d839c19e9582d62c860686678bac68240d7254 Mon Sep 17 00:00:00 2001
From: Raghavan <oneraghavan@gmail.com>
Date: Fri, 29 Aug 2025 11:25:06 +0530
Subject: [PATCH 714/932] Fix(async): Add support for truncate_prompt_tokens in
 AsyncLLM (#23800)

---
 vllm/v1/engine/async_llm.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 7440fe1f07..2a9fa1fd91 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -15,6 +15,7 @@ import vllm.envs as envs
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
 from vllm.inputs import PromptType
 from vllm.inputs.preprocess import InputPreprocessor
@@ -348,6 +349,15 @@ class AsyncLLM(EngineClient):
             # to handle startup failure gracefully in the OpenAI server.
             self._run_output_handler()
 
+            tokenization_kwargs: dict[str, Any] = {}
+            truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
+
+            _validate_truncation_size(
+                self.model_config.max_model_len,
+                truncate_prompt_tokens,
+                tokenization_kwargs,
+            )
+
             q = await self.add_request(
                 request_id,
                 prompt,
@@ -355,6 +365,7 @@ class AsyncLLM(EngineClient):
                 lora_request=lora_request,
                 trace_headers=trace_headers,
                 priority=priority,
+                tokenization_kwargs=tokenization_kwargs,
                 data_parallel_rank=data_parallel_rank,
             )
 
@@ -481,6 +492,7 @@ class AsyncLLM(EngineClient):
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
+        truncate_prompt_tokens: Optional[int] = None,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """
@@ -503,6 +515,14 @@ class AsyncLLM(EngineClient):
             # to handle startup failure gracefully in the OpenAI server.
             self._run_output_handler()
 
+            if tokenization_kwargs is None:
+                tokenization_kwargs = dict[str, Any]()
+            _validate_truncation_size(
+                self.model_config.max_model_len,
+                truncate_prompt_tokens,
+                tokenization_kwargs,
+            )
+
             q = await self.add_request(
                 request_id,
                 prompt,

From b4f9e9631c84c73cbf05f18402074be1abf0471d Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 29 Aug 2025 14:28:35 +0800
Subject: [PATCH 715/932] [CI/Build] Clean up LoRA test (#23890)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       |  1 -
 .buildkite/test-pipeline.yaml                 |  9 +--
 .../llm/test_generate_multiple_loras.py       | 80 -------------------
 ...ith_tp.py => test_llm_with_multi_loras.py} | 37 ++++++++-
 4 files changed, 40 insertions(+), 87 deletions(-)
 delete mode 100644 tests/entrypoints/llm/test_generate_multiple_loras.py
 rename tests/lora/{test_multi_loras_with_tp.py => test_llm_with_multi_loras.py} (80%)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index df0bae0c9c..c395011a24 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
   --ignore=entrypoints/llm/test_chat.py \
   --ignore=entrypoints/llm/test_accuracy.py \
   --ignore=entrypoints/llm/test_init.py \
-  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
   --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 454aaca0a1..f265204552 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -109,10 +109,9 @@ steps:
   - tests/entrypoints/offline_mode
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
 - label: Entrypoints Test (API Server) # 40min
@@ -326,7 +325,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
   parallelism: 4
 
 - label: PyTorch Compilation Unit Tests
@@ -807,13 +806,13 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_multi_loras_with_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_gpus: 2 
   optional: true
   source_file_dependencies:
   - vllm/
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
deleted file mode 100644
index a04f195692..0000000000
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import weakref
-
-import pytest
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
-
-from vllm import LLM
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.lora.request import LoRARequest
-
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-
-PROMPTS = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-
-
-@pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module", params=[False, True])
-def llm(request, monkeypatch_module):
-
-    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
-
-    # pytest caches the fixture so we use weakref.proxy to
-    # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              tensor_parallel_size=1,
-              max_model_len=8192,
-              enable_lora=True,
-              max_loras=4,
-              max_lora_rank=64,
-              max_num_seqs=128,
-              enforce_eager=True)
-
-    yield weakref.proxy(llm)
-
-    del llm
-
-    cleanup_dist_env_and_memory()
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.mark.skip_global_cleanup
-def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
-    lora_request = [
-        LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
-        for idx in range(len(PROMPTS))
-    ]
-    # Multiple SamplingParams should be matched with each prompt
-    outputs = llm.generate(PROMPTS, lora_request=lora_request)
-    assert len(PROMPTS) == len(outputs)
-
-    # Exception raised, if the size of params does not match the size of prompts
-    with pytest.raises(ValueError):
-        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
-
-    # Single LoRARequest should be applied to every prompt
-    single_lora_request = lora_request[0]
-    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
-    assert len(PROMPTS) == len(outputs)
diff --git a/tests/lora/test_multi_loras_with_tp.py b/tests/lora/test_llm_with_multi_loras.py
similarity index 80%
rename from tests/lora/test_multi_loras_with_tp.py
rename to tests/lora/test_llm_with_multi_loras.py
index fe9bd3f269..3d8dd512a2 100644
--- a/tests/lora/test_multi_loras_with_tp.py
+++ b/tests/lora/test_llm_with_multi_loras.py
@@ -1,8 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-Script to test multi loras service with tp >= 2
+This script contains:
+1. test multi loras service with tp >= 2
+2. test multi loras request
 """
+import pytest
+
 from tests.utils import multi_gpu_test
 from vllm import LLM, SamplingParams
 from vllm.lora.request import LoRARequest
@@ -156,3 +160,34 @@ def test_multi_loras_with_tp_sync():
 
         output_text = call_llm_get_outputs(prompt, "Alice")
         check_outputs(output_text, expected_output)
+
+
+def test_multiple_lora_requests():
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+    )
+    PROMPTS = ["Hello, my name is"] * 2
+    LORA_NAME = "Alice"
+    lora_request = [
+        LoRARequest(LORA_NAME + str(idx), idx + 1,
+                    LORA_NAME_PATH_MAP[LORA_NAME])
+        for idx in range(len(PROMPTS))
+    ]
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, lora_request=lora_request)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
+
+    # Single LoRARequest should be applied to every prompt
+    single_lora_request = lora_request[0]
+    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
+    assert len(PROMPTS) == len(outputs)

From 2d0afcc9dc925928ee8764c826a3661e487f9f82 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Thu, 28 Aug 2025 23:29:13 -0700
Subject: [PATCH 716/932] [mrope][Qwen2-VL] Fix edge case where getting index
 of image/video token can potentially throw in default vl mrope
 implementation.  (#23895)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 .../layers/rotary_embedding/mrope.py               | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index e374aa9beb..5686ec7b35 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -670,12 +670,18 @@ class MRotaryEmbedding(RotaryEmbedding):
         image_index, video_index = 0, 0
         for _ in range(image_nums + video_nums):
             video_second_per_grid_t = 0.0
-            if image_token_id in input_tokens and remain_images > 0:
-                ed_image = input_tokens.index(image_token_id, st)
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
             else:
                 ed_image = len(input_tokens) + 1
-            if video_token_id in input_tokens and remain_videos > 0:
-                ed_video = input_tokens.index(video_token_id, st)
+            if remain_videos > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
             else:
                 ed_video = len(input_tokens) + 1
             if ed_image < ed_video:

From 885ca6d31db8816ee08e3fa634fbb58add289898 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Fri, 29 Aug 2025 14:58:48 +0800
Subject: [PATCH 717/932] [Misc] Fix warnings for mistral model (#23552)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 vllm/model_executor/models/pixtral.py         | 12 ++++----
 vllm/model_executor/models/voxtral.py         | 12 ++++----
 vllm/transformers_utils/tokenizers/mistral.py | 30 +++++++++++--------
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index a74e01a596..e7f5799a80 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -15,7 +15,7 @@ from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.multimodal import ImageEncoder
 from PIL import Image
-from transformers import PixtralVisionConfig, TensorType
+from transformers import BatchFeature, PixtralVisionConfig, TensorType
 from transformers.image_utils import ImageInput
 from transformers.models.pixtral.image_processing_pixtral import (
     _num_image_tokens as _get_pixtral_hf_num_image_tokens)
@@ -163,10 +163,12 @@ class PixtralProcessorAdapter:
             images_processed.append(image_processed)
             images_tokens.append(image_tokens)
 
-        return {
-            "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
-            "images": images_processed,
-        }
+        return BatchFeature({
+            "input_ids":
+            torch.cat(images_tokens)[None].expand(len(text), -1),
+            "images":
+            images_processed,
+        })
 
 
 class PixtralProcessingInfo(BaseProcessingInfo):
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index eed8d89ca4..6bc748407a 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -17,7 +17,7 @@ from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.protocol.transcription.request import TranscriptionRequest
 from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
-from transformers import TensorType, WhisperConfig
+from transformers import BatchFeature, TensorType, WhisperConfig
 from transformers.tokenization_utils_base import TextInput
 
 from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
@@ -156,10 +156,12 @@ class VoxtralProcessorAdapter:
             audios_tokens.append(torch.tensor(audio_tokens))
             audios_processed.append(torch.tensor(audio))
 
-        return {
-            "input_ids": torch.cat(audios_tokens)[None].expand(len(text), -1),
-            "audio_arrays": audios_processed,
-        }
+        return BatchFeature({
+            "input_ids":
+            torch.cat(audios_tokens)[None].expand(len(text), -1),
+            "audio_arrays":
+            audios_processed,
+        })
 
 
 class VoxtralProcessingInfo(BaseProcessingInfo):
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 4dd8b2439b..f545993a5a 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -204,18 +204,16 @@ class MistralTokenizer(TokenizerBase):
         self.version: int = int(_mistral_version_str.split("v")[-1])
 
         tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
-        from mistral_common.tokens.tokenizers.tekken import (
-            SpecialTokenPolicy, Tekkenizer)
+        from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
+        from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+
         self.is_tekken = isinstance(tokenizer_, Tekkenizer)
         from mistral_common.tokens.tokenizers.sentencepiece import (
             SentencePieceTokenizer)
         self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
-        if self.is_tekken:
-            # Make sure special tokens will not raise
-            tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
-        elif self.is_spm:
-            pass
-        else:
+        self._special_token_policy = (SpecialTokenPolicy.IGNORE
+                                      if self.is_tekken else None)
+        if not (self.is_tekken or self.is_spm):
             raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
 
         self._vocab = tokenizer_.vocab()
@@ -430,7 +428,8 @@ class MistralTokenizer(TokenizerBase):
                         return self.tokenizer.unk_id
 
                 ids = [_token_to_id(t) for t in tokens]
-                decoded = self.tokenizer.decode(ids)
+                decoded = self.tokenizer.decode(ids,
+                                                self._special_token_policy)
             else:
                 decoded = "".join(tokens)
         else:
@@ -444,7 +443,8 @@ class MistralTokenizer(TokenizerBase):
                 if token in special_tokens:
                     if regular_tokens:
                         decoded_list.append(
-                            self.tokenizer.decode(regular_tokens))
+                            self.tokenizer.decode(regular_tokens,
+                                                  self._special_token_policy))
                         regular_tokens = []
                     decoded_list.append(token)
                 else:
@@ -452,7 +452,8 @@ class MistralTokenizer(TokenizerBase):
 
             if regular_tokens:
                 decoded_list.append(
-                    self.tokenizer.decode(regular_tokens))  # type: ignore
+                    self.tokenizer.decode(regular_tokens,
+                                          self._special_token_policy))
 
             decoded = ''.join(decoded_list)
 
@@ -470,7 +471,7 @@ class MistralTokenizer(TokenizerBase):
 
         if isinstance(ids, int):
             ids = [ids]
-        return self.tokenizer.decode(ids)
+        return self.tokenizer.decode(ids, self._special_token_policy)
 
     def convert_ids_to_tokens(
         self,
@@ -511,6 +512,9 @@ class MistralTokenizer(TokenizerBase):
             # See: https://github.com/vllm-project/vllm/pull/8640
             #      https://github.com/vllm-project/vllm/pull/9625
             # if underlying tokenizeir is sentencepiece, we just add "�"
-            tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
+            tokens = [
+                self.tokenizer.id_to_byte_piece(id, self._special_token_policy)
+                for id in ids
+            ]
 
         return tokens

From 934bebf19252da6e1f2583d92e31d583b49498a2 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 29 Aug 2025 08:01:40 +0100
Subject: [PATCH 718/932] Better errors for Transformers backend missing
 features (#23759)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers.py | 25 ++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index f7ced6134d..5ad0482330 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -17,6 +17,7 @@
 """Wrapper around `transformers` models"""
 from collections.abc import Iterable, Mapping
 from contextlib import contextmanager
+from pathlib import Path
 from typing import Literal, Optional, Union
 
 import regex as re
@@ -60,6 +61,21 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
 logger = init_logger(__name__)
 
 
+def get_feature_request_tip(
+    model: str,
+    trust_remote_code: bool,
+) -> str:
+    hf_url = f"a discussion at https://huggingface.co/{model}/discussions/new"
+    gh_url = "an issue at https://github.com/huggingface/transformers/issues/new/choose"
+    url = hf_url if trust_remote_code else gh_url
+    prefix = f"Please open {url} to request support for this feature. "
+    if Path(model).exists():
+        prefix = ""
+    doc_url = "https://docs.vllm.ai/en/latest/models/supported_models.html#writing-custom-models"
+    tip = f"See {doc_url} for instructions on how to add support yourself."
+    return f"{prefix}{tip}"
+
+
 def vllm_flash_attention_forward(
         # Transformers args
         module: torch.nn.Module,
@@ -480,8 +496,11 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
             return
 
         if not self.model.supports_pp_plan:
+            tip = get_feature_request_tip(self.model_config.model,
+                                          self.model_config.trust_remote_code)
             raise ValueError(
-                f"{type(self.model)} does not support pipeline parallel yet!")
+                f"{type(self.model)} does not support pipeline parallel. {tip}"
+            )
 
         module_lists = []
         module_list_idx = None
@@ -535,8 +554,10 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         models_with_tp_plan = filter(supports_tp_plan, pretrained_models)
 
         if not any(models_with_tp_plan) and self.tp_size > 1:
+            tip = get_feature_request_tip(self.model_config.model,
+                                          self.model_config.trust_remote_code)
             raise ValueError(
-                f"{type(self.model)} does not support tensor parallel yet!")
+                f"{type(self.model)} does not support tensor parallel. {tip}")
 
         def _tensor_parallel(module: nn.Module,
                              prefix: str = "",

From 2554b27baa58b15843367f92d7f73d71bb89033d Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Fri, 29 Aug 2025 04:04:02 -0300
Subject: [PATCH 719/932] [V0 Deprecation] Remove pooling model support in V0 
 (#23434)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/distributed/test_pipeline_parallel.py   |   8 +-
 tests/entrypoints/llm/test_classify.py        |   8 -
 tests/entrypoints/llm/test_encode.py          |   8 -
 tests/entrypoints/llm/test_reward.py          |   8 -
 tests/entrypoints/llm/test_score.py           |   8 -
 .../offline_mode/test_offline_mode.py         |  19 +-
 tests/entrypoints/openai/test_embedding.py    |   8 -
 tests/entrypoints/openai/test_rerank.py       |   8 -
 tests/entrypoints/openai/test_score.py        |   9 -
 .../models/language/pooling/test_embedding.py |  20 +-
 tests/models/language/pooling/test_reward.py  |   8 -
 tests/models/language/pooling/test_scoring.py |   9 -
 tests/models/registry.py                      |  23 +-
 tests/worker/test_model_input.py              |  54 -----
 vllm/core/scheduler.py                        |   1 -
 vllm/engine/arg_utils.py                      |   7 +-
 vllm/engine/async_llm_engine.py               | 111 ++-------
 vllm/engine/llm_engine.py                     |  82 ++-----
 vllm/engine/multiprocessing/__init__.py       |   1 +
 vllm/engine/multiprocessing/client.py         |  51 +---
 vllm/engine/multiprocessing/engine.py         |   5 +-
 vllm/engine/protocol.py                       |   5 +-
 vllm/entrypoints/llm.py                       |   3 +-
 vllm/entrypoints/openai/serving_score.py      |   4 +-
 vllm/inputs/data.py                           |   6 -
 vllm/inputs/preprocess.py                     |  12 +-
 vllm/model_executor/layers/pooler.py          |  32 +--
 vllm/model_executor/models/bert.py            |   2 +-
 vllm/model_executor/models/gritlm.py          |   2 +-
 vllm/model_executor/models/modernbert.py      |   2 +-
 vllm/model_executor/pooling_metadata.py       |  90 -------
 vllm/multimodal/inputs.py                     |   6 -
 vllm/sequence.py                              |  11 -
 vllm/worker/enc_dec_model_runner.py           |   5 +-
 vllm/worker/model_runner.py                   |  29 ---
 vllm/worker/model_runner_base.py              |  14 +-
 vllm/worker/pooling_model_runner.py           | 222 ------------------
 vllm/worker/worker.py                         |   6 +-
 38 files changed, 99 insertions(+), 808 deletions(-)
 delete mode 100644 vllm/model_executor/pooling_metadata.py
 delete mode 100644 vllm/worker/pooling_model_runner.py

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 28150d7682..1afe9ea970 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -118,6 +118,8 @@ class PPTestSettings:
         multi_node_only: bool = False,
         load_format: Optional[str] = None,
     ):
+        vllm_major_versions = ["1"] if runner == "pooling" else ["0"]
+
         return PPTestSettings(
             parallel_setups=[
                 ParallelSetup(tp_size=tp_base,
@@ -126,7 +128,7 @@ class PPTestSettings:
                               chunked_prefill=False),
             ],
             distributed_backends=["mp"],
-            vllm_major_versions=["0"],
+            vllm_major_versions=vllm_major_versions,
             runner=runner,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
@@ -213,7 +215,9 @@ TEXT_GENERATION_MODELS = {
 EMBEDDING_MODELS = {  # type: ignore[var-annotated]
     # [Text-only]
     "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
-    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
+    # TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883
+    # is fixed
+    #"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
     "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
         load_format="dummy", runner="pooling"
     ),
diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py
index 57705ff669..7c261a2a57 100644
--- a/tests/entrypoints/llm/test_classify.py
+++ b/tests/entrypoints/llm/test_classify.py
@@ -16,14 +16,6 @@ MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
 prompts = ["The chef prepared a delicious meal."]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index cb54b16b0b..eae3e23437 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -27,14 +27,6 @@ TOKEN_IDS = [
 ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
diff --git a/tests/entrypoints/llm/test_reward.py b/tests/entrypoints/llm/test_reward.py
index de82cf8d40..2cee3c8d94 100644
--- a/tests/entrypoints/llm/test_reward.py
+++ b/tests/entrypoints/llm/test_reward.py
@@ -16,14 +16,6 @@ MODEL_NAME = "internlm/internlm2-1_8b-reward"
 prompts = ["The chef prepared a delicious meal."]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
diff --git a/tests/entrypoints/llm/test_score.py b/tests/entrypoints/llm/test_score.py
index 5a1339b2ad..f715dacacb 100644
--- a/tests/entrypoints/llm/test_score.py
+++ b/tests/entrypoints/llm/test_score.py
@@ -14,14 +14,6 @@ from ...models.utils import softmax
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index dd8d63ad31..a154bb1059 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -32,15 +32,16 @@ MODEL_CONFIGS = [
         "tensor_parallel_size": 1,
         "tokenizer_mode": "mistral",
     },
-    {
-        "model": "sentence-transformers/all-MiniLM-L12-v2",
-        "enforce_eager": True,
-        "gpu_memory_utilization": 0.20,
-        "max_model_len": 64,
-        "max_num_batched_tokens": 64,
-        "max_num_seqs": 64,
-        "tensor_parallel_size": 1,
-    },
+    # TODO: re-enable once these tests are run with V1
+    # {
+    #     "model": "sentence-transformers/all-MiniLM-L12-v2",
+    #     "enforce_eager": True,
+    #     "gpu_memory_utilization": 0.20,
+    #     "max_model_len": 64,
+    #     "max_num_batched_tokens": 64,
+    #     "max_num_seqs": 64,
+    #     "tensor_parallel_size": 1,
+    # },
 ]
 
 
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index cf2442a569..d46ab304ba 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -24,14 +24,6 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
 DTYPE = "bfloat16"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def server():
     args = [
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
index 73364294cb..ce4d6c5f5d 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -14,14 +14,6 @@ MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def server():
     args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index cb6ec795ae..4fafcfb45f 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -12,15 +12,6 @@ from vllm.entrypoints.openai.protocol import ScoreResponse
 
 from ...utils import RemoteOpenAIServer
 
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 MODELS = [
     {
         "name": "BAAI/bge-reranker-v2-m3",
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index 2dd35c4151..f918b2b91b 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -10,14 +10,6 @@ from vllm.platforms import current_platform
 from ...utils import check_embeddings_close, check_transformers_version
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.mark.parametrize(
     "model",
     [
@@ -32,21 +24,15 @@ def v1(run_with_both_engines):
             "intfloat/e5-mistral-7b-instruct",
             # CPU v1 doesn't support sliding window
             marks=[pytest.mark.core_model]),
-        # the qwen models interfere with each other (see PR
-        # https://github.com/vllm-project/vllm/pull/18720).
-        # To avoid this problem, for now we skip v0 since it will be
-        # deprecated anyway.
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
-                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
+                     marks=[pytest.mark.cpu_model]),
         # [Encoder-only]
         pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         # [Cross-Encoder]
-        pytest.param("sentence-transformers/stsb-roberta-base-v2",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
 )
 def test_models(
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index beafa0aed9..08722ac98b 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -13,14 +13,6 @@ from ....conftest import HfRunner
 from ...utils import check_transformers_version
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture
 def math_step_prompts():
     # ruff: noqa: E501
diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py
index 6b5ff70681..ef9d5530cd 100644
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -23,15 +23,6 @@ TEXTS_2 = [
     "The capital of Germany is Berlin.",
 ]
 
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 DTYPE = "half"
 
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 85b4c96e3b..13eb4872e7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -323,8 +323,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
 
 _EMBEDDING_EXAMPLE_MODELS = {
     # [Text-only]
-    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True),
-    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True),  # noqa: E501
+    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
+    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),  # noqa: E501
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
     "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                                trust_remote_code=True),
@@ -337,9 +337,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
-                                trust_remote_code=True, v0_only=True),
+                                trust_remote_code=True),
     "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
-                                               trust_remote_code=True, v0_only=True),  # noqa: E501
+                                               trust_remote_code=True),  # noqa: E501
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B",
                                            max_transformers_version="4.53",
@@ -347,9 +347,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B",
                                                   max_transformers_version="4.53",
                                                   transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"),  # noqa: E501
-    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True),  # noqa: E501
-    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True),  # noqa: E501
-    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True),  # noqa: E501
+    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
+    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),  # noqa: E501
+    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"),  # noqa: E501
     # [Multimodal]
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
     "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
@@ -364,20 +364,19 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
     "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"),  # noqa: E501
 
     # [Cross-encoder]
-    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True),  # noqa: E501
+    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
     "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base",  # noqa: E501
                                                        trust_remote_code=True,
                                                        hf_overrides={
                                                            "architectures": ["GteNewForSequenceClassification"]}),# noqa: E501
-    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501
-    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True),  # noqa: E501
-    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True),  # noqa: E501
+    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base"), # noqa: E501
+    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"),  # noqa: E501
+    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"),  # noqa: E501
 }
 
 _AUTOMATIC_CONVERTED_MODELS = {
     # Use as_seq_cls_model for automatic conversion
     "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma",  # noqa: E501
-                                                      v0_only=True,
                                                       hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
                                                                     "classifier_from_token": ["Yes"],  # noqa: E501
                                                                     "method": "no_post_processing"}),  # noqa: E501
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index 2031f41fab..0f28ef2ba8 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -9,10 +9,7 @@ from vllm.attention import AttentionMetadata, AttentionMetadataBuilder
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-from vllm.worker.pooling_model_runner import (
-    ModelInputForGPUWithPoolingMetadata)
 
 
 class MockAttentionBackend(AttentionBackend):
@@ -114,54 +111,3 @@ def test_model_runner_input():
     assert (received_model_input.sampling_metadata.selected_token_indices ==
             sampling_metadata.selected_token_indices)
     assert received_model_input.sampling_metadata.seq_groups is None
-
-
-def test_embedding_model_runner_input():
-    pooling_metadata = PoolingMetadata(
-        seq_groups=[[0]],
-        seq_data={},
-        prompt_lens=[1],
-    )
-    attn_metadata = AttentionMetadata(
-        num_prefills=1,
-        num_prefill_tokens=2,
-        num_decode_tokens=3,
-        slot_mapping=torch.zeros(1),
-        multi_modal_placeholder_index_maps=None,
-        enable_kv_scales_calculation=True,
-    )
-    model_input = ModelInputForGPUWithPoolingMetadata(
-        input_tokens=torch.ones(10),
-        input_positions=torch.ones(10),
-        pooling_metadata=pooling_metadata,
-        attn_metadata=attn_metadata)
-
-    assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata)
-
-    # Test round trip serialization.
-    tensor_dict = model_input.as_broadcastable_tensor_dict()
-    attn_backend = MockAttentionBackend()
-    received_model_input = (
-        ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict, attn_backend=attn_backend))
-    # Check that received copy has correct values.
-    assert isinstance(received_model_input,
-                      ModelInputForGPUWithPoolingMetadata)
-    assert received_model_input.input_tokens is not None
-    assert (
-        received_model_input.input_tokens == model_input.input_tokens).all()
-    assert received_model_input.input_positions is not None
-    assert (received_model_input.input_positions == model_input.input_positions
-            ).all()
-    assert received_model_input.multi_modal_kwargs is None
-    assert (received_model_input.multi_modal_kwargs ==
-            model_input.multi_modal_kwargs)
-    assert received_model_input.lora_requests is None
-    assert received_model_input.lora_requests == model_input.lora_requests
-    assert received_model_input.lora_mapping is None
-    assert received_model_input.lora_mapping == model_input.lora_mapping
-    for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(received_model_input.attn_metadata, field.name,
-                       None) == getattr(attn_metadata, field.name, None)
-    # Pooling metadata is not broadcast.
-    assert received_model_input.pooling_metadata is None
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index c89f3f6632..d7864293e9 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1591,7 +1591,6 @@ class Scheduler:
                     encoder_seq_data=encoder_seq_data,
                     cross_block_table=cross_block_table,
                     state=seq_group.state,
-                    token_type_ids=seq_group.token_type_ids,
                     # `multi_modal_data` will only be present for the 1st comm
                     # between engine and worker.
                     # the subsequent comms can still use delta, but
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7802802f13..06bd97dd6a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1566,8 +1566,7 @@ class EngineArgs:
                 use_spec_decode = self.speculative_config is not None
 
                 if (is_gpu and not use_sliding_window and not use_spec_decode
-                        and not self.enable_lora
-                        and model_config.runner_type != "pooling"):
+                        and not self.enable_lora):
                     self.enable_chunked_prefill = True
                     logger.warning(
                         "Chunked prefill is enabled by default for models "
@@ -1585,10 +1584,6 @@ class EngineArgs:
                 "OOM during the initial memory profiling phase, or result "
                 "in low performance due to small KV cache size. Consider "
                 "setting --max-model-len to a smaller value.", max_model_len)
-        elif (self.enable_chunked_prefill
-              and model_config.runner_type == "pooling"):
-            msg = "Chunked prefill is not supported for pooling models"
-            raise ValueError(msg)
 
         # if using prefix caching, we must set a hash algo
         if self.enable_prefix_caching:
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 4fb028627a..9f9ad1854c 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -72,8 +72,8 @@ STOP_ITERATION = Exception()  # Sentinel
 
 
 class AsyncStream:
-    """A stream of RequestOutputs or PoolingRequestOutputs for a request
-    that can be iterated over asynchronously via an async generator."""
+    """A stream of RequestOutputs for a request that can be iterated over
+    asynchronously via an async generator."""
 
     def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
         self.request_id = request_id
@@ -81,8 +81,7 @@ class AsyncStream:
         self._queue: asyncio.Queue = asyncio.Queue()
         self._finished = False
 
-    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
-                              Exception]) -> None:
+    def put(self, item: Union[RequestOutput, Exception]) -> None:
         if not self._finished:
             self._queue.put_nowait(item)
 
@@ -99,9 +98,7 @@ class AsyncStream:
     def finished(self) -> bool:
         return self._finished
 
-    async def generator(
-        self
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+    async def generator(self) -> AsyncGenerator[RequestOutput, None]:
         try:
             while True:
                 result = await self._queue.get()
@@ -151,8 +148,7 @@ class RequestTracker:
                 self.abort_request(rid, exception=exc)
 
     def process_request_output(self,
-                               request_output: Union[RequestOutput,
-                                                     PoolingRequestOutput],
+                               request_output: RequestOutput,
                                *,
                                verbose: bool = False) -> None:
         """Process a request output from the engine."""
@@ -261,9 +257,7 @@ class _AsyncLLMEngine(LLMEngine):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    async def step_async(
-        self, virtual_engine: int
-    ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
+    async def step_async(self, virtual_engine: int) -> List[RequestOutput]:
         """Performs one decoding iteration and returns newly generated results.
         The workers are ran asynchronously if possible.
 
@@ -405,7 +399,7 @@ class _AsyncLLMEngine(LLMEngine):
         self,
         request_id: str,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
@@ -779,14 +773,14 @@ class AsyncLLMEngine(EngineClient):
         self,
         request_id: str,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
         data_parallel_rank: Optional[int] = None,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+    ) -> AsyncGenerator[RequestOutput, None]:
         if not self.is_running:
             if self.start_engine_loop:
                 self.start_background_loop()
@@ -908,7 +902,7 @@ class AsyncLLMEngine(EngineClient):
             await self.abort(request_id)
             raise
 
-    async def encode(
+    def encode(
         self,
         prompt: PromptType,
         pooling_params: PoolingParams,
@@ -918,85 +912,8 @@ class AsyncLLMEngine(EngineClient):
         priority: int = 0,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model.
-
-        Generate outputs for a request. This method is a coroutine. It adds the
-        request into the waiting queue of the LLMEngine and streams the outputs
-        from the LLMEngine to the caller.
-
-        Args:
-            prompt: The prompt to the LLM. See
-                [`PromptType`][vllm.inputs.PromptType] for more details about
-                the format of each input.
-            pooling_params: The pooling parameters of the request.
-            request_id: The unique id of the request.
-            lora_request: LoRA request to use for generation, if any.
-            trace_headers: OpenTelemetry trace headers.
-            priority: The priority of the request.
-                Only applicable with priority scheduling.
-
-        Yields:
-            The output `PoolingRequestOutput` objects from the LLMEngine
-            for the request.
-
-        Details:
-            - If the engine is not running, start the background loop,
-                which iteratively invokes
-                [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
-                to process the waiting requests.
-            - Add the request to the engine's `RequestTracker`.
-                On the next background loop, this request will be sent to
-                the underlying engine.
-                Also, a corresponding `AsyncStream` will be created.
-            - Wait for the request outputs from `AsyncStream` and yield them.
-
-        Example:
-        ```
-        # Please refer to entrypoints/api_server.py for
-        # the complete example.
-
-        # initialize the engine and the example input
-        # note that engine_args here is AsyncEngineArgs instance
-        engine = AsyncLLMEngine.from_engine_args(engine_args)
-        example_input = {
-            "input": "What is LLM?",
-            "request_id": 0,
-        }
-
-        # start the generation
-        results_generator = engine.encode(
-        example_input["input"],
-        PoolingParams(),
-        example_input["request_id"])
-
-        # get the results
-        final_output = None
-        async for request_output in results_generator:
-            if await request.is_disconnected():
-                # Abort the request if the client disconnects.
-                await engine.abort(request_id)
-                # Return or raise an error
-                ...
-            final_output = request_output
-
-        # Process and return the final output
-        ...
-        ```
-        """
-        try:
-            async for output in await self.add_request(
-                    request_id,
-                    prompt,
-                    pooling_params,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
-                    priority=priority,
-                    tokenization_kwargs=tokenization_kwargs,
-            ):
-                yield LLMEngine.validate_output(output, PoolingRequestOutput)
-        except asyncio.CancelledError:
-            await self.abort(request_id)
-            raise
+        raise NotImplementedError(
+            "Pooling models are not supported in vLLM V0")
 
     async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
         """Abort a request.
@@ -1104,8 +1021,8 @@ class AsyncLLMEngine(EngineClient):
     async def is_sleeping(self) -> bool:
         return self.engine.is_sleeping()
 
-    async def add_lora(self, lora_request: LoRARequest) -> None:
-        self.engine.add_lora(lora_request)
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.engine.add_lora(lora_request)
 
     async def collective_rpc(self,
                              method: str,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 03c2f0375d..7a5130af0b 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -40,12 +40,11 @@ from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.outputs import (PoolingRequestOutput, RequestOutput,
                           RequestOutputFactory)
-from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
-                           PoolingSequenceGroupOutput, Sequence, SequenceGroup,
-                           SequenceGroupBase, SequenceGroupMetadata,
-                           SequenceGroupOutput, SequenceStatus)
+                           Sequence, SequenceGroup, SequenceGroupBase,
+                           SequenceGroupMetadata, SequenceGroupOutput,
+                           SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
 from vllm.transformers_utils.detokenizer import Detokenizer
@@ -93,8 +92,7 @@ class SchedulerContext:
 
     def __init__(self) -> None:
         self.output_queue: Deque[OutputData] = deque()
-        self.request_outputs: List[Union[RequestOutput,
-                                         PoolingRequestOutput]] = []
+        self.request_outputs: List[RequestOutput] = []
         self.seq_group_metadata_list: Optional[
             List[SequenceGroupMetadata]] = None
         self.scheduler_outputs: Optional[SchedulerOutputs] = None
@@ -261,8 +259,7 @@ class LLMEngine:
 
         self.model_executor = executor_class(vllm_config=vllm_config)
 
-        if self.model_config.runner_type != "pooling":
-            self._initialize_kv_caches()
+        self._initialize_kv_caches()
 
         # If usage stat is enabled, collect relevant info.
         if is_usage_stats_enabled():
@@ -541,7 +538,7 @@ class LLMEngine:
         self,
         request_id: str,
         processed_inputs: ProcessorInputs,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
         arrival_time: float,
         lora_request: Optional[LoRARequest],
         trace_headers: Optional[Mapping[str, str]] = None,
@@ -577,7 +574,7 @@ class LLMEngine:
         encoder_seq = (None if encoder_inputs is None else Sequence(
             seq_id, encoder_inputs, block_size, eos_token_id, lora_request))
 
-        # Create a SequenceGroup based on SamplingParams or PoolingParams
+        # Create a SequenceGroup based on SamplingParams
         if isinstance(params, SamplingParams):
             seq_group = self._create_sequence_group_with_sampling(
                 request_id,
@@ -588,18 +585,8 @@ class LLMEngine:
                 trace_headers=trace_headers,
                 encoder_seq=encoder_seq,
                 priority=priority)
-        elif isinstance(params, PoolingParams):
-            seq_group = self._create_sequence_group_with_pooling(
-                request_id,
-                seq,
-                params,
-                arrival_time=arrival_time,
-                lora_request=lora_request,
-                encoder_seq=encoder_seq,
-                priority=priority)
         else:
-            raise ValueError(
-                "Either SamplingParams or PoolingParams must be provided.")
+            raise ValueError("SamplingParams must be provided.")
 
         # Add the sequence group to the scheduler with least unfinished seqs.
         costs = [
@@ -618,7 +605,7 @@ class LLMEngine:
         self,
         request_id: str,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
@@ -636,9 +623,8 @@ class LLMEngine:
             prompt: The prompt to the LLM. See
                 [PromptType][vllm.inputs.PromptType]
                 for more details about the format of each input.
-            params: Parameters for sampling or pooling.
+            params: Parameters for sampling.
                 [SamplingParams][vllm.SamplingParams] for text generation.
-                [PoolingParams][vllm.PoolingParams] for pooling.
             arrival_time: The arrival time of the request. If None, we use
                 the current monotonic time.
             lora_request: The LoRA request to add.
@@ -760,29 +746,6 @@ class LLMEngine:
 
         return seq_group
 
-    def _create_sequence_group_with_pooling(
-        self,
-        request_id: str,
-        seq: Sequence,
-        pooling_params: PoolingParams,
-        arrival_time: float,
-        lora_request: Optional[LoRARequest],
-        encoder_seq: Optional[Sequence] = None,
-        priority: int = 0,
-    ) -> SequenceGroup:
-        """Creates a SequenceGroup with PoolingParams."""
-        # Defensive copy of PoolingParams, which are used by the pooler
-        pooling_params = pooling_params.clone()
-        # Create the sequence group.
-        seq_group = SequenceGroup(request_id=request_id,
-                                  seqs=[seq],
-                                  arrival_time=arrival_time,
-                                  lora_request=lora_request,
-                                  pooling_params=pooling_params,
-                                  encoder_seq=encoder_seq,
-                                  priority=priority)
-        return seq_group
-
     def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
         """Aborts a request(s) with the given ID.
 
@@ -856,18 +819,6 @@ class LLMEngine:
             success = success and scheduler.reset_prefix_cache(device)
         return success
 
-    @staticmethod
-    def _process_sequence_group_outputs(
-        seq_group: SequenceGroup,
-        outputs: List[PoolingSequenceGroupOutput],
-    ) -> None:
-        seq_group.pooled_data = outputs[0].data
-
-        for seq in seq_group.get_seqs():
-            seq.status = SequenceStatus.FINISHED_STOPPED
-
-        return
-
     def _process_model_outputs(self,
                                ctx: SchedulerContext,
                                request_id: Optional[str] = None) -> None:
@@ -962,13 +913,10 @@ class LLMEngine:
                             seq_group.metrics.model_execute_time = (
                                 o.model_execute_time)
 
-            if self.model_config.runner_type == "pooling":
-                self._process_sequence_group_outputs(seq_group, output)
-            else:
-                self.output_processor.process_prompt_logprob(seq_group, output)
-                if seq_group_meta.do_sample:
-                    self.output_processor.process_outputs(
-                        seq_group, output, is_async)
+            self.output_processor.process_prompt_logprob(seq_group, output)
+            if seq_group_meta.do_sample:
+                self.output_processor.process_outputs(seq_group, output,
+                                                      is_async)
 
             if seq_group.is_finished():
                 finished_now.append(i)
@@ -1090,7 +1038,7 @@ class LLMEngine:
                 seq.append_token_id(sample.output_token, sample.logprobs,
                                     sample.output_embed)
 
-    def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
+    def step(self) -> List[RequestOutput]:
         """Performs one decoding iteration and returns newly generated results.
 
         <figure markdown="span">
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index ff0405d2f8..9f64ee0808 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -120,6 +120,7 @@ class RPCLoadAdapterRequest:
 @dataclass
 class RPCAdapterLoadedResponse:
     request_id: str
+    lora_loaded: bool
 
 
 RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 0bb11328b1..2d3248859c 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -6,7 +6,7 @@ import copy
 import pickle
 from contextlib import contextmanager, suppress
 from typing import (Any, AsyncGenerator, Dict, Iterable, Iterator, List,
-                    Mapping, Optional, Union, cast)
+                    Mapping, Optional, Union)
 
 import cloudpickle
 import psutil
@@ -477,10 +477,8 @@ class MQLLMEngineClient(EngineClient):
                 Any priority other than 0 will lead to an error if the
                 scheduling policy is not "priority".
         """
-        return cast(
-            AsyncGenerator[RequestOutput, None],
-            self._process_request(prompt, sampling_params, request_id,
-                                  lora_request, trace_headers, priority))
+        return self._process_request(prompt, sampling_params, request_id,
+                                     lora_request, trace_headers, priority)
 
     def encode(
         self,
@@ -490,45 +488,20 @@ class MQLLMEngineClient(EngineClient):
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model.
-
-        Generate outputs for a request. This method is a coroutine. It adds the
-        request into the waiting queue of the LLMEngine and streams the outputs
-        from the LLMEngine to the caller.
-
-        Args:
-            prompt: The prompt to the LLM. See
-                [`PromptType`][vllm.inputs.PromptType] for more details about
-                the format of each input.
-            pooling_params: The pooling parameters of the request.
-            request_id: The unique id of the request.
-            lora_request: LoRA request to use for generation, if any.
-            trace_headers: OpenTelemetry trace headers.
-
-        Yields:
-            The output `PoolingRequestOutput` objects from the LLMEngine
-            for the request.
-        """
-        return cast(
-            AsyncGenerator[PoolingRequestOutput, None],
-            self._process_request(prompt,
-                                  pooling_params,
-                                  request_id,
-                                  lora_request,
-                                  trace_headers,
-                                  priority=priority))
+        raise NotImplementedError(
+            "Pooling models are not supported in vLLM V0")
 
     async def _process_request(
         self,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
-    ) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[
-            PoolingRequestOutput, None]]:
+    ) -> AsyncGenerator[RequestOutput, None]:
         """Send an RPCGenerateRequest to the RPCServer and stream responses."""
 
         # If already dead, error out.
@@ -547,7 +520,7 @@ class MQLLMEngineClient(EngineClient):
         try:
             # 2) Detach logits processors so that they can be pickled
             # separately (may require cloudpickle which is slower)
-            if isinstance(params, SamplingParams) and params.logits_processors:
+            if params.logits_processors:
                 # Defensive shallow copy
                 params = copy.copy(params)
                 logits_processors = params.logits_processors
@@ -646,13 +619,14 @@ class MQLLMEngineClient(EngineClient):
             raise request_output
         return request_output.is_sleeping
 
-    async def add_lora(self, lora_request: LoRARequest) -> None:
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
         """Load a new LoRA adapter into the engine for future requests."""
         # Uses the same I/O as generate requests
         request = RPCLoadAdapterRequest(lora_request)
 
         # Create output queue for this request.
-        queue: asyncio.Queue[Union[None, BaseException]] = asyncio.Queue()
+        queue: asyncio.Queue[Union[
+            BaseException, RPCAdapterLoadedResponse]] = asyncio.Queue()
         self.output_queues[request.request_id] = queue
 
         # Send the request
@@ -666,3 +640,4 @@ class MQLLMEngineClient(EngineClient):
         # Raise on error, otherwise happily return None
         if isinstance(request_output, BaseException):
             raise request_output
+        return request_output.lora_loaded
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 903f3fd71e..343b8df7e8 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -347,7 +347,7 @@ class MQLLMEngine:
 
     def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
         try:
-            self.engine.add_lora(request.lora_request)
+            lora_loaded = self.engine.add_lora(request.lora_request)
         except BaseException as e:
             # Send back an error if the adater fails to load
             rpc_err = RPCError(request_id=request.request_id,
@@ -357,7 +357,8 @@ class MQLLMEngine:
             return
         # Otherwise, send back the successful load message
         self._send_outputs(
-            RPCAdapterLoadedResponse(request_id=request.request_id))
+            RPCAdapterLoadedResponse(request_id=request.request_id,
+                                     lora_loaded=lora_loaded))
 
     def _handle_is_sleeping_request(self, request: RPCIsSleepingRequest):
         is_sleeping = self.is_sleeping()
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 5e8ac9c0b3..31c36b8562 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -3,7 +3,7 @@
 
 import asyncio
 from abc import ABC, abstractmethod
-from typing import AsyncGenerator, Iterable, Mapping, Optional, Union
+from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
 from vllm.config import DecodingConfig, ModelConfig, VllmConfig
@@ -224,6 +224,7 @@ class EngineClient(ABC):
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """Generate outputs for a request from a pooling model."""
         ...
@@ -320,7 +321,7 @@ class EngineClient(ABC):
         ...
 
     @abstractmethod
-    async def add_lora(self, lora_request: LoRARequest) -> None:
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
         """Load a new LoRA adapter into the engine for future requests."""
         ...
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 72b6123670..9d900e691b 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1156,8 +1156,7 @@ class LLM:
                 tokenization_kwargs=tokenization_kwargs,
             )
 
-            if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop(
-                    "token_type_ids", None)):
+            if (token_type_ids := engine_prompt.pop("token_type_ids", None)):
                 params = pooling_params.clone()
                 compressed = compress_token_type_ids(token_type_ids)
                 params.extra_kwargs = {"compressed_token_type_ids": compressed}
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 37838e22a4..c54deb371d 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -7,7 +7,6 @@ from typing import Any, Optional, Union
 
 from fastapi import Request
 
-from vllm import envs
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
@@ -229,8 +228,7 @@ class ServingScores(OpenAIServing):
                              params=default_pooling_params,
                              lora_request=lora_request)
 
-            if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop(
-                    "token_type_ids", None)):
+            if (token_type_ids := engine_prompt.pop("token_type_ids", None)):
                 pooling_params = default_pooling_params.clone()
                 compressed = compress_token_type_ids(token_type_ids)
                 pooling_params.extra_kwargs = {
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 23cb5e5022..8e6d3136d5 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -174,9 +174,6 @@ class TokenInputs(TypedDict):
     prompt_token_ids: list[int]
     """The token IDs of the prompt."""
 
-    token_type_ids: NotRequired[list[int]]
-    """The token type IDs of the prompt."""
-
     prompt: NotRequired[str]
     """
     The original prompt text corresponding to the token IDs, if available.
@@ -190,7 +187,6 @@ class TokenInputs(TypedDict):
 
 def token_inputs(
     prompt_token_ids: list[int],
-    token_type_ids: Optional[list[int]] = None,
     prompt: Optional[str] = None,
     cache_salt: Optional[str] = None,
 ) -> TokenInputs:
@@ -200,8 +196,6 @@ def token_inputs(
 
     if prompt is not None:
         inputs["prompt"] = prompt
-    if token_type_ids is not None:
-        inputs["token_type_ids"] = token_type_ids
     if cache_salt is not None:
         inputs["cache_salt"] = cache_salt
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index fff9c42fe3..3dbd9057fe 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -355,7 +355,6 @@ class InputPreprocessor:
         mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = parsed_content["prompt_token_ids"]
-        token_type_ids = parsed_content.get("token_type_ids")
 
         inputs: Union[TokenInputs, MultiModalInputs]
         if multi_modal_data := parsed_content.get("multi_modal_data"):
@@ -368,10 +367,7 @@ class InputPreprocessor:
                 mm_hash_overrides=mm_hash_overrides,
             )
         else:
-            inputs = token_inputs(
-                prompt_token_ids=prompt_token_ids,
-                token_type_ids=token_type_ids,
-            )
+            inputs = token_inputs(prompt_token_ids=prompt_token_ids)
 
         if cache_salt := parsed_content.get("cache_salt"):
             inputs["cache_salt"] = cache_salt
@@ -387,7 +383,6 @@ class InputPreprocessor:
         mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = parsed_content["prompt_token_ids"]
-        token_type_ids = parsed_content.get("token_type_ids")
 
         inputs: Union[TokenInputs, MultiModalInputs]
         if multi_modal_data := parsed_content.get("multi_modal_data"):
@@ -400,10 +395,7 @@ class InputPreprocessor:
                 mm_hash_overrides=mm_hash_overrides,
             )
         else:
-            inputs = token_inputs(
-                prompt_token_ids=prompt_token_ids,
-                token_type_ids=token_type_ids,
-            )
+            inputs = token_inputs(prompt_token_ids=prompt_token_ids, )
 
         if cache_salt := parsed_content.get("cache_salt"):
             inputs["cache_salt"] = cache_salt
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index eebf7b2508..66101e1a99 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -13,17 +13,12 @@ import torch.nn.functional as F
 from transformers import PretrainedConfig
 
 from vllm.config import ModelConfig, PoolerConfig
-from vllm.model_executor.pooling_metadata import (  # noqa: E501
-    PoolingMetadata as V0PoolingMetadata)
-from vllm.model_executor.pooling_metadata import PoolingTensors
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
 from vllm.tasks import PoolingTask
 from vllm.utils import current_stream, resolve_obj_by_qualname
-from vllm.v1.pool.metadata import PoolingCursor
-from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
+from vllm.v1.pool.metadata import PoolingCursor, PoolingMetadata
 
-PoolingMetadata = Union[V0PoolingMetadata, V1PoolingMetadata]
 PoolingFn = Callable[
     [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
     Union[torch.Tensor, list[torch.Tensor]]]
@@ -127,36 +122,23 @@ def get_prompt_lens(
     hidden_states: Union[torch.Tensor, list[torch.Tensor]],
     pooling_metadata: PoolingMetadata,
 ) -> torch.Tensor:
-    if isinstance(pooling_metadata, V1PoolingMetadata):
-        return pooling_metadata.prompt_lens
-
-    return PoolingTensors.from_pooling_metadata(
-        pooling_metadata, hidden_states[0].device).prompt_lens
+    return pooling_metadata.prompt_lens
 
 
 def get_prompt_token_ids(
         pooling_metadata: PoolingMetadata) -> list[torch.Tensor]:
-    if isinstance(pooling_metadata, V1PoolingMetadata):
-        assert pooling_metadata.prompt_token_ids is not None, (
-            "Please set `requires_token_ids=True` in `get_pooling_updates`")
-
-        return [
-            pooling_metadata.prompt_token_ids[i, :num]
-            for i, num in enumerate(pooling_metadata.prompt_lens)
-        ]
+    assert pooling_metadata.prompt_token_ids is not None, (
+        "Please set `requires_token_ids=True` in `get_pooling_updates`")
 
     return [
-        torch.tensor(seq_data_i.prompt_token_ids)
-        for seq_data_i in pooling_metadata.seq_data.values()
+        pooling_metadata.prompt_token_ids[i, :num]
+        for i, num in enumerate(pooling_metadata.prompt_lens)
     ]
 
 
 def get_pooling_params(
         pooling_metadata: PoolingMetadata) -> list[PoolingParams]:
-    if isinstance(pooling_metadata, V0PoolingMetadata):
-        pooling_params = [p for _, p in pooling_metadata.seq_groups]
-    else:
-        pooling_params = pooling_metadata.pooling_params
+    pooling_params = pooling_metadata.pooling_params
     return pooling_params
 
 
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index b34ca5cbe9..8f23439655 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -24,9 +24,9 @@ from vllm.model_executor.layers.pooler import (ClassifierPooler,
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
+from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces import SupportsCrossEncoding, SupportsQuant
 from .interfaces_base import default_pooling_type
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 1b3d541c65..a7b324f0a5 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -15,10 +15,10 @@ from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
                                                build_output, get_prompt_lens,
                                                get_prompt_token_ids)
 from vllm.model_executor.models.llama import LlamaForCausalLM
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import PoolerOutput
 from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces_base import default_pooling_type
 
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 4778555861..7762875898 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -22,9 +22,9 @@ from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
+from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces import SupportsCrossEncoding
 from .interfaces_base import default_pooling_type
diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
deleted file mode 100644
index 3209879193..0000000000
--- a/vllm/model_executor/pooling_metadata.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from dataclasses import dataclass
-from typing import Any, Optional
-
-import torch
-
-from vllm.pooling_params import PoolingParams
-from vllm.utils import is_pin_memory_available
-from vllm.v1.pool.metadata import PoolingCursor, build_pooling_cursor
-
-
-class PoolingMetadata:
-    """Metadata for pooling operations in the Pooler layer.
-
-    This class holds the necessary information for pooling operations,
-    providing context for how to perform pooling and other related operations.
-
-    Attributes:
-        seq_groups: List of (seq_ids, pooling_params).
-        seq_data: A mapping of sequence ID to additional sequence data.
-        prompt_lens: List of the lengths of each prompt.
-    """
-
-    def __init__(
-            self,
-            seq_groups: list[tuple[list[int], PoolingParams]],
-            seq_data: dict[int, Any],  # Specific data related to sequences
-            prompt_lens: list[int],
-            pooling_cursor: Optional[PoolingCursor] = None) -> None:
-        self.seq_groups = seq_groups
-        self.seq_data = seq_data
-        self.prompt_lens = prompt_lens
-        self.pooling_cursor: Optional[PoolingCursor] = pooling_cursor
-
-    def __repr__(self) -> str:
-        return ("PoolingMetadata("
-                f"seq_groups={self.seq_groups}, "
-                f"seq_data={self.seq_data}, "
-                f"prompt_lens={self.prompt_lens})")
-
-    def __getitem__(self, indices: slice):
-        return PoolingMetadata(
-            seq_groups=self.seq_groups[indices],
-            seq_data=dict(list(self.seq_data.items())[indices]),
-            prompt_lens=self.prompt_lens[indices],
-            pooling_cursor=None
-            if self.pooling_cursor is None else self.pooling_cursor[indices],
-        )
-
-    def build_pooling_cursor(self, num_scheduled_tokens: list[int],
-                             device: torch.device):
-        prompt_lens = torch.tensor(self.prompt_lens, device="cpu")
-        self.pooling_cursor = build_pooling_cursor(num_scheduled_tokens,
-                                                   prompt_lens,
-                                                   device=device)
-
-
-@dataclass
-class PoolingTensors:
-    """Tensors for pooling."""
-
-    prompt_lens: torch.Tensor
-
-    @classmethod
-    def from_pooling_metadata(
-        cls,
-        pooling_metadata: "PoolingMetadata",
-        device: torch.device,
-    ) -> "PoolingTensors":
-        """
-        Create PoolingTensors from PoolingMetadata.
-
-        Args:
-            pooling_metadata: PoolingMetadata instance to convert.
-            device: Device to store the tensors.
-        """
-        # Convert prompt lengths to tensor
-        pin_memory = is_pin_memory_available()
-
-        prompt_lens_t = torch.tensor(
-            pooling_metadata.prompt_lens,
-            device="cpu",
-            dtype=torch.long,
-            pin_memory=pin_memory,
-        )
-
-        return cls(prompt_lens=prompt_lens_t.to(device=device,
-                                                non_blocking=True), )
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 2c0ebaced6..cf6ab6c8de 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -913,9 +913,6 @@ class MultiModalInputs(TypedDict):
     prompt_token_ids: list[int]
     """The processed token IDs which includes placeholder tokens."""
 
-    token_type_ids: NotRequired[list[int]]
-    """The token type IDs of the prompt."""
-
     mm_kwargs: MultiModalKwargsOptionalItems
     """Keyword arguments to be directly passed to the model after batching."""
 
@@ -946,6 +943,3 @@ class MultiModalEncDecInputs(MultiModalInputs):
 
     encoder_prompt_token_ids: list[int]
     """The processed token IDs of the encoder prompt."""
-
-    encoder_token_type_ids: NotRequired[list[int]]
-    """The token type IDs of the encoder prompt."""
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 36b1b198bd..7b48b7be9f 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -508,12 +508,6 @@ class Sequence:
             return [0] * len(self.inputs["prompt_embeds"])
         return self.inputs["prompt_token_ids"]
 
-    @property
-    def token_type_ids(self) -> list[int]:
-        if self.inputs["type"] == "embeds":
-            return []
-        return self.inputs.get("token_type_ids", [])
-
     @property
     def multi_modal_data(self) -> MultiModalKwargs:
         if self.inputs["type"] == "multimodal":
@@ -765,10 +759,6 @@ class SequenceGroup:
         return (self.encoder_seq.prompt_token_ids
                 if self.encoder_seq is not None else None)
 
-    @property
-    def token_type_ids(self) -> Optional[list[int]]:
-        return self.first_seq.token_type_ids
-
     @property
     def multi_modal_data(self) -> MultiModalKwargs:
         if self.first_seq.multi_modal_data:
@@ -972,7 +962,6 @@ class SequenceGroupMetadata(
     computed_block_nums: Optional[list[int]] = None
     state: Optional[SequenceGroupState] = msgspec.field(
         default_factory=lambda: SequenceGroupState())
-    token_type_ids: Optional[list[int]] = None
     multi_modal_data: Optional[MultiModalKwargs] = None
     multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
     encoder_seq_data: Optional[SequenceData] = None
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index cb5d5664ab..12fd25f4de 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -24,8 +24,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
                              MultiModalRegistry)
 from vllm.platforms import _Backend
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (IntermediateTensors, PoolerOutput,
-                           SequenceGroupMetadata)
+from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad
 from vllm.worker.model_runner import (GPUModelRunnerBase,
                                       ModelInputForGPUBuilder,
@@ -161,7 +160,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
         kv_caches: List[torch.Tensor],
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
-    ) -> Optional[List[PoolerOutput]]:
+    ) -> Optional[List[SamplerOutput]]:
         if num_steps > 1:
             raise ValueError("num_steps > 1 is not supported in "
                              "EncoderDecoderModelRunner")
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index a1c08fa814..f05401fd01 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -86,7 +86,6 @@ class ModelInputForGPU(ModelRunnerInputBase):
     input_tokens: Optional[torch.Tensor] = None
     inputs_embeds: Optional[torch.Tensor] = None
     input_positions: Optional[torch.Tensor] = None
-    token_types: Optional[torch.Tensor] = None
     seq_lens: Optional[List[int]] = None
     query_lens: Optional[List[int]] = None
     lora_mapping: Optional["LoRAMapping"] = None
@@ -192,7 +191,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
             self.input_tokens[0].clear()  # type: ignore
             self.inputs_embeds = None  # type: ignore
             self.input_positions[0].clear()  # type: ignore
-            self.token_types[0].clear()  # type: ignore
             self.mrope_input_positions = None  # type: ignore
             self.seq_lens[0] = 0  # type: ignore
             self.orig_seq_lens[0] = 0  # type: ignore
@@ -219,7 +217,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
             input_tokens: Optional[List[List[int]]] = None,
             inputs_embeds: Optional[torch.Tensor] = None,
             input_positions: Optional[List[List[int]]] = None,
-            token_types: Optional[List[List[int]]] = None,
             mrope_input_positions: Optional[List[List[List[int]]]] = None,
 
             # The sequence length (may be capped to the sliding window).
@@ -284,12 +281,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                         for seq_id in range(len(self.seq_ids)):
                             self.input_positions[seq_id].clear()
 
-                    if token_types:
-                        self.token_types = token_types
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.token_types[seq_id].clear()
-
                     self.mrope_input_positions = None
 
                     if seq_lens:
@@ -348,7 +339,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                 self.input_tokens = input_tokens or []
                 self.inputs_embeds = inputs_embeds
                 self.input_positions = input_positions or []
-                self.token_types = token_types or []
                 self.mrope_input_positions = mrope_input_positions or None
                 self.seq_lens = seq_lens or []
                 self.orig_seq_lens = orig_seq_lens or []
@@ -376,7 +366,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
 
             self.input_tokens = [[] for _ in range(self.n_seqs)]
             self.input_positions = [[] for _ in range(self.n_seqs)]
-            self.token_types = [[] for _ in range(self.n_seqs)]
             self.mrope_input_positions = None
             self.seq_lens = [0] * self.n_seqs
             self.orig_seq_lens = [0] * self.n_seqs
@@ -400,7 +389,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                     f"inputs_embeds.shape="
                     f"{getattr(self.inputs_embeds, 'shape', None)}, "
                     f"input_positions={self.input_positions}, "
-                    f"token_types={self.token_types}, "
                     f"mrope_input_positions={self.mrope_input_positions}, "
                     f"seq_lens={self.seq_lens}, "
                     f"orig_seq_lens={self.orig_seq_lens}, "
@@ -522,8 +510,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
             prompt_embeds = seq_data.get_token_embeddings(
             )[context_len:seq_len]
 
-        token_types = seq_group_metadata.token_type_ids
-
         inter_data.seq_lens[seq_idx] = seq_len
         inter_data.orig_seq_lens[seq_idx] = seq_len
         inter_data.prompt_lens[seq_idx] = seq_data.get_prompt_len()
@@ -531,8 +517,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
         inter_data.input_tokens[seq_idx].extend(tokens)
         inter_data.inputs_embeds = prompt_embeds
         inter_data.input_positions[seq_idx].extend(range(context_len, seq_len))
-        inter_data.token_types[seq_idx].extend(
-            token_types if token_types else [])
         inter_data.query_lens[seq_idx] = seq_len - context_len
 
         if seq_data.mrope_position_delta is not None:
@@ -590,8 +574,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                 seq_idx][uncomputed_start:]
             inter_data.input_positions[seq_idx] = inter_data.input_positions[
                 seq_idx][uncomputed_start:]
-            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
-                uncomputed_start:]
             context_len = prefix_cache_len
 
             inter_data.context_lens[seq_idx] = context_len
@@ -606,8 +588,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                 seq_idx][-1:]
             inter_data.input_positions[seq_idx] = inter_data.input_positions[
                 seq_idx][-1:]
-            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
-                -1:]
             inter_data.query_lens[seq_idx] = 1
             inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1
 
@@ -802,12 +782,9 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
         # Combine and flatten intermediate data.
         input_tokens = list[int]()
         inputs_embeds_list = list[torch.Tensor]()
-        token_types = list[int]()
         for inter_data in self.inter_data_list:
             for cur_input_tokens in inter_data.input_tokens:
                 input_tokens.extend(cur_input_tokens)
-            for cur_token_types in inter_data.token_types:
-                token_types.extend(cur_token_types)
             if inter_data.inputs_embeds is not None:
                 inputs_embeds_list.append(
                     inter_data.inputs_embeds.to(
@@ -890,11 +867,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                                                self.runner.device,
                                                self.runner.pin_memory)
 
-        token_types_tensor = async_tensor_h2d(token_types, torch.long,
-                                               self.runner.device,
-                                               self.runner.pin_memory) \
-                                                if token_types else None
-
         if mrope_input_positions is not None:
             for idx in range(3):
                 mrope_input_positions[idx].extend(
@@ -951,7 +923,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
             input_tokens=input_tokens_tensor,
             inputs_embeds=inputs_embeds,
             input_positions=input_positions_tensor,
-            token_types=token_types_tensor,
             attn_metadata=attn_metadata,
             seq_lens=seq_lens,
             query_lens=query_lens,
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 7b8fe2f802..1008b74361 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -13,10 +13,9 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.interfaces import supports_transcription
-from vllm.model_executor.models.interfaces_base import (
-    is_pooling_model, is_text_generation_model)
+from vllm.model_executor.models.interfaces_base import is_text_generation_model
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
+from vllm.tasks import GenerationTask, SupportedTask
 
 if TYPE_CHECKING:
     from vllm.attention import AttentionMetadata
@@ -241,20 +240,11 @@ class ModelRunnerBase(ABC, Generic[T]):
 
         return supported_tasks
 
-    def get_supported_pooling_tasks(self) -> list[PoolingTask]:
-        model = self.get_model()
-        if not is_pooling_model(model):
-            return []
-
-        return list(model.pooler.get_supported_tasks())
-
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         tasks = list[SupportedTask]()
 
         if self.model_config.runner_type == "generate":
             tasks.extend(self.get_supported_generation_tasks())
-        if self.model_config.runner_type == "pooling":
-            tasks.extend(self.get_supported_pooling_tasks())
 
         return tuple(tasks)
 
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
deleted file mode 100644
index 3e1950798d..0000000000
--- a/vllm/worker/pooling_model_runner.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.distributed import get_pp_group
-from vllm.forward_context import set_forward_context
-from vllm.logger import init_logger
-from vllm.model_executor.models.interfaces_base import VllmModelForPooling
-from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.multimodal import MultiModalKwargs
-from vllm.pooling_params import PoolingParams
-from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
-                           SequenceGroupMetadata)
-from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPU,
-                                      ModelInputForGPUBuilder)
-
-logger = init_logger(__name__)
-
-
-@dataclasses.dataclass(frozen=True)
-class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU):
-    """
-    Used by the PoolingModelRunner.
-    """
-    pooling_metadata: Optional["PoolingMetadata"] = None
-
-
-class PoolingModelRunner(
-        GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
-    _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
-        ModelInputForGPUWithPoolingMetadata)
-    _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-    ):
-        super().__init__(vllm_config=vllm_config,
-                         kv_cache_dtype=kv_cache_dtype,
-                         is_driver_worker=is_driver_worker)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForGPUWithPoolingMetadata,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
-        if num_steps > 1:
-            raise ValueError(
-                "PoolingModelRunner does not support multi-step execution.")
-
-        if self.lora_config:
-            assert model_input.lora_requests is not None
-            assert model_input.lora_mapping is not None
-            self.set_active_loras(model_input.lora_requests,
-                                  model_input.lora_mapping)
-
-        # Currently cuda graph is only supported by the decode phase.
-        assert model_input.attn_metadata is not None
-        prefill_meta = model_input.attn_metadata.prefill_metadata
-        decode_meta = model_input.attn_metadata.decode_metadata
-        virtual_engine = model_input.virtual_engine
-        # Pooling models are (ab-)used also to integrate non text models that
-        # are not autoregressive (PrithviGeosaptialMAE).
-        # These model might not use attention and do not really have a prefill
-        # and decode phase. The model input is processed in one shot and both
-        # decode_metadata and prefill_metadata would be None for such models.
-        # See the PlaceholderAttentionMetadata class.
-        # TODO: Figure out if cuda_graph is of any use for these models and
-        #  explore how to leverage it.
-        if (prefill_meta is None and decode_meta is not None
-                and decode_meta.use_cuda_graph):
-            if model_input.inputs_embeds is None:
-                assert model_input.input_tokens is not None
-                graph_batch_size = model_input.input_tokens.shape[0]
-                model_executable = (
-                    self.graph_runners[model_input.virtual_engine][(
-                        graph_batch_size, False)])
-            else:
-                graph_batch_size = model_input.inputs_embeds.shape[0]
-                model_executable = (
-                    self.graph_runners[model_input.virtual_engine][(
-                        graph_batch_size, True)])
-        else:
-            model_executable = self.model
-
-        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        seqlen_agnostic_kwargs = {
-            "finished_requests_ids": model_input.finished_requests_ids,
-            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
-        } if self.has_inner_state else {}
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_start = torch.cuda.Event(enable_timing=True)
-            model_forward_end = torch.cuda.Event(enable_timing=True)
-            model_forward_start.record()
-
-        cross_enc_kwargs = {}
-        if model_input.token_types is not None:
-            cross_enc_kwargs["token_type_ids"] = model_input.token_types
-
-        with set_forward_context(model_input.attn_metadata, self.vllm_config,
-                                 virtual_engine):
-            hidden_or_intermediate_states = model_executable(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(
-                    multi_modal_kwargs,
-                    device=self.device,
-                ),
-                **cross_enc_kwargs,
-                **seqlen_agnostic_kwargs,
-            )
-
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_end.record()
-
-        # Only perform pooling in the last pipeline stage.
-        if not get_pp_group().is_last_rank:
-            if (self.is_driver_worker
-                    and hidden_or_intermediate_states is not None
-                    and isinstance(hidden_or_intermediate_states,
-                                   IntermediateTensors)
-                    and self.observability_config is not None
-                    and self.observability_config.collect_model_forward_time):
-                model_forward_end.synchronize()
-                model_forward_time = model_forward_start.elapsed_time(
-                    model_forward_end)
-                orig_model_forward_time = 0.0
-                if intermediate_tensors is not None:
-                    orig_model_forward_time = intermediate_tensors.tensors.get(
-                        "model_forward_time", torch.tensor(0.0)).item()
-                hidden_or_intermediate_states.tensors["model_forward_time"] = (
-                    torch.tensor(model_forward_time + orig_model_forward_time))
-            return hidden_or_intermediate_states
-
-        # Only perform pooling in the driver worker.
-        if not self.is_driver_worker:
-            return []
-
-        pooling_metadata = model_input.pooling_metadata
-        assert pooling_metadata is not None
-
-        pooling_metadata.build_pooling_cursor(
-            num_scheduled_tokens=pooling_metadata.prompt_lens,
-            device=hidden_or_intermediate_states.device)
-
-        return [
-            self.model.pooler(hidden_states=hidden_or_intermediate_states,
-                              pooling_metadata=pooling_metadata)
-        ]
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self,
-            tensor_dict: Dict[str,
-                              Any]) -> ModelInputForGPUWithPoolingMetadata:
-        return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        )
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForGPUWithPoolingMetadata:
-        assert seq_group_metadata_list is not None
-        model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list, finished_requests_ids)
-        # Prepare PoolingMetadata.
-        assert model_input.seq_lens is not None
-        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
-                                                 model_input.seq_lens)
-
-        return dataclasses.replace(model_input,
-                                   pooling_metadata=pooling_metadata)
-
-    def _prepare_pooling(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        prompt_lens: List[int],
-    ) -> PoolingMetadata:
-        """Prepare PoolingMetadata for the sequence group metadata list."""
-        seq_groups: List[Tuple[List[int], PoolingParams]] = []
-        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-
-            pooling_params = seq_group_metadata.pooling_params
-            assert pooling_params is not None
-
-            task = pooling_params.task
-            assert task is not None, "You did not set `task` in the API"
-
-            model = cast(VllmModelForPooling, self.model)
-            to_update = model.pooler.get_pooling_updates(task)
-            to_update.apply(pooling_params)
-
-            seq_groups.append((seq_ids, pooling_params))
-
-        seq_data: Dict[int, SequenceData] = {}
-        for seq_group_metadata in seq_group_metadata_list:
-            seq_data.update(seq_group_metadata.seq_data)
-
-        pooling_metadata = PoolingMetadata(
-            seq_groups=seq_groups,
-            seq_data=seq_data,
-            prompt_lens=prompt_lens,
-        )
-
-        return pooling_metadata
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index fc24d95b80..2e20c89c63 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -30,7 +30,6 @@ from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache,
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
-from vllm.worker.pooling_model_runner import PoolingModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
                                      WorkerInput)
 
@@ -83,9 +82,7 @@ class Worker(LocalOrDistributedWorkerBase):
                     else {"return_hidden_states": True}
 
         ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
-        if model_config.runner_type == "pooling":
-            ModelRunnerClass = PoolingModelRunner
-        elif self.model_config.is_encoder_decoder:
+        if self.model_config.is_encoder_decoder:
             ModelRunnerClass = EncoderDecoderModelRunner
         self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
             vllm_config=self.vllm_config,
@@ -99,7 +96,6 @@ class Worker(LocalOrDistributedWorkerBase):
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CacheEngine]
-        # Initialize gpu_cache as pooling models don't initialize kv_caches
         self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
         self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
 

From ad39106b16fee0074e814f06ec7a517399ea154d Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 29 Aug 2025 17:19:58 +0800
Subject: [PATCH 720/932] [CPU] Enable data parallel for CPU backend (#23903)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../scripts/hardware_ci/run-cpu-test.sh       | 24 +++++++++++++++----
 docs/getting_started/installation/cpu.md      |  3 ++-
 .../installation/cpu/x86.inc.md               |  2 +-
 vllm/platforms/cpu.py                         |  8 +++++++
 vllm/v1/worker/cpu_model_runner.py            |  7 +++++-
 vllm/v1/worker/cpu_worker.py                  | 13 ++++++++--
 6 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 8b8f0e8c65..0f734763f1 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 
 function cpu_tests() {
   set -e
@@ -89,17 +89,33 @@ function cpu_tests() {
     pytest -x -s -v \
     tests/lora/test_qwen2vl.py"
 
-  # online serving
+  # online serving: tp+pp
   docker exec cpu-test-"$NUMA_NODE" bash -c '
     set -e
     VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+    server_pid=$!
     timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
     vllm bench serve \
       --backend vllm \
       --dataset-name random \
       --model meta-llama/Llama-3.2-3B-Instruct \
       --num-prompts 20 \
-      --endpoint /v1/completions'
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+
+  # online serving: tp+dp
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index e76ec35e1e..7f0ecb2bc0 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -96,6 +96,7 @@ Currently, there are no pre-built CPU wheels.
 - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively.
 - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
+- `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
 - `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 - `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False).
 
@@ -179,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
     - Offline Inference: `256 * world_size`
     - Online Serving: `128 * world_size`
 
-vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP together if there are enough CPU sockets and memory nodes.
+vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
 
 ### Which quantization configs does vLLM CPU support?
 
diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md
index 6dc6f94249..f7af259ace 100644
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@@ -43,7 +43,7 @@ docker build -f docker/Dockerfile.cpu \
 
 # Launching OpenAI server
 docker run --rm \
-            --privileged=true \
+            --security-opt seccomp=unconfined \
             --shm-size=4g \
             -p 8000:8000 \
             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 5686fae5cd..12d5e0bf08 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -69,6 +69,7 @@ class CpuPlatform(Platform):
     device_type: str = "cpu"
     dispatch_key: str = "CPU"
     dist_backend: str = "gloo"
+    device_control_env_var = "CPU_VISIBLE_MEMORY_NODES"
 
     @property
     def supported_dtypes(self) -> list[torch.dtype]:
@@ -297,6 +298,13 @@ class CpuPlatform(Platform):
             allowed_numa_nodes.add(x.numa_node)  # type: ignore
         allowed_numa_nodes_list = sorted(allowed_numa_nodes)
 
+        env_key = CpuPlatform.device_control_env_var
+        if (env_key in os.environ and os.environ[env_key] != ""):
+            visible_nodes = [int(s) for s in os.environ[env_key].split(',')]
+            allowed_numa_nodes_list = [
+                x for x in visible_nodes if x in allowed_cpu_id_list
+            ]
+
         return allowed_numa_nodes_list, logical_cpu_list
 
     @classmethod
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 7d07261127..226d7792a4 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 import torch.nn as nn
@@ -113,6 +113,11 @@ class CPUModelRunner(GPUModelRunner):
     def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
         return sampled_token_ids.tolist()
 
+    def get_dp_padding(self,
+                       num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
+        # Note: For CPU backend, dp padding is not required for now.
+        return 0, None
+
 
 @contextmanager
 def _torch_cuda_wrapper():
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index be78597926..b87c4fe09b 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -55,7 +55,14 @@ class CPUWorker(Worker):
             else:
                 self.local_omp_cpuid = "all"
         else:
-            self.local_omp_cpuid = omp_cpuids.split("|")[self.rank]
+            local_dp_rank = self.parallel_config.data_parallel_rank_local
+            omp_cpuids = omp_cpuids.split("|")
+            if local_dp_rank is not None:
+                world_size = self.parallel_config.world_size
+                omp_cpuids = omp_cpuids[local_dp_rank *
+                                        world_size:(local_dp_rank + 1) *
+                                        world_size]
+            self.local_omp_cpuid = omp_cpuids[self.rank]
 
         if self.local_omp_cpuid != "all":
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
@@ -162,7 +169,9 @@ class CPUWorker(Worker):
         # Reserve CPUs for other processes
         reserve_cpu_num = envs.VLLM_CPU_NUM_OF_RESERVED_CPU
         if reserve_cpu_num is None:
-            reserve_cpu_num = 1 if self.parallel_config.world_size > 1 else 0
+            need_reserve = (self.parallel_config.world_size > 1 or
+                            self.parallel_config.data_parallel_size_local > 1)
+            reserve_cpu_num = 1 if need_reserve else 0
         assert len(logical_cpu_list) > reserve_cpu_num, (
             f"VLLM_CPU_NUM_OF_RESERVED_CPU ({reserve_cpu_num}) "
             f"should less than {len(logical_cpu_list)}.")

From d9e00dbd1fcf9e4b6b0b42a228d7bb26175cbba4 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Fri, 29 Aug 2025 18:12:32 +0800
Subject: [PATCH 721/932] [Performance] V1 Classify Models E2E Performance
 Optimization (#23541)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 tests/entrypoints/llm/test_classify.py        |  6 ++
 .../entrypoints/openai/test_classification.py | 30 +++++++++
 vllm/entrypoints/openai/api_server.py         |  6 +-
 vllm/model_executor/layers/pooler.py          | 62 ++++++++++---------
 vllm/v1/worker/gpu_model_runner.py            | 15 +++--
 5 files changed, 81 insertions(+), 38 deletions(-)

diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py
index 7c261a2a57..6c0c9cd015 100644
--- a/tests/entrypoints/llm/test_classify.py
+++ b/tests/entrypoints/llm/test_classify.py
@@ -62,3 +62,9 @@ def test_encode_api(llm: LLM):
     err_msg = "pooling_task must be one of.+"
     with pytest.raises(ValueError, match=err_msg):
         llm.encode(prompts, use_tqdm=False)
+
+
+def test_score_api(llm: LLM):
+    err_msg = "Score API is only enabled for num_labels == 1."
+    with pytest.raises(ValueError, match=err_msg):
+        llm.score("ping", "pong", use_tqdm=False)
diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py
index 30078fe902..36c96d76c2 100644
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@@ -226,3 +226,33 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str):
         },
     )
     assert response.json()["error"]["type"] == "BadRequestError"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_score(server: RemoteOpenAIServer, model_name: str):
+    # score api is only enabled for num_labels == 1.
+    response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": model_name,
+            "text_1": "ping",
+            "text_2": "pong",
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank(server: RemoteOpenAIServer, model_name: str):
+    # rerank api is only enabled for num_labels == 1.
+    response = requests.post(
+        server.url_for("rerank"),
+        json={
+            "model": model_name,
+            "query": "ping",
+            "documents": ["pong"],
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index a28d38729f..ca7d1539dd 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1805,17 +1805,13 @@ async def init_app_state(
         request_logger=request_logger,
         log_error_stack=args.log_error_stack,
     ) if "classify" in supported_tasks else None
-
-    enable_serving_reranking = ("classify" in supported_tasks and getattr(
-        model_config.hf_config, "num_labels", 0) == 1)
     state.openai_serving_scores = ServingScores(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
         log_error_stack=args.log_error_stack,
-    ) if ("embed" in supported_tasks or enable_serving_reranking) else None
-
+    ) if ("embed" in supported_tasks or "score" in supported_tasks) else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 66101e1a99..62b3ee1aba 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -13,12 +13,15 @@ import torch.nn.functional as F
 from transformers import PretrainedConfig
 
 from vllm.config import ModelConfig, PoolerConfig
+from vllm.logger import init_logger
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
 from vllm.tasks import PoolingTask
 from vllm.utils import current_stream, resolve_obj_by_qualname
 from vllm.v1.pool.metadata import PoolingCursor, PoolingMetadata
 
+logger = init_logger(__name__)
+
 PoolingFn = Callable[
     [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
     Union[torch.Tensor, list[torch.Tensor]]]
@@ -183,7 +186,7 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
         fn = resolve_obj_by_qualname(function_name)()
         return PoolerActivation.wraps(fn)
 
-    return PoolerScore()
+    return PoolerClassify()
 
 
 def build_output(
@@ -371,24 +374,31 @@ class PoolerMultiLabelClassify(PoolerActivation):
 
 class PoolerClassify(PoolerActivation):
 
+    def __init__(self, *, static_num_labels: bool = True) -> None:
+        super().__init__()
+
+        if static_num_labels:
+            from vllm.config import get_current_vllm_config
+            vllm_config = get_current_vllm_config()
+            self.num_labels = getattr(vllm_config.model_config.hf_config,
+                                      "num_labels", 0)
+            if self.num_labels == 0:
+                logger.warning("num_labels should be > 0 for classification"
+                               "models, falling back to softmax. "
+                               "Please check if the configuration is correct.")
+        else:
+            self.num_labels = None
+
     def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        num_labels = pooled_data.shape[-1]
+        num_labels = (self.num_labels if self.num_labels is not None else
+                      pooled_data.shape[-1])
+
         if num_labels < 2:
             return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
 
         return F.softmax(pooled_data.float(), dim=-1).to(pooled_data.dtype)
 
 
-class PoolerScore(PoolerActivation):
-
-    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        num_labels = pooled_data.shape[-1]
-        if num_labels < 2:
-            return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
-
-        return pooled_data
-
-
 class LambdaPoolerActivation(PoolerActivation):
 
     def __init__(self, fn: Callable[[torch.Tensor], torch.Tensor]):
@@ -428,6 +438,10 @@ class EmbeddingPoolerHead(PoolerHead):
     def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
                 pooling_metadata: PoolingMetadata):
 
+        if isinstance(pooled_data, list):
+            pooled_data = torch.stack(pooled_data)
+        # pooled_data shape: [batchsize, hidden_dimension]
+
         # Apply ST projector
         if self.projector is not None:
             projector = cast(nn.Module, self.projector)
@@ -437,17 +451,11 @@ class EmbeddingPoolerHead(PoolerHead):
                 y = projector(x.to(torch.float32))
                 return y.to(orig_dtype)
 
-            if isinstance(pooled_data, torch.Tensor):
-                pooled_data = _proj(pooled_data)
-            else:
-                pooled_data = [_proj(t) for t in pooled_data]
+            pooled_data = _proj(pooled_data)
+        # pooled_data shape: [batchsize, embedding_dimension]
 
         pooling_params = get_pooling_params(pooling_metadata)
 
-        if isinstance(pooled_data, list):
-            pooled_data = torch.stack(pooled_data)
-        # pooled_data shape: [batchsize, embedding_dimension]
-
         # for matryoshka representation
         dimensions_list = [
             pooling_param.dimensions for pooling_param in pooling_params
@@ -477,13 +485,14 @@ class EmbeddingPoolerHead(PoolerHead):
                 for vecs, f in zip(pooled_data, flags)
             ]
 
+        # pooled_data shape: [batchsize, embedding_dimension]
         return pooled_data
 
 
 class RewardPoolerHead(PoolerHead):
 
     def __init__(self) -> None:
-        super().__init__(activation=PoolerClassify())
+        super().__init__(activation=PoolerClassify(static_num_labels=False))
 
     def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
                 pooling_metadata: PoolingMetadata):
@@ -637,19 +646,13 @@ class ClassifierPooler(Pooler):
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.pooling(hidden_states, pooling_metadata)
-
         if isinstance(pooled_data, list):
             pooled_data = torch.stack(pooled_data)
         # pooled_data shape: [batchsize, hidden_size]
 
         if self.classifier is not None:
-            # apply classifier once on the full batch if possible
-            if isinstance(pooled_data, torch.Tensor):
-                pooled_data = self.classifier(pooled_data)
-            elif len({data.shape for data in pooled_data}) <= 1:
-                pooled_data = self.classifier(torch.stack(pooled_data))
-            else:
-                pooled_data = [self.classifier(data) for data in pooled_data]
+            pooled_data = self.classifier(pooled_data)
+        # pooled_data shape: [batchsize, num_labels]
 
         pooling_params = get_pooling_params(pooling_metadata)
         flags = [p.activation for p in pooling_params]
@@ -662,6 +665,7 @@ class ClassifierPooler(Pooler):
                 for vecs, f in zip(pooled_data, flags)
             ]
 
+        # scores shape: [batchsize, num_labels]
         return build_output(scores)
 
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0250a4e19a..c6d50c17f2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1248,10 +1248,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 and "encode" in supported_tasks):
             supported_tasks.remove("encode")
 
-            logger.info_once("Chunked prefill is not supported with "
-                             "encode task which using ALL pooling. "
-                             "Please turn off chunked prefill by "
-                             "`--no-enable-chunked-prefill` before using it.")
+            logger.debug_once("Chunked prefill is not supported with "
+                              "encode task which using ALL pooling. "
+                              "Please turn off chunked prefill by "
+                              "`--no-enable-chunked-prefill` before using it.")
+
+        if "score" in supported_tasks:
+            num_labels = getattr(self.model_config.hf_config, "num_labels", 0)
+            if num_labels != 1:
+                supported_tasks.remove("score")
+                logger.debug_once(
+                    "Score API is only enabled for num_labels == 1.")
 
         return supported_tasks
 

From 69f46359dd5b36c1a059a0a8b729be1bd86394e8 Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Fri, 29 Aug 2025 03:36:57 -0700
Subject: [PATCH 722/932] [Multimodal] Consolidate mm inputs into
 MultiModalFeatureSpec (#23779)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 tests/tokenization/test_detokenize.py         |  2 --
 tests/v1/core/test_kv_cache_utils.py          | 22 +++++++-----
 tests/v1/core/test_prefix_caching.py          | 22 +++++++-----
 tests/v1/core/test_scheduler.py               | 26 +++++++-------
 tests/v1/core/utils.py                        | 30 ++++++++--------
 tests/v1/engine/test_engine_core.py           |  4 +--
 tests/v1/engine/test_engine_core_client.py    |  4 +--
 .../v1/engine/test_fast_incdec_prefix_err.py  | 18 +++++-----
 tests/v1/engine/test_output_processor.py      | 30 ++++++----------
 tests/v1/kv_connector/unit/utils.py           |  4 +--
 vllm/multimodal/cache.py                      | 16 +++++++--
 vllm/multimodal/inputs.py                     | 23 +++++++++++++
 vllm/v1/engine/__init__.py                    |  7 ++--
 vllm/v1/engine/core.py                        | 16 ++++-----
 vllm/v1/engine/processor.py                   | 31 +++++++----------
 vllm/v1/request.py                            | 34 ++++++-------------
 16 files changed, 143 insertions(+), 146 deletions(-)

diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index ccafc88461..ea7ccfbb2b 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -64,8 +64,6 @@ def _run_incremental_decode(tokenizer,
     request = EngineCoreRequest("",
                                 prompt_token_ids,
                                 None,
-                                None,
-                                None,
                                 params,
                                 None,
                                 None,
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 47c74aff1e..c4f927d69c 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -7,7 +7,8 @@ import pytest
 import torch
 
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalFeatureSpec,
+                                    MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit
 from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -37,17 +38,20 @@ def make_request(
     mm_hashes: Optional[list[str]] = None,
     cache_salt: Optional[str] = None,
 ):
-    if mm_positions is None:
-        mm_kwargs = None
-    else:
-        mm_item = MultiModalKwargsItem.dummy("dummy_m")
-        mm_kwargs = [mm_item] * len(mm_positions)
+    mm_features = []
+    if mm_positions is not None:
+        for j, position in enumerate(mm_positions):
+            identifier = mm_hashes[j] if mm_hashes else f"hash_{j}"
+            mm_feature = MultiModalFeatureSpec(
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=position,
+                identifier=identifier,
+                modality="image")
+            mm_features.append(mm_feature)
 
     return Request(request_id=request_id,
                    prompt_token_ids=prompt_token_ids,
-                   multi_modal_kwargs=mm_kwargs,
-                   multi_modal_hashes=mm_hashes,
-                   multi_modal_placeholders=mm_positions,
+                   mm_features=mm_features if mm_features else None,
                    sampling_params=SamplingParams(max_tokens=17),
                    pooling_params=None,
                    eos_token_id=100,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 89824768ed..e7a8f63702 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -9,7 +9,8 @@ import pytest
 import torch
 
 from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalFeatureSpec,
+                                    MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256, sha256_cbor_64bit
 from vllm.v1.core.block_pool import BlockPool
@@ -32,17 +33,20 @@ def make_request(
     prompt_logprobs: Optional[int] = None,
     cache_salt: Optional[str] = None,
 ):
-    if mm_positions is None:
-        mm_kwargs = None
-    else:
-        mm_item = MultiModalKwargsItem.dummy("dummy_m")
-        mm_kwargs = [mm_item] * len(mm_positions)
+    mm_features = []
+    if mm_positions is not None:
+        for j, position in enumerate(mm_positions):
+            identifier = mm_hashes[j] if mm_hashes else f"hash_{j}"
+            mm_feature = MultiModalFeatureSpec(
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=position,
+                identifier=identifier,
+                modality="image")
+            mm_features.append(mm_feature)
 
     return Request(request_id=request_id,
                    prompt_token_ids=prompt_token_ids,
-                   multi_modal_kwargs=mm_kwargs,
-                   multi_modal_hashes=mm_hashes,
-                   multi_modal_placeholders=mm_positions,
+                   mm_features=mm_features if mm_features else None,
                    sampling_params=SamplingParams(
                        max_tokens=17, prompt_logprobs=prompt_logprobs),
                    pooling_params=None,
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 70e8691788..572d6c9c88 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -8,7 +8,8 @@ import torch
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalFeatureSpec,
+                                    MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
@@ -1308,21 +1309,24 @@ def create_requests_with_priority(
                                      prompt_logprobs=prompt_logprobs)
     requests = []
     for i in range(num_requests):
+        mm_features = []
         if mm_positions is not None:
             mm_position = mm_positions[i]
-            mm_item = MultiModalKwargsItem.dummy("dummy_m")
-            mm_kwargs = [mm_item] * len(mm_position)
-        else:
-            mm_position = None
-            mm_kwargs = None
+            for j, position in enumerate(mm_position):
+                identifier = f"hash{i}_{j}"
+                mm_feature = MultiModalFeatureSpec(
+                    data=MultiModalKwargsItem.dummy("dummy_m"),
+                    mm_position=position,
+                    identifier=identifier,
+                    modality="image")
+                mm_features.append(mm_feature)
+
         request = Request(
             request_id=f"{i + starting_idx}",
             prompt_token_ids=[i + starting_idx] * num_tokens,
             sampling_params=sampling_params,
             pooling_params=None,
-            multi_modal_kwargs=mm_kwargs,
-            multi_modal_placeholders=mm_position,
-            multi_modal_hashes=None,
+            mm_features=mm_features if mm_features else None,
             eos_token_id=EOS_TOKEN_ID,
             arrival_time=arrival_times[i],
             priority=priorities[i],
@@ -1801,9 +1805,7 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
     request = Request(
         request_id="0",
         prompt_token_ids=[0, 1],
-        multi_modal_kwargs=None,
-        multi_modal_hashes=None,
-        multi_modal_placeholders=None,
+        mm_features=None,
         sampling_params=sampling_params,
         pooling_params=None,
         eos_token_id=EOS_TOKEN_ID,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 78a71f10a5..e392c2c336 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -6,7 +6,8 @@ import torch
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalFeatureSpec,
+                                    MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
                                          init_none_hash)
@@ -139,19 +140,20 @@ def create_requests(
                                      prompt_logprobs=prompt_logprobs)
     requests = []
     for i in range(num_requests):
+        mm_features = []
         if mm_positions is not None:
             mm_position = mm_positions[i]
-            mm_item = MultiModalKwargsItem.dummy("dummy_m")
-            mm_kwargs = [mm_item] * len(mm_position)
-            # Dummy hash for each mm item should be unique
-            # since encoder cache tracks entries by hash
-            mm_hashes = [
-                "hash" + str(i) + "_" + str(j) for j in range(len(mm_position))
-            ]
-        else:
-            mm_position = None
-            mm_kwargs = None
-            mm_hashes = None
+            for j, position in enumerate(mm_position):
+                # Dummy hash for each mm item should be unique
+                # since encoder cache tracks entries by hash
+                identifier = f"hash{i}_{j}"
+                mm_feature = MultiModalFeatureSpec(
+                    data=MultiModalKwargsItem.dummy("dummy_m"),
+                    mm_position=position,
+                    identifier=identifier,
+                    modality="image")
+                mm_features.append(mm_feature)
+
         prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
                             num_tokens)
         request = Request(
@@ -159,9 +161,7 @@ def create_requests(
             prompt_token_ids=prompt_token_ids,
             sampling_params=sampling_params,
             pooling_params=None,
-            multi_modal_kwargs=mm_kwargs,
-            multi_modal_placeholders=mm_position,
-            multi_modal_hashes=mm_hashes,
+            mm_features=mm_features if mm_features else None,
             eos_token_id=EOS_TOKEN_ID,
             block_hasher=block_hasher,
         )
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 2ea957a3e2..e6f7ebf259 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -35,9 +35,7 @@ def make_request() -> EngineCoreRequest:
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
         prompt_token_ids=PROMPT_TOKENS,
-        mm_kwargs=None,
-        mm_hashes=None,
-        mm_placeholders=None,
+        mm_features=None,
         sampling_params=SamplingParams(),
         pooling_params=None,
         eos_token_id=None,
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 37eb869fe6..625a3470e8 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -52,9 +52,7 @@ def make_request(
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
         prompt_token_ids=prompt_tokens_ids,
-        mm_kwargs=None,
-        mm_hashes=None,
-        mm_placeholders=None,
+        mm_features=None,
         sampling_params=params,
         pooling_params=None,
         eos_token_id=None,
diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py
index f028b4ab1d..f3d8e13088 100644
--- a/tests/v1/engine/test_fast_incdec_prefix_err.py
+++ b/tests/v1/engine/test_fast_incdec_prefix_err.py
@@ -26,16 +26,14 @@ def test_fast_inc_detok_invalid_utf8_err_case():
     prompt_token_ids = [107, 4606, 236787, 107]
     params = SamplingParams(skip_special_tokens=True)
     request = EngineCoreRequest(
-        "test",
-        prompt_token_ids,
-        None,
-        None,
-        None,
-        params,
-        None,
-        None,
-        0.0,
-        None,
+        request_id="test",
+        prompt_token_ids=prompt_token_ids,
+        mm_features=None,
+        sampling_params=params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
         cache_salt=None,
         data_parallel_rank=None,
     )
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index c113439a70..6544e8b017 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -52,11 +52,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
     requests = [
         EngineCoreRequest(request_id=f"request-{idx}",
                           prompt_token_ids=prompt_tokens,
-                          arrival_time=0,
-                          mm_kwargs=None,
-                          mm_hashes=None,
-                          mm_placeholders=None,
+                          mm_features=None,
                           eos_token_id=None,
+                          arrival_time=0,
                           lora_request=None,
                           cache_salt=None,
                           data_parallel_rank=None,
@@ -401,11 +399,9 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
     requests = [
         EngineCoreRequest(request_id=request_id_list[idx],
                           prompt_token_ids=prompt_tokens,
-                          arrival_time=0,
-                          mm_kwargs=None,
-                          mm_hashes=None,
-                          mm_placeholders=None,
+                          mm_features=None,
                           eos_token_id=None,
+                          arrival_time=0,
                           lora_request=None,
                           cache_salt=None,
                           data_parallel_rank=None,
@@ -566,11 +562,9 @@ def test_stop_token(include_stop_str_in_output: bool,
     request = EngineCoreRequest(
         request_id=request_id,
         prompt_token_ids=prompt_tokens,
-        arrival_time=0,
-        mm_kwargs=None,
-        mm_hashes=None,
-        mm_placeholders=None,
+        mm_features=None,
         eos_token_id=eos_token_id,
+        arrival_time=0,
         lora_request=None,
         cache_salt=None,
         data_parallel_rank=None,
@@ -665,11 +659,9 @@ def test_stop_string(include_stop_str_in_output: bool,
         EngineCoreRequest(
             request_id=request_id_list[idx],
             prompt_token_ids=prompt_tokens,
-            arrival_time=0,
-            mm_kwargs=None,
-            mm_hashes=None,
-            mm_placeholders=None,
+            mm_features=None,
             eos_token_id=None,
+            arrival_time=0,
             lora_request=None,
             cache_salt=None,
             data_parallel_rank=None,
@@ -781,11 +773,9 @@ def test_iteration_stats(dummy_test_vectors):
         EngineCoreRequest(
             request_id=f"request-{idx}",
             prompt_token_ids=prompt_tokens,
-            arrival_time=0,
-            mm_kwargs=None,
-            mm_hashes=None,
-            mm_placeholders=None,
+            mm_features=None,
             eos_token_id=None,
+            arrival_time=0,
             lora_request=None,
             cache_salt=None,
             data_parallel_rank=None,
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index a47f583b32..3f068d5e8c 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -162,9 +162,7 @@ def create_request(request_id: int,
         prompt_token_ids=prompt_token_ids,
         sampling_params=sampling_params,
         pooling_params=None,
-        multi_modal_kwargs=None,
-        multi_modal_placeholders=None,
-        multi_modal_hashes=None,
+        mm_features=None,
         eos_token_id=EOS_TOKEN_ID,
         block_hasher=get_request_block_hasher(block_size, hash_fn),
     )
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 0e81cb6d4d..d385fcf61c 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -12,9 +12,9 @@ from vllm.logger import init_logger
 from vllm.utils import GiB_bytes, LRUCache
 from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves
 
-from .inputs import (MultiModalFieldElem, MultiModalKwargs,
-                     MultiModalKwargsItem, MultiModalKwargsItems,
-                     NestedTensors)
+from .inputs import (MultiModalFeatureSpec, MultiModalFieldElem,
+                     MultiModalKwargs, MultiModalKwargsItem,
+                     MultiModalKwargsItems, NestedTensors)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
@@ -418,6 +418,16 @@ class BaseMultiModalReceiverCache(
                             MultiModalKwargsItem]):
     """The required interface for caches on P1."""
 
+    def get_and_update_features(
+        self,
+        mm_features: list["MultiModalFeatureSpec"],
+    ) -> list["MultiModalFeatureSpec"]:
+        """Update multimodal features with cached encoder outputs."""
+        for feature in mm_features:
+            feature.data = self.get_and_update_item(feature.data,
+                                                    feature.identifier)
+        return mm_features
+
 
 class MultiModalReceiverCache(BaseMultiModalReceiverCache):
     """
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index cf6ab6c8de..6fcc5bc772 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -198,6 +198,29 @@ A dictionary containing nested tensors which have been batched via
 """
 
 
+@dataclass
+class MultiModalFeatureSpec:
+    """
+    Represents a single multimodal input with its processed data and metadata.
+    
+    Used by the V1 engine to track multimodal data through processing and
+    caching. A request containing multiple multimodal items will have one
+    MultiModalFeatureSpec per item.
+    """
+
+    data: Optional["MultiModalKwargsItem"]
+    """Multimodal data for this feature"""
+
+    modality: str
+    """Based on the input, e.g., "image", "audio", "video"."""
+
+    identifier: str
+    """mm_hash or uuid for caching encoder outputs."""
+
+    mm_position: PlaceholderRange
+    """e.g., PlaceholderRange(offset=2, length=336)"""
+
+
 @dataclass
 class MultiModalFieldElem:
     """
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index f7ec982db4..5d8959a3cd 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -3,14 +3,13 @@
 
 import enum
 import time
-from collections.abc import Sequence
 from typing import Any, Optional, Union
 
 import msgspec
 import torch
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.v1.metrics.stats import SchedulerStats
@@ -48,9 +47,7 @@ class EngineCoreRequest(
 
     request_id: str
     prompt_token_ids: list[int]
-    mm_kwargs: Optional[Sequence[Optional[MultiModalKwargsItem]]]
-    mm_hashes: Optional[list[str]]
-    mm_placeholders: Optional[list[PlaceholderRange]]
+    mm_features: Optional[list[MultiModalFeatureSpec]]
     sampling_params: Optional[SamplingParams]
     pooling_params: Optional[PoolingParams]
     eos_token_id: Optional[int]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a7038e2d2c..785cbc9d8d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -434,15 +434,13 @@ class EngineCore:
         This function could be directly used in input processing thread to allow
         request initialization running in parallel with Model forward
         """
-        if request.mm_hashes is not None:
-            assert request.mm_kwargs is not None
-
-            # Note on thread safety: no race condition.
-            # `mm_receiver_cache` is reset at the end of LLMEngine init,
-            # and will only accessed in the input processing thread afterwards.
-            if self.mm_receiver_cache is not None:
-                request.mm_kwargs = self.mm_receiver_cache.get_and_update(
-                    request.mm_kwargs, request.mm_hashes)
+        # Note on thread safety: no race condition.
+        # `mm_receiver_cache` is reset at the end of LLMEngine init,
+        # and will only accessed in the input processing thread afterwards.
+        if self.mm_receiver_cache is not None and request.mm_features:
+            request.mm_features = (
+                self.mm_receiver_cache.get_and_update_features(
+                    request.mm_features))
 
         req = Request.from_engine_core_request(request,
                                                self.request_block_hasher)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index df915258d8..6cff95c393 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -12,7 +12,7 @@ from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import processor_cache_from_config
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
@@ -346,9 +346,8 @@ class Processor:
             pooling_params = params.clone()
 
         # Multimodal related.
-        sorted_mm_inputs: Optional[list[Optional[MultiModalKwargsItem]]] = None
-        sorted_mm_positions: Optional[list[PlaceholderRange]] = None
-        sorted_mm_hashes: Optional[list[str]] = None
+        mm_features: Optional[list[MultiModalFeatureSpec]] = None
+
         if decoder_inputs["type"] == "multimodal":
             decoder_mm_inputs = decoder_inputs["mm_kwargs"]
             decoder_mm_positions = decoder_inputs["mm_placeholders"]
@@ -359,25 +358,19 @@ class Processor:
             # in the input sequence.
             sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
 
-            sorted_mm_inputs = [
-                decoder_mm_inputs[modality][idx]
-                for modality, idx in sorted_mm_idxs
-            ]
-            sorted_mm_positions = [
-                decoder_mm_positions[modality][idx]
-                for modality, idx in sorted_mm_idxs
-            ]
-            sorted_mm_hashes = [
-                decoder_mm_hashes[modality][idx]
-                for modality, idx in sorted_mm_idxs
-            ]
+            mm_features = []
+            for modality, idx in sorted_mm_idxs:
+                mm_features.append(
+                    MultiModalFeatureSpec(
+                        data=decoder_mm_inputs[modality][idx],
+                        modality=modality,
+                        identifier=decoder_mm_hashes[modality][idx],
+                        mm_position=decoder_mm_positions[modality][idx]))
 
         return decoder_inputs.get("prompt"), EngineCoreRequest(
             request_id=request_id,
             prompt_token_ids=decoder_inputs["prompt_token_ids"],
-            mm_kwargs=sorted_mm_inputs,
-            mm_hashes=sorted_mm_hashes,
-            mm_placeholders=sorted_mm_positions,
+            mm_features=mm_features,
             sampling_params=sampling_params,
             pooling_params=pooling_params,
             eos_token_id=eos_token_id,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 4e99a9ccef..ad7477241e 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -6,10 +6,9 @@ import time
 from functools import partial
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.utils import is_list_of
 from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
                             EngineCoreRequest, FinishReason)
 from vllm.v1.structured_output.request import StructuredOutputRequest
@@ -26,14 +25,12 @@ class Request:
         self,
         request_id: str,
         prompt_token_ids: list[int],
-        multi_modal_kwargs: Optional[list[MultiModalKwargsItem]],
-        multi_modal_hashes: Optional[list[str]],
-        multi_modal_placeholders: Optional[list[PlaceholderRange]],
         sampling_params: Optional[SamplingParams],
         pooling_params: Optional[PoolingParams],
         eos_token_id: Optional[int],
         client_index: int = 0,
         arrival_time: Optional[float] = None,
+        mm_features: Optional[list[MultiModalFeatureSpec]] = None,
         lora_request: Optional["LoRARequest"] = None,
         structured_output_request: Optional["StructuredOutputRequest"] = None,
         cache_salt: Optional[str] = None,
@@ -89,16 +86,14 @@ class Request:
         self.cache_salt: Optional[str] = cache_salt
 
         # Multi-modal related
-        self.mm_positions = multi_modal_placeholders or []
-        self.mm_kwargs = multi_modal_kwargs or []
-        self.mm_hashes: list[str] = multi_modal_hashes or []
-        self.num_encoder_inputs = len(self.mm_kwargs)
+        self.mm_features = mm_features or []
+        self.num_encoder_inputs = len(self.mm_features)
         self.has_encoder_inputs = self.num_encoder_inputs > 0
-
-        # Sanity check
-        assert len(self.mm_kwargs) == len(self.mm_positions)
-        if self.mm_hashes:
-            assert len(self.mm_kwargs) == len(self.mm_hashes)
+        # TODO(sfeng33): Remove these legacy fields after clearing out all
+        # references in scheduler and model runner
+        self.mm_positions = [f.mm_position for f in self.mm_features]
+        self.mm_kwargs = [f.data for f in self.mm_features]
+        self.mm_hashes = [f.identifier for f in self.mm_features]
 
         # Read-only views
         # Prevent directly appending to these lists since
@@ -126,20 +121,11 @@ class Request:
         cls, request: EngineCoreRequest,
         block_hasher: Optional[Callable[["Request"], list["BlockHash"]]]
     ) -> "Request":
-        if request.mm_kwargs is not None:
-            mm_kwargs_lst = list(request.mm_kwargs)
-            assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem), (
-                "mm_kwargs was not updated in EngineCore.add_request")
-        else:
-            mm_kwargs_lst = None
-
         return cls(
             request_id=request.request_id,
             client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
-            multi_modal_kwargs=mm_kwargs_lst,
-            multi_modal_hashes=request.mm_hashes,
-            multi_modal_placeholders=request.mm_placeholders,
+            mm_features=request.mm_features,
             sampling_params=request.sampling_params,
             pooling_params=request.pooling_params,
             eos_token_id=request.eos_token_id,

From 67c14906aaa480d4fee2606f31c784ae21f8a633 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 29 Aug 2025 03:57:35 -0700
Subject: [PATCH 723/932] Update PyTorch to 2.8.0 (#20358)

Signed-off-by: Huy Do <huydhn@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                |  4 +--
 CMakeLists.txt                               |  4 +--
 pyproject.toml                               |  2 +-
 requirements/build.txt                       |  3 +-
 requirements/cpu.txt                         |  9 +++--
 requirements/cuda.txt                        | 10 +++---
 requirements/rocm-build.txt                  |  8 ++---
 requirements/test.in                         |  6 ++--
 requirements/test.txt                        | 36 ++++++++++----------
 tests/distributed/test_sequence_parallel.py  |  2 +-
 tests/lora/test_chatglm3_tp.py               |  6 +++-
 vllm/v1/attention/backends/flex_attention.py |  5 +--
 12 files changed, 50 insertions(+), 45 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f265204552..482808cd07 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -462,8 +462,8 @@ steps:
   - tests/quantization
   commands:
   # temporary install here since we need nightly, will move to requirements/test.in
-  # after torchao 0.12 release
-  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 
 - label: LM Eval Small Models # 53min
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e92e08f0d0..3f1f9a781a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
 
 #
 # Try to find python package with an executable that exactly matches
diff --git a/pyproject.toml b/pyproject.toml
index 013f2a6cd5..e63f8aeae2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "packaging>=24.2",
     "setuptools>=77.0.3,<80.0.0",
     "setuptools-scm>=8.0",
-    "torch == 2.7.1",
+    "torch == 2.8.0",
     "wheel",
     "jinja2",
 ]
diff --git a/requirements/build.txt b/requirements/build.txt
index dd644d621e..5f826a1afa 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,7 +4,8 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-torch==2.7.1
+torch==2.8.0
 wheel
 jinja2>=3.1.6
 regex
+build
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index f4b95b7289..a48cb9fde0 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -9,17 +9,16 @@ packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
-torch==2.7.0; platform_system == "Darwin"
-torch==2.7.0; platform_machine == "ppc64le"
-torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
+torch==2.8.0; platform_system == "Darwin"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.7.0; platform_machine == "ppc64le"
+torchaudio==2.8.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.22.0; platform_machine == "ppc64le"
+torchvision==0.23.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # Intel Extension for PyTorch, only for x86_64 CPUs
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index fb30e493f8..3f8b8fca32 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9'
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.7.1
-torchaudio==2.7.1
+torch==2.8.0
+torchaudio==2.8.0
 # These must be updated alongside torch
-torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
-xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
\ No newline at end of file
+torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
+xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.8
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index cbae9bbb8a..affe562c24 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -1,10 +1,10 @@
 # Common dependencies
 -r common.txt
 
---extra-index-url https://download.pytorch.org/whl/rocm6.2.4
-torch==2.7.0
-torchvision==0.22.0
-torchaudio==2.7.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.3
+torch==2.8.0
+torchvision==0.23.0
+torchaudio==2.8.0
 
 triton==3.3.0
 cmake>=3.26.1,<4
diff --git a/requirements/test.in b/requirements/test.in
index 92c577c501..5b1688c76c 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.7.1
-torchaudio==2.7.1
-torchvision==0.22.1
+torch==2.8.0
+torchaudio==2.8.0
+torchvision==0.23.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.8.2 # required for voxtral test
diff --git a/requirements/test.txt b/requirements/test.txt
index 0c27c9bb67..0b728ebfb0 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -541,42 +541,42 @@ numpy==1.26.4
     #   tritonclient
     #   vocos
     #   xarray
-nvidia-cublas-cu12==12.8.3.14
+nvidia-cublas-cu12==12.8.4.1
     # via
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
-nvidia-cuda-cupti-cu12==12.8.57
+nvidia-cuda-cupti-cu12==12.8.90
     # via torch
-nvidia-cuda-nvrtc-cu12==12.8.61
+nvidia-cuda-nvrtc-cu12==12.8.93
     # via torch
-nvidia-cuda-runtime-cu12==12.8.57
+nvidia-cuda-runtime-cu12==12.8.90
     # via torch
-nvidia-cudnn-cu12==9.7.1.26
+nvidia-cudnn-cu12==9.10.2.21
     # via torch
-nvidia-cufft-cu12==11.3.3.41
+nvidia-cufft-cu12==11.3.3.83
     # via torch
-nvidia-cufile-cu12==1.13.0.11
+nvidia-cufile-cu12==1.13.1.3
     # via torch
-nvidia-curand-cu12==10.3.9.55
+nvidia-curand-cu12==10.3.9.90
     # via torch
-nvidia-cusolver-cu12==11.7.2.55
+nvidia-cusolver-cu12==11.7.3.90
     # via torch
-nvidia-cusparse-cu12==12.5.7.53
+nvidia-cusparse-cu12==12.5.8.93
     # via
     #   nvidia-cusolver-cu12
     #   torch
-nvidia-cusparselt-cu12==0.6.3
+nvidia-cusparselt-cu12==0.7.1
     # via torch
-nvidia-nccl-cu12==2.26.2
+nvidia-nccl-cu12==2.27.3
     # via torch
-nvidia-nvjitlink-cu12==12.8.61
+nvidia-nvjitlink-cu12==12.8.93
     # via
     #   nvidia-cufft-cu12
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
     #   torch
-nvidia-nvtx-cu12==12.8.55
+nvidia-nvtx-cu12==12.8.90
     # via torch
 omegaconf==2.3.0
     # via
@@ -1069,7 +1069,7 @@ tomli==2.2.1
     # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
-torch==2.7.1+cu128
+torch==2.8.0+cu128
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -1098,7 +1098,7 @@ torch==2.7.1+cu128
     #   torchvision
     #   vector-quantize-pytorch
     #   vocos
-torchaudio==2.7.1+cu128
+torchaudio==2.8.0+cu128
     # via
     #   -r requirements/test.in
     #   encodec
@@ -1111,7 +1111,7 @@ torchmetrics==1.7.4
     #   pytorch-lightning
     #   terratorch
     #   torchgeo
-torchvision==0.22.1+cu128
+torchvision==0.23.0+cu128
     # via
     #   -r requirements/test.in
     #   lightly
@@ -1152,7 +1152,7 @@ transformers==4.55.2
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
     # via -r requirements/test.in
-triton==3.3.1
+triton==3.4.0
     # via torch
 tritonclient==2.51.0
     # via
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index 49b8eddecb..c93b436f38 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -292,7 +292,7 @@ SP_TEST_MODELS = [
     # TODO support other models
     # [LANGUAGE GENERATION]
     "meta-llama/Llama-3.2-1B-Instruct",
-    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
 ]
 
 
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index fb00e7b65b..5cffb8cfcc 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
+    # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
+    # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
+    # more GPU memory causing vLLM to OOM
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
                    enable_lora=True,
@@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
                    tensor_parallel_size=4,
                    trust_remote_code=True,
                    fully_sharded_loras=True,
-                   enable_chunked_prefill=True)
+                   enable_chunked_prefill=True,
+                   gpu_memory_utilization=0.85)
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         assert output1[i] == EXPECTED_LORA_OUTPUT[i]
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index a596f6b2b3..d5b1c15e68 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -789,6 +789,7 @@ def get_kernel_options(query, block_m, block_n,
             device_props = torch.cuda.get_device_properties()
             max_shared_memory = device_props.shared_memory_per_block_optin
             if max_shared_memory < 144 * 1024:
-                kernel_options["BLOCK_M"] = 32
-                kernel_options["BLOCK_N"] = 32
+                kernel_options["BLOCK_M"] = kernel_options["BLOCK_M"] // 2
+                kernel_options["BLOCK_N"] = kernel_options["BLOCK_N"] // 2
+
     return kernel_options

From 4f7cde7272359d886d8dd178eebb19d94b3cdd6c Mon Sep 17 00:00:00 2001
From: Adit Chawdhary <25533953+aditchawdhary@users.noreply.github.com>
Date: Fri, 29 Aug 2025 17:58:13 +0530
Subject: [PATCH 724/932] Adds `json_count_leaves` utility function  (#23899)

Signed-off-by: aditchawdhary <aditxy@hotmail.com>
---
 tests/utils_/test_utils.py | 36 +++++++++++++++++++++++++++++++++---
 vllm/multimodal/cache.py   | 32 +++++++++++++++++++++++++++-----
 vllm/utils/jsontree.py     | 14 ++++++++++++--
 3 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index 04195ea0cf..66124dd854 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -379,9 +379,9 @@ def test_duplicate_dict_args(caplog_vllm, parser):
 def test_supports_kw(callable,kw_name,requires_kw_only,
                      allow_var_kwargs,is_supported):
     assert supports_kw(
-        callable=callable,
-        kw_name=kw_name,
-        requires_kw_only=requires_kw_only,
+            callable=callable,
+            kw_name=kw_name,
+            requires_kw_only=requires_kw_only,
         allow_var_kwargs=allow_var_kwargs
     ) == is_supported
 
@@ -948,6 +948,36 @@ def test_join_host_port():
     assert join_host_port("::1", 5555) == "[::1]:5555"
 
 
+def test_json_count_leaves():
+    """Test json_count_leaves function from jsontree utility."""
+    from vllm.utils.jsontree import json_count_leaves
+
+    # Single leaf values
+    assert json_count_leaves(42) == 1
+    assert json_count_leaves("hello") == 1
+    assert json_count_leaves(None) == 1
+
+    # Empty containers
+    assert json_count_leaves([]) == 0
+    assert json_count_leaves({}) == 0
+    assert json_count_leaves(()) == 0
+
+    # Flat structures
+    assert json_count_leaves([1, 2, 3]) == 3
+    assert json_count_leaves({"a": 1, "b": 2}) == 2
+    assert json_count_leaves((1, 2, 3)) == 3
+
+    # Nested structures
+    nested_dict = {"a": 1, "b": {"c": 2, "d": 3}}
+    assert json_count_leaves(nested_dict) == 3
+
+    nested_list = [1, [2, 3], 4]
+    assert json_count_leaves(nested_list) == 4
+
+    mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4}
+    assert json_count_leaves(mixed_nested) == 4
+
+
 def test_convert_ids_list_to_tokens():
     tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
     token_ids = tokenizer.encode("Hello, world!")
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index d385fcf61c..35b743ed21 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -10,7 +10,8 @@ from typing_extensions import TypeAlias, override
 
 from vllm.logger import init_logger
 from vllm.utils import GiB_bytes, LRUCache
-from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves
+from vllm.utils.jsontree import (json_count_leaves, json_map_leaves,
+                                 json_reduce_leaves)
 
 from .inputs import (MultiModalFeatureSpec, MultiModalFieldElem,
                      MultiModalKwargs, MultiModalKwargsItem,
@@ -127,11 +128,32 @@ class MultiModalCache:
         )
 
         if debug:
-            logger.debug("Calculated size of %s to be %.2f GiB", type(value),
-                         size / GiB_bytes)
+            leaf_count = json_count_leaves(value)
+            logger.debug(
+                "Calculated size of %s to be %.2f GiB (%d leaves)",
+                type(value),
+                size / GiB_bytes,
+                leaf_count,
+            )
 
         return size
 
+    @classmethod
+    def get_item_complexity(cls, value: MultiModalCacheValue) -> int:
+        """
+        Get the number of leaf elements in a multi-modal cache value.
+
+        This provides a measure of structural complexity that can be useful
+        for debugging cache performance and understanding data patterns.
+
+        Args:
+            value: The multi-modal cache value to analyze.
+
+        Returns:
+            The number of leaf elements in the nested structure.
+        """
+        return json_count_leaves(value)
+
     @classmethod
     def get_lru_cache(
         cls,
@@ -184,7 +206,7 @@ class BaseMultiModalCache(ABC, Generic[_I, _O]):
         """
         Possibly update a multi-modal item based on whether it is
         in the underlying cache.
-        
+
         This update is done out-of-place and updates the cache eviction order.
 
         Args:
@@ -262,7 +284,7 @@ class BaseMultiModalProcessorCache(
         in the underlying cache.
 
         This **DOES NOT** update the cache eviction order.
-    
+
         Args:
             mm_hashes: The hash of each item to check.
 
diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py
index 4cbe0f76e0..457afb7e2c 100644
--- a/vllm/utils/jsontree.py
+++ b/vllm/utils/jsontree.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Helper functions to work with nested JSON structures."""
+
 from collections.abc import Iterable
 from functools import reduce
 from typing import Callable, TypeVar, Union, overload
@@ -8,8 +9,12 @@ from typing import Callable, TypeVar, Union, overload
 _T = TypeVar("_T")
 _U = TypeVar("_U")
 
-JSONTree = Union[dict[str, "JSONTree[_T]"], list["JSONTree[_T]"],
-                 tuple["JSONTree[_T]", ...], _T]
+JSONTree = Union[
+    dict[str, "JSONTree[_T]"],
+    list["JSONTree[_T]"],
+    tuple["JSONTree[_T]", ...],
+    _T,
+]
 """A nested JSON structure where the leaves need not be JSON-serializable."""
 
 
@@ -78,3 +83,8 @@ def json_reduce_leaves(
         json_iter_leaves(value),
         initial,
     )
+
+
+def json_count_leaves(value: JSONTree[_T]) -> int:
+    """Count the number of leaves in a nested JSON structure."""
+    return sum(1 for _ in json_iter_leaves(value))

From 1cf3753b901ba874a830c19555bb31fe37f91231 Mon Sep 17 00:00:00 2001
From: EduardDurech <39579228+EduardDurech@users.noreply.github.com>
Date: Fri, 29 Aug 2025 14:29:18 +0200
Subject: [PATCH 725/932] [MODEL] `Apertus` and `XIELU` (#23068)

Signed-off-by: EduardDurech <39579228+EduardDurech@users.noreply.github.com>
Co-authored-by: AllenHaoHuang <allenhuangdd@gmail.com>
---
 .../models/language/generation/test_common.py |   3 +-
 tests/models/registry.py                      |   3 +
 tests/models/test_registry.py                 |   3 +
 vllm/model_executor/layers/activation.py      | 111 ++++
 vllm/model_executor/models/apertus.py         | 576 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 6 files changed, 696 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/apertus.py

diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 57382914bf..4c4434c941 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -92,7 +92,8 @@ AITER_MODEL_LIST = [
         pytest.param(
             "allenai/OLMoE-1B-7B-0924-Instruct",
             marks=[pytest.mark.cpu_model],
-        )
+        ),
+        pytest.param("swiss-ai/Apertus-8B"),  # apertus
     ])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 13eb4872e7..a37ffdc311 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -137,6 +137,9 @@ class _HfExamplesInfo:
 # yapf: disable
 _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Decoder-only]
+    "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B",
+                                          min_transformers_version="4.56.0",
+                                          trust_remote_code=True),
     "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
                                    trust_remote_code=True),
     "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B",
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 8769ad45eb..36882aba5e 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -24,6 +24,9 @@ from .registry import HF_EXAMPLE_MODELS
 
 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
+    # Skip if transformers version is incompatible
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_transformers_version(on_fail="skip")
     # Ensure all model classes can be imported successfully
     model_cls = ModelRegistry._try_load_model_cls(model_arch)
     assert model_cls is not None
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index f3248589ab..eb7e494e32 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -10,11 +10,14 @@ import torch.nn.functional as F
 
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
+from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.utils import LazyDict
 
+logger = init_logger(__name__)
+
 
 @CustomOp.register("fatrelu_and_mul")
 class FatreluAndMul(CustomOp):
@@ -363,6 +366,112 @@ class ReLUSquaredActivation(CustomOp):
         return self.forward_native(x)
 
 
+@CustomOp.register("xielu")
+class XIELU(CustomOp):
+    """
+    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010
+    If the user has installed the nickjbrowning/XIELU, we import xIELU CUDA
+    Otherwise, we emit a single warning and use xIELU Python
+    """
+
+    def __init__(
+        self,
+        alpha_p_init: float = 0.8,
+        alpha_n_init: float = 0.8,
+        beta: float = 0.5,
+        eps: float = -1e-6,
+        dtype: torch.dtype = torch.bfloat16,
+        with_vector_loads: bool = False,
+    ):
+        super().__init__()
+        self.alpha_p = nn.Parameter(
+            torch.log(torch.exp(torch.tensor(alpha_p_init, dtype=dtype)) -
+                      1).unsqueeze(0))
+        self.alpha_n = nn.Parameter(
+            torch.log(
+                torch.exp(torch.tensor(alpha_n_init - beta, dtype=dtype)) -
+                1).unsqueeze(0))
+        self.register_buffer("beta", torch.tensor(beta, dtype=dtype))
+        self.register_buffer("eps", torch.tensor(eps, dtype=dtype))
+        self.with_vector_loads = with_vector_loads
+        # Temporary until xIELU CUDA fully implemented
+        self._beta_scalar = float(self.beta.detach().cpu().float().item())
+        self._eps_scalar = float(self.eps.detach().cpu().float().item())
+
+        self._xielu_cuda_obj = None
+        try:
+            import xielu.ops  # noqa: F401
+
+            self._xielu_cuda_obj = torch.classes.xielu.XIELU()
+            msg = "Using experimental xIELU CUDA."
+            try:
+                from torch._dynamo import allow_in_graph
+
+                self._xielu_cuda_fn = allow_in_graph(self._xielu_cuda)
+                msg += " Enabled torch._dynamo for xIELU CUDA."
+            except Exception as err:
+                msg += (f" Could not enable torch._dynamo for xIELU ({err}) - "
+                        "this may result in slower performance.")
+                self._xielu_cuda_fn = self._xielu_cuda
+            logger.warning_once(msg)
+        except Exception as err:
+            logger.warning_once(
+                "CUDA-fused xIELU not available (%s) –"
+                " falling back to a Python version.\n"
+                "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`",
+                str(err),
+            )
+
+    def _xielu_python(self, x: torch.Tensor) -> torch.Tensor:
+        alpha_p = nn.functional.softplus(self.alpha_p)
+        alpha_n = self.beta + nn.functional.softplus(self.alpha_n)
+        return torch.where(
+            x > 0,
+            alpha_p * x * x + self.beta * x,
+            (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n +
+            self.beta * x,
+        )
+
+    def _xielu_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        """Firewall function to prevent torch.compile from seeing .item()"""
+        assert self._xielu_cuda_obj is not None, (
+            "XIELU CUDA object must not be None")
+        original_shape = x.shape
+        # CUDA kernel expects 3D tensors, reshape if needed
+        while x.dim() < 3:
+            x = x.unsqueeze(0)
+        if x.dim() > 3:
+            x = x.view(-1, 1, x.size(-1))
+        if original_shape != x.shape:
+            logger.warning_once(
+                "Warning: xIELU input tensor expects 3 dimensions"
+                " but got (shape: %s). Reshaping to (shape: %s).",
+                original_shape,
+                x.shape,
+            )
+        result = self._xielu_cuda_obj.forward(
+            x,
+            self.alpha_p,
+            self.alpha_n,
+            # Temporary until xIELU CUDA fully implemented ->
+            # self.{beta,eps}.item()
+            self._beta_scalar,
+            self._eps_scalar,
+            self.with_vector_loads,
+        )
+        return result.view(original_shape)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self._xielu_cuda_obj is not None and input.is_cuda:
+            if not torch._dynamo.is_compiling():
+                return self._xielu_cuda_fn(input)
+            else:
+                logger.warning_once(
+                    "torch._dynamo is compiling, using Python version of xIELU."
+                )
+        return self._xielu_python(input)
+
+
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.
 
@@ -426,6 +535,8 @@ _ACTIVATION_REGISTRY = LazyDict({
     lambda: nn.Tanh(),
     "sigmoid":
     lambda: nn.Sigmoid(),
+    "xielu":
+    lambda: XIELU(),
 })
 
 
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
new file mode 100644
index 0000000000..0de683d2cd
--- /dev/null
+++ b/vllm/model_executor/models/apertus.py
@@ -0,0 +1,576 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2025 The Swiss AI Initiative.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate the architectural differences made by
+# the Swiss AI Initiative that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Apertus model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import ApertusConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import XIELU
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class ApertusMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.up_proj = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "xielu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only xIELU is supported for now.")
+        self.act_fn = XIELU()
+
+    def forward(self, x):
+        x, _ = self.up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class ApertusAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: ApertusConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        head_dim = getattr(config, "head_dim", None)
+        if head_dim is None:
+            head_dim = self.hidden_size // self.total_num_heads
+        self.head_dim = head_dim
+        # Phi models introduced a partial_rotary_factor parameter in the config
+        self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
+                                             1)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self._init_rotary_emb(config,
+                              rope_scaling=rope_scaling,
+                              quant_config=quant_config)
+
+        sliding_window = None
+        if layer_types := getattr(config, "layer_types", None):
+            is_sliding = layer_types[layer_idx] == "sliding_attention"
+            if is_sliding:
+                sliding_window = config.sliding_window
+
+        attn_cls = (EncoderOnlyAttention
+                    if attn_type == AttentionType.ENCODER_ONLY else Attention)
+
+        self.attn = attn_cls(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            attn_type=attn_type,
+            prefix=f"{prefix}.attn",
+        )
+
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = self.q_norm(q.contiguous().view(-1, self.head_dim)).view_as(q)
+        k = self.k_norm(k.contiguous().view(-1, self.head_dim)).view_as(k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def _init_rotary_emb(self, config: ApertusConfig,
+                         rope_scaling: Optional[dict[str, Any]],
+                         quant_config: Optional[QuantizationConfig]) -> None:
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "apertus":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=int(self.partial_rotary_factor * self.head_dim),
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+            partial_rotary_factor=self.partial_rotary_factor,
+        )
+
+
+class ApertusDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: ApertusConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        bias_o_proj = attention_bias
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, 'qkv_bias'):
+            attention_bias = config.qkv_bias
+
+        # Apertus defaults to causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. parasail-ai/GritLM-7B-vllm)
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        self.self_attn = ApertusAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            bias_o_proj=bias_o_proj,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
+        )
+        self.mlp = ApertusMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.attention_layernorm = RMSNorm(config.hidden_size,
+                                           eps=config.rms_norm_eps)
+        self.feedforward_layernorm = RMSNorm(config.hidden_size,
+                                             eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.attention_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.attention_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions,
+                                       hidden_states=hidden_states)
+
+        # Fully Connected
+        hidden_states, residual = self.feedforward_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class ApertusModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: type[nn.Module] = ApertusDecoderLayer):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: layer_type(config=config,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config,
+                                      prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.aux_hidden_state_layers = tuple[int, ...]()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors, tuple[torch.Tensor,
+                                                        list[torch.Tensor]]]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        aux_hidden_states = []
+        for idx, layer in enumerate(
+                self.layers[self.start_layer:self.end_layer]):
+            if idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states + residual)
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            if "scale" in name:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings"
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: type[nn.Module] = ApertusDecoderLayer):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"),
+                                      layer_type=layer_type)
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config else
+                    lora_config.lora_vocab_padding_size),
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.embed_tokens)
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: type[nn.Module] = ApertusDecoderLayer):
+        return ApertusModel(vllm_config=vllm_config,
+                            prefix=prefix,
+                            layer_type=layer_type)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 9040189ee5..98115f8623 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -39,6 +39,7 @@ logger = init_logger(__name__)
 # yapf: disable
 _TEXT_GENERATION_MODELS = {
     # [Decoder-only]
+    "ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
     "AquilaModel": ("llama", "LlamaForCausalLM"),
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
     "ArceeForCausalLM": ("arcee", "ArceeForCausalLM"),

From 0a2f4c0793988d3cf0d47b5f771fb38231db4b2b Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Fri, 29 Aug 2025 15:42:57 +0100
Subject: [PATCH 726/932] [Models] Use in-place adds in Idefics2Vision (#23932)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/model_executor/models/idefics2_vision_model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 88b2a29590..0ca2e9e4bb 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -108,7 +108,7 @@ class Idefics2VisionEmbeddings(nn.Module):
                        bucket_coords_w).flatten()
             position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
         position_ids = position_ids.to(self.position_embedding.weight.device)
-        embeddings = embeddings + self.position_embedding(position_ids)
+        embeddings += self.position_embedding(position_ids)
         return embeddings
 
 
@@ -262,11 +262,11 @@ class Idefics2EncoderLayer(nn.Module):
         residual = hidden_states
         hidden_states = self.layer_norm1(hidden_states)
         hidden_states = self.self_attn(hidden_states)
-        hidden_states = residual + hidden_states
+        hidden_states += residual
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
+        hidden_states += residual
         return hidden_states
 
 
From d90d8eb674f3870b8c85515a468108d5f1bd609a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 29 Aug 2025 08:17:27 -0700
Subject: [PATCH 727/932] [BugFix] Async scheduling and PP compatibility with
 DP (#23770)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/engine/test_engine_core.py    | 59 +++++++++---------------
 tests/v1/test_async_llm_dp.py          |  6 ++-
 vllm/executor/ray_utils.py             |  6 +++
 vllm/v1/engine/core.py                 | 63 ++++++++++++++------------
 vllm/v1/executor/abstract.py           |  9 ++--
 vllm/v1/executor/multiproc_executor.py | 15 ++++--
 vllm/v1/worker/gpu_worker.py           | 45 +++++++++---------
 7 files changed, 105 insertions(+), 98 deletions(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index e6f7ebf259..98265c6349 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -306,17 +306,17 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
 
         # Schedule Batch 1: (10, req0)
         assert engine_core.step_with_batch_queue()[0] is None
-        assert engine_core.batch_queue.qsize() == 1
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert len(engine_core.batch_queue) == 1
+        scheduler_output = engine_core.batch_queue[-1][1]
         assert scheduler_output.num_scheduled_tokens["0"] == 10
         # num_computed_tokens should have been updated immediately.
         assert engine_core.scheduler.requests[
             req0.request_id].num_computed_tokens == 10
 
         # Schedule Batch 2: (2, req0), (8, req1)
-        assert engine_core.step_with_batch_queue()[0] is None
-        assert engine_core.batch_queue.qsize() == 2
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert engine_core.step_with_batch_queue()[0] == {}
+        assert len(engine_core.batch_queue) == 1
+        scheduler_output = engine_core.batch_queue[-1][1]
         assert scheduler_output.num_scheduled_tokens["0"] == 2
         assert scheduler_output.num_scheduled_tokens["1"] == 8
         # num_computed_tokens should have been updated immediately.
@@ -325,42 +325,32 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
 
         assert engine_core.scheduler.get_num_unfinished_requests() == 2
 
-        # Batch queue is full. Finish Batch 1.
-        engine_core.step_with_batch_queue()
-
-        # Schedule Batch 3: (4, req1). Note that req0 cannot be scheduled
+        # Finish Batch 1 and schedule Batch 3: (4, req1).
+        # Note that req0 cannot be scheduled
         # because it is in the decoding stage now.
         engine_core.step_with_batch_queue()
-        assert engine_core.batch_queue.qsize() == 2
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert len(engine_core.batch_queue) == 1
+        scheduler_output = engine_core.batch_queue[-1][1]
         assert scheduler_output.num_scheduled_tokens["1"] == 4
 
-        # Batch queue is full. Finish Batch 2. Get first token of req0.
+        # Finish Batch 2. Get first token of req0.
+        # Schedule Batch 4: (1, req0).
         output = engine_core.step_with_batch_queue()[0].get(0)
         assert output is not None
         assert len(output.outputs) == 1
         assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
-
-        # Schedule Batch 4: (1, req0).
-        engine_core.step_with_batch_queue()
-        assert engine_core.batch_queue.qsize() == 2
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        scheduler_output = engine_core.batch_queue[-1][1]
         assert scheduler_output.num_scheduled_tokens["0"] == 1
 
-        # Batch queue is full. Finish Batch 3. Get first token of req1.
+        # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
         output = engine_core.step_with_batch_queue()[0].get(0)
         assert output is not None
         assert len(output.outputs) == 1
         assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
-
-        # Schedule Batch 5: (1, req1).
-        engine_core.step_with_batch_queue()
-        assert engine_core.batch_queue.qsize() == 2
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        scheduler_output = engine_core.batch_queue[-1][1]
         assert scheduler_output.num_scheduled_tokens["1"] == 1
 
         # Loop until req0 is finished.
-        step = 0
         req_id = 0
         expected_num_tokens = [
             engine_core.scheduler.requests["0"].num_tokens + 1,
@@ -368,19 +358,14 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
         ]
         while engine_core.scheduler.get_num_unfinished_requests() == 2:
             output = engine_core.step_with_batch_queue()[0]
-            if step % 2 == 0:
-                # Even steps consumes an output.
-                assert output is not None
-                assert len(output[0].outputs) == 1
-                if req_id in engine_core.scheduler.requests:
-                    assert engine_core.scheduler.requests[
-                        req_id].num_tokens == expected_num_tokens[req_id]
-                expected_num_tokens[req_id] += 1
-                req_id = (req_id + 1) % 2
-            else:
-                # Odd steps schedules a new batch.
-                assert output is None
-            step += 1
+            # Every step consumes an output.
+            assert output is not None
+            assert len(output[0].outputs) == 1
+            if req_id in engine_core.scheduler.requests:
+                assert engine_core.scheduler.requests[
+                    req_id].num_tokens == expected_num_tokens[req_id]
+            expected_num_tokens[req_id] += 1
+            req_id = (req_id + 1) % 2
 
 
 @multi_gpu_test(num_gpus=2)
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index c2610a87ac..32da58011b 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -75,9 +75,10 @@ async def generate(
     ],
 )
 @pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"])
+@pytest.mark.parametrize("async_scheduling", [True, False])
 @pytest.mark.asyncio
-async def test_load(output_kind: RequestOutputKind,
-                    data_parallel_backend: str):
+async def test_load(output_kind: RequestOutputKind, data_parallel_backend: str,
+                    async_scheduling: bool):
 
     stats_loggers = {}
 
@@ -105,6 +106,7 @@ async def test_load(output_kind: RequestOutputKind,
         prompt = "This is a test of data parallel"
 
         engine_args.data_parallel_backend = data_parallel_backend
+        engine_args.async_scheduling = async_scheduling
         engine = AsyncLLM.from_engine_args(engine_args,
                                            stat_loggers=[SimpleStatsLogger])
         after.callback(engine.shutdown)
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 7abaffa54c..4b2a15afb6 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -10,6 +10,7 @@ import msgspec
 
 import vllm.platforms
 from vllm.config import ParallelConfig
+from vllm.distributed import get_pp_group
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -136,6 +137,11 @@ try:
                 scheduler_output, intermediate_tensors)
             if isinstance(output, IntermediateTensors):
                 output = scheduler_output, output
+            elif not get_pp_group().is_last_rank:
+                # Case where there are no scheduled requests
+                # but may still be finished requests.
+                assert not output or not output.req_ids
+                output = scheduler_output, None
             return output
 
         def override_env_vars(self, vars: Dict[str, str]):
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 785cbc9d8d..922c06b44b 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -138,12 +138,12 @@ class EngineCore:
         # schedule and execute batches, and is required by pipeline parallelism
         # to eliminate pipeline bubbles.
         self.batch_queue_size = self.model_executor.max_concurrent_batches
-        self.batch_queue: Optional[queue.Queue[tuple[Future[ModelRunnerOutput],
-                                                     SchedulerOutput]]] = None
+        self.batch_queue: Optional[deque[tuple[Future[ModelRunnerOutput],
+                                               SchedulerOutput]]] = None
         if self.batch_queue_size > 1:
             logger.info("Batch queue is enabled with size %d",
                         self.batch_queue_size)
-            self.batch_queue = queue.Queue(self.batch_queue_size)
+            self.batch_queue = deque(maxlen=self.batch_queue_size)
 
         self.request_block_hasher: Optional[Callable[[Request],
                                                      list[BlockHash]]] = None
@@ -319,41 +319,43 @@ class EngineCore:
         batch in the job queue is finished.
         3. Update the scheduler from the output.
         """
-        assert self.batch_queue is not None
+        batch_queue = self.batch_queue
+        assert batch_queue is not None
 
-        engine_core_outputs = None
-        scheduler_output = None
         # Try to schedule a new batch if the batch queue is not full, but
         # the scheduler may return an empty batch if all requests are scheduled.
         # Note that this is not blocking.
-        if not self.batch_queue.full():
+        assert len(batch_queue) < self.batch_queue_size
+
+        model_executed = False
+        if self.scheduler.has_requests():
             scheduler_output = self.scheduler.schedule()
-            if scheduler_output.total_num_scheduled_tokens > 0:
-                future = self.model_executor.execute_model(scheduler_output)
-                self.batch_queue.put_nowait(
-                    (future, scheduler_output))  # type: ignore
+            future = self.model_executor.execute_model(scheduler_output)
+            batch_queue.appendleft(
+                (future, scheduler_output))  # type: ignore[arg-type]
 
-        scheduled_batch = (scheduler_output is not None
-                           and scheduler_output.total_num_scheduled_tokens > 0)
+            model_executed = scheduler_output.total_num_scheduled_tokens > 0
+            if model_executed and len(batch_queue) < self.batch_queue_size \
+                and not batch_queue[-1][0].done():
+                # Don't block on next worker response unless the queue is full
+                # or there are no more requests to schedule.
+                return None, True
 
-        # If no more requests can be scheduled and the job queue is not empty,
-        # block until the first batch in the job queue is finished.
-        # TODO(comaniac): Ideally we should peek the first batch in the
-        # job queue to check if it's finished before scheduling a new batch,
-        # but peeking the first element in a queue is not thread-safe,
-        # so we need more work.
-        if not scheduled_batch and not self.batch_queue.empty():
-            future, scheduler_output = self.batch_queue.get_nowait()
+        elif not batch_queue:
+            # Queue is empty. We should not reach here since this method should
+            # only be called when the scheduler contains requests or the queue
+            # is non-empty.
+            return None, False
 
-            # Blocking until the first result is available.
-            model_output = self.execute_model_with_error_logging(
-                lambda _: future.result(), scheduler_output)
+        # Block until the next result is available.
+        future, scheduler_output = batch_queue.pop()
+        model_output = self.execute_model_with_error_logging(
+            lambda _: future.result(), scheduler_output)
 
-            self.batch_queue.task_done()
-            engine_core_outputs = (self.scheduler.update_from_output(
-                scheduler_output, model_output))
+        engine_core_outputs = self.scheduler.update_from_output(
+            scheduler_output, model_output)
 
-        return engine_core_outputs, scheduled_batch
+        return engine_core_outputs, model_executed
 
     def shutdown(self):
         self.structured_output_manager.clear_backend()
@@ -388,7 +390,7 @@ class EngineCore:
         return self.model_executor.is_sleeping
 
     def execute_dummy_batch(self):
-        self.model_executor.collective_rpc("execute_dummy_batch")
+        self.model_executor.execute_dummy_batch()
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_executor.add_lora(lora_request)
@@ -733,7 +735,8 @@ class EngineCoreProc(EngineCore):
         """Exits when an engine step needs to be performed."""
 
         waited = False
-        while not self.engines_running and not self.scheduler.has_requests():
+        while not self.engines_running and not self.scheduler.has_requests() \
+                and not self.batch_queue:
             if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
                 logger.debug("EngineCore waiting for work.")
                 waited = True
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 4be2f74177..68408a0b8a 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -81,12 +81,10 @@ class Executor(ExecutorBase):
         pass
 
     def determine_available_memory(self) -> list[int]:  # in bytes
-        output = self.collective_rpc("determine_available_memory")
-        return output
+        return self.collective_rpc("determine_available_memory")
 
     def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
-        output = self.collective_rpc("get_kv_cache_spec")
-        return output
+        return self.collective_rpc("get_kv_cache_spec")
 
     def execute_model(
         self,
@@ -96,6 +94,9 @@ class Executor(ExecutorBase):
                                      args=(scheduler_output, ))
         return output[0]
 
+    def execute_dummy_batch(self) -> None:
+        self.collective_rpc("execute_dummy_batch")
+
     def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
         output = self.collective_rpc("take_draft_token_ids")
         return output[0]
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 15b88a2128..12e79ff165 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -191,6 +191,10 @@ class MultiprocExecutor(Executor):
                 outputs, self.output_rank)
         return self.kv_output_aggregator.aggregate(outputs, self.output_rank)
 
+    def execute_dummy_batch(self) -> None:
+        self.collective_rpc("execute_dummy_batch",
+                            unique_reply_rank=self.output_rank)
+
     def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
         # OPTIMIZATION: Get output only from a single worker (output_rank)
         outputs = self.collective_rpc("take_draft_token_ids",
@@ -242,12 +246,17 @@ class MultiprocExecutor(Executor):
                 dequeue_timeout = None if deadline is None else (
                     deadline - time.monotonic())
 
-                if non_block:
+                if self.io_thread_pool is not None:
+                    # We must consume worker_response_mq from a single thread.
                     result = self.io_thread_pool.submit(  # type: ignore
                         get_response, w, dequeue_timeout, self.shutdown_event)
-                else:
+                    if not non_block:
+                        result = result.result()
+                elif not non_block:
                     result = get_response(w, dequeue_timeout)
-
+                else:
+                    raise RuntimeError("non_block can only be used when"
+                                       " max_concurrent_batches > 1")
                 responses.append(result)
 
             return responses
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index c252193313..2088bfff5b 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -354,36 +354,37 @@ class Worker(WorkerBase):
         scheduler_output: "SchedulerOutput",
     ) -> Optional[ModelRunnerOutput]:
         intermediate_tensors = None
-        if not get_pp_group().is_first_rank:
+        forward_pass = scheduler_output.total_num_scheduled_tokens > 0
+        if forward_pass and not get_pp_group().is_first_rank:
             intermediate_tensors = IntermediateTensors(
                 get_pp_group().recv_tensor_dict(
                     all_gather_group=get_tp_group()))
 
         output = self.model_runner.execute_model(scheduler_output,
                                                  intermediate_tensors)
-
-        parallel_config = self.vllm_config.parallel_config
-        if parallel_config.distributed_executor_backend != "external_launcher" \
-            and not get_pp_group().is_last_rank:
-            assert isinstance(output, IntermediateTensors)
-            get_pp_group().send_tensor_dict(output.tensors,
-                                            all_gather_group=get_tp_group())
-
-            kv_connector_output = output.kv_connector_output
-            if not kv_connector_output:
-                return None
-
-            # In case of PP with kv transfer, we need to pass through the
-            # kv_connector_output
-            if (not kv_connector_output.finished_sending
-                    and not kv_connector_output.finished_recving):
-                return EMPTY_MODEL_RUNNER_OUTPUT
-
-            output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
-            output.kv_connector_output = kv_connector_output
+        if isinstance(output, ModelRunnerOutput):
             return output
 
-        assert isinstance(output, ModelRunnerOutput)
+        assert isinstance(output, IntermediateTensors)
+        parallel_config = self.vllm_config.parallel_config
+        assert parallel_config.distributed_executor_backend != (
+            "external_launcher") and not get_pp_group().is_last_rank
+
+        get_pp_group().send_tensor_dict(output.tensors,
+                                        all_gather_group=get_tp_group())
+
+        kv_connector_output = output.kv_connector_output
+        if not kv_connector_output:
+            return None
+
+        # In case of PP with kv transfer, we need to pass through the
+        # kv_connector_output
+        if (not kv_connector_output.finished_sending
+                and not kv_connector_output.finished_recving):
+            return EMPTY_MODEL_RUNNER_OUTPUT
+
+        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
+        output.kv_connector_output = kv_connector_output
         return output
 
     def take_draft_token_ids(self) -> Optional[DraftTokenIds]:

From 72a69132dc540fe7168ffdbb761412fa569f323f Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 29 Aug 2025 23:29:21 +0800
Subject: [PATCH 728/932] [CI]  Add `aiter` to matching list of issue auto
 labeller for `rocm` tag (#23942)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .github/workflows/issue_autolabel.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index 6401d6586c..e0ab3872d8 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -49,6 +49,10 @@ jobs:
                     term: "VLLM_ROCM_",
                     searchIn: "both"
                   },
+                  {
+                    term: "aiter",
+                    searchIn: "title"
+                  },
                   {
                     term: "rocm",
                     searchIn: "title"

From 0dc9532065c5f98952cb82d4c497e49ca09400bf Mon Sep 17 00:00:00 2001
From: yzds <41983536+youzhedian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 00:36:39 +0800
Subject: [PATCH 729/932] [BUGFIX ] fix undefined silu_and_mul_nvfp4_quant
 (#23929)

Signed-off-by: hongchao <hongchao@msh.team>
Signed-off-by: Richard Zou <zou3519@gmail.com>
Co-authored-by: hongchao <hongchao@msh.team>
Co-authored-by: Richard Zou <zou3519@gmail.com>
Co-authored-by: Richard Zou <zou3519@users.noreply.github.com>
---
 csrc/ops.h                                | 4 ++--
 csrc/torch_bindings.cpp                   | 3 ++-
 vllm/compilation/fix_functionalization.py | 4 +++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 78a487201b..7a176a5c00 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -130,8 +130,8 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
                         torch::Tensor& scale);
 
-#ifndef USE_ROCM
-
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
 void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                               torch::Tensor& output_block_scale,
                               torch::Tensor& input,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b769c09adc..56626a02c0 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -115,7 +115,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
   ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
 
-#ifndef USE_ROCM
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
   ops.def(
       "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
       "Tensor input, Tensor input_global_scale) -> ()");
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index a36dd8b845..6bc721eec3 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -97,7 +97,9 @@ class FixFunctionalizationPass(VllmInductorPass):
                                      node,
                                      mutated_args,
                                      args=('result', 'input', 'scale'))
-            elif at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default:
+            elif hasattr(
+                    torch.ops._C, "silu_and_mul_nvfp4_quant"
+            ) and at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default:
                 mutated_args = {1: 'result', 2: 'result_block_scale'}
                 self.defunctionalize(graph,
                                      node,

From 4d7fe40fc0468b44404c32d87e4ae0158de24cdc Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Fri, 29 Aug 2025 10:09:55 -0700
Subject: [PATCH 730/932] [RL][BugFix] Fix missing tokenizer error for
 token-in-token-out (#23904)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .../openai/test_token_in_token_out.py         | 73 +++++++++++++++++++
 vllm/entrypoints/openai/serving_completion.py |  6 +-
 vllm/entrypoints/openai/serving_engine.py     | 38 +++++-----
 3 files changed, 99 insertions(+), 18 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_token_in_token_out.py

diff --git a/tests/entrypoints/openai/test_token_in_token_out.py b/tests/entrypoints/openai/test_token_in_token_out.py
new file mode 100644
index 0000000000..ed003939c4
--- /dev/null
+++ b/tests/entrypoints/openai/test_token_in_token_out.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import tempfile
+
+import pytest
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf)
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
+
+
+@pytest.fixture(scope="module")
+def server():
+    global MODEL_PATH
+    MODEL_PATH = download_weights_from_hf(
+        MODEL_NAME,
+        allow_patterns=["*"],
+        cache_dir=MODEL_PATH,
+        ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"])
+    args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        "--skip-tokenizer-init",
+        "--load-format",
+        "dummy",
+    ]
+    with RemoteOpenAIServer(MODEL_PATH, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_token_in_token_out_and_logprobs(server):
+    """
+    Test token-in-token-out and token_ids align with prompt_logprobs
+    & logprobs when return_tokens_as_token_ids is enabled.
+    """
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    text = "Hello, world! How are you today?"
+    token_ids = tokenizer.encode(text)
+    async with server.get_async_client() as client:
+        # Test with both return_token_ids and return_tokens_as_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_PATH,
+            prompt=token_ids,
+            max_tokens=20,
+            temperature=0,
+            echo=True,
+            extra_body={
+                "return_token_ids": True,
+            },
+        )
+
+        # Verify all fields are present
+        assert (completion.choices[0].token_ids is not None
+                and 0 < len(completion.choices[0].token_ids) <= 20)
+        assert completion.choices[0].prompt_token_ids is not None
+
+        # Decode prompt tokens
+        if completion.choices[0].prompt_token_ids:
+            prompt_text = tokenizer.decode(
+                completion.choices[0].prompt_token_ids)
+            # The decoded prompt should match or close to original prompt
+            assert prompt_text == text
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index b81fd63ece..f461d7609b 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -127,7 +127,11 @@ class OpenAIServingCompletion(OpenAIServing):
         try:
             lora_request = self._maybe_get_adapters(request)
 
-            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+            if self.model_config.skip_tokenizer_init:
+                tokenizer = None
+            else:
+                tokenizer = await self.engine_client.get_tokenizer(lora_request
+                                                                   )
 
             request_prompts, engine_prompts = await self._preprocess_completion(
                 request,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index a97935e109..ca6f398793 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -526,8 +526,8 @@ class OpenAIServing:
     async def _normalize_prompt_text_to_input(
         self,
         request: AnyRequest,
-        tokenizer: AnyTokenizer,
         prompt: str,
+        tokenizer: AnyTokenizer,
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]],
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
@@ -563,12 +563,10 @@ class OpenAIServing:
     async def _normalize_prompt_tokens_to_input(
         self,
         request: AnyRequest,
-        tokenizer: AnyTokenizer,
         prompt_ids: list[int],
+        tokenizer: Optional[AnyTokenizer],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
     ) -> TextTokensPrompt:
-        async_tokenizer = self._get_async_tokenizer(tokenizer)
-
         if truncate_prompt_tokens is None:
             input_ids = prompt_ids
         elif truncate_prompt_tokens < 0:
@@ -576,7 +574,11 @@ class OpenAIServing:
         else:
             input_ids = prompt_ids[-truncate_prompt_tokens:]
 
-        input_text = await async_tokenizer.decode(input_ids)
+        if tokenizer is None:
+            input_text = ""
+        else:
+            async_tokenizer = self._get_async_tokenizer(tokenizer)
+            input_text = await async_tokenizer.decode(input_ids)
 
         return self._validate_input(request, input_ids, input_text)
 
@@ -681,27 +683,27 @@ class OpenAIServing:
         [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
         that assumes multiple inputs.
         """
-        for text in prompt_inputs:
-            if isinstance(text, str):
+        for prompt in prompt_inputs:
+            if isinstance(prompt, str):
                 yield await self._normalize_prompt_text_to_input(
                     request,
-                    tokenizer,
-                    prompt=text,
+                    prompt=prompt,
+                    tokenizer=tokenizer,
                     truncate_prompt_tokens=truncate_prompt_tokens,
                     add_special_tokens=add_special_tokens,
                 )
             else:
                 yield await self._normalize_prompt_tokens_to_input(
                     request,
-                    tokenizer,
-                    prompt_ids=text,
+                    prompt_ids=prompt,
+                    tokenizer=tokenizer,
                     truncate_prompt_tokens=truncate_prompt_tokens,
                 )
 
     async def _tokenize_prompt_input_or_inputs_async(
         self,
         request: AnyRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
@@ -740,17 +742,19 @@ class OpenAIServing:
         tasks = []
         for prompt_input in batch_inputs:
             if prompt_input["is_tokens"] is False:
+                assert tokenizer is not None, \
+                    "Tokenizer is required for text prompts"
                 task = self._normalize_prompt_text_to_input(
                     request,
-                    tokenizer,
                     prompt_input["content"],
+                    tokenizer=tokenizer,
                     truncate_prompt_tokens=truncate_prompt_tokens,
                     add_special_tokens=add_special_tokens)
             else:
                 task = self._normalize_prompt_tokens_to_input(
                     request,
-                    tokenizer,
                     prompt_input["content"],
+                    tokenizer=tokenizer,
                     truncate_prompt_tokens=truncate_prompt_tokens)
             tasks.append(task)
 
@@ -766,7 +770,7 @@ class OpenAIServing:
         request: Union[DetokenizeRequest, EmbeddingCompletionRequest,
                        RerankRequest, ClassificationRequest, ScoreRequest,
                        TokenizeCompletionRequest],
-        tokenizer: AnyTokenizer,
+        tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ...,
         add_special_tokens: bool = ...,
@@ -777,7 +781,7 @@ class OpenAIServing:
     async def _preprocess_completion(
         self,
         request: CompletionRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ...,
@@ -789,7 +793,7 @@ class OpenAIServing:
     async def _preprocess_completion(
         self,
         request: CompletionLikeRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,

From b7adf94c4a6c7290dd8765819da68a801008f5a1 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 29 Aug 2025 13:28:35 -0400
Subject: [PATCH 731/932] Tuned H100/H200 triton fp8 block configs for
 fused_qkv_a_proj (#23939)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 benchmarks/kernels/bench_block_fp8_gemm.py    |   1 +
 .../kernels/benchmark_w8a8_block_fp8.py       |   1 +
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 126 ++++++++++++++-
 4 files changed, 271 insertions(+), 3 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
index 883f0cf7e5..9663503e9b 100644
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -16,6 +16,7 @@ assert current_platform.is_cuda(), (
 # DeepSeek-V3 weight shapes
 DEEPSEEK_V3_SHAPES = [
     (512 + 64, 7168),
+    (2112, 7168),
     ((128 + 64) * 128, 7168),
     (128 * (128 + 128), 512),
     (7168, 16384),
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index e648a91077..98bde9d83c 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -141,6 +141,7 @@ def get_weight_shapes(tp_size):
     # cannot TP
     total = [
         (512 + 64, 7168),
+        (2112, 7168),
         ((128 + 64) * 128, 7168),
         (128 * (128 + 128), 512),
         (7168, 16384),
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..f81e09e198
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index fbca5ce05d..e073843af6 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,10 +1,130 @@
 {
-    "2048": {
-        "BLOCK_SIZE_M": 256,
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
-        "num_warps": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
         "num_stages": 3
     },
     "3072": {

From 1c26b4229673ba44eb418b7f60882daed34facc9 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 29 Aug 2025 20:47:58 +0200
Subject: [PATCH 732/932] [Docs] [V1] [Hybrid] Add new documentation re:
 contributing mamba-based models  (#23824)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 docs/contributing/model/basic.md | 28 ++++++++++++++++++++++++++++
 docs/usage/v1_guide.md           | 12 +++++-------
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index 21b1f21d60..aafdb1058e 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -121,3 +121,31 @@ To support a model with interleaving sliding windows, we need to take care of th
 - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
 
 With these two steps, interleave sliding windows should work with the model.
+
+### How to support models that use Mamba?
+
+We consider 3 different scenarios:
+
+1. Models that use Mamba layers (either Mamba-1 or Mamba-2) but do not use attention layers.
+2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
+3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
+
+For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
+The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
+For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
+Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
+V0-only classes and code will be removed in the very near future.
+The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
+
+For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
+These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
+
+For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
+Please follow the same guidelines as case (2) for implementing these models.
+We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
+For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
+It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
+Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
+Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
+Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
+The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 20234e7611..f71805436a 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -107,16 +107,14 @@ to enable simultaneous generation and embedding using the same engine instance i
 #### Mamba Models
 
 Models using selective state-space mechanisms instead of standard transformer attention are supported.
-Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported.
-Please note that prefix caching is not yet supported for these models.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported.
 
-Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
+Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
-Please note that prefix caching is not yet supported for these models.
 
-Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
-Please note that prefix caching is not yet supported for these models.
-It is also necessary to enforce eager mode for these models in V1.
+Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`, `Lfm2ForCausalLM`).
+
+Please note that prefix caching is not yet supported for any of the above models.
 
 #### Encoder-Decoder Models
 

From 8c3e199998cc5b1225328f2de01a7443fbb4f3cd Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Fri, 29 Aug 2025 12:16:57 -0700
Subject: [PATCH 733/932] Revert gemma3n fast prefill changes (#23897)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/e2e/test_kv_sharing_fast_prefill.py |   1 +
 vllm/model_executor/models/gemma3n.py        | 433 +++----------------
 vllm/model_executor/models/gemma3n_mm.py     |   2 +-
 3 files changed, 74 insertions(+), 362 deletions(-)

diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index 7bc7f44dd7..6bc9b2b1d8 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -64,6 +64,7 @@ def cleanup(llm: LLM, compilation_config: CompilationConfig):
 
 @fork_new_process_for_each_test
 @pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.skip(reason="Disable until Gemma3n supports fast prefill")
 def test_kv_sharing_fast_prefill(
     monkeypatch: pytest.MonkeyPatch,
     enforce_eager: bool,
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 0e0e191e75..ffec340870 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -23,11 +23,9 @@ from torch import nn
 from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig
 
 from vllm.attention import Attention
-from vllm.compilation.backends import set_model_tag
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY,
                                                    GeluAndMul,
@@ -47,7 +45,6 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.v1.attention.backends.utils import KVSharingFastPrefillMetadata
 
 from .interfaces import SupportsQuant
 from .utils import (AutoWeightsLoader, extract_layer_index,
@@ -536,178 +533,7 @@ class Gemma3nDecoderLayer(nn.Module):
         return corrected_predictions
 
 
-# This enables torch.compile if --kv-sharing-fast-prefill passed
-@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
-                       kv_sharing_fast_prefill)
-class Gemma3nSelfDecoder(nn.Module):
-    """
-    Includes altup embedding and self decoder layers
-    """
-
-    def __init__(
-        self,
-        *,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-        decoder_layers: list[Gemma3nDecoderLayer],
-        layer_idx_start: int,
-        per_layer_model_projection: ColumnParallelLinear,
-        embed_scale_per_layer: torch.Tensor,
-        embed_tokens_per_layer: VocabParallelEmbedding,
-        per_layer_projection_norm: RMSNorm,
-        per_layer_input_scale: torch.Tensor,
-        altup_projections: nn.ModuleList,
-        eps: torch.Tensor,
-        embed_tokens: VocabParallelEmbedding,
-        embed_scale: torch.Tensor,
-    ):
-        super().__init__()
-        self.decoder_layers = decoder_layers
-        self.layer_idx_start = layer_idx_start
-        self.per_layer_model_projection = per_layer_model_projection
-        self.config = vllm_config.model_config.hf_config
-        self.embed_scale_per_layer = embed_scale_per_layer
-        self.embed_tokens_per_layer = embed_tokens_per_layer
-        self.per_layer_projection_norm = per_layer_projection_norm
-        self.per_layer_input_scale = per_layer_input_scale
-        self.altup_projections = altup_projections
-        self.eps = eps
-        self.embed_tokens = embed_tokens
-        self.embed_scale = embed_scale
-
-    def get_per_layer_input_embeddings(
-            self, input_ids: torch.Tensor) -> torch.Tensor:
-        # Deal with the fact that vocab_size_per_layer_input < vocab_size
-        # which causes us to have some out of vocab tokens by setting
-        # those token ids to 0. This matches the HF implementation.
-        per_layer_inputs_mask = torch.logical_and(
-            input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input)
-        per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids,
-                                              torch.zeros_like(input_ids))
-        return self.embed_tokens_per_layer(
-            per_layer_inputs_tokens) * self.embed_scale_per_layer
-
-    def get_per_layer_inputs(
-        self,
-        hidden_states_0: torch.Tensor,
-        per_layer_inputs: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        per_layer_projection = self.per_layer_model_projection(hidden_states_0)
-        per_layer_projection = per_layer_projection.reshape(
-            *hidden_states_0.shape[:-1],
-            self.config.num_hidden_layers,
-            self.config.hidden_size_per_layer_input,
-        )
-        per_layer_projection = self.per_layer_projection_norm(
-            per_layer_projection)
-        if per_layer_inputs is not None:
-            # Profiling run does not compute per_layer_inputs
-            per_layer_inputs = per_layer_projection + per_layer_inputs
-            per_layer_inputs *= self.per_layer_input_scale
-        else:
-            per_layer_inputs = per_layer_projection
-        return per_layer_inputs
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids) * self.embed_scale
-
-    def altup_embed(self, hidden_states_0: torch.Tensor) -> torch.Tensor:
-        # Altup embed.
-        hidden_states = [hidden_states_0] * self.config.altup_num_inputs
-        target_magnitude = torch.mean(hidden_states_0**2, dim=-1,
-                                      keepdim=True)**0.5
-        for i in range(1, self.config.altup_num_inputs):
-            hidden_states[i] = self.altup_projections[i - 1](hidden_states[i])
-            new_magnitude = torch.mean(hidden_states[i]**2,
-                                       dim=-1,
-                                       keepdim=True)**0.5
-            hidden_states[i] *= target_magnitude / torch.maximum(
-                new_magnitude, self.eps)
-        hidden_states = torch.stack(hidden_states, dim=-1)
-        return hidden_states
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        per_layer_inputs: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        if inputs_embeds is not None:
-            hidden_states_0 = inputs_embeds
-        else:
-            hidden_states_0 = self.get_input_embeddings(input_ids)
-
-        adjusted_per_layer_inputs = self.get_per_layer_inputs(
-            hidden_states_0, per_layer_inputs)
-        hidden_states = self.altup_embed(hidden_states_0)
-
-        # [altnum_inputs, num_tokens, hidden_size]
-        hidden_states = hidden_states.permute(2, 0, 1)
-
-        for idx, layer in enumerate(self.decoder_layers):
-            layer_idx = idx + self.layer_idx_start
-            # [altup_num_inputs, num_tokens, hidden_size]
-            hidden_states = layer(
-                positions=positions,
-                hidden_states=hidden_states,
-                per_layer_input=adjusted_per_layer_inputs[:, layer_idx, :],
-                **kwargs,
-            )
-
-        # [num_tokens, hidden_size, altnum_inputs]
-        hidden_states = hidden_states.permute(1, 2, 0)
-
-        return hidden_states, adjusted_per_layer_inputs
-
-
-# This enables torch.compile if --kv-sharing-fast-prefill passed
-@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
-                       kv_sharing_fast_prefill)
-class Gemma3nCrossDecoder(nn.Module):
-    """
-    Cross-decoder layers
-    """
-
-    def __init__(
-        self,
-        *,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-        decoder_layers: list[Gemma3nDecoderLayer],
-        layer_idx_start: int,
-    ):
-        super().__init__()
-        self.decoder_layers = decoder_layers
-        self.layer_idx_start = layer_idx_start
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        per_layer_inputs: torch.Tensor,
-        **kwargs,
-    ) -> torch.Tensor:
-        # [altnum_inputs, num_tokens, hidden_size]
-        hidden_states = hidden_states.permute(2, 0, 1)
-        for idx, layer in enumerate(self.decoder_layers):
-            layer_idx = idx + self.layer_idx_start
-            # [altup_num_inputs, num_tokens, hidden_size]
-            hidden_states = layer(
-                positions=positions,
-                hidden_states=hidden_states,
-                per_layer_input=per_layer_inputs[:, layer_idx, :],
-                **kwargs,
-            )
-        # [num_tokens, hidden_size, altnum_inputs]
-        hidden_states = hidden_states.permute(1, 2, 0)
-        return hidden_states
-
-
-# This disables torch.compile if --kv-sharing-fast-prefill passed
-@support_torch_compile(enable_if=lambda vllm_config: not vllm_config.
-                       cache_config.kv_sharing_fast_prefill)
+@support_torch_compile
 class Gemma3nTextModel(nn.Module, SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -717,6 +543,7 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
+
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
@@ -786,211 +613,95 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
             lambda prefix: Gemma3nDecoderLayer(
                 config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
-
-        self.eps = torch.tensor(torch.finfo().min)
-
-        first_kv_shared_layer_idx = (config.num_hidden_layers -
-                                     config.num_kv_shared_layers)
-        # Layer idx 0-19 are self-decoder layers in You Only Cache Once (YOCO)
-        with set_model_tag("self_decoder"):
-            self.self_decoder = Gemma3nSelfDecoder(
-                vllm_config=vllm_config,
-                prefix=f"{prefix}.self_decoder",
-                decoder_layers=self.layers[:first_kv_shared_layer_idx],
-                layer_idx_start=0,
-                per_layer_model_projection=self.per_layer_model_projection,
-                embed_scale_per_layer=self.embed_scale_per_layer,
-                embed_tokens_per_layer=self.embed_tokens_per_layer,
-                per_layer_projection_norm=self.per_layer_projection_norm,
-                per_layer_input_scale=self.per_layer_input_scale,
-                altup_projections=self.altup_projections,
-                eps=self.eps,
-                embed_tokens=self.embed_tokens,
-                embed_scale=self.embed_scale,
-            )
-        # Layer idx 20-30 are cross-decoder layers in YOCO
-        with set_model_tag("cross_decoder"):
-            self.cross_decoder = Gemma3nCrossDecoder(
-                vllm_config=vllm_config,
-                prefix=f"{prefix}.cross_decoder",
-                decoder_layers=self.layers[first_kv_shared_layer_idx:],
-                layer_idx_start=first_kv_shared_layer_idx,
-            )
-
         self.norm = RMSNorm(
             config.hidden_size,
             eps=config.rms_norm_eps,
         )
-
-        self.fast_prefill_enabled = cache_config.kv_sharing_fast_prefill
-
-        if self.fast_prefill_enabled:
-            # Allocate static buffers for CUDAGraph
-            # TODO(sarckk): Extract this functionality to interface
-            max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
-            device = next(self.parameters()).device
-            self.positions = torch.zeros(max_num_tokens,
-                                         dtype=torch.int64,
-                                         device=device)
-            self.hidden_states = torch.zeros(
-                (max_num_tokens, config.hidden_size,
-                 self.config.altup_num_inputs),
-                dtype=self.embed_tokens.weight.dtype,
-                device=device,
-            )
-            self.per_layer_inputs = torch.zeros(
-                (max_num_tokens, self.config.num_hidden_layers,
-                 self.config.hidden_size_per_layer_input),
-                dtype=self.embed_tokens.weight.dtype,
-                device=device,
-            )
+        self.eps = torch.tensor(torch.finfo().min)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.self_decoder.get_input_embeddings(input_ids)
+        return self.embed_tokens(input_ids) * self.embed_scale
 
-    def fast_prefill_forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        per_layer_inputs: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        logits_indices_padded, num_logits_indices = None, None
-        attn_metadata = get_forward_context().attn_metadata
-
-        # attn_metadata is None during dummy runs
-        if (self.fast_prefill_enabled and attn_metadata is not None):
-            assert isinstance(attn_metadata, dict)
-            # Last layer is a KV sharing layer
-            layer_attn_metadata = attn_metadata[
-                self.layers[-1].self_attn.attn.layer_name]
-            if (isinstance(layer_attn_metadata, KVSharingFastPrefillMetadata)):
-                logits_indices_padded = (
-                    layer_attn_metadata.logits_indices_padded)
-                num_logits_indices = layer_attn_metadata.num_logits_indices
-
-        # Copy inputs for cudagraph
-        batch_size = positions.size(0)
-        self.positions[:batch_size].copy_(positions)
-        self_decoder_hidden_states, per_layer_inputs_adjusted = \
-            self.self_decoder(
-                input_ids=input_ids,
-                positions=self.positions[:batch_size],
-                inputs_embeds=inputs_embeds,
-                per_layer_inputs=per_layer_inputs,
-                **kwargs,
-            )
-
-        if logits_indices_padded is None:
-            logits_indices_padded = torch.arange(
-                positions.size(0),
-                dtype=positions.dtype,
-                device=positions.device,
-            )
-
-        # NOTE(sarckk): There is currently a bug caused by
-        # vLLM converting output of last piecewise CUDA graph
-        # to weakref, causing memory to be prematurely freed
-        # when there are multiple compilation units
-        # Keep .clone() until fix in
-        # https://github.com/vllm-project/vllm/pull/22282
-        hidden_states = self_decoder_hidden_states.clone()
-
-        # Copy inputs for cudagraph
-        num_padded_logits_indices = logits_indices_padded.size(0)
-        self.positions[:num_padded_logits_indices].copy_(
-            positions[logits_indices_padded])
-        self.hidden_states[:num_padded_logits_indices].copy_(
-            self_decoder_hidden_states[logits_indices_padded])
-        self.per_layer_inputs[:num_padded_logits_indices].copy_(
-            per_layer_inputs_adjusted[logits_indices_padded])
-        cross_decoder_hidden_states = self.cross_decoder(
-            positions=self.positions[:num_padded_logits_indices],
-            hidden_states=self.hidden_states[:num_padded_logits_indices],
-            per_layer_inputs=self.per_layer_inputs[:num_padded_logits_indices],
-            **kwargs,
-        )
-
-        if num_logits_indices is not None:
-            assert num_logits_indices > 0
-            # Merge cross-decoder and self-decoder hidden states
-            hidden_states[logits_indices_padded[:num_logits_indices]] = (
-                cross_decoder_hidden_states[:num_logits_indices])
-        else:
-            hidden_states = cross_decoder_hidden_states
-
-        return hidden_states
-
-    def normal_forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        per_layer_inputs: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        hidden_states, per_layer_inputs = self.self_decoder(
-            input_ids=input_ids,
-            positions=positions,
-            inputs_embeds=inputs_embeds,
-            per_layer_inputs=per_layer_inputs,
-            **kwargs,
-        )
-        hidden_states = self.cross_decoder(
-            positions=positions,
-            hidden_states=hidden_states,
-            per_layer_inputs=per_layer_inputs,
-            **kwargs,
-        )
-        return hidden_states
-
-    def altup_unembed(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
-        # Altup unembed.
-        target_magnitude = torch.mean(hidden_states[..., 0]**2,
-                                      dim=-1,
-                                      keepdim=True)**0.5
-        for i in range(1, self.config.altup_num_inputs):
-            hidden_states[..., i] = self.altup_unembed_projections[i - 1](
-                hidden_states[..., i])
-            new_magnitude = torch.mean(hidden_states[..., i]**2,
-                                       dim=-1,
-                                       keepdim=True)**0.5
-            hidden_states[..., i] *= target_magnitude / torch.maximum(
-                new_magnitude, self.eps)
-        # [num_tokens,hidden_size, altup_num_inputs] -> [num_tokens,hidden_size]
-        hidden_states = torch.mean(hidden_states, dim=-1)
-        return hidden_states
+    def get_per_layer_input_embeddings(
+            self, input_ids: torch.Tensor) -> torch.Tensor:
+        # Deal with the fact that vocab_size_per_layer_input < vocab_size
+        # which causes us to have some out of vocab tokens by setting
+        # those token ids to 0. This matches the HF implementation.
+        per_layer_inputs_mask = torch.logical_and(
+            input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input)
+        per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids,
+                                              torch.zeros_like(input_ids))
+        return self.embed_tokens_per_layer(
+            per_layer_inputs_tokens) * self.embed_scale_per_layer
 
     def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        per_layer_inputs: Optional[torch.Tensor] = None,
+        per_layer_inputs: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        if self.fast_prefill_enabled:
-            hidden_states = self.fast_prefill_forward(
-                input_ids,
-                positions,
-                inputs_embeds,
-                per_layer_inputs,
-                **kwargs,
-            )
+        if inputs_embeds is not None:
+            hidden_states_0 = inputs_embeds
         else:
-            hidden_states = self.normal_forward(
-                input_ids,
-                positions,
-                inputs_embeds,
-                per_layer_inputs,
+            hidden_states_0 = self.get_input_embeddings(input_ids)
+
+        per_layer_projection = self.per_layer_model_projection(hidden_states_0)
+        per_layer_projection = per_layer_projection.reshape(
+            *hidden_states_0.shape[:-1],
+            self.config.num_hidden_layers,
+            self.config.hidden_size_per_layer_input,
+        )
+        per_layer_projection = self.per_layer_projection_norm(
+            per_layer_projection)
+
+        if per_layer_inputs is not None:
+            # Profiling run does not compute per_layer_inputs
+            per_layer_inputs = per_layer_projection + per_layer_inputs
+            per_layer_inputs *= self.per_layer_input_scale
+        else:
+            per_layer_inputs = per_layer_projection
+
+        # Altup embed.
+        hidden_states = [hidden_states_0] * self.config.altup_num_inputs
+        target_magnitude = torch.mean(hidden_states_0**2, dim=-1,
+                                      keepdim=True)**0.5
+        for i in range(1, self.config.altup_num_inputs):
+            hidden_states[i] = self.altup_projections[i - 1](hidden_states[i])
+            new_magnitude = torch.mean(hidden_states[i]**2,
+                                       dim=-1,
+                                       keepdim=True)**0.5
+            hidden_states[i] *= target_magnitude / torch.maximum(
+                new_magnitude, self.eps)
+        hidden_states = torch.stack(hidden_states, dim=0)
+
+        # Transformer blocks.
+        for layer_idx, layer in enumerate(self.layers):
+            # [altup_num_inputs, num_tokens, hidden_size]
+            hidden_states = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                per_layer_input=per_layer_inputs[:, layer_idx, :],
                 **kwargs,
             )
-        hidden_states = self.altup_unembed(hidden_states)
+
+        # Altup unembed.
+        target_magnitude = torch.mean(hidden_states[0]**2,
+                                      dim=-1,
+                                      keepdim=True)**0.5
+        for i in range(1, self.config.altup_num_inputs):
+            hidden_states[i] = self.altup_unembed_projections[i - 1](
+                hidden_states[i])
+            new_magnitude = torch.mean(hidden_states[i]**2,
+                                       dim=-1,
+                                       keepdim=True)**0.5
+            hidden_states[i] *= target_magnitude / torch.maximum(
+                new_magnitude, self.eps)
+        # [altup_num_inputs,num_tokens,hidden_size] -> [num_tokens,hidden_size]
+        hidden_states = torch.mean(hidden_states, dim=0)
+
         return self.norm(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str,
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index aba4f98ea5..d59dde1560 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -620,7 +620,7 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal):
         # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache
         # them here, as the model  forward has only access to the input_embeds.
         if input_ids is not None:
-            per_layer_inputs = self.language_model.model.self_decoder.get_per_layer_input_embeddings(
+            per_layer_inputs = self.language_model.model.get_per_layer_input_embeddings(
                 input_ids)
             per_layer_inputs = per_layer_inputs.reshape(
                 -1, self.config.text_config.num_hidden_layers,

From 5674a40366bae4cfc862c35b719e3dcac2587ac1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 29 Aug 2025 20:37:24 +0100
Subject: [PATCH 734/932] [Misc] Make `download_weights_from_hf` more reliable
 (#23863)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../model_loader/weight_utils.py              | 49 ++++++++++++-------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 3bb47f82d2..f87eeaa456 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -278,33 +278,48 @@ def download_weights_from_hf(
     Returns:
         str: The path to the downloaded model weights.
     """
+    assert len(allow_patterns) > 0
     local_only = huggingface_hub.constants.HF_HUB_OFFLINE
     if not local_only:
-        # Before we download we look at that is available:
-        fs = HfFileSystem()
-        file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
+        # Attempt to reduce allow_patterns to a single pattern
+        # so we only have to call snapshot_download once.
+        try:
+            fs = HfFileSystem()
+            file_list = fs.ls(model_name_or_path,
+                              detail=False,
+                              revision=revision)
 
-        # depending on what is available we download different things
-        for pattern in allow_patterns:
-            matching = fnmatch.filter(file_list, pattern)
-            if len(matching) > 0:
-                allow_patterns = [pattern]
+            # Use the first pattern found in the HF repo's files.
+            for pattern in allow_patterns:
+                matching = fnmatch.filter(file_list, pattern)
+                if len(matching) > 0:
+                    allow_patterns = [pattern]
                 break
+        except Exception as e:
+            logger.warning(
+                "Failed to get file list for '%s'. Trying each pattern in "
+                "allow_patterns individually until weights have been "
+                "downloaded. Error: %s", model_name_or_path, e)
 
     logger.info("Using model weights format %s", allow_patterns)
     # Use file lock to prevent multiple processes from
     # downloading the same model weights at the same time.
     with get_lock(model_name_or_path, cache_dir):
         start_time = time.perf_counter()
-        hf_folder = snapshot_download(
-            model_name_or_path,
-            allow_patterns=allow_patterns,
-            ignore_patterns=ignore_patterns,
-            cache_dir=cache_dir,
-            tqdm_class=DisabledTqdm,
-            revision=revision,
-            local_files_only=local_only,
-        )
+        for allow_pattern in allow_patterns:
+            hf_folder = snapshot_download(
+                model_name_or_path,
+                allow_patterns=allow_pattern,
+                ignore_patterns=ignore_patterns,
+                cache_dir=cache_dir,
+                tqdm_class=DisabledTqdm,
+                revision=revision,
+                local_files_only=local_only,
+            )
+            # If we have downloaded weights for this allow_pattern,
+            # we don't need to check the rest.
+            if any(Path(hf_folder).glob(allow_pattern)):
+                break
         time_taken = time.perf_counter() - start_time
         if time_taken > 0.5:
             logger.info("Time spent downloading weights for %s: %.6f seconds",

From d660c98c1b59580af97d6c7dd162c7c8894d40ed Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Fri, 29 Aug 2025 15:40:04 -0700
Subject: [PATCH 735/932] [CI] Fix unavailable image remote URL (#23966)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 tests/models/multimodal/generation/vlm_utils/custom_inputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index c53243b42e..c68ac8f576 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -119,7 +119,7 @@ def different_patch_input_cases_internvl():
 
 def windows_attention_image_qwen2_5_vl():
     # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
-    image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
+    image_url = "https://github.com/AOMediaCodec/av1-avif/blob/main/testFiles/Link-U/hato.jpg?raw=true"
     image = Image.open(BytesIO(requests.get(image_url).content))
 
     question = "Describe the image."

From 5b31cb1781e594aae29d878b1acde3e2f900bc41 Mon Sep 17 00:00:00 2001
From: dubejf <dubejf@users.noreply.github.com>
Date: Sat, 30 Aug 2025 00:36:39 -0400
Subject: [PATCH 736/932] [Bugfix] Fix --config arg expansion called from
 api_server.py (#23944)

Signed-off-by: Jean-Francois Dube <dubejf+gh@gmail.com>
Co-authored-by: Jean-Francois Dube <dubejf+gh@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_cli_args.py | 22 ++++++++++++++++++++++
 vllm/utils/__init__.py                    |  7 +++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index b20838956d..9a1c0ea13b 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -27,6 +27,28 @@ def serve_parser():
     return make_arg_parser(parser)
 
 
+### Test config parsing
+def test_config_arg_parsing(serve_parser, cli_config_file):
+    args = serve_parser.parse_args([])
+    assert args.port == 8000
+    args = serve_parser.parse_args(['--config', cli_config_file])
+    assert args.port == 12312
+    args = serve_parser.parse_args([
+        '--config',
+        cli_config_file,
+        '--port',
+        '9000',
+    ])
+    assert args.port == 9000
+    args = serve_parser.parse_args([
+        '--port',
+        '9000',
+        '--config',
+        cli_config_file,
+    ])
+    assert args.port == 9000
+
+
 ### Tests for LoRA module parsing
 def test_valid_key_value_format(serve_parser):
     # Test old format: name=path
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 60bddc5b50..c5ed10326f 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1976,13 +1976,16 @@ class FlexibleArgumentParser(ArgumentParser):
 
         config_args = self.load_config_file(file_path)
 
-        # 0th index is for {serve,chat,complete}
+        # 0th index might be the sub command {serve,chat,complete,...}
         # optionally followed by model_tag (only for serve)
         # followed by config args
         # followed by rest of cli args.
         # maintaining this order will enforce the precedence
         # of cli > config > defaults
-        if args[0] == "serve":
+        if args[0].startswith('-'):
+            # No sub command (e.g., api_server entry point)
+            args = config_args + args[0:index] + args[index + 2:]
+        elif args[0] == "serve":
             model_in_cli = len(args) > 1 and not args[1].startswith('-')
             model_in_config = any(arg == '--model' for arg in config_args)
 

From 8fb85b7bb67408c725474c31fe2a8f980c250277 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Fri, 29 Aug 2025 21:36:48 -0700
Subject: [PATCH 737/932] Add routed_scaling_factor to MoE grouped topk
 (#23123)

Signed-off-by: Xin Yang <xyangx@amazon.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../layers/fused_moe/cpu_fused_moe.py          | 12 ++++++++++++
 .../layers/fused_moe/fused_moe.py              |  7 ++++---
 vllm/model_executor/layers/fused_moe/layer.py  | 18 ++++++++++++++++++
 .../layers/fused_moe/rocm_aiter_fused_moe.py   |  3 +++
 .../layers/quantization/awq_marlin.py          |  2 ++
 .../layers/quantization/bitsandbytes.py        |  2 ++
 .../compressed_tensors_moe.py                  | 10 ++++++++++
 .../layers/quantization/experts_int8.py        |  2 ++
 vllm/model_executor/layers/quantization/fp8.py |  4 +++-
 .../model_executor/layers/quantization/gguf.py |  2 ++
 .../layers/quantization/gptq_marlin.py         |  2 ++
 .../layers/quantization/modelopt.py            |  4 ++++
 .../layers/quantization/moe_wna16.py           |  2 ++
 .../layers/quantization/mxfp4.py               |  2 ++
 .../layers/quantization/quark/quark_moe.py     |  4 ++++
 vllm/model_executor/layers/quantization/rtn.py |  2 ++
 vllm/model_executor/models/deepseek_v2.py      |  1 +
 vllm/model_executor/models/dots1.py            |  1 +
 vllm/model_executor/models/glm4_moe.py         |  1 +
 19 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 769a04b7de..0eec93601b 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -21,6 +21,7 @@ def grouped_topk(
     num_expert_group: int = 0,
     topk_group: int = 0,
     scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,
     e_score_correction_bias: Optional[torch.Tensor] = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert hidden_states.shape[0] == gating_output.shape[0], (
@@ -65,6 +66,8 @@ def grouped_topk(
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
 
+    if routed_scaling_factor != 1.0:
+        topk_weights = topk_weights * routed_scaling_factor
     return topk_weights, topk_ids.to(torch.int32)
 
 
@@ -78,6 +81,7 @@ def select_experts(
     num_expert_group: Optional[int] = None,
     custom_routing_function: Optional[Callable] = None,
     scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,
     e_score_correction_bias: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     if use_grouped_topk:
@@ -90,6 +94,7 @@ def select_experts(
                             num_expert_group=num_expert_group,
                             topk_group=topk_group,
                             scoring_func=scoring_func,
+                            routed_scaling_factor=routed_scaling_factor,
                             e_score_correction_bias=e_score_correction_bias)
     elif custom_routing_function is None:
         assert scoring_func == "softmax"
@@ -131,12 +136,15 @@ class IPEXFusedMOE:
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         assert activation == "silu", f"{activation} is not supported."
         assert not apply_router_weight_on_input
+        assert routed_scaling_factor == 1.0, \
+            f"routed_scaling_factor {routed_scaling_factor} is not supported."
         return layer.ipex_fusion(
             x,
             use_grouped_topk,
@@ -170,6 +178,7 @@ class SGLFusedMOE:
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -186,6 +195,7 @@ class SGLFusedMOE:
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
         )
 
@@ -227,6 +237,7 @@ class CPUFusedMOE:
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -243,6 +254,7 @@ class CPUFusedMOE:
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 17a5c735a5..eb3e14180e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1011,7 +1011,8 @@ def grouped_topk(
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
 
-    topk_weights = topk_weights * routed_scaling_factor
+    if routed_scaling_factor != 1.0:
+        topk_weights = topk_weights * routed_scaling_factor
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
@@ -1790,8 +1791,8 @@ def fused_moe(
         Defaults to False.
     - global_num_experts (int): The total number of experts in the global
         expert space.
-    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices 
-        from the global expert space to the local expert space of the expert 
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+        from the global expert space to the local expert space of the expert
         parallel shard.
     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
         w1.
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 5a87763c07..3a2c9cbaf4 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -244,6 +244,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -400,6 +401,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -427,6 +429,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             expert_map=expert_map,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             activation=activation,
             apply_router_weight_on_input=apply_router_weight_on_input,
@@ -450,6 +453,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -469,6 +473,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
             enable_eplb=enable_eplb,
@@ -534,6 +539,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -560,6 +566,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             expert_map,
             custom_routing_function,
             scoring_func,
+            routed_scaling_factor,
             e_score_correction_bias,
             apply_router_weight_on_input,
             activation,
@@ -579,6 +586,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -617,6 +625,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -637,6 +646,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             raise NotImplementedError(
                 "Expert score correction bias is not supported for TPU.")
         assert activation == "silu", f"{activation} is not supported for TPU."
+        assert routed_scaling_factor == 1.0, \
+            f"routed_scaling_factor {routed_scaling_factor} is not supported " \
+            f"for TPU."
         if enable_eplb is not False or expert_load_view is not None or \
                 logical_to_physical_map is not None or \
                 logical_replica_count is not None:
@@ -766,6 +778,7 @@ class FusedMoE(CustomOp):
         prefix: str = "",
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -848,6 +861,7 @@ class FusedMoE(CustomOp):
         self.topk_group = topk_group
         self.custom_routing_function = custom_routing_function
         self.scoring_func = scoring_func
+        self.routed_scaling_factor = routed_scaling_factor
         self.e_score_correction_bias = e_score_correction_bias
         self.apply_router_weight_on_input = apply_router_weight_on_input
         self.activation = activation
@@ -1416,6 +1430,7 @@ class FusedMoE(CustomOp):
         num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         indices_type: Optional[torch.dtype] = None,
         enable_eplb: bool = False,
@@ -1460,6 +1475,7 @@ class FusedMoE(CustomOp):
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
                 scoring_func=scoring_func,
+                routed_scaling_factor=routed_scaling_factor,
                 e_score_correction_bias=e_score_correction_bias)
             if indices_type is not None:
                 topk_ids = topk_ids.to(dtype=indices_type)
@@ -1627,6 +1643,7 @@ class FusedMoE(CustomOp):
                 num_expert_group=self.num_expert_group,
                 custom_routing_function=self.custom_routing_function,
                 scoring_func=self.scoring_func,
+                routed_scaling_factor=self.routed_scaling_factor,
                 e_score_correction_bias=self.e_score_correction_bias,
                 activation=self.activation,
                 enable_eplb=self.enable_eplb,
@@ -1695,6 +1712,7 @@ class FusedMoE(CustomOp):
             num_expert_group=self.num_expert_group,
             custom_routing_function=self.custom_routing_function,
             scoring_func=self.scoring_func,
+            routed_scaling_factor=self.routed_scaling_factor,
             e_score_correction_bias=self.e_score_correction_bias,
             activation=self.activation,
             apply_router_weight_on_input=self.apply_router_weight_on_input,
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index b838fd798b..f14f13e2ad 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -267,6 +267,7 @@ def rocm_aiter_grouped_topk(
     num_expert_group: int = 0,
     topk_group: int = 0,
     scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,
     e_score_correction_bias: Optional[torch.Tensor] = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
     token = hidden_states.shape[0]
@@ -298,6 +299,8 @@ def rocm_aiter_grouped_topk(
             scoring_func,
         )
 
+    if routed_scaling_factor != 1.0:
+        topk_weights = topk_weights * routed_scaling_factor
     return topk_weights, topk_ids
 
 
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 287d66b06d..8293d42ef4 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -497,6 +497,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -523,6 +524,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index b7897a4379..9713757df9 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -466,6 +466,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -490,6 +491,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
         if self.quant_config.load_in_8bit:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 2cad9ff0d3..e458541922 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -350,6 +350,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -375,6 +376,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
         )
@@ -809,6 +811,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -832,6 +835,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
         )
@@ -1057,6 +1061,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -1084,6 +1089,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
@@ -1361,6 +1367,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -1389,6 +1396,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
@@ -1592,6 +1600,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -1618,6 +1627,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 3e43caa4cb..2d8a684bc7 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -120,6 +120,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -146,6 +147,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 0200b0e9ed..48bac8697e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -955,6 +955,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -994,7 +995,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     expert_offset=layer.ep_rank * layer.local_num_experts,
                     local_num_experts=layer.local_num_experts,
                     block_shape=self.quant_config.weight_block_size,
-                    routed_scaling=1.0,
+                    routed_scaling=routed_scaling_factor,
                 )
             else:
                 assert (not renormalize
@@ -1020,6 +1021,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
             enable_eplb=enable_eplb,
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 90222f2e3b..ad648df238 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -532,6 +532,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -562,6 +563,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
         return fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index c5d1e01701..3509759666 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -643,6 +643,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -669,6 +670,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 1fbb2e3bb6..4bb8438d90 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -483,6 +483,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -521,6 +522,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
         )
@@ -1356,6 +1358,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -1434,6 +1437,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 0cde104cc7..fb3e4b518b 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -297,6 +297,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -322,6 +323,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index f7d591328f..a2301779c7 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -546,6 +546,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -569,6 +570,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 num_expert_group=num_expert_group,
                 custom_routing_function=custom_routing_function,
                 scoring_func=scoring_func,
+                routed_scaling_factor=routed_scaling_factor,
                 e_score_correction_bias=e_score_correction_bias)
 
             return torch.ops.vllm.fused_marlin_moe(
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 58f56c6381..fdf03ded04 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -218,6 +218,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -244,6 +245,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
@@ -380,6 +382,7 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -406,6 +409,7 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 8bdb50e07b..8f72b8cbea 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -283,6 +283,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -309,6 +310,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index ed033954f7..61e8090411 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -160,6 +160,7 @@ class DeepseekV2MoE(nn.Module):
             topk_group=config.topk_group,
             prefix=f"{prefix}.experts",
             scoring_func=config.scoring_func,
+            routed_scaling_factor=self.routed_scaling_factor,
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts)
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index c386f8db9e..a5477af869 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -137,6 +137,7 @@ class Dots1MoE(nn.Module):
             topk_group=config.topk_group,
             prefix=f"{prefix}.experts",
             scoring_func=config.scoring_func,
+            routed_scaling_factor=self.routed_scaling_factor,
             e_score_correction_bias=self.gate.e_score_correction_bias)
 
         if config.n_shared_experts is not None:
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index fcc63815ac..06ed453ec2 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -159,6 +159,7 @@ class Glm4MoE(nn.Module):
             topk_group=config.topk_group,
             prefix=f"{prefix}.experts",
             scoring_func="sigmoid",
+            routed_scaling_factor=self.routed_scaling_factor,
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts)

From ee52a32705988bcac1833feab34af977addf5cca Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Fri, 29 Aug 2025 21:41:25 -0700
Subject: [PATCH 738/932] [CI] Move testing image from remote URL to S3
 (#23980)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 .../multimodal/generation/vlm_utils/custom_inputs.py  | 11 ++++-------
 vllm/assets/image.py                                  |  2 +-
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index c68ac8f576..e369416fc4 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -1,12 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
-from io import BytesIO
 from typing import Callable
 
-import requests
-from PIL import Image
-
+from vllm.assets.image import ImageAsset
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import (rescale_video_size, resize_video,
                                    sample_frames_from_video)
@@ -118,9 +115,9 @@ def different_patch_input_cases_internvl():
 
 
 def windows_attention_image_qwen2_5_vl():
-    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
-    image_url = "https://github.com/AOMediaCodec/av1-avif/blob/main/testFiles/Link-U/hato.jpg?raw=true"
-    image = Image.open(BytesIO(requests.get(image_url).content))
+
+    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
+    image = ImageAsset("hato").pil_image
 
     question = "Describe the image."
     img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index c977242a3d..c8f8d43a98 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -11,7 +11,7 @@ from .base import get_vllm_public_assets
 
 VLM_IMAGES_DIR = "vision_model_images"
 
-ImageAssetName = Literal["stop_sign", "cherry_blossom"]
+ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato"]
 
 
 @dataclass(frozen=True)

From 9748c5198b492e22dc24d6eb455ec907369392f3 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Sat, 30 Aug 2025 00:14:43 -0700
Subject: [PATCH 739/932] [CI] Fix broken compile tests due to unsupported
 SiluMul+Nvfp4Quant fusion (#23973)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/compilation/activation_quant_fusion.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
index 40e124a03e..f2fbb1200e 100644
--- a/vllm/compilation/activation_quant_fusion.py
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -29,8 +29,9 @@ SILU_MUL_OP = torch.ops._C.silu_and_mul.default
 FUSED_OPS: dict[QuantKey, OpOverload] = {
     kFp8StaticTensorSym: torch.ops._C.silu_and_mul_quant.default,  # noqa: E501
 }
-if current_platform.is_cuda() and hasattr(torch.ops._C,
-                                          "silu_and_mul_nvfp4_quant"):
+silu_and_mul_nvfp4_quant_supported = (current_platform.is_cuda() and hasattr(
+    torch.ops._C, "silu_and_mul_nvfp4_quant"))
+if silu_and_mul_nvfp4_quant_supported:
     FUSED_OPS[
         kNvfp4Quant] = torch.ops._C.silu_and_mul_nvfp4_quant.default  # noqa: E501
 
@@ -171,8 +172,9 @@ class ActivationQuantFusionPass(VllmInductorPass):
         pattern_silu_mul_fp8 = SiluMulFp8StaticQuantPattern()
         pattern_silu_mul_fp8.register(self.patterns)
 
-        pattern_silu_mul_nvfp4 = SiluMulNvfp4QuantPattern()
-        pattern_silu_mul_nvfp4.register(self.patterns)
+        if silu_and_mul_nvfp4_quant_supported:
+            pattern_silu_mul_nvfp4 = SiluMulNvfp4QuantPattern()
+            pattern_silu_mul_nvfp4.register(self.patterns)
 
     def __call__(self, graph: torch.fx.Graph):
         self.begin()

From f1bddbd852f37f98958d636821c45014c05e07a8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 30 Aug 2025 15:14:58 +0800
Subject: [PATCH 740/932] [Core] Cleanup TPU model runner for MM (#23894)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/v1/worker/tpu_model_runner.py | 32 +-----------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 2307006127..985d5ba58c 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -808,31 +808,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         return per_layer_attn_metadata, logits_indices, padded_num_reqs,\
             num_reqs, end_index
 
-    def _scatter_placeholders(
-        self,
-        embeds: torch.Tensor,
-        is_embed: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        if is_embed is None:
-            return embeds
-
-        placeholders = embeds.new_full(
-            (is_embed.shape[0], embeds.shape[-1]),
-            fill_value=torch.nan,
-        )
-        placeholders[is_embed] = embeds
-        return placeholders
-
-    def _gather_placeholders(
-        self,
-        placeholders: torch.Tensor,
-        is_embed: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        if is_embed is None:
-            return placeholders
-
-        return placeholders[is_embed]
-
     def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
@@ -892,12 +867,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # NOTE (NickLucche) here we diverge from logic in other runners, as we
         # assume to only have whole mm items to process. Hence we avoid the
         # intrinsic dynamism that `scatter_mm_placeholders` introduces.
-        for (mm_hash, pos_info), output in zip(
-                mm_hashes_pos,
-                encoder_outputs,
-        ):
-            if req_id not in self.encoder_cache:
-                self.encoder_cache[req_id] = {}
+        for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs):
             assert pos_info.is_embed is None, "Expected all positions to be"\
                 " contiguous and embeddings."
             self.encoder_cache[mm_hash] = output

From 4071c76cf3cff46e14630f1e66cbf006b6eb51d3 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Sat, 30 Aug 2025 09:16:15 +0200
Subject: [PATCH 741/932] [V1] [Hybrid] Move MiniMaxLinearAttention into
 layers/mamba (#23831)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../layers/mamba/linear_attn.py               | 442 ++++++++++++++++++
 vllm/model_executor/models/minimax_text_01.py | 416 +----------------
 2 files changed, 448 insertions(+), 410 deletions(-)
 create mode 100644 vllm/model_executor/layers/mamba/linear_attn.py

diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
new file mode 100644
index 0000000000..d93cef1a27
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -0,0 +1,442 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from typing import TYPE_CHECKING, Optional, Union
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+from typing import TYPE_CHECKING
+
+import torch
+import torch.distributed
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+
+from vllm import envs
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.lightning_attn import (
+    lightning_attention, linear_decode_forward_triton)
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.mamba.abstract import MambaBase
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
+from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+import torch
+import torch.distributed
+
+from vllm.model_executor.models.minimax_cache import MinimaxCacheParams
+
+
+class MiniMaxText01RMSNormTP(CustomOp):
+    name = "MiniMaxText01RMSNormTP"
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.tp_world = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.weight = nn.Parameter(torch.ones(int(hidden_size /
+                                                  self.tp_world)))
+
+        self.weight.weight_loader = self.weight_loader
+        self.variance_epsilon = eps
+        return
+
+    @staticmethod
+    def weight_loader(
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+    ) -> None:
+        tp_world = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+
+        shard_size = loaded_weight.shape[0] // tp_world
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        param.data.copy_(loaded_weight[shard])
+        return
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32)
+        if self.tp_world > 1:
+            variance = tensor_model_parallel_all_reduce(
+                variance) / self.tp_world
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+
+        weight = self.weight
+        if x.size(-1) != self.weight.size(0):
+            if self.weight.size(0) < x.size(-1):
+                repeat_count = (x.size(-1) + self.weight.size(0)) // x.size(-1)
+                full_weight = self.weight.repeat(repeat_count)
+                weight = full_weight[:x.size(-1)]
+            else:
+                weight = self.weight[:x.size(-1)]
+
+        x = x.to(orig_dtype) * weight
+        return x
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        assert residual is None, "RMSNorm does not support residual connection."
+        return self._forward(x)
+
+
+class MiniMaxText01LinearKernel:
+
+    @staticmethod
+    def jit_linear_forward_prefix(q: torch.Tensor,
+                                  k: torch.Tensor,
+                                  v: torch.Tensor,
+                                  kv_caches: torch.Tensor,
+                                  slope_rate: torch.Tensor,
+                                  block_size: int,
+                                  layer_idx: Optional[int] = None,
+                                  **kwargs) -> torch.Tensor:
+
+        slope_rate = slope_rate.to(torch.float32)
+        should_pad_dim = q.dim() == 3
+        if should_pad_dim:
+            q = q.unsqueeze(0)
+            k = k.unsqueeze(0)
+            v = v.unsqueeze(0)
+        b, h, n, d = q.shape
+        e = d
+        kv_history = kv_caches.reshape(1, h, d, e).contiguous()
+        output, kv_history = lightning_attention(q,
+                                                 k,
+                                                 v,
+                                                 slope_rate,
+                                                 block_size=block_size,
+                                                 kv_history=kv_history)
+        kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e))
+        assert output.shape[0] == 1, "batch size must be 1"
+        return rearrange(output.squeeze(0), "h n d -> n (h d)")
+
+
+class MiniMaxText01LinearAttention(nn.Module, MambaBase):
+
+    @property
+    def mamba_type(self) -> str:
+        return "linear_attention"
+
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        from vllm.v1.attention.backends.linear_attn import (
+            LinearAttentionBackend)
+        return LinearAttentionBackend
+
+    def get_state_dtype(self) -> tuple[torch.dtype]:
+        assert self.model_config is not None
+        assert self.cache_config is not None
+        return MambaStateDtypeCalculator.linear_attention_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+        )
+
+    def get_state_shape(self) -> tuple[tuple[int, int, int], ...]:
+        return MambaStateShapeCalculator.linear_attention_state_shape(
+            num_heads=self.num_heads,
+            tp_size=self.tp_size,
+            head_dim=self.head_dim)
+
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_inner_size: int,
+        num_heads: int,
+        head_dim: int,
+        max_position: int,
+        block_size: int,
+        num_hidden_layer: int,
+        model_config: Optional[ModelConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = 0,
+        linear_layer_idx: int = 0,
+        prefix: str = "linear_attn",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = layer_idx
+        self.BLOCK = block_size
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.total_num_heads = num_heads
+        self.hidden_inner_size = hidden_inner_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        assert self.total_num_heads % self.tp_size == 0
+        self.tp_heads = self.total_num_heads // self.tp_size
+        self.qkv_size = self.num_heads * self.head_dim
+        self.tp_hidden = self.head_dim * self.tp_heads
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.prefix = prefix
+
+        self.qkv_proj = ColumnParallelLinear(
+            hidden_size,
+            self.hidden_inner_size * 3,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.output_gate = ColumnParallelLinear(
+            hidden_size,
+            self.hidden_inner_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output_gate",
+        )
+        self.out_proj = RowParallelLinear(
+            self.hidden_inner_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.norm = MiniMaxText01RMSNormTP(
+            self.hidden_inner_size,
+            eps=1e-5,
+        )
+
+        slope_rate = MiniMaxText01LinearAttention._build_slope_tensor(
+            self.num_heads)
+        if num_hidden_layer <= 1:
+            self.slope_rate = slope_rate * (1 + 1e-5)
+        else:
+            self.slope_rate = slope_rate * (1 - layer_idx /
+                                            (num_hidden_layer - 1) + 1e-5)
+        self.tp_slope = self.slope_rate[self.tp_rank *
+                                        self.tp_heads:(self.tp_rank + 1) *
+                                        self.tp_heads].contiguous()
+
+        if envs.VLLM_USE_V1:
+            compilation_config = get_current_vllm_config().compilation_config
+            if prefix in compilation_config.static_forward_context:
+                raise ValueError(f"Duplicate layer name: {prefix}")
+            compilation_config.static_forward_context[prefix] = self
+
+    @staticmethod
+    def weight_direct_load(param: torch.Tensor,
+                           loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+        return
+
+    @staticmethod
+    def _build_slope_tensor(n_attention_heads: int):
+
+        def get_slopes(n):
+
+            def get_slopes_power_of_2(n):
+                start = 2**(-(2**-(math.log2(n) - 3)))
+                ratio = start
+                return [start * ratio**i for i in range(n)]
+
+            if math.log2(n).is_integer():
+                return get_slopes_power_of_2(n)
+            else:
+                closest_power_of_2 = 2**math.floor(math.log2(n))
+                return (get_slopes_power_of_2(closest_power_of_2) + get_slopes(
+                    2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
+
+        slopes = torch.tensor(get_slopes(n_attention_heads),
+                              dtype=torch.float32).reshape(
+                                  n_attention_heads, 1, 1)
+        return slopes
+
+    def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
+                               attn_metadata):
+        hidden = []
+        for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)):
+            if _prefill_idx >= len(attn_metadata.query_start_loc):
+                break
+            if _prefill_idx >= len(state_indices_tensor):
+                break
+            # prefills are packed at end of batch in V1
+            offset = attn_metadata.num_decode_tokens if envs.VLLM_USE_V1 else 0
+            _start = attn_metadata.query_start_loc[offset + _prefill_idx]
+            _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1]
+            slot_id = state_indices_tensor[offset + _prefill_idx]
+            qs = q[_start:_end].transpose(0, 1).contiguous()
+            ks = k[_start:_end].transpose(0, 1).contiguous()
+            vs = v[_start:_end].transpose(0, 1).contiguous()
+            slice_layer_cache = kv_cache[slot_id, ...]
+
+            out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix(
+                qs,
+                ks,
+                vs,
+                slice_layer_cache,
+                self.tp_slope,
+                self.BLOCK,
+                layer_idx=self.layer_idx)
+            hidden.append(out_slice.contiguous())
+        if attn_metadata.num_decode_tokens > 0:
+            hidden_decode = self._decode_infer(q, k, v, kv_cache,
+                                               state_indices_tensor,
+                                               attn_metadata)
+            if envs.VLLM_USE_V1:
+                hidden.insert(0, hidden_decode)
+            else:
+                hidden.append(hidden_decode)
+
+        if not hidden:
+            return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype)
+
+        hidden = torch.concat(hidden, dim=0).contiguous()
+        return hidden
+
+    def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor,
+                      attn_metadata):
+        if not envs.VLLM_USE_V1:
+            q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+            k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+            v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+            num_prefills = getattr(attn_metadata, "num_prefills", 0)
+            slot_id = state_indices_tensor[num_prefills:]
+        else:
+            q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+            k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+            v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+            slot_id = state_indices_tensor[:attn_metadata.num_decodes]
+        hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope,
+                                              slot_id, 32)
+        return hidden
+
+    def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: MinimaxCacheParams) -> None:
+        if not envs.VLLM_USE_V1:
+            self._forward(hidden_states, output, positions, kv_caches)
+        else:
+            torch.ops.vllm.linear_attention(
+                hidden_states,
+                output,
+                positions,
+                self.prefix,
+            )
+
+    def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                 positions: torch.Tensor,
+                 kv_caches: Optional[MinimaxCacheParams]) -> None:
+        forward_context = get_forward_context()
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+        if envs.VLLM_USE_V1 and attn_metadata is not None:
+            assert isinstance(attn_metadata, dict)
+            attn_metadata = attn_metadata[self.prefix]
+            assert isinstance(attn_metadata, LinearAttentionMetadata)
+            num_actual_tokens = attn_metadata.num_prefill_tokens + \
+                attn_metadata.num_decode_tokens
+        else:
+            num_actual_tokens = hidden_states.shape[0]
+
+        qkv, _ = self.qkv_proj(hidden_states[:num_actual_tokens])
+        qkv32 = qkv.to(torch.float32)
+        qkvact = torch.nn.functional.silu(qkv32)
+        qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1))
+        q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
+        if envs.VLLM_USE_V1:
+            if attn_metadata is not None:
+                kv_cache = self.kv_cache[forward_context.virtual_engine][0]
+                state_indices_tensor = attn_metadata.state_indices_tensor
+
+                num_prefills = getattr(attn_metadata, "num_prefills", 0)
+                if num_prefills > 0:
+                    num_decode_tokens = getattr(attn_metadata,
+                                                "num_decode_tokens", 0)
+                    for prefill_idx in range(num_prefills):
+                        q_start = attn_metadata.query_start_loc[
+                            num_decode_tokens + prefill_idx]
+                        q_end = attn_metadata.query_start_loc[num_decode_tokens
+                                                              + prefill_idx +
+                                                              1]
+                        query_len = q_end - q_start
+                        context_len = attn_metadata.seq_lens[
+                            num_decode_tokens + prefill_idx] - query_len
+                        if context_len == 0:
+                            block_to_clear = state_indices_tensor[
+                                num_decode_tokens + prefill_idx]
+                            kv_cache[block_to_clear, ...] = 0
+        else:
+            assert kv_caches is not None
+            kv_cache = kv_caches.minimax_cache
+            state_indices_tensor = kv_caches.state_indices_tensor
+
+        decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
+        if attn_metadata is None:
+            hidden = torch.empty((q.shape[0], q.shape[1] * q.shape[2]),
+                                 device=q.device,
+                                 dtype=q.dtype)
+        else:
+            if not decode_only:
+                hidden = self._prefill_and_mix_infer(q, k, v, kv_cache,
+                                                     state_indices_tensor,
+                                                     attn_metadata)
+            else:
+                hidden = self._decode_infer(q, k, v, kv_cache,
+                                            state_indices_tensor,
+                                            attn_metadata)
+        hidden = self.norm._forward(hidden)
+        gate, _ = self.output_gate(hidden_states[:num_actual_tokens])
+        hidden = F.sigmoid(gate) * hidden
+        hidden = hidden.to(hidden_states.dtype)
+
+        output[:num_actual_tokens], _ = self.out_proj(hidden)
+
+
+def linear_attention(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    positions: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self._forward(hidden_states=hidden_states,
+                  output=output,
+                  positions=positions,
+                  kv_caches=None)
+
+
+def linear_attention_fake(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    positions: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="linear_attention",
+    op_func=linear_attention,
+    mutates_args=["output"],
+    fake_impl=linear_attention_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 93ef13d5d1..ef1fe86c5b 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -1,45 +1,37 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only MiniMaxText01 model."""
-import math
 from collections.abc import Iterable
 from itertools import islice
 from typing import TYPE_CHECKING, Optional, Union
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
+    pass
 
 import regex as re
 import torch
 import torch.distributed
-import torch.nn.functional as F
-from einops import rearrange
 from torch import nn
 from transformers import MiniMaxConfig
 
 from vllm import envs
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
-                         get_current_vllm_config)
-from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size)
-from vllm.forward_context import ForwardContext, get_forward_context
-from vllm.model_executor.custom_op import CustomOp
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.lightning_attn import (
-    lightning_attention, linear_decode_forward_triton)
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.mamba.abstract import MambaBase
+from vllm.model_executor.layers.mamba.linear_attn import (
+    MiniMaxText01LinearAttention)
 from vllm.model_executor.layers.mamba.mamba_utils import (
     MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -50,10 +42,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import direct_register_custom_op
-from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
 
 from .interfaces import HasInnerState, IsHybrid
 from .minimax_cache import MinimaxCacheManager, MinimaxCacheParams
@@ -87,66 +76,6 @@ def weight_loader_with_alias(alias: str):
     return wrapper
 
 
-class MiniMaxText01RMSNormTP(CustomOp):
-    name = "MiniMaxText01RMSNormTP"
-
-    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
-        super().__init__()
-        self.tp_world = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.weight = nn.Parameter(torch.ones(int(hidden_size /
-                                                  self.tp_world)))
-
-        self.weight.weight_loader = self.weight_loader
-        self.variance_epsilon = eps
-        return
-
-    @staticmethod
-    def weight_loader(
-        param: nn.Parameter,
-        loaded_weight: torch.Tensor,
-    ) -> None:
-        tp_world = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
-
-        shard_size = loaded_weight.shape[0] // tp_world
-        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
-        param.data.copy_(loaded_weight[shard])
-        return
-
-    def _forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        orig_dtype = x.dtype
-        x = x.to(torch.float32)
-        variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32)
-        if self.tp_world > 1:
-            variance = tensor_model_parallel_all_reduce(
-                variance) / self.tp_world
-        x = x * torch.rsqrt(variance + self.variance_epsilon)
-
-        weight = self.weight
-        if x.size(-1) != self.weight.size(0):
-            if self.weight.size(0) < x.size(-1):
-                repeat_count = (x.size(-1) + self.weight.size(0)) // x.size(-1)
-                full_weight = self.weight.repeat(repeat_count)
-                weight = full_weight[:x.size(-1)]
-            else:
-                weight = self.weight[:x.size(-1)]
-
-        x = x.to(orig_dtype) * weight
-        return x
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert residual is None, "RMSNorm does not support residual connection."
-        return self._forward(x)
-
-
 class MiniMaxText01MLP(nn.Module):
 
     def __init__(
@@ -253,307 +182,6 @@ class MiniMaxText01MoE(nn.Module):
         return final_hidden
 
 
-class MiniMaxText01LinearKernel:
-
-    @staticmethod
-    def jit_linear_forward_prefix(q: torch.Tensor,
-                                  k: torch.Tensor,
-                                  v: torch.Tensor,
-                                  kv_caches: torch.Tensor,
-                                  slope_rate: torch.Tensor,
-                                  block_size: int,
-                                  layer_idx: int = None,
-                                  **kwargs) -> torch.Tensor:
-
-        slope_rate = slope_rate.to(torch.float32)
-        should_pad_dim = q.dim() == 3
-        if should_pad_dim:
-            q = q.unsqueeze(0)
-            k = k.unsqueeze(0)
-            v = v.unsqueeze(0)
-        b, h, n, d = q.shape
-        e = d
-        kv_history = kv_caches.reshape(1, h, d, e).contiguous()
-        output, kv_history = lightning_attention(q,
-                                                 k,
-                                                 v,
-                                                 slope_rate,
-                                                 block_size=block_size,
-                                                 kv_history=kv_history)
-        kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e))
-        assert output.shape[0] == 1, "batch size must be 1"
-        return rearrange(output.squeeze(0), "h n d -> n (h d)")
-
-
-class MiniMaxText01LinearAttention(nn.Module, MambaBase):
-
-    @property
-    def mamba_type(self) -> str:
-        return "linear_attention"
-
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.linear_attn import (
-            LinearAttentionBackend)
-        return LinearAttentionBackend
-
-    def get_state_dtype(self) -> tuple[torch.dtype]:
-        return MambaStateDtypeCalculator.linear_attention_state_dtype(
-            self.model_config.dtype,
-            self.cache_config.mamba_cache_dtype,
-        )
-
-    def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
-        return MambaStateShapeCalculator.linear_attention_state_shape(
-            num_heads=self.num_heads,
-            tp_size=self.tp_size,
-            head_dim=self.head_dim)
-
-    def __init__(
-        self,
-        hidden_size: int,
-        hidden_inner_size: int,
-        num_heads: int,
-        head_dim: int,
-        max_position: int,
-        block_size: int,
-        num_hidden_layer: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        layer_idx: int = 0,
-        linear_layer_idx: int = 0,
-        prefix: str = "linear_attn",
-    ) -> None:
-        super().__init__()
-
-        self.layer_idx = layer_idx
-        self.BLOCK = block_size
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.head_dim = head_dim
-        self.total_num_heads = num_heads
-        self.hidden_inner_size = hidden_inner_size
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
-
-        assert self.total_num_heads % self.tp_size == 0
-        self.tp_heads = self.total_num_heads // self.tp_size
-        self.qkv_size = self.num_heads * self.head_dim
-        self.tp_hidden = self.head_dim * self.tp_heads
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.prefix = prefix
-
-        self.qkv_proj = ColumnParallelLinear(
-            hidden_size,
-            self.hidden_inner_size * 3,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.qkv_proj",
-        )
-        self.output_gate = ColumnParallelLinear(
-            hidden_size,
-            self.hidden_inner_size,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.output_gate",
-        )
-        self.out_proj = RowParallelLinear(
-            self.hidden_inner_size,
-            hidden_size,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.out_proj",
-        )
-        self.norm = MiniMaxText01RMSNormTP(
-            self.hidden_inner_size,
-            eps=1e-5,
-        )
-
-        slope_rate = MiniMaxText01LinearAttention._build_slope_tensor(
-            self.num_heads)
-        if num_hidden_layer <= 1:
-            self.slope_rate = slope_rate * (1 + 1e-5)
-        else:
-            self.slope_rate = slope_rate * (1 - layer_idx /
-                                            (num_hidden_layer - 1) + 1e-5)
-        self.tp_slope = self.slope_rate[self.tp_rank *
-                                        self.tp_heads:(self.tp_rank + 1) *
-                                        self.tp_heads].contiguous()
-
-        if envs.VLLM_USE_V1:
-            compilation_config = get_current_vllm_config().compilation_config
-            if prefix in compilation_config.static_forward_context:
-                raise ValueError(f"Duplicate layer name: {prefix}")
-            compilation_config.static_forward_context[prefix] = self
-
-    @staticmethod
-    def weight_direct_load(param: torch.Tensor,
-                           loaded_weight: torch.Tensor) -> None:
-        assert param.size() == loaded_weight.size()
-        param.data.copy_(loaded_weight)
-        return
-
-    @staticmethod
-    def _build_slope_tensor(n_attention_heads: int):
-
-        def get_slopes(n):
-
-            def get_slopes_power_of_2(n):
-                start = 2**(-(2**-(math.log2(n) - 3)))
-                ratio = start
-                return [start * ratio**i for i in range(n)]
-
-            if math.log2(n).is_integer():
-                return get_slopes_power_of_2(n)
-            else:
-                closest_power_of_2 = 2**math.floor(math.log2(n))
-                return (get_slopes_power_of_2(closest_power_of_2) + get_slopes(
-                    2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
-
-        slopes = torch.tensor(get_slopes(n_attention_heads),
-                              dtype=torch.float32).reshape(
-                                  n_attention_heads, 1, 1)
-        return slopes
-
-    def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
-                               attn_metadata):
-        hidden = []
-        for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)):
-            if _prefill_idx >= len(attn_metadata.query_start_loc):
-                break
-            if _prefill_idx >= len(state_indices_tensor):
-                break
-            # prefills are packed at end of batch in V1
-            offset = attn_metadata.num_decode_tokens if envs.VLLM_USE_V1 else 0
-            _start = attn_metadata.query_start_loc[offset + _prefill_idx]
-            _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1]
-            slot_id = state_indices_tensor[offset + _prefill_idx]
-            qs = q[_start:_end].transpose(0, 1).contiguous()
-            ks = k[_start:_end].transpose(0, 1).contiguous()
-            vs = v[_start:_end].transpose(0, 1).contiguous()
-            slice_layer_cache = kv_cache[slot_id, ...]
-
-            out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix(
-                qs,
-                ks,
-                vs,
-                slice_layer_cache,
-                self.tp_slope,
-                self.BLOCK,
-                layer_idx=self.layer_idx)
-            hidden.append(out_slice.contiguous())
-        if attn_metadata.num_decode_tokens > 0:
-            hidden_decode = self._decode_infer(q, k, v, kv_cache,
-                                               state_indices_tensor,
-                                               attn_metadata)
-            if envs.VLLM_USE_V1:
-                hidden.insert(0, hidden_decode)
-            else:
-                hidden.append(hidden_decode)
-
-        if not hidden:
-            return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype)
-
-        hidden = torch.concat(hidden, dim=0).contiguous()
-        return hidden
-
-    def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor,
-                      attn_metadata):
-        if not envs.VLLM_USE_V1:
-            q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
-            k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
-            v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
-            num_prefills = getattr(attn_metadata, "num_prefills", 0)
-            slot_id = state_indices_tensor[num_prefills:]
-        else:
-            q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-            k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-            v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-            slot_id = state_indices_tensor[:attn_metadata.num_decodes]
-        hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope,
-                                              slot_id, 32)
-        return hidden
-
-    def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
-                positions: torch.Tensor,
-                kv_caches: MinimaxCacheParams) -> None:
-        if not envs.VLLM_USE_V1:
-            self._forward(hidden_states, output, positions, kv_caches)
-        else:
-            torch.ops.vllm.linear_attention(
-                hidden_states,
-                output,
-                positions,
-                self.prefix,
-            )
-
-    def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
-                 positions: torch.Tensor,
-                 kv_caches: Optional[MinimaxCacheParams]) -> None:
-        forward_context = get_forward_context()
-        attn_metadata: AttentionMetadata = forward_context.attn_metadata
-        if envs.VLLM_USE_V1 and attn_metadata is not None:
-            assert isinstance(attn_metadata, dict)
-            attn_metadata = attn_metadata[self.prefix]
-            assert isinstance(attn_metadata, LinearAttentionMetadata)
-            num_actual_tokens = attn_metadata.num_prefill_tokens + \
-                attn_metadata.num_decode_tokens
-        else:
-            num_actual_tokens = hidden_states.shape[0]
-
-        qkv, _ = self.qkv_proj(hidden_states[:num_actual_tokens])
-        qkv32 = qkv.to(torch.float32)
-        qkvact = torch.nn.functional.silu(qkv32)
-        qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1))
-        q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
-        if envs.VLLM_USE_V1:
-            if attn_metadata is not None:
-                kv_cache = self.kv_cache[forward_context.virtual_engine][0]
-                state_indices_tensor = attn_metadata.state_indices_tensor
-
-                num_prefills = getattr(attn_metadata, "num_prefills", 0)
-                if num_prefills > 0:
-                    num_decode_tokens = getattr(attn_metadata,
-                                                "num_decode_tokens", 0)
-                    for prefill_idx in range(num_prefills):
-                        q_start = attn_metadata.query_start_loc[
-                            num_decode_tokens + prefill_idx]
-                        q_end = attn_metadata.query_start_loc[num_decode_tokens
-                                                              + prefill_idx +
-                                                              1]
-                        query_len = q_end - q_start
-                        context_len = attn_metadata.seq_lens[
-                            num_decode_tokens + prefill_idx] - query_len
-                        if context_len == 0:
-                            block_to_clear = state_indices_tensor[
-                                num_decode_tokens + prefill_idx]
-                            kv_cache[block_to_clear, ...] = 0
-        else:
-            kv_cache = kv_caches.minimax_cache
-            state_indices_tensor = kv_caches.state_indices_tensor
-
-        decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
-        if attn_metadata is None:
-            hidden = torch.empty((q.shape[0], q.shape[1] * q.shape[2]),
-                                 device=q.device,
-                                 dtype=q.dtype)
-        else:
-            if not decode_only:
-                hidden = self._prefill_and_mix_infer(q, k, v, kv_cache,
-                                                     state_indices_tensor,
-                                                     attn_metadata)
-            else:
-                hidden = self._decode_infer(q, k, v, kv_cache,
-                                            state_indices_tensor,
-                                            attn_metadata)
-        hidden = self.norm._forward(hidden)
-        gate, _ = self.output_gate(hidden_states[:num_actual_tokens])
-        hidden = F.sigmoid(gate) * hidden
-        hidden = hidden.to(hidden_states.dtype)
-        output[:num_actual_tokens], _ = self.out_proj(hidden)
-
-
 class MiniMaxText01Attention(nn.Module):
 
     def __init__(
@@ -1397,35 +1025,3 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
             tp_size=parallel_config.tensor_parallel_size,
             head_dim=hf_config.head_dim,
         )
-
-
-def linear_attention(
-    hidden_states: torch.Tensor,
-    output: torch.Tensor,
-    positions: torch.Tensor,
-    layer_name: str,
-) -> None:
-    forward_context: ForwardContext = get_forward_context()
-    self = forward_context.no_compile_layers[layer_name]
-    self._forward(hidden_states=hidden_states,
-                  output=output,
-                  positions=positions,
-                  kv_caches=None)
-
-
-def linear_attention_fake(
-    hidden_states: torch.Tensor,
-    output: torch.Tensor,
-    positions: torch.Tensor,
-    layer_name: str,
-) -> None:
-    return
-
-
-direct_register_custom_op(
-    op_name="linear_attention",
-    op_func=linear_attention,
-    mutates_args=["output"],
-    fake_impl=linear_attention_fake,
-    dispatch_key=current_platform.dispatch_key,
-)

From 628d00cd7b06c9706b0613aafaefe927fb255877 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 30 Aug 2025 19:16:11 +0800
Subject: [PATCH 742/932] [Bugfix] Fix test_lora_resolvers.py (#23984)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/entrypoints/openai/test_lora_resolvers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index f480117258..818efd8256 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -47,6 +47,7 @@ class MockModelConfig:
     allowed_local_media_path: str = ""
     encoder_config = None
     generation_config: str = "auto"
+    skip_tokenizer_init: bool = False
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}

From 5490d633cec0e6b946d3f5c2d56e6236ef42eb40 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sat, 30 Aug 2025 19:22:14 +0800
Subject: [PATCH 743/932] [UT] fix unify_kv_cache_configs when kv cache config
 needs sort (#23843)

---
 tests/v1/core/test_kv_cache_utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index c4f927d69c..e738f2bd46 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -601,8 +601,14 @@ def test_unify_kv_cache_configs():
     ]
 
     unify_kv_cache_configs(need_sort_kv_cache_config)
-    assert need_sort_kv_cache_config[0].num_blocks == 10
-    assert need_sort_kv_cache_config[1].num_blocks == 10
+    sorted_kv_cache_groups = [
+        KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+        KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)),
+    ]
+    assert (
+        need_sort_kv_cache_config[0].kv_cache_groups == sorted_kv_cache_groups)
+    assert (
+        need_sort_kv_cache_config[1].kv_cache_groups == sorted_kv_cache_groups)
 
     diff_kv_cache_config = [
         KVCacheConfig(

From 3a6acad43177d612654082ed1d56fb8d2c442179 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Sat, 30 Aug 2025 21:31:26 +0800
Subject: [PATCH 744/932] [Model] Enable encoder DP for MiniCPM-V (#23948)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/configuration/optimization.md     |  2 +-
 vllm/model_executor/models/minicpmv.py | 43 +++++++++++++++++---------
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index b11ccb5c00..2d8cdcc11f 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -175,7 +175,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
 Known supported models:
 
 - Llama4 (<gh-pr:18368>)
-- MiniCPM-V-4 (<gh-pr:23327>)
+- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
 - Qwen2.5-VL (<gh-pr:22742>)
 - Step3 (<gh-pr:22697>)
 
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 0181bfeebd..04176c5589 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -977,6 +977,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     instantiated.
     """
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
@@ -990,6 +992,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         config = vllm_config.model_config.hf_config
         multimodal_config = vllm_config.model_config.multimodal_config
         quant_config = vllm_config.quant_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
         super().__init__()
         # All MiniCPM-V models disable `tie_word_embeddings` but
         # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
@@ -1237,6 +1240,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
 
 class MiniCPMV2_0(MiniCPMVBaseModel):
 
+    supports_encoder_tp_data = False
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         assert self.version == (2, 0)
@@ -1351,9 +1356,12 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
         quant_config: Optional[QuantizationConfig],
         prefix: str = "",
     ) -> nn.Module:
-        model = Idefics2VisionTransformer(config.vision_config,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
+        model = Idefics2VisionTransformer(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_data_parallel=self.use_data_parallel,
+        )
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model
@@ -1441,9 +1449,12 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> nn.Module:
-        model = Idefics2VisionTransformer(config.vision_config,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
+        model = Idefics2VisionTransformer(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_data_parallel=self.use_data_parallel,
+        )
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model
@@ -1521,8 +1532,6 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         ],
     }
 
-    supports_encoder_tp_data = True
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         assert self.version == (4, 0)
@@ -1546,9 +1555,12 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         prefix: str = "",
     ) -> nn.Module:
         quant_config = self._maybe_ignore_quant_config(quant_config)
-        model = Idefics2VisionTransformer(config.vision_config,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
+        model = Idefics2VisionTransformer(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_data_parallel=self.use_data_parallel,
+        )
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model
@@ -1652,9 +1664,12 @@ class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
         prefix: str = "",
     ) -> nn.Module:
         quant_config = self._maybe_ignore_quant_config(quant_config)
-        model = Idefics2VisionTransformer(config.vision_config,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
+        model = Idefics2VisionTransformer(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_data_parallel=self.use_data_parallel,
+        )
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model

From 379ea2823a751ef43de7b3e7bcb3262b1e14c510 Mon Sep 17 00:00:00 2001
From: "sadegh.shokatian" <sadegh.ja1070@gmail.com>
Date: Sat, 30 Aug 2025 06:40:02 -0700
Subject: [PATCH 745/932] Add LoRA support for DeepSeek models (V2, V3,
 R1-0528) (#23971)

Signed-off-by: sadeghja1070 <sadegh.ja1070@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 docs/models/supported_models.md           | 6 +++---
 vllm/model_executor/models/deepseek.py    | 8 ++++++--
 vllm/model_executor/models/deepseek_v2.py | 5 +++--
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 01c1090c6f..e8fe77e8d6 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -335,9 +335,9 @@ th {
 | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
-| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ |
-| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ |
+| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
 | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index e815f13d66..3f9349d766 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -52,7 +52,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, extract_layer_index,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
@@ -439,7 +439,11 @@ class DeepseekModel(nn.Module):
         return loaded_params
 
 
-class DeepseekForCausalLM(nn.Module, SupportsPP):
+class DeepseekForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 61e8090411..36c9427e47 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -56,7 +56,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import MixtureOfExperts, SupportsPP
+from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -727,7 +727,8 @@ class DeepseekV2Model(nn.Module):
         return hidden_states
 
 
-class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
+class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts,
+                            SupportsLoRA):
     packed_modules_mapping = {
         "gate_up_proj": ["gate_proj", "up_proj"],
     }

From fb4983e112a81f4df25b92ab98c9c84a5babfec9 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sat, 30 Aug 2025 21:41:45 +0800
Subject: [PATCH 746/932] [Misc] add reorder_batch AttentionMetadataBuilder
 (#23798)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/v1/attention/backends/utils.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index ad53b2e80b..011a90ece0 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -212,6 +212,23 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
         """
         raise NotImplementedError
 
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        """
+        Update the order of requests in the batch based on the attention
+        backend's needs. For example, some attention backends (namely MLA) may
+        want to separate requests based on if the attention computation will be
+        compute-bound or memory-bound.
+
+        Args:
+            input_batch: input batch
+            scheduler_output: scheduler output.
+
+        Returns:
+            True if the batch was modified, False otherwise.
+        """
+        raise NotImplementedError
+
     def build_for_cudagraph_capture(
             self, common_attn_metadata: CommonAttentionMetadata) -> M:
         """

From e80bca309eb866e6d62e081e6ad80f3e10c113e9 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sat, 30 Aug 2025 21:42:25 +0800
Subject: [PATCH 747/932] [Refactor] refactor freezing_value/cuda_event
 initialize outside try finally (#23758)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/v1/worker/cpu_model_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 226d7792a4..360a626979 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -128,8 +128,8 @@ def _torch_cuda_wrapper():
             self.record = lambda: None
             self.synchronize = lambda: None
 
+    cuda_event = torch.cuda.Event
     try:
-        cuda_event = torch.cuda.Event
         torch.cuda.Event = _EventPlaceholder
         yield
     finally:
@@ -141,9 +141,9 @@ def _set_global_compilation_settings(config: VllmConfig):
     import torch._inductor.config
 
     inductor_config = config.compilation_config.inductor_compile_config
+    # Note: The MKLDNN and CPPGEMM backend requires freezing parameters.
+    freezing_value = torch._inductor.config.freezing
     try:
-        # Note: The MKLDNN and CPPGEMM backend requires freezing parameters.
-        freezing_value = torch._inductor.config.freezing
         if inductor_config.get("max_autotune", False):
             torch._inductor.config.freezing = True
         yield

From 68a349114f2a90f17645a8172496b0803d677f29 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sat, 30 Aug 2025 21:43:33 +0800
Subject: [PATCH 748/932] [Misc] enhance type hint for rearrange return value
 (#23519)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/distributed/eplb/eplb_state.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 042acf40d6..d5ab61473a 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -409,12 +409,14 @@ class EplbState:
             self.expert_rearrangement_step = 0
             self.rearrange(model)
 
-    def rearrange(self,
-                  model: MixtureOfExperts,
-                  is_profile: bool = False,
-                  execute_shuffle: bool = True,
-                  global_expert_load: Optional[torch.Tensor] = None,
-                  rank_mapping: Optional[dict[int, int]] = None) -> None:
+    def rearrange(
+        self,
+        model: MixtureOfExperts,
+        is_profile: bool = False,
+        execute_shuffle: bool = True,
+        global_expert_load: Optional[torch.Tensor] = None,
+        rank_mapping: Optional[dict[int,
+                                    int]] = None) -> Optional[torch.Tensor]:
         """
         Rearrange the experts according to the current load.
         """
@@ -548,6 +550,7 @@ class EplbState:
                 " (profile) " if is_profile else " ",
                 time_end - time_start,
             )
+        return None
 
     @staticmethod
     def recv_state() -> tuple[torch.Tensor, torch.Tensor]:
@@ -613,4 +616,4 @@ def _node_count_with_rank_mapping(
             if is_same_node and node_assignment[other_rank] == 0:
                 node_assignment[other_rank] = next_node_id
 
-    return next_node_id
\ No newline at end of file
+    return next_node_id

From 038e9be4eb7a63189c8980845d80cb96957b9919 Mon Sep 17 00:00:00 2001
From: Andy Lo <andy@mistral.ai>
Date: Sat, 30 Aug 2025 16:37:39 +0100
Subject: [PATCH 749/932] [LoRA] Much faster startup when LoRA is enabled
 (#23777)

Signed-off-by: Andy Lo <andy@mistral.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py        | 11 ++++++---
 vllm/v1/worker/gpu_worker.py              |  5 +++-
 vllm/v1/worker/lora_model_runner_mixin.py | 30 ++++++++++++++++-------
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c6d50c17f2..d6717892d4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2213,6 +2213,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         uniform_decode: bool = False,
         skip_eplb: bool = False,
         is_profile: bool = False,
+        remove_lora: bool = True,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Run a dummy forward pass to warm up/profile run or capture the
@@ -2230,6 +2231,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             uniform_decode: If True, the batch is a uniform decode batch.
             skip_eplb: If True, skip EPLB state update.
             is_profile: If True, this is a profile run.
+            remove_lora: If False, dummy LoRAs are not destroyed after the run
         """
         assert cudagraph_runtime_mode in {
             CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
@@ -2317,7 +2319,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         attn_metadata[layer_name] = attn_metadata_i
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
-                                            num_scheduled_tokens):
+                                            num_scheduled_tokens, remove_lora):
             if self.supports_mm_inputs:
                 input_ids = None
                 inputs_embeds = self.inputs_embeds[:num_tokens]
@@ -2708,11 +2710,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
                                 force_attention=force_attention,
                                 uniform_decode=uniform_decode,
-                                skip_eplb=True)
+                                skip_eplb=True,
+                                remove_lora=False)
             self._dummy_run(num_tokens,
                             cudagraph_runtime_mode=cudagraph_runtime_mode,
                             uniform_decode=uniform_decode,
-                            skip_eplb=True)
+                            skip_eplb=True,
+                            remove_lora=False)
+        self.maybe_remove_all_loras(self.lora_config)
 
     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
         """
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 2088bfff5b..2e7d668537 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -308,7 +308,10 @@ class Worker(WorkerBase):
         # We skip EPLB here since we don't want to record dummy metrics
         for size in sorted(warmup_sizes, reverse=True):
             logger.info("Compile and warming up model for size %d", size)
-            self.model_runner._dummy_run(size, skip_eplb=True)
+            self.model_runner._dummy_run(size,
+                                         skip_eplb=True,
+                                         remove_lora=False)
+        self.model_runner.maybe_remove_all_loras(self.model_runner.lora_config)
 
         # Warmup and tune the kernels used during model execution before
         # cuda graph capture.
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 84ed46989e..4b5f27d275 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -5,7 +5,7 @@ Define LoRA functionality mixin for model runners.
 """
 
 from contextlib import contextmanager
-from typing import Union
+from typing import Optional, Union
 
 import numpy as np
 import torch
@@ -87,7 +87,9 @@ class LoRAModelRunnerMixin:
                                       lora_requests)
 
     @contextmanager
-    def maybe_setup_dummy_loras(self, lora_config):
+    def maybe_setup_dummy_loras(self,
+                                lora_config: Optional[LoRAConfig],
+                                remove_lora: bool = True):
         if lora_config is None:
             yield
         else:
@@ -114,10 +116,11 @@ class LoRAModelRunnerMixin:
                 yield
 
             # __exit__ code
-            self.lora_manager.remove_all_adapters()
+            if remove_lora:
+                self.lora_manager.remove_all_adapters()
 
     @contextmanager
-    def maybe_select_dummy_loras(self, lora_config: LoRAConfig,
+    def maybe_select_dummy_loras(self, lora_config: Optional[LoRAConfig],
                                  num_scheduled_tokens: np.ndarray):
         if lora_config is None:
             yield
@@ -151,13 +154,22 @@ class LoRAModelRunnerMixin:
             yield
 
     @contextmanager
-    def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig,
-                                  num_scheduled_tokens: np.ndarray):
-        with self.maybe_setup_dummy_loras(
-                lora_config), self.maybe_select_dummy_loras(
-                    lora_config, num_scheduled_tokens):
+    def maybe_dummy_run_with_lora(self,
+                                  lora_config: Optional[LoRAConfig],
+                                  num_scheduled_tokens: np.ndarray,
+                                  remove_lora: bool = True):
+        with (
+                self.maybe_setup_dummy_loras(lora_config, remove_lora),
+                self.maybe_select_dummy_loras(lora_config,
+                                              num_scheduled_tokens),
+        ):
             yield
 
+    def maybe_remove_all_loras(self, lora_config: Optional[LoRAConfig]):
+        if lora_config is None:
+            return
+        self.lora_manager.remove_all_adapters()
+
     def add_lora(self, lora_request: LoRARequest) -> bool:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")

From 5b8077b8ac42625a3465ad1f885e409d33e0e42e Mon Sep 17 00:00:00 2001
From: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com>
Date: Sat, 30 Aug 2025 17:39:38 -0300
Subject: [PATCH 750/932] Fix wrong truncate_prompt_tokens type hint (#22761)

Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
Signed-off-by: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/llm.py                       | 41 +++++++---------
 vllm/entrypoints/openai/protocol.py           | 28 +++++++----
 vllm/entrypoints/openai/serving_chat.py       |  1 -
 .../openai/serving_classification.py          | 13 -----
 vllm/entrypoints/openai/serving_completion.py |  1 -
 vllm/entrypoints/openai/serving_embedding.py  | 14 ------
 vllm/entrypoints/openai/serving_engine.py     | 47 ++++++++-----------
 vllm/entrypoints/openai/serving_pooling.py    |  2 -
 vllm/entrypoints/openai/serving_score.py      |  6 +--
 vllm/inputs/preprocess.py                     | 22 ++++++++-
 vllm/pooling_params.py                        |  7 ++-
 vllm/sampling_params.py                       | 14 ++++--
 vllm/transformers_utils/tokenizer_group.py    |  1 +
 vllm/utils/__init__.py                        |  6 +++
 14 files changed, 101 insertions(+), 102 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 9d900e691b..479524a117 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -51,7 +51,7 @@ from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Counter, Device, is_list_of
+from vllm.utils import Counter, Device, as_iter, is_list_of
 from vllm.v1.sample.logits_processor import LogitsProcessor
 
 if TYPE_CHECKING:
@@ -364,14 +364,6 @@ class LLM:
             # Use default sampling params.
             sampling_params = self.get_default_sampling_params()
 
-        tokenization_kwargs: dict[str, Any] = {}
-        truncate_prompt_tokens = None
-        if isinstance(sampling_params, SamplingParams):
-            truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
-
-        _validate_truncation_size(model_config.max_model_len,
-                                  truncate_prompt_tokens, tokenization_kwargs)
-
         # Add any modality specific loras to the corresponding prompts
         lora_request = self._get_modality_specific_lora_reqs(
             prompts, lora_request)
@@ -381,7 +373,6 @@ class LLM:
             params=sampling_params,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
-            tokenization_kwargs=tokenization_kwargs,
             priority=priority,
         )
 
@@ -871,6 +862,8 @@ class LLM:
                 If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             pooling_task: Override the pooling task to use.
+            tokenization_kwargs: overrides tokenization_kwargs set in
+                pooling_params
 
         Returns:
             A list of `PoolingRequestOutput` objects containing the
@@ -916,24 +909,17 @@ class LLM:
             # Use default pooling params.
             pooling_params = PoolingParams()
 
-        if isinstance(pooling_params, PoolingParams):
-            pooling_params.verify(pooling_task, model_config)
-        else:
-            for pooling_param in pooling_params:
-                pooling_param.verify(pooling_task, model_config)
-
-        if tokenization_kwargs is None:
-            tokenization_kwargs = dict[str, Any]()
-            _validate_truncation_size(model_config.max_model_len,
-                                      truncate_prompt_tokens,
-                                      tokenization_kwargs)
+        for param in as_iter(pooling_params):
+            param.verify(pooling_task, model_config)
+            # for backwards compatibility
+            if truncate_prompt_tokens is not None:
+                param.truncate_prompt_tokens = truncate_prompt_tokens
 
         self._validate_and_add_requests(
             prompts=prompts,
             params=pooling_params,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
-            tokenization_kwargs=tokenization_kwargs,
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
@@ -1385,7 +1371,6 @@ class LLM:
         *,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
         priority: Optional[list[int]] = None,
     ) -> None:
         if isinstance(prompts, (str, dict)):
@@ -1412,7 +1397,17 @@ class LLM:
             tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
             it = tqdm_func(it, desc="Adding requests")
 
+        model_config = self.llm_engine.model_config
+
         for i, prompt in enumerate(it):
+
+            param = params[i] if isinstance(params, Sequence) else params
+
+            tokenization_kwargs: dict[str, Any] = {}
+            _validate_truncation_size(model_config.max_model_len,
+                                      param.truncate_prompt_tokens,
+                                      tokenization_kwargs)
+
             self._add_request(
                 prompt,
                 params[i] if isinstance(params, Sequence) else params,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 5cb41bd93d..0fa1136b47 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -452,7 +452,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     min_tokens: int = 0
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
     prompt_logprobs: Optional[int] = None
     allowed_token_ids: Optional[list[int]] = None
     bad_words: list[str] = Field(default_factory=list)
@@ -995,7 +995,7 @@ class CompletionRequest(OpenAIBaseModel):
     min_tokens: int = 0
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
     allowed_token_ids: Optional[list[int]] = None
     prompt_logprobs: Optional[int] = None
     # --8<-- [end:completion-sampling-params]
@@ -1325,8 +1325,10 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
     # --8<-- [end:embedding-extra-params]
 
     def to_pooling_params(self):
-        return PoolingParams(dimensions=self.dimensions,
-                             normalize=self.normalize)
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize)
 
 
 class EmbeddingChatRequest(OpenAIBaseModel):
@@ -1393,8 +1395,10 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         return data
 
     def to_pooling_params(self):
-        return PoolingParams(dimensions=self.dimensions,
-                             normalize=self.normalize)
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize)
 
 
 EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
@@ -1430,7 +1434,9 @@ class ScoreRequest(OpenAIBaseModel):
     # --8<-- [end:score-extra-params]
 
     def to_pooling_params(self):
-        return PoolingParams(activation=self.activation)
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            activation=self.activation)
 
 
 class RerankRequest(OpenAIBaseModel):
@@ -1460,7 +1466,9 @@ class RerankRequest(OpenAIBaseModel):
     # --8<-- [end:rerank-extra-params]
 
     def to_pooling_params(self):
-        return PoolingParams(activation=self.activation)
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            activation=self.activation)
 
 
 class RerankDocument(BaseModel):
@@ -1618,7 +1626,9 @@ class ClassificationRequest(OpenAIBaseModel):
     # --8<-- [end:classification-extra-params]
 
     def to_pooling_params(self):
-        return PoolingParams(activation=self.activation)
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            activation=self.activation)
 
 
 class ClassificationData(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 1c0ffdfb91..6300d0758c 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -237,7 +237,6 @@ class OpenAIServingChat(OpenAIServing):
                     documents=request.documents,
                     chat_template_kwargs=request.chat_template_kwargs,
                     tool_parser=tool_parser,
-                    truncate_prompt_tokens=request.truncate_prompt_tokens,
                     add_special_tokens=request.add_special_tokens,
                 )
             else:
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 1d510d0b60..b4fdc36390 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -61,7 +61,6 @@ class ClassificationMixin(OpenAIServing):
                 ctx.request,
                 ctx.tokenizer,
                 ctx.request.input,
-                truncate_prompt_tokens=ctx.request.truncate_prompt_tokens,
             )
 
             return None
@@ -157,18 +156,6 @@ class ServingClassification(ClassificationMixin):
 
         return await super().handle(ctx)  # type: ignore
 
-    @override
-    def _validate_request(
-        self,
-        ctx: ClassificationServeContext,
-    ) -> Optional[ErrorResponse]:
-        if error := super()._validate_request(ctx):
-            return error
-
-        ctx.truncate_prompt_tokens = ctx.request.truncate_prompt_tokens
-
-        return None
-
     @override
     def _create_pooling_params(
         self,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index f461d7609b..11effba8f9 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -137,7 +137,6 @@ class OpenAIServingCompletion(OpenAIServing):
                 request,
                 tokenizer,
                 request.prompt,
-                truncate_prompt_tokens=request.truncate_prompt_tokens,
                 add_special_tokens=request.add_special_tokens,
             )
         except ValueError as e:
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 45c1932f18..0a0d98db2d 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -97,7 +97,6 @@ class EmbeddingMixin(OpenAIServing):
                     # so there is no need to append extra tokens to the input
                     add_generation_prompt=False,
                     continue_final_message=False,
-                    truncate_prompt_tokens=ctx.truncate_prompt_tokens,
                     add_special_tokens=ctx.request.add_special_tokens,
                 )
             else:
@@ -106,7 +105,6 @@ class EmbeddingMixin(OpenAIServing):
                      ctx.request,
                      tokenizer,
                      ctx.request.input,
-                     truncate_prompt_tokens=ctx.truncate_prompt_tokens,
                      add_special_tokens=ctx.request.add_special_tokens,
                  )
             return None
@@ -631,18 +629,6 @@ class OpenAIServingEmbedding(EmbeddingMixin):
 
         return await super().handle(ctx)  # type: ignore
 
-    @override
-    def _validate_request(
-        self,
-        ctx: ServeContext[EmbeddingRequest],
-    ) -> Optional[ErrorResponse]:
-        if error := super()._validate_request(ctx):
-            return error
-
-        ctx.truncate_prompt_tokens = ctx.request.truncate_prompt_tokens
-
-        return None
-
     @override
     def _create_pooling_params(
         self,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index ca6f398793..320c1e61f1 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -165,7 +165,6 @@ class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, BaseModel,
 
     # Shared across most requests
     tokenizer: Optional[AnyTokenizer] = None
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
 
     # `protected_namespaces` resolves Pydantic v2's warning
     # on conflict with protected namespace "model_"
@@ -297,14 +296,12 @@ class OpenAIServing:
         truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens",
                                          None)
 
-        if truncate_prompt_tokens is not None:
-            if truncate_prompt_tokens <= self.max_model_len:
-                ctx.truncate_prompt_tokens = truncate_prompt_tokens
-            else:
-                return self.create_error_response(
-                    "truncate_prompt_tokens value is "
-                    "greater than max_model_len."
-                    " Please, select a smaller truncation size.")
+        if truncate_prompt_tokens is not None and \
+            truncate_prompt_tokens > self.max_model_len:
+            return self.create_error_response(
+                "truncate_prompt_tokens value is "
+                "greater than max_model_len."
+                " Please, select a smaller truncation size.")
         return None
 
     def _create_pooling_params(
@@ -528,7 +525,6 @@ class OpenAIServing:
         request: AnyRequest,
         prompt: str,
         tokenizer: AnyTokenizer,
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]],
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
         async_tokenizer = self._get_async_tokenizer(tokenizer)
@@ -538,6 +534,9 @@ class OpenAIServing:
                     "do_lower_case", False)):
             prompt = prompt.lower()
 
+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
+                                         None)
+
         if truncate_prompt_tokens is None:
             encoded = await async_tokenizer(
                 prompt, add_special_tokens=add_special_tokens)
@@ -565,8 +564,10 @@ class OpenAIServing:
         request: AnyRequest,
         prompt_ids: list[int],
         tokenizer: Optional[AnyTokenizer],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
     ) -> TextTokensPrompt:
+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
+                                         None)
+
         if truncate_prompt_tokens is None:
             input_ids = prompt_ids
         elif truncate_prompt_tokens < 0:
@@ -652,7 +653,6 @@ class OpenAIServing:
         request: AnyRequest,
         tokenizer: AnyTokenizer,
         prompt_input: Union[str, list[int]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
     ) -> TextTokensPrompt:
         """
@@ -664,7 +664,6 @@ class OpenAIServing:
                 request,
                 tokenizer,
             [prompt_input],
-                truncate_prompt_tokens=truncate_prompt_tokens,
                 add_special_tokens=add_special_tokens,
         ):
             return result
@@ -675,7 +674,6 @@ class OpenAIServing:
         request: AnyRequest,
         tokenizer: AnyTokenizer,
         prompt_inputs: Iterable[Union[str, list[int]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
     ) -> AsyncGenerator[TextTokensPrompt, None]:
         """
@@ -689,7 +687,6 @@ class OpenAIServing:
                     request,
                     prompt=prompt,
                     tokenizer=tokenizer,
-                    truncate_prompt_tokens=truncate_prompt_tokens,
                     add_special_tokens=add_special_tokens,
                 )
             else:
@@ -697,7 +694,6 @@ class OpenAIServing:
                     request,
                     prompt_ids=prompt,
                     tokenizer=tokenizer,
-                    truncate_prompt_tokens=truncate_prompt_tokens,
                 )
 
     async def _tokenize_prompt_input_or_inputs_async(
@@ -706,7 +702,6 @@ class OpenAIServing:
         tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
     ) -> tuple[list[TextTokensPrompt], list[EmbedsPrompt]]:
         """
@@ -719,6 +714,12 @@ class OpenAIServing:
         inputs_embeds = list[EmbedsPrompt]()
         inputs_text = list[TextTokensPrompt]()
 
+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
+                                         None)
+
+        if (truncate_prompt_tokens or 0) < 0:
+            truncate_prompt_tokens = self.max_model_len
+
         if (isinstance(request, CompletionRequest)
                 and request.prompt_embeds is not None):
             inputs_embeds.extend(
@@ -748,14 +749,10 @@ class OpenAIServing:
                     request,
                     prompt_input["content"],
                     tokenizer=tokenizer,
-                    truncate_prompt_tokens=truncate_prompt_tokens,
                     add_special_tokens=add_special_tokens)
             else:
                 task = self._normalize_prompt_tokens_to_input(
-                    request,
-                    prompt_input["content"],
-                    tokenizer=tokenizer,
-                    truncate_prompt_tokens=truncate_prompt_tokens)
+                    request, prompt_input["content"], tokenizer=tokenizer)
             tasks.append(task)
 
         # Wait for all tokenization tasks to complete
@@ -772,7 +769,6 @@ class OpenAIServing:
                        TokenizeCompletionRequest],
         tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ...,
         add_special_tokens: bool = ...,
     ) -> tuple[list[TextTokensPrompt], list[EngineTokensPrompt]]:
         ...
@@ -784,7 +780,6 @@ class OpenAIServing:
         tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ...,
         add_special_tokens: bool = ...,
     ) -> tuple[list[Union[TextTokensPrompt, EmbedsPrompt]], list[Union[
             EngineTokensPrompt, EngineEmbedsPrompt]]]:
@@ -796,7 +791,6 @@ class OpenAIServing:
         tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
     ) -> tuple[Union[list[TextTokensPrompt], list[Union[
             TextTokensPrompt, EmbedsPrompt]]], Union[
@@ -813,7 +807,6 @@ class OpenAIServing:
              request,
              tokenizer,
              input_or_inputs,
-             truncate_prompt_tokens=truncate_prompt_tokens,
              add_special_tokens=add_special_tokens,
          )
 
@@ -866,7 +859,6 @@ class OpenAIServing:
         documents: Optional[list[dict[str, str]]] = None,
         chat_template_kwargs: Optional[dict[str, Any]] = None,
         tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
         add_special_tokens: bool = False,
     ) -> tuple[list[ConversationMessage], Sequence[RequestPrompt],
                list[EngineTokensPrompt]]:
@@ -941,7 +933,6 @@ class OpenAIServing:
                 request,
                 tokenizer,
                 request_prompt,
-                truncate_prompt_tokens=truncate_prompt_tokens,
                 add_special_tokens=add_special_tokens,
             )
         else:
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index e8cb1aed84..b2c2af2ec5 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -120,7 +120,6 @@ class OpenAIServingPooling(OpenAIServing):
                     # so there is no need to append extra tokens to the input
                     add_generation_prompt=False,
                     continue_final_message=False,
-                    truncate_prompt_tokens=truncate_prompt_tokens,
                     add_special_tokens=request.add_special_tokens,
                 )
             else:
@@ -129,7 +128,6 @@ class OpenAIServingPooling(OpenAIServing):
                      request,
                      tokenizer,
                      request.input,
-                     truncate_prompt_tokens=truncate_prompt_tokens,
                      add_special_tokens=request.add_special_tokens,
                  )
         except (ValueError, TypeError, jinja2.TemplateError) as e:
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index c54deb371d..847c014a11 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -266,12 +266,14 @@ class ServingScores(OpenAIServing):
         request: Union[ScoreRequest, RerankRequest],
         request_id: str,
         raw_request: Optional[Request] = None,
-        truncate_prompt_tokens: Optional[int] = None,
     ) -> Union[list[PoolingRequestOutput], ErrorResponse]:
         lora_request = self._maybe_get_adapters(request)
 
         tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
+                                         None)
+
         tokenization_kwargs: dict[str, Any] = {}
         _validate_truncation_size(self.max_model_len, truncate_prompt_tokens,
                                   tokenization_kwargs)
@@ -343,7 +345,6 @@ class ServingScores(OpenAIServing):
                 request,
                 request_id,
                 raw_request,
-                request.truncate_prompt_tokens,
             )
             if isinstance(final_res_batch, ErrorResponse):
                 return final_res_batch
@@ -391,7 +392,6 @@ class ServingScores(OpenAIServing):
                 request,
                 request_id,
                 raw_request,
-                request.truncate_prompt_tokens,
             )
             if isinstance(final_res_batch, ErrorResponse):
                 return final_res_batch
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 3dbd9057fe..2f2fbe274b 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -346,6 +346,22 @@ class InputPreprocessor:
     ) -> EmbedsInputs:
         return self._process_embeds(parsed_content)
 
+    def _truncate_inputs(
+            self,
+            inputs: list[int],
+            tokenization_kwargs: Optional[dict[str, Any]] = None) -> list[int]:
+
+        if not tokenization_kwargs or "truncation" not in \
+                tokenization_kwargs or self.tokenizer is None:
+            return inputs
+
+        max_length = tokenization_kwargs["max_length"]
+
+        if self.tokenizer.truncation_side == "left":
+            return inputs[-max_length:]
+        else:
+            return inputs[:max_length]
+
     def _process_tokens(
         self,
         parsed_content: TokensPrompt,
@@ -354,7 +370,8 @@ class InputPreprocessor:
         *,
         mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
-        prompt_token_ids = parsed_content["prompt_token_ids"]
+        prompt_token_ids = self._truncate_inputs(
+            parsed_content["prompt_token_ids"], tokenization_kwargs)
 
         inputs: Union[TokenInputs, MultiModalInputs]
         if multi_modal_data := parsed_content.get("multi_modal_data"):
@@ -382,7 +399,8 @@ class InputPreprocessor:
         *,
         mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
-        prompt_token_ids = parsed_content["prompt_token_ids"]
+        prompt_token_ids = self._truncate_inputs(
+            parsed_content["prompt_token_ids"], tokenization_kwargs)
 
         inputs: Union[TokenInputs, MultiModalInputs]
         if multi_modal_data := parsed_content.get("multi_modal_data"):
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 29f037b437..6672392b8d 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Annotated, Any, Optional
 
 import msgspec
 
@@ -27,6 +27,11 @@ class PoolingParams(
                     the classification outputs.
         softmax: Whether to apply softmax to the reward outputs.
     """
+    truncate_prompt_tokens: Optional[Annotated[int,
+                                               msgspec.Meta(ge=-1)]] = None
+    """If set to -1, will use the truncation size supported by the model. If
+    set to an integer k, will use only the last k tokens from the prompt
+    (i.e., left truncation). If set to `None`, truncation is disabled."""
 
     ## for embeddings models
     dimensions: Optional[int] = None
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index df4cca9ba1..c7b4ba34c6 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -182,7 +182,8 @@ class SamplingParams(
     optionally prompt tokens as a first argument."""
     include_stop_str_in_output: bool = False
     """Whether to include the stop strings in output text."""
-    truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None
+    truncate_prompt_tokens: Optional[Annotated[int,
+                                               msgspec.Meta(ge=-1)]] = None
     """If set to -1, will use the truncation size supported by the model. If
     set to an integer k, will use only the last k tokens from the prompt
     (i.e., left truncation). If set to `None`, truncation is disabled."""
@@ -241,7 +242,8 @@ class SamplingParams(
         spaces_between_special_tokens: bool = True,
         logits_processors: Optional[list[LogitsProcessor]] = None,
         truncate_prompt_tokens: Optional[Annotated[int,
-                                                   msgspec.Meta(ge=1)]] = None,
+                                                   msgspec.Meta(
+                                                       ge=-1)]] = None,
         output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
         guided_decoding: Optional[GuidedDecodingParams] = None,
         logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
@@ -411,9 +413,11 @@ class SamplingParams(
             raise ValueError(f"prompt_logprobs must be non-negative, got "
                              f"{self.prompt_logprobs}.")
         if (self.truncate_prompt_tokens is not None
-                and self.truncate_prompt_tokens < 1):
-            raise ValueError(f"truncate_prompt_tokens must be >= 1, "
-                             f"got {self.truncate_prompt_tokens}")
+                and (self.truncate_prompt_tokens == 0
+                     or self.truncate_prompt_tokens < -1)):
+            raise ValueError(
+                f"truncate_prompt_tokens must be an integer >= 1 or -1, "
+                f"got {self.truncate_prompt_tokens}")
         assert isinstance(self.stop_token_ids, list)
         if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
             raise ValueError(f"stop_token_ids must contain only integers, "
diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py
index a8bb0398df..ae8220f9b9 100644
--- a/vllm/transformers_utils/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -23,6 +23,7 @@ class TokenizerGroup:
         self.tokenizer_config = tokenizer_config
         self.enable_lora = enable_lora
         self.max_input_length = max_input_length
+        self.truncation_side = tokenizer_config.get("truncation_side", "left")
         self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
         max_loras = tokenizer_config.get("max_loras", 0)
         self.lora_tokenizers = LRUCache[int, AnyTokenizer](
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index c5ed10326f..698aaab3aa 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1328,6 +1328,12 @@ def as_list(maybe_list: Iterable[T]) -> list[T]:
     return maybe_list if isinstance(maybe_list, list) else list(maybe_list)
 
 
+def as_iter(obj: Union[T, Iterable[T]]) -> Iterable[T]:
+    if isinstance(obj, str) or not isinstance(obj, Iterable):
+        obj = [obj]
+    return obj
+
+
 # `collections` helpers
 def is_list_of(
     value: object,

From 749be00a98eef8eab262cc3119893c00dbca22e9 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Sat, 30 Aug 2025 18:01:22 -0700
Subject: [PATCH 751/932] [Core][Multimodal] Allow passing `multi_modal_uuids`
 as multimodal identifiers. (#23394)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 docs/features/multimodal_inputs.md            |  35 +++
 .../test_processor_multi_modal_uuids.py       | 229 ++++++++++++++++++
 vllm/entrypoints/openai/serving_engine.py     |   2 +-
 vllm/inputs/data.py                           |  20 +-
 vllm/inputs/preprocess.py                     |  44 ++--
 vllm/multimodal/__init__.py                   |   6 +-
 vllm/multimodal/hasher.py                     |   7 +-
 vllm/multimodal/inputs.py                     |  15 +-
 vllm/multimodal/processing.py                 |  91 +++++--
 vllm/v1/engine/processor.py                   |  60 ++++-
 10 files changed, 455 insertions(+), 54 deletions(-)
 create mode 100644 tests/v1/engine/test_processor_multi_modal_uuids.py

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 9d51f9cf52..206ab7a468 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -13,6 +13,41 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
 - `prompt`: The prompt should follow the format that is documented on HuggingFace.
 - `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][].
 
+### Stable UUIDs for Caching (multi_modal_uuids)
+
+When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content.
+
+??? code
+
+    ```python
+    from vllm import LLM
+    from PIL import Image
+
+    # Qwen2.5-VL example with two images
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
+
+    prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
+    img_a = Image.open("/path/to/a.jpg")
+    img_b = Image.open("/path/to/b.jpg")
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": [img_a, img_b]},
+        # Provide stable IDs for caching.
+        # Requirements (matched by this example):
+        #  - Include every modality present in multi_modal_data.
+        #  - For lists, provide the same number of entries.
+        #  - Use None to fall back to content hashing for that item.
+        "multi_modal_uuids": {"image": ["sku-1234-a", None]},
+    })
+
+    for o in outputs:
+        print(o.outputs[0].text)
+    ```
+
+!!! warning
+    If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
+
 ### Image Inputs
 
 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py
new file mode 100644
index 0000000000..970a59eca8
--- /dev/null
+++ b/tests/v1/engine/test_processor_multi_modal_uuids.py
@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.platforms.interface import UnspecifiedPlatform
+from vllm.sampling_params import SamplingParams
+from vllm.v1.engine import processor as processor_mod
+from vllm.v1.engine.processor import Processor
+
+cherry_pil_image = ImageAsset("cherry_blossom").pil_image
+stop_pil_image = ImageAsset("stop_sign").pil_image
+baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
+
+
+# Mock processor for testing
+def _mk_processor(monkeypatch,
+                  *,
+                  mm_cache_gb: float = 4.0,
+                  enable_prefix_caching: bool = True) -> Processor:
+    """
+    Create a Processor instance with minimal configuration suitable for unit
+    tests without accessing external resources.
+    """
+    monkeypatch.setattr(ModelConfig,
+                        "try_get_generation_config",
+                        lambda self: {},
+                        raising=True)
+    monkeypatch.setattr(ModelConfig,
+                        "__post_init__",
+                        lambda self: None,
+                        raising=True)
+    monkeypatch.setattr(UnspecifiedPlatform,
+                        "is_async_output_supported",
+                        classmethod(lambda cls, enforce_eager: True),
+                        raising=True)
+    monkeypatch.setattr(
+        ModelConfig,
+        "verify_async_output_proc",
+        lambda self, parallel_config, speculative_config, device_config: None,
+        raising=True)
+    monkeypatch.setattr(ModelConfig,
+                        "verify_with_parallel_config",
+                        lambda self, parallel_config: None,
+                        raising=True)
+    monkeypatch.setattr(processor_mod,
+                        "processor_cache_from_config",
+                        lambda vllm_config, mm_registry: None,
+                        raising=True)
+
+    monkeypatch.setattr(VllmConfig,
+                        "__post_init__",
+                        lambda self: None,
+                        raising=True)
+
+    model_config = ModelConfig(
+        skip_tokenizer_init=True,
+        max_model_len=128,
+        mm_processor_cache_gb=mm_cache_gb,
+        generation_config="vllm",
+        tokenizer="dummy",
+    )
+
+    # Minimal multimodal_config to satisfy references in
+    # Processor.process_inputs.
+    class _MockMMConfig:
+
+        def __init__(self, gb: float):
+            self.mm_processor_cache_gb = gb
+
+    model_config.multimodal_config = _MockMMConfig(
+        mm_cache_gb)  # type: ignore[attr-defined]
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
+        device_config=DeviceConfig(device="cpu"),
+    )
+
+    # Pass tokenizer=None; InputPreprocessor handles None when
+    # skip_tokenizer_init is True.
+    return Processor(vllm_config, tokenizer=None)  # type: ignore[arg-type]
+
+
+def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
+    processor = _mk_processor(monkeypatch)
+
+    prompt = {
+        "prompt": "USER: <image>\nDescribe\nASSISTANT:",
+        "multi_modal_data": {
+            "image": [cherry_pil_image, stop_pil_image]
+        },
+        # Mismatch: 2 items but only 1 uuid provided
+        "multi_modal_uuids": {
+            "image": ["hash_cherry"]
+        },
+    }
+
+    with pytest.raises(ValueError, match="must have same length as data"):
+        processor.process_inputs(
+            request_id="req-1",
+            prompt=prompt,  # type: ignore[arg-type]
+            params=SamplingParams(),
+        )
+
+
+def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
+    processor = _mk_processor(monkeypatch)
+
+    prompt = {
+        "prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
+        # Two modalities provided in data
+        "multi_modal_data": {
+            "image": [cherry_pil_image],
+            "video": [baby_reading_np_ndarrays]
+        },
+        # Only image uuids provided; video missing should raise
+        "multi_modal_uuids": {
+            "image": ["hash_cherry"]
+        },
+    }
+
+    with pytest.raises(ValueError,
+                       match="must be provided if multi_modal_data"):
+        processor.process_inputs(
+            request_id="req-2",
+            prompt=prompt,  # type: ignore[arg-type]
+            params=SamplingParams(),
+        )
+
+
+@pytest.mark.parametrize(
+    "mm_cache_gb, enable_prefix_caching",
+    [
+        (4.0, True),  # default behavior
+        (4.0, False),  # prefix caching disabled
+        (0.0, True),  # processor cache disabled
+    ],
+)
+def test_multi_modal_uuids_accepts_none_and_passes_through(
+        monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool):
+    processor = _mk_processor(monkeypatch,
+                              mm_cache_gb=mm_cache_gb,
+                              enable_prefix_caching=enable_prefix_caching)
+
+    # Capture the overrides passed to InputPreprocessor.preprocess
+    captured: dict[str, object] = {}
+
+    def fake_preprocess(prompt,
+                        *,
+                        tokenization_kwargs=None,
+                        lora_request=None,
+                        mm_hash_overrides=None):
+        captured["mm_hash_overrides"] = mm_hash_overrides
+        # Minimal processed inputs for decoder-only flow
+        return {"type": "token", "prompt_token_ids": [1]}
+
+    # Monkeypatch only the bound preprocess method on this instance
+    monkeypatch.setattr(processor.input_preprocessor,
+                        "preprocess",
+                        fake_preprocess,
+                        raising=True)
+
+    # Use a consistent two-image scenario across all configurations
+    mm_uuids = {"image": [None, "hash_stop"], "video": None}
+    prompt = {
+        "prompt": "USER: <image><image>\nTwo images\nASSISTANT:",
+        "multi_modal_data": {
+            "image": [cherry_pil_image, stop_pil_image],
+            "video": baby_reading_np_ndarrays,
+        },
+        "multi_modal_uuids": mm_uuids,
+    }
+
+    processor.process_inputs(
+        request_id="req-3",
+        prompt=prompt,  # type: ignore[arg-type]
+        params=SamplingParams(),
+    )
+
+    assert captured["mm_hash_overrides"] == mm_uuids
+
+
+def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
+    # When both processor cache is 0 and prefix caching disabled, the
+    # processor builds overrides from request id instead of using user UUIDs.
+    processor = _mk_processor(monkeypatch,
+                              mm_cache_gb=0.0,
+                              enable_prefix_caching=False)
+
+    captured: dict[str, object] = {}
+
+    def fake_preprocess(prompt,
+                        *,
+                        tokenization_kwargs=None,
+                        lora_request=None,
+                        mm_hash_overrides=None):
+        captured["mm_hash_overrides"] = mm_hash_overrides
+        return {"type": "token", "prompt_token_ids": [1]}
+
+    monkeypatch.setattr(processor.input_preprocessor,
+                        "preprocess",
+                        fake_preprocess,
+                        raising=True)
+
+    request_id = "req-42"
+    mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": "hash_video"}
+    prompt = {
+        "prompt": "USER: <image><image><video>\nDescribe\nASSISTANT:",
+        "multi_modal_data": {
+            "image": [cherry_pil_image, stop_pil_image],
+            "video": baby_reading_np_ndarrays,
+        },
+        "multi_modal_uuids": mm_uuids,
+    }
+
+    processor.process_inputs(
+        request_id=request_id,
+        prompt=prompt,  # type: ignore[arg-type]
+        params=SamplingParams(),
+    )
+
+    # Expect request-id-based overrides are passed through
+    assert captured["mm_hash_overrides"] == {
+        "image": [f"{request_id}-image-0", f"{request_id}-image-1"],
+        "video": [f"{request_id}-video-0"],
+    }
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 320c1e61f1..7014744cd9 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -67,7 +67,7 @@ from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import (  # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
-    MultiModalDataDict)
+    MultiModalDataDict, MultiModalUUIDDict)
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, SamplingParams
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 8e6d3136d5..f23b498722 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -7,7 +7,8 @@ import torch
 from typing_extensions import NotRequired, TypedDict, TypeIs, TypeVar
 
 if TYPE_CHECKING:
-    from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs
+    from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalInputs,
+                                        MultiModalUUIDDict)
 
 
 class TextPrompt(TypedDict):
@@ -30,6 +31,15 @@ class TextPrompt(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
+    multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
+    """
+    Optional user-specified UUIDs for multimodal items, mapped by modality.
+    Lists must match the number of items per modality and may contain `None`.
+    For `None` entries, the hasher will compute IDs automatically; non-None
+    entries override the default hashes for caching, and MUST be unique per
+    multimodal item.
+    """
+
     cache_salt: NotRequired[str]
     """
     Optional cache salt to be used for prefix caching.
@@ -59,6 +69,14 @@ class TokensPrompt(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
+    multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
+    """
+    Optional user-specified UUIDs for multimodal items, mapped by modality.
+    Lists must match the number of items per modality and may contain `None`.
+    For `None` entries, the hasher will compute IDs automatically; non-None
+    entries override the default hashes for caching.
+    """
+
     cache_salt: NotRequired[str]
     """
     Optional cache salt to be used for prefix caching.
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 2f2fbe274b..094fcf021b 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -13,7 +13,7 @@ from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                                    MultiModalInputs)
+                                    MultiModalInputs, MultiModalUUIDDict)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
@@ -258,7 +258,8 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
@@ -291,7 +292,8 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> MultiModalInputs:
         """
         Async version of
@@ -368,7 +370,8 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = self._truncate_inputs(
             parsed_content["prompt_token_ids"], tokenization_kwargs)
@@ -397,7 +400,8 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = self._truncate_inputs(
             parsed_content["prompt_token_ids"], tokenization_kwargs)
@@ -426,7 +430,8 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -462,7 +467,8 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -498,7 +504,8 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> SingletonInputs:
         """
         Extract the singleton inputs from a prompt.
@@ -545,7 +552,8 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> SingletonInputs:
         """
         Async version of
@@ -684,7 +692,8 @@ class InputPreprocessor:
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> EncoderDecoderInputs:
         """
         For encoder/decoder models only:
@@ -759,7 +768,8 @@ class InputPreprocessor:
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> EncoderDecoderInputs:
         """
         Async version of
@@ -826,7 +836,8 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
@@ -858,7 +869,8 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> DecoderOnlyInputs:
         """
         Async version of
@@ -879,7 +891,8 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
         if self.model_config.is_encoder_decoder:
@@ -909,7 +922,8 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> ProcessorInputs:
         """
         Async version of
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 69eed22741..b7d4cd298e 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from .base import MultiModalPlaceholderMap
-from .hasher import MultiModalHashDict, MultiModalHasher
+from .hasher import MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
                      MultiModalDataDict, MultiModalKwargs,
                      MultiModalKwargsItems, MultiModalPlaceholderDict,
-                     NestedTensors)
+                     MultiModalUUIDDict, NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -23,12 +23,12 @@ __all__ = [
     "ModalityData",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
-    "MultiModalHashDict",
     "MultiModalHasher",
     "MultiModalKwargs",
     "MultiModalKwargsItems",
     "MultiModalPlaceholderDict",
     "MultiModalPlaceholderMap",
+    "MultiModalUUIDDict",
     "NestedTensors",
     "MULTIMODAL_REGISTRY",
     "MultiModalRegistry",
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 3708dc7065..da019d40a6 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -3,7 +3,7 @@
 
 import pickle
 import uuid
-from collections.abc import Iterable, Mapping
+from collections.abc import Iterable
 from typing import Union
 
 import numpy as np
@@ -16,11 +16,6 @@ from vllm.multimodal.image import convert_image_mode
 
 logger = init_logger(__name__)
 
-MultiModalHashDict = Mapping[str, list[str]]
-"""
-A dictionary containing hashes for items in each modality.
-"""
-
 
 class MultiModalHasher:
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 6fcc5bc772..f8ea3835f0 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -22,7 +22,8 @@ if TYPE_CHECKING:
     from PIL.Image import Image
     from transformers.feature_extraction_utils import BatchFeature
 
-    from .hasher import MultiModalHashDict
+    from .processing import MultiModalHashes
+
 else:
     torch = LazyLoader("torch", globals(), "torch")
 
@@ -115,6 +116,16 @@ The built-in modalities are defined by
 [`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins].
 """
 
+MultiModalUUIDDict: TypeAlias = Mapping[str, Union[list[Optional[str]], str]]
+"""
+A dictionary containing user-provided UUIDs for items in each modality.
+If a UUID for an item is not provided, its entry will be `None` and
+MultiModalHasher will compute a hash for the item.
+
+The UUID will be used to identify the item for all caching purposes
+(input processing caching, embedding caching, prefix caching, etc).
+"""
+
 
 @dataclass(frozen=True)
 class PlaceholderRange:
@@ -939,7 +950,7 @@ class MultiModalInputs(TypedDict):
     mm_kwargs: MultiModalKwargsOptionalItems
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: "MultiModalHashDict"
+    mm_hashes: "MultiModalHashes"
     """The hashes of the multi-modal data."""
 
     mm_placeholders: "MultiModalPlaceholderDict"
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 41595df2e2..0531b7bd9f 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -24,7 +24,8 @@ from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                      MultiModalFieldConfig, MultiModalInputs,
                      MultiModalKwargsItem, MultiModalKwargsItems,
-                     MultiModalKwargsOptionalItems, PlaceholderRange)
+                     MultiModalKwargsOptionalItems, MultiModalUUIDDict,
+                     PlaceholderRange)
 from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
                     MultiModalDataParser)
 
@@ -1021,7 +1022,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         *,
-        mm_hash_overrides: Optional[MultiModalHashes] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> MultiModalInputs:
         return self.apply(prompt,
                           mm_data,
@@ -1361,24 +1363,62 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        *,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> MultiModalHashes:
         """Create MM hashes to be returned (only used in V1).
 
+
         Note: When overrides are provided via callers of `apply`,
         `_hash_mm_items` will be bypassed and the overrides will be used.
         """
         model_id = self.info.model_id
 
-        return {
-            modality: [
-                MultiModalHasher.hash_kwargs(model_id=model_id,
-                                             **{modality: item},
-                                             **hf_processor_mm_kwargs,
-                                             **tokenization_kwargs)
-                for item in items
-            ]
-            for modality, items in mm_items.items()
-        }
+        hashes: MultiModalHashes = {}
+        mm_hash_overrides = mm_hash_overrides or {}
+
+        for modality, items in mm_items.items():
+            if modality in mm_hash_overrides:
+                mm_hashes = mm_hash_overrides[modality]
+                if isinstance(mm_hashes, str):
+                    mm_hashes = [mm_hashes]
+
+                # For None entries, compute a hash; otherwise, use provided ID.
+                computed: list[str] = []
+                for i, item in enumerate(items):
+                    mm_hash = mm_hashes[i]
+
+                    # NOTE: Even if a mm_hash is provided, we still compute a
+                    # hash if `hf_processor_mm_kwargs` or `tokenization_kwargs`
+                    # are provided. This is because the processed multimodal
+                    # inputs can be different depending on the processor kwargs.
+                    if mm_hash is None or \
+                        hf_processor_mm_kwargs or \
+                        tokenization_kwargs:
+
+                        # NOTE: use provided hash string to hash with kwargs
+                        # if available for better performance.
+                        item = mm_hash if mm_hash is not None else item
+                        computed.append(
+                            MultiModalHasher.hash_kwargs(
+                                model_id=model_id,
+                                **{modality: item},
+                                **hf_processor_mm_kwargs,
+                                **tokenization_kwargs))
+                    else:
+                        computed.append(mm_hash)
+                hashes[modality] = computed
+            else:
+                hashes[modality] = [
+                    MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: item},
+                                                 **hf_processor_mm_kwargs,
+                                                 **tokenization_kwargs)
+                    for item in items
+                ]
+
+        return hashes
 
     def _get_cache_missing_items(
         self,
@@ -1474,7 +1514,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_hash_overrides: Optional[MultiModalHashes] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         (
             prompt_ids,
@@ -1495,9 +1536,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         )
 
         # Use overrides if provided; fallback to data-dependent hashing.
-        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
-                     self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
-                                         tokenization_kwargs))
+        mm_hashes = self._hash_mm_items(mm_data_items,
+                                        hf_processor_mm_kwargs,
+                                        tokenization_kwargs,
+                                        mm_hash_overrides=mm_hash_overrides)
 
         mm_prompt_updates = self._get_mm_prompt_updates(
             mm_data_items,
@@ -1520,7 +1562,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_hash_overrides: Optional[MultiModalHashes] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         """
         Apply the HF processor on the full prompt text,
@@ -1538,10 +1581,10 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                 mm_hash_overrides=mm_hash_overrides,
             )
 
-        # Use overrides if provided; fallback to data-dependent hashing.
-        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
-                     self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
-                                         tokenization_kwargs))
+        mm_hashes = self._hash_mm_items(mm_data_items,
+                                        hf_processor_mm_kwargs,
+                                        tokenization_kwargs,
+                                        mm_hash_overrides=mm_hash_overrides)
 
         mm_missing_data_items = self._get_cache_missing_items(
             cache=cache,
@@ -1742,7 +1785,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1857,7 +1901,8 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
         *,
-        mm_hash_overrides: Optional[MultiModalHashes] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> MultiModalEncDecInputs:
         """
         Process multi-modal inputs to be used in vLLM.
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6cff95c393..6d54c92e2d 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -150,6 +150,49 @@ class Processor:
         self._validate_sampling_params(params, lora_request)
         self._validate_supported_sampling_params(params)
 
+    def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
+        """
+        Validate that user-provided multi_modal_uuids align with
+        multi_modal_data in the incoming request prompt(s).
+        Only checks lengths; `None` entries are allowed and will be 
+        auto-hashed downstream.
+        """
+
+        def _validate_single_prompt(single_prompt: Union[dict, str]) -> None:
+            if not isinstance(single_prompt, dict):
+                return
+            mm_data = single_prompt.get("multi_modal_data")
+            mm_uuids = single_prompt.get("multi_modal_uuids")
+            if not mm_data or not mm_uuids:
+                return
+
+            for modality, items in mm_data.items():
+                if modality in mm_uuids:
+                    data_len = len(items) if isinstance(items, list) else 1
+                    uuid_len = len(mm_uuids[modality]) if isinstance(
+                        mm_uuids[modality], list) else 1
+                    if uuid_len != data_len:
+                        raise ValueError(
+                            f"multi_modal_uuids for modality '{modality}' "
+                            "must have same length as data: got "
+                            f"{uuid_len} uuids vs "
+                            f"{data_len} items.")
+                else:
+                    raise ValueError(
+                        f"multi_modal_uuids for modality '{modality}' must "
+                        "be provided if multi_modal_data is provided.")
+
+        # Handle explicit encoder/decoder prompts or singleton prompt
+        if isinstance(prompt, dict) and "encoder_prompt" in prompt:
+            enc = prompt.get("encoder_prompt")
+            dec = prompt.get("decoder_prompt")
+            if enc is not None:
+                _validate_single_prompt(enc)
+            if dec is not None:
+                _validate_single_prompt(dec)
+        else:
+            _validate_single_prompt(prompt)  # type: ignore[arg-type]
+
     def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
@@ -289,17 +332,27 @@ class Processor:
         if arrival_time is None:
             arrival_time = time.time()
 
-        # Optionally generate multimodal hash overrides based on request id.
+        # Optionally generate multimodal hash overrides to avoid hashing
+        # multimodal data items by their content as their identifiers.
+
         # NOTE: when users explicitly turn off BOTH prefix caching and input
         # processing caching, no multimodal features or embeddings will be
-        # reused across requests, therefore hashing is no longer necessary.
+        # reused across requests, therefore identifying multimodal data items
+        # by their content is no longer necessary, and we create uuids with
+        # request id-modality-index as multimodal hash overrides.
         if (self.model_config.multimodal_config and
                 self.model_config.multimodal_config.mm_processor_cache_gb == 0
                 and not self.cache_config.enable_prefix_caching):
             mm_hash_overrides = self._maybe_build_mm_hash_overrides(
                 request_id, prompt)
         else:
-            mm_hash_overrides = None
+            # Otherwise, use user-provided uuids as multimodal hash overrides
+            # if provided.
+            self._validate_multi_modal_uuids(prompt)
+            if isinstance(prompt, dict):
+                mm_hash_overrides = prompt.get("multi_modal_uuids")
+            else:
+                mm_hash_overrides = None
 
         # Process inputs, which includes:
         # 1. Tokenize text prompt, with LoRA request if one exists.
@@ -317,6 +370,7 @@ class Processor:
             params=params,
             processed_inputs=processed_inputs,
         )
+
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
         self._validate_model_inputs(processed_inputs, lora_request)

From 9701352e4ba75e89b5d35a5cb74d0a55567319d9 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Sun, 31 Aug 2025 10:21:59 +0200
Subject: [PATCH 752/932] [Doc]: fix typos in Python comments (#24001)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 vllm/compilation/monitor.py                     |  2 +-
 vllm/core/evictor.py                            |  2 +-
 vllm/engine/llm_engine.py                       |  2 +-
 vllm/entrypoints/llm.py                         | 10 +++++-----
 vllm/executor/mp_distributed_executor.py        |  2 +-
 vllm/lora/layers.py                             |  2 +-
 vllm/platforms/interface.py                     |  2 +-
 vllm/reasoning/hunyuan_a13b_reasoning_parser.py |  2 +-
 vllm/v1/worker/gpu_model_runner.py              |  2 +-
 vllm/v1/worker/gpu_worker.py                    |  2 +-
 10 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 9047bf3cbf..c46721ab2d 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -43,7 +43,7 @@ cudagraph_capturing_enabled: bool = True
 
 
 def validate_cudagraph_capturing_enabled():
-    # used to monitor whether an cudagraph capturing is legal at runtime.
+    # used to monitor whether a cudagraph capturing is legal at runtime.
     # should be called before any cudagraph capturing.
     # if an illegal cudagraph capturing happens, raise an error.
     global cudagraph_capturing_enabled
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 7ec4768e90..7a4a836ee3 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -76,7 +76,7 @@ class LRUEvictor(Evictor):
     that's recorded in the Block. If there are multiple blocks with
     the same last_accessed time, then the one with the largest num_hashed_tokens
     will be evicted. If two blocks each have the lowest last_accessed time and
-    highest num_hashed_tokens value, then one will be chose arbitrarily
+    highest num_hashed_tokens value, then one will be chosen arbitrarily
     """
 
     # CLEANUP_THRESHOLD determines the maximum allowable size of the priority
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 7a5130af0b..10ded6f16d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1239,7 +1239,7 @@ class LLMEngine:
 
             # Stop the execute model loop in parallel workers until there are
             # more requests to process. This avoids waiting indefinitely in
-            # torch.distributed ops which may otherwise timeout, and unblocks
+            # torch.distributed ops which may otherwise time out, and unblocks
             # the RPC thread in the workers so that they can process any other
             # queued control plane messages, such as add/remove lora adapters.
             logger.debug("Stopping remote worker execution loop.")
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 479524a117..cab761b8ea 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -329,7 +329,7 @@ class LLM:
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
             sampling_params: The sampling parameters for text generation. If
                 None, we use the default sampling parameters.
                 When it is a single value, it is applied to every prompt.
@@ -853,7 +853,7 @@ class LLM:
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
             use_tqdm: If `True`, shows a tqdm progress bar.
@@ -946,7 +946,7 @@ class LLM:
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
             use_tqdm: If `True`, shows a tqdm progress bar.
@@ -994,7 +994,7 @@ class LLM:
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
             use_tqdm: If `True`, shows a tqdm progress bar.
                 If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                 it is used to create the progress bar.
@@ -1038,7 +1038,7 @@ class LLM:
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
             use_tqdm: If `True`, shows a tqdm progress bar.
                 If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                 it is used to create the progress bar.
diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index 4e8c6d7909..136dca54e6 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -101,7 +101,7 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase):
             result_handler.start()
             self.worker_monitor.start()
 
-        # Set up signal handlers to shutdown the executor cleanly
+        # Set up signal handlers to shut down the executor cleanly
         # sometimes gc does not work well
 
         self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 24a05d310d..d8503b2045 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -605,7 +605,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
 
 class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     """ColumnParallelLinear layer that is composed of 2 sublayers (slices)
-    packed together (eg. gate_proj + up_proj -> gate_up_proj).
+    packed together (e.g. gate_proj + up_proj -> gate_up_proj).
 
     This means we have 2 LoRAs, each applied to one half of the layer.
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 01f3e2d977..ad12f7f788 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -537,7 +537,7 @@ class Platform:
 
     def get_global_graph_pool(self) -> Any:
         """
-        Return the global graph pool for the this platform.
+        Return the global graph pool for this platform.
         """
         cls = self.__class__
         if cls._global_graph_pool is None:
diff --git a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
index b2452b95c1..9deec8a1e8 100644
--- a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
+++ b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
@@ -30,7 +30,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
     Key Features:
         - For non-stream output , Recognizes and extracts reasoning ("think")
          and answer ("answer") sections from text using regular expressions.
-        - For stream process, it require a token id sequences to change the 
+        - For stream process, it requires a token id sequences to change the
           reasoning state and other state so it maintains internal state to 
           manage parsing across multiple token.
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d6717892d4..f77373e8ad 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2734,7 +2734,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                                  layer_names)
             attn_backends = {}
             attn_backend_layers = defaultdict(list)
-            # Dedupe based on full class name; this is a bit safer than using
+            # Dedupe based on full class name; this is a bit safer than
             # using the class itself as the key because when we create dynamic
             # attention backend subclasses (e.g. ChunkedLocalAttention) unless
             # they are cached correctly, there will be different objects per
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 2e7d668537..f49f5bdd97 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -224,7 +224,7 @@ class Worker(WorkerBase):
         memory can be used for KV cache without OOMs.
 
         The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the free memory that can be used for KV cache in
+        Then, it calculates the free memory that can be used for KV cache in
         bytes.
 
         Tip:

From 81eea3d348c26fb1e6ff0185ad109aedd60a28a2 Mon Sep 17 00:00:00 2001
From: Xiaodong Wang <xdwang@meta.com>
Date: Sun, 31 Aug 2025 05:57:05 -0700
Subject: [PATCH 753/932] vllm fix check on max vocab size (#22471)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
---
 vllm/v1/engine/processor.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6d54c92e2d..1aa117ded4 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -470,7 +470,19 @@ class Processor:
         else:
             tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
             max_input_id = max(prompt_ids, default=0)
-            if max_input_id > tokenizer.max_token_id:
+
+            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
+            # self.model_config.get_vocab_size() is the model’s vocab size.
+            # For Qwen3 models, the language model has extra tokens that do
+            # not exist in the tokenizer, and vice versa for multimodal
+            # placeholder tokens in some multimodal models.
+            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
+            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
+
+            # Here we take the max of the two to determine if a token id is
+            # truly out-of-vocabulary.
+            if max_input_id > max(tokenizer.max_token_id,
+                                  self.model_config.get_vocab_size() - 1):
                 raise ValueError(
                     f"Token id {max_input_id} is out of vocabulary")
 

From 752d2e1c364e4195093e4f3f2fc33e3ae1840707 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sun, 31 Aug 2025 16:42:17 -0700
Subject: [PATCH 754/932] [Minor] Fix some random typos in comments (#24009)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/utils/__init__.py                       | 2 +-
 vllm/v1/core/sched/scheduler.py              | 2 +-
 vllm/v1/core/single_type_kv_cache_manager.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 698aaab3aa..9c78e56d58 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -3290,7 +3290,7 @@ def sha256_cbor_64bit(input) -> int:
     return full_hash & ((1 << 64) - 1)
 
 
-def get_hash_fn_by_name(hash_fn_name: str) -> Callable:
+def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], int]:
     """Get a hash function by name, or raise an error if
     the function is not found.
     Args:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 3bd2fe2f05..30a443499d 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1207,7 +1207,7 @@ class Scheduler(SchedulerInterface):
         # Now that the blocks are ready, actually cache them.
         (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
         num_computed_tokens = len(block_ids) * self.block_size
-        # Handle the case where num request tokens less then one block.
+        # Handle the case where num request tokens less than one block.
         num_computed_tokens = min(num_computed_tokens, request.num_tokens)
         if num_computed_tokens == request.num_tokens:
             num_computed_tokens -= 1
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index f0af921229..f6affb3dab 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -47,7 +47,7 @@ class SingleTypeKVCacheManager(ABC):
         # {req_id: The number of cached blocks for this given request}
         # This is used to track the number of cached blocks for each request.
         # This is only used to track the RUNNING requests, we do not track the
-        # data for reempted ones.
+        # data for preempted ones.
         self.num_cached_block: dict[str, int] = {}
 
         self.kv_cache_group_id = kv_cache_group_id

From 14b4326b9470c098d537cce3834033a7f3b2c386 Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Mon, 1 Sep 2025 04:13:21 +0300
Subject: [PATCH 755/932] v1: Support KV events from connectors (#19737)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 examples/online_serving/kv_events_subscriber.py     |  2 ++
 vllm/distributed/kv_events.py                       |  5 +++++
 .../distributed/kv_transfer/kv_connector/v1/base.py | 13 +++++++++++++
 .../kv_transfer/kv_connector/v1/multi_connector.py  |  6 ++++++
 vllm/v1/core/block_pool.py                          |  9 ++++++---
 vllm/v1/core/sched/scheduler.py                     | 12 ++++++++++++
 6 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py
index 584db53db4..f238c66234 100644
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -27,10 +27,12 @@ class BlockStored(KVCacheEvent):
     token_ids: list[int]
     block_size: int
     lora_id: Optional[int]
+    medium: Optional[str]
 
 
 class BlockRemoved(KVCacheEvent):
     block_hashes: list[int]
+    medium: Optional[str]
 
 
 class AllBlocksCleared(KVCacheEvent):
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index 2d7935773d..37f8f72fa9 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -40,16 +40,21 @@ class KVCacheEvent(
     """Base class for all KV cache-related events"""
 
 
+MEDIUM_GPU = "GPU"
+
+
 class BlockStored(KVCacheEvent):
     block_hashes: list[int]
     parent_block_hash: Optional[int]
     token_ids: list[int]
     block_size: int
     lora_id: Optional[int]
+    medium: Optional[str]
 
 
 class BlockRemoved(KVCacheEvent):
     block_hashes: list[int]
+    medium: Optional[str]
 
 
 class AllBlocksCleared(KVCacheEvent):
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 5601ee74be..2804003f5a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -19,6 +19,8 @@ The class provides the following primitives:
             Returns whether KV cache should be freed now or will be
             freed asynchronously and optionally returns KV transfer
             params.
+        take_events() - returns new KV events that were collected
+            by the connector since the last call.
 
     Worker-side: runs in each worker, loads/saves KV cache to/from
     the Connector based on the metadata.
@@ -34,6 +36,7 @@ The class provides the following primitives:
 
 import enum
 from abc import ABC, abstractmethod
+from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Callable, Literal, Optional
 
 import torch
@@ -45,6 +48,7 @@ from vllm.v1.outputs import KVConnectorOutput
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
+    from vllm.distributed.kv_events import KVCacheEvent
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
@@ -313,6 +317,15 @@ class KVConnectorBase_V1(ABC):
         """
         return False, None
 
+    def take_events(self) -> Iterable["KVCacheEvent"]:
+        """
+        Take the KV cache events from the connector.
+
+        Yields:
+            New KV cache events since the last call.
+        """
+        return ()
+
     @classmethod
     def get_required_kvcache_layout(
             cls, vllm_config: "VllmConfig") -> Optional[str]:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index d3f6a226dc..65bcb4d93b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -1,12 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
+from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 
 from vllm.config import KVTransferConfig, VllmConfig
+from vllm.distributed.kv_events import KVCacheEvent
 from vllm.distributed.kv_transfer.kv_connector.factory import (
     KVConnectorFactory)
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
@@ -208,6 +210,10 @@ class MultiConnector(KVConnectorBase_V1):
 
         return async_saves > 0, kv_txfer_params
 
+    def take_events(self) -> Iterable[KVCacheEvent]:
+        for c in self._connectors:
+            yield from c.take_events()
+
     @classmethod
     def get_required_kvcache_layout(
             cls, vllm_config: "VllmConfig") -> Optional[str]:
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index fdd96c3e95..b537cac8e1 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -4,8 +4,9 @@ from collections import defaultdict
 from collections.abc import Iterable
 from typing import Optional
 
-from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved,
-                                        BlockStored, KVCacheEvent)
+from vllm.distributed.kv_events import (MEDIUM_GPU, AllBlocksCleared,
+                                        BlockRemoved, BlockStored,
+                                        KVCacheEvent)
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
                                          FreeKVCacheBlockQueue, KVCacheBlock)
@@ -156,6 +157,7 @@ class BlockPool:
                     block_size=block_size,
                     lora_id=request.lora_request.id
                     if request.lora_request else None,
+                    medium=MEDIUM_GPU,
                 ))
 
     def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
@@ -218,7 +220,8 @@ class BlockPool:
             # we disable hybrid kv cache manager when kv cache event is
             # enabled, so there is only one group.
             self.kv_event_queue.append(
-                BlockRemoved(block_hashes=[block_hash.get_hash_value()]))
+                BlockRemoved(block_hashes=[block_hash.get_hash_value()],
+                             medium=MEDIUM_GPU))
         return True
 
     def touch(self, blocks: tuple[list[KVCacheBlock], ...]) -> None:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 30a443499d..d4391b1c21 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -589,7 +589,19 @@ class Scheduler(SchedulerInterface):
             meta = self.connector.build_connector_meta(scheduler_output)
             scheduler_output.kv_connector_metadata = meta
 
+        # collect KV cache events from KV cache manager
         events = self.kv_cache_manager.take_events()
+
+        # collect KV cache events from connector
+        if self.connector is not None:
+            connector_events = self.connector.take_events()
+            if connector_events:
+                if events is None:
+                    events = list(connector_events)
+                else:
+                    events.extend(connector_events)
+
+        # publish collected KV cache events
         if events:
             batch = KVEventBatch(ts=time.time(), events=events)
             self.kv_event_publisher.publish(batch)

From 183a70967a90ef06e614ad020ead7d27e87b7688 Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Mon, 1 Sep 2025 05:33:40 +0200
Subject: [PATCH 756/932] [BUGFIX] GPTQ quantization compatibility for Qwen3
 MOE models (AutoGPTQ and AutoRound-GPTQ) (#23994)

Signed-off-by: JartX <sagformas@epdcenter.es>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/quantization/gptq.py        |  8 +++++++-
 vllm/model_executor/layers/quantization/gptq_marlin.py |  3 +++
 vllm/model_executor/models/qwen3_moe.py                | 10 +++++++---
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index f18c936bac..2272709f93 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -37,6 +37,7 @@ class GPTQConfig(QuantizationConfig):
         desc_act: bool,
         lm_head_quantized: bool,
         dynamic: dict[str, dict[str, Union[int, bool]]],
+        autoround_version: str = "",
     ) -> None:
         # GPTQModel use `dynamic` config property to allow per module
         # quantization config so each module can be individually optimized.
@@ -74,6 +75,9 @@ class GPTQConfig(QuantizationConfig):
                 "Currently, only 2/3/4/8-bit weight quantization is "
                 f"supported for GPTQ, but got {self.weight_bits} bits.")
 
+        # used to identify GPTQ model quantized by autoround
+        self.autoround_version = autoround_version
+
     def __repr__(self) -> str:
         return (f"GPTQConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
@@ -108,8 +112,10 @@ class GPTQConfig(QuantizationConfig):
         desc_act = cls.get_from_keys(config, ["desc_act"])
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                  default=False)
+        autoround_version = cls.get_from_keys_or(config, ["autoround_version"],
+                                                 default="")
         return cls(weight_bits, group_size, desc_act, lm_head_quantized,
-                   dynamic)
+                   dynamic, autoround_version)
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 3509759666..3644d91f64 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -119,6 +119,9 @@ class GPTQMarlinConfig(QuantizationConfig):
 
         self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
 
+        # used to identify GPTQ model quantized by autoround
+        self.autoround_version = full_config.get("autoround_version", "")
+
     def __repr__(self) -> str:
         return (f"GPTQMarlinConfig(quant_type={self.quant_type}, "
                 f"group_size={self.group_size}, "
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 94e6a66bea..a7e0a00350 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -159,9 +159,13 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
 
     def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
         # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
-        # seems to avoid gate quantization.
-        # See: https://huggingface.co/Qwen/Qwen3-30B-A3B-GPTQ-Int4
-        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+        # seems to avoid gate quantization while AutoRound does.
+        # See: https://huggingface.co/Qwen/Qwen3-30B-A3B-GPTQ-Int4,
+        # and https://huggingface.co/jart25/Qwen3-Coder-30B-A3B-Instruct-Int4-gptq
+        if isinstance(
+                quant_config,
+            (GPTQConfig,
+             GPTQMarlinConfig)) and not quant_config.autoround_version:
             return None
         return quant_config
 

From 8c742a66d1bdb92978a40f0420123bf0d07fd90d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 31 Aug 2025 21:02:43 -0700
Subject: [PATCH 757/932] [Misc] Avoid redundant copy for encoder-only models
 (#24012)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f77373e8ad..4a6856bf4f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -827,13 +827,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 blk_table_tensor = torch.zeros(
                     (num_reqs, 1),
                     dtype=torch.int32,
-                    pin_memory=self.pin_memory,
-                    device="cpu").to(self.device, non_blocking=True)
-                slot_mapping = torch.zeros((total_num_scheduled_tokens, ),
-                                           dtype=torch.int32,
-                                           pin_memory=self.pin_memory,
-                                           device="cpu").to(self.device,
-                                                            non_blocking=True)
+                    device=self.device,
+                )
+                slot_mapping = torch.zeros(
+                    (total_num_scheduled_tokens, ),
+                    dtype=torch.int64,
+                    device=self.device,
+                )
                 num_common_prefix_blocks = 0
             else:
                 blk_table = self.input_batch.block_table[kv_cache_group_id]

From acc1a6e10af7ec16787ffe8dc67ad36a05955024 Mon Sep 17 00:00:00 2001
From: Jun-Howie <62869005+Jun-Howie@users.noreply.github.com>
Date: Mon, 1 Sep 2025 13:39:57 +0800
Subject: [PATCH 758/932] Fix the bug related to loading GPTP INT3 weights.
 (#23328)

Signed-off-by: JunHowie <JunHowie@aliyun.com>
Co-authored-by: JunHowie <JunHowie@aliyun.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/quantization/utils/gptq_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
index db82b0def1..4fbd0f5c4e 100644
--- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from copy import deepcopy
+from fractions import Fraction
 from typing import Optional, Union
 
 import regex as re
@@ -29,7 +30,7 @@ def override_config(config: QuantizationConfig, prefix: str):
     if isinstance(desc_act, bool):
         config.desc_act = desc_act
 
-    config.pack_factor = 32 // config.weight_bits  # packed into int32
+    config.pack_factor = Fraction(32, config.weight_bits)  # packed into int32
     if config.get_name() == "gptq_marlin":
         is_sym = get_dynamic_override(config, prefix, "sym", config.is_sym)
         if isinstance(is_sym, bool):

From b55713683c007d0fca16a46a88c1d976dc2e3baa Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 31 Aug 2025 22:40:38 -0700
Subject: [PATCH 759/932] [Misc] Move fast prefill logic to separate method
 (#24013)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 48 +++++++++++++++++-------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4a6856bf4f..08e13ab887 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -783,28 +783,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         logits_indices_padded = None
         if self.cache_config.kv_sharing_fast_prefill:
-            assert self.kv_sharing_fast_prefill_logits_indices is not None
-            num_logits = logits_indices.shape[0]
-            assert num_logits > 0
-            self.kv_sharing_fast_prefill_logits_indices[:num_logits].copy_(
+            logits_indices_padded = self._prepare_kv_sharing_fast_prefill(
                 logits_indices)
-            # There might have leftover indices in logits_indices[num_logits:]
-            # from previous iterations, whose values may be greater than the
-            # batch size in the current iteration. To ensure indices are always
-            # valid, we fill the padded indices with the last index.
-            self.kv_sharing_fast_prefill_logits_indices[num_logits:].fill_(
-                logits_indices[-1].item())
-            if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-                    and num_logits <= self.cudagraph_batch_sizes[-1]):
-                # Use piecewise CUDA graphs.
-                # Add padding to the batch size.
-                num_logits_padded = self.vllm_config.pad_for_cudagraph(
-                    num_logits)
-            else:
-                num_logits_padded = num_logits
-            logits_indices_padded = (
-                self.kv_sharing_fast_prefill_logits_indices[:num_logits_padded]
-            )
 
         attn_metadata: dict[str, Any] = {}
 
@@ -1109,6 +1089,32 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         )
         return metadata
 
+    def _prepare_kv_sharing_fast_prefill(
+        self,
+        logits_indices: torch.Tensor,
+    ) -> torch.Tensor:
+        assert self.kv_sharing_fast_prefill_logits_indices is not None
+        num_logits = logits_indices.shape[0]
+        assert num_logits > 0
+        self.kv_sharing_fast_prefill_logits_indices[:num_logits].copy_(
+            logits_indices)
+        # There might have leftover indices in logits_indices[num_logits:]
+        # from previous iterations, whose values may be greater than the
+        # batch size in the current iteration. To ensure indices are always
+        # valid, we fill the padded indices with the last index.
+        self.kv_sharing_fast_prefill_logits_indices[num_logits:].fill_(
+            logits_indices[-1].item())
+        if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+                and num_logits <= self.cudagraph_batch_sizes[-1]):
+            # Use piecewise CUDA graphs.
+            # Add padding to the batch size.
+            num_logits_padded = self.vllm_config.pad_for_cudagraph(num_logits)
+        else:
+            num_logits_padded = num_logits
+        logits_indices_padded = (
+            self.kv_sharing_fast_prefill_logits_indices[:num_logits_padded])
+        return logits_indices_padded
+
     def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:

From ff0e59d83a287c3d749b02e8b6a43f31bc856753 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 1 Sep 2025 13:52:20 +0800
Subject: [PATCH 760/932] [CI/Build] Improve Tensor Schema tests speed by avoid
 engine core initialization (#23357)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .buildkite/test-pipeline.yaml                 |   3 +-
 .../processing/test_tensor_schema.py          | 130 +++++++-----------
 vllm/model_executor/models/granite_speech.py  |   2 +-
 vllm/model_executor/models/mllama.py          |   3 +-
 vllm/model_executor/models/ovis.py            |   2 +-
 vllm/model_executor/models/ovis2_5.py         | 120 ++++++++++++----
 vllm/model_executor/models/phi4mm.py          |   4 +-
 7 files changed, 153 insertions(+), 111 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 482808cd07..7443d34b9c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -566,8 +566,7 @@ steps:
   - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
-    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+    - pytest -v -s models/multimodal/processing
 
 - label: Multi-Modal Models Test (Standard)
   mirror_hardwares: [amdexperimental]
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 2d8cd49edc..1a11fa3d2b 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -1,30 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import tempfile
 from collections.abc import Iterable
+from contextlib import contextmanager
 from functools import partial
 from typing import Any, Union
-from unittest.mock import patch
 
 import numpy as np
 import pytest
+import torch.nn as nn
 from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
                                                        UserMessage)
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image
 
-from vllm.config import ModelConfig
-from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
+from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.distributed import (cleanup_dist_env_and_memory,
+                              init_distributed_environment,
+                              initialize_model_parallel)
 from vllm.inputs import InputProcessingContext
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs)
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads
-from vllm.v1.core.kv_cache_utils import get_kv_cache_config
-from vllm.v1.engine.core import EngineCore as V1EngineCore
+from vllm.utils import is_list_of
 
-from ....conftest import VllmRunner
 from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
 from ...utils import dummy_hf_overrides
 
@@ -137,6 +138,27 @@ def create_batched_mm_kwargs(
     return group_mm_kwargs_by_modality(items)
 
 
+@contextmanager
+def initialize_dummy_model(model_cls: nn.Module, model_config: ModelConfig):
+    temp_file = tempfile.mkstemp()[1]
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"file://{temp_file}",
+        local_rank=0,
+        backend="nccl",
+    )
+    initialize_model_parallel(tensor_model_parallel_size=1)
+    vllm_config = VllmConfig(model_config=model_config)
+    with set_current_vllm_config(vllm_config=vllm_config):
+        with set_default_torch_dtype(model_config.dtype):
+            model = model_cls(vllm_config=vllm_config)
+        yield model
+
+    del model
+    cleanup_dist_env_and_memory()
+
+
 def get_model_id_to_test(
         model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
     filtered_results = []
@@ -155,8 +177,7 @@ def get_model_id_to_test(
 @pytest.mark.parametrize(
     "model_arch, model_id",
     get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
-def test_model_tensor_schema(model_arch: str, model_id: str,
-                             vllm_runner: type[VllmRunner], monkeypatch):
+def test_model_tensor_schema(model_arch: str, model_id: str):
     if model_arch in ARCH_TO_SKIP:
         pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
     if model_id in REPO_ID_TO_SKIP:
@@ -177,14 +198,20 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
         tokenizer_mode=model_info.tokenizer_mode,
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
+        hf_overrides=hf_overrides_fn,
     )
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
 
-    if not any(
-            hasattr(model_cls, f"_parse_and_validate_{m}_input")
-            for m in ["image", "video", "audio"]):
+    inputs_parse_methods = []
+    for attr_name in dir(model_cls):
+        attr = getattr(model_cls, attr_name)
+        if hasattr(attr, "__annotations__"):
+            return_type = attr.__annotations__.get("return", None)
+            if return_type is not None and "Input" in str(return_type):
+                inputs_parse_methods.append(attr_name)
+
+    if not any(inputs_parse_methods):
         pytest.skip(f"{model_arch} does not support tensor schema validation.")
 
     ctx = InputProcessingContext(
@@ -197,68 +224,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
         modality: 3 if limit is None else limit
         for modality, limit in supported_mm_limits.items()
     }
+    model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
+    processor = factories.build_processor(ctx, cache=None)
 
-    # Avoid calling model.forward()
-    def _initialize_kv_caches_v0(self) -> None:
-        self.cache_config.num_gpu_blocks = 0
-        self.cache_config.num_cpu_blocks = 0
-
-    def _initialize_kv_caches_v1(self, vllm_config):
-        kv_cache_specs = self.model_executor.get_kv_cache_specs()
-        scheduler_kv_cache_config = get_kv_cache_config(
-            vllm_config,
-            kv_cache_specs[0],
-            10 * GiB_bytes,
-        )
-
-        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
-        return 1, 0, scheduler_kv_cache_config
-
-    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
-                       _initialize_kv_caches_v0),
-          patch.object(V1EngineCore, "_initialize_kv_caches",
-                       _initialize_kv_caches_v1), monkeypatch.context() as m):
-        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-        if model_info.v0_only:
-            m.setenv("VLLM_USE_V1", "0")
-
-        # TODO(Isotr0py): Can we avoid initializing engine?
-        with (
-                set_default_torch_num_threads(1),
-                vllm_runner(
-                    model_id,
-                    tokenizer_name=model_info.tokenizer,
-                    tokenizer_mode=model_info.tokenizer_mode,
-                    revision=model_info.revision,
-                    trust_remote_code=model_info.trust_remote_code,
-                    max_model_len=model_info.max_model_len,
-                    load_format="dummy",
-                    hf_overrides=hf_overrides_fn,
-                    limit_mm_per_prompt=limit_mm_per_prompt,
-                    enforce_eager=True,
-                ) as vllm_model,
-        ):
-            model_config = vllm_model.llm.llm_engine.model_config
-            llm_engine = vllm_model.llm.llm_engine
-
-            if hasattr(llm_engine, "processor"):
-                # v1 processor
-                mm_registry = llm_engine.processor.mm_registry
-            else:
-                # v0 input_preprocessor
-                mm_registry = llm_engine.input_preprocessor.mm_registry
-
-            processor = mm_registry.create_processor(model_config)
-
-            def validate_model_input(model, modality: str,
-                                     mm_kwargs: MultiModalKwargs):
-                method_name = f"_parse_and_validate_{modality}_input"
-                if hasattr(model, method_name):
-                    getattr(model, method_name)(**mm_kwargs)
-
-            for modality, _, mm_kwargs in create_batched_mm_kwargs(
-                    model_config, processor):
-                valid_func = partial(validate_model_input,
-                                     modality=modality,
-                                     mm_kwargs=mm_kwargs)
-                vllm_model.apply_model(valid_func)
+    with initialize_dummy_model(model_cls, model_config) as model:
+        for modality, _, mm_kwargs in create_batched_mm_kwargs(
+                model_config, processor):
+            for method_name in inputs_parse_methods:
+                print(f"Testing `{method_name}` with modality={modality} "
+                      f"and mm_kwargs{list(mm_kwargs.keys())}")
+                getattr(model, method_name)(modality=modality, **mm_kwargs)
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index c3ac3bb78c..221023f1fb 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -549,7 +549,7 @@ class GraniteSpeechForConditionalGeneration(
 
         raise ValueError("Only audio modality is supported")
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index cc2216996f..f441287a4d 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1371,7 +1371,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
                 output_tensor[i, :t.size(0)] = t
             return output_tensor
 
-    def _parse_and_validate_image_input(self, **kwargs: object):
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[MllamaImagePixelInputs]:
         # tensor with the same shape will be batched together by
         # MultiModalKwargs.batch, so pixel_values here can be:
         #   - list[torch.Tensor]:
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 5b3ad7cbd0..04a06e5f9d 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -209,7 +209,7 @@ class OvisImagePatchInputs(TypedDict):
     `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
     """
 
-    inducator_tokens: torch.Tensor
+    indicator_tokens: torch.Tensor
     """
     Shape: 
     `(batch_size * (num_patches + 1))`
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index 58a1407244..5e4758ef8e 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -3,7 +3,7 @@
 """ PyTorch Ovis model."""
 from collections.abc import Iterable, Mapping
 from functools import partial
-from typing import Optional, Union
+from typing import Literal, Optional, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -50,6 +50,27 @@ IMAGE_PAD_TOKEN_ID_MAP = {
 }
 
 
+class OvisVideoPatchInputs(TypedDict):
+    type: Literal["video_patches"]
+    flat_data: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
+    """
+
+    indicator_tokens: torch.Tensor
+    """
+    Shape:
+    `(batch_size * (num_patches + 1))`
+    """
+
+    patches_per_image: list[int]
+    """
+    List of number of total patches for each frame in the video.
+    This is used to restore the first two dimensions of `flat_data`.
+    """
+
+
 def _ovis2_5_field_config():
     return dict(pixel_values=MultiModalFieldConfig.batched("image"),
                 grids=MultiModalFieldConfig.batched("image"),
@@ -429,17 +450,11 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
         self.make_empty_intermediate_tensors = (
             self.get_language_model().make_empty_intermediate_tensors)
 
-    def _parse_and_validate_visual_input(
-            self, is_video,
-            **kwargs: object) -> Optional[OvisImagePatchInputs]:
-        if is_video:
-            pixel_values = kwargs.pop("video_pixel_values", None)
-            indicator_tokens = kwargs.pop("video_indicator_tokens", None)
-            grids = kwargs.pop("video_grids", None)
-        else:
-            pixel_values = kwargs.pop("pixel_values", None)
-            indicator_tokens = kwargs.pop("indicator_tokens", None)
-            grids = kwargs.pop("grids", None)
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[OvisImagePatchInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        indicator_tokens = kwargs.pop("indicator_tokens", None)
+        grids = kwargs.pop("grids", None)
         if pixel_values is None and indicator_tokens is None:
             return None
 
@@ -466,8 +481,40 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
 
         raise AssertionError("This line should be unreachable.")
 
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[OvisImagePatchInputs]:
+        pixel_values = kwargs.pop("video_pixel_values", None)
+        indicator_tokens = kwargs.pop("video_indicator_tokens", None)
+        grids = kwargs.pop("video_grids", None)
+        if pixel_values is None and indicator_tokens is None:
+            return None
+
+        if pixel_values is not None and indicator_tokens is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(indicator_tokens, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of indicator_tokens. "
+                                 f"Got type: {type(indicator_tokens)}")
+
+            return OvisVideoPatchInputs(
+                type="video_patches",
+                flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
+                patches_per_image=[
+                    x.shape[0] // (self.config.vit_config.hidden_stride**2)
+                    for x in flatten_bn(pixel_values)
+                ],
+                indicator_tokens=flatten_bn(flatten_bn(indicator_tokens),
+                                            concat=True),
+                grids=flatten_bn(flatten_bn(grids), concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
     def _process_image_input(
-            self, image_input: OvisImagePatchInputs) -> MultiModalEmbeddings:
+        self, image_input: Union[OvisImagePatchInputs, OvisVideoPatchInputs]
+    ) -> MultiModalEmbeddings:
         image_patches_flat = image_input["flat_data"]
         patches_per_image = image_input["patches_per_image"]
         indicator_tokens = image_input["indicator_tokens"]
@@ -500,21 +547,44 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
                 torch.cat(vision_embeddings_per_image, dim=0))
         return tuple(vision_embeddings)
 
-    def get_multimodal_embeddings(
-            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-        embeddings = []
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
 
-        # NOTE: _parse_and_validate_visual_input has side-effects and pops
-        # keys from kwargs. We process images first, then videos.
-        image_input = self._parse_and_validate_visual_input(False, **kwargs)
-        if image_input:
-            embeddings.extend(self._process_image_input(image_input))
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values", "indicator_tokens",
+                             "grids") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("video_pixel_values", "video_indicator_tokens",
+                             "video_grids") and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
 
-        video_input = self._parse_and_validate_visual_input(True, **kwargs)
-        if video_input:
-            embeddings.extend(self._process_image_input(video_input))
+        return modalities
 
-        return tuple(embeddings) if embeddings else None
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return []
+
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_image_input(video_input)
+                multimodal_embeddings += video_embeddings
+
+        return multimodal_embeddings
 
     def get_input_embeddings(
         self,
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 211cbd9c81..ac0efc2771 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1031,8 +1031,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         ]
         return audio_embeds
 
-    def _parse_and_validate_image_input(self,
-                                        **kwargs: object) -> Optional[dict]:
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Phi4MMImagePixelInputs]:
         input_image_embeds: NestedTensors = kwargs.get("input_image_embeds")
         if input_image_embeds is None:
             return None

From 499b074bfd7d131dc90bba718b46ea4f35c7436e Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Mon, 1 Sep 2025 14:05:42 +0800
Subject: [PATCH 761/932] [Misc] refactor code by import as for
 torch._inductor.config (#23677)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/v1/worker/cpu_model_runner.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 360a626979..feb49978d7 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -138,14 +138,14 @@ def _torch_cuda_wrapper():
 
 @contextmanager
 def _set_global_compilation_settings(config: VllmConfig):
-    import torch._inductor.config
+    import torch._inductor.config as torch_inductor_config
 
     inductor_config = config.compilation_config.inductor_compile_config
     # Note: The MKLDNN and CPPGEMM backend requires freezing parameters.
-    freezing_value = torch._inductor.config.freezing
+    freezing_value = torch_inductor_config.freezing
     try:
         if inductor_config.get("max_autotune", False):
-            torch._inductor.config.freezing = True
+            torch_inductor_config.freezing = True
         yield
     finally:
-        torch._inductor.config.freezing = freezing_value
+        torch_inductor_config.freezing = freezing_value

From 437c3ce02615443ab166f4155028c1d81ee27c06 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sun, 31 Aug 2025 23:05:59 -0700
Subject: [PATCH 762/932] Migrate Phi4 inputs to TensorSchema (#23471)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/phi4_multimodal.py | 119 +++++++++++-------
 vllm/model_executor/models/phi4mm.py          |  99 +++++++++------
 2 files changed, 137 insertions(+), 81 deletions(-)

diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
index 492d4bfb7d..6d973a964d 100644
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -40,6 +40,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
@@ -615,50 +616,90 @@ class Phi4MMAudioEmbedding(nn.Module):
         return loaded_params
 
 
-class Phi4MMImagePixelInputs(TypedDict):
+class Phi4MMImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - p: Number of patches (1 + num_patches)
+        - c: Number of channels (3)
+        - h: Height of each image patch
+        - w: Width of each image patch
+        - nc: Number of crops
+        - H_mask: Height of attention mask
+        - W_mask: Width of attention mask
+    """
+
     type: Literal["pixel_values"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    Shape:
-    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
 
-    Note that `num_patches` may be different per batch and image,
-    in which case the data is passed as a list instead of a batched tensor.
+    data: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "p", 3, "h", "w", dynamic_dims={"p"}
+                    ),  # may be different per batch and image
+    ]
+
+    image_sizes: Annotated[
+        torch.Tensor,
+        TensorShape("bn", 2),  # (height, width)
+    ]
+
+    num_img_tokens: Annotated[
+        list[int],
+        TensorShape("bn"),
+    ]
+
+    image_attention_mask: Annotated[
+        torch.Tensor,
+        TensorShape("bn", "nc", 32, 32),  # H_mask, W_mask
+    ]
+
+
+class Phi4MMImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - f: Image feature size
+        - h: Hidden size (must match language model backbone)
     """
 
-    image_sizes: torch.Tensor
-    """
-    Shape: `(batch_size * num_images, 2)`
-
-    This should be in `(height, width)` format.
-    """
-
-    num_img_tokens: list[int]
-    """Shape: `(batch_size * num_images)`"""
-
-    image_attention_mask: torch.Tensor
-    """Shape: `(batch_size * num_images, H_mask, W_mask)`"""
-
-
-class Phi4MMImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
 
-    `hidden_size` must match the hidden size of language model backbone.
+    data: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "f", "h"),
+    ]
+
+
+class Phi4MMAudioFeatureInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of audios
+        - f: Number of Mel filterbank bins (80)
+        - t: Time frames (M)
     """
 
-
-class Phi4MMAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size * num_audios, 80, M)"""
+
+    data: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "t", 80, dynamic_dims={"t"}),
+    ]
 
 
-class Phi4MMAudioEmbeddingInputs(TypedDict):
+class Phi4MMAudioEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - n: Number of audios
+        - f: Audio feature size
+        - h: Hidden size (must match language model backbone)
+    """
+
     type: Literal["audio_embeds"]
-    data: NestedTensors
-    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
+
+    data: Annotated[
+        NestedTensors,
+        TensorShape("b", "n", "f", "h"),
+    ]
 
 
 Phi4MMImageInput = Union[Phi4MMImagePixelInputs, Phi4MMImageEmbeddingInputs]
@@ -1170,18 +1211,10 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
             return None
 
         if audio_features is not None:
-            if not isinstance(audio_features, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio features. "
-                                 f"Got type: {type(audio_features)}")
-
             return Phi4MMAudioFeatureInputs(type="audio_features",
                                             data=flatten_bn(audio_features))
 
         if audio_embeds is not None:
-            if not isinstance(audio_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio embeds. "
-                                 f"Got type: {type(audio_embeds)}")
-
             return Phi4MMAudioEmbeddingInputs(type="audio_embeds",
                                               data=audio_embeds)
 
@@ -1259,7 +1292,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         elif isinstance(image_sizes, torch.Tensor):
             image_sizes = image_sizes.flatten(0, 1)
         else:
-            raise ValueError("Incorrect image_attention_mask inputs")
+            raise ValueError("Incorrect image_sizes inputs")
 
         if isinstance(num_img_tokens, list):
             num_img_tokens = [
@@ -1269,7 +1302,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         elif isinstance(num_img_tokens, torch.Tensor):
             num_img_tokens = num_img_tokens.flatten(0, 1).tolist()
         else:
-            raise ValueError("Incorrect image_attention_mask inputs")
+            raise ValueError("Incorrect num_img_tokens inputs")
 
         return Phi4MMImagePixelInputs(
             type="pixel_values",
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index ac0efc2771..352ae4064c 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -31,6 +31,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
@@ -391,41 +392,71 @@ class Phi4MMImageEncoder(nn.Module):
         return img_set_tensor
 
 
-class Phi4MMImagePixelInputs(TypedDict):
+class Phi4MMImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - p: Number of patches (1 + num_patches)
+        - c: Number of channels (3)
+        - h: Height of each image patch
+        - w: Width of each image patch
+        - nc: Number of crops
+        - H_mask: Height of attention mask
+        - W_mask: Width of attention mask
+    """
+
     type: Literal["pixel_values"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    Shape:
-    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
 
-    Note that `num_patches` may be different per batch and image,
-    in which case the data is passed as a list instead of a batched tensor.
+    data: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "p", 3, "h", "w", dynamic_dims={"p"}
+                    ),  # may be different per batch and image
+    ]
+
+    image_sizes: Annotated[
+        torch.Tensor,
+        TensorShape("bn", 2),  # (height, width)
+    ]
+
+    num_img_tokens: Annotated[
+        list[int],
+        TensorShape("bn"),
+    ]
+
+    image_attention_mask: Annotated[
+        torch.Tensor,
+        TensorShape("bn", "nc", 32, 32),  # H_mask, W_mask
+    ]
+
+
+class Phi4MMAudioFeatureInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of audios
+        - t: Time frames (M)
     """
 
-    image_sizes: torch.Tensor
-    """
-    Shape: `(batch_size * num_images, 2)`
-
-    This should be in `(height, width)` format.
-    """
-
-    num_img_tokens: list[int]
-    """Shape: `(batch_size * num_images)`"""
-
-    image_attention_mask: torch.Tensor
-    """Shape: `(batch_size * num_images, H_mask, W_mask)`"""
-
-
-class Phi4MMAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size * num_audios, 80, M)"""
+
+    data: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "t", 80, dynamic_dims={"t"}),
+    ]
 
 
-class Phi4MMAudioEmbeddingInputs(TypedDict):
+class Phi4MMAudioEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - n: Number of audios
+        - f: Audio feature size
+        - h: Hidden size (must match language model backbone)
+    """
     type: Literal["audio_embeds"]
-    data: NestedTensors
-    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
+    data: Annotated[
+        NestedTensors,
+        TensorShape("b", "n", "f", "h"),
+    ]
 
 
 Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
@@ -985,18 +1016,10 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
             return None
 
         if audio_features is not None:
-            if not isinstance(audio_features, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio features. "
-                                 f"Got type: {type(audio_features)}")
-
             return Phi4MMAudioFeatureInputs(type="audio_features",
                                             data=flatten_bn(audio_features))
 
         if audio_embeds is not None:
-            if not isinstance(audio_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio embeds. "
-                                 f"Got type: {type(audio_embeds)}")
-
             return Phi4MMAudioEmbeddingInputs(type="audio_embeds",
                                               data=audio_embeds)
 
@@ -1074,7 +1097,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         elif isinstance(image_sizes, torch.Tensor):
             image_sizes = image_sizes.flatten(0, 1)
         else:
-            raise ValueError("Incorrect image_attention_mask inputs")
+            raise ValueError("Incorrect image_sizes inputs")
 
         if isinstance(num_img_tokens, list):
             num_img_tokens = [
@@ -1084,7 +1107,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         elif isinstance(num_img_tokens, torch.Tensor):
             num_img_tokens = num_img_tokens.flatten(0, 1).tolist()
         else:
-            raise ValueError("Incorrect image_attention_mask inputs")
+            raise ValueError("Incorrect num_img_tokens inputs")
 
         return Phi4MMImagePixelInputs(
             type="pixel_values",

From 1cb39dbcddd9bf98d459cb4da81e3a9cc942088e Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Mon, 1 Sep 2025 07:07:12 +0100
Subject: [PATCH 763/932] [Misc] IO Processor plugins for pooling models
 (#22820)

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
---
 .buildkite/test-pipeline.yaml                 |   5 +
 docs/design/io_processor_plugins.md           |  78 +++
 docs/design/plugin_system.md                  |   2 +
 .../prithvi_geospatial_mae_io_processor.py    |  60 +++
 .../online_serving/prithvi_geospatial_mae.py  |  54 +++
 tests/conftest.py                             |   3 +
 .../prithvi_io_processor/__init__.py          |   8 +
 .../prithvi_io_processor/prithvi_processor.py | 449 ++++++++++++++++++
 .../prithvi_io_processor/types.py             |  59 +++
 .../prithvi_io_processor_plugin/setup.py      |  16 +
 tests/plugins_tests/conftest.py               |  12 -
 .../test_io_processor_plugins.py              | 137 ++++++
 tests/plugins_tests/test_platform_plugins.py  |   9 +
 vllm/config/__init__.py                       |   2 +
 vllm/engine/arg_utils.py                      |   4 +
 vllm/engine/protocol.py                       |   4 +
 vllm/entrypoints/llm.py                       |  47 +-
 vllm/entrypoints/openai/api_server.py         |   5 +-
 vllm/entrypoints/openai/protocol.py           |  44 +-
 vllm/entrypoints/openai/serving_engine.py     |  10 +-
 vllm/entrypoints/openai/serving_pooling.py    |  75 ++-
 vllm/inputs/__init__.py                       |   3 +-
 vllm/inputs/data.py                           |  10 +
 vllm/plugins/io_processors/__init__.py        |  68 +++
 vllm/plugins/io_processors/interface.py       |  62 +++
 25 files changed, 1183 insertions(+), 43 deletions(-)
 create mode 100644 docs/design/io_processor_plugins.md
 create mode 100644 examples/offline_inference/prithvi_geospatial_mae_io_processor.py
 create mode 100644 examples/online_serving/prithvi_geospatial_mae.py
 create mode 100644 tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
 create mode 100644 tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
 create mode 100644 tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
 create mode 100644 tests/plugins/prithvi_io_processor_plugin/setup.py
 delete mode 100644 tests/plugins_tests/conftest.py
 create mode 100644 tests/plugins_tests/test_io_processor_plugins.py
 create mode 100644 vllm/plugins/io_processors/__init__.py
 create mode 100644 vllm/plugins/io_processors/interface.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7443d34b9c..55349e0ac9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -769,6 +769,11 @@ steps:
   - pytest -v -s plugins_tests/test_platform_plugins.py
   - pip uninstall vllm_add_dummy_platform -y
   # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y 
+  # end io_processor plugins test
   # other tests continue here:
   - pytest -v -s plugins_tests/test_scheduler_plugins.py
   - pip install -e ./plugins/vllm_add_dummy_model
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
new file mode 100644
index 0000000000..8e5d524940
--- /dev/null
+++ b/docs/design/io_processor_plugins.md
@@ -0,0 +1,78 @@
+# IO Processor Plugins
+
+IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
+
+When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggerd via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
+
+## Writing an IO Processor Plugin
+
+IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
+
+```python
+IOProcessorInput = TypeVar('IOProcessorInput')
+IOProcessorOutput = TypeVar('IOProcessorOutput')
+
+class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+
+    @abstractmethod
+    def pre_process(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        raise NotImplementedError
+
+    async def pre_process_async(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        return self.pre_process(prompt, request_id, **kwargs)
+
+    @abstractmethod
+    def post_process(self,
+                     model_output: Sequence[PoolingRequestOutput],
+                     request_id: Optional[str] = None,
+                     **kwargs) -> IOProcessorOutput:
+        raise NotImplementedError
+
+    async def post_process_async(
+        self,
+        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
+        collected_output = [item async for i, item in model_output]
+        return self.post_process(collected_output, request_id, **kwargs)
+
+    @abstractmethod
+    def parse_request(self, request: Any) -> IOProcessorInput:
+        raise NotImplementedError
+
+    @abstractmethod
+    def output_to_response(
+            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        raise NotImplementedError
+```
+
+The `parse_request` method is used for validating the user prompt and converting it into the input expected by the `pre_process`/`pre_process_async` methods.
+The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
+The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
+
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is [here](../../vllm/entrypoints/openai/serving_pooling_with_io_plugin.py).
+
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our [online](../../examples/online_serving/prithvi_geospatial_mae.py) and [offline](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py) inference examples.
+
+## Using an IO Processor plugin
+
+IO Processor plugins are loaded at engine startup and there are two methods for specifying the name of the plugin to be loaded:
+
+1. Via vLLM's `EngineArgs`: setting the `io_processor_plugin` argument in the `EngineArgs` used to initialize the `AsyncLLM`. The same can be achieved by passing the `io_processor_plugin` argument to `LLM` in offline mode, or by passing the `--io-processor-plugin` argument in serving mode.
+2. Via the model HF configuration: adding an `io_processor_plugin` field to the model config (config.json).
+
+The order also determines method priority. i.e., setting the plugin name via `EngineArgs` will override any plugin name specified in the model HF config (config.json).
diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index ca1c2c2305..3719380977 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -49,6 +49,8 @@ Every plugin has three parts:
 
 - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
 
+- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for poling models. The plugin function returns the IOProcessor's class fully qualified name.
+
 ## Guidelines for Writing Plugins
 
 - **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
new file mode 100644
index 0000000000..8023cd6677
--- /dev/null
+++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+import os
+
+import torch
+
+from vllm import LLM
+from vllm.pooling_params import PoolingParams
+
+# This example shows how to perform an offline inference that generates
+# multimodal data. In this specific case this example will take a geotiff
+# image as input, process it using the multimodal data processor, and
+# perform inference.
+# Reuirement - install plugin at:
+#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+
+
+def main():
+    torch.set_default_dtype(torch.float16)
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+
+    img_prompt = dict(
+        data=image_url,
+        data_format="url",
+        image_format="tiff",
+        out_data_format="b64_json",
+    )
+
+    llm = LLM(
+        model="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+        skip_tokenizer_init=True,
+        trust_remote_code=True,
+        enforce_eager=True,
+        # Limit the maximum number of parallel requests
+        # to avoid the model going OOM.
+        # The maximum number depends on the available GPU memory
+        max_num_seqs=32,
+        io_processor_plugin="prithvi_to_tiff_india",
+    )
+
+    pooling_params = PoolingParams(task="encode", softmax=False)
+    pooler_output = llm.encode(
+        img_prompt,
+        pooling_params=pooling_params,
+    )
+    output = pooler_output[0].outputs
+
+    print(output)
+    decoded_data = base64.b64decode(output.data)
+
+    file_path = os.path.join(os.getcwd(), "offline_prediction.tiff")
+    with open(file_path, "wb") as f:
+        f.write(decoded_data)
+
+    print(f"Output file path: {file_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/prithvi_geospatial_mae.py
new file mode 100644
index 0000000000..cbd34f4613
--- /dev/null
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import os
+
+import requests
+
+# This example shows how to perform an online inference that generates
+# multimodal data. In this specific case this example will take a geotiff
+# image as input, process it using the multimodal data processor, and
+# perform inference.
+# Reuirements :
+# - install plugin at:
+#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+# - start vllm in serving mode with the below args
+#   --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
+#   --task embed --trust-remote-code
+#   --skip-tokenizer-init --enforce-eager
+#   --io-processor-plugin prithvi_to_tiff_india
+
+
+def main():
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+    server_endpoint = "http://localhost:8000/pooling"
+
+    request_payload_url = {
+        "data": {
+            "data": image_url,
+            "data_format": "url",
+            "image_format": "tiff",
+            "out_data_format": "b64_json",
+        },
+        "priority": 0,
+        "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+    }
+
+    ret = requests.post(server_endpoint, json=request_payload_url)
+
+    print(f"response.status_code: {ret.status_code}")
+    print(f"response.reason:{ret.reason}")
+
+    response = ret.json()
+
+    decoded_image = base64.b64decode(response["data"]["data"])
+
+    out_path = os.path.join(os.getcwd(), "online_prediction.tiff")
+
+    with open(out_path, "wb") as f:
+        f.write(decoded_image)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/conftest.py b/tests/conftest.py
index 9fed43cba5..27db5422ce 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1120,6 +1120,9 @@ class VllmRunner:
 
         return self.llm.llm_engine.collective_rpc(_apply_model)
 
+    def get_llm(self) -> LLM:
+        return self.llm
+
     def __enter__(self):
         return self
 
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
new file mode 100644
index 0000000000..a750c756c1
--- /dev/null
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+def register_prithvi_india():
+    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorIndia"  # noqa: E501
+
+
+def register_prithvi_valencia():
+    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorValencia"  # noqa: E501
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
new file mode 100644
index 0000000000..d49a50b7a3
--- /dev/null
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -0,0 +1,449 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import base64
+import datetime
+import os
+import tempfile
+import urllib.request
+from collections.abc import AsyncGenerator, Sequence
+from typing import Any, Optional, Union
+
+import albumentations
+import numpy as np
+import rasterio
+import regex as re
+import torch
+from einops import rearrange
+from terratorch.datamodules import Sen1Floods11NonGeoDataModule
+
+from vllm.config import VllmConfig
+from vllm.entrypoints.openai.protocol import (IOProcessorRequest,
+                                              IOProcessorResponse)
+from vllm.inputs.data import PromptType
+from vllm.logger import init_logger
+from vllm.outputs import PoolingRequestOutput
+from vllm.plugins.io_processors.interface import (IOProcessor,
+                                                  IOProcessorInput,
+                                                  IOProcessorOutput)
+
+from .types import DataModuleConfig, ImagePrompt, ImageRequestOutput
+
+logger = init_logger(__name__)
+
+NO_DATA = -9999
+NO_DATA_FLOAT = 0.0001
+OFFSET = 0
+PERCENTILE = 99
+
+DEFAULT_INPUT_INDICES = [0, 1, 2, 3, 4, 5]
+
+datamodule_config: DataModuleConfig = {
+    "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
+    "batch_size":
+    16,
+    "constant_scale":
+    0.0001,
+    "data_root":
+    "/dccstor/geofm-finetuning/datasets/sen1floods11",
+    "drop_last":
+    True,
+    "no_data_replace":
+    0.0,
+    "no_label_replace":
+    -1,
+    "num_workers":
+    8,
+    "test_transform": [
+        albumentations.Resize(always_apply=False,
+                              height=448,
+                              interpolation=1,
+                              p=1,
+                              width=448),
+        albumentations.pytorch.ToTensorV2(transpose_mask=False,
+                                          always_apply=True,
+                                          p=1.0),
+    ],
+}
+
+
+def save_geotiff(image: torch.Tensor, meta: dict,
+                 out_format: str) -> str | bytes:
+    """Save multi-band image in Geotiff file.
+
+    Args:
+        image: np.ndarray with shape (bands, height, width)
+        output_path: path where to save the image
+        meta: dict with meta info.
+    """
+    if out_format == "path":
+        # create temp file
+        file_path = os.path.join(os.getcwd(), "prediction.tiff")
+        with rasterio.open(file_path, "w", **meta) as dest:
+            for i in range(image.shape[0]):
+                dest.write(image[i, :, :], i + 1)
+
+        return file_path
+    elif out_format == "b64_json":
+        with tempfile.NamedTemporaryFile() as tmpfile:
+            with rasterio.open(tmpfile.name, "w", **meta) as dest:
+                for i in range(image.shape[0]):
+                    dest.write(image[i, :, :], i + 1)
+
+            file_data = tmpfile.read()
+            return base64.b64encode(file_data)
+
+    else:
+        raise ValueError("Unknown output format")
+
+
+def _convert_np_uint8(float_image: torch.Tensor):
+    image = float_image.numpy() * 255.0
+    image = image.astype(dtype=np.uint8)
+
+    return image
+
+
+def read_geotiff(
+    file_path: Optional[str] = None,
+    path_type: Optional[str] = None,
+    file_data: Optional[bytes] = None,
+) -> tuple[torch.Tensor, dict, tuple[float, float] | None]:
+    """Read all bands from *file_path* and return image + meta info.
+
+    Args:
+        file_path: path to image file.
+
+    Returns:
+        np.ndarray with shape (bands, height, width)
+        meta info dict
+    """
+
+    if all([x is None for x in [file_path, path_type, file_data]]):
+        raise Exception("All input fields to read_geotiff are None")
+    write_to_file: Optional[bytes] = None
+    path: Optional[str] = None
+    if file_data is not None:
+        # with tempfile.NamedTemporaryFile() as tmpfile:
+        #     tmpfile.write(file_data)
+        #     path = tmpfile.name
+
+        write_to_file = file_data
+    elif file_path is not None and path_type == "url":
+        resp = urllib.request.urlopen(file_path)
+        # with tempfile.NamedTemporaryFile() as tmpfile:
+        #     tmpfile.write(resp.read())
+        #     path = tmpfile.name
+        write_to_file = resp.read()
+    elif file_path is not None and path_type == "path":
+        path = file_path
+    elif file_path is not None and path_type == "b64_json":
+        image_data = base64.b64decode(file_path)
+        # with tempfile.NamedTemporaryFile() as tmpfile:
+        #     tmpfile.write(image_data)
+        #     path = tmpfile.name
+        write_to_file = image_data
+    else:
+        raise Exception("Wrong combination of parameters to read_geotiff")
+
+    with tempfile.NamedTemporaryFile() as tmpfile:
+        path_to_use = None
+        if write_to_file:
+            tmpfile.write(write_to_file)
+            path_to_use = tmpfile.name
+        elif path:
+            path_to_use = path
+
+        with rasterio.open(path_to_use) as src:
+            img = src.read()
+            meta = src.meta
+            try:
+                coords = src.lnglat()
+            except Exception:
+                # Cannot read coords
+                coords = None
+
+    return img, meta, coords
+
+
+def load_image(
+    data: Union[list[str]],
+    path_type: str,
+    mean: Optional[list[float]] = None,
+    std: Optional[list[float]] = None,
+    indices: Optional[Union[list[int], None]] = None,
+):
+    """Build an input example by loading images in *file_paths*.
+
+    Args:
+        file_paths: list of file paths .
+        mean: list containing mean values for each band in the
+              images in *file_paths*.
+        std: list containing std values for each band in the
+             images in *file_paths*.
+
+    Returns:
+        np.array containing created example
+        list of meta info for each image in *file_paths*
+    """
+
+    imgs = []
+    metas = []
+    temporal_coords = []
+    location_coords = []
+
+    for file in data:
+        # if isinstance(file, bytes):
+        #     img, meta, coords = read_geotiff(file_data=file)
+        # else:
+        img, meta, coords = read_geotiff(file_path=file, path_type=path_type)
+        # Rescaling (don't normalize on nodata)
+        img = np.moveaxis(img, 0, -1)  # channels last for rescaling
+        if indices is not None:
+            img = img[..., indices]
+        if mean is not None and std is not None:
+            img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)
+
+        imgs.append(img)
+        metas.append(meta)
+        if coords is not None:
+            location_coords.append(coords)
+
+        try:
+            match = re.search(r"(\d{7,8}T\d{6})", file)
+            if match:
+                year = int(match.group(1)[:4])
+                julian_day = match.group(1).split("T")[0][4:]
+                if len(julian_day) == 3:
+                    julian_day = int(julian_day)
+                else:
+                    julian_day = (datetime.datetime.strptime(
+                        julian_day, "%m%d").timetuple().tm_yday)
+                temporal_coords.append([year, julian_day])
+        except Exception:
+            logger.exception("Could not extract timestamp for %s", file)
+
+    imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
+    imgs = np.moveaxis(imgs, -1, 0).astype("float32")  # C, num_frames, H, W
+    imgs = np.expand_dims(imgs, axis=0)  # add batch di
+
+    return imgs, temporal_coords, location_coords, metas
+
+
+class PrithviMultimodalDataProcessor(IOProcessor):
+
+    def __init__(self, vllm_config: VllmConfig):
+
+        super().__init__(vllm_config)
+
+        self.datamodule = Sen1Floods11NonGeoDataModule(
+            data_root=datamodule_config["data_root"],
+            batch_size=datamodule_config["batch_size"],
+            num_workers=datamodule_config["num_workers"],
+            bands=datamodule_config["bands"],
+            drop_last=datamodule_config["drop_last"],
+            test_transform=datamodule_config["test_transform"],
+        )
+        self.img_size = 512
+        self.h1 = 1
+        self.w1 = 1
+        self.original_h = 512
+        self.original_w = 512
+        self.batch_size = 1
+        self.meta_data = None
+        self.requests_cache: dict[str, dict[str, Any]] = {}
+        self.indices = DEFAULT_INPUT_INDICES
+
+    def parse_request(self, request: Any) -> IOProcessorInput:
+        if type(request) is dict:
+            image_prompt = ImagePrompt(**request)
+            return image_prompt
+        if isinstance(request, IOProcessorRequest):
+            if not hasattr(request, "data"):
+                raise ValueError(
+                    "missing 'data' field in OpenAIBaseModel Request")
+
+            request_data = request.data
+
+            if type(request_data) is dict:
+                return ImagePrompt(**request_data)
+            else:
+                raise ValueError("Unable to parse the request data")
+
+        raise ValueError("Unable to parse request")
+
+    def output_to_response(
+            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        return IOProcessorResponse(
+            request_id=plugin_output.request_id,
+            data=plugin_output,
+        )
+
+    def pre_process(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+
+        image_data = dict(prompt)
+
+        if request_id:
+            self.requests_cache[request_id] = {
+                "out_format": image_data["out_data_format"],
+            }
+
+        input_data, temporal_coords, location_coords, meta_data = load_image(
+            data=[image_data["data"]],
+            indices=self.indices,
+            path_type=image_data["data_format"],
+        )
+
+        self.meta_data = meta_data[0]
+
+        if input_data.mean() > 1:
+            input_data = input_data / 10000  # Convert to range 0-1
+
+        self.original_h, self.original_w = input_data.shape[-2:]
+        pad_h = (self.img_size -
+                 (self.original_h % self.img_size)) % self.img_size
+        pad_w = (self.img_size -
+                 (self.original_w % self.img_size)) % self.img_size
+        input_data = np.pad(
+            input_data,
+            ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)),
+            mode="reflect",
+        )
+
+        batch = torch.tensor(input_data)
+        windows = batch.unfold(3, self.img_size,
+                               self.img_size).unfold(4, self.img_size,
+                                                     self.img_size)
+        self.h1, self.w1 = windows.shape[3:5]
+        windows = rearrange(
+            windows,
+            "b c t h1 w1 h w -> (b h1 w1) c t h w",
+            h=self.img_size,
+            w=self.img_size,
+        )
+
+        # Split into batches if number of windows > batch_size
+        num_batches = (windows.shape[0] // self.batch_size
+                       if windows.shape[0] > self.batch_size else 1)
+        windows = torch.tensor_split(windows, num_batches, dim=0)
+
+        if temporal_coords:
+            temporal_coords = torch.tensor(temporal_coords).unsqueeze(0)
+        else:
+            temporal_coords = None
+        if location_coords:
+            location_coords = torch.tensor(location_coords[0]).unsqueeze(0)
+        else:
+            location_coords = None
+
+        prompts = []
+        for window in windows:
+            # Apply standardization
+            window = self.datamodule.test_transform(
+                image=window.squeeze().numpy().transpose(1, 2, 0))
+            window = self.datamodule.aug(window)["image"]
+            prompts.append({
+                "prompt_token_ids": [1],
+                "multi_modal_data": {
+                    "pixel_values": window.to(torch.float16)[0],
+                    "location_coords": location_coords.to(torch.float16),
+                },
+            })
+
+        return prompts
+
+    async def pre_process_async(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        return self.pre_process(prompt, request_id, **kwargs)
+
+    def post_process(
+        self,
+        model_output: Sequence[PoolingRequestOutput],
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
+
+        pred_imgs_list = []
+
+        if request_id and (request_id in self.requests_cache):
+            out_format = self.requests_cache[request_id]["out_format"]
+        else:
+            out_format = "b64_json"
+
+        for output in model_output:
+            y_hat = output.outputs.data.argmax(dim=1)
+            pred = torch.nn.functional.interpolate(
+                y_hat.unsqueeze(1).float(),
+                size=self.img_size,
+                mode="nearest",
+            )
+            pred_imgs_list.append(pred)
+
+        pred_imgs: torch.Tensor = torch.concat(pred_imgs_list, dim=0)
+
+        # Build images from patches
+        pred_imgs = rearrange(
+            pred_imgs,
+            "(b h1 w1) c h w -> b c (h1 h) (w1 w)",
+            h=self.img_size,
+            w=self.img_size,
+            b=1,
+            c=1,
+            h1=self.h1,
+            w1=self.w1,
+        )
+
+        # Cut padded area back to original size
+        pred_imgs = pred_imgs[..., :self.original_h, :self.original_w]
+
+        # Squeeze (batch size 1)
+        pred_imgs = pred_imgs[0]
+
+        if not self.meta_data:
+            raise ValueError("No metadata available for the current task")
+        self.meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
+        out_data = save_geotiff(_convert_np_uint8(pred_imgs), self.meta_data,
+                                out_format)
+
+        return ImageRequestOutput(type=out_format,
+                                  format="tiff",
+                                  data=out_data,
+                                  request_id=request_id)
+
+    async def post_process_async(
+        self,
+        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
+        collected_output = [item async for i, item in model_output]
+        return self.post_process(collected_output, request_id, **kwargs)
+
+
+class PrithviMultimodalDataProcessorIndia(PrithviMultimodalDataProcessor):
+
+    def __init__(self, vllm_config: VllmConfig):
+
+        super().__init__(vllm_config)
+
+        self.indices = [1, 2, 3, 8, 11, 12]
+
+
+class PrithviMultimodalDataProcessorValencia(PrithviMultimodalDataProcessor):
+
+    def __init__(self, vllm_config: VllmConfig):
+
+        super().__init__(vllm_config)
+
+        self.indices = [0, 1, 2, 3, 4, 5]
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
new file mode 100644
index 0000000000..d480aef704
--- /dev/null
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Literal, Optional, TypedDict, Union
+
+import albumentations
+from pydantic import BaseModel
+
+
+class DataModuleConfig(TypedDict):
+    bands: list[str]
+    batch_size: int
+    constant_scale: float
+    data_root: str
+    drop_last: bool
+    no_data_replace: float
+    no_label_replace: int
+    num_workers: int
+    test_transform: list[
+        albumentations.core.transforms_interface.BasicTransform]
+
+
+class ImagePrompt(BaseModel):
+
+    data_format: Literal["b64_json", "bytes", "url"]
+    """
+    This is the data type for the input image
+    """
+
+    image_format: str
+    """
+    This is the image format (e.g., jpeg, png, etc.)
+    """
+
+    out_data_format: Literal["b64_json", "url"]
+
+    data: Any
+    """
+    Input image data
+    """
+
+
+MultiModalPromptType = Union[ImagePrompt]
+
+
+class ImageRequestOutput(BaseModel):
+    """
+    The output data of an image request to vLLM. 
+
+    Args:
+        type (str): The data content type [path, object]
+        format (str): The image format (e.g., jpeg, png, etc.)
+        data (Any): The resulting data.
+    """
+
+    type: Literal["path", "b64_json"]
+    format: str
+    data: str
+    request_id: Optional[str] = None
diff --git a/tests/plugins/prithvi_io_processor_plugin/setup.py b/tests/plugins/prithvi_io_processor_plugin/setup.py
new file mode 100644
index 0000000000..a03b1fbbd4
--- /dev/null
+++ b/tests/plugins/prithvi_io_processor_plugin/setup.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from setuptools import setup
+
+setup(
+    name="prithvi_io_processor_plugin",
+    version="0.1",
+    packages=["prithvi_io_processor"],
+    entry_points={
+        "vllm.io_processor_plugins": [
+            "prithvi_to_tiff_india = prithvi_io_processor:register_prithvi_india",  # noqa: E501
+            "prithvi_to_tiff_valencia = prithvi_io_processor:register_prithvi_valencia",  # noqa: E501
+        ]
+    },
+)
diff --git a/tests/plugins_tests/conftest.py b/tests/plugins_tests/conftest.py
deleted file mode 100644
index c8c1b81ca2..0000000000
--- a/tests/plugins_tests/conftest.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py
new file mode 100644
index 0000000000..00fe429445
--- /dev/null
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+from vllm.config import VllmConfig
+from vllm.entrypoints.llm import LLM
+from vllm.entrypoints.openai.protocol import IOProcessorResponse
+from vllm.plugins.io_processors import get_io_processor
+from vllm.pooling_params import PoolingParams
+
+MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
+
+image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"  # noqa: E501
+
+
+def test_loading_missing_plugin():
+    vllm_config = VllmConfig()
+    with pytest.raises(ValueError):
+        get_io_processor(vllm_config, "wrong_plugin")
+
+
+def test_loading_engine_with_wrong_plugin():
+
+    with pytest.raises(ValueError):
+        LLM(
+            model=MODEL_NAME,
+            skip_tokenizer_init=True,
+            trust_remote_code=True,
+            enforce_eager=True,
+            # Limit the maximum number of parallel requests
+            # to avoid the model going OOM in CI.
+            max_num_seqs=32,
+            io_processor_plugin="wrong_plugin",
+        )
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
+
+    img_prompt = dict(
+        data=image_url,
+        data_format="url",
+        image_format="tiff",
+        out_data_format="b64_json",
+    )
+
+    pooling_params = PoolingParams(task="encode", softmax=False)
+
+    with vllm_runner(
+            model_name,
+            runner="pooling",
+            skip_tokenizer_init=True,
+            trust_remote_code=True,
+            enforce_eager=True,
+            # Limit the maximum number of parallel requests
+            # to avoid the model going OOM in CI.
+            max_num_seqs=1,
+            io_processor_plugin="prithvi_to_tiff_valencia",
+    ) as llm_runner:
+        pooler_output = llm_runner.get_llm().encode(
+            img_prompt,
+            pooling_params=pooling_params,
+        )
+    output = pooler_output[0].outputs
+
+    # verify the output is formatted as expected for this plugin
+    assert all(
+        hasattr(output, attr)
+        for attr in ["type", "format", "data", "request_id"])
+
+    # We just check that the output is a valid base64 string.
+    # Raises an exception and fails the test if the string is corrupted.
+    base64.b64decode(output.data)
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--runner",
+        "pooling",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--skip-tokenizer-init",
+        # Limit the maximum number of parallel requests
+        # to avoid the model going OOM in CI.
+        "--max-num-seqs",
+        "32",
+        "--io-processor-plugin",
+        "prithvi_to_tiff_valencia"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_prithvi_mae_plugin_online(
+    server: RemoteOpenAIServer,
+    model_name: str,
+):
+
+    request_payload_url = {
+        "data": {
+            "data": image_url,
+            "data_format": "url",
+            "image_format": "tiff",
+            "out_data_format": "b64_json",
+        },
+        "priority": 0,
+        "model": model_name,
+    }
+
+    ret = requests.post(
+        server.url_for("pooling"),
+        json=request_payload_url,
+    )
+
+    response = ret.json()
+
+    # verify the request response is in the correct format
+    assert (parsed_response := IOProcessorResponse(**response))
+
+    # verify the output is formatted as expected for this plugin
+    plugin_data = parsed_response.data
+
+    assert all(
+        plugin_data.get(attr)
+        for attr in ["type", "format", "data", "request_id"])
+
+    # We just check that the output is a valid base64 string.
+    # Raises an exception and fails the test if the string is corrupted.
+    base64.b64decode(plugin_data["data"])
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 1d7e447501..6e2089ea2e 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -7,6 +7,15 @@ import torch
 from vllm.plugins import load_general_plugins
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 def test_platform_plugins():
     # simulate workload by running an example
     import runpy
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index cfc5e07d83..484f3986bb 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -501,6 +501,8 @@ class ModelConfig:
     logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
     """One or more logits processors' fully-qualified class names or class
     definitions"""
+    io_processor_plugin: Optional[str] = None
+    """IOProcessor plugin name to load at model startup"""
 
     def compute_hash(self) -> str:
         """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 06bd97dd6a..a6434e6be0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -364,6 +364,7 @@ class EngineArgs:
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
     mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
     mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
+    io_processor_plugin: Optional[str] = None
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     # LoRA fields
     enable_lora: bool = False
@@ -577,6 +578,8 @@ class EngineArgs:
                                  **model_kwargs["override_attention_dtype"])
         model_group.add_argument("--logits-processors",
                                  **model_kwargs["logits_processors"])
+        model_group.add_argument("--io-processor-plugin",
+                                 **model_kwargs["io_processor_plugin"])
 
         # Model loading arguments
         load_kwargs = get_kwargs(LoadConfig)
@@ -993,6 +996,7 @@ class EngineArgs:
             model_impl=self.model_impl,
             override_attention_dtype=self.override_attention_dtype,
             logits_processors=self.logits_processors,
+            io_processor_plugin=self.io_processor_plugin,
         )
 
     def validate_tensorizer_args(self):
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 31c36b8562..b0b11a33a4 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -15,6 +15,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
+from vllm.plugins.io_processors.interface import IOProcessor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -267,6 +268,9 @@ class EngineClient(ABC):
         """Get the appropriate tokenizer for the request"""
         ...
 
+    async def get_io_processor(self) -> IOProcessor:
+        raise NotImplementedError
+
     @abstractmethod
     async def is_tracing_enabled(self) -> bool:
         ...
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index cab761b8ea..9b2ad808eb 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -37,13 +37,15 @@ from vllm.entrypoints.score_utils import (ScoreContentPartParam,
 # yapf: enable
 from vllm.entrypoints.utils import (_validate_truncation_size,
                                     log_non_default_args)
-from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
+from vllm.inputs import (DataPrompt, PromptType, SingletonPrompt, TextPrompt,
+                         TokensPrompt)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
                           PoolingRequestOutput, RequestOutput,
                           ScoringRequestOutput)
+from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
                                   SamplingParams)
@@ -284,6 +286,11 @@ class LLM:
 
         self.supported_tasks = supported_tasks
 
+        # Load the Input/Output processor plugin if any
+        io_processor_plugin = self.llm_engine.model_config.io_processor_plugin
+        self.io_processor = get_io_processor(self.llm_engine.vllm_config,
+                                             io_processor_plugin)
+
     def get_tokenizer(
         self,
         lora_request: Optional[LoRARequest] = None,
@@ -833,7 +840,7 @@ class LLM:
 
     def encode(
         self,
-        prompts: Union[PromptType, Sequence[PromptType]],
+        prompts: Union[PromptType, Sequence[PromptType], DataPrompt],
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         *,
@@ -915,6 +922,22 @@ class LLM:
             if truncate_prompt_tokens is not None:
                 param.truncate_prompt_tokens = truncate_prompt_tokens
 
+        io_processor_prompt = False
+        if isinstance(prompts, dict) and "data" in prompts:
+            io_processor_prompt = True
+            if self.io_processor is None:
+                raise ValueError(
+                    "No IOProcessor plugin installed. Please refer "
+                    "to the documentation and to the "
+                    "'prithvi_geospatial_mae_io_processor' "
+                    "offline inference example for more details.")
+
+            # Validate the request data is valid for the loaded plugin
+            validated_prompt = self.io_processor.parse_request(prompts)
+
+            # obtain the actual model prompts from the pre-processor
+            prompts = self.io_processor.pre_process(prompt=validated_prompt)
+
         self._validate_and_add_requests(
             prompts=prompts,
             params=pooling_params,
@@ -923,8 +946,24 @@ class LLM:
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
-        return self.engine_class.validate_outputs(outputs,
-                                                  PoolingRequestOutput)
+
+        model_outputs = self.engine_class.validate_outputs(
+            outputs, PoolingRequestOutput)
+
+        if io_processor_prompt:
+            # get the post-processed model outputs
+            assert self.io_processor is not None
+            processed_outputs = self.io_processor.post_process(
+                model_output=model_outputs)
+
+            return [
+                PoolingRequestOutput[Any](request_id="",
+                                          outputs=processed_outputs,
+                                          prompt_token_ids=[],
+                                          finished=True)
+            ]
+        else:
+            return model_outputs
 
     def embed(
         self,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index ca7d1539dd..3cebfdf885 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -64,6 +64,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               EmbeddingRequest,
                                               EmbeddingResponse, ErrorInfo,
                                               ErrorResponse,
+                                              IOProcessorResponse,
                                               LoadLoRAAdapterRequest,
                                               PoolingRequest, PoolingResponse,
                                               RerankRequest, RerankResponse,
@@ -795,7 +796,7 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(),
                             status_code=generator.error.code)
-    elif isinstance(generator, PoolingResponse):
+    elif isinstance(generator, (PoolingResponse, IOProcessorResponse)):
         return JSONResponse(content=generator.model_dump())
 
     assert_never(generator)
@@ -1782,7 +1783,7 @@ async def init_app_state(
     ) if "generate" in supported_tasks else None
     state.openai_serving_pooling = OpenAIServingPooling(
         engine_client,
-        model_config,
+        vllm_config,
         state.openai_serving_models,
         request_logger=request_logger,
         chat_template=resolved_chat_template,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 0fa1136b47..12b274e121 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -6,7 +6,8 @@
 import json
 import time
 from http import HTTPStatus
-from typing import Annotated, Any, ClassVar, Literal, Optional, Union
+from typing import (Annotated, Any, ClassVar, Generic, Literal, Optional,
+                    TypeVar, Union)
 
 import regex as re
 import torch
@@ -1405,7 +1406,46 @@ EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
 
 PoolingCompletionRequest = EmbeddingCompletionRequest
 PoolingChatRequest = EmbeddingChatRequest
-PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
+
+T = TypeVar("T")
+
+
+class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
+    model: Optional[str] = None
+
+    priority: int = Field(default=0)
+    """
+    The priority of the request (lower means earlier handling;
+    default: 0). Any priority other than 0 will raise an error
+    if the served model does not use priority scheduling.
+    """
+    data: T
+    """
+    When using plugins IOProcessor plugins, the actual input is processed
+    by the plugin itself. Hence, we use a generic type for the request data
+    """
+
+    def to_pooling_params(self):
+        return PoolingParams(task="encode")
+
+
+class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
+
+    request_id: Optional[str] = None
+    """
+    The request_id associated with this response
+    """
+    created_at: int = Field(default_factory=lambda: int(time.time()))
+
+    data: T
+    """
+    When using plugins IOProcessor plugins, the actual output is generated
+    by the plugin itself. Hence, we use a generic type for the response data
+    """
+
+
+PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest,
+                       IOProcessorRequest]
 
 
 class ScoreRequest(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 7014744cd9..b6a1876011 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -49,9 +49,11 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               EmbeddingCompletionRequest,
                                               EmbeddingRequest,
                                               EmbeddingResponse, ErrorInfo,
-                                              ErrorResponse, PoolingResponse,
-                                              RerankRequest, ResponsesRequest,
-                                              ScoreRequest, ScoreResponse,
+                                              ErrorResponse,
+                                              IOProcessorRequest,
+                                              PoolingResponse, RerankRequest,
+                                              ResponsesRequest, ScoreRequest,
+                                              ScoreResponse,
                                               TokenizeChatRequest,
                                               TokenizeCompletionRequest,
                                               TokenizeResponse,
@@ -89,7 +91,7 @@ ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
                         TokenizeChatRequest]
 SpeechToTextRequest = Union[TranscriptionRequest, TranslationRequest]
 AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest, SpeechToTextRequest,
-                   ResponsesRequest]
+                   ResponsesRequest, IOProcessorRequest]
 
 AnyResponse = Union[
     CompletionResponse,
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index b2c2af2ec5..685c98c817 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -4,7 +4,7 @@
 import asyncio
 import base64
 import time
-from collections.abc import AsyncGenerator
+from collections.abc import AsyncGenerator, Sequence
 from typing import Final, Literal, Optional, Union, cast
 
 import jinja2
@@ -13,19 +13,25 @@ import torch
 from fastapi import Request
 from typing_extensions import assert_never
 
-from vllm.config import ModelConfig
+from vllm.config import VllmConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
+# yapf: disable
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              IOProcessorRequest,
+                                              IOProcessorResponse,
                                               PoolingChatRequest,
+                                              PoolingCompletionRequest,
                                               PoolingRequest, PoolingResponse,
                                               PoolingResponseData, UsageInfo)
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+# yapf: enable
+from vllm.entrypoints.openai.serving_engine import OpenAIServing, RequestPrompt
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
 from vllm.outputs import PoolingOutput, PoolingRequestOutput
+from vllm.plugins.io_processors import get_io_processor
 from vllm.utils import merge_async_iterators
 
 logger = init_logger(__name__)
@@ -52,7 +58,7 @@ class OpenAIServingPooling(OpenAIServing):
     def __init__(
         self,
         engine_client: EngineClient,
-        model_config: ModelConfig,
+        vllm_config: VllmConfig,
         models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
@@ -61,19 +67,21 @@ class OpenAIServingPooling(OpenAIServing):
         log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
-                         model_config=model_config,
+                         model_config=vllm_config.model_config,
                          models=models,
                          request_logger=request_logger,
                          log_error_stack=log_error_stack)
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
+        io_processor_plugin = self.model_config.io_processor_plugin
+        self.io_processor = get_io_processor(vllm_config, io_processor_plugin)
 
     async def create_pooling(
         self,
         request: PoolingRequest,
         raw_request: Optional[Request] = None,
-    ) -> Union[PoolingResponse, ErrorResponse]:
+    ) -> Union[PoolingResponse, IOProcessorResponse, ErrorResponse]:
         """
         See https://platform.openai.com/docs/api-reference/embeddings/create
         for the API specification. This API mimics the OpenAI Embedding API.
@@ -82,20 +90,13 @@ class OpenAIServingPooling(OpenAIServing):
         if error_check_ret is not None:
             return error_check_ret
 
-        encoding_format = request.encoding_format
-        if request.dimensions is not None:
-            return self.create_error_response(
-                "dimensions is currently not supported")
-
         model_name = self._get_model_name(request.model)
+
         request_id = f"pool-{self._base_request_id(raw_request)}"
         created_time = int(time.time())
 
-        truncate_prompt_tokens = request.truncate_prompt_tokens
-
+        is_io_processor_request = isinstance(request, IOProcessorRequest)
         try:
-            truncate_prompt_tokens = _validate_truncation_size(
-                self.max_model_len, truncate_prompt_tokens)
             lora_request = self._maybe_get_adapters(request)
 
             if self.model_config.skip_tokenizer_init:
@@ -104,7 +105,32 @@ class OpenAIServingPooling(OpenAIServing):
                 tokenizer = await self.engine_client.get_tokenizer(lora_request
                                                                    )
 
-            if isinstance(request, PoolingChatRequest):
+            if getattr(request, "dimensions", None) is not None:
+                return self.create_error_response(
+                    "dimensions is currently not supported")
+
+            truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
+                                             None)
+            truncate_prompt_tokens = _validate_truncation_size(
+                self.max_model_len, truncate_prompt_tokens)
+
+            if is_io_processor_request:
+                if self.io_processor is None:
+                    raise ValueError(
+                        "No IOProcessor plugin installed. Please refer "
+                        "to the documentation and to the "
+                        "'prithvi_geospatial_mae_io_processor' "
+                        "offline inference example for more details.")
+
+                validated_prompt = self.io_processor.parse_request(request)
+
+                engine_prompts = await self.io_processor.pre_process_async(
+                    prompt=validated_prompt, request_id=request_id)
+                request_prompts: Sequence[RequestPrompt] = [
+                    ""
+                ] * len(engine_prompts)
+
+            elif isinstance(request, PoolingChatRequest):
                 (
                     _,
                     request_prompts,
@@ -122,7 +148,7 @@ class OpenAIServingPooling(OpenAIServing):
                     continue_final_message=False,
                     add_special_tokens=request.add_special_tokens,
                 )
-            else:
+            elif isinstance(request, PoolingCompletionRequest):
                 (request_prompts,
                  engine_prompts) = await self._preprocess_completion(
                      request,
@@ -130,6 +156,9 @@ class OpenAIServingPooling(OpenAIServing):
                      request.input,
                      add_special_tokens=request.add_special_tokens,
                  )
+            else:
+                raise ValueError(
+                    f"Unsupported request of type {type(request)}")
         except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
@@ -171,6 +200,16 @@ class OpenAIServingPooling(OpenAIServing):
 
         result_generator = merge_async_iterators(*generators)
 
+        if is_io_processor_request:
+            assert self.io_processor is not None
+            output = await self.io_processor.post_process_async(
+                model_output=result_generator,
+                request_id=request_id,
+            )
+            return self.io_processor.output_to_response(output)
+
+        assert isinstance(request,
+                          (PoolingCompletionRequest, PoolingChatRequest))
         num_prompts = len(engine_prompts)
 
         # Non-streaming response
@@ -190,7 +229,7 @@ class OpenAIServingPooling(OpenAIServing):
                 request_id,
                 created_time,
                 model_name,
-                encoding_format,
+                request.encoding_format,
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index aef7841e71..e9db2a0dc1 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
+from .data import (DataPrompt, DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
                    EncoderDecoderInputs, ExplicitEncoderDecoderPrompt,
                    ProcessorInputs, PromptType, SingletonInputs,
                    SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
@@ -18,6 +18,7 @@ target model.
 """
 
 __all__ = [
+    "DataPrompt",
     "TextPrompt",
     "TokensPrompt",
     "PromptType",
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index f23b498722..065d0ab592 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -95,6 +95,16 @@ class EmbedsPrompt(TypedDict):
     """
 
 
+class DataPrompt(TypedDict):
+    """Represents generic inputs handled by IO processor plugins."""
+
+    data: Any
+    """The input data"""
+
+    data_format: str
+    """The input data format"""
+
+
 SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
 """
 Set of possible schemas for a single prompt:
diff --git a/vllm/plugins/io_processors/__init__.py b/vllm/plugins/io_processors/__init__.py
new file mode 100644
index 0000000000..c5c4f6f8d9
--- /dev/null
+++ b/vllm/plugins/io_processors/__init__.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import logging
+from typing import Optional
+
+from vllm.config import VllmConfig
+from vllm.plugins import load_plugins_by_group
+from vllm.plugins.io_processors.interface import IOProcessor
+from vllm.utils import resolve_obj_by_qualname
+
+logger = logging.getLogger(__name__)
+
+
+def get_io_processor(
+        vllm_config: VllmConfig,
+        plugin_from_init: Optional[str] = None) -> IOProcessor | None:
+    # Input.Output processors are loaded as plugins under the
+    # 'vllm.io_processor_plugins' group. Similar to platform
+    # plugins, these plugins register a function that returns the class
+    # name for the processor to install.
+
+    if plugin_from_init:
+        model_plugin = plugin_from_init
+    else:
+        # A plugin can be specified via the model config
+        # Retrieve the model specific plugin if available
+        # This is using a custom field in the hf_config for the model
+        hf_config = vllm_config.model_config.hf_config.to_dict()
+        config_plugin = hf_config.get("io_processor_plugin")
+        model_plugin = config_plugin
+
+    if model_plugin is None:
+        logger.info("No IOProcessor plugins requested by the model")
+        return None
+
+    logger.debug("IOProcessor plugin to be loaded %s", model_plugin)
+
+    # Load all installed plugin in the group
+    multimodal_data_processor_plugins = \
+        load_plugins_by_group('vllm.io_processor_plugins')
+
+    loadable_plugins = {}
+    for name, func in multimodal_data_processor_plugins.items():
+        try:
+            assert callable(func)
+            processor_cls_qualname = func()
+            if processor_cls_qualname is not None:
+                loadable_plugins[name] = processor_cls_qualname
+        except Exception:
+            logger.warning("Failed to load plugin %s.", name, exc_info=True)
+
+    num_available_plugins = len(loadable_plugins.keys())
+    if num_available_plugins == 0:
+        raise ValueError("No IOProcessor plugins installed"
+                         f" but one is required ({model_plugin}).")
+
+    if model_plugin not in loadable_plugins:
+        raise ValueError(
+            f"The model requires the '{model_plugin}' IO Processor plugin "
+            "but it is not installed. "
+            f"Available plugins: {list(loadable_plugins.keys())}")
+
+    activated_plugin_cls = loadable_plugins[model_plugin]
+
+    return resolve_obj_by_qualname(activated_plugin_cls)(vllm_config)
diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py
new file mode 100644
index 0000000000..5c73188d5d
--- /dev/null
+++ b/vllm/plugins/io_processors/interface.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from collections.abc import AsyncGenerator, Sequence
+from typing import Any, Generic, Optional, TypeVar, Union
+
+from vllm.config import VllmConfig
+from vllm.entrypoints.openai.protocol import IOProcessorResponse
+from vllm.inputs.data import PromptType
+from vllm.outputs import PoolingRequestOutput
+
+IOProcessorInput = TypeVar('IOProcessorInput')
+IOProcessorOutput = TypeVar('IOProcessorOutput')
+
+
+class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+
+    @abstractmethod
+    def pre_process(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        raise NotImplementedError
+
+    async def pre_process_async(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        return self.pre_process(prompt, request_id, **kwargs)
+
+    @abstractmethod
+    def post_process(self,
+                     model_output: Sequence[PoolingRequestOutput],
+                     request_id: Optional[str] = None,
+                     **kwargs) -> IOProcessorOutput:
+        raise NotImplementedError
+
+    async def post_process_async(
+        self,
+        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
+        collected_output = [item async for i, item in model_output]
+        return self.post_process(collected_output, request_id, **kwargs)
+
+    @abstractmethod
+    def parse_request(self, request: Any) -> IOProcessorInput:
+        raise NotImplementedError
+
+    @abstractmethod
+    def output_to_response(
+            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        raise NotImplementedError
\ No newline at end of file

From 422e793fa6c4381809ca236946ab5c2206ad59c1 Mon Sep 17 00:00:00 2001
From: Code Jesus <devonperoutky@gmail.com>
Date: Sun, 31 Aug 2025 23:07:54 -0700
Subject: [PATCH 764/932] [Bugfix] Add support for `<tool_call>` format in
 streaming mode for XLAM Tool Parser (#22769)

Signed-off-by: Devon Peroutky <devon@kindo.ai>
---
 tests/tool_use/test_xlam_tool_parser.py       | 218 +++++++++++++++++-
 .../openai/tool_parsers/xlam_tool_parser.py   | 102 ++++++--
 2 files changed, 296 insertions(+), 24 deletions(-)

diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index 8d26b90515..0bc22e4f10 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -2,12 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
+from collections.abc import Generator
+from typing import Optional
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage, FunctionCall,
+                                              ToolCall)
 from vllm.entrypoints.openai.tool_parsers import xLAMToolParser
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.transformers_utils.detokenizer import detokenize_incrementally
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
 # Use a common model that is likely to be available
 MODEL = "Salesforce/Llama-xLAM-2-8B-fc-r"
@@ -36,6 +41,56 @@ def assert_tool_calls(actual_tool_calls: list[ToolCall],
         assert actual_tool_call.function == expected_tool_call.function
 
 
+def stream_delta_message_generator(
+    xlam_tool_parser: xLAMToolParser,
+    xlam_tokenizer: AnyTokenizer,
+    model_output: str,
+    request: Optional[ChatCompletionRequest] = None,
+) -> Generator[DeltaMessage, None, None]:
+    all_token_ids = xlam_tokenizer.encode(model_output,
+                                          add_special_tokens=False)
+
+    previous_text = ""
+    previous_tokens = None
+    prefix_offset = 0
+    read_offset = 0
+    for i, delta_token in enumerate(all_token_ids):
+        delta_token_ids = [delta_token]
+        previous_token_ids = all_token_ids[:i]
+        current_token_ids = all_token_ids[:i + 1]
+
+        (new_tokens, delta_text, new_prefix_offset,
+         new_read_offset) = (detokenize_incrementally(
+             tokenizer=xlam_tokenizer,
+             all_input_ids=current_token_ids,
+             prev_tokens=previous_tokens,
+             prefix_offset=prefix_offset,
+             read_offset=read_offset,
+             skip_special_tokens=False,
+             spaces_between_special_tokens=True,
+         ))
+
+        current_text = previous_text + delta_text
+
+        delta_message = xlam_tool_parser.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+            request=request,
+        )
+        if delta_message:
+            yield delta_message
+
+        previous_text = current_text
+        previous_tokens = (previous_tokens +
+                           new_tokens if previous_tokens else new_tokens)
+        prefix_offset = new_prefix_offset
+        read_offset = new_read_offset
+
+
 def test_extract_tool_calls_no_tools(xlam_tool_parser):
     model_output = "This is a test"
     extracted_tool_calls = xlam_tool_parser.extract_tool_calls(
@@ -51,6 +106,7 @@ def test_extract_tool_calls_no_tools(xlam_tool_parser):
         "single_tool_with_think_tag",
         "single_tool_with_json_code_block",
         "single_tool_with_tool_calls_tag",
+        "single_tool_with_tool_call_xml_tags",
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
@@ -118,6 +174,20 @@ def test_extract_tool_calls_no_tools(xlam_tool_parser):
             ],
             "I'll check the weather for you.",
         ),
+        (
+            """I'll help you check the weather.<tool_call>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]</tool_call>""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "I'll help you check the weather.",
+        ),
     ],
 )
 def test_extract_tool_calls(xlam_tool_parser, model_output,
@@ -245,3 +315,147 @@ def test_streaming_with_list_structure(xlam_tool_parser):
         assert hasattr(result, "tool_calls")
         assert len(result.tool_calls) == 1
         assert result.tool_calls[0].function.name == "get_current_weather"
+
+
+@pytest.mark.parametrize(
+    ids=[
+        "parallel_tool_calls",
+        "single_tool_with_think_tag",
+        "single_tool_with_json_code_block",
+        "single_tool_with_tool_calls_tag",
+        "single_tool_with_tool_call_xml_tags",
+    ],
+    argnames=["model_output", "expected_tool_calls", "expected_content"],
+    argvalues=[
+        (
+            """[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}, {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}]""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                )),
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Orlando",
+                        "state": "FL",
+                        "unit": "fahrenheit",
+                    }),
+                )),
+            ],
+            "",
+        ),
+        (
+            """<think>I'll help you with that.</think>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "<think>I'll help you with that.</think>",
+        ),
+        (
+            """```json\n[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]\n```""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "",
+        ),
+        (
+            """[TOOL_CALLS][{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "",
+        ),
+        (
+            """I can help with that.<tool_call>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]</tool_call>""",  # noqa: E501
+            [
+                ToolCall(function=FunctionCall(
+                    name="get_current_weather",
+                    arguments=json.dumps({
+                        "city": "Dallas",
+                        "state": "TX",
+                        "unit": "fahrenheit",
+                    }),
+                ))
+            ],
+            "I can help with that.",
+        ),
+    ],
+)
+def test_extract_tool_calls_streaming_incremental(
+    xlam_tool_parser,
+    xlam_tokenizer,
+    model_output,
+    expected_tool_calls,
+    expected_content,
+):
+    """Verify the XLAM Parser streaming behavior by verifying each chunk is as expected."""  # noqa: E501
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=[])
+
+    chunks = []
+    for delta_message in stream_delta_message_generator(
+            xlam_tool_parser, xlam_tokenizer, model_output, request):
+        chunks.append(delta_message)
+
+    # Should have multiple chunks
+    assert len(chunks) >= 3
+
+    # Should have a chunk with tool header (id, name, type) for the first tool call # noqa: E501
+    header_found = False
+    expected_first_tool = expected_tool_calls[0]
+    for chunk in chunks:
+        if chunk.tool_calls and chunk.tool_calls[0].id:
+            header_found = True
+            assert (chunk.tool_calls[0].function.name ==
+                    expected_first_tool.function.name)
+            assert chunk.tool_calls[0].type == "function"
+            # Arguments may be empty initially or None
+            if chunk.tool_calls[0].function.arguments is not None:
+                # If present, should be empty string initially
+                assert chunk.tool_calls[0].function.arguments == ""
+            break
+    assert header_found
+
+    # Should have chunks with incremental arguments
+    arg_chunks = []
+    for chunk in chunks:
+        if (chunk.tool_calls and chunk.tool_calls[0].function.arguments
+                and chunk.tool_calls[0].function.arguments != ""
+                and chunk.tool_calls[0].index ==
+                0  # Only collect arguments from the first tool call
+            ):
+            arg_chunks.append(chunk.tool_calls[0].function.arguments)
+
+    # Arguments should be streamed incrementally
+    assert len(arg_chunks) > 1
+
+    # Concatenated arguments should form valid JSON for the first tool call
+    full_args = "".join(arg_chunks)
+    parsed_args = json.loads(full_args)
+    expected_args = json.loads(expected_first_tool.function.arguments)
+    assert parsed_args == expected_args
diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
index 87cd413b37..484e904cd8 100644
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -186,11 +186,31 @@ class xLAMToolParser(ToolParser):
         """
         Extract tool calls for streaming mode.
         """
-        # Simplify detection: if it begins with "[" treat it as a function call
-        is_function_call = (current_text.strip().startswith("["))
+        # First, check for a definitive start of a tool call block.
+        # This prevents premature parsing of incomplete output.
+        stripped_text = current_text.strip()
+        preprocessed_content, preprocessed_tool_calls = (
+            self.preprocess_model_output(current_text))
 
-        # If not a function call, return normal content
-        if not is_function_call:
+        # For JSON code blocks, we need to detect them earlier, even if incomplete
+        has_potential_json_block = ("```json" in current_text
+                                    or "```\n[" in current_text
+                                    or "[TOOL_CALLS]" in current_text
+                                    or "<tool_call>" in current_text)
+
+        is_tool_call_block = (
+            stripped_text.startswith("[")
+            or stripped_text.startswith("<tool_call>")
+            or stripped_text.startswith("[TOOL_CALLS]") or
+            # Check if we have thinking tags with JSON-like content following
+            ("</think>[" in current_text) or
+            # Check if the text contains a JSON array after preprocessing
+            preprocessed_tool_calls is not None or
+            # For JSON code blocks, detect early if we see enough structure
+            (has_potential_json_block and '"name"' in current_text
+             and '"arguments"' in current_text))
+
+        if not is_tool_call_block:
             return DeltaMessage(content=delta_text)
 
         try:
@@ -204,7 +224,10 @@ class xLAMToolParser(ToolParser):
 
             # Try parsing as JSON to check for complete tool calls
             try:
-                parsed_tools = json.loads(current_text)
+                # Use preprocessed tool calls if available
+                tool_calls_text = (preprocessed_tool_calls if
+                                   preprocessed_tool_calls else current_text)
+                parsed_tools = json.loads(tool_calls_text)
                 if isinstance(parsed_tools, list):
                     # Update our tool array for next time
                     self.prev_tool_call_arr = parsed_tools
@@ -257,13 +280,40 @@ class xLAMToolParser(ToolParser):
                         return delta
 
             # Use regex to identify tool calls in the output
+            # Use preprocessed tool calls text for better parsing, but also try to extract from incomplete JSON blocks
+            search_text = (preprocessed_tool_calls
+                           if preprocessed_tool_calls else current_text)
+
+            # For JSON code blocks that aren't complete yet, try to extract the JSON content
+            if not preprocessed_tool_calls and has_potential_json_block:
+                # Try to extract the JSON array from within the code block
+                json_match = re.search(r"```(?:json)?\s*([\s\S]*?)(?:```|$)",
+                                       current_text)
+                if json_match:
+                    potential_json = json_match.group(1).strip()
+                    # Use this as search text even if it's incomplete
+                    if potential_json.startswith("[") and (
+                            '"name"' in potential_json
+                            and '"arguments"' in potential_json):
+                        search_text = potential_json
+
+            # Try to find complete tool names first
             name_pattern = r'"name"\s*:\s*"([^"]+)"'
-            name_matches = list(re.finditer(name_pattern, current_text))
+            name_matches = list(re.finditer(name_pattern, search_text))
             tool_count = len(name_matches)
 
-            # If no tools found yet, return
+            # If no complete tool names found, check for partial tool names
             if tool_count == 0:
-                return None
+                # Check if we're in the middle of parsing a tool name
+                partial_name_pattern = r'"name"\s*:\s*"([^"]*)'
+                partial_matches = list(
+                    re.finditer(partial_name_pattern, search_text))
+                if partial_matches:
+                    # We have a partial tool name - not ready to emit yet
+                    return None
+                else:
+                    # No tools found at all
+                    return None
 
             # Ensure our state arrays are large enough
             while len(self.streaming_state["sent_tools"]) < tool_count:
@@ -332,7 +382,7 @@ class xLAMToolParser(ToolParser):
                 # First, check for the empty arguments case: "arguments": {}
                 empty_args_pattern = (
                     r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}')
-                empty_args_match = re.search(empty_args_pattern, current_text)
+                empty_args_match = re.search(empty_args_pattern, search_text)
 
                 # Check if this tool has empty arguments
                 if empty_args_match and empty_args_match.start() > 0:
@@ -376,7 +426,7 @@ class xLAMToolParser(ToolParser):
 
                 # Extract arguments for current tool using regex for non-empty arguments
                 args_pattern = r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*(\{(?:[^{}]|(?:\{[^{}]*\}))*\})'
-                args_matches = list(re.finditer(args_pattern, current_text))
+                args_matches = list(re.finditer(args_pattern, search_text))
 
                 if current_idx < len(args_matches):
                     args_text = args_matches[current_idx].group(1)
@@ -384,17 +434,25 @@ class xLAMToolParser(ToolParser):
                     # Handle transition between tools
                     is_last_tool = current_idx == tool_count - 1
 
-                    # Find where the arguments for our current tool end
-                    if not is_last_tool:
-                        # If we have more tools after this one, try to find the complete argument block
-                        next_tool_pos = current_text.find(
-                            "},{", args_matches[current_idx].start())
-                        if next_tool_pos != -1:
-                            args_end_pos = (next_tool_pos + 1
-                                            )  # +1 to include the '}'
-                            args_text = (current_text[args_matches[current_idx]
-                                                      .start():args_end_pos].
-                                         split('"arguments":')[1].strip())
+                    # For multiple tools, extract only the arguments for the current tool
+                    if tool_count > 1:
+                        # Parse the entire JSON structure to properly extract arguments for each tool
+                        try:
+                            parsed_tools = json.loads(search_text)
+                            if isinstance(
+                                    parsed_tools,
+                                    list) and current_idx < len(parsed_tools):
+                                current_tool = parsed_tools[current_idx]
+                                if isinstance(current_tool.get("arguments"),
+                                              dict):
+                                    args_text = json.dumps(
+                                        current_tool["arguments"])
+                                else:
+                                    args_text = str(
+                                        current_tool.get("arguments", "{}"))
+                        except (json.JSONDecodeError, KeyError, IndexError):
+                            # Fallback to regex-based extraction
+                            pass
 
                     # If arguments haven't been sent yet
                     sent_args = self.streaming_state["sent_tools"][
@@ -419,7 +477,7 @@ class xLAMToolParser(ToolParser):
                                 index=current_idx,
                                 function=DeltaFunctionCall(
                                     arguments="{").model_dump(
-                                        exclude_none=True),  # type: ignore  
+                                        exclude_none=True),  # type: ignore
                             )
                         ])
                         return delta

From 5438967fbc7a10ae6eee7a98182f4de94101e858 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Mon, 1 Sep 2025 14:11:20 +0800
Subject: [PATCH 765/932] [Misc] add hash_function doc string (#24014)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/v1/core/kv_cache_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 6a62c55fb2..590baa6208 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -527,6 +527,7 @@ def hash_block_tokens(
     hash values for the same block contents.
 
     Args:
+        hash_function: The hash function used to compute block hash.
         parent_block_hash: The hash of the parent block. None
             if this is the first block.
         curr_block_token_ids: A list of token ids in the current

From d7fbc6ddaccbcbd514cc6e5a48a04666d9930329 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 1 Sep 2025 16:12:22 +0800
Subject: [PATCH 766/932] [Misc] Enable V1 FP16 inference on pre-Ampere GPUs
 (#24022)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/engine/arg_utils.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a6434e6be0..d4dd545dd4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1436,17 +1436,6 @@ class EngineArgs:
                                recommend_to_remove=True)
             return False
 
-        # Triton v3.3 has f16 conversion regression issue on Turing and Volta,
-        # which broke fp16 inference
-        # see: https://github.com/triton-lang/triton/issues/6698
-        if (current_platform.is_cuda()
-                and not current_platform.has_device_capability(80)
-                and model_config.dtype == torch.float16):
-            _raise_or_fallback(
-                feature_name="Compute Capability < 8.0 with FP16",
-                recommend_to_remove=False)
-            return False
-
         if self.kv_cache_dtype != "auto":
             supported = current_platform.is_kv_cache_dtype_supported(
                 self.kv_cache_dtype, model_config)

From 55602bb2e695108e35501dfcd5890e7664c31495 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Mon, 1 Sep 2025 16:50:25 +0800
Subject: [PATCH 767/932] [Frontend] Update the warning log when using
 VLLM_ALLOW_LONG_MAX_MODEL_LEN (#20904)

Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/__init__.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 484f3986bb..f53e8b0308 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3021,16 +3021,20 @@ def _get_and_verify_max_len(
                 f"User-specified max_model_len ({max_model_len}) is greater "
                 f"than the derived max_model_len ({max_len_key}="
                 f"{derived_max_model_len} or model_max_length="
-                f"{model_max_length} in model's config.json). This may lead "
-                "to incorrect model outputs or CUDA errors.")
+                f"{model_max_length} in model's config.json).")
+            warning = (
+                "VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme "
+                "caution. If the model uses relative position encoding (RoPE), "
+                "positions exceeding derived_max_model_len lead to nan. If the "
+                "model uses absolute position encoding, positions exceeding "
+                "derived_max_model_len will cause a CUDA array out-of-bounds "
+                "error.")
             if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN:
-                logger.warning(
-                    "%s Make sure the value is correct and within the "
-                    "model context size.", msg)
+                logger.warning_once("%s %s", msg, warning)
             else:
                 raise ValueError(
                     f"{msg} To allow overriding this maximum, set "
-                    "the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1")
+                    f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}")
     return int(max_model_len)
 
 
From dc1a53186d8dbb3b450174a38a10c474b55f22e9 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 1 Sep 2025 17:38:04 +0800
Subject: [PATCH 768/932] [Kernel] Update DeepGEMM to latest commit (#23915)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 docker/Dockerfile         | 5 ++---
 tools/install_deepgemm.sh | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 2e272cbca8..75e8fa49f8 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -432,11 +432,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
+ARG DEEPGEMM_GIT_REF
 COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "${DEEPGEMM_GIT_REF}" \
-    && rm /tmp/install_deepgemm.sh
+    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} 
 
 # Install EP kernels(pplx-kernels and DeepEP), NixL
 COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
diff --git a/tools/install_deepgemm.sh b/tools/install_deepgemm.sh
index 33849581d2..b125cda96f 100755
--- a/tools/install_deepgemm.sh
+++ b/tools/install_deepgemm.sh
@@ -6,7 +6,7 @@ set -e
 
 # Default values
 DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
-DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
+DEEPGEMM_GIT_REF="ea9c5d9270226c5dd7a577c212e9ea385f6ef048"
 
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do

From 107284959ac14d42aba09a03220ff21da1eafcb2 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Mon, 1 Sep 2025 11:38:20 +0200
Subject: [PATCH 769/932] [Doc]: fix typos in Python comments (#24026)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 examples/offline_inference/multilora_inference.py           | 2 +-
 vllm/distributed/device_communicators/pynccl.py             | 2 +-
 vllm/distributed/parallel_state.py                          | 4 ++--
 .../entrypoints/openai/tool_parsers/pythonic_tool_parser.py | 2 +-
 vllm/model_executor/layers/fused_moe/moe_pallas.py          | 2 +-
 vllm/model_executor/models/ovis.py                          | 2 +-
 vllm/model_executor/models/phi4mm_audio.py                  | 6 +++---
 vllm/model_executor/models/phi4mm_utils.py                  | 2 +-
 vllm/third_party/pynvml.py                                  | 2 +-
 vllm/transformers_utils/configs/nemotron.py                 | 2 +-
 vllm/transformers_utils/configs/nemotron_h.py               | 2 +-
 vllm/transformers_utils/processors/ovis.py                  | 2 +-
 vllm/transformers_utils/processors/ovis2_5.py               | 2 +-
 vllm/v1/spec_decode/ngram_proposer.py                       | 2 +-
 14 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index f0c00bcaae..6040683c68 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -23,7 +23,7 @@ def create_test_prompts(
     2 requests for base model, 4 requests for the LoRA. We define 2
     different LoRA adapters (using the same model for demo purposes).
     Since we also set `max_loras=1`, the expectation is that the requests
-    with the second LoRA adapter will be ran after all requests with the
+    with the second LoRA adapter will be run after all requests with the
     first adapter have finished.
     """
     return [
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 502bfd3900..3e4d0d250a 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -31,7 +31,7 @@ class PyNcclCommunicator:
             group: the process group to work on. If None, it will use the
                 default process group.
             device: the device to bind the PyNcclCommunicator to. If None,
-                it will be bind to f"cuda:{local_rank}".
+                it will be bound to f"cuda:{local_rank}".
             library_path: the path to the NCCL library. If None, it will
                 use the default library path.
         It is the caller's responsibility to make sure each communicator
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index b89aee99c8..fc96c2ac92 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -939,8 +939,8 @@ def get_pipeline_model_parallel_group():
 def graph_capture(device: torch.device):
     """
     `graph_capture` is a context manager which should surround the code that
-    is capturing the CUDA graph. Its main purpose is to ensure that the
-    some operations will be run after the graph is captured, before the graph
+    is capturing the CUDA graph. Its main purpose is to ensure that some
+    operations will be run after the graph is captured, before the graph
     is replayed. It returns a `GraphCaptureContext` object which contains the
     necessary data for the graph capture. Currently, it only contains the
     stream that the graph capture is running on. This stream is set to the
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 73329cdf70..992f141bef 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -165,7 +165,7 @@ class PythonicToolParser(ToolParser):
                             index] += delta.function.arguments
 
             # HACK: serving_chat.py inspects the internal state of tool parsers
-            # when determining it's final streaming delta, automatically
+            # when determining its final streaming delta, automatically
             # adding autocompleted JSON.
             # These two lines avoid that nonsense while ensuring finish_reason
             # is set to tool_calls when at least one tool is called.
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index 582ae3e12c..23f618b1a5 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -7,7 +7,7 @@ import torch.nn.functional as F
 
 def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor:
     """
-  Compute the histogram of a int32 tensor. The bin edges are defined by the
+  Compute the histogram of an int32 tensor. The bin edges are defined by the
   min and max values, with step = 1.
   """
     assert input.dtype == torch.int32, "input must be of torch.int32 dtype."
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 04a06e5f9d..41fd272397 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -544,7 +544,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
                                                       vision_embeddings)
             input_ids = None
 
-        # up until here we have a inputs_embeds 100% numerical identity
+        # up until here we have an inputs_embeds 100% numerical identity
         # between the OG HF Transformers implementation and ours
         hidden_states = self.llm(
             input_ids=input_ids,
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
index 0b0d66ae77..b5e4d727bf 100644
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -43,7 +43,7 @@ class ConformerEncoderLayer(nn.Module):
             if set different to 0, the number of 
              depthwise_seperable_out_channel will be used as a
              channel_out of the second conv1d layer. 
-             otherwise, it equal to 0, the second conv1d layer is skipped.
+             otherwise, it equals to 0, the second conv1d layer is skipped.
         depthwise_multiplier: int
             number of input_dim channels duplication. this value
              will be used to compute the hidden channels of the Conv1D.
@@ -115,7 +115,7 @@ class ConformerEncoderLayer(nn.Module):
                     we recalculate activation in backward.
             default "".
         export: bool, optional
-            if set to True, it remove the padding from convolutional layers
+            if set to True, it removes the padding from convolutional layers
              and allow the onnx conversion for inference.
               default False.
         use_pt_scaled_dot_product_attention: bool, optional
@@ -686,7 +686,7 @@ class ConformerEncoder(TransformerEncoderBase):
             only work for glu_in_attention !=0
             default "swish".
         export: bool, optional
-            if set to True, it remove the padding from convolutional layers
+            if set to True, it removes the padding from convolutional layers
              and allow the onnx conversion for inference.
               default False.
         activation_checkpointing: str, optional
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
index c4890d8427..5953550382 100644
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -258,7 +258,7 @@ class DepthWiseSeperableConv1d(nn.Module):
             if set different to 0, the number of 
              depthwise_seperable_out_channel will be used as a channel_out
              of the second conv1d layer.
-             otherwise, it equal to 0, the second conv1d layer is skipped.
+             otherwise, it equals to 0, the second conv1d layer is skipped.
         kernel_size: int
             kernel_size
         depthwise_multiplier: int
diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py
index d215e5d8bf..c06aa56744 100644
--- a/vllm/third_party/pynvml.py
+++ b/vllm/third_party/pynvml.py
@@ -1022,7 +1022,7 @@ def _extractNVMLErrorsAsClasses():
     Each NVML Error gets a new NVMLError subclass. This way try,except blocks can filter appropriate
     exceptions more easily.
 
-    NVMLError is a parent class. Each NVML_ERROR_* gets it's own subclass.
+    NVMLError is a parent class. Each NVML_ERROR_* gets its own subclass.
     e.g. NVML_ERROR_ALREADY_INITIALIZED will be turned into NVMLError_AlreadyInitialized
     '''
     this_module = sys.modules[__name__]
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 9a7243b126..090fefa142 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -26,7 +26,7 @@ logger = logging.get_logger(__name__)
 class NemotronConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a
-    [`NemotronModel`]. It is used to instantiate an Nemotron model
+    [`NemotronModel`]. It is used to instantiate a Nemotron model
     according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar
     configuration to that of the Nemotron-8B.
diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py
index 027f291154..581bed5716 100644
--- a/vllm/transformers_utils/configs/nemotron_h.py
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -38,7 +38,7 @@ class NemotronHConfig(PretrainedConfig):
             passed when calling [`NemotronHModel`]
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be
-            tied. Note that this is only relevant if the model has a output
+            tied. Note that this is only relevant if the model has an output
             word embedding layer.
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py
index 557d251c45..0077a7a8ce 100644
--- a/vllm/transformers_utils/processors/ovis.py
+++ b/vllm/transformers_utils/processors/ovis.py
@@ -55,7 +55,7 @@ class OvisProcessorKwargs(ProcessingKwargs, total=False):   # type: ignore[call-
 
 class OvisProcessor(ProcessorMixin):
     r"""
-    Constructs a Ovis processor which wraps a Ovis image processor and a Qwen2 tokenizer into a single processor.
+    Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
     [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
     [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
     Args:
diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py
index d3273257ff..282e9cb211 100644
--- a/vllm/transformers_utils/processors/ovis2_5.py
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -41,7 +41,7 @@ class Ovis2_5ProcessorKwargs(ProcessingKwargs,
 
 class Ovis2_5Processor(ProcessorMixin):
     r"""
-    Constructs a Ovis processor which wraps a Ovis image processor
+    Constructs an Ovis processor which wraps an Ovis image processor
     and a Qwen2 tokenizer into a single processor.
     [`OvisProcessor`] offers all the functionalities of 
     [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. 
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index fbcf2cb50d..b92e396d45 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -107,7 +107,7 @@ def _find_longest_matched_ngram_and_propose_tokens(
     longest_ngram = 0
     position = 0
 
-    # lps[0] always equal to 0, we starts with index 1
+    # lps[0] always equal to 0, we start with index 1
     prev_lps = 0
     i = 1
     while i < total_token:

From d46934b2297eb3dbda24c3bf26f6655d88ba99bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Mon, 1 Sep 2025 12:07:46 +0200
Subject: [PATCH 770/932] [Frontend] Gemma3n audio
 `transcriptions`/`translations` endpoint (#23735)

Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/conftest.py          | 27 +++++++
 .../openai/test_transcription_validation.py   | 35 +++++----
 .../openai/test_translation_validation.py     | 78 +++++++++++--------
 vllm/entrypoints/openai/protocol.py           | 19 +++++
 vllm/entrypoints/openai/speech_to_text.py     |  7 +-
 vllm/model_executor/models/gemma3n_mm.py      | 65 +++++++++++++++-
 vllm/model_executor/models/interfaces.py      |  6 +-
 vllm/model_executor/models/voxtral.py         |  8 +-
 vllm/model_executor/models/whisper.py         |  7 +-
 9 files changed, 189 insertions(+), 63 deletions(-)
 create mode 100644 tests/entrypoints/openai/conftest.py

diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/openai/conftest.py
new file mode 100644
index 0000000000..0ecdd4245d
--- /dev/null
+++ b/tests/entrypoints/openai/conftest.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.assets.audio import AudioAsset
+
+
+@pytest.fixture
+def mary_had_lamb():
+    path = AudioAsset('mary_had_lamb').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.fixture
+def winning_call():
+    path = AudioAsset('winning_call').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.fixture
+def foscolo():
+    # Test translation it->en
+    path = AudioAsset('azacinto_foscolo').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 6009d9aeec..6a3cdfdfc8 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -12,8 +12,6 @@ import pytest
 import pytest_asyncio
 import soundfile as sf
 
-from vllm.assets.audio import AudioAsset
-
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "openai/whisper-large-v3-turbo"
@@ -24,20 +22,6 @@ MISTRAL_FORMAT_ARGS = [
 ]
 
 
-@pytest.fixture
-def mary_had_lamb():
-    path = AudioAsset('mary_had_lamb').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
-@pytest.fixture
-def winning_call():
-    path = AudioAsset('winning_call').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
 @pytest.fixture(scope="module")
 def server():
     with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
@@ -76,6 +60,25 @@ async def test_basic_audio(mary_had_lamb, model_name):
         assert out_usage["seconds"] == 16, out_usage["seconds"]
 
 
+@pytest.mark.asyncio
+async def test_basic_audio_gemma(foscolo):
+    # Gemma accuracy on some of the audio samples we use is particularly bad,
+    # hence we use a different one here. WER is evaluated separately.
+    model_name = "google/gemma-3n-E2B-it"
+    server_args = ["--enforce-eager"]
+
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=foscolo,
+            language="it",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "da cui vergine nacque Venere" in out
+
+
 @pytest.mark.asyncio
 async def test_non_asr_model(winning_call):
     # text to text model
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index f4f5c66f2d..f43b7a253d 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -12,32 +12,24 @@ import pytest
 import pytest_asyncio
 import soundfile as sf
 
-from vllm.assets.audio import AudioAsset
-
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "openai/whisper-small"
 SERVER_ARGS = ["--enforce-eager"]
 
 
-@pytest.fixture
-def foscolo():
-    # Test translation it->en
-    path = AudioAsset('azacinto_foscolo').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
-@pytest.fixture(scope="module")
-def server():
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
-        yield remote_server
+@pytest.fixture(scope="module",
+                params=["openai/whisper-small", "google/gemma-3n-E2B-it"])
+def server(request):
+    # Parametrize over model name
+    with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
+        yield remote_server, request.param
 
 
 @pytest_asyncio.fixture
-async def client(server):
+async def client_and_model(server):
+    server, model_name = server
     async with server.get_async_client() as async_client:
-        yield async_client
+        yield async_client, model_name
 
 
 @pytest.mark.asyncio
@@ -56,27 +48,29 @@ async def test_non_asr_model(foscolo):
 
 # NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
 @pytest.mark.asyncio
-async def test_basic_audio(foscolo, client):
+async def test_basic_audio(foscolo, client_and_model):
+    client, model_name = client_and_model
     translation = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=foscolo,
         response_format="text",
-        # TODO remove once language detection is implemented
-        extra_body=dict(language="it"),
+        # TODO remove `language="it"` once language detection is implemented
+        extra_body=dict(language="it", to_language="en"),
         temperature=0.0)
     out = json.loads(translation)['text'].strip().lower()
     assert "greek sea" in out
 
 
 @pytest.mark.asyncio
-async def test_audio_prompt(foscolo, client):
+async def test_audio_prompt(foscolo, client_and_model):
+    client, model_name = client_and_model
     # Condition whisper on starting text
     prompt = "Nor have I ever"
     transcription = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=foscolo,
         prompt=prompt,
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en"),
         response_format="text",
         temperature=0.0)
     out = json.loads(transcription)['text']
@@ -85,22 +79,27 @@ async def test_audio_prompt(foscolo, client):
 
 
 @pytest.mark.asyncio
-async def test_streaming_response(foscolo, client, server):
+async def test_streaming_response(foscolo, client_and_model, server):
+    client, model_name = client_and_model
     translation = ""
     res_no_stream = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=foscolo,
         response_format="json",
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en", seed=42),
         temperature=0.0)
+
     # Stream via HTTPX since OpenAI translation client doesn't expose streaming
+    server, model_name = server
     url = server.url_for("v1/audio/translations")
     headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
     data = {
-        "model": MODEL_NAME,
+        "model": model_name,
         "language": "it",
+        "to_language": "en",
         "stream": True,
         "temperature": 0.0,
+        "seed": 42,
     }
     foscolo.seek(0)
     async with httpx.AsyncClient() as http_client:
@@ -121,16 +120,24 @@ async def test_streaming_response(foscolo, client, server):
                 text = chunk["choices"][0].get("delta", {}).get("content")
                 translation += text or ""
 
-    assert translation == res_no_stream.text
+    res_stream = translation.split()
+    # NOTE There's a small non-deterministic issue here, likely in the attn
+    # computation, which will cause a few tokens to be different, while still
+    # being very close semantically.
+    assert sum([
+        x == y for x, y in zip(res_stream, res_no_stream.text.split())
+    ]) >= len(res_stream) * 0.9
 
 
 @pytest.mark.asyncio
-async def test_stream_options(foscolo, client, server):
+async def test_stream_options(foscolo, server):
+    server, model_name = server
     url = server.url_for("v1/audio/translations")
     headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
     data = {
-        "model": MODEL_NAME,
+        "model": model_name,
         "language": "it",
+        "to_language": "en",
         "stream": True,
         "stream_include_usage": True,
         "stream_continuous_usage_stats": True,
@@ -164,7 +171,10 @@ async def test_stream_options(foscolo, client, server):
 
 
 @pytest.mark.asyncio
-async def test_long_audio_request(foscolo, client):
+async def test_long_audio_request(foscolo, client_and_model):
+    client, model_name = client_and_model
+    if model_name == "google/gemma-3n-E2B-it":
+        pytest.skip("Gemma3n does not support long audio requests")
     foscolo.seek(0)
     audio, sr = librosa.load(foscolo)
     repeated_audio = np.tile(audio, 2)
@@ -173,9 +183,9 @@ async def test_long_audio_request(foscolo, client):
     sf.write(buffer, repeated_audio, sr, format='WAV')
     buffer.seek(0)
     translation = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=buffer,
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en"),
         response_format="text",
         temperature=0.0)
     out = json.loads(translation)['text'].strip().lower()
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 12b274e121..00b72f74ce 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -2175,6 +2175,13 @@ class TranscriptionRequest(OpenAIBaseModel):
     )
     # --8<-- [end:transcription-extra-params]
 
+    to_language: Optional[str] = None
+    """The language of the output audio we transcribe to.
+
+    Please note that this is not currently used by supported models at this 
+    time, but it is a placeholder for future use, matching translation api.
+    """
+
     # --8<-- [start:transcription-sampling-params]
     temperature: float = Field(default=0.0)
     """The sampling temperature, between 0 and 1.
@@ -2408,6 +2415,9 @@ class TranslationRequest(OpenAIBaseModel):
 
     # TODO support additional sampling parameters
     # --8<-- [start:translation-sampling-params]
+    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    """The seed to use for sampling."""
+
     temperature: float = Field(default=0.0)
     """The sampling temperature, between 0 and 1.
 
@@ -2427,6 +2437,14 @@ class TranslationRequest(OpenAIBaseModel):
     will improve accuracy.
     """
 
+    to_language: Optional[str] = None
+    """The language of the input audio we translate to.
+
+    Please note that this is not supported by all models, refer to the specific
+    model documentation for more details.
+    For instance, Whisper only supports `to_language=en`.
+    """
+
     stream: Optional[bool] = False
     """Custom field not present in the original OpenAI definition. When set,
     it will enable output to be streamed in a similar fashion as the Chat
@@ -2458,6 +2476,7 @@ class TranslationRequest(OpenAIBaseModel):
 
         return SamplingParams.from_optional(temperature=temperature,
                                             max_tokens=max_tokens,
+                                            seed=self.seed,
                                             output_kind=RequestOutputKind.DELTA
                                             if self.stream \
                                             else RequestOutputKind.FINAL_ONLY)
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index 1cbd7dba39..965bdac3ac 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -89,6 +89,9 @@ class OpenAISpeechToText(OpenAIServing):
     ) -> tuple[list[PromptType], float]:
         # Validate request
         language = self.model_cls.validate_language(request.language)
+        # Skip to_language validation to avoid extra logging for Whisper.
+        to_language = self.model_cls.validate_language(request.to_language) \
+            if request.to_language else None
 
         if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
             raise ValueError("Maximum file size exceeded.")
@@ -112,7 +115,9 @@ class OpenAISpeechToText(OpenAIServing):
                 model_config=self.model_config,
                 language=language,
                 task_type=self.task_type,
-                request_prompt=request.prompt)
+                request_prompt=request.prompt,
+                to_language=to_language,
+            )
             prompts.append(prompt)
         return prompts, duration
 
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index d59dde1560..c25bbcd420 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Optional, TypedDict, Union, cast
+from typing import Any, Literal, Optional, TypedDict, Union, cast
 
+import numpy as np
 import torch
 from torch import nn
 from transformers import AutoModel, BatchFeature
@@ -13,7 +14,8 @@ from transformers.models.gemma3n import (Gemma3nAudioConfig,
                                          Gemma3nVisionConfig)
 from transformers.models.siglip import SiglipImageProcessorFast
 
-from vllm.config import VllmConfig
+from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import RowParallelLinear
@@ -21,6 +23,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM
 from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -40,7 +43,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
+                         SupportsTranscription)
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -410,7 +414,10 @@ class Gemma3nMultimodalEmbedder(nn.Module):
 @MULTIMODAL_REGISTRY.register_processor(Gemma3nMultiModalProcessor,
                                         info=Gemma3nProcessingInfo,
                                         dummy_inputs=Gemma3nDummyInputsBuilder)
-class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal):
+class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                      SupportsTranscription):
+    supported_languages = ISO639_1_SUPPORTED_LANGS
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -694,3 +701,53 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal):
             return "<audio_soft_token>"
         else:
             raise ValueError(f"Unsupported modality: {modality}")
+
+    @classmethod
+    def get_generation_prompt(cls, audio: np.ndarray,
+                              stt_config: SpeechToTextConfig,
+                              model_config: ModelConfig,
+                              language: Optional[str],
+                              task_type: Literal["transcribe", "translate"],
+                              request_prompt: str,
+                              to_language: Optional[str]) -> PromptType:
+        """
+        Gemma3n supports "free-form" transcription.
+        We fix its prompt here to standardize transcriptions/translations 
+        requests.
+        """
+        # Transcribe this audio [into <>] | for transcription
+        # Translate this audio [from <> into <>] | for translation
+        prompt = "<start_of_turn>user\n"
+        prompt += "Transcribe" if task_type == "transcribe" else "Translate"
+        prompt += " this audio"
+
+        # We assume the language is a valid ISO 639-1 code.
+        full_lang_name = cls.supported_languages.get(language, "")
+        # Translation only for now
+        full_lang_name_to = cls.supported_languages.get(to_language, "")
+
+        if task_type == "transcribe" and full_lang_name:
+            prompt += f" into {full_lang_name}"
+        elif task_type == "translate":
+            if full_lang_name:
+                prompt += f" from {full_lang_name}"
+            if full_lang_name_to:
+                prompt += f" into {full_lang_name_to}"
+
+        prompt += ": <audio_soft_token><end_of_turn>\n<start_of_turn>model\n"
+
+        audio = (audio, stt_config.sample_rate)
+        prompts_dict = {"multi_modal_data": {"audio": audio}, "prompt": prompt}
+        return cast(PromptType, prompts_dict)
+
+    @classmethod
+    def get_speech_to_text_config(cls, model_config: ModelConfig,
+                                  task_type: str) -> SpeechToTextConfig:
+        return SpeechToTextConfig(
+            # Let's set this to 30 as suggested in the docs for now, although
+            # the model is only limited by its context length.
+            max_audio_clip_s=30,
+            sample_rate=16000,
+            # TODO enable chunking after more thorough testing.
+            min_energy_split_window_size=None,
+        )
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 2ee966fb5c..d5b71b0578 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -700,8 +700,10 @@ class SupportsTranscription(Protocol):
     def get_generation_prompt(cls, audio: np.ndarray,
                               stt_config: SpeechToTextConfig,
                               model_config: ModelConfig,
-                              language: Optional[str], task_type: str,
-                              request_prompt: str) -> PromptType:
+                              language: Optional[str],
+                              task_type: Literal["transcribe", "translate"],
+                              request_prompt: str,
+                              to_language: Optional[str]) -> PromptType:
         """Get the prompt for the ASR model.
         The model has control over the construction, as long as it
         returns a valid PromptType."""
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 6bc748407a..f3731b389c 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -5,7 +5,7 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from math import ceil
-from typing import Optional, Union, cast
+from typing import Literal, Optional, Union, cast
 
 import numpy as np
 import regex as re
@@ -455,8 +455,10 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
     def get_generation_prompt(cls, audio: np.ndarray,
                               model_config: ModelConfig,
                               stt_config: SpeechToTextConfig,
-                              language: Optional[str], task_type: str,
-                              request_prompt: str) -> PromptType:
+                              language: Optional[str],
+                              task_type: Literal["transcribe", "translate"],
+                              request_prompt: str,
+                              to_language: Optional[str]) -> PromptType:
         tokenizer = cached_tokenizer_from_config(model_config)
         audio = Audio(audio, int(stt_config.sample_rate),
                       format="wav")  # lossless
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 16bbe2f201..848b6e0f80 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -4,7 +4,7 @@
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from contextlib import nullcontext
-from typing import Optional, TypedDict, Union, cast
+from typing import Literal, Optional, TypedDict, Union, cast
 
 import numpy as np
 import torch
@@ -783,8 +783,9 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
             model_config: ModelConfig,  # not needed here
             stt_config: SpeechToTextConfig,
             language: Optional[str],
-            task_type: str,
-            request_prompt: str) -> PromptType:
+            task_type: Literal["transcribe", "translate"],
+            request_prompt: str,
+            to_language: Optional[str]) -> PromptType:
         if language is None:
             raise ValueError(
                 "Language must be specified when creating the Whisper prompt")

From 3e330fcb218e207bbc9eec3ed479a8b53b25c98d Mon Sep 17 00:00:00 2001
From: Kay Yan <kay.yan@daocloud.io>
Date: Mon, 1 Sep 2025 18:34:52 +0800
Subject: [PATCH 771/932] [Doc]: Fix CPU install docs: force torch-backend=cpu
 to avoid GPU torchvision errors (#24033)

Signed-off-by: Kay Yan <kay.yan@daocloud.io>
---
 docs/getting_started/installation/cpu/build.inc.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md
index 57a09e674a..4bd4d39a6f 100644
--- a/docs/getting_started/installation/cpu/build.inc.md
+++ b/docs/getting_started/installation/cpu/build.inc.md
@@ -16,8 +16,8 @@ cd vllm_source
 Third, install required dependencies:
 
 ```bash
-uv pip install -r requirements/cpu-build.txt --torch-backend auto
-uv pip install -r requirements/cpu.txt --torch-backend auto
+uv pip install -r requirements/cpu-build.txt --torch-backend cpu
+uv pip install -r requirements/cpu.txt --torch-backend cpu
 ```
 
 ??? console "pip"

From 7c8271cd1e85670bd60c058441e1a224421dea01 Mon Sep 17 00:00:00 2001
From: Kwai-Keye <Keye@kuaishou.com>
Date: Mon, 1 Sep 2025 18:50:27 +0800
Subject: [PATCH 772/932] [Model]: support KeyeVL-1_5-8B (#23838)

Signed-off-by: wangruitao <wangruitao@kuaishou.com>
Co-authored-by: wangruitao <wangruitao@kuaishou.com>
---
 docs/models/supported_models.md               |   3 +-
 examples/offline_inference/vision_language.py |  32 +
 .../vision_language_multi_image.py            |  38 ++
 .../multimodal/processing/test_common.py      |   1 +
 tests/models/registry.py                      |   2 +
 .../layers/rotary_embedding/mrope.py          | 129 ++++
 vllm/model_executor/models/keye.py            | 594 +++++++++--------
 vllm/model_executor/models/keye_vl1_5.py      | 601 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 9 files changed, 1123 insertions(+), 278 deletions(-)
 create mode 100644 vllm/model_executor/models/keye_vl1_5.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index e8fe77e8d6..4b4cebb6a3 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -634,7 +634,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
+| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
+| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
 | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 4e879666f6..b104113b88 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -683,6 +683,37 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Keye-VL-1.5
+def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1.5-8B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        trust_remote_code=True,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Kimi-VL
 def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1648,6 +1679,7 @@ model_example_map = {
     "interns1": run_interns1,
     "internvl_chat": run_internvl,
     "keye_vl": run_keye_vl,
+    "keye_vl1_5": run_keye_vl1_5,
     "kimi_vl": run_kimi_vl,
     "llama4": run_llama4,
     "llava": run_llava,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index d9242efa85..01c2905cf2 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -542,6 +542,43 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1_5-8B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "moonshotai/Kimi-VL-A3B-Instruct"
 
@@ -1209,6 +1246,7 @@ model_example_map = {
     "interns1": load_interns1,
     "internvl_chat": load_internvl,
     "keye_vl": load_keye_vl,
+    "keye_vl1_5": load_keye_vl1_5,
     "kimi_vl": load_kimi_vl,
     "llama4": load_llama4,
     "llava": load_llava,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 3ff4360b83..16c0428c6d 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -293,6 +293,7 @@ def _test_processing_correctness_one(
     "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
     "OpenGVLab/InternVL3_5-30B-A3B",
     "Kwai-Keye/Keye-VL-8B-Preview",
+    "Kwai-Keye/Keye-VL-1_5-8B",
     "moonshotai/Kimi-VL-A3B-Instruct",
     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
     "llava-hf/llava-1.5-7b-hf",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index a37ffdc311..3b5cec2dc7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -438,6 +438,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),    # noqa: E501
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                     trust_remote_code=True),
+    "KeyeVL1_5ForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-1_5-8B", # noqa: E501
+                                                         trust_remote_code=True),
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                       extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
                                                       trust_remote_code=True),
diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index 5686ec7b35..0ab4bc5375 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -402,6 +402,15 @@ class MRotaryEmbedding(RotaryEmbedding):
                 context_len=context_len,
                 seq_len=seq_len,
             )
+        elif "KeyeVL1_5" in hf_config.model_type:
+            return cls._keye_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                context_len=context_len,
+                seq_len=seq_len,
+            )
         else:
             return cls._vl_get_input_positions_tensor(
                 input_tokens=input_tokens,
@@ -636,6 +645,126 @@ class MRotaryEmbedding(RotaryEmbedding):
                                 len(input_tokens)).item()
         return llm_positions, mrope_position_delta
 
+    @classmethod
+    def _keye_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> tuple[torch.Tensor, int]:
+        if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
+            video_grid_thw = video_grid_thw[0]
+        """Get mrope input positions and delta value (Keye series)."""
+
+        def split_thw(
+                grid_thw: Union[torch.Tensor, list[int]]) -> list[list[int]]:
+            """
+            Split grid_thw along the t dimension.
+
+            Args:
+                grid_thw: shape [N, 3] tensor or nested list of [t, h, w].
+
+            Returns:
+                List of [1, h, w] rows, repeated t times for each original row.
+            """
+
+            if isinstance(grid_thw, list):
+                grid_thw = torch.tensor(grid_thw, dtype=torch.long)
+
+            if grid_thw.numel() == 0:
+                return []
+
+            t, hw = grid_thw[:, 0], grid_thw[:, 1:]
+            ones = torch.ones_like(hw[:, :1])  # [N,1]
+            out = torch.cat([ones, hw], dim=1).repeat_interleave(t, dim=0)
+            return out.tolist()
+
+        video_grid_thw = split_thw(video_grid_thw)
+
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+
+        image_nums = len(image_grid_thw)
+        frame_nums = len(video_grid_thw)
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_frames = image_nums, frame_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + frame_nums):
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
+            else:
+                ed_image = len(input_tokens) + 1
+            if remain_frames > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
+            else:
+                ed_video = len(input_tokens) + 1
+
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                video_index += 1
+                remain_frames -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = \
+                t, h // spatial_merge_size, w // spatial_merge_size
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+                -1, llm_grid_h * llm_grid_w)).long().flatten()
+
+            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                llm_grid_t, -1, llm_grid_w).flatten()
+            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                llm_grid_t, llm_grid_h, -1).flatten()
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
     @classmethod
     def _vl_get_input_positions_tensor(
         cls,
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index c6dbd62b90..710b805acb 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
+from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, Optional, TypeVar, Union
 
 import numpy as np
 import torch
@@ -57,16 +58,13 @@ from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
 
-_MAX_FRAMES_PER_VIDEO = 16
-_MAX_IMAGE_SIZE = 9999999
-
 
 def smart_resize(
     height: int,
     width: int,
-    factor: int = 28,
-    min_pixels: int = 28 * 28 * 130,
-    max_pixels: int = 28 * 28 * 1280,
+    factor: int,
+    min_pixels: int,
+    max_pixels: int,
 ):
     if height < factor:
         logger.warning(
@@ -887,9 +885,9 @@ class Projector(nn.Module):
 
     def forward(
         self,
-        image_features: torch.Tensor,
+        image_features: Union[torch.Tensor, list[torch.Tensor]],
         image_grid_thw: list[tuple[int, int, int]],
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
         m1, m2 = self.merge_kernel_size
         if isinstance(image_features, (list, tuple)):
             processed_features = list()
@@ -986,6 +984,12 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
 
 class KeyeProcessingInfo(BaseProcessingInfo):
 
+    def get_max_image_size(self) -> int:
+        return 9999999  #_MAX_IMAGE_SIZE
+
+    def get_max_frame_per_video(self) -> int:
+        return 16  #_MAX_FRAMES_PER_VIDEO
+
     def get_image_processor(self, **kwargs: object):
         return self.get_hf_processor(**kwargs).image_processor
 
@@ -1077,8 +1081,8 @@ class KeyeProcessingInfo(BaseProcessingInfo):
 
     def get_image_size_with_most_features(self, ) -> ImageSize:
         max_image_size, _ = self._get_vision_info(
-            image_width=_MAX_IMAGE_SIZE,
-            image_height=_MAX_IMAGE_SIZE,
+            image_width=self.get_max_image_size(),
+            image_height=self.get_max_image_size(),
             image_processor=None,
         )
         return max_image_size
@@ -1123,7 +1127,7 @@ class KeyeProcessingInfo(BaseProcessingInfo):
                                                       max_image_tokens)
         max_frames_per_video = min(
             max_total_frames // max(max_videos, 1),
-            _MAX_FRAMES_PER_VIDEO,
+            self.get_max_frame_per_video(),
         )
 
         return max(max_frames_per_video, 1)
@@ -1139,7 +1143,10 @@ class KeyeProcessingInfo(BaseProcessingInfo):
         )
 
 
-class KeyeDummyInputsBuilder(BaseDummyInputsBuilder[KeyeProcessingInfo]):
+_I = TypeVar("_I", bound=KeyeProcessingInfo)
+
+
+class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
@@ -1183,6 +1190,10 @@ class KeyeDummyInputsBuilder(BaseDummyInputsBuilder[KeyeProcessingInfo]):
         return mm_data
 
 
+class KeyeDummyInputsBuilder(KeyeBaseDummyInputsBuilder[KeyeProcessingInfo]):
+    ...
+
+
 class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
@@ -1231,13 +1242,7 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
         return _keye_field_config(hf_inputs)
 
 
-@MULTIMODAL_REGISTRY.register_processor(
-    KeyeMultiModalProcessor,
-    info=KeyeProcessingInfo,
-    dummy_inputs=KeyeDummyInputsBuilder,
-)
-class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
-                                   SupportsPP):
+class BaseKeyeModule(nn.Module):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -1264,6 +1269,11 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
 
         raise ValueError("Only image or video modality is supported")
 
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: PretrainedConfig = vllm_config.model_config.hf_config
@@ -1278,7 +1288,8 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
             quant_config=self._maybe_ignore_quant_config(quant_config),
             prefix=maybe_prefix(prefix, "visual"),
         )
-        self.mlp_AR = Projector(
+
+        self.mlp_AR = self._build_projector(
             config,
             config.vision_config,
             quant_config=self._maybe_ignore_quant_config(quant_config),
@@ -1294,13 +1305,287 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
-        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
-            return None
-        return quant_config
+    @abstractmethod
+    def _build_projector(self,
+                         text_config: PretrainedConfig,
+                         vision_config: PretrainedConfig,
+                         quant_config: Optional[QuantizationConfig] = None,
+                         prefix: str = "") -> nn.Module:
+        raise ValueError("Need projector")
 
-    def _validate_and_reshape_mm_tensor(self, mm_input: NestedTensors,
-                                        name: str) -> torch.Tensor:
+    def _process_image_input(self,
+                             image_input: Any) -> tuple[torch.Tensor, ...]:
+        siglip_position_ids = list()
+        image_grid_hws = list()
+        sample_indices = list()
+        cu_seqlens = [0]
+
+        image_grid_thw = image_input["image_grid_thw"]
+        assert image_grid_thw.ndim == 2
+
+        for idx, thaw in enumerate(image_grid_thw):
+            thw_tuple = tuple(thaw.detach().cpu().numpy().tolist())
+            numel = np.prod(thw_tuple)
+            image_grid_hws.append(thw_tuple)
+            image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+            siglip_position_ids.append(image_position_ids)
+            sample_indices.append(torch.full((numel, ), idx,
+                                             dtype=torch.int64))
+            cu_seqlens.append(cu_seqlens[-1] + numel)
+
+        if image_input["type"] == "image_embeds":
+            raise ValueError(
+                "Image embeddings are not supported for this processing path.")
+        else:
+            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+            siglip_position_ids = torch.concat(siglip_position_ids,
+                                               dim=0).to(pixel_values.device)
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
+                pixel_values.device)
+            sample_indices = torch.concat(sample_indices,
+                                          dim=0).to(pixel_values.device)
+
+            image_embeds = self.visual(
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_hws,
+                position_ids=siglip_position_ids,
+                vision_return_embed_list=False,
+                interpolate_pos_encoding=True,
+                sample_indices=sample_indices,
+                cu_seqlens=cu_seqlens,
+                use_rope=True,
+                window_size=-1,
+            )
+            image_embeds = tuple(self.mlp_AR(image_embeds, image_grid_thw))
+            return image_embeds
+
+    def _process_video_embeds(
+        self,
+        video_type: Literal["video_embeds", "pixel_values_videos"],
+        video_grid_thw: list[torch.Tensor],
+        pixel_values_videos: Optional[torch.Tensor] = None
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        siglip_position_ids = list()
+        video_grid_hws = list()
+        sample_indices = list()
+        cu_seqlens = [0]
+
+        assert video_grid_thw.ndim == 2
+        for idx, sub_thw in enumerate(video_grid_thw):
+            thw_tuple = tuple(sub_thw.detach().cpu().numpy().tolist())
+            numel = np.prod(thw_tuple)
+
+            video_grid_hws.append(thw_tuple)
+            video_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+            siglip_position_ids.append(video_position_ids)
+            sample_indices.append(torch.full((numel, ), idx,
+                                             dtype=torch.int64))
+            cu_seqlens.append(cu_seqlens[-1] + numel)
+
+        if video_type == "video_embeds":
+            raise ValueError(
+                "Video embeddings are not supported for this processing path.")
+        else:
+            pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
+            siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
+                pixel_values_videos.device)
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
+                pixel_values_videos.device)
+            sample_indices = torch.concat(sample_indices,
+                                          dim=0).to(pixel_values_videos.device)
+
+            video_embeds = self.visual(
+                pixel_values=pixel_values_videos,
+                image_grid_thw=video_grid_hws,
+                position_ids=siglip_position_ids,
+                vision_return_embed_list=True,
+                interpolate_pos_encoding=True,
+                sample_indices=sample_indices,
+                cu_seqlens=cu_seqlens,
+                use_rope=True,
+                window_size=-1,
+            )
+            video_embeds = self.mlp_AR(video_embeds, video_grid_thw)
+            return video_embeds
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        for input_key in kwargs:
+            if (input_key in ("pixel_values", "image_embeds")
+                    and "images" not in modalities):
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if (input_key in ("pixel_values_videos", "video_embeds")
+                    and "videos" not in modalities):
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += video_embeddings
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                [
+                    self.config.image_token_id,
+                    self.config.video_token_id,
+                ],
+            )
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[Any] = None,
+        video_input: Optional[Any] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=self.config.image_token_id,
+            )
+
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeds,
+                placeholder_token_id=self.config.video_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Keye-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+            pixel_values: Pixel values to be fed to a model.
+                `None` if no images are passed.
+            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
+                `None` if no images are passed.
+            pixel_values_videos: Pixel values of videos to be fed to a model.
+                `None` if no videos are passed.
+            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
+                `None` if no videos are passed.
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                if uses_mrope(self.config):
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}")
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    video_input=video_input,
+                )
+                input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """Get the module prefix in multimodal models."""
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="mlp_AR.",
+            tower_model="visual.",
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    KeyeMultiModalProcessor,
+    info=KeyeProcessingInfo,
+    dummy_inputs=KeyeDummyInputsBuilder,
+)
+class KeyeForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
+                                   SupportsLoRA, SupportsPP):
+
+    def _build_projector(self,
+                         text_config: PretrainedConfig,
+                         vision_config: PretrainedConfig,
+                         quant_config: Optional[QuantizationConfig] = None,
+                         prefix: str = "") -> nn.Module:
+        return Projector(text_config, vision_config, quant_config, prefix)
+
+    def _validate_and_reshape_mm_tensor(
+            self, mm_input: NestedTensors,
+            name: str) -> Union[torch.Tensor, list[torch.Tensor]]:
         if not isinstance(mm_input, (torch.Tensor, list)):
             raise ValueError(f"Incorrect type of {name}. "
                              f"Got type: {type(mm_input)}")
@@ -1388,257 +1673,12 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
                 video_grid_thw=video_grid_thw,
             )
 
-    def _process_image_input(
-            self, image_input: KeyeImageInputs) -> tuple[torch.Tensor, ...]:
-        siglip_position_ids = list()
-        image_grid_hws = list()
-        sample_indices = list()
-        cu_seqlens = [0]
-
-        image_grid_thw = image_input["image_grid_thw"]
-        assert image_grid_thw.ndim == 2
-
-        for idx, thaw in enumerate(image_grid_thw):
-            thw_tuple = tuple(thaw.detach().cpu().numpy().tolist())
-            numel = np.prod(thw_tuple)
-            image_grid_hws.append(thw_tuple)
-            image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
-            siglip_position_ids.append(image_position_ids)
-            sample_indices.append(torch.full((numel, ), idx,
-                                             dtype=torch.int64))
-            cu_seqlens.append(cu_seqlens[-1] + numel)
-
-        if image_input["type"] == "image_embeds":
-            raise ValueError(
-                "Image embeddings are not supported for this processing path.")
-        else:
-            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-            siglip_position_ids = torch.concat(siglip_position_ids,
-                                               dim=0).to(pixel_values.device)
-            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
-                pixel_values.device)
-            sample_indices = torch.concat(sample_indices,
-                                          dim=0).to(pixel_values.device)
-
-            image_embeds = self.visual(
-                pixel_values=pixel_values,
-                image_grid_thw=image_grid_hws,
-                position_ids=siglip_position_ids,
-                vision_return_embed_list=False,
-                interpolate_pos_encoding=True,
-                sample_indices=sample_indices,
-                cu_seqlens=cu_seqlens,
-                use_rope=True,
-                window_size=-1,
-            )
-            image_embeds = tuple(self.mlp_AR(image_embeds, image_grid_thw))
-            return image_embeds
-
     def _process_video_input(
             self, video_input: KeyeVideoInputs) -> tuple[torch.Tensor, ...]:
-        siglip_position_ids = list()
-        video_grid_hws = list()
-        sample_indices = list()
-        cu_seqlens = [0]
-
+        video_type = video_input["type"]
         video_grid_thw = video_input["video_grid_thw"]
-        assert video_grid_thw.ndim == 2
+        pixel_values_videos = video_input.get("pixel_values_videos", None)
 
-        for idx, thaw in enumerate(video_grid_thw):
-            thw_tuple = tuple(thaw.detach().cpu().numpy().tolist())
-            numel = np.prod(thw_tuple)
-
-            video_grid_hws.append(thw_tuple)
-            video_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
-            siglip_position_ids.append(video_position_ids)
-            sample_indices.append(torch.full((numel, ), idx,
-                                             dtype=torch.int64))
-            cu_seqlens.append(cu_seqlens[-1] + numel)
-
-        if video_input["type"] == "video_embeds":
-            raise ValueError(
-                "Video embeddings are not supported for this processing path.")
-        else:
-            pixel_values_videos = video_input["pixel_values_videos"].type(
-                self.visual.dtype)
-            siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
-                pixel_values_videos.device)
-            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
-                pixel_values_videos.device)
-            sample_indices = torch.concat(sample_indices,
-                                          dim=0).to(pixel_values_videos.device)
-
-            video_embeds = self.visual(
-                pixel_values=pixel_values_videos,
-                image_grid_thw=video_grid_hws,
-                position_ids=siglip_position_ids,
-                vision_return_embed_list=True,
-                interpolate_pos_encoding=True,
-                sample_indices=sample_indices,
-                cu_seqlens=cu_seqlens,
-                use_rope=True,
-                window_size=-1,
-            )
-            video_embeds = tuple(self.mlp_AR(video_embeds, video_grid_thw))
-            return video_embeds
-
-    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
-        modalities = {}
-
-        for input_key in kwargs:
-            if (input_key in ("pixel_values", "image_embeds")
-                    and "images" not in modalities):
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if (input_key in ("pixel_values_videos", "video_embeds")
-                    and "videos" not in modalities):
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
-
-        return modalities
-
-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
-    def get_multimodal_embeddings(
-            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-
-        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-        if not modalities:
-            return None
-
-        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
-
-        for modality in modalities:
-            if modality == "images":
-                image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += vision_embeddings
-            if modality == "videos":
-                video_input = modalities["videos"]
-                video_embeddings = self._process_video_input(video_input)
-                multimodal_embeddings += video_embeddings
-        return multimodal_embeddings
-
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                [
-                    self.config.image_token_id,
-                    self.config.video_token_id,
-                ],
-            )
-        return inputs_embeds
-
-    def get_input_embeddings_v0(
-        self,
-        input_ids: torch.Tensor,
-        image_input: Optional[KeyeImagePixelInputs] = None,
-        video_input: Optional[KeyeVideoPixelInputs] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.get_input_embeddings(input_ids)
-        if image_input is not None:
-            image_embeds = self._process_image_input(image_input)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                image_embeds,
-                placeholder_token_id=self.config.image_token_id,
-            )
-
-        if video_input is not None:
-            video_embeds = self._process_video_input(video_input)
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                video_embeds,
-                placeholder_token_id=self.config.video_token_id,
-            )
-        return inputs_embeds
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        """Run forward pass for Qwen2-VL.
-
-        Args:
-            input_ids: Flattened (concatenated) input_ids corresponding to a
-                batch.
-            positions: Flattened (concatenated) position ids corresponding to a
-                batch.
-                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
-                opensource models), the shape will be `(3, seq_len)`,
-                otherwise it will be `(seq_len,).
-            pixel_values: Pixel values to be fed to a model.
-                `None` if no images are passed.
-            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
-                `None` if no images are passed.
-            pixel_values_videos: Pixel values of videos to be fed to a model.
-                `None` if no videos are passed.
-            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
-                `None` if no videos are passed.
-        """
-
-        if intermediate_tensors is not None:
-            inputs_embeds = None
-
-        elif inputs_embeds is None:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            video_input = self._parse_and_validate_video_input(**kwargs)
-
-            if image_input is None and video_input is None:
-                inputs_embeds = None
-            else:
-                if uses_mrope(self.config):
-                    assert positions.ndim == 2 and positions.size(0) == 3, (
-                        "multimodal section rotary embedding requires "
-                        f"(3, seq_len) positions, but got {positions.size()}")
-                inputs_embeds = self.get_input_embeddings_v0(
-                    input_ids,
-                    image_input=image_input,
-                    video_input=video_input,
-                )
-                input_ids = None
-
-        hidden_states = self.language_model.model(
-            input_ids=input_ids,
-            positions=positions,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds,
-        )
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        return self.language_model.compute_logits(hidden_states,
-                                                  sampling_metadata)
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-
-        loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
-
-    def get_mm_mapping(self) -> MultiModelKeys:
-        """Get the module prefix in multimodal models."""
-        return MultiModelKeys.from_string_field(
-            language_model="language_model",
-            connector="visual.",
-            tower_model="mlp_AR.",
-        )
+        return tuple(
+            self._process_video_embeds(video_type, video_grid_thw,
+                                       pixel_values_videos))
diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py
new file mode 100644
index 0000000000..605c6d3eaf
--- /dev/null
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -0,0 +1,601 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
+from collections.abc import Mapping, Sequence
+from functools import partial
+from typing import Annotated, Any, Literal, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from transformers import PretrainedConfig
+from transformers.activations import GELUActivation
+from transformers.feature_extraction_utils import BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
+                                    MultiModalFieldConfig,
+                                    MultiModalKwargsItems, VideoItem)
+from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems,
+                                   MultiModalDataItems, MultiModalDataParser)
+from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .keye import (BaseKeyeModule, BaseMultiModalProcessor,
+                   KeyeBaseDummyInputsBuilder, KeyeProcessingInfo)
+
+logger = init_logger(__name__)
+
+
+def split_thw(grid_thw: torch.Tensor) -> torch.Tensor:
+    """
+    Split grid_thw in t dimension.
+
+    Args:
+        grid_thw: [N, 3] tensor of [t, h, w]
+
+    Returns:
+        [Σt, 3] tensor where each row is [1, h, w]
+
+    Example:
+    >>> grid_thw = torch.tensor([[2, 3, 4], [1, 5, 6]])
+    >>> split_thw(grid_thw)
+    tensor([[1, 3, 4],
+           [1, 3, 4],
+           [1, 5, 6]])
+    """
+    t = grid_thw[:, 0]
+    h_w = grid_thw[:, 1:]
+    ones = torch.ones_like(h_w[:, :1])
+    return torch.cat([ones, h_w], dim=1).repeat_interleave(t, dim=0)
+
+
+def get_num_patches(grid_thw: torch.Tensor, num_frames: Union[list[int],
+                                                              torch.Tensor]):
+    """
+    Return num_patches per video.
+
+    Args:
+        t: tensor with shape [N, ...] where each item is a list/tensor
+        cu_seqlens: list indicating the boundaries of groups
+
+    Returns:
+        list of ints representing the sum of products for each group
+
+    Examples:
+        >>> # Suppose there are 2 videos with a total of 3 grids
+        >>> grid_thw = torch.tensor([[2, 2, 2],  # grid 0: 2*2*2=8 patches
+        ...                          [2, 2, 2],  # grid 1: 2*2*2=8 patches
+        ...                          [1, 1, 1]]) # grid 2: 1*1*1=1 patches
+        >>> num_frames = [2, 1]  # The first video contains 2 grids,
+                                   the second contains 1 grid.
+        >>> get_num_patches(grid_thw, num_frames)
+        tensor([16, 1])  # Total patches for first video: 8+8=16,
+                           second video: 1.
+    """
+
+    assert len(grid_thw.shape) == 2
+    if isinstance(num_frames, torch.Tensor):
+        num_frames = num_frames.clone().tolist()
+
+    num_grids_per_frame = grid_thw.prod(dim=1)
+    start_idx_per_video = [0, *itertools.accumulate(num_frames)]
+    num_patches = [
+        num_grids_per_frame[start_idx_per_video[i]:start_idx_per_video[i + 1]].
+        sum() for i in range(len(num_frames))
+    ]
+    return torch.stack(num_patches) if num_patches else torch.zeros(
+        0, dtype=grid_thw.dtype, device=grid_thw.device)
+
+
+class KeyeVL1_5ImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - np: Number of patches
+        - c: Number of channels
+        - ps: Patch size
+        - ni: Number of images
+        - g: Grid dimensions (3 for t, h, w)
+    """
+    type: Literal["pixel_values"]
+
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", 3, "ps", "ps", dynamic_dims={"np"})]
+
+    image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
+
+
+class KeyeVL1_5ImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - nf: Number of image features
+        - hs: Hidden size (must match the hidden size of language model
+          backbone)
+        - ni: Number of images
+        - g: Grid dimensions (3 for t, h, w)
+    """
+    type: Literal["image_embeds"]
+    image_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
+    image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
+
+
+KeyeVL1_5ImageInputs = Union[KeyeVL1_5ImagePixelInputs,
+                             KeyeVL1_5ImageEmbeddingInputs]
+
+
+class KeyeVL1_5VideoPixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - np: Number of patches
+        - c: Number of channels
+        - ps: Patch size
+        - ni: Number of images
+        - g: Grid dimensions (3 for t, h, w)
+    """
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: Annotated[
+        torch.Tensor,
+        TensorShape("np", 3, "ps", "ps", dynamic_dims={"np"})]
+    video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
+
+    num_frames: torch.Tensor
+
+
+class KeyeVL1_5VideoEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - nf: Number of video features
+        - hs: Hidden size (must match the hidden size of language model
+          backbone)
+        - nv: Number of videos
+        - g: Grid dimensions (3 for t, h, w)
+    """
+    type: Literal["video_embeds"]
+    video_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
+    video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
+    num_frames: torch.Tensor
+
+
+KeyeVL1_5VideoInputs = Union[KeyeVL1_5VideoPixelInputs,
+                             KeyeVL1_5VideoEmbeddingInputs]
+
+
+class KeyeVL1_5Projector(nn.Module):
+
+    def __init__(
+        self,
+        text_config: PretrainedConfig,
+        vision_config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.merge_kernel_size = (2, 2)
+
+        self.hidden_size = (self.vision_config.hidden_size *
+                            self.merge_kernel_size[0] *
+                            self.merge_kernel_size[1])
+
+        self.pre_norm = torch.nn.LayerNorm(self.hidden_size, eps=1e-05)
+        self.act = GELUActivation()
+
+        self.linear_1 = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_1",
+        )
+        self.linear_2 = RowParallelLinear(
+            self.hidden_size,
+            self.text_config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_2",
+        )
+
+    def forward(
+        self,
+        image_features: Union[torch.Tensor, tuple[torch.Tensor],
+                              list[torch.Tensor]],
+        image_grid_thw: list[tuple[int, int, int]],
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        m1, m2 = self.merge_kernel_size
+        if isinstance(image_features, (list, tuple)):
+            processed_features = list()
+            for image_feature, image_grid in zip(image_features,
+                                                 image_grid_thw):
+                t, h, w = image_grid
+                image_feature = rearrange(
+                    image_feature,
+                    "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                    t=t,
+                    h=h // m1,
+                    p1=m1,
+                    w=w // m2,
+                    p2=m2,
+                )
+                image_feature = self.pre_norm(image_feature)
+                hidden_states, _ = self.linear_1(image_feature)
+                hidden_states = self.act(hidden_states)
+                hidden_states, _ = self.linear_2(hidden_states)
+                processed_features.append(hidden_states)
+
+            return processed_features
+
+        dims = image_features.shape[:-1]
+        dim = image_features.shape[-1]
+        image_features = image_features.view(np.prod(dims), dim)
+        hidden_states = self.pre_norm(image_features.view(
+            -1, self.hidden_size))
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states.view(*dims, -1)
+
+
+class KeyeVL1_5ProcessingInfo(KeyeProcessingInfo):
+
+    def get_max_frame_per_video(self) -> int:
+        return 2048
+
+    def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": 1}
+
+
+def _keye_field_config(hf_inputs: Mapping[str, torch.Tensor], ):
+    image_grid_thw = hf_inputs.get("image_grid_thw",
+                                   torch.empty((0, 3), dtype=torch.int64))
+    image_grid_sizes = image_grid_thw.prod(-1)
+
+    video_grid_thw = hf_inputs.get("video_grid_thw",
+                                   torch.empty((0, 3), dtype=torch.int64))
+    video_grid_thw = split_thw(video_grid_thw)
+    num_frames = hf_inputs.get("num_frames",
+                               video_grid_thw[:, 0]).clone().tolist()
+
+    video_num_patches = get_num_patches(video_grid_thw, num_frames)
+
+    video_num_grids = []
+    if len(num_frames) > 0:
+        i = 0
+        j = 1
+        cur_frames = num_frames[i]
+        for t, _, _ in video_grid_thw.tolist():
+            cur_frames -= t
+            if cur_frames == 0:
+                video_num_grids.append(j)
+                i += 1
+                if i < len(num_frames):
+                    cur_frames = num_frames[i]
+                j = 1
+            else:
+                j += 1
+    video_num_grids = torch.tensor(video_num_grids)
+    return dict(pixel_values=MultiModalFieldConfig.flat_from_sizes(
+        "image", image_grid_sizes),
+                image_embeds=MultiModalFieldConfig.flat_from_sizes(
+                    "image", image_grid_sizes),
+                image_grid_thw=MultiModalFieldConfig.batched("image"),
+                pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+                    "video", video_num_patches),
+                video_embeds=MultiModalFieldConfig.flat_from_sizes(
+                    "video", video_num_patches),
+                video_grid_thw=MultiModalFieldConfig.flat_from_sizes(
+                    "video", video_num_grids),
+                num_frames=MultiModalFieldConfig.batched("video"))
+
+
+class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                required_fields={
+                    "image_embeds",
+                    "image_grid_thw",
+                },
+                fields_factory=_keye_field_config,
+            )
+
+        return super()._parse_image_data(data)
+
+    def _parse_video_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="video",
+                required_fields={
+                    "video_embeds",
+                    "video_grid_thw",
+                },
+                fields_factory=_keye_field_config,
+            )
+
+        return super()._parse_video_data(data)
+
+
+class KeyeVL1_5MultiModalProcessor(
+        BaseMultiModalProcessor[KeyeVL1_5ProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return KeyeVL1_5MultiModalDataParser()
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+        image_token_id = vocab[hf_processor.image_token]
+        video_token_id = vocab[hf_processor.video_token]
+        placeholder = {"image": image_token_id, "video": video_token_id}
+        merge_length = image_processor.merge_size**2
+
+        out_mm_kwargs_data = out_mm_kwargs.get_data()
+        frame_types: list[torch.Tensor] = \
+            hf_processor_mm_kwargs.get("frame_types", None)
+        timestamps: list[torch.Tensor] = \
+            hf_processor_mm_kwargs.get("timestamps", None)
+        num_videos = mm_items.get_count("video", strict=False)
+
+        if frame_types is None:
+            frame_types = [None] * num_videos
+        assert len(frame_types) == num_videos, \
+            f"Number of frame_types={len(frame_types)} " \
+            f"doesn't equal to number of videos={num_videos}"
+        if timestamps is None:
+            timestamps = [None] * num_videos
+        assert len(timestamps) == num_videos, \
+            f"Number of timestamps={len(timestamps)} " \
+            f"doesn't equal to number of videos={num_videos}"
+
+        video_grid_thw = out_mm_kwargs_data.get(
+            'video_grid_thw', torch.empty((0, 3), dtype=torch.int64))
+        num_frames = out_mm_kwargs_data.get(
+            'num_frames', torch.tensor([], dtype=torch.int64))
+
+        assert len(num_frames) == num_videos, \
+            f"Size of num_frames={len(num_frames)} " \
+            f"doesn't equal to number of videos={num_videos}"
+
+        video_grid_hws = split_thw(video_grid_thw)
+        assert int(num_frames.sum().tolist()) == video_grid_hws.shape[0], (
+            f"The first dimension of `video_grid_hws`={video_grid_hws.shape[0]}"
+            f"doesn't equal to num of frames.")
+
+        cu_seqlens = torch.cumsum(torch.tensor([0] + num_frames.tolist()),
+                                  dim=-1)
+
+        def get_replacement_keye(item_idx: int, modality: str):
+            """
+            Args:
+                item_idx(int): The item index of modality to replace 
+                modality(str): The modality
+            """
+            if modality == "image":
+                out_item = out_mm_kwargs[modality][item_idx]
+                grid_thw = out_item[f"{modality}_grid_thw"].data
+                assert isinstance(grid_thw, torch.Tensor)
+
+                num_tokens = int(grid_thw.prod()) // merge_length
+                return [image_token_id] * num_tokens
+            elif modality == "video":
+                placeholders = []
+                video_timestamps = timestamps[item_idx]
+                video_frame_types = frame_types[item_idx]
+                grid_thw = video_grid_hws[
+                    cu_seqlens[item_idx]:cu_seqlens[item_idx + 1]]
+
+                nframes = grid_thw.shape[0]
+
+                if video_timestamps is None:
+                    video_timestamps = [""] * nframes
+                else:
+                    video_timestamps = [
+                        format(ts, ".1f") for ts in video_timestamps
+                    ]
+
+                if video_frame_types is None:
+                    video_frame_types = [0] * nframes
+                for i, sub_thw in enumerate(grid_thw):
+                    s = f"{hf_processor.frame_token}{video_timestamps[i]}"
+                    if video_frame_types[i] == 1:
+                        s += hf_processor.fast_start
+                    placeholders.extend(tokenizer.encode(s))
+                    num_frame_tokens = int(sub_thw.prod()) // merge_length
+                    placeholders.extend([video_token_id] * num_frame_tokens)
+                    if video_frame_types[i] == 1:
+                        placeholders.append(vocab[hf_processor.fast_end])
+
+                return PromptUpdateDetails.select_token_id(
+                    placeholders, embed_token_id=video_token_id)
+            else:
+                raise ValueError(f"Unsupported modality {modality}")
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=[placeholder[modality]],
+                replacement=partial(get_replacement_keye, modality=modality),
+            ) for modality in ("image", "video")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _keye_field_config(hf_inputs)
+
+
+class KeyeVL1_5DummyInputsBuilder(
+        KeyeBaseDummyInputsBuilder[KeyeVL1_5ProcessingInfo]):
+    ...
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    KeyeVL1_5MultiModalProcessor,
+    info=KeyeVL1_5ProcessingInfo,
+    dummy_inputs=KeyeVL1_5DummyInputsBuilder,
+)
+class KeyeVL1_5ForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
+                                        SupportsLoRA, SupportsPP):
+
+    def _build_projector(self,
+                         text_config: PretrainedConfig,
+                         vision_config: PretrainedConfig,
+                         quant_config: Optional[QuantizationConfig] = None,
+                         prefix: str = "") -> nn.Module:
+        return KeyeVL1_5Projector(text_config, vision_config, quant_config,
+                                  prefix)
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config: PretrainedConfig = vllm_config.model_config.hf_config
+        self.merge_size = config.vision_config.spatial_merge_size
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: NestedTensors,
+                                        expected_dim: int, name: str):
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == expected_dim:
+                return mm_input
+            elif mm_input.ndim == expected_dim + 1:
+                return torch.concat(list(mm_input))
+            else:
+                raise ValueError(
+                    f"{name} should be {expected_dim}D or "
+                    f"batched {expected_dim}D tensor."
+                    f"Got ndim: {mm_input.ndim} (shape={mm_input.shape})")
+        else:
+            return torch.concat(list(mm_input))
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[KeyeVL1_5ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, expected_dim=4, name="image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, expected_dim=2, name="image grid_thw")
+
+            return KeyeVL1_5ImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, expected_dim=2, name="image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, expected_dim=2, name="image grid_thw")
+
+            return KeyeVL1_5ImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[KeyeVL1_5VideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+        num_frames = kwargs.pop("num_frames", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos,
+                expected_dim=4,
+                name="video pixel values",
+            )
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, expected_dim=2, name="video grid_thw")
+
+            num_frames = self._validate_and_reshape_mm_tensor(
+                num_frames, expected_dim=1, name="video num frames")
+
+            return KeyeVL1_5VideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+                num_frames=num_frames)
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, expected_dim=2, name="video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, expected_dim=2, name="video grid_thw")
+
+            return KeyeVL1_5VideoEmbeddingInputs(type="video_embeds",
+                                                 video_embeds=video_embeds,
+                                                 video_grid_thw=video_grid_thw,
+                                                 num_frames=num_frames)
+
+    def _process_video_input(
+            self,
+            video_input: KeyeVL1_5VideoInputs) -> tuple[torch.Tensor, ...]:
+        video_type = video_input["type"]
+        video_grid_thw = split_thw(video_input["video_grid_thw"])
+        pixel_values_videos = video_input.get("pixel_values_videos", None)
+
+        video_embeds = self._process_video_embeds(video_type, video_grid_thw,
+                                                  pixel_values_videos)
+        video_embeds = torch.concat(video_embeds, dim=0)
+
+        num_frames = video_input["num_frames"].clone().tolist()
+
+        num_patches = get_num_patches(video_grid_thw, num_frames).tolist()
+
+        patch_cu_seqlens = torch.cumsum(
+            torch.tensor([0] + num_patches).detach().clone(), dim=-1)
+        patch_cu_seqlens = torch.div(patch_cu_seqlens,
+                                     self.merge_size**2,
+                                     rounding_mode="floor")
+
+        new_video_embeds = []
+        for idx in range(patch_cu_seqlens.shape[0] - 1):
+            start = patch_cu_seqlens[idx]
+            end = patch_cu_seqlens[idx + 1]
+            new_video_embeds.append(video_embeds[start:end])
+        return tuple(new_video_embeds)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 98115f8623..edb7f24214 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -227,6 +227,7 @@ _MULTIMODAL_MODELS = {
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
     "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
+    "KeyeVL1_5ForConditionalGeneration": ("keye_vl1_5", "KeyeVL1_5ForConditionalGeneration"), # noqa: E501
     "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
     "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
     "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),

From 41c80698b3849969dcb5c5e40d0991b0eb4821cc Mon Sep 17 00:00:00 2001
From: Julien Debache <julien.debache@hotmail.com>
Date: Mon, 1 Sep 2025 15:28:26 +0200
Subject: [PATCH 773/932] Document multi-proc method selection for profiling
 (#23802)

Signed-off-by: jdebache <jdebache@nvidia.com>
---
 docs/contributing/profiling.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 74627e9062..dffd62385e 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -73,6 +73,8 @@ apt install nsight-systems-cli
 
 ### Example commands and usage
 
+When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues).
+
 #### Offline Inference
 
 For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.

From 39a22dcaac707ebc6c79bfbfc12d6375a2094f38 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 1 Sep 2025 08:54:01 -0700
Subject: [PATCH 774/932] [Misc] Minor code simplification for spec decode
 (#24053)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/sched/scheduler.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index d4391b1c21..e07d53ff84 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -873,19 +873,19 @@ class Scheduler(SchedulerInterface):
             scheduled_spec_token_ids = (
                 scheduler_output.scheduled_spec_decode_tokens.get(req_id))
             if scheduled_spec_token_ids:
+                num_draft_tokens = len(scheduled_spec_token_ids)
+                num_accepted = len(generated_token_ids) - 1
+                num_rejected = num_draft_tokens - num_accepted
                 # num_computed_tokens represents the number of tokens
                 # processed in the current step, considering scheduled
                 # tokens and rejections. If some tokens are rejected,
                 # num_computed_tokens is decreased by the number of rejected
-                # tokens, where is given by:
-                # len(scheduled_spec_token_ids) + 1 - len(generated_token_ids).
-                num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 -
-                                       len(generated_token_ids))
-                request.num_computed_tokens -= num_tokens_rejected
+                # tokens.
+                request.num_computed_tokens -= num_rejected
                 spec_decoding_stats = self.make_spec_decoding_stats(
                     spec_decoding_stats,
-                    num_draft_tokens=len(scheduled_spec_token_ids),
-                    num_accepted_tokens=len(generated_token_ids) - 1)
+                    num_draft_tokens=num_draft_tokens,
+                    num_accepted_tokens=num_accepted)
 
             stopped = False
             new_logprobs = None

From cf91a89dd2b89f92e5877d0d7b4c32c70da6f3c1 Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Mon, 1 Sep 2025 17:17:41 +0100
Subject: [PATCH 775/932] [docs][misc] IOProcessor plugins fixes (#24046)

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
---
 docs/design/io_processor_plugins.md           |  4 ++--
 .../online_serving/prithvi_geospatial_mae.py  |  1 +
 .../prithvi_io_processor/prithvi_processor.py | 19 +------------------
 .../test_io_processor_plugins.py              |  1 +
 vllm/entrypoints/openai/protocol.py           |  3 ++-
 vllm/plugins/io_processors/interface.py       |  9 +++++++--
 6 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index 8e5d524940..ee474b5a7b 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -64,9 +64,9 @@ The `parse_request` method is used for validating the user prompt and converting
 The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
 The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
 
-The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is [here](../../vllm/entrypoints/openai/serving_pooling_with_io_plugin.py).
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
 
-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our [online](../../examples/online_serving/prithvi_geospatial_mae.py) and [offline](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
 
 ## Using an IO Processor plugin
 
diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/prithvi_geospatial_mae.py
index cbd34f4613..31301e0042 100644
--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@@ -33,6 +33,7 @@ def main():
         },
         "priority": 0,
         "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+        "softmax": False,
     }
 
     ret = requests.post(server_endpoint, json=request_payload_url)
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
index d49a50b7a3..0ebaafda94 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -8,7 +8,7 @@ import datetime
 import os
 import tempfile
 import urllib.request
-from collections.abc import AsyncGenerator, Sequence
+from collections.abc import Sequence
 from typing import Any, Optional, Union
 
 import albumentations
@@ -359,14 +359,6 @@ class PrithviMultimodalDataProcessor(IOProcessor):
 
         return prompts
 
-    async def pre_process_async(
-        self,
-        prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
-        **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
-        return self.pre_process(prompt, request_id, **kwargs)
-
     def post_process(
         self,
         model_output: Sequence[PoolingRequestOutput],
@@ -421,15 +413,6 @@ class PrithviMultimodalDataProcessor(IOProcessor):
                                   data=out_data,
                                   request_id=request_id)
 
-    async def post_process_async(
-        self,
-        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
-        request_id: Optional[str] = None,
-        **kwargs,
-    ) -> IOProcessorOutput:
-        collected_output = [item async for i, item in model_output]
-        return self.post_process(collected_output, request_id, **kwargs)
-
 
 class PrithviMultimodalDataProcessorIndia(PrithviMultimodalDataProcessor):
 
diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py
index 00fe429445..b2fbef2ee2 100644
--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -113,6 +113,7 @@ async def test_prithvi_mae_plugin_online(
         },
         "priority": 0,
         "model": model_name,
+        "softmax": False
     }
 
     ret = requests.post(
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 00b72f74ce..30c3a82696 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1424,9 +1424,10 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
     When using plugins IOProcessor plugins, the actual input is processed
     by the plugin itself. Hence, we use a generic type for the request data
     """
+    softmax: bool = True
 
     def to_pooling_params(self):
-        return PoolingParams(task="encode")
+        return PoolingParams(task="encode", softmax=self.softmax)
 
 
 class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py
index 5c73188d5d..62b224cac5 100644
--- a/vllm/plugins/io_processors/interface.py
+++ b/vllm/plugins/io_processors/interface.py
@@ -49,7 +49,12 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
         request_id: Optional[str] = None,
         **kwargs,
     ) -> IOProcessorOutput:
-        collected_output = [item async for i, item in model_output]
+        # We cannot guarantee outputs are returned in the same order they were
+        # fed to vLLM.
+        # Let's sort them by id before post_processing
+        sorted_output = sorted([(i, item) async for i, item in model_output],
+                               key=lambda output: output[0])
+        collected_output = [output[1] for output in sorted_output]
         return self.post_process(collected_output, request_id, **kwargs)
 
     @abstractmethod
@@ -59,4 +64,4 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
     @abstractmethod
     def output_to_response(
             self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
-        raise NotImplementedError
\ No newline at end of file
+        raise NotImplementedError

From a0e0efd6bdcfd071e5f4123319628887dfd4973d Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Tue, 2 Sep 2025 00:56:56 +0800
Subject: [PATCH 776/932] [Model] Support DP for ViT on
 Kimi-VL-A3B-Thinking-2506 (#23817)

Signed-off-by: Junhong <liujunhong11@huawei.com>
Signed-off-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com>
Co-authored-by: Junhong <liujunhong11@huawei.com>
Co-authored-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
---
 docs/configuration/optimization.md       |  1 +
 tests/multimodal/test_utils.py           | 18 ++++--
 vllm/model_executor/models/kimi_vl.py    | 52 +++++++++++-----
 vllm/model_executor/models/moonvit.py    | 77 +++++++++++++++++-------
 vllm/model_executor/models/qwen2_5_vl.py | 12 ++--
 vllm/multimodal/utils.py                 | 57 +++++++++++++-----
 6 files changed, 156 insertions(+), 61 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 2d8cdcc11f..b0ea9621d5 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -174,6 +174,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
 
 Known supported models:
 
+- Kimi-VL (<gh-pr:23817>)
 - Llama4 (<gh-pr:18368>)
 - MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
 - Qwen2.5-VL (<gh-pr:22742>)
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index a028c668c8..05e68a961a 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -636,8 +636,10 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
 
     # Run the model through the sharded function
     with torch.inference_mode():
-        sharded_output = run_dp_sharded_mrope_vision_model(
-            vision_model, pixel_values, grid_thw_list)
+        sharded_output = run_dp_sharded_mrope_vision_model(vision_model,
+                                                           pixel_values,
+                                                           grid_thw_list,
+                                                           rope_type="rope_3d")
         sharded_output = torch.cat(sharded_output, dim=0)
 
     # Check that the world size is setup correctly
@@ -691,8 +693,10 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(
 
     # Should handle empty input gracefully
     with torch.inference_mode():
-        output = run_dp_sharded_mrope_vision_model(vision_model, pixel_values,
-                                                   grid_thw_list)
+        output = run_dp_sharded_mrope_vision_model(vision_model,
+                                                   pixel_values,
+                                                   grid_thw_list,
+                                                   rope_type="rope_3d")
 
     assert len(output) == 0
 
@@ -745,8 +749,10 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
 
     # Should handle uneven distribution without errors
     with torch.inference_mode():
-        output_tuple = run_dp_sharded_mrope_vision_model(
-            vision_model, pixel_values, grid_thw_list)
+        output_tuple = run_dp_sharded_mrope_vision_model(vision_model,
+                                                         pixel_values,
+                                                         grid_thw_list,
+                                                         rope_type="rope_3d")
 
     # Verify output shape is reasonable
     merge_factor = vision_model.spatial_merge_size**2
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index a08a9a62a5..4f76d4afdb 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -56,6 +56,7 @@ from transformers.activations import GELUActivation
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
@@ -76,6 +77,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
@@ -93,8 +95,10 @@ class MaxImageTokenMeta:
 
 class KimiVLMultiModalProjector(nn.Module):
 
-    def __init__(self, config: KimiVLConfig):
+    def __init__(self, config: KimiVLConfig, \
+                 use_data_parallel: bool = False, prefix: str = ""):
         super().__init__()
+        self.use_data_parallel = use_data_parallel
 
         self.hidden_size = (config.vision_config.hidden_size *
                             config.vision_config.merge_kernel_size[0] *
@@ -102,20 +106,24 @@ class KimiVLMultiModalProjector(nn.Module):
 
         self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size,
                                            eps=1e-5)
-        self.linear_1 = nn.Linear(self.hidden_size,
-                                  self.hidden_size,
-                                  bias=True)
+        self.linear_1 = ReplicatedLinear(self.hidden_size,
+                                         self.hidden_size,
+                                         bias=True,
+                                         prefix=maybe_prefix(
+                                             prefix, "linear_1"))
+        self.linear_2 = ReplicatedLinear(self.hidden_size,
+                                         config.text_config.hidden_size,
+                                         bias=True,
+                                         prefix=maybe_prefix(
+                                             prefix, "linear_2"))
         self.act = GELUActivation()
-        self.linear_2 = nn.Linear(self.hidden_size,
-                                  config.text_config.hidden_size,
-                                  bias=True)
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         hidden_states = self.pre_norm(image_features).view(
             -1, self.hidden_size)
-        hidden_states = self.linear_1(hidden_states)
+        hidden_states, _ = self.linear_1(hidden_states)
         hidden_states = self.act(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
         return hidden_states
 
 
@@ -273,6 +281,8 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
 class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                      SupportsPP):
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
@@ -292,10 +302,17 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
         quant_config = vllm_config.quant_config
 
         assert isinstance(config.vision_config, MoonViTConfig)
+        self.use_data_parallel = model_config.multimodal_config.mm_encoder_tp_mode == "data"
+        self.hidden_size = config.text_config.hidden_size
+        self.vision_tower = MoonVitPretrainedModel(config.vision_config,
+                                                   self.use_data_parallel,
+                                                   prefix=maybe_prefix(
+                                                       prefix, "vision_tower"))
 
-        self.vision_tower = MoonVitPretrainedModel(config.vision_config)
-
-        self.multi_modal_projector = KimiVLMultiModalProjector(config=config)
+        self.multi_modal_projector = KimiVLMultiModalProjector(
+            config=config,
+            use_data_parallel=self.use_data_parallel,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
 
         self.quant_config = quant_config
         sub_vllm_config = copy.deepcopy(vllm_config)
@@ -376,13 +393,19 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         pixel_values = inputs["pixel_values"]
         image_grid_hws = inputs["image_grid_hws"]
-        return self.vision_tower(pixel_values, image_grid_hws)
+        if self.use_data_parallel:
+            return run_dp_sharded_mrope_vision_model(self.vision_tower,
+                                                     pixel_values,
+                                                     image_grid_hws.tolist(),
+                                                     rope_type="rope_2d")
+        else:
+            return self.vision_tower(pixel_values, image_grid_hws)
 
     def _process_image_input(self,
                              image_input: KimiVLImageInputs) -> torch.Tensor:
         assert image_input["type"] == "pixel_values"
         image_features = self._process_image_pixels(image_input)
-        assert isinstance(image_features, list)
+        assert isinstance(image_features, (list, tuple))
         lengths = [x.shape[0] for x in image_features]
         return self.multi_modal_projector(
             torch.cat(image_features)).split(lengths)
@@ -496,6 +519,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
             expert_params_mapping = []
 
         params_dict = dict(self.named_parameters())
+
         for args in weights:
             name, loaded_weight = args[:2]
             kwargs = args[2] if len(args) > 2 else {}
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index d0fdab13ef..41a2c836b0 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -42,7 +42,6 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-import math
 from collections.abc import Sequence
 from copy import deepcopy
 from functools import cached_property
@@ -55,6 +54,8 @@ from transformers.activations import ACT2FN, PytorchGELUTanh
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import is_flash_attn_2_available
 
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.models.utils import maybe_prefix
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 
 if is_flash_attn_2_available():
@@ -383,21 +384,30 @@ class MLP2(nn.Module):
         bias: whether to use bias in linear layer.
     """
 
-    def __init__(self, dims: list[int], activation, bias=True):
+    def __init__(self,
+                 dims: list[int],
+                 activation,
+                 bias=True,
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
         assert len(dims) == 3
-        self.fc0 = nn.Linear(dims[0], dims[1], bias=bias)
-        self.fc1 = nn.Linear(dims[1], dims[2], bias=bias)
+        self.use_data_parallel = use_data_parallel
+        self.fc0 = ReplicatedLinear(dims[0],
+                                    dims[1],
+                                    bias=bias,
+                                    prefix=maybe_prefix(prefix, "fc0"))
+        self.fc1 = ReplicatedLinear(dims[1],
+                                    dims[2],
+                                    bias=bias,
+                                    prefix=maybe_prefix(prefix, "fc1"))
         self.activation = activation
-        for m in [self.fc0, self.fc1]:
-            nn.init.trunc_normal_(m.weight, std=math.sqrt(2 / m.in_features))
-            if m.bias is not None:
-                nn.init.zeros_(m.bias)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.fc0(x)
+        x, _ = self.fc0(x)
         x = self.activation(x)
-        return self.fc1(x)
+        x, _ = self.fc1(x)
+        return x
 
 
 class MoonVitEncoderLayer(nn.Module):
@@ -407,6 +417,8 @@ class MoonVitEncoderLayer(nn.Module):
         num_heads: int,
         hidden_dim: int,
         mlp_dim: int,
+        prefix: str = "",
+        use_data_parallel: bool = False,
         *,
         attn_implementation: str = "sdpa",
         activation=F.gelu,
@@ -423,9 +435,19 @@ class MoonVitEncoderLayer(nn.Module):
 
         self.norm0 = nn.LayerNorm(hidden_dim)
         self.norm1 = nn.LayerNorm(hidden_dim)
-        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim], activation)
-        self.wqkv = nn.Linear(hidden_dim, hidden_dim * 3, bias=attn_bias)
-        self.wo = nn.Linear(hidden_dim, hidden_dim, bias=attn_bias)
+        self.use_data_parallel = use_data_parallel
+        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim],
+                        activation,
+                        prefix=f"{prefix}.mlp",
+                        use_data_parallel=use_data_parallel)
+        self.wqkv = ReplicatedLinear(hidden_dim,
+                                     hidden_dim * 3,
+                                     bias=attn_bias,
+                                     prefix=f"{prefix}.wqkv")
+        self.wo = ReplicatedLinear(hidden_dim,
+                                   hidden_dim,
+                                   bias=attn_bias,
+                                   prefix=f"{prefix}.wo")
 
     def attention_qkvpacked(
         self,
@@ -438,7 +460,7 @@ class MoonVitEncoderLayer(nn.Module):
             x (torch.Tensor): (batch_size, seqlen, hidden_dim)
             cu_seqlens (torch.Tensor):
         """
-        xqkv = self.wqkv(x)
+        xqkv, _ = self.wqkv(x)
 
         qkv_shape = xqkv.size()[:-1] + (
             3,
@@ -457,8 +479,7 @@ class MoonVitEncoderLayer(nn.Module):
                              xv,
                              q_cu_seqlens=cu_seqlens,
                              k_cu_seqlens=cu_seqlens)
-
-        attn_out = self.wo(attn_out)
+        attn_out, _ = self.wo(attn_out)
         return attn_out
 
     def forward(
@@ -494,13 +515,17 @@ class MoonVitEncoder(nn.Module):
         hidden_dim: int,
         num_layers: int,
         block_cfg: dict,
+        prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
         self.rope_2d = Rope2DPosEmb(
             block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512)
         self.blocks = nn.ModuleList(
-            [MoonVitEncoderLayer(**block_cfg) for _ in range(num_layers)])
+            [MoonVitEncoderLayer(use_data_parallel=use_data_parallel, \
+                                 prefix=f"{prefix}.blocks.{layer_idx}", \
+                                 **block_cfg) for layer_idx in range(num_layers)])
         self.final_layernorm = nn.LayerNorm(hidden_dim)
 
     def forward(self, hidden_states: torch.Tensor,
@@ -508,10 +533,9 @@ class MoonVitEncoder(nn.Module):
         rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens(
             grid_hws=grid_hw)
 
-        lengths = torch.cat((
-            torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
-            grid_hw[:, 0] * grid_hw[:, 1],
-        ))
+        lengths = torch.cat(
+            (torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
+             (grid_hw[:, 0] * grid_hw[:, 1]).to(hidden_states.device)))
         cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32)
 
         for _, block in enumerate(self.blocks):
@@ -587,11 +611,19 @@ class MoonVitPretrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_sdpa = True
 
-    def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
+    def __init__(self,
+                 config: MoonViTConfig,
+                 use_data_parallel: bool = False,
+                 prefix: str = "",
+                 *inputs,
+                 **kwargs):
         super().__init__(config, *inputs, **kwargs)
         config = deepcopy(config)
+        self.use_data_parallel = use_data_parallel
         self.merge_kernel_size = config.merge_kernel_size
+        self.hidden_size = config.hidden_size
         self.patch_size = config.patch_size
+        self.vit_processing_type = "rope_2d"
         self.patch_embed = MoonVisionPatchEmbed(
             out_dim=config.hidden_size,
             patch_size=config.patch_size,
@@ -610,6 +642,7 @@ class MoonVitPretrainedModel(PreTrainedModel):
                 "attn_bias": True,
                 "attn_implementation": config._attn_implementation,
             },
+            prefix=f"{prefix}.encoder",
         )
 
     def forward(self, pixel_values: torch.Tensor,
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index b528083b7c..c8f7fc16b4 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1021,8 +1021,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             pixel_values = image_input["pixel_values"]
 
             if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(
-                    self.visual, pixel_values, grid_thw_list)
+                return run_dp_sharded_mrope_vision_model(self.visual,
+                                                         pixel_values,
+                                                         grid_thw_list,
+                                                         rope_type="rope_3d")
             else:
                 image_embeds = self.visual(pixel_values,
                                            grid_thw=grid_thw_list)
@@ -1048,8 +1050,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
             if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(
-                    self.visual, pixel_values_videos, grid_thw_list)
+                return run_dp_sharded_mrope_vision_model(self.visual,
+                                                         pixel_values_videos,
+                                                         grid_thw_list,
+                                                         rope_type="rope_3d")
             else:
                 video_embeds = self.visual(pixel_values_videos,
                                            grid_thw=grid_thw_list)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 834b2189e4..ac967dcc40 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -9,7 +9,7 @@ from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Literal, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 from urllib.request import url2pathname
 
@@ -444,7 +444,6 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor,
     Args:
         image_input (torch.Tensor): Image input tensor.
         vision_model (torch.nn.Module): Vision model.
-
     Returns:
         torch.Tensor: Output image embeddings
     """
@@ -542,6 +541,8 @@ def run_dp_sharded_mrope_vision_model(
     vision_model: torch.nn.Module,
     pixel_values: torch.Tensor,
     grid_thw_list: list[list[int]],
+    *,
+    rope_type: Literal["rope_3d", "rope_2d"],
 ) -> tuple[torch.Tensor, ...]:
     """Run a vision model with data parallelism (DP) sharding. 
     The function will shard the input image tensor on the 
@@ -552,6 +553,10 @@ def run_dp_sharded_mrope_vision_model(
         vision_model (torch.nn.Module): Vision model.
         pixel_values (torch.Tensor): Image/Video input tensor.
         grid_thw_list: List of grid dimensions for each image
+        rope_type: Type of rope used in the vision model.
+                   Different rope types have different dimension to do ViT.
+                   "rope_3d" for 3D rope (e.g., Qwen2.5-VL)
+                   "rope_2d" for 2D rope (e.g., Kimi-VL)
     Returns:
         torch.Tensor: Output image embeddings
 
@@ -605,8 +610,12 @@ def run_dp_sharded_mrope_vision_model(
                                          device=pixel_values.device,
                                          dtype=pixel_values.dtype)
     # embed_dim_reduction_factor = 2 * 2
-    embed_dim_reduction_factor = (vision_model.spatial_merge_size *
-                                  vision_model.spatial_merge_size)
+    if rope_type == "rope_2d":
+        embed_dim_reduction_factor = (vision_model.merge_kernel_size[0] *
+                                      vision_model.merge_kernel_size[1])
+    else:
+        embed_dim_reduction_factor = (vision_model.spatial_merge_size *
+                                      vision_model.spatial_merge_size)
 
     # Find the max length across all ranks
     # The output embedding of every DP rank has to be
@@ -617,23 +626,42 @@ def run_dp_sharded_mrope_vision_model(
     local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local]
 
     # Run the vision model on the local pixel_values_local
-    if pixel_values_local.shape[0] > 0:
-        image_embeds_local = vision_model(pixel_values_local,
-                                          local_grid_thw_list)
+    if rope_type == "rope_2d":
+        if pixel_values_local.shape[0] > 0:
+            image_embeds_local = vision_model(
+                pixel_values_local, torch.tensor(local_grid_thw_list))
+            if isinstance(image_embeds_local, list):
+                image_embeds_local = torch.cat(image_embeds_local, dim=0)
+        else:
+            out_dim = getattr(vision_model.config, "hidden_size", None)
+            image_embeds_local = torch.empty(
+                (0, embed_dim_reduction_factor, out_dim),
+                device=pixel_values.device,
+                dtype=pixel_values.dtype)
     else:
-        # Handle empty case
-        image_embeds_local = torch.empty((0, vision_model.out_hidden_size),
-                                         device=pixel_values.device,
-                                         dtype=pixel_values.dtype)
+        if pixel_values_local.shape[0] > 0:
+            image_embeds_local = vision_model(pixel_values_local,
+                                              local_grid_thw_list)
+        else:
+            # Handle empty case
+            image_embeds_local = torch.empty((0, vision_model.out_hidden_size),
+                                             device=pixel_values.device,
+                                             dtype=pixel_values.dtype)
 
     # Pad the output based on max_len_per_rank
     # for tensor_model_parallel_all_gather to work
     current_len = image_embeds_local.shape[0]
     if current_len < max_len_per_rank:
         padding_size = max_len_per_rank - current_len
-        padding = torch.empty((padding_size, image_embeds_local.shape[1]),
-                              dtype=image_embeds_local.dtype,
-                              device=image_embeds_local.device)
+        if rope_type == "rope_2d":
+            padding = torch.empty((padding_size, image_embeds_local.shape[1],
+                                   image_embeds_local.shape[2]),
+                                  dtype=image_embeds_local.dtype,
+                                  device=image_embeds_local.device)
+        else:
+            padding = torch.empty((padding_size, image_embeds_local.shape[1]),
+                                  dtype=image_embeds_local.dtype,
+                                  device=image_embeds_local.device)
         image_embeds_local_padded = torch.cat([image_embeds_local, padding],
                                               dim=0)
     else:
@@ -674,7 +702,6 @@ def run_dp_sharded_mrope_vision_model(
                     embed_start:embed_start + img_patches]
                 embed_start += img_patches
             current_idx += count
-
     out_embeddings = tuple(embed for embed in original_order_embeddings
                            if embed is not None)
     assert len(out_embeddings) == len(

From 5685370271d7f3e8222e26efb854e72e826b9af7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 1 Sep 2025 12:07:53 -0700
Subject: [PATCH 777/932] [Chore][V0 Deprecation] Move LogProb to a separate
 file (#24055)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/beam_search.py                           |  2 +-
 vllm/entrypoints/openai/protocol.py           |  2 +-
 vllm/entrypoints/openai/serving_chat.py       |  2 +-
 vllm/entrypoints/openai/serving_completion.py |  2 +-
 vllm/entrypoints/openai/serving_engine.py     |  2 +-
 vllm/entrypoints/openai/serving_responses.py  |  4 +--
 vllm/logprobs.py                              | 28 +++++++++++++++++++
 vllm/model_executor/layers/sampler.py         |  4 +--
 vllm/model_executor/model_loader/neuron.py    |  4 +--
 .../model_loader/neuronx_distributed.py       |  4 +--
 vllm/outputs.py                               |  5 ++--
 vllm/sequence.py                              | 25 +----------------
 vllm/transformers_utils/detokenizer.py        |  5 ++--
 vllm/v1/engine/logprobs.py                    |  2 +-
 14 files changed, 49 insertions(+), 42 deletions(-)
 create mode 100644 vllm/logprobs.py

diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index 5a2e79e1b5..01124872e9 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -4,8 +4,8 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional, Union
 
+from vllm.logprobs import Logprob
 from vllm.lora.request import LoRARequest
-from vllm.sequence import Logprob
 
 if TYPE_CHECKING:
     from vllm.multimodal import MultiModalDataDict
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 30c3a82696..4881022325 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -43,10 +43,10 @@ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
 from vllm.entrypoints.score_utils import (ScoreContentPartParam,
                                           ScoreMultiModalParam)
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
                                   RequestOutputKind, SamplingParams)
-from vllm.sequence import Logprob
 from vllm.utils import random_uuid, resolve_obj_by_qualname
 
 logger = init_logger(__name__)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 6300d0758c..35edd2f85c 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -43,10 +43,10 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
 from vllm.entrypoints.utils import get_max_tokens
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
                                                 truncate_tool_call_ids,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 11effba8f9..b26140d4b9 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -38,9 +38,9 @@ from vllm.entrypoints.utils import get_max_tokens
 from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
                               is_tokens_prompt)
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import as_list, merge_async_iterators
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index b6a1876011..796b8ab5fc 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -67,13 +67,13 @@ from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob, PromptLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import (  # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
     MultiModalDataDict, MultiModalUUIDDict)
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.sequence import Logprob, PromptLogprobs
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
                           log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 899cb07b2b..6a676cfe1b 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -58,11 +58,11 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.tool_server import MCPToolServer, ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob as SampleLogprob
+from vllm.logprobs import SampleLogprobs
 from vllm.outputs import CompletionOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import Logprob as SampleLogprob
-from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
 
diff --git a/vllm/logprobs.py b/vllm/logprobs.py
new file mode 100644
index 0000000000..e58ca142c0
--- /dev/null
+++ b/vllm/logprobs.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional
+
+
+# We use dataclass for now because it is used for
+# openai server output, and msgspec is not serializable.
+# TODO(sang): Fix it.
+@dataclass
+class Logprob:
+    """Infos for supporting OpenAI compatible logprobs and token ranks.
+
+    Attributes:
+        logprob: The logprob of chosen token
+        rank: The vocab rank of chosen token (>=1)
+        decoded_token: The decoded chosen token index
+    """
+    logprob: float
+    rank: Optional[int] = None
+    decoded_token: Optional[str] = None
+
+
+# {token_id -> logprob} per each sequence group. None if the corresponding
+# sequence group doesn't require prompt logprob.
+PromptLogprobs = list[Optional[dict[int, Logprob]]]
+# {token_id -> logprob} for each sequence group.
+SampleLogprobs = list[dict[int, Logprob]]
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index e77eb637c8..829dd82b0b 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -13,14 +13,14 @@ import torch
 import torch.nn as nn
 
 import vllm.envs as envs
+from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.model_executor.layers.utils import apply_penalties
 from vllm.model_executor.sampling_metadata import (SamplingMetadata,
                                                    SamplingTensors,
                                                    SequenceGroupToSample)
 from vllm.sampling_params import SamplingType
 from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
-                           CompletionSequenceGroupOutput, Logprob,
-                           PromptLogprobs, SampleLogprobs, SequenceOutput)
+                           CompletionSequenceGroupOutput, SequenceOutput)
 
 if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
     # yapf: disable
diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
index fad97aba84..ee484e9a7b 100644
--- a/vllm/model_executor/model_loader/neuron.py
+++ b/vllm/model_executor/model_loader/neuron.py
@@ -14,12 +14,12 @@ from transformers import PretrainedConfig
 
 from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
                          SpeculativeConfig)
+from vllm.logprobs import Logprob
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import get_quantization_config
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
-                           SequenceOutput)
+from vllm.sequence import CompletionSequenceGroupOutput, SequenceOutput
 
 TORCH_DTYPE_TO_NEURON_AMP = {
     "auto": "f32",
diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py
index f450961c64..34bf43fe7b 100644
--- a/vllm/model_executor/model_loader/neuronx_distributed.py
+++ b/vllm/model_executor/model_loader/neuronx_distributed.py
@@ -27,11 +27,11 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
 from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
                          SpeculativeConfig)
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
-                           SequenceOutput)
+from vllm.sequence import CompletionSequenceGroupOutput, SequenceOutput
 
 # yapf: enable
 logger = init_logger(__name__)
diff --git a/vllm/outputs.py b/vllm/outputs.py
index acdb2f89ce..64bcfd472f 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -11,11 +11,12 @@ import torch
 from typing_extensions import TypeVar
 
 from vllm.logger import init_logger
+from vllm.logprobs import PromptLogprobs, SampleLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind
-from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
-                           SequenceGroup, SequenceGroupBase, SequenceStatus)
+from vllm.sequence import (RequestMetrics, SequenceGroup, SequenceGroupBase,
+                           SequenceStatus)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 7b48b7be9f..4b8e1f4641 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -16,6 +16,7 @@ import msgspec
 import torch
 
 from vllm.inputs import SingletonInputs
+from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import RequestOutputKind, SamplingParams
@@ -38,30 +39,6 @@ def array_full(token_id: int, count: int):
     return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
 
 
-# We use dataclass for now because it is used for
-# openai server output, and msgspec is not serializable.
-# TODO(sang): Fix it.
-@dataclass
-class Logprob:
-    """Infos for supporting OpenAI compatible logprobs and token ranks.
-
-    Attributes:
-        logprob: The logprob of chosen token
-        rank: The vocab rank of chosen token (>=1)
-        decoded_token: The decoded chosen token index
-    """
-    logprob: float
-    rank: Optional[int] = None
-    decoded_token: Optional[str] = None
-
-
-# {token_id -> logprob} per each sequence group. None if the corresponding
-# sequence group doesn't require prompt logprob.
-PromptLogprobs = list[Optional[dict[int, Logprob]]]
-# {token_id -> logprob} for each sequence group.
-SampleLogprobs = list[dict[int, Logprob]]
-
-
 class SequenceStatus(enum.IntEnum):
     """Status of a sequence."""
     WAITING = 0
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 380c62a141..56b01ecf78 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -3,8 +3,9 @@
 
 from typing import Optional
 
-from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
-                           Sequence, SequenceGroup)
+from vllm.logprobs import Logprob
+from vllm.sequence import (VLLM_INVALID_TOKEN_ID, SamplingParams, Sequence,
+                           SequenceGroup)
 
 from .detokenizer_utils import (convert_prompt_ids_to_tokens,
                                 detokenize_incrementally)
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 3de7fa6889..133122b6fc 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -7,7 +7,7 @@ from dataclasses import dataclass
 from typing import Optional
 
 from vllm.logger import init_logger
-from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
+from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_ids_list_to_tokens)
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest

From a344a5aa0a58cc1758d9721e848ce1f5ca4b6c7f Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Mon, 1 Sep 2025 14:09:37 -0700
Subject: [PATCH 778/932] [bugfix]fix MTP hidden states (#24056)

Signed-off-by: Lu Fang <fanglu@fb.com>
---
 vllm/v1/spec_decode/eagle.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 0a0e9fed72..bf25c91d83 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -220,6 +220,7 @@ class EagleProposer:
             )
             if self.method in ("deepseek_mtp", "ernie_mtp"):
                 last_hidden_states = ret_hidden_states
+                hidden_states = last_hidden_states
             else:
                 last_hidden_states, hidden_states = ret_hidden_states
         sample_hidden_states = last_hidden_states[last_token_indices]

From 0235103cbbdb511e6708aae600f759060a797c16 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Tue, 2 Sep 2025 04:07:45 +0200
Subject: [PATCH 779/932] [Doc]: fix typos in Python comments (#24042)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/distributed/device_communicators/quick_all_reduce.py   | 2 +-
 vllm/distributed/device_communicators/ray_communicator.py   | 2 +-
 vllm/entrypoints/openai/run_batch.py                        | 2 +-
 vllm/entrypoints/openai/serving_responses.py                | 2 +-
 vllm/executor/ray_utils.py                                  | 2 +-
 .../model_executor/layers/quantization/utils/quant_utils.py | 2 +-
 vllm/model_executor/models/registry.py                      | 2 +-
 vllm/model_executor/sampling_metadata.py                    | 2 +-
 vllm/scalar_type.py                                         | 2 +-
 vllm/sequence.py                                            | 2 +-
 vllm/v1/core/sched/scheduler.py                             | 6 +++---
 vllm/v1/metrics/stats.py                                    | 2 +-
 12 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/vllm/distributed/device_communicators/quick_all_reduce.py b/vllm/distributed/device_communicators/quick_all_reduce.py
index c61231e2d3..836241910e 100644
--- a/vllm/distributed/device_communicators/quick_all_reduce.py
+++ b/vllm/distributed/device_communicators/quick_all_reduce.py
@@ -78,7 +78,7 @@ class QuickAllReduce:
             group: the process group to work on. If None, it will use the
                 default process group.
             device: the device to bind the CustomAllreduce to. If None,
-                it will be bind to f"cuda:{local_rank}".
+                it will be bound to f"cuda:{local_rank}".
         It is the caller's responsibility to make sure each communicator
         is bind to a unique device, and all communicators in this group
         are in the same node.
diff --git a/vllm/distributed/device_communicators/ray_communicator.py b/vllm/distributed/device_communicators/ray_communicator.py
index 46cc1c2f52..8cd8c459a9 100644
--- a/vllm/distributed/device_communicators/ray_communicator.py
+++ b/vllm/distributed/device_communicators/ray_communicator.py
@@ -186,7 +186,7 @@ class RayPPCommunicator(Communicator):
         """
         Receive a torch.Tensor from a peer and synchronize the current stream.
 
-        After this call returns, the receive buffer is safe to read from from
+        After this call returns, the receive buffer is safe to read from
         any stream. An RayChannelError will be raised if an error occurred
         (e.g., remote actor died), and the buffer is not safe to read.
 
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 01551a8c7f..fa813550e5 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -161,7 +161,7 @@ async def write_local_file(output_path: str,
     batch_outputs: The list of batch outputs to write.
     """
     # We should make this async, but as long as run_batch runs as a
-    # standalone program, blocking the event loop won't effect performance.
+    # standalone program, blocking the event loop won't affect performance.
     with open(output_path, "w", encoding="utf-8") as f:
         for o in batch_outputs:
             print(o.model_dump_json(), file=f)
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 6a676cfe1b..4c15de3030 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -728,7 +728,7 @@ class OpenAIServingResponses(OpenAIServing):
             for response_msg in request.input:
                 messages.append(
                     parse_response_input(response_msg, prev_outputs))
-                # User passes in a a tool call request and its output. We need
+                # User passes in a tool call request and its output. We need
                 # to add the tool call request to prev_outputs so that the
                 # parse_response_input can find the tool call request when
                 # parsing the tool call output.
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 4b2a15afb6..0bdeb28569 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -223,7 +223,7 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
 
     """
     # Wait until PG is ready - this will block until all
-    # requested resources are available, and will timeout
+    # requested resources are available, and will time out
     # if they cannot be provisioned.
     placement_group_specs = current_placement_group.bundle_specs
 
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 6154fca2e4..f4ff875adb 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -116,7 +116,7 @@ def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape):
 # then we would expand a to:
 #       a = [[1, 1, 2, 2],
 #            [3, 3, 4, 4]]
-# NOTE this function this function does not explicitly broadcast dimensions
+# NOTE this function does not explicitly broadcast dimensions
 # with an extent of 1, since this can be done implicitly by pytorch
 def group_broadcast(t, shape):
     for i, s in enumerate(shape):
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index edb7f24214..f236040bb2 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -185,7 +185,7 @@ _EMBEDDING_MODELS = {
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
     # Technically PrithviGeoSpatialMAE is a model that works on images, both in
-    # input and output. I am adding it here because it piggy-backs on embedding
+    # input and output. I am adding it here because it piggybacks on embedding
     # models for the time being.
     "PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"),
 }
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 56f0f0984b..2315f9dad5 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -97,7 +97,7 @@ class SamplingMetadataCache:
 class SamplingMetadata:
     """Metadata for input sequences. Used in sampler.
 
-    The usage is as follow;
+    The usage is as follows;
     ```
     hidden_states = execute_model(...)
     logits = hidden_states[sampling_metadata.selected_token_indices]
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index 6f11ab8e03..055f28914a 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -269,7 +269,7 @@ class ScalarType:
 
     @classmethod
     def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
-        """Create a unsigned integer scalar type."""
+        """Create an unsigned integer scalar type."""
         ret = cls(0, size_bits, False, bias if bias else 0)
         ret.id  # noqa B018: make sure the id is cached
         return ret
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 4b8e1f4641..24114c0bb7 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1193,7 +1193,7 @@ class HiddenStates(msgspec.Struct, array_like=True,
     seq_ids are the sequence ids of each entry of the batch
     dimension of the hidden_states tensor"""
     # Scorer hidden states. For prefill step, it is used for hidden states of
-    # all tokens, whereas for decode step, it use used for last accepted tokens.
+    # all tokens, whereas for decode step, it is used for last accepted tokens.
     hidden_states: torch.Tensor
     # The sequence group metadata list. Only needed for decode step.
     seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index e07d53ff84..8322fa7335 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -815,7 +815,7 @@ class Scheduler(SchedulerInterface):
         # NOTE: structured_output_request_ids maps
         # a request's (request that uses structured output)
         # request_id to its index in the batch.
-        # This will helps us determine to slice the grammar bitmask
+        # This will help us determine to slice the grammar bitmask
         # and only applies valid mask for requests that
         # uses structured decoding.
         structured_output_request_ids: dict[str, int] = {}
@@ -923,7 +923,7 @@ class Scheduler(SchedulerInterface):
                     request):
                 # NOTE: structured_output_request
                 # should not be None if use_structured_output, we have
-                # check above, so safe to ignore type warning
+                # checked above, so safe to ignore type warning
                 request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
                     req_id, new_token_ids)
 
@@ -1242,7 +1242,7 @@ class Scheduler(SchedulerInterface):
         finished_sending reqs to the output.
         * if finished_sending: free the blocks
         # if finished_recving: add to state so we can
-            scheduler the request during the next step.
+            schedule the request during the next step.
         """
 
         if self.connector is not None:
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 9a80460261..95094bda65 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -59,7 +59,7 @@ class RequestStateStats:
 
     num_generation_tokens: int = 0
 
-    # This is a engine frontend timestamp (wall-clock)
+    # This is an engine frontend timestamp (wall-clock)
     arrival_time: float = 0.0
 
     # These are engine core timestamps (monotonic)

From 2b41cbbf030dd6cf4d5441fe679ca1c9add0d0e6 Mon Sep 17 00:00:00 2001
From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Tue, 2 Sep 2025 06:53:00 +0300
Subject: [PATCH 780/932] [V1][Mamba1] - FP32 SSM Kernel Support (#23506)

Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
---
 csrc/mamba/mamba_ssm/selective_scan_fwd.cu    | 72 ++++++++++++-------
 .../models/language/generation/test_hybrid.py |  7 +-
 .../layers/mamba/mamba_utils.py               | 18 +++--
 3 files changed, 65 insertions(+), 32 deletions(-)

diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
index c4ddbc1427..d534e138d2 100644
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -27,11 +27,12 @@
 
 template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
          bool kIsVariableB_, bool kIsVariableC_,
-         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
+         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_, typename state_t_>
 struct Selective_Scan_fwd_kernel_traits {
     static_assert(kNItems_ % 4 == 0);
     using input_t = input_t_;
     using weight_t = weight_t_;
+    using state_t = state_t_;
     static constexpr int kNThreads = kNThreads_;
     // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy.
     static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3;
@@ -132,7 +133,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
     weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
     input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + 
+    typename Ktraits::state_t *ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) + 
     cache_index * params.ssm_states_batch_stride + 
     dim_id * kNRows * params.ssm_states_dim_stride;
     
@@ -261,7 +262,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 if (threadIdx.x == 0) {
                     smem_running_prefix[state_idx] = prefix_op.running_prefix;
                     if (chunk == n_chunks - 1) {
-                        ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y);
+                        ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y);
                     }
                 }
                 #pragma unroll
@@ -310,7 +311,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     }
 }
 
-template<int kNThreads, int kNItems, typename input_t, typename weight_t>
+template<int kNThreads, int kNItems, typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
     // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block
     // processing 1 row.
@@ -321,7 +322,7 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
     BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
         BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
             BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
-                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t>;
+                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t, state_t>;
                 constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
                 dim3 grid(params.batch, params.dim / kNRows);
                 auto kernel = &selective_scan_fwd_kernel<Ktraits>;
@@ -341,59 +342,78 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
     });
 }
 
-template<typename input_t, typename weight_t>
+template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
 
     #ifndef USE_ROCM
         if (params.seqlen <= 128) {           
-            selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 16, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
         } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
         }
     #else
         if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
         } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
         }
     #endif
 }
 
-template void selective_scan_fwd_cuda<at::BFloat16, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<at::Half, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, float, at::BFloat16>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float, at::Half>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<float, float, float>(SSMParamsBase &params, cudaStream_t stream);
 
 #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
 
-#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)              \
+#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, STYPE, NAME, ...)       \
     if (ITYPE == at::ScalarType::Half) {                                            \
         using input_t = at::Half;                                                   \
         using weight_t = float;                                                     \
-        __VA_ARGS__();                                                              \
+        if (STYPE == at::ScalarType::Half) {                                        \
+            using state_t = at::Half;                                               \
+            __VA_ARGS__();                                                          \
+        } else if (STYPE == at::ScalarType::Float) {                                \
+            using state_t = float;                                                  \
+            __VA_ARGS__();                                                          \
+        } else {                                                                    \
+            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
+        }                                                                           \
     } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
         using input_t = at::BFloat16;                                               \
         using weight_t = float;                                                     \
-        __VA_ARGS__();                                                              \
+        if (STYPE == at::ScalarType::BFloat16) {                                    \
+            using state_t = at::BFloat16;                                           \
+            __VA_ARGS__();                                                          \
+        } else if (STYPE == at::ScalarType::Float) {                                \
+            using state_t = float;                                                  \
+            __VA_ARGS__();                                                          \
+        } else {                                                                    \
+            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
+        }                                                                           \
     } else if (ITYPE == at::ScalarType::Float)  {                                   \
         using input_t = float;                                                      \
         using weight_t = float;                                                     \
+        using state_t = float;                                                      \
         __VA_ARGS__();                                                              \
     } else {                                                                        \
         AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
     }
 
 
-template<typename input_t, typename weight_t>
+template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream);
 
 void set_ssm_params_fwd(SSMParamsBase &params,
@@ -648,7 +668,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
 
     // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
     at::Tensor out = delta;
-    TORCH_CHECK(ssm_states.scalar_type() == input_type);
+    // ssm_states can now be either the same as input_type or float32
+    auto state_type = ssm_states.scalar_type();
+    TORCH_CHECK(state_type == input_type || state_type == at::ScalarType::Float);
     TORCH_CHECK(ssm_states.is_cuda());
     TORCH_CHECK(ssm_states.stride(-1) == 1);
 
@@ -670,7 +692,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
     
     const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
     auto stream = at::cuda::getCurrentCUDAStream().stream();
-    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
-        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
+    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), ssm_states.scalar_type(), "selective_scan_fwd", [&] {
+        selective_scan_fwd_cuda<input_t, weight_t, state_t>(params, stream);
     });
 }
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 31ca3a6f0f..3cacbdcfbe 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -65,6 +65,11 @@ V0_UNSUPPORTED_MODELS = [
     "LiquidAI/LFM2-1.2B",
 ]
 
+FP32_STATE_MODELS = [
+    "state-spaces/mamba-130m-hf",
+    "Zyphra/Zamba2-1.2B-instruct",
+]
+
 # Avoid OOM
 MAX_NUM_SEQS = 4
 
@@ -434,7 +439,7 @@ def test_full_cuda_graph(
     )
 
 
-@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
+@pytest.mark.parametrize("model", FP32_STATE_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_fp32_state(
diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py
index 280a9e45e6..1dc4663964 100644
--- a/vllm/model_executor/layers/mamba/mamba_utils.py
+++ b/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -30,12 +30,8 @@ class MambaStateDtypeCalculator:
         mamba_cache_dtype: MambaDType,
         mamba_ssm_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
-        # TODO (tdoublep) requires kernel changes
-        if mamba_cache_dtype == "float32" or mamba_ssm_cache_dtype == "float32":
-            raise ValueError("fp32 state for mamba1 is not yet supported")
-        else:
-            return MambaStateDtypeCalculator.mamba2_state_dtype(
-                model_dtype, mamba_cache_dtype, mamba_ssm_cache_dtype)
+        return cls._mamba_state_dtype(model_dtype, mamba_cache_dtype,
+                                      mamba_ssm_cache_dtype)
 
     @classmethod
     def mamba2_state_dtype(
@@ -43,6 +39,16 @@ class MambaStateDtypeCalculator:
         model_dtype: Union[ModelDType, torch.dtype],
         mamba_cache_dtype: MambaDType,
         mamba_ssm_cache_dtype: MambaDType,
+    ) -> tuple[torch.dtype, ...]:
+        return cls._mamba_state_dtype(model_dtype, mamba_cache_dtype,
+                                      mamba_ssm_cache_dtype)
+
+    @classmethod
+    def _mamba_state_dtype(
+        cls,
+        model_dtype: Union[ModelDType, torch.dtype],
+        mamba_cache_dtype: MambaDType,
+        mamba_ssm_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
         conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype,
                                                     model_dtype)

From 04d0c60770fef98808db1c3d3361ab1a74384726 Mon Sep 17 00:00:00 2001
From: damon <dazhi_jiang@163.com>
Date: Tue, 2 Sep 2025 11:54:20 +0800
Subject: [PATCH 781/932] =?UTF-8?q?[Bugfix]=20Fix=20the=20issue=20that=20B?=
 =?UTF-8?q?lip2ForConditionalGeneration'=20object=20has=E2=80=A6=20(#24028?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Dazhi Jiang <dazhi_jiang@163.com>
---
 vllm/model_executor/models/blip2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 2f2b880bb0..ed98a3008c 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -560,8 +560,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _create_image_input(self,
-                            **kwargs: object) -> Optional[Blip2ImageInputs]:
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Blip2ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 

From d59c986444a701b39369453eff0a8ba324bd565f Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Tue, 2 Sep 2025 00:54:37 -0300
Subject: [PATCH 782/932] Remove runtime checks based on pooling params
 (#24051)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/v1/worker/gpu_input_batch.py  | 17 ++++++-----------
 vllm/v1/worker/gpu_model_runner.py | 20 ++++++++------------
 2 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index f4c2f45df5..ef5a7e39a5 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -704,17 +704,12 @@ class InputBatch:
             logitsprocs=self.logitsprocs,
         )
 
-    @property
-    def pooling_metadata(self) -> PoolingMetadata:
-        if len(self.pooling_params) == 0:
-            pooling_params = []
-        else:
-            # Note, for now this assumes that all request in the batch
-            # are either sampling or pooling requests
-            assert len(self.req_ids) == len(self.pooling_params)
-            pooling_params = [
-                self.pooling_params[req_id] for req_id in self.req_ids
-            ]
+    def get_pooling_params(self) -> list[PoolingParams]:
+        assert len(self.req_ids) == len(self.pooling_params)
+        return [self.pooling_params[req_id] for req_id in self.req_ids]
+
+    def get_pooling_metadata(self) -> PoolingMetadata:
+        pooling_params = self.get_pooling_params()
 
         return PoolingMetadata(
             prompt_lens=torch.from_numpy(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 08e13ab887..96dafd6add 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -138,7 +138,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 cache_config.cache_dtype]
 
-        self.is_pooling_model = model_config.pooler_config is not None
+        self.is_pooling_model = (model_config.runner_type == 'pooling')
         self.is_multimodal_raw_input_only_model = (
             model_config.is_multimodal_raw_input_only_model)
 
@@ -332,17 +332,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def _init_model_kwargs(self, num_tokens: int):
         model_kwargs = dict[str, Any]()
-        num_reqs = self.input_batch.num_reqs
 
-        num_pooling_reqs = len(self.input_batch.pooling_params)
-
-        if num_pooling_reqs == 0:
+        if not self.is_pooling_model:
             return model_kwargs
 
-        # This does nontrivial work.
-        pooling_params = self.input_batch.pooling_metadata.pooling_params
-
-        assert num_pooling_reqs == num_reqs
+        num_reqs = self.input_batch.num_reqs
+        pooling_params = self.input_batch.get_pooling_params()
 
         token_type_id_requests = dict[int, Any]()
         for i, param in enumerate(pooling_params):
@@ -456,7 +451,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             else:
                 generator = None
 
-            if pooling_params:
+            if self.is_pooling_model:
+                assert pooling_params is not None
                 task = pooling_params.task
                 assert task is not None, "You did not set `task` in the API"
 
@@ -1437,7 +1433,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         " a batch must be pooling request"
 
         hidden_states = hidden_states[:num_scheduled_tokens]
-        pooling_metadata = self.input_batch.pooling_metadata
+        pooling_metadata = self.input_batch.get_pooling_metadata()
         pooling_metadata.build_pooling_cursor(num_scheduled_tokens_np.tolist(),
                                               device=hidden_states.device)
         seq_lens_cpu = self.seq_lens.cpu[:self.input_batch.num_reqs]
@@ -1609,7 +1605,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                             all_gather_group=get_tp_group())
             logits = None
         else:
-            if self.input_batch.pooling_params:
+            if self.is_pooling_model:
                 return self._pool(hidden_states, num_scheduled_tokens,
                                   num_scheduled_tokens_np, kv_connector_output)
 

From 1fa1d6a9a0784855b420a75c2b42d6600d62cb41 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Mon, 1 Sep 2025 21:01:36 -0700
Subject: [PATCH 783/932] Migrate OvisImagePatchInputs to TensorSchema (#22024)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/ovis.py | 41 +++++++++++++++---------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 41fd272397..f1bb18716b 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -19,7 +19,7 @@
 """ PyTorch Ovis model."""
 import math
 from collections.abc import Iterable, Mapping
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -49,6 +49,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processors.ovis import OvisProcessor
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import merge_multimodal_embeddings
@@ -201,25 +202,22 @@ class VisualTokenizer(torch.nn.Module):
         return tokens
 
 
-class OvisImagePatchInputs(TypedDict):
+class OvisImagePatchInputs(TensorSchema):
+    """
+    Dimensions:
+        - batch_patches: Batch size * number of patches
+        - patch_size: patch_size_x * patch_size_y * num_channels
+        - patch_indicators: Batch size * (number of patches + 1)
+        - patches_per_image: List of number of total patches for each image
+          in the batch.
+    """
     type: Literal["image_patches"]
-    flat_data: torch.Tensor
-    """
-    Shape: 
-    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
-    """
-
-    indicator_tokens: torch.Tensor
-    """
-    Shape: 
-    `(batch_size * (num_patches + 1))`
-    """
-
-    patches_per_image: list[int]
-    """
-    List of number of total patches for each image in the batch.
-    This is used to restore the first two dimensions of `flat_data`.
-    """
+    flat_data: Annotated[torch.Tensor,
+                         TensorShape("batch_patches", "patch_size")]
+    indicator_tokens: Annotated[torch.Tensor, TensorShape("patch_indicators")]
+    patches_per_image: Annotated[list[int],
+                                 TensorShape("num_patches_per_image")]
+    # This is used to restore the first two dimensions of `flat_data`.
 
 
 class VisualEmbedding(torch.nn.Embedding):
@@ -458,9 +456,12 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of indicator_tokens. "
                                  f"Got type: {type(pixel_values)}")
 
+            flat_data = flatten_bn(pixel_values, concat=True)
+            if flat_data.ndim >= 3:
+                flat_data = flat_data.flatten(start_dim=1)
             return OvisImagePatchInputs(
                 type="image_patches",
-                flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
+                flat_data=flat_data,
                 patches_per_image=[
                     x.shape[0] for x in flatten_bn(pixel_values)
                 ],

From 7be0cb8e9e48849060660cf153205b4de1c1c854 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Tue, 2 Sep 2025 12:06:53 +0800
Subject: [PATCH 784/932] [XPU][Feature] fp8 online quantization support for
 XPU (#23148)

Signed-off-by: Yan Ma <yan.ma@intel.com>
Co-authored-by: Qiming Zhang <qiming1.zhang@intel.com>
---
 vllm/_ipex_ops.py                             |  56 +++++-
 .../model_executor/layers/quantization/fp8.py |  25 +++
 .../layers/quantization/ipex_quant.py         | 159 +++++++++++++++++-
 vllm/platforms/xpu.py                         |   4 +
 4 files changed, 242 insertions(+), 2 deletions(-)

diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 79e3e448ca..19f6c4e306 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -349,3 +350,56 @@ class ipex_ops:
     def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                     block_mapping: torch.Tensor) -> None:
         torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore
+
+    @staticmethod
+    def scaled_fp8_quant(
+        input: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        num_token_padding: Optional[int] = None,
+        scale_ub: Optional[torch.Tensor] = None,
+        use_per_token_if_dynamic: bool = False,
+        output: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Quantize input tensor to FP8 and return quantized tensor and scale.
+        
+        This function is designed for both static and dynamic quantization:
+        If you provide the scale, it will use static scaling and if you omit
+        it, the scale will be determined dynamically. Currently, XPU platform
+        only supports dynamic quantization. The function also allows optional
+        padding of the output tensors for downstream kernels that will benefit
+        from padding.
+
+        Args:
+            input: The input tensor to be quantized to FP8
+            scale: Optional scaling factor for the FP8 quantization
+            scale_ub: Optional upper bound for scaling factor in dynamic
+                per token case
+            num_token_padding: If specified, pad the first dimension
+                of the output to at least this value.
+            use_per_token_if_dynamic: Whether to do per_tensor or per_token
+                in the dynamic quantization case.
+    
+        Returns:
+            tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
+                scaling factor.
+        """
+        # This code assumes batch_dim and num_tokens are flattened
+        assert (input.ndim == 2)
+        shape: Union[tuple[int, int], torch.Size] = input.shape
+        out_dtype: torch.dtype = current_platform.fp8_dtype()
+        if num_token_padding:
+            shape = (max(num_token_padding, input.shape[0]), shape[1])
+        if output is None:
+            output = torch.empty(shape, device=input.device, dtype=out_dtype)
+        else:
+            assert num_token_padding is None, \
+                "padding not supported if output passed in"
+            assert output.dtype == out_dtype
+        assert scale is None, "only dynamic fp8 quantization supported on XPU"
+        assert not use_per_token_if_dynamic, (
+            "per token dynamic fp8 quantization not supported on XPU")
+        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+        torch.ops.torch_ipex.dynamic_scaled_fp8_quant(output, input, scale)
+
+        return output, scale
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 48bac8697e..d9e01dcf40 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -137,10 +137,35 @@ class Fp8Config(QuantizationConfig):
                    ignored_layers=ignored_layers,
                    weight_block_size=weight_block_size)
 
+    def get_xpu_quant_method(self, layer: torch.nn.Module,
+                             prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention
+        from vllm.model_executor.layers.quantization.ipex_quant import (
+            XPUFp8LinearMethod, XPUFp8MoEMethod)
+        fp8_config = Fp8Config(
+            is_checkpoint_fp8_serialized=self.is_checkpoint_fp8_serialized,
+            activation_scheme=self.activation_scheme,
+            ignored_layers=self.ignored_layers,
+            weight_block_size=self.weight_block_size)
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix=prefix,
+                                ignored_layers=self.ignored_layers,
+                                fused_mapping=self.packed_modules_mapping):
+                return UnquantizedLinearMethod()
+            return XPUFp8LinearMethod(fp8_config)
+        elif isinstance(layer, FusedMoE):
+            return XPUFp8MoEMethod(fp8_config, layer)
+        elif isinstance(layer, Attention):
+            return Fp8KVCacheMethod(self)
+        return None
+
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
+        if current_platform.is_xpu():
+            return self.get_xpu_quant_method(layer, prefix)
         if isinstance(layer, LinearBase):
             if is_layer_skipped(prefix=prefix,
                                 ignored_layers=self.ignored_layers,
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 9c458954f9..5f9d481427 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -1,11 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 
 import torch
 from packaging import version
+from torch.nn import Module
+from torch.nn.parameter import Parameter
 
+from vllm._ipex_ops import ipex_ops as ops
+from vllm.model_executor.layers.fused_moe import (FusedMoEMethodBase,
+                                                  FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -13,7 +18,10 @@ from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod,
                                                          is_layer_skipped_awq)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.fp8 import (Fp8Config,
+                                                         Fp8LinearMethod)
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
 MIN_IPEX_VERSION = "2.6.0"
@@ -251,3 +259,152 @@ class IPEXAWQLinearMethod(AWQLinearMethod):
         reshaped_x = x.reshape(-1, x.shape[-1])
         out = layer.ipex_qlinear(reshaped_x)
         return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
+
+
+class XPUFp8LinearMethod(Fp8LinearMethod):
+
+    def __init__(self, quant_config: Fp8Config):
+        super().__init__(quant_config)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # If checkpoint not serialized fp8, quantize the weights.
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
+                                                         scale=None)
+            # Update the layer with the new values.
+            layer.weight = Parameter(qweight, requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            layer.input_scale = None
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        weight = layer.weight.data
+        weight_scale = layer.weight_scale.data
+        output = torch.ops.torch_ipex.fp8_gemm_w8a16(x, weight, True,
+                                                     weight_scale, bias)
+        return output
+
+
+class XPUFp8MoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
+        super().__init__(layer.moe_config)
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
+                       intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        layer.intermediate_size_per_partition = intermediate_size_per_partition
+        layer.hidden_size = hidden_size
+        layer.num_experts = num_experts
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                         2,
+                                                         dtype=torch.float32),
+                                              requires_grad=False)
+        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                        dtype=torch.float32),
+                                             requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+        # INPUT_SCALES
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            fp8_dtype = current_platform.fp8_dtype()
+            w13_weight = torch.empty_like(layer.w13_weight.data,
+                                          dtype=fp8_dtype)
+            w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
+
+            # Re-initialize w13_scale because we directly quantize
+            # merged w13 weights and generate a single scaling factor.
+            layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
+                layer.local_num_experts,
+                dtype=torch.float32,
+                device=w13_weight.device),
+                                                        requires_grad=False)
+            for expert in range(layer.local_num_experts):
+                w13_weight[expert, :, :], layer.w13_weight_scale[
+                    expert] = ops.scaled_fp8_quant(
+                        layer.w13_weight.data[expert, :, :])
+                w2_weight[expert, :, :], layer.w2_weight_scale[
+                    expert] = ops.scaled_fp8_quant(
+                        layer.w2_weight.data[expert, :, :])
+            layer.w13_weight = torch.nn.Parameter(w13_weight,
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight,
+                                                 requires_grad=False)
+        import intel_extension_for_pytorch as ipex
+        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+            layer.w13_weight,
+            layer.w2_weight,
+            w1_scale_inv=layer.w13_weight_scale,
+            w2_scale_inv=layer.w2_weight_scale,
+            a1_scale_inv=layer.w13_input_scale,
+            a2_scale_inv=layer.w2_input_scale,
+            use_prepack=True,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return layer.ipex_fusion(
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            custom_routing_function=custom_routing_function,
+        )
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 84f4cd7256..d61b921e19 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -148,6 +148,10 @@ class XPUPlatform(Platform):
         torch.xpu.reset_peak_memory_stats(device)
         return torch.xpu.max_memory_allocated(device)
 
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        return torch.float8_e5m2
+
     @classmethod
     def is_data_center_gpu(cls) -> bool:
         device_name = cls.get_device_name().lower()

From 56d04089ef508003c684c90429046d90f2117547 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Mon, 1 Sep 2025 21:35:45 -0700
Subject: [PATCH 785/932] Migrate Interns1 inputs to TensorSchema (#23510)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/interns1.py | 101 ++++++++++++-------------
 1 file changed, 50 insertions(+), 51 deletions(-)

diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index c739e74b05..26e358f939 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -7,7 +7,7 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import regex as re
 import torch
@@ -32,6 +32,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -62,51 +63,60 @@ class InternS1MultiModalProjector(nn.Module):
         return hidden_states
 
 
-class InternS1ImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
+class InternS1ImagePixelInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    Dimensions:
+        - bnp: Batch size * number of images * (1 + num_patches)
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+        - bn: Batch size * number of images
     """
+    type: Literal["pixel_values"] = "pixel_values"
+    pixel_values: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
 
-class InternS1ImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
+class InternS1ImageEmbeddingInputs(TensorSchema):
     """
-    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
-    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
+    Dimensions:
+        - ni: Number of images
+        - tifs: Total image feature size
+        - hs: Hidden size (must match language model backbone)
     """
+    type: Literal["image_embeds"] = "image_embeds"
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("ni", "tifs", "hs")]
 
 
 InternS1ImageInputs = Union[InternS1ImagePixelInputs,
                             InternS1ImageEmbeddingInputs]
 
 
-class InternS1VideoPixelInputs(TypedDict):
-    type: Literal["pixel_values_videos"]
-    pixel_values: torch.Tensor
+class InternS1VideoPixelInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_video * num_frames, num_channels, height, width)`
+    Dimensions:
+        - bnv: Batch size * number of videos * number of frames
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
     """
-
-    num_patches: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
+    type: Literal["pixel_values_videos"] = "pixel_values_videos"
+    pixel_values: Annotated[torch.Tensor, TensorShape("bnv", 3, "h", "w")]
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
 
-class InternS1VideoEmbeddingInputs(TypedDict):
-    type: Literal["video_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
+class InternS1VideoEmbeddingInputs(TensorSchema):
     """
-    A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
-    or a list of tensors of shape `(total_video_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
+    Dimensions:
+        - nv: Number of videos
+        - tvfs: Total video feature size
+        - hs: Hidden size (must match language model backbone)
     """
+    type: Literal["video_embeds"] = "video_embeds"
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("nv", "tvfs", "hs")]
 
 
 InternS1VideoInputs = Union[InternS1VideoPixelInputs,
@@ -572,26 +582,6 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
         vit_embeds = self.multi_modal_projector(vit_embeds)
         return vit_embeds
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-
-        h, w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f" per patch is {expected_expr}. "
-                    f"You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[InternS1ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -627,10 +617,15 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
             pixel_values = flatten_bn(pixel_values, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
 
+            h, w = self.config.vision_config.image_size
             return InternS1ImagePixelInputs(
                 type="pixel_values",
-                pixel_values=self._validate_pixel_values(pixel_values),
+                pixel_values=pixel_values,
                 num_patches=image_num_patches,
+                resolve_bindings={
+                    "h": h,
+                    "w": w,
+                },
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -671,11 +666,15 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
                                                  concat=True)
             video_num_patches = flatten_bn(video_num_patches, concat=True)
 
+            h, w = self.config.vision_config.image_size
             return InternS1VideoPixelInputs(
                 type="pixel_values_videos",
-                pixel_values=self._validate_pixel_values(
-                    pixel_values_flat_video),
                 num_patches=video_num_patches,
+                pixel_values=pixel_values_flat_video,
+                resolve_bindings={
+                    "h": h,
+                    "w": w,
+                },
             )
 
         raise AssertionError("This line should be unreachable.")

From fad73be1a54391efc5b974c86a077b421b58dace Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Tue, 2 Sep 2025 11:38:55 +0200
Subject: [PATCH 786/932] [Doc]: fix typos in Python comments (#24077)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 tests/async_engine/test_api_server.py               | 2 +-
 tests/core/block/e2e/test_correctness.py            | 4 ++--
 tests/engine/test_arg_utils.py                      | 2 +-
 tests/kernels/moe/test_deepep_deepgemm_moe.py       | 2 +-
 tests/lora/test_add_lora.py                         | 4 ++--
 tests/lora/test_lora_allowed_token_ids.py           | 4 ++--
 tests/models/language/generation/test_common.py     | 2 +-
 tests/models/language/generation/test_mistral.py    | 4 ++--
 tests/models/multimodal/generation/test_qwen2_vl.py | 4 ++--
 tests/v1/core/test_kv_cache_utils.py                | 2 +-
 tests/v1/executor/test_executor.py                  | 2 +-
 tests/v1/spec_decode/test_eagle.py                  | 2 +-
 tests/v1/test_kv_sharing.py                         | 2 +-
 tests/v1/worker/test_gpu_model_runner.py            | 2 +-
 14 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 76c94bdf80..90f63e7ea1 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -98,7 +98,7 @@ def test_api_server(api_server, distributed_executor_backend: str):
         pool.join()
 
         # check cancellation stats
-        # give it some times to update the stats
+        # give it some time to update the stats
         time.sleep(1)
 
         num_aborted_requests = requests.get(
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 93222b564e..8de48ef59a 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -439,10 +439,10 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
 @pytest.mark.parametrize("seed", [1])
 def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
                                                   test_llm_generator):
-    """Verify block manager v2 with auto prefix caching could works normal
+    """Verify block manager v2 with auto prefix caching could work normally
     even when eviction started.
     With APC enabled, all blocks are held by native block at the beginning.
-    Then blocks are managed by evictor instead. If cache hit at the evitor's
+    Then blocks are managed by evictor instead. If cache hit at the evictor's
     block, then it could be reused, or we need to recompute its kv cache.
     """
     output_len = 10
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 93ac18dfcc..ba8e31a79f 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -167,7 +167,7 @@ def test_get_kwargs():
     # dict should have json tip in help
     json_tip = "Should either be a valid JSON string or JSON keys"
     assert json_tip in kwargs["json_tip"]["help"]
-    # nested config should should construct the nested config
+    # nested config should construct the nested config
     assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)
 
 
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 36a98522a6..6558cab6a9 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -282,7 +282,7 @@ def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
         a1_scale=a1_scale,
         block_shape=block_shape,
         # Make sure this is set to False so we
-        # dont end up comparing the same implementation.
+        # don't end up comparing the same implementation.
         allow_deep_gemm=False)
 
 
diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index 44755c603f..35d0245759 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -59,10 +59,10 @@ async def requests_processing_time(llm,
 @pytest.mark.asyncio
 async def test_add_lora(chatglm3_lora_files):
     """ 
-    The add_lora function is used to pre-load some LoRA adapters into the
+    The add_lora function is used to preload some LoRA adapters into the
     engine in anticipation of future requests using these adapters. To test
     this functionality, we use the async engine to process some requests - We
-    do it twice, once with add_lora() pre-loading and once without.
+    do it twice, once with add_lora() preloading and once without.
 
     We measure the request processing time in both cases and expect the time 
     to be lesser in the case with add_lora() calls.
diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py
index 01bc102bd1..e77eae7044 100644
--- a/tests/lora/test_lora_allowed_token_ids.py
+++ b/tests/lora/test_lora_allowed_token_ids.py
@@ -18,7 +18,7 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
     adapters that define additional tokens.
     """
 
-    # Setup a base model compatible with the sql_lora_files adapter and
+    # Set up a base model compatible with the sql_lora_files adapter and
     # a known number of tokens in the base model.
     model_config = ModelConfig(
         model=llama_2_7b_base_huggingface_id,
@@ -84,7 +84,7 @@ def test_allowed_token_ids_with_lora_adapter_no_vocab(
     adapters that do not define additional tokens.
     """
 
-    # Setup a base model compatible with the qwen25vl_lora_files adapter and
+    # Set up a base model compatible with the qwen25vl_lora_files adapter and
     # a known number of tokens in the base model.
     model_config = ModelConfig(
         model=qwen25vl_base_huggingface_id,
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 4c4434c941..8a04946b2f 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -13,7 +13,7 @@ from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 
 # These have unsupported head_dim for FA. We do not
-# not have a clean way to fall back, so we fail with
+# have a clean way to fall back, so we fail with
 # a clear msg when it happens.
 # https://github.com/vllm-project/vllm/issues/14524
 REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index af51a60edf..845afbfa8a 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -20,7 +20,7 @@ MISTRAL_FORMAT_MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.3",
     # uses the v3-Tekken tokenizer
     "mistralai/Ministral-8B-Instruct-2410",
-    # Mistral-Nemo is to big for CI, but passes locally
+    # Mistral-Nemo is too big for CI, but passes locally
     # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
@@ -273,7 +273,7 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
 
 
 def test_mistral_function_call_nested_json():
-    """Ensure that the function-name regex captures the entire outer-most
+    """Ensure that the function-name regex captures the entire outermost
     JSON block, including nested braces."""
 
     # Create a minimal stub tokenizer that provides the few attributes the
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index c61c27ae20..a81f5e7ec8 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -154,7 +154,7 @@ def batch_make_image_embeddings(
         embed_counter += cur_batch_embed_len
         image_counter += cur_batch_image_count
 
-    # ensure we don't lost any images or embeddings
+    # ensure we don't lose any images or embeddings
     assert embed_counter == image_embeds.size(0)
     assert image_counter == image_grid_thw.size(0)
     assert len(image_batches) == len(result)
@@ -238,7 +238,7 @@ def batch_make_video_embeddings(
         embed_counter += cur_batch_embed_len
         video_counter += cur_batch_video_count
 
-    # ensure we don't lost any videos or embeddings
+    # ensure we don't lose any videos or embeddings
     assert embed_counter == video_embeds.size(0)
     assert video_counter == video_grid_thw.size(0)
     assert len(video_batches) == len(result)
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index e738f2bd46..4d0a26f76e 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -247,7 +247,7 @@ def test_free_kv_cache_block_queue_append_n():
 
 def test_free_kv_cache_block_queue_popleft_n():
     blocks = [KVCacheBlock(block_id=i) for i in range(6)]
-    # Create a empty FreeKVCacheBlockQueue with these blocks
+    # Create an empty FreeKVCacheBlockQueue with these blocks
     queue = FreeKVCacheBlockQueue(
         [blocks[1], blocks[3], blocks[5], blocks[4], blocks[0], blocks[2]])
     assert queue.num_free_blocks == 6
diff --git a/tests/v1/executor/test_executor.py b/tests/v1/executor/test_executor.py
index bdd5155c14..4e83e2f9d4 100644
--- a/tests/v1/executor/test_executor.py
+++ b/tests/v1/executor/test_executor.py
@@ -27,7 +27,7 @@ class CustomMultiprocExecutor(MultiprocExecutor):
                        kwargs: Optional[dict] = None,
                        non_block: bool = False,
                        unique_reply_rank: Optional[int] = None) -> list[Any]:
-        # Drop marker to show that this was ran
+        # Drop marker to show that this was run
         with open(".marker", "w"):
             ...
         return super().collective_rpc(method, timeout, args, kwargs)
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 7b8445a0b2..46e3a611c6 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -183,7 +183,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
     mock_pp_group.world_size = pp_size
     mock_get_pp_group.return_value = mock_pp_group
 
-    # Setup the target model mock with a custom class so that
+    # Set up the target model mock with a custom class so that
     # isinstance() checks match the expected type.
     class _TargetModelStub(LlamaForCausalLM):
         model: mock.MagicMock
diff --git a/tests/v1/test_kv_sharing.py b/tests/v1/test_kv_sharing.py
index 6b01b7d3e1..9684804714 100644
--- a/tests/v1/test_kv_sharing.py
+++ b/tests/v1/test_kv_sharing.py
@@ -30,7 +30,7 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
     }
 
     # Layers 0 and 1 both belong in KV cache group 0
-    # However, if they have have different attention backends, they will be
+    # However, if they have different attention backends, they will be
     # placed in different attention groups for KV cache group 0
     kv_cache_groups = [
         KVCacheGroupSpec(["model.layers.0", "model.layers.1"],
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index d6cd03fb01..6d99029e40 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -702,7 +702,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
     KVCacheTensors for the attention and mamba layers
     (via _reshape_kv_cache_tensors function). This test verifies
     that the views are compatible: writing a mamba block
-    will not corrupt an attention block and vice-versa
+    will not corrupt an attention block and vice versa
     '''
 
     current_platform.seed_everything(42)

From 2f0bab3f26678ebaacdbdd61e0bec581fa7e30b9 Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Tue, 2 Sep 2025 18:48:18 +0800
Subject: [PATCH 787/932] [Model] Support dp on ViT on GLM-4.5V (#23168)

Signed-off-by: David Chen <530634352@qq.com>
---
 docs/configuration/optimization.md    |   1 +
 vllm/model_executor/models/glm4_1v.py | 203 ++++++++++++++++++--------
 2 files changed, 145 insertions(+), 59 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index b0ea9621d5..0ab2ae58ad 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -174,6 +174,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
 
 Known supported models:
 
+- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
 - Kimi-VL (<gh-pr:23817>)
 - Llama4 (<gh-pr:18368>)
 - MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 662728e6b1..f9fd5163d6 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -45,15 +45,20 @@ from transformers.models.glm4v.video_processing_glm4v import (
 from transformers.video_utils import VideoMetadata
 
 from vllm.config import VllmConfig
-from vllm.distributed import parallel_state
+from vllm.distributed import (get_tensor_model_parallel_world_size,
+                              parallel_state)
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.layernorm import RMSNorm
+# yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
+                                               MergedReplicatedLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
+# yapf: enable
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
@@ -66,6 +71,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -153,7 +159,7 @@ class Glm4vVideoEmbeddingInputs(TensorSchema):
 
 Glm4vVideoInputs = Union[Glm4vVideoPixelInputs, Glm4vVideoEmbeddingInputs]
 
-# === Vision Encoder === #
+# ==== Vision Encoder ==== #
 
 
 class Glm4vVisionMLP(nn.Module):
@@ -165,19 +171,23 @@ class Glm4vVisionMLP(nn.Module):
         bias: bool = False,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            input_size=in_features,
-            output_sizes=[hidden_features] * 2,
-            bias=bias,
-            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(hidden_features,
-                                           in_features,
-                                           bias=bias,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj")
+        cls_gate_up = (MergedReplicatedLinear
+                       if use_data_parallel else MergedColumnParallelLinear)
+        self.gate_up_proj = cls_gate_up(input_size=in_features,
+                                        output_sizes=[hidden_features] * 2,
+                                        bias=bias,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.gate_up_proj")
+        cls_down = (ReplicatedLinear
+                    if use_data_parallel else RowParallelLinear)
+        self.down_proj = cls_down(hidden_features,
+                                  in_features,
+                                  bias=bias,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.down_proj")
         self.act_fn = SiluAndMul()
 
     def forward(self, x: torch.Tensor):
@@ -218,33 +228,54 @@ class Glm4vVisionAttention(nn.Module):
         projection_size: int,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
-        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_size = (1 if use_data_parallel else
+                        get_tensor_model_parallel_world_size())
         self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
         self.hidden_size_per_attention_head = dist_utils.divide(
             projection_size, num_heads)
         self.num_attention_heads_per_partition = dist_utils.divide(
             num_heads, self.tp_size)
 
-        self.qkv = QKVParallelLinear(
-            hidden_size=embed_dim,
-            head_size=self.hidden_size_per_attention_head,
-            total_num_heads=num_heads,
-            total_num_kv_heads=num_heads,
-            bias=False,
-            quant_config=quant_config,
-            # Change qkv prefix to align with GLM-4.5V-FP8 quantization config
-            prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv",
-        )
-        self.proj = RowParallelLinear(
-            input_size=projection_size,
-            output_size=embed_dim,
-            quant_config=quant_config,
-            prefix=f"{prefix}.proj",
-            bias=False,
-        )
+        if use_data_parallel:
+            self.qkv = ReplicatedLinear(
+                input_size=embed_dim,
+                output_size=3 * projection_size,
+                bias=False,
+                quant_config=quant_config,
+                # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg
+                prefix=f"{prefix}.qkv_proj"
+                if quant_config else f"{prefix}.qkv",
+            )
+            self.proj = ReplicatedLinear(
+                input_size=projection_size,
+                output_size=embed_dim,
+                quant_config=quant_config,
+                prefix=f"{prefix}.proj",
+                bias=False,
+            )
+        else:
+            self.qkv = QKVParallelLinear(
+                hidden_size=embed_dim,
+                head_size=self.hidden_size_per_attention_head,
+                total_num_heads=num_heads,
+                total_num_kv_heads=num_heads,
+                bias=False,
+                quant_config=quant_config,
+                # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg
+                prefix=f"{prefix}.qkv_proj"
+                if quant_config else f"{prefix}.qkv",
+            )
+            self.proj = RowParallelLinear(
+                input_size=projection_size,
+                output_size=embed_dim,
+                quant_config=quant_config,
+                prefix=f"{prefix}.proj",
+                bias=False,
+            )
 
         # Detect attention implementation.
         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
@@ -375,6 +406,7 @@ class Glm4vVisionBlock(nn.Module):
         norm_layer: Optional[Callable[[int], nn.Module]] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -387,6 +419,7 @@ class Glm4vVisionBlock(nn.Module):
             projection_size=dim,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
+            use_data_parallel=use_data_parallel,
         )
         self.mlp = Glm4vVisionMLP(
             dim,
@@ -394,6 +427,7 @@ class Glm4vVisionBlock(nn.Module):
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
         )
 
     def forward(
@@ -456,24 +490,40 @@ class Glm4vPatchMerger(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = d_model
-        self.proj = ColumnParallelLinear(self.hidden_size,
-                                         self.hidden_size,
-                                         bias=bias,
-                                         gather_output=True,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.proj")
+        if use_data_parallel:
+            self.proj = ReplicatedLinear(
+                input_size=self.hidden_size,
+                output_size=self.hidden_size,
+                bias=bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.proj",
+            )
+        else:
+            self.proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                bias=bias,
+                gather_output=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.proj",
+            )
         self.post_projection_norm = nn.LayerNorm(self.hidden_size)
-        self.gate_up_proj = MergedColumnParallelLinear(
+        cls_gate_up = (MergedReplicatedLinear
+                       if use_data_parallel else MergedColumnParallelLinear)
+        self.gate_up_proj = cls_gate_up(
             input_size=self.hidden_size,
             output_sizes=[context_dim] * 2,
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.gate_up_proj",
         )
-        self.down_proj = RowParallelLinear(
+        cls_down = (ReplicatedLinear
+                    if use_data_parallel else RowParallelLinear)
+        self.down_proj = cls_down(
             context_dim,
             self.hidden_size,
             bias=bias,
@@ -548,14 +598,33 @@ class Glm4vVisionEmbeddings(nn.Module):
                                                         dtype=torch.float32))
 
             # Calculate target dimensions for each patch
-            target_h = torch.cat([
-                image_shapes[i, 1].repeat(lengths[i])
-                for i in range(len(lengths))
-            ]).to(device=device, dtype=torch.float32)
-            target_w = torch.cat([
-                image_shapes[i, 2].repeat(lengths[i])
-                for i in range(len(lengths))
-            ]).to(device=device, dtype=torch.float32)
+            # Add bounds checking for data parallel mode
+            if len(lengths) > image_shapes.shape[0]:
+                # In data parallel mode, some GPUs might not have all
+                # image shapes
+                # Use available image shapes, cycling if necessary
+                target_h_list = []
+                target_w_list = []
+                for i in range(len(lengths)):
+                    # Cycle through available shapes
+                    shape_idx = i % image_shapes.shape[0]
+                    target_h_list.append(image_shapes[shape_idx,
+                                                      1].repeat(lengths[i]))
+                    target_w_list.append(image_shapes[shape_idx,
+                                                      2].repeat(lengths[i]))
+                target_h = torch.cat(target_h_list).to(device=device,
+                                                       dtype=torch.float32)
+                target_w = torch.cat(target_w_list).to(device=device,
+                                                       dtype=torch.float32)
+            else:
+                target_h = torch.cat([
+                    image_shapes[i, 1].repeat(lengths[i])
+                    for i in range(len(lengths))
+                ]).to(device=device, dtype=torch.float32)
+                target_w = torch.cat([
+                    image_shapes[i, 2].repeat(lengths[i])
+                    for i in range(len(lengths))
+                ]).to(device=device, dtype=torch.float32)
 
             # Normalize coordinates to [-1, 1] range for grid_sample
             h_coords = h_coords.to(device=device, dtype=torch.float32)
@@ -629,6 +698,7 @@ class Glm4vVisionTransformer(nn.Module):
         norm_eps: float = 1e-6,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
@@ -638,6 +708,7 @@ class Glm4vVisionTransformer(nn.Module):
         depth = vision_config.depth
         self.hidden_size = vision_config.hidden_size
         self.num_heads = vision_config.num_heads
+        self.use_data_parallel = use_data_parallel
 
         self.patch_size = vision_config.patch_size
         self.spatial_merge_size = vision_config.spatial_merge_size
@@ -661,6 +732,7 @@ class Glm4vVisionTransformer(nn.Module):
                 norm_layer=norm_layer,
                 quant_config=quant_config,
                 prefix=f"{prefix}.blocks.{layer_idx}",
+                use_data_parallel=self.use_data_parallel,
             ) for layer_idx in range(depth)
         ])
         self.merger = Glm4vPatchMerger(
@@ -669,6 +741,7 @@ class Glm4vVisionTransformer(nn.Module):
             quant_config=quant_config,
             bias=False,
             prefix=f"{prefix}.merger",
+            use_data_parallel=self.use_data_parallel,
         )
         self.embeddings = Glm4vVisionEmbeddings(vision_config)
 
@@ -731,8 +804,11 @@ class Glm4vVisionTransformer(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        grid_thw: torch.Tensor,
+        grid_thw: list[list[int]],
     ) -> torch.Tensor:
+        # Convert grid_thw to tensor (always expecting list format now)
+        grid_thw = torch.tensor(grid_thw, device=x.device, dtype=torch.long)
+
         # patchify
         x = x.to(device=self.device, dtype=self.dtype)
         x = self.patch_embed(x)
@@ -1250,6 +1326,8 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.visual.": "visual.",
         })
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
@@ -1267,12 +1345,14 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         self.config = config
         self.multimodal_config = multimodal_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
         self.visual = Glm4vVisionTransformer(
             config.vision_config,
             norm_eps=getattr(config, "rms_norm_eps", 1e-5),
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "visual"),
+            use_data_parallel=self.use_data_parallel,
         )
 
         if config.model_type == "glm4v":
@@ -1382,8 +1462,14 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
-
+            if self.use_data_parallel:
+                return run_dp_sharded_mrope_vision_model(self.visual,
+                                                         pixel_values,
+                                                         grid_thw.tolist(),
+                                                         rope_type="rope_3d")
+            else:
+                image_embeds = self.visual(pixel_values,
+                                           grid_thw=grid_thw.tolist())
         merge_size = self.visual.spatial_merge_size
         sizes = grid_thw.prod(-1) // merge_size // merge_size
         return image_embeds.split(sizes.tolist())
@@ -1393,23 +1479,22 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
 
-        device = self.visual.device
-        flat_grid_thw = torch.cat([
-            torch.tensor([[1, h, w]] * t, device=device)
-            for t, h, w in grid_thw
-        ])
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
         else:
             pixel_values_videos = video_input["pixel_values_videos"].type(
                 self.visual.dtype)
-            video_embeds = self.visual(pixel_values_videos,
-                                       grid_thw=flat_grid_thw)
-
+            if self.use_data_parallel:
+                return run_dp_sharded_mrope_vision_model(self.visual,
+                                                         pixel_values_videos,
+                                                         grid_thw.tolist(),
+                                                         rope_type="rope_3d")
+            else:
+                video_embeds = self.visual(pixel_values_videos,
+                                           grid_thw=grid_thw.tolist())
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
         sizes = grid_thw.prod(-1) // merge_size // merge_size
-
         return video_embeds.split(sizes.tolist())
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:

From ce30dca5c44353f278dc114bd6f03b11700088eb Mon Sep 17 00:00:00 2001
From: Aziz <azizbenothman76@gmail.com>
Date: Tue, 2 Sep 2025 12:49:32 +0200
Subject: [PATCH 788/932] [CI]: reduce HTTP calls inside entrypoints openai
 tests (#23646)

Signed-off-by: AzizCode92 <azizbenothman76@gmail.com>
Signed-off-by: Aziz <azizbenothman76@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/entrypoints/conftest.py                 | 29 +++++++++++++++++++
 tests/entrypoints/openai/test_chat.py         |  2 --
 tests/entrypoints/openai/test_completion.py   | 26 -----------------
 .../test_completion_with_prompt_embeds.py     | 27 +----------------
 .../entrypoints/openai/test_lora_adapters.py  |  8 -----
 tests/entrypoints/openai/test_models.py       |  8 -----
 .../openai/test_return_tokens_as_ids.py       |  2 --
 tests/entrypoints/openai/test_tokenization.py |  2 --
 8 files changed, 30 insertions(+), 74 deletions(-)

diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index a7c533ec24..48fd848e88 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -201,3 +201,32 @@ table: "table_1" | "table_2"
 condition: column "=" number
 number: "1" | "2"
 """)
+
+
+@pytest.fixture(scope="session")
+def zephyr_lora_files():
+    """Download zephyr LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+
+
+@pytest.fixture(scope="session")
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
+    """Create zephyr LoRA files with added tokens once per test session."""
+    import shutil
+    from tempfile import TemporaryDirectory
+
+    from transformers import AutoTokenizer
+
+    tmp_dir = TemporaryDirectory()
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
+    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+    # Copy tokenizer to adapter and add some unique tokens
+    # 32000, 32001, 32002
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
+    tokenizer.save_pretrained(tmp_model_dir)
+    yield tmp_model_dir
+    tmp_dir.cleanup()
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 5ad29d70f1..c9947c54a9 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -15,8 +15,6 @@ import torch
 from openai import BadRequestError, OpenAI
 
 from ...utils import RemoteOpenAIServer
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 74ef6deeea..d55f8d9d65 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -3,8 +3,6 @@
 # imports for guided decoding tests
 import json
 import os
-import shutil
-from tempfile import TemporaryDirectory
 from typing import Optional
 
 import jsonschema
@@ -14,9 +12,7 @@ import pytest_asyncio
 import regex as re
 import requests
 # downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from openai import BadRequestError
-from transformers import AutoTokenizer
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -26,32 +22,10 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically these adapters use a different base model,
 # but we're not testing generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
 GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
 
 
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_added_tokens_files(zephyr_lora_files):
-    tmp_dir = TemporaryDirectory()
-    tmp_model_dir = f"{tmp_dir.name}/zephyr"
-    shutil.copytree(zephyr_lora_files, tmp_model_dir)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    # Copy tokenizer to adapter and add some unique tokens
-    # 32000, 32001, 32002
-    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-                                 special_tokens=True)
-    assert added == 3
-    tokenizer.save_pretrained(tmp_model_dir)
-    yield tmp_model_dir
-    tmp_dir.cleanup()
-
-
 @pytest.fixture(scope="module")
 def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
     return [
diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
index 00d3ffb61e..a0ef31762e 100644
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -3,48 +3,23 @@
 
 import base64
 import io
-import shutil
-from tempfile import TemporaryDirectory
 
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
 import torch
 # downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from openai import BadRequestError
-from transformers import AutoConfig, AutoTokenizer
+from transformers import AutoConfig
 
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
 CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
 
 
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_added_tokens_files(zephyr_lora_files):
-    tmp_dir = TemporaryDirectory()
-    tmp_model_dir = f"{tmp_dir.name}/zephyr"
-    shutil.copytree(zephyr_lora_files, tmp_model_dir)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    # Copy tokenizer to adapter and add some unique tokens
-    # 32000, 32001, 32002
-    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-                                 special_tokens=True)
-    assert added == 3
-    tokenizer.save_pretrained(tmp_model_dir)
-    yield tmp_model_dir
-    tmp_dir.cleanup()
-
-
 @pytest.fixture(scope="module")
 def default_server_args(
     zephyr_lora_files,
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index bcdeaaaced..f91dcf194b 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -9,8 +9,6 @@ from contextlib import suppress
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 
 from ...utils import RemoteOpenAIServer
 
@@ -18,7 +16,6 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
 BADREQUEST_CASES = [
     (
@@ -48,11 +45,6 @@ BADREQUEST_CASES = [
 ]
 
 
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
 @pytest.fixture(scope="module")
 def monkeypatch_module():
     from _pytest.monkeypatch import MonkeyPatch
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 1980daa80d..7cd3ca196a 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -4,8 +4,6 @@
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 
 from ...utils import RemoteOpenAIServer
 
@@ -13,12 +11,6 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index af58fbd4b3..5f43fdc958 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -11,8 +11,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
 from .test_completion import default_server_args  # noqa: F401
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
 from .test_completion import MODEL_NAME
 
 
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 0dbbdfbfd2..72c8a3510c 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -8,8 +8,6 @@ import requests
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

From 8bd5844989373d6914f6dea94a56822e6d7380d4 Mon Sep 17 00:00:00 2001
From: Christian Berge <42270330+cberge908@users.noreply.github.com>
Date: Tue, 2 Sep 2025 14:04:59 +0200
Subject: [PATCH 789/932] correct LWS deployment yaml (#23104)

Signed-off-by: cberge908 <42270330+cberge908@users.noreply.github.com>
---
 docs/deployment/frameworks/lws.md             | 6 ++----
 examples/online_serving/multi-node-serving.sh | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md
index 3319dc6c90..3b9fa3ea43 100644
--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@@ -22,7 +22,7 @@ Deploy the following yaml file `lws.yaml`
     metadata:
       name: vllm
     spec:
-      replicas: 2
+      replicas: 1
       leaderWorkerTemplate:
         size: 2
         restartPolicy: RecreateGroupOnPodRestart
@@ -41,7 +41,7 @@ Deploy the following yaml file `lws.yaml`
                   - sh
                   - -c
                   - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
-                    python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+                    vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2"
                 resources:
                   limits:
                     nvidia.com/gpu: "8"
@@ -126,8 +126,6 @@ Should get an output similar to this:
 NAME       READY   STATUS    RESTARTS   AGE
 vllm-0     1/1     Running   0          2s
 vllm-0-1   1/1     Running   0          2s
-vllm-1     1/1     Running   0          2s
-vllm-1-1   1/1     Running   0          2s
 ```
 
 Verify that the distributed tensor-parallel inference works:
diff --git a/examples/online_serving/multi-node-serving.sh b/examples/online_serving/multi-node-serving.sh
index e8ad8d3de5..3fc5502fb9 100644
--- a/examples/online_serving/multi-node-serving.sh
+++ b/examples/online_serving/multi-node-serving.sh
@@ -11,7 +11,7 @@
 # Example usage:
 # On the head node machine, start the Ray head node process and run a vLLM server.
 #   ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>]  && \
-#   python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2
+#   vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2
 # 
 # On each worker node, start the Ray worker node process.
 #   ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]

From 0a74e9d0f2367cc121547aa8e21e13b04d4cad30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 2 Sep 2025 16:23:35 +0200
Subject: [PATCH 790/932] [Gemma3n] Fix audio batching (#24052)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 ...i_chat_completion_client_for_multimodal.py | 42 +++++++++++++++++++
 vllm/model_executor/models/gemma3n_mm.py      | 28 +++++++++----
 2 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index ac5f79b56e..37216a5cfe 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -266,10 +266,52 @@ def run_audio(model: str) -> None:
     print("Chat completion output from base64 encoded audio:", result)
 
 
+def run_multi_audio(model: str) -> None:
+    from vllm.assets.audio import AudioAsset
+
+    # Two different audios to showcase batched inference.
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+    audio_url2 = AudioAsset("azacinto_foscolo").url
+    audio_base64_2 = encode_base64_content_from_url(audio_url2)
+
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Are these two audios the same?"},
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
+                    },
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64_2,
+                            "format": "wav",
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+
+
 example_function_map = {
     "text-only": run_text_only,
     "single-image": run_single_image,
     "multi-image": run_multi_image,
+    "multi-audio": run_multi_audio,
     "video": run_video,
     "audio": run_audio,
 }
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index c25bbcd420..d831e9084d 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -5,6 +5,7 @@ from typing import Any, Literal, Optional, TypedDict, Union, cast
 
 import numpy as np
 import torch
+# yapf: disable
 from torch import nn
 from transformers import AutoModel, BatchFeature
 from transformers.models.gemma3n import (Gemma3nAudioConfig,
@@ -30,7 +31,6 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
-# yapf: disable
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo,
                                         MultiModalPromptUpdates,
@@ -62,7 +62,8 @@ class Gemma3nImagePixelInputs(TypedDict):
 
 
 class Gemma3nAudioInputs(TypedDict):
-    input_features: torch.Tensor
+    input_features: Union[torch.Tensor, list[torch.Tensor]]
+    input_features_padded: torch.Tensor
     """Shape: `(batch_size * num_audio, seq_length, num_features)`"""
     input_features_mask: torch.Tensor
     """Shape: `(batch_size * num_audio, seq_length)`"""
@@ -188,8 +189,13 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
             mm_kwargs,
             tok_kwargs,
         )
+
         if 'input_features' in processed_outputs:
-            # Avoid padding since we need the output of each item to be
+            # Padding enables audio_tower to run in batched mode
+            processed_outputs["input_features_padded"] = \
+                processed_outputs["input_features"]
+
+            # Unpad features here since we need the output of each item to be
             # independent of other items for the cache to work correctly
             unpadded_features = [
                 f[mask] for f, mask in zip(
@@ -206,9 +212,11 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
 
-        return dict(pixel_values=MultiModalFieldConfig.batched("image"),
-                    input_features=MultiModalFieldConfig.batched("audio"),
-                    input_features_mask=MultiModalFieldConfig.batched("audio"))
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            input_features=MultiModalFieldConfig.batched("audio"),
+            input_features_padded=MultiModalFieldConfig.batched("audio"),
+            input_features_mask=MultiModalFieldConfig.batched("audio"))
 
     def _get_prompt_updates(
         self,
@@ -516,9 +524,14 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
         if input_features_mask is None:
             return None
 
+        input_features_padded = kwargs.pop("input_features_padded", None)
+        if input_features_padded is None:
+            return None
+
         return Gemma3nAudioInputs(
             input_features=input_features,
             input_features_mask=input_features_mask,
+            input_features_padded=input_features_padded,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -564,7 +577,8 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
         audio_input: Gemma3nAudioInputs,
     ) -> list[torch.Tensor]:
         assert self.audio_tower is not None
-        input_features = audio_input["input_features"].squeeze(1)
+        # Run on padded features to enable batching
+        input_features = audio_input["input_features_padded"].squeeze(1)
         input_features_mask = audio_input["input_features_mask"].squeeze(1)
         audio_outputs, audio_mask = self.audio_tower(input_features,
                                                      ~input_features_mask)

From 38ba061f6f441ac60fb31c68b12385ae00ff7614 Mon Sep 17 00:00:00 2001
From: Kyungmin Lee <30465912+lkm2835@users.noreply.github.com>
Date: Tue, 2 Sep 2025 23:40:55 +0900
Subject: [PATCH 791/932] [BugFix] Fix EXAONE4 rotary embeddings (#23918)

Signed-off-by: lkm2835 <lkm2835@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/exaone4.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 971fcbd2aa..e94c43a47f 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -164,8 +164,8 @@ class Exaone4Attention(nn.Module):
         is_sliding = config.layer_types[layer_idx] == "sliding_attention"
         self.sliding_window = config.sliding_window if is_sliding else None
 
-        # apply rotary embeddings to every layer
-        self.apply_all_layers = not is_sliding
+        # apply rotary embeddings to every layer in full attention models
+        self.apply_rope_all_layers = "sliding_attention" not in config.layer_types
 
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -201,7 +201,7 @@ class Exaone4Attention(nn.Module):
         k = self.k_norm(k)
         k = k.flatten(-2, -1)
 
-        if self.sliding_window or self.apply_all_layers:
+        if self.sliding_window or self.apply_rope_all_layers:
             q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)

From e0653f6c0b9f331af0877e7c7abc99a85efc3982 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Wed, 3 Sep 2025 00:48:57 +0800
Subject: [PATCH 792/932] [Model] Classification models support logit_bias /
 sigmoid_normalize (#24031)

Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/config/__init__.py               | 45 ++++++++++++++-------------
 vllm/model_executor/layers/pooler.py  |  8 +++++
 vllm/model_executor/models/config.py  |  4 ++-
 vllm/model_executor/models/jina_vl.py | 11 ++-----
 4 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index f53e8b0308..2e0212d010 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -2651,24 +2651,46 @@ class PoolerConfig:
     ## for embeddings models
     normalize: Optional[bool] = None
     """
-    Whether to normalize the embeddings outputs.
+    Whether to normalize the embeddings outputs. Defaults to True.
     """
     dimensions: Optional[int] = None
     """
     Reduce the dimensions of embeddings if model
-    support matryoshka representation.
+    support matryoshka representation. Defaults to None.
+    """
+    enable_chunked_processing: Optional[bool] = None
+    """
+    Whether to enable chunked processing for long inputs that exceed the model's
+    maximum position embeddings. When enabled, long inputs will be split into
+    chunks, processed separately, and then aggregated using weighted averaging.
+    This allows embedding models to handle arbitrarily long text without CUDA
+    errors. Defaults to False.
+    """
+    max_embed_len: Optional[int] = None
+    """
+    Maximum input length allowed for embedding generation. When set, allows
+    inputs longer than max_embed_len to be accepted for embedding models.
+    When an input exceeds max_embed_len, it will be handled according to 
+    the original max_model_len validation logic. 
+    Defaults to None (i.e. set to max_model_len).
     """
 
     ## for classification models
     activation: Optional[bool] = None
     """
     Whether to apply activation function to the classification outputs.
+    Defaults to True.
+    """
+    logit_bias: Optional[float] = None
+    """
+    If provided, apply classification logit biases. Defaults to None.
     """
 
     ## for reward models
     softmax: Optional[bool] = None
     """
     Whether to apply softmax to the reward outputs.
+    Defaults to True.
     """
     step_tag_id: Optional[int] = None
     """
@@ -2683,25 +2705,6 @@ class PoolerConfig:
     ``math-shepherd-mistral-7b-prm`` model.
     """
 
-    enable_chunked_processing: Optional[bool] = None
-    """
-    Whether to enable chunked processing for long inputs that exceed the model's
-    maximum position embeddings. When enabled, long inputs will be split into
-    chunks, processed separately, and then aggregated using weighted averaging.
-    This allows embedding models to handle arbitrarily long text without CUDA
-    errors. Defaults to False.
-    """
-
-    max_embed_len: Optional[int] = None
-    """
-    Maximum input length allowed for embedding generation. When set, allows
-    inputs longer than max_embed_len to be accepted for embedding models.
-    This parameter enables accepting long inputs without requiring
-    VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds
-    max_embed_len, it will be handled according to the original max_model_len
-    validation logic. Defaults to None (i.e. set to max_model_len).
-    """
-
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 62b3ee1aba..afe7ea7b83 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -633,9 +633,14 @@ class ClassifierPooler(Pooler):
     ) -> None:
         super().__init__()
 
+        from vllm.config import get_current_vllm_config
+        vllm_config = get_current_vllm_config()
+
         self.pooling = pooling
         self.classifier = classifier
         self.act_fn = act_fn or PoolerClassify()
+        self.logit_bias: Optional[
+            float] = vllm_config.model_config.pooler_config.logit_bias
 
     def get_supported_tasks(self) -> Set[PoolingTask]:
         return {"classify", "score"}
@@ -654,6 +659,9 @@ class ClassifierPooler(Pooler):
             pooled_data = self.classifier(pooled_data)
         # pooled_data shape: [batchsize, num_labels]
 
+        if self.logit_bias is not None:
+            pooled_data -= self.logit_bias
+
         pooling_params = get_pooling_params(pooling_metadata)
         flags = [p.activation for p in pooling_params]
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 377b7bf26a..0245e89f7d 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -210,8 +210,10 @@ class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         config = vllm_config.model_config.hf_config
-
         config.num_labels = 1
+        pooler_config = vllm_config.model_config.pooler_config
+        if pooler_config.logit_bias is None:
+            pooler_config.logit_bias = 2.65
 
 
 class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py
index 8c64f636c6..140b0d1674 100644
--- a/vllm/model_executor/models/jina_vl.py
+++ b/vllm/model_executor/models/jina_vl.py
@@ -92,17 +92,14 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        # logit bias for sigmoid normalization
-        self.LOGIT_BIAS = 2.65
-
         self.score = JinaVLScorer(config)
         self.pooler = DispatchPooler({
             "encode":
             Pooler.for_encode(pooler_config),
             "classify":
-            Pooler.for_classify(pooler_config, classifier=None),
+            Pooler.for_classify(pooler_config, classifier=self.score),
             "score":
-            Pooler.for_classify(pooler_config, classifier=None),
+            Pooler.for_classify(pooler_config, classifier=self.score),
         })
 
     @classmethod
@@ -137,9 +134,7 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
             inputs_embeds=inputs_embeds,
             **kwargs,
         )
-
-        logits = self.score(hidden_states) - self.LOGIT_BIAS
-        return logits
+        return hidden_states
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)

From e66ed3e675d9e3c8cbc128adf81b62162827daee Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 2 Sep 2025 13:18:15 -0400
Subject: [PATCH 793/932] [CI Failure] Skip failing nvfp4 silu test (#23959)

Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
index 969f14cc3f..4325162ae9 100644
--- a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+++ b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
@@ -8,7 +8,8 @@ from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
-if not current_platform.has_device_capability(100):
+if not (current_platform.has_device_capability(100)
+        and hasattr(torch.ops._C, "silu_and_mul_nvfp4_quant")):
     pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
                 allow_module_level=True)
 

From 0e1759cd54f98e628d6660becb7f5626d084d68f Mon Sep 17 00:00:00 2001
From: Peter Pan <peter.pan@daocloud.io>
Date: Wed, 3 Sep 2025 01:27:20 +0800
Subject: [PATCH 794/932] [docs] add SYS_NICE cap & `security-opt` for
 docker/k8s (#24017)

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
Signed-off-by: Peter Pan <peter.pan@daocloud.io>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/getting_started/installation/cpu.md      | 32 +++++++++++++++++++
 .../installation/cpu/arm.inc.md               |  4 +++
 .../installation/cpu/s390x.inc.md             |  3 ++
 .../installation/cpu/x86.inc.md               |  1 +
 4 files changed, 40 insertions(+)

diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 7f0ecb2bc0..ccb2909ea3 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -194,3 +194,35 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel
 - Both of them require `amx` CPU flag.
     - `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models
     - `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios.
+
+### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
+
+In some container environments (like Docker), NUMA-related syscalls used by vLLM (e.g., `get_mempolicy`, `migrate_pages`) are blocked/denied in the runtime's default seccomp/capabilities settings. This may lead to warnings like `get_mempolicy: Operation not permitted`. Functionality is not affected, but NUMA memory binding/migration optimizations may not take effect and performance can be suboptimal.
+
+To enable these optimizations inside Docker with the least privilege, you can follow below tips:
+
+```bash
+docker run ... --cap-add SYS_NICE --security-opt seccomp=unconfined  ...
+
+# 1) `--cap-add SYS_NICE` is to address `get_mempolicy` EPERM issue.
+
+# 2) `--security-opt seccomp=unconfined` is to enable `migrate_pages` for `numa_migrate_pages()`.
+# Actually, `seccomp=unconfined` bypasses the seccomp for container,
+# if it's unacceptable, you can customize your own seccomp profile,
+# based on docker/runtime default.json and add `migrate_pages` to `SCMP_ACT_ALLOW` list.
+
+# reference : https://docs.docker.com/engine/security/seccomp/
+```
+
+Alternatively, running with `--privileged=true` also works but is broader and not generally recommended.
+
+In K8S, the following configuration can be added to workload yaml to achieve the same effect as above:
+
+```yaml
+securityContext:
+  seccompProfile:
+    type: Unconfined
+  capabilities:
+    add:
+    - SYS_NICE
+```
diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md
index cac578eefb..e45baa0aa4 100644
--- a/docs/getting_started/installation/cpu/arm.inc.md
+++ b/docs/getting_started/installation/cpu/arm.inc.md
@@ -48,6 +48,10 @@ docker run --rm \
             --dtype=bfloat16 \
             other vLLM OpenAI server arguments
 ```
+
+!!! tip
+    An alternative of `--privileged=true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`.
+
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
 # --8<-- [end:extra-information]
diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md
index c1917267ce..f9c4ccb942 100644
--- a/docs/getting_started/installation/cpu/s390x.inc.md
+++ b/docs/getting_started/installation/cpu/s390x.inc.md
@@ -89,6 +89,9 @@ docker run --rm \
     other vLLM OpenAI server arguments
 ```
 
+!!! tip
+    An alternative of `--privileged true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`.
+
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
 # --8<-- [end:extra-information]
diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md
index f7af259ace..836da33f65 100644
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@@ -44,6 +44,7 @@ docker build -f docker/Dockerfile.cpu \
 # Launching OpenAI server
 docker run --rm \
             --security-opt seccomp=unconfined \
+            --cap-add SYS_NICE \
             --shm-size=4g \
             -p 8000:8000 \
             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \

From c83c4ff815f57f57194b99828368f5785ca4e1cc Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Wed, 3 Sep 2025 01:49:16 +0800
Subject: [PATCH 795/932] [Benchmark] Add support for local hf dataset path in
 benchmark (#23999)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 benchmarks/README.md        |  7 +++-
 vllm/benchmarks/datasets.py | 64 +++++++++++++++++++++++++++++--------
 2 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 38072152b6..98b3600d13 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -110,7 +110,12 @@ become available.
 
 🚧: to be supported
 
-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
+For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
+
+```bash
+--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
+```
 
 ## 🚀 Example - Online Benchmark
 
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 93519b5ba1..882b68ac9e 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1227,6 +1227,16 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
                           type=str,
                           default=None,
                           help="Split of the HF dataset.")
+    hf_group.add_argument(
+        "--hf-name",
+        type=str,
+        default=None,
+        help=(
+            "Name of the dataset on HuggingFace "
+            "(e.g., 'lmarena-ai/VisionArena-Chat'). "
+            "Specify this if your dataset-path is a local path."
+        ),
+    )
     hf_group.add_argument(
         "--hf-output-len",
         type=int,
@@ -1307,28 +1317,53 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
     elif args.dataset_name == "hf":
         # all following datasets are implemented from the
         # HuggingFaceDataset base class
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+        if (
+            args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = VisionArenaDataset
             args.hf_split = "train"
             args.hf_subset = None
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = InstructCoderDataset
             args.hf_split = "train"
-        elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = MTBenchDataset
             args.hf_split = "train"
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = ConversationDataset
-        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = AIMODataset
             args.hf_split = "train"
-        elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS:  # noqa: E501
+        elif (
+            args.dataset_path
+            in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS  # noqa: E501
+            or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = NextEditPredictionDataset
             args.hf_split = "train"
-        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = ASRDataset
             args.hf_split = "train"
-        elif args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = MLPerfDataset
             args.hf_split = "train"
         else:
@@ -1358,6 +1393,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             dataset_split=args.hf_split,
             random_seed=args.seed,
             no_stream=args.no_stream,
+            hf_name=args.hf_name,
         ).sample(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
@@ -1710,6 +1746,7 @@ class HuggingFaceDataset(BenchmarkDataset):
         dataset_split: str,
         no_stream: bool = False,
         dataset_subset: Optional[str] = None,
+        hf_name: Optional[str] = None,
         **kwargs,
     ) -> None:
         super().__init__(dataset_path=dataset_path, **kwargs)
@@ -1717,6 +1754,7 @@ class HuggingFaceDataset(BenchmarkDataset):
         self.dataset_split = dataset_split
         self.dataset_subset = dataset_subset
         self.load_stream = not no_stream
+        self.hf_name = hf_name or dataset_path
         self.load_data()
 
     def load_data(self) -> None:
@@ -1827,10 +1865,9 @@ class VisionArenaDataset(HuggingFaceDataset):
         for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
-            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
+            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
             if parser_fn is None:
-                raise ValueError(
-                    f"Unsupported dataset path: {self.dataset_path}")
+                raise ValueError(f"Unsupported dataset path: {self.hf_name}")
             prompt = parser_fn(item)
             mm_content = process_image(item["images"][0])
             prompt_len = len(tokenizer(prompt).input_ids)
@@ -2099,10 +2136,9 @@ class NextEditPredictionDataset(HuggingFaceDataset):
     def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
                request_id_prefix: str = "",
                **kwargs):
-        formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(
-            self.dataset_path)
+        formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name)
         if formatting_prompt_func is None:
-            raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
+            raise ValueError(f"Unsupported dataset path: {self.hf_name}")
         samples = []
         for i, sample in enumerate(self.data):
             sample = formatting_prompt_func(sample)

From 1c4131058465db9966ffc3a701a86a479216d0ea Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 2 Sep 2025 13:54:10 -0400
Subject: [PATCH 796/932] [Bugfix] Fix transform_config parsing in Compressed
 Tensors (#23945)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../quantization/compressed_tensors/compressed_tensors.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b07bf675ca..97041a5a05 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -63,7 +63,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         sparsity_ignore_list: list[str],
         kv_cache_scheme: Optional[dict[str, Any]] = None,
         config: Optional[dict[str, Any]] = None,
-        transform_config: Optional[TransformConfig] = None,
+        transform_config: Optional[dict[str, Any]] = None,
     ):
         super().__init__()
         self.ignore = ignore
@@ -75,7 +75,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         self.sparsity_ignore_list = sparsity_ignore_list
         self.config = config
 
-        if transform_config is not None:
+        if transform_config:
             self.transform_config = TransformConfig.model_validate(
                 transform_config)
         else:

From f399182e8c401303f7399b953417b926e4eb64f2 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Tue, 2 Sep 2025 10:55:32 -0700
Subject: [PATCH 797/932] Run ruff format on a few files. (#24075)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 tests/entrypoints/test_chat_utils.py      | 1032 ++++++++++++---------
 vllm/entrypoints/chat_utils.py            |  378 +++++---
 vllm/entrypoints/openai/serving_engine.py |  229 +++--
 3 files changed, 948 insertions(+), 691 deletions(-)

diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 647f1c7b7f..0c1f19371a 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -46,23 +46,27 @@ MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 
 @pytest.fixture(scope="function")
 def phi3v_model_config():
-    return ModelConfig(PHI3V_MODEL_ID,
-                       runner="generate",
-                       trust_remote_code=True,
-                       limit_mm_per_prompt={
-                           "image": 2,
-                       })
+    return ModelConfig(
+        PHI3V_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
 
 @pytest.fixture(scope="function")
 def phi3v_model_config_mm_interleaved():
-    return ModelConfig(PHI3V_MODEL_ID,
-                       runner="generate",
-                       trust_remote_code=True,
-                       interleave_mm_strings=True,
-                       limit_mm_per_prompt={
-                           "image": 2,
-                       })
+    return ModelConfig(
+        PHI3V_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        interleave_mm_strings=True,
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
 
 @pytest.fixture(scope="module")
@@ -77,14 +81,16 @@ def phi3v_tokenizer():
 
 @pytest.fixture(scope="function")
 def qwen25omni_model_config_mm_interleaved():
-    return ModelConfig(QWEN25OMNI_MODEL_ID,
-                       runner="generate",
-                       interleave_mm_strings=True,
-                       limit_mm_per_prompt={
-                           "image": 2,
-                           "audio": 1,
-                           "video": 1,
-                       })
+    return ModelConfig(
+        QWEN25OMNI_MODEL_ID,
+        runner="generate",
+        interleave_mm_strings=True,
+        limit_mm_per_prompt={
+            "image": 2,
+            "audio": 1,
+            "video": 1,
+        },
+    )
 
 
 @pytest.fixture(scope="module")
@@ -99,11 +105,13 @@ def qwen25omni_tokenizer():
 
 @pytest.fixture(scope="module")
 def mllama_model_config():
-    return ModelConfig(MLLAMA_MODEL_ID,
-                       runner="generate",
-                       limit_mm_per_prompt={
-                           "image": 2,
-                       })
+    return ModelConfig(
+        MLLAMA_MODEL_ID,
+        runner="generate",
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
 
 @pytest.fixture(scope="module")
@@ -118,11 +126,13 @@ def mllama_tokenizer():
 
 @pytest.fixture(scope="function")
 def mistral_model_config():
-    return ModelConfig(MISTRAL_MODEL_ID,
-                       runner="generate",
-                       limit_mm_per_prompt={
-                           "image": 2,
-                       })
+    return ModelConfig(
+        MISTRAL_MODEL_ID,
+        runner="generate",
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
 
 @pytest.fixture(scope="module")
@@ -137,21 +147,21 @@ def mistral_tokenizer():
 
 @pytest.fixture(scope="module")
 def image_url():
-    image = ImageAsset('cherry_blossom')
+    image = ImageAsset("cherry_blossom")
     base64 = encode_image_base64(image.pil_image)
     return f"data:image/jpeg;base64,{base64}"
 
 
 @pytest.fixture(scope="module")
 def video_url():
-    video = VideoAsset('baby_reading', 1)
+    video = VideoAsset("baby_reading", 1)
     base64 = encode_video_base64(video.np_ndarrays)
     return f"data:video/jpeg;base64,{base64}"
 
 
 @pytest.fixture(scope="module")
 def audio_url():
-    audio = AudioAsset('mary_had_lamb')
+    audio = AudioAsset("mary_had_lamb")
     base64 = encode_audio_base64(*audio.audio_and_sample_rate)
     return f"data:audio/ogg;base64,{base64}"
 
@@ -195,15 +205,18 @@ def test_parse_chat_messages_single_image(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "What's in the image?"
-            }]
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -223,58 +236,69 @@ def test_parse_chat_messages_empty_system(
 ):
     # Test string format
     conversation, _ = parse_chat_messages(
-        [{
-            "role": "system",
-            "content": ""
-        }, {
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "Who are you?"
-            }]
-        }],
+        [
+            {
+                "role": "system",
+                "content": ""
+            },
+            {
+                "role": "user",
+                "content": [{
+                    "type": "text",
+                    "text": "Who are you?"
+                }],
+            },
+        ],
         mistral_model_config,
         mistral_tokenizer,
         content_format="string",
     )
-    assert conversation == [{
-        "role": "system",
-        "content": ""
-    }, {
-        "role": "user",
-        "content": "Who are you?"
-    }]
+    assert conversation == [
+        {
+            "role": "system",
+            "content": ""
+        },
+        {
+            "role": "user",
+            "content": "Who are you?"
+        },
+    ]
 
     # Test openai format
     conversation, _ = parse_chat_messages(
-        [{
+        [
+            {
+                "role": "system",
+                "content": ""
+            },
+            {
+                "role": "user",
+                "content": [{
+                    "type": "text",
+                    "text": "Who are you?"
+                }],
+            },
+        ],
+        mistral_model_config,
+        mistral_tokenizer,
+        content_format="openai",
+    )
+    assert conversation == [
+        {
             "role": "system",
-            "content": ""
-        }, {
+            "content": [{
+                "type": "text",
+                "text": ""
+            }]
+        },
+        {
             "role": "user",
             "content": [{
                 "type": "text",
                 "text": "Who are you?"
             }]
-        }],
-        mistral_model_config,
-        mistral_tokenizer,
-        content_format="openai",
-    )
-    assert conversation == [{
-        "role": "system",
-        "content": [{
-            "type": "text",
-            "text": ""
-        }]
-    }, {
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "text": "Who are you?"
-        }]
-    }]
+        },
+    ]
 
 
 @pytest.mark.asyncio
@@ -287,15 +311,18 @@ async def test_parse_chat_messages_single_image_async(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "What's in the image?"
-            }]
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -318,18 +345,22 @@ def test_parse_chat_messages_multiple_images(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "image_pil",
-                "image_pil": ImageAsset('cherry_blossom').pil_image
-            }, {
-                "type": "text",
-                "text": "What's in these images?"
-            }]
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -340,7 +371,7 @@ def test_parse_chat_messages_multiple_images(
         "role":
         "user",
         "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
 
@@ -355,18 +386,22 @@ async def test_parse_chat_messages_multiple_images_async(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "image_pil",
-                "image_pil": ImageAsset('cherry_blossom').pil_image
-            }, {
-                "type": "text",
-                "text": "What's in these images?"
-            }]
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -377,7 +412,7 @@ async def test_parse_chat_messages_multiple_images_async(
         "role":
         "user",
         "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
     }]
     _assert_mm_data_is_image_input(await mm_future, 2)
 
@@ -391,22 +426,26 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type":
-                "text",
-                "text":
-                "What's in <|image_1|> and how does it compare to <|image_2|>?"
-            }]
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type":
+                    "text",
+                    "text":
+                    "What's in <|image_1|> and how does it compare to <|image_2|>?",  # noqa: E501
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -416,7 +455,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
         "role":
         "user",
         "content":
-        "What's in <|image_1|> and how does it compare to <|image_2|>?"
+        "What's in <|image_1|> and how does it compare to <|image_2|>?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
 
@@ -447,9 +486,9 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
                     "type":
                     "text",
                     "text":
-                    "What's in <|image_1|> and how does it compare to the other one?"  # noqa: E501
-                }
-            ]
+                    "What's in <|image_1|> and how does it compare to the other one?",  # noqa: E501
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -461,7 +500,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
         "user",
         "content":
         "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
-        "other one?"
+        "other one?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
 
@@ -472,34 +511,44 @@ def test_parse_chat_messages_multiple_images_across_messages(
     image_url,
 ):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "What's in this image?"
-            }]
-        }, {
-            "role": "assistant",
-            "content": "Some stuff."
-        }, {
-            "role":
-            "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "What about this one?"
-            }]
-        }],
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "What's in this image?"
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "What about this one?"
+                    },
+                ],
+            },
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
@@ -527,19 +576,23 @@ def test_parse_chat_messages_context_text_format(
     phi3v_tokenizer,
 ):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "What's in this text?"
-            }]
-        }, {
-            "role": "assistant",
-            "content": "Some stuff."
-        }, {
-            "role": "user",
-            "content": "What about this one?"
-        }],
+        [
+            {
+                "role": "user",
+                "content": [{
+                    "type": "text",
+                    "text": "What's in this text?"
+                }],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role": "user",
+                "content": "What about this one?"
+            },
+        ],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="openai",
@@ -551,21 +604,21 @@ def test_parse_chat_messages_context_text_format(
             "content": [{
                 "type": "text",
                 "text": "What's in this text?"
-            }]
+            }],
         },
         {
             "role": "assistant",
             "content": [{
                 "type": "text",
                 "text": "Some stuff."
-            }]
+            }],
         },
         {
             "role": "user",
             "content": [{
                 "type": "text",
                 "text": "What about this one?"
-            }]
+            }],
         },
     ]
 
@@ -578,31 +631,37 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
     with warnings.catch_warnings():
         warnings.filterwarnings(
             "ignore",
-            message="coroutine 'async_get_and_parse_image' was never awaited")
+            message="coroutine 'async_get_and_parse_image' was never awaited",
+        )
         with pytest.raises(ValueError, match="At most"):
             parse_chat_messages(
                 [{
                     "role":
                     "user",
-                    "content": [{
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "text",
-                        "text": "What's in these images?"
-                    }]
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url
+                            },
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url
+                            },
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url
+                            },
+                        },
+                        {
+                            "type": "text",
+                            "text": "What's in these images?"
+                        },
+                    ],
                 }],
                 phi3v_model_config,
                 phi3v_tokenizer,
@@ -618,42 +677,54 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
     with warnings.catch_warnings():
         warnings.filterwarnings(
             "ignore",
-            message="coroutine 'async_get_and_parse_image' was never awaited")
+            message="coroutine 'async_get_and_parse_image' was never awaited",
+        )
         with pytest.raises(ValueError, match="At most"):
             parse_chat_messages(
-                [{
-                    "role":
-                    "user",
-                    "content": [{
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "text",
-                        "text": "What's in this image?"
-                    }]
-                }, {
-                    "role": "assistant",
-                    "content": "Some stuff."
-                }, {
-                    "role":
-                    "user",
-                    "content": [{
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "text",
-                        "text": "What about these two?"
-                    }]
-                }],
+                [
+                    {
+                        "role":
+                        "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": image_url
+                                },
+                            },
+                            {
+                                "type": "text",
+                                "text": "What's in this image?"
+                            },
+                        ],
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "Some stuff."
+                    },
+                    {
+                        "role":
+                        "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": image_url
+                                },
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": image_url
+                                },
+                            },
+                            {
+                                "type": "text",
+                                "text": "What about these two?"
+                            },
+                        ],
+                    },
+                ],
                 phi3v_model_config,
                 phi3v_tokenizer,
                 content_format="string",
@@ -670,12 +741,14 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
             "role":
             "user",
             "content": [
-                "What's in these images?", {
+                "What's in these images?",
+                {
                     "image_url": image_url
-                }, {
+                },
+                {
                     "image_url": image_url
-                }
-            ]
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -686,7 +759,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
         "role":
         "user",
         "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
 
@@ -700,26 +773,32 @@ def test_parse_chat_messages_multiple_images_interleave(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "text",
-                "text": "I need you to compare this image"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "and this one"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "Do they have differences?"
-            }]
+            "content": [
+                {
+                    "type": "text",
+                    "text": "I need you to compare this image",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "and this one"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "Do they have differences?"
+                },
+            ],
         }],
         phi3v_model_config_mm_interleaved,
         phi3v_tokenizer,
@@ -731,7 +810,7 @@ def test_parse_chat_messages_multiple_images_interleave(
         "user",
         "content":
         "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-        "Do they have differences?"
+        "Do they have differences?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
 
@@ -746,26 +825,32 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "text",
-                "text": "I need you to compare this image"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "and this one"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "Do they have differences?"
-            }]
+            "content": [
+                {
+                    "type": "text",
+                    "text": "I need you to compare this image",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "and this one"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "Do they have differences?"
+                },
+            ],
         }],
         phi3v_model_config_mm_interleaved,
         phi3v_tokenizer,
@@ -777,7 +862,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
         "user",
         "content":
         "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-        "Do they have differences?"
+        "Do they have differences?",
     }]
     _assert_mm_data_is_image_input(await mm_data, 2)
 
@@ -788,135 +873,161 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
     image_url,
 ):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's on this image?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "Be accurate."
-                },
-            ]
-        }, {
-            "role": "assistant",
-            "content": "Some stuff."
-        }, {
-            "role":
-            "user",
-            "content": [{
-                "type": "text",
-                "text": "What's on this image?"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }]
-        }],
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "Be accurate."
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                ],
+            },
+        ],
         phi3v_model_config_mm_interleaved,
         phi3v_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "What's on this image?\n<|image_1|>\nBe accurate."
-    }, {
-        "role": "assistant",
-        "content": "Some stuff."
-    }, {
-        "role": "user",
-        "content": "What's on this image?\n<|image_2|>"
-    }]
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "What's on this image?\n<|image_1|>\nBe accurate.",
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role": "user",
+            "content": "What's on this image?\n<|image_2|>"
+        },
+    ]
     _assert_mm_data_is_image_input(mm_data, 2)
 
 
 def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
-        qwen25omni_model_config_mm_interleaved, qwen25omni_tokenizer,
-        image_url, video_url, audio_url):
+    qwen25omni_model_config_mm_interleaved,
+    qwen25omni_tokenizer,
+    image_url,
+    video_url,
+    audio_url,
+):
     conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's on this image?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "Now listen to this audio"
-                },
-                {
-                    "type": "audio_url",
-                    "audio_url": {
-                        "url": audio_url
-                    }
-                },
-            ]
-        }, {
-            "role": "assistant",
-            "content": "Some stuff."
-        }, {
-            "role":
-            "user",
-            "content": [{
-                "type": "text",
-                "text": "What's on this image?"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "And what's in the video?"
-            }, {
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
-                }
-            }]
-        }],
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "Now listen to this audio"
+                    },
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            "url": audio_url
+                        }
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "And what's in the video?"
+                    },
+                    {
+                        "type": "video_url",
+                        "video_url": {
+                            "url": video_url
+                        }
+                    },
+                ],
+            },
+        ],
         qwen25omni_model_config_mm_interleaved,
         qwen25omni_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-        "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>"
-    }, {
-        "role": "assistant",
-        "content": "Some stuff."
-    }, {
-        "role":
-        "user",
-        "content":
-        "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-        "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>"
-    }]
+    assert conversation == [
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+        },
+    ]
 
     _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
 
@@ -929,7 +1040,8 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
     with pytest.raises(
             ValueError,
             match=r"Found more '<|image_1|>' placeholders in input prompt "
-            "than actual multimodal data items."):
+            "than actual multimodal data items.",
+    ):
         parse_chat_messages(
             [{
                 "role":
@@ -952,9 +1064,9 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
                         "text",
                         "text":
                         "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-                        "Do they have differences?"
+                        "Do they have differences?",
                     },
-                ]
+                ],
             }],
             phi3v_model_config_mm_interleaved,
             phi3v_tokenizer,
@@ -973,12 +1085,15 @@ def test_mllama_single_image(
         [{
             "role":
             "user",
-            "content": [{
-                'type': 'text',
-                'text': 'The content of this image is:'
-            }, {
-                "image_url": image_url
-            }]
+            "content": [
+                {
+                    "type": "text",
+                    "text": "The content of this image is:"
+                },
+                {
+                    "image_url": image_url
+                },
+            ],
         }],
         mllama_model_config,
         mllama_tokenizer,
@@ -986,14 +1101,17 @@ def test_mllama_single_image(
     )
     _assert_mm_data_is_image_input(mm_data, 1)
     assert conversation == [{
-        'role':
-        'user',
-        'content': [{
-            'type': 'text',
-            'text': 'The content of this image is:'
-        }, {
-            'type': 'image'
-        }]
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "The content of this image is:"
+            },
+            {
+                "type": "image"
+            },
+        ],
     }]
 
 
@@ -1009,20 +1127,20 @@ def test_mllama_interleaved_images(
             "user",
             "content": [
                 {
-                    'type': 'text',
-                    'text': 'The content of the first image is:'
+                    "type": "text",
+                    "text": "The content of the first image is:",
                 },
                 {
                     "image_url": image_url
                 },
                 {
-                    'type': 'text',
-                    'text': 'The content of the second image is:'
+                    "type": "text",
+                    "text": "The content of the second image is:",
                 },
                 {
                     "image_url": image_url
                 },
-            ]
+            ],
         }],
         mllama_model_config,
         mllama_tokenizer,
@@ -1030,19 +1148,24 @@ def test_mllama_interleaved_images(
     )
     _assert_mm_data_is_image_input(mm_data, 2)
     assert conversation == [{
-        'role':
-        'user',
-        'content': [{
-            'type': 'text',
-            'text': 'The content of the first image is:'
-        }, {
-            'type': 'image'
-        }, {
-            'type': 'text',
-            'text': 'The content of the second image is:'
-        }, {
-            'type': 'image'
-        }]
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "The content of the first image is:"
+            },
+            {
+                "type": "image"
+            },
+            {
+                "type": "text",
+                "text": "The content of the second image is:"
+            },
+            {
+                "type": "image"
+            },
+        ],
     }]
 
 
@@ -1053,34 +1176,36 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
     def get_conversation(is_hf: bool):
         img_part = {"type": "image_url", "image_url": {"url": image_url}}
         if is_hf:
-            img_part = {'type': 'image'}
+            img_part = {"type": "image"}
         return [{
-            'role':
-            'user',
-            'content': [
+            "role":
+            "user",
+            "content": [
                 {
-                    'type': 'text',
-                    'text': 'The content of the first image is:'
+                    "type": "text",
+                    "text": "The content of the first image is:",
                 },
                 img_part,
                 {
-                    'type': 'text',
-                    'text': 'The content of the second image is:'
+                    "type": "text",
+                    "text": "The content of the second image is:",
                 },
                 img_part,
                 {
-                    'type': 'text',
-                    'text': 'What animal is in the first image?'
+                    "type": "text",
+                    "text": "What animal is in the first image?",
                 },
-            ]
+            ],
         }]
 
     # Build a config for the model
-    model_config = ModelConfig(model,
-                               runner="generate",
-                               limit_mm_per_prompt={
-                                   "image": 2,
-                               })
+    model_config = ModelConfig(
+        model,
+        runner="generate",
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
     # Build the tokenizer group and grab the underlying tokenizer
     tokenizer_group = TokenizerGroup(
@@ -1126,7 +1251,8 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
     [
         QWEN2VL_MODEL_ID,  # tokenizer.chat_template is of type str
         HERMES_MODEL_ID,  # tokenizer.chat_template is of type dict
-    ])
+    ],
+)
 @pytest.mark.parametrize("use_tools", [True, False])
 def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     """checks that chat_template is a dict type for HF models."""
@@ -1152,14 +1278,14 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     )
     tokenizer = tokenizer_group.tokenizer
 
-    tools = [{
+    tools = ([{
         "type": "function",
         "function": {
             "name": "dummy_function_name",
             "description": "This is a dummy function",
-            "parameters": sample_json_schema
-        }
-    }] if use_tools else None
+            "parameters": sample_json_schema,
+        },
+    }] if use_tools else None)
 
     # Test detecting the tokenizer's chat_template
     chat_template = resolve_hf_chat_template(
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 1954cbcbf1..80e2c44a02 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -103,6 +103,7 @@ class PILImage(BaseModel):
     """
     A PIL.Image.Image object.
     """
+
     image_pil: Image.Image
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -115,6 +116,7 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
         "image_pil": ImageAsset('cherry_blossom').pil_image
     }
     """
+
     image_pil: Required[PILImage]
 
 
@@ -127,6 +129,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
         "image_url": "https://example.com/image.jpg"
     }
     """
+
     image_url: Required[str]
 
 
@@ -138,6 +141,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
         "audio_url": "https://example.com/audio.mp3"
     }
     """
+
     audio_url: Required[str]
 
 
@@ -149,6 +153,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
         "video_url": "https://example.com/video.mp4"
     }
     """
+
     video_url: Required[str]
 
 
@@ -174,19 +179,24 @@ class CustomThinkCompletionContentParam(TypedDict, total=False):
 
 
 ChatCompletionContentPartParam: TypeAlias = Union[
-    OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
+    OpenAIChatCompletionContentPartParam,
+    ChatCompletionContentPartAudioParam,
     ChatCompletionContentPartInputAudioParam,
-    ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
+    ChatCompletionContentPartVideoParam,
+    ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentPILImageParam,
     CustomChatCompletionContentSimpleImageParam,
     ChatCompletionContentPartImageEmbedsParam,
     CustomChatCompletionContentSimpleAudioParam,
-    CustomChatCompletionContentSimpleVideoParam, str,
-    CustomThinkCompletionContentParam]
+    CustomChatCompletionContentSimpleVideoParam,
+    str,
+    CustomThinkCompletionContentParam,
+]
 
 
 class CustomChatCompletionMessageParam(TypedDict, total=False):
     """Enables custom roles in the Chat Completion API."""
+
     role: Required[str]
     """The role of the message's author."""
 
@@ -207,9 +217,11 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
     """The tool calls generated by the model, such as function calls."""
 
 
-ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam,
-                                   CustomChatCompletionMessageParam,
-                                   OpenAIHarmonyMessage]
+ChatCompletionMessageParam = Union[
+    OpenAIChatCompletionMessageParam,
+    CustomChatCompletionMessageParam,
+    OpenAIHarmonyMessage,
+]
 
 
 # TODO: Make fields ReadOnly once mypy supports it
@@ -262,13 +274,13 @@ def _is_var_or_elems_access(
     key: Optional[str] = None,
 ) -> bool:
     if isinstance(node, jinja2.nodes.Filter):
-        return (node.node is not None
-                and _is_var_or_elems_access(node.node, varname, key))
+        return node.node is not None and _is_var_or_elems_access(
+            node.node, varname, key)
     if isinstance(node, jinja2.nodes.Test):
         return _is_var_or_elems_access(node.node, varname, key)
 
-    if (isinstance(node, jinja2.nodes.Getitem)
-            and isinstance(node.arg, jinja2.nodes.Slice)):
+    if isinstance(node, jinja2.nodes.Getitem) and isinstance(
+            node.arg, jinja2.nodes.Slice):
         return _is_var_or_elems_access(node.node, varname, key)
 
     # yapf: disable
@@ -373,15 +385,18 @@ def resolve_mistral_chat_template(
 ) -> Optional[str]:
     if chat_template is not None:
         logger.warning_once(
-            "'chat_template' cannot be overridden for mistral tokenizer.")
+            "'chat_template' cannot be overridden for mistral tokenizer."
+        )
     if "add_generation_prompt" in kwargs:
         logger.warning_once(
             "'add_generation_prompt' is not supported for mistral tokenizer, "
-            "so it will be ignored.")
+            "so it will be ignored."
+        )
     if "continue_final_message" in kwargs:
         logger.warning_once(
             "'continue_final_message' is not supported for mistral tokenizer, "
-            "so it will be ignored.")
+            "so it will be ignored."
+        )
     return None
 
 
@@ -401,23 +416,35 @@ def resolve_hf_chat_template(
         try:
             processor = cached_get_processor(
                 tokenizer.name_or_path,
-                processor_cls=(PreTrainedTokenizer, PreTrainedTokenizerFast,
-                               ProcessorMixin),
+                processor_cls=(
+                    PreTrainedTokenizer,
+                    PreTrainedTokenizerFast,
+                    ProcessorMixin,
+                ),
                 trust_remote_code=model_config.trust_remote_code,
             )
-            if isinstance(processor, ProcessorMixin) and \
-                hasattr(processor, 'chat_template') and \
-                processor.chat_template is not None:
+            if (
+                isinstance(processor, ProcessorMixin)
+                and hasattr(processor, "chat_template")
+                and processor.chat_template is not None
+            ):
                 return processor.chat_template
         except Exception:
-            logger.debug("Failed to load AutoProcessor chat template for %s", tokenizer.name_or_path, exc_info=True)  # noqa: E501
+            logger.debug(
+                "Failed to load AutoProcessor chat template for %s",
+                tokenizer.name_or_path,
+                exc_info=True,
+            )  # noqa: E501
 
     # 3rd priority: AutoTokenizer chat template
     try:
         return tokenizer.get_chat_template(chat_template, tools=tools)
     except Exception:
-        logger.debug("Failed to load AutoTokenizer chat template for %s",
-                     tokenizer.name_or_path, exc_info=True)
+        logger.debug(
+            "Failed to load AutoTokenizer chat template for %s",
+            tokenizer.name_or_path,
+            exc_info=True,
+        )
 
     # 4th priority: Predefined fallbacks
     path = get_chat_template_fallback_path(
@@ -425,12 +452,16 @@ def resolve_hf_chat_template(
         tokenizer_name_or_path=model_config.tokenizer,
     )
     if path is not None:
-        logger.info("Loading chat template fallback for %s as there isn't one "
-                    "defined on HF Hub.", tokenizer.name_or_path)
+        logger.info(
+            "Loading chat template fallback for %s as there isn't one "
+            "defined on HF Hub.",
+            tokenizer.name_or_path,
+        )
         chat_template = load_chat_template(path)
     else:
-        logger.debug("There is no chat template fallback for %s",
-                     tokenizer.name_or_path)
+        logger.debug(
+            "There is no chat template fallback for %s", tokenizer.name_or_path
+        )
 
     return chat_template
 
@@ -452,11 +483,17 @@ def _resolve_chat_template_content_format(
     else:
         hf_chat_template = None
 
-    jinja_text = (hf_chat_template if isinstance(hf_chat_template, str)
-                  else load_chat_template(chat_template, is_literal=True))
+    jinja_text = (
+        hf_chat_template
+        if isinstance(hf_chat_template, str)
+        else load_chat_template(chat_template, is_literal=True)
+    )
 
-    detected_format = ("string" if jinja_text is None else
-                       _detect_content_format(jinja_text, default="string"))
+    detected_format = (
+        "string"
+        if jinja_text is None
+        else _detect_content_format(jinja_text, default="string")
+    )
 
     return detected_format
 
@@ -512,7 +549,6 @@ def resolve_chat_template_content_format(
     return detected_format
 
 
-
 ModalityStr = Literal["image", "audio", "video", "image_embeds"]
 _T = TypeVar("_T")
 
@@ -539,6 +575,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     @cached_property
     def model_cls(self) -> type[SupportsMultiModal]:
         from vllm.model_executor.model_loader import get_model_cls
+
         model_cls = get_model_cls(self.model_config)
         return cast(type[SupportsMultiModal], model_cls)
 
@@ -574,28 +611,29 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
 
 class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
-
     def all_mm_data(self) -> Optional[MultiModalDataDict]:
         if not self._items_by_modality:
             return None
         mm_inputs = {}
         items_by_modality = dict(self._items_by_modality)
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
-            raise ValueError(\
-                "Mixing raw image and embedding inputs is not allowed")
+            raise ValueError(
+                "Mixing raw image and embedding inputs is not allowed"
+            )
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
             if len(image_embeds_lst) > 1:
-                raise ValueError(\
-                    "Only one message can have {'type': 'image_embeds'}")
+                raise ValueError(
+                    "Only one message can have {'type': 'image_embeds'}"
+                )
             mm_inputs["image"] = image_embeds_lst[0]
         if "image" in items_by_modality:
-            mm_inputs["image"] = items_by_modality["image"] # A list of images
+            mm_inputs["image"] = items_by_modality["image"]  # A list of images
         if "audio" in items_by_modality:
-            mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
+            mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
-            mm_inputs["video"] = items_by_modality["video"] # A list of videos
+            mm_inputs["video"] = items_by_modality["video"]  # A list of videos
         return mm_inputs
 
     def create_parser(self) -> "BaseMultiModalContentParser":
@@ -603,32 +641,33 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
 
 
 class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
-
     async def all_mm_data(self) -> Optional[MultiModalDataDict]:
         if not self._items_by_modality:
             return None
         mm_inputs = {}
         items_by_modality = {
-                modality: await asyncio.gather(*items)
-                for modality, items in self._items_by_modality.items()
-            }
+            modality: await asyncio.gather(*items)
+            for modality, items in self._items_by_modality.items()
+        }
 
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
             raise ValueError(
-                "Mixing raw image and embedding inputs is not allowed")
+                "Mixing raw image and embedding inputs is not allowed"
+            )
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
             if len(image_embeds_lst) > 1:
                 raise ValueError(
-                    "Only one message can have {'type': 'image_embeds'}")
+                    "Only one message can have {'type': 'image_embeds'}"
+                )
             mm_inputs["image"] = image_embeds_lst[0]
         if "image" in items_by_modality:
-            mm_inputs["image"] = items_by_modality["image"] # A list of images
+            mm_inputs["image"] = items_by_modality["image"]  # A list of images
         if "audio" in items_by_modality:
-            mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
+            mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
-            mm_inputs["video"] = items_by_modality["video"] # A list of videos
+            mm_inputs["video"] = items_by_modality["video"]  # A list of videos
         return mm_inputs
 
     def create_parser(self) -> "BaseMultiModalContentParser":
@@ -636,7 +675,6 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
 
 
 class BaseMultiModalContentParser(ABC):
-
     def __init__(self) -> None:
         super().__init__()
 
@@ -648,8 +686,9 @@ class BaseMultiModalContentParser(ABC):
         # }
         self._placeholder_storage: dict[str, list] = defaultdict(list)
 
-    def _add_placeholder(self, modality: ModalityStr,
-                         placeholder: Optional[str]):
+    def _add_placeholder(
+        self, modality: ModalityStr, placeholder: Optional[str]
+    ):
         mod_placeholder = MODALITY_PLACEHOLDERS_MAP[modality]
         if placeholder:
             self._placeholder_storage[mod_placeholder].append(placeholder)
@@ -662,8 +701,9 @@ class BaseMultiModalContentParser(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def parse_image_embeds(self,
-                           image_embeds: Union[str, dict[str, str]]) -> None:
+    def parse_image_embeds(
+        self, image_embeds: Union[str, dict[str, str]]
+    ) -> None:
         raise NotImplementedError
 
     @abstractmethod
@@ -684,7 +724,6 @@ class BaseMultiModalContentParser(ABC):
 
 
 class MultiModalContentParser(BaseMultiModalContentParser):
-
     def __init__(self, tracker: MultiModalItemTracker) -> None:
         super().__init__()
 
@@ -701,8 +740,9 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         placeholder = self._tracker.add("image", image)
         self._add_placeholder("image", placeholder)
 
-    def parse_image_embeds(self,
-                           image_embeds: Union[str, dict[str, str]]) -> None:
+    def parse_image_embeds(
+        self, image_embeds: Union[str, dict[str, str]]
+    ) -> None:
         if isinstance(image_embeds, dict):
             embeds = {
                 k: self._connector.fetch_image_embedding(v)
@@ -741,14 +781,13 @@ class MultiModalContentParser(BaseMultiModalContentParser):
 
 
 class AsyncMultiModalContentParser(BaseMultiModalContentParser):
-
     def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
         super().__init__()
 
         self._tracker = tracker
         self._connector = MediaConnector(
             media_io_kwargs=self._tracker._model_config.media_io_kwargs,
-            allowed_local_media_path=tracker.allowed_local_media_path
+            allowed_local_media_path=tracker.allowed_local_media_path,
         )
 
     def parse_image(self, image_url: str) -> None:
@@ -757,8 +796,9 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         placeholder = self._tracker.add("image", image_coro)
         self._add_placeholder("image", placeholder)
 
-    def parse_image_embeds(self,
-                           image_embeds: Union[str, dict[str, str]]) -> None:
+    def parse_image_embeds(
+        self, image_embeds: Union[str, dict[str, str]]
+    ) -> None:
         future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
 
         if isinstance(image_embeds, dict):
@@ -769,8 +809,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
             future.set_result(embeds)
 
         if isinstance(image_embeds, str):
-            embedding = self._connector.\
-                fetch_image_embedding(image_embeds)
+            embedding = self._connector.fetch_image_embedding(image_embeds)
             future.set_result(embedding)
 
         placeholder = self._tracker.add("image_embeds", future)
@@ -809,20 +848,23 @@ def validate_chat_template(chat_template: Optional[Union[Path, str]]):
         return
 
     elif isinstance(chat_template, Path) and not chat_template.exists():
-        raise FileNotFoundError(
-            "the supplied chat template path doesn't exist")
+        raise FileNotFoundError("the supplied chat template path doesn't exist")
 
     elif isinstance(chat_template, str):
         JINJA_CHARS = "{}\n"
-        if not any(c in chat_template
-                   for c in JINJA_CHARS) and not Path(chat_template).exists():
+        if (
+            not any(c in chat_template for c in JINJA_CHARS)
+            and not Path(chat_template).exists()
+        ):
             raise ValueError(
                 f"The supplied chat template string ({chat_template}) "
-                f"appears path-like, but doesn't exist!")
+                f"appears path-like, but doesn't exist!"
+            )
 
     else:
         raise TypeError(
-            f"{type(chat_template)} is not a valid chat template type")
+            f"{type(chat_template)} is not a valid chat template type"
+        )
 
 
 def _load_chat_template(
@@ -835,8 +877,9 @@ def _load_chat_template(
 
     if is_literal:
         if isinstance(chat_template, Path):
-            raise TypeError("chat_template is expected to be read directly "
-                            "from its value")
+            raise TypeError(
+                "chat_template is expected to be read directly from its value"
+            )
 
         return chat_template
 
@@ -849,9 +892,11 @@ def _load_chat_template(
 
         JINJA_CHARS = "{}\n"
         if not any(c in chat_template for c in JINJA_CHARS):
-            msg = (f"The supplied chat template ({chat_template}) "
-                   f"looks like a file path, but it failed to be "
-                   f"opened. Reason: {e}")
+            msg = (
+                f"The supplied chat template ({chat_template}) "
+                f"looks like a file path, but it failed to be "
+                f"opened. Reason: {e}"
+            )
             raise ValueError(msg) from e
 
         # If opening a file fails, set chat template to be args to
@@ -870,8 +915,9 @@ def load_chat_template(
     return _cached_load_chat_template(chat_template, is_literal=is_literal)
 
 
-def _get_interleaved_text_prompt(placeholder_storage: dict[str, list],
-                                 texts: list[str]) -> str:
+def _get_interleaved_text_prompt(
+    placeholder_storage: dict[str, list], texts: list[str]
+) -> str:
     for idx, elem in enumerate(texts):
         if elem in placeholder_storage:
             texts[idx] = placeholder_storage[elem].pop(0)
@@ -881,10 +927,11 @@ def _get_interleaved_text_prompt(placeholder_storage: dict[str, list],
 
 # TODO: Let user specify how to insert multimodal tokens into prompt
 # (similar to chat template)
-def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list],
-                                     texts: list[str],
-                                     interleave_strings: bool
-                                     ) -> str:
+def _get_full_multimodal_text_prompt(
+    placeholder_storage: dict[str, list],
+    texts: list[str],
+    interleave_strings: bool,
+) -> str:
     """Combine multimodal prompts for a multimodal language model."""
 
     # flatten storage to make it looks like
@@ -907,7 +954,6 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list],
     # Look through the text prompt to check for missing placeholders
     missing_placeholders: list[str] = []
     for placeholder in placeholder_counts:
-
         # For any existing placeholder in the text prompt, we leave it as is
         placeholder_counts[placeholder] -= text_prompt.count(placeholder)
 
@@ -916,15 +962,18 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list],
                 "Placeholder count is negative! "
                 "Ensure that the 'interleave_strings' flag is disabled "
                 "(current value: %s) "
-                "when manually placing image placeholders.", interleave_strings
+                "when manually placing image placeholders.",
+                interleave_strings,
             )
             logger.debug("Input prompt: %s", text_prompt)
             raise ValueError(
                 f"Found more '{placeholder}' placeholders in input prompt than "
-                "actual multimodal data items.")
+                "actual multimodal data items."
+            )
 
-        missing_placeholders.extend([placeholder] *
-                                    placeholder_counts[placeholder])
+        missing_placeholders.extend(
+            [placeholder] * placeholder_counts[placeholder]
+        )
 
     # NOTE: Default behaviour: we always add missing placeholders
     # at the front of the prompt, if interleave_strings=False
@@ -944,7 +993,8 @@ _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
 _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
 
 _ResponsesInputImageParser = TypeAdapter(
-    ResponseInputImageParam).validate_python
+    ResponseInputImageParam
+).validate_python
 _ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
 
 # Define a mapping from part types to their corresponding parsing functions.
@@ -952,32 +1002,35 @@ MM_PARSER_MAP: dict[
     str,
     Callable[[ChatCompletionContentPartParam], _ContentPart],
 ] = {
-    "text":
-    lambda part: _TextParser(part).get("text", None),
-    "thinking":
-    lambda part: _ThinkParser(part).get("thinking", None),
-    "input_text":
-    lambda part: _TextParser(part).get("text", None),
-    "input_image":
-    lambda part: _ResponsesInputImageParser(part).get("image_url", None),
-    "image_url":
-    lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
-    "image_embeds":
-    lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
+    "text": lambda part: _TextParser(part).get("text", None),
+    "thinking": lambda part: _ThinkParser(part).get("thinking", None),
+    "input_text": lambda part: _TextParser(part).get("text", None),
+    "input_image": lambda part: _ResponsesInputImageParser(part).get(
+        "image_url", None
+    ),
+    "image_url": lambda part: _ImageParser(part)
+    .get("image_url", {})
+    .get("url", None),
+    "image_embeds": lambda part: _ImageEmbedsParser(part).get(
+        "image_embeds", None
+    ),
     "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
-    "audio_url":
-    lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
-    "input_audio":
-    lambda part: _InputAudioParser(part).get("input_audio", None),
-    "refusal":
-    lambda part: _RefusalParser(part).get("refusal", None),
-    "video_url":
-    lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
+    "audio_url": lambda part: _AudioParser(part)
+    .get("audio_url", {})
+    .get("url", None),
+    "input_audio": lambda part: _InputAudioParser(part).get(
+        "input_audio", None
+    ),
+    "refusal": lambda part: _RefusalParser(part).get("refusal", None),
+    "video_url": lambda part: _VideoParser(part)
+    .get("video_url", {})
+    .get("url", None),
 }
 
 
 def _parse_chat_message_content_mm_part(
-        part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]:
+    part: ChatCompletionContentPartParam,
+) -> tuple[str, _ContentPart]:
     """
     Parses a given multi-modal content part based on its type.
 
@@ -993,7 +1046,8 @@ def _parse_chat_message_content_mm_part(
         ValueError: If the 'type' field is missing and no direct URL is found.
     """
     assert isinstance(
-        part, dict)  # This is needed to avoid mypy errors: part.get() from str
+        part, dict
+    )  # This is needed to avoid mypy errors: part.get() from str
     part_type = part.get("type", None)
 
     if isinstance(part_type, str) and part_type in MM_PARSER_MAP:
@@ -1002,8 +1056,10 @@ def _parse_chat_message_content_mm_part(
         # Special case for 'image_url.detail'
         # We only support 'auto', which is the default
         if part_type == "image_url" and part.get("detail", "auto") != "auto":
-            logger.warning("'image_url.detail' is currently not supported "
-                           "and will be ignored.")
+            logger.warning(
+                "'image_url.detail' is currently not supported "
+                "and will be ignored."
+            )
 
         return part_type, content
 
@@ -1011,19 +1067,22 @@ def _parse_chat_message_content_mm_part(
     # 'type' is required field by pydantic
     if part_type is None:
         if part.get("image_url") is not None:
-            image_params = cast(CustomChatCompletionContentSimpleImageParam,
-                                part)
+            image_params = cast(
+                CustomChatCompletionContentSimpleImageParam, part
+            )
             return "image_url", image_params.get("image_url", "")
         if part.get("audio_url") is not None:
-            audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
-                                part)
+            audio_params = cast(
+                CustomChatCompletionContentSimpleAudioParam, part
+            )
             return "audio_url", audio_params.get("audio_url", "")
         if part.get("input_audio") is not None:
             input_audio_params = cast(dict[str, str], part)
             return "input_audio", input_audio_params
         if part.get("video_url") is not None:
-            video_params = cast(CustomChatCompletionContentSimpleVideoParam,
-                                part)
+            video_params = cast(
+                CustomChatCompletionContentSimpleVideoParam, part
+            )
             return "video_url", video_params.get("video_url", "")
         # Raise an error if no 'type' or direct URL is found.
         raise ValueError("Missing 'type' field in multimodal part.")
@@ -1033,9 +1092,16 @@ def _parse_chat_message_content_mm_part(
     return part_type, "unknown part_type content"
 
 
-VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "image_embeds", "image_pil",
-                                       "audio_url", "input_audio", "video_url")
+VALID_MESSAGE_CONTENT_MM_PART_TYPES = (
+    "text",
+    "refusal",
+    "image_url",
+    "image_embeds",
+    "image_pil",
+    "audio_url",
+    "input_audio",
+    "video_url",
+)
 
 
 def _parse_chat_message_content_parts(
@@ -1055,21 +1121,20 @@ def _parse_chat_message_content_parts(
             part,
             mm_parser,
             wrap_dicts=wrap_dicts,
-            interleave_strings=interleave_strings
+            interleave_strings=interleave_strings,
         )
         if parse_res:
             content.append(parse_res)
 
     if wrap_dicts:
         # Parsing wraps images and texts as interleaved dictionaries
-        return [ConversationMessage(role=role,
-                                    content=content)]  # type: ignore
+        return [ConversationMessage(role=role, content=content)]  # type: ignore
     texts = cast(list[str], content)
     mm_placeholder_storage = mm_parser.mm_placeholder_storage()
     if mm_placeholder_storage:
-        text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_storage,
-                                                       texts,
-                                                       interleave_strings)
+        text_prompt = _get_full_multimodal_text_prompt(
+            mm_placeholder_storage, texts, interleave_strings
+        )
     else:
         text_prompt = "\n".join(texts)
 
@@ -1099,13 +1164,16 @@ def _parse_chat_message_content_part(
     if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
         logger.warning(
             "Skipping multimodal part '%s' (type: '%s') "
-            "with empty / unparsable content.", part, part_type)
+            "with empty / unparsable content.",
+            part,
+            part_type,
+        )
         return None
 
     if part_type in ("text", "input_text", "refusal", "thinking"):
         str_content = cast(str, content)
         if wrap_dicts:
-            return {'type': 'text', 'text': str_content}
+            return {"type": "text", "text": str_content}
         else:
             return str_content
 
@@ -1137,8 +1205,12 @@ def _parse_chat_message_content_part(
     else:
         raise NotImplementedError(f"Unknown part type: {part_type}")
 
-    return {'type': modality} if wrap_dicts else (
-        MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
+    return (
+        {"type": modality}
+        if wrap_dicts
+        else (
+            MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
+        )
     )
 
 
@@ -1171,14 +1243,16 @@ def _parse_chat_message_content(
     )
 
     for result_msg in result:
-        if role == 'assistant':
+        if role == "assistant":
             parsed_msg = _AssistantParser(message)
 
             # The 'tool_calls' is not None check ensures compatibility.
             # It's needed only if downstream code doesn't strictly
             # follow the OpenAI spec.
-            if ("tool_calls" in parsed_msg
-                and parsed_msg["tool_calls"] is not None):
+            if (
+                "tool_calls" in parsed_msg
+                and parsed_msg["tool_calls"] is not None
+            ):
                 result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
         elif role == "tool":
             parsed_msg = _ToolParser(message)
@@ -1198,12 +1272,15 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
     # so, for messages that have tool_calls, parse the string (which we get
     # from openAI format) to dict
     for message in messages:
-        if (message["role"] == "assistant" and "tool_calls" in message
-                and isinstance(message["tool_calls"], list)):
-
+        if (
+            message["role"] == "assistant"
+            and "tool_calls" in message
+            and isinstance(message["tool_calls"], list)
+        ):
             for item in message["tool_calls"]:
                 item["function"]["arguments"] = json.loads(
-                    item["function"]["arguments"])
+                    item["function"]["arguments"]
+                )
 
 
 def parse_chat_messages(
@@ -1224,7 +1301,7 @@ def parse_chat_messages(
                 content_format == "string"
                 and model_config.multimodal_config is not None
                 and model_config.multimodal_config.interleave_mm_strings
-            )
+            ),
         )
 
         conversation.extend(sub_messages)
@@ -1252,7 +1329,7 @@ def parse_chat_messages_futures(
                 content_format == "string"
                 and model_config.multimodal_config is not None
                 and model_config.multimodal_config.interleave_mm_strings
-            )
+            ),
         )
 
         conversation.extend(sub_messages)
@@ -1283,10 +1360,10 @@ def apply_hf_chat_template(
         raise ValueError(
             "As of transformers v4.44, default chat template is no longer "
             "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one.")
+            "does not define one."
+        )
 
     try:
-
         return tokenizer.apply_chat_template(
             conversation=conversation,  # type: ignore[arg-type]
             tools=tools,  # type: ignore[arg-type]
@@ -1298,13 +1375,14 @@ def apply_hf_chat_template(
     # External library exceptions can sometimes occur despite the framework's
     # internal exception management capabilities.
     except Exception as e:
-
         # Log and report any library-related exceptions for further
         # investigation.
         logger.exception(
-            "An error occurred in `transformers` while applying chat template")
+            "An error occurred in `transformers` while applying chat template"
+        )
         raise ValueError(str(e)) from e
 
+
 def apply_mistral_chat_template(
     tokenizer: MistralTokenizer,
     messages: list[ChatCompletionMessageParam],
@@ -1337,26 +1415,26 @@ def apply_mistral_chat_template(
     # External library exceptions can sometimes occur despite the framework's
     # internal exception management capabilities.
     except Exception as e:
-
         # Log and report any library-related exceptions for further
         # investigation.
         logger.exception(
-            "An error occurred in `mistral_common` while applying chat "
-            "template")
+            "An error occurred in `mistral_common` while applying chat template"
+        )
         raise ValueError(str(e)) from e
 
+
 def get_history_tool_calls_cnt(conversation: list[ConversationMessage]):
     idx = 0
     for msg in conversation:
-        if msg['role'] == 'assistant':
-            tool_calls = msg.get('tool_calls')
-            idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
+        if msg["role"] == "assistant":
+            tool_calls = msg.get("tool_calls")
+            idx += len(list(tool_calls)) if tool_calls is not None else 0  # noqa
     return idx
 
-def make_tool_call_id(id_type:str='random', func_name=None, idx=None):
 
-    if id_type=='kimi_k2':
-        return f'functions.{func_name}:{idx}'
+def make_tool_call_id(id_type: str = "random", func_name=None, idx=None):
+    if id_type == "kimi_k2":
+        return f"functions.{func_name}:{idx}"
     else:
         # by default return random
         return f"chatcmpl-tool-{random_uuid()}"
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 796b8ab5fc..f506f7de16 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -82,16 +82,26 @@ from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of,
 
 logger = init_logger(__name__)
 
-CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
-                              EmbeddingCompletionRequest, RerankRequest,
-                              ClassificationRequest, ScoreRequest,
-                              TokenizeCompletionRequest]
+CompletionLikeRequest = Union[
+    CompletionRequest,
+    DetokenizeRequest,
+    EmbeddingCompletionRequest,
+    RerankRequest,
+    ClassificationRequest,
+    ScoreRequest,
+    TokenizeCompletionRequest,
+]
 
 ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
                         TokenizeChatRequest]
 SpeechToTextRequest = Union[TranscriptionRequest, TranslationRequest]
-AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest, SpeechToTextRequest,
-                   ResponsesRequest, IOProcessorRequest]
+AnyRequest = Union[
+    CompletionLikeRequest,
+    ChatLikeRequest,
+    SpeechToTextRequest,
+    ResponsesRequest,
+    IOProcessorRequest,
+]
 
 AnyResponse = Union[
     CompletionResponse,
@@ -135,6 +145,7 @@ class RequestProcessingMixin(BaseModel):
     Mixin for request processing,
     handling prompt preparation and engine input.
     """
+
     request_prompts: Optional[Sequence[RequestPrompt]] = []
     engine_prompts: Optional[Union[list[EngineTokensPrompt],
                                    list[EngineEmbedsPrompt]]] = []
@@ -147,6 +158,7 @@ class ResponseGenerationMixin(BaseModel):
     Mixin for response generation,
     managing result generators and final batch results.
     """
+
     result_generator: Optional[AsyncGenerator[tuple[int, Union[
         RequestOutput, PoolingRequestOutput]], None]] = None
     final_res_batch: list[Union[RequestOutput, PoolingRequestOutput]] = Field(
@@ -155,8 +167,12 @@ class ResponseGenerationMixin(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
 
-class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, BaseModel,
-                   Generic[RequestT]):
+class ServeContext(
+        RequestProcessingMixin,
+        ResponseGenerationMixin,
+        BaseModel,
+        Generic[RequestT],
+):
     # Shared across all requests
     request: RequestT
     raw_request: Optional[Request] = None
@@ -298,8 +314,8 @@ class OpenAIServing:
         truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens",
                                          None)
 
-        if truncate_prompt_tokens is not None and \
-            truncate_prompt_tokens > self.max_model_len:
+        if (truncate_prompt_tokens is not None
+                and truncate_prompt_tokens > self.max_model_len):
             return self.create_error_response(
                 "truncate_prompt_tokens value is "
                 "greater than max_model_len."
@@ -344,10 +360,12 @@ class OpenAIServing:
                     return self.create_error_response(
                         "Request prompts not available")
 
-                self._log_inputs(request_id_item,
-                                 ctx.request_prompts[i],
-                                 params=pooling_params,
-                                 lora_request=ctx.lora_request)
+                self._log_inputs(
+                    request_id_item,
+                    ctx.request_prompts[i],
+                    params=pooling_params,
+                    lora_request=ctx.lora_request,
+                )
 
                 # Mypy has an existing bug related to inferring the variance of
                 # TypedDicts with `builtins.enumerate`:
@@ -410,10 +428,11 @@ class OpenAIServing:
             return self.create_error_response(str(e))
 
     def create_error_response(
-            self,
-            message: str,
-            err_type: str = "BadRequestError",
-            status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
+    ) -> ErrorResponse:
         if self.log_error_stack:
             exc_type, _, _ = sys.exc_info()
             if exc_type is not None:
@@ -424,10 +443,11 @@ class OpenAIServing:
             message=message, type=err_type, code=status_code.value))
 
     def create_streaming_error_response(
-            self,
-            message: str,
-            err_type: str = "BadRequestError",
-            status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str:
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
+    ) -> str:
         json_str = json.dumps(
             self.create_error_response(message=message,
                                        err_type=err_type,
@@ -438,25 +458,25 @@ class OpenAIServing:
         self,
         request: AnyRequest,
     ) -> Optional[ErrorResponse]:
-
         error_response = None
 
         if self._is_model_supported(request.model):
             return None
         if request.model in self.models.lora_requests:
             return None
-        if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and (
-                load_result := await self.models.resolve_lora(request.model)):
+        if (envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and
+            (load_result := await self.models.resolve_lora(request.model))):
             if isinstance(load_result, LoRARequest):
                 return None
-            if isinstance(load_result, ErrorResponse) and \
-                load_result.error.code == HTTPStatus.BAD_REQUEST.value:
+            if (isinstance(load_result, ErrorResponse) and
+                    load_result.error.code == HTTPStatus.BAD_REQUEST.value):
                 error_response = load_result
 
         return error_response or self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
-            status_code=HTTPStatus.NOT_FOUND)
+            status_code=HTTPStatus.NOT_FOUND,
+        )
 
     def _get_active_default_mm_loras(
             self, request: AnyRequest) -> Optional[LoRARequest]:
@@ -487,7 +507,6 @@ class OpenAIServing:
         request: AnyRequest,
         supports_default_mm_loras: bool = False,
     ) -> Optional[LoRARequest]:
-
         if request.model in self.models.lora_requests:
             return self.models.lora_requests[request.model]
 
@@ -548,13 +567,15 @@ class OpenAIServing:
                 prompt,
                 add_special_tokens=add_special_tokens,
                 truncation=True,
-                max_length=self.max_model_len)
+                max_length=self.max_model_len,
+            )
         else:
             encoded = await async_tokenizer(
                 prompt,
                 add_special_tokens=add_special_tokens,
                 truncation=True,
-                max_length=truncate_prompt_tokens)
+                max_length=truncate_prompt_tokens,
+            )
 
         input_ids = encoded.input_ids
         input_text = prompt
@@ -595,16 +616,22 @@ class OpenAIServing:
 
         # Note: EmbeddingRequest, ClassificationRequest,
         # and ScoreRequest doesn't have max_tokens
-        if isinstance(request,
-                      (EmbeddingChatRequest, EmbeddingCompletionRequest,
-                       ScoreRequest, RerankRequest, ClassificationRequest)):
-
+        if isinstance(
+                request,
+            (
+                EmbeddingChatRequest,
+                EmbeddingCompletionRequest,
+                ScoreRequest,
+                RerankRequest,
+                ClassificationRequest,
+            ),
+        ):
             # Note: input length can be up to the entire model context length
             # since these requests don't generate tokens.
             if token_num > self.max_model_len:
                 operations: dict[type[AnyRequest], str] = {
                     ScoreRequest: "score",
-                    ClassificationRequest: "classification"
+                    ClassificationRequest: "classification",
                 }
                 operation = operations.get(type(request),
                                            "embedding generation")
@@ -618,8 +645,11 @@ class OpenAIServing:
 
         # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
         # and does not require model context length validation
-        if isinstance(request, (TokenizeCompletionRequest, TokenizeChatRequest,
-                                DetokenizeRequest)):
+        if isinstance(
+                request,
+            (TokenizeCompletionRequest, TokenizeChatRequest,
+             DetokenizeRequest),
+        ):
             return TextTokensPrompt(prompt=input_text,
                                     prompt_token_ids=input_ids)
 
@@ -639,8 +669,8 @@ class OpenAIServing:
                 f"{token_num} input tokens. Please reduce the length of "
                 "the input messages.")
 
-        if max_tokens is not None and \
-            token_num + max_tokens > self.max_model_len:
+        if (max_tokens is not None
+                and token_num + max_tokens > self.max_model_len):
             raise ValueError(
                 "'max_tokens' or 'max_completion_tokens' is too large: "
                 f"{max_tokens}. This model's maximum context length is "
@@ -745,13 +775,14 @@ class OpenAIServing:
         tasks = []
         for prompt_input in batch_inputs:
             if prompt_input["is_tokens"] is False:
-                assert tokenizer is not None, \
-                    "Tokenizer is required for text prompts"
+                assert tokenizer is not None, (
+                    "Tokenizer is required for text prompts")
                 task = self._normalize_prompt_text_to_input(
                     request,
                     prompt_input["content"],
                     tokenizer=tokenizer,
-                    add_special_tokens=add_special_tokens)
+                    add_special_tokens=add_special_tokens,
+                )
             else:
                 task = self._normalize_prompt_tokens_to_input(
                     request, prompt_input["content"], tokenizer=tokenizer)
@@ -766,9 +797,14 @@ class OpenAIServing:
     @overload
     async def _preprocess_completion(
         self,
-        request: Union[DetokenizeRequest, EmbeddingCompletionRequest,
-                       RerankRequest, ClassificationRequest, ScoreRequest,
-                       TokenizeCompletionRequest],
+        request: Union[
+            DetokenizeRequest,
+            EmbeddingCompletionRequest,
+            RerankRequest,
+            ClassificationRequest,
+            ScoreRequest,
+            TokenizeCompletionRequest,
+        ],
         tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
         add_special_tokens: bool = ...,
@@ -783,8 +819,10 @@ class OpenAIServing:
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
         add_special_tokens: bool = ...,
-    ) -> tuple[list[Union[TextTokensPrompt, EmbedsPrompt]], list[Union[
-            EngineTokensPrompt, EngineEmbedsPrompt]]]:
+    ) -> tuple[
+            list[Union[TextTokensPrompt, EmbedsPrompt]],
+            list[Union[EngineTokensPrompt, EngineEmbedsPrompt]],
+    ]:
         ...
 
     async def _preprocess_completion(
@@ -794,32 +832,38 @@ class OpenAIServing:
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
         add_special_tokens: bool = True,
-    ) -> tuple[Union[list[TextTokensPrompt], list[Union[
-            TextTokensPrompt, EmbedsPrompt]]], Union[
-                list[EngineTokensPrompt], list[Union[EngineTokensPrompt,
-                                                     EngineEmbedsPrompt]]]]:
-        if not isinstance(request,
-                          CompletionRequest) and input_or_inputs is None:
+    ) -> tuple[
+            Union[list[TextTokensPrompt], list[Union[TextTokensPrompt,
+                                                     EmbedsPrompt]]],
+            Union[
+                list[EngineTokensPrompt],
+                list[Union[EngineTokensPrompt, EngineEmbedsPrompt]],
+            ],
+    ]:
+        if (not isinstance(request, CompletionRequest)
+                and input_or_inputs is None):
             raise ValueError(
                 "Prompt embeds with non-completion requests is not"
                 " currently supported.")
 
-        (request_prompts_text, request_prompts_embeds
-         ) = await self._tokenize_prompt_input_or_inputs_async(
-             request,
-             tokenizer,
-             input_or_inputs,
-             add_special_tokens=add_special_tokens,
-         )
+        (
+            request_prompts_text,
+            request_prompts_embeds,
+        ) = await self._tokenize_prompt_input_or_inputs_async(
+            request,
+            tokenizer,
+            input_or_inputs,
+            add_special_tokens=add_special_tokens,
+        )
 
         engine_prompts_text = [
             EngineTokensPrompt(
                 prompt_token_ids=request_prompt_text["prompt_token_ids"])
             for request_prompt_text in request_prompts_text
         ]
-        cache_salt = request.cache_salt if (
-            hasattr(request, "cache_salt")
-            and request.cache_salt is not None) else None
+        cache_salt = (request.cache_salt if
+                      (hasattr(request, "cache_salt")
+                       and request.cache_salt is not None) else None)
         if cache_salt:
             for prompt_text in engine_prompts_text:
                 prompt_text["cache_salt"] = cache_salt
@@ -831,8 +875,8 @@ class OpenAIServing:
         # non-completion requests and if we don't add the overload here,
         # everywhere this function is used outside of serving_completion will
         # need logic asserting that only text prompts are in the request.
-        if not isinstance(request,
-                          CompletionRequest) and input_or_inputs is not None:
+        if (not isinstance(request, CompletionRequest)
+                and input_or_inputs is not None):
             return request_prompts_text, engine_prompts_text
 
         engine_prompts_embeds = [
@@ -862,8 +906,11 @@ class OpenAIServing:
         chat_template_kwargs: Optional[dict[str, Any]] = None,
         tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
         add_special_tokens: bool = False,
-    ) -> tuple[list[ConversationMessage], Sequence[RequestPrompt],
-               list[EngineTokensPrompt]]:
+    ) -> tuple[
+            list[ConversationMessage],
+            Sequence[RequestPrompt],
+            list[EngineTokensPrompt],
+    ]:
         model_config = self.model_config
 
         resolved_content_format = resolve_chat_template_content_format(
@@ -925,8 +972,8 @@ class OpenAIServing:
 
         if tokenizer is None:
             assert isinstance(request_prompt, str), (
-                "Prompt has to be a string", \
-                "when the tokenizer is not initialised"
+                "Prompt has to be a string",
+                "when the tokenizer is not initialised",
             )
             prompt_inputs = TextTokensPrompt(prompt=request_prompt,
                                              prompt_token_ids=[1])
@@ -943,7 +990,8 @@ class OpenAIServing:
                 "Prompt has to be either a string or a list of token ids")
             prompt_inputs = TextTokensPrompt(
                 prompt=tokenizer.decode(request_prompt),
-                prompt_token_ids=request_prompt)
+                prompt_token_ids=request_prompt,
+            )
 
         engine_prompt = EngineTokensPrompt(
             prompt_token_ids=prompt_inputs["prompt_token_ids"])
@@ -1007,22 +1055,23 @@ class OpenAIServing:
                 prompt_token_ids=prompt_token_ids)
             request_prompt = prompt_token_ids
             # Update the sampling params.
-            sampling_params.max_tokens = (self.max_model_len -
-                                          len(prompt_token_ids))
+            sampling_params.max_tokens = self.max_model_len - len(
+                prompt_token_ids)
             # OPTIMIZATION
             priority = orig_priority - 1
 
     @staticmethod
     def _load_prompt_embeds(
         prompt_embeds: Optional[Union[bytes, list[bytes]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
     ) -> list[EmbedsPrompt]:
 
         def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
-            tensor = torch.load(io.BytesIO(
-                pybase64.b64decode(embed, validate=True)),
-                                weights_only=True,
-                                map_location=torch.device("cpu"))
+            tensor = torch.load(
+                io.BytesIO(pybase64.b64decode(embed, validate=True)),
+                weights_only=True,
+                map_location=torch.device("cpu"),
+            )
             assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
                 torch.float32,
                 torch.bfloat16,
@@ -1061,7 +1110,7 @@ class OpenAIServing:
             prompt = inputs
         elif isinstance(inputs, list):
             prompt_token_ids = inputs
-        elif 'prompt_embeds' in inputs:
+        elif "prompt_embeds" in inputs:
             prompt_embeds = inputs.get("prompt_embeds")
         else:
             prompt = inputs["prompt"]
@@ -1101,10 +1150,12 @@ class OpenAIServing:
         return raw_request.headers.get("X-Request-Id", default)
 
     @staticmethod
-    def _get_decoded_token(logprob: Logprob,
-                           token_id: int,
-                           tokenizer: AnyTokenizer,
-                           return_as_token_id: bool = False) -> str:
+    def _get_decoded_token(
+        logprob: Logprob,
+        token_id: int,
+        tokenizer: AnyTokenizer,
+        return_as_token_id: bool = False,
+    ) -> str:
         if return_as_token_id:
             return f"token_id:{token_id}"
 
@@ -1117,9 +1168,11 @@ class OpenAIServing:
             return True
         return self.models.is_base_model(model_name)
 
-    def _get_model_name(self,
-                        model_name: Optional[str] = None,
-                        lora_request: Optional[LoRARequest] = None) -> str:
+    def _get_model_name(
+        self,
+        model_name: Optional[str] = None,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> str:
         if lora_request:
             return lora_request.lora_name
         if not model_name:
@@ -1129,7 +1182,7 @@ class OpenAIServing:
 
 def clamp_prompt_logprobs(
     prompt_logprobs: Union[PromptLogprobs,
-                           None]) -> Union[PromptLogprobs, None]:
+                           None], ) -> Union[PromptLogprobs, None]:
     if prompt_logprobs is None:
         return prompt_logprobs
 
@@ -1137,6 +1190,6 @@ def clamp_prompt_logprobs(
         if logprob_dict is None:
             continue
         for logprob_values in logprob_dict.values():
-            if logprob_values.logprob == float('-inf'):
+            if logprob_values.logprob == float("-inf"):
                 logprob_values.logprob = -9999.0
     return prompt_logprobs

From 9480ae24e38cb73c5b665f5843ebd92c75a2039f Mon Sep 17 00:00:00 2001
From: Kyuyeun Kim <62023335+kyuyeunk@users.noreply.github.com>
Date: Tue, 2 Sep 2025 10:56:31 -0700
Subject: [PATCH 798/932] [Bugfix] Fix packed_factor missing attribute error
 (#23902)

Signed-off-by: Kyuyeun Kim <kyuyeunk@google.com>
---
 vllm/model_executor/layers/linear.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 19ff631450..f24c87dbf4 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -722,8 +722,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                 # If quantized, we need to adjust the offset and size to account
                 # for the packing.
                 if packed_dim == output_dim:
-                    shard_size = shard_size // param.pack_factor
-                    shard_offset = shard_offset // param.pack_factor
+                    shard_size = shard_size // param.packed_factor
+                    shard_offset = shard_offset // param.packed_factor
                     # Special case for Marlin.
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
@@ -756,8 +756,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             # for the packing.
             packed_dim = getattr(param, "packed_dim", None)
             if packed_dim == output_dim:
-                shard_size = shard_size // param.pack_factor
-                shard_offset = shard_offset // param.pack_factor
+                shard_size = shard_size // param.packed_factor
+                shard_offset = shard_offset // param.packed_factor
                 # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
@@ -1107,8 +1107,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                 # If quantized, we need to adjust the offset and size to account
                 # for the packing.
                 if packed_dim == output_dim:
-                    shard_size = shard_size // param.pack_factor
-                    shard_offset = shard_offset // param.pack_factor
+                    shard_size = shard_size // param.packed_factor
+                    shard_offset = shard_offset // param.packed_factor
 
                     # Special case for Marlin.
                     shard_size, shard_offset = adjust_marlin_shard(
@@ -1155,8 +1155,8 @@ class QKVParallelLinear(ColumnParallelLinear):
             # for the packing.
             packed_dim = getattr(param, "packed_dim", None)
             if packed_dim == output_dim:
-                shard_size = shard_size // param.pack_factor
-                shard_offset = shard_offset // param.pack_factor
+                shard_size = shard_size // param.packed_factor
+                shard_offset = shard_offset // param.packed_factor
 
                 # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(

From 2417798471af8521e488c04f3a43c91e6836a705 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 2 Sep 2025 19:10:10 +0100
Subject: [PATCH 799/932] [Metrics] Deprecate TPOT in favor of ITL (#24110)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 .../prometheus_grafana/grafana.json           | 12 +++++-----
 tests/entrypoints/openai/test_metrics.py      | 22 +++++++++++++------
 vllm/engine/llm_engine.py                     |  8 +++----
 vllm/engine/metrics.py                        | 18 +++++++++++++--
 vllm/engine/metrics_types.py                  |  2 +-
 vllm/v1/metrics/loggers.py                    | 22 ++++++++++++++++---
 vllm/v1/metrics/stats.py                      |  6 ++---
 7 files changed, 64 insertions(+), 26 deletions(-)

diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json
index 3488956a5b..37abc9de92 100644
--- a/examples/online_serving/prometheus_grafana/grafana.json
+++ b/examples/online_serving/prometheus_grafana/grafana.json
@@ -402,7 +402,7 @@
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "includeNullMetadata": false,
           "instant": false,
@@ -418,7 +418,7 @@
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -435,7 +435,7 @@
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -452,7 +452,7 @@
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -468,7 +468,7 @@
             "uid": "${DS_PROMETHEUS}"
           },
           "editorMode": "code",
-          "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
           "hide": false,
           "instant": false,
           "legendFormat": "Mean",
@@ -476,7 +476,7 @@
           "refId": "E"
         }
       ],
-      "title": "Time Per Output Token Latency",
+      "title": "Inter Token Latency",
       "type": "timeseries"
     },
     {
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index ff2e7004ff..a4e1aca8bc 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
     "vllm:request_params_max_tokens_sum",
     "vllm:request_params_max_tokens_bucket",
     "vllm:request_params_max_tokens_count",
-    "vllm:time_to_first_token_seconds_sum",
-    "vllm:time_to_first_token_seconds_bucket",
-    "vllm:time_to_first_token_seconds_count",
     "vllm:time_per_output_token_seconds_sum",
     "vllm:time_per_output_token_seconds_bucket",
     "vllm:time_per_output_token_seconds_count",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:inter_token_latency_seconds_sum",
+    "vllm:inter_token_latency_seconds_bucket",
+    "vllm:inter_token_latency_seconds_count",
     "vllm:e2e_request_latency_seconds_sum",
     "vllm:e2e_request_latency_seconds_bucket",
     "vllm:e2e_request_latency_seconds_count",
@@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
     "vllm:request_decode_time_seconds_count",
 ]
 
-HIDDEN_DEPRECATED_METRICS: list[str] = []
+HIDDEN_DEPRECATED_METRICS: list[str] = [
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
+]
 
 
 @pytest.mark.asyncio
@@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
     assert response.status_code == HTTPStatus.OK
 
     for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
-        if (not server.show_hidden_metrics
-                and metric not in HIDDEN_DEPRECATED_METRICS):
-            assert metric in response.text
+        if (metric in HIDDEN_DEPRECATED_METRICS
+                and not server.show_hidden_metrics):
+            continue
+        assert metric in response.text
 
 
 @pytest.mark.asyncio
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 10ded6f16d..47f56e5813 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1414,7 +1414,7 @@ class LLMEngine:
         num_generation_tokens_iter = 0
         num_tokens_iter = 0
         time_to_first_tokens_iter: List[float] = []
-        time_per_output_tokens_iter: List[float] = []
+        inter_token_latencies_iter: List[float] = []
         num_preemption_iter = (0 if scheduler_outputs is None else
                                scheduler_outputs.preempted)
 
@@ -1498,9 +1498,9 @@ class LLMEngine:
                         num_generation_tokens_from_prefill_groups += (
                             seq_group.num_seqs())
                 else:
-                    # TPOTs.
+                    # ITLs
                     latency = seq_group.get_last_token_latency()
-                    time_per_output_tokens_iter.append(latency)
+                    inter_token_latencies_iter.append(latency)
                     if seq_group.state.current_step == 0:
                         # For async_output_proc, the do_log_stats()
                         # is called following init_multi_step(), which
@@ -1582,7 +1582,7 @@ class LLMEngine:
             num_generation_tokens_iter=num_generation_tokens_iter,
             num_tokens_iter=num_tokens_iter,
             time_to_first_tokens_iter=time_to_first_tokens_iter,
-            time_per_output_tokens_iter=time_per_output_tokens_iter,
+            inter_token_latencies_iter=inter_token_latencies_iter,
             num_preemption_iter=num_preemption_iter,
 
             # Request stats
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index ba8dbd1fad..0a8709db40 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -113,9 +113,21 @@ class Metrics:
                 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
                 2560.0
             ])
+        # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
+        # TODO: in 0.12, only enable if show_hidden_metrics=True
         self.histogram_time_per_output_token = self._histogram_cls(
             name="vllm:time_per_output_token_seconds",
-            documentation="Histogram of time per output token in seconds.",
+            documentation=(
+                "Histogram of time per output token in seconds."
+                "DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
+            labelnames=labelnames,
+            buckets=[
+                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
+                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
+            ])
+        self.histogram_inter_token_latency = self._histogram_cls(
+            name="vllm:inter_token_latency_seconds",
+            documentation="Histogram of inter token latency in seconds.",
             labelnames=labelnames,
             buckets=[
                 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
@@ -491,7 +503,9 @@ class PrometheusStatLogger(StatLoggerBase):
         self._log_histogram(self.metrics.histogram_time_to_first_token,
                             stats.time_to_first_tokens_iter)
         self._log_histogram(self.metrics.histogram_time_per_output_token,
-                            stats.time_per_output_tokens_iter)
+                            stats.inter_token_latencies_iter)
+        self._log_histogram(self.metrics.histogram_inter_token_latency,
+                            stats.inter_token_latencies_iter)
 
         # Request level data
         # Latency
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 3281a9121a..9778ab5a8c 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -43,7 +43,7 @@ class Stats:
     num_generation_tokens_iter: int
     num_tokens_iter: int
     time_to_first_tokens_iter: List[float]
-    time_per_output_tokens_iter: List[float]
+    inter_token_latencies_iter: List[float]
     num_preemption_iter: int
 
     # Request stats (should have _requests suffix)
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 3b0616952b..41e07a0056 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -377,9 +377,13 @@ class PrometheusStatLogger(StatLoggerBase):
         self.histogram_time_to_first_token = make_per_engine(
             histogram_time_to_first_token, engine_indexes, model_name)
 
+        # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
+        # TODO: in 0.12, only enable if show_hidden_metrics=True
         histogram_time_per_output_token = self._histogram_cls(
             name="vllm:time_per_output_token_seconds",
-            documentation="Histogram of time per output token in seconds.",
+            documentation=(
+                "Histogram of time per output token in seconds."
+                "DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
             buckets=[
                 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
                 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
@@ -388,6 +392,17 @@ class PrometheusStatLogger(StatLoggerBase):
         self.histogram_time_per_output_token = make_per_engine(
             histogram_time_per_output_token, engine_indexes, model_name)
 
+        histogram_inter_token_latency = self._histogram_cls(
+            name="vllm:inter_token_latency_seconds",
+            documentation="Histogram of inter-token latency in seconds.",
+            buckets=[
+                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
+                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
+            ],
+            labelnames=labelnames)
+        self.histogram_inter_token_latency = make_per_engine(
+            histogram_inter_token_latency, engine_indexes, model_name)
+
         request_latency_buckets = [
             0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
             40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
@@ -537,8 +552,9 @@ class PrometheusStatLogger(StatLoggerBase):
             self.histogram_n_request[engine_idx].observe(n_param)
         for ttft in iteration_stats.time_to_first_tokens_iter:
             self.histogram_time_to_first_token[engine_idx].observe(ttft)
-        for tpot in iteration_stats.time_per_output_tokens_iter:
-            self.histogram_time_per_output_token[engine_idx].observe(tpot)
+        for itl in iteration_stats.inter_token_latencies_iter:
+            self.histogram_inter_token_latency[engine_idx].observe(itl)
+            self.histogram_time_per_output_token[engine_idx].observe(itl)
 
         for finished_request in iteration_stats.finished_requests:
             self.counter_request_success[
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 95094bda65..45c32aaaaf 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -96,7 +96,7 @@ class IterationStats:
         self.max_num_generation_tokens_iter: list[int] = []
         self.n_params_iter: list[int] = []
         self.time_to_first_tokens_iter: list[float] = []
-        self.time_per_output_tokens_iter: list[float] = []
+        self.inter_token_latencies_iter: list[float] = []
         self.waiting_lora_adapters: dict[str, int] = {}
         self.running_lora_adapters: dict[str, int] = {}
 
@@ -128,8 +128,8 @@ class IterationStats:
         if is_prefilling:
             req_stats.first_token_ts = engine_core_timestamp
         else:
-            tpot = engine_core_timestamp - req_stats.last_token_ts
-            self.time_per_output_tokens_iter.append(tpot)
+            itl = engine_core_timestamp - req_stats.last_token_ts
+            self.inter_token_latencies_iter.append(itl)
 
         req_stats.last_token_ts = engine_core_timestamp
 

From 598bd74cf80f91d0de422d6ac994034d86b9e31c Mon Sep 17 00:00:00 2001
From: nathan <97126670+nathanrchn@users.noreply.github.com>
Date: Tue, 2 Sep 2025 20:34:28 +0200
Subject: [PATCH 800/932] Fix weights loading for Apertus (#24100)

Signed-off-by: Nathan Ranchin <nranchin@student.ethz.ch>
---
 vllm/model_executor/models/apertus.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 0de683d2cd..f6400b05e1 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -415,6 +415,12 @@ class ApertusModel(nn.Module):
             (".qkv_proj", ".v_proj", "v"),
         ]
         params_dict = dict(self.named_parameters())
+
+        # we need to load the buffers for beta and eps (XIELU)
+        for name, buffer in self.named_buffers():
+            if name.endswith(".beta") or name.endswith(".eps"):
+                params_dict[name] = buffer
+
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:

From 98aee612aa13155badc2747bd51b378d6e515958 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 2 Sep 2025 14:53:34 -0400
Subject: [PATCH 801/932] [Log] Only Print Profiler Results on Rank 0 (#23370)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/v1/worker/gpu_worker.py | 6 ++++--
 vllm/worker/worker.py        | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index f49f5bdd97..cb000d53a9 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -400,8 +400,10 @@ class Worker(WorkerBase):
             self.profiler.start()
         else:
             self.profiler.stop()
-            print(self.profiler.key_averages().table(
-                sort_by="self_cuda_time_total"))
+            # only print profiler results on rank 0
+            if self.local_rank == 0:
+                print(self.profiler.key_averages().table(
+                    sort_by="self_cuda_time_total"))
 
     def execute_dummy_batch(self) -> None:
         self.model_runner._dummy_run(1)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 2e20c89c63..2d2e51c329 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -128,8 +128,10 @@ class Worker(LocalOrDistributedWorkerBase):
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")
         self.profiler.stop()
-        print(
-            self.profiler.key_averages().table(sort_by="self_cuda_time_total"))
+        # only print profiler results on rank 0
+        if self.local_rank == 0:
+            print(self.profiler.key_averages().table(
+                sort_by="self_cuda_time_total"))
 
     def sleep(self, level: int = 1) -> None:
         free_bytes_before_sleep = torch.cuda.mem_get_info()[0]

From d328f7894f140fdc643dc1aa5fe80f4596e6f418 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 2 Sep 2025 22:15:06 +0200
Subject: [PATCH 802/932] [CI] Enable all hf transformers baselines in
 test_hybrid (#23936)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 .../models/language/generation/test_hybrid.py | 76 ++++++-------------
 tests/models/registry.py                      | 13 +++-
 2 files changed, 32 insertions(+), 57 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 3cacbdcfbe..9e97e3fa65 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -34,17 +34,6 @@ HYBRID_MODELS = [
     "LiquidAI/LFM2-1.2B",
 ]
 
-HF_UNSUPPORTED_MODELS = [
-    # The HF transformers implementation of
-    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
-    # doesn't compare vLLM output with HF output.
-    # See https://github.com/huggingface/transformers/pull/35943
-    "yujiepan/mamba2-codestral-v0.1-tiny-random",
-    # transformers 4.55 is still producing garbage for this model
-    # TODO(tdoublep): follow-up on transformers side
-    "ibm-granite/granite-4.0-tiny-preview"
-]
-
 V1_SUPPORTED_MODELS = [
     "state-spaces/mamba-130m-hf",
     "ai21labs/Jamba-tiny-dev",
@@ -90,20 +79,13 @@ def test_models(
     try:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
         model_info.check_available_online(on_fail="skip")
-        hf_version_check = model_info.check_transformers_version(
-            on_fail="return")
+        model_info.check_transformers_version(on_fail="skip")
     except ValueError:
-        hf_version_check = None
-
-    if hf_version_check is not None:
-        print(f"Skipping transformers comparison because: {hf_version_check}")
+        pass
 
     with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS and hf_version_check is None:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
-        else:
-            hf_outputs = None
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "0")
@@ -121,7 +103,7 @@ def test_models(
     else:
         vllm_v1_outputs = None
 
-    if hf_outputs is not None and vllm_v0_outputs is not None:
+    if vllm_v0_outputs is not None:
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_v0_outputs,
@@ -130,12 +112,10 @@ def test_models(
         )
 
     if model in V1_SUPPORTED_MODELS:
-        ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
-        assert ref_outputs is not None
         check_logprobs_close(
-            outputs_0_lst=ref_outputs,
+            outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_v1_outputs,
-            name_0="hf" if hf_outputs is not None else "vllm-v0",
+            name_0="hf",
             name_1="vllm-v1",
         )
 
@@ -402,11 +382,8 @@ def test_full_cuda_graph(
         pass
 
     with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
-        else:
-            hf_outputs = None
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "0")
@@ -421,7 +398,7 @@ def test_full_cuda_graph(
         vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
-    if hf_outputs is not None and vllm_v0_outputs is not None:
+    if vllm_v0_outputs is not None:
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_v0_outputs,
@@ -429,12 +406,10 @@ def test_full_cuda_graph(
             name_1="vllm-v0",
         )
 
-    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
-    assert ref_outputs is not None
     check_logprobs_close(
-        outputs_0_lst=ref_outputs,
+        outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_v1_outputs,
-        name_0="hf" if hf_outputs is not None else "vllm-v0",
+        name_0="hf",
         name_1="vllm-v1",
     )
 
@@ -460,11 +435,8 @@ def test_fp32_state(
         pass
 
     with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
-        else:
-            hf_outputs = None
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "0")
@@ -480,18 +452,16 @@ def test_fp32_state(
         vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
-    if hf_outputs is not None:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_v0_outputs,
-            name_0="hf",
-            name_1="vllm-v0",
-        )
-
-    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
     check_logprobs_close(
-        outputs_0_lst=ref_outputs,
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_v0_outputs,
+        name_0="hf",
+        name_1="vllm-v0",
+    )
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_v1_outputs,
-        name_0="hf" if hf_outputs is not None else "vllm-v0",
+        name_0="hf",
         name_1="vllm-v1",
     )
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3b5cec2dc7..4cf3dd6e08 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -154,7 +154,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
                                          trust_remote_code=True),
     "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1",
-                                        min_transformers_version="4.56.0",
+                                        min_transformers_version="4.55.3",
                                         extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
                                         {"1b": "bigscience/bloomz-1b1"}),
@@ -208,7 +208,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "GptOssForCausalLM": _HfExamplesInfo("lmsys/gpt-oss-20b-bf16"),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
-    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"),  # noqa: E501
+    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview", # noqa: E501
+                                                   min_transformers_version="4.55.3"),
     "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
     "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                              trust_remote_code=True),
@@ -228,7 +229,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                             trust_remote_code=True),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
     "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
-                                        min_transformers_version="4.56.0",
+                                        min_transformers_version="4.55.3",
                                         extras={
                                             "tiny": "ai21labs/Jamba-tiny-dev",
                                             "random": "ai21labs/Jamba-tiny-random",  # noqa: E501
@@ -244,7 +245,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
                                          is_available_online=False),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
-    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
+    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
+                                         min_transformers_version="4.55.3",
+                                         extras={
+                                            "random": "yujiepan/mamba2-codestral-v0.1-tiny-random", # noqa: E501
+                                         }),
     "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
     "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
                                          trust_remote_code=True),

From 457e4719710e44014fb8b2eaf668d0c5cfa145c1 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Tue, 2 Sep 2025 17:13:57 -0500
Subject: [PATCH 803/932] [AMD][Kernel][Bugfix] Cast offsets tensor bn to
 tl.int64 to avoid GPU segfault (#23692)

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
---
 vllm/attention/ops/prefix_prefill.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index e1d41930f6..a70db89cdb 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -146,7 +146,7 @@ def _fwd_kernel(Q,
         start_n = tl.multiple_of(start_n, BLOCK_SIZE)
         # -- compute qk ----
         bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                     (start_n // BLOCK_SIZE) * stride_b_loc_s)
+                     (start_n // BLOCK_SIZE) * stride_b_loc_s).to(tl.int64)
         # [D,BLOCK_SIZE]
         off_k = (
             bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
@@ -367,7 +367,7 @@ def _fwd_kernel_flash_attn_v2(
         bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
                      ((start_n + offs_n) // block_size) * stride_b_loc_s,
                      mask=(start_n + offs_n) < cur_batch_ctx_len,
-                     other=0)
+                     other=0).to(tl.int64)
         off_k = (
             bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
             (offs_d[:, None] // x) * stride_k_cache_d +
@@ -575,7 +575,7 @@ def _fwd_kernel_alibi(
         bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
                      ((start_n + offs_n) // block_size) * stride_b_loc_s,
                      mask=(start_n + offs_n) < cur_batch_ctx_len,
-                     other=0)
+                     other=0).to(tl.int64)
         off_k = (
             bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
             (offs_d[:, None] // x) * stride_k_cache_d +

From 930a24144c073a08cfecabd75a242e713bc4f57e Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 2 Sep 2025 18:22:30 -0400
Subject: [PATCH 804/932] [Bug] R1 Accuracy: Fix `routed_scaling_factor` Double
 Mul Issue (#24119)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/models/deepseek_v2.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 36c9427e47..3a8eaf6817 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -160,7 +160,8 @@ class DeepseekV2MoE(nn.Module):
             topk_group=config.topk_group,
             prefix=f"{prefix}.experts",
             scoring_func=config.scoring_func,
-            routed_scaling_factor=self.routed_scaling_factor,
+            # we do scaling outside, set factor to 1.0 to avoid double mul
+            routed_scaling_factor=1.0,
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts)

From 2fd1a40a54cf9a5af6f0a8ce4700faf4a1a5108b Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Tue, 2 Sep 2025 19:50:28 -0400
Subject: [PATCH 805/932] [CI/Build] Disable SiluMul NVFP4 quant fusion tests
 (#24121)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .buildkite/test-pipeline.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 55349e0ac9..be7044c41a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -666,7 +666,7 @@ steps:
     # Quantization
     - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
     - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+    # - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -676,7 +676,7 @@ steps:
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####

From 862f2ef893d9751db0a92bd2d4ae0e3d9677872f Mon Sep 17 00:00:00 2001
From: Chaojun Zhang <chaojun.zhang@intel.com>
Date: Wed, 3 Sep 2025 08:21:18 +0800
Subject: [PATCH 806/932] [XPU] Fix the bug of LoRA logits on the XPU platform
 (#24081)

Signed-off-by: chzhang <chaojun.zhang@intel.com>
---
 vllm/lora/layers.py                    |  2 +-
 vllm/lora/punica_wrapper/punica_xpu.py | 13 ++++++++++---
 vllm/platforms/xpu.py                  |  5 ++++-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index d8503b2045..6e4b69c303 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1151,7 +1151,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         lora_logits = lora_logits.mT
         indices_padded = self.punica_wrapper.sampler_indices_padded
 
-        if current_platform.is_tpu():
+        if current_platform.is_tpu() or current_platform.is_xpu():
             indices_padded = indices_padded[:logits.size(0)]
 
         lora_logits = (lora_logits.reshape(
diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py
index 572e39e0ec..163bb41223 100644
--- a/vllm/lora/punica_wrapper/punica_xpu.py
+++ b/vllm/lora/punica_wrapper/punica_xpu.py
@@ -225,6 +225,13 @@ class PunicaWrapperXPU(PunicaWrapperBase):
             add_inputs=True,
             **kwargs)
 
+    @property
+    def sampler_indices_padded(self) -> torch.Tensor:
+        """
+        This property provides access to padded sampler indices.
+        """
+        return self._sampler_indices_padded[:]
+
     def add_lora_logits(self,
                         y: torch.Tensor,
                         x: torch.Tensor,
@@ -259,11 +266,11 @@ class PunicaWrapperXPU(PunicaWrapperBase):
             buffer = torch.zeros((x.size(0), r),
                                  dtype=torch.float32,
                                  device=x.device)
-
-        bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
+        sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0))
+        bgmv_shrink(x, lora_a_stacked, buffer, sampler_indices, scale)
         bgmv_expand(buffer,
                     lora_b_stacked,
                     y,
-                    self.sampler_indices,
+                    sampler_indices,
                     add_inputs=True)
         return y.view_as(y_org)
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index d61b921e19..645a9e63a4 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -91,7 +91,7 @@ class XPUPlatform(Platform):
             cache_config.block_size = 64
 
         # lazy import to avoid circular import
-        from vllm.config import CUDAGraphMode
+        from vllm.config import CompilationLevel, CUDAGraphMode
         compilation_config = vllm_config.compilation_config
         if compilation_config.cudagraph_mode is None or \
                 compilation_config.cudagraph_mode.max_cudagraph_mode() \
@@ -100,6 +100,9 @@ class XPUPlatform(Platform):
                         "cudagraphs. Fallback to cudagraph_mode=NONE")
             compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
+        if vllm_config.lora_config is not None:
+            compilation_config.level = CompilationLevel.NO_COMPILATION
+
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
         parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"

From 42dc59dbaceb2b9aa1477e9b3e0c33b379678468 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 3 Sep 2025 10:09:19 +0800
Subject: [PATCH 807/932] Update release pipeline post PyTorch 2.8.0 update
 (#24073)

Signed-off-by: Huy Do <huydhn@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Huy Do <huydhn@gmail.com>
---
 .buildkite/release-pipeline.yaml    | 32 ++++++++++++++---------------
 .buildkite/scripts/upload-wheels.sh | 22 +++++++++++---------
 tools/install_deepgemm.sh           |  2 +-
 3 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 92a1bcada3..53b5b23db3 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,21 +1,24 @@
 steps:
-  # aarch64 + CUDA builds
-  - label: "Build arm64 wheel - CUDA 12.8"
-    id: build-wheel-arm64-cuda-12-8
+  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+  - label: "Build arm64 wheel - CUDA 12.9"
+    id: build-wheel-arm64-cuda-12-9
     agents:
       queue: arm64_cpu_queue_postmerge
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
-  # x86 + CUDA builds
+  - block: "Build CUDA 12.8 wheel"
+    key: block-build-cu128-wheel
+
   - label: "Build wheel - CUDA 12.8"
+    depends_on: block-build-cu128-wheel
     id: build-wheel-cuda-12-8
     agents:
       queue: cpu_queue_postmerge
@@ -44,18 +47,14 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
-  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
-  # However, this block can be uncommented to save some compute hours.
-  # - block: "Build CUDA 11.8 wheel"
-  #   key: block-build-cu118-wheel
-
-  - label: "Build wheel - CUDA 11.8"
-    # depends_on: block-build-cu118-wheel
-    id: build-wheel-cuda-11-8
+  # x86 + CUDA builds
+  - label: "Build wheel - CUDA 12.9"
+    depends_on: ~
+    id: build-wheel-cuda-12-9
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -75,6 +74,7 @@ steps:
       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
+  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
   - label: "Build release image (arm64)"
     depends_on: ~
     id: build-release-image-arm64
@@ -82,7 +82,7 @@ steps:
       queue: arm64_cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 
   # Add job to create multi-arch manifest
@@ -103,7 +103,7 @@ steps:
       - create-multi-arch-manifest
       - build-wheel-cuda-12-8
       - build-wheel-cuda-12-6
-      - build-wheel-cuda-11-8
+      - build-wheel-cuda-12-9
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 745f285c00..43aa8c47be 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -58,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 
-if [[ $normal_wheel == *"cu118"* ]]; then
-    # if $normal_wheel matches cu118, do not upload the index.html
-    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
     # if $normal_wheel matches cu126, do not upload the index.html
     echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+    # if $normal_wheel matches cu128, do not upload the index.html
+    echo "Skipping index files for cu128 wheels"
 else
-    # only upload index.html for cu128 wheels (default wheels)
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
     aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
     aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@@ -74,14 +75,15 @@ fi
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 
-if [[ $normal_wheel == *"cu118"* ]]; then
-    # if $normal_wheel matches cu118, do not upload the index.html
-    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
     # if $normal_wheel matches cu126, do not upload the index.html
     echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+    # if $normal_wheel matches cu128, do not upload the index.html
+    echo "Skipping index files for cu128 wheels"
 else
-    # only upload index.html for cu128 wheels (default wheels)
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
     aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
 
diff --git a/tools/install_deepgemm.sh b/tools/install_deepgemm.sh
index b125cda96f..98427f1835 100755
--- a/tools/install_deepgemm.sh
+++ b/tools/install_deepgemm.sh
@@ -105,4 +105,4 @@ fi
 
 popd
 
-echo "✅ DeepGEMM installation completed successfully"
\ No newline at end of file
+echo "✅ DeepGEMM installation completed successfully"

From e32a0e86781bd6dc0d5cf267cf177c762bf96ffa Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 2 Sep 2025 22:32:59 -0400
Subject: [PATCH 808/932] Upgrade xgrammar to 0.1.23 (#22988)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 requirements/common.txt            | 2 +-
 vllm/v1/worker/gpu_model_runner.py | 9 +--------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index e21abfb9a3..ce0795488c 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -25,7 +25,7 @@ outlines == 0.1.11 ; platform_machine == "s390x"
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 96dafd6add..c81bc58f1e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -90,15 +90,11 @@ from .utils import (AttentionGroup, MultiModalBudget,
 
 if TYPE_CHECKING:
     import xgrammar as xgr
-    import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile  # noqa: E501
 
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
     from vllm.v1.core.sched.output import SchedulerOutput
 else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
-    xgr_torch_compile = LazyLoader(
-        "xgr_torch_compile", globals(),
-        "xgrammar.kernels.apply_token_bitmask_inplace_torch_compile")
 
 logger = init_logger(__name__)
 
@@ -1333,10 +1329,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # so we receive it in that format.
         grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous()
 
-        # Force use of the torch.compile implementation from xgrammar to work
-        # around issues with the Triton kernel in concurrent structured output
-        # scenarios. See PR #19565 and issues #19493, #18376 for details.
-        xgr_torch_compile.apply_token_bitmask_inplace_torch_compile(
+        xgr.apply_token_bitmask_inplace(
             logits,
             grammar_bitmask.to(self.device, non_blocking=True),
             indices=out_indices if not skip_out_indices else None,

From 136d853e65a91a21b08227217b51daaba2d5cc71 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Tue, 2 Sep 2025 22:52:51 -0400
Subject: [PATCH 809/932] [V1] Wrapper which plumbs request-level logits
 processors into vLLM batch-level logits processing (#23656)

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../custom.py}                                |   0
 .../logits_processor/custom_req.py            | 151 ++++++++++++++++
 .../logits_processor/custom_req_init.py       | 165 ++++++++++++++++++
 .../logits_processors/test_custom_offline.py  |  33 ++++
 tests/v1/logits_processors/utils.py           |  67 ++++++-
 vllm/v1/sample/logits_processor/__init__.py   | 113 +++++++++++-
 6 files changed, 524 insertions(+), 5 deletions(-)
 rename examples/offline_inference/{logits_processor.py => logits_processor/custom.py} (100%)
 create mode 100644 examples/offline_inference/logits_processor/custom_req.py
 create mode 100644 examples/offline_inference/logits_processor/custom_req_init.py

diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor/custom.py
similarity index 100%
rename from examples/offline_inference/logits_processor.py
rename to examples/offline_inference/logits_processor/custom.py
diff --git a/examples/offline_inference/logits_processor/custom_req.py b/examples/offline_inference/logits_processor/custom_req.py
new file mode 100644
index 0000000000..4c19bb4ce2
--- /dev/null
+++ b/examples/offline_inference/logits_processor/custom_req.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""This example demonstrates wrapping a request-level logits processor to be
+compatible with vLLM's batch-level logits processing
+
+For demo purposes, a dummy logits processor is employed which, if
+`target_token` is passed as a keyword argument to `SamplingParams.extra_args`,
+will mask out all tokens except `target_token`. This logits processor can be
+applied to a vector of logits associated with a single decode step for a single
+request. The logits processor cannot be applied to a request which does not
+pass in a `target_token` custom argument.
+
+The request-level dummy logits processor is wrapped to create a batch-level
+logits processor, which can apply the logits processor to output logits from
+all requests in the persistent batch in a given decode step. For requests which
+do not provide a `target_token` argument, the corresponding row of `logits`
+will not be modified.
+
+A batch is constructed with `temperature=0.0` and 50% of requests specifying
+`target_token`, and for these requests - and *only* these requests - we
+expect the `target_token` to be decoded in each step, yielding an output
+similar to that shown below:
+
+Generated Outputs:
+------------------------------------------------------------
+Prompt:    'Hello, my name is'
+Output:    " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
+------------------------------------------------------------
+Prompt:    'The president of the United States is'
+Output:    " not a racist. He is a racist.\nHe's a racist because he"
+------------------------------------------------------------
+Prompt:    'The capital of France is'
+Output:    ' also also also also also also also also also also also also also
+             also also also'
+------------------------------------------------------------
+Prompt:    'The future of AI is'
+Output:    ' in the hands of the people.\n\nThe future of AI is in the'
+------------------------------------------------------------
+"""
+
+from typing import Any, Optional
+
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.logger import init_logger
+from vllm.v1.sample.logits_processor import (
+    AdapterLogitsProcessor,
+    RequestLogitsProcessor,
+)
+
+logger = init_logger(__name__)
+
+
+class DummyPerReqLogitsProcessor:
+    """The request-level logits processor masks out all logits except the
+    token id identified by `target_token`"""
+
+    def __init__(self, target_token: int) -> None:
+        """Specify `target_token`"""
+        self.target_token = target_token
+
+    def __call__(
+        self,
+        output_ids: list[int],
+        logits: torch.Tensor,
+    ) -> torch.Tensor:
+        val_to_keep = logits[self.target_token].item()
+        logits[:] = float("-inf")
+        logits[self.target_token] = val_to_keep
+        return logits
+
+
+class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
+    """Example of wrapping a fake request-level logit processor to create a
+    batch-level logits processor"""
+
+    def is_argmax_invariant(self) -> bool:
+        return False
+
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> Optional[RequestLogitsProcessor]:
+        """This method returns a new request-level logits processor, customized
+        to the `target_token` value associated with a particular request.
+
+        Returns None if the logits processor should not be applied to the
+        particular request. To use the logits processor the request must have
+        a "target_token" custom argument with an integer value.
+
+        Args:
+          params: per-request sampling params
+
+        Returns:
+          `Callable` request logits processor, or None
+        """
+        target_token: Optional[Any] = params.extra_args and params.extra_args.get(
+            "target_token"
+        )
+        if target_token is None:
+            return None
+        if not isinstance(target_token, int):
+            logger.warning(
+                "target_token value %s is not int; not applying logits"
+                " processor to request.",
+                target_token,
+            )
+            return None
+        return DummyPerReqLogitsProcessor(target_token)
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[WrappedPerReqLogitsProcessor],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/logits_processor/custom_req_init.py b/examples/offline_inference/logits_processor/custom_req_init.py
new file mode 100644
index 0000000000..62947d122e
--- /dev/null
+++ b/examples/offline_inference/logits_processor/custom_req_init.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""This example demonstrates a special case of wrapping a request-level logits
+processor, namely the case where it is necessary to utilize engine config or
+environment info passed to the constructor. The subclass must override the
+wrapper base class `__init__()` method to access the engine config, the device
+identifier, or the flag which indicates whether pinned memory is available.
+
+For demo purposes, a request-level dummy logits processor is employed which
+causes the same token (`target_token`) to be decoded in each step. The
+request-level dummy logits processor is wrapped to create a batch-level logits
+processor, which can apply the logits processor to output logits from all
+requests in the persistent batch in a given decode step.
+
+The wrapped dummy logits processor below models a scenario where we must
+disable the logits processor on non-"cuda" platforms. The wrapper base class
+`__init__()` is overridden in order to check this condition and set a flag.
+
+A batch is constructed with `temperature=0.0` and 50% of requests specifying
+`target_token`, and for these requests - and *only* these requests - we
+expect that on a "cuda" device the output will look something like:
+
+Generated Outputs:
+------------------------------------------------------------
+Prompt:    'Hello, my name is'
+Output:    " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
+------------------------------------------------------------
+Prompt:    'The president of the United States is'
+Output:    " not a racist. He is a racist.\nHe's a racist because he"
+------------------------------------------------------------
+Prompt:    'The capital of France is'
+Output:    ' also also also also also also also also also also also also also
+             also also also'
+------------------------------------------------------------
+Prompt:    'The future of AI is'
+Output:    ' in the hands of the people.\n\nThe future of AI is in the'
+------------------------------------------------------------
+
+which indicates that the logits processor is running. However, on a non-"cuda"
+device, the first and third requests would not repeat the same token.
+"""
+
+from typing import Optional
+
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.sample.logits_processor import (
+    AdapterLogitsProcessor,
+    RequestLogitsProcessor,
+)
+
+logger = init_logger(__name__)
+
+
+class DummyPerReqLogitsProcessor:
+    """The request-level logits processor masks out all logits except the
+    token id identified by `target_token`"""
+
+    def __init__(self, target_token: int) -> None:
+        """Specify `target_token`"""
+        self.target_token = target_token
+
+    def __call__(
+        self,
+        output_ids: list[int],
+        logits: torch.Tensor,
+    ) -> torch.Tensor:
+        val_to_keep = logits[self.target_token].item()
+        logits[:] = float("-inf")
+        logits[self.target_token] = val_to_keep
+        return logits
+
+
+class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
+    """Example of overriding the wrapper class `__init__()` in order to utilize
+    info about the device type"""
+
+    def __init__(
+        self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
+    ):
+        super().__init__(vllm_config, device, is_pin_memory)
+        self.is_cuda = device.type == "cuda"
+
+    def is_argmax_invariant(self) -> bool:
+        return False
+
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> Optional[RequestLogitsProcessor]:
+        """This method returns a new request-level logits processor, customized
+        to the `target_token` value associated with a particular request.
+
+        Returns None if the logits processor should not be applied to the
+        particular request. To use the logits processor the request must have
+        a "target_token" custom argument with an integer value, and the device
+        must be "cuda"-type
+
+        Args:
+          params: per-request sampling params
+
+        Returns:
+          `Callable` request logits processor, or None
+        """
+        if (
+            not self.is_cuda
+            or (
+                target_token := params.extra_args
+                and params.extra_args.get("target_token")
+            )
+            is None
+        ):
+            return None
+        if not isinstance(target_token, int):
+            logger.warning(
+                "target_token value %s is not int; not applying logits"
+                " processor to request.",
+                target_token,
+            )
+            return None
+        return DummyPerReqLogitsProcessor(target_token)
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[WrappedPerReqLogitsProcessor],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py
index a7fde1990f..97d96b129a 100644
--- a/tests/v1/logits_processors/test_custom_offline.py
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -15,6 +15,7 @@ from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
                                               POOLING_MODEL_NAME, TEMP_GREEDY,
                                               CustomLogitprocSource,
                                               DummyLogitsProcessor,
+                                              WrappedPerReqLogitsProcessor,
                                               dummy_module)
 from tests.v1.logits_processors.utils import entry_points as fake_entry_points
 from tests.v1.logits_processors.utils import prompts
@@ -161,6 +162,38 @@ def test_custom_logitsprocs(monkeypatch,
     _run_test(kwargs, logitproc_loaded=True)
 
 
+@create_new_process_for_each_test()
+def test_custom_logitsprocs_req(monkeypatch):
+    """Test passing request-level logits processor to offline Python interface
+    
+    Wrap a request-level logits processor to create a batch level logits
+    processor that has a well-defined behavior (mask out all tokens except one
+    `target_token`)
+
+    Construct an `LLM` instance which loads the wrapped logits processor. Pass
+    the custom logitproc as a class object.
+
+    Construct a reference `LLM` instance with no custom logitproc
+
+    Pass in a batch of requests, 50% of which pass a `target_token` value
+    in through `SamplingParams.extra_args`, 50% of which do not.
+
+    Validate that
+    * Requests which do not activate the custom logitproc, yield the same
+      results for both `LLM` instances
+    * Requests which activate the custom logitproc, only output `target_token`
+
+    Args:
+      monkeypatch: for setting env vars
+    """
+
+    # Test that logitproc info is passed to workers
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
+    random.seed(40)
+    _run_test({"logits_processors": [WrappedPerReqLogitsProcessor]},
+              logitproc_loaded=True)
+
+
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("logitproc_source", [
     CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT,
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
index c36f1bd021..7ec35bd3eb 100644
--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@@ -3,15 +3,21 @@
 
 import types
 from enum import Enum, auto
-from typing import Optional
+from typing import Any, Optional
 
 import torch
 
 from vllm.config import VllmConfig
-from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate,
-                                             LogitsProcessor)
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP,
+                                             AdapterLogitsProcessor,
+                                             BatchUpdate, LogitsProcessor,
+                                             RequestLogitsProcessor)
 from vllm.v1.sample.logits_processor.builtin import process_dict_updates
 
+logger = init_logger(__name__)
+
 MODEL_NAME = "facebook/opt-125m"
 POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
 DUMMY_LOGITPROC_ARG = "target_token"
@@ -104,5 +110,60 @@ class EntryPoints(list):
         self.names = [ep.name for ep in eps]
 
 
+class DummyPerReqLogitsProcessor:
+    """The request-level logits processor masks out all logits except the
+    token id identified by `target_token`"""
+
+    def __init__(self, target_token: int) -> None:
+        """Specify `target_token`"""
+        self.target_token = target_token
+
+    def __call__(
+        self,
+        output_ids: list[int],
+        logits: torch.Tensor,
+    ) -> torch.Tensor:
+        val_to_keep = logits[self.target_token].item()
+        logits[:] = float("-inf")
+        logits[self.target_token] = val_to_keep
+        return logits
+
+
+class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
+    """Example of wrapping a fake request-level logit processor to create a
+    batch-level logits processor"""
+
+    def is_argmax_invariant(self) -> bool:
+        return False
+
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> Optional[RequestLogitsProcessor]:
+        """This method returns a new request-level logits processor, customized
+        to the `target_token` value associated with a particular request.
+
+        Returns None if the logits processor should not be applied to the
+        particular request. To use the logits processor the request must have
+        a "target_token" custom argument with an integer value.
+
+        Args:
+          params: per-request sampling params
+
+        Returns:
+          `Callable` request logits processor, or None
+        """
+        target_token: Optional[
+            Any] = params.extra_args and params.extra_args.get("target_token")
+        if target_token is None:
+            return None
+        if not isinstance(target_token, int):
+            logger.warning(
+                "target_token value %s is not int; not applying logits"
+                " processor to request.", target_token)
+            return None
+        return DummyPerReqLogitsProcessor(target_token)
+
+
 """Fake version of importlib.metadata.entry_points"""
 entry_points = lambda group: EntryPoints(group)
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 8220269162..a5f1cadd85 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -1,16 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
+import inspect
 import itertools
+from abc import abstractmethod
 from collections.abc import Sequence
+from functools import partial
 from typing import TYPE_CHECKING, Optional, Union
 
 import torch
 
 from vllm.logger import init_logger
+from vllm.logits_process import LogitsProcessor as RequestLogitsProcessor
+from vllm.sampling_params import SamplingParams
 from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor,
                                                      MinPLogitsProcessor,
-                                                     MinTokensLogitsProcessor)
+                                                     MinTokensLogitsProcessor,
+                                                     process_dict_updates)
 from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
                                                        LogitsProcessor,
                                                        MoveDirectionality)
@@ -177,9 +183,112 @@ def build_logitsprocs(
             BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes))
 
 
+class AdapterLogitsProcessor(LogitsProcessor):
+    """Wrapper for per-request logits processors
+    
+    To wrap a specific per-request logits processor,
+    * Subclass `AdapterLogitsProcessor`
+    * Implement `self.is_argmax_invariant()` base-class method
+    * Implement `self.new_req_logits_processor(params)`
+    
+    `self.__init__(vllm_config, device, is_pin_memory)` does not need to be
+    overridden in general. However, to implement custom constructor behavior -
+    especially any logic which operates on or stores `vllm_config`, `device`,
+    or `is_pin_memory` - `self.__init__(vllm_config, device, is_pin_memory)`
+    must be overriden and the override must call
+    `super().__init__(vllm_config, device, is_pin_memory)`
+    """
+
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool):
+        """Subclass must invoke
+        `super().__init__(vllm_config, device, is_pin_memory)`.
+
+        Subclass constructor may find it useful to utilize the `vllm_config`,
+        `device` and `is_pin_memory` argument. However regardless of whether
+        these arguments are used, the vLLM logits processor interface requires
+        all three arguments to be present.
+        """
+
+        # Map req index -> logits processor state
+        #
+        # State representation is a partial[Tensor] comprising a request-level
+        # logits processor with the output token ids argument and (if required)
+        # the prompt token ids argument pre-populated
+        #
+        # Note that the partial carries a *reference* to output token ids, and
+        # will thus always operate on the list as it is currently, not as it
+        # was when the partial was created.
+        self.req_info: dict[int, partial[torch.Tensor]] = {}
+
+    @abstractmethod
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> Optional[RequestLogitsProcessor]:
+        """Consume request info; return a per-request logits processor.
+
+        Return None if logits processor does not need to be applied to request
+
+        Args:
+          params: request sampling params
+
+        Returns:
+          None if logits processor should not be applied to request; otherwise
+          returns a `RequestLogitsProcessor` instance
+        
+        """
+        raise NotImplementedError
+
+    def _new_state(
+        self,
+        params: SamplingParams,
+        prompt_ids: list[int],
+        output_ids: list[int],
+    ) -> Optional[partial[torch.Tensor]]:
+        """Return state representation for new request
+
+        Returns None if logits processor is not applicable to request
+
+        Args:
+          params: request sampling params
+          prompt_ids: request prompt token ids
+          output_ids: decoded tokens so far for this request
+
+        Returns:
+          logits processor partial[Tensor] or None
+        
+        """
+        if req_lp := self.new_req_logits_processor(params):
+            args = [prompt_ids, output_ids] if (len(
+                inspect.signature(req_lp).parameters) == 3) else [output_ids]
+            return partial(req_lp, *args)
+        return None
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        process_dict_updates(
+            self.req_info,
+            batch_update,
+            self._new_state,
+        )
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if self.req_info:
+            # Apply per-request logits processors to corresponding rows of
+            # logits tensor
+            for req_idx, req_lp in self.req_info.items():
+                req_logits = logits[req_idx]
+                new_logits = req_lp(req_logits)
+                if new_logits is not req_logits:
+                    # Modify logits tensor row in-place if necessary
+                    logits[req_idx] = new_logits
+        return logits
+
+
 __all__ = [
     "LogitsProcessor", "LogitBiasLogitsProcessor", "MinPLogitsProcessor",
     "MinTokensLogitsProcessor", "BatchUpdate", "BatchUpdateBuilder",
     "MoveDirectionality", "LogitsProcessors", "build_logitsprocs",
-    "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP"
+    "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP",
+    "AdapterLogitsProcessor"
 ]

From 1bd007f23476d98caeb0a62c00384d7f2cf052a6 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Wed, 3 Sep 2025 11:44:50 +0800
Subject: [PATCH 810/932] fix some typos (#24071)

Signed-off-by: co63oc <co63oc@users.noreply.github.com>
---
 benchmarks/benchmark_block_pool.py                 |  2 +-
 benchmarks/benchmark_ngram_proposer.py             |  2 +-
 csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu    |  2 +-
 docs/configuration/optimization.md                 |  4 ++--
 docs/design/io_processor_plugins.md                |  2 +-
 .../prithvi_geospatial_mae_io_processor.py         |  2 +-
 examples/online_serving/prithvi_geospatial_mae.py  |  2 +-
 tests/compile/piecewise/test_multiple_graphs.py    |  2 +-
 tests/kernels/moe/test_mxfp4_moe.py                |  2 +-
 tests/models/multimodal/processing/test_mllama4.py |  2 +-
 tests/quantization/test_modelopt.py                |  2 +-
 tests/samplers/test_beam_search.py                 |  2 +-
 tests/v1/attention/test_chunked_local_attention.py |  2 +-
 .../unit/test_shared_storage_connector.py          | 14 +++++++-------
 tests/v1/logits_processors/test_custom_offline.py  |  2 +-
 vllm/benchmarks/serve.py                           |  2 +-
 vllm/config/compilation.py                         |  2 +-
 vllm/config/parallel.py                            |  2 +-
 .../kv_transfer/kv_connector/v1/nixl_connector.py  |  2 +-
 vllm/entrypoints/openai/serving_responses.py       |  2 +-
 vllm/model_executor/layers/activation.py           |  2 +-
 .../compressed_tensors/transform/module.py         |  2 +-
 vllm/model_executor/layers/quantization/mxfp4.py   |  2 +-
 vllm/model_executor/models/gemma3n_mm.py           |  2 +-
 vllm/model_executor/models/interns1.py             |  2 +-
 vllm/third_party/pynvml.py                         |  2 +-
 vllm/v1/attention/backends/flash_attn.py           |  2 +-
 vllm/v1/attention/backends/flashinfer.py           |  2 +-
 vllm/v1/core/kv_cache_utils.py                     |  2 +-
 vllm/v1/worker/gpu_input_batch.py                  |  2 +-
 vllm/v1/worker/gpu_model_runner.py                 |  2 +-
 vllm/v1/worker/kv_connector_model_runner_mixin.py  |  2 +-
 32 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py
index fd363c2ad0..eae8d9927e 100644
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@@ -57,7 +57,7 @@ def invoke_main() -> None:
         "--num-iteration",
         type=int,
         default=1000,
-        help="Number of iterations to run to stablize final data readings",
+        help="Number of iterations to run to stabilize final data readings",
     )
     parser.add_argument(
         "--allocate-blocks",
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index c60040d05a..11833fa1b3 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -77,7 +77,7 @@ def invoke_main() -> None:
         "--num-iteration",
         type=int,
         default=100,
-        help="Number of iterations to run to stablize final data readings",
+        help="Number of iterations to run to stabilize final data readings",
     )
     parser.add_argument(
         "--num-req", type=int, default=128, help="Number of requests in the batch"
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
index fdac47c425..d7efb717a9 100644
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@@ -181,7 +181,7 @@ struct W4A8GemmKernel {
     auto A_ptr = static_cast<MmaType const*>(A.const_data_ptr());
     auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr());
     auto D_ptr = static_cast<ElementD*>(D.data_ptr());
-    // can we avoid harcode the 8 here
+    // can we avoid hardcode the 8 here
     auto S_ptr =
         static_cast<cutlass::Array<ElementScale, ScalePackSize> const*>(
             group_scales.const_data_ptr());
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 0ab2ae58ad..c853fcf929 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
 
 !!! note
     API server scale-out disables [multi-modal IPC caching](#ipc-caching)
-    because it requires a one-to-one correspondance between API and engine core processes.
+    because it requires a one-to-one correspondence between API and engine core processes.
 
     This does not impact [multi-modal processor caching](#processor-caching).
 
@@ -227,7 +227,7 @@ to avoid repeatedly processing the same multi-modal inputs in `BaseMultiModalPro
 ### IPC Caching
 
 Multi-modal IPC caching is automatically enabled when
-there is a one-to-one correspondance between API (`P0`) and engine core (`P1`) processes,
+there is a one-to-one correspondence between API (`P0`) and engine core (`P1`) processes,
 to avoid repeatedly transferring the same multi-modal inputs between them.
 
 ### Configuration
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index ee474b5a7b..e70ee4a076 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -2,7 +2,7 @@
 
 IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
 
-When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggerd via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
+When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
 
 ## Writing an IO Processor Plugin
 
diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
index 8023cd6677..adc27859a1 100644
--- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
+++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
@@ -12,7 +12,7 @@ from vllm.pooling_params import PoolingParams
 # multimodal data. In this specific case this example will take a geotiff
 # image as input, process it using the multimodal data processor, and
 # perform inference.
-# Reuirement - install plugin at:
+# Requirement - install plugin at:
 #   https://github.com/christian-pinto/prithvi_io_processor_plugin
 
 
diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/prithvi_geospatial_mae.py
index 31301e0042..359162c470 100644
--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@@ -10,7 +10,7 @@ import requests
 # multimodal data. In this specific case this example will take a geotiff
 # image as input, process it using the multimodal data processor, and
 # perform inference.
-# Reuirements :
+# Requirements :
 # - install plugin at:
 #   https://github.com/christian-pinto/prithvi_io_processor_plugin
 # - start vllm in serving mode with the below args
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
index f5e2d9ddb7..aee2acbd49 100644
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -134,7 +134,7 @@ class SimpleModelWithTwoGraphs(ParentModel):
         # Test will fail without set_model_tag here with error:
         # "ValueError: too many values to unpack (expected 3)"
         # This is because CompiledAttention and CompiledAttentionTwo
-        # have different implmentations but the same torch.compile
+        # have different implementations but the same torch.compile
         # cache dir will be used as default prefix is 'model_tag'
         with set_model_tag("attn_one"):
             self.attn_one = CompiledAttention(
diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py
index 7bd1ffce58..c29bed3dd6 100644
--- a/tests/kernels/moe/test_mxfp4_moe.py
+++ b/tests/kernels/moe/test_mxfp4_moe.py
@@ -224,7 +224,7 @@ def tg_mxfp4_moe(
     assert (w2_bias.dim() == 2 and w2_bias.shape[0] == num_experts
             and w2_bias.shape[1] == hidden_size)
 
-    # Swap w1 and w3 as the defenition of
+    # Swap w1 and w3 as the definition of
     # swiglu is different in the trtllm-gen
     w13_weight_scale_ = w13_weight_scale.clone()
     w13_weight_ = w13_weight.clone()
diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
index 3be77b5da6..e7b28ff8ec 100644
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -52,7 +52,7 @@ def test_profiling(model_id: str, max_model_len: int):
     chunks_per_image = prod(mm_data["patches_per_image"])
     total_num_patches = chunks_per_image * tokens_per_patch
     num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
-        1]  # x-y seperator tokens
+        1]  # x-y separator tokens
     total_tokens = total_num_patches.item() + num_tiles.item(
     ) + 3  # image start, image, image end
 
diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py
index fcbfa681d7..c60a03f44b 100644
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -27,7 +27,7 @@ def use_v0_only(monkeypatch):
                     reason="ModelOpt FP8 is not supported on this GPU type.")
 def test_modelopt_fp8_checkpoint_setup(vllm_runner):
     """Test ModelOpt FP8 checkpoint loading and structure validation."""
-    # TODO: provide a small publically available test checkpoint
+    # TODO: provide a small publicly available test checkpoint
     model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
                   "TinyLlama-1.1B-Chat-v1.0-fp8-0710")
 
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index cc9a88a255..0320a5ef31 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -82,7 +82,7 @@ def test_beam_search_with_concurrency_limit(
     beam_width: int,
 ) -> None:
     # example_prompts[1]&[3]&[7] fails due to unknown reason even without
-    # concurency limit. skip them for now.
+    # concurrency limit. skip them for now.
     example_prompts = (example_prompts[:8])
     concurrency_limit = 2
     assert len(example_prompts) > concurrency_limit
diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py
index 8c5a63653d..be77256a0d 100644
--- a/tests/v1/attention/test_chunked_local_attention.py
+++ b/tests/v1/attention/test_chunked_local_attention.py
@@ -160,7 +160,7 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData):
         # Use torch.arange instead of torch.randint so we can assert on
         # block table tensor values. The block table will have shape
         # (num_batches, cdiv(max_seq_len, block_size)) and the values will be
-        # aranged from 0 to cdiv(max_seq_len, block_size)-1
+        # arranged from 0 to cdiv(max_seq_len, block_size)-1
         arange_block_indices=True,
     )
 
diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
index db203b81f1..6be261e45c 100644
--- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py
+++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
@@ -33,7 +33,7 @@ def _check_path_len(path):
 
 
 def _list_path(path):
-    """Return the list of foldername (hashes generatd) under the path"""
+    """Return the list of foldername (hashes generated) under the path"""
     return list(path.iterdir())
 
 
@@ -41,7 +41,7 @@ def run_test(tmp_path, processor, llm: LLM, question: str,
              image_urls: list[Image], expected_len: int, info: str):
     """
     One individual test to process the prompt and output base on 1 set of input
-    Then check if the length in the strorage path matches the expected length
+    Then check if the length in the storage path matches the expected length
     `info` introduces details or purpose of the individual test
     """
     print(f"***info: {info}***")
@@ -115,7 +115,7 @@ def test_shared_storage_connector_hashes(tmp_path):
     """
     Tests that SharedStorageConnector saves KV to the storage locations
     with proper hashes; that are unique for inputs with identical text but 
-    differnt images (same size), or same multiple images but different orders.
+    different images (same size), or same multiple images but different orders.
     """
     # Using tmp_path as the storage path to store KV
     print(f"KV storage path at: {str(tmp_path)}")
@@ -171,12 +171,12 @@ def test_shared_storage_connector_hashes(tmp_path):
                   img=[image_1],
                   expected_len=2,
                   info=("image_1 single input the 2nd time. "
-                        "It should not form aother new hash.")),
+                        "It should not form another new hash.")),
         InputCase(text=TEXT_PROMPTS[0],
                   img=[image_2],
                   expected_len=2,
                   info=("image_2 single input the 2nd time. "
-                        "It should not form aother new hash.")),
+                        "It should not form another new hash.")),
         InputCase(text=TEXT_PROMPTS[0],
                   img=[image_1, image_2],
                   expected_len=3,
@@ -189,12 +189,12 @@ def test_shared_storage_connector_hashes(tmp_path):
                   img=[image_1, image_2],
                   expected_len=4,
                   info=("[image_1, image_2] input the 2nd time. "
-                        "It should not form aother new hash.")),
+                        "It should not form another new hash.")),
         InputCase(text=TEXT_PROMPTS[0],
                   img=[image_2, image_1],
                   expected_len=4,
                   info=("[image_2, image_1] input the 2nd time. "
-                        "It should not form aother new hash.")),
+                        "It should not form another new hash.")),
         InputCase(text=TEXT_PROMPTS[0],
                   img=[],
                   expected_len=5,
diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py
index 97d96b129a..891f55a146 100644
--- a/tests/v1/logits_processors/test_custom_offline.py
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -81,7 +81,7 @@ def _run_test(kwargs: dict, logitproc_loaded: bool) -> None:
             target_token = params.extra_args[DUMMY_LOGITPROC_ARG]
             if not all(x == target_token for x in lp_toks):
                 raise AssertionError(
-                    f"Request {bdx} generated {lp_toks}, shoud all be "
+                    f"Request {bdx} generated {lp_toks}, should all be "
                     f"{target_token}")
         else:
             # This request does not exercise custom logitproc (or custom
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index abb838316c..a98eb2a78f 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -189,7 +189,7 @@ async def get_request(
         # NOTE: If we simply accumulate the random delta values
         # from the gamma distribution, their sum would have 1-2% gap
         # from target_total_delay_s. The purpose of the following logic is to
-        # close the gap for stablizing the throughput data
+        # close the gap for stabilizing the throughput data
         # from different random seeds.
         target_total_delay_s = total_requests / request_rate
         normalize_factor = target_total_delay_s / delay_ts[-1]
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 5c3b220016..28ad3d2f53 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -234,7 +234,7 @@ class CompilationConfig:
     - FULL_AND_PIECEWISE.
 
     PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
-    incompatiable ops (i.e. some attention ops) outside the cudagraph
+    incompatible ops (i.e. some attention ops) outside the cudagraph
     for general flexibility.
     This is the default mode.
 
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 9ea883d4a0..9d4594bab3 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -87,7 +87,7 @@ class ParallelConfig:
     data_parallel_external_lb: bool = False
     """Whether to use "external" DP LB mode. Applies only to online serving
     and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
-    wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank
+    wide-EP setup in Kubernetes. Set implicitly when --data-parallel-rank
     is provided explicitly to vllm serve."""
     data_parallel_hybrid_lb: bool = False
     """Whether to use "hybrid" DP LB mode. Applies only to online serving
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 6608d2a4a9..efe023d559 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -787,7 +787,7 @@ class NixlConnectorWorker:
         self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
             "NIXL_INIT_AGENT", descs)
 
-        # TODO(mgoin): Hybrid memory allocator is currently diabled for
+        # TODO(mgoin): Hybrid memory allocator is currently disabled for
         # models with local attention (Llama 4). Can remove this once enabled.
         if self.vllm_config.model_config.hf_config.model_type == "llama4":
             from transformers import Llama4TextConfig
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 4c15de3030..7f11b37e51 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -717,7 +717,7 @@ class OpenAIServingResponses(OpenAIServing):
                             prev_msgs.append(msg)
             messages.extend(prev_msgs)
         # Append the new input.
-        # Reponses API supports simple text inputs without chat format.
+        # Responses API supports simple text inputs without chat format.
         if isinstance(request.input, str):
             messages.append(get_user_message(request.input))
         else:
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index eb7e494e32..fac37ef75b 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -362,7 +362,7 @@ class ReLUSquaredActivation(CustomOp):
         return torch.square(F.relu(x))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        #TODO : implement cuda kenrels
+        #TODO : implement cuda kernels
         return self.forward_native(x)
 
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
index b3be254717..48ab2582a3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
@@ -83,7 +83,7 @@ class HadamardTransform(torch.nn.Module):
             # do not fold into weight in order to utilize FWHT
             self.scales[part_id] = 1 / math.sqrt(data.size(0))
 
-            # FUTURE: avoid runtime tranpose by processing weights
+            # FUTURE: avoid runtime transpose by processing weights
             # prior to apply
 
     def forward(self, value: Tensor, part_id: int = 0) -> Tensor:
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index a2301779c7..85d05ff51d 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -310,7 +310,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             w13_bias = layer.w13_bias.data.to(torch.float32)
             w2_bias = layer.w2_bias.data.to(torch.float32)
 
-            # Swap w1 and w3 as the defenition of
+            # Swap w1 and w3 as the definition of
             # swiglu is different in the trtllm-gen
             def swap_every_two_rows(x, axis=-1):
                 shape = x.shape
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index d831e9084d..3074451e40 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -179,7 +179,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
     ) -> BatchFeature:
 
         # HF Transformers audio processor no longer accepts `audios` key.
-        # We pop `audios` and replace it with `audio` key to surpress
+        # We pop `audios` and replace it with `audio` key to suppress
         # the warning.
         if 'audios' in mm_data:
             mm_data['audio'] = mm_data.pop('audios')
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index 26e358f939..d998b8a0ab 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -492,7 +492,7 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
-        # transformers InternVLProcessor uses <IMG_CONTEXT> as the seperator
+        # transformers InternVLProcessor uses <IMG_CONTEXT> as the separator
         # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116
         if modality.startswith("image"):
             return '<IMG_CONTEXT>'
diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py
index c06aa56744..6aabbc217d 100644
--- a/vllm/third_party/pynvml.py
+++ b/vllm/third_party/pynvml.py
@@ -3533,7 +3533,7 @@ def nvmlDeviceGetMPSComputeRunningProcesses_v3(handle):
         return []
     elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
         # typical case
-        # oversize the array incase more processes are created
+        # oversize the array in case more processes are created
         c_count.value = c_count.value * 2 + 5
         proc_array = c_nvmlProcessInfo_v3_t * c_count.value
         c_procs = proc_array()
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index dd2b956d4f..3cc67acd04 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -167,7 +167,7 @@ class FlashAttentionMetadataBuilder(
     # work for mixed prefill-decode and uniform-decode. But for non-spec decodes
     # the graphs would not work for mixed prefill-decode; sorta the inverse
     # of UNIFORM_SINGLE_TOKEN_DECODE.
-    # Theres probably a better way to describe this using `AttentionCGSupport`
+    # There's probably a better way to describe this using `AttentionCGSupport`
     # but for now just set it to `UNIFORM_BATCH` to get use to drop down
     # to FULL_AND_PIECEWISE.
     # TODO(luka, lucas): audit FA2 as part of:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 5fc3a1517b..2f275b8b23 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -291,7 +291,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 paged_kv_indices_buffer=paged_kv_indices,
                 paged_kv_last_page_len_buffer=paged_kv_last_page_len,
                 # Tensor cores are enabled by default because the perf would be
-                # atleast as good as cuda cores for all attention ops in latest
+                # at least as good as cuda cores for all attention ops in latest
                 # gpus.
                 use_tensor_cores=True,
             )
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 590baa6208..248ad9cda7 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -217,7 +217,7 @@ class FreeKVCacheBlockQueue:
         # Create a fake head and a tail block for the doubly linked list to
         # reduce branching in the code
         #
-        # The implementation garenteed that the fake head and tail
+        # The implementation guaranteed that the fake head and tail
         # are NEVER got popped, so we could safely assume each real blocks
         # in the queue has prev and next blocks.
         self.fake_free_list_head = KVCacheBlock(block_id=-1)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index ef5a7e39a5..ad70d9efaa 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -584,7 +584,7 @@ class InputBatch:
 
             if self.is_pooling_model:
                 last_req_index -= 1
-                # Samping state not used by pooling models.
+                # Sampling state not used by pooling models.
                 continue
 
             # Autoregressive models require detailed tracking of condense
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c81bc58f1e..4556a51b80 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2776,7 +2776,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.attn_groups.append(
                 create_attn_groups(attn_backends, kv_cache_spec))
 
-        # Calculate reorder batch threshold (if neeeded)
+        # Calculate reorder batch threshold (if needed)
         self.calculate_reorder_batch_threshold()
 
     def initialize_cudagraph_capture(self) -> None:
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index a03ebe35d8..e2ffa2f12f 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -82,7 +82,7 @@ class KVConnectorModelRunnerMixin:
             scheduler_output) if has_kv_transfer_group() else nullcontext()
 
     # This context manager must be used within an active forward context.
-    # It encapsulates the entire KV conector lifecycle within execute_model
+    # It encapsulates the entire KV connector lifecycle within execute_model
     @staticmethod
     @contextmanager
     def _get_kv_connector_output(

From c4ed78b14f7f63cfa65722ad21deffe964441fd2 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 2 Sep 2025 23:45:52 -0400
Subject: [PATCH 811/932] [Compile] Fix Compile Warning for `w4a8_mm_entry.cu`
 (#23660)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
index d7efb717a9..57bcbaae45 100644
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@@ -11,6 +11,7 @@
 #include "core/registration.h"
 
 #include "cutlass/cutlass.h"
+#include <limits>
 
 #include "cute/tensor.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
@@ -169,6 +170,11 @@ struct W4A8GemmKernel {
     int k = A.size(1);
     int n = B.size(1);
 
+    // safely cast group_size to int
+    TORCH_CHECK(group_size > 0 && group_size <= std::numeric_limits<int>::max(),
+                "group_size out of supported range for int: ", group_size);
+    int const group_size_int = static_cast<int>(group_size);
+
     // Allocate output
     const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
     auto device = A.device();
@@ -192,7 +198,7 @@ struct W4A8GemmKernel {
         cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
 
     // strides
-    int const scale_k = cutlass::ceil_div(k, group_size);
+    int const scale_k = cutlass::ceil_div(k, group_size_int);
     StrideA stride_A =
         cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
     // Reverse stride here due to swap and transpose
@@ -211,8 +217,8 @@ struct W4A8GemmKernel {
     using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
 
     MainloopArguments mainloop_arguments{
-        B_ptr, layout_B_reordered, A_ptr,     stride_A,
-        S_ptr, stride_S,           group_size};
+        B_ptr, layout_B_reordered, A_ptr,         stride_A,
+        S_ptr, stride_S,           group_size_int};
 
     EpilogueArguments epilogue_arguments{
         ChTokScalesEpilogue::prepare_args(channel_scales, token_scales),

From d7e1e599724ea82e12f40bd2b9320b5c27b23a32 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Wed, 3 Sep 2025 06:05:45 +0200
Subject: [PATCH 812/932] [Doc]: fix typos in Python comments (#24093)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 tests/core/test_scheduler.py                           |  2 +-
 .../correctness/test_transcription_api_correctness.py  |  2 +-
 tests/entrypoints/openai/test_return_token_ids.py      |  2 +-
 tests/entrypoints/openai/test_serving_chat.py          |  2 +-
 tests/kernels/utils.py                                 |  2 +-
 tests/multimodal/test_utils.py                         |  4 ++--
 tests/v1/e2e/test_spec_decode.py                       |  2 +-
 .../kv_connector/unit/test_remote_decode_lifecycle.py  |  4 ++--
 tests/v1/spec_decode/test_tree_attention.py            |  4 ++--
 vllm/lora/utils.py                                     |  2 +-
 .../layers/quantization/compressed_tensors/utils.py    |  2 +-
 vllm/multimodal/utils.py                               | 10 +++++-----
 vllm/v1/attention/backends/utils.py                    |  2 +-
 vllm/v1/structured_output/utils.py                     |  4 ++--
 vllm/v1/worker/tpu_worker.py                           |  2 +-
 15 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 591e1780c1..e1a840bb15 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -641,7 +641,7 @@ def test_schedule_decode_blocks_to_copy_update():
     # Nothing is preempted.
     assert output.blocks_to_swap_out == []
     # Since append_slot returns the source -> dist mapping, it should
-    # applied.
+    # be applied.
     assert output.blocks_to_copy == [(2, 3)]
 
 
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index 0d0ce0be8c..9122b7003b 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -32,7 +32,7 @@ def to_bytes(y, sr):
 
 async def transcribe_audio(client, tokenizer, y, sr):
     # Send loaded audio directly instead of loading from disk,
-    # dont account for that time though
+    # don't account for that time though
     with to_bytes(y, sr) as f:
         start_time = time.perf_counter()
         transcription = await client.audio.transcriptions.create(
diff --git a/tests/entrypoints/openai/test_return_token_ids.py b/tests/entrypoints/openai/test_return_token_ids.py
index 6addcb41c4..ff8f193fec 100644
--- a/tests/entrypoints/openai/test_return_token_ids.py
+++ b/tests/entrypoints/openai/test_return_token_ids.py
@@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
                 logprobs_token_ids.append(token_id)
 
         # When echo=True, the logprobs include both prompt and response tokens
-        # The token_ids field should match the the suffix of response portion
+        # The token_ids field should match the suffix of response portion
         # The prompt_token_ids should match the prompt portion
         assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
         response_token_ids_length = len(completion.choices[0].token_ids)
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 10879f0be8..fe482112d3 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -313,7 +313,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
         }],
     )
 
-    # By default cache_salt in the engine prompt is not set
+    # By default, cache_salt in the engine prompt is not set
     with suppress(Exception):
         await serving_chat.create_chat_completion(req)
     assert "cache_salt" not in mock_engine.generate.call_args.args[0]
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index fa4125840a..c46db8e307 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1236,7 +1236,7 @@ def baseline_scaled_mm(a: torch.Tensor,
                        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
     # We treat N-dimensional group scaling as extended numpy-style broadcasting
-    # in numpy simply stretches dimensions with an extent of 1 to match the
+    # in numpy simply stretches dimensions with an extent of 1 to match
     # the target shape by repeating the data along that dimension (broadcasting)
     # , we extend these semantics to say if the extent of a dimension in the
     # source shape is not 1 and does not match the target shape we repeat each
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 05e68a961a..0f82e1f3e3 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -458,7 +458,7 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
     with torch.inference_mode():
         sharded_output = run_dp_sharded_vision_model(image_input, vision_model)
 
-    # Check that the world size is setup correctly
+    # Check that the world size is set up correctly
     assert get_tensor_model_parallel_world_size() == world_size
 
     # Check that the outputs have the same shape
@@ -642,7 +642,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
                                                            rope_type="rope_3d")
         sharded_output = torch.cat(sharded_output, dim=0)
 
-    # Check that the world size is setup correctly
+    # Check that the world size is set up correctly
     assert get_tensor_model_parallel_world_size() == world_size
 
     # Compare outputs (only on rank 0)
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index bd0fa6b807..cd1d34fc6c 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -83,7 +83,7 @@ def test_ngram_correctness(
     model_name: str,
 ):
     '''
-    Compare the outputs of a original LLM and a speculative LLM
+    Compare the outputs of an original LLM and a speculative LLM
     should be the same when using ngram speculative decoding.
     '''
     with monkeypatch.context() as m:
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index d8c56ac42f..380e72a156 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -42,7 +42,7 @@ def test_basic_lifecycle():
     engine_core_outputs = scheduler.update_from_output(scheduler_output,
                                                        model_runner_output)
 
-    # Ensure the request is finished after 1 tokens.
+    # Ensure the request is finished after 1 token.
     assert request.is_finished()
     assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED
     output = engine_core_outputs[0].outputs[0]
@@ -141,7 +141,7 @@ def test_short_prompt_lifecycle():
 
 
 def test_prefix_cache_lifecycle():
-    """Test that remote decode params still works with a prefix cache hit."""
+    """Test that remote decode params still work with a prefix cache hit."""
 
     vllm_config = create_vllm_config()
     scheduler = create_scheduler(vllm_config)
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index 6317817408..eacb2ad584 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -187,7 +187,7 @@ def test_tree_attn_correctness() -> None:
                         dtype=torch.bfloat16,
                     )
 
-                    # Setup the block table and KV cache for paged KV.
+                    # Set up the block table and KV cache for paged KV.
                     assert max_sequence_length % block_size == 0
                     max_blocks_per_batch = max_sequence_length // block_size
                     kv_cache = torch.randn(
@@ -222,7 +222,7 @@ def test_tree_attn_correctness() -> None:
                                 num_alloc_blocks_per_batch] = block_ids.view(
                                     -1, num_alloc_blocks_per_batch)
 
-                    # Setup the slot mapping for the input KVs.
+                    # Set up the slot mapping for the input KVs.
                     tree_positions = sequence_position + torch.arange(
                         0,
                         tree_size_q,
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index ab0a9fbd25..1fc214c12b 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -239,7 +239,7 @@ def get_adapter_absolute_path(lora_path: str) -> str:
     except (HfHubHTTPError, RepositoryNotFoundError, EntryNotFoundError,
             HFValidationError):
         # Handle errors that may occur during the download
-        # Return original path instead instead of throwing error here
+        # Return original path instead of throwing error here
         logger.exception("Error downloading the HuggingFace model")
         return lora_path
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 099d8613fc..b2dd250109 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -94,7 +94,7 @@ def find_matched_target(
     config that a layer corresponds to.
 
     Recall that a compressed-tensors configs has a concept of
-    config_groups, where each layer can be quantized with with a different
+    config_groups, where each layer can be quantized with a different
     scheme.
 
     targets in each config_group will be a list of either layer names
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index ac967dcc40..794e24c2c7 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -213,7 +213,7 @@ class MediaConnector:
         image_mode: str = "RGB",
     ) -> Image.Image:
         """
-        Load a PIL image from a HTTP or base64 data URL.
+        Load a PIL image from an HTTP or base64 data URL.
 
         By default, the image is converted into RGB format.
         """
@@ -237,7 +237,7 @@ class MediaConnector:
         image_mode: str = "RGB",
     ) -> Image.Image:
         """
-        Asynchronously load a PIL image from a HTTP or base64 data URL.
+        Asynchronously load a PIL image from an HTTP or base64 data URL.
 
         By default, the image is converted into RGB format.
         """
@@ -261,7 +261,7 @@ class MediaConnector:
         image_mode: str = "RGB",
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         """
-        Load video from a HTTP or base64 data URL.
+        Load video from an HTTP or base64 data URL.
         """
         image_io = ImageMediaIO(image_mode=image_mode,
                                 **self.media_io_kwargs.get("image", {}))
@@ -281,7 +281,7 @@ class MediaConnector:
         image_mode: str = "RGB",
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         """
-        Asynchronously load video from a HTTP or base64 data URL.
+        Asynchronously load video from an HTTP or base64 data URL.
 
         By default, the image is converted into RGB format.
         """
@@ -370,7 +370,7 @@ def group_mm_inputs_by_modality(
 
     def modality_group_func(
             mm_input: MultiModalKwargsItems) -> Union[str, int]:
-        # If the input has multiple modalities, return a id as the unique key
+        # If the input has multiple modalities, return an id as the unique key
         # for the mm_input input.
         if len(mm_input) > 1:
             return id(mm_input)
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 011a90ece0..b286a4ba9f 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -709,7 +709,7 @@ def reorder_batch_to_split_decodes_and_prefills(
 
     for i, req_id in enumerate(input_batch.req_ids):
         num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-        # for now treat 1 scheduled token as "decode" even if its not,
+        # for now treat 1 scheduled token as "decode" even if it's not,
         # we should update this to something like < 8 in the future but
         # currently the TritonMLA._forward_decode only supports
         # num_tokens = 1
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 95319831d5..953185a8fc 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -65,9 +65,9 @@ def get_outlines_cache_path() -> str:
     elif xdg_cache_home:
         return os.path.join(xdg_cache_home, ".cache", "outlines")
     # If homedir is "/", we may be inside a container, and thus writing to
-    # root would be problematic, so we fallback to using a tempfile.
+    # root would be problematic, so we fall back to using a tempfile.
     # Also validate the path exists, since os.path.expanduser does
-    # not garuntee existence.
+    # not guarantee existence.
     elif os.path.isdir(home_dir) and home_dir != "/":
         # Default Unix fallback: ~/.cache/outlines
         return os.path.join(home_dir, ".cache", "outlines")
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 9adf8a1421..3f4e3ecbd4 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -250,7 +250,7 @@ class TPUWorker:
         scheduler_output: "SchedulerOutput",
     ) -> Optional[ModelRunnerOutput]:
         output = self.model_runner.execute_model(scheduler_output)
-        # every worker's output is needed when kv_transfer_group is setup
+        # every worker's output is needed when kv_transfer_group is set up
         return output if self.is_driver_worker or has_kv_transfer_group(
         ) else None
 

From 02d411fdb232f31ce46c6f8076bfe9e5acf88fc9 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Wed, 3 Sep 2025 06:14:07 +0200
Subject: [PATCH 813/932] [Doc]: fix typos in Python comments (#24115)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 .buildkite/nightly-benchmarks/scripts/compare-json-results.py | 2 +-
 benchmarks/benchmark_serving.py                               | 2 +-
 benchmarks/benchmark_serving_structured_output.py             | 2 +-
 benchmarks/benchmark_throughput.py                            | 2 +-
 tools/profiler/visualize_layerwise_profile.py                 | 2 +-
 vllm/compilation/collective_fusion.py                         | 2 +-
 vllm/engine/multiprocessing/engine.py                         | 2 +-
 vllm/model_executor/layers/quantization/utils/w8a8_utils.py   | 2 +-
 vllm/model_executor/model_loader/default_loader.py            | 2 +-
 vllm/v1/worker/xpu_worker.py                                  | 2 +-
 vllm/worker/worker.py                                         | 2 +-
 11 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 50431d0cd4..5ea5a50a25 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -218,7 +218,7 @@ if __name__ == "__main__":
         "--xaxis",
         type=str,
         default="# of max concurrency.",
-        help="column name to use as X Axis in comparision graph",
+        help="column name to use as X Axis in comparison graph",
     )
     args = parser.parse_args()
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 02f5f585c0..934df05efa 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1104,7 +1104,7 @@ def create_argument_parser():
         "--percentile-metrics",
         type=str,
         default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentiles. "
         "This argument specifies the metrics to report percentiles. "
         'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
         'Default value is "ttft,tpot,itl".',
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index ca6843a72a..4aae755eb4 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -998,7 +998,7 @@ def create_argument_parser():
         "--percentile-metrics",
         type=str,
         default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentiles. "
         "This argument specifies the metrics to report percentiles. "
         'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
         'Default value is "ttft,tpot,itl".',
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 6b24b8c8f3..34a525f00d 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -719,7 +719,7 @@ def create_argument_parser():
         "[length * (1 - range_ratio), length * (1 + range_ratio)].",
     )
 
-    # hf dtaset
+    # hf dataset
     parser.add_argument(
         "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
     )
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index 038d3c44f0..30d6547073 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -119,7 +119,7 @@ def attempt_to_make_names_unique(entries_and_traces):
              if not all_the_same(trace_eles)), None)
 
         if first_trace_difference is None:
-            # can't create a unique name, leave them names as the
+            # can't create a unique name, leave the names as they
             # are they will get aggregated by the pivot_table call
             continue
 
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 7a99aaff70..71274420c3 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -513,7 +513,7 @@ if flashinfer_comm is not None:
                         torch.ops._C.static_scaled_fp8_quant(
                             quant_out, norm_out, scale_factor)
             if scale_factor is None or norm_out is not None:
-                # we need to return allreduce outpput
+                # we need to return allreduce output
                 # in cases of non quant fused AR + RMS norm
                 # and fused AR + RMS norm + quant without fused add
                 allreduce_in.copy_(allreduce_out)
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 343b8df7e8..138283d4c8 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -49,7 +49,7 @@ class MQLLMEngine:
 
     This class is used to wrap the
     [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
-    in concurrnet manner. It runs a background loop and uses zeromq to
+    in concurrent manner. It runs a background loop and uses zeromq to
     receive new requests and stream outputs incrementally via ipc.
 
     The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 5333bbd310..ecdcc57393 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -23,7 +23,7 @@ TORCH_DEVICE_IDENTITY = None
 # The condition to determine if it is on a platform that supports
 # torch._scaled_mm rowwise feature.
 # The condition is determined once as the operations
-# are time consuming.
+# are time-consuming.
 USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() and version.parse(
     torch.__version__) >= version.parse("2.7")
                                and current_platform.has_device_capability(94))
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 34b8d8e4ed..1e5aa9e571 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -211,7 +211,7 @@ class DefaultModelLoader(BaseModelLoader):
 
             if not USE_TPU_COMMONS:
                 # In PyTorch XLA, we should call `xm.mark_step`
-                # requently so that not too many ops are accumulated
+                # frequently so that not too many ops are accumulated
                 # in the XLA program. import torch_xla.core.xla_model
                 # as xm
                 import torch_xla.core.xla_model as xm
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 17288cda8e..7355206f30 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -84,7 +84,7 @@ class XPUWorker(Worker):
         """Profiles the peak memory usage of the model to determine how many
         KV blocks may be allocated without OOMs.
         The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
+        Then, it calculates the maximum possible number of GPU and CPU blocks
         that can be allocated with the remaining free memory.
         .. tip::
             You may limit the usage of GPU memory
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 2d2e51c329..08bb4e7c9e 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -234,7 +234,7 @@ class Worker(LocalOrDistributedWorkerBase):
         KV blocks may be allocated without OOMs.
 
         The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
+        Then, it calculates the maximum possible number of GPU and CPU blocks
         that can be allocated with the remaining free memory.
 
         Tip:

From e81d4e69c16c05c147b692a9f028265bd0d49092 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Wed, 3 Sep 2025 12:19:14 +0800
Subject: [PATCH 814/932] [Misc] Add check for dual_chunk_attention (#24070)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/config/__init__.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 2e0212d010..fd3ad2c8a6 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -49,7 +49,8 @@ from vllm.transformers_utils.config import (
     try_get_tokenizer_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
-from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
+from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                        STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType,
                         LazyLoader, common_broadcastable_dtype, random_uuid)
 
 if TYPE_CHECKING:
@@ -1304,6 +1305,10 @@ class ModelConfig:
                     self.hf_config.dual_chunk_attention_config[
                         "sparse_attention_enabled"] = True
 
+            if envs.VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL:
+                raise ValueError("please set VLLM_ATTENTION_BACKEND to "
+                                 f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}")
+
     def verify_async_output_proc(self, parallel_config, speculative_config,
                                  device_config) -> None:
         if not self.use_async_output_proc:

From 426cc8629f7e630e1c5a0b96fe2db737a170a06d Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Tue, 2 Sep 2025 21:57:59 -0700
Subject: [PATCH 815/932] [BugFix] Fix routed_scaling_factor double mul for
 dots1 and glm4 MoE models (#24132)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 vllm/model_executor/models/dots1.py    | 3 ++-
 vllm/model_executor/models/glm4_moe.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index a5477af869..4ddf906ddd 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -137,7 +137,8 @@ class Dots1MoE(nn.Module):
             topk_group=config.topk_group,
             prefix=f"{prefix}.experts",
             scoring_func=config.scoring_func,
-            routed_scaling_factor=self.routed_scaling_factor,
+            # we do scaling outside, set factor to 1.0 to avoid double mul
+            routed_scaling_factor=1.0,
             e_score_correction_bias=self.gate.e_score_correction_bias)
 
         if config.n_shared_experts is not None:
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 06ed453ec2..284506b642 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -159,7 +159,8 @@ class Glm4MoE(nn.Module):
             topk_group=config.topk_group,
             prefix=f"{prefix}.experts",
             scoring_func="sigmoid",
-            routed_scaling_factor=self.routed_scaling_factor,
+            # we do scaling outside, set factor to 1.0 to avoid double mul
+            routed_scaling_factor=1.0,
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts)

From f38035c123b32f239f746585e197e7250694a1ca Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 3 Sep 2025 14:45:25 +0800
Subject: [PATCH 816/932] [distributed][rl] remove nccl cumem env var override
 (#24141)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/usage/troubleshooting.md |  2 +-
 vllm/env_override.py          | 18 ------------------
 2 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index b92c6cef4a..4945927e3d 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -295,4 +295,4 @@ This indicates vLLM failed to initialize the NCCL communicator, possibly due to
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
-- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable `NCCL_CUMEM_ENABLE=0` to disable NCCL's `cuMem` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
+- To address a memory overhead issue in older NCCL versions (see [bug](https://github.com/NVIDIA/nccl/issues/1234)), vLLM versions `>= 0.4.3, <= 0.10.1.1` would set the environment variable `NCCL_CUMEM_ENABLE=0`. External processes connecting to vLLM also needed to set this variable to prevent hangs or crashes. Since the underlying NCCL bug was fixed in NCCL 2.22.3, this override was removed in newer vLLM versions to allow for NCCL performance optimizations.
diff --git a/vllm/env_override.py b/vllm/env_override.py
index ef425d4333..b06703a2fb 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -13,24 +13,6 @@ logger = init_logger(__name__)
 # that interact with vllm workers.
 # they are executed whenever `import vllm` is called.
 
-if os.environ.get('NCCL_CUMEM_ENABLE', '0') != '0':
-    logger.warning(
-        "NCCL_CUMEM_ENABLE is set to %s, skipping override. "
-        "This may increase memory overhead with cudagraph+allreduce: "
-        "https://github.com/NVIDIA/nccl/issues/1234",
-        os.environ['NCCL_CUMEM_ENABLE'])
-elif not os.path.exists('/dev/nvidia-caps-imex-channels'):
-    # NCCL requires NCCL_CUMEM_ENABLE to work with
-    # multi-node NVLink, typically on GB200-NVL72 systems.
-    # The ultimate way to detect multi-node NVLink is to use
-    # NVML APIs, which are too expensive to call here.
-    # As an approximation, we check the existence of
-    # /dev/nvidia-caps-imex-channels, used by
-    # multi-node NVLink to communicate across nodes.
-    # This will still cost some GPU memory, but it is worthwhile
-    # because we can get very fast cross-node bandwidth with NVLink.
-    os.environ['NCCL_CUMEM_ENABLE'] = '0'
-
 # see https://github.com/vllm-project/vllm/pull/15951
 # it avoids unintentional cuda initialization from torch.cuda.is_available()
 os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'

From f0c503f66e2f6aafa966318d488fd92ac662cdf0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Wed, 3 Sep 2025 09:19:54 +0200
Subject: [PATCH 817/932] [Nixl] Heterogeneous TP support FlashInfer (#20189)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../kv_connector/v1/nixl_connector.py         | 62 ++++++++++++++++---
 1 file changed, 53 insertions(+), 9 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index efe023d559..8f16babfe2 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -715,7 +715,7 @@ class NixlConnectorWorker:
         # are non-contiguous (it's not locally guaranteed that they will be)
         # Disadvantage is that the encoded NixlAgentMetadata is now larger
         # (roughly 8KB vs 5KB).
-        # Conversely for FlashInfer, K and V are transferred in the same tensor
+        # Conversely for FlashInfer, K and V are registered in the same region
         # to better exploit the memory layout (ie num_blocks is the first dim).
         split_k_and_v = not (self.use_mla or self._use_pallas_v1
                              or self._use_flashinfer)
@@ -758,12 +758,21 @@ class NixlConnectorWorker:
         assert tensor_size_bytes % self.num_blocks == 0
         self.block_len = tensor_size_bytes // self.num_blocks
         self.slot_size_bytes = self.block_len // self.block_size
+        self.device_kv_caches = kv_caches
+        self.dst_num_blocks[self.engine_id] = self.num_blocks
         if self._use_flashinfer:
             assert self.slot_size_bytes % 2 == 0
             self.slot_size_bytes /= 2
-        self.device_kv_caches = kv_caches
-        self.dst_num_blocks[self.engine_id] = self.num_blocks
 
+            # NOTE (NickLucche) When FlashInfer is used, memory is registered
+            # with joint KV for each block. This minimizes the overhead in
+            # registerMem allowing faster descs queries. In order to be able to
+            # split on kv_heads dim as required by heterogeneous TP, one must
+            # be able to index K/V separately. Hence the we double the number
+            # of 'virtual' regions here and halve `block_len` below.
+            self.num_regions *= 2
+
+        kv_block_len = self.get_backend_aware_kv_block_len()
         # Register local/src descr for NIXL xfer.
         blocks_data = []
         for base_addr in seen_base_addresses:
@@ -776,8 +785,18 @@ class NixlConnectorWorker:
                 block_offset = block_id * self.block_len
                 addr = base_addr + block_offset
                 # (addr, len, device id)
-                # TODO: does device_id matter to DRAM?
-                blocks_data.append((addr, self.block_len, self.tp_rank))
+                blocks_data.append((addr, kv_block_len, self.tp_rank))
+
+            if self._use_flashinfer:
+                # Separate and interleave K/V regions to maintain the same
+                # descs ordering. This is needed for selecting contiguous heads
+                # when split across TP ranks.
+                for block_id in range(self.num_blocks):
+                    block_offset = block_id * self.block_len
+                    addr = base_addr + block_offset
+                    # Register addresses for V cache (K registered first).
+                    v_addr = addr + kv_block_len
+                    blocks_data.append((v_addr, kv_block_len, self.tp_rank))
         logger.debug("Created %s blocks for src engine %s and rank %s",
                      len(blocks_data), self.engine_id, self.tp_rank)
 
@@ -903,7 +922,7 @@ class NixlConnectorWorker:
             remote_block_size = nixl_agent_meta.block_len // (
                 self.slot_size_bytes * tp_ratio)
             if self._use_flashinfer:
-                # Account for joint KV in FlashInfer.
+                # With flashinfer, KV are sent in the same message.
                 remote_block_size //= 2
             if tp_ratio > 1:
                 # Heterogeneous TP expects same kv_cache_layout.
@@ -929,10 +948,10 @@ class NixlConnectorWorker:
         # rank. With heterogeneous TP, prepare the descriptors by splitting the
         # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
         # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
-        # Only register the remote's descriptors if current rank pulls from it.
         self.kv_caches_base_addr[
             engine_id] = nixl_agent_meta.kv_caches_base_addr
-        rank_offset = self.tp_rank % tp_ratio * self.block_len \
+        kv_block_len = self.get_backend_aware_kv_block_len()
+        rank_offset = self.tp_rank % tp_ratio * kv_block_len \
             if not (self.use_mla or is_kv_replicated) else 0
         # Register all remote blocks, but only the corresponding kv heads.
         for base_addr in nixl_agent_meta.kv_caches_base_addr:
@@ -943,7 +962,16 @@ class NixlConnectorWorker:
                 # self.block_len == remote_block_len//tp_ratio bytes.
                 addr = base_addr + block_offset + rank_offset
                 # (addr, len, device id)
-                blocks_data.append((addr, self.block_len, remote_tp_rank))
+                blocks_data.append((addr, kv_block_len, remote_tp_rank))
+
+            if self._use_flashinfer:
+                # With FlashInfer index V separately to allow head splitting.
+                for block_id in range(nixl_agent_meta.num_blocks):
+                    block_offset = block_id * nixl_agent_meta.block_len
+                    addr = base_addr + block_offset + rank_offset
+                    v_addr = addr + nixl_agent_meta.block_len // 2
+                    blocks_data.append((v_addr, kv_block_len, remote_tp_rank))
+
         logger.debug(
             "Created %s blocks for dst engine %s with remote rank %s and "
             "local rank %s", len(blocks_data), engine_id, remote_tp_rank,
@@ -1249,6 +1277,22 @@ class NixlConnectorWorker:
                 descs_ids.append(reg_id * num_blocks + block_id)
         return descs_ids
 
+    def get_backend_aware_kv_block_len(self):
+        """
+        Get the block length for one K/V element (K and V have the same size).
+
+        For FA and other backends, this is equal to the length of the whole 
+        block, as K and V are in separate regions.
+        For FlashInfer, this is half the length of the whole block, as K and V
+        share the same region.
+        """
+        if self._use_flashinfer:
+            # For indexing only half (either just the K or V part).
+            block_len = self.block_len // 2
+        else:
+            block_len = self.block_len
+        return block_len
+
 
 @contextlib.contextmanager
 def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:

From 70549c1245c3eeb3706e3c09a9e18d702fbf705f Mon Sep 17 00:00:00 2001
From: dsinghvi <divyanshsinghvi@gmail.com>
Date: Wed, 3 Sep 2025 13:43:11 +0530
Subject: [PATCH 818/932] [CI/Build] Serve images used by multimodal tests
 through local HTTP Server (#23907)

Signed-off-by: Divyansh Singhvi <divyanshsinghvi@gmail.com>
Signed-off-by: dsinghvi <divyanshsinghvi@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/conftest.py                             | 122 ++++++++++++++++++
 tests/entrypoints/llm/test_chat.py            |   5 +-
 tests/entrypoints/openai/test_vision.py       |  43 +++---
 .../openai/test_vision_embedding.py           |  19 +--
 .../multimodal/generation/test_pixtral.py     |  57 ++++----
 tests/multimodal/test_utils.py                |  31 +++--
 .../openai/responses/test_image.py            |  30 +++--
 tests/v1/tpu/test_multimodal.py               |  13 +-
 vllm/assets/image.py                          |  28 +++-
 9 files changed, 250 insertions(+), 98 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 27db5422ce..1052aeb35b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,9 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import http.server
 import json
 import math
+import mimetypes
 import os
+import socket
 import tempfile
+import threading
+from collections.abc import Generator
 from enum import Enum
 from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
 
@@ -32,6 +37,7 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
                          to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
+from vllm.multimodal.utils import fetch_image
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.sequence import Logprob
@@ -1253,3 +1259,119 @@ def cli_config_file():
 def cli_config_file_with_model():
     """Return the path to the CLI config file with model."""
     return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
+
+
+class AssetHandler(http.server.BaseHTTPRequestHandler):
+    # _IMAGE_CACHE : Dict[str, bytes] = {}
+
+    def log_message(self, *args, **kwargs):
+        pass
+
+    def do_GET(self):
+        # Accepts paths like: /1280px-Venn_diagram_rgb.jpg
+        filename = self.path.lstrip("/")
+        if not filename or "." not in filename:
+            self.send_error(404, "Missing filename (expected /<name>.<ext>)")
+            return
+
+        base, ext = filename.rsplit(".", 1)
+        ext = ext.lower()
+
+        if ext not in ["jpg", "png"]:
+            self.send_error(404, f"Unsupported extension: .{ext}")
+            return
+
+        try:
+            data = ImageAsset(base).read_bytes(ext=ext)
+        except Exception as e:
+            self.send_error(500, f"Failed to load asset: {ext} {base} {e} ")
+            return
+
+        ctype, _ = mimetypes.guess_type(filename)
+        if ctype is None:
+            ctype = {"jpg": "image/jpg", "png": "image/png"}[ext]
+        self.send_response(200)
+        self.send_header("Content-Type", ctype)
+        self.send_header("Content-Length", str(len(data)))
+        self.end_headers()
+        self.wfile.write(data)
+
+
+def _find_free_port() -> int:
+    with socket.socket() as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+class LocalAssetServer:
+
+    address: str
+    port: int
+    server: Optional[http.server.ThreadingHTTPServer]
+    thread: Optional[threading.Thread]
+
+    def __init__(self, address: str = "127.0.0.1") -> None:
+        self.address = address
+        self.port = -1
+        self.server = None
+        self.thread = None
+
+    def __enter__(self):
+        self.port = _find_free_port()
+        self.server = http.server.ThreadingHTTPServer(
+            (self.address, self.port), AssetHandler)
+        self.thread = threading.Thread(target=self.server.serve_forever,
+                                       daemon=True)
+        self.thread.start()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if self.server:
+            self.server.shutdown()
+            del self.server
+
+        if self.thread:
+            self.thread.join()
+            del self.thread
+
+        if exc_type is None:
+            return None
+
+        return False
+
+    @property
+    def base_url(self) -> str:
+        assert self.port is not None
+        return f"http://{self.address}:{self.port}"
+
+    def url_for(self, name: str) -> str:
+        """e.g., name='RGBA_comp.png' -> 'http://127.0.0.1:PORT/RGBA_comp.png'"""
+        return f"{self.base_url}/{name}"
+
+    def get_image_asset(self, name: str) -> Image.Image:
+        return fetch_image(self.url_for(name))
+
+
+@pytest.fixture(scope="session")
+def local_asset_server() -> Generator[LocalAssetServer, None, None]:
+    """
+    Starts a thread based HTTP server bound to 127.0.0.1 on a random free port. 
+    The server currently servers images at:
+    http://127.0.0.1:<port>/<name>.<ext>
+    """
+    with LocalAssetServer() as srv:
+        yield srv
+
+
+@pytest.fixture
+def image_url(request, local_asset_server) -> str:
+    # request.param is one of the IMAGE_ASSETS filenames
+    name = request.param
+    return local_asset_server.url_for(name)
+
+
+@pytest.fixture
+def image_urls(request, local_asset_server) -> list[str]:
+    """Indirect fixture: takes a list of names, returns list of full URLs."""
+    names: list[str] = request.param
+    return [local_asset_server.url_for(name) for name in names]
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 2cbfed98a5..bf460d0fb2 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -7,7 +7,7 @@ import pytest
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
 
-from ..openai.test_vision import TEST_IMAGE_URLS
+from ..openai.test_vision import TEST_IMAGE_ASSETS
 
 
 @pytest.fixture(scope="function")
@@ -95,7 +95,8 @@ def vision_llm():
 
 
 @pytest.mark.parametrize("image_urls",
-                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+                         [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]],
+                         indirect=True)
 def test_chat_multi_image(vision_llm, image_urls: list[str]):
     messages = [{
         "role":
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 106ec121a4..9d61754059 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -16,11 +16,11 @@ MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
 MAXIMUM_IMAGES = 2
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
-TEST_IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]
 
 EXPECTED_MM_BEAM_SEARCH_RES = [
@@ -69,10 +69,11 @@ async def client(server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image() -> dict[str, str]:
+def base64_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_url: encode_image_base64(fetch_image(image_url))
-        for image_url in TEST_IMAGE_URLS
+        image_asset:
+        encode_image_base64(local_asset_server.get_image_asset(image_asset))
+        for image_asset in TEST_IMAGE_ASSETS
     }
 
 
@@ -97,7 +98,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                          model_name: str, image_url: str):
     content_text = "What's in this image?"
@@ -157,7 +158,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
                                                model_name: str,
                                                image_url: str):
@@ -187,7 +188,7 @@ async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
                                                     model_name: str,
                                                     image_url: str):
@@ -223,10 +224,11 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_single_chat_session_image_base64encoded(
-        client: openai.AsyncOpenAI, model_name: str, image_url: str,
-        base64_encoded_image: dict[str, str]):
+        client: openai.AsyncOpenAI, model_name: str, raw_image_url: str,
+        image_url: str, base64_encoded_image: dict[str, str]):
 
     content_text = "What's in this image?"
     messages = [{
@@ -237,7 +239,7 @@ async def test_single_chat_session_image_base64encoded(
                 "type": "image_url",
                 "image_url": {
                     "url":
-                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                    f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
                 }
             },
             {
@@ -287,12 +289,12 @@ async def test_single_chat_session_image_base64encoded(
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_URLS))))
+@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS))))
 async def test_single_chat_session_image_base64encoded_beamsearch(
         client: openai.AsyncOpenAI, model_name: str, image_idx: int,
         base64_encoded_image: dict[str, str]):
     # NOTE: This test also validates that we pass MM data through beam search
-    image_url = TEST_IMAGE_URLS[image_idx]
+    raw_image_url = TEST_IMAGE_ASSETS[image_idx]
     expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
 
     messages = [{
@@ -303,7 +305,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
                 "type": "image_url",
                 "image_url": {
                     "url":
-                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                    f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
                 }
             },
             {
@@ -326,7 +328,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_chat_streaming_image(client: openai.AsyncOpenAI,
                                     model_name: str, image_url: str):
     messages = [{
@@ -385,7 +387,8 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
     "image_urls",
-    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
 async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
                                  image_urls: list[str]):
 
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index d3cc2fac6a..dbd403fb7a 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -19,11 +19,11 @@ vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
 assert vlm2vec_jinja_path.exists()
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
-TEST_IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]
 
 
@@ -49,10 +49,11 @@ def server():
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image() -> dict[str, str]:
+def base64_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_url: encode_image_base64(fetch_image(image_url))
-        for image_url in TEST_IMAGE_URLS
+        image_url:
+        encode_image_base64(local_asset_server.get_image_asset(image_url))
+        for image_url in TEST_IMAGE_ASSETS
     }
 
 
@@ -70,7 +71,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
                                image_url: str):
     content_text = "Represent the given image."
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index d39cf70678..f95dbc7547 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -29,10 +29,10 @@ MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
 
 IMG_URLS = [
-    "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
-    "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/231-200x300.jpg",
-    "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/27-500x500.jpg",
-    "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/17-150x600.jpg",
+    "237-400x300.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
+    "231-200x300.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
+    "27-500x500.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
+    "17-150x600.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
 ]
 PROMPT = "Describe each image in one short sentence."
 
@@ -105,12 +105,6 @@ def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
     return engine_inputs
 
 
-MSGS = [
-    _create_msg_format(IMG_URLS[:1]),
-    _create_msg_format(IMG_URLS[:2]),
-    _create_msg_format(IMG_URLS),
-]
-
 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 LIMIT_MM_PER_PROMPT = dict(image=4)
 
@@ -156,12 +150,8 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_chat(
-    vllm_runner,
-    max_model_len: int,
-    model: str,
-    dtype: str,
-) -> None:
+def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
+              local_asset_server) -> None:
     EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
         FIXTURE_LOGPROBS_CHAT[model])
     with vllm_runner(
@@ -174,7 +164,14 @@ def test_chat(
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
         outputs = []
-        for msg in MSGS:
+
+        urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
+        msgs = [
+            _create_msg_format(urls_all[:1]),
+            _create_msg_format(urls_all[:2]),
+            _create_msg_format(urls_all),
+        ]
+        for msg in msgs:
             output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
 
             outputs.extend(output)
@@ -190,14 +187,24 @@ def test_chat(
                          name_1="output")
 
 
-@pytest.mark.parametrize("prompt,expected_ranges",
-                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
-                           [PlaceholderRange(offset=11, length=494)]),
-                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
-                              PlaceholderRange(offset=11, length=266),
-                              PlaceholderRange(offset=277, length=1056),
-                              PlaceholderRange(offset=1333, length=418)
-                          ])])
+@pytest.fixture
+def prompt(request, local_asset_server) -> TextPrompt:
+    names = request.param
+    urls = [local_asset_server.url_for(n) for n in names]
+    return _create_engine_inputs_hf(urls)
+
+
+@pytest.mark.parametrize(
+    "prompt,expected_ranges",
+    [
+        pytest.param(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
+        pytest.param(IMG_URLS[1:4], [
+            PlaceholderRange(offset=11, length=266),
+            PlaceholderRange(offset=277, length=1056),
+            PlaceholderRange(offset=1333, length=418)
+        ])
+    ],
+)
 def test_multi_modal_placeholders(vllm_runner, prompt: TextPrompt,
                                   expected_ranges: list[PlaceholderRange],
                                   monkeypatch) -> None:
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 0f82e1f3e3..886582a516 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -31,11 +31,11 @@ if TYPE_CHECKING:
     from vllm.multimodal.inputs import MultiModalPlaceholderDict
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
-TEST_IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]
 
 TEST_VIDEO_URLS = [
@@ -45,12 +45,11 @@ TEST_VIDEO_URLS = [
 
 
 @pytest.fixture(scope="module")
-def url_images() -> dict[str, Image.Image]:
-    connector = MediaConnector()
+def url_images(local_asset_server) -> dict[str, Image.Image]:
 
     return {
-        image_url: connector.fetch_image(image_url)
-        for image_url in TEST_IMAGE_URLS
+        image_url: local_asset_server.get_image_asset(image_url)
+        for image_url in TEST_IMAGE_ASSETS
     }
 
 
@@ -69,7 +68,7 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_fetch_image_http(image_url: str):
     connector = MediaConnector()
 
@@ -79,12 +78,12 @@ async def test_fetch_image_http(image_url: str):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
 async def test_fetch_image_base64(url_images: dict[str, Image.Image],
-                                  image_url: str, suffix: str):
+                                  raw_image_url: str, suffix: str):
     connector = MediaConnector()
-    url_image = url_images[image_url]
+    url_image = url_images[raw_image_url]
 
     try:
         mime_type = Image.MIME[Image.registered_extensions()[suffix]]
@@ -117,7 +116,7 @@ async def test_fetch_image_base64(url_images: dict[str, Image.Image],
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_fetch_image_local_files(image_url: str):
     connector = MediaConnector()
 
@@ -152,8 +151,8 @@ async def test_fetch_image_local_files(image_url: str):
 
 
 @pytest.mark.asyncio
-async def test_fetch_image_local_files_with_space_in_name():
-    image_url = TEST_IMAGE_URLS[0]
+@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
+async def test_fetch_image_local_files_with_space_in_name(image_url: str):
     connector = MediaConnector()
 
     with TemporaryDirectory() as temp_dir:
diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/responses/test_image.py
index c8d09fd39f..3ed36ca678 100644
--- a/tests/v1/entrypoints/openai/responses/test_image.py
+++ b/tests/v1/entrypoints/openai/responses/test_image.py
@@ -8,17 +8,17 @@ import pytest
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import encode_image_base64
 
 # Use a small vision model for testing
 MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
 MAXIMUM_IMAGES = 2
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
-TEST_IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]
 
 
@@ -52,16 +52,17 @@ async def client(image_server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image() -> dict[str, str]:
+def base64_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_url: encode_image_base64(fetch_image(image_url))
-        for image_url in TEST_IMAGE_URLS
+        image_url:
+        encode_image_base64(local_asset_server.get_image_asset(image_url))
+        for image_url in TEST_IMAGE_ASSETS
     }
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                          model_name: str, image_url: str):
     content_text = "What's in this image?"
@@ -91,11 +92,11 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
 async def test_single_chat_session_image_base64encoded(
     client: openai.AsyncOpenAI,
     model_name: str,
-    image_url: str,
+    raw_image_url: str,
     base64_encoded_image: dict[str, str],
 ):
     content_text = "What's in this image?"
@@ -106,7 +107,7 @@ async def test_single_chat_session_image_base64encoded(
             {
                 "type": "input_image",
                 "image_url":
-                f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
+                f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
                 "detail": "auto",
             },
             {
@@ -127,7 +128,8 @@ async def test_single_chat_session_image_base64encoded(
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
     "image_urls",
-    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
 async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
                                  image_urls: list[str]):
     messages = [{
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
index bcc2993028..9947fcbe73 100644
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
@@ -4,18 +4,19 @@
 import openai
 import pytest
 
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import encode_image_base64
 from vllm.platforms import current_platform
 
-from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS
+from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
 from ...utils import RemoteOpenAIServer
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image() -> dict[str, str]:
+def base64_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_url: encode_image_base64(fetch_image(image_url))
-        for image_url in TEST_IMAGE_URLS
+        image_asset:
+        encode_image_base64(local_asset_server.get_image_asset(image_asset))
+        for image_asset in TEST_IMAGE_ASSETS
     }
 
 
@@ -66,7 +67,7 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
         client: openai.AsyncOpenAI = remote_server.get_async_client()
 
         # Other requests now should be much faster
-        for image_url in TEST_IMAGE_URLS:
+        for image_url in TEST_IMAGE_ASSETS:
             image_base64 = base64_encoded_image[image_url]
             chat_completion_from_base64 = await client.chat.completions\
                 .create(
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index c8f8d43a98..4639a11187 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Literal
 
 import torch
@@ -11,17 +12,29 @@ from .base import get_vllm_public_assets
 
 VLM_IMAGES_DIR = "vision_model_images"
 
-ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato"]
+ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato",
+                         "2560px-Gfp-wisconsin-madison-the-nature-boardwalk",
+                         "Grayscale_8bits_palette_sample_image",
+                         "1280px-Venn_diagram_rgb", "RGBA_comp", "237-400x300",
+                         "231-200x300", "27-500x500", "17-150x600",
+                         "handelsblatt-preview", "paper-11"]
 
 
 @dataclass(frozen=True)
 class ImageAsset:
     name: ImageAssetName
 
+    def get_path(self, ext: str) -> Path:
+        """
+        Return s3 path for given image.
+        """
+        return get_vllm_public_assets(filename=f"{self.name}.{ext}",
+                                      s3_prefix=VLM_IMAGES_DIR)
+
     @property
-    def pil_image(self) -> Image.Image:
-        image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
-                                            s3_prefix=VLM_IMAGES_DIR)
+    def pil_image(self, ext="jpg") -> Image.Image:
+
+        image_path = self.get_path(ext)
         return Image.open(image_path)
 
     @property
@@ -29,6 +42,9 @@ class ImageAsset:
         """
         Image embeddings, only used for testing purposes with llava 1.5.
         """
-        image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
-                                            s3_prefix=VLM_IMAGES_DIR)
+        image_path = self.get_path('pt')
         return torch.load(image_path, map_location="cpu", weights_only=True)
+
+    def read_bytes(self, ext: str) -> bytes:
+        p = Path(self.get_path(ext))
+        return p.read_bytes()

From 9c99e4871f12aa57072b3dfd14157018d30e327b Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 3 Sep 2025 16:34:29 +0800
Subject: [PATCH 819/932] [Misc] Clean up deadcode for legacy processing
 pipeline (#24153)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/multimodal/processing/test_tensor_schema.py | 3 ---
 vllm/multimodal/utils.py                                 | 5 +----
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 1a11fa3d2b..615564f70e 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -41,9 +41,6 @@ ARCH_NEEDS_EXTRAS = [
 ]
 REPO_ID_TO_SKIP = {
     "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
-    # FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model
-    # after support PP for GPT-OSS
-    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model",
 }
 
 ImageInput = list[Image.Image]
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 794e24c2c7..e09c97de57 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -378,10 +378,7 @@ def group_mm_inputs_by_modality(
         elif len(mm_input) == 1:
             return next(iter(mm_input.keys()))
 
-        # FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty,
-        # this is used to make InternVL with legacy pipeline still work with v1.
-        else:
-            return ""
+        raise AssertionError("This line should be unreachable.")
 
     return [
         list(group) for _, group in groupby(mm_inputs, key=modality_group_func)

From 51383bd472747cfbc1f8f2e40767e1becef01f0f Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Wed, 3 Sep 2025 17:23:56 +0800
Subject: [PATCH 820/932] [CI] Accelerate mteb test by setting
 SentenceTransformers mteb score to a constant (#24088)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 .../openai/correctness/test_mteb_embed.py     |  4 ++-
 .../openai/correctness/test_mteb_score.py     | 31 ++++++++--------
 tests/models/language/pooling/embed_utils.py  |  5 +--
 tests/models/language/pooling/mteb_utils.py   | 36 ++++++++++++-------
 tests/models/language/pooling/test_baai.py    |  4 +++
 .../pooling/test_bge_reranker_v2_gemma.py     |  3 +-
 .../language/pooling/test_cross_encoder.py    |  2 ++
 .../models/language/pooling/test_embedding.py |  5 +--
 tests/models/language/pooling/test_gte.py     | 26 ++++++++------
 .../models/language/pooling/test_intfloat.py  |  4 ++-
 tests/models/language/pooling/test_jina.py    |  2 ++
 .../language/pooling/test_mxbai_rerank.py     |  1 +
 tests/models/language/pooling/test_nomic.py   |  2 ++
 .../language/pooling/test_qwen3_reranker.py   |  1 +
 .../pooling/test_snowflake_arctic_embed.py    |  7 +++-
 .../language/pooling/test_st_projector.py     |  1 +
 tests/models/utils.py                         |  1 +
 17 files changed, 83 insertions(+), 52 deletions(-)

diff --git a/tests/entrypoints/openai/correctness/test_mteb_embed.py b/tests/entrypoints/openai/correctness/test_mteb_embed.py
index 783f7d3e0d..1601c18d9b 100644
--- a/tests/entrypoints/openai/correctness/test_mteb_embed.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_embed.py
@@ -37,4 +37,6 @@ def test_mteb_embed(server):
     print("SentenceTransformer main score: ", st_main_score)
     print("Difference: ", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_EMBED_TOL
diff --git a/tests/entrypoints/openai/correctness/test_mteb_score.py b/tests/entrypoints/openai/correctness/test_mteb_score.py
index cfb865815c..417f85adc6 100644
--- a/tests/entrypoints/openai/correctness/test_mteb_score.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_score.py
@@ -6,16 +6,19 @@ import pytest
 
 # yapf conflicts with isort for this block
 # yapf: disable
-from tests.models.language.pooling.mteb_utils import (
-    MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
-    RerankClientMtebEncoder, ScoreClientMtebEncoder,
-    mteb_test_rerank_models_hf, run_mteb_rerank)
+from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS,
+                                                      MTEB_RERANK_TASKS,
+                                                      MTEB_RERANK_TOL,
+                                                      RerankClientMtebEncoder,
+                                                      ScoreClientMtebEncoder,
+                                                      run_mteb_rerank)
 # yapf: enable
 from tests.utils import RemoteOpenAIServer
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 
 MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+st_main_score = 0.33457
 
 
 @pytest.fixture(scope="module")
@@ -29,15 +32,7 @@ def server():
         yield remote_server
 
 
-@pytest.fixture(scope="module")
-def st_main_score(hf_runner):
-    # The main score related to the version of the dependency.
-    # So we need to recalculate every time.
-    main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
-    return main_score
-
-
-def test_mteb_score(server, st_main_score):
+def test_mteb_score(server):
     url = server.url_for("score")
     encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
     vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
@@ -47,10 +42,12 @@ def test_mteb_score(server, st_main_score):
     print("SentenceTransformer main score: ", st_main_score)
     print("Difference: ", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_RERANK_TOL
 
 
-def test_mteb_rerank(server, st_main_score):
+def test_mteb_rerank(server):
     url = server.url_for("rerank")
     encoder = RerankClientMtebEncoder(MODEL_NAME, url)
     vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
@@ -60,4 +57,6 @@ def test_mteb_rerank(server, st_main_score):
     print("SentenceTransformer main score: ", st_main_score)
     print("Difference: ", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_RERANK_TOL
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
index a74ad2aa25..8f8393c4e1 100644
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -35,10 +35,7 @@ def correctness_test_embed_models(hf_runner,
                                   example_prompts,
                                   vllm_extra_kwargs=None,
                                   hf_model_callback=None):
-    if not model_info.enable_test:
-        # A model family has many models with the same architecture,
-        # and we don't need to test each one.
-        pytest.skip("Skipping test.")
+    pytest.skip("Debug only, ci prefers to use mteb test.")
 
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 640858125b..7be1bba2ff 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -18,7 +18,7 @@ from tests.models.utils import EmbedModelInfo, RerankModelInfo
 # - Different model results in differences more than 1e-3
 # 1e-4 is a good tolerance threshold
 MTEB_EMBED_TASKS = ["STS12"]
-MTEB_EMBED_TOL = 0.02
+MTEB_EMBED_TOL = 1e-4
 
 # See #19344
 MTEB_RERANK_TASKS = ["NFCorpus"]
@@ -192,22 +192,28 @@ def mteb_test_embed_models(hf_runner,
                                               MTEB_EMBED_TASKS)
         vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
 
-    with hf_runner(model_info.name,
-                   is_sentence_transformer=True,
-                   dtype="float32") as hf_model:
+    if model_info.mteb_score is None:
+        with hf_runner(model_info.name,
+                       is_sentence_transformer=True,
+                       dtype="float32") as hf_model:
 
-        if hf_model_callback is not None:
-            hf_model_callback(hf_model)
+            if hf_model_callback is not None:
+                hf_model_callback(hf_model)
 
-        st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
-        st_dtype = next(hf_model.model.parameters()).dtype
+            st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
+            st_dtype = next(hf_model.model.parameters()).dtype
+    else:
+        st_main_score = model_info.mteb_score
+        st_dtype = "Constant"
 
     print("Model:", model_info.name)
     print("VLLM:", vllm_dtype, vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < atol
 
 
 def run_mteb_rerank(cross_encoder, tasks, languages):
@@ -310,12 +316,18 @@ def mteb_test_rerank_models(hf_runner,
                                           languages=MTEB_RERANK_LANGS)
         vllm_dtype = model_config.dtype
 
-    st_main_score, st_dtype = mteb_test_rerank_models_hf(
-        hf_runner, model_info.name, hf_model_callback)
+    if model_info.mteb_score is None:
+        st_main_score, st_dtype = mteb_test_rerank_models_hf(
+            hf_runner, model_info.name, hf_model_callback)
+    else:
+        st_main_score = model_info.mteb_score
+        st_dtype = "Constant"
 
     print("Model:", model_info.name)
     print("VLLM:", vllm_dtype, vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < atol
diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py
index 6fbe0e82d7..be8cb6fa76 100644
--- a/tests/models/language/pooling/test_baai.py
+++ b/tests/models/language/pooling/test_baai.py
@@ -12,6 +12,7 @@ MODELS = [
     ########## BertModel
     CLSPoolingEmbedModelInfo("BAAI/bge-base-en",
                              architecture="BertModel",
+                             mteb_score=0.779336792,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("BAAI/bge-base-zh",
                              architecture="BertModel",
@@ -52,10 +53,12 @@ MODELS = [
     ########## XLMRobertaModel
     CLSPoolingEmbedModelInfo("BAAI/bge-m3",
                              architecture="XLMRobertaModel",
+                             mteb_score=0.787343078,
                              enable_test=True),
     ########## Qwen2Model
     LASTPoolingEmbedModelInfo("BAAI/bge-code-v1",
                               architecture="Qwen2Model",
+                              mteb_score=0.75724465,
                               dtype="float32",
                               enable_test=True),
 ]
@@ -65,6 +68,7 @@ RERANK_MODELS = [
     CLSPoolingRerankModelInfo(
         "BAAI/bge-reranker-base",
         architecture="XLMRobertaForSequenceClassification",
+        mteb_score=0.32398,
         enable_test=True),
     CLSPoolingRerankModelInfo(
         "BAAI/bge-reranker-large",
diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
index f473e0ba01..eaa8bfb84f 100644
--- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
@@ -104,7 +104,6 @@ class GemmaMtebEncoder(VllmMtebEncoder):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.prompt = PROMPT
         self.query_template = "A: {query}\n"
         self.document_template = "B: {doc}\n{prompt}"
 
@@ -119,7 +118,7 @@ class GemmaMtebEncoder(VllmMtebEncoder):
         _sentences = []
         for query, corpus, prompt in sentences:
             query = self.query_template.format(query=query)
-            corpus = self.document_template.format(doc=corpus, prompt=prompt)
+            corpus = self.document_template.format(doc=corpus, prompt=PROMPT)
             _sentences.append((query, corpus, prompt))
 
         return super().predict(_sentences, *args, **kwargs)
diff --git a/tests/models/language/pooling/test_cross_encoder.py b/tests/models/language/pooling/test_cross_encoder.py
index 8c1bc5779b..b49908c9ce 100644
--- a/tests/models/language/pooling/test_cross_encoder.py
+++ b/tests/models/language/pooling/test_cross_encoder.py
@@ -8,8 +8,10 @@ from .mteb_utils import mteb_test_rerank_models
 
 RERANK_MODELS = [
     CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
+                              mteb_score=0.32898,
                               architecture="BertForSequenceClassification"),
     LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
+                               mteb_score=0.25736,
                                architecture="Qwen3ForSequenceClassification")
 ]
 
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index f918b2b91b..0733ac85c1 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -7,7 +7,7 @@ import pytest
 from vllm.config import PoolerConfig
 from vllm.platforms import current_platform
 
-from ...utils import check_embeddings_close, check_transformers_version
+from ...utils import check_embeddings_close
 
 
 @pytest.mark.parametrize(
@@ -30,7 +30,6 @@ from ...utils import check_embeddings_close, check_transformers_version
         pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         # [Cross-Encoder]
         pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
@@ -42,8 +41,6 @@ def test_models(
     model,
     monkeypatch,
 ) -> None:
-    if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
-        check_transformers_version(model, max_transformers_version="4.53.2")
 
     if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
         # ROCm Triton FA does not currently support sliding window attention
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 9911620c01..98d215b0ad 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -5,13 +5,14 @@ import pytest
 
 from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
                       EmbedModelInfo, LASTPoolingEmbedModelInfo,
-                      RerankModelInfo, check_transformers_version)
+                      RerankModelInfo)
 from .embed_utils import correctness_test_embed_models
 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 MODELS = [
     ########## BertModel
     CLSPoolingEmbedModelInfo("thenlper/gte-large",
+                             mteb_score=0.76807651,
                              architecture="BertModel",
                              enable_test=True),
     CLSPoolingEmbedModelInfo("thenlper/gte-base",
@@ -30,28 +31,37 @@ MODELS = [
                              architecture="BertModel",
                              enable_test=False),
     ########### NewModel
+    # These three architectures are almost the same, but not exactly the same.
+    # For example,
+    # - whether to use token_type_embeddings
+    # - whether to use context expansion
+    # So only test one (the most widely used) model
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
                              architecture="GteNewModel",
+                             mteb_score=0.775074696,
                              hf_overrides={"architectures": ["GteNewModel"]},
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
                              architecture="GteNewModel",
                              hf_overrides={"architectures": ["GteNewModel"]},
-                             enable_test=True),
+                             enable_test=False),
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
                              architecture="GteNewModel",
                              hf_overrides={"architectures": ["GteNewModel"]},
-                             enable_test=True),
+                             enable_test=False),
     ########### Qwen2ForCausalLM
     LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+                              mteb_score=0.758473459018872,
                               architecture="Qwen2ForCausalLM",
                               enable_test=True),
     ########## ModernBertModel
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
+                             mteb_score=0.748193353,
                              architecture="ModernBertModel",
                              enable_test=True),
     ########## Qwen3ForCausalLM
     LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
+                              mteb_score=0.771163695,
                               architecture="Qwen3ForCausalLM",
                               dtype="float32",
                               enable_test=True),
@@ -65,10 +75,12 @@ RERANK_MODELS = [
     CLSPoolingRerankModelInfo(
         # classifier_pooling: mean
         "Alibaba-NLP/gte-reranker-modernbert-base",
+        mteb_score=0.33386,
         architecture="ModernBertForSequenceClassification",
         enable_test=True),
     CLSPoolingRerankModelInfo(
         "Alibaba-NLP/gte-multilingual-reranker-base",
+        mteb_score=0.33062,
         architecture="GteNewForSequenceClassification",
         hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
         enable_test=True),
@@ -78,10 +90,6 @@ RERANK_MODELS = [
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
-    if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
-        check_transformers_version(model_info.name,
-                                   max_transformers_version="4.53.2")
-
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
@@ -89,10 +97,6 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
 def test_embed_models_correctness(hf_runner, vllm_runner,
                                   model_info: EmbedModelInfo,
                                   example_prompts) -> None:
-    if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
-        check_transformers_version(model_info.name,
-                                   max_transformers_version="4.53.2")
-
     correctness_test_embed_models(hf_runner, vllm_runner, model_info,
                                   example_prompts)
 
diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py
index 6cae53a660..bc95475836 100644
--- a/tests/models/language/pooling/test_intfloat.py
+++ b/tests/models/language/pooling/test_intfloat.py
@@ -10,6 +10,7 @@ MODELS = [
     ########## BertModel
     CLSPoolingEmbedModelInfo("intfloat/e5-small",
                              architecture="BertModel",
+                             mteb_score=0.742285423,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("intfloat/e5-base",
                              architecture="BertModel",
@@ -23,6 +24,7 @@ MODELS = [
     ########## XLMRobertaModel
     CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base",
                              architecture="XLMRobertaModel",
+                             mteb_score=0.779325955,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large",
                              architecture="XLMRobertaModel",
@@ -36,7 +38,7 @@ MODELS = [
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 37c5bdc97d..c4e4835556 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -14,6 +14,7 @@ from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 EMBEDDING_MODELS = [
     CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3",
+                             mteb_score=0.824413164,
                              architecture="XLMRobertaModel",
                              is_matryoshka=True)
 ]
@@ -21,6 +22,7 @@ EMBEDDING_MODELS = [
 RERANK_MODELS = [
     CLSPoolingRerankModelInfo(
         "jinaai/jina-reranker-v2-base-multilingual",
+        mteb_score=0.33643,
         architecture="XLMRobertaForSequenceClassification")
 ]
 
diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py
index 73823deeff..1731c6ae6f 100644
--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@@ -20,6 +20,7 @@ RERANK_MODELS = [
     LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
                                architecture="Qwen2ForSequenceClassification",
                                hf_overrides=mxbai_rerank_hf_overrides,
+                               mteb_score=0.273,
                                enable_test=True),
     LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
                                architecture="Qwen2ForSequenceClassification",
diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py
index 2d05958e9b..52a8ce6e66 100644
--- a/tests/models/language/pooling/test_nomic.py
+++ b/tests/models/language/pooling/test_nomic.py
@@ -10,6 +10,7 @@ from .mteb_utils import mteb_test_embed_models
 MODELS = [
     CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1",
                              architecture="NomicBertModel",
+                             mteb_score=0.737568559,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
                              architecture="NomicBertModel",
@@ -19,6 +20,7 @@ MODELS = [
                              enable_test=False),
     CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
                              architecture="NomicBertModel",
+                             mteb_score=0.715488912,
                              enable_test=True)
 ]
 
diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py
index 5dd2d9eae9..ebdacf9d0c 100644
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -20,6 +20,7 @@ qwen3_reranker_hf_overrides = {
 RERANK_MODELS = [
     LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
                                architecture="Qwen3ForSequenceClassification",
+                               mteb_score=0.25736,
                                hf_overrides=qwen3_reranker_hf_overrides,
                                enable_test=True),
     LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
index c22c78592e..864f3d75ef 100644
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -11,6 +11,7 @@ MODELS = [
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
                              is_matryoshka=False,
                              architecture="BertModel",
+                             mteb_score=0.714927797,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
                              is_matryoshka=False,
@@ -23,6 +24,7 @@ MODELS = [
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
                              is_matryoshka=False,
                              architecture="NomicBertModel",
+                             mteb_score=0.681146831,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
                              is_matryoshka=False,
@@ -31,14 +33,17 @@ MODELS = [
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
                              is_matryoshka=True,
                              architecture="BertModel",
+                             mteb_score=0.649088363,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
                              is_matryoshka=True,
                              architecture="XLMRobertaModel",
+                             mteb_score=0.712258299,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                              is_matryoshka=True,
                              architecture="GteModel",
+                             mteb_score=0.706622444,
                              enable_test=True),
 ]
 
@@ -46,7 +51,7 @@ MODELS = [
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
diff --git a/tests/models/language/pooling/test_st_projector.py b/tests/models/language/pooling/test_st_projector.py
index 51ddbcc5ab..bafeb4060d 100644
--- a/tests/models/language/pooling/test_st_projector.py
+++ b/tests/models/language/pooling/test_st_projector.py
@@ -10,6 +10,7 @@ ST_PROJECTOR_MODELS = [
     CLSPoolingEmbedModelInfo(
         "TencentBAC/Conan-embedding-v1",
         architecture="BertModel",
+        mteb_score=0.688611955,
         enable_test=True,
     ),
 ]
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 0fb1f5b375..40a41afff8 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -347,6 +347,7 @@ class ModelInfo:
     dtype: str = "auto"
     hf_overrides: Optional[dict[str, Any]] = None
     default_pooling_type: str = ""
+    mteb_score: Optional[float] = None
     enable_test: bool = True
 
 
From 28f350e147c4b5c050c0080ef0d924c15ab87635 Mon Sep 17 00:00:00 2001
From: Jakub Smid <90085992+biba10@users.noreply.github.com>
Date: Wed, 3 Sep 2025 12:47:55 +0200
Subject: [PATCH 821/932] Support add_generation_prompt in embeddings endpoint
 with chat request (#23931)

Signed-off-by: biba10 <jaksmid@seznam.cz>
---
 vllm/entrypoints/openai/protocol.py          | 8 ++++++++
 vllm/entrypoints/openai/serving_embedding.py | 4 +---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 4881022325..413e1dd8d6 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1342,6 +1342,14 @@ class EmbeddingChatRequest(OpenAIBaseModel):
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
 
     # --8<-- [start:chat-embedding-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=
+        ("If true, the generation prompt will be added to the chat template. "
+         "This is a parameter used by chat template in tokenizer config of the "
+         "model."),
+    )
+
     add_special_tokens: bool = Field(
         default=False,
         description=(
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 0a0d98db2d..c6d3509afd 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -93,9 +93,7 @@ class EmbeddingMixin(OpenAIServing):
                     or ctx.chat_template,
                     chat_template_content_format=ctx.
                     chat_template_content_format,
-                    # In embedding requests, we are not generating tokens,
-                    # so there is no need to append extra tokens to the input
-                    add_generation_prompt=False,
+                    add_generation_prompt=ctx.request.add_generation_prompt,
                     continue_final_message=False,
                     add_special_tokens=ctx.request.add_special_tokens,
                 )

From 6997a25ac65ed6cc3c2be6d09ca45f633a345f63 Mon Sep 17 00:00:00 2001
From: qscqesze <qscqesze@gmail.com>
Date: Wed, 3 Sep 2025 19:27:04 +0800
Subject: [PATCH 822/932] [Model] Remove useless code from MiniMax
 implementation (#23982)

Signed-off-by: QscQ <qscqesze@gmail.com>
Signed-off-by: qingjun <qingjun@minimaxi.com>
---
 vllm/model_executor/layers/mamba/linear_attn.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
index d93cef1a27..5fe37a6289 100644
--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -83,17 +83,7 @@ class MiniMaxText01RMSNormTP(CustomOp):
             variance = tensor_model_parallel_all_reduce(
                 variance) / self.tp_world
         x = x * torch.rsqrt(variance + self.variance_epsilon)
-
-        weight = self.weight
-        if x.size(-1) != self.weight.size(0):
-            if self.weight.size(0) < x.size(-1):
-                repeat_count = (x.size(-1) + self.weight.size(0)) // x.size(-1)
-                full_weight = self.weight.repeat(repeat_count)
-                weight = full_weight[:x.size(-1)]
-            else:
-                weight = self.weight[:x.size(-1)]
-
-        x = x.to(orig_dtype) * weight
+        x = x.to(orig_dtype) * self.weight
         return x
 
     def forward(

From 4ba0c587ba3ad2ab419ba6f43a2d52946c58d530 Mon Sep 17 00:00:00 2001
From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com>
Date: Wed, 3 Sep 2025 22:17:20 +0800
Subject: [PATCH 823/932] FIX: Add libnuma-dev to Dockerfile for dev stage
 (#20388)

Signed-off-by: dongbo910220 <1275604947@qq.com>
---
 docker/Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 75e8fa49f8..01b7aa0f44 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -261,6 +261,8 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
+# Install libnuma-dev, required by fastsafetensors (fixes #20384)
+RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt

From 6d80ae83e1455cb0e47196cea557398fde0f03d1 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 3 Sep 2025 17:01:09 +0200
Subject: [PATCH 824/932] [Bugfix] Fixing division by zero in triton_attn if
 query_heads/kv_heads > 16  (#23424)

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 vllm/attention/ops/triton_unified_attention.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index 56ebed0f52..250e9b3890 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -674,7 +674,8 @@ def unified_attention(
     num_queries_per_kv = num_query_heads // num_kv_heads
     head_size = q.shape[2]
 
-    BLOCK_M = 16
+    BLOCK_M = 16 if num_queries_per_kv <= 16 else triton.next_power_of_2(
+        num_queries_per_kv)
     BLOCK_Q = BLOCK_M // num_queries_per_kv
 
     # Ideally we would launch with kernel with:

From fa4311d85f3a5f6d445f20429ca6d38122908eeb Mon Sep 17 00:00:00 2001
From: nopperl <54780682+nopperl@users.noreply.github.com>
Date: Thu, 4 Sep 2025 00:24:02 +0900
Subject: [PATCH 825/932] [V1] v1 engine + full CUDA graph support for PLaMo2
 (#23998)

Signed-off-by: Hemmi Shinichi <shemmi@preferred.jp>
Signed-off-by: nopperl <54780682+nopperl@users.noreply.github.com>
Co-authored-by: Hemmi Shinichi <shemmi@preferred.jp>
Co-authored-by: Thomas Parnell <tom.parnell@gmail.com>
---
 docs/models/supported_models.md               |   2 +-
 docs/usage/v1_guide.md                        |   2 +-
 .../models/language/generation/test_hybrid.py |   5 +-
 tests/models/registry.py                      |   2 -
 vllm/config/compilation.py                    |   1 +
 vllm/model_executor/models/plamo2.py          | 462 +++++++++++++-----
 6 files changed, 349 insertions(+), 125 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 4b4cebb6a3..7f54d98527 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -395,7 +395,7 @@ th {
 | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Phi4FlashForCausalLM` | Phi-4-mini-flash-reasoning | `microsoft/microsoft/Phi-4-mini-instruct`, etc. | | | |
 | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ |
-| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | |
+| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | ✅︎ |
 | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index f71805436a..525f740d12 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -110,7 +110,7 @@ Models using selective state-space mechanisms instead of standard transformer at
 Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported.
 
 Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
-`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
+`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`, `Plamo2ForCausalLM`).
 
 Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`, `Lfm2ForCausalLM`).
 
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 9e97e3fa65..b44ddc61b6 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -25,8 +25,7 @@ SSM_MODELS = [
 
 HYBRID_MODELS = [
     "ai21labs/Jamba-tiny-dev",
-    # skipping until vLLM implementation issues are resolved
-    # "pfnet/plamo-2-1b",
+    "pfnet/plamo-2-1b",
     "Zyphra/Zamba2-1.2B-instruct",
     "hmellor/tiny-random-BambaForCausalLM",
     "ibm-granite/granite-4.0-tiny-preview",
@@ -37,6 +36,7 @@ HYBRID_MODELS = [
 V1_SUPPORTED_MODELS = [
     "state-spaces/mamba-130m-hf",
     "ai21labs/Jamba-tiny-dev",
+    "pfnet/plamo-2-1b",
     "yujiepan/mamba2-codestral-v0.1-tiny-random",
     "Zyphra/Zamba2-1.2B-instruct",
     "hmellor/tiny-random-BambaForCausalLM",
@@ -47,6 +47,7 @@ V1_SUPPORTED_MODELS = [
 
 FULL_CUDA_GRAPH_MODELS = [
     "ai21labs/Jamba-tiny-dev",
+    "pfnet/plamo-2-1b",
     "Zyphra/Zamba2-1.2B-instruct",
 ]
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 4cf3dd6e08..f1f61c6151 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -287,8 +287,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
     "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
-                                         max_transformers_version="4.53",
-                                         transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings",  # noqa: E501
                                         trust_remote_code=True),
     "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                        max_transformers_version="4.53",
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 28ad3d2f53..677fb069bc 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -340,6 +340,7 @@ class CompilationConfig:
         "vllm.mamba_mixer",
         "vllm.short_conv",
         "vllm.linear_attention",
+        "vllm.plamo2_mamba_mixer",
     ]
 
     def compute_hash(self) -> str:
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 7f70e44b10..b9869f5e58 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -3,19 +3,24 @@
 """Inference-only PLaMo2 model."""
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig, PreTrainedModel
+from transformers import PretrainedConfig
 
+from vllm import envs
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
-from vllm.forward_context import get_forward_context
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -23,8 +28,11 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba2_metadata import (
-    Mamba2Metadata, prepare_mamba2_metadata)
+    Mamba2Metadata, prepare_mamba2_metadata, update_metadata)
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
@@ -39,7 +47,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import (
     composed_weight_loader, default_weight_loader, sharded_weight_loader)
 from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
-                                                   SupportsPP, SupportsV0Only)
+                                                   SupportsPP)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.models.utils import (
@@ -47,8 +55,10 @@ from vllm.model_executor.models.utils import (
     make_layers, maybe_prefix)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import LayerBlockType
+from vllm.utils import LayerBlockType, direct_register_custom_op
+from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
 
 
 # Only used for type hinting.
@@ -73,20 +83,6 @@ class Plamo2Config(PretrainedConfig):  # type: ignore
     vocab_size: int
 
 
-class Plamo2PreTrainedModel(PreTrainedModel):  # type: ignore
-
-    def _init_weights(self, module: torch.nn.Module) -> None:
-        std = 0.02
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
 def is_mamba(config: Plamo2Config, i: int) -> bool:
     assert config.mamba_step > 1
 
@@ -99,7 +95,8 @@ def is_mamba(config: Plamo2Config, i: int) -> bool:
 # Adapted from:
 # vllm.model_executor.layers.mamba.mamba_mixer2.MambaMixer2
 # transformers.models.mamba.modeling_mamba.MambaMixer
-class Plamo2MambaMixer(nn.Module):
+@CustomOp.register(name="plamo2_mamba_mixer")
+class Plamo2MambaMixer(MambaBase, CustomOp):
 
     def __init__(self,
                  vllm_config: VllmConfig,
@@ -108,6 +105,8 @@ class Plamo2MambaMixer(nn.Module):
                  **kwargs) -> None:
         super().__init__()
         self.config = vllm_config.model_config.hf_config
+        self.cache_config = vllm_config.cache_config
+        self.model_config = vllm_config.model_config
         self.quant_config = vllm_config.quant_config
         self.hidden_size = self.config.hidden_size
         self.ssm_state_size = self.config.mamba_d_state
@@ -115,8 +114,6 @@ class Plamo2MambaMixer(nn.Module):
         self.intermediate_size = (self.config.mamba_num_heads *
                                   self.config.hidden_size_per_head)
         self.tp_size = get_tensor_model_parallel_world_size()
-        self.intermediate_size_per_tp_worker = \
-            self.intermediate_size // self.tp_size
         self.head_dim = self.config.hidden_size_per_head
         self.num_heads = self.config.mamba_num_heads
         self.time_step_rank = max(64, self.hidden_size // 16)
@@ -197,6 +194,22 @@ class Plamo2MambaMixer(nn.Module):
         self.C_norm = RMSNorm(self.ssm_state_size,
                               eps=self.config.rms_norm_eps)
 
+        self.chunk_size = self.config.mamba_chunk_size
+
+        if envs.VLLM_USE_V1:
+            compilation_config = get_current_vllm_config().compilation_config
+            if prefix in compilation_config.static_forward_context:
+                raise ValueError(f"Duplicate layer name: {prefix}")
+            compilation_config.static_forward_context[prefix] = self
+            # The outer list is for v0 PP virtual engine. Though this code path
+            # only runs for v1, we have to do this to unify with the interface
+            # of Attention + v0 PP.
+            # The inner tuple is (conv_state, ssm_state)
+            self.kv_cache = [(torch.tensor([]), torch.tensor([]))]
+            assert self.chunk_size != -1, "chunk_size must be set for v1"
+
+        self.prefix = prefix
+
     def _project_ssm_parameters(self, hidden_states):
         ssm_parameters = self.bcdt_proj(hidden_states)
         B, C, time_step = torch.split(
@@ -212,25 +225,76 @@ class Plamo2MambaMixer(nn.Module):
         dt = self.dt_proj(time_step)
         return B, C, dt
 
-    def forward(
+    def forward_native(
         self,
         hidden_states: torch.Tensor,
+        output: torch.Tensor,
         mamba_cache_params: MambaCacheParams,
         mamba2_metadata: Mamba2Metadata,
         **kwargs,
-    ) -> torch.Tensor:
+    ):
+        pass
 
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        if not envs.VLLM_USE_V1:
+            CustomOp.forward(self, hidden_states, output, mamba_cache_params,
+                             mamba2_metadata)
+        else:
+            torch.ops.vllm.plamo2_mamba_mixer(
+                hidden_states,
+                output,
+                self.prefix,
+            )
+
+    def forward_cuda(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+
+        forward_context = get_forward_context()
         # mamba2_metadata contains metadata necessary for the mamba2 triton
         # kernels to operate in continuous batching and in chunked prefill
         # modes; they are computed at top-level model forward since they
         # stay the same and reused for all mamba layers in the same iteration
-        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
-
-        num_prefills = attn_metadata.num_prefills  # request count
-        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
-        num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
-        has_prefill = num_prefills > 0
-        has_decode = num_decodes > 0
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+        if envs.VLLM_USE_V1:
+            if attn_metadata is not None:
+                assert isinstance(attn_metadata, dict)
+                attn_metadata = attn_metadata[self.prefix]
+                mamba2_metadata = attn_metadata
+                assert isinstance(attn_metadata, Mamba2AttentionMetadata)
+                self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+                # conv_state = (..., dim, width-1) yet contiguous along 'dim'
+                conv_state = self_kv_cache[0].transpose(-1, -2)
+                ssm_state = self_kv_cache[1]
+                state_indices_tensor = attn_metadata.state_indices_tensor
+                has_initial_states_p = attn_metadata.has_initial_states_p
+                prep_initial_states = attn_metadata.prep_initial_states
+                chunk_size = attn_metadata.chunk_size
+                seq_idx_p = attn_metadata.seq_idx_p
+                chunk_indices_p = attn_metadata.chunk_indices_p
+                chunk_offsets_p = attn_metadata.chunk_offsets_p
+        else:
+            conv_state = mamba_cache_params.conv_state
+            ssm_state = mamba_cache_params.ssm_state
+            state_indices_tensor = mamba_cache_params.state_indices_tensor
+            has_initial_states_p = mamba2_metadata.has_initial_states
+            prep_initial_states = mamba2_metadata.prep_initial_states
+            chunk_size = mamba2_metadata.chunk_size
+            seq_idx_p = mamba2_metadata.seq_idx
+            chunk_indices_p = mamba2_metadata.chunk_indices
+            chunk_offsets_p = mamba2_metadata.chunk_offsets
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)
@@ -240,23 +304,59 @@ class Plamo2MambaMixer(nn.Module):
         conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
                                                self.conv1d.weight.size(2))
 
+        if envs.VLLM_USE_V1 and attn_metadata is None:
+            # V1 profile run
+            hidden_states = (hidden_states.transpose(0, 1).clone().transpose(
+                0, 1)).contiguous()
+            output[:] = self.out_proj(hidden_states)
+            return
+
+        num_prefills = attn_metadata.num_prefills  # request count
+        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
+        num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
+        has_prefill = num_prefills > 0
+        has_decode = num_decodes > 0
+        num_actual_tokens = num_prefill_tokens + num_decodes
+
+        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
         # Separate prefill and decode by splitting varlen input
         # Split along token dimension
-        hidden_states_p, hidden_states_d = torch.split(
-            hidden_states,
-            [num_prefill_tokens, num_decodes],
-            dim=0,
-        )
-        gate_p, gate_d = torch.split(gate, [num_prefill_tokens, num_decodes],
-                                     dim=0)
-        # Split along batch dimension
-        state_indices_tensor_p, state_indices_tensor_d = torch.split(
-            mamba_cache_params.state_indices_tensor,
-            [num_prefills, num_decodes],
-            dim=0,
-        )
-        query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills + 1]
-                             if has_prefill else None)
+        if envs.VLLM_USE_V1:
+            hidden_states_d, hidden_states_p = torch.split(
+                hidden_states[:num_actual_tokens],
+                [num_decodes, num_prefill_tokens],
+                dim=0,
+            )
+            gate_d, gate_p = torch.split(gate[:num_actual_tokens],
+                                         [num_decodes, num_prefill_tokens],
+                                         dim=0)
+            # Split along batch dimension
+            state_indices_tensor_d, state_indices_tensor_p = torch.split(
+                state_indices_tensor,
+                [num_decodes, num_prefills],
+                dim=0,
+            )
+            query_start_loc_p = (
+                attn_metadata.query_start_loc[-num_prefills - 1:] -
+                num_decodes if has_prefill else None)
+        else:
+            hidden_states_p, hidden_states_d = torch.split(
+                hidden_states,
+                [num_prefill_tokens, num_decodes],
+                dim=0,
+            )
+            gate_p, gate_d = torch.split(gate,
+                                         [num_prefill_tokens, num_decodes],
+                                         dim=0)
+            # Split along batch dimension
+            state_indices_tensor_p, state_indices_tensor_d = torch.split(
+                state_indices_tensor,
+                [num_prefills, num_decodes],
+                dim=0,
+            )
+            query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills +
+                                                               1]
+                                 if has_prefill else None)
 
         # Preallocate output tensor to avoid memcpy cost for merging prefill
         # and decode outputs
@@ -268,25 +368,38 @@ class Plamo2MambaMixer(nn.Module):
             dtype=hidden_states.dtype,
             device=hidden_states.device,
         )
-        preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split(
-            preallocated_ssm_out,
-            [num_prefill_tokens, num_decodes],
-            dim=0,
-        )
+        if envs.VLLM_USE_V1:
+            preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split(
+                preallocated_ssm_out,
+                [num_decodes, num_prefill_tokens],
+                dim=0,
+            )
+        else:
+            preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split(
+                preallocated_ssm_out,
+                [num_prefill_tokens, num_decodes],
+                dim=0,
+            )
 
         # Process prefill requests
         if has_prefill:
             # 2. Convolution sequence transformation
             # - "cache_indices" updates the conv_state cache in positions
-            # pointed to by "mamba_cache_params.state_indices_tensor"
+            #   pointed to by "state_indices_tensor"
+            x = hidden_states_p.transpose(
+                0, 1)  # this is the form that causal-conv see
+            if mamba2_metadata.cu_seqlen is None:
+                mamba2_metadata = update_metadata(x, query_start_loc_p,
+                                                  mamba2_metadata)
             hidden_states_p = causal_conv1d_fn(
-                hidden_states_p.transpose(0, 1),
+                x,
                 conv_weights,
                 self.conv1d.bias,
                 activation=self.activation,
-                conv_states=mamba_cache_params.conv_state,
-                has_initial_state=mamba2_metadata.has_initial_states,
+                conv_states=conv_state,
+                has_initial_state=has_initial_states_p,
                 cache_indices=state_indices_tensor_p,
+                metadata=mamba2_metadata,
                 query_start_loc=query_start_loc_p)
             hidden_states_p = hidden_states_p.transpose(0, 1)
             hidden_states_p = hidden_states_p[:num_prefill_tokens]
@@ -299,12 +412,16 @@ class Plamo2MambaMixer(nn.Module):
 
             # 3. State Space Model sequence transformation
             initial_states = None
-            if (mamba2_metadata.has_initial_states is not None
-                    and mamba2_metadata.prep_initial_states):
+            if has_initial_states_p is not None and prep_initial_states:
                 # making a copy of the states
-                initial_states = torch.where(
-                    mamba2_metadata.has_initial_states[:, None, None, None],
-                    mamba_cache_params.ssm_state[state_indices_tensor_p], 0)
+                if envs.VLLM_USE_V1:
+                    initial_states = torch.where(
+                        has_initial_states_p[:, None, None, None],
+                        ssm_state[state_indices_tensor_p], 0)
+                else:
+                    initial_states = torch.where(
+                        has_initial_states_p[:num_prefills, None, None, None],
+                        ssm_state[state_indices_tensor_p], 0)
             varlen_state = mamba_chunk_scan_combined(
                 hidden_states_p.view(1, num_prefill_tokens,
                                      self.num_heads // self.tp_size,
@@ -313,15 +430,15 @@ class Plamo2MambaMixer(nn.Module):
                 self.A,
                 B.view(1, num_prefill_tokens, 1, -1),
                 C.view(1, num_prefill_tokens, 1, -1),
-                chunk_size=mamba2_metadata.chunk_size,
+                chunk_size=chunk_size,
                 D=self.D,
                 z=gate_p.view(1, num_prefill_tokens,
                               self.num_heads // self.tp_size, self.head_dim),
                 dt_bias=self.dt_bias,
-                seq_idx=mamba2_metadata.seq_idx,
-                chunk_indices=mamba2_metadata.chunk_indices,
-                chunk_offsets=mamba2_metadata.chunk_offsets,
-                cu_seqlens=attn_metadata.query_start_loc[:num_prefills + 1],
+                seq_idx=seq_idx_p,
+                chunk_indices=chunk_indices_p,
+                chunk_offsets=chunk_offsets_p,
+                cu_seqlens=query_start_loc_p,
                 initial_states=initial_states,
                 return_varlen_states=True,
                 return_final_states=False,
@@ -329,18 +446,19 @@ class Plamo2MambaMixer(nn.Module):
                 dt_limit=(0.0, float("inf")),
                 out=preallocated_ssm_out_p.view(1, num_prefill_tokens, -1,
                                                 self.head_dim),
+                state_dtype=ssm_state.dtype,
             )
 
             # update ssm states
             # - varlen state is a (batch, nheads, headdim, dstate) tensor
-            mamba_cache_params.ssm_state[state_indices_tensor_p] = varlen_state
+            ssm_state[state_indices_tensor_p] = varlen_state
 
         # Process decode requests
         if has_decode:
             # 2. Convolution sequence transformation
             hidden_states_d = causal_conv1d_update(
                 hidden_states_d,
-                mamba_cache_params.conv_state,
+                conv_state,
                 conv_weights,
                 self.conv1d.bias,
                 self.activation,
@@ -363,8 +481,10 @@ class Plamo2MambaMixer(nn.Module):
             # - the hidden is reshaped into (bs, num_heads, head_dim)
             # - mamba_cache_params.ssm_state's slots will be selected
             #   using state_indices_tensor_d
+
+            # NOTE: final output is an in-place update of out tensor
             selective_state_update(
-                mamba_cache_params.ssm_state,
+                ssm_state,
                 hidden_states_d,
                 dt,
                 A,
@@ -378,11 +498,68 @@ class Plamo2MambaMixer(nn.Module):
                 out=preallocated_ssm_out_d.view(num_decodes, -1,
                                                 self.head_dim),
             )
-            assert self.num_heads % self.tp_size == 0
 
         # 4. Final linear projection
-        out = self.out_proj(preallocated_ssm_out)
-        return out
+        output[:num_actual_tokens] = self.out_proj(preallocated_ssm_out)
+
+    def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
+        assert self.model_config is not None
+        assert self.cache_config is not None
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+            self.cache_config.mamba_ssm_cache_dtype,
+        )
+
+    def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
+        return MambaStateShapeCalculator.mamba2_state_shape(
+            intermediate_size=self.intermediate_size,
+            tp_world_size=get_tensor_model_parallel_world_size(),
+            n_groups=0,
+            num_heads=self.num_heads,
+            head_dim=self.head_dim,
+            state_size=self.ssm_state_size,
+            conv_kernel=self.conv_kernel_size,
+        )
+
+    @property
+    def mamba_type(self) -> str:
+        return "mamba2"
+
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        from vllm.v1.attention.backends.mamba2_attn import (
+            Mamba2AttentionBackend)
+        return Mamba2AttentionBackend
+
+
+def plamo2_mamba_mixer(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self.forward_cuda(hidden_states=hidden_states,
+                      output=output,
+                      mamba_cache_params=None,
+                      mamba2_metadata=None)
+
+
+def plamo2_mamba_mixer_fake(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="plamo2_mamba_mixer",
+    op_func=plamo2_mamba_mixer,
+    mutates_args=["output"],
+    fake_impl=plamo2_mamba_mixer_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
 
 
 class DenseMLP(nn.Module):
@@ -418,7 +595,6 @@ class DenseMLP(nn.Module):
         return self.down_proj(h)
 
 
-@support_torch_compile
 class Plamo2AttentionMixer(nn.Module):
 
     def __init__(self,
@@ -575,12 +751,24 @@ class Plamo2DecoderLayer(nn.Module):
             hidden_states, residual = self.pre_mixer_norm(
                 hidden_states, residual)
 
+        if self.is_mamba:
+            # Plamo2MambaMixer writes output to this tensor
+            output = torch.empty_like(hidden_states)
+            mixer_kwargs = {
+                "output": output,
+                "mamba_cache_params": mamba_cache_params,
+                "mamba2_metadata": mamba2_metadata,
+            }
+        else:
+            mixer_kwargs = {
+                "positions": positions,
+            }
         hidden_states = self.mixer(
-            positions=positions,
             hidden_states=hidden_states,
-            mamba_cache_params=mamba_cache_params,
-            mamba2_metadata=mamba2_metadata,
+            **mixer_kwargs,
         )
+        if self.is_mamba:
+            hidden_states = output
         hidden_states = self.post_mixer_norm(hidden_states)
         # Fully Connected
         hidden_states, residual = self.pre_mlp_norm(hidden_states, residual)
@@ -591,7 +779,7 @@ class Plamo2DecoderLayer(nn.Module):
 
 class Plamo2Decoder(torch.nn.Module):
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
         extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)}
@@ -617,7 +805,7 @@ class Plamo2Decoder(torch.nn.Module):
         mamba_cache_index = 0
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             layer_mamba_cache_params = None
-            if layer.is_mamba:
+            if layer.is_mamba and mamba_cache_params is not None:
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
                     mamba_cache_index)
                 mamba_cache_index += 1
@@ -632,10 +820,11 @@ class Plamo2Decoder(torch.nn.Module):
         return hidden_states, residual
 
 
-class Plamo2Model(Plamo2PreTrainedModel):
+@support_torch_compile
+class Plamo2Model(torch.nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config.model_config.hf_config)
+        super().__init__()
 
         config = vllm_config.model_config.hf_config
 
@@ -653,9 +842,9 @@ class Plamo2Model(Plamo2PreTrainedModel):
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
-        self.layers = Plamo2Decoder(vllm_config, prefix=f"{prefix}.layers")
+        self.layers = Plamo2Decoder(vllm_config=vllm_config,
+                                    prefix=f"{prefix}.layers")
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_init()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
@@ -679,11 +868,16 @@ class Plamo2Model(Plamo2PreTrainedModel):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
-        mamba2_metadata = prepare_mamba2_metadata(
-            chunk_size=self.config.mamba_chunk_size,
-            attn_metadata=attn_metadata,
-        )
+        if not envs.VLLM_USE_V1:
+            attn_metadata: AttentionMetadata = get_forward_context(
+            ).attn_metadata
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.mamba_chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
 
         hidden_states, residual = self.layers(
             positions=positions,
@@ -701,8 +895,7 @@ class Plamo2Model(Plamo2PreTrainedModel):
         return hidden_states
 
 
-class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
-                        IsHybrid, SupportsV0Only):
+class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -712,12 +905,10 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
         config = vllm_config.model_config.hf_config
         scheduler_config = vllm_config.scheduler_config
-        assert not vllm_config.cache_config.enable_prefix_caching, \
-            "PLaMo2 currently does not support prefix caching"
 
-        super().__init__(config)
         self.config = config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
@@ -751,8 +942,6 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
-        # Initialize weights and apply final processing
-        self.post_init()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
@@ -763,19 +952,27 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
-        if self.mamba_cache is None:
-            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
-                self.vllm_config.parallel_config, LayerBlockType.mamba)
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                num_mamba_layers = (
+                    self.model_config.get_num_layers_by_block_type(
+                        self.vllm_config.parallel_config,
+                        LayerBlockType.mamba))
 
-            self.mamba_cache = MambaCacheManager(
-                self.vllm_config,
-                num_mamba_layers,
-                *self._get_mamba_cache_shape(),
-                self.lm_head.weight.dtype,
-                self.lm_head.weight.dtype,
-            )
+                mamba_state_shape = self.get_mamba_state_shape_from_config(
+                    self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
+                self.mamba_cache = MambaCacheManager(self.vllm_config,
+                                                     num_mamba_layers,
+                                                     *mamba_state_shape,
+                                                     *mamba_state_dtype)
 
-        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+        else:
+            # NOTE: mamba_cache_params is not needed for v1
+            mamba_cache_params = None
 
         hidden_states = self.model(input_ids, positions, mamba_cache_params,
                                    intermediate_tensors, inputs_embeds)
@@ -788,21 +985,48 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
 
-    def _get_mamba_cache_shape(
-            self) -> tuple[tuple[int, int], tuple[int, int, int]]:
-        world_size = get_tensor_model_parallel_world_size()
-        hidden_size = (self.config.mamba_num_heads *
-                       self.config.hidden_size_per_head)
-        conv_state_shape = (
-            hidden_size // world_size,
-            self.config.mamba_d_conv - 1,
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
         )
-        temporal_state_shape = (
-            divide(self.config.mamba_num_heads, world_size),
-            self.config.hidden_size_per_head,
-            self.config.mamba_d_state,
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
+        """Calculate shapes for Mamba's convolutional and state caches.
+        Args:
+            vllm_config: vLLM config
+            use_v1: Get shapes for V1 (or V0)
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+            - temporal_state_shape: Shape for state space model cache
+        """
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+        intermediate_size =\
+                hf_config.mamba_num_heads * hf_config.hidden_size_per_head
+
+        return MambaStateShapeCalculator.mamba2_state_shape(
+            intermediate_size=intermediate_size,
+            tp_world_size=parallel_config.tensor_parallel_size,
+            n_groups=0,
+            num_heads=hf_config.mamba_num_heads,
+            head_dim=hf_config.hidden_size_per_head,
+            state_size=hf_config.mamba_d_state,
+            conv_kernel=hf_config.mamba_d_conv,
+            use_v1=use_v1,
         )
-        return conv_state_shape, temporal_state_shape
 
     def compute_logits(
         self,

From e9b92dcd89e8d05f162a8fdaa3d5d60012615514 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Wed, 3 Sep 2025 12:35:18 -0400
Subject: [PATCH 826/932] [Kernels] Overlap shared experts with send/recv
 (#23273)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 docs/design/fused_moe_modular_kernel.md       |   8 +-
 examples/offline_inference/data_parallel.py   |   8 +
 tests/kernels/moe/test_pplx_moe.py            |  86 ++++++--
 tests/kernels/moe/utils.py                    | 149 ++++++++++++++
 .../device_communicators/all2all.py           |   7 +-
 .../base_device_communicator.py               |   5 +-
 .../fused_moe/deepep_ht_prepare_finalize.py   | 130 ++++++++----
 .../fused_moe/deepep_ll_prepare_finalize.py   |  50 ++++-
 .../flashinfer_cutlass_prepare_finalize.py    |   4 +-
 .../layers/fused_moe/fused_batched_moe.py     |   4 +-
 vllm/model_executor/layers/fused_moe/layer.py | 185 ++++++++++++++----
 .../layers/fused_moe/modular_kernel.py        | 142 +++++++++++---
 .../layers/fused_moe/pplx_prepare_finalize.py |  77 +++++++-
 .../layers/fused_moe/prepare_finalize.py      |   4 +-
 .../layers/quantization/awq_marlin.py         |   4 +-
 .../layers/quantization/bitsandbytes.py       |   2 +-
 .../compressed_tensors_moe.py                 |  12 +-
 .../layers/quantization/experts_int8.py       |   4 +-
 .../model_executor/layers/quantization/fp8.py |   4 +-
 .../layers/quantization/gguf.py               |   4 +-
 .../layers/quantization/gptq_marlin.py        |   2 +-
 .../layers/quantization/modelopt.py           |   4 +-
 .../layers/quantization/moe_wna16.py          |   4 +-
 .../layers/quantization/mxfp4.py              |   4 +-
 .../layers/quantization/quark/quark_moe.py    |   6 +-
 .../model_executor/layers/quantization/rtn.py |   4 +-
 .../layers/shared_fused_moe/__init__.py       |   6 +
 .../shared_fused_moe/shared_fused_moe.py      |  56 ++++++
 vllm/model_executor/models/deepseek_v2.py     | 103 ++++++----
 vllm/model_executor/models/glm4_moe.py        |   2 +
 vllm/model_executor/models/llama4.py          |  29 +--
 vllm/v1/worker/gpu_worker.py                  |   3 +-
 32 files changed, 885 insertions(+), 227 deletions(-)
 create mode 100644 vllm/model_executor/layers/shared_fused_moe/__init__.py
 create mode 100644 vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py

diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index b03483d1c9..cb2037b575 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -54,8 +54,8 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts
 
 ### FusedMoEPrepareAndFinalize
 
-The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare` and `finalize` functions.
-The `prepare` function is responsible for input activation Quantization and All2All Dispatch. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
+The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive`  and `finalize` functions.
+The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers.  Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts.  The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
 
 ![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")
 
@@ -146,6 +146,10 @@ This section describes the significance of the various functions exposed by the
 
 `FusedMoEPrepareAndFinalize::prepare()`: The prepare method implements the Quantization and All2All Dispatch. Typically the Dispatch function from the relevant All2All Manager is invoked.
 
+`FusedMoEPrepareAndFinalize::has_prepare_no_receive()`: Indicates whether or not this subclass implements `prepare_no_receive`. Defaults to False.
+
+`FusedMoEPrepareAndFinalize::prepare_no_receive()`: The prepare_no_receive method implements the Quantization and All2All Dispatch. It does not wait for the result of the dispatch operation but instead returns a thunk that can be invoked to wait for the final results. Typically the Dispatch function from the relevant All2All Manager is invoked.
+
 `FusedMoEPrepareAndFinalize::finalize()`: Maybe perform TopK Weight Application and Reduction and All2All Combine. Typically the Combine function from the relevant All2AllManager is invoked.
 
 `FusedMoEPrepareAndFinalize::activation_format()`: Return `FusedMoEActivationFormat.BatchedExperts` if the output of the prepare method (i.e. the All2All dispatch) is Batched. Return `FusedMoEActivationFormat.Standard` otherwise.
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index dd7559451c..36d805a32d 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -87,6 +87,11 @@ def parse_args():
         default=0.8,
         help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
     )
+    parser.add_argument(
+        "--compilation-config",
+        type=int,
+        help=("Compilation optimization (O) level 0-3."),
+    )
     parser.add_argument(
         "--quantization",
         type=str,
@@ -106,6 +111,7 @@ def main(
     trust_remote_code,
     max_num_seqs,
     max_model_len,
+    compilation_config,
     gpu_memory_utilization,
     quantization,
 ):
@@ -162,6 +168,7 @@ def main(
         max_model_len=max_model_len,
         gpu_memory_utilization=gpu_memory_utilization,
         quantization=quantization,
+        compilation_config=compilation_config,
     )
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
@@ -218,6 +225,7 @@ if __name__ == "__main__":
                 args.trust_remote_code,
                 args.max_num_seqs,
                 args.max_model_len,
+                args.compilation_config,
                 args.gpu_memory_utilization,
                 args.quantization,
             ),
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 3f36d7ada2..394f521140 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -4,10 +4,11 @@
 
 Run `pytest tests/kernels/test_pplx_moe.py`.
 """
+import copy
 import itertools
 import textwrap
 import traceback
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import pytest
 import torch
@@ -21,7 +22,10 @@ try:
 except ImportError:
     has_pplx = False
 
-from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
+from tests.kernels.moe.modular_kernel_tools.parallel_utils import (
+    _set_vllm_config)
+from tests.kernels.moe.utils import (make_shared_experts, make_test_weights,
+                                     naive_batched_moe)
 from tests.kernels.quant_utils import dequant
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
@@ -511,7 +515,8 @@ def pplx_moe(
     block_shape: Optional[list[int]] = None,
     use_compile: bool = False,
     use_cudagraphs: bool = True,
-) -> torch.Tensor:
+    shared_experts: Optional[torch.nn.Module] = None,
+) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
 
     num_tokens, hidden_dim = a.shape
     num_experts = w1.shape[0]
@@ -546,6 +551,7 @@ def pplx_moe(
     fused_experts = FusedMoEModularKernel(
         prepare_finalize,
         experts,
+        shared_experts,
     )
 
     # Note: workers with the same dp_rank must use the exact same inputs.
@@ -586,7 +592,11 @@ def pplx_moe(
                          global_num_experts=num_experts)
 
     if use_cudagraphs:
-        out.fill_(0)
+        if isinstance(out, tuple):
+            out[0].fill_(0)
+            out[1].fill_(0)
+        else:
+            out.fill_(0)
         stream = torch.cuda.Stream()
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph, stream=stream):
@@ -626,6 +636,7 @@ def _pplx_moe(
     per_act_token_quant: bool = False,
     block_shape: Optional[list[int]] = None,
     use_internode: bool = False,
+    shared_experts: Optional[torch.nn.Module] = None,
 ):
     try:
         if use_internode:
@@ -666,6 +677,11 @@ def _pplx_moe(
         with set_current_vllm_config(vllm_config), override_config(moe_config):
             topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
 
+            if shared_experts is not None:
+                shared_output = shared_experts(a)
+            else:
+                shared_output = None
+
             torch_output = torch_experts(
                 a,
                 w1,
@@ -696,7 +712,7 @@ def _pplx_moe(
                 block_shape=block_shape,
             )
 
-            pplx_output = pplx_moe(
+            pplx_outputs = pplx_moe(
                 group_name,
                 rank,
                 world_size,
@@ -713,8 +729,24 @@ def _pplx_moe(
                 quant_dtype=quant_dtype,
                 per_act_token_quant=per_act_token_quant,
                 block_shape=block_shape,
+                shared_experts=shared_experts,
             )
 
+        if shared_experts is None:
+            pplx_shared_output = None
+            pplx_output = pplx_outputs
+            assert isinstance(pplx_output, torch.Tensor)
+        else:
+            pplx_shared_output, pplx_output = pplx_outputs
+
+        if shared_output is not None:
+            assert pplx_shared_output is not None
+            chunked_shared_output = chunk_by_rank(
+                shared_output, pgi.rank,
+                pgi.world_size).to(pplx_shared_output.device)
+        else:
+            chunked_shared_output = None
+
         chunked_batch_output = chunk_by_rank(
             batched_output, pgi.rank, pgi.world_size).to(pplx_output.device)
 
@@ -727,6 +759,15 @@ def _pplx_moe(
                                    chunked_batch_output,
                                    atol=3e-2,
                                    rtol=3e-2)
+
+        if shared_experts is not None:
+            assert chunked_shared_output is not None
+            assert pplx_shared_output is not None
+            torch.testing.assert_close(pplx_shared_output,
+                                       chunked_shared_output,
+                                       atol=3e-2,
+                                       rtol=3e-2)
+
     finally:
         if use_internode:
             nvshmem_finalize()
@@ -788,7 +829,8 @@ def test_pplx_moe_slow(
 
 
 def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
-                    make_weights: bool, test_fn: Callable):
+                    use_shared_experts: bool, make_weights: bool,
+                    test_fn: Callable):
 
     def format_result(msg, ex=None):
         if ex is not None:
@@ -803,6 +845,14 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
         else:
             print(f"PASSED {msg}")
 
+    if use_shared_experts:
+        # Note: this config is only needed for the non-naive shared experts.
+        new_vllm_config = copy.deepcopy(vllm_config)
+        new_vllm_config.parallel_config.data_parallel_size = pgi.world_size
+        new_vllm_config.parallel_config.enable_expert_parallel = True
+        _set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank,
+                         pgi.local_rank)
+
     current_platform.seed_everything(7)
     combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES,
                                [False, True], [None, [128, 128]])
@@ -819,9 +869,11 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
             use_fp8_w8a8 = False
             quant_dtype = None
 
-        test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
-                     f"dtype={dtype}, per_act_token={per_act_token_quant}, "
-                     f"block_shape={block_shape}")
+        test_desc = (
+            f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
+            f"dtype={dtype}, per_act_token={per_act_token_quant}, "
+            f"block_shape={block_shape}, use_internode={use_internode}, "
+            f"use_shared_experts={use_shared_experts}")
 
         if not use_fp8_w8a8 and (per_act_token_quant
                                  or block_shape is not None):
@@ -852,6 +904,14 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
             args["w1_s"] = w1_s
             args["w2_s"] = w2_s
 
+        if use_shared_experts:
+            args["shared_experts"] = make_shared_experts(
+                n,
+                k,
+                in_dtype=a.dtype,
+                quant_dtype=quant_dtype,
+            )
+
         try:
             test_fn(
                 pgi=pgi,
@@ -891,18 +951,20 @@ def test_pplx_prepare_finalize(
     current_platform.seed_everything(7)
     world_size, dp_size = world_dp_size
     parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
-                    use_internode, False, _pplx_prepare_finalize)
+                    use_internode, False, False, _pplx_prepare_finalize)
 
 
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @pytest.mark.parametrize("use_internode", [False])
+@pytest.mark.parametrize("use_shared_experts", [False, True])
 @requires_pplx
 @multi_gpu_test(num_gpus=2)
 def test_pplx_moe(
     world_dp_size: tuple[int, int],
     use_internode: bool,
+    use_shared_experts: bool,
 ):
     current_platform.seed_everything(7)
     world_size, dp_size = world_dp_size
-    parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True,
-                    _pplx_moe)
+    parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode,
+                    use_shared_experts, True, _pplx_moe)
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index 82960bd573..4b58a28eed 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -8,6 +8,7 @@ import vllm._custom_ops as ops
 from tests.kernels.quant_utils import per_block_cast_to_int8
 from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
                                                     FLOAT8_E4M3_MAX)
+from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
@@ -282,3 +283,151 @@ def per_token_cast_to_fp8(
     x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
     fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
     return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
+
+
+# CustomOp?
+class BaselineMM(torch.nn.Module):
+
+    def __init__(
+        self,
+        b: torch.Tensor,
+        out_dtype: torch.dtype,
+    ):
+        super().__init__()
+        self.b = b.to(dtype=torch.float32)
+        self.out_dtype = out_dtype
+
+    def forward(
+            self,
+            a: torch.Tensor) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        return torch.mm(a.to(dtype=torch.float32),
+                        self.b).to(self.out_dtype), None
+
+
+class TestMLP(torch.nn.Module):
+
+    def __init__(
+        self,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        out_dtype: torch.dtype,
+    ):
+        super().__init__()
+        self.gate_up_proj = BaselineMM(w1, out_dtype)
+        self.down_proj = BaselineMM(w2, out_dtype)
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+def make_naive_shared_experts(
+    N: int,
+    K: int,
+    in_dtype: torch.dtype = torch.bfloat16,
+) -> torch.nn.Module:
+    w1 = torch.randn((K, N * 2), device="cuda", dtype=in_dtype) / 15
+    w2 = torch.randn((N, K), device="cuda", dtype=in_dtype) / 15
+    return TestMLP(w1, w2, out_dtype=in_dtype)
+
+
+class RealMLP(torch.nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        hidden_act: str = "silu",
+        quant_config=None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        w1_s: Optional[torch.Tensor] = None,
+        w2_s: Optional[torch.Tensor] = None,
+    ) -> None:
+        from vllm.model_executor.layers.linear import (
+            MergedColumnParallelLinear, RowParallelLinear)
+
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.gate_up_proj.register_parameter(
+            "weight", torch.nn.Parameter(w1, requires_grad=False))
+        self.gate_up_proj.register_parameter(
+            "weight_scale", torch.nn.Parameter(w1_s, requires_grad=False))
+        self.gate_up_proj.register_parameter(
+            "input_scale",
+            None)  #torch.nn.Parameter(None, requires_grad=False))
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        self.down_proj.register_parameter(
+            "weight", torch.nn.Parameter(w2, requires_grad=False))
+        self.down_proj.register_parameter(
+            "weight_scale", torch.nn.Parameter(w2_s, requires_grad=False))
+        self.down_proj.register_parameter(
+            "input_scale",
+            None)  #torch.nn.Parameter(None, requires_grad=False))
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+def make_shared_experts(
+    N: int,
+    K: int,
+    in_dtype: torch.dtype = torch.bfloat16,
+    quant_dtype: Union[torch.dtype, str, None] = None,
+) -> torch.nn.Module:
+    from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+
+    (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights(
+        1,
+        N,
+        K,
+        in_dtype=in_dtype,
+        quant_dtype=quant_dtype,
+    )
+    old_dtype = torch.get_default_dtype()
+    try:
+        torch.set_default_dtype(in_dtype)
+        if quant_dtype == torch.float8_e4m3fn:
+            w1 = w1[0].transpose(0, 1)
+            w2 = w2[0].transpose(0, 1)
+            w1_s = w1_s[0].transpose(0, 1) if w1_s is not None else None
+            w2_s = w2_s[0].transpose(0, 1) if w2_s is not None else None
+            quant_config = Fp8Config(True)
+        else:
+            w1 = w1[0]
+            w2 = w2[0]
+            w1_s = None
+            w2_s = None
+            quant_config = None
+
+        return RealMLP(K,
+                       N,
+                       w1,
+                       w2,
+                       "silu",
+                       quant_config,
+                       w1_s=w1_s,
+                       w2_s=w2_s)
+    finally:
+        torch.set_default_dtype(old_dtype)
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index 85f87cb21e..7c0f30b9aa 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -13,11 +13,6 @@ from .base_device_communicator import All2AllManagerBase, Cache
 
 logger = init_logger(__name__)
 
-if TYPE_CHECKING:
-    from vllm.model_executor.layers.fused_moe.layer import FusedMoE
-else:
-    FusedMoE = None
-
 
 class NaiveAll2AllManager(All2AllManagerBase):
     """
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 9131582eef..01f59b44a0 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -252,7 +252,10 @@ class DeviceCommunicatorBase:
 
         moe_modules = [
             module for module in model.modules()
-            if module.__class__.__name__ == "FusedMoE"
+            # TODO(bnell): Should use isinstance but can't.  Maybe search for
+            # presence of quant_method.init_prepare_finalize?
+            if (module.__class__.__name__ == "FusedMoE"
+                or module.__class__.__name__ == "SharedFusedMoE")
         ]
         for module in moe_modules:
             module.quant_method.init_prepare_finalize(module)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index 437e569d31..2bbe523b4b 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
+from typing import Callable, Optional, Union
 
 import deep_ep
 import torch
@@ -25,6 +25,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self.num_dispatchers_ = num_dispatchers
         self.dp_size = dp_size
         self.rank_expert_offset = rank_expert_offset
+        self.async_prepare = True
+
         # The dispatch function returns a handle that the combine function
         # requires. We store the handle here so it is available to the
         # combine function.
@@ -56,10 +58,16 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             return None
         return deep_ep.Buffer.get_combine_config(self.dp_size)
 
-    def _do_dispatch(self, tokens: torch.Tensor,
-                     token_scales: Optional[torch.Tensor],
-                     rank_topk_ids: torch.Tensor,
-                     rank_topk_weights: torch.Tensor, num_experts: int):
+    def _do_dispatch(
+        self,
+        tokens: torch.Tensor,
+        token_scales: Optional[torch.Tensor],
+        rank_topk_ids: torch.Tensor,
+        rank_topk_weights: torch.Tensor,
+        num_experts: int,
+        a1_scale: Optional[torch.Tensor],
+        quant_config: FusedMoEQuantConfig,
+    ) -> Callable:
 
         has_scales = token_scales is not None
 
@@ -93,9 +101,36 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             expert_alignment=1,
             config=self._get_dispatch_config(),
             previous_event=None,
-            async_finish=False,
+            async_finish=self.async_prepare,
             allocate_on_comm_stream=False)
 
+        return lambda: self._receiver(
+            event,
+            has_scales,
+            token_data,
+            expert_topk_ids,
+            num_experts,
+            expert_num_tokens_per_expert_list,
+            expert_topk_weights,
+            a1_scale,
+            quant_config,
+        )
+
+    def _receiver(
+        self,
+        event: deep_ep.EventOverlap,
+        has_scales: bool,
+        token_data: Union[tuple[torch.Tensor, torch.Tensor], torch.Tensor],
+        expert_topk_ids: Optional[torch.Tensor],
+        num_experts: int,
+        expert_num_tokens_per_expert_list: list[int],
+        expert_topk_weights: Optional[torch.Tensor],
+        a1_scale: Optional[torch.Tensor],
+        quant_config: FusedMoEQuantConfig,
+    ) -> mk.PrepareResultType:
+        if self.async_prepare:
+            event.current_stream_wait()
+
         if has_scales:
             expert_x, expert_x_scale = token_data
         else:
@@ -112,6 +147,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         # DeepEP's topk_ids output refers to the local experts directly. Offset
         # the topk_ids to move it back to the global experts space so it aligns
         # with existing vLLM interfaces.
+        assert expert_topk_ids is not None
         expert_topk_ids = torch.where(
             expert_topk_ids == -1,
             num_experts - 1 if self.rank_expert_offset == 0 else 0,
@@ -123,10 +159,28 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         expert_tokens_meta = mk.ExpertTokensMetadata.make_from_list(
             expert_num_tokens_per_expert_list, device=expert_x.device)
 
+        # Dispatch and Quant
+        # DeepEP kernels only support dispatching block-quantized
+        # activation scales.
+        # Dispatch in bfloat16 and quantize afterwards
+        if not quant_config.is_block_quantized:
+            # Quantize after dispatch.
+            expert_x_scale = None
+            if expert_x.numel() != 0:
+                expert_x, expert_x_scale = moe_kernel_quantize_input(
+                    expert_x,
+                    a1_scale,
+                    quant_dtype=quant_config.quant_dtype,
+                    per_act_token_quant=False,
+                    block_shape=quant_config.block_shape)
+
         return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
                 expert_topk_weights)
 
-    def prepare(
+    def supports_async(self) -> bool:
+        return True
+
+    def prepare_async(
         self,
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
@@ -137,9 +191,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> Callable:
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
@@ -159,37 +211,37 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             )
             if a1q_scale is not None and a1q_scale.numel() == 1:
                 a1q_scale = a1q_scale.view(1, 1)
-            (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
-             expert_topk_weights) = self._do_dispatch(
-                 tokens=a1q,
-                 token_scales=a1q_scale,
-                 rank_topk_ids=topk_ids,
-                 rank_topk_weights=topk_weights,
-                 num_experts=num_experts)
+            a1_post_scale = None
         else:
-            # Dispatch and Quant
-            # DeepEP kernels only support dispatching block-quantized
-            # activation scales.
-            # Dispatch in bfloat16
-            (expert_x, _, expert_tokens_meta, expert_topk_ids,
-             expert_topk_weights) = self._do_dispatch(
-                 tokens=a1,
-                 token_scales=None,
-                 rank_topk_ids=topk_ids,
-                 rank_topk_weights=topk_weights,
-                 num_experts=num_experts)
-            # Quantize after dispatch.
-            expert_x_scale = None
-            if expert_x.numel() != 0:
-                expert_x, expert_x_scale = moe_kernel_quantize_input(
-                    expert_x,
-                    a1_scale,
-                    quant_dtype=quant_config.quant_dtype,
-                    per_act_token_quant=False,
-                    block_shape=quant_config.block_shape)
+            a1q = a1
+            a1q_scale = None
+            a1_post_scale = a1_scale
 
-        return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
-                expert_topk_weights)
+        return self._do_dispatch(tokens=a1q,
+                                 token_scales=a1q_scale,
+                                 rank_topk_ids=topk_ids,
+                                 rank_topk_weights=topk_weights,
+                                 num_experts=num_experts,
+                                 a1_scale=a1_post_scale,
+                                 quant_config=quant_config)
+
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+    ) -> mk.PrepareResultType:
+        receiver = self.prepare_async(a1, a1_scale, a2_scale, topk_weights,
+                                      topk_ids, num_experts, expert_map,
+                                      apply_router_weight_on_input,
+                                      quant_config)
+        return receiver()
 
     def finalize(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 93ac11fb4b..1849e49e0a 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional, Union
+from typing import Callable, Optional, Union
 
 import deep_ep
 import torch
@@ -75,7 +75,6 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self,
         x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
         a1_dtype: torch.dtype,
         quant_dtype: Union[torch.dtype, str, None],
         per_act_token_quant: bool,
@@ -110,7 +109,10 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         return x, x_scales
 
-    def prepare(
+    def supports_async(self) -> bool:
+        return True
+
+    def prepare_async(
         self,
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
@@ -121,9 +123,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> mk.ReceiverType:
 
         hidden_size = a1.size(1)
         assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \
@@ -155,16 +155,48 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                                                 num_experts,
                                                 use_fp8=self.use_fp8_dispatch,
                                                 async_finish=False,
-                                                return_recv_hook=False)
+                                                return_recv_hook=True)
+
+        return lambda: self._receiver(hook, expert_x, expert_num_tokens,
+                                      a1_scale, a1.dtype, quant_config)
+
+    def _receiver(
+        self,
+        hook: Callable,
+        expert_x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        expert_num_tokens: torch.Tensor,
+        a1_scale,
+        a1_dtype,
+        quant_config: FusedMoEQuantConfig,
+    ) -> mk.PrepareResultType:
+        hook()
 
         expert_x, expert_x_scale = self._do_quant(
-            expert_x, a1_scale, a2_scale, a1.dtype, quant_config.quant_dtype,
+            expert_x, a1_scale, a1_dtype, quant_config.quant_dtype,
             quant_config.per_act_token_quant, quant_config.block_shape)
 
         expert_tokens_meta = mk.ExpertTokensMetadata(
             expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None)
 
-        return (expert_x, expert_x_scale, expert_tokens_meta, None, None)
+        return expert_x, expert_x_scale, expert_tokens_meta, None, None
+
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+    ) -> mk.PrepareResultType:
+        receiver = self.prepare_async(a1, a1_scale, a2_scale, topk_weights,
+                                      topk_ids, num_experts, expert_map,
+                                      apply_router_weight_on_input,
+                                      quant_config)
+        return receiver()
 
     def finalize(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index 061b02172c..157cb36d4f 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -56,9 +56,7 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         apply_router_weight_on_input: bool,
         # TODO(bnell): use quant_config + scales instead of ctor args
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> mk.PrepareResultType:
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index b46f4be4b9..88063668e9 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -506,9 +506,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> mk.PrepareResultType:
         assert a1.dim() == 2
         assert topk_ids.dim() == 2
         assert topk_ids.size(0) == a1.size(0)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 3a2c9cbaf4..b1a61ade53 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -4,7 +4,7 @@
 from abc import abstractmethod
 from collections.abc import Iterable
 from enum import Enum
-from typing import Callable, Literal, Optional, overload
+from typing import Callable, Literal, Optional, Union, overload
 
 import torch
 import torch.nn.functional as F
@@ -215,6 +215,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             self.fused_experts = FusedMoEModularKernel(
                 prepare_finalize,
                 experts,
+                layer.shared_experts,
             )
 
     def select_gemm_impl(
@@ -252,7 +253,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         raise NotImplementedError
 
 
@@ -409,7 +410,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb:
             assert expert_load_view is not None
             assert logical_to_physical_map is not None
@@ -461,7 +462,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -547,7 +548,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ):
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb is not False or expert_load_view is not None or \
                 logical_to_physical_map is not None or \
                 logical_replica_count is not None:
@@ -594,7 +595,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ):
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb is not False or expert_load_view is not None or \
                 logical_to_physical_map is not None or \
                 logical_replica_count is not None:
@@ -633,7 +634,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert not use_grouped_topk
         assert num_expert_group is None
         assert topk_group is None
@@ -948,6 +949,10 @@ class FusedMoE(CustomOp):
                 dtype=moe.in_dtype,
                 device=torch.cuda.current_device())
 
+    @property
+    def shared_experts(self) -> Optional[torch.nn.Module]:
+        return None
+
     @property
     def tp_size(self):
         return self.moe_parallel_config.tp_size
@@ -1400,6 +1405,7 @@ class FusedMoE(CustomOp):
         return [
             weight.view(self.local_num_experts, -1) for name, weight in weights
             if name not in NON_EXPERT_WEIGHTS
+            and not name.startswith("_shared_experts.")
         ]
 
     def set_eplb_state(
@@ -1582,25 +1588,45 @@ class FusedMoE(CustomOp):
         else:
             return tensor_model_parallel_all_reduce(final_hidden_states)
 
-    def forward(self, hidden_states: torch.Tensor,
-                router_logits: torch.Tensor):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         og_hidden_states = hidden_states.shape[-1]
         if self.hidden_size != og_hidden_states:
             hidden_states = F.pad(hidden_states,
                                   (0, self.hidden_size - og_hidden_states),
                                   mode='constant',
                                   value=0.0)
-        # TODO: Once the OOM issue for the TPU backend is resolved, we will
-        # switch to using the moe_forward custom op.
-        if current_platform.is_tpu():
-            return self.forward_impl(hidden_states, router_logits)
-        else:
-            return torch.ops.vllm.moe_forward(
-                hidden_states, router_logits,
-                self.layer_name)[..., :og_hidden_states]
 
-    def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
-                             full_router_logits: torch.Tensor):
+        if self.shared_experts is None:
+            if current_platform.is_tpu():
+                # TODO: Once the OOM issue for the TPU backend is resolved, we
+                # will switch to using the moe_forward custom op.
+                fused_output = self.forward_impl(hidden_states, router_logits)
+                assert not isinstance(fused_output, tuple)
+            else:
+                fused_output = torch.ops.vllm.moe_forward(
+                    hidden_states, router_logits, self.layer_name)
+            return fused_output[..., :og_hidden_states]
+        else:
+            if current_platform.is_tpu():
+                # TODO: Once the OOM issue for the TPU backend is resolved, we
+                # will switch to using the moe_forward custom op.
+                shared_output, fused_output = self.forward_impl(
+                    hidden_states, router_logits)
+            else:
+                shared_output, fused_output = torch.ops.vllm.moe_forward_shared(
+                    hidden_states, router_logits, self.layer_name)
+            return (shared_output[..., :og_hidden_states],
+                    fused_output[..., :og_hidden_states])
+
+    def forward_impl_chunked(
+        self,
+        full_hidden_states: torch.Tensor,
+        full_router_logits: torch.Tensor,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.batched_hidden_states is not None
         assert self.batched_router_logits is not None
         assert self.batched_hidden_states.dtype == full_hidden_states.dtype
@@ -1611,7 +1637,10 @@ class FusedMoE(CustomOp):
         assert (
             self.batched_router_logits.size(-1) == full_router_logits.size(-1))
 
-        full_final_hidden_states = torch.empty_like(full_hidden_states)
+        full_fused_final_hidden_states = torch.empty_like(full_hidden_states)
+        if self.shared_experts is not None:
+            full_shared_final_hidden_states = torch.empty_like(
+                full_hidden_states)
 
         def process_chunk(chunk_start, chunk_end, skip_result_store=False):
             chunk_size = chunk_end - chunk_start
@@ -1652,9 +1681,21 @@ class FusedMoE(CustomOp):
                 logical_replica_count=self.logical_replica_count,
             )
 
+            assert self.shared_experts is None or isinstance(
+                final_hidden_states, tuple)
+
             if not skip_result_store:
-                full_final_hidden_states[chunk_start:chunk_end, :].copy_(
-                    final_hidden_states, non_blocking=True)
+                if self.shared_experts is None:
+                    full_fused_final_hidden_states[
+                        chunk_start:chunk_end, :].copy_(final_hidden_states,
+                                                        non_blocking=True)
+                else:
+                    full_shared_final_hidden_states[
+                        chunk_start:chunk_end, :].copy_(final_hidden_states[0],
+                                                        non_blocking=True)
+                    full_fused_final_hidden_states[
+                        chunk_start:chunk_end, :].copy_(final_hidden_states[1],
+                                                        non_blocking=True)
 
         ctx = get_forward_context()
         # flashinfer_cutlass_kernels can handle: optional DP + TP/EP
@@ -1675,10 +1716,17 @@ class FusedMoE(CustomOp):
                               chunk_end,
                               skip_result_store=chunk_start_ >= num_tokens)
 
-        return full_final_hidden_states
+        if self.shared_experts is None:
+            return full_fused_final_hidden_states
+        else:
+            return (full_shared_final_hidden_states,
+                    full_fused_final_hidden_states)
 
-    def forward_impl(self, hidden_states: torch.Tensor,
-                     router_logits: torch.Tensor):
+    def forward_impl(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.quant_method is not None
         # Route to the chunked forward path using the FlashInfer Cutlass kernel
         # only when data parallelism (DP) is enabled.
@@ -1698,6 +1746,15 @@ class FusedMoE(CustomOp):
             hidden_states, router_logits = get_ep_group().dispatch(
                 hidden_states, router_logits)
 
+        # If there are shared experts but we are not using a modular kernel, the
+        # shared experts must be called here
+        if (not isinstance(self.quant_method.fused_experts,
+                           FusedMoEModularKernel)
+                and self.shared_experts is not None):
+            shared_output = self.shared_experts(hidden_states)
+        else:
+            shared_output = None
+
         # Matrix multiply.
         final_hidden_states = self.quant_method.apply(
             layer=self,
@@ -1722,14 +1779,30 @@ class FusedMoE(CustomOp):
             logical_replica_count=self.logical_replica_count,
         )
 
-        if do_naive_dispatch_combine:
-            final_hidden_states = get_ep_group().combine(final_hidden_states)
-        if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
-            # Default set to False. (May have to add shared expert outputs.
-            final_hidden_states = self.maybe_all_reduce_tensor_model_parallel(
-                final_hidden_states)
+        if shared_output is not None:
+            assert not isinstance(final_hidden_states, tuple)
+            assert self.shared_experts is not None
+            final_hidden_states = (
+                shared_output,
+                final_hidden_states,
+            )
 
-        return final_hidden_states
+        def reduce_output(states: torch.Tensor) -> torch.Tensor:
+            if do_naive_dispatch_combine:
+                states = get_ep_group().combine(states)
+
+            if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
+                states = self.maybe_all_reduce_tensor_model_parallel(states)
+
+            return states
+
+        if self.shared_experts is None:
+            return reduce_output(final_hidden_states)
+        else:
+            return (
+                reduce_output(final_hidden_states[0]),
+                reduce_output(final_hidden_states[1]),
+            )
 
     @classmethod
     def make_expert_params_mapping(
@@ -1784,17 +1857,22 @@ class FusedMoE(CustomOp):
         return s
 
 
-def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
-                layer_name: str) -> torch.Tensor:
+def moe_forward(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    layer_name: str,
+) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
-    assert self.quant_method is not None
-
+    assert self.shared_experts is None
     return self.forward_impl(hidden_states, router_logits)
 
 
-def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
-                     layer_name: str) -> torch.Tensor:
+def moe_forward_fake(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    layer_name: str,
+) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
@@ -1807,6 +1885,37 @@ direct_register_custom_op(
     tags=(torch.Tag.needs_fixed_stride_order, ),
 )
 
+
+def moe_forward_shared(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    layer_name: str,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    assert self.shared_experts is not None
+    return self.forward_impl(hidden_states, router_logits)
+
+
+def moe_forward_shared_fake(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    layer_name: str,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    shared_out = torch.empty_like(hidden_states)
+    fused_out = torch.empty_like(hidden_states)
+    return shared_out, fused_out
+
+
+direct_register_custom_op(
+    op_name="moe_forward_shared",
+    op_func=moe_forward_shared,
+    mutates_args=["hidden_states"],
+    fake_impl=moe_forward_shared_fake,
+    dispatch_key=current_platform.dispatch_key,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
+)
+
 # Mark the FusedMoE weight_loader as supporting MoE-specific parameters
 # to avoid expensive runtime reflection in model loading code
 FusedMoE.weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 2ea6383d5a..7a8c6f8571 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from enum import Enum
 from math import prod
-from typing import Optional, final
+from typing import Callable, Optional, Union, final
 
 import torch
 
@@ -141,6 +141,29 @@ class TopKWeightAndReduce(ABC):
         raise NotImplementedError
 
 
+#
+# PrepareResultType is a tuple of:
+# - quantized + dispatched a.
+# - quantized + dispatched a1_scales.
+# - Optional ExpertTokensMetadata containing gpu/cpu tensors
+#   as big as the number of local experts with the information about the
+#   number of tokens assigned to each local expert.
+# - Optional dispatched expert topk IDs
+# - Optional dispatched expert topk weight
+#
+# See `prepare` method below.
+#
+PrepareResultType = tuple[
+    torch.Tensor,
+    Optional[torch.Tensor],
+    Optional[ExpertTokensMetadata],
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+]
+
+ReceiverType = Callable[[], PrepareResultType]
+
+
 # TODO: pass FusedMoEParallelConfig in as ctor parameter?
 class FusedMoEPrepareAndFinalize(ABC):
     """
@@ -160,16 +183,9 @@ class FusedMoEPrepareAndFinalize(ABC):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[
-            torch.Tensor,
-            Optional[torch.Tensor],
-            Optional[ExpertTokensMetadata],
-            Optional[torch.Tensor],
-            Optional[torch.Tensor],
-    ]:
+    ) -> PrepareResultType:
         """
-        Perform any quantization (and/or) dispatching needed
-        for this kernel.
+        Perform any quantization (and/or) dispatching needed for this kernel.
         - a1: The (unquantized) input to the MoE layer.
         - a1_scale: Optional scales for a1
         - a2_scale: Optional scales for the second MoE gemm.  Required to make
@@ -193,6 +209,51 @@ class FusedMoEPrepareAndFinalize(ABC):
         """
         raise NotImplementedError
 
+    def supports_async(self) -> bool:
+        """
+        Indicates whether or not this class implements prepare_async.
+        """
+        return False
+
+    def prepare_async(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+    ) -> ReceiverType:
+        """
+        Perform any quantization (and/or) dispatching needed for this kernel
+        but do not wait for results from other workers.
+        - a1: The (unquantized) input to the MoE layer.
+        - a1_scale: Optional scales for a1
+        - a2_scale: Optional scales for the second MoE gemm.  Required to make
+          sure the quantization is consistent for both gemms.
+        - topk_ids: The topk ids.
+        - topk_weights: The topk weights.
+        - num_experts: The total number of experts in the global expert space.
+        - expert_map: A tensor mapping expert indices from the global expert
+          space to the local expert space of the expert parallel shard.
+        - apply_router_weight_on_input: When True, apply the weights to the
+          activations, before quantization + dispatching.
+
+        Returns a callback that when invoked waits for results from other
+        workers and has the same return signature as `prepare`, e.g.
+
+        receiver = obj.prepare_async(...)
+        a, a_scales, expert_meta, topk_ids, topk_weights = receiver()
+
+        is equivalent to:
+
+        a, a_scales, expert_meta, topk_ids, topk_weights = obj.prepare(...)
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def finalize(
         self,
@@ -453,10 +514,12 @@ class FusedMoEModularKernel(torch.nn.Module):
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
         fused_experts: FusedMoEPermuteExpertsUnpermute,
+        shared_experts: Optional[torch.nn.Module] = None,
     ):
         super().__init__()
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
+        self.shared_experts = shared_experts
         assert prepare_finalize.activation_format == \
             fused_experts.activation_formats[0], (
                 f"{prepare_finalize.__class__.__name__}."
@@ -692,7 +755,7 @@ class FusedMoEModularKernel(torch.nn.Module):
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         """
         This function computes a Mixture of Experts (MoE) layer using two sets
         of weights, w1 and w2, and top-k gating mechanism.
@@ -736,18 +799,46 @@ class FusedMoEModularKernel(torch.nn.Module):
         if global_num_experts == -1:
             global_num_experts = local_num_experts
 
-        (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids,
-         _expert_topk_weights) = self.prepare_finalize.prepare(
-             a1,
-             a1_scale,
-             a2_scale,
-             topk_weights,
-             topk_ids,
-             global_num_experts,
-             expert_map,
-             apply_router_weight_on_input,
-             self.fused_experts.quant_config,
-         )
+        shared_output: torch.Tensor
+
+        if (not self.prepare_finalize.supports_async()
+                or self.shared_experts is None):
+
+            # Run shared experts serially with dispatch.
+            if self.shared_experts is not None:
+                shared_output = self.shared_experts(a1)
+
+            (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids,
+             _expert_topk_weights) = self.prepare_finalize.prepare(
+                 a1,
+                 a1_scale,
+                 a2_scale,
+                 topk_weights,
+                 topk_ids,
+                 global_num_experts,
+                 expert_map,
+                 apply_router_weight_on_input,
+                 self.fused_experts.quant_config,
+             )
+        else:
+            # Overlap shared expert compute with all2all dispatch.
+            receiver = self.prepare_finalize.prepare_async(
+                a1,
+                a1_scale,
+                a2_scale,
+                topk_weights,
+                topk_ids,
+                global_num_experts,
+                expert_map,
+                apply_router_weight_on_input,
+                self.fused_experts.quant_config,
+            )
+
+            assert self.shared_experts is not None
+            shared_output = self.shared_experts(a1)
+
+            (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids,
+             _expert_topk_weights) = receiver()
 
         # Maybe prepare gathered topk_ids and topk_weights from other EP ranks.
         topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids
@@ -795,4 +886,7 @@ class FusedMoEModularKernel(torch.nn.Module):
             self.fused_experts.finalize_weight_and_reduce_impl(),
         )
 
-        return output
+        if self.shared_experts is None:
+            return output
+        else:
+            return shared_output, output
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 401f37922b..2ae79e69f5 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -84,12 +84,15 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         return self.max_num_tokens
 
     def topk_indices_dtype(self) -> Optional[torch.dtype]:
-        return torch.int32
+        return torch.uint32
 
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
-    def prepare(
+    def supports_async(self) -> bool:
+        return True
+
+    def prepare_async(
         self,
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
@@ -100,9 +103,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> mk.ReceiverType:
         num_tokens = a1.size(0)  # M
         hidden_dim = a1.size(-1)  # K
 
@@ -138,6 +139,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         _validate_scale_shape(a1q, a1q_scale, quant_config.per_act_token_quant,
                               quant_config.block_shape)
 
+        orig_a_scale_block_shape: Optional[int] = None
+
         if a1q_scale is not None:
             scalar_scales = a1q_scale.numel() == 1
 
@@ -205,8 +208,45 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             out_expert_x_scale=expert_x_scale,
             dp_x=a1q,
             dp_x_scale=a1q_scale,
-            indices=topk_ids.view(dtype=torch.uint32),
+            indices=topk_ids,
             bound_m=bound_m,
+            do_send=True,
+            do_recv=False,
+        )
+
+        return lambda: self._receiver(
+            expert_num_tokens,
+            expert_x,
+            expert_x_scale,
+            a1q,
+            a1q_scale,
+            topk_ids,
+            bound_m,
+            orig_a_scale_block_shape,
+        )
+
+    def _receiver(
+        self,
+        expert_num_tokens: torch.Tensor,
+        expert_x: torch.Tensor,
+        expert_x_scale: Optional[torch.Tensor],
+        a1q: torch.Tensor,
+        a1q_scale: Optional[torch.Tensor],
+        topk_ids: torch.Tensor,
+        bound_m: Optional[torch.Tensor],
+        orig_a_scale_block_shape: Optional[int],
+    ) -> mk.PrepareResultType:
+
+        self.a2a.dispatch(
+            out_expert_num_tokens=expert_num_tokens,
+            out_expert_x=expert_x,
+            out_expert_x_scale=expert_x_scale,
+            dp_x=a1q,
+            dp_x_scale=a1q_scale,
+            indices=topk_ids,
+            bound_m=bound_m,
+            do_send=False,
+            do_recv=True,
         )
 
         if expert_x_scale is not None:
@@ -218,6 +258,31 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         return expert_x, expert_x_scale, expert_tokens_meta, None, None
 
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+    ) -> mk.PrepareResultType:
+        receiver = self.prepare_async(
+            a1,
+            a1_scale,
+            a2_scale,
+            topk_weights,
+            topk_ids,
+            num_experts,
+            expert_map,
+            apply_router_weight_on_input,
+            quant_config,
+        )
+        return receiver()
+
     def finalize(
         self,
         output: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 567a0a88fe..bd9f7d4a06 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -38,9 +38,7 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> mk.PrepareResultType:
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 8293d42ef4..bf99f0823b 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 from torch.nn import Parameter
@@ -505,7 +505,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 9713757df9..2245c59af6 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -474,7 +474,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         from vllm.model_executor.layers.fused_moe import fused_experts
         assert self.fused_experts is None
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index e458541922..c2b884c058 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -3,7 +3,7 @@
 
 import enum
 from enum import Enum
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import torch
 from compressed_tensors import CompressionFormat
@@ -358,7 +358,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
@@ -819,7 +819,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for "
@@ -1069,7 +1069,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
@@ -1375,7 +1375,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
@@ -1608,7 +1608,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 2d8a684bc7..b361fe9bea 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 
@@ -128,7 +128,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d9e01dcf40..de22cceb45 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -988,7 +988,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb:
             assert expert_load_view is not None
             assert logical_to_physical_map is not None
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index ad648df238..01af1ccd9a 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import gguf
 import torch
@@ -540,7 +540,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ):
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 3644d91f64..cf959e13bc 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -654,7 +654,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 4bb8438d90..e140807879 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -491,7 +491,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptFp8MoEMethod` yet.")
@@ -1366,7 +1366,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ):
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptNvFp4FusedMoE` yet.")
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index fb3e4b518b..d6d7ec9b15 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 
@@ -305,7 +305,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
         if enable_eplb:
             raise NotImplementedError(
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 85d05ff51d..889c15df3c 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import torch
 from torch.nn.parameter import Parameter
@@ -554,7 +554,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
 
         if enable_eplb:
             raise NotImplementedError("EPLB is not supported for mxfp4")
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index fdf03ded04..6cff9f3019 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 
@@ -226,7 +226,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
@@ -390,7 +390,7 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 8f72b8cbea..0d5fa05652 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -3,7 +3,7 @@
 # Copyright © 2025, Oracle and/or its affiliates.
 
 import os
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -291,7 +291,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/shared_fused_moe/__init__.py b/vllm/model_executor/layers/shared_fused_moe/__init__.py
new file mode 100644
index 0000000000..b87c69d3ed
--- /dev/null
+++ b/vllm/model_executor/layers/shared_fused_moe/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.model_executor.layers.shared_fused_moe.shared_fused_moe import (
+    SharedFusedMoE)
+
+__all__ = ["SharedFusedMoE"]
diff --git a/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py
new file mode 100644
index 0000000000..e1e3d188d9
--- /dev/null
+++ b/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+
+# TODO(bnell): Add shared + fused combo function? e.g. +
+class SharedFusedMoE(FusedMoE):
+    """
+    A FusedMoE operation that also computes the results of shared experts.
+    If an all2all communicator is being used the shared expert computation
+    can be interleaved with the fused all2all dispatch communication step.
+    """
+
+    def __init__(
+        self,
+        shared_experts: torch.nn.Module,
+        use_overlapped: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self._shared_experts = shared_experts
+        self.use_overlapped = use_overlapped
+
+    @property
+    def shared_experts(self) -> Optional[torch.nn.Module]:
+        return self._shared_experts if self.use_overlapped else None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if not self.use_overlapped:
+            shared_out = self._shared_experts(hidden_states)
+
+            # Reduce outputs if necessary, since the MLP should
+            # have been created with reduce_results=False.
+            if (self.reduce_results and self.tp_size > 1
+                    and self.must_reduce_shared_expert_outputs()):
+                shared_out = tensor_model_parallel_all_reduce(shared_out)
+
+            fused_out = super().forward(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+            )
+        else:
+            shared_out, fused_out = super().forward(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+            )
+        return shared_out, fused_out
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 3a8eaf6817..7db6fc5d8a 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -49,6 +49,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -147,63 +148,85 @@ class DeepseekV2MoE(nn.Module):
         self.physical_expert_end = (self.physical_expert_start +
                                     self.n_local_physical_experts)
 
-        self.experts = FusedMoE(
-            num_experts=config.n_routed_experts,
-            top_k=config.num_experts_per_tok,
-            hidden_size=config.hidden_size,
-            intermediate_size=config.moe_intermediate_size,
-            reduce_results=False,
-            renormalize=config.norm_topk_prob,
-            quant_config=quant_config,
-            use_grouped_topk=True,
-            num_expert_group=config.n_group,
-            topk_group=config.topk_group,
-            prefix=f"{prefix}.experts",
-            scoring_func=config.scoring_func,
-            # we do scaling outside, set factor to 1.0 to avoid double mul
-            routed_scaling_factor=1.0,
-            e_score_correction_bias=self.gate.e_score_correction_bias,
-            enable_eplb=self.enable_eplb,
-            num_redundant_experts=self.n_redundant_experts)
-
-        if config.n_shared_experts is not None:
+        if config.n_shared_experts is None:
+            self.experts = FusedMoE(
+                num_experts=config.n_routed_experts,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.moe_intermediate_size,
+                reduce_results=False,
+                renormalize=config.norm_topk_prob,
+                quant_config=quant_config,
+                use_grouped_topk=True,
+                num_expert_group=config.n_group,
+                topk_group=config.topk_group,
+                prefix=f"{prefix}.experts",
+                scoring_func=config.scoring_func,
+                # we do scaling outside, set factor to 1.0 to avoid double mul
+                routed_scaling_factor=1.0,
+                e_score_correction_bias=self.gate.e_score_correction_bias,
+                enable_eplb=self.enable_eplb,
+                num_redundant_experts=self.n_redundant_experts)
+            self.shared_experts = None
+        else:
             intermediate_size = (config.moe_intermediate_size *
                                  config.n_shared_experts)
+
             self.shared_experts = DeepseekV2MLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
-                reduce_results=self.experts.must_reduce_shared_expert_outputs(
-                ),
+                reduce_results=False,
                 prefix=f"{prefix}.shared_experts",
             )
 
+            self.experts = SharedFusedMoE(
+                shared_experts=self.shared_experts,
+                num_experts=config.n_routed_experts,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.moe_intermediate_size,
+                reduce_results=False,
+                renormalize=config.norm_topk_prob,
+                quant_config=quant_config,
+                use_grouped_topk=True,
+                num_expert_group=config.n_group,
+                topk_group=config.topk_group,
+                prefix=f"{prefix}.experts",
+                scoring_func=config.scoring_func,
+                # we do scaling outside, set factor to 1.0 to avoid double mul
+                routed_scaling_factor=1.0,
+                e_score_correction_bias=self.gate.e_score_correction_bias,
+                enable_eplb=self.enable_eplb,
+                num_redundant_experts=self.n_redundant_experts)
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
-        if self.n_shared_experts is not None:
-            shared_output = self.shared_experts(hidden_states)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
 
-        if hidden_states.dtype != torch.float16:
-            final_hidden_states = self.experts(
-                hidden_states=hidden_states,
-                router_logits=router_logits) * self.routed_scaling_factor
+        fused_moe_out = self.experts(hidden_states=hidden_states,
+                                     router_logits=router_logits)
+
+        if self.shared_experts is not None:
+            shared_output, final_hidden_states = fused_moe_out
         else:
-            # Fix FP16 overflow
-            # See DeepseekV2DecoderLayer for more details.
-            final_hidden_states = self.experts(hidden_states=hidden_states,
-                                               router_logits=router_logits)
-        if shared_output is not None:
-            if hidden_states.dtype != torch.float16:
-                final_hidden_states = final_hidden_states + shared_output
-            else:
-                # Fix FP16 overflow
-                # See DeepseekV2DecoderLayer for more details.
-                final_hidden_states = final_hidden_states + shared_output \
-                    * (1. / self.routed_scaling_factor)
+            shared_output = None
+            final_hidden_states = fused_moe_out
+
+        # Fix FP16 overflow
+        # See DeepseekV2DecoderLayer for more details.
+        if hidden_states.dtype != torch.float16:
+            final_hidden_states *= self.routed_scaling_factor
+        elif self.shared_experts is not None:
+            assert shared_output is not None
+            shared_output *= (1. / self.routed_scaling_factor)
+
+        if self.shared_experts is not None:
+            assert shared_output is not None
+            final_hidden_states += shared_output
 
         if self.tp_size > 1:
             final_hidden_states = (
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 284506b642..1fb4576092 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -184,6 +184,8 @@ class Glm4MoE(nn.Module):
 
         if self.n_shared_experts is not None:
             shared_output = self.shared_experts(hidden_states)
+        else:
+            shared_output = None
         router_logits = self.gate(hidden_states.to(dtype=torch.float32))
         final_hidden_states = self.experts(
             hidden_states=hidden_states,
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index ba08e6f81f..ddd7e6a593 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -36,6 +36,7 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 
@@ -73,7 +74,18 @@ class Llama4MoE(nn.Module):
                                        quant_config=None,
                                        prefix=f"{prefix}.router")
 
-        self.experts = FusedMoE(
+        self.shared_expert = LlamaMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size_moe,
+            hidden_act="silu",
+            quant_config=quant_config,
+            bias=False,
+            prefix=f"{prefix}.shared_expert",
+            reduce_results=False,
+        )
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_expert,
             num_experts=config.num_local_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
@@ -83,22 +95,13 @@ class Llama4MoE(nn.Module):
             reduce_results=False,
             renormalize=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.experts")
-
-        self.shared_expert = LlamaMLP(
-            hidden_size=config.hidden_size,
-            intermediate_size=intermediate_size_moe,
-            hidden_act="silu",
-            quant_config=quant_config,
-            bias=False,
-            prefix=f"{prefix}.shared_expert",
-            reduce_results=self.experts.must_reduce_shared_expert_outputs(),
+            prefix=f"{prefix}.experts",
         )
 
     def forward(self, hidden_states):
         router_logits, _ = self.router(hidden_states)
-        shared_out = self.shared_expert(hidden_states)
-        routed_out = self.experts(
+
+        shared_out, routed_out = self.experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
         )
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index cb000d53a9..affba877ec 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -500,7 +500,8 @@ class Worker(WorkerBase):
         parallel_config = self.vllm_config.parallel_config
         moe_modules = [
             module for module in self.model_runner.model.modules()
-            if module.__class__.__name__ == "FusedMoE"
+            if (module.__class__.__name__ == "FusedMoE"
+                or module.__class__.__name__ == "SharedFusedMoE")
         ]
         num_local_experts = moe_modules[0].moe_config.num_local_experts
         assert all(module.moe_config.num_local_experts == num_local_experts

From 731a6940e39e84619bbc8db8a794563bb8cc61a5 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Wed, 3 Sep 2025 11:04:00 -0700
Subject: [PATCH 827/932] Migrate whisper inputs to TensorSchema (#23505)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/whisper.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 848b6e0f80..97e8cd6e76 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -4,7 +4,7 @@
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from contextlib import nullcontext
-from typing import Literal, Optional, TypedDict, Union, cast
+from typing import Annotated, Literal, Optional, Union, cast
 
 import numpy as np
 import torch
@@ -40,6 +40,7 @@ from vllm.multimodal.processing import (BaseProcessingInfo,
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.transformers_utils.processor import cached_get_processor
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
                          SupportsTranscription, SupportsV0Only)
@@ -111,9 +112,16 @@ ISO639_1_SUPPORTED_LANGS = {
 }
 
 
-class WhisperAudioInputs(TypedDict):
-    input_features: NestedTensors
-    """Shape: `(batch_size, 128, M)`"""
+class WhisperAudioInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - nmb: Number of mel bins
+        - t: Time frames (M)
+    """
+
+    input_features: Annotated[Optional[NestedTensors],
+                              TensorShape("b", "nmb", "t")]
 
 
 class WhisperPositionalEmbedding(nn.Embedding):

From a742322092379a25d83300a4d44abc7cf225a700 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 3 Sep 2025 14:05:24 -0400
Subject: [PATCH 828/932] [Attention] Blackwell FP8 MLA support with
 CUTLASS_MLA backend (#23289)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../attention/mla/sm100_cutlass_mla_kernel.cu |  16 +-
 tests/kernels/test_cutlass_mla_decode.py      | 246 ++++++++++++------
 vllm/platforms/cuda.py                        |   4 +-
 vllm/v1/attention/backends/mla/cutlass_mla.py |  23 +-
 4 files changed, 184 insertions(+), 105 deletions(-)

diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
index 6dd6f269f3..820bf81dd1 100644
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@@ -64,11 +64,11 @@ struct IsPersistent {
   static const bool value = v;
 };
 
-template <typename T, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
+template <typename T, typename TOut, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
 struct MlaSm100 {
   using Element = T;
   using ElementAcc = float;
-  using ElementOut = T;
+  using ElementOut = TOut;
 
   using TileShape = Shape<_128, _128, Shape<_512, _64>>;
   using TileShapeH = cute::tuple_element_t<0, TileShape>;
@@ -178,7 +178,7 @@ typename T::Fmha::Arguments args_from_options(
   return arguments;
 }
 
-template <typename Element, bool IsPaged128, typename PersistenceOption>
+template <typename Element, typename ElementOut, bool IsPaged128, typename PersistenceOption>
 void runMla(
     at::Tensor const& out,
     at::Tensor const& q_nope,
@@ -190,7 +190,7 @@ void runMla(
     double sm_scale,
     int64_t num_kv_splits,
     cudaStream_t stream) {
-  using MlaSm100Type = MlaSm100<Element, IsPaged128, PersistenceOption>;
+  using MlaSm100Type = MlaSm100<Element, ElementOut, IsPaged128, PersistenceOption>;
   typename MlaSm100Type::Fmha fmha;
   auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
 
@@ -233,13 +233,13 @@ void sm100_cutlass_mla_decode(
   DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
     DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
       if (in_dtype == at::ScalarType::Half) {
-        runMla<cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+        runMla<cutlass::half_t, cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
           out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
       } else if (in_dtype == at::ScalarType::BFloat16) {
-        runMla<cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+        runMla<cutlass::bfloat16_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
           out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
       } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
-        runMla<cutlass::float_e4m3_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+        runMla<cutlass::float_e4m3_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
           out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
       } else {
         TORCH_CHECK(false, "Unsupported input data type of MLA");
@@ -253,7 +253,7 @@ void sm100_cutlass_mla_decode(
 int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
   // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
   // which are float, so Element type here doesn't matter.
-  using MlaSm100Type = MlaSm100<cutlass::half_t, true>;
+  using MlaSm100Type = MlaSm100<cutlass::half_t, cutlass::half_t, true>;
 
   // Get split kv. Requires problem shape and sm_count only.
   typename MlaSm100Type::Fmha::Arguments arguments;
diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py
index 2b745b84da..85984324b1 100644
--- a/tests/kernels/test_cutlass_mla_decode.py
+++ b/tests/kernels/test_cutlass_mla_decode.py
@@ -1,96 +1,180 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+import random
+
 import pytest
 import torch
-import torch.nn.functional as F
-from torch import Tensor
 
 import vllm._custom_ops as ops
 from vllm.platforms import current_platform
-
-if not current_platform.has_device_capability(100):
-    pytest.skip(
-        reason="Cutlass MLA Requires compute capability of 10 or above.",
-        allow_module_level=True)
+from vllm.triton_utils import triton
 
 
-def ref_mla(
-        out: Tensor,  # (bs, num_heads, v_head_dim)
-        query: Tensor,  # (bs, num_heads, head_dim)
-        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
-        scale: float,
-        block_tables: Tensor,  # (bs, max_num_blocks)
-        seq_lens: Tensor,  # (bs,)
-):
-    bs, num_heads, v_head_dim = out.shape
-    head_dim = query.shape[2]
-
-    for i in range(bs):
-        # gather and flatten KV-cache
-        kv = kv_cache[
-            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
-        kv = kv.view(1, -1,
-                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
-        v = kv[:, :, :v_head_dim]
-
-        q = query[i].view(num_heads, 1, head_dim)
-        o = F.scaled_dot_product_attention(q,
-                                           kv,
-                                           v,
-                                           scale=scale,
-                                           enable_gqa=True)
-        out[i] = o.view(num_heads, v_head_dim)
-
-    return out
-
-
-@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
-@pytest.mark.parametrize("mean_seq_len", [128, 1024, 4096])
-@pytest.mark.parametrize("bs", [1, 2, 4])
-@pytest.mark.parametrize("varlen", [False, True])
-@pytest.mark.parametrize("block_size", [16, 64, 128])
-def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
-                            varlen: bool, block_size: int):
-    torch.set_default_dtype(dtype)
-    torch.set_default_device('cuda')
-    torch.manual_seed(42)
-
-    d = 576
-    h_q = 128
-    dv = 512
-
-    q_nope_dim = 128
-    q_pe_dim = 64
-    scale = (q_nope_dim + q_pe_dim)**(-0.5)
-    if varlen:
-        seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
-        seq_lens = seq_lens.clip(2).to(torch.int32)
+def cal_diff(x: torch.Tensor,
+             y: torch.Tensor,
+             name: str,
+             use_fp8: bool = False) -> None:
+    x, y = x.double(), y.double()
+    cos_diff = 1 - 2 * (x * y).sum().item() / max(
+        (x * x + y * y).sum().item(), 1e-12)
+    if (use_fp8):
+        assert cos_diff < 1e-4
     else:
-        seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
-    max_seq_len = seq_lens.max().item()
-    block_num = (max_seq_len + block_size - 1) // block_size
+        assert cos_diff < 1e-5
 
-    # Pad block_num so that small blocks can be packed into full 128-sized
-    # CUTLASS tiles. One 128-wide tile can hold (128 // block_size) small
-    # blocks.
-    pack_factor = 128 // block_size
-    block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor
 
-    # Amplify input values to ensure test coverage of edge cases where CUTLASS
-    # kernel errors occur with split_k settings.
-    q = torch.randn(bs, h_q, d) * 100
-    block_table = torch.randint(0,
-                                bs * block_num, (bs, block_num),
-                                dtype=torch.int32)
+CUTLASS_MLA_UNSUPPORTED_REASON = \
+    "Cutlass MLA Requires compute capability of 10 or above." \
+    if not current_platform.is_device_capability(100) \
+    else "Cutlass MLA is supported"
 
-    kv_cache = torch.randn(block_table.numel(), block_size, d)
 
-    out_ref = q.new_zeros(bs, h_q, dv)
-    ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
-    out_ans = torch.zeros_like(out_ref)
-    q_nope = q[:, :, :dv].clone()
-    q_pe = q[:, :, dv:].clone()
-    ops.cutlass_mla_decode(out_ans, q_nope, q_pe, kv_cache, seq_lens,
-                           block_table, scale)
+@pytest.mark.skipif(not current_platform.has_device_capability(100),
+                    reason=CUTLASS_MLA_UNSUPPORTED_REASON)
+@pytest.mark.parametrize("b", [128])
+@pytest.mark.parametrize("s_q", [1])
+@pytest.mark.parametrize("mean_sk", [4096, 8192, 16384])
+@pytest.mark.parametrize("h_q", [16, 32, 64, 128])
+@pytest.mark.parametrize("h_kv", [1])
+@pytest.mark.parametrize("d", [576])
+@pytest.mark.parametrize("dv", [512])
+@pytest.mark.parametrize("block_size", [64])
+@pytest.mark.parametrize("causal", [True])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.parametrize("torch_dtype", [torch.bfloat16, torch.float8_e4m3fn])
+@torch.inference_mode()
+def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size,
+                            causal, varlen, torch_dtype):
+    device = torch.device("cuda:0")
+    if torch_dtype == torch.float8_e4m3fn:
+        init_dtype = torch.bfloat16
+    else:
+        init_dtype = torch_dtype
+    torch.set_default_dtype(init_dtype)
+    torch.set_default_device(device)
+    torch.cuda.set_device(device)
+    torch.manual_seed(42)
+    random.seed(42)
 
-    torch.testing.assert_close(out_ans, out_ref, atol=1e-2, rtol=1e-2)
+    print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
+          f"{d=}, {dv=}, {causal=}, {varlen=}, {torch_dtype=}")
+
+    use_fp8 = torch_dtype == torch.float8_e4m3fn
+    scale = math.sqrt(d)**(-1)
+    cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32)
+    if varlen:
+        for i in range(b):
+            cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2),
+                                   s_q)
+    total_seqlens = cache_seqlens.sum().item()
+    max_seqlen = cache_seqlens.max().item()
+    max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
+
+    q = torch.randn(b, s_q, h_q, d)
+    block_table = torch.arange(b * max_seqlen_pad // block_size,
+                               dtype=torch.int32).view(
+                                   b, max_seqlen_pad // block_size)
+    blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
+    blocked_v = blocked_k[..., :dv]
+
+    init_dtype = q.dtype
+    if use_fp8:
+        fp8_dtype = torch.float8_e4m3fn
+        descale_q = torch.ones((1), dtype=torch.float32)
+        descale_k = torch.ones((1), dtype=torch.float32)
+
+        q = q.to(fp8_dtype)
+        blocked_k = blocked_k.to(fp8_dtype)
+        blocked_v = blocked_v.to(fp8_dtype)
+    else:
+        descale_q = None
+        descale_k = None
+
+    def cutlass_mla():
+        MAX_HEADS = 128
+
+        q_reshaped = q.squeeze(1)
+        q_nope = q_reshaped[:, :, :dv].clone()
+        q_pe = q_reshaped[:, :, dv:].clone()
+
+        if h_q < MAX_HEADS:
+            q_nope_padded = q_nope.new_empty((b, MAX_HEADS, dv))
+            q_nope_padded[:, :h_q] = q_nope
+            q_nope = q_nope_padded
+
+            q_pe_padded = q_pe.new_empty((b, MAX_HEADS, d - dv))
+            q_pe_padded[:, :h_q] = q_pe
+            q_pe = q_pe_padded
+
+        kv_cache_flat = blocked_k.squeeze(2)
+        device_properties = torch.cuda.get_device_properties(
+            torch.device("cuda:0"))
+        sm_count = device_properties.multi_processor_count
+        workspace_size = ops.sm100_cutlass_mla_get_workspace_size(
+            max_seqlen * block_size, b, sm_count, num_kv_splits=1)
+        workspace = torch.empty(workspace_size,
+                                device="cuda",
+                                dtype=torch.uint8)
+
+        out_ans = torch.empty(b, MAX_HEADS, dv, dtype=init_dtype)
+
+        ops.sm100_cutlass_mla_decode(out_ans, q_nope, q_pe, kv_cache_flat,
+                                     cache_seqlens, block_table, workspace,
+                                     scale, 1)
+        return out_ans[:, :h_q].contiguous()
+
+    def scaled_dot_product_attention(query, key, value, is_causal=False):
+        query = query.float()
+        key = key.float()
+        value = value.float()
+        key = key.repeat_interleave(h_q // h_kv, dim=0)
+        value = value.repeat_interleave(h_q // h_kv, dim=0)
+        attn_weight = query @ key.transpose(-2, -1) / math.sqrt(query.size(-1))
+        if is_causal:
+            s_q = query.shape[-2]
+            s_k = key.shape[-2]
+            attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype)
+            temp_mask = torch.ones(s_q, s_k,
+                                   dtype=torch.bool).tril(diagonal=s_k - s_q)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(query.dtype)
+            attn_weight += attn_bias
+        lse = attn_weight.logsumexp(dim=-1)
+        attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
+        return attn_weight @ value, lse
+
+    def ref_mla():
+        q_ = (q.to(torch.float) * descale_q).to(init_dtype) if use_fp8 else q
+        blocked_k_ = (blocked_k.to(torch.float) *
+                      descale_k).to(init_dtype) if use_fp8 else blocked_k
+        blocked_v_ = (blocked_v.to(torch.float) *
+                      descale_k).to(init_dtype) if use_fp8 else blocked_v
+        out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
+        lse = torch.empty(b, h_q, s_q, dtype=torch.float32)
+        for i in range(b):
+            begin = i * max_seqlen_pad
+            end = begin + cache_seqlens[i]
+            out_i, lse_i = scaled_dot_product_attention(
+                q_[i].transpose(0, 1),
+                blocked_k_.view(-1, h_kv, d)[begin:end].transpose(0, 1),
+                blocked_v_.view(-1, h_kv, dv)[begin:end].transpose(0, 1),
+                is_causal=causal,
+            )
+            out[i] = out_i.transpose(0, 1)
+            lse[i] = lse_i
+        return out, lse
+
+    out_cutlass = cutlass_mla()
+    out_torch, lse_torch = ref_mla()
+    # Extract the single token (s_q=1) slice to match cutlass output shape
+    out_torch_slice = out_torch[:, 0, :, :]  # [b, h_q, dv]
+    cal_diff(out_cutlass, out_torch_slice, "out", use_fp8)
+
+    t = triton.testing.do_bench(cutlass_mla)
+    FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
+    bytes = (total_seqlens * h_kv * d +
+             b * s_q * h_q * d) * (torch.finfo(torch_dtype).bits // 8) + (
+                 b * s_q * h_q * dv) * (torch.finfo(init_dtype).bits // 8)
+    print(f"{t:.3f} ms, {FLOPS / 10 ** 9 / t:.0f} TFLOPS,",
+          f"{bytes / 10 ** 6 / t:.0f} GB/s")
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 5cbb734643..c65c987c0e 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -500,8 +500,8 @@ class CudaPlatformBase(Platform):
                 else:
                     attention_backend = "FLASHMLA"
 
-            # Only FlashMLA supports fp8
-            if attention_backend == "FLASHMLA":
+            # Only FlashMLA and CUTLASS_MLA support fp8
+            if attention_backend in ["FLASHMLA", "CUTLASS_MLA"]:
                 supported = True
             else:
                 supported = (not fp8_attention)
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 8a17d3a492..705307d4de 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -108,10 +108,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
                                       "are not implemented for "
                                       "CutlassMLAImpl")
 
-        if is_quantized_kv_cache(self.kv_cache_dtype):
-            raise NotImplementedError(
-                "CutlassMLA V1 with FP8 KV cache not yet supported")
-
         self._use_old_cutlass_mla = False
         force_old_cutlass = os.environ.get("FORCE_OLD_CUTLASS_MLA", None)
         if force_old_cutlass:
@@ -182,11 +178,10 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
                 > 0), f"block num must be greater than 0, got {block_num}"
         assert block_num % (128 / PAGE_SIZE) == 0
 
-        # TODO(kaixih@nvidia): support fp8
         assert q_nope.dtype in (
-            torch.float16,
-            torch.bfloat16,
-        ), f"q_nope.dtype needs to be fp16 or bf16 but got {q_nope.dtype}."
+            torch.float16, torch.bfloat16, torch.float8_e4m3fn), (
+                f"q_nope.dtype needs to be fp16 or bf16 or e4m3 but got "
+                f"{q_nope.dtype}.")
         assert q_nope.dtype == q_pe.dtype == kv_c_and_k_pe_cache.dtype
         assert (
             seq_lens.dtype == torch.int32
@@ -195,7 +190,9 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
             page_table.dtype == torch.int32
         ), f"page_table.dtype needs to be int32 but got {page_table.dtype}."
 
-        out = q_nope.new_empty((B_q, MAX_HEADS, D_latent))
+        dtype = (torch.bfloat16 if is_quantized_kv_cache(self.kv_cache_dtype)
+                 else q_nope.dtype)
+        out = q_nope.new_empty((B_q, MAX_HEADS, D_latent), dtype=dtype)
 
         ops.sm100_cutlass_mla_decode(
             out,
@@ -220,9 +217,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
-        if self.kv_cache_dtype.startswith("fp8"):
-            raise NotImplementedError("FP8 Cutlass MLA not yet supported")
-
         # Adjust workspace size (if necessary)
         self._workspace.ensure_size(attn_metadata, self._num_kv_splits)
 
@@ -252,8 +246,9 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
-        if self.kv_cache_dtype.startswith("fp8"):
-            raise NotImplementedError("FP8 Cutlass MLA not yet supported")
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FP8 Cutlass MLA not supported with FORCE_OLD_CUTLASS_MLA")
 
         B = q_nope.shape[0]
 

From 6adaed42f49ff683f80521b73daa3a3bde413baa Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Thu, 4 Sep 2025 03:14:30 +0800
Subject: [PATCH 829/932] [Feature][P/D]: Optimize NIXL Connector xfer Launch
 (#23887)

Signed-off-by: ycyaw66 <497410282@qq.com>
Co-authored-by: ycyaw66 <497410282@qq.com>
---
 .../kv_connector/v1/nixl_connector.py         | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 8f16babfe2..de9cbc6606 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -14,6 +14,7 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
 
 import msgspec
+import numpy as np
 import torch
 import zmq
 
@@ -1191,8 +1192,8 @@ class NixlConnectorWorker:
         # workers will issue xfers to parts of the P worker remote kv caches.
 
         # Get descs ids.
-        local_block_descs_ids: list[int] = []
-        remote_block_descs_ids: list[int] = []
+        local_block_descs_ids: np.ndarray
+        remote_block_descs_ids: np.ndarray
         if not self.block_window_per_layer:
             # Default case: assume global attention
             remote_block_descs_ids = self._get_block_descs_ids(
@@ -1202,6 +1203,8 @@ class NixlConnectorWorker:
         else:
             # TODO(mgoin): remove this once we have hybrid memory allocator
             # Optimization for models with local attention (Llama 4)
+            local_descs_list = []
+            remote_descs_list = []
             for layer_idx, block_window in enumerate(
                     self.block_window_per_layer):
                 # For each layer:
@@ -1221,8 +1224,11 @@ class NixlConnectorWorker:
                 layer_remote_desc_ids = self._get_block_descs_ids(
                     dst_engine_id, layer_remote_block_ids, layer_idx)
 
-                local_block_descs_ids.extend(layer_local_desc_ids)
-                remote_block_descs_ids.extend(layer_remote_desc_ids)
+                local_descs_list.append(layer_local_desc_ids)
+                remote_descs_list.append(layer_remote_desc_ids)
+
+            local_block_descs_ids = np.concatenate(local_descs_list)
+            remote_block_descs_ids = np.concatenate(remote_descs_list)
 
         assert len(local_block_descs_ids) == len(remote_block_descs_ids)
 
@@ -1247,14 +1253,14 @@ class NixlConnectorWorker:
     def _get_block_descs_ids(self,
                              engine_id: str,
                              block_ids: list[int],
-                             layer_idx: Optional[int] = None) -> list[int]:
+                             layer_idx: Optional[int] = None) -> np.ndarray:
         """
         Get the descs ids for a set of block ids.
         If layer_idx is provided, we use the region_ids for the given layer.
         Otherwise, we use all regions.
         """
         if layer_idx is None:
-            region_ids = range(self.num_regions)
+            region_ids = np.arange(self.num_regions)
         else:
             assert layer_idx < self.num_layers
             if self.num_layers < self.num_regions:
@@ -1262,20 +1268,19 @@ class NixlConnectorWorker:
                 # the regions are organized as [K0, V0, K1, V1, ...]
                 # and we select K_i and V_i
                 assert 2 * self.num_layers == self.num_regions
-                region_ids = range(2 * layer_idx, 2 * layer_idx + 2)
+                region_ids = np.arange(2 * layer_idx, 2 * layer_idx + 2)
             else:
                 # Otherwise, we assume we have MLA and select i-th layer
                 assert self.num_layers == self.num_regions
-                region_ids = range(layer_idx, layer_idx + 1)
+                region_ids = np.arange(layer_idx, layer_idx + 1)
 
         num_blocks = self.dst_num_blocks[engine_id]
 
         # Compute the desc ids for each block.
-        descs_ids: list[int] = []
-        for reg_id in region_ids:
-            for block_id in block_ids:
-                descs_ids.append(reg_id * num_blocks + block_id)
-        return descs_ids
+        region_ids = region_ids[:, None]
+        block_ids = np.array(block_ids)[None, :]
+        descs_ids = region_ids * num_blocks + block_ids
+        return descs_ids.flatten()
 
     def get_backend_aware_kv_block_len(self):
         """

From a43a3f1770525b7ac88151667da76bc2f15ec50d Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Thu, 4 Sep 2025 05:21:36 +0900
Subject: [PATCH 830/932] [Bugfix][DP] DP distribution does not require
 ray[default] (#23822)

Signed-off-by: Kebe <mail@kebe7jun.com>
---
 vllm/v1/engine/utils.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 56ef8477d2..ed0129fda9 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -315,7 +315,6 @@ class CoreEngineActorManager:
 
         import ray
         from ray._private.state import available_resources_per_node
-        from ray.util.state import list_nodes
 
         logger.info("Creating placement groups for data parallel")
         dp_master_ip = \
@@ -324,31 +323,28 @@ class CoreEngineActorManager:
         local_engine_count = \
             vllm_config.parallel_config.data_parallel_size_local
 
-        nodes = sorted(list_nodes(filters=[("state", "=", "ALIVE")]),
-                       key=lambda node: node.node_ip != dp_master_ip)
-        assert nodes[0].node_ip == dp_master_ip, (
-            "The head node is missing or dead")
-        assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
-            "There can only be one head node")
-
         available_resources = available_resources_per_node()
         world_size = vllm_config.parallel_config.world_size
         placement_groups: list[PlacementGroup] = []
         local_dp_ranks: list[int] = []
-
-        for node in nodes:
-            node_ip = node.node_ip
-            node_resources = available_resources[node.node_id]
+        dp_master_ip_key = f'node:{dp_master_ip}'
+        nodes = sorted(available_resources.values(),
+                       key=lambda x: dp_master_ip_key not in x)
+        assert len(nodes) > 0, (
+            "No nodes with resources found in Ray cluster.")
+        assert dp_master_ip_key in nodes[0], (
+            "The DP master node (ip: %s) is missing or dead", dp_master_ip)
+        for node_resources in nodes:
             if "GPU" not in node_resources:
                 continue
             # For now, each DP rank can only be assigned to one node
             # TODO(rui): support allocating a single DP rank
             # to multiple nodes
             available_engine_count = int(node_resources["GPU"]) // world_size
-            if node_ip == dp_master_ip:
+            if dp_master_ip_key in node_resources:
                 assert available_engine_count >= local_engine_count, (
                     "Not enough resources to allocate DP ranks "
-                    f"on DP master node {node_ip}")
+                    f"on DP master node {dp_master_ip}")
                 for i in range(local_engine_count):
                     bundles = [{
                         "GPU": 1.0,

From 36c260dad604ccc845150753f2530b5b2ba9d7e6 Mon Sep 17 00:00:00 2001
From: George Nagy II <george.nagy0969@gmail.com>
Date: Wed, 3 Sep 2025 15:08:47 -0600
Subject: [PATCH 831/932] [Feature][gpt-oss] Add support for num_cached_tokens
 and num_reasoning_tokens tracking (#23460)

Signed-off-by: George Nagy II <george.nagy0969@gmail.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/context.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 9d587e8669..52e35bcac9 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -93,17 +93,35 @@ class HarmonyContext(ConversationContext):
             # as new prompt each time. Hence the sum.
             self.num_prompt_tokens += len(output.prompt_token_ids)
 
+    def _update_num_cached_tokens(self, output: RequestOutput):
+        if output.num_cached_tokens is not None:
+            #Similar to num_prompt_tokens
+            self.num_cached_tokens += output.num_cached_tokens
+
     def _update_num_output_tokens(self, token_ids: Sequence[int]):
         self.num_output_tokens += len(token_ids)
 
+    def _update_num_reasoning_tokens(self, token_ids: Sequence[int]):
+        # Count tokens that are part of reasoning content (analysis channel
+        # or tool-directed messages like python/browser calls)
+        is_analysis = self.parser.current_channel == "analysis"
+        is_tool_call = (self.parser.current_recipient is not None and
+                        (self.parser.current_recipient.startswith("python") or
+                         self.parser.current_recipient.startswith("browser.")))
+        if is_analysis or is_tool_call:
+            self.num_reasoning_tokens += len(token_ids)
+
     def append_output(self, output) -> None:
         if isinstance(output, RequestOutput):
             self._update_num_prompt_tokens(output)
+            self._update_num_cached_tokens(output)
             output_token_ids = output.outputs[0].token_ids
             self._update_num_output_tokens(output_token_ids)
             self.parser = get_streamable_parser_for_assistant()
             for token_id in output_token_ids:
                 self.parser.process(token_id)
+                # Check if the current token is part of reasoning content
+                self._update_num_reasoning_tokens([token_id])
             output_msgs = self.parser.messages
         else:
             # Tool output.
@@ -204,6 +222,7 @@ class StreamingHarmonyContext(HarmonyContext):
             # so we only want to add the prompt tokens once for each message.
             if self.first_tok_of_message:
                 self._update_num_prompt_tokens(output)
+                self._update_num_cached_tokens(output)
             # Reset self.first_tok_of_message if needed:
             # if the current token is the last one of the current message
             # (finished=True), then the next token processed will mark the
@@ -212,6 +231,8 @@ class StreamingHarmonyContext(HarmonyContext):
             tok = output.outputs[0].token_ids[0]
             self.parser.process(tok)
             self._update_num_output_tokens(output.outputs[0].token_ids)
+            # Check if the current token is part of reasoning content
+            self._update_num_reasoning_tokens([tok])
             self.last_tok = tok
         else:
             # Handle the case of tool output in direct message format

From b5ee1e3261d9edf94d76ba8b437ebdef7ac599ea Mon Sep 17 00:00:00 2001
From: Peter Pan <peter.pan@daocloud.io>
Date: Thu, 4 Sep 2025 06:49:16 +0800
Subject: [PATCH 832/932] Remove deprecated `PyNcclConnector` (#24151)

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
---
 .../disagg_benchmarks/disagg_overhead_benchmark.sh   |  4 ++--
 .../disagg_performance_benchmark.sh                  |  4 ++--
 examples/offline_inference/disaggregated_prefill.py  | 12 ++++++------
 examples/online_serving/disaggregated_prefill.sh     |  4 ++--
 tests/kv_transfer/test_lookup_buffer.py              |  2 +-
 tests/kv_transfer/test_send_recv.py                  |  2 +-
 vllm/config/__init__.py                              |  2 +-
 7 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 92f97ffabe..2c72941cf7 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -62,7 +62,7 @@ benchmark() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
 
   CUDA_VISIBLE_DEVICES=1 python3 \
@@ -72,7 +72,7 @@ benchmark() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
   wait_for_server 8100
   wait_for_server 8200
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index af2bcba3ea..0bbf7cd2b1 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -69,7 +69,7 @@ launch_disagg_prefill() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
   CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
@@ -78,7 +78,7 @@ launch_disagg_prefill() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
   wait_for_server 8100
   wait_for_server 8200
diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py
index 05a361fee0..f619fa584f 100644
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -30,12 +30,12 @@ def run_prefill(prefill_done):
     ]
     sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
 
-    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # Using P2pNcclConnector to transmit KV caches between vLLM instances.
     # This instance is the prefill node (kv_producer, rank 0).
     # The number of parallel instances for KV cache transfer is set to 2,
-    # as required for PyNcclConnector.
+    # as required for P2pNcclConnector.
     ktc = KVTransferConfig(
-        kv_connector="PyNcclConnector",
+        kv_connector="P2pNcclConnector",
         kv_role="kv_producer",
         kv_rank=0,
         kv_parallel_size=2,
@@ -74,12 +74,12 @@ def run_decode(prefill_done):
     ]
     sampling_params = SamplingParams(temperature=0, top_p=0.95)
 
-    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # Using P2pNcclConnector to transmit KV caches between vLLM instances.
     # This instance is the decode node (kv_consumer, rank 1).
     # The number of parallel instances for KV cache transfer is set to 2,
-    # as required for PyNcclConnector.
+    # as required for P2pNcclConnector.
     ktc = KVTransferConfig(
-        kv_connector="PyNcclConnector",
+        kv_connector="P2pNcclConnector",
         kv_role="kv_consumer",
         kv_rank=1,
         kv_parallel_size=2,
diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
index 6925dc8af0..d434e22b1a 100644
--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@@ -53,7 +53,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
     --gpu-memory-utilization 0.8 \
     --trust-remote-code \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
 
 # decoding instance, which is the KV consumer
 CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
@@ -62,7 +62,7 @@ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
     --gpu-memory-utilization 0.8 \
     --trust-remote-code \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
 
 # wait until prefill and decode instances are ready
 wait_for_server 8100
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 352ab63552..ca2f04dabf 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -128,7 +128,7 @@ if __name__ == "__main__":
     print(f"initialized! My rank is {my_rank}")
 
     config = KVTransferConfig(
-        kv_connector='PyNcclConnector',
+        kv_connector='P2pNcclConnector',
         kv_buffer_device='cuda',
         kv_buffer_size=1e9,
         kv_rank=my_rank,
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 32116608a2..99ad2b43ae 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -137,7 +137,7 @@ if __name__ == "__main__":
     )
 
     config = KVTransferConfig(
-        kv_connector='PyNcclConnector',
+        kv_connector='P2pNcclConnector',
         kv_buffer_device='cuda',
         kv_buffer_size=1e9,
         kv_rank=my_rank,
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index fd3ad2c8a6..2cea2695a6 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3247,7 +3247,7 @@ class KVTransferConfig:
 
     kv_parallel_size: int = 1
     """The number of parallel instances for KV cache transfer. For
-    PyNcclConnector, this should be 2."""
+    P2pNcclConnector, this should be 2."""
 
     kv_ip: str = "127.0.0.1"
     """The KV connector ip, used to build distributed connection."""

From a38f8bd54c861d37acd1bf6497b86edf664a6ab7 Mon Sep 17 00:00:00 2001
From: wuhang <wuhang6@huawei.com>
Date: Thu, 4 Sep 2025 12:05:10 +0800
Subject: [PATCH 833/932] [Feature][Responses API]Support MCP tools with
 streaming mode + background mode (#23927)

Signed-off-by: wuhang <wuhang6@huawei.com>
---
 .../openai/test_response_api_with_harmony.py  |  19 ++-
 vllm/entrypoints/openai/api_server.py         |  16 ++-
 vllm/entrypoints/openai/serving_responses.py  | 129 ++++++++++++++----
 3 files changed, 138 insertions(+), 26 deletions(-)

diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 72d468db08..0d5836fab5 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -275,7 +275,8 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_streaming(client: OpenAI, model_name: str):
+@pytest.mark.parametrize("background", [True, False])
+async def test_streaming(client: OpenAI, model_name: str, background: bool):
     # TODO: Add back when web search and code interpreter are available in CI
     prompts = [
         "tell me a story about a cat in 20 words",
@@ -300,11 +301,16 @@ async def test_streaming(client: OpenAI, model_name: str):
                 # },
             ],
             stream=True,
+            background=background,
         )
 
         events = []
         current_event_mode = None
+        resp_id = None
         async for event in response:
+            if event.type == "response.created":
+                resp_id = event.response.id
+
             if current_event_mode != event.type:
                 current_event_mode = event.type
                 print(f"\n[{event.type}] ", end="", flush=True)
@@ -322,6 +328,17 @@ async def test_streaming(client: OpenAI, model_name: str):
 
         assert len(events) > 0
 
+        if background:
+            starting_after = 5
+            async with await client.responses.retrieve(
+                    response_id=resp_id,
+                    stream=True,
+                    starting_after=starting_after) as stream:
+                counter = starting_after
+                async for event in stream:
+                    counter += 1
+                    assert event == events[counter]
+
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 3cebfdf885..b6667ebf15 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -616,14 +616,23 @@ async def create_responses(request: ResponsesRequest, raw_request: Request):
 
 
 @router.get("/v1/responses/{response_id}")
-async def retrieve_responses(response_id: str, raw_request: Request):
+async def retrieve_responses(
+    response_id: str,
+    raw_request: Request,
+    starting_after: Optional[int] = None,
+    stream: Optional[bool] = False,
+):
     handler = responses(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
             message="The model does not support Responses API")
 
     try:
-        response = await handler.retrieve_responses(response_id)
+        response = await handler.retrieve_responses(
+            response_id,
+            starting_after=starting_after,
+            stream=stream,
+        )
     except Exception as e:
         raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
                             detail=str(e)) from e
@@ -631,6 +640,9 @@ async def retrieve_responses(response_id: str, raw_request: Request):
     if isinstance(response, ErrorResponse):
         return JSONResponse(content=response.model_dump(),
                             status_code=response.error.code)
+    elif stream:
+        return StreamingResponse(content=response,
+                                 media_type="text/event-stream")
     return JSONResponse(content=response.model_dump())
 
 
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 7f11b37e51..58424c9d9f 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -4,6 +4,7 @@
 import asyncio
 import json
 import time
+from collections import deque
 from collections.abc import AsyncGenerator, AsyncIterator, Sequence
 from contextlib import AsyncExitStack
 from copy import copy
@@ -55,7 +56,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
 # yapf: enable
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.tool_server import MCPToolServer, ToolServer
+from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob as SampleLogprob
@@ -168,6 +169,11 @@ class OpenAIServingResponses(OpenAIServing):
         # never remove messages from the store.
         self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}
 
+        # HACK(wuhang): This is a hack. We should use a better store.
+        # FIXME: If enable_store=True, this may cause a memory leak since we
+        # never remove events from the store.
+        self.event_store: dict[str, tuple[deque[str], asyncio.Event]] = {}
+
         self.background_tasks: dict[str, asyncio.Task] = {}
 
         self.tool_server = tool_server
@@ -249,15 +255,6 @@ class OpenAIServingResponses(OpenAIServing):
         if raw_request:
             raw_request.state.request_metadata = request_metadata
 
-        if self.tool_server is not None and isinstance(
-                self.tool_server,
-                MCPToolServer) and request.stream and request.tools and any(
-                    tool.type in ["web_search_preview", "code_interpreter"]
-                    for tool in request.tools):
-            return self.create_error_response(
-                "MCP tool server is not supported in background mode and "
-                "streaming mode")
-
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[ConversationContext, None]] = []
 
@@ -329,25 +326,44 @@ class OpenAIServingResponses(OpenAIServing):
                 self.response_store[response.id] = response
 
             # Run the request in the background.
-            task = asyncio.create_task(
-                self._run_background_request(
-                    request,
-                    sampling_params,
-                    result_generator,
-                    context,
-                    model_name,
-                    tokenizer,
-                    request_metadata,
-                    created_time,
-                ),
-                name=f"create_{response.id}",
-            )
+            if request.stream:
+                task = asyncio.create_task(
+                    self._run_background_request_stream(
+                        request,
+                        sampling_params,
+                        result_generator,
+                        context,
+                        model_name,
+                        tokenizer,
+                        request_metadata,
+                        created_time,
+                    ),
+                    name=f"create_{request.request_id}",
+                )
+            else:
+                task = asyncio.create_task(
+                    self._run_background_request(
+                        request,
+                        sampling_params,
+                        result_generator,
+                        context,
+                        model_name,
+                        tokenizer,
+                        request_metadata,
+                        created_time,
+                    ),
+                    name=f"create_{response.id}",
+                )
 
             # For cleanup.
             response_id = response.id
             self.background_tasks[response_id] = task
             task.add_done_callback(
                 lambda _: self.background_tasks.pop(response_id, None))
+
+            if request.stream:
+                return self.responses_background_stream_generator(
+                    request.request_id)
             return response
 
         if request.stream:
@@ -736,6 +752,40 @@ class OpenAIServingResponses(OpenAIServing):
                     prev_outputs.append(response_msg)
         return messages
 
+    async def _run_background_request_stream(
+        self,
+        request: ResponsesRequest,
+        *args,
+        **kwargs,
+    ):
+        event_deque: deque[str] = deque()
+        new_event_signal = asyncio.Event()
+        self.event_store[request.request_id] = (event_deque, new_event_signal)
+        response = None
+        try:
+            generator = self.responses_stream_generator(
+                request, *args, **kwargs)
+            async for event in generator:
+                event_deque.append(event)
+                new_event_signal.set()  # Signal new event available
+        except Exception as e:
+            logger.exception("Background request failed for %s",
+                             request.request_id)
+            response = self.create_error_response(str(e))
+        finally:
+            # Mark as finished with a special marker
+            event_deque.append("__STREAM_END__")
+            new_event_signal.set()
+
+        if response is not None and isinstance(response, ErrorResponse):
+            # If the request has failed, update the status to "failed".
+            response_id = request.request_id
+            async with self.response_store_lock:
+                stored_response = self.response_store.get(response_id)
+                assert stored_response is not None
+                if stored_response.status not in ("completed", "cancelled"):
+                    stored_response.status = "failed"
+
     async def _run_background_request(
         self,
         request: ResponsesRequest,
@@ -759,9 +809,36 @@ class OpenAIServingResponses(OpenAIServing):
                 if stored_response.status not in ("completed", "cancelled"):
                     stored_response.status = "failed"
 
+    async def responses_background_stream_generator(
+        self,
+        response_id: str,
+        starting_after: Optional[int] = None,
+    ):
+        if response_id not in self.event_store:
+            raise ValueError(f"Unknown response_id: {response_id}")
+
+        event_deque, new_event_signal = self.event_store[response_id]
+        start_index = 0 if starting_after is None else starting_after + 1
+        current_index = start_index
+
+        while True:
+            new_event_signal.clear()
+
+            # Yield existing events from start_index
+            while current_index < len(event_deque):
+                event = event_deque[current_index]
+                if event == "__STREAM_END__":
+                    return
+                yield event
+                current_index += 1
+
+            await new_event_signal.wait()
+
     async def retrieve_responses(
         self,
         response_id: str,
+        starting_after: Optional[int],
+        stream: Optional[bool],
     ) -> Union[ErrorResponse, ResponsesResponse]:
         if not response_id.startswith("resp_"):
             return self._make_invalid_id_error(response_id)
@@ -771,6 +848,12 @@ class OpenAIServingResponses(OpenAIServing):
 
         if response is None:
             return self._make_not_found_error(response_id)
+
+        if stream:
+            return self.responses_background_stream_generator(
+                response_id,
+                starting_after,
+            )
         return response
 
     async def cancel_responses(

From e919d6f549f4da22fa60ea394f00aaf93ef23aa0 Mon Sep 17 00:00:00 2001
From: Qiming Zhang <qiming1.zhang@intel.com>
Date: Wed, 3 Sep 2025 21:37:37 -0700
Subject: [PATCH 834/932] [Kernel][Bugfix] Fix grouped topk cu (#24146)

Signed-off-by: mayuyuace <qiming1.zhang@intel.com>
---
 csrc/moe/grouped_topk_kernels.cu | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
index 78f7b3cc1a..accbb09858 100644
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -28,6 +28,7 @@ namespace cg = cooperative_groups;
 namespace vllm {
 namespace moe {
 
+constexpr float kNegInfinity = INFINITY * -1;
 constexpr unsigned FULL_WARP_MASK = 0xffffffff;
 constexpr int32_t WARP_SIZE = 32;
 constexpr int32_t BLOCK_SIZE = 512;
@@ -512,8 +513,8 @@ __global__ void group_idx_and_topk_idx_kernel(
       warp_id * topk;
   s_topk_idx += warp_id * topk;
 
-  T value = cuda::std::numeric_limits<T>::min();
-  T topk_group_value = cuda::std::numeric_limits<T>::min();
+  T value = kNegInfinity;
+  T topk_group_value = kNegInfinity;
   int32_t num_equalto_topkth_group;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
@@ -539,11 +540,11 @@ __global__ void group_idx_and_topk_idx_kernel(
       __syncwarp();  // Ensure all threads have valid data before reduction
       topk_group_value = cg::reduce(tile, value, cg::greater<T>());
       if (value == topk_group_value) {
-        value = cuda::std::numeric_limits<T>::min();
+        value = kNegInfinity;
       }
       pre_count_equal_to_top_value = count_equal_to_top_value;
       count_equal_to_top_value = __popc(__ballot_sync(
-          FULL_WARP_MASK, (value == cuda::std::numeric_limits<T>::min())));
+          FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
     }
     num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
   }
@@ -555,7 +556,7 @@ __global__ void group_idx_and_topk_idx_kernel(
 
   int count_equalto_topkth_group = 0;
   bool if_proceed_next_topk =
-      (topk_group_value != cuda::std::numeric_limits<T>::min());
+      (topk_group_value != cuda_cast<T, float>(kNegInfinity));
   if (case_id < num_tokens && if_proceed_next_topk) {
     for (int i_group = 0; i_group < n_group; i_group++) {
       if ((group_scores[i_group] > topk_group_value) ||
@@ -568,7 +569,7 @@ __global__ void group_idx_and_topk_idx_kernel(
               (i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
                                                  scores_with_bias[offset + i]))
                   ? scores_with_bias[offset + i]
-                  : cuda::std::numeric_limits<T>::min();
+                  : cuda_cast<T, float>(kNegInfinity);
           queue.add(candidates, offset + i);
         }
         if (group_scores[i_group] == topk_group_value) {

From 712b273f655fe007e4efbaae00f1a343cfeb0742 Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Wed, 3 Sep 2025 22:21:12 -0700
Subject: [PATCH 835/932] [Refactor] Introduce basic Renderer for
 completion-style request (#24010)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 tests/entrypoints/test_renderer.py            | 163 +++++++++++++
 vllm/entrypoints/openai/serving_engine.py     |  20 +-
 vllm/entrypoints/openai/serving_pooling.py    |  26 +--
 .../openai/serving_tokenization.py            |  15 +-
 vllm/entrypoints/renderer.py                  | 219 ++++++++++++++++++
 5 files changed, 416 insertions(+), 27 deletions(-)
 create mode 100644 tests/entrypoints/test_renderer.py
 create mode 100644 vllm/entrypoints/renderer.py

diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/test_renderer.py
new file mode 100644
index 0000000000..54b5271ba6
--- /dev/null
+++ b/tests/entrypoints/test_renderer.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import Optional
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from vllm.entrypoints.renderer import CompletionRenderer
+
+
+@dataclass
+class MockModelConfig:
+    max_model_len: int = 100
+    encoder_config: Optional[dict] = None
+
+
+class MockTokenizerResult:
+
+    def __init__(self, input_ids):
+        self.input_ids = input_ids
+
+
+@pytest.fixture
+def mock_model_config():
+    return MockModelConfig()
+
+
+@pytest.fixture
+def mock_tokenizer():
+    tokenizer = MagicMock()
+    return tokenizer
+
+
+@pytest.fixture
+def mock_async_tokenizer():
+    async_tokenizer = AsyncMock()
+    return async_tokenizer
+
+
+@pytest.fixture
+def renderer(mock_model_config, mock_tokenizer):
+    return CompletionRenderer(model_config=mock_model_config,
+                              tokenizer=mock_tokenizer,
+                              async_tokenizer_pool={})
+
+
+class TestRenderPrompt:
+    """Test Category A: Basic Functionality Tests"""
+
+    @pytest.mark.asyncio
+    async def test_token_input(self, renderer):
+        tokens = [101, 7592, 2088]
+        results = await renderer.render_prompt(prompt_or_prompts=tokens,
+                                               max_length=100)
+
+        assert len(results) == 1
+        assert results[0]["prompt_token_ids"] == tokens
+
+    @pytest.mark.asyncio
+    async def test_token_list_input(self, renderer):
+        token_lists = [[101, 7592, 2088], [102, 1234, 5678, 9012], [103, 4567]]
+        results = await renderer.render_prompt(prompt_or_prompts=token_lists,
+                                               max_length=100)
+
+        assert len(results) == 3
+        assert results[0]["prompt_token_ids"] == [101, 7592, 2088]
+        assert results[1]["prompt_token_ids"] == [102, 1234, 5678, 9012]
+        assert results[2]["prompt_token_ids"] == [103, 4567]
+
+    @pytest.mark.asyncio
+    async def test_text_input(self, renderer, mock_async_tokenizer):
+        mock_async_tokenizer.return_value = MockTokenizerResult(
+            [101, 7592, 2088])
+        renderer.async_tokenizer_pool[
+            renderer.tokenizer] = mock_async_tokenizer
+
+        results = await renderer.render_prompt(prompt_or_prompts="Hello world",
+                                               max_length=100)
+
+        assert len(results) == 1
+        assert results[0]["prompt_token_ids"] == [101, 7592, 2088]
+        mock_async_tokenizer.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_text_list_input(self, renderer, mock_async_tokenizer):
+        mock_async_tokenizer.return_value = MockTokenizerResult(
+            [101, 7592, 2088])
+        renderer.async_tokenizer_pool[
+            renderer.tokenizer] = mock_async_tokenizer
+
+        text_list_input = ["Hello world", "How are you?", "Good morning"]
+        results = await renderer.render_prompt(
+            prompt_or_prompts=text_list_input, max_length=100)
+
+        assert len(results) == 3
+        for result in results:
+            assert result["prompt_token_ids"] == [101, 7592, 2088]
+        assert mock_async_tokenizer.call_count == 3
+
+    @pytest.mark.asyncio
+    async def test_no_truncation(self, renderer, mock_async_tokenizer):
+        mock_async_tokenizer.return_value = MockTokenizerResult(
+            [101, 7592, 2088])
+        renderer.async_tokenizer_pool[
+            renderer.tokenizer] = mock_async_tokenizer
+
+        results = await renderer.render_prompt(prompt_or_prompts="Hello world",
+                                               max_length=100)
+
+        assert len(results) == 1
+        call_args = mock_async_tokenizer.call_args
+        assert "truncation" not in call_args.kwargs or call_args.kwargs[
+            "truncation"] is False
+
+    @pytest.mark.asyncio
+    async def test_truncation_positive(self, renderer, mock_async_tokenizer):
+        mock_async_tokenizer.return_value = MockTokenizerResult(
+            [101, 7592, 2088])  # Truncated
+        renderer.async_tokenizer_pool[
+            renderer.tokenizer] = mock_async_tokenizer
+
+        results = await renderer.render_prompt(prompt_or_prompts="Hello world",
+                                               max_length=100,
+                                               truncate_prompt_tokens=50)
+
+        assert len(results) == 1
+        call_args = mock_async_tokenizer.call_args
+        assert call_args.kwargs["truncation"] is True
+        assert call_args.kwargs["max_length"] == 50
+
+    @pytest.mark.asyncio
+    async def test_token_truncation_last_elements(self, renderer):
+        # Test that token truncation keeps the last N elements
+        long_tokens = [100, 101, 102, 103, 104, 105, 106, 107, 108,
+                       109]  # 10 tokens
+        results = await renderer.render_prompt(prompt_or_prompts=long_tokens,
+                                               max_length=100,
+                                               truncate_prompt_tokens=5)
+
+        assert len(results) == 1
+        # Should keep the last 5 tokens: [105, 106, 107, 108, 109]
+        assert results[0]["prompt_token_ids"] == [105, 106, 107, 108, 109]
+
+    @pytest.mark.asyncio
+    async def test_max_length_exceeded(self, renderer):
+        long_tokens = list(range(150))  # Exceeds max_model_len=100
+
+        with pytest.raises(ValueError, match="maximum context length"):
+            await renderer.render_prompt(prompt_or_prompts=long_tokens,
+                                         max_length=100)
+
+    @pytest.mark.asyncio
+    async def test_no_tokenizer_for_text(self, mock_model_config):
+        renderer_no_tokenizer = CompletionRenderer(
+            model_config=mock_model_config,
+            tokenizer=None,
+            async_tokenizer_pool={})
+
+        with pytest.raises(ValueError, match="No tokenizer available"):
+            await renderer_no_tokenizer.render_prompt(
+                prompt_or_prompts="Hello world", max_length=100)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index f506f7de16..a218f6882f 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -62,8 +62,10 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               TranslationRequest)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser
+from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer
 # yapf: enable
 from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
+from vllm.inputs.data import PromptType
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
@@ -243,6 +245,16 @@ class OpenAIServing:
                                          AsyncMicrobatchTokenizer] = {}
         self.log_error_stack = log_error_stack
 
+    def _get_renderer(self, tokenizer: Optional[AnyTokenizer]) -> BaseRenderer:
+        """
+        Get a Renderer instance with the provided tokenizer.
+        Uses shared async tokenizer pool for efficiency.
+        """
+        return CompletionRenderer(
+            model_config=self.model_config,
+            tokenizer=tokenizer,
+            async_tokenizer_pool=self._async_tokenizer_pool)
+
     def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer:
         """
         Return (and cache) an `AsyncMicrobatchTokenizer` bound to the
@@ -1098,7 +1110,7 @@ class OpenAIServing:
     def _log_inputs(
         self,
         request_id: str,
-        inputs: RequestPrompt,
+        inputs: Union[RequestPrompt, PromptType],
         params: Optional[Union[SamplingParams, PoolingParams,
                                BeamSearchParams]],
         lora_request: Optional[LoRARequest],
@@ -1110,11 +1122,9 @@ class OpenAIServing:
             prompt = inputs
         elif isinstance(inputs, list):
             prompt_token_ids = inputs
-        elif "prompt_embeds" in inputs:
-            prompt_embeds = inputs.get("prompt_embeds")
         else:
-            prompt = inputs["prompt"]
-            prompt_token_ids = inputs["prompt_token_ids"]
+            prompt = getattr(inputs, 'prompt', None)
+            prompt_token_ids = getattr(inputs, 'prompt_token_ids', None)
 
         self.request_logger.log_inputs(
             request_id,
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 685c98c817..c08c0743ff 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -4,7 +4,7 @@
 import asyncio
 import base64
 import time
-from collections.abc import AsyncGenerator, Sequence
+from collections.abc import AsyncGenerator
 from typing import Final, Literal, Optional, Union, cast
 
 import jinja2
@@ -26,7 +26,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               PoolingRequest, PoolingResponse,
                                               PoolingResponseData, UsageInfo)
 # yapf: enable
-from vllm.entrypoints.openai.serving_engine import OpenAIServing, RequestPrompt
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
@@ -104,6 +104,7 @@ class OpenAIServingPooling(OpenAIServing):
             else:
                 tokenizer = await self.engine_client.get_tokenizer(lora_request
                                                                    )
+            renderer = self._get_renderer(tokenizer)
 
             if getattr(request, "dimensions", None) is not None:
                 return self.create_error_response(
@@ -126,14 +127,11 @@ class OpenAIServingPooling(OpenAIServing):
 
                 engine_prompts = await self.io_processor.pre_process_async(
                     prompt=validated_prompt, request_id=request_id)
-                request_prompts: Sequence[RequestPrompt] = [
-                    ""
-                ] * len(engine_prompts)
 
             elif isinstance(request, PoolingChatRequest):
                 (
                     _,
-                    request_prompts,
+                    _,
                     engine_prompts,
                 ) = await self._preprocess_chat(
                     request,
@@ -149,13 +147,13 @@ class OpenAIServingPooling(OpenAIServing):
                     add_special_tokens=request.add_special_tokens,
                 )
             elif isinstance(request, PoolingCompletionRequest):
-                (request_prompts,
-                 engine_prompts) = await self._preprocess_completion(
-                     request,
-                     tokenizer,
-                     request.input,
-                     add_special_tokens=request.add_special_tokens,
-                 )
+                engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=request.input,
+                    max_length=self.max_model_len,
+                    truncate_prompt_tokens=truncate_prompt_tokens,
+                    add_special_tokens=request.add_special_tokens,
+                    cache_salt=getattr(request, 'cache_salt', None),
+                )
             else:
                 raise ValueError(
                     f"Unsupported request of type {type(request)}")
@@ -177,7 +175,7 @@ class OpenAIServingPooling(OpenAIServing):
                 request_id_item = f"{request_id}-{i}"
 
                 self._log_inputs(request_id_item,
-                                 request_prompts[i],
+                                 engine_prompt,
                                  params=pooling_params,
                                  lora_request=lora_request)
 
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 2f258255d5..70cb6c21b2 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -65,6 +65,7 @@ class OpenAIServingTokenization(OpenAIServing):
             lora_request = self._maybe_get_adapters(request)
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
+            renderer = self._get_renderer(tokenizer)
 
             if isinstance(request, TokenizeChatRequest):
                 tool_dicts = (None if request.tools is None else
@@ -87,13 +88,11 @@ class OpenAIServingTokenization(OpenAIServing):
                     add_special_tokens=request.add_special_tokens,
                 )
             else:
-                (request_prompts,
-                 engine_prompts) = await self._preprocess_completion(
-                     request,
-                     tokenizer,
-                     request.prompt,
-                     add_special_tokens=request.add_special_tokens,
-                 )
+                engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=request.prompt,
+                    add_special_tokens=request.add_special_tokens,
+                    cache_salt=getattr(request, 'cache_salt', None),
+                )
         except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(f"{e} {e.__cause__}")
@@ -101,7 +100,7 @@ class OpenAIServingTokenization(OpenAIServing):
         input_ids: list[int] = []
         for i, engine_prompt in enumerate(engine_prompts):
             self._log_inputs(request_id,
-                             request_prompts[i],
+                             engine_prompt,
                              params=None,
                              lora_request=lora_request)
 
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
new file mode 100644
index 0000000000..29200dda89
--- /dev/null
+++ b/vllm/entrypoints/renderer.py
@@ -0,0 +1,219 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from abc import ABC, abstractmethod
+from typing import Annotated, Optional, Union
+
+from pydantic import Field
+
+from vllm.config import ModelConfig
+from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.parse import parse_and_batch_prompt
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import AsyncMicrobatchTokenizer
+
+
+class BaseRenderer(ABC):
+    """
+    Base class for unified input processing and rendering.
+    
+    The Renderer serves as a unified input processor that consolidates
+    tokenization, chat template formatting, and multimodal input handling
+    into a single component.
+    It converts high-level API requests (OpenAI-style JSON) into token IDs and
+    multimodal features ready for engine consumption.
+    
+    Key responsibilities:
+    - Convert text prompts to token sequences with proper special tokens
+    - Apply chat templates and format conversations
+    - Handle multimodal inputs (images, audio, etc.) when applicable
+    - Manage prompt truncation and length validation
+    - Provide clean separation between API layer and engine core
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        tokenizer: Optional[AnyTokenizer] = None,
+    ):
+        super().__init__()
+        self.model_config = model_config
+        self.tokenizer = tokenizer
+
+    @abstractmethod
+    async def render_prompt(
+        self,
+        prompt_or_prompts: Union[str, list[str], list[int], list[list[int]]],
+        max_length: Optional[int] = None,
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
+        add_special_tokens: Optional[bool] = True,
+        cache_salt: Optional[str] = None,
+    ) -> list[EngineTokensPrompt]:
+        """
+        Convert input prompts into tokenized format for engine processing.
+        
+        This is the core method that transforms various input formats into
+        standardized TokensPrompt objects. Implementations should handle
+        tokenization, special token insertion, truncation, and validation
+        according to model requirements.
+        
+        Args:
+            prompt_or_prompts: Input data in various formats:
+                - str: Single text prompt
+                - list[str]: Batch of text prompts  
+                - list[int]: Pre-tokenized sequence
+                - list[list[int]]: Batch of pre-tokenized sequences
+            max_length: Maximum sequence length (endpoint-specific behavior)
+            truncate_prompt_tokens: Truncate to last N tokens
+                (None=no truncation, 0=empty)
+            add_special_tokens: Add model-specific tokens (e.g., [CLS], [SEP])
+                to text inputs
+            cache_salt: Optional string to disambiguate cached prompts
+            
+        Returns:
+            list[EngineTokensPrompt]: Tokenized prompts ready for engine 
+                consumption
+            
+        Raises:
+            ValueError: If input format is invalid or length limits exceeded
+        """
+        raise NotImplementedError
+
+
+class CompletionRenderer(BaseRenderer):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        tokenizer: Optional[AnyTokenizer] = None,
+        async_tokenizer_pool: Optional[dict[AnyTokenizer,
+                                            AsyncMicrobatchTokenizer]] = None,
+    ):
+        super().__init__(model_config, tokenizer)
+        self.async_tokenizer_pool = async_tokenizer_pool or {}
+        self.async_tokenizer: Optional[AsyncMicrobatchTokenizer] = None
+
+    async def render_prompt(
+        self,
+        prompt_or_prompts: Union[str, list[str], list[int], list[list[int]]],
+        max_length: Optional[int] = None,
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
+        add_special_tokens: Optional[bool] = True,
+        cache_salt: Optional[str] = None,
+    ) -> list[EngineTokensPrompt]:
+        """Implementation of prompt rendering for completion-style requests.
+        
+        Uses async tokenizer pooling for improved performance. See base class
+        for detailed parameter documentation.
+        """
+        if truncate_prompt_tokens is not None:
+            if max_length is not None:
+                assert 0 <= truncate_prompt_tokens <= max_length
+            if truncate_prompt_tokens == 0:
+                return []
+
+        # Parse and batch the input prompts
+        batch_inputs = parse_and_batch_prompt(prompt_or_prompts)
+
+        rendered_prompts: list[EngineTokensPrompt] = []
+        tokenize_tasks = []
+        for prompt_input in batch_inputs:
+            if prompt_input["is_tokens"] is True:
+                # Token input
+                token_ids = self._maybe_apply_truncation(
+                    prompt_input["content"], truncate_prompt_tokens)
+                rendered_prompts.append(
+                    self._create_tokens_prompt(token_ids, max_length,
+                                               cache_salt))
+            else:
+                # Text input
+                tokenize_task = asyncio.create_task(
+                    self._tokenize(prompt_input["content"], max_length,
+                                   truncate_prompt_tokens, add_special_tokens,
+                                   cache_salt))
+                tokenize_tasks.append(tokenize_task)
+
+        # Wait for all text tokenization to finish
+        if tokenize_tasks:
+            tokenized_text_prompts = await asyncio.gather(*tokenize_tasks)
+            rendered_prompts.extend(tokenized_text_prompts)
+
+        return rendered_prompts
+
+    def _maybe_apply_truncation(
+            self, token_ids: list[int],
+            truncate_prompt_tokens: Optional[int]) -> list[int]:
+        """Apply truncation to token sequence."""
+        if truncate_prompt_tokens is None:
+            return token_ids
+        if truncate_prompt_tokens >= len(token_ids):
+            return token_ids
+
+        return token_ids[-truncate_prompt_tokens:]
+
+    async def _tokenize(
+        self,
+        text: str,
+        max_length: Optional[int],
+        truncate_prompt_tokens: Optional[int],
+        add_special_tokens: Optional[bool],
+        cache_salt: Optional[str],
+    ) -> EngineTokensPrompt:
+        """Tokenize text input asynchronously."""
+        async_tokenizer = self._get_async_tokenizer()
+
+        # Handle encoder-specific preprocessing
+        if (self.model_config.encoder_config is not None
+                and self.model_config.encoder_config.get(
+                    "do_lower_case", False)):
+            text = text.lower()
+
+        # Tokenize texts
+        if truncate_prompt_tokens is None:
+            encoded = await async_tokenizer(
+                text, add_special_tokens=add_special_tokens)
+        else:
+            encoded = await async_tokenizer(
+                text,
+                add_special_tokens=add_special_tokens,
+                truncation=True,
+                max_length=truncate_prompt_tokens)
+
+        return self._create_tokens_prompt(encoded.input_ids, max_length,
+                                          cache_salt)
+
+    def _get_async_tokenizer(self) -> AsyncMicrobatchTokenizer:
+        """Get or create async tokenizer using shared pool."""
+        if self.async_tokenizer is not None:
+            return self.async_tokenizer
+        if self.tokenizer is None:
+            raise ValueError(
+                "No tokenizer available for text input processing")
+
+        # Check shared pool first
+        if self.tokenizer in self.async_tokenizer_pool:
+            return self.async_tokenizer_pool[self.tokenizer]
+
+        # Create new async tokenizer and add to pool
+        self.async_tokenizer = AsyncMicrobatchTokenizer(self.tokenizer)
+        self.async_tokenizer_pool[self.tokenizer] = self.async_tokenizer
+        return self.async_tokenizer
+
+    def _create_tokens_prompt(
+        self,
+        token_ids: list[int],
+        max_length: Optional[int] = None,
+        cache_salt: Optional[str] = None,
+    ) -> EngineTokensPrompt:
+        """Create validated EngineTokensPrompt."""
+        if max_length is not None and len(token_ids) > max_length:
+            raise ValueError(
+                f"This maximum context length is {max_length} tokens. "
+                f"However, your request has {len(token_ids)} input tokens. "
+                "Please reduce the length of the input messages.")
+
+        tokens_prompt = EngineTokensPrompt(prompt_token_ids=token_ids)
+        if cache_salt is not None:
+            tokens_prompt["cache_salt"] = cache_salt
+        return tokens_prompt

From cb55ad86fe47759c22de06b3b3b93b3aa28690a2 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Wed, 3 Sep 2025 23:09:11 -0700
Subject: [PATCH 836/932] Migrate ultravox inputs to TensorSchema (#23503)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/ultravox.py | 60 +++++++++++++-------------
 1 file changed, 29 insertions(+), 31 deletions(-)

diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index f91c4ddb6e..c883065805 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -4,7 +4,7 @@
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal, Optional, Union
 
 import torch
 from torch import nn
@@ -31,6 +31,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -43,26 +44,37 @@ _AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>"
 _MAX_ENCODER_BATCH_SIZE = 16
 
 
-class UltravoxAudioFeatureInputs(TypedDict):
+class UltravoxAudioFeatureInputs(TensorSchema):
+    """
+    Dimensions:
+    - b: batch size
+    - n: number of chunks
+    - t: Time frames (M)
+    - nmb: Number of mel bins
+    """
     type: Literal["audio_features"]
-    data: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
-    """Shape: `(batch_size, num_chunks, 80, M)`"""
-    lens: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    Length of the audio frames. Used for attention mask in WhisperEncoder.
-    Shape: `(batch_size, num_chunks)`
-    """
-    token_len: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    Length of the audio tokens. Used for flattening the audio features.
-    Shape: `(batch_size, num_chunks)`
-    """
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor],
+                          list[list[torch.Tensor]]],
+                    TensorShape("b", "n", "nmb", "t", dynamic_dims={"n"})]
+    lens: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("b", "n", dynamic_dims={"n"})]
+    """Length of the audio frames. Used for attention mask in WhisperEncoder."""
+    token_len: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                         TensorShape("b", "n", dynamic_dims={"n"})]
+    """Length of the audio tokens. Used for flattening the audio features."""
 
 
-class UltravoxAudioEmbeddingInputs(TypedDict):
+class UltravoxAudioEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+    - b: batch size
+    - na: number of audios
+    - afs: audio feature size
+    - hs: hidden size
+    """
     type: Literal["audio_embeds"]
-    data: NestedTensors
-    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)`"""
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("b", "na", "afs", "hs")]
 
 
 UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs,
@@ -484,26 +496,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
             return None
 
         if audio_features is not None:
-            if not isinstance(audio_features, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio features. "
-                                 f"Got type: {type(audio_features)}")
-            if not isinstance(audio_lens, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio_lens. "
-                                 f"Got type: {type(audio_features)}")
-            if not isinstance(audio_token_len, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio_token_len. "
-                                 f"Got type: {type(audio_features)}")
-
             return UltravoxAudioFeatureInputs(type="audio_features",
                                               data=audio_features,
                                               lens=audio_lens,
                                               token_len=audio_token_len)
 
         if audio_embeds is not None:
-            if not isinstance(audio_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio embeds. "
-                                 f"Got type: {type(audio_embeds)}")
-
             return UltravoxAudioEmbeddingInputs(type="audio_embeds",
                                                 data=audio_embeds)
 

From 57b1ce94f7ac13624a32da90658b36b60a276a60 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 4 Sep 2025 14:28:45 +0800
Subject: [PATCH 837/932] [CPU] Refactor CPU unquantized linear (#24150)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 csrc/cpu/dnnl_helper.cpp                      | 177 ++++++++++++++++++
 csrc/cpu/dnnl_helper.h                        |  74 ++++++++
 csrc/cpu/dnnl_kernels.cpp                     |  54 ++++++
 csrc/cpu/torch_bindings.cpp                   |  18 ++
 tests/kernels/test_onednn.py                  |  70 +++++++
 vllm/_custom_ops.py                           |  29 +++
 vllm/model_executor/layers/linear.py          |  25 +--
 vllm/model_executor/layers/utils.py           |  39 +++-
 .../layers/vocab_parallel_embedding.py        |   6 +
 9 files changed, 466 insertions(+), 26 deletions(-)

diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
index f3f00edb36..6def0e061f 100644
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -22,6 +22,23 @@ void release_dnnl_matmul_handler(int64_t handler) {
   delete ptr;
 }
 
+DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
+  this->realloc(allocation_unit * 128);
+}
+
+void DNNLScratchPadManager::realloc(size_t new_size) {
+  new_size = round(new_size);
+  if (new_size > size_) {
+    ptr_ = std::aligned_alloc(64, new_size);
+    size_ = new_size;
+  }
+}
+
+DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
+  static DNNLScratchPadManager manager;
+  return &manager;
+}
+
 template <typename KT, typename VT>
 class DNNLPrimitiveCache {
  public:
@@ -166,6 +183,23 @@ struct hash<W8A8MatMulPrimitiveHandler::MSizeCacheKey> {
            hash<int>()(static_cast<int>(val.bias_type));
   }
 };
+
+template <>
+struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
+  size_t operator()(
+      const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
+    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
+  }
+};
+
+template <>
+struct hash<MatMulPrimitiveHandler::MSizeCacheKey> {
+  size_t operator()(const MatMulPrimitiveHandler::MSizeCacheKey& val) const {
+    return hash<dnnl_dim_t>()(val.a_m_size) ^
+           hash<dnnl_dim_t>()(val.a_m_stride) ^ hash<bool>()(val.use_bias) ^
+           hash<int>()(static_cast<int>(val.bias_type));
+  }
+};
 }  // namespace std
 
 bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
@@ -181,6 +215,17 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
          l.bias_type == r.bias_type;
 }
 
+bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
+                const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
+  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
+}
+
+bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
+                const MatMulPrimitiveHandler::MSizeCacheKey& r) {
+  return l.a_m_size == r.a_m_size && l.a_m_stride == r.a_m_stride &&
+         l.use_bias == r.use_bias && l.bias_type == r.bias_type;
+}
+
 static std::shared_ptr<W8A8MatMulPrimitiveHandler::MSizeCache>
 get_w8a8_class_primitive_cache(
     const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
@@ -239,6 +284,11 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
   }
 
   dnnl::matmul matmul = get_matmul_cache(args);
+
+  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
+  scratchpad_storage->set_data_handle(
+      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
+
   matmul.execute(default_stream(), memory_cache_);
   default_stream().wait();
 }
@@ -257,6 +307,8 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
 
   return m_size_cache_->get_or_create(key, [&]() {
     dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
+    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+    manager->realloc(desc.scratchpad_desc().get_size());
     return dnnl::matmul(desc);
   });
 }
@@ -300,6 +352,11 @@ void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
       dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                    default_engine(), nullptr);
   set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get());
+
+  memory_cache_[DNNL_ARG_SCRATCHPAD] =
+      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(5, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
 }
 
 dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
@@ -319,6 +376,9 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
                           dnnl::memory::format_tag::ab);
 
   dnnl::primitive_attr attr;
+
+  attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
   // For PER_TOKEN, scales will be applied in outside epilogue
   if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
     attr.set_scales_mask(DNNL_ARG_SRC, 0);
@@ -344,3 +404,120 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
                                         attr);
   }
 }
+
+MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
+    : DNNLMatMulPrimitiveHandler(
+          static_cast<DNNLMatMulPrimitiveHandler::Args>(args), args.ab_type),
+      m_size_cache_(nullptr) {
+  assert(ab_type_ == dnnl::memory::data_type::f32 ||
+         ab_type_ == dnnl::memory::data_type::bf16 ||
+         ab_type_ == dnnl::memory::data_type::f16);
+  prepack_weight(args.b_ptr,
+                 create_primitive_desc(
+                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
+                                   .a_m_stride = DNNL_RUNTIME_DIM_VAL,
+                                   .use_bias = false,
+                                   .bias_type = dnnl::memory::data_type::undef},
+                     true)
+                     .weights_desc());
+  init_runtime_memory_cache(args);
+}
+
+static std::shared_ptr<MatMulPrimitiveHandler::MSizeCache>
+get_matul_class_primitive_cache(
+    const MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
+    int64_t cache_size) {
+  static MatMulPrimitiveHandler::ClassMatmulCache cache(128);
+  assert(cache_size > 0);
+  return cache.get_or_create(key, [&]() {
+    return std::make_shared<MatMulPrimitiveHandler::MSizeCache>(cache_size);
+  });
+}
+
+void MatMulPrimitiveHandler::execute(ExecArgs& args) {
+  auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
+  auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
+  a_storage->set_data_handle((void*)args.a_ptr);
+  a_mem_desc->dims[0] = args.a_m_size;
+  a_mem_desc->format_desc.blocking.strides[0] = args.a_m_stride;
+  c_storage->set_data_handle((void*)args.c_ptr);
+  c_mem_desc->dims[0] = args.a_m_size;
+
+  if (args.use_bias) {
+    auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
+    bias_storage->set_data_handle((void*)args.bias_ptr);
+  }
+
+  dnnl::matmul matmul = get_matmul_cache(args);
+
+  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
+  scratchpad_storage->set_data_handle(
+      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
+
+  matmul.execute(default_stream(), memory_cache_);
+  default_stream().wait();
+}
+
+dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
+    const MSizeCacheKey& key) {
+  if (m_size_cache_.get() == nullptr) {
+    ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
+    m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
+  }
+  return m_size_cache_->get_or_create(key, [&]() {
+    dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
+    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+    manager->realloc(desc.scratchpad_desc().get_size());
+    return dnnl::matmul(desc);
+  });
+}
+
+dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
+    const MSizeCacheKey& key, bool first_time) {
+  dnnl::memory::desc a_md;
+  dnnl::memory::desc b_md;
+  if (first_time) {
+    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
+                              dnnl::memory::format_tag::ab);
+    b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
+                              dnnl::memory::format_tag::any);
+  } else {
+    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
+                              {key.a_m_stride, 1});
+    b_md = b_target_mem_desc_;
+  }
+  dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
+                          dnnl::memory::format_tag::ab);
+
+  dnnl::primitive_attr attr;
+  attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
+  if (key.use_bias) {
+    dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
+    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
+                                        c_md, attr);
+  } else {
+    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
+                                        attr);
+  }
+}
+
+void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
+  memory_cache_[DNNL_ARG_SRC] = dnnl::memory(
+      {{1, b_k_size_}, b_type_, {b_k_size_, 1}}, default_engine(), nullptr);
+  set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
+  memory_cache_[DNNL_ARG_DST] =
+      dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
+
+  memory_cache_[DNNL_ARG_BIAS] =
+      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
+
+  memory_cache_[DNNL_ARG_SCRATCHPAD] =
+      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
+}
diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h
index 54ceefced9..ad6773d2b9 100644
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@@ -59,6 +59,30 @@ constexpr inline dnnl::memory::data_type get_dnnl_type() {
   return DNNLType<std::decay_t<T>>::type;
 }
 
+class DNNLScratchPadManager {
+ public:
+  static constexpr size_t allocation_unit = 4 * 1024 * 1024;  // 4KB
+
+  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
+
+  DNNLScratchPadManager();
+
+  template <typename T>
+  T* get_data() {
+    return reinterpret_cast<T*>(ptr_);
+  }
+
+  static size_t round(size_t size) {
+    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
+  }
+
+  void realloc(size_t new_size);
+
+ private:
+  size_t size_;
+  void* ptr_;
+};
+
 class DNNLMatMulPrimitiveHandler {
  public:
   virtual ~DNNLMatMulPrimitiveHandler() = default;
@@ -166,4 +190,54 @@ class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
   std::shared_ptr<MSizeCache> m_size_cache_;
 };
 
+class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
+ public:
+  struct Args : public DNNLMatMulPrimitiveHandler::Args {
+    dnnl::memory::data_type ab_type;
+  };
+
+  struct ClassMatmulCacheKey {
+    dnnl_dim_t b_n_size;
+    dnnl_dim_t b_k_size;
+
+    friend bool operator==(const ClassMatmulCacheKey& l,
+                           const ClassMatmulCacheKey& r);
+  };
+
+  struct MSizeCacheKey {
+    dnnl_dim_t a_m_size;
+    dnnl_dim_t a_m_stride;
+    bool use_bias;
+    dnnl::memory::data_type bias_type;
+
+    friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r);
+  };
+
+  using MSizeCache = DNNLPrimitiveCache<MSizeCacheKey, dnnl::matmul>;
+  using ClassMatmulCache =
+      DNNLPrimitiveCache<ClassMatmulCacheKey, std::shared_ptr<MSizeCache>>;
+
+  struct ExecArgs : public MSizeCacheKey {
+    const void* a_ptr;
+    const void* bias_ptr;
+    void* c_ptr;
+  };
+
+ public:
+  MatMulPrimitiveHandler(const Args& args);
+
+  void execute(ExecArgs& args);
+
+ private:
+  dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key,
+                                                     bool first_time);
+
+  void init_runtime_memory_cache(const Args& args);
+
+  dnnl::matmul get_matmul_cache(const MSizeCacheKey& key);
+
+ private:
+  std::shared_ptr<MSizeCache> m_size_cache_;
+};
+
 #endif
diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp
index acc3b9ecde..1aa9961492 100644
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -379,6 +379,7 @@ void onednn_scaled_mm(
   exec_args.a_ptr = a.data_ptr<int8_t>();
   exec_args.a_m_size = a.size(0);
   exec_args.bias_ptr = nullptr;
+  exec_args.bias_type = get_dnnl_type<void>();
   exec_args.use_bias = false;
   exec_args.a_scales_ptr = nullptr;
   exec_args.a_zero_points_ptr = nullptr;
@@ -492,3 +493,56 @@ void dynamic_scaled_int8_quant(
         }
       });
 }
+
+int64_t create_onednn_mm_handler(const torch::Tensor& b,
+                                 int64_t primitive_cache_size) {
+  TORCH_CHECK(b.dim() == 2);
+
+  MatMulPrimitiveHandler::Args args;
+  args.primitive_cache_size = primitive_cache_size;
+
+  args.b_k_size = b.size(0);
+  args.b_k_stride = b.stride(0);
+  args.b_n_size = b.size(1);
+  args.b_n_stride = b.stride(1);
+  args.b_ptr = b.data_ptr();
+
+  VLLM_DISPATCH_FLOATING_TYPES(b.scalar_type(), "create_onednn_mm_handler",
+                               [&] {
+                                 args.c_type = get_dnnl_type<scalar_t>();
+                                 args.ab_type = get_dnnl_type<scalar_t>();
+                               });
+
+  return reinterpret_cast<int64_t>(new MatMulPrimitiveHandler(args));
+}
+
+void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
+               const torch::Tensor& a,  // [M, IC], row-major
+               const std::optional<torch::Tensor>& bias, int64_t handler) {
+  CPU_KERNEL_GUARD_IN(onednn_mm)
+  TORCH_CHECK(a.dim() == 2);
+  TORCH_CHECK(a.stride(-1) == 1);
+  TORCH_CHECK(c.is_contiguous());
+  MatMulPrimitiveHandler* ptr =
+      reinterpret_cast<MatMulPrimitiveHandler*>(handler);
+
+  MatMulPrimitiveHandler::ExecArgs exec_args;
+  exec_args.a_m_size = a.size(0);
+  exec_args.a_m_stride = a.stride(0);
+
+  VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
+    if (bias.has_value()) {
+      exec_args.use_bias = true;
+      exec_args.bias_type = get_dnnl_type<scalar_t>();
+      exec_args.bias_ptr = bias->data_ptr<scalar_t>();
+    } else {
+      exec_args.use_bias = false;
+      exec_args.bias_type = get_dnnl_type<void>();
+      exec_args.bias_ptr = nullptr;
+    }
+    exec_args.a_ptr = a.data_ptr<scalar_t>();
+    exec_args.c_ptr = c.data_ptr<scalar_t>();
+
+    ptr->execute(exec_args);
+  });
+}
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index c9f426bdf6..98c3ebc5a7 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -21,6 +21,12 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                       const std::optional<torch::Tensor>& bias,
                       int64_t handler);
 
+int64_t create_onednn_mm_handler(const torch::Tensor& b,
+                                 int64_t primitive_cache_size);
+
+void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
+               const std::optional<torch::Tensor>& bias, int64_t handler);
+
 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                         torch::Tensor& kv_cache, double scale,
                         torch::Tensor& block_tables, torch::Tensor& seq_lens);
@@ -153,6 +159,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("release_dnnl_matmul_handler(int handler) -> ()",
           &release_dnnl_matmul_handler);
 
+  // Create oneDNN GEMM handler
+  ops.def(
+      "create_onednn_mm_handler(Tensor b, int "
+      "primitive_cache_size) -> int",
+      &create_onednn_mm_handler);
+
+  // oneDNN GEMM
+  ops.def(
+      "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
+      "int handler) -> ()");
+  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
+
   // Create oneDNN W8A8 handler
   ops.def(
       "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py
index 17692384ac..37772464a2 100644
--- a/tests/kernels/test_onednn.py
+++ b/tests/kernels/test_onednn.py
@@ -111,6 +111,49 @@ def onednn_int8_gemm_test_helper(primitive_cache_size: int,
         torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
 
+def onednn_gemm_test_helper(primitive_cache_size: int,
+                            m: int,
+                            n: int,
+                            k: int,
+                            use_bias: bool,
+                            use_stride: bool,
+                            dtype: torch.dtype = torch.bfloat16,
+                            device: str = "cpu"):
+    if use_stride:
+        a = torch.rand((m, 2 * k), dtype=dtype, device=device) * 1.5
+        a = a[:, :k]
+    else:
+        a = torch.rand((m, k), dtype=dtype, device=device) * 1.5
+
+    b = torch.rand((n, k), dtype=dtype, device=device) * 1.5
+
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=dtype) * 5
+        bias_f32 = bias.float()
+    else:
+        bias = None
+        bias_f32 = None
+
+    handler = ops.create_onednn_mm(
+        b.t(),
+        primitive_cache_size,
+    )
+
+    out = ops.onednn_mm(handler, a, bias)
+    baseline = torch.nn.functional.linear(a.float(), b.float(),
+                                          bias_f32).to(dtype=a.dtype)
+
+    torch.testing.assert_close(out, baseline)
+
+    if use_bias:
+        # To test runtime bias setting
+        out = ops.onednn_mm(handler, a, None)
+        baseline = torch.nn.functional.linear(a.float(), b.float(),
+                                              None).to(dtype=a.dtype)
+
+        torch.testing.assert_close(out, baseline)
+
+
 @pytest.mark.parametrize("n,k", NK_FACTORS)
 @pytest.mark.parametrize("m_list", M_FACTORS)
 @pytest.mark.parametrize("per_tensor_a_scale", [True, False])
@@ -142,3 +185,30 @@ def test_onednn_int8_scaled_gemm(
             use_azp=use_azp,
             out_dtype=output_type,
         )
+
+
+@pytest.mark.parametrize("n,k", NK_FACTORS)
+@pytest.mark.parametrize("m_list", M_FACTORS)
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("use_stride", [True, False])
+@pytest.mark.parametrize("dtype", DTYPE)
+@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES)
+def test_onednn_gemm(
+    n: int,
+    k: int,
+    m_list: tuple[int],
+    use_bias: bool,
+    use_stride: bool,
+    dtype: torch.dtype,
+    primitive_cache_size: int,
+):
+    for m in m_list:
+        onednn_gemm_test_helper(
+            primitive_cache_size=primitive_cache_size,
+            m=m,
+            n=n,
+            k=k,
+            use_bias=use_bias,
+            use_stride=use_stride,
+            dtype=dtype,
+        )
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 340d6e1164..bb67d5790a 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1928,6 +1928,35 @@ class CPUDNNLGEMMHandler:
             torch.ops._C.release_dnnl_matmul_handler(self.handler)
 
 
+if hasattr(torch.ops._C, "create_onednn_mm_handler"):
+    _supports_onednn = True
+else:
+    _supports_onednn = False
+
+
+def create_onednn_mm(
+    weight: torch.Tensor,  # [K, N]
+    primitive_cache_size: int = 128,
+) -> CPUDNNLGEMMHandler:
+    handler = CPUDNNLGEMMHandler()
+    handler.k, handler.n = weight.size()
+    handler.handler = torch.ops._C.create_onednn_mm_handler(
+        weight, primitive_cache_size)
+    return handler
+
+
+def onednn_mm(
+    dnnl_handler: CPUDNNLGEMMHandler,
+    x: torch.Tensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    output = torch.empty((*x.shape[0:-1], dnnl_handler.n), dtype=x.dtype)
+    torch.ops._C.onednn_mm(output, x.reshape(-1, dnnl_handler.k), bias,
+                           dnnl_handler.handler)
+
+    return output
+
+
 def create_onednn_scaled_mm(
     weight: torch.Tensor,  # [K, N]
     weight_scales: torch.Tensor,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index f24c87dbf4..1224b94d56 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -9,7 +9,6 @@ import torch
 import torch.nn as nn
 from torch.nn.parameter import Parameter, UninitializedParameter
 
-from vllm import envs
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
@@ -200,26 +199,10 @@ class UnquantizedLinearMethod(LinearMethodBase):
         set_weight_attrs(weight, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # special postprocessing for CPU SGL
-        if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL:
-            from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
-            N, K = layer.weight.size()
-            dtype = layer.weight.dtype
-            if check_cpu_sgl_kernel(N, K, dtype):
-                packed_weight = torch.ops._C.convert_weight_packed(
-                    layer.weight)
-                assert packed_weight.size() == layer.weight.size()
-                layer.weight.copy_(packed_weight)
-                if layer.bias is not None:
-                    layer.bias = Parameter(layer.bias.to(torch.float32),
-                                           requires_grad=False)
-                layer.use_cpu_sgl = True
-            else:
-                logger.warning(
-                    "CPU SGL kernels require Intel AMX support,"
-                    " bf16/fp16/int8 weight, IC and OC are divisible by "
-                    "32 and 16.")
-                layer.use_cpu_sgl = False
+        if current_platform.is_cpu():
+            from vllm.model_executor.layers.utils import (
+                dispatch_cpu_unquantized_gemm)
+            dispatch_cpu_unquantized_gemm(layer, remove_weight=True)
 
     def apply(self,
               layer: torch.nn.Module,
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 2897f75b31..d2b135c1e4 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -142,20 +142,49 @@ direct_register_custom_op(
 )
 
 
-def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype):
+def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype) -> bool:
     return (torch._C._cpu._is_amx_tile_supported()
             and (dtype in (torch.bfloat16, torch.int8)) and k % 32 == 0
             and n % 16 == 0)
 
 
+def dispatch_cpu_unquantized_gemm(
+    layer: torch.nn.Module,
+    remove_weight: bool,
+) -> None:
+    N, K = layer.weight.size()
+    dtype = layer.weight.dtype
+    if envs.VLLM_CPU_SGL_KERNEL and check_cpu_sgl_kernel(N, K, dtype):
+        packed_weight = torch.ops._C.convert_weight_packed(layer.weight)
+        if getattr(layer, "bias", None) is not None:
+            bias_f32 = layer.bias.to(torch.float32)
+        else:
+            bias_f32 = None
+        layer.cpu_linear = (
+            lambda x, weight, bias: torch.ops._C.weight_packed_linear(
+                x, packed_weight, bias_f32
+                if bias is not None else None, True))
+        if remove_weight:
+            layer.weight = torch.nn.Parameter(torch.empty(0),
+                                              requires_grad=False)
+    elif ops._supports_onednn:
+        origin_weight = layer.weight
+        if remove_weight:
+            layer.weight = torch.nn.Parameter(torch.empty(0),
+                                              requires_grad=False)
+        handler = ops.create_onednn_mm(origin_weight.t(), 32)
+        layer.cpu_linear = lambda x, weight, bias: ops.onednn_mm(
+            handler, x, bias)
+    else:
+        layer.cpu_linear = lambda x, weight, bias: torch.nn.functional.linear(
+            x, weight, bias)
+
+
 def cpu_unquantized_gemm(layer: torch.nn.Module,
                          x: torch.Tensor,
                          weight: torch.Tensor,
                          bias: Optional[torch.Tensor] = None):
-    if getattr(layer, "use_cpu_sgl", False):
-        return torch.ops._C.weight_packed_linear(x, weight, bias, True)
-    else:
-        return torch.nn.functional.linear(x, weight, bias)
+    return layer.cpu_linear(x, weight, bias)
 
 
 def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 9f223998e5..c92a797819 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -40,6 +40,12 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, extra_weight_attrs)
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if current_platform.is_cpu():
+            from vllm.model_executor.layers.utils import (
+                dispatch_cpu_unquantized_gemm)
+            dispatch_cpu_unquantized_gemm(layer, remove_weight=False)
+
     def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,

From 12e1e63cc5565c5926a2d3454d3b31a1ee03f564 Mon Sep 17 00:00:00 2001
From: Weida Hong <wdhongtw@gmail.com>
Date: Thu, 4 Sep 2025 14:38:26 +0800
Subject: [PATCH 838/932] [Misc] Enhance output readability of helper script
 (#24214)

Signed-off-by: Weida Hong <wdhongtw@google.com>
---
 benchmarks/auto_tune/auto_tune.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 82c20ffa65..d9d0fe4e0c 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -213,7 +213,7 @@ run_benchmark() {
 
     pkill -if vllm
     sleep 10
-    printf '=%.0s' $(seq 1 20)
+    echo "===================="
     return 0
 }
 

From e7fc70016fed270940759dc753a726790701a0c1 Mon Sep 17 00:00:00 2001
From: bingchen-mi <chenbing8@xiaomi.com>
Date: Thu, 4 Sep 2025 15:08:09 +0800
Subject: [PATCH 839/932] [Model] Add MiDashengLM model support (#23652)

Signed-off-by: chenbing8 <chenbing8@xiaomi.com>
Signed-off-by: bingchen-mi <chenbing8@xiaomi.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md               |   1 +
 examples/offline_inference/audio_language.py  |  31 +
 .../multimodal/processing/test_common.py      |   1 +
 tests/models/registry.py                      |   2 +
 vllm/model_executor/models/midashenglm.py     | 788 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/transformers_utils/config.py             |   1 +
 vllm/transformers_utils/configs/__init__.py   |   2 +
 .../transformers_utils/configs/midashenglm.py | 101 +++
 9 files changed, 928 insertions(+)
 create mode 100644 vllm/model_executor/models/midashenglm.py
 create mode 100644 vllm/transformers_utils/configs/midashenglm.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 7f54d98527..c8f628d31a 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -643,6 +643,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
 | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
 | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
+| `MiDashengLMModel` | MiDashengLM | T + A<sup>+</sup> | `mispeech/midashenglm-7b` | | ✅︎ | ✅︎ |
 | `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
 | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 22cb8b057d..a5b8397e7e 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -146,6 +146,36 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
     )
 
 
+# MiDashengLM
+def run_midashenglm(question: str, audio_count: int):
+    model_name = "mispeech/midashenglm-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    audio_in_prompt = "".join(
+        ["<|audio_bos|><|AUDIO|><|audio_eos|>" for idx in range(audio_count)]
+    )
+
+    default_system = "You are a helpful language and speech assistant."
+
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
 # MiniCPM-O
 def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
     model_name = "openbmb/MiniCPM-o-2_6"
@@ -352,6 +382,7 @@ model_example_map = {
     "voxtral": run_voxtral,
     "gemma3n": run_gemma3n,
     "granite_speech": run_granite_speech,
+    "midashenglm": run_midashenglm,
     "minicpmo": run_minicpmo,
     "phi4_mm": run_phi4mm,
     "phi4_multimodal": run_phi4_multimodal,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 16c0428c6d..8ffd65cf08 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -302,6 +302,7 @@ def _test_processing_correctness_one(
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
     "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "mispeech/midashenglm-7b",
     "openbmb/MiniCPM-Llama3-V-2_5",
     "openbmb/MiniCPM-o-2_6",
     "openbmb/MiniCPM-V-2_6",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index f1f61c6151..c9e2eec511 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -460,6 +460,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                       max_transformers_version="4.48",  # noqa: E501
                                                       transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                       hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
+    "MiDashengLMModel": _HfExamplesInfo("mispeech/midashenglm-7b",
+                            trust_remote_code=True),
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
                                 trust_remote_code=True),
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py
new file mode 100644
index 0000000000..858d4e7e34
--- /dev/null
+++ b/vllm/model_executor/models/midashenglm.py
@@ -0,0 +1,788 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 Horizon team, Xiaomi MiLM Plus.
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiDashengLM model compatible with HuggingFace weights."""
+import collections
+import collections.abc
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Callable, Optional, TypedDict, Union, cast
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torchaudio.transforms as audio_transforms
+from transformers import BatchFeature
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargsItems)
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.midashenglm import DashengConfig
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+_Tuple2 = Union[int, tuple[int, int], Sequence[int]]
+
+
+def _resolve_tuple2(x: _Tuple2) -> tuple[int, int]:
+    if isinstance(x, collections.abc.Sequence):
+        assert len(x) == 2, (
+            f"Expected a sequence of length 2, got {x} with length {len(x)}")
+        return cast(tuple[int, int], tuple(x))
+    return (x, x)
+
+
+def calculate_mel_frames_dasheng(
+    audio_length_samples: int,
+    n_fft: int = 512,
+    hop_size: int = 160,
+    dasheng_subsampling: int = 4,
+    center=True,
+    model_subsampling: int = 5,
+) -> int:
+    """Calculate the number of Mel-spectrogram frames."""
+    if center:
+        audio_length_samples = audio_length_samples + n_fft
+
+    return (int(1 + ((audio_length_samples - n_fft) / hop_size)) //
+            dasheng_subsampling // model_subsampling)
+
+
+class AudioPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        input_size: _Tuple2 = 64,
+        patch_size: _Tuple2 = 16,
+        patch_stride: _Tuple2 = 16,
+        in_chans: int = 1,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten: bool = False,
+    ):
+        super().__init__()
+        self.input_size = _resolve_tuple2(input_size)
+        self.patch_size = _resolve_tuple2(patch_size)
+        self.patch_stride = _resolve_tuple2(patch_stride)
+        self.grid_size = (
+            self.input_size[0] // self.patch_stride[0],
+            self.input_size[1] // self.patch_stride[1],
+        )
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_stride,
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        if self.flatten:
+            x = torch.permute(torch.flatten(
+                x, 2, 3), (0, 2, 1))  # rearrange(x, "b c f t -> b (f t) c")
+        x = self.norm(x)
+        return x
+
+
+class LayerScale(nn.Module):
+
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class DashengMlp(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = ColumnParallelLinear(input_size=in_features,
+                                        output_size=hidden_features,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1")
+        self.act = get_act_fn("gelu")
+        self.fc2 = RowParallelLinear(input_size=hidden_features,
+                                     output_size=out_features,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.act(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class DashengAttention(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        causal: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.embed_dim = dim
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        if self.total_num_heads >= tp_size:
+            # Number of heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_heads % tp_size == 0
+        else:
+            # Number of heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_heads == 0
+        self.num_kv_heads = max(1, self.total_num_heads // tp_size)
+        self.head_dim = self.embed_dim // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scale = self.head_dim**-0.5
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+        )
+        self.attn = MultiHeadAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scale,
+            num_kv_heads=self.num_kv_heads,
+        )
+        self.proj = RowParallelLinear(
+            input_size=dim,
+            output_size=dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+        )
+        self.causal = causal
+
+    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None):
+        B, N, C = x.shape
+
+        qkv_out, _ = self.qkv(x)
+        q, k, v = qkv_out.split([self.q_size, self.kv_size, self.kv_size],
+                                dim=-1)
+
+        attn_out = self.attn(q, k, v)
+        C_local = attn_out.numel() // (B * N)  # C_local for parallel
+        attn_out = attn_out.view(B, N, C_local)
+
+        x, _ = self.proj(attn_out)
+
+        return x
+
+
+class DashengBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        init_values: Optional[float] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
+        self.attn = DashengAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.ls1 = (LayerScale(dim, init_values=init_values)
+                    if init_values else nn.Identity())
+
+        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
+        self.mlp = DashengMlp(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.ls2 = (LayerScale(dim, init_values=init_values)
+                    if init_values else nn.Identity())
+
+    # Kwargs usually has a mask parameter that is passed to Attention
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        x = x + self.ls1(self.attn(self.norm1(x), mask))
+        x = x + self.ls2(self.mlp(self.norm2(x)))
+        return x
+
+
+class DashengAudioTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: DashengConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.target_length = config.target_length
+        self.hop_length = config.hop_length
+
+        self._init_front_end(config)
+
+        self.init_bn = nn.BatchNorm2d(config.n_mels, momentum=0.01)
+
+        self.patch_embed = AudioPatchEmbed(
+            input_size=(config.n_mels, config.target_length),
+            embed_dim=config.embed_dim,
+            in_chans=config.input_channels,
+            patch_size=config.patch_size,
+            flatten=False,
+            patch_stride=config.patch_stride,
+        )
+
+        self.time_pos_embed = nn.Parameter(
+            torch.empty(1, config.embed_dim, 1, self.patch_embed.grid_size[1]))
+        self.freq_pos_embed = nn.Parameter(
+            torch.empty(1, config.embed_dim, self.patch_embed.grid_size[0], 1))
+        self.blocks = nn.ModuleList(
+            DashengBlock(
+                dim=config.embed_dim,
+                num_heads=config.num_heads,
+                mlp_ratio=config.mlp_ratio,
+                qkv_bias=config.qkv_bias,
+                init_values=config.init_values,
+                quant_config=quant_config,
+                prefix=f"{prefix}.block{i}",
+            ) for i in range(config.depth))
+        self.norm = nn.LayerNorm(config.embed_dim, eps=1e-6)
+
+    def _init_front_end(self, config):
+        with set_default_torch_dtype(torch.float32):
+            self.front_end = nn.Sequential(
+                audio_transforms.MelSpectrogram(
+                    f_min=config.f_min,
+                    f_max=config.f_max,
+                    center=config.center,
+                    win_length=config.win_length,
+                    hop_length=config.hop_length,
+                    sample_rate=config.sample_rate,
+                    n_fft=config.n_fft,
+                    n_mels=config.n_mels,
+                ),
+                audio_transforms.AmplitudeToDB(top_db=120),
+            )
+
+            mel_spectrogram = self.front_end[0]
+            fb = mel_spectrogram.mel_scale.fb
+            win = mel_spectrogram.spectrogram.window
+            mel_spectrogram.mel_scale.fb = fb.to(torch.bfloat16).to(
+                torch.float32)
+            mel_spectrogram.spectrogram.window = win.to(torch.bfloat16).to(
+                torch.float32)
+
+    def forward_features(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        t = x.shape[-1]
+        x = x + self.time_pos_embed[:, :, :, :t]
+        x = (x + self.freq_pos_embed[:, :, :, :]
+             )  # Just to support __getitem__ in posembed
+        x = torch.permute(torch.flatten(x, 2, 3),
+                          (0, 2, 1))  # rearrange(x, "b c f t -> b (f t) c")
+        for block in self.blocks:
+            x = block(x, mask)
+        x = self.norm(x)
+        return x
+
+    def _to_mask(self, lengths: torch.Tensor, max_length: int) -> torch.Tensor:
+        batch_size = len(lengths)
+        idx = torch.arange(max_length, device=lengths.device)
+        idx = idx.repeat(batch_size).view(batch_size, max_length)
+        mask = (idx < lengths.unsqueeze(-1)).bool()
+        return mask
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_length: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        x = self.front_end(x)
+        x = x.to(self.time_pos_embed.dtype)
+        target_length_in_patches = self.target_length // 4
+        x = x.unsqueeze(1)
+        x = torch.permute(x, (0, 2, 1, 3))
+        x = self.init_bn(x)
+        x = torch.permute(x, (0, 2, 1, 3))
+
+        x = self.patch_embed(x)
+        t = x.shape[-1]
+
+        input_splits = x.split(target_length_in_patches, dim=-1)
+
+        if x_length is not None:
+            assert len(x_length) == len(x), (
+                "batchsizes of input x and x_length need to be same")
+            assert x_length.ndim == 1, "Lengths are of size (B,)"
+            scaled_lengths = (x_length / (self.hop_length * 4)).long()
+            mask = self._to_mask(max_length=t, lengths=scaled_lengths)
+            split_masks = mask.logical_not().split(target_length_in_patches,
+                                                   dim=-1)
+        else:
+            mask = None
+            split_masks = [None] * len(input_splits)
+
+        outputs = []
+
+        for split_x, split_mask in zip(input_splits, split_masks):
+            forward_kwargs = {}
+            forward_kwargs["mask"] = split_mask
+            split_x = self.forward_features(split_x, **forward_kwargs)
+            outputs.append(split_x)
+        x = torch.cat(outputs, dim=1)
+        return x, mask
+
+
+class AudioProjectorSubsample(nn.Module):
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        downsample_rate=5,
+        dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.k = downsample_rate
+        self.net = nn.Sequential(
+            ColumnParallelLinear(
+                input_size=in_dim * self.k,
+                output_size=out_dim,
+                quant_config=quant_config,
+                prefix=f"{prefix}.net.0",
+                return_bias=False,
+            ), get_act_fn("gelu"),
+            RowParallelLinear(
+                input_size=out_dim,
+                output_size=out_dim,
+                quant_config=quant_config,
+                prefix=f"{prefix}.net.2",
+                return_bias=False,
+            ))
+
+    def forward(self, x, mask=None):
+        batch_size, seq_len, dim = x.shape
+        num_frames_to_discard = seq_len % self.k
+        if num_frames_to_discard > 0:
+            x = x[:, :-num_frames_to_discard, :]
+            if mask is not None:
+                mask = mask[:, :-num_frames_to_discard]
+        if mask is None:
+            mask = torch.ones(x.shape[:-1], dtype=torch.long, device=x.device)
+        x = x.reshape(batch_size, -1, self.k *
+                      dim)  # rearrange(x, "b (s k) d -> b s (k d)", k=self.k)
+        for layer in self.net:
+            x = layer(x)
+        mask = mask.reshape(
+            batch_size, -1,
+            self.k)  # rearrange(mask, "b (s k) -> b s k", k=self.k)
+        mask = mask.any(dim=-1).long()
+        return x, mask
+
+
+# === Audio Inputs === #
+class MiDashengLMAudioInputs(TypedDict):
+    input_values: torch.Tensor
+    """Shape: `(num_audios, num_sampling_points)`"""
+    audio_length: torch.Tensor
+    """Shape: `(num_audios, 1)`"""
+
+
+class MiDashengLMProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_feature_extractor(self):
+        hf_processor = self.get_hf_processor()
+        feature_extractor = hf_processor.feature_extractor
+        return feature_extractor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None}
+
+    def get_min_audio_len(self):
+        return 3200
+
+    def get_max_audio_len(self):
+        return 160000
+
+
+class MiDashengLMDummyInputsBuilder(
+        BaseDummyInputsBuilder[MiDashengLMProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        audio_token = hf_processor.audio_token
+
+        return audio_token * num_audios
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+
+        return {
+            "audio":
+            self._get_dummy_audios(length=self.info.get_max_audio_len(),
+                                   num_audios=num_audios)
+        }
+
+
+class MiDashengLMMultiModalProcessor(
+        BaseMultiModalProcessor[MiDashengLMProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, Any],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        audios = mm_data.pop("audios", [])
+
+        # + Padding
+        min_audio_len = self.info.get_min_audio_len()
+        processed_audios = [
+            np.pad(audio, (0, min_audio_len - audio.shape[-1]),
+                   mode='constant',
+                   constant_values=0) if isinstance(audio, np.ndarray)
+            and audio.shape[-1] < min_audio_len else audio for audio in audios
+        ]
+
+        if processed_audios:
+            mm_data["audio"] = processed_audios
+
+        if not mm_data.get("audio", []):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        mm_kwargs = dict(**mm_kwargs, )
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            input_values=MultiModalFieldConfig.batched("audio"),
+            audio_length=MultiModalFieldConfig.batched("audio"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        audio_token = getattr(processor, "audio_token", "<|AUDIO|>")
+        audio_bos_token = getattr(processor, "audio_bos_token",
+                                  "<|audio_bos|>")
+        audio_eos_token = getattr(processor, "audio_eos_token",
+                                  "<|audio_eos|>")
+
+        audio_token_id = vocab[audio_token]
+        audio_bos_id = vocab[audio_bos_token]
+        audio_eos_id = vocab[audio_eos_token]
+
+        out_mm_data = out_mm_kwargs.get_data()
+        audio_length = out_mm_data.get("audio_length")
+        if audio_length is None:
+            audio_output_lengths = []
+        else:
+            audio_length_np = audio_length.cpu().numpy() if isinstance(
+                audio_length, torch.Tensor) else audio_length
+            audio_output_lengths = [
+                max(1, calculate_mel_frames_dasheng(
+                    int(length)))  # at least one frame
+                for length in audio_length_np
+            ]
+
+        def get_replacement_midashenglm(item_idx: int):
+            num_features = audio_output_lengths[item_idx]
+            audio_tokens = [audio_token_id] * num_features
+
+            return PromptUpdateDetails.select_token_id(
+                [audio_bos_id] + audio_tokens + [audio_eos_id],
+                embed_token_id=audio_token_id,
+            )
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_replacement_midashenglm,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MiDashengLMMultiModalProcessor,
+    info=MiDashengLMProcessingInfo,
+    dummy_inputs=MiDashengLMDummyInputsBuilder,
+)
+class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("audio"):
+            return "<|audio_bos|><|AUDIO|><|audio_eos|>"
+
+        raise ValueError("Only audio modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+
+        # Initialize audio components
+        self.audio_encoder = DashengAudioTransformer(
+            config.audio_encoder_config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "audio_encoder"),
+        )
+        self.audio_projector = AudioProjectorSubsample(
+            in_dim=config.audio_encoder_config.embed_dim,
+            out_dim=config.text_config.hidden_size,
+            downsample_rate=config.subsample_factor,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "audio_projector"),
+        )
+
+        # Initialize language model (decoder)
+        self.decoder = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "decoder"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.quant_config = quant_config
+        self.make_empty_intermediate_tensors = (
+            self.decoder.make_empty_intermediate_tensors)
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[MiDashengLMAudioInputs]:
+        input_values = kwargs.pop("input_values", None)
+        audio_length = kwargs.pop("audio_length", None)
+
+        if input_values is None:
+            return None
+        input_values = self._validate_and_reshape_mm_tensor(
+            input_values, "input_values")
+        audio_length = self._validate_and_reshape_mm_tensor(
+            audio_length, "audio_length")
+        if not isinstance(input_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio input features. "
+                             f"Got type: {type(input_values)}")
+
+        return MiDashengLMAudioInputs(
+            input_values=input_values,
+            audio_length=audio_length,
+        )
+
+    def _process_audio_input(
+            self, audio_input: MiDashengLMAudioInputs) -> torch.Tensor:
+        # Process audio through encoder and projector
+        input_values = audio_input["input_values"]
+        audio_length = audio_input["audio_length"]
+
+        encoder_out, encoder_atts = self.audio_encoder(input_values,
+                                                       audio_length)
+        audio_embeddings, _ = self.audio_projector(encoder_out, encoder_atts)
+        audio_embeddings = audio_embeddings.to(
+            audio_input["input_values"].dtype)
+        batch_size, max_audio_tokens, embed_dim = audio_embeddings.shape
+
+        audio_length_np = audio_length.cpu().numpy() if isinstance(
+            audio_length, torch.Tensor) else audio_length
+        audio_output_lengths = [
+            max(1, calculate_mel_frames_dasheng(
+                int(length)))  # at least one frame
+            for length in audio_length_np
+        ]
+        audio_output_lengths = torch.tensor(audio_output_lengths).to(
+            audio_embeddings.device)
+
+        audio_feature_mask = (torch.arange(
+            max_audio_tokens,
+            device=audio_embeddings.device).unsqueeze(0).expand(
+                batch_size, max_audio_tokens)
+                              < audio_output_lengths.unsqueeze(1))
+
+        masked_audio_features = audio_embeddings[audio_feature_mask].view(
+            -1, embed_dim)
+
+        return torch.split(masked_audio_features,
+                           audio_output_lengths.tolist())
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.decoder
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+
+        if audio_input is None:
+            return []
+        return self._process_audio_input(audio_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.decoder.get_input_embeddings(input_ids)
+        if multimodal_embeddings and len(multimodal_embeddings) > 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.audio_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
+
+        return self.decoder.model(input_ids,
+                                  positions,
+                                  intermediate_tensors,
+                                  inputs_embeds=inputs_embeds)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.decoder.compute_logits(hidden_states, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f236040bb2..feca60f2f0 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -236,6 +236,7 @@ _MULTIMODAL_MODELS = {
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
     "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
+    "MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
     "MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"),  # noqa: E501
     "MiniCPMO": ("minicpmo", "MiniCPMO"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index bec792465b..95e4ed1ccf 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -71,6 +71,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
     jais="JAISConfig",
     mlp_speculator="MLPSpeculatorConfig",
     medusa="MedusaConfig",
+    midashenglm="MiDashengLMConfig",
     eagle="EAGLEConfig",
     speculators="SpeculatorsConfig",
     nemotron="NemotronConfig",
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 8339c55bcf..f651ecb078 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -17,6 +17,7 @@ from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
+from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
@@ -36,6 +37,7 @@ __all__ = [
     "RWConfig",
     "JAISConfig",
     "MedusaConfig",
+    "MiDashengLMConfig",
     "MLPSpeculatorConfig",
     "MoonViTConfig",
     "KimiVLConfig",
diff --git a/vllm/transformers_utils/configs/midashenglm.py b/vllm/transformers_utils/configs/midashenglm.py
new file mode 100644
index 0000000000..1c23202e23
--- /dev/null
+++ b/vllm/transformers_utils/configs/midashenglm.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 Horizon team, Xiaomi MiLM Plus.
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+
+from transformers import PretrainedConfig
+from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
+    Qwen2_5OmniTextConfig)
+
+
+class DashengConfig(PretrainedConfig):
+    model_type = "midashenglm_dasheng_encoder"
+
+    def __init__(
+        self,
+        embed_dim: int = 768,
+        outputdim: int = 527,
+        patch_size: Union[int, tuple[int, int]] = 16,
+        patch_stride: Union[int, tuple[int, int]] = 16,
+        input_channels: int = 1,
+        target_length: int = 1012,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        init_values: Optional[float] = None,
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        f_min: float = 0.0,
+        f_max: float = 8000.0,
+        center: bool = True,
+        win_length: int = 512,
+        hop_length: int = 160,
+        sample_rate: int = 16000,
+        n_fft: int = 512,
+        n_mels: int = 64,
+        **kwargs,
+    ):
+        self.embed_dim = embed_dim
+        self.outputdim = outputdim
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.input_channels = input_channels
+        self.target_length = target_length
+        self.depth = depth
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.init_values = init_values
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.f_min = f_min
+        self.f_max = f_max
+        self.center = center
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.n_mels = n_mels
+        super().__init__(**kwargs)
+
+
+class MiDashengLMConfig(PretrainedConfig):
+    model_type = "midashenglm"
+
+    def __init__(
+        self,
+        audio_encoder_config: Optional[dict] = None,
+        subsample_factor: int = 5,
+        text_config: Optional[dict] = None,
+        audio_token_id: Optional[int] = None,
+        **kwargs,
+    ):
+        self.audio_encoder_config = DashengConfig(
+            **(audio_encoder_config or {}))
+        self.subsample_factor = subsample_factor
+        self.text_config = (Qwen2_5OmniTextConfig(
+            **text_config) if text_config else Qwen2_5OmniTextConfig())
+        self.text_config.rope_scaling = None  # uses_mrope is false
+        self.audio_token_id = audio_token_id
+        super().__init__(**kwargs)

From 51d5e9be7dbf4d914374447548dd01f9bfb68f89 Mon Sep 17 00:00:00 2001
From: mgazz <michele.gazzetti1@ibm.com>
Date: Thu, 4 Sep 2025 08:22:41 +0100
Subject: [PATCH 840/932] [Core][Model] Terratorch backend integration (#23513)

Signed-off-by: Michele Gazzetti <michele.gazzetti1@ibm.com>
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
Co-authored-by: Christian Pinto <christian.pinto@ibm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../prithvi_geospatial_mae.py                 |   6 +-
 .../prithvi_geospatial_mae_io_processor.py    |   1 +
 .../online_serving/prithvi_geospatial_mae.py  |   1 +
 requirements/test.in                          |   2 +-
 requirements/test.txt                         |   2 +-
 tests/distributed/test_pipeline_parallel.py   |   6 +
 tests/distributed/test_sequence_parallel.py   |   3 +
 .../entrypoints/openai/test_chat_template.py  |   4 +-
 .../entrypoints/openai/test_skip_tokenizer.py |   6 +-
 tests/entrypoints/test_chat_utils.py          |  12 +-
 .../multimodal/generation/vlm_utils/core.py   |   3 +
 .../multimodal/pooling/test_prithvi_mae.py    |   2 +-
 .../multimodal/processing/test_common.py      |   4 +-
 .../processing/test_tensor_schema.py          |   4 +-
 tests/models/multimodal/test_mapping.py       |   4 +-
 tests/models/registry.py                      |  40 ++-
 tests/models/test_initialization.py           |   5 +-
 tests/models/test_terratorch.py               |  45 ++++
 tests/models/utils.py                         |   2 +
 .../test_io_processor_plugins.py              | 103 ++++----
 vllm/config/__init__.py                       |   5 +-
 vllm/model_executor/models/registry.py        |  15 +-
 ...rithvi_geospatial_mae.py => terratorch.py} | 238 ++++++++----------
 23 files changed, 305 insertions(+), 208 deletions(-)
 create mode 100644 tests/models/test_terratorch.py
 rename vllm/model_executor/models/{prithvi_geospatial_mae.py => terratorch.py} (52%)

diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index b6007b9f46..1a5879a6d3 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -45,7 +45,11 @@ datamodule_config = {
 class PrithviMAE:
     def __init__(self, model):
         self.model = LLM(
-            model=model, skip_tokenizer_init=True, dtype="float16", enforce_eager=True
+            model=model,
+            skip_tokenizer_init=True,
+            dtype="float16",
+            enforce_eager=True,
+            model_impl="terratorch",
         )
 
     def run(self, input_data, location_coords):
diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
index adc27859a1..5d629fabf0 100644
--- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
+++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
@@ -37,6 +37,7 @@ def main():
         # The maximum number depends on the available GPU memory
         max_num_seqs=32,
         io_processor_plugin="prithvi_to_tiff_india",
+        model_impl="terratorch",
     )
 
     pooling_params = PoolingParams(task="encode", softmax=False)
diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/prithvi_geospatial_mae.py
index 359162c470..c6eed64838 100644
--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@@ -15,6 +15,7 @@ import requests
 #   https://github.com/christian-pinto/prithvi_io_processor_plugin
 # - start vllm in serving mode with the below args
 #   --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
+#   --model-impl terratorch
 #   --task embed --trust-remote-code
 #   --skip-tokenizer-init --enforce-eager
 #   --io-processor-plugin prithvi_to_tiff_india
diff --git a/requirements/test.in b/requirements/test.in
index 5b1688c76c..5db9cd7979 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -53,5 +53,5 @@ runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
 pydantic>=2.10 # 2.9 leads to error on python 3.10
-terratorch==1.1rc2 # required for PrithviMAE test
 decord==0.6.0
+terratorch==1.1rc3 # required for PrithviMAE test
diff --git a/requirements/test.txt b/requirements/test.txt
index 0b728ebfb0..332a9b9cfb 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1042,7 +1042,7 @@ tensorboardx==2.6.4
     # via lightning
 tensorizer==2.10.1
     # via -r requirements/test.in
-terratorch==1.1rc2
+terratorch==1.1rc3
     # via -r requirements/test.in
 threadpoolctl==3.5.0
     # via scikit-learn
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 1afe9ea970..fffab1a984 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -298,6 +298,8 @@ def _compare_tp(
     tokenizer_mode = model_info.tokenizer_mode
     hf_overrides = model_info.hf_overrides
     hf_config = get_config(model_id, trust_remote_code)
+    skip_tokenizer_init = model_info.skip_tokenizer_init
+    max_num_seqs = model_info.max_num_seqs
 
     dtype = "float16"
     if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
@@ -351,6 +353,10 @@ def _compare_tp(
         common_args.extend(["--load-format", load_format])
     if hf_overrides:
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+    if skip_tokenizer_init:
+        common_args.append("--skip-tokenizer-init")
+    if max_num_seqs:
+        common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
 
     specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
     testing_ray_compiled_graph = False
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index c93b436f38..65c5e68968 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -178,6 +178,7 @@ def _compare_sp(
     trust_remote_code = model_info.trust_remote_code
     tokenizer_mode = model_info.tokenizer_mode
     hf_overrides = model_info.hf_overrides
+    skip_tokenizer_init = model_info.skip_tokenizer_init
 
     if load_format == "dummy":
         # Avoid OOM
@@ -227,6 +228,8 @@ def _compare_sp(
         common_args.extend(["--load-format", load_format])
     if hf_overrides:
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+    if skip_tokenizer_init:
+        common_args.append("--skip-tokenizer-init")
 
     compilation_config = {
         'level': 3,
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index 5b6e2a4146..ce90a67c01 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -104,7 +104,9 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
         trust_remote_code=model_info.trust_remote_code,
         revision=model_info.revision,
         hf_overrides=model_info.hf_overrides,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
 
     # Initialize the tokenizer
     tokenizer = get_tokenizer(
diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py
index 0bb42ed8aa..af520ac61d 100644
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -11,7 +11,7 @@ import torch
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
+MODEL_NAME = "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 DTYPE = "float16"
 
 
@@ -35,7 +35,9 @@ def server():
         "--trust-remote-code",
         "--skip-tokenizer-init",
         "--max-num-seqs",
-        "32"
+        "32",
+        "--model-impl",
+        "terratorch"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 0c1f19371a..18db1027c0 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -1266,7 +1266,9 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
 
     # Build the tokenizer group and grab the underlying tokenizer
     tokenizer_group = TokenizerGroup(
@@ -1322,7 +1324,9 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
 
     tokenizer_group = TokenizerGroup(
         model,
@@ -1382,7 +1386,9 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
 
     tokenizer_group = TokenizerGroup(
         model_config.tokenizer,
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index a5d6948f06..ae70838336 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -69,6 +69,9 @@ def run_test(
         vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
     if model_info.hf_overrides:
         vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
+    if model_info.skip_tokenizer_init:
+        vllm_runner_kwargs_[
+            "skip_tokenizer_init"] = model_info.skip_tokenizer_init
 
     if vllm_runner_kwargs:
         vllm_runner_kwargs_.update(vllm_runner_kwargs)
diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py
index e9be79fba9..b503d42567 100644
--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -46,7 +46,7 @@ def _run_test(
         vllm_model.encode(prompt)
 
 
-MODELS = ["christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"]
+MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
 
 
 @pytest.mark.core_model
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 8ffd65cf08..ced0ab3377 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -66,7 +66,9 @@ def _test_processing_correctness(
         hf_overrides=model_info.hf_overrides,
         # Ensure that the cache can fit all of the data
         mm_processor_cache_gb=2048,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 615564f70e..b678313752 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -196,7 +196,9 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=hf_overrides_fn,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
 
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index 7096810d8e..caf1966ab5 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -59,7 +59,9 @@ def test_hf_model_weights_mapper(model_arch: str):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
     original_weights = create_repo_dummy_weights(model_id)
diff --git a/tests/models/registry.py b/tests/models/registry.py
index c9e2eec511..38efb01341 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -6,10 +6,11 @@ from dataclasses import dataclass, field
 from typing import Any, Literal, Optional
 
 import pytest
+import torch
 from packaging.version import Version
 from transformers import __version__ as TRANSFORMERS_VERSION
 
-from vllm.config import TokenizerMode
+from vllm.config import ModelDType, TokenizerMode
 
 
 @dataclass(frozen=True)
@@ -47,6 +48,23 @@ class _HfExamplesInfo:
     The reason for the minimum/maximum version requirement.
     """
 
+    skip_tokenizer_init: bool = False
+    """
+    If true, skip initialization of tokenizer and detokenizer. 
+    """
+
+    dtype: ModelDType = "auto"
+    """
+    The data type for the model weights and activations.
+    """
+
+    enforce_eager: bool = False
+    """
+    Whether to enforce eager execution. If True, we will
+    disable CUDA graph and always execute the model in eager mode.
+    If False, we will use CUDA graph and eager execution in hybrid.
+    """
+
     is_available_online: bool = True
     """
     Set this to ``False`` if the name of this architecture no longer exists on
@@ -76,6 +94,9 @@ class _HfExamplesInfo:
     If not specified, the default revision will be used.
     """
 
+    max_num_seqs: Optional[int] = None
+    """Maximum number of sequences to be processed in a single iteration."""
+
     def check_transformers_version(
         self,
         *,
@@ -361,8 +382,21 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
                                          trust_remote_code=True),
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
-    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
-                                            is_available_online=False),  # noqa: E501
+    "PrithviGeoSpatialMAE": _HfExamplesInfo("mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
+                                            dtype=torch.float16,
+                                            enforce_eager=True,
+                                            skip_tokenizer_init=True,
+                                            # This is to avoid the model
+                                            # going OOM in CI
+                                            max_num_seqs=32,
+                                            ),
+    "Terratorch": _HfExamplesInfo("mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
+                                  dtype=torch.float16,
+                                  enforce_eager=True,
+                                  skip_tokenizer_init=True,
+                                  # This is to avoid the model going OOM in CI
+                                  max_num_seqs=32,
+                                  ),
 }
 
 _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index b4d516233b..aaa04f52f7 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -73,6 +73,9 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
             tokenizer=model_info.tokenizer,
             tokenizer_mode=model_info.tokenizer_mode,
             revision=model_info.revision,
+            enforce_eager=model_info.enforce_eager,
+            skip_tokenizer_init=model_info.skip_tokenizer_init,
+            dtype=model_info.dtype,
             speculative_config={
                 "model": model_info.speculative_model,
                 "num_speculative_tokens": 1,
@@ -85,7 +88,7 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
             model_impl=ModelImpl.TRANSFORMERS
             if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
             hf_overrides=hf_overrides_fn,
-        )
+            max_num_seqs=model_info.max_num_seqs)
 
 
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py
new file mode 100644
index 0000000000..bfa54280dc
--- /dev/null
+++ b/tests/models/test_terratorch.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.conftest import VllmRunner
+from vllm.utils import set_default_torch_num_threads
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
+        "mgazz/Prithvi_v2_eo_300_tl_unet_agb"
+    ],
+)
+def test_inference(
+    vllm_runner: type[VllmRunner],
+    model: str,
+) -> None:
+
+    pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
+    location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
+    prompt = dict(prompt_token_ids=[1],
+                  multi_modal_data=dict(pixel_values=pixel_values,
+                                        location_coords=location_coords))
+    with (
+            set_default_torch_num_threads(1),
+            vllm_runner(
+                model,
+                runner="pooling",
+                dtype=torch.float16,
+                enforce_eager=True,
+                skip_tokenizer_init=True,
+                # Limit the maximum number of sequences to avoid the
+                # test going OOM during the warmup run
+                max_num_seqs=32,
+            ) as vllm_model,
+    ):
+
+        vllm_output = vllm_model.llm.encode(prompt)
+        assert torch.equal(
+            torch.isnan(vllm_output[0].outputs.data).any(),
+            torch.tensor(False))
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 40a41afff8..ab0b27af4d 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -294,6 +294,8 @@ def build_model_context(
         limit_mm_per_prompt=limit_mm_per_prompt,
         mm_processor_cache_gb=mm_processor_cache_gb,
         hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
         **model_config_kwargs,
     )
     return InputContext(model_config)
diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py
index b2fbef2ee2..825165e89b 100644
--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -7,12 +7,11 @@ import requests
 
 from tests.utils import RemoteOpenAIServer
 from vllm.config import VllmConfig
-from vllm.entrypoints.llm import LLM
 from vllm.entrypoints.openai.protocol import IOProcessorResponse
 from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 
-MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
+MODEL_NAME = "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 
 image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"  # noqa: E501
 
@@ -23,61 +22,7 @@ def test_loading_missing_plugin():
         get_io_processor(vllm_config, "wrong_plugin")
 
 
-def test_loading_engine_with_wrong_plugin():
-
-    with pytest.raises(ValueError):
-        LLM(
-            model=MODEL_NAME,
-            skip_tokenizer_init=True,
-            trust_remote_code=True,
-            enforce_eager=True,
-            # Limit the maximum number of parallel requests
-            # to avoid the model going OOM in CI.
-            max_num_seqs=32,
-            io_processor_plugin="wrong_plugin",
-        )
-
-
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
-
-    img_prompt = dict(
-        data=image_url,
-        data_format="url",
-        image_format="tiff",
-        out_data_format="b64_json",
-    )
-
-    pooling_params = PoolingParams(task="encode", softmax=False)
-
-    with vllm_runner(
-            model_name,
-            runner="pooling",
-            skip_tokenizer_init=True,
-            trust_remote_code=True,
-            enforce_eager=True,
-            # Limit the maximum number of parallel requests
-            # to avoid the model going OOM in CI.
-            max_num_seqs=1,
-            io_processor_plugin="prithvi_to_tiff_valencia",
-    ) as llm_runner:
-        pooler_output = llm_runner.get_llm().encode(
-            img_prompt,
-            pooling_params=pooling_params,
-        )
-    output = pooler_output[0].outputs
-
-    # verify the output is formatted as expected for this plugin
-    assert all(
-        hasattr(output, attr)
-        for attr in ["type", "format", "data", "request_id"])
-
-    # We just check that the output is a valid base64 string.
-    # Raises an exception and fails the test if the string is corrupted.
-    base64.b64decode(output.data)
-
-
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def server():
     args = [
         "--runner",
@@ -90,7 +35,9 @@ def server():
         "--max-num-seqs",
         "32",
         "--io-processor-plugin",
-        "prithvi_to_tiff_valencia"
+        "prithvi_to_tiff_valencia",
+        "--model-impl",
+        "terratorch",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -136,3 +83,43 @@ async def test_prithvi_mae_plugin_online(
     # We just check that the output is a valid base64 string.
     # Raises an exception and fails the test if the string is corrupted.
     base64.b64decode(plugin_data["data"])
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
+
+    img_prompt = dict(
+        data=image_url,
+        data_format="url",
+        image_format="tiff",
+        out_data_format="b64_json",
+    )
+
+    pooling_params = PoolingParams(task="encode", softmax=False)
+
+    with vllm_runner(
+            model_name,
+            runner="pooling",
+            skip_tokenizer_init=True,
+            trust_remote_code=True,
+            enforce_eager=True,
+            # Limit the maximum number of parallel requests
+            # to avoid the model going OOM in CI.
+            max_num_seqs=1,
+            model_impl="terratorch",
+            io_processor_plugin="prithvi_to_tiff_valencia",
+    ) as llm_runner:
+        pooler_output = llm_runner.get_llm().encode(
+            img_prompt,
+            pooling_params=pooling_params,
+        )
+    output = pooler_output[0].outputs
+
+    # verify the output is formatted as expected for this plugin
+    assert all(
+        hasattr(output, attr)
+        for attr in ["type", "format", "data", "request_id"])
+
+    # We just check that the output is a valid base64 string.
+    # Raises an exception and fails the test if the string is corrupted.
+    base64.b64decode(output.data)
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 2cea2695a6..7c2b497022 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -171,6 +171,7 @@ class ModelImpl(str, enum.Enum):
     AUTO = "auto"
     VLLM = "vllm"
     TRANSFORMERS = "transformers"
+    TERRATORCH = "terratorch"
 
 
 def get_attr_docs(cls: type[Any]) -> dict[str, str]:
@@ -496,7 +497,9 @@ class ModelConfig:
     back to the Transformers implementation if no vLLM implementation is
     available.\n
     - "vllm" will use the vLLM model implementation.\n
-    - "transformers" will use the Transformers model implementation."""
+    - "transformers" will use the Transformers model implementation.\n
+    - "terratorch" will use the TerraTorch model implementation.
+    """
     override_attention_dtype: Optional[str] = None
     """Override dtype for attention"""
     logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index feca60f2f0..38d300b03d 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -184,10 +184,11 @@ _EMBEDDING_MODELS = {
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
-    # Technically PrithviGeoSpatialMAE is a model that works on images, both in
-    # input and output. I am adding it here because it piggybacks on embedding
+    # Technically Terratorch models work on images, both in
+    # input and output. I am adding it here because it piggy-backs on embedding
     # models for the time being.
-    "PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"),
+    "PrithviGeoSpatialMAE": ("terratorch", "Terratorch"),
+    "Terratorch": ("terratorch", "Terratorch"),
 }
 
 _CROSS_ENCODER_MODELS = {
@@ -639,6 +640,9 @@ class _ModelRegistry:
                 model_info = self._try_inspect_model_cls(arch)
                 if model_info is not None:
                     return (model_info, arch)
+        elif model_config.model_impl == ModelImpl.TERRATORCH:
+            model_info = self._try_inspect_model_cls("Terratorch")
+            return (model_info, "Terratorch")
 
         # Fallback to transformers impl (after resolving convert_type)
         if (all(arch not in self.models for arch in architectures)
@@ -687,6 +691,11 @@ class _ModelRegistry:
                 model_cls = self._try_load_model_cls(arch)
                 if model_cls is not None:
                     return (model_cls, arch)
+        elif model_config.model_impl == ModelImpl.TERRATORCH:
+            arch = "Terratorch"
+            model_cls = self._try_load_model_cls(arch)
+            if model_cls is not None:
+                return (model_cls, arch)
 
         # Fallback to transformers impl (after resolving convert_type)
         if (all(arch not in self.models for arch in architectures)
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/terratorch.py
similarity index 52%
rename from vllm/model_executor/models/prithvi_geospatial_mae.py
rename to vllm/model_executor/models/terratorch.py
index 2edc357d2d..739396a493 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -15,13 +15,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only IBM/NASA Prithvi Geospatial model."""
+"""Wrapper around `Terratorch` models"""
 
+from collections import OrderedDict
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 import torch.nn as nn
+from terratorch.vllm import (DummyDataGenerator, InferenceRunner,
+                             InputDefinition, InputTypeEnum)
 from transformers import BatchFeature
 
 from vllm.config import VllmConfig
@@ -29,6 +32,7 @@ from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import AutoWeightsLoader
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputs, MultiModalKwargsItems,
@@ -45,52 +49,46 @@ from .interfaces import (IsAttentionFree, MultiModalEmbeddings,
 from .interfaces_base import default_pooling_type
 
 
-def _prithvi_field_config(hf_inputs: Mapping[str, torch.Tensor]):
-    # This model receives in input a multi-dimensional tensor representing
-    # a single image patch and therefore it is not to be split
-    # into multiple elements, but rather to be considered a single one.
-    # Hence, the decision of using a MultiModalSharedField.
-    # The expected shape is (num_channels, width, height).
-
-    # This model however allows the user to also submit multiple image
-    # patches as a batch, adding a further dimension to the above shape.
-    # At this stage we only support submitting one patch per request and
-    # batching is achieved via vLLM batching.
-    # TODO (christian-pinto): enable support for multi patch requests
-    # in tandem with vLLM batching.
-    return dict(
-        pixel_values=MultiModalFieldConfig.shared(batch_size=1,
-                                                  modality="image"),
-        location_coords=MultiModalFieldConfig.shared(batch_size=1,
-                                                     modality="image"),
-    )
+def _terratorch_field_names(pretrained_cfg: dict):
+    input_definition = InputDefinition(**pretrained_cfg["input"])
+    return set(input_definition.data.keys())
 
 
-class PrithviGeoSpatialMAEMultiModalDataParser(MultiModalDataParser):
+def _terratorch_field_factory(
+    pretrained_cfg: dict
+) -> Callable[
+    [Mapping[str, torch.Tensor]],
+        Mapping[str, MultiModalFieldConfig],
+]:
 
-    def _parse_image_data(
-        self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
-        if isinstance(data, dict):
-            return DictEmbeddingItems(
-                data,
-                modality="image",
-                required_fields={"pixel_values", "location_coords"},
-                fields_factory=_prithvi_field_config,
-            )
+    def _terratorch_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+        input_definition = InputDefinition(**pretrained_cfg["input"])
+        fields = {}
+        for input_name, input in input_definition.data.items():
+            if input.type == InputTypeEnum.tensor:
+                fields[input_name] = "image"
 
-        return super()._parse_image_data(data)
+        mm_fields_config = {}
+        for field_name, field_modality in fields.items():
+            mm_fields_config[field_name] = MultiModalFieldConfig.shared(
+                batch_size=1, modality=field_modality)
+        return mm_fields_config
+
+    return _terratorch_field_config
 
 
-class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
+class TerratorchProcessingInfo(BaseProcessingInfo):
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
 
-class PrithviGeoSpatialMAEInputBuilder(
-        BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):
+class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]):
+
+    def __init__(self, info: TerratorchProcessingInfo):
+        super().__init__(info)
+        self.dummy_data_generator = DummyDataGenerator(
+            self.info.get_hf_config().to_dict()["pretrained_cfg"])
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         return ""
@@ -100,29 +98,57 @@ class PrithviGeoSpatialMAEInputBuilder(
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> MultiModalDataDict:
-        # This model input is fixed and is in the form of a torch Tensor.
-        # The size of pixel_values might change in the cases where we resize
-        # the input but never exceeds the dimensions below.
-        image_data = {
-            "pixel_values": torch.full((6, 512, 512), 1.0,
-                                       dtype=torch.float16),
-            "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
-        }
-
-        return {"image": image_data}
+        # Dummy data is generated based on the 'input' section
+        # defined in the HF configuration file
+        return self.dummy_data_generator.get_dummy_mm_data()
 
 
-class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
+class TerratorchMultiModalDataParser(MultiModalDataParser):
+
+    def __init__(self, pretrained_cfg: dict, *args, **kwargs):
+        self._pretrained_cfg = pretrained_cfg
+        super().__init__(*args, **kwargs)
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if isinstance(data, dict):
+
+            terratorch_fields = _terratorch_field_names(self._pretrained_cfg)
+
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                required_fields=terratorch_fields,
+                fields_factory=_terratorch_field_factory(self._pretrained_cfg),
+            )
+
+        return super()._parse_image_data(data)
+
+
+class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
+
+    def __init__(
+            self,
+            info: TerratorchProcessingInfo,
+            dummy_inputs: "BaseDummyInputsBuilder[TerratorchProcessingInfo]",
+            *,
+            cache: Optional[MultiModalProcessorOnlyCache] = None) -> None:
+
+        self.pretrained_cfg = info.get_hf_config().to_dict()["pretrained_cfg"]
+        super().__init__(info=info, dummy_inputs=dummy_inputs, cache=cache)
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        return PrithviGeoSpatialMAEMultiModalDataParser()
+        return TerratorchMultiModalDataParser(
+            pretrained_cfg=self.pretrained_cfg)
 
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return _prithvi_field_config(hf_inputs)
+        return _terratorch_field_factory(self.pretrained_cfg)(hf_inputs)
 
     def _get_prompt_updates(
         self,
@@ -173,13 +199,11 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
 
 @default_pooling_type("All")
 @MULTIMODAL_REGISTRY.register_processor(
-    PrithviGeoSpatialMAEMultiModalProcessor,
-    info=PrithviGeoSpatialMAEProcessingInfo,
-    dummy_inputs=PrithviGeoSpatialMAEInputBuilder,
+    TerratorchMultiModalProcessor,
+    info=TerratorchProcessingInfo,
+    dummy_inputs=TerratorchInputBuilder,
 )
-class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
-    """Prithvi Masked Autoencoder"""
-
+class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
     supports_multimodal_raw_input_only = True
     is_pooling_model = True
 
@@ -190,43 +214,13 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
 
         raise ValueError("Only image modality is supported")
 
-    def _instantiate_model(self, config: dict) -> Optional[nn.Module]:
-        # We might be able/need to support different tasks with this same model
-        if config["task_args"]["task"] == "SemanticSegmentationTask":
-            from terratorch.cli_tools import SemanticSegmentationTask
-
-            task = SemanticSegmentationTask(
-                config["model_args"],
-                config["task_args"]["model_factory"],
-                loss=config["task_args"]["loss"],
-                lr=config["task_args"]["lr"],
-                ignore_index=config["task_args"]["ignore_index"],
-                optimizer=config["task_args"]["optimizer"],
-                optimizer_hparams=config["optimizer_params"],
-                scheduler=config["task_args"]["scheduler"],
-                scheduler_hparams=config["scheduler_params"],
-                plot_on_val=config["task_args"]["plot_on_val"],
-                freeze_decoder=config["task_args"]["freeze_decoder"],
-                freeze_backbone=config["task_args"]["freeze_backbone"],
-            )
-
-            return task.model
-        else:
-            return None
-
     def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
-        # the actual model is dynamically instantiated using terratorch
-        # allowing us to perform changes to the model architecture
-        # at startup time (e.g., change the model decoder class.)
-        self.model = self._instantiate_model(
-            vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"])
-        if self.model is None:
-            raise ValueError(
-                "Unsupported task. "
-                "Only SemanticSegmentationTask is supported for now "
-                "by PrithviGeospatialMAE.")
+        config = vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"]
+
+        self.inference_runner = InferenceRunner(config)
+        self.model = self.inference_runner.model
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
@@ -234,23 +228,6 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
         self.pooler = DispatchPooler(
             {"encode": Pooler.for_encode(pooler_config)}, )
 
-    def _parse_and_validate_multimodal_data(
-            self, **kwargs) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        pixel_values = kwargs.pop("pixel_values", None)
-        if not isinstance(pixel_values, torch.Tensor):
-            raise ValueError(f"Incorrect type of pixel_values. "
-                             f"Got type: {type(pixel_values)}")
-
-        location_coords = kwargs.pop("location_coords", None)
-        if not isinstance(location_coords, torch.Tensor):
-            raise ValueError(f"Incorrect type of location_coords. "
-                             f"Got type: {type(location_coords)}")
-        location_coords = torch.unbind(location_coords, dim=0)[0]
-        if location_coords.shape == torch.Size([0]):
-            location_coords = None
-
-        return pixel_values, location_coords
-
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
@@ -270,10 +247,7 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ):
-        pixel_values, location_coords = (
-            self._parse_and_validate_multimodal_data(**kwargs))
-        model_output = self.model(pixel_values,
-                                  location_coords=location_coords)
+        model_output = self.inference_runner.forward(**kwargs)
 
         return model_output.output
 
@@ -283,28 +257,34 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
         model_buffers = dict(self.named_buffers())
         loaded_buffers = []
         for key, value in weights:
-            if key == "state_dict":
-                weights_to_parse = value
-                for name, weight in weights_to_parse.items():
-                    if "pos_embed" in name:
-                        continue
+            if isinstance(value, (dict, OrderedDict)):
+                if key == "state_dict":
+                    weights_to_parse = value
+                    for name, weight in weights_to_parse.items():
+                        name = f"inference_runner.{name}"
 
-                    if "_timm_module." in name:
-                        name = name.replace("_timm_module.", "")
+                        if "pos_embed" in name:
+                            continue
 
-                    # this model requires a couple of buffers to be loaded
-                    # that are not loadable with the AutoWeightsLoader
-                    if name in model_buffers:
                         if "_timm_module." in name:
                             name = name.replace("_timm_module.", "")
-                        buffer = model_buffers[name]
-                        weight_loader = getattr(buffer, "weight_loader",
-                                                default_weight_loader)
-                        weight_loader(buffer, weight)
-                        loaded_buffers.append(name)
-                    else:
-                        params_list.append((name, weight))
-                break
+
+                        # this model requires a couple of buffers to be loaded
+                        # that are not loadable with the AutoWeightsLoader
+                        if name in model_buffers:
+                            if "_timm_module." in name:
+                                name = name.replace("_timm_module.", "")
+                            buffer = model_buffers[name]
+                            weight_loader = getattr(buffer, "weight_loader",
+                                                    default_weight_loader)
+                            weight_loader(buffer, weight)
+                            loaded_buffers.append(name)
+                        else:
+                            params_list.append((name, weight))
+                    break
+
+            elif isinstance(value, torch.Tensor):
+                params_list.append((f"inference_runner.model.{key}", value))
 
         # Load the remaining model parameters
         loader = AutoWeightsLoader(self)

From 04f3c35cff1a3ea240d3977842b23ae5e853eb9f Mon Sep 17 00:00:00 2001
From: anthonsu <50185138+anthonsu@users.noreply.github.com>
Date: Thu, 4 Sep 2025 02:41:41 -0700
Subject: [PATCH 841/932] Improve flexibility of auto_tune.sh execution.
 (#23766)

Signed-off-by: Anthony Su <50185138+anthonsu@users.noreply.github.com>
Signed-off-by: anthonsu <50185138+anthonsu@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 benchmarks/auto_tune/README.md    |  6 +++++
 benchmarks/auto_tune/auto_tune.sh | 44 +++++++++++++++++++++----------
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index 9aad51df6e..3aa988aac2 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -31,6 +31,12 @@ cd vllm
 
 You must set the following variables at the top of the script before execution.
 
+   Note: You can also override the default values below via environment variables when running the script.
+
+```bash
+MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
+```
+
 | Variable | Description | Example Value |
 | --- | --- | --- |
 | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index d9d0fe4e0c..ed3679b66f 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -5,25 +5,41 @@
 
 TAG=$(date +"%Y_%m_%d_%H_%M")
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-BASE="$SCRIPT_DIR/../../.."
-MODEL="meta-llama/Llama-3.1-8B-Instruct"
-SYSTEM="TPU"
-TP=1
-DOWNLOAD_DIR=""
-INPUT_LEN=4000
-OUTPUT_LEN=16
-MAX_MODEL_LEN=4096
-MIN_CACHE_HIT_PCT=0
-MAX_LATENCY_ALLOWED_MS=100000000000
-NUM_SEQS_LIST="128 256"
-NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
+VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
+BASE=${BASE:-"$SCRIPT_DIR/../../.."}
+MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
+SYSTEM=${SYSTEM:-"TPU"}
+TP=${TP:-1}
+DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
+INPUT_LEN=${INPUT_LEN:-4000}
+OUTPUT_LEN=${OUTPUT_LEN:-16}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
+MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
+MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
+NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
+NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
 
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
 PROFILE_PATH="$LOG_FOLDER/profile"
 
-echo "result file: $RESULT"
-echo "model: $MODEL"
+echo "====================== AUTO TUNE PARAMETERS ===================="
+echo "SCRIPT_DIR=$SCRIPT_DIR"
+echo "BASE=$BASE"
+echo "MODEL=$MODEL"
+echo "SYSTEM=$SYSTEM"
+echo "TP=$TP"
+echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
+echo "INPUT_LEN=$INPUT_LEN"
+echo "OUTPUT_LEN=$OUTPUT_LEN"
+echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
+echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
+echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
+echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
+echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
+echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
+echo "RESULT_FILE=$RESULT"
+echo "====================== AUTO TUNEPARAMETERS ===================="
 
 rm -rf $LOG_FOLDER
 rm -rf $PROFILE_PATH

From 3efb9f4d95bfe8810b106637cd9eb45693c93e4d Mon Sep 17 00:00:00 2001
From: whx <56632993+whx-sjtu@users.noreply.github.com>
Date: Thu, 4 Sep 2025 17:46:37 +0800
Subject: [PATCH 842/932] [Attention][Platform] Refactor MLA to support Custom
 Op (#23332)

Signed-off-by: whx-sjtu <2952154980@qq.com>
---
 vllm/model_executor/layers/mla.py         | 158 ++++++++++++++++++++++
 vllm/model_executor/models/deepseek_v2.py |  86 ++++--------
 2 files changed, 186 insertions(+), 58 deletions(-)
 create mode 100644 vllm/model_executor/layers/mla.py

diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
new file mode 100644
index 0000000000..a057161903
--- /dev/null
+++ b/vllm/model_executor/layers/mla.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from vllm.attention import Attention
+from vllm.config import CacheConfig
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.quantization import QuantizationConfig
+
+
+@dataclass
+class MLAModules:
+    """Modules used in MLA.
+    """
+    kv_a_layernorm: torch.nn.Module
+    kv_b_proj: torch.nn.Module
+    rotary_emb: torch.nn.Module
+    o_proj: torch.nn.Module
+    fused_qkv_a_proj: Optional[torch.nn.Module]
+    kv_a_proj_with_mqa: Optional[torch.nn.Module]
+    q_a_layernorm: Optional[torch.nn.Module]
+    q_b_proj: Optional[torch.nn.Module]
+    q_proj: Optional[torch.nn.Module]
+
+
+@CustomOp.register("multi_head_latent_attention")
+class MultiHeadLatentAttention(CustomOp):
+    """MLA layer registered as CustomOp.
+    Note that currently MLA ignores the enable/disable mechanism of CustomOp
+    because there is only one in-tree implementation in forward_native.
+    TODO: implement this with a new PluggableLayer mechanism.
+
+    This class takes positions and hidden_states as input. 
+    The input tensors can either contain prefill tokens or decode tokens.
+    The class does the following:
+
+    1. MLA Preprocess.
+    2. Perform multi-head attention to prefill tokens and
+       multi-query attention to decode tokens separately.
+    3. Return the output tensor.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        scale: float,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        mla_modules: MLAModules,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        self.fused_qkv_a_proj = mla_modules.fused_qkv_a_proj
+        self.kv_a_proj_with_mqa = mla_modules.kv_a_proj_with_mqa
+        self.q_a_layernorm = mla_modules.q_a_layernorm
+        self.q_b_proj = mla_modules.q_b_proj
+        self.q_proj = mla_modules.q_proj
+        self.kv_a_layernorm = mla_modules.kv_a_layernorm
+        self.kv_b_proj = mla_modules.kv_b_proj
+        self.rotary_emb = mla_modules.rotary_emb
+        self.o_proj = mla_modules.o_proj
+
+        # In the MLA backend, kv_cache includes both k_c and
+        # pe (i.e. decoupled position embeddings). In particular,
+        # the concat_and_cache_mla op requires
+        #     k_c.size(1) + k_pe.size(1) == kv_cache.size(2)
+        # i.e.
+        #     kv_lora_rank + qk_rope_head_dim == head_size
+        self.mla_attn = Attention(
+            num_heads=self.num_heads,
+            head_size=self.kv_lora_rank + self.qk_rope_head_dim,
+            scale=scale,
+            num_kv_heads=1,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_mla=True,
+            # MLA Args
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            qk_head_dim=self.qk_head_dim,
+            v_head_dim=self.v_head_dim,
+            kv_b_proj=self.kv_b_proj,
+        )
+
+        self.prefix = prefix
+        self.debug_layer_idx = int(self.prefix.split(".")[-2])
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        q_c = None
+        kv_lora = None
+
+        if self.q_lora_rank is not None:
+            assert self.fused_qkv_a_proj is not None, \
+                "fused_qkv_a_proj is required when q_lora_rank is not None"
+            assert self.q_a_layernorm is not None, \
+                "q_a_layernorm is required when q_lora_rank is not None"
+            assert self.q_b_proj is not None, \
+                "q_b_proj is required when q_lora_rank is not None"
+            qkv_lora = self.fused_qkv_a_proj(hidden_states)[0]
+            q_c, kv_lora = qkv_lora.split(
+                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
+                dim=-1,
+            )
+            q_c = self.q_a_layernorm(q_c)
+            q = self.q_b_proj(q_c)[0]
+        else:
+            assert self.kv_a_proj_with_mqa is not None, \
+                "kv_a_proj_with_mqa is required when q_lora_rank is None"
+            assert self.q_proj is not None, \
+                "q_proj is required when q_lora_rank is None"
+            kv_lora = self.kv_a_proj_with_mqa(hidden_states)[0]
+            q = self.q_proj(hidden_states)[0]
+
+        kv_c, k_pe = kv_lora.split([self.kv_lora_rank, self.qk_rope_head_dim],
+                                   dim=-1)
+        kv_c_normed = self.kv_a_layernorm(kv_c)
+
+        q = q.view(-1, self.num_heads, self.qk_head_dim)
+        # Add head dim of 1 to k_pe
+        k_pe = k_pe.unsqueeze(1)
+
+        q[..., self.qk_nope_head_dim:], k_pe = self.rotary_emb(
+            positions, q[..., self.qk_nope_head_dim:], k_pe)
+
+        attn_out = self.mla_attn(
+            q,
+            kv_c_normed,
+            k_pe,
+            output_shape=(hidden_states.shape[0],
+                          self.num_heads * self.v_head_dim))
+        return self.o_proj(attn_out)[0]
+
+    def forward_cuda(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 7db6fc5d8a..bb95a1dbf1 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -47,6 +47,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttention
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
@@ -492,72 +493,41 @@ class DeepseekV2MLAAttention(nn.Module):
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
-        # In the MLA backend, kv_cache includes both k_c and
-        # pe (i.e. decoupled position embeddings). In particular,
-        # the concat_and_cache_mla op requires
-        #     k_c.size(1) + k_pe.size(1) == kv_cache.size(2)
-        # i.e.
-        #     kv_lora_rank + qk_rope_head_dim == head_size
-        self.mla_attn = Attention(
-            num_heads=self.num_local_heads,
-            head_size=self.kv_lora_rank + self.qk_rope_head_dim,
-            scale=self.scaling,
-            num_kv_heads=1,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.attn",
-            use_mla=True,
-            # MLA Args
-            q_lora_rank=self.q_lora_rank,
-            kv_lora_rank=self.kv_lora_rank,
-            qk_nope_head_dim=self.qk_nope_head_dim,
-            qk_rope_head_dim=self.qk_rope_head_dim,
-            qk_head_dim=self.qk_head_dim,
-            v_head_dim=self.v_head_dim,
+        mla_modules = MLAModules(
+            kv_a_layernorm=self.kv_a_layernorm,
             kv_b_proj=self.kv_b_proj,
+            rotary_emb=self.rotary_emb,
+            o_proj=self.o_proj,
+            fused_qkv_a_proj=self.fused_qkv_a_proj
+            if self.q_lora_rank is not None else None,
+            kv_a_proj_with_mqa=self.kv_a_proj_with_mqa
+            if self.q_lora_rank is None else None,
+            q_a_layernorm=self.q_a_layernorm
+            if self.q_lora_rank is not None else None,
+            q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None,
+            q_proj=self.q_proj if self.q_lora_rank is None else None,
+        )
+        self.mla_attn = MultiHeadLatentAttention(
+            self.hidden_size,
+            self.num_local_heads,
+            self.scaling,
+            self.qk_nope_head_dim,
+            self.qk_rope_head_dim,
+            self.v_head_dim,
+            self.q_lora_rank,
+            self.kv_lora_rank,
+            mla_modules,
+            cache_config,
+            quant_config,
+            prefix,
         )
-
-        self.prefix = prefix
-        self.debug_layer_idx = int(self.prefix.split(".")[-2])
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
-        q_c = None
-        kv_lora = None
-
-        if self.q_lora_rank is not None:
-            qkv_lora = self.fused_qkv_a_proj(hidden_states)[0]
-            q_c, kv_lora = qkv_lora.split(
-                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
-                dim=-1,
-            )
-            q_c = self.q_a_layernorm(q_c)
-            q = self.q_b_proj(q_c)[0]
-        else:
-            kv_lora = self.kv_a_proj_with_mqa(hidden_states)[0]
-            q = self.q_proj(hidden_states)[0]
-
-        kv_c, k_pe = kv_lora.split([self.kv_lora_rank, self.qk_rope_head_dim],
-                                   dim=-1)
-        kv_c_normed = self.kv_a_layernorm(kv_c)
-
-        q = q.view(-1, self.num_local_heads, self.qk_head_dim)
-        # Add head dim of 1 to k_pe
-        k_pe = k_pe.unsqueeze(1)
-
-        q[..., self.qk_nope_head_dim:], k_pe = self.rotary_emb(
-            positions, q[..., self.qk_nope_head_dim:], k_pe)
-
-        attn_out = self.mla_attn(
-            q,
-            kv_c_normed,
-            k_pe,
-            output_shape=(hidden_states.shape[0],
-                          self.num_local_heads * self.v_head_dim))
-        return self.o_proj(attn_out)[0]
+        return self.mla_attn(positions, hidden_states)
 
 
 class DeepseekV2DecoderLayer(nn.Module):

From 2c301ee2eb2d60015936c5e34c80fa62d3c2d37d Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli0116@gmail.com>
Date: Thu, 4 Sep 2025 17:47:08 +0800
Subject: [PATCH 843/932] [Bugfix] Fix Incremental Detokenization with
 `tokenizers == 0.22.0` (#24159)

Signed-off-by: Fanli Lin <fanli.lin@intel.com>
Signed-off-by: Fanli Lin <fanli0116@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/v1/engine/detokenizer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 04ad51aae0..0ccbe65493 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -234,7 +234,7 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
         try:
             token = self.stream.step(self.tokenizer, next_token_id)
         except Exception as e:
-            if str(e) != INVALID_PREFIX_ERR_MSG:
+            if not str(e).startswith(INVALID_PREFIX_ERR_MSG):
                 raise e
             # Recover from edge case where tokenizer can produce non-monotonic,
             # invalid UTF-8 output, which breaks the internal state of
@@ -243,7 +243,8 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
             logger.warning(
                 "Encountered invalid prefix detokenization error"
                 " for request %s, resetting decode stream.", self.request_id)
-            self.stream = DecodeStream(self.skip_special_tokens)
+            self.stream = DecodeStream(
+                skip_special_tokens=self.skip_special_tokens)
             token = self.stream.step(self.tokenizer, next_token_id)
         return token
 

From 402759d4727d9a377598a09d06770770d4e184c6 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 4 Sep 2025 05:47:59 -0400
Subject: [PATCH 844/932] [Attention] FlashAttn MLA (#14258)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Co-authored-by: Matthew Bonanni <mbonanni001@gmail.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .buildkite/check-wheel-size.py                |   8 +-
 cmake/external_projects/vllm_flash_attn.cmake |   2 +-
 docker/Dockerfile                             |   2 +-
 .../attention/test_attention_selector.py      | 106 +++++++---
 tests/v1/attention/test_attention_backends.py |  16 --
 tests/v1/attention/test_mla_backends.py       | 184 ++++++++---------
 tests/v1/attention/utils.py                   |   2 +
 vllm/attention/utils/fa_utils.py              |  13 ++
 vllm/engine/arg_utils.py                      |   2 +
 vllm/envs.py                                  |   1 +
 vllm/platforms/cuda.py                        |  68 ++++---
 vllm/platforms/interface.py                   |   5 +-
 vllm/v1/attention/backends/flashinfer.py      |   3 +-
 vllm/v1/attention/backends/linear_attn.py     |   5 +-
 vllm/v1/attention/backends/mamba1_attn.py     |   5 +-
 vllm/v1/attention/backends/mamba2_attn.py     |   5 +-
 vllm/v1/attention/backends/mla/common.py      |  17 +-
 .../attention/backends/mla/flashattn_mla.py   | 189 ++++++++++++++++++
 vllm/v1/attention/backends/mla/flashmla.py    |  10 +-
 .../attention/backends/mla/rocm_aiter_mla.py  |  14 +-
 vllm/v1/attention/backends/short_conv_attn.py |   7 +-
 vllm/v1/attention/backends/xformers.py        |  16 +-
 22 files changed, 480 insertions(+), 200 deletions(-)
 create mode 100644 vllm/v1/attention/backends/mla/flashattn_mla.py

diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 68aff793ae..76f6d7aeca 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -5,11 +5,11 @@ import os
 import sys
 import zipfile
 
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/3792 .
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Note that we have 800 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
 
 
 def print_top_10_largest_files(zip_file):
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 49defccbb1..3d32121f13 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
+          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 01b7aa0f44..6f8ca30ffd 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -237,7 +237,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=400
+ARG VLLM_MAX_SIZE_MB=450
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index aea166da3a..3c2aaabaca 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -22,7 +22,7 @@ def clear_cache():
 
 # Define MLA and non-MLA backends separately
 DEVICE_MLA_BACKENDS = {
-    "cuda": ["TRITON_MLA", "FLASHMLA"],
+    "cuda": ["TRITON_MLA", "FLASHMLA", "FLASH_ATTN_MLA", "CUTLASS_MLA"],
     "hip": ["TRITON_MLA", "ROCM_AITER_MLA"],
     "cpu": [],
 }
@@ -98,21 +98,14 @@ def test_env(
             with patch("vllm.attention.selector.current_platform",
                        RocmPlatform()):
                 if use_mla:
-                    # Validate HIP MLA backend-block_size combinations
-                    valid_combination = (
-                        (name == "TRITON_MLA" and block_size != 1)
-                        or (name == "ROCM_AITER_MLA" and block_size == 1))
+                    # ROCm MLA backend logic:
+                    # - TRITON_MLA: supported when block_size != 1
+                    # - ROCM_AITER_MLA: supported when block_size == 1
+                    # If backend is forced but doesn't match block_size,
+                    # should raise ValueError
 
-                    if valid_combination:
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   torch.float16,
-                                                   block_size,
-                                                   False,
-                                                   use_mla=use_mla)
-                        expected = f"{name}_VLLM_V1" if use_v1 else name
-                        assert backend.get_name() == expected
-                    else:
+                    if name == "TRITON_MLA" and block_size == 1:
+                        # TRITON_MLA doesn't support block_size == 1
                         with pytest.raises(ValueError) as exc_info:
                             get_attn_backend(16,
                                              torch.float16,
@@ -122,6 +115,27 @@ def test_env(
                                              use_mla=use_mla)
                         assert f"The selected backend, {name}" in str(
                             exc_info.value)
+                    elif name == "ROCM_AITER_MLA" and block_size != 1:
+                        # ROCM_AITER_MLA only supports block_size == 1
+                        with pytest.raises(ValueError) as exc_info:
+                            get_attn_backend(16,
+                                             torch.float16,
+                                             torch.float16,
+                                             block_size,
+                                             False,
+                                             use_mla=use_mla)
+                        assert f"The selected backend, {name}" in str(
+                            exc_info.value)
+                    else:
+                        # Valid backend-block_size combination
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        expected = f"{name}_VLLM_V1" if use_v1 else name
+                        assert backend.get_name() == expected
                 else:
                     backend = get_attn_backend(16,
                                                torch.float16,
@@ -136,16 +150,22 @@ def test_env(
             with patch("vllm.attention.selector.current_platform",
                        CudaPlatform()):
                 if use_mla:
-                    if name == "FLASHMLA" and block_size == 64:
-                        from vllm.attention.backends.flashmla import (
-                            is_flashmla_supported)
+                    # CUDA MLA backend logic:
+                    # - CUTLASS_MLA: only supported with block_size == 128
+                    #   and Blackwell GPUs (SM 10.0), V1 only
+                    # - FLASHMLA: only supported with block_size == 64
+                    # - FLASH_ATTN_MLA: V1 only
+                    # - TRITON_MLA: fallback for other cases
 
-                        # only on cuda platforms with specific capability.
-                        is_supported, _ = is_flashmla_supported()
-
-                        if not is_supported:
-                            # if platform is not supported then skip this case.
-                            pytest.skip()
+                    if name == "CUTLASS_MLA":
+                        if not use_v1:
+                            # CUTLASS_MLA only supported on V1 engine
+                            pytest.skip(
+                                "CUTLASS_MLA only supported on V1 engine")
+                        elif block_size != 128:
+                            # CUTLASS_MLA only supports block_size == 128
+                            pytest.skip(
+                                "CUTLASS_MLA only supports block_size 128")
                         else:
                             backend = get_attn_backend(16,
                                                        torch.float16,
@@ -153,9 +173,45 @@ def test_env(
                                                        block_size,
                                                        False,
                                                        use_mla=use_mla)
-                            expected = f"{name}_VLLM_V1" if use_v1 else name
+                            expected = "CUTLASS_MLA_VLLM_V1"
+                            assert backend.get_name() == expected
+                    elif name == "FLASHMLA":
+                        if block_size != 64:
+                            # FlashMLA only supports block_size == 64
+                            pytest.skip("FlashMLA only supports block_size 64")
+                        else:
+                            from vllm.attention.backends.flashmla import (
+                                is_flashmla_supported)
+                            is_supported, _ = is_flashmla_supported()
+                            if not is_supported:
+                                pytest.skip(
+                                    "FlashMLA not supported on this platform")
+                            else:
+                                backend = get_attn_backend(16,
+                                                           torch.float16,
+                                                           torch.float16,
+                                                           block_size,
+                                                           False,
+                                                           use_mla=use_mla)
+                                expected = f"{name}_VLLM_V1" if use_v1 else name
+                                assert backend.get_name() == expected
+                    elif name == "FLASH_ATTN_MLA":
+                        if not use_v1:
+                            # FlashAttention MLA only supported on V1 engine
+                            pytest.skip(
+                                "FlashAttention MLA only supported on V1 engine"
+                            )
+                        else:
+                            backend = get_attn_backend(16,
+                                                       torch.float16,
+                                                       torch.float16,
+                                                       block_size,
+                                                       False,
+                                                       use_mla=use_mla)
+                            expected = "FLASH_ATTN_MLA"
                             assert backend.get_name() == expected
                     else:
+                        # TRITON_MLA or other fallback
                         backend = get_attn_backend(16,
                                                    torch.float16,
                                                    torch.float16,
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index e4c07aae0e..1ae8b91c34 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -70,22 +70,6 @@ BATCH_SPECS = {
 }
 
 
-def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec,
-                          device: torch.device,
-                          num_blocks: int = 100) -> torch.Tensor:
-    """Create a dummy KV cache tensor for testing."""
-    kv_cache = torch.randn(
-        2,  # K and V
-        num_blocks,
-        kv_cache_spec.block_size,
-        kv_cache_spec.num_kv_heads,
-        kv_cache_spec.head_size,
-        dtype=_convert_dtype_to_torch(kv_cache_spec.dtype),
-        device=device,
-    )
-    return kv_cache
-
-
 def create_and_prepopulate_kv_cache(
         k_contexts: list[torch.Tensor],
         v_contexts: list[torch.Tensor],
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 2407035879..e7cd116fdc 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -15,7 +15,7 @@ from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
-    _Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1,
+    _Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1, _Backend.FLASH_ATTN_MLA,
     _Backend.TRITON_MLA_VLLM_V1
 ]
 
@@ -69,20 +69,6 @@ BATCH_SPECS = {
 }
 
 
-def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec,
-                          device: torch.device,
-                          num_blocks: int = 100) -> torch.Tensor:
-    """Create a dummy KV cache tensor for testing."""
-    kv_cache = torch.randn(
-        num_blocks,
-        kv_cache_spec.block_size,
-        kv_cache_spec.head_size,  # latent dimension
-        dtype=_convert_dtype_to_torch(kv_cache_spec.dtype),
-        device=device,
-    )
-    return kv_cache
-
-
 def create_and_prepopulate_kv_cache(
         kv_c_contexts: list[torch.Tensor],
         k_pe_contexts: list[torch.Tensor],
@@ -315,7 +301,7 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
 
     # 2. Generate data and compute SDPA reference output for MLA
     all_q_vllm, all_kv_c_vllm, all_k_pe_vllm = [], [], []
-    all_sdpa_outputs = []
+    all_sdpa_outputs: list[list[torch.Tensor]] = []
     kv_c_contexts, k_pe_contexts = [], []
 
     # Create shared MLA weight matrices for consistency across all sequences
@@ -331,6 +317,9 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
                        device=device)
     kv_b_proj_weight = torch.cat([W_UK, W_UV], dim=-1)
 
+    for i, backend in enumerate(BACKENDS_TO_TEST):
+        all_sdpa_outputs.append([])
+
     for i in range(batch_size):
         s_len = seq_lens[i]
         q_len = query_lens[i]
@@ -358,85 +347,93 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
                                 dtype=dtype,
                                 device=device)
 
-        # Determine if this is decode (single token)
-        # or prefill (multiple tokens)
-        is_decode = q_len == 1
+        # Determine if this is decode or prefill
+        is_decode = []
+        for i, backend in enumerate(BACKENDS_TO_TEST):
+            builder_cls, _ = get_attention_backend(backend)
+            is_decode.append(q_len <= builder_cls.reorder_batch_threshold)
 
         # Split q into nope and rope components
         q_nope, q_pe = q_c.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1)
 
-        if is_decode:
-            # Decode path: MQA-style attention in latent space
-            # Transform q_nope to latent space: q_nope @ W_UK
-            # q_nope: [1, num_heads, qk_nope_head_dim]
-            # W_UK: [kv_lora_rank, num_heads, qk_nope_head_dim]
-            ql_nope = torch.einsum("qnh,lnh->qnl", q_nope,
-                                   W_UK)  # [1, num_heads, kv_lora_rank]
+        #######################################################
+        # Decode path: MQA-style attention in latent space
+        # Transform q_nope to latent space: q_nope @ W_UK
+        # q_nope: [1, num_heads, qk_nope_head_dim]
+        # W_UK: [kv_lora_rank, num_heads, qk_nope_head_dim]
+        ql_nope = torch.einsum("qnh,lnh->qnl", q_nope,
+                               W_UK)  # [1, num_heads, kv_lora_rank]
 
-            # Build MQA attention inputs
-            # Q: [1, num_heads, kv_lora_rank + qk_rope_head_dim]
-            q_mqa = torch.cat([ql_nope, q_pe], dim=-1)
-            # K: [s_len, kv_lora_rank + qk_rope_head_dim]
-            # (broadcasted to all heads)
-            k_mqa = torch.cat([kv_c_full, k_pe_full.squeeze(1)], dim=-1)
-            k_mqa = k_mqa.unsqueeze(1).expand(-1, num_q_heads, -1)
-            # V: [s_len, kv_lora_rank] (broadcasted to all heads)
-            v_mqa = kv_c_full.unsqueeze(1).expand(-1, num_q_heads, -1)
+        # Build MQA attention inputs
+        # Q: [1, num_heads, kv_lora_rank + qk_rope_head_dim]
+        q_mqa = torch.cat([ql_nope, q_pe], dim=-1)
+        # K: [s_len, kv_lora_rank + qk_rope_head_dim]
+        # (broadcasted to all heads)
+        k_mqa = torch.cat([kv_c_full, k_pe_full.squeeze(1)], dim=-1)
+        k_mqa = k_mqa.unsqueeze(1).expand(-1, num_q_heads, -1)
+        # V: [s_len, kv_lora_rank] (broadcasted to all heads)
+        v_mqa = kv_c_full.unsqueeze(1).expand(-1, num_q_heads, -1)
 
-            # SDPA expects (N, H, L, D)
-            q_sdpa_in = q_mqa.unsqueeze(0).transpose(1, 2)
-            k_sdpa_in = k_mqa.unsqueeze(0).transpose(1, 2)
-            v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2)
+        # Create custom attention mask for decode path:
+        # - Query tokens can attend to all context tokens
+        # - Query tokens can only attend to query tokens up to their position
+        attn_mask = torch.ones(q_len, s_len, dtype=torch.bool, device=device)
+        # Apply causal mask only to the query portion (context_len onwards)
+        causal_mask = torch.tril(torch.ones(q_len, q_len, device=device))
+        attn_mask[:, context_len:] = causal_mask
 
-            sdpa_out_i = torch.nn.functional.scaled_dot_product_attention(
-                q_sdpa_in, k_sdpa_in, v_sdpa_in, is_causal=False, scale=scale)
-            sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze(
-                0)  # [1, num_heads, kv_lora_rank]
+        # SDPA expects (N, H, L, D)
+        q_sdpa_in = q_mqa.unsqueeze(0).transpose(1, 2)
+        k_sdpa_in = k_mqa.unsqueeze(0).transpose(1, 2)
+        v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2)
 
-            # Project back to output space: sdpa_out @ W_UV
-            sdpa_out_i = torch.einsum("qnl,lnv->qnv", sdpa_out_i, W_UV)
-            sdpa_out_i = sdpa_out_i.flatten(start_dim=-2)
-        else:
-            # Prefill path: MHA-style attention with full sequence
-            # Apply kv_b_proj to the full kv_c tensor
-            kv_nope_full = torch.einsum("sl,lnh->snh", kv_c_full,
-                                        kv_b_proj_weight)
-            k_nope_full, v_full = kv_nope_full.split(
-                [qk_nope_head_dim, v_head_dim], dim=-1)
+        sdpa_out_i_decode = torch.nn.functional.scaled_dot_product_attention(
+            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale)
+        sdpa_out_i_decode = sdpa_out_i_decode.transpose(1, 2).squeeze(
+            0)  # [1, num_heads, kv_lora_rank]
 
-            # Build attention inputs for full sequence
-            q_mha = torch.cat([q_nope, q_pe],
-                              dim=-1)  # [q_len, num_heads, total_dim]
-            k_pe_full_expanded = k_pe_full.expand(-1, num_q_heads, -1)
-            k_full = torch.cat([k_nope_full, k_pe_full_expanded], dim=-1)
+        # Project back to output space: sdpa_out @ W_UV
+        sdpa_out_i_decode = torch.einsum("qnl,lnv->qnv", sdpa_out_i_decode,
+                                         W_UV)
+        sdpa_out_i_decode = sdpa_out_i_decode.flatten(start_dim=-2)
 
-            # Create custom attention mask:
-            # - Query tokens can attend to all context tokens
-            # - Query tokens can only attend to query tokens up to their pos
-            attn_mask = torch.ones(q_len,
-                                   s_len,
-                                   dtype=torch.bool,
-                                   device=device)
-            # Apply causal mask only to the query portion (context_len onwards)
-            causal_mask = torch.tril(torch.ones(q_len, q_len, device=device))
-            attn_mask[:, context_len:] = causal_mask
+        #######################################################
+        # Prefill path: MHA-style attention with full sequence
+        # Apply kv_b_proj to the full kv_c tensor
+        kv_nope_full = torch.einsum("sl,lnh->snh", kv_c_full, kv_b_proj_weight)
+        k_nope_full, v_full = kv_nope_full.split(
+            [qk_nope_head_dim, v_head_dim], dim=-1)
 
-            # SDPA expects (N, H, L, D)
-            q_sdpa_in = q_mha.unsqueeze(0).transpose(1, 2)
-            k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2)
-            v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2)
+        # Build attention inputs for full sequence
+        q_mha = torch.cat([q_nope, q_pe],
+                          dim=-1)  # [q_len, num_heads, total_dim]
+        k_pe_full_expanded = k_pe_full.expand(-1, num_q_heads, -1)
+        k_full = torch.cat([k_nope_full, k_pe_full_expanded], dim=-1)
 
-            # Single attention call with custom mask
-            sdpa_out_i = torch.nn.functional.scaled_dot_product_attention(
-                q_sdpa_in,
-                k_sdpa_in,
-                v_sdpa_in,
-                attn_mask=attn_mask,
-                scale=scale)
-            sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze(0)
-            sdpa_out_i = sdpa_out_i.flatten(start_dim=-2)
+        # Create custom attention mask:
+        # - Query tokens can attend to all context tokens
+        # - Query tokens can only attend to query tokens up to their pos
+        attn_mask = torch.ones(q_len, s_len, dtype=torch.bool, device=device)
+        # Apply causal mask only to the query portion (context_len onwards)
+        causal_mask = torch.tril(torch.ones(q_len, q_len, device=device))
+        attn_mask[:, context_len:] = causal_mask
 
-        all_sdpa_outputs.append(sdpa_out_i)
+        # SDPA expects (N, H, L, D)
+        q_sdpa_in = q_mha.unsqueeze(0).transpose(1, 2)
+        k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2)
+        v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2)
+
+        # Single attention call with custom mask
+        sdpa_out_i_prefill = torch.nn.functional.scaled_dot_product_attention(
+            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale)
+        sdpa_out_i_prefill = sdpa_out_i_prefill.transpose(1, 2).squeeze(0)
+        sdpa_out_i_prefill = sdpa_out_i_prefill.flatten(start_dim=-2)
+
+        for i, backend in enumerate(BACKENDS_TO_TEST):
+            if is_decode[i]:
+                all_sdpa_outputs[i].append(sdpa_out_i_decode)
+            else:
+                all_sdpa_outputs[i].append(sdpa_out_i_prefill)
 
         # Inputs for vLLM MLA backends are just the new tokens
         all_q_vllm.append(q_c)
@@ -451,7 +448,9 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
     query_vllm = torch.cat(all_q_vllm, dim=0)
     kv_c_vllm = torch.cat(all_kv_c_vllm, dim=0)
     k_pe_vllm = torch.cat(all_k_pe_vllm, dim=0)
-    sdpa_output = torch.cat(all_sdpa_outputs, dim=0)
+    sdpa_outputs = []
+    for i, backend in enumerate(BACKENDS_TO_TEST):
+        sdpa_outputs.append(torch.cat(all_sdpa_outputs[i], dim=0))
 
     # Create mock kv_b_proj using the same weights as reference implementation
     from vllm.model_executor.layers.linear import ColumnParallelLinear
@@ -486,7 +485,7 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
         randomize_blocks=True)
 
     # 4. Run vLLM backends and compare
-    for backend_name in BACKENDS_TO_TEST:
+    for i, backend_name in enumerate(BACKENDS_TO_TEST):
         backend_output = run_attention_backend(
             backend_name, kv_cache_spec, ["placeholder"], vllm_config, device,
             common_attn_metadata, query_vllm, kv_c_vllm, k_pe_vllm, kv_cache,
@@ -494,12 +493,12 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
             mock_kv_b_proj)
 
         # Check shape and dtype consistency
-        assert backend_output.shape == sdpa_output.shape, (
+        assert backend_output.shape == sdpa_outputs[i].shape, (
             f"[{backend_name}] shape {backend_output.shape} != "
-            f"SDPA shape {sdpa_output.shape}")
-        assert backend_output.dtype == sdpa_output.dtype, (
+            f"SDPA shape {sdpa_outputs[i].shape}")
+        assert backend_output.dtype == sdpa_outputs[i].dtype, (
             f"[{backend_name}] dtype {backend_output.dtype} != "
-            f"SDPA dtype {sdpa_output.dtype}")
+            f"SDPA dtype {sdpa_outputs[i].dtype}")
 
         assert torch.isfinite(backend_output).all(), (
             f"[{backend_name}] produced non-finite values")
@@ -508,12 +507,13 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
         rtol = 1e-2
         atol = 5e-1
 
-        max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item()
+        max_diff = torch.max(torch.abs(backend_output -
+                                       sdpa_outputs[i])).item()
         max_rel_diff = torch.max(
-            torch.abs(backend_output - sdpa_output) /
-            torch.abs(sdpa_output)).item()
+            torch.abs(backend_output - sdpa_outputs[i]) /
+            torch.abs(sdpa_outputs[i])).item()
         all_close = torch.allclose(backend_output,
-                                   sdpa_output,
+                                   sdpa_outputs[i],
                                    rtol=rtol,
                                    atol=atol)
 
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 6a08cdc56f..5c49566240 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -139,6 +139,8 @@ def get_attention_backend(backend_name: _Backend):
         "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
         _Backend.FLASHMLA_VLLM_V1:
         "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
+        _Backend.FLASH_ATTN_MLA:
+        "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
         _Backend.TRITON_MLA_VLLM_V1:
         "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
     }
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index f8b00565f0..dc0af7e28e 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -68,5 +68,18 @@ def flash_attn_supports_fp8() -> bool:
         current_platform.get_device_capability().major == 9
 
 
+def flash_attn_supports_mla():
+    from vllm.platforms import current_platform
+    if current_platform.is_cuda():
+        try:
+            from vllm.vllm_flash_attn.flash_attn_interface import (
+                is_fa_version_supported)
+            return is_fa_version_supported(3) \
+                and current_platform.get_device_capability()[0] == 9
+        except (ImportError, AssertionError):
+            pass
+    return False
+
+
 def is_flash_attn_varlen_func_available() -> bool:
     return current_platform.is_cuda() or current_platform.is_xpu()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d4dd545dd4..71ee90040f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1488,6 +1488,8 @@ class EngineArgs:
             "TRITON_MLA",
             "CUTLASS_MLA",
             "FLASHMLA",
+            "FLASHMLA_VLLM_V1",
+            "FLASH_ATTN_MLA",
             "FLASHINFER",
             "FLASHINFER_VLLM_V1",
             "ROCM_AITER_MLA",
diff --git a/vllm/envs.py b/vllm/envs.py
index 1232bd7bf9..56adb83e8d 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -463,6 +463,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # - "ROCM_FLASH": use ROCmFlashAttention
     # - "FLASHINFER": use flashinfer
     # - "FLASHMLA": use FlashMLA
+    # - "FLASH_ATTN_MLA": use FlashAttention for MLA
     "VLLM_ATTENTION_BACKEND":
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index c65c987c0e..1b0a298352 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -223,9 +223,30 @@ class CudaPlatformBase(Platform):
         if use_mla:
             # TODO(lucas): refactor to be more concise
             #  we should probably consider factoring out V1 here
-            if selected_backend == _Backend.CUTLASS_MLA or (
-                    cls.is_device_capability(100) and selected_backend is None
-                    and block_size == 128):
+
+            from vllm.attention.ops.flashmla import is_flashmla_supported
+            from vllm.attention.utils.fa_utils import flash_attn_supports_mla
+
+            use_cutlassmla = selected_backend == _Backend.CUTLASS_MLA or (
+                selected_backend is None and cls.is_device_capability(100)
+                and block_size == 128)
+            use_flashmla = selected_backend in [
+                _Backend.FLASHMLA, _Backend.FLASHMLA_VLLM_V1
+            ] or (selected_backend is None and is_flashmla_supported()[0])
+            use_flashattn = selected_backend == _Backend.FLASH_ATTN_MLA or (
+                selected_backend is None and flash_attn_supports_mla())
+            use_triton = selected_backend == _Backend.TRITON_MLA or (
+                selected_backend is None)
+
+            def _get_version(name, import_suffix) -> str:
+                if use_v1:
+                    logger.info_once(f"Using {name} backend on V1 engine.")
+                    return f"vllm.v1.attention.backends.mla.{import_suffix}"
+                else:
+                    logger.info_once(f"Using {name} backend.")
+                    return f"vllm.attention.backends.{import_suffix}"
+
+            if use_cutlassmla:
                 if use_v1:
                     logger.info_once("Using Cutlass MLA backend on V1 engine.")
                     return ("vllm.v1.attention.backends.mla."
@@ -233,36 +254,27 @@ class CudaPlatformBase(Platform):
                 else:
                     logger.warning(
                         "Cutlass MLA backend is only supported on V1 engine")
-            if selected_backend == _Backend.TRITON_MLA or block_size != 64:
-                if use_v1:
-                    logger.info_once("Using Triton MLA backend on V1 engine.")
-                    return ("vllm.v1.attention.backends.mla."
-                            "triton_mla.TritonMLABackend")
-                else:
-                    logger.info("Using Triton MLA backend.")
-                    return "vllm.attention.backends.triton_mla.TritonMLABackend"
-            else:
-                from vllm.attention.backends.flashmla import (
-                    is_flashmla_supported)
-                if not is_flashmla_supported()[0]:
-                    logger.warning(
-                        "FlashMLA backend is not supported due to %s",
-                        is_flashmla_supported()[1])
-                elif block_size != 64:
+            if use_flashmla:
+                if block_size != 64:
                     logger.warning(
                         "FlashMLA backend is not supported for block size %d"
                         " (currently only supports block size 64).",
                         block_size)
                 else:
-                    if use_v1:
-                        logger.info_once(
-                            "Using FlashMLA backend on V1 engine.")
-                        return ("vllm.v1.attention.backends.mla."
-                                "flashmla.FlashMLABackend")
-                    else:
-                        logger.info("Using FlashMLA backend.")
-                        return ("vllm.attention.backends."
-                                "flashmla.FlashMLABackend")
+                    return _get_version("FlashMLA", "flashmla.FlashMLABackend")
+            if use_flashattn:
+                if use_v1:
+                    logger.info_once(
+                        "Using FlashAttention MLA backend on V1 engine.")
+                    return ("vllm.v1.attention.backends.mla."
+                            "flashattn_mla.FlashAttnMLABackend")
+                else:
+                    logger.warning(
+                        "FlashAttention MLA backend is only supported on V1 "
+                        "engine.")
+            if use_triton:
+                return _get_version("Triton MLA",
+                                    "triton_mla.TritonMLABackend")
         if use_v1:
             FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"  # noqa: E501
             FLEX_ATTENTION_V1 = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index ad12f7f788..cb620542b8 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -52,9 +52,10 @@ class _Backend(enum.Enum):
     FLASHINFER_VLLM_V1 = enum.auto()
     TRITON_MLA = enum.auto()  # Supported by V1
     TRITON_MLA_VLLM_V1 = enum.auto()
-    FLASHMLA_VLLM_V1 = enum.auto()
-    FLASHMLA = enum.auto()  # Supported by V1
     CUTLASS_MLA = enum.auto()
+    FLASHMLA = enum.auto()  # Supported by V1
+    FLASHMLA_VLLM_V1 = enum.auto()
+    FLASH_ATTN_MLA = enum.auto()  # Supported by V1
     PALLAS = enum.auto()
     PALLAS_VLLM_V1 = enum.auto()
     IPEX = enum.auto()
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 2f275b8b23..fc17385797 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -317,7 +317,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\
-            split_decodes_and_prefills(common_attn_metadata)
+            split_decodes_and_prefills(common_attn_metadata,
+                                       decode_threshold=self.reorder_batch_threshold)
 
         page_size = self.page_size
         max_q_len = common_attn_metadata.max_query_len
diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py
index f08b6d7f17..ac0034b5dc 100644
--- a/vllm/v1/attention/backends/linear_attn.py
+++ b/vllm/v1/attention/backends/linear_attn.py
@@ -52,8 +52,9 @@ class LinearAttentionMetadataBuilder(
         state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=1))
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold))
 
         attn_metadata = LinearAttentionMetadata(
             num_prefills=num_prefills,
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index 97a1aa86dd..7cbfa2c2c9 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -50,8 +50,9 @@ class Mamba1AttentionMetadataBuilder(
             query_start_loc.device)
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=1))
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold))
 
         has_initial_states = None
         padded_decodes = num_decodes
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index ed30884fdb..f3e6cd7430 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -115,8 +115,9 @@ class Mamba2AttentionMetadataBuilder(
         state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=1))
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold))
 
         # Compute seq_idx, chunk_indices and chunk_offsets for prefill only
         if num_prefills > 0:
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 9f93b50b07..b4c9aae254 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -578,11 +578,13 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         prefill.prefill_main = self._fi_prefill_main
         prefill.prefill_chunks = self._fi_prefill_chunks
 
-    def _build_decode(self, block_table_tensor: torch.Tensor,
-                      seq_lens: torch.Tensor):
+    def _build_decode(
+            self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor,
+            seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor,
+            query_start_loc_device: torch.Tensor) -> MLACommonDecodeMetadata:
         return MLACommonDecodeMetadata(
             block_table=block_table_tensor,
-            seq_lens=seq_lens,
+            seq_lens=seq_lens_device,
         )
 
     def build_for_cudagraph_capture(
@@ -618,6 +620,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         query_start_loc = common_attn_metadata.query_start_loc
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
         seq_lens = common_attn_metadata.seq_lens
+        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
 
         query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
@@ -625,7 +628,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                                    query_seq_lens_cpu)
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \
-            split_decodes_and_prefills(common_attn_metadata)
+            split_decodes_and_prefills(common_attn_metadata,
+                                       decode_threshold=self.reorder_batch_threshold)
 
         assert num_decodes + num_prefills == num_reqs
         assert num_decode_tokens + num_prefill_tokens == num_tokens
@@ -725,7 +729,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         if num_decodes > 0:
             decode_metadata = self._build_decode(
                 block_table_tensor=block_table_tensor[:num_decodes, ...],
-                seq_lens=seq_lens[:num_decodes],
+                seq_lens_cpu=seq_lens_cpu[:num_decodes],
+                seq_lens_device=seq_lens[:num_decodes],
+                query_start_loc_cpu=query_start_loc_cpu[:num_decodes + 1],
+                query_start_loc_device=query_start_loc[:num_decodes + 1],
             )
 
         attn_metadata = self.metadata_cls(
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
new file mode 100644
index 0000000000..0e08307ddf
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import ClassVar, Optional
+
+import torch
+
+from vllm.attention.backends.abstract import (AttentionLayer, AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.utils.fa_utils import (flash_attn_supports_mla,
+                                           get_flash_attn_version)
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonDecodeMetadata,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata,
+                                                   MLACommonMetadataBuilder)
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata
+
+logger = init_logger(__name__)
+
+
+class FlashAttnMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASH_ATTN_MLA"
+
+    @staticmethod
+    def get_metadata_cls() -> type["FlashAttnMLAMetadata"]:
+        return FlashAttnMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["FlashAttnMLAMetadataBuilder"]:
+        return FlashAttnMLAMetadataBuilder
+
+    @staticmethod
+    def get_impl_cls() -> type["FlashAttnMLAImpl"]:
+        return FlashAttnMLAImpl
+
+
+@dataclass
+class FlashAttnMLADecodeMetadata(MLACommonDecodeMetadata):
+    query_start_loc: torch.Tensor
+    max_query_len: int
+    max_seq_len: int
+    scheduler_metadata: Optional[torch.Tensor] = None
+
+
+@dataclass
+class FlashAttnMLAMetadata(MLACommonMetadata[FlashAttnMLADecodeMetadata]):
+    pass
+
+
+class FlashAttnMLAMetadataBuilder(
+        MLACommonMetadataBuilder[FlashAttnMLAMetadata]):
+    reorder_batch_threshold: ClassVar[int] = 512
+
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device,
+                         FlashAttnMLAMetadata)
+        self.fa_aot_schedule = (get_flash_attn_version() == 3)
+
+    def _schedule_decode(self, num_reqs, cu_query_lens, max_query_len, seqlens,
+                         max_seq_len, causal):
+        if self.fa_aot_schedule:
+            return get_scheduler_metadata(
+                batch_size=num_reqs,
+                max_seqlen_q=max_query_len,
+                max_seqlen_k=max_seq_len,
+                num_heads_q=self.num_heads,
+                num_heads_kv=1,
+                headdim=self.mla_dims.qk_rope_head_dim,
+                cache_seqlens=seqlens,
+                qkv_dtype=self.kv_cache_spec.dtype,
+                headdim_v=self.mla_dims.kv_lora_rank,
+                page_size=self.page_size,
+                cu_seqlens_q=cu_query_lens,
+                causal=causal,
+            )
+        return None
+
+    def _build_decode(
+            self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor,
+            seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor,
+            query_start_loc_device: torch.Tensor
+    ) -> FlashAttnMLADecodeMetadata:
+        query_lens_cpu = (query_start_loc_cpu[1:] - query_start_loc_cpu[:-1])
+        max_query_len = query_lens_cpu.max().item()
+        max_seq_len = seq_lens_cpu.max().item()
+
+        scheduler_metadata = self._schedule_decode(
+            num_reqs=seq_lens_cpu.numel(),
+            cu_query_lens=query_start_loc_device,
+            max_query_len=max_query_len,
+            seqlens=seq_lens_device,
+            max_seq_len=max_seq_len,
+            causal=True,
+        )
+
+        return FlashAttnMLADecodeMetadata(
+            block_table=block_table_tensor,
+            seq_lens=seq_lens_device,
+            query_start_loc=query_start_loc_device,
+            max_query_len=max_query_len,
+            max_seq_len=max_seq_len,
+            scheduler_metadata=scheduler_metadata,
+        )
+
+
+class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+
+        assert flash_attn_supports_mla(), \
+            "FlashAttnMLA is not supported on this device"
+
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "FlashAttnMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashAttnMLAImpl")
+
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashAttnMLA V1 with FP8 KV cache not yet supported")
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: FlashAttnMLAMetadata,
+        layer: AttentionLayer,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError(
+                "FP8 FlashAttention MLA not yet supported")
+
+        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
+        k_pe_cache = kv_c_and_k_pe_cache[..., self.kv_lora_rank:]
+
+        o = flash_attn_varlen_func(
+            q=q_pe,
+            k=k_pe_cache.unsqueeze(-2),  # Add head dim of 1
+            v=kv_c_cache.unsqueeze(-2),  # Add head dim of 1
+            q_v=q_nope,
+            max_seqlen_q=attn_metadata.decode.max_query_len,
+            cu_seqlens_q=attn_metadata.decode.query_start_loc,
+            max_seqlen_k=attn_metadata.decode.max_seq_len,
+            seqused_k=attn_metadata.decode.seq_lens,
+            block_table=attn_metadata.decode.block_table,
+            softmax_scale=self.scale,
+            causal=True,
+            fa_version=3,  # only version 3 is supported
+            scheduler_metadata=attn_metadata.decode.scheduler_metadata,
+        )
+
+        return self._v_up_proj(o)
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 1c50144d47..df617ab7a8 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -85,11 +85,13 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
                 device=self.device,
                 dtype=torch.int32)
 
-    def _build_decode(self, block_table_tensor: torch.Tensor,
-                      seq_lens: torch.Tensor) -> FlashMLADecodeMetadata:
+    def _build_decode(
+            self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor,
+            seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor,
+            query_start_loc_device: torch.Tensor) -> FlashMLADecodeMetadata:
         tile_scheduler_metadata, num_splits = \
             get_mla_metadata(
-            seq_lens,
+            seq_lens_device,
             self.num_q_heads,
             1, # MQA for the decode path
         )
@@ -123,7 +125,7 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
 
         return FlashMLADecodeMetadata(
             block_table=block_table_tensor,
-            seq_lens=seq_lens,
+            seq_lens=seq_lens_device,
             tile_scheduler_metadata=tile_scheduler_metadata,
             num_splits=num_splits,
         )
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 870cc60038..42670093da 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -104,12 +104,14 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
                                           dtype=torch.int32,
                                           device=device)
 
-    def _build_decode(self, block_table_tensor: torch.Tensor,
-                      seq_lens: torch.Tensor) -> AiterMLADecodeMetadata:
+    def _build_decode(
+            self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor,
+            seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor,
+            query_start_loc_device: torch.Tensor) -> AiterMLADecodeMetadata:
         page_size = self.kv_cache_spec.block_size
-        block_table_bounds = (seq_lens + page_size - 1) // page_size
+        block_table_bounds = (seq_lens_device + page_size - 1) // page_size
         device = self.device
-        num_reqs = seq_lens.size(0)
+        num_reqs = seq_lens_device.size(0)
 
         mask = (torch.arange(block_table_tensor.size(1),
                              dtype=block_table_tensor.dtype,
@@ -117,7 +119,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
                 < block_table_bounds.unsqueeze(1))
         paged_kv_indices = block_table_tensor[mask]
 
-        paged_kv_last_page_len = seq_lens % page_size
+        paged_kv_last_page_len = seq_lens_device % page_size
         paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0,
                                              page_size, paged_kv_last_page_len)
 
@@ -156,7 +158,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
 
         attn_metadata = AiterMLADecodeMetadata(
             block_table=block_table_tensor,
-            seq_lens=seq_lens,
+            seq_lens=seq_lens_device,
             paged_kv_indptr=paged_kv_indptr,
             paged_kv_indices=paged_kv_indices,
             paged_kv_last_page_len=paged_kv_last_page_len,
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
index d80ced8ec8..fcbf0c7b53 100644
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -58,8 +58,9 @@ class ShortConvAttentionMetadataBuilder(
         state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=1))
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold))
         has_initial_states = None
         if num_prefills > 0:
             #[batch,]
@@ -78,4 +79,4 @@ class ShortConvAttentionMetadataBuilder(
             has_initial_states=has_initial_states,
             state_indices_tensor=state_indices_tensor,
         )
-        return attn_metadata
\ No newline at end of file
+        return attn_metadata
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index 7f888c1135..c59ff32cf7 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -3,7 +3,7 @@
 """Attention layer with XFormersAttention."""
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, ClassVar, Optional
 
 import torch
 
@@ -197,6 +197,8 @@ class XFormersAttentionMetadata:
 class XFormersAttentionMetadataBuilder(
         AttentionMetadataBuilder[XFormersAttentionMetadata]):
 
+    reorder_batch_threshold: ClassVar[int] = 1
+
     def __init__(
         self,
         kv_cache_spec: AttentionSpec,
@@ -212,9 +214,10 @@ class XFormersAttentionMetadataBuilder(
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
-        return reorder_batch_to_split_decodes_and_prefills(input_batch,
-                                                           scheduler_output,
-                                                           decode_threshold=1)
+        return reorder_batch_to_split_decodes_and_prefills(
+            input_batch,
+            scheduler_output,
+            decode_threshold=self.reorder_batch_threshold)
 
     def build(
         self,
@@ -223,8 +226,9 @@ class XFormersAttentionMetadataBuilder(
         fast_build: bool = False,
     ) -> XFormersAttentionMetadata:
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=1))
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold))
 
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         q_start_loc = common_attn_metadata.query_start_loc

From 369a079568f8e8ced4d9dac4da2fbfdd451e778e Mon Sep 17 00:00:00 2001
From: Ignacio Sica <mignacio.sica@gmail.com>
Date: Thu, 4 Sep 2025 06:48:25 -0300
Subject: [PATCH 845/932] [Hardware][Apple-CPU] Disable OneDNN build for Apple
 Silicon (#24200)

Signed-off-by: ignaciosica <mignacio.sica@gmail.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 cmake/cpu_extension.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 52bfd82c7f..0649446322 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -88,6 +88,7 @@ is_avx512_disabled(AVX512_DISABLED)
 
 if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
     message(STATUS "Apple Silicon Detected")
+    set(APPLE_SILICON_FOUND TRUE)
     set(ENABLE_NUMA OFF)
     check_sysctl(hw.optional.neon ASIMD_FOUND)
     check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
@@ -189,7 +190,7 @@ else()
     set(USE_ACL OFF)
 endif()
 
-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
     FetchContent_Declare(
         oneDNN
         GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git

From 8f423e5f43eca54c9b0e5583ab388fd6d5cc8672 Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Thu, 4 Sep 2025 18:49:06 +0900
Subject: [PATCH 846/932] [Feature][Response API] Add streaming support for
 non-harmony (#23741)

Signed-off-by: Kebe <mail@kebe7jun.com>
---
 .../openai/responses/test_basic.py            |  16 +
 vllm/entrypoints/context.py                   |  10 +
 vllm/entrypoints/openai/serving_responses.py  | 456 +++++++++++++++---
 3 files changed, 406 insertions(+), 76 deletions(-)

diff --git a/tests/v1/entrypoints/openai/responses/test_basic.py b/tests/v1/entrypoints/openai/responses/test_basic.py
index 7a0baa5767..2ee1004493 100644
--- a/tests/v1/entrypoints/openai/responses/test_basic.py
+++ b/tests/v1/entrypoints/openai/responses/test_basic.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
+import openai.types.responses as openai_responses_types
 import pytest
 
 
@@ -86,3 +87,18 @@ async def test_logprobs(client: openai.AsyncOpenAI):
     outputs = response.output
     assert outputs[-1].content[-1].logprobs
     assert len(outputs[-1].content[-1].logprobs[0].top_logprobs) == 5
+
+
+@pytest.mark.asyncio
+async def test_streaming(client: openai.AsyncOpenAI):
+    stream = await client.responses.create(
+        input="What is 13 * 24?",
+        stream=True,
+    )
+    events = [event async for event in stream]
+    assert isinstance(events[0], openai_responses_types.ResponseCreatedEvent)
+    assert any(
+        isinstance(event, openai_responses_types.ResponseTextDeltaEvent)
+        for event in events)
+    assert isinstance(events[-1],
+                      openai_responses_types.ResponseCompletedEvent)
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 52e35bcac9..fb58cba3a4 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -49,9 +49,19 @@ class SimpleContext(ConversationContext):
 
     def __init__(self):
         self.last_output = None
+        self.num_prompt_tokens = 0
+        self.num_output_tokens = 0
+        self.num_cached_tokens = 0
+        # todo num_reasoning_tokens is not implemented yet.
+        self.num_reasoning_tokens = 0
 
     def append_output(self, output) -> None:
         self.last_output = output
+        if not isinstance(output, RequestOutput):
+            raise ValueError("SimpleContext only supports RequestOutput.")
+        self.num_prompt_tokens = len(output.prompt_token_ids or [])
+        self.num_cached_tokens = output.num_cached_tokens or 0
+        self.num_output_tokens += len(output.outputs[0].token_ids or [])
 
     def need_builtin_tool_call(self) -> bool:
         return False
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 58424c9d9f..d49724b043 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -4,6 +4,7 @@
 import asyncio
 import json
 import time
+import uuid
 from collections import deque
 from collections.abc import AsyncGenerator, AsyncIterator, Sequence
 from contextlib import AsyncExitStack
@@ -25,7 +26,8 @@ from openai.types.responses import (ResponseCreatedEvent,
                                     ResponseOutputMessage, ResponseOutputText,
                                     ResponseReasoningItem,
                                     ResponseReasoningTextDeltaEvent,
-                                    ResponseReasoningTextDoneEvent)
+                                    ResponseReasoningTextDoneEvent,
+                                    response_text_delta_event)
 from openai.types.responses.response_output_text import (Logprob,
                                                          LogprobTopLogprob)
 # yapf: enable
@@ -47,7 +49,7 @@ from vllm.entrypoints.harmony_utils import (
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.entrypoints.openai.protocol import (ErrorResponse,
+from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse,
                                               InputTokensDetails,
                                               OutputTokensDetails,
                                               RequestResponseMetadata,
@@ -459,10 +461,6 @@ class OpenAIServingResponses(OpenAIServing):
             assert isinstance(context, HarmonyContext)
             output = self._make_response_output_items_with_harmony(context)
             # TODO: these are all 0 for now!
-            num_prompt_tokens = context.num_prompt_tokens
-            num_generated_tokens = context.num_output_tokens
-            num_cached_tokens = context.num_cached_tokens
-            num_reasoning_tokens = context.num_reasoning_tokens
         else:
             assert isinstance(context, SimpleContext)
             final_res = context.last_output
@@ -475,10 +473,11 @@ class OpenAIServingResponses(OpenAIServing):
 
             # Calculate usage.
             assert final_res.prompt_token_ids is not None
-            num_prompt_tokens = len(final_res.prompt_token_ids)
-            num_generated_tokens = len(final_output.token_ids)
-            num_cached_tokens = final_res.num_cached_tokens
-            num_reasoning_tokens = 0
+        assert isinstance(context, (SimpleContext, HarmonyContext))
+        num_prompt_tokens = context.num_prompt_tokens
+        num_generated_tokens = context.num_output_tokens
+        num_cached_tokens = context.num_cached_tokens
+        num_reasoning_tokens = context.num_reasoning_tokens
 
         usage = ResponseUsage(
             input_tokens=num_prompt_tokens,
@@ -553,6 +552,28 @@ class OpenAIServingResponses(OpenAIServing):
                 ))
         return out
 
+    def _create_stream_response_logprobs(
+        self,
+        token_ids: Sequence[int],
+        logprobs: Optional[SampleLogprobs],
+        tokenizer: AnyTokenizer,
+        top_logprobs: Optional[int] = None
+    ) -> list[response_text_delta_event.Logprob]:
+        lgs = self._create_response_logprobs(token_ids=token_ids,
+                                             logprobs=logprobs,
+                                             tokenizer=tokenizer,
+                                             top_logprobs=top_logprobs)
+        return [
+            response_text_delta_event.Logprob(
+                token=lg.token,
+                logprob=lg.logprob,
+                top_logprobs=[
+                    response_text_delta_event.LogprobTopLogprob(
+                        token=tl.token, logprob=tl.logprob)
+                    for tl in lg.top_logprobs
+                ]) for lg in lgs
+        ]
+
     def _make_response_output_items(
         self,
         request: ResponsesRequest,
@@ -912,7 +933,7 @@ class OpenAIServingResponses(OpenAIServing):
             status_code=HTTPStatus.BAD_REQUEST,
         )
 
-    async def _process_streaming_events(
+    async def _process_simple_streaming_events(
         self,
         request: ResponsesRequest,
         sampling_params: SamplingParams,
@@ -922,47 +943,292 @@ class OpenAIServingResponses(OpenAIServing):
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
         created_time: int,
+        _send_event: Callable[[BaseModel], str],
     ) -> AsyncGenerator[str, None]:
-        sequence_number = 0
+        current_content_index = 0
+        current_output_index = 0
+        current_item_id = ""
+        reasoning_parser = None
+        if self.reasoning_parser:
+            reasoning_parser = self.reasoning_parser(tokenizer)
+        previous_text = ""
+        previous_token_ids: list[int] = []
+        first_delta_sent = False
+        previous_delta_messages: list[DeltaMessage] = []
+        async for ctx in result_generator:
+            assert isinstance(ctx, SimpleContext)
+            if ctx.last_output is None:
+                continue
+            if ctx.last_output.outputs:
+                output = ctx.last_output.outputs[0]
+                if reasoning_parser:
+                    delta_message = \
+                        reasoning_parser.extract_reasoning_content_streaming(
+                        previous_text=previous_text,
+                        current_text=previous_text + output.text,
+                        delta_text=output.text,
+                        previous_token_ids=previous_token_ids,
+                        current_token_ids=previous_token_ids +
+                        output.token_ids,
+                        delta_token_ids=output.token_ids,
+                    )
+                else:
+                    delta_message = DeltaMessage(content=output.text, )
+                previous_text += output.text
+                previous_token_ids += output.token_ids
+                if not delta_message:
+                    continue
+                if not first_delta_sent:
+                    current_item_id = str(uuid.uuid4())
+                    if delta_message.reasoning_content:
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseOutputItemAddedEvent(
+                                type="response.output_item.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.
+                                ResponseReasoningItem(
+                                    type="reasoning",
+                                    id=current_item_id,
+                                    summary=[],
+                                    status="in_progress",
+                                ),
+                            ))
+                    else:
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseOutputItemAddedEvent(
+                                type="response.output_item.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.
+                                ResponseOutputMessage(
+                                    id=current_item_id,
+                                    type="message",
+                                    role="assistant",
+                                    content=[],
+                                    status="in_progress",
+                                ),
+                            ))
+                    yield _send_event(
+                        openai_responses_types.ResponseContentPartAddedEvent(
+                            type="response.content_part.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            content_index=current_content_index,
+                            part=openai_responses_types.ResponseOutputText(
+                                type="output_text",
+                                text="",
+                                annotations=[],
+                                logprobs=[],
+                            ),
+                        ))
+                    current_content_index += 1
+                    first_delta_sent = True
+                # todo(kebe7jun) tool call support
 
-        def _send_event(event: BaseModel):
-            nonlocal sequence_number
-            # Set sequence_number if the event has this attribute
-            if hasattr(event, 'sequence_number'):
-                event.sequence_number = sequence_number
-            sequence_number += 1
-            # Get event type from the event's type field if it exists
-            event_type = getattr(event, 'type', 'unknown')
-            return (f"event: {event_type}\n"
-                    f"data: {event.model_dump_json(indent=None)}\n\n")
+                # check delta message and previous delta message are
+                # same as content or reasoning content
+                if (previous_delta_messages
+                        and previous_delta_messages[-1].reasoning_content
+                        is not None and delta_message.content is not None):
+                    # from reasoning to normal content, send done
+                    # event for reasoning
+                    reason_content = ''.join(
+                        pm.reasoning_content for pm in previous_delta_messages
+                        if pm.reasoning_content is not None)
+                    yield _send_event(
+                        ResponseReasoningTextDoneEvent(
+                            type="response.reasoning_text.done",
+                            item_id=current_item_id,
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            content_index=current_content_index,
+                            text=reason_content,
+                        ))
+                    current_content_index = 0
+                    reasoning_item = ResponseReasoningItem(
+                        type="reasoning",
+                        content=[
+                            ResponseReasoningTextContent(
+                                text=reason_content,
+                                type="reasoning_text",
+                            ),
+                        ],
+                        status="completed",
+                        id=current_item_id,
+                        summary=[],
+                    )
+                    yield _send_event(
+                        ResponseOutputItemDoneEvent(
+                            type="response.output_item.done",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=reasoning_item,
+                        ))
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemAddedEvent(
+                            type="response.output_item.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.ResponseOutputMessage(
+                                id=current_item_id,
+                                type="message",
+                                role="assistant",
+                                content=[],
+                                status="in_progress",
+                            ),
+                        ))
+                    current_output_index += 1
+                    current_item_id = str(uuid.uuid4())
+                    yield _send_event(
+                        openai_responses_types.ResponseContentPartAddedEvent(
+                            type="response.content_part.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            content_index=current_content_index,
+                            part=openai_responses_types.ResponseOutputText(
+                                type="output_text",
+                                text="",
+                                annotations=[],
+                                logprobs=[],
+                            ),
+                        ))
+                    current_content_index += 1
+                    # reset previous delta messages
+                    previous_delta_messages = []
 
+                if delta_message.reasoning_content is not None:
+                    yield _send_event(
+                        ResponseReasoningTextDeltaEvent(
+                            type="response.reasoning_text.delta",
+                            sequence_number=-1,
+                            content_index=current_content_index,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            delta=delta_message.reasoning_content,
+                        ))
+                elif delta_message.content is not None:
+                    yield _send_event(
+                        openai_responses_types.ResponseTextDeltaEvent(
+                            type="response.output_text.delta",
+                            sequence_number=-1,
+                            content_index=current_content_index,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            delta=delta_message.content,
+                            logprobs=self._create_stream_response_logprobs(
+                                token_ids=output.token_ids,
+                                logprobs=output.logprobs,
+                                tokenizer=tokenizer,
+                                top_logprobs=request.top_logprobs,
+                            ) if request.is_include_output_logprobs() else [],
+                        ))
+                current_content_index += 1
+
+                previous_delta_messages.append(delta_message)
+        if previous_delta_messages:
+            if previous_delta_messages[-1].reasoning_content is not None:
+                reason_content = ''.join(pm.reasoning_content
+                                         for pm in previous_delta_messages
+                                         if pm.reasoning_content is not None)
+                yield _send_event(
+                    ResponseReasoningTextDoneEvent(
+                        type="response.reasoning_text.done",
+                        item_id=current_item_id,
+                        sequence_number=-1,
+                        output_index=current_output_index,
+                        content_index=current_content_index,
+                        text=reason_content,
+                    ))
+                current_content_index += 1
+                reasoning_item = ResponseReasoningItem(
+                    type="reasoning",
+                    content=[
+                        ResponseReasoningTextContent(
+                            text=reason_content,
+                            type="reasoning_text",
+                        ),
+                    ],
+                    status="completed",
+                    id=current_item_id,
+                    summary=[],
+                )
+                yield _send_event(
+                    ResponseOutputItemDoneEvent(
+                        type="response.output_item.done",
+                        sequence_number=-1,
+                        output_index=current_output_index,
+                        item=reasoning_item,
+                    ))
+            elif previous_delta_messages[-1].content is not None:
+                final_content = ''.join(pm.content
+                                        for pm in previous_delta_messages
+                                        if pm.content is not None)
+                yield _send_event(
+                    openai_responses_types.ResponseTextDoneEvent(
+                        type="response.output_text.done",
+                        sequence_number=-1,
+                        output_index=current_output_index,
+                        content_index=current_content_index,
+                        text=final_content,
+                        logprobs=[],
+                        item_id=current_item_id,
+                    ))
+                current_content_index += 1
+                part = ResponseOutputText(
+                    text=final_content,
+                    type="output_text",
+                    annotations=[],
+                )
+                yield _send_event(
+                    openai_responses_types.ResponseContentPartDoneEvent(
+                        type="response.content_part.done",
+                        sequence_number=-1,
+                        item_id=current_item_id,
+                        output_index=current_output_index,
+                        content_index=current_content_index,
+                        part=part,
+                    ))
+                current_content_index += 1
+                item = ResponseOutputMessage(
+                    type="message",
+                    role="assistant",
+                    content=[
+                        part,
+                    ],
+                    status="completed",
+                    id=current_item_id,
+                    summary=[],
+                )
+                yield _send_event(
+                    ResponseOutputItemDoneEvent(
+                        type="response.output_item.done",
+                        sequence_number=-1,
+                        output_index=current_output_index,
+                        item=item,
+                    ))
+
+    async def _process_harmony_streaming_events(
+        self,
+        request: ResponsesRequest,
+        sampling_params: SamplingParams,
+        result_generator: AsyncIterator[Optional[ConversationContext]],
+        context: ConversationContext,
+        model_name: str,
+        tokenizer: AnyTokenizer,
+        request_metadata: RequestResponseMetadata,
+        created_time: int,
+        _send_event: Callable[[BaseModel], str],
+    ) -> AsyncGenerator[str, None]:
         current_content_index = 0  # FIXME: this number is never changed
         current_output_index = 0
         current_item_id = ""  # FIXME: this number is never changed
         sent_output_item_added = False
 
-        initial_response = ResponsesResponse.from_request(
-            request,
-            sampling_params,
-            model_name=model_name,
-            created_time=created_time,
-            output=[],
-            status="in_progress",
-            usage=None,
-        ).model_dump()
-        yield _send_event(
-            ResponseCreatedEvent(
-                type="response.created",
-                sequence_number=-1,
-                response=initial_response,
-            ))
-        yield _send_event(
-            ResponseInProgressEvent(
-                type="response.in_progress",
-                sequence_number=-1,
-                response=initial_response,
-            ))
-
         async for ctx in result_generator:
 
             assert isinstance(ctx, StreamingHarmonyContext)
@@ -1312,29 +1578,6 @@ class OpenAIServingResponses(OpenAIServing):
                             ),
                         ))
 
-        async def empty_async_generator():
-            # A hack to trick Python to think this is a generator but in fact
-            # it immediately returns.
-            if False:
-                yield
-
-        final_response = await self.responses_full_generator(
-            request,
-            sampling_params,
-            empty_async_generator(),
-            context,
-            model_name,
-            tokenizer,
-            request_metadata,
-            created_time=created_time,
-        )
-        yield _send_event(
-            openai_responses_types.ResponseCompletedEvent(
-                type="response.completed",
-                sequence_number=-1,
-                response=final_response.model_dump(),
-            ))
-
     async def responses_stream_generator(
         self,
         request: ResponsesRequest,
@@ -1349,16 +1592,77 @@ class OpenAIServingResponses(OpenAIServing):
         # TODO:
         # 1. Handle disconnect
 
-        if not isinstance(context, StreamingHarmonyContext):
-            raise NotImplementedError(
-                "Streaming is not supported for responses API without Harmony."
-            )
-
         created_time = created_time or int(time.time())
 
+        sequence_number = 0
+
+        def _send_event(event: BaseModel):
+            nonlocal sequence_number
+            # Set sequence_number if the event has this attribute
+            if hasattr(event, 'sequence_number'):
+                event.sequence_number = sequence_number
+            sequence_number += 1
+            # Get event type from the event's type field if it exists
+            event_type = getattr(event, 'type', 'unknown')
+            return (f"event: {event_type}\n"
+                    f"data: {event.model_dump_json(indent=None)}\n\n")
+
         async with AsyncExitStack() as exit_stack:
-            await context.init_tool_sessions(self.tool_server, exit_stack)
-            async for event_data in self._process_streaming_events(
-                    request, sampling_params, result_generator, context,
-                    model_name, tokenizer, request_metadata, created_time):
+            processer = None
+            if self.use_harmony:
+                await context.init_tool_sessions(self.tool_server, exit_stack)
+                processer = self._process_harmony_streaming_events
+            else:
+                processer = self._process_simple_streaming_events
+
+            initial_response = ResponsesResponse.from_request(
+                request,
+                sampling_params,
+                model_name=model_name,
+                created_time=created_time,
+                output=[],
+                status="in_progress",
+                usage=None,
+            ).model_dump()
+            yield _send_event(
+                ResponseCreatedEvent(
+                    type="response.created",
+                    sequence_number=-1,
+                    response=initial_response,
+                ))
+            yield _send_event(
+                ResponseInProgressEvent(
+                    type="response.in_progress",
+                    sequence_number=-1,
+                    response=initial_response,
+                ))
+
+            async for event_data in processer(request, sampling_params,
+                                              result_generator, context,
+                                              model_name, tokenizer,
+                                              request_metadata, created_time,
+                                              _send_event):
                 yield event_data
+
+            async def empty_async_generator():
+                # A hack to trick Python to think this is a generator but
+                # in fact it immediately returns.
+                if False:
+                    yield
+
+            final_response = await self.responses_full_generator(
+                request,
+                sampling_params,
+                empty_async_generator(),
+                context,
+                model_name,
+                tokenizer,
+                request_metadata,
+                created_time=created_time,
+            )
+            yield _send_event(
+                openai_responses_types.ResponseCompletedEvent(
+                    type="response.completed",
+                    sequence_number=-1,
+                    response=final_response.model_dump(),
+                ))

From 6c7af8110a2c383f18a9523bc26c26d3f599ff2e Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Thu, 4 Sep 2025 02:58:18 -0700
Subject: [PATCH 847/932] [Doc] Update vLLM Singapore Meetup info (#24234)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 README.md                 | 1 +
 docs/community/meetups.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 8812aac4ea..e13993efd3 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ Easy, fast, and cheap LLM serving for everyone
 
 *Latest News* 🔥
 
+- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
 - [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 221a7bd962..04919769e1 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -2,6 +2,7 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
 - [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).

From eafa8dcde63d625350ed618db4dd1cbcbaae77a1 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Thu, 4 Sep 2025 18:58:26 +0800
Subject: [PATCH 848/932] [Model] Add pp support for hunyuan (#24212)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 docs/models/supported_models.md          | 4 ++--
 vllm/model_executor/models/hunyuan_v1.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index c8f628d31a..c25c4b5219 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -365,8 +365,8 @@ th {
 | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
 | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
-| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
-| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
+| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ | ✅︎ |
+| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
 | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index fbba849a76..a74a44bc2b 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -56,7 +56,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA
+from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
                     make_layers)
 
@@ -841,7 +841,7 @@ class HunYuanModel(nn.Module):
         return loaded_params
 
 
-class HunYuanV1Base(nn.Module, SupportsLoRA):
+class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",

From 2b30afa4420cbada6dd9084de3ee7eb19142b7ff Mon Sep 17 00:00:00 2001
From: nopperl <54780682+nopperl@users.noreply.github.com>
Date: Thu, 4 Sep 2025 20:59:16 +0900
Subject: [PATCH 849/932] Use hidden_size_per_head as head_size fallback
 (#24221)

Signed-off-by: nopperl <54780682+nopperl@users.noreply.github.com>
---
 vllm/config/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 7c2b497022..941aff8919 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1426,6 +1426,11 @@ class ModelConfig:
         if getattr(self.hf_text_config, "head_dim", None) is not None:
             return self.hf_text_config.head_dim
 
+        # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head`
+        if getattr(self.hf_text_config, "hidden_size_per_head",
+                   None) is not None:
+            return self.hf_text_config.hidden_size_per_head
+
         # FIXME(woosuk): This may not be true for all models.
         return (self.hf_text_config.hidden_size //
                 self.hf_text_config.num_attention_heads)

From 16ded21eeb578cdd7060a32d21fe213d8f31cd0c Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Thu, 4 Sep 2025 20:41:08 +0800
Subject: [PATCH 850/932] [XPU] support Triton Attention backend on Intel GPU
 (#24149)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 .../scripts/hardware_ci/run-xpu-test.sh       |  9 +++---
 vllm/_ipex_ops.py                             |  5 ++--
 vllm/attention/ops/paged_attn.py              |  7 ++++-
 vllm/platforms/xpu.py                         | 28 +++++++++++++++++--
 vllm/v1/attention/backends/triton_attn.py     | 15 ++++++----
 5 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 73f3e63fbf..efcd10acf0 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -30,10 +30,11 @@ docker run \
     bash -c '
     set -e
     echo $ZE_AFFINITY_MASK
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
     cd tests
     pytest -v -s v1/core
     pytest -v -s v1/engine
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 19f6c4e306..c2868c040a 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -242,10 +242,9 @@ class ipex_ops:
         k_scale_float: float = 1.0,
         v_scale_float: float = 1.0,
     ) -> None:
-        assert kv_cache_dtype == "auto"
-        # TODO: support FP8 kv cache.
         ipex.llm.modules.PagedAttention.reshape_and_cache_flash(
-            key, value, key_cache, value_cache, slot_mapping)
+            key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
+            k_scale_float, v_scale_float)
 
     @staticmethod
     def flash_attn_varlen_func(
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index c6d1501e27..4d870a45e5 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -6,9 +6,14 @@ from typing import List, Optional, Tuple
 
 import torch
 
-from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
 from vllm.triton_utils import HAS_TRITON
 
+if current_platform.is_cuda_alike():
+    from vllm import _custom_ops as ops
+elif current_platform.is_xpu():
+    from vllm._ipex_ops import ipex_ops as ops
+
 if HAS_TRITON:
     from vllm.attention.ops.prefix_prefill import context_attention_fwd
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 645a9e63a4..9f89334e9a 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -37,14 +37,38 @@ class XPUPlatform(Platform):
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
                              block_size: int, use_v1: bool, use_mla: bool,
                              has_sink: bool) -> str:
-        if selected_backend is not None and selected_backend != _Backend.IPEX:
-            logger.info("Cannot use %s backend on XPU.", selected_backend)
         use_v1 = envs.VLLM_USE_V1
         if not use_v1:
             raise ValueError("XPU backend only supports V1.")
+        TRITON_ATTN_VLLM_V1 = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
+        FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
+        if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
+            logger.info_once("Using Triton backend on V1 engine.")
+            return TRITON_ATTN_VLLM_V1
+        elif selected_backend == _Backend.FLASH_ATTN:
+            logger.info_once("Using Flash Attention backend on V1 engine.")
+            return FLASH_ATTN_V1
+        elif selected_backend:
+            raise ValueError(
+                f"Invalid attention backend for {cls.device_name}, "
+                f"with use_v1: {use_v1} use_mla: {use_mla}")
+
         logger.info("Using Flash Attention backend on V1 engine.")
         return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
 
+    @classmethod
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
+                                    model_config: "ModelConfig") -> bool:
+        """
+        Check if the kv_cache_dtype is supported.
+        XPU only support fp8 kv cache with triton backend.
+        """
+        if envs.is_set("VLLM_ATTENTION_BACKEND") and \
+            envs.VLLM_ATTENTION_BACKEND == "TRITON_ATTN_VLLM_V1":
+            return kv_cache_dtype in ["fp8_e4m3", "fp8_e5m2", "fp8"]
+
+        return False
+
     @classmethod
     def set_device(cls, device: torch.device) -> None:
         """
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index a37a7f6811..104cebb45d 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -7,7 +7,6 @@ from typing import ClassVar, Optional
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm import envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
@@ -23,6 +22,11 @@ from vllm.v1.attention.backends.utils import (AttentionCGSupport,
                                               CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
+if current_platform.is_cuda_alike():
+    from vllm import _custom_ops as ops
+elif current_platform.is_xpu():
+    from vllm._ipex_ops import ipex_ops as ops
+
 logger = init_logger(__name__)
 
 
@@ -337,7 +341,7 @@ class TritonAttentionImpl(AttentionImpl):
                     layer._v_scale,
                 )
             else:
-                torch.ops._C_cache_ops.reshape_and_cache_flash(
+                ops.reshape_and_cache_flash(
                     key,
                     value,
                     key_cache,
@@ -354,9 +358,10 @@ class TritonAttentionImpl(AttentionImpl):
             num_tokens, num_heads, head_size = query.shape
             assert layer._q_scale == 1.0, \
                 "A non 1.0 q_scale is not currently supported."
-            if not current_platform.is_rocm():
-                # Skip Q quantization on ROCm, since dequantizing back to
-                # f32 in the attention kernel is not supported.
+            if current_platform.is_cuda():
+                # Skip Q quantization on ROCm and XPU, enable this on cuda
+                # only, since dequantizing back to f32 in the attention kernel
+                # is not supported.
                 query, _ = ops.scaled_fp8_quant(
                     query.reshape(
                         (num_tokens, num_heads * head_size)).contiguous(),

From c9f7081f9c848d83ecbf42b57591451d6ff5a7a9 Mon Sep 17 00:00:00 2001
From: Yash Pratap Singh <yashsingh20001@gmail.com>
Date: Thu, 4 Sep 2025 18:20:50 +0530
Subject: [PATCH 851/932] [LoRA]: Add lora support to qwen-2.5-omni (#24231)

---
 docs/models/supported_models.md                   |  2 +-
 .../model_executor/models/qwen2_5_omni_thinker.py | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index c25c4b5219..9db6f8036a 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -662,7 +662,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
 | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎ |
+| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ |
 | `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ | ✅︎ |
 | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
 | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 5c64c81547..29563540a7 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -41,6 +41,7 @@ from transformers.models.whisper import WhisperFeatureExtractor
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2_5_vl import (
     Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs,
     Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs,
@@ -66,7 +67,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, WeightsMapper,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -705,7 +707,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
     dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder,
 )
 class Qwen2_5OmniThinkerForConditionalGeneration(
-        nn.Module, SupportsMultiModal, SupportsPP,
+        nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         Qwen2_5OmniConditionalGenerationMixin):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -798,6 +800,15 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """Get module prefix for multimodal models to filter LoRA modules."""
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector=[],  # No explicit connector in this model
+            tower_model=["visual",
+                         "audio_tower"],  # Exclude vision and audio towers
+        )
+
     def get_multimodal_embeddings(self,
                                   **kwargs: object) -> MultiModalEmbeddings:
 

From 37241077d563d5808ce8f36ebce4eec25e1c1f88 Mon Sep 17 00:00:00 2001
From: nvjullin <jullin@nvidia.com>
Date: Thu, 4 Sep 2025 21:25:40 +0800
Subject: [PATCH 852/932] [Misc] Removed force_fp8_e4m3fnuz from FP8LinearOp
 (#23725)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Julien Lin <jullin@nvidia.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 tests/compile/test_fusion.py                  | 27 +++++++++++--------
 tests/compile/test_silu_mul_quant_fusion.py   | 27 ++++++++++---------
 tests/utils.py                                |  9 +++++++
 .../layers/quantization/ptpc_fp8.py           |  6 ++---
 .../layers/quantization/utils/w8a8_utils.py   |  6 ++---
 5 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index c4229f9346..eedb9bdcd5 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -15,9 +15,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape, QuantKey, ScaleDesc)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp, maybe_create_device_identity)
+    Fp8LinearOp, cutlass_fp8_supported, maybe_create_device_identity)
 from vllm.platforms import current_platform
 
+from ..utils import override_cutlass_fp8_supported
 from .backend import TestBackend
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -26,9 +27,9 @@ FP8_DTYPE = current_platform.fp8_dtype()
 class TestModel(torch.nn.Module):
 
     def __init__(self, hidden_size: int, eps: float, static: bool,
-                 force_fp8_e4m3fnuz: bool, *args, **kwargs):
+                 cuda_force_torch: bool, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.force_fp8_e4m3fnuz = force_fp8_e4m3fnuz
+        self.cuda_force_torch = cuda_force_torch
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
         self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
@@ -42,11 +43,12 @@ class TestModel(torch.nn.Module):
             torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
             for _ in range(2)
         ]
-        self.fp8_linear = Fp8LinearOp(
-            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
-            act_quant_static=static,
-            act_quant_group_shape=group_shape,
-        )
+
+        with override_cutlass_fp8_supported(not cuda_force_torch):
+            self.fp8_linear = Fp8LinearOp(
+                act_quant_static=static,
+                act_quant_group_shape=group_shape,
+            )
 
     def forward(self, x):
         resid = torch.sqrt(x)
@@ -81,11 +83,14 @@ class TestModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("static", [True, False])
-@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
+# cuda_force_torch used to test torch code path on platforms that
+# cutlass_fp8_supported() == True.
+@pytest.mark.parametrize("cuda_force_torch",
+                         [True, False] if cutlass_fp8_supported() else [True])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                     reason="Only test on CUDA and ROCm")
 def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
-                              force_fp8_e4m3fnuz):
+                              cuda_force_torch):
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
     torch.manual_seed(1)
@@ -102,7 +107,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
         fusion_pass = FusionPass.instance(vllm_config)
 
         backend = TestBackend(noop_pass, fusion_pass)
-        model = TestModel(hidden_size, eps, static, force_fp8_e4m3fnuz)
+        model = TestModel(hidden_size, eps, static, cuda_force_torch)
 
         # First dimension dynamic
         x = torch.rand(num_tokens, hidden_size)
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index fcc2589e42..e16d1725e6 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -17,9 +17,10 @@ from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape, kFp8StaticTensorSym, kNvfp4Quant)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp)
+    Fp8LinearOp, cutlass_fp8_supported)
 from vllm.platforms import current_platform
 
+from ..utils import override_cutlass_fp8_supported
 from .backend import TestBackend
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -32,7 +33,7 @@ def is_nvfp4_supported():
 
 class TestSiluMulFp8QuantModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, **kwargs):
+    def __init__(self, hidden_size: int, cuda_force_torch: bool, **kwargs):
         super().__init__()
         self.silu_and_mul = SiluAndMul()
         self.wscale = torch.rand(1, dtype=torch.float32)
@@ -40,11 +41,11 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
 
         self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
 
-        self.fp8_linear = Fp8LinearOp(
-            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
-            act_quant_static=True,
-            act_quant_group_shape=GroupShape.PER_TENSOR,
-        )
+        with override_cutlass_fp8_supported(not cuda_force_torch):
+            self.fp8_linear = Fp8LinearOp(
+                act_quant_static=True,
+                act_quant_group_shape=GroupShape.PER_TENSOR,
+            )
 
     def forward(self, x):
         y = self.silu_and_mul(x)
@@ -96,12 +97,15 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
 @pytest.mark.parametrize(
     "model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
     if is_nvfp4_supported() else [TestSiluMulFp8QuantModel])
-@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
+# cuda_force_torch used to test torch code path on platforms that
+# cutlass_fp8_supported() == True.
+@pytest.mark.parametrize("cuda_force_torch",
+                         [True, False] if cutlass_fp8_supported() else [True])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                     reason="Only test on CUDA and ROCm")
 def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
-                                   force_fp8_e4m3fnuz):
-    if model_class == TestSiluMulNvfp4QuantModel and force_fp8_e4m3fnuz:
+                                   cuda_force_torch):
+    if model_class == TestSiluMulNvfp4QuantModel and cuda_force_torch:
         pytest.skip("Duplicate tests for NVFP4")
 
     torch.set_default_device("cuda")
@@ -114,8 +118,7 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
     fusion_pass = ActivationQuantFusionPass(config)
 
     backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
-    model = model_class(hidden_size=hidden_size,
-                        force_fp8_e4m3fnuz=force_fp8_e4m3fnuz)
+    model = model_class(hidden_size, cuda_force_torch)
 
     # First dimension dynamic
     x = torch.rand(num_tokens, hidden_size * 2)
diff --git a/tests/utils.py b/tests/utils.py
index 9d2073f3c1..e472350026 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -17,6 +17,7 @@ from contextlib import contextmanager, suppress
 from multiprocessing import Process
 from pathlib import Path
 from typing import Any, Callable, Literal, Optional, Union
+from unittest.mock import patch
 
 import cloudpickle
 import httpx
@@ -1077,3 +1078,11 @@ def get_attn_backend_list_based_on_platform() -> list[str]:
         return attn_backend_list
     else:
         raise ValueError("Unsupported platform")
+
+
+@contextmanager
+def override_cutlass_fp8_supported(value: bool):
+    with patch(
+            "vllm.model_executor.layers.quantization.utils.w8a8_utils.cutlass_fp8_supported",
+            return_value=value):
+        yield
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 466fd5fba7..45ea8e3520 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -92,13 +92,13 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
     """
 
     def __init__(self, quant_config: PTPCFp8Config):
+        assert current_platform.is_rocm(), \
+            "PTPCFp8LinearMethod is only supported on ROCm."
         super().__init__(quant_config=quant_config)
         # Force weight quantization
         self.quant_config.is_checkpoint_fp8_serialized = False
         self.fp8_linear = Fp8LinearOp(
-            act_quant_static=False,
-            act_quant_group_shape=GroupShape.PER_TOKEN,
-            force_fp8_e4m3fnuz=True)
+            act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.weight = torch.nn.Parameter(layer.weight.data,
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index ecdcc57393..8f6b7f83d4 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -355,12 +355,10 @@ class Fp8LinearOp:
     def __init__(self,
                  act_quant_static: bool,
                  act_quant_group_shape: GroupShape = GroupShape.PER_TENSOR,
-                 pad_output: Optional[bool] = None,
-                 force_fp8_e4m3fnuz: bool = False):
+                 pad_output: Optional[bool] = None):
         if current_platform.is_rocm():
             self.preferred_backend = "rocm"
-        elif current_platform.is_cuda(
-        ) and not force_fp8_e4m3fnuz and cutlass_fp8_supported():
+        elif current_platform.is_cuda() and cutlass_fp8_supported():
             if has_flashinfer() and current_platform.has_device_capability(
                     100):
                 self.preferred_backend = "flashinfer"

From e41a0fa3772c5b9bbca439ce5ed7d05803febbc1 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 4 Sep 2025 07:55:23 -0700
Subject: [PATCH 853/932] [Perf] Freeze core engine proc heap after init
 (#24008)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 922c06b44b..d7e9cfa366 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
 import os
 import queue
 import signal
@@ -536,6 +537,11 @@ class EngineCoreProc(EngineCore):
         self.step_fn = (self.step if self.batch_queue is None else
                         self.step_with_batch_queue)
 
+        # Mark the startup heap as static so that it's ignored by GC.
+        # Reduces pause times of oldest generation collections.
+        gc.collect()
+        gc.freeze()
+
     @contextmanager
     def _perform_handshakes(
         self,

From 83609ca91d42c8847d1b4c272b011a0b6c27319e Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Thu, 4 Sep 2025 17:52:17 +0200
Subject: [PATCH 854/932] [Doc]: fix typos in Python comments (#24173)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 benchmarks/benchmark_dataset.py                               | 2 +-
 benchmarks/kernels/benchmark_lora.py                          | 2 +-
 benchmarks/multi_turn/benchmark_serving_multi_turn.py         | 2 +-
 examples/offline_inference/audio_language.py                  | 2 +-
 tests/models/multimodal/generation/vlm_utils/builders.py      | 2 +-
 .../models/multimodal/generation/vlm_utils/case_filtering.py  | 2 +-
 vllm/attention/backends/mla/common.py                         | 2 +-
 vllm/engine/async_llm_engine.py                               | 2 +-
 vllm/engine/multiprocessing/client.py                         | 2 +-
 vllm/model_executor/layers/quantization/awq_triton.py         | 2 +-
 vllm/model_executor/layers/quantization/base_config.py        | 2 +-
 vllm/v1/attention/backends/mla/common.py                      | 4 ++--
 12 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 2ea4f9ccaf..64ffa62c04 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -403,7 +403,7 @@ class RandomDataset(BenchmarkDataset):
             # [6880, 6881] -> ['Ġcalls', 'here'] ->
             # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
             # To avoid uncontrolled change of the prompt length,
-            # the encoded sequence is truncated before being decode again.
+            # the encoded sequence is truncated before being decoded again.
             total_input_len = prefix_len + int(input_lens[i])
             re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
                 :total_input_len
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 3d38d4b353..89309c79f0 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -637,7 +637,7 @@ def bench_optype(
     # Clear LoRA optimization hash-maps.
     _LORA_A_PTR_DICT.clear()
     _LORA_B_PTR_DICT.clear()
-    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
+    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
     for kwargs in kwargs_list:
         op_type.bench_fn()(**kwargs)
     torch.cuda.synchronize()
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index d23b7b6e45..66d85eaf51 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -962,7 +962,7 @@ async def main_mp(
 
     # At this point all the clients finished,
     # collect results (TTFT, TPOT, etc.) from all the clients.
-    # This needs to happens before calling join on the clients
+    # This needs to happen before calling join on the clients
     # (result_queue should be emptied).
     while not result_queue.empty():
         client_metrics.append(result_queue.get())
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index a5b8397e7e..65a87d2dd9 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -117,7 +117,7 @@ def run_gemma3n(question: str, audio_count: int) -> ModelRequestData:
 
 # Granite Speech
 def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
-    # NOTE - the setting in this example are somehat different than what is
+    # NOTE - the setting in this example are somewhat different from what is
     # optimal for granite speech, and it is generally recommended to use beam
     # search. Check the model README for suggested settings.
     # https://huggingface.co/ibm-granite/granite-speech-3.3-8b
diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py
index 03c08240d6..133d5d6ee2 100644
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -250,7 +250,7 @@ def build_video_inputs_from_test_info(
 
 def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
                              size_type: SizeType):
-    """Applies a size scaler to one image; this can be a an image size factor,
+    """Applies a size scaler to one image; this can be an image size factor,
     which scales the image while maintaining the aspect ratio"""
     # Special case for embeddings; if it's a tensor, it's only valid if we
     # are considering size factors at constant scale, i.e., we just clone
diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
index 336e2dd2b1..1edb512135 100644
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -42,7 +42,7 @@ def get_filtered_test_settings(
             else:
                 assert test_info.prompt_formatter is not None
 
-            # Everything looks okay; keep if this is has correct proc handling
+            # Everything looks okay; keep if this is correct proc handling
             if (test_info.distributed_executor_backend
                     is not None) == new_proc_per_test:
                 matching_tests[test_name] = test_info
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index c5ed4c6e40..3b90375211 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -822,7 +822,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
             and context_lens_tensor is not None \
             and context_lens_tensor[:self.num_prefills].max() > 0:
 
-            # NOTE: it is recommend you read the `Chunked Prefill` section in
+            # NOTE: it is recommended you read the `Chunked Prefill` section in
             # the comment at the top of the file before trying to understand
             # the following code
 
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 9f9ad1854c..6010a4647a 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -717,7 +717,7 @@ class AsyncLLMEngine(EngineClient):
                 # Stop the execute model loop in parallel workers until there
                 # are more requests to process. This avoids waiting
                 # indefinitely in torch.distributed ops which may otherwise
-                # timeout, and unblocks the RPC thread in the workers so that
+                # time out, and unblocks the RPC thread in the workers so that
                 # they can process any other queued control plane messages,
                 # such as add/remove lora adapters.
                 await engine.engine.stop_remote_worker_execution_loop_async()
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 2d3248859c..0beb9c8cc0 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -270,7 +270,7 @@ class MQLLMEngineClient(EngineClient):
             queue.put_nowait(request_output)
 
     async def setup(self):
-        """Setup the client before it starts sending server requests."""
+        """Set up the client before it starts sending server requests."""
 
         # Start output_loop
         if self.output_loop is None:
diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py
index ebc526d6db..2e8894436a 100644
--- a/vllm/model_executor/layers/quantization/awq_triton.py
+++ b/vllm/model_executor/layers/quantization/awq_triton.py
@@ -19,7 +19,7 @@ def awq_dequantize_kernel(
         num_rows,  # input num rows in qweight
         BLOCK_SIZE_X: tl.constexpr,
         BLOCK_SIZE_Y: tl.constexpr):
-    # Setup the pids.
+    # Set up the pids.
     pid_x = tl.program_id(axis=0)
     pid_y = tl.program_id(axis=1)
 
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 4a43351260..6fd94afbe5 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -128,7 +128,7 @@ class QuantizationConfig(ABC):
     @staticmethod
     def get_from_keys_or(config: dict[str, Any], keys: list[str],
                          default: Any) -> Any:
-        """Get a optional value from the model's quantization config."""
+        """Get an optional value from the model's quantization config."""
         try:
             return QuantizationConfig.get_from_keys(config, keys)
         except ValueError:
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index b4c9aae254..9696b6c091 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -401,7 +401,7 @@ M = TypeVar("M", bound=MLACommonMetadata)
 
 
 def use_flashinfer_prefill() -> bool:
-    # For blackwell default to flashinfer prefill if its available since
+    # For blackwell default to flashinfer prefill if it's available since
     # it is faster than FA2.
     return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL
             and current_platform.is_device_capability(100))
@@ -1018,7 +1018,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
             return layer.weight
 
         # we currently do not have quantized bmm's which are needed for
-        # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
+        # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
         # the bmm's in 16-bit, the extra memory overhead of this is fairly low
         kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
         assert kv_b_proj_weight.shape == (

From 94866d7c9387e4e71944080fd30497bcb59db399 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 5 Sep 2025 00:06:51 +0800
Subject: [PATCH 855/932] [Misc] Slight improve deepgemm print (#24085)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/layers/fused_moe/deep_gemm_moe.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 7b8467a5a0..c0bfda73ee 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -57,13 +57,14 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
     if not _valid_deep_gemm_shape(M, N, K):
         logger.debug_once(
             "DeepGemm disabled due to unaligned problem size. "
-            "M: %s, N: %s, K: %s. M should >= align size "
-            "and N and K must be multiples of %s."
+            "M: %s, N: %s, K: %s. M should >= %s "
+            "and N and K must be multiples of %s. "
             "This is not an error and we will fall back to triton.",
             M,
             N,
             K,
             align,
+            align,
         )
         return False
     elif N <= 512:

From 78336a0c3ee4eb9dba6e37959d926160e91623fd Mon Sep 17 00:00:00 2001
From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com>
Date: Fri, 5 Sep 2025 00:49:20 +0800
Subject: [PATCH 856/932] Upgrade FlashInfer to v0.3.0 (#24086)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 docker/Dockerfile | 2 +-
 setup.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 6f8ca30ffd..b78d7d88f1 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -375,7 +375,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.2.14.post1"
+ARG FLASHINFER_GIT_REF="v0.3.0"
 # Flag to control whether to compile FlashInfer AOT kernels
 # Set to "true" to enable AOT compilation:
 # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
diff --git a/setup.py b/setup.py
index ffe8ec4e79..872696b250 100644
--- a/setup.py
+++ b/setup.py
@@ -694,7 +694,7 @@ setup(
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.14.post1"],
+        "flashinfer": ["flashinfer-python==0.3.0"],
         # Optional deps for AMD FP4 quantization support
         "petit-kernel": ["petit-kernel"],
     },

From 482e52f56ccd9f7e86654909acc2f99384ae874f Mon Sep 17 00:00:00 2001
From: "Saman A. Pour" <samanamp@outlook.com>
Date: Thu, 4 Sep 2025 13:33:43 -0700
Subject: [PATCH 857/932] QWEN3 Coder Fused MoE kernels Optimization configs
 (#24266)

Signed-off-by: Saman Keon <samanamp@outlook.com>
---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 9 files changed, 1314 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..bdbaf3811c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..6e17bcd214
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..aa7610cd75
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..df920e8b39
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..e8fe8ea67f
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..0baf13cb6a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..4fc4868eaa
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..d70adca05e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..0f5867fea5
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}

From 60b755cbcb1154e519572410f60d0d258eecbce4 Mon Sep 17 00:00:00 2001
From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Date: Thu, 4 Sep 2025 14:25:30 -0700
Subject: [PATCH 858/932] [Misc] Have AsyncLLM `custom_stat_loggers` extend
 default logger list (#20952)

Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Signed-off-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/engine/test_async_llm.py           |  5 +-
 tests/v1/metrics/test_engine_logger_apis.py | 83 +++++++++++++++++++++
 vllm/v1/engine/async_llm.py                 |  8 +-
 vllm/v1/metrics/loggers.py                  | 12 +--
 4 files changed, 99 insertions(+), 9 deletions(-)
 create mode 100644 tests/v1/metrics/test_engine_logger_apis.py

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index df04a14af7..aca546600d 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -393,7 +393,7 @@ class MockLoggingStatLogger(LoggingStatLogger):
 async def test_customize_loggers(monkeypatch):
     """Test that we can customize the loggers.
     If a customized logger is provided at the init, it should
-    be used directly.
+    be added to the default loggers.
     """
 
     with monkeypatch.context() as m, ExitStack() as after:
@@ -410,7 +410,8 @@ async def test_customize_loggers(monkeypatch):
 
         stat_loggers = engine.logger_manager.per_engine_logger_dict
         assert len(stat_loggers) == 1
-        assert len(stat_loggers[0]) == 1
+        assert len(
+            stat_loggers[0]) == 2  # LoggingStatLogger + MockLoggingStatLogger
         stat_loggers[0][0].log.assert_called_once()
 
 
diff --git a/tests/v1/metrics/test_engine_logger_apis.py b/tests/v1/metrics/test_engine_logger_apis.py
new file mode 100644
index 0000000000..e6a4d0a2a2
--- /dev/null
+++ b/tests/v1/metrics/test_engine_logger_apis.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+
+import pytest
+
+from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
+from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
+
+
+class DummyStatLogger:
+    """
+    A dummy stat logger for testing purposes.
+    Implements the minimal interface expected by StatLoggerManager.
+    """
+
+    def __init__(self, vllm_config, engine_idx):
+        self.vllm_config = vllm_config
+        self.engine_idx = engine_idx
+        self.recorded = []
+        self.logged = False
+        self.engine_initialized = False
+
+    def record(self, scheduler_stats, iteration_stats, engine_idx):
+        self.recorded.append((scheduler_stats, iteration_stats, engine_idx))
+
+    def log(self):
+        self.logged = True
+
+    def log_engine_initialized(self):
+        self.engine_initialized = True
+
+
+@pytest.fixture
+def log_stats_enabled_engine_args():
+    """
+    Shared fixture providing common AsyncEngineArgs configuration
+    used across multiple tests.
+    """
+    return AsyncEngineArgs(
+        model="distilbert/distilgpt2",
+        dtype="half",
+        disable_log_stats=False,
+        enforce_eager=True,
+    )
+
+
+@pytest.mark.asyncio
+async def test_async_llm_replace_default_loggers(
+        log_stats_enabled_engine_args):
+    """
+    RayPrometheusStatLogger should replace the default PrometheusStatLogger
+    """
+
+    engine = AsyncLLM.from_engine_args(log_stats_enabled_engine_args,
+                                       stat_loggers=[RayPrometheusStatLogger])
+    assert isinstance(engine.logger_manager.prometheus_logger,
+                      RayPrometheusStatLogger)
+    engine.shutdown()
+
+
+@pytest.mark.asyncio
+async def test_async_llm_add_to_default_loggers(log_stats_enabled_engine_args):
+    """
+    It's still possible to use custom stat loggers exclusively by passing 
+    disable_log_stats=True in addition to a list of custom stat loggers.
+    """
+    # Create engine_args with disable_log_stats=True for this test
+    disabled_log_engine_args = copy.deepcopy(log_stats_enabled_engine_args)
+    disabled_log_engine_args.disable_log_stats = True
+
+    # Disable default loggers; pass custom stat logger to the constructor
+    engine = AsyncLLM.from_engine_args(disabled_log_engine_args,
+                                       stat_loggers=[DummyStatLogger])
+
+    assert len(engine.logger_manager.per_engine_logger_dict[0]) == 1
+    assert isinstance(engine.logger_manager.per_engine_logger_dict[0][0],
+                      DummyStatLogger)
+
+    # log_stats is still True, since custom stat loggers are used
+    assert engine.log_stats
+
+    engine.shutdown()
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 2a9fa1fd91..d23602eaaf 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -98,7 +98,12 @@ class AsyncLLM(EngineClient):
         self.model_config = vllm_config.model_config
         self.vllm_config = vllm_config
         self.log_requests = log_requests
-        self.log_stats = log_stats
+
+        self.log_stats = log_stats or (stat_loggers is not None)
+        if not log_stats and stat_loggers is not None:
+            logger.info(
+                "AsyncLLM created with log_stats=False and non-empty custom "
+                "logger list; enabling logging without default stat loggers")
 
         if self.model_config.skip_tokenizer_init:
             self.tokenizer = None
@@ -137,6 +142,7 @@ class AsyncLLM(EngineClient):
                 vllm_config=vllm_config,
                 engine_idxs=self.engine_core.engine_ranks_managed,
                 custom_stat_loggers=stat_loggers,
+                enable_default_loggers=log_stats,
             )
             self.logger_manager.log_engine_initialized()
 
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 41e07a0056..f480344c85 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -651,16 +651,16 @@ class StatLoggerManager:
         vllm_config: VllmConfig,
         engine_idxs: Optional[list[int]] = None,
         custom_stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        enable_default_loggers: bool = True,
     ):
         self.engine_idxs = engine_idxs if engine_idxs else [0]
 
-        factories: list[StatLoggerFactory]
+        factories: list[StatLoggerFactory] = []
         if custom_stat_loggers is not None:
-            factories = custom_stat_loggers
-        else:
-            factories = []
-            if logger.isEnabledFor(logging.INFO):
-                factories.append(LoggingStatLogger)
+            factories.extend(custom_stat_loggers)
+
+        if enable_default_loggers and logger.isEnabledFor(logging.INFO):
+            factories.append(LoggingStatLogger)
 
         # engine_idx: StatLogger
         self.per_engine_logger_dict: dict[int, list[StatLoggerBase]] = {}

From adc3ddb4309d4843a5425eed702bb3e1d942fc2d Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Fri, 5 Sep 2025 05:25:45 +0800
Subject: [PATCH 859/932] [Bugfix][Misc] Fix silu_and_mul_nvfp4_quant issue and
 extract common utils for nvfp4 kernel source files (#23727)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |   4 +-
 csrc/dispatch_utils.h                         |   9 -
 csrc/ops.h                                    |   3 +-
 .../activation_nvfp4_quant_fusion_kernels.cu  | 212 ++----------
 .../fp4/nvfp4_blockwise_moe_kernel.cu         |  16 +
 csrc/quantization/fp4/nvfp4_experts_quant.cu  | 310 +++---------------
 csrc/quantization/fp4/nvfp4_quant_entry.cu    |  18 +
 csrc/quantization/fp4/nvfp4_quant_kernels.cu  | 271 +--------------
 csrc/quantization/fp4/nvfp4_utils.cuh         | 251 ++++++++++++++
 csrc/torch_bindings.cpp                       |   3 +-
 .../test_silu_nvfp4_quant_fusion.py           |   3 +-
 11 files changed, 382 insertions(+), 718 deletions(-)
 create mode 100644 csrc/quantization/fp4/nvfp4_utils.cuh

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index be7044c41a..55349e0ac9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -666,7 +666,7 @@ steps:
     # Quantization
     - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
     - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    # - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+    - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -676,7 +676,7 @@ steps:
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    # - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 2728aa81f0..995374a50b 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -52,15 +52,6 @@
 #define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))
 
-#define AT_DISPATCH_BYTE_CASE(enum_type, ...) \
-  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, byte_t, __VA_ARGS__)
-
-#define VLLM_DISPATCH_CASE_BYTE_TYPES(...) \
-  AT_DISPATCH_BYTE_CASE(at::ScalarType::Byte, __VA_ARGS__)
-
-#define VLLM_DISPATCH_BYTE_TYPES(TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_BYTE_TYPES(__VA_ARGS__))
-
 #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
 
diff --git a/csrc/ops.h b/csrc/ops.h
index 7a176a5c00..a288112e21 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -130,8 +130,7 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
                         torch::Tensor& scale);
 
-#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
-    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+#ifndef USE_ROCM
 void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                               torch::Tensor& output_block_scale,
                               torch::Tensor& input,
diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
index 9bbeb0334f..b4eb141cb4 100644
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -26,164 +26,17 @@
 #include "dispatch_utils.h"
 
 #include "cuda_utils.h"
+#include "nvfp4_utils.cuh"
 
 namespace vllm {
 
-// Get type2 from type or vice versa (applied to half and bfloat16)
-template <typename T>
-struct TypeConverter {
-  using Type = half2;
-};  // keep for generality
-
-template <>
-struct TypeConverter<half2> {
-  using Type = c10::Half;
-};
-
-template <>
-struct TypeConverter<c10::Half> {
-  using Type = half2;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat162> {
-  using Type = c10::BFloat16;
-};
-
-template <>
-struct TypeConverter<c10::BFloat16> {
-  using Type = __nv_bfloat162;
-};
-
-#define ELTS_PER_THREAD 8
-
-constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
-constexpr int CVT_FP4_SF_VEC_SIZE = 16;
-
-// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
-        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
-        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Fast reciprocal.
-inline __device__ float reciprocal_approximate_ftz(float a) {
-  float b;
-  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
-  return b;
-}
-
-template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
-                                                       int numCols,
-                                                       SFType* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
-                CVT_FP4_NUM_THREADS_PER_SF == 2);
-
-  // One pair of threads write one SF to global memory.
-  // TODO: stage through smem for packed STG.32
-  // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    // SF vector index (16 elements share one SF in the K dimension).
-    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
-    int32_t mIdx = rowIdx;
-
-    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
-    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
-
-    int32_t mTileIdx = mIdx / (32 * 4);
-    // SF vector size 16.
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
-    int64_t mTileStride = numKTiles * 32 * 4 * 4;
-
-    int32_t kTileIdx = (kIdx / 4);
-    int64_t kTileStride = 32 * 4 * 4;
-
-    // M tile layout [32, 4] is column-major.
-    int32_t outerMIdx = (mIdx % 32);
-    int64_t outerMStride = 4 * 4;
-
-    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
-    int64_t innerMStride = 4;
-
-    int32_t innerKIdx = (kIdx % 4);
-    int64_t innerKStride = 1;
-
-    // Compute the global offset.
-    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
-                       outerMIdx * outerMStride + innerMIdx * innerMStride +
-                       innerKIdx * innerKStride;
-
-    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-  }
-#endif
-  return nullptr;
-}
-
-// Define a 16 bytes packed data type.
-template <class Type>
-struct PackedVec {
-  typename TypeConverter<Type>::Type elts[4];
-};
-
-template <>
-struct PackedVec<__nv_fp8_e4m3> {
-  __nv_fp8x2_e4m3 elts[8];
-};
-
 template <class Type>
 __inline__ __device__ PackedVec<Type> compute_silu(PackedVec<Type>& vec,
                                                    PackedVec<Type>& vec2) {
   PackedVec<Type> result;
 #pragma unroll
   for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
-    if constexpr (std::is_same_v<Type, c10::Half>) {
+    if constexpr (std::is_same_v<Type, half>) {
       half2 val(0.5f, 0.5f);
       half2 t0 = __hmul2(vec.elts[i], val);
       half2 t1 = __hfma2(h2tanh(t0), val, val);
@@ -206,13 +59,12 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
                                                   PackedVec<Type>& vec2,
                                                   float SFScaleVal,
                                                   uint8_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   PackedVec<Type> out_silu = compute_silu(vec, vec2);
   // Get absolute maximum values among the local 8 values.
   auto localMax = __habs2(out_silu.elts[0]);
 
-  // Local maximum value.
-  #pragma unroll
+// Local maximum value.
+#pragma unroll
   for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
     localMax = __hmax2(localMax, __habs2(out_silu.elts[i]));
   }
@@ -259,9 +111,9 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
   // Convert the input to float.
   float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
 
-  #pragma unroll
+#pragma unroll
   for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    if constexpr (std::is_same_v<Type, c10::Half>) {
+    if constexpr (std::is_same_v<Type, half>) {
       fp2Vals[i] = __half22float2(out_silu.elts[i]);
     } else {
       fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]);
@@ -275,22 +127,14 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
 
   // Write the e2m1 values to global memory.
   return e2m1Vec;
-#else
-  return 0;
-#endif
 }
 
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(1024, 4) silu_and_cvt_fp16_to_fp4(
-#else
-silu_and_cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__global__ void __launch_bounds__(1024, 4)
+    silu_and_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                             float const* SFScale, uint32_t* out,
+                             uint32_t* SFout) {
   using PackedVec = PackedVec<Type>;
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@@ -328,22 +172,25 @@ silu_and_cvt_fp16_to_fp4(
           in_vec, in_vec2, SFScaleVal, sf_out);
     }
   }
-#endif
 }
 
 }  // namespace vllm
 
-void silu_and_mul_nvfp4_quant(torch::Tensor& output,  // [..., d]
-                              torch::Tensor& output_sf,
-                              torch::Tensor& input,  // [..., 2 * d]
-                              torch::Tensor& input_sf) {
-  TORCH_CHECK(input.dtype() == torch::kFloat16 ||
-              input.dtype() == torch::kBFloat16);
+void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
+                                     torch::Tensor& output_sf,
+                                     torch::Tensor& input,  // [..., 2 * d]
+                                     torch::Tensor& input_sf) {
   int32_t m = input.size(0);
   int32_t n = input.size(1) / 2;
+
   TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half ||
+                  input.scalar_type() == at::ScalarType::BFloat16,
+              "Unsupported input data type for quantize_to_fp4.");
+
   int multiProcessorCount =
       get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
+
   auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
   auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
   auto output_ptr = static_cast<int64_t*>(output.data_ptr());
@@ -352,17 +199,14 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& output,  // [..., d]
   dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
   int const numBlocksPerSM = 2048 / block.x;
   dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
   VLLM_DISPATCH_HALF_TYPES(
-      input.scalar_type(), "act_and_mul_quant_kernel", [&] {
-        auto input_ptr = reinterpret_cast<scalar_t const*>(input.data_ptr());
-        VLLM_DISPATCH_BYTE_TYPES(
-            output.scalar_type(), "fused_act_and_mul_quant_kernel_nvfp4_type",
-            [&] {
-              vllm::silu_and_cvt_fp16_to_fp4<scalar_t>
-                  <<<grid, block, 0, stream>>>(
-                      m, n, input_ptr, input_sf_ptr,
-                      reinterpret_cast<uint32_t*>(output_ptr),
-                      reinterpret_cast<uint32_t*>(sf_out));
-            });
+      input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
+        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+        auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
+        vllm::silu_and_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
+            m, n, input_ptr, input_sf_ptr,
+            reinterpret_cast<uint32_t*>(output_ptr),
+            reinterpret_cast<uint32_t*>(sf_out));
       });
 }
diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
index 03db5cc196..2c8df6144b 100644
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>
 
diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu
index 190d66f318..ce3ba2c19b 100644
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -1,247 +1,42 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include <torch/all.h>
 
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 
-#include <cuda_runtime.h>
 #include <cuda_fp8.h>
+#include "dispatch_utils.h"
 
-template <typename T>
-struct TypeConverter {
-  using Type = half2;
-};  // keep for generality
+#include "nvfp4_utils.cuh"
 
-template <>
-struct TypeConverter<half2> {
-  using Type = half;
-};
-
-template <>
-struct TypeConverter<half> {
-  using Type = half2;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat162> {
-  using Type = __nv_bfloat16;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat16> {
-  using Type = __nv_bfloat162;
-};
-
-#define ELTS_PER_THREAD 8
-
-constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
-constexpr int CVT_FP4_SF_VEC_SIZE = 16;
-
-// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
-        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
-        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Fast reciprocal.
-inline __device__ float reciprocal_approximate_ftz(float a) {
-  float b;
-  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
-  return b;
-}
-
-template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
-                                                       int numCols,
-                                                       SFType* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
-                CVT_FP4_NUM_THREADS_PER_SF == 2);
-
-  // One pair of threads write one SF to global memory.
-  // TODO: stage through smem for packed STG.32
-  // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    // SF vector index (16 elements share one SF in the K dimension).
-    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
-    int32_t mIdx = rowIdx;
-
-    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
-    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
-
-    int32_t mTileIdx = mIdx / (32 * 4);
-    // SF vector size 16.
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
-    int64_t mTileStride = numKTiles * 32 * 4 * 4;
-
-    int32_t kTileIdx = (kIdx / 4);
-    int64_t kTileStride = 32 * 4 * 4;
-
-    // M tile layout [32, 4] is column-major.
-    int32_t outerMIdx = (mIdx % 32);
-    int64_t outerMStride = 4 * 4;
-
-    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
-    int64_t innerMStride = 4;
-
-    int32_t innerKIdx = (kIdx % 4);
-    int64_t innerKStride = 1;
-
-    // Compute the global offset.
-    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
-                       outerMIdx * outerMStride + innerMIdx * innerMStride +
-                       innerKIdx * innerKStride;
-
-    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-  }
-#endif
-  return nullptr;
-}
-
-// Define a 16 bytes packed data type.
-template <class Type>
-struct PackedVec {
-  typename TypeConverter<Type>::Type elts[4];
-};
-
-template <>
-struct PackedVec<__nv_fp8_e4m3> {
-  __nv_fp8x2_e4m3 elts[8];
-};
-
-// Quantizes the provided PackedVec into the uint32_t output
-template <class Type, bool UE8M0_SF = false>
-__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
-                                         uint8_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  // Get absolute maximum values among the local 8 values.
-  auto localMax = __habs2(vec.elts[0]);
-
-  // Local maximum value.
-  #pragma unroll
-  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
-  }
-
-  // Get the absolute maximum among all 16 values (two threads).
-  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
-  // Get the final absolute maximum values.
-  float vecMax = float(__hmax(localMax.x, localMax.y));
-
-  // Get the SF (max value of the vector / max value of e2m1).
-  // maximum value of e2m1 = 6.0.
-  // TODO: use half as compute data type.
-  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
-  // 8 bits representation of the SF.
-  uint8_t fp8SFVal;
-  // Write the SF to global memory (STG.8).
-  if constexpr (UE8M0_SF) {
-    // Extract the 8 exponent bits from float32.
-    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
-    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
-    fp8SFVal = tmp & 0xff;
-    // Convert back to fp32.
-    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
-  } else {
-    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
-    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
-    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
-    // Convert back to fp32.
-    SFValue = float(tmp);
-  }
-  // Get the output scale.
-  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
-  //                       reciprocal(SFScaleVal))
-  float outputScale =
-      SFValue != 0 ? reciprocal_approximate_ftz(
-                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
-                   : 0.0f;
-
-  if (SFout) {
-    // Write the SF to global memory (STG.8).
-    *SFout = fp8SFVal;
-  }
-
-  // Convert the input to float.
-  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
-
-  #pragma unroll
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    if constexpr (std::is_same_v<Type, half>) {
-      fp2Vals[i] = __half22float2(vec.elts[i]);
-    } else {
-      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
-    }
-    fp2Vals[i].x *= outputScale;
-    fp2Vals[i].y *= outputScale;
-  }
-
-  // Convert to e2m1 values.
-  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
-
-  // Write the e2m1 values to global memory.
-  return e2m1Vec;
-#else
-  return 0;
-#endif
-}
+namespace vllm {
 
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp16_to_fp4(
-#else
-cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts,
-    uint32_t* output_scale_offset_by_experts, int n_experts, bool low_latency) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__global__ void __launch_bounds__(512, 4)
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout,
+                    uint32_t* input_offset_by_experts,
+                    uint32_t* output_scale_offset_by_experts, int n_experts,
+                    bool low_latency) {
   using PackedVec = PackedVec<Type>;
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@@ -299,8 +94,8 @@ cvt_fp16_to_fp4(
                 &input_offset_by_experts[chunk_start + 12]));
         local_offsets[16] = __ldca(&input_offset_by_experts[chunk_start + 16]);
 
-  // Check against the 16 loaded offsets
-  #pragma unroll
+// Check against the 16 loaded offsets
+#pragma unroll
         for (int i = 0; i < 16; i++) {
           if (rowIdx >= local_offsets[i] && rowIdx < local_offsets[i + 1]) {
             rowIdx_in_expert = rowIdx - local_offsets[i];
@@ -330,21 +125,15 @@ cvt_fp16_to_fp4(
 
     out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
   }
-#endif
 }
 
 // Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(1024, 4) cvt_fp16_to_fp4(
-#else
-cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts,
-    uint32_t* output_scale_offset_by_experts, int n_experts) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__global__ void __launch_bounds__(1024, 4)
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout,
+                    uint32_t* input_offset_by_experts,
+                    uint32_t* output_scale_offset_by_experts, int n_experts) {
   using PackedVec = PackedVec<Type>;
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@@ -425,7 +214,6 @@ cvt_fp16_to_fp4(
 
     out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
   }
-#endif
 }
 
 template <typename T>
@@ -501,6 +289,8 @@ void quant_impl(void* output, void* output_scale, void* input,
   }
 }
 
+}  // namespace vllm
+
 /*Quantization entry for fp4 experts quantization*/
 #define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x, m) \
@@ -560,23 +350,17 @@ void scaled_fp4_experts_quant_sm100a(
   // 4 means 4 fp8 values are packed into one int32
   TORCH_CHECK(output_scale.size(1) * 4 == padded_k);
 
-  auto in_dtype = input.dtype();
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream =
       at::cuda::getCurrentCUDAStream(input.get_device());
-  if (in_dtype == at::ScalarType::Half) {
-    quant_impl<half>(output.data_ptr(), output_scale.data_ptr(),
-                     input.data_ptr(), input_global_scale.data_ptr(),
-                     input_offset_by_experts.data_ptr(),
-                     output_scale_offset_by_experts.data_ptr(), m_topk, k,
-                     n_experts, stream);
-  } else if (in_dtype == at::ScalarType::BFloat16) {
-    quant_impl<__nv_bfloat16>(output.data_ptr(), output_scale.data_ptr(),
-                              input.data_ptr(), input_global_scale.data_ptr(),
-                              input_offset_by_experts.data_ptr(),
-                              output_scale_offset_by_experts.data_ptr(), m_topk,
-                              k, n_experts, stream);
-  } else {
-    TORCH_CHECK(false, "Expected input data type to be half or bfloat16");
-  }
+
+  VLLM_DISPATCH_HALF_TYPES(
+      input.scalar_type(), "nvfp4_experts_quant_kernel", [&] {
+        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+        vllm::quant_impl<cuda_type>(
+            output.data_ptr(), output_scale.data_ptr(), input.data_ptr(),
+            input_global_scale.data_ptr(), input_offset_by_experts.data_ptr(),
+            output_scale_offset_by_experts.data_ptr(), m_topk, k, n_experts,
+            stream);
+      });
 }
diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu
index 1b61bd4519..c2b39e5438 100644
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -32,6 +32,14 @@ void scaled_fp4_experts_quant_sm100a(
     torch::Tensor const& output_scale_offset_by_experts);
 #endif
 
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,
+                                     torch::Tensor& output_sf,
+                                     torch::Tensor& input,
+                                     torch::Tensor& input_sf);
+#endif
+
 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
                       torch::Tensor& output_sf, torch::Tensor const& input_sf) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
@@ -54,3 +62,13 @@ void scaled_fp4_experts_quant(
   TORCH_CHECK_NOT_IMPLEMENTED(false,
                               "No compiled nvfp4 experts quantization kernel");
 }
+
+void silu_and_mul_nvfp4_quant(torch::Tensor& output, torch::Tensor& output_sf,
+                              torch::Tensor& input, torch::Tensor& input_sf) {
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  return silu_and_mul_nvfp4_quant_sm1xxa(output, output_sf, input, input_sf);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "No compiled silu_and_mul nvfp4 quantization kernel");
+}
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
index 4e080de151..0c1b9ef066 100644
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -23,245 +23,18 @@
 #include <c10/cuda/CUDAGuard.h>
 
 #include <cuda_fp8.h>
+#include "dispatch_utils.h"
 
 #include "cuda_utils.h"
+#include "nvfp4_utils.cuh"
 
-// Get type2 from type or vice versa (applied to half and bfloat16)
-template <typename T>
-struct TypeConverter {
-  using Type = half2;
-};  // keep for generality
-
-template <>
-struct TypeConverter<half2> {
-  using Type = half;
-};
-
-template <>
-struct TypeConverter<half> {
-  using Type = half2;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat162> {
-  using Type = __nv_bfloat16;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat16> {
-  using Type = __nv_bfloat162;
-};
-
-#define ELTS_PER_THREAD 8
-
-constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
-constexpr int CVT_FP4_SF_VEC_SIZE = 16;
-
-// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
-        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
-        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Fast reciprocal.
-inline __device__ float reciprocal_approximate_ftz(float a) {
-  float b;
-  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
-  return b;
-}
-
-template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
-                                                       int numCols,
-                                                       SFType* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
-                CVT_FP4_NUM_THREADS_PER_SF == 2);
-
-  // One pair of threads write one SF to global memory.
-  // TODO: stage through smem for packed STG.32
-  // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    // SF vector index (16 elements share one SF in the K dimension).
-    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
-    int32_t mIdx = rowIdx;
-
-    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
-    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
-
-    int32_t mTileIdx = mIdx / (32 * 4);
-    // SF vector size 16.
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
-    int64_t mTileStride = numKTiles * 32 * 4 * 4;
-
-    int32_t kTileIdx = (kIdx / 4);
-    int64_t kTileStride = 32 * 4 * 4;
-
-    // M tile layout [32, 4] is column-major.
-    int32_t outerMIdx = (mIdx % 32);
-    int64_t outerMStride = 4 * 4;
-
-    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
-    int64_t innerMStride = 4;
-
-    int32_t innerKIdx = (kIdx % 4);
-    int64_t innerKStride = 1;
-
-    // Compute the global offset.
-    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
-                       outerMIdx * outerMStride + innerMIdx * innerMStride +
-                       innerKIdx * innerKStride;
-
-    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-  }
-#endif
-  return nullptr;
-}
-
-// Define a 16 bytes packed data type.
-template <class Type>
-struct PackedVec {
-  typename TypeConverter<Type>::Type elts[4];
-};
-
-template <>
-struct PackedVec<__nv_fp8_e4m3> {
-  __nv_fp8x2_e4m3 elts[8];
-};
-
-// Quantizes the provided PackedVec into the uint32_t output
-template <class Type, bool UE8M0_SF = false>
-__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
-                                         uint8_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  // Get absolute maximum values among the local 8 values.
-  auto localMax = __habs2(vec.elts[0]);
-
-  // Local maximum value.
-  #pragma unroll
-  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
-  }
-
-  // Get the absolute maximum among all 16 values (two threads).
-  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
-  // Get the final absolute maximum values.
-  float vecMax = float(__hmax(localMax.x, localMax.y));
-
-  // Get the SF (max value of the vector / max value of e2m1).
-  // maximum value of e2m1 = 6.0.
-  // TODO: use half as compute data type.
-  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
-  // 8 bits representation of the SF.
-  uint8_t fp8SFVal;
-  // Write the SF to global memory (STG.8).
-  if constexpr (UE8M0_SF) {
-    // Extract the 8 exponent bits from float32.
-    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
-    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
-    fp8SFVal = tmp & 0xff;
-    // Convert back to fp32.
-    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
-  } else {
-    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
-    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
-    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
-    // Convert back to fp32.
-    SFValue = float(tmp);
-  }
-  // Get the output scale.
-  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
-  //                       reciprocal(SFScaleVal))
-  float outputScale =
-      SFValue != 0 ? reciprocal_approximate_ftz(
-                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
-                   : 0.0f;
-
-  if (SFout) {
-    // Write the SF to global memory (STG.8).
-    *SFout = fp8SFVal;
-  }
-
-  // Convert the input to float.
-  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
-
-  #pragma unroll
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    if constexpr (std::is_same_v<Type, half>) {
-      fp2Vals[i] = __half22float2(vec.elts[i]);
-    } else {
-      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
-    }
-    fp2Vals[i].x *= outputScale;
-    fp2Vals[i].y *= outputScale;
-  }
-
-  // Convert to e2m1 values.
-  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
-
-  // Write the e2m1 values to global memory.
-  return e2m1Vec;
-#else
-  return 0;
-#endif
-}
+namespace vllm {
 
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp16_to_fp4(
-#else
-cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__global__ void __launch_bounds__(512, 4)
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout) {
   using PackedVec = PackedVec<Type>;
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@@ -293,7 +66,6 @@ cvt_fp16_to_fp4(
           cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
     }
   }
-#endif
 }
 
 template <typename T>
@@ -332,6 +104,8 @@ template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
                                     int multiProcessorCount,
                                     cudaStream_t stream);
 
+}  // namespace vllm
+
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                              torch::Tensor const& input,
                              torch::Tensor const& output_sf,
@@ -340,6 +114,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
   int32_t n = input.size(1);
 
   TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half ||
+                  input.scalar_type() == at::ScalarType::BFloat16,
+              "Unsupported input data type for quantize_to_fp4.");
 
   int multiProcessorCount =
       get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
@@ -353,24 +130,10 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
   // We don't support e8m0 scales at this moment.
   bool useUE8M0 = false;
 
-  switch (input.scalar_type()) {
-    case torch::kHalf: {
-      auto input_ptr = reinterpret_cast<half const*>(input.data_ptr());
-      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out,
-                            useUE8M0, multiProcessorCount, stream);
-      break;
-    }
-    case torch::kBFloat16: {
-      auto input_ptr = reinterpret_cast<__nv_bfloat16 const*>(input.data_ptr());
-      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out,
-                            useUE8M0, multiProcessorCount, stream);
-      break;
-    }
-    default: {
-      std::cerr << "Observing: " << input.scalar_type()
-                << " for the input datatype which is invalid";
-      throw std::runtime_error(
-          "Unsupported input data type for quantize_to_fp4.");
-    }
-  }
+  VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
+    using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+    auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
+    vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr,
+                                sf_out, useUE8M0, multiProcessorCount, stream);
+  });
 }
diff --git a/csrc/quantization/fp4/nvfp4_utils.cuh b/csrc/quantization/fp4/nvfp4_utils.cuh
new file mode 100644
index 0000000000..48e4959de9
--- /dev/null
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cuda_fp8.h>
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+namespace vllm {
+
+// Convert PyTorch cpp type to CUDA type
+template <typename T>
+struct CUDATypeConverter {
+  using Type = T;
+};
+
+template <>
+struct CUDATypeConverter<at::Half> {
+  using Type = half;
+};
+
+template <>
+struct CUDATypeConverter<at::BFloat16> {
+  using Type = __nv_bfloat16;
+};
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = half;
+};
+
+template <>
+struct TypeConverter<half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = __nv_bfloat16;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat16> {
+  using Type = __nv_bfloat162;
+};
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
+        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
+  return val;
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+  return val;
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
+                                                       int numCols,
+                                                       SFType* SFout) {
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
+                CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
+                       outerMIdx * outerMStride + innerMIdx * innerMStride +
+                       innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+  return nullptr;
+}
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
+                                         uint8_t* SFout) {
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(vec.elts[0]);
+
+// Local maximum value.
+#pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(
+                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                   : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(vec.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+}
+
+}  // namespace vllm
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 56626a02c0..b769c09adc 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -115,8 +115,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
   ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
 
-#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
-    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+#ifndef USE_ROCM
   ops.def(
       "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
       "Tensor input, Tensor input_global_scale) -> ()");
diff --git a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
index 4325162ae9..969f14cc3f 100644
--- a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+++ b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
@@ -8,8 +8,7 @@ from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
-if not (current_platform.has_device_capability(100)
-        and hasattr(torch.ops._C, "silu_and_mul_nvfp4_quant")):
+if not current_platform.has_device_capability(100):
     pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
                 allow_module_level=True)
 

From 886ccbe5bae5ac3562f2b73c7770bba73d2ea34e Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Thu, 4 Sep 2025 14:58:44 -0700
Subject: [PATCH 860/932] [CI/Build] Reduce the number of redundant cases to
 test for LoRA (#24276)

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
---
 tests/lora/test_layers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 6e2dda464d..891bc75fcd 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -60,9 +60,9 @@ DEVICES = ([
 # prefill stage(True) or decode stage(False)
 STAGES = [True, False]
 
-NUM_RANDOM_SEEDS = 6
+NUM_RANDOM_SEEDS = 2
 
-VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
+VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 2
 
 
 @pytest.fixture(autouse=True)

From 65e038931d8599dd9ab80ca5b53d5573d5b74fd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 5 Sep 2025 01:04:12 +0200
Subject: [PATCH 861/932] [Frontend] Skip unnecessary detokenization when
 token_id is requested (#24236)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/entrypoints/openai/serving_chat.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 35edd2f85c..fff6dcd724 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1419,9 +1419,10 @@ class OpenAIServingChat(OpenAIServing):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None or step_top_logprobs.get(
                     token_id) is None:
-                token = tokenizer.decode(token_id)
                 if should_return_as_token_id:
                     token = f"token_id:{token_id}"
+                else:
+                    token = tokenizer.decode(token_id)
 
                 logprobs_content.append(
                     ChatCompletionLogProbsContent(

From c29fb540ff90da720490daae58bb4bfe31a91125 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Thu, 4 Sep 2025 23:39:12 -0400
Subject: [PATCH 862/932] [gpt-oss] tool parser supports for /chat/completions
 [1/n] (#22386)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 tests/entrypoints/openai/test_serving_chat.py | 163 +++++++++++++++++-
 tests/tool_use/test_openai_tool_parser.py     | 147 ++++++++++++++++
 vllm/entrypoints/harmony_utils.py             |  72 ++++++--
 vllm/entrypoints/openai/serving_chat.py       | 132 ++++++++++----
 .../openai/tool_parsers/__init__.py           |   2 +
 .../openai/tool_parsers/openai_tool_parser.py |  73 ++++++++
 vllm/model_executor/models/config.py          |   2 +-
 vllm/reasoning/gptoss_reasoning_parser.py     |  45 +++--
 8 files changed, 573 insertions(+), 63 deletions(-)
 create mode 100644 tests/tool_use/test_openai_tool_parser.py
 create mode 100644 vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index fe482112d3..c609cfb5c0 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,13 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from __future__ import annotations
+
 import asyncio
 from contextlib import suppress
 from dataclasses import dataclass, field
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 from unittest.mock import MagicMock
 
 import pytest
+import pytest_asyncio
 
 from vllm.config import MultiModalConfig
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
@@ -17,6 +20,164 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
+from ...utils import RemoteOpenAIServer
+
+if TYPE_CHECKING:
+    from openai import OpenAI
+
+GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
+
+
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module")
+def gptoss_server(monkeypatch_module: pytest.MonkeyPatch):
+    with monkeypatch_module.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
+        args = [
+            "--enforce-eager",
+            "--max-model-len",
+            "8192",
+            "--tool-call-parser",
+            "openai",
+            "--reasoning-parser",
+            "openai_gptoss",
+            "--enable-auto-tool-choice",
+        ]
+        with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, args) as remote_server:
+            yield remote_server
+
+
+@pytest_asyncio.fixture
+async def gptoss_client(gptoss_server):
+    async with gptoss_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI):
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string"
+                    },
+                    "state": {
+                        "type": "string"
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }]
+
+    messages = [
+        {
+            "role": "user",
+            "content": "What is the weather in Dallas, TX?"
+        },
+    ]
+
+    stream = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME, messages=messages, tools=tools, stream=True)
+
+    name = None
+    args_buf = ""
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.tool_calls:
+            tc = delta.tool_calls[0]
+            if tc.function and tc.function.name:
+                name = tc.function.name
+            if tc.function and tc.function.arguments:
+                args_buf += tc.function.arguments
+
+    assert name is not None
+    assert len(args_buf) > 0
+
+
+@pytest.mark.asyncio
+async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI):
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string"
+                    },
+                    "state": {
+                        "type": "string"
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }]
+
+    messages = [
+        {
+            "role": "system",
+            "content": "you are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "What is the weather in Dallas, TX?"
+        },
+    ]
+
+    first = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools,
+        temperature=0.0,
+    )
+    first_msg = first.choices[0].message
+    assert first_msg.tool_calls is not None and len(first_msg.tool_calls) > 0
+    tc = first_msg.tool_calls[0]
+    assert tc.function is not None and tc.function.name == "get_current_weather"
+    args1 = tc.function.arguments
+    assert args1 is not None and len(args1) > 0
+
+    messages.append({"role": "assistant", "content": args1})
+    messages.append({
+        "role": "user",
+        "content": "Now convert to celsius and return JSON only"
+    })
+
+    second = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools,
+        temperature=0.0,
+    )
+    second_msg = second.choices[0].message
+    assert (second_msg.content is not None and len(second_msg.content) > 0) or \
+        (second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0)  # noqa: E501
+
+
 MODEL_NAME = "openai-community/gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
diff --git a/tests/tool_use/test_openai_tool_parser.py b/tests/tool_use/test_openai_tool_parser.py
new file mode 100644
index 0000000000..0192c7d276
--- /dev/null
+++ b/tests/tool_use/test_openai_tool_parser.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+from openai_harmony import (Conversation, DeveloperContent,
+                            HarmonyEncodingName, Message, Role, SystemContent,
+                            load_harmony_encoding)
+
+from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.tool_parsers import OpenAIToolParser
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+MODEL = "gpt2"
+
+
+@pytest.fixture(scope="module")
+def openai_tokenizer():
+    # The parser does not use the tokenizer, but the constructor requires it.
+    return get_tokenizer(MODEL)
+
+
+@pytest.fixture
+def openai_tool_parser(openai_tokenizer):
+    return OpenAIToolParser(openai_tokenizer)
+
+
+@pytest.fixture(scope="module")
+def harmony_encoding():
+    return load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+
+
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall],
+    expected_tool_calls: list[ToolCall],
+):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
+                                                    expected_tool_calls):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) > 16  # Default from protocol.py
+        assert actual_tool_call.type == "function"
+        assert actual_tool_call.function == expected_tool_call.function
+
+
+def test_extract_tool_calls_no_tools(openai_tool_parser, harmony_encoding):
+    convo = Conversation.from_messages([
+        Message.from_role_and_content(
+            Role.SYSTEM,
+            SystemContent.new(),
+        ),
+        Message.from_role_and_content(
+            Role.DEVELOPER,
+            DeveloperContent.new().with_instructions("Talk like a pirate!")),
+        Message.from_role_and_content(Role.USER, "Arrr, how be you?"),
+        Message.from_role_and_content(Role.ASSISTANT,
+                                      "This is a test").with_channel("final")
+    ])
+    token_ids = harmony_encoding.render_conversation_for_completion(
+        convo, Role.ASSISTANT)
+    extracted_info = openai_tool_parser.extract_tool_calls(
+        "",
+        request=None,
+        token_ids=token_ids,
+    )
+    assert not extracted_info.tools_called
+    assert extracted_info.tool_calls == []
+    assert extracted_info.content == "This is a test"
+
+
+def test_extract_tool_calls_single_tool(openai_tool_parser, harmony_encoding):
+    convo = Conversation.from_messages([
+        Message.from_role_and_content(Role.USER,
+                                      "What is the weather in Tokyo?"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            'User asks: "What is the weather in Tokyo?" We need to use get_current_weather tool.',  #  noqa: E501
+        ).with_channel("analysis"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            '{"location": "Tokyo"}').with_channel("commentary").with_recipient(
+                "functions.get_current_weather").with_content_type("json"),
+    ])
+    token_ids = harmony_encoding.render_conversation_for_completion(
+        convo, Role.ASSISTANT)
+
+    extracted_info = openai_tool_parser.extract_tool_calls(
+        "",
+        request=None,
+        token_ids=token_ids,
+    )
+    assert extracted_info.tools_called
+    expected_tool_calls = [
+        ToolCall(function=FunctionCall(
+            name="get_current_weather",
+            arguments=json.dumps({"location": "Tokyo"}),
+        ))
+    ]
+    assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
+    assert extracted_info.content is None
+
+
+def test_extract_tool_calls_multiple_tools(
+    openai_tool_parser,
+    harmony_encoding,
+):
+    convo = Conversation.from_messages([
+        Message.from_role_and_content(
+            Role.USER, "What is the weather in Tokyo based on where I'm at?"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            'User asks: "What is the weather in Tokyo?" based on their location. We need to use get_current_weather tool and get_user_location tool.',  #  noqa: E501
+        ).with_channel("analysis"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            '{"location": "Tokyo"}').with_channel("commentary").with_recipient(
+                "functions.get_current_weather").with_content_type("json"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            '{"location": "Tokyo"}').with_channel("commentary").with_recipient(
+                "functions.get_user_location").with_content_type("json"),
+    ])
+    token_ids = harmony_encoding.render_conversation_for_completion(
+        convo,
+        Role.ASSISTANT,
+    )
+
+    extracted_info = openai_tool_parser.extract_tool_calls(
+        "",
+        request=None,
+        token_ids=token_ids,
+    )
+    assert extracted_info.tools_called
+    expected_tool_calls = [
+        ToolCall(function=FunctionCall(
+            name="get_current_weather",
+            arguments=json.dumps({"location": "Tokyo"}),
+        )),
+        ToolCall(function=FunctionCall(
+            name="get_user_location",
+            arguments=json.dumps({"location": "Tokyo"}),
+        ))
+    ]
+    assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
+    assert extracted_info.content is None
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index 078d316844..d1ff06425f 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
 import datetime
 import json
 from collections.abc import Iterable, Sequence
@@ -18,7 +21,8 @@ from openai_harmony import (Author, Conversation, DeveloperContent,
                             Role, StreamableParser, SystemContent, TextContent,
                             ToolDescription, load_harmony_encoding)
 
-from vllm.entrypoints.openai.protocol import ResponseInputOutputItem
+from vllm.entrypoints.openai.protocol import (ChatCompletionToolsParam,
+                                              ResponseInputOutputItem)
 from vllm.utils import random_uuid
 
 REASONING_EFFORT = {
@@ -63,13 +67,29 @@ def get_system_message(
     return sys_msg
 
 
-def get_developer_message(instructions: Optional[str] = None,
-                          tools: Optional[list[Tool]] = None) -> Message:
+def create_tool_definition(tool: Union[ChatCompletionToolsParam, Tool]):
+    if isinstance(tool, ChatCompletionToolsParam):
+        return ToolDescription.new(
+            name=tool.function.name,
+            description=tool.function.description,
+            parameters=tool.function.parameters,
+        )
+    return ToolDescription.new(
+        name=tool.name,
+        description=tool.description,
+        parameters=tool.parameters,
+    )
+
+
+def get_developer_message(
+    instructions: Optional[str] = None,
+    tools: Optional[list[Union[Tool, ChatCompletionToolsParam]]] = None,
+) -> Message:
     dev_msg_content = DeveloperContent.new()
     if instructions is not None:
         dev_msg_content = dev_msg_content.with_instructions(instructions)
     if tools is not None:
-        function_tools = []
+        function_tools: list[Union[Tool, ChatCompletionToolsParam]] = []
         for tool in tools:
             if tool.type in ("web_search_preview", "code_interpreter"):
                 # These are built-in tools that are added to the system message.
@@ -80,11 +100,7 @@ def get_developer_message(instructions: Optional[str] = None,
                 raise ValueError(f"tool type {tool.type} not supported")
         if function_tools:
             function_tool_descriptions = [
-                ToolDescription.new(
-                    name=tool.name,
-                    description=tool.description,
-                    parameters=tool.parameters,
-                ) for tool in function_tools
+                create_tool_definition(tool) for tool in function_tools
             ]
             dev_msg_content = dev_msg_content.with_function_tools(
                 function_tool_descriptions)
@@ -148,16 +164,46 @@ def parse_response_input(
     return msg
 
 
-def parse_chat_input(chat_msg) -> Message:
-    role = chat_msg["role"]
-    content = chat_msg["content"]
+def parse_chat_input(chat_msg) -> list[Message]:
+    if not isinstance(chat_msg, dict):
+        # Handle Pydantic models
+        chat_msg = chat_msg.model_dump(exclude_none=True)
+
+    role = chat_msg.get("role")
+
+    # Assistant message with tool calls
+    tool_calls = chat_msg.get("tool_calls")
+    if role == "assistant" and tool_calls:
+        msgs: list[Message] = []
+        for call in tool_calls:
+            func = call.get("function", {})
+            name = func.get("name", "")
+            arguments = func.get("arguments", "") or ""
+            msg = Message.from_role_and_content(Role.ASSISTANT, arguments)
+            msg = msg.with_channel("commentary")
+            msg = msg.with_recipient(f"functions.{name}")
+            msg = msg.with_content_type("json")
+            msgs.append(msg)
+        return msgs
+
+    # Tool role message (tool output)
+    if role == "tool":
+        name = chat_msg.get("name", "")
+        content = chat_msg.get("content", "") or ""
+        msg = Message.from_author_and_content(
+            Author.new(Role.TOOL, f"functions.{name}"),
+            content).with_channel("commentary")
+        return [msg]
+
+    # Default: user/assistant/system messages with content
+    content = chat_msg.get("content", "")
     if isinstance(content, str):
         contents = [TextContent(text=content)]
     else:
         # TODO: Support refusal.
         contents = [TextContent(text=c.get("text", "")) for c in content]
     msg = Message.from_role_and_contents(role, contents)
-    return msg
+    return [msg]
 
 
 def render_for_completion(messages: list[Message]) -> list[int]:
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index fff6dcd724..4cc22787a0 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -6,7 +6,7 @@ import json
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
-from typing import Callable, Final, Optional, Union
+from typing import TYPE_CHECKING, Callable, Final, Optional, Union
 
 import jinja2
 import partial_json_parser
@@ -489,6 +489,8 @@ class OpenAIServingChat(OpenAIServing):
                 get_streamable_parser_for_assistant()
                 for _ in range(num_choices)
             ]
+            harmony_tools_streamed = [False] * num_choices
+        tools_streamed = [False] * num_choices
 
         if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
             tool_choice_function_name = request.tool_choice.function.name
@@ -662,13 +664,11 @@ class OpenAIServingChat(OpenAIServing):
 
                     if self.use_harmony:
                         harmony_parser = harmony_parsers[i]
+                        prev_recipient = harmony_parser.current_recipient
                         for token_id in output.token_ids:
                             harmony_parser.process(token_id)
-                        is_reasoning = \
-                            harmony_parser.current_channel == "analysis"
-                        if not request.include_reasoning and is_reasoning:
-                            # Skip the reasoning content.
-                            continue
+                        cur_channel = harmony_parser.current_channel
+                        cur_recipient = harmony_parser.current_recipient
                         delta_text = harmony_parser.last_content_delta or ""
                     else:
                         delta_text = output.text
@@ -681,8 +681,7 @@ class OpenAIServingChat(OpenAIServing):
                     delta_message: Optional[DeltaMessage]
 
                     # just update previous_texts and previous_token_ids
-                    if ((tool_choice_auto or self.reasoning_parser)
-                            and not self.use_harmony):
+                    if tool_choice_auto or self.reasoning_parser:
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
                         previous_text = previous_texts[i]
@@ -696,11 +695,54 @@ class OpenAIServingChat(OpenAIServing):
                             current_token_ids = as_list(output.token_ids)
 
                     if self.use_harmony:
-                        if is_reasoning:
-                            delta_message = DeltaMessage(
-                                reasoning_content=delta_text)
-                        else:
+                        if cur_channel == "final":
                             delta_message = DeltaMessage(content=delta_text)
+                        elif cur_channel == "analysis":
+                            if request.include_reasoning:
+                                delta_message = DeltaMessage(
+                                    reasoning_content=delta_text)
+                            else:
+                                delta_message = None
+                        elif (cur_channel == "commentary" and cur_recipient
+                              and cur_recipient.startswith("functions.")):
+                            # Count completed tool calls to determine index
+                            base_index = 0
+                            for msg in harmony_parser.messages:
+                                if (msg.channel == "commentary"
+                                        and msg.recipient
+                                        and msg.recipient.startswith(
+                                            "functions.")):
+                                    base_index += 1
+
+                            if prev_recipient != cur_recipient:
+                                tool_name = cur_recipient.split(
+                                    "functions.", 1)[1]
+                                delta_message = DeltaMessage(tool_calls=[
+                                    DeltaToolCall(
+                                        id=make_tool_call_id(),
+                                        type="function",
+                                        function=DeltaFunctionCall(
+                                            name=tool_name,
+                                            arguments="",
+                                        ),
+                                        index=base_index,
+                                    )
+                                ])
+                            elif delta_text:
+                                delta_message = DeltaMessage(tool_calls=[
+                                    DeltaToolCall(
+                                        index=base_index,
+                                        function=DeltaFunctionCall(
+                                            arguments=delta_text),
+                                    )
+                                ])
+                            else:
+                                delta_message = None
+
+                            if delta_message is not None:
+                                harmony_tools_streamed[i] = True
+                        else:
+                            delta_message = None
                     # handle streaming deltas for tools with named tool_choice
                     elif tool_choice_function_name:
                         if (self.reasoning_parser and not reasoning_end_arr[i]
@@ -758,6 +800,7 @@ class OpenAIServingChat(OpenAIServing):
                             delta_message = DeltaMessage(tool_calls=[
                                 delta_tool_call,
                             ])
+                            tools_streamed[i] = True
 
                     elif request.tool_choice == "required":
                         assert previous_texts is not None
@@ -783,6 +826,7 @@ class OpenAIServingChat(OpenAIServing):
                         if (delta_message and delta_message.tool_calls and
                                 delta_message.tool_calls[0].id is not None):
                             history_tool_call_cnt += 1
+                            tools_streamed[i] = True
 
                         # update the previous values for the next iteration
                         previous_texts[i] = current_text
@@ -859,6 +903,8 @@ class OpenAIServingChat(OpenAIServing):
                                     current_token_ids=current_token_ids,
                                     delta_token_ids=delta_token_ids,
                                     request=request))
+                            if delta_message and delta_message.tool_calls:
+                                tools_streamed[i] = True
                     # when only tool calls
                     elif tool_choice_auto:
                         assert tool_parser is not None
@@ -871,6 +917,8 @@ class OpenAIServingChat(OpenAIServing):
                                 current_token_ids=current_token_ids,
                                 delta_token_ids=output.token_ids,
                                 request=request))
+                        if delta_message and delta_message.tool_calls:
+                            tools_streamed[i] = True
 
                     # when only reasoning
                     elif self.reasoning_parser:
@@ -907,7 +955,10 @@ class OpenAIServingChat(OpenAIServing):
                     # wasn't ready to send a token, then
                     #   get the next token without streaming a chunk
                     if delta_message is None:
-                        continue
+                        if output.finish_reason is None:
+                            continue
+                        else:
+                            delta_message = DeltaMessage()
 
                     # Log streaming delta if output logging is enabled
                     if self.enable_log_outputs and self.request_logger:
@@ -993,12 +1044,18 @@ class OpenAIServingChat(OpenAIServing):
                             ])
 
                         # Send the finish response for each request.n only once
+                        if auto_tools_called or tools_streamed[i] or (
+                                self.use_harmony
+                                and harmony_tools_streamed[i]):
+                            finish_reason_ = "tool_calls"
+                        else:
+                            finish_reason_ = output.finish_reason \
+                                if output.finish_reason else "stop"
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
                             delta=delta_message,
                             logprobs=logprobs,
-                            finish_reason=output.finish_reason
-                            if not auto_tools_called else "tool_calls",
+                            finish_reason=finish_reason_,
                             stop_reason=output.stop_reason,
                             token_ids=(as_list(output.token_ids)
                                        if request.return_token_ids else None))
@@ -1131,31 +1188,32 @@ class OpenAIServingChat(OpenAIServing):
                 logprobs = None
 
             if self.use_harmony:
-                reasoning_content, final_content, is_tool_call = (
-                    parse_chat_output(token_ids))
-                if not request.include_reasoning:
-                    reasoning_content = None
-
-                if is_tool_call:
-                    # TODO(woosuk): Implement tool call for gpt-oss.
-                    # For now, only Responses API supports tool call for
-                    # gpt-oss.
-                    raise NotImplementedError(
-                        "Tool call in Chat Completion API is not supported "
-                        "for gpt-oss yet. Please use Responses API instead.")
-                else:
-                    # Normal message
-                    message = ChatMessage(
-                        role=role,
-                        reasoning_content=reasoning_content,
-                        content=final_content,
-                    )
+                if TYPE_CHECKING:
+                    assert self.tool_parser is not None
+                tool_parser = self.tool_parser(tokenizer)
+                # NOTE: We use token_ids for openai tool parser
+                tool_call_info = tool_parser.extract_tool_calls(
+                    "",
+                    request=request,
+                    token_ids=token_ids,  # type: ignore
+                )
+                reasoning_content, content = None, tool_call_info.content
+                if request.include_reasoning:
+                    reasoning_content, content, _ = parse_chat_output(
+                        token_ids)
+                message = ChatMessage(
+                    role=role,
+                    reasoning_content=reasoning_content,
+                    content=content,
+                    tool_calls=tool_call_info.tool_calls,
+                )
 
                 choice_data = ChatCompletionResponseChoice(
                     index=output.index,
                     message=message,
                     logprobs=logprobs,
-                    finish_reason="tool_calls" if is_tool_call else
+                    finish_reason="tool_calls"
+                    if tool_call_info.tools_called else
                     output.finish_reason if output.finish_reason else "stop",
                     stop_reason=output.stop_reason,
                 )
@@ -1504,12 +1562,12 @@ class OpenAIServingChat(OpenAIServing):
         messages.append(sys_msg)
 
         # Add developer message.
-        dev_msg = get_developer_message()
+        dev_msg = get_developer_message(tools=request.tools)
         messages.append(dev_msg)
 
         # Add user message.
         for chat_msg in request.messages:
-            messages.append(parse_chat_input(chat_msg))
+            messages.extend(parse_chat_input(chat_msg))
 
         # Render prompt token ids.
         prompt_token_ids = render_for_completion(messages)
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 44aa1208a5..35096b0461 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -16,6 +16,7 @@ from .llama4_pythonic_tool_parser import Llama4PythonicToolParser
 from .llama_tool_parser import Llama3JsonToolParser
 from .minimax_tool_parser import MinimaxToolParser
 from .mistral_tool_parser import MistralToolParser
+from .openai_tool_parser import OpenAIToolParser
 from .phi4mini_tool_parser import Phi4MiniJsonToolParser
 from .pythonic_tool_parser import PythonicToolParser
 from .qwen3coder_tool_parser import Qwen3CoderToolParser
@@ -46,4 +47,5 @@ __all__ = [
     "Qwen3CoderToolParser",
     "SeedOssToolParser",
     "Step3ToolParser",
+    "OpenAIToolParser",
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
new file mode 100644
index 0000000000..c5d59514b9
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from collections.abc import Sequence
+from typing import TYPE_CHECKING
+
+from vllm.entrypoints.harmony_utils import parse_output_into_messages
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+
+if TYPE_CHECKING:
+    from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+
+@ToolParserManager.register_module("openai")
+class OpenAIToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+        token_ids: Sequence[int] | None = None,
+    ) -> ExtractedToolCallInformation:
+        if token_ids is None:
+            raise NotImplementedError(
+                "OpenAIToolParser requires token IDs and does not support text-based extraction."  # noqa: E501
+            )
+
+        parser = parse_output_into_messages(token_ids)
+        tool_calls = []
+        final_content = None
+
+        if len(parser.messages) > 0:
+            for msg in parser.messages:
+                if msg.recipient and msg.recipient.startswith("functions."):
+                    tool_calls.append(
+                        ToolCall(
+                            type="function",
+                            function=FunctionCall(
+                                name=msg.recipient.split("functions.")[1],
+                                arguments=msg.content[0].text,
+                            ),
+                        ))
+                elif msg.channel == "final":
+                    final_content = msg.content[0].text
+
+        return ExtractedToolCallInformation(
+            tools_called=len(tool_calls) > 0,
+            tool_calls=tool_calls,
+            content=final_content,
+        )
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        raise NotImplementedError(
+            "Not being used, manual parsing in serving_chat.py"  # noqa: E501
+        )
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 0245e89f7d..8b76a54332 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -256,7 +256,7 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         decoding_config = vllm_config.decoding_config
         if decoding_config.reasoning_backend == "":
-            decoding_config.reasoning_backend = "GptOss"
+            decoding_config.reasoning_backend = "openai_gptoss"
 
         # Increase the max capture size from 512 to 1024 for performance.
         # NOTE(woosuk): This will increase the number of CUDA graphs
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index 05a72ac23b..3bd4d872ce 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -6,6 +6,7 @@ from typing import Optional, Union
 
 from transformers import PreTrainedTokenizerBase
 
+from vllm.entrypoints.harmony_utils import parse_chat_output
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage)
 from vllm.logger import init_logger
@@ -14,7 +15,7 @@ from vllm.reasoning import ReasoningParser, ReasoningParserManager
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("GptOss")
+@ReasoningParserManager.register_module("openai_gptoss")
 class GptOssReasoningParser(ReasoningParser):
     """
     Reasoning parser for GptOss model.
@@ -39,9 +40,10 @@ class GptOssReasoningParser(ReasoningParser):
         return False
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
-        raise RuntimeError(
-            "GptOss model uses harmony to extract reasoning content. This "
-            "function should not be called.")
+        _, content, _ = parse_chat_output(input_ids)
+        if content is None:
+            return []
+        return self.model_tokenizer.encode(content)
 
     def extract_reasoning_content_streaming(
         self,
@@ -52,13 +54,34 @@ class GptOssReasoningParser(ReasoningParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
     ) -> Union[DeltaMessage, None]:
-        raise RuntimeError(
-            "GptOss model uses harmony to extract reasoning content. This "
-            "function should not be called.")
+        prev_reasoning, prev_content, _ = parse_chat_output(
+            list(previous_token_ids))
+        cur_reasoning, cur_content, _ = parse_chat_output(
+            list(current_token_ids))
+        reasoning_delta = None
+        content_delta = None
+        if cur_reasoning is not None:
+            prev_r = prev_reasoning or ""
+            if cur_reasoning.startswith(prev_r):
+                reasoning_delta = cur_reasoning[len(prev_r):] or None
+            else:
+                reasoning_delta = cur_reasoning
+        if cur_content is not None:
+            prev_c = prev_content or ""
+            if cur_content.startswith(prev_c):
+                content_delta = cur_content[len(prev_c):] or None
+            else:
+                content_delta = cur_content
+        if reasoning_delta is None and content_delta is None:
+            return None
+        return DeltaMessage(reasoning_content=reasoning_delta,
+                            content=content_delta)
 
     def extract_reasoning_content(
-            self, model_output: str, request: ChatCompletionRequest
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
     ) -> tuple[Optional[str], Optional[str]]:
-        raise RuntimeError(
-            "GptOss model uses harmony to extract reasoning content. This "
-            "function should not be called.")
+        raise NotImplementedError(
+            "gpt-oss has a special branch for parsing reasoning in non-streaming mode. This method shouldn't be used."  # noqa: E501
+        )

From e599e2c65ee32abcc986733ab0a55becea158bb4 Mon Sep 17 00:00:00 2001
From: liuzhenwei <zhenwei.liu@intel.com>
Date: Fri, 5 Sep 2025 12:03:12 +0800
Subject: [PATCH 863/932] [XPU][P/D] Add XPU support in NixlConnector (#22436)

Signed-off-by: zhenwei <zhenwei.liu@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
---
 requirements/xpu.txt                          |  1 +
 .../kv_transfer/kv_connector/utils.py         | 50 ++++++++++++-
 .../kv_connector/v1/nixl_connector.py         |  1 +
 vllm/platforms/tpu.py                         | 26 +++++++
 vllm/platforms/xpu.py                         | 31 ++++++++
 vllm/v1/worker/gpu_model_runner.py            |  4 ++
 vllm/v1/worker/tpu_model_runner.py            | 72 +------------------
 7 files changed, 114 insertions(+), 71 deletions(-)

diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 4607c3efdf..c44a2a9c74 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -10,6 +10,7 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+nixl==0.3.0 # for PD disaggregation
 --extra-index-url=https://download.pytorch.org/whl/xpu
 torch==2.8.0+xpu
 torchaudio
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 2364400b3d..f4dc248a12 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -6,7 +6,7 @@ KV cache helper for store.
 from collections import defaultdict
 from collections.abc import Sequence
 from concurrent.futures import CancelledError, Future
-from typing import Optional, cast
+from typing import Literal, Optional, Union, cast
 
 import torch
 
@@ -196,3 +196,51 @@ class KVOutputAggregator:
             output_future.add_done_callback(make_callback(i))
 
         return result_future
+
+
+def _make_src_and_dst_indices(
+    src_block_ids: list[int],
+    dst_block_ids: list[int],
+    src_device: Union[torch.device, str],
+    dst_device: Union[torch.device, str],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    src_indices = torch.tensor(src_block_ids,
+                               device=src_device,
+                               dtype=torch.int64)
+    dst_indices = torch.tensor(dst_block_ids,
+                               device=dst_device,
+                               dtype=torch.int64)
+    return src_indices, dst_indices
+
+
+def copy_kv_blocks(
+    src_kv_caches: dict[str, torch.Tensor],
+    dst_kv_caches: dict[str, torch.Tensor],
+    src_block_ids: list[int],
+    dst_block_ids: list[int],
+    direction: Literal["h2d", "d2h"],
+) -> None:
+    """Copy kv blocks between different buffers."""
+    if not src_kv_caches or not dst_kv_caches or \
+       not src_block_ids or not dst_block_ids or \
+       len(src_block_ids) != len(dst_block_ids):
+        return
+
+    src_device = next(iter(src_kv_caches.values())).device
+    dst_device = next(iter(dst_kv_caches.values())).device
+
+    src_indices, dst_indices = _make_src_and_dst_indices(
+        src_block_ids=src_block_ids,
+        dst_block_ids=dst_block_ids,
+        src_device=src_device,
+        dst_device=dst_device)
+
+    from vllm.platforms import current_platform
+    if direction == "h2d":
+        copy_fn = current_platform.insert_blocks_to_device
+    else:
+        copy_fn = current_platform.swap_out_blocks_to_host
+    for layer_name in src_kv_caches:
+        src_tensor = src_kv_caches[layer_name]
+        dst_tensor = dst_kv_caches[layer_name]
+        copy_fn(src_tensor, dst_tensor, src_indices, dst_indices)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index de9cbc6606..c2f73fa281 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -61,6 +61,7 @@ except ImportError:
 _NIXL_SUPPORTED_XPUS = {
     "cuda": ("cuda", ),
     "tpu": ("cpu", ),
+    "xpu": ("cpu", ),
 }
 
 
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index d7468d74b0..6a061956d8 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -200,6 +200,32 @@ class TpuPlatform(Platform):
                                     model_config: "ModelConfig") -> bool:
         return True
 
+    @classmethod
+    @torch.compile(backend="openxla")
+    def insert_blocks_to_device(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        torch.ops.xla.dynamo_set_buffer_donor_(dst_cache, True)
+        dst_cache[dst_block_indices] = src_cache[src_block_indices].to(
+            dst_cache.device)
+
+    @classmethod
+    @torch.compile(backend="openxla")
+    def swap_out_blocks_to_host(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """ tpu blocks to cpu blocks"""
+        torch.ops.xla.dynamo_set_buffer_donor_(src_cache, True)
+        dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu()
+
 
 try:
     from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 9f89334e9a..32208e7fff 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -164,6 +164,13 @@ class XPUPlatform(Platform):
                 vllm_config.scheduler_config.max_model_len,
                 DEFAULT_MAX_NUM_BATCHED_TOKENS)
 
+        if (envs.VLLM_KV_CACHE_LAYOUT is None
+                or envs.VLLM_KV_CACHE_LAYOUT != "NHD"):
+            os.environ["VLLM_KV_CACHE_LAYOUT"] = "NHD"
+            logger.info(
+                "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
+                "only NHD layout is supported by XPU attention kernels.")
+
     @classmethod
     def is_pin_memory_available(cls):
         return True
@@ -210,3 +217,27 @@ class XPUPlatform(Platform):
     @classmethod
     def opaque_attention_op(cls) -> bool:
         return True
+
+    @classmethod
+    def insert_blocks_to_device(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """Copy blocks from src_cache to dst_cache on XPU."""
+        _src_cache = src_cache[:, src_block_indices]
+        dst_cache[:, dst_block_indices] = _src_cache.to(dst_cache.device)
+
+    @classmethod
+    def swap_out_blocks_to_host(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """Copy blocks from XPU to host (CPU)."""
+        _src_cache = src_cache[:, src_block_indices]
+        dst_cache[:, dst_block_indices] = _src_cache.cpu()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4556a51b80..42baf020e9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -28,6 +28,7 @@ from vllm.config import (CompilationLevel, CUDAGraphMode, VllmConfig,
 from vllm.distributed.eplb.eplb_state import EplbState
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           has_kv_transfer_group)
+from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
     prepare_communication_buffer_for_model)
@@ -3139,6 +3140,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(kv_caches)
+            if self.device.type == 'xpu':
+                get_kv_transfer_group().set_host_xfer_buffer_ops(
+                    copy_kv_blocks)
 
     def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
         """
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 985d5ba58c..5947b54d33 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -3,7 +3,7 @@
 import bisect
 import gc
 import time
-from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, Optional, cast
 from unittest.mock import patch
 
 import numpy as np
@@ -23,6 +23,7 @@ from vllm.config import (ParallelConfig, VllmConfig,
                          get_layers_from_vllm_config, update_config)
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           has_kv_transfer_group)
+from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.lora.layers import BaseLayerWithLoRA
@@ -1887,75 +1888,6 @@ def _get_padded_token_len(paddings: list[int], x: int) -> int:
     return paddings[index]
 
 
-def _make_src_and_dst_indices(
-    src_block_ids: list[int],
-    dst_block_ids: list[int],
-    src_device: Union[torch.device, str],
-    dst_device: Union[torch.device, str],
-) -> tuple[torch.Tensor, torch.Tensor]:
-    src_indices = torch.tensor(src_block_ids,
-                               device=src_device,
-                               dtype=torch.int64)
-    dst_indices = torch.tensor(dst_block_ids,
-                               device=dst_device,
-                               dtype=torch.int64)
-    return src_indices, dst_indices
-
-
-@torch.compile(backend="openxla")
-def _insert_blocks_to_tpu(
-    cpu_cache: torch.Tensor,
-    tpu_cache: torch.Tensor,
-    cpu_block_indices: torch.Tensor,
-    tpu_block_indices: torch.Tensor,
-) -> None:
-    torch.ops.xla.dynamo_set_buffer_donor_(tpu_cache, True)
-    tpu_cache[tpu_block_indices] = cpu_cache[cpu_block_indices].to(
-        tpu_cache.device)
-
-
-@torch.compile(backend="openxla")
-def _swap_out_tpu_blocks(
-    tpu_cache: torch.Tensor,
-    cpu_cache: torch.Tensor,
-    tpu_block_indices: torch.Tensor,
-    cpu_block_indices: torch.Tensor,
-) -> None:
-    """ tpu blocks to cpu blocks"""
-    torch.ops.xla.dynamo_set_buffer_donor_(tpu_cache, True)
-    cpu_cache[cpu_block_indices] = tpu_cache[tpu_block_indices].cpu()
-
-
-def copy_kv_blocks(
-    src_kv_caches: dict[str, torch.Tensor],
-    dst_kv_caches: dict[str, torch.Tensor],
-    src_block_ids: list[int],
-    dst_block_ids: list[int],
-    direction: Literal["h2d", "d2h"],
-) -> None:
-    """Copy kv blocks between different buffers."""
-    if not src_kv_caches or not dst_kv_caches or \
-       not src_block_ids or not dst_block_ids or \
-       len(src_block_ids) != len(dst_block_ids):
-        return
-
-    src_device = next(iter(src_kv_caches.values())).device
-    dst_device = next(iter(dst_kv_caches.values())).device
-
-    src_indices, dst_indices = _make_src_and_dst_indices(
-        src_block_ids=src_block_ids,
-        dst_block_ids=dst_block_ids,
-        src_device=src_device,
-        dst_device=dst_device)
-
-    _copy_fn = _insert_blocks_to_tpu if direction == "h2d" else \
-               _swap_out_tpu_blocks
-    for layer_name in src_kv_caches:
-        src_tensor = src_kv_caches[layer_name]
-        dst_tensor = dst_kv_caches[layer_name]
-        _copy_fn(src_tensor, dst_tensor, src_indices, dst_indices)
-
-
 def _get_padded_num_kv_cache_update_slices(num_tokens: int, max_num_reqs: int,
                                            page_size: int) -> int:
     """Calculates the padded number of KV cache update slices to avoid

From 006e7a34aeb3e905ca4131a3251fe079f0511e2f Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Fri, 5 Sep 2025 05:08:50 -0700
Subject: [PATCH 864/932]  Adding int4 and int8 models for CPU benchmarking
 (#23709)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../tests/serving-tests-cpu-snc2.json         | 420 +++++++++++-
 .../tests/serving-tests-cpu-snc3.json         | 627 +++++++++++++++++-
 docs/contributing/benchmarks.md               |  32 +-
 3 files changed, 1066 insertions(+), 13 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
index 2d88a0b30c..f758097e09 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -1,6 +1,6 @@
 [
     {
-        "test_name": "serving_llama8B_tp1_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -32,7 +32,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp2_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -64,7 +64,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp4_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -96,7 +96,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp1_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
@@ -131,7 +131,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp2_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
@@ -166,7 +166,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp4_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
@@ -198,5 +198,413 @@
 	    "random-output-len": 128,
             "num_prompts": 1000
         }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
     }
 ]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
index 823abbaa99..ce396d6e54 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -1,6 +1,6 @@
 [
     {
-        "test_name": "serving_llama8B_pp1_sharegpt",
+        "test_name": "serving_llama8B_bf16_pp1_sharegpt",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -32,7 +32,39 @@
         }
     },
     {
-        "test_name": "serving_llama8B_pp3_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_bf16_pp3_sharegpt",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -64,7 +96,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp2pp3_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -97,7 +129,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_pp1_random_128_128",
+        "test_name": "serving_llama8B_bf16_pp1_random_128_128",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
@@ -132,7 +164,42 @@
         }
     },
     {
-        "test_name": "serving_llama8B_pp3_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_bf16_pp3_random_128_128",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
@@ -167,7 +234,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp2pp3_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
@@ -201,5 +268,553 @@
 	    "ignore-eos": "",
             "num_prompts": 1000
         }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
     }
 ]
diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index 2bbed778f3..25c2d2955f 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -11,9 +11,39 @@ vLLM contains two sets of benchmarks:
 
 The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
 
+### Manually Trigger the benchmark
+
+Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.  
+For CPU environment, please use the image with "-cpu" postfix.
+
+Here is an example for docker run command for CPU.  
+
+```bash
+docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
+```
+
+Then, run below command inside the docker instance.  
+
+```bash
+bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.  
+
+#### Runtime environment variables
+
+- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
+- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
+- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
+- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
+- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
+
+For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
+
 The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
 
-More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
+More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
 
 [](){ #nightly-benchmarks }
 

From 7812bcf2783acef15b6088ba223f1c94fec42d0d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 5 Sep 2025 22:48:42 +0800
Subject: [PATCH 865/932] [docs] add shenzhen meetup (#24326)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 README.md                 | 5 +++--
 docs/community/meetups.md | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e13993efd3..4e03df758c 100644
--- a/README.md
+++ b/README.md
@@ -18,16 +18,17 @@ Easy, fast, and cheap LLM serving for everyone
 
 *Latest News* 🔥
 
+- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
-- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
-- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 
 <details>
 <summary>Previous News</summary>
 
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
+- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 04919769e1..a3004249b7 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -2,6 +2,7 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
 - [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
 - [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).

From 23a6c5280e93ca8796f12b18d0e1dba4f3d1331d Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sat, 6 Sep 2025 01:26:00 +0800
Subject: [PATCH 866/932] [gpt-oss][Bugfix]Fix streamableparser for missing
 handling of certain token_ids (#24306)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/entrypoints/context.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index fb58cba3a4..e4f2e800f9 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -238,11 +238,11 @@ class StreamingHarmonyContext(HarmonyContext):
             # (finished=True), then the next token processed will mark the
             # beginning of a new message
             self.first_tok_of_message = output.finished
-            tok = output.outputs[0].token_ids[0]
-            self.parser.process(tok)
+            for tok in output.outputs[0].token_ids:
+                self.parser.process(tok)
             self._update_num_output_tokens(output.outputs[0].token_ids)
             # Check if the current token is part of reasoning content
-            self._update_num_reasoning_tokens([tok])
+            self._update_num_reasoning_tokens(output.outputs[0].token_ids)
             self.last_tok = tok
         else:
             # Handle the case of tool output in direct message format

From eedb2a2a102b47d527071b9acfb9edd541ef5de6 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Sat, 6 Sep 2025 04:13:42 +0800
Subject: [PATCH 867/932] [Bugfix] Fix silu_mul+quant fusion test (#24341)

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 tests/compile/test_silu_mul_quant_fusion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index e16d1725e6..731ceeb905 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -118,7 +118,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
     fusion_pass = ActivationQuantFusionPass(config)
 
     backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
-    model = model_class(hidden_size, cuda_force_torch)
+    model = model_class(hidden_size=hidden_size,
+                        cuda_force_torch=cuda_force_torch)
 
     # First dimension dynamic
     x = torch.rand(num_tokens, hidden_size * 2)

From 9dfbeb41e510ad04e90b03ef2f437f476c5abe28 Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@meta.com>
Date: Fri, 5 Sep 2025 14:14:18 -0700
Subject: [PATCH 868/932] [RFC] allow cancelation after shutdown in blocking
 collective_rpc (#23390)

Signed-off-by: Shiyan Deng <dsy842974287@meta.com>
---
 vllm/v1/executor/multiproc_executor.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 12e79ff165..84eb956b5c 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -253,7 +253,8 @@ class MultiprocExecutor(Executor):
                     if not non_block:
                         result = result.result()
                 elif not non_block:
-                    result = get_response(w, dequeue_timeout)
+                    result = get_response(w, dequeue_timeout,
+                                          self.shutdown_event)
                 else:
                     raise RuntimeError("non_block can only be used when"
                                        " max_concurrent_batches > 1")
@@ -295,12 +296,8 @@ class MultiprocExecutor(Executor):
         """Properly shut down the executor and its workers"""
         if not getattr(self, 'shutting_down', False):
             self.shutting_down = True
-            self.shutdown_event.set()
-
-            if self.io_thread_pool is not None:
-                self.io_thread_pool.shutdown(wait=False, cancel_futures=True)
-                self.io_thread_pool = None
 
+            # Make sure all the worker processes are terminated first.
             if workers := getattr(self, 'workers', None):
                 for w in workers:
                     # Close death_writer to signal child processes to exit
@@ -310,6 +307,11 @@ class MultiprocExecutor(Executor):
                     w.worker_response_mq = None
                 self._ensure_worker_termination([w.proc for w in workers])
 
+            self.shutdown_event.set()
+            if self.io_thread_pool is not None:
+                self.io_thread_pool.shutdown(wait=False, cancel_futures=True)
+                del self.io_thread_pool
+
         self.rpc_broadcast_mq = None
 
     def check_health(self) -> None:

From c954c6629cdf4ebbfc1cab05d7d71af6696c5881 Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Fri, 5 Sep 2025 20:26:22 -0400
Subject: [PATCH 869/932] [CI] Add timeouts to tests (#24260)

Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 .buildkite/test-pipeline.yaml | 146 ++++++++++++++++++++++++----------
 1 file changed, 102 insertions(+), 44 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 55349e0ac9..ad240023a0 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -41,7 +41,8 @@ steps:
   commands:
   - bash standalone_tests/pytorch_nightly_dependency.sh
 
-- label: Async Engine, Inputs, Utils, Worker Test # 24min
+- label: Async Engine, Inputs, Utils, Worker Test # 36min
+  timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
@@ -63,7 +64,8 @@ steps:
   - pytest -v -s utils_ # Utils
   - pytest -v -s worker # Worker
 
-- label: Python-only Installation Test
+- label: Python-only Installation Test # 10min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - tests/standalone_tests/python_only_compile.sh
@@ -71,7 +73,8 @@ steps:
   commands:
   - bash standalone_tests/python_only_compile.sh
 
-- label: Basic Correctness Test # 30min
+- label: Basic Correctness Test # 20min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   fast_check: true
   torch_nightly: true
@@ -88,7 +91,8 @@ steps:
   - pytest -v -s basic_correctness/test_cpu_offload.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
-- label: Core Test # 10min
+- label: Core Test # 22min
+  timeout_in_minutes: 35
   mirror_hardwares: [amdexperimental]
   fast_check: true
   source_file_dependencies:
@@ -98,7 +102,8 @@ steps:
   commands:
   - pytest -v -s core
 
-- label: Entrypoints Test (LLM) # 40min
+- label: Entrypoints Test (LLM) # 30min
+  timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -114,7 +119,8 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-- label: Entrypoints Test (API Server) # 40min
+- label: Entrypoints Test (API Server) # 100min
+  timeout_in_minutes: 130
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -129,7 +135,8 @@ steps:
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
   - pytest -v -s entrypoints/test_chat_utils.py
 
-- label: Distributed Tests (4 GPUs) # 10min
+- label: Distributed Tests (4 GPUs) # 35min
+  timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
@@ -172,7 +179,8 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
 
-- label: EPLB Algorithm Test
+- label: EPLB Algorithm Test # 5min
+  timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/distributed/eplb
@@ -181,6 +189,7 @@ steps:
   - pytest -v -s distributed/test_eplb_algo.py
 
 - label: EPLB Execution Test # 5min
+  timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -189,7 +198,8 @@ steps:
   commands:
   - pytest -v -s distributed/test_eplb_execute.py
 
-- label: Metrics, Tracing Test # 10min
+- label: Metrics, Tracing Test # 12min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   num_gpus: 2
   source_file_dependencies:
@@ -208,7 +218,8 @@ steps:
 ##### fast check tests  #####
 #####  1 GPU test  #####
 
-- label: Regression Test # 5min
+- label: Regression Test # 7min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
@@ -218,7 +229,8 @@ steps:
   - pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
-- label: Engine Test # 10min
+- label: Engine Test # 25min
+  timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
@@ -233,7 +245,8 @@ steps:
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
 
-- label: V1 Test e2e + engine
+- label: V1 Test e2e + engine # 30min
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
     - vllm/
@@ -244,7 +257,8 @@ steps:
     - pytest -v -s v1/e2e
     - pytest -v -s v1/engine
 
-- label: V1 Test entrypoints
+- label: V1 Test entrypoints # 35min
+  timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
     - vllm/
@@ -252,7 +266,8 @@ steps:
   commands:
     - pytest -v -s v1/entrypoints
 
-- label: V1 Test others
+- label: V1 Test others # 42min
+  timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
     - vllm/
@@ -276,7 +291,8 @@ steps:
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
-- label: Examples Test # 25min
+- label: Examples Test # 30min
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
@@ -301,7 +317,8 @@ steps:
     - python3 offline_inference/basic/score.py
     - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
-- label: Platform Tests (CUDA)
+- label: Platform Tests (CUDA) # 4min
+  timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
@@ -309,7 +326,8 @@ steps:
   commands:
     - pytest -v -s cuda/test_cuda_context.py
 
-- label: Samplers Test # 36min
+- label: Samplers Test # 56min
+  timeout_in_minutes: 75
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/model_executor/layers
@@ -320,15 +338,23 @@ steps:
     - pytest -v -s samplers
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 
-- label: LoRA Test %N # 15min each
+- label: LoRA Test %N # 20min each
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
+  commands:
+    - pytest -v -s lora \
+      --shard-id=$$BUILDKITE_PARALLEL_JOB \
+      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+      --ignore=lora/test_chatglm3_tp.py \
+      --ignore=lora/test_llama_tp.py \
+      --ignore=lora/test_llm_with_multi_loras.py
   parallelism: 4
 
-- label: PyTorch Compilation Unit Tests
+- label: PyTorch Compilation Unit Tests # 15min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -344,7 +370,8 @@ steps:
     - pytest -v -s compile/test_fusion_all_reduce.py
     - pytest -v -s compile/test_decorator.py
 
-- label: PyTorch Fullgraph Smoke Test # 9min
+- label: PyTorch Fullgraph Smoke Test # 15min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -358,7 +385,8 @@ steps:
   - pytest -v -s compile/piecewise/test_full_cudagraph.py
   - pytest -v -s compile/piecewise/test_multiple_graphs.py
 
-- label: PyTorch Fullgraph Test # 18min
+- label: PyTorch Fullgraph Test # 20min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -367,7 +395,8 @@ steps:
   commands:
   - pytest -v -s compile/test_full_graph.py
 
-- label: Kernels Core Operation Test
+- label: Kernels Core Operation Test # 48min
+  timeout_in_minutes: 75
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/
@@ -375,7 +404,8 @@ steps:
   commands:
     - pytest -v -s kernels/core
 
-- label: Kernels Attention Test %N
+- label: Kernels Attention Test %N # 23min
+  timeout_in_minutes: 35
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/attention/
@@ -386,7 +416,8 @@ steps:
     - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
 
-- label: Kernels Quantization Test %N
+- label: Kernels Quantization Test %N # 64min
+  timeout_in_minutes: 90
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/quantization/
@@ -396,7 +427,8 @@ steps:
     - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
 
-- label: Kernels MoE Test %N
+- label: Kernels MoE Test %N # 40min
+  timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/quantization/cutlass_w8a8/moe/
@@ -408,7 +440,8 @@ steps:
     - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
 
-- label: Kernels Mamba Test
+- label: Kernels Mamba Test # 31min
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/mamba/
@@ -416,7 +449,8 @@ steps:
   commands:
     - pytest -v -s kernels/mamba
 
-- label: Tensorizer Test # 11min
+- label: Tensorizer Test # 14min
+  timeout_in_minutes: 25
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/model_executor/model_loader
@@ -428,7 +462,8 @@ steps:
     - pytest -v -s tensorizer_loader
     - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 
-- label: Model Executor Test
+- label: Model Executor Test # 7min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/model_executor
@@ -438,7 +473,8 @@ steps:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s model_executor
 
-- label: Benchmarks # 9min
+- label: Benchmarks # 11min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/.buildkite"
   source_file_dependencies:
@@ -446,7 +482,8 @@ steps:
   commands:
   - bash scripts/run-benchmarks.sh
 
-- label: Benchmarks CLI Test # 10min
+- label: Benchmarks CLI Test # 7min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
@@ -454,7 +491,8 @@ steps:
   commands:
   - pytest -v -s benchmarks/
 
-- label: Quantization Test
+- label: Quantization Test # 70min
+  timeout_in_minutes: 90
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/
@@ -467,6 +505,7 @@ steps:
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 
 - label: LM Eval Small Models # 53min
+  timeout_in_minutes: 75
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/
@@ -474,7 +513,8 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 
-- label: OpenAI API correctness
+- label: OpenAI API correctness # 22min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/
@@ -483,7 +523,8 @@ steps:
   commands: # LMEval+Transcription WER check
   - pytest -s entrypoints/openai/correctness/
 
-- label: Encoder Decoder tests # 5min
+- label: Encoder Decoder tests # 12min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
@@ -491,7 +532,8 @@ steps:
   commands:
     - pytest -v -s encoder_decoder
 
-- label: OpenAI-Compatible Tool Use # 20 min
+- label: OpenAI-Compatible Tool Use # 23 min
+  timeout_in_minutes: 35
   mirror_hardwares: [amdexperimental]
   fast_check: false
   source_file_dependencies:
@@ -504,7 +546,8 @@ steps:
 
 #####  models test  #####
 
-- label: Basic Models Test # 24min
+- label: Basic Models Test # 57min
+  timeout_in_minutes: 75
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -517,7 +560,8 @@ steps:
     - pytest -v -s models/test_vision.py
     - pytest -v -s models/test_initialization.py
 
-- label: Language Models Test (Standard)
+- label: Language Models Test (Standard) # 35min
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -528,6 +572,7 @@ steps:
     - pytest -v -s models/language -m core_model
 
 - label: Language Models Test (Hybrid) # 35 min
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -540,7 +585,8 @@ steps:
     - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
     - pytest -v -s models/language/generation -m hybrid_model
 
-- label: Language Models Test (Extended Generation) # 1hr20min
+- label: Language Models Test (Extended Generation) # 80min
+  timeout_in_minutes: 110
   mirror_hardwares: [amdexperimental]
   optional: true
   source_file_dependencies:
@@ -552,6 +598,7 @@ steps:
     - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 
 - label: Language Models Test (Extended Pooling)  # 36min
+  timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
   optional: true
   source_file_dependencies:
@@ -560,7 +607,8 @@ steps:
   commands:
     - pytest -v -s models/language/pooling -m 'not core_model'
 
-- label: Multi-Modal Processor Test
+- label: Multi-Modal Processor Test # 44min
+  timeout_in_minutes: 60
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal
@@ -568,7 +616,8 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/processing
 
-- label: Multi-Modal Models Test (Standard)
+- label: Multi-Modal Models Test (Standard) # 60min
+  timeout_in_minutes: 80
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -610,7 +659,8 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 
-- label: Quantized Models Test
+- label: Quantized Models Test # 45 min
+  timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/model_executor/layers/quantization
@@ -640,7 +690,8 @@ steps:
     - python3 examples/offline_inference/audio_language.py --model-type whisper
     - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
 
-- label: Blackwell Test
+- label: Blackwell Test # 38 min
+  timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
   gpu: b200
   # optional: true
@@ -682,6 +733,7 @@ steps:
 #####  multi gpus test  #####
 
 - label: Distributed Comm Ops Test # 7min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -693,6 +745,7 @@ steps:
   - pytest -v -s distributed/test_shm_broadcast.py
 
 - label: 2 Node Tests (4 GPUs in total) # 16min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -716,7 +769,8 @@ steps:
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
     - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
 
-- label: Distributed Tests (2 GPUs) # 40min
+- label: Distributed Tests (2 GPUs) # 110min
+  timeout_in_minutes: 150
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -757,6 +811,7 @@ steps:
   - pytest -v -s models/multimodal/generation/test_maverick.py
 
 - label: Plugin Tests (2 GPUs) # 40min
+  timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -783,6 +838,7 @@ steps:
   - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 
 - label: Pipeline Parallelism Test # 45min
+  timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
@@ -796,7 +852,8 @@ steps:
   - pytest -v -s distributed/test_pp_cudagraph.py
   - pytest -v -s distributed/test_pipeline_parallel.py
 
-- label: LoRA TP Test (Distributed)
+- label: LoRA TP Test (Distributed) # 17 min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   num_gpus: 4
   source_file_dependencies:
@@ -814,6 +871,7 @@ steps:
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2 

From cee182b2970b56d3f9d5aa0017ce165cbdb788f6 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <benjamin.chislett@centml.ai>
Date: Fri, 5 Sep 2025 21:20:17 -0400
Subject: [PATCH 870/932] [Perf][V1] Fully overlap model execution (#23569)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
---
 vllm/v1/executor/multiproc_executor.py |  50 +++++-
 vllm/v1/outputs.py                     |  15 ++
 vllm/v1/worker/gpu_input_batch.py      |   5 +
 vllm/v1/worker/gpu_model_runner.py     | 203 ++++++++++++++++++++++---
 vllm/v1/worker/gpu_worker.py           |  10 +-
 5 files changed, 252 insertions(+), 31 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 84eb956b5c..ef6303495c 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -3,6 +3,7 @@
 import multiprocessing
 import os
 import pickle
+import queue
 import signal
 import threading
 import time
@@ -33,7 +34,8 @@ from vllm.utils import (decorate_logs, get_distributed_init_method,
                         get_loopback_ip, get_mp_context, get_open_port,
                         set_process_title)
 from vllm.v1.executor.abstract import Executor, FailureCallback
-from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds,
+                             ModelRunnerOutput)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -414,6 +416,16 @@ class WorkerProc:
         # Initializes a message queue for sending the model output
         self.worker_response_mq = MessageQueue(1, 1)
 
+        scheduler_config = vllm_config.scheduler_config
+        self.use_async_scheduling = scheduler_config.async_scheduling
+        if self.use_async_scheduling:
+            self.async_output_queue: queue.Queue = queue.Queue()
+            self.async_output_copy_thread = Thread(
+                target=self.async_output_busy_loop,
+                daemon=True,
+                name="WorkerAsyncOutputCopy")
+            self.async_output_copy_thread.start()
+
         # Initialize device and loads weights
         self.worker.init_device()
         self.worker.load_model()
@@ -595,6 +607,36 @@ class WorkerProc:
         SUCCESS = auto()
         FAILURE = auto()
 
+    def enqueue_output(self, output: Any):
+        """Prepares output from the worker and enqueues it to the
+        worker_response_mq. If the output is an Exception, it is
+        converted to a FAILURE response.
+        """
+        if isinstance(output, AsyncModelRunnerOutput):
+            output = output.get_output()
+
+        if isinstance(output, Exception):
+            result = (WorkerProc.ResponseStatus.FAILURE, str(output))
+        else:
+            result = (WorkerProc.ResponseStatus.SUCCESS, output)
+        self.worker_response_mq.enqueue(result)
+
+    def handle_output(self, output: Any):
+        """Handles output from the worker. If async scheduling is enabled,
+        it is passed to the async_output_busy_loop thread. Otherwise, it is
+        enqueued directly to the worker_response_mq.
+        """
+        if self.use_async_scheduling:
+            self.async_output_queue.put(output)
+        else:
+            self.enqueue_output(output)
+
+    def async_output_busy_loop(self):
+        """Entrypoint for the thread which handles outputs asynchronously."""
+        while True:
+            output = self.async_output_queue.get()
+            self.enqueue_output(output)
+
     def worker_busy_loop(self):
         """Main busy loop for Multiprocessing Workers"""
         while True:
@@ -614,10 +656,8 @@ class WorkerProc:
                 # exception might not be serializable, so we convert it to
                 # string, only for logging purpose.
                 if output_rank is None or self.rank == output_rank:
-                    self.worker_response_mq.enqueue(
-                        (WorkerProc.ResponseStatus.FAILURE, str(e)))
+                    self.handle_output(e)
                 continue
 
             if output_rank is None or self.rank == output_rank:
-                self.worker_response_mq.enqueue(
-                    (WorkerProc.ResponseStatus.SUCCESS, output))
+                self.handle_output(output)
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index f8d6b24702..1b2da8addb 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import NamedTuple, Optional
 
@@ -114,6 +115,20 @@ class ModelRunnerOutput:
     num_nans_in_logits: Optional[dict[str, int]] = None
 
 
+# ModelRunnerOutput wrapper for async scheduling.
+class AsyncModelRunnerOutput(ABC):
+
+    @abstractmethod
+    def get_output(self) -> ModelRunnerOutput:
+        """Get the ModelRunnerOutput for this async output.
+        
+        This is a blocking call that waits until the results are ready, which
+        might involve copying device tensors to the host.
+        This method should only be called once per AsyncModelRunnerOutput.
+        """
+        pass
+
+
 @dataclass
 class DraftTokenIds:
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index ad70d9efaa..83fc821b84 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -250,6 +250,11 @@ class InputBatch:
 
         self.pooling_params: dict[str, PoolingParams] = {}
 
+        # Cached reference to the GPU tensor of previously sampled tokens
+        self.prev_sampled_token_ids: Optional[torch.Tensor] = None
+        self.prev_sampled_token_ids_invalid_indices: Optional[set[int]] = None
+        self.prev_req_id_to_index: Optional[dict[str, int]] = None
+
     @property
     def req_ids(self) -> list[str]:
         # None elements should only be present transiently
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 42baf020e9..7859e966b0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -67,8 +67,8 @@ from vllm.v1.kv_cache_interface import (AttentionSpec,
                                         FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheSpec,
                                         MambaSpec, SlidingWindowSpec)
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
-                             LogprobsTensors, ModelRunnerOutput)
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
+                             DraftTokenIds, LogprobsTensors, ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -100,6 +100,53 @@ else:
 logger = init_logger(__name__)
 
 
+# Wrapper for ModelRunnerOutput to support overlapped execution.
+class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
+
+    def __init__(
+        self,
+        model_runner_output: ModelRunnerOutput,
+        sampled_token_ids: torch.Tensor,
+        invalid_req_indices: list[int],
+        async_output_copy_stream: torch.cuda.Stream,
+    ):
+        self._model_runner_output = model_runner_output
+        self._invalid_req_indices = invalid_req_indices
+
+        # Event on the copy stream so we can synchronize the non-blocking copy.
+        self._async_copy_ready_event = torch.cuda.Event()
+
+        # Keep a reference to the device tensor to avoid it being
+        # deallocated until we finish copying it to the host.
+        self._sampled_token_ids = sampled_token_ids
+
+        # Initiate the copy on a separate stream, but do not synchronize it.
+        default_stream = torch.cuda.current_stream()
+        with torch.cuda.stream(async_output_copy_stream):
+            async_output_copy_stream.wait_stream(default_stream)
+            self._sampled_token_ids_cpu = self._sampled_token_ids.to(
+                'cpu', non_blocking=True)
+            self._async_copy_ready_event.record()
+
+    def get_output(self) -> ModelRunnerOutput:
+        """Copy the device tensors to the host and return a ModelRunnerOutput.
+        
+        This function blocks until the copy is finished.
+        """
+        self._async_copy_ready_event.synchronize()
+
+        # Release the device tensor once the copy has completed
+        del self._sampled_token_ids
+
+        valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist()
+        for i in self._invalid_req_indices:
+            valid_sampled_token_ids[i].clear()
+
+        output = self._model_runner_output
+        output.sampled_token_ids = valid_sampled_token_ids
+        return output
+
+
 class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def __init__(
@@ -230,6 +277,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             is_pooling_model=self.is_pooling_model,
         )
 
+        self.use_async_scheduling = self.scheduler_config.async_scheduling
+        self.async_output_copy_stream = torch.cuda.Stream() if \
+            self.use_async_scheduling else None
+
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
         # The convention is different.
         # self.cudagraph_batch_sizes sorts in ascending order.
@@ -654,6 +705,73 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         return cu_num_tokens, arange
 
+    def _prepare_input_ids(self, total_num_scheduled_tokens: int,
+                           cu_num_tokens: np.ndarray) -> None:
+        """Prepare the input IDs for the current batch.
+        
+        Carefully handles the `prev_sampled_token_ids` which can be cached
+        from the previous engine iteration, in which case those tokens on the
+        GPU need to be copied into the corresponding slots into input_ids."""
+
+        if self.input_batch.prev_sampled_token_ids is None:
+            # Normal scheduling case
+            self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
+            return
+
+        # Async scheduling case, where some decode requests from the previous
+        # iteration won't have entries in input_ids_cpu and need to be copied
+        # on the GPU from prev_sampled_token_ids.
+        prev_req_id_to_index = self.input_batch.prev_req_id_to_index
+        assert prev_req_id_to_index is not None
+        flattened_indices = []
+        prev_common_req_indices = []
+        indices_match = True
+        max_flattened_index = -1
+        for req_id, cur_index in self.input_batch.req_id_to_index.items():
+            if (prev_index := prev_req_id_to_index.get(req_id)) is not None:
+                prev_common_req_indices.append(prev_index)
+                # We need to compute the flattened input_ids index of the
+                # last token in each common request.
+                flattened_index = cu_num_tokens[cur_index].item() - 1
+                flattened_indices.append(flattened_index)
+                indices_match &= (prev_index == flattened_index)
+                max_flattened_index = max(max_flattened_index, flattened_index)
+        num_commmon_tokens = len(flattened_indices)
+        if num_commmon_tokens < total_num_scheduled_tokens:
+            # If not all requests are decodes from the last iteration,
+            # We need to copy the input_ids_cpu to the GPU first.
+            self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
+        if num_commmon_tokens == 0:
+            # No requests in common with the previous iteration
+            # So input_ids_cpu will have all the input ids.
+            return
+        if indices_match and max_flattened_index == (num_commmon_tokens - 1):
+            # Common-case optimization: the batch is unchanged
+            # and no reordering happened.
+            # The indices are both the same permutation of 0..N-1 so
+            # we can copy directly using a single slice.
+            self.input_ids.gpu[:num_commmon_tokens].copy_(
+                self.input_batch.prev_sampled_token_ids[:num_commmon_tokens,
+                                                        0],
+                non_blocking=True)
+            return
+        # Upload the index tensors asynchronously
+        # so the scatter can be non-blocking.
+        input_ids_index_tensor = torch.tensor(flattened_indices,
+                                              dtype=torch.int64,
+                                              pin_memory=self.pin_memory).to(
+                                                  self.device,
+                                                  non_blocking=True)
+        prev_common_req_indices_tensor = torch.tensor(
+            prev_common_req_indices,
+            dtype=torch.int64,
+            pin_memory=self.pin_memory).to(self.device, non_blocking=True)
+        self.input_ids.gpu.scatter_(
+            dim=0,
+            index=input_ids_index_tensor,
+            src=self.input_batch.prev_sampled_token_ids[
+                prev_common_req_indices_tensor, 0])
+
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
@@ -740,7 +858,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         max_seq_len = self.seq_lens.np[:num_reqs].max().item()
 
         # Copy the tensors to the GPU.
-        self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
+        self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
+
         if self.uses_mrope:
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
@@ -1458,7 +1577,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self,
         scheduler_output: "SchedulerOutput",
         intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> Union[ModelRunnerOutput, IntermediateTensors]:
+    ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
         self._update_states(scheduler_output)
         if not scheduler_output.total_num_scheduled_tokens:
             if not has_kv_transfer_group():
@@ -1673,6 +1792,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # so that we could clear the sampled tokens before returning.
                 discard_sampled_tokens_req_indices.append(i)
 
+        # Copy some objects so they don't get modified after returning.
+        # This is important when using async scheduling.
+        req_ids_output_copy = self.input_batch.req_ids.copy()
+        req_id_to_index_output_copy = \
+            self.input_batch.req_id_to_index.copy()
+
         # NOTE: GPU -> CPU Sync happens here.
         # Move as many CPU operations as possible before this sync point.
         logprobs_tensors = sampler_output.logprobs_tensors
@@ -1685,21 +1810,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             scheduler_output.num_scheduled_tokens,
         )
 
-        # Get the valid generated tokens.
+        num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
-        max_gen_len = sampled_token_ids.shape[-1]
-        if max_gen_len == 1:
-            # No spec decode tokens.
-            valid_sampled_token_ids = self._to_list(sampled_token_ids)
+        if not self.use_async_scheduling:
+            # Get the valid generated tokens.
+            max_gen_len = sampled_token_ids.shape[-1]
+            if max_gen_len == 1:
+                # No spec decode tokens.
+                valid_sampled_token_ids = self._to_list(sampled_token_ids)
+            else:
+                # Includes spec decode tokens.
+                valid_sampled_token_ids = self.rejection_sampler.parse_output(
+                    sampled_token_ids,
+                    self.input_batch.vocab_size,
+                )
+            # Mask out the sampled tokens that should not be sampled.
+            for i in discard_sampled_tokens_req_indices:
+                valid_sampled_token_ids[i].clear()
         else:
-            # Includes spec decode tokens.
-            valid_sampled_token_ids = self.rejection_sampler.parse_output(
-                sampled_token_ids,
-                self.input_batch.vocab_size,
-            )
-        # Mask out the sampled tokens that should not be sampled.
-        for i in discard_sampled_tokens_req_indices:
-            valid_sampled_token_ids[i].clear()
+            valid_sampled_token_ids = []
+            invalid_req_indices = list(discard_sampled_tokens_req_indices)
+            invalid_req_indices_set = set(invalid_req_indices)
+            assert sampled_token_ids.shape[-1] == 1
+
+            # Cache the sampled tokens on the GPU and avoid CPU sync.
+            # These will be copied into input_ids in the next step
+            # when preparing inputs.
+            self.input_batch.prev_sampled_token_ids = \
+                sampled_token_ids
+            self.input_batch.prev_sampled_token_ids_invalid_indices = \
+                invalid_req_indices_set
+            self.input_batch.prev_req_id_to_index = {
+                req_id: i
+                for i, req_id in enumerate(self.input_batch.req_ids)
+                if i not in invalid_req_indices_set
+            }
 
         # Cache the sampled tokens in the model runner, so that the scheduler
         # doesn't need to send them back.
@@ -1707,7 +1852,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # the sampled tokens back, because there's no direct communication
         # between the first-stage worker and the last-stage worker.
         req_ids = self.input_batch.req_ids
-        for req_idx, sampled_ids in enumerate(valid_sampled_token_ids):
+        for req_idx in range(num_sampled_tokens):
+            if self.use_async_scheduling:
+                sampled_ids = [-1] if \
+                    req_idx not in invalid_req_indices_set else None
+            else:
+                sampled_ids = valid_sampled_token_ids[req_idx]
             if not sampled_ids:
                 continue
 
@@ -1722,6 +1872,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                            start_idx:end_idx] = sampled_ids
             self.input_batch.num_tokens_no_spec[req_idx] = end_idx
             self.input_batch.num_tokens[req_idx] = end_idx
+
             req_id = req_ids[req_idx]
             req_state = self.requests[req_id]
             req_state.output_token_ids.extend(sampled_ids)
@@ -1741,9 +1892,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         self.eplb_step()
 
-        return ModelRunnerOutput(
-            req_ids=self.input_batch.req_ids,
-            req_id_to_index=self.input_batch.req_id_to_index,
+        output = ModelRunnerOutput(
+            req_ids=req_ids_output_copy,
+            req_id_to_index=req_id_to_index_output_copy,
             sampled_token_ids=valid_sampled_token_ids,
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
@@ -1752,6 +1903,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_nans_in_logits=num_nans_in_logits,
         )
 
+        if not self.use_async_scheduling:
+            return output
+
+        return AsyncGPUModelRunnerOutput(
+            model_runner_output=output,
+            sampled_token_ids=sampled_token_ids,
+            invalid_req_indices=invalid_req_indices,
+            async_output_copy_stream=self.async_output_copy_stream,
+        )
+
     def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
         if self._draft_token_ids is None:
             return None
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index affba877ec..99c805a3e9 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -5,7 +5,7 @@ import copy
 import gc
 import os
 from contextlib import AbstractContextManager, nullcontext
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import torch
 import torch.distributed
@@ -28,8 +28,8 @@ from vllm.tasks import SupportedTask
 from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
-                             ModelRunnerOutput)
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
+                             DraftTokenIds, ModelRunnerOutput)
 from vllm.v1.utils import report_usage_stats
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 from vllm.v1.worker.worker_base import WorkerBase
@@ -355,7 +355,7 @@ class Worker(WorkerBase):
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> Optional[ModelRunnerOutput]:
+    ) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]:
         intermediate_tensors = None
         forward_pass = scheduler_output.total_num_scheduled_tokens > 0
         if forward_pass and not get_pp_group().is_first_rank:
@@ -365,7 +365,7 @@ class Worker(WorkerBase):
 
         output = self.model_runner.execute_model(scheduler_output,
                                                  intermediate_tensors)
-        if isinstance(output, ModelRunnerOutput):
+        if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)):
             return output
 
         assert isinstance(output, IntermediateTensors)

From 35efa7029702f47a427bd78407647e2b05929bbb Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Fri, 5 Sep 2025 18:56:15 -0700
Subject: [PATCH 871/932] Add @22quinn as code reviewer for RL related
 components (#24346)

---
 .github/CODEOWNERS | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index c087fd555c..d2839deccb 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,13 +5,15 @@
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
+/vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96
+/vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
@@ -85,4 +87,3 @@ mkdocs.yaml @hmellor
 /vllm/v1/attention/backends/mla/rocm*.py @gshtras
 /vllm/attention/ops/rocm*.py @gshtras
 /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
-

From 35bf19386489c204189fe920aa6e1b420fe75928 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Sat, 6 Sep 2025 04:41:12 +0200
Subject: [PATCH 872/932] [Doc]: fix typos in Python comments (#24294)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 csrc/quantization/machete/generate.py                       | 2 +-
 docs/getting_started/installation/cpu.md                    | 2 +-
 tests/models/multimodal/generation/vlm_utils/core.py        | 2 +-
 vllm/distributed/device_communicators/custom_all_reduce.py  | 4 ++--
 .../openai/tool_parsers/internlm2_tool_parser.py            | 6 +++---
 vllm/envs.py                                                | 2 +-
 vllm/model_executor/layers/fused_moe/fused_moe.py           | 2 +-
 vllm/model_executor/layers/fused_moe/layer.py               | 4 ++--
 vllm/model_executor/layers/quantization/gptq_marlin.py      | 4 ++--
 vllm/v1/attention/backends/flashinfer.py                    | 2 +-
 vllm/v1/engine/core.py                                      | 2 +-
 vllm/v1/worker/gpu_model_runner.py                          | 2 +-
 12 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index 0d14ba1593..8fd536ef46 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
             ))
 
     def prepacked_type_key(prepack_type: PrepackTypeConfig):
-        # For now we we can just use the first accumulator type seen since
+        # For now, we can just use the first accumulator type seen since
         # the tensor core shapes/layouts don't vary based on accumulator
         # type so we can generate less code this way
         return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index ccb2909ea3..f8b4f75308 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
     - Offline Inference: `256 * world_size`
     - Online Serving: `128 * world_size`
 
-vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
+vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommended to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
 
 ### Which quantization configs does vLLM CPU support?
 
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index ae70838336..11d44120b8 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -42,7 +42,7 @@ def run_test(
     tensor_parallel_size: int = 1,
     vllm_embeddings: Optional[torch.Tensor] = None,
 ):
-    """Modality agnostic test test executor for comparing HF/vLLM outputs."""
+    """Modality agnostic test executor for comparing HF/vLLM outputs."""
     # In the case of embeddings, vLLM takes separate input tensors
     vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
 
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 80aca81234..c8cc35f997 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -60,7 +60,7 @@ class CustomAllreduce:
             group: the process group to work on. If None, it will use the
                 default process group.
             device: the device to bind the CustomAllreduce to. If None,
-                it will be bind to f"cuda:{local_rank}".
+                it will be bound to f"cuda:{local_rank}".
         It is the caller's responsibility to make sure each communicator
         is bind to a unique device, and all communicators in this group
         are in the same node.
@@ -158,7 +158,7 @@ class CustomAllreduce:
 
         self.disabled = False
         # Buffers memory are owned by this Python class and passed to C++.
-        # Meta data composes of two parts: meta data for synchronization and a
+        # Metadata composes of two parts: metadata for synchronization and a
         # temporary buffer for storing intermediate allreduce results.
         self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
                                                    group=group,
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 6ef8fadf59..2055393d7e 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -35,7 +35,7 @@ class Internlm2ToolParser(ToolParser):
             self, request: ChatCompletionRequest) -> ChatCompletionRequest:
         if request.tools and request.tool_choice != 'none':
             # do not skip special tokens because internlm use the special
-            # tokens to indicated the start and end of the tool calls
+            # tokens to indicate the start and end of the tool calls
             # information.
             request.skip_special_tokens = False
         return request
@@ -60,8 +60,8 @@ class Internlm2ToolParser(ToolParser):
         if '<|action_start|>' not in current_text:
             self.position = len(current_text)
             return DeltaMessage(content=delta_text)
-        # if the tool call is sended, return a empty delta message
-        # to make sure the finish_reason will be send correctly.
+        # if the tool call is sended, return an empty delta message
+        # to make sure the finish_reason will be sent correctly.
         if self.current_tool_id > 0:
             return DeltaMessage(content='')
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 56adb83e8d..e5b4489329 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1064,7 +1064,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # vllm should use flashinfer fused allreduce. The variable should be a
     # JSON with the following format:
     #     { <world size>: <max size in mb> }
-    # Unspecified world sizes will fallback to
+    # Unspecified world sizes will fall back to
     #     { 2: 64, 4: 1, <everything else>: 0.5 }
     "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
     lambda: json.loads(os.getenv(
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index eb3e14180e..06edfb0552 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -534,7 +534,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
     EM = sorted_token_ids.size(0)
     if A.size(0) < config["BLOCK_SIZE_M"]:
         # optimize for small batch_size.
-        # We assume that top_ids of each token is unique, so
+        # We assume that top_ids of each token is unique,
         # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
         # and we can skip some invalid blocks.
         EM = min(sorted_token_ids.size(0),
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b1a61ade53..272ad39565 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -710,7 +710,7 @@ def determine_expert_map(
 
     # Create a tensor of size num_experts filled with -1
     expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
-    # Create a expert map for the local experts
+    # Create an expert map for the local experts
     start_idx = ep_rank * base_experts + min(ep_rank, remainder)
     expert_map[start_idx:start_idx + local_num_experts] = torch.arange(
         0, local_num_experts, dtype=torch.int32)
@@ -806,7 +806,7 @@ class FusedMoE(CustomOp):
 
         self.global_num_experts = num_experts + num_redundant_experts
 
-        # we padding globally so EP buffer allocation works
+        # we are padding globally so EP buffer allocation works
         if quant_config and quant_config.get_name() == "mxfp4":
             from vllm.model_executor.layers.quantization.mxfp4 import (  # noqa: E501
                 should_use_flashinfer_mxfp4)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index cf959e13bc..76de3a59c8 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -469,7 +469,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         )
         layer.register_parameter("w2_scales", w2_scales)
         set_weight_attrs(w2_scales, extra_weight_attrs)
-        # dont shard the w2 scales when running act order
+        # don't shard the w2 scales when running act order
         set_weight_attrs(w2_scales,
                          {"load_full_w2": self.quant_config.desc_act})
         # up_proj scales
@@ -493,7 +493,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         )
         layer.register_parameter("w2_qzeros", w2_qzeros)
         set_weight_attrs(w2_qzeros, extra_weight_attrs)
-        # dont shard the w2 scales when running act order
+        # don't shard the w2 scales when running act order
         set_weight_attrs(w2_qzeros,
                          {"load_full_w2": self.quant_config.desc_act})
         w13_g_idx = torch.nn.Parameter(
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index fc17385797..06a853007a 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -687,7 +687,7 @@ class FlashInferImpl(AttentionImpl):
             else:
                 raise ValueError(f"Unsupported output dtype: {output.dtype}")
 
-            # TRTLLM attn kernel requires o scale to pass as a host scalar,
+            # TRTLLM attn kernel requires to scale to pass as a host scalar,
             # store the o scale as a host scalar in warmup run with cuda graph
             # not enabled
             if layer._o_scale_float is None:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index d7e9cfa366..e239e6cbba 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -439,7 +439,7 @@ class EngineCore:
         """
         # Note on thread safety: no race condition.
         # `mm_receiver_cache` is reset at the end of LLMEngine init,
-        # and will only accessed in the input processing thread afterwards.
+        # and will only be accessed in the input processing thread afterwards.
         if self.mm_receiver_cache is not None and request.mm_features:
             request.mm_features = (
                 self.mm_receiver_cache.get_and_update_features(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7859e966b0..5bee2dff98 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2826,7 +2826,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Disable cudagraph capturing globally, so any unexpected cudagraph
         # capturing will be detected and raise an error after here.
         # Note: We don't put it into graph_capture context manager because
-        # we may doing lazy capturing in future that still allows capturing
+        # we may do lazy capturing in future that still allows capturing
         # after here.
         set_cudagraph_capturing_enabled(False)
 

From 3c529fc9945964819dc17b9910ad6ccdbf231413 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Fri, 5 Sep 2025 20:22:40 -0700
Subject: [PATCH 873/932] [KV Sharing] Raise error if using eagle with fast
 prefill (#24350)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 vllm/config/__init__.py | 18 ++++++++++++++++++
 vllm/config/cache.py    |  7 -------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 941aff8919..8bdc22acf3 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3665,6 +3665,24 @@ class VllmConfig:
                 " Disabling `torch.compile`.")
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
+        if self.cache_config.kv_sharing_fast_prefill:
+            if not envs.VLLM_USE_V1:
+                raise NotImplementedError(
+                    "Fast prefill optimization for KV sharing is not supported "
+                    "in V0 currently.")
+
+            if self.speculative_config is not None and \
+                self.speculative_config.use_eagle():
+                raise NotImplementedError(
+                    "Fast prefill optimization for KV sharing is not "
+                    "compatible with EAGLE as EAGLE requires correct logits "
+                    "for all tokens while fast prefill gives incorrect logits "
+                    "for prompt tokens.")
+
+            logger.warning_once(
+                "--kv-sharing-fast-prefill requires changes on model side for "
+                "correctness and to realize prefill savings. ")
+
         if ((not envs.VLLM_USE_V1) and self.lora_config is not None
                 and self.compilation_config.level
                 != CompilationLevel.NO_COMPILATION):
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 79761e7844..6f8f962fe7 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -145,19 +145,12 @@ class CacheConfig:
 
         self._verify_cache_dtype()
         self._verify_prefix_caching()
-        self._verify_kv_sharing_fast_prefill()
 
     def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
         # metrics info
         return {key: str(value) for key, value in self.__dict__.items()}
 
-    def _verify_kv_sharing_fast_prefill(self) -> None:
-        if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1:
-            raise NotImplementedError(
-                "Fast prefill optimization for KV sharing is not supported "
-                "in V0 currently.")
-
     @model_validator(mode='after')
     def _verify_args(self) -> Self:
         if self.cpu_offload_gb < 0:

From ac201a0eaf2af032779db3e7ac96f857cbaa5b7a Mon Sep 17 00:00:00 2001
From: yzds <41983536+youzhedian@users.noreply.github.com>
Date: Sat, 6 Sep 2025 13:24:05 +0800
Subject: [PATCH 874/932] [Feature] Support Decode Context Parallel (DCP) for
 MLA (#23734)

Signed-off-by: hongchao <hongchao@msh.team>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: hongchao <hongchao@msh.team>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   3 +-
 csrc/cache.h                                  |   7 -
 csrc/cache_kernels.cu                         | 103 ------
 csrc/torch_bindings.cpp                       |  10 -
 tests/distributed/test_context_parallel.py    | 263 ++++++++++++++
 vllm/_custom_ops.py                           |  14 -
 vllm/attention/ops/common.py                  | 139 ++++++++
 vllm/attention/ops/flashmla.py                |   4 +-
 vllm/config/parallel.py                       |   5 +
 vllm/distributed/parallel_state.py            |  49 ++-
 vllm/engine/arg_utils.py                      |  17 +
 vllm/v1/attention/backends/mla/common.py      | 332 ++++++++++++++++--
 vllm/v1/attention/backends/mla/cutlass_mla.py |  18 +-
 .../attention/backends/mla/flashattn_mla.py   |  13 +-
 vllm/v1/attention/backends/mla/flashmla.py    |  18 +-
 .../attention/backends/mla/rocm_aiter_mla.py  |  15 +-
 vllm/v1/attention/backends/mla/triton_mla.py  |  15 +-
 vllm/v1/core/kv_cache_coordinator.py          |  70 ++--
 vllm/v1/core/kv_cache_manager.py              |   9 +
 vllm/v1/core/kv_cache_utils.py                |   6 +
 vllm/v1/core/sched/scheduler.py               |  10 +
 vllm/v1/core/single_type_kv_cache_manager.py  |  19 +-
 vllm/v1/kv_cache_interface.py                 |   8 +
 vllm/v1/worker/block_table.py                 |  59 +++-
 vllm/v1/worker/gpu_model_runner.py            |  11 +
 vllm/v1/worker/gpu_worker.py                  |   6 +-
 vllm/worker/worker.py                         |   6 +-
 27 files changed, 999 insertions(+), 230 deletions(-)
 create mode 100644 tests/distributed/test_context_parallel.py
 create mode 100644 vllm/attention/ops/common.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ad240023a0..b0d4c4456d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -837,7 +837,7 @@ steps:
   - pytest -v -s models/test_oot_registration.py # it needs a clean process
   - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 
-- label: Pipeline Parallelism Test # 45min
+- label: Pipeline + Context Parallelism Test # 45min
   timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
@@ -851,6 +851,7 @@ steps:
   commands:
   - pytest -v -s distributed/test_pp_cudagraph.py
   - pytest -v -s distributed/test_pipeline_parallel.py
+  # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
 
 - label: LoRA TP Test (Distributed) # 17 min
   timeout_in_minutes: 30
diff --git a/csrc/cache.h b/csrc/cache.h
index e8e069aefd..fd230bec27 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -36,13 +36,6 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                           const std::string& kv_cache_dtype,
                           torch::Tensor& scale);
 
-void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
-                                   torch::Tensor& cp_local_token_select_indices,
-                                   torch::Tensor& kv_cache,
-                                   torch::Tensor& slot_mapping,
-                                   const std::string& kv_cache_dtype,
-                                   torch::Tensor& scale);
-
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index fbb022464e..80b4c47c55 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -396,51 +396,6 @@ __global__ void concat_and_cache_mla_kernel(
   copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }
 
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
-__global__ void cp_fused_concat_and_cache_mla_kernel(
-    const scalar_t* __restrict__ kv_c,  // [num_full_tokens, kv_lora_rank]
-    const scalar_t* __restrict__ k_pe,  // [num_full_tokens, pe_dim]
-    const int64_t* __restrict__ cp_local_token_select_indices,  // [num_tokens]
-    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
-                                     // + pe_dim)]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int block_stride,                    //
-    const int entry_stride,                    //
-    const int kv_c_stride,                     //
-    const int k_pe_stride,                     //
-    const int kv_lora_rank,                    //
-    const int pe_dim,                          //
-    const int block_size,                      //
-    const float* scale                         //
-) {
-  const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
-  const int64_t slot_idx = slot_mapping[blockIdx.x];
-  // NOTE: slot_idx can be -1 if the token is padded
-  if (slot_idx < 0) {
-    return;
-  }
-  const int64_t block_idx = slot_idx / block_size;
-  const int64_t block_offset = slot_idx % block_size;
-
-  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
-                  int src_stride, int dst_stride, int size, int offset) {
-    for (int i = threadIdx.x; i < size; i += blockDim.x) {
-      const int64_t src_idx = token_idx * src_stride + i;
-      const int64_t dst_idx =
-          block_idx * block_stride + block_offset * entry_stride + i + offset;
-      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        dst[dst_idx] = src[src_idx];
-      } else {
-        dst[dst_idx] =
-            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
-      }
-    }
-  };
-
-  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
-  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
-}
-
 }  // namespace vllm
 
 // KV_T is the data type of key and value tensors.
@@ -554,20 +509,6 @@ void reshape_and_cache_flash(
           kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
           reinterpret_cast<const float*>(scale.data_ptr()));
 
-// KV_T is the data type of key and value tensors.
-// CACHE_T is the stored data type of kv-cache.
-// KV_DTYPE is the real data type of kv-cache.
-#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)     \
-  vllm::cp_fused_concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>   \
-      <<<grid, block, 0, stream>>>(                                     \
-          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
-          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
-          cp_local_token_select_indices.data_ptr<int64_t>(),            \
-          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
-          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
-          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
-          reinterpret_cast<const float*>(scale.data_ptr()));
-
 void concat_and_cache_mla(
     torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
     torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@@ -606,50 +547,6 @@ void concat_and_cache_mla(
                              CALL_CONCAT_AND_CACHE_MLA);
 }
 
-// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
-// calls into one:
-// k_c_normed.index_select(0, cp_local_token_select_indices) + \
-// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
-// concat_and_cache_mla.
-void cp_fused_concat_and_cache_mla(
-    torch::Tensor& kv_c,  // [num_total_tokens, kv_lora_rank]
-    torch::Tensor& k_pe,  // [num_total_tokens, pe_dim]
-    torch::Tensor& cp_local_token_select_indices,  // [num_tokens]
-    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
-                                  // pe_dim)]
-    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, torch::Tensor& scale) {
-  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
-  // slot_mapping.size(0) because of padding for CUDA graphs.
-  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
-  // both include padding.
-  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
-  // since key includes padding for CUDA graphs, while slot_mapping does not.
-  // In this case, slot_mapping.size(0) represents the actual number of tokens
-  // before padding.
-  // For compatibility with both cases, we use slot_mapping.size(0) as the
-  // number of tokens.
-  int num_tokens = slot_mapping.size(0);
-  int kv_lora_rank = kv_c.size(1);
-  int pe_dim = k_pe.size(1);
-  int block_size = kv_cache.size(1);
-
-  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
-
-  int kv_c_stride = kv_c.stride(0);
-  int k_pe_stride = k_pe.stride(0);
-  int block_stride = kv_cache.stride(0);
-  int entry_stride = kv_cache.stride(1);
-
-  dim3 grid(num_tokens);
-  dim3 block(std::min(kv_lora_rank, 512));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
-                             CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
-}
-
 namespace vllm {
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b769c09adc..95fb5b197f 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -693,16 +693,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                     Tensor scale) -> ()");
   cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);
 
-  cache_ops.def(
-      "cp_fused_concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
-      "                              Tensor cp_local_token_select_indices,"
-      "                              Tensor! kv_cache,"
-      "                              Tensor slot_mapping,"
-      "                              str kv_cache_dtype,"
-      "                              Tensor scale) -> ()");
-  cache_ops.impl("cp_fused_concat_and_cache_mla", torch::kCUDA,
-                 &cp_fused_concat_and_cache_mla);
-
   // Convert the key and value cache to fp8 data type.
   cache_ops.def(
       "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
new file mode 100644
index 0000000000..23be703a30
--- /dev/null
+++ b/tests/distributed/test_context_parallel.py
@@ -0,0 +1,263 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+WARNING: This test runs in both single-node (4 GPUs) and multi-node
+ (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
+ important to set the distributed backend to "mp" to avoid Ray scheduling
+ all workers in a node other than the head node, which can cause the test
+ to fail.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Literal, NamedTuple, Optional
+
+import pytest
+
+from vllm.config import RunnerOption
+from vllm.logger import init_logger
+
+from ..models.registry import HF_EXAMPLE_MODELS
+from ..utils import compare_two_settings, create_new_process_for_each_test
+
+logger = init_logger("test_context_parallel")
+
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+
+
+class ParallelSetup(NamedTuple):
+    tp_size: int
+    pp_size: int
+    dcp_size: int
+    eager_mode: bool
+    chunked_prefill: bool
+
+
+class CPTestOptions(NamedTuple):
+    multi_node_only: bool
+    load_format: Optional[str] = None
+
+
+@dataclass
+class CPTestSettings:
+    parallel_setups: list[ParallelSetup]
+    # NOTE: the length of distributed_backends and
+    # vllm_major_versions should be the same, and they
+    # are first zipped together to iterate over all
+    # test settings.
+    distributed_backends: list[str]
+    # vllm major version: "0" for V0, "1" for V1
+    vllm_major_versions: list[str]
+    runner: RunnerOption
+    test_options: CPTestOptions
+
+    def __post_init__(self):
+        if len(self.distributed_backends) != len(self.vllm_major_versions):
+            raise ValueError(
+                f"Length mismatch: distributed_backends "
+                f"({len(self.distributed_backends)}) != "
+                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+
+    @staticmethod
+    def detailed(
+        *,
+        tp_base: int = 4,
+        pp_base: int = 1,
+        dcp_base: int = 1,
+        multi_node_only: bool = False,
+        runner: RunnerOption = "auto",
+        load_format: Optional[str] = None,
+    ):
+        parallel_setups = []
+        for eager_mode_val in [False]:
+            for pp_multiplier in [1]:
+                for dcp_multiplier in [2, 4]:
+                    for chunked_prefill_val in [True]:
+                        parallel_setups.append(
+                            ParallelSetup(tp_size=tp_base,
+                                          pp_size=pp_multiplier * pp_base,
+                                          dcp_size=dcp_multiplier * dcp_base,
+                                          eager_mode=eager_mode_val,
+                                          chunked_prefill=chunked_prefill_val))
+        return CPTestSettings(
+            parallel_setups=parallel_setups,
+            distributed_backends=["mp"],
+            vllm_major_versions=["1"],
+            runner=runner,
+            test_options=CPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+
+    def iter_params(self, model_id: str):
+        opts = self.test_options
+
+        for parallel_setup in self.parallel_setups:
+            for backend, vllm_major_version in zip(self.distributed_backends,
+                                                   self.vllm_major_versions):
+                yield (model_id, parallel_setup, backend, vllm_major_version,
+                       self.runner, opts)
+
+
+def _compare_cp_with_tp(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    runner: RunnerOption,
+    test_options: CPTestOptions,
+    num_gpus_available: int,
+    *,
+    method: Literal["generate"],
+    is_multimodal: bool,
+):
+    (
+        tp_size,
+        pp_size,
+        dcp_size,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+
+    multi_node_only, load_format = test_options
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+
+    trust_remote_code = model_info.trust_remote_code
+    tokenizer_mode = model_info.tokenizer_mode
+    hf_overrides = model_info.hf_overrides
+
+    if load_format == "dummy":
+        # Avoid OOM
+        text_overrides = {
+            "num_hidden_layers": 4,
+            "hidden_size": 512,
+            "intermediate_size": 800,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 1,
+        }
+
+        if is_multimodal:
+            hf_overrides.update({"text_config": text_overrides})
+        else:
+            hf_overrides.update(text_overrides)
+    else:
+        model_info.check_available_online(on_fail="skip")
+
+    if num_gpus_available < tp_size * pp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+    if VLLM_MULTI_NODE and distributed_backend == "mp":
+        pytest.skip("Skipping multi-node pipeline parallel test for "
+                    "multiprocessing distributed backend")
+    if multi_node_only and not VLLM_MULTI_NODE:
+        pytest.skip("Not in multi-node setting")
+
+    common_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if runner != "auto":
+        common_args.extend(["--runner", runner])
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if load_format:
+        common_args.extend(["--load-format", load_format])
+    if hf_overrides:
+        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+
+    cp_env = tp_env = {
+        "VLLM_USE_V1":
+        vllm_major_version,  # Note(hc): DCP only support V1 engine only
+    }
+
+    cp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--pipeline-parallel-size",
+        str(pp_size),
+        "--decode-context-parallel-size",
+        str(dcp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--pipeline-parallel-size",
+        str(pp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+
+    try:
+        compare_two_settings(model_id,
+                             cp_args,
+                             tp_args,
+                             cp_env,
+                             tp_env,
+                             method=method,
+                             max_wait_seconds=720)
+    except Exception:
+        testing_ray_compiled_graph = cp_env is not None
+        if testing_ray_compiled_graph and vllm_major_version == "0":
+            # Ray Compiled Graph tests are flaky for V0,
+            # so we don't want to fail the test
+            logger.exception("Ray Compiled Graph tests failed")
+        else:
+            raise
+
+
+CP_TEXT_GENERATION_MODELS = {
+    # [MLA attention only]
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(),
+}
+
+CP_TEST_MODELS = [
+    # TODO support other models
+    # [LANGUAGE GENERATION]
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
+]
+
+
+@pytest.mark.parametrize(
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "runner", "test_options"),
+    [
+        params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in CP_TEST_MODELS
+    ],
+)
+@create_new_process_for_each_test()
+def test_cp_generation(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    runner: RunnerOption,
+    test_options: CPTestOptions,
+    num_gpus_available,
+):
+    _compare_cp_with_tp(model_id,
+                        parallel_setup,
+                        distributed_backend,
+                        vllm_major_version,
+                        runner,
+                        test_options,
+                        num_gpus_available,
+                        method="generate",
+                        is_multimodal=False)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index bb67d5790a..545f4cb48b 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1625,20 +1625,6 @@ def concat_and_cache_mla(
                                                 scale)
 
 
-def cp_fused_concat_and_cache_mla(
-    kv_c: torch.Tensor,
-    k_pe: torch.Tensor,
-    cp_local_token_select_indices: torch.Tensor,
-    kv_cache: torch.Tensor,
-    slot_mapping: torch.Tensor,
-    kv_cache_dtype: str,
-    scale: torch.Tensor,
-) -> None:
-    torch.ops._C_cache_ops.cp_fused_concat_and_cache_mla(
-        kv_c, k_pe, cp_local_token_select_indices, kv_cache, slot_mapping,
-        kv_cache_dtype, scale)
-
-
 def copy_blocks(key_caches: list[torch.Tensor],
                 value_caches: list[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
new file mode 100644
index 0000000000..189b57e8e8
--- /dev/null
+++ b/vllm/attention/ops/common.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.distributed.parallel_state import GroupCoordinator
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr,
+                                vlse_ptr, outputs_stride_B, outputs_stride_H,
+                                outputs_stride_D, lses_stride_N, lses_stride_B,
+                                lses_stride_H, lse_idx, HEAD_DIM: tl.constexpr,
+                                N_ROUNDED: tl.constexpr):
+    """
+    Apply the all-gathered lses to correct each local rank's attention
+    output. we still need perform a cross-rank reduction to obtain the
+    final attention output.
+
+    Args:
+        output: [ B, H, D ]
+        lses   : [ N, B, H ]
+        cp, batch, q_heads, v_head_dim
+    Return:
+        output: [ B, H, D ]
+        lse   : [ B, H ]
+    """
+    batch_idx = tl.program_id(axis=0).to(tl.int64)
+    head_idx = tl.program_id(axis=1).to(tl.int64)
+    d_offsets = tl.arange(0, HEAD_DIM)
+    num_n_offsets = tl.arange(0, N_ROUNDED)
+
+    # shape = [N]
+    lse_offsets = num_n_offsets * lses_stride_N + batch_idx * \
+        lses_stride_B + head_idx * lses_stride_H
+
+    # calc final lse
+    lse = tl.load(lses_ptr + lse_offsets)
+    lse = tl.where((lse != lse) | (lse == float('inf')), -float('inf'), lse)
+    lse_max = tl.max(lse, axis=0)
+    lse -= lse_max
+    lse_exp = tl.exp(lse)
+    lse_acc = tl.sum(lse_exp, axis=0)
+    lse = tl.log(lse_acc)
+    lse += lse_max
+
+    lse_offsets = batch_idx * lses_stride_B + head_idx * lses_stride_H
+    tl.store(vlse_ptr + lse_offsets, lse)
+
+    # shape = [D]
+    output_offsets = batch_idx * outputs_stride_B + \
+                    head_idx * outputs_stride_H + \
+                    d_offsets * outputs_stride_D
+
+    # correct output
+    lse_offset = lse_idx * lses_stride_N + batch_idx * \
+        lses_stride_B + head_idx * lses_stride_H
+    lse_tmp = tl.load(lses_ptr + lse_offset)
+    lse_finally = lse_tmp - lse
+    lse_finally = tl.where(
+        (lse_finally != lse_finally) | (lse_finally == float('inf')),
+        -float('inf'), lse_finally)
+    factor = tl.exp(lse_finally)
+    output = tl.load(outputs_ptr + output_offsets)
+    output = output * factor
+
+    tl.store(new_output_ptr + output_offsets, output)
+
+
+class CPTritonContext:
+    """ The CPTritonContext is used to avoid recompilation of the Triton JIT.
+    """
+
+    def __init__(self):
+        self.inner_kernel = None
+
+    def call_kernel(self, kernel, grid, *regular_args, **const_args):
+        if self.inner_kernel is None:
+            self.inner_kernel = kernel[grid](*regular_args, **const_args)
+        else:
+            self.inner_kernel[grid](*regular_args)
+
+
+def correct_attn_out(out: torch.Tensor, lses: torch.Tensor, cp_rank: int,
+                     ctx: CPTritonContext):
+    """
+    Apply the all-gathered lses to correct each local rank's attention
+    output. we still need perform a cross-rank reduction to obtain the
+    final attention output.
+
+    Args:
+        output: [ B, H, D ]
+        lses   : [ N, B, H ]
+    Return:
+        output: [ B, H, D ]
+        lse   : [ B, H ]
+    """
+    if ctx is None:
+        ctx = CPTritonContext()
+
+    lse = torch.empty_like(lses[0])
+
+    grid = (out.shape[0], out.shape[1], 1)
+    regular_args = (out, out, lses, lse, *out.stride(), *lses.stride(),
+                    cp_rank)
+    const_args = {
+        "HEAD_DIM": out.shape[-1],
+        "N_ROUNDED": lses.shape[0],
+    }
+
+    ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args,
+                    **const_args)
+    return out, lse
+
+
+def cp_lse_ag_out_rs(cp_attn_out: torch.Tensor,
+                     cp_attn_lse: torch.Tensor,
+                     cp_group: GroupCoordinator,
+                     ctx: CPTritonContext = None):
+    """
+    cp_attn_out: [ B, H, D ]
+    cp_attn_lse: [ B, H ]
+    """
+    if cp_group.world_size == 1:
+        return cp_attn_out
+
+    if ctx is None:
+        ctx = CPTritonContext()
+
+    lses = torch.empty((cp_group.world_size, ) + cp_attn_lse.shape,
+                       dtype=cp_attn_lse.dtype,
+                       device=cp_attn_lse.device)
+
+    cp_attn_lse = cp_attn_lse.contiguous()
+    lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
+    out, _ = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
+    assert out.is_contiguous()
+    out = cp_group.reduce_scatter(out, dim=1)
+    return out
diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py
index 564042cf8e..2c3e8c4240 100644
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -105,7 +105,9 @@ def flash_mla_with_kvcache(
         descale_q,
         descale_k,
     )
-    return out, softmax_lse
+
+    # Note(hc): need revisit when we support DCP with decode query_len > 1.
+    return out.squeeze(1), softmax_lse.squeeze(-1)
 
 
 #
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 9d4594bab3..fb8e30996e 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -170,6 +170,11 @@ class ParallelConfig:
     Set to be private as it's not intended to be configured by users.
     """
 
+    decode_context_parallel_size: int = 1
+    """Number of decode context parallel groups, because the world size does
+    not change by dcp, it simply reuse the GPUs of TP group, and tp_size
+    needs to be divisible by dcp_size."""
+
     @property
     def world_size_across_dp(self) -> int:
         """world_size_across_dp is TPxPPxDP, it is the size of the world
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index fc96c2ac92..522dfc8d8b 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -904,6 +904,18 @@ def get_tensor_model_parallel_group():
     return get_tp_group()
 
 
+_DCP: Optional[GroupCoordinator] = None
+
+
+def get_dcp_group() -> GroupCoordinator:
+    assert _DCP is not None, (
+        "decode context model parallel group is not initialized")
+    return _DCP
+
+
+# kept for backward compatibility
+get_context_model_parallel_group = get_dcp_group
+
 _PP: Optional[GroupCoordinator] = None
 
 _DP: Optional[GroupCoordinator] = None
@@ -1034,6 +1046,7 @@ def init_distributed_environment(
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
+    decode_context_model_parallel_size: Optional[int] = 1,
     backend: Optional[str] = None,
 ) -> None:
     """
@@ -1098,6 +1111,23 @@ def initialize_model_parallel(
                                     use_message_queue_broadcaster=True,
                                     group_name="tp")
 
+    # Build the DCP model-parallel groups.
+    global _DCP
+    assert _DCP is None, (
+        "decode context model parallel group is already initialized")
+    # Note(hc): In the current implementation of decode context parallel,
+    # dcp_size must not exceed tp_size, because the world size does not
+    # change by DCP, it simply reuse the GPUs of TP group, and split one
+    # TP group into tp_size//dcp_size DCP groups.
+    group_ranks = all_ranks.reshape(
+        -1, decode_context_model_parallel_size).unbind(0)
+    group_ranks = [x.tolist() for x in group_ranks]
+    _DCP = init_model_parallel_group(group_ranks,
+                                     get_world_group().local_rank,
+                                     backend,
+                                     use_message_queue_broadcaster=True,
+                                     group_name="dcp")
+
     # Build the pipeline model-parallel groups.
     global _PP
     assert _PP is None, (
@@ -1141,6 +1171,7 @@ def initialize_model_parallel(
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
+    decode_context_model_parallel_size: Optional[int] = 1,
     backend: Optional[str] = None,
 ) -> None:
     """Helper to initialize model parallel groups if they are not initialized,
@@ -1151,7 +1182,8 @@ def ensure_model_parallel_initialized(
         get_world_group().device_group)
     if not model_parallel_is_initialized():
         initialize_model_parallel(tensor_model_parallel_size,
-                                  pipeline_model_parallel_size, backend)
+                                  pipeline_model_parallel_size,
+                                  decode_context_model_parallel_size, backend)
         return
 
     assert (
@@ -1226,6 +1258,16 @@ def get_tensor_model_parallel_rank():
     return get_tp_group().rank_in_group
 
 
+def get_decode_context_model_parallel_world_size():
+    """Return world size for the decode context model parallel group."""
+    return get_dcp_group().world_size
+
+
+def get_decode_context_model_parallel_rank():
+    """Return my rank for the decode context model parallel group."""
+    return get_dcp_group().rank_in_group
+
+
 def get_node_count() -> int:
     """Return the total number of nodes in the distributed environment. """
     assert _NODE_COUNT is not None, (
@@ -1246,6 +1288,11 @@ def destroy_model_parallel():
         _PP.destroy()
     _PP = None
 
+    global _DCP
+    if _DCP:
+        _DCP.destroy()
+    _DCP = None
+
     global _DP
     if _DP:
         _DP.destroy()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 71ee90040f..d96654ecfa 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -306,6 +306,8 @@ class EngineArgs:
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
     tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
+    decode_context_parallel_size: int = \
+        ParallelConfig.decode_context_parallel_size
     data_parallel_size: int = ParallelConfig.data_parallel_size
     data_parallel_rank: Optional[int] = None
     data_parallel_start_rank: Optional[int] = None
@@ -636,6 +638,9 @@ class EngineArgs:
             **parallel_kwargs["pipeline_parallel_size"])
         parallel_group.add_argument("--tensor-parallel-size", "-tp",
                                     **parallel_kwargs["tensor_parallel_size"])
+        parallel_group.add_argument(
+            "--decode-context-parallel-size", "-dcp",
+            **parallel_kwargs["decode_context_parallel_size"])
         parallel_group.add_argument("--data-parallel-size", "-dp",
                                     **parallel_kwargs["data_parallel_size"])
         parallel_group.add_argument(
@@ -1156,6 +1161,17 @@ class EngineArgs:
             # global layers in interleaved sliding window models.
             sliding_window = model_config.get_sliding_window()
 
+        # Note(hc): In the current implementation of decode context
+        # parallel(DCP), tp_size needs to be divisible by dcp_size,
+        # because the world size does not change by dcp, it simply
+        # reuse the GPUs of TP group, and split one TP group into
+        # tp_size//dcp_size DCP groups.
+        assert self.tensor_parallel_size % self.decode_context_parallel_size \
+            == 0, (
+            f"tp_size={self.tensor_parallel_size} must be divisible by"
+            f"dcp_size={self.decode_context_parallel_size}."
+        )
+
         cache_config = CacheConfig(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
@@ -1306,6 +1322,7 @@ class EngineArgs:
             distributed_executor_backend=self.distributed_executor_backend,
             worker_cls=self.worker_cls,
             worker_extension_cls=self.worker_extension_cls,
+            decode_context_parallel_size=self.decode_context_parallel_size,
         )
 
         speculative_config = self.create_speculative_config(
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 9696b6c091..090ebf9384 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -201,10 +201,11 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl)
 from vllm.attention.backends.utils import get_mla_dims
+from vllm.attention.ops.common import cp_lse_ag_out_rs
 from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import is_global_first_rank
+from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase,
@@ -323,6 +324,13 @@ class MLACommonPrefillMetadata:
         seq_lens: torch.Tensor
         workspace: torch.Tensor
 
+        # for mla DCP
+        cp_chunk_seq_lens: Optional[list[list[int]]] = None
+        origin_context_lens: Optional[list[int]] = None
+        cp_cu_seq_lens: Optional[torch.Tensor] = None
+        chunk_size: Optional[int] = None
+        cu_seq_lens_lst: Optional[list[list[int]]] = None
+
     block_table: torch.Tensor
     query_start_loc: torch.Tensor
     max_query_len: int
@@ -444,6 +452,13 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             parallel_config)
         self.mla_dims = get_mla_dims(self.model_config)
         self.aot_schedule = current_platform.is_cuda()
+        try:
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0
 
         # Dont try to access the runner on AMD
         if self.aot_schedule:
@@ -465,12 +480,27 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             128 * 1024)
         assert self.chunked_prefill_workspace_size >= \
             scheduler_config.max_num_seqs * cache_config.block_size
-        self.chunked_prefill_workspace = torch.empty(
-            (self.chunked_prefill_workspace_size,
-             self.model_config.get_head_size()),
-            dtype=self.model_config.dtype,
-            device=device,
-        )
+        if self.dcp_world_size > 1:
+            # Note(hc): The local kvcache is incomplete when DCP is triggered,
+            # an additional kvcache allgather across the DCP group is therefore
+            # required, so the workspace has to be enlarged by 1/DCP relative
+            # to the original TP allocation.
+            assert self.chunked_prefill_workspace_size % \
+                self.dcp_world_size == 0
+            self.chunked_prefill_workspace = torch.empty(
+                (self.chunked_prefill_workspace_size +
+                 self.chunked_prefill_workspace_size // self.dcp_world_size,
+                 self.model_config.get_head_size()),
+                dtype=self.model_config.dtype,
+                device=device,
+            )
+        else:
+            self.chunked_prefill_workspace = torch.empty(
+                (self.chunked_prefill_workspace_size,
+                 self.model_config.get_head_size()),
+                dtype=self.model_config.dtype,
+                device=device,
+            )
 
         self._use_cudnn_prefill = use_cudnn_prefill()
         self._use_fi_prefill = use_flashinfer_prefill()
@@ -631,6 +661,12 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             split_decodes_and_prefills(common_attn_metadata,
                                        decode_threshold=self.reorder_batch_threshold)
 
+        # Note(hc): update seq_lens of decode reqs under DCP.
+        if self.dcp_world_size > 1:
+            seq_lens[:num_decodes] = seq_lens[:num_decodes] \
+                // self.dcp_world_size + (self.dcp_rank <= \
+                (seq_lens[:num_decodes] - 1) % self.dcp_world_size)
+
         assert num_decodes + num_prefills == num_reqs
         assert num_decode_tokens + num_prefill_tokens == num_tokens
 
@@ -639,6 +675,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             reqs_start = num_decodes  # prefill_start
 
             context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs]
+            # Note(hc): The context lengths in the perspective of dcp rank0.
+            cp_context_lens_cpu = torch.ceil(context_lens_cpu.float() /
+                                             self.dcp_world_size).int()
+            origin_context_lens = context_lens_cpu.tolist()
             max_context_len_cpu = context_lens_cpu.max().item()
             num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
             prefill_query_start_loc = query_start_loc[
@@ -691,20 +731,66 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                              out=cu_seq_lens_cpu[:, 1:],
                              dtype=torch.int32)
 
+                if self.dcp_world_size > 1:
+                    # Note(hc): The above max_context_chunk already enforces
+                    # block_size alignment, DCP just need the block_size can
+                    # be divisible by dcp_world_size, because DCP use
+                    # cp_gather_cache which not require `cp_chunk_starts`
+                    # aligned to page_size.
+                    assert max_context_chunk % self.dcp_world_size == 0
+                    cp_max_context_chunk = max_context_chunk // \
+                        self.dcp_world_size
+                    cp_chunk_starts = \
+                        torch.arange(num_chunks, dtype=torch.int32) \
+                        .unsqueeze(1).expand(-1, num_prefills) \
+                        * cp_max_context_chunk
+                    cp_chunk_ends = torch.min(
+                        cp_context_lens_cpu.unsqueeze(0),
+                        cp_chunk_starts + cp_max_context_chunk)
+                    cp_chunk_seq_lens = (cp_chunk_ends -
+                                         cp_chunk_starts).clamp(min=0)
+
+                    cp_cu_seq_lens_cpu = torch.zeros(num_chunks,
+                                                     num_prefills + 1,
+                                                     dtype=torch.int32,
+                                                     pin_memory=True)
+                    torch.cumsum(cp_chunk_seq_lens,
+                                 dim=1,
+                                 out=cp_cu_seq_lens_cpu[:, 1:],
+                                 dtype=torch.int32)
+
                 chunked_context_metadata_cls = \
                     CudnnPrefillMetadata.ChunkedContextMetadata \
                     if self._use_cudnn_prefill else \
                         MLACommonPrefillMetadata.ChunkedContextMetadata
-
-                chunked_context_metadata = \
-                    chunked_context_metadata_cls(
-                    cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
-                    starts=chunk_starts.to(device, non_blocking=True),
-                    seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
-                    max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
-                    seq_lens=chunk_seq_lens,
-                    workspace=self.chunked_prefill_workspace,
-                )
+                if self.dcp_world_size > 1:
+                    chunked_context_metadata = \
+                        chunked_context_metadata_cls(
+                        cu_seq_lens=cu_seq_lens_cpu \
+                            .to(device, non_blocking=True),
+                        starts=cp_chunk_starts.to(device, non_blocking=True),
+                        seq_tot=cp_chunk_seq_lens.sum(dim=1).tolist(),
+                        max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                        seq_lens=chunk_seq_lens,
+                        workspace=self.chunked_prefill_workspace,
+                        cp_chunk_seq_lens=cp_chunk_seq_lens.tolist(),
+                        origin_context_lens=origin_context_lens,
+                        cp_cu_seq_lens=cp_cu_seq_lens_cpu \
+                            .to(device, non_blocking=True),
+                        chunk_size=max_context_chunk,
+                        cu_seq_lens_lst=cu_seq_lens_cpu.tolist(),
+                    )
+                else:
+                    chunked_context_metadata = \
+                        chunked_context_metadata_cls(
+                        cu_seq_lens=cu_seq_lens_cpu \
+                            .to(device, non_blocking=True),
+                        starts=chunk_starts.to(device, non_blocking=True),
+                        seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
+                        max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                        seq_lens=chunk_seq_lens,
+                        workspace=self.chunked_prefill_workspace,
+                    )
 
                 if self._use_cudnn_prefill:
                     chunked_context_metadata.seq_lens = chunk_seq_lens
@@ -757,6 +843,71 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         return attn_metadata
 
 
+def reorg_kvcache(
+    allgatered_kv_c_normed: torch.Tensor,
+    allgatered_k_pe: torch.Tensor,
+    cp_chunk_seq_lens_lst: list[int],
+    origin_context_lens: list[int],
+    cp_world_size: int,
+    sum_seq_len: int,
+    max_seq_len: int,
+    chunk_size: int,
+    chunk_idx: int,
+    toks: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    reorg kvcache after cp local gather to tp layout for attn kernel.
+
+    Args:
+        cp_chunk_seq_lens_lst: chunk context lengths under CP.
+        origin_context_lens: origin full context lengths under CP.
+        cp_world_size: CP size.
+        sum_seq_len: the sum of cp_chunk_seq_lens_lst.
+        max_seq_len: the max value of cp_chunk_seq_lens_lst.
+        chunk_size: equals to max_context_chunk from
+            chunked_context_metadata building.
+        chunk_idx: chunk idx of chunked_prefill.
+        toks: the number of tokens for local gather cache.
+    """
+    kv_c_segments = []
+    k_pe_segments = []
+    src_token_idx = 0
+    max_seq_len_check = 0
+    for cp_chunk_seq_len, origin_context_len in zip(cp_chunk_seq_lens_lst,
+                                                    origin_context_lens):
+        chunk_context_len = chunk_size
+        if cp_chunk_seq_len != 0:
+            chunk_context_len = min(
+                chunk_context_len, origin_context_len - chunk_size * chunk_idx)
+        cp_target_rank = (chunk_context_len - 1) % cp_world_size
+        cur_seq_len = 0
+        for rank in range(cp_world_size):
+            if rank > cp_target_rank and cp_chunk_seq_len:
+                real_cp_chunk_seq_len = cp_chunk_seq_len - 1
+            else:
+                real_cp_chunk_seq_len = cp_chunk_seq_len
+            if real_cp_chunk_seq_len:
+                kv_c_segment = allgatered_kv_c_normed[rank * toks +
+                                                      src_token_idx:rank *
+                                                      toks + src_token_idx +
+                                                      real_cp_chunk_seq_len]
+                k_pe_segment = allgatered_k_pe[rank * toks +
+                                               src_token_idx:rank * toks +
+                                               src_token_idx +
+                                               real_cp_chunk_seq_len]
+                kv_c_segments.append(kv_c_segment)
+                k_pe_segments.append(k_pe_segment)
+                cur_seq_len += real_cp_chunk_seq_len
+        max_seq_len_check = max(max_seq_len_check, cur_seq_len)
+        src_token_idx += cp_chunk_seq_len
+    reorganized_kv_c_normed = torch.cat(kv_c_segments, dim=0)
+    reorganized_k_pe = torch.cat(k_pe_segments, dim=0)
+    assert reorganized_kv_c_normed.shape[0] == sum_seq_len
+    assert reorganized_k_pe.shape[0] == sum_seq_len
+    assert max_seq_len_check == max_seq_len
+    return reorganized_kv_c_normed, reorganized_k_pe
+
+
 class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     """
     NOTE: Please read the comment at the top of the file before trying to
@@ -836,6 +987,8 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                 self.vllm_flash_attn_version == 3
                 and current_platform.get_device_capability()[0] == 9)
 
+        self.dcp_world_size: Optional[int] = None
+
     def _flash_attn_varlen_diff_headdims(self,
                                          q,
                                          k,
@@ -1152,6 +1305,108 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
 
         return output, output_lse
 
+    def _context_parallel_compute_prefill_context(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+        k_scale: torch.Tensor,
+        dcp_world_size: int,
+    ):
+        assert k_scale is None, "DCP not support sacled kvcache now."
+        assert attn_metadata.prefill is not None
+        prefill_metadata = attn_metadata.prefill
+        assert prefill_metadata.chunked_context is not None
+        assert prefill_metadata.chunked_context.cp_chunk_seq_lens is not None
+        assert prefill_metadata.chunked_context.origin_context_lens is not None
+        assert prefill_metadata.chunked_context.cp_cu_seq_lens is not None
+        assert prefill_metadata.chunked_context.chunk_size is not None
+        assert prefill_metadata.chunked_context.cu_seq_lens_lst is not None
+
+        output = None
+        iters = len(prefill_metadata.chunked_context.seq_tot)
+        workspace = prefill_metadata.chunked_context.workspace
+
+        for i in range(iters):
+            toks = prefill_metadata.chunked_context.seq_tot[i]
+            ops.cp_gather_cache(
+                src_cache=kv_c_and_k_pe_cache,
+                dst=workspace,
+                block_table=prefill_metadata.block_table,
+                cu_seq_lens=prefill_metadata.chunked_context.cp_cu_seq_lens[i],
+                batch_size=attn_metadata.num_prefills,
+                seq_starts=prefill_metadata.chunked_context.starts[i],
+            )
+            # workspace
+            # |------- N tokens --------|--------- N*dcp_size tokens ----------|
+            # |<- use for loca_gather ->|<--------- use for allgather -------->|
+            allgather_offset = workspace.shape[0] // (dcp_world_size + 1)
+            assert allgather_offset * (dcp_world_size +
+                                       1) == workspace.shape[0]
+            assert toks <= allgather_offset
+            local_gathered_kvcache = workspace[:toks]
+            cur_allgather_workspace = workspace[
+                allgather_offset:allgather_offset * (1 + dcp_world_size)]
+            assert toks * dcp_world_size <= cur_allgather_workspace.shape[0]
+            cur_allgather_kvcache = cur_allgather_workspace[:toks *
+                                                            dcp_world_size]
+            cur_allgather_kvcache.copy_(get_dcp_group().all_gather(
+                local_gathered_kvcache, dim=0))
+            assert cur_allgather_kvcache.shape[
+                -1] == self.kv_lora_rank + self.qk_rope_head_dim
+            allgatered_kv_c_normed, allgatered_k_pe = \
+                cur_allgather_kvcache.unsqueeze(
+                1).split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+
+            kv_c_normed, k_pe = reorg_kvcache(
+                allgatered_kv_c_normed,
+                allgatered_k_pe,
+                cp_chunk_seq_lens_lst=prefill_metadata.chunked_context.
+                cp_chunk_seq_lens[i],
+                origin_context_lens=prefill_metadata.chunked_context.
+                origin_context_lens,
+                cp_world_size=dcp_world_size,
+                sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i]
+                [-1],
+                max_seq_len=prefill_metadata.chunked_context.max_seq_lens[i],
+                chunk_size=prefill_metadata.chunked_context.chunk_size,
+                chunk_idx=i,
+                toks=toks)
+
+            kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
+                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            k_nope, v = kv_nope\
+                .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
+                          dim=-1)
+
+            attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
+                prefill=prefill_metadata,
+                chunk_idx=i,
+                q=q,
+                k=k,
+                v=v,
+            )
+
+            if output is None:
+                output = attn_output
+                output_lse = attn_softmax_lse
+            else:
+                output_tmp = torch.empty_like(output)
+                output_lse_tmp = torch.empty_like(output_lse)
+                merge_attn_states(
+                    output=output_tmp,
+                    output_lse=output_lse_tmp,
+                    prefix_output=output,
+                    prefix_lse=output_lse,
+                    suffix_output=attn_output,
+                    suffix_lse=attn_softmax_lse,
+                )
+                output = output_tmp
+                output_lse = output_lse_tmp
+
+        return output, output_lse
+
     def _forward_prefill(
         self,
         q: torch.Tensor,
@@ -1162,6 +1417,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         k_scale: torch.Tensor,
     ) -> torch.Tensor:
         assert attn_metadata.prefill is not None
+        assert self.dcp_world_size is not None
 
         has_context = attn_metadata.prefill.chunked_context is not None
         kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
@@ -1181,8 +1437,15 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
 
         if has_context:
             suffix_output, suffix_lse = output
-            context_output, context_lse = self._compute_prefill_context( \
-                q, kv_c_and_k_pe_cache, attn_metadata, k_scale)
+            if self.dcp_world_size > 1:
+                context_output, context_lse = \
+                    self._context_parallel_compute_prefill_context(
+                    q, kv_c_and_k_pe_cache, attn_metadata,
+                    k_scale=None, dcp_world_size=self.dcp_world_size)
+            else:
+                context_output, context_lse = \
+                    self._compute_prefill_context(
+                    q, kv_c_and_k_pe_cache, attn_metadata, k_scale)
 
             output = torch.empty_like(suffix_output)
             merge_attn_states(
@@ -1202,12 +1465,11 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     @abstractmethod
     def _forward_decode(
         self,
-        ql_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: M,
         layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         raise NotImplementedError
 
     def forward(
@@ -1235,6 +1497,9 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
             # same expert outputs.
             return output.fill_(0)
 
+        if self.dcp_world_size is None:
+            self.dcp_world_size = get_dcp_group().world_size
+
         fp8_attention = self.kv_cache_dtype.startswith("fp8")
 
         num_actual_toks = attn_metadata.num_actual_tokens
@@ -1313,7 +1578,26 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                     layer._q_scale)
                 decode_q_pe = decode_q_pe.reshape(q_pe_shape)
 
-            output[:num_decode_tokens] = self._forward_decode(
-                decode_ql_nope, decode_q_pe, kv_cache, attn_metadata, layer)
+            decode_q = (decode_ql_nope, decode_q_pe)
+            if self.dcp_world_size > 1:
+                assert not fp8_attention, "DCP not support fp8 kvcache now."
+                # concatenate decode_ql_nope and decode_q_pe -> (B, N, L + P)
+                decode_q = torch.cat(decode_q, dim=-1)
+                # decode_q do allgather in head dim.
+                decode_q = get_dcp_group().all_gather(decode_q, dim=1)
 
+            # call decode attn
+            attn_out, lse = self._forward_decode(decode_q, kv_cache,
+                                                 attn_metadata, layer)
+
+            # recorect dcp attn_out with lse.
+            if self.dcp_world_size > 1:
+                assert lse is not None, (
+                    "For a mla backend want to enable"
+                    "DCP, it is mandatory that the corresponding decode attn"
+                    "kernel return the softmax lse.")
+                attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group())
+
+            # v_up projection
+            output[:num_decode_tokens] = self._v_up_proj(attn_out)
         return output_padded
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 705307d4de..95dce8d8e2 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -232,7 +232,7 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
                                            self._workspace.get_buf(),
                                            self.scale, self._num_kv_splits)
 
-        return self._v_up_proj(o)
+        return o
 
     # TODO: Currently we leave it here only for backup in case something is
     #       wrong with the new SM100 CUTLASS MLA kernel
@@ -265,21 +265,25 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
                                attn_metadata.decode.seq_lens,
                                attn_metadata.decode.block_table, self.scale)
 
-        return self._v_up_proj(o)
+        return o
 
     def _forward_decode(
         self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if type(q) is tuple:
+            q_nope, q_pe = q
+        else:
+            q_nope, q_pe = torch.split(
+                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
         if self._use_old_cutlass_mla:
             # TODO: Remove the old cutlass MLA kernel after more extensive
             #       testing
             return self._old_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
-                                            attn_metadata)
+                                            attn_metadata), None
 
         return self._sm100_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
-                                          attn_metadata)
+                                          attn_metadata), None
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index 0e08307ddf..e2a63c2f57 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar, Optional, Union
 
 import torch
 
@@ -154,15 +154,20 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]):
 
     def _forward_decode(
         self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: FlashAttnMLAMetadata,
         layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
+        if type(q) is tuple:
+            q_nope, q_pe = q
+        else:
+            q_nope, q_pe = torch.split(
+                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+
         if self.kv_cache_dtype.startswith("fp8"):
             raise NotImplementedError(
                 "FP8 FlashAttention MLA not yet supported")
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index df617ab7a8..11c91b8a06 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar, Optional, Union
 
 import torch
 
@@ -169,20 +169,20 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
 
     def _forward_decode(
         self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: FlashMLAMetadata,
         layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
-        q = torch.cat([q_nope, q_pe], dim=-1)\
-            .unsqueeze(1) # Add seqlen dim of 1 (decode)
+        if type(q) is tuple:
+            q = torch.cat(q, dim=-1)
 
-        o, _ = flash_mla_with_kvcache(
-            q=q,
+        assert isinstance(q, torch.Tensor)
+        o, lse = flash_mla_with_kvcache(
+            q=q.unsqueeze(1),  # Add seqlen dim of 1 (decode)
             k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
             block_table=attn_metadata.decode.block_table,
             cache_seqlens=attn_metadata.decode.seq_lens,
@@ -196,4 +196,4 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
             descale_k=layer._k_scale.reshape(1),
         )
 
-        return self._v_up_proj(o)
+        return o, lse
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 42670093da..fc6b1998e8 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar, Optional, Union
 
 import torch
 
@@ -220,18 +220,19 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
 
     def _forward_decode(
         self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: AiterMLAMetadata,
         layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
-        B = q_nope.shape[0]
+        if type(q) is tuple:
+            q = torch.cat(q, dim=-1)
 
-        q = torch.cat([q_nope, q_pe], dim=-1)
+        assert isinstance(q, torch.Tensor)
+        B = q.shape[0]
         o = torch.zeros(B,
                         self.num_heads,
                         self.kv_lora_rank,
@@ -249,4 +250,4 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
                              attn_metadata.decode.paged_kv_indices,
                              attn_metadata.decode.paged_kv_last_page_len)
 
-        return self._v_up_proj(o)
+        return o, None
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index f2974ed668..d692b00d78 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 
@@ -123,21 +123,22 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
 
     def _forward_decode(
         self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
         if self.kv_cache_dtype.startswith("fp8"):
             raise NotImplementedError("FP8 Triton MLA not yet supported")
 
-        B = q_nope.shape[0]
+        if type(q) is tuple:
+            q = torch.cat(q, dim=-1)
 
-        q = torch.cat([q_nope, q_pe], dim=-1)
+        assert isinstance(q, torch.Tensor)
+        B = q.shape[0]
         o = torch.zeros(B,
                         self.num_heads,
                         self.kv_lora_rank,
@@ -171,4 +172,4 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
                              attn_metadata.decode.seq_lens, attn_logits,
                              num_kv_splits, self.scale, PAGE_SIZE)
 
-        return self._v_up_proj(o)
+        return o, None
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 9421341f99..86771060c4 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -24,6 +24,7 @@ class KVCacheCoordinator(ABC):
         use_eagle: bool,
         enable_caching: bool,
         enable_kv_cache_events: bool,
+        dcp_world_size: int,
     ):
         self.kv_cache_config = kv_cache_config
         self.max_model_len = max_model_len
@@ -39,6 +40,7 @@ class KVCacheCoordinator(ABC):
                 kv_cache_spec=kv_cache_group.kv_cache_spec,
                 block_pool=self.block_pool,
                 kv_cache_group_id=i,
+                dcp_world_size=dcp_world_size,
             ) for i, kv_cache_group in enumerate(
                 self.kv_cache_config.kv_cache_groups))
 
@@ -197,9 +199,14 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
     """
 
     def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
-                 use_eagle: bool, enable_kv_cache_events: bool):
-        super().__init__(kv_cache_config, max_model_len, use_eagle, False,
-                         enable_kv_cache_events)
+                 use_eagle: bool, enable_kv_cache_events: bool,
+                 dcp_world_size: int):
+        super().__init__(kv_cache_config,
+                         max_model_len,
+                         use_eagle,
+                         False,
+                         enable_kv_cache_events,
+                         dcp_world_size=dcp_world_size)
         self.num_single_type_manager = len(self.single_type_managers)
 
     def get_num_common_prefix_blocks(self, request_id: str,
@@ -225,12 +232,19 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
 
     def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
                  use_eagle: bool, enable_caching: bool,
-                 enable_kv_cache_events: bool):
-        super().__init__(kv_cache_config, max_model_len, use_eagle,
-                         enable_caching, enable_kv_cache_events)
+                 enable_kv_cache_events: bool, dcp_world_size: int):
+        super().__init__(kv_cache_config,
+                         max_model_len,
+                         use_eagle,
+                         enable_caching,
+                         enable_kv_cache_events,
+                         dcp_world_size=dcp_world_size)
         self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[
             0].kv_cache_spec
         self.block_size = self.kv_cache_spec.block_size
+        self.dcp_world_size = dcp_world_size
+        if dcp_world_size > 1:
+            self.block_size *= dcp_world_size
         assert len(self.kv_cache_config.kv_cache_groups) == 1, (
             "UnitaryKVCacheCoordinator assumes only one kv cache group")
 
@@ -246,6 +260,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
             block_pool=self.block_pool,
             kv_cache_spec=self.kv_cache_spec,
             use_eagle=self.use_eagle,
+            dcp_world_size=self.dcp_world_size,
         )
         return hit_blocks, len(hit_blocks[0]) * self.block_size
 
@@ -261,9 +276,14 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
 
     def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
                  use_eagle: bool, enable_caching: bool,
-                 enable_kv_cache_events: bool):
-        super().__init__(kv_cache_config, max_model_len, use_eagle,
-                         enable_caching, enable_kv_cache_events)
+                 enable_kv_cache_events: bool, dcp_world_size: int):
+        super().__init__(kv_cache_config,
+                         max_model_len,
+                         use_eagle,
+                         enable_caching,
+                         enable_kv_cache_events,
+                         dcp_world_size=dcp_world_size)
+        assert dcp_world_size == 1, "DCP not support hybrid attn now."
         self.verify_and_split_kv_cache_groups()
 
     def verify_and_split_kv_cache_groups(self) -> None:
@@ -394,17 +414,27 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
         return hit_blocks, hit_length
 
 
-def get_kv_cache_coordinator(
-        kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
-        enable_caching: bool,
-        enable_kv_cache_events: bool) -> KVCacheCoordinator:
+def get_kv_cache_coordinator(kv_cache_config: KVCacheConfig,
+                             max_model_len: int, use_eagle: bool,
+                             enable_caching: bool,
+                             enable_kv_cache_events: bool,
+                             dcp_world_size: int) -> KVCacheCoordinator:
     if not enable_caching:
-        return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len,
+        return KVCacheCoordinatorNoPrefixCache(kv_cache_config,
+                                               max_model_len,
                                                use_eagle,
-                                               enable_kv_cache_events)
+                                               enable_kv_cache_events,
+                                               dcp_world_size=dcp_world_size)
     if len(kv_cache_config.kv_cache_groups) == 1:
-        return UnitaryKVCacheCoordinator(kv_cache_config, max_model_len,
-                                         use_eagle, enable_caching,
-                                         enable_kv_cache_events)
-    return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle,
-                                    enable_caching, enable_kv_cache_events)
+        return UnitaryKVCacheCoordinator(kv_cache_config,
+                                         max_model_len,
+                                         use_eagle,
+                                         enable_caching,
+                                         enable_kv_cache_events,
+                                         dcp_world_size=dcp_world_size)
+    return HybridKVCacheCoordinator(kv_cache_config,
+                                    max_model_len,
+                                    use_eagle,
+                                    enable_caching,
+                                    enable_kv_cache_events,
+                                    dcp_world_size=dcp_world_size)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 87a11fe58a..3a0fbb5e5c 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -91,6 +91,7 @@ class KVCacheManager:
         use_eagle: bool = False,
         log_stats: bool = False,
         enable_kv_cache_events: bool = False,
+        dcp_world_size: int = 1,
     ) -> None:
         self.max_model_len = max_model_len
 
@@ -109,12 +110,20 @@ class KVCacheManager:
             self.block_size = kv_cache_config.kv_cache_groups[
                 0].kv_cache_spec.block_size
 
+            if dcp_world_size > 1:
+                assert len(kv_cache_config.kv_cache_groups) == 1
+                # Note(hc): need revisit. When both DCP and any future
+                # PCP are enabled, the block_size may need to be scaled
+                # by a factor of dcp_size × pcp_size?
+                self.block_size *= dcp_world_size
+
         self.coordinator = get_kv_cache_coordinator(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
             use_eagle=self.use_eagle,
             enable_caching=self.enable_caching,
             enable_kv_cache_events=enable_kv_cache_events,
+            dcp_world_size=dcp_world_size,
         )
         self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
         self.block_pool = self.coordinator.block_pool
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 248ad9cda7..aff1183e49 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -846,6 +846,12 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
     )
 
     num_tokens = num_blocks * vllm_config.cache_config.block_size
+    if vllm_config.parallel_config.decode_context_parallel_size > 1:
+        num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
+        logger.info(
+            "Multiplying the GPU KV cache size by the dcp_world_size %d.",
+            vllm_config.parallel_config.decode_context_parallel_size)
+
     num_tokens_str = f"{num_tokens:,}"
     logger.info("GPU KV cache size: %s tokens", num_tokens_str)
     max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 8322fa7335..31f7e9c70f 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -100,6 +100,15 @@ class Scheduler(SchedulerInterface):
 
         self.block_size = self.cache_config.block_size
 
+        self.dcp_world_size = \
+            vllm_config.parallel_config.decode_context_parallel_size
+        # Note(hc): The scheduler’s block_size must be multiplied
+        # by dcp_world_size, since block hashes are computed on the
+        # original full token sequence at a granularity of
+        # original_block_size × dcp_world_size.
+        if self.dcp_world_size > 1:
+            self.block_size *= self.dcp_world_size
+
         # req_id -> Request
         self.requests: dict[str, Request] = {}
         # Scheduling policy
@@ -161,6 +170,7 @@ class Scheduler(SchedulerInterface):
             use_eagle=self.use_eagle,
             log_stats=self.log_stats,
             enable_kv_cache_events=self.enable_kv_cache_events,
+            dcp_world_size=self.dcp_world_size,
         )
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
 
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index f6affb3dab..8159349e46 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -25,6 +25,7 @@ class SingleTypeKVCacheManager(ABC):
         kv_cache_spec: KVCacheSpec,
         block_pool: BlockPool,
         kv_cache_group_id: int,
+        dcp_world_size: int = 1,
     ) -> None:
         """
         Initializes the SingleTypeKVCacheManager.
@@ -33,8 +34,10 @@ class SingleTypeKVCacheManager(ABC):
             block_pool: The block pool.
             kv_cache_group_id: The id of the kv cache group of this manager.
         """
-
         self.block_size = kv_cache_spec.block_size
+        self.dcp_world_size = dcp_world_size
+        if self.dcp_world_size > 1:
+            self.block_size *= dcp_world_size
         self.kv_cache_spec = kv_cache_spec
         self.block_pool = block_pool
 
@@ -196,6 +199,7 @@ class SingleTypeKVCacheManager(ABC):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         """
         Get the longest cache hit prefix of the blocks that is not longer than 
@@ -253,6 +257,7 @@ class FullAttentionManager(SingleTypeKVCacheManager):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(
             kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec)
@@ -260,7 +265,10 @@ class FullAttentionManager(SingleTypeKVCacheManager):
             "and chunked local attention groups"
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
             [] for _ in range(len(kv_cache_group_ids)))
-        max_num_blocks = max_length // kv_cache_spec.block_size
+        block_size = kv_cache_spec.block_size
+        if dcp_world_size > 1:
+            block_size *= dcp_world_size
+        max_num_blocks = max_length // block_size
         for block_hash in itertools.islice(block_hashes, max_num_blocks):
             # block_hashes is a chain of block hashes. If a block hash is not
             # in the cached_block_hash_to_id, the following block hashes are
@@ -310,9 +318,11 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, SlidingWindowSpec), (
             "SlidingWindowManager can only be used for sliding window groups")
+        assert dcp_world_size == 1, "DCP not support sliding window attn now."
 
         # The number of contiguous blocks needed for prefix cache hit.
         # -1 since the input token itself is also included in the window
@@ -408,6 +418,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         """
         For chunked local attention, we need to find the longest cache hit
@@ -445,6 +456,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
             "chunked local attention groups")
         assert use_eagle is False, ("Hybrid KV cache is not supported for " +
                                     "eagle + chunked local attention.")
+        assert dcp_world_size == 1, "DCP not support chunked local attn now."
         max_num_blocks = max_length // kv_cache_spec.block_size
         if max_length > 0:
             local_attention_start_idx = (max_length //
@@ -525,10 +537,12 @@ class MambaManager(SingleTypeKVCacheManager):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(
             kv_cache_spec,
             MambaSpec), ("MambaManager can only be used for mamba groups")
+        assert dcp_world_size == 1, "DCP not support mamba now."
         # Prefix caching is not supported for mamba now. Always return empty
         # list.
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
@@ -583,6 +597,7 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, CrossAttentionSpec), (
             "CrossAttentionManager can only be used for cross-attention groups"
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index a3e4d393e4..6467fcfe40 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -86,6 +86,12 @@ class FullAttentionSpec(AttentionSpec):
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
+        dcp_world_size = \
+            vllm_config.parallel_config.decode_context_parallel_size
+        # Note(hc): each dcp rank only need save
+        # (max_model_len//dcp_world_size) tokens locally.
+        if dcp_world_size > 1:
+            max_model_len = cdiv(max_model_len, dcp_world_size)
         return cdiv(max_model_len, self.block_size) * self.page_size_bytes
 
     @classmethod
@@ -162,6 +168,8 @@ class SlidingWindowSpec(AttentionSpec):
         assert not self.use_mla, "MLA is not supported for sliding window"
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        assert vllm_config.parallel_config.decode_context_parallel_size == 1, \
+            "DCP not support sliding window."
         max_model_len = vllm_config.model_config.max_model_len
         max_num_batched_tokens = (
             vllm_config.scheduler_config.max_num_batched_tokens)
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 6ab5ce2748..c5902595a4 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -4,6 +4,7 @@
 import numpy as np
 import torch
 
+from vllm.distributed import get_dcp_group
 from vllm.logger import init_logger
 from vllm.utils import cdiv
 
@@ -50,6 +51,13 @@ class BlockTable:
         self.slot_mapping = torch.zeros(self.max_num_batched_tokens,
                                         dtype=torch.int64,
                                         device=self.device)
+        try:
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0
 
     def append_row(
         self,
@@ -89,13 +97,36 @@ class BlockTable:
         # NOTE(woosuk): We can't simply use `token_indices // block_size`
         # here because M (max_model_len) is not necessarily divisible by
         # block_size.
-        block_table_indices = (req_indices * self.max_num_blocks_per_req +
-                               positions // self.block_size)
-        block_numbers = self.block_table_np.ravel()[block_table_indices]
-        block_offsets = positions % self.block_size
-        np.add(block_numbers * self.block_size,
-               block_offsets,
-               out=self.slot_mapping_np[:req_indices.shape[0]])
+        if self.dcp_world_size > 1:
+            # Note(hc): The DCP implement store kvcache with a interleave
+            # style, the kvcache for the token whose token_idx is i is
+            # always stored on the GPU whose dcp_rank equals i % cp_world_size:
+
+            # Use a "virtual block" which equals to world_size * block_size
+            # for block_table_indices calculation.
+            virtual_block_size = self.block_size * self.dcp_world_size
+            block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                                   positions // virtual_block_size)
+            block_numbers = self.block_table_np.ravel()[block_table_indices]
+            # Use virtual_block_size for mask calculation, which marks local
+            # tokens.
+            virtual_block_offsets = positions % virtual_block_size
+            mask = virtual_block_offsets % self.dcp_world_size == self.dcp_rank
+            # Calcuate local block_offsets
+            block_offsets = virtual_block_offsets // self.dcp_world_size
+            # Calcuate slot_mapping
+            slot_mapping = block_numbers * self.block_size + block_offsets
+            # Write final slots, use -1 for not-local
+            self.slot_mapping_np[:req_indices.shape[0]] = np.where(
+                mask, slot_mapping, -1)
+        else:
+            block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                                   positions // self.block_size)
+            block_numbers = self.block_table_np.ravel()[block_table_indices]
+            block_offsets = positions % self.block_size
+            np.add(block_numbers * self.block_size,
+                   block_offsets,
+                   out=self.slot_mapping_np[:req_indices.shape[0]])
 
     def commit_block_table(self, num_reqs: int) -> None:
         self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
@@ -128,9 +159,19 @@ class MultiGroupBlockTable:
     def __init__(self, max_num_reqs: int, max_model_len: int,
                  max_num_batched_tokens: int, pin_memory: bool,
                  device: torch.device, block_sizes: list[int]) -> None:
+        # Note(hc): each dcp rank only store
+        # (max_model_len//dcp_world_size) tokens in kvcache,
+        # so the block_size which used for calc max_num_blocks_per_req
+        # must be multiplied by dcp_world_size.
+        try:
+            dcp_world_size = get_dcp_group().world_size
+        except AssertionError:
+            # DCP might not be initialized in testing
+            dcp_world_size = 1
+
         self.block_tables = [
-            BlockTable(block_size, max_num_reqs, cdiv(max_model_len,
-                                                      block_size),
+            BlockTable(block_size, max_num_reqs,
+                       cdiv(max_model_len, block_size * dcp_world_size),
                        max_num_batched_tokens, pin_memory, device)
             for block_size in block_sizes
         ]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5bee2dff98..ba909f5e81 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -56,6 +56,7 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         GiB_bytes, LazyLoader, cdiv, check_use_alibi,
                         get_dtype_size, is_pin_memory_available, round_up,
                         supports_dynamo)
+from vllm.v1.attention.backends.mla.flashmla import FlashMLABackend
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
     create_fast_prefill_custom_backend,
@@ -187,6 +188,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             model_config.is_multimodal_raw_input_only_model)
 
         self.max_model_len = model_config.max_model_len
+        self.dcp_world_size = self.parallel_config.decode_context_parallel_size
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
 
@@ -428,6 +430,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return
 
         if self.reorder_batch_threshold is not None:
+            if self.dcp_world_size > 1:
+                assert self.reorder_batch_threshold == 1, \
+                    "DCP not support reorder_batch_threshold > 1 now."
             reorder_batch_to_split_decodes_and_prefills(
                 self.input_batch,
                 scheduler_output,
@@ -3305,6 +3310,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 get_kv_transfer_group().set_host_xfer_buffer_ops(
                     copy_kv_blocks)
 
+        if self.dcp_world_size > 1:
+            assert self.attn_groups[0][0].backend is FlashMLABackend, (
+                "DCP only support flashmla now."
+                "For a mla backend want to enable DCP, it is mandatory that the"
+                "corresponding decode attn kernel return the softmax lse.")
+
     def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
         """
         Add encoder-only layers to the KV cache config.
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 99c805a3e9..6a3bc5d46d 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -616,7 +616,9 @@ def init_worker_distributed_environment(
     init_distributed_environment(parallel_config.world_size, rank,
                                  distributed_init_method, local_rank, backend)
 
-    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size)
+    ensure_model_parallel_initialized(
+        parallel_config.tensor_parallel_size,
+        parallel_config.pipeline_parallel_size,
+        parallel_config.decode_context_parallel_size)
 
     ensure_kv_transfer_initialized(vllm_config)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 08bb4e7c9e..b4a67e2899 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -539,8 +539,10 @@ def init_worker_distributed_environment(
     init_distributed_environment(parallel_config.world_size, rank,
                                  distributed_init_method, local_rank,
                                  current_platform.dist_backend)
-    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size)
+    ensure_model_parallel_initialized(
+        parallel_config.tensor_parallel_size,
+        parallel_config.pipeline_parallel_size,
+        parallel_config.decode_context_parallel_size)
 
     ensure_kv_transfer_initialized(vllm_config)
 

From 6432739ef1b35be382733c6c081dab76696b1f96 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 5 Sep 2025 22:30:22 -0700
Subject: [PATCH 875/932] [Bugfix] Catch and log invalid token ids in
 detokenizer (#24351)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/detokenizer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 0ccbe65493..38f435f516 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -233,6 +233,11 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
     def _protected_step(self, next_token_id: int) -> Optional[str]:
         try:
             token = self.stream.step(self.tokenizer, next_token_id)
+        except OverflowError:
+            # Handle rare observed overflow, still to be diagnosed.
+            # See https://github.com/vllm-project/vllm/issues/21951.
+            logger.exception("Encountered invalid token id: %d", next_token_id)
+            token = None
         except Exception as e:
             if not str(e).startswith(INVALID_PREFIX_ERR_MSG):
                 raise e

From 53b19ccdd5a04f442283a9b077b78da3b0df79d4 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 6 Sep 2025 13:53:58 +0800
Subject: [PATCH 876/932] [Core] Allow disabling TP sharding for parallel
 Linear layer (#23024)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/layers/linear.py          | 175 +++++++-----------
 .../model_loader/bitsandbytes_loader.py       |  23 ++-
 vllm/model_executor/models/deepseek_v2.py     |   6 +-
 vllm/model_executor/models/glm4_1v.py         | 128 +++++--------
 vllm/model_executor/models/qwen2_5_vl.py      |  58 +++---
 vllm/model_executor/models/step3_vl.py        |  71 +++----
 vllm/model_executor/parameter.py              |  22 +--
 7 files changed, 203 insertions(+), 280 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 1224b94d56..fa8a261db7 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -223,6 +223,7 @@ class LinearBase(CustomOp):
         quant_config: Quantization configure.
         prefix: Prefix for parameter names.
         return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, tensor parallelism will be disabled for this layer.
     """
 
     def __init__(
@@ -235,6 +236,7 @@ class LinearBase(CustomOp):
         prefix: str = "",
         *,
         return_bias: bool = True,
+        disable_tp: bool = False,
     ):
         super().__init__()
 
@@ -254,6 +256,17 @@ class LinearBase(CustomOp):
             self.quant_method = quant_config.get_quant_method(self,
                                                               prefix=prefix)
         self.return_bias = return_bias
+        self.disable_tp = disable_tp
+        self.tp_rank = (get_tensor_model_parallel_rank()
+                        if not disable_tp else 0)
+        self.tp_size = (get_tensor_model_parallel_world_size()
+                        if not disable_tp else 1)
+
+    def __post_init__(self):
+        for param in self.parameters():
+            if isinstance(param, BasevLLMParameter):
+                param.tp_rank = self.tp_rank
+                param.tp_size = self.tp_size
 
 
 @CustomOp.register("replicated_linear")
@@ -270,6 +283,7 @@ class ReplicatedLinear(LinearBase):
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
         return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: Take no effect for replicated linear layers.
     """
 
     def __init__(
@@ -283,26 +297,21 @@ class ReplicatedLinear(LinearBase):
         prefix: str = "",
         *,
         return_bias: bool = True,
+        disable_tp: bool = False,
     ):
-        # If MergedReplicatedLinear, use output size of each partition.
-        if hasattr(self, "output_sizes"):
-            self.output_partition_sizes = self.output_sizes
-        else:
-            self.output_partition_sizes = [output_size]
-
         super().__init__(input_size,
                          output_size,
                          skip_bias_add,
                          params_dtype,
                          quant_config,
                          prefix=prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)
 
         # All the linear layer supports quant method.
         assert self.quant_method is not None
         self.quant_method.create_weights(self,
-                                         self.input_size,
-                                         self.output_partition_sizes,
+                                         self.input_size, [self.output_size],
                                          self.input_size,
                                          self.output_size,
                                          self.params_dtype,
@@ -358,74 +367,6 @@ class ReplicatedLinear(LinearBase):
         return s
 
 
-class MergedReplicatedLinear(ReplicatedLinear):
-    """Replicated linear layer.
-
-    Args:
-        input_size: input dimension of the linear layer.
-        output_sizes: list of output dimensions of the linear layer.
-        bias: If true, add bias.
-        skip_bias_add: If true, skip adding bias but instead return it.
-        params_dtype: Data type for the parameters.
-        quant_config: Quantization configure.
-        prefix: The name of the layer in the state dict, including all parents
-                        (e.g. model.layers.0.qkv_proj)
-        return_bias: If true, return bias together with outputs in forward pass.
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_sizes: list[int],
-        bias: bool = True,
-        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-        *,
-        return_bias: bool = True,
-    ):
-        self.output_sizes = output_sizes
-        super().__init__(input_size,
-                         sum(output_sizes),
-                         bias,
-                         skip_bias_add,
-                         params_dtype,
-                         quant_config,
-                         prefix=prefix,
-                         return_bias=return_bias)
-
-    def weight_loader(self,
-                      param: Union[Parameter, BasevLLMParameter],
-                      loaded_weight: torch.Tensor,
-                      loaded_shard_id: Optional[int] = None):
-        assert loaded_shard_id is not None
-        assert loaded_shard_id < len(self.output_sizes)
-
-        if isinstance(param, BlockQuantScaleParameter):
-            from vllm.model_executor.layers.quantization.fp8 import (
-                Fp8LinearMethod, Fp8MoEMethod)
-            assert self.quant_method is not None
-            assert isinstance(self.quant_method,
-                              (Fp8LinearMethod, Fp8MoEMethod))
-            weight_block_size = self.quant_method.quant_config.weight_block_size
-            assert weight_block_size is not None
-            block_n, _ = weight_block_size[0], weight_block_size[1]
-            shard_offset = (
-                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) //
-                block_n)
-            shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) //
-                          block_n)
-        elif isinstance(param, PerTensorScaleParameter):
-            shard_offset = loaded_shard_id
-            shard_size = 1
-        else:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id])
-            shard_size = self.output_sizes[loaded_shard_id]
-
-        param.data[shard_offset:shard_offset + shard_size] = loaded_weight
-
-
 @CustomOp.register("column_parallel_linear")
 class ColumnParallelLinear(LinearBase):
     """Linear layer with column parallelism.
@@ -448,7 +389,9 @@ class ColumnParallelLinear(LinearBase):
         output_sizes: list of output sizes packed into one output, like for QKV
                        the list would be size 3.
         prefix: The name of the layer in the state dict, including all parents
-                        (e.g. model.layers.0.qkv_proj) 
+                        (e.g. model.layers.0.qkv_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, weights matrix won't be sharded through tp rank.
     """
 
     def __init__(
@@ -464,9 +407,13 @@ class ColumnParallelLinear(LinearBase):
         prefix: str = "",
         *,
         return_bias: bool = True,
+        disable_tp: bool = False,
     ):
         # Divide the weight matrix along the last dimension.
-        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = (get_tensor_model_parallel_rank()
+                        if not disable_tp else 0)
+        self.tp_size = (get_tensor_model_parallel_world_size()
+                        if not disable_tp else 1)
         self.input_size_per_partition = input_size
         self.output_size_per_partition = divide(output_size, self.tp_size)
         self.output_partition_sizes = [self.output_size_per_partition]
@@ -483,7 +430,8 @@ class ColumnParallelLinear(LinearBase):
                          params_dtype,
                          quant_config,
                          prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)
 
         self.gather_output = gather_output
 
@@ -512,8 +460,6 @@ class ColumnParallelLinear(LinearBase):
         else:
             self.register_parameter("bias", None)
 
-        self.tp_rank = get_tensor_model_parallel_rank()
-
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
 
         output_dim = getattr(param, "output_dim", None)
@@ -554,7 +500,8 @@ class ColumnParallelLinear(LinearBase):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-    def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
+    def weight_loader_v2(self, param: BasevLLMParameter,
+                         loaded_weight: torch.Tensor):
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
         if len(loaded_weight.shape) == 0:
@@ -570,7 +517,7 @@ class ColumnParallelLinear(LinearBase):
         # Matrix multiply.
         assert self.quant_method is not None
         output_parallel = self.quant_method.apply(self, input_, bias)
-        if self.gather_output:
+        if self.gather_output and self.tp_size > 1:
             # All-gather across the partitions.
             output = tensor_model_parallel_all_gather(output_parallel)
         else:
@@ -584,7 +531,7 @@ class ColumnParallelLinear(LinearBase):
         s = f"in_features={self.input_size}"
         s += f", output_features={self.output_size_per_partition}"
         s += f", bias={self.bias is not None}"
-        s += f", tp_size={get_tensor_model_parallel_world_size()}"
+        s += f", tp_size={self.tp_size}"
         s += f", gather_output={self.gather_output}"
         return s
 
@@ -611,6 +558,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
         return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, all weights matrix won't be sharded, this layer
+                    will be treated as a "Replicated" MergedLinear.
     """
 
     def __init__(
@@ -625,10 +574,13 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         prefix: str = "",
         *,
         return_bias: bool = True,
+        disable_tp: bool = False,
     ):
         self.output_sizes = output_sizes
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = (get_tensor_model_parallel_world_size()
+                        if not disable_tp else 1)
+        self.tp_rank = (get_tensor_model_parallel_rank()
+                        if not disable_tp else 0)
 
         assert all(output_size % self.tp_size == 0
                    for output_size in output_sizes)
@@ -640,7 +592,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                          params_dtype=params_dtype,
                          quant_config=quant_config,
                          prefix=prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)
 
     def weight_loader(self,
                       param: Parameter,
@@ -832,8 +785,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
 
         assert loaded_shard_id < len(self.output_sizes)
 
-        tp_size = get_tensor_model_parallel_world_size()
-
         if isinstance(param, BlockQuantScaleParameter):
             from vllm.model_executor.layers.quantization.fp8 import (
                 Fp8LinearMethod, Fp8MoEMethod)
@@ -845,17 +796,19 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             block_n, _ = weight_block_size[0], weight_block_size[1]
             shard_offset = (
                 (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) //
-                block_n) // tp_size
+                block_n) // self.tp_size
             shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) //
-                          block_n // tp_size)
+                          block_n // self.tp_size)
         else:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
-            shard_size = self.output_sizes[loaded_shard_id] // tp_size
+            shard_offset = sum(
+                self.output_sizes[:loaded_shard_id]) // self.tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
 
         param.load_merged_column_weight(loaded_weight=loaded_weight,
                                         shard_id=loaded_shard_id,
                                         shard_offset=shard_offset,
-                                        shard_size=shard_size)
+                                        shard_size=shard_size,
+                                        tp_rank=self.tp_rank)
 
 
 class QKVParallelLinear(ColumnParallelLinear):
@@ -883,6 +836,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
         return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, weights matrix won't be sharded through tp rank.
     """
 
     def __init__(
@@ -898,6 +852,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         prefix: str = "",
         *,
         return_bias: bool = True,
+        disable_tp: bool = False,
     ):
         self.hidden_size = hidden_size
         self.head_size = head_size
@@ -906,7 +861,8 @@ class QKVParallelLinear(ColumnParallelLinear):
             total_num_kv_heads = total_num_heads
         self.total_num_kv_heads = total_num_kv_heads
         # Divide the weight matrix along the last dimension.
-        tp_size = get_tensor_model_parallel_world_size()
+        tp_size = (get_tensor_model_parallel_world_size()
+                   if not disable_tp else 1)
         self.num_heads = divide(self.total_num_heads, tp_size)
         if tp_size >= self.total_num_kv_heads:
             self.num_kv_heads = 1
@@ -932,7 +888,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                          params_dtype=params_dtype,
                          quant_config=quant_config,
                          prefix=prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)
 
     def _get_shard_offset_mapping(self, loaded_shard_id: str):
         shard_offset_mapping = {
@@ -993,10 +950,13 @@ class QKVParallelLinear(ColumnParallelLinear):
                          loaded_shard_id: Optional[str] = None):
         if loaded_shard_id is None:  # special case for certain models
             if isinstance(param, PerTensorScaleParameter):
-                param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0)
+                param.load_qkv_weight(loaded_weight=loaded_weight,
+                                      shard_id=0,
+                                      tp_rank=self.tp_rank)
                 return
             elif type(param) in (RowvLLMParameter, BasevLLMParameter):
-                param.load_qkv_weight(loaded_weight=loaded_weight)
+                param.load_qkv_weight(loaded_weight=loaded_weight,
+                                      tp_rank=self.tp_rank)
                 return
             # TODO: @dsikka - move to parameter.py
             self._load_fused_module_from_checkpoint(param, loaded_weight)
@@ -1020,7 +980,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                               num_heads=self.num_kv_head_replicas,
                               shard_id=loaded_shard_id,
                               shard_offset=shard_offset,
-                              shard_size=shard_size)
+                              shard_size=shard_size,
+                              tp_rank=self.tp_rank)
 
     def weight_loader(self,
                       param: Parameter,
@@ -1226,6 +1187,7 @@ class RowParallelLinear(LinearBase):
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.down_proj)
         return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, weights matrix won't be sharded through tp rank.
     """
 
     def __init__(
@@ -1241,10 +1203,13 @@ class RowParallelLinear(LinearBase):
         prefix: str = "",
         *,
         return_bias: bool = True,
+        disable_tp: bool = False,
     ):
         # Divide the weight matrix along the first dimension.
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = (get_tensor_model_parallel_rank()
+                        if not disable_tp else 0)
+        self.tp_size = (get_tensor_model_parallel_world_size()
+                        if not disable_tp else 1)
         self.input_size_per_partition = divide(input_size, self.tp_size)
         self.output_size_per_partition = output_size
         self.output_partition_sizes = [output_size]
@@ -1255,7 +1220,8 @@ class RowParallelLinear(LinearBase):
                          params_dtype,
                          quant_config,
                          prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)
 
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results
@@ -1339,10 +1305,9 @@ class RowParallelLinear(LinearBase):
         if self.input_is_parallel:
             input_parallel = input_
         else:
-            tp_rank = get_tensor_model_parallel_rank()
             splitted_input = split_tensor_along_last_dim(
                 input_, num_partitions=self.tp_size)
-            input_parallel = splitted_input[tp_rank].contiguous()
+            input_parallel = splitted_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
         assert self.quant_method is not None
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index b8393956ee..c8dd1ec0ec 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -69,6 +69,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         # Store all module names (from transformers) that support
         # BNB quantization.
         self.target_modules: list[str] = []
+        self.tp_disabled_modules: list[str] = []
         # Store the mapping of expert parameters for MoE models.
         self.expert_params_mapping: list[tuple[str, str, int, str]] = []
         # mapping weight names from transformers to vllm.
@@ -322,14 +323,24 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                                quant_state_dict) -> Generator:
         from bitsandbytes.functional import quantize_4bit
 
-        tp_size = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
+        global_tp_size = get_tensor_model_parallel_world_size()
+        global_tp_rank = get_tensor_model_parallel_rank()
 
         for (
                 org_weight_name,
                 mapped_weight_name,
                 weight_tensor,
         ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
+
+            # override tp_size and tp_rank if the module has disabled TP
+            if any(tp_disabled_module in mapped_weight_name
+                   for tp_disabled_module in self.tp_disabled_modules):
+                tp_size = 1
+                tp_rank = 0
+            else:
+                tp_size = global_tp_size
+                tp_rank = global_tp_rank
+
             if any(target_module in mapped_weight_name
                    for target_module in self.target_modules
                    ) and mapped_weight_name.endswith(".weight"):
@@ -418,12 +429,16 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                     # Map vllm's names to transformers's names.
                     rep_name, sub_modules = modules_info
                     for sub_name in sub_modules:
-                        self.target_modules.append(
-                            name.replace(rep_name, sub_name))
+                        new_name = name.replace(rep_name, sub_name)
+                        self.target_modules.append(new_name)
+                        if module.disable_tp:
+                            self.tp_disabled_modules.append(new_name)
                 # Add original module name even if the module has stacked map,
                 # in case model has a mixture of disk-merged and disk-split
                 # weights with same last name.
                 self.target_modules.append(name)
+                if module.disable_tp:
+                    self.tp_disabled_modules.append(name)
             elif isinstance(module, FusedMoE) and hasattr(
                     module.quant_method, "quant_config"):
                 # TODO: support FusedMoE with prequant and 8bit.
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index bb95a1dbf1..d65dcfebae 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -43,7 +43,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
-                                               MergedReplicatedLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -435,12 +434,13 @@ class DeepseekV2MLAAttention(nn.Module):
         self.max_position_embeddings = max_position_embeddings
 
         if self.q_lora_rank is not None:
-            self.fused_qkv_a_proj = MergedReplicatedLinear(
+            self.fused_qkv_a_proj = MergedColumnParallelLinear(
                 self.hidden_size,
                 [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
                 bias=False,
                 quant_config=quant_config,
-                prefix=f"{prefix}.fused_qkv_a_proj")
+                prefix=f"{prefix}.fused_qkv_a_proj",
+                disable_tp=True)
         else:
             self.kv_a_proj_with_mqa = ReplicatedLinear(
                 self.hidden_size,
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index f9fd5163d6..fd5fecac67 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -51,14 +51,10 @@ from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.layernorm import RMSNorm
-# yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
-                                               MergedReplicatedLinear,
                                                QKVParallelLinear,
-                                               ReplicatedLinear,
                                                RowParallelLinear)
-# yapf: enable
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
@@ -174,20 +170,22 @@ class Glm4vVisionMLP(nn.Module):
         use_data_parallel: bool = False,
     ):
         super().__init__()
-        cls_gate_up = (MergedReplicatedLinear
-                       if use_data_parallel else MergedColumnParallelLinear)
-        self.gate_up_proj = cls_gate_up(input_size=in_features,
-                                        output_sizes=[hidden_features] * 2,
-                                        bias=bias,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.gate_up_proj")
-        cls_down = (ReplicatedLinear
-                    if use_data_parallel else RowParallelLinear)
-        self.down_proj = cls_down(hidden_features,
-                                  in_features,
-                                  bias=bias,
-                                  quant_config=quant_config,
-                                  prefix=f"{prefix}.down_proj")
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=in_features,
+            output_sizes=[hidden_features] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+            disable_tp=use_data_parallel,
+        )
+        self.down_proj = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+            disable_tp=use_data_parallel,
+        )
         self.act_fn = SiluAndMul()
 
     def forward(self, x: torch.Tensor):
@@ -234,48 +232,32 @@ class Glm4vVisionAttention(nn.Module):
         # Per attention head and per partition values.
         self.tp_size = (1 if use_data_parallel else
                         get_tensor_model_parallel_world_size())
-        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.tp_rank = (0 if use_data_parallel else
+                        parallel_state.get_tensor_model_parallel_rank())
         self.hidden_size_per_attention_head = dist_utils.divide(
             projection_size, num_heads)
         self.num_attention_heads_per_partition = dist_utils.divide(
             num_heads, self.tp_size)
 
-        if use_data_parallel:
-            self.qkv = ReplicatedLinear(
-                input_size=embed_dim,
-                output_size=3 * projection_size,
-                bias=False,
-                quant_config=quant_config,
-                # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg
-                prefix=f"{prefix}.qkv_proj"
-                if quant_config else f"{prefix}.qkv",
-            )
-            self.proj = ReplicatedLinear(
-                input_size=projection_size,
-                output_size=embed_dim,
-                quant_config=quant_config,
-                prefix=f"{prefix}.proj",
-                bias=False,
-            )
-        else:
-            self.qkv = QKVParallelLinear(
-                hidden_size=embed_dim,
-                head_size=self.hidden_size_per_attention_head,
-                total_num_heads=num_heads,
-                total_num_kv_heads=num_heads,
-                bias=False,
-                quant_config=quant_config,
-                # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg
-                prefix=f"{prefix}.qkv_proj"
-                if quant_config else f"{prefix}.qkv",
-            )
-            self.proj = RowParallelLinear(
-                input_size=projection_size,
-                output_size=embed_dim,
-                quant_config=quant_config,
-                prefix=f"{prefix}.proj",
-                bias=False,
-            )
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=False,
+            quant_config=quant_config,
+            # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg
+            prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv",
+            disable_tp=use_data_parallel,
+        )
+        self.proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+            bias=False,
+            disable_tp=use_data_parallel,
+        )
 
         # Detect attention implementation.
         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
@@ -494,41 +476,31 @@ class Glm4vPatchMerger(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = d_model
-        if use_data_parallel:
-            self.proj = ReplicatedLinear(
-                input_size=self.hidden_size,
-                output_size=self.hidden_size,
-                bias=bias,
-                quant_config=quant_config,
-                prefix=f"{prefix}.proj",
-            )
-        else:
-            self.proj = ColumnParallelLinear(
-                self.hidden_size,
-                self.hidden_size,
-                bias=bias,
-                gather_output=True,
-                quant_config=quant_config,
-                prefix=f"{prefix}.proj",
-            )
+        self.proj = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=bias,
+            gather_output=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+            disable_tp=use_data_parallel,
+        )
         self.post_projection_norm = nn.LayerNorm(self.hidden_size)
-        cls_gate_up = (MergedReplicatedLinear
-                       if use_data_parallel else MergedColumnParallelLinear)
-        self.gate_up_proj = cls_gate_up(
+        self.gate_up_proj = MergedColumnParallelLinear(
             input_size=self.hidden_size,
             output_sizes=[context_dim] * 2,
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.gate_up_proj",
+            disable_tp=use_data_parallel,
         )
-        cls_down = (ReplicatedLinear
-                    if use_data_parallel else RowParallelLinear)
-        self.down_proj = cls_down(
+        self.down_proj = RowParallelLinear(
             context_dim,
             self.hidden_size,
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.down_proj",
+            disable_tp=use_data_parallel,
         )
         self.act_fn = SiluAndMul()
         self.extra_activation_func = nn.GELU()
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index c8f7fc16b4..0a89f86fc7 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -48,7 +48,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 # yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
-                                               MergedReplicatedLinear,
                                                QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
@@ -178,22 +177,20 @@ class Qwen2_5_VisionMLP(nn.Module):
                  prefix: str = "",
                  use_data_parallel: bool = False):
         super().__init__()
-        cls_gate_up_proj = (MergedReplicatedLinear if use_data_parallel else
-                            MergedColumnParallelLinear)
-        self.gate_up_proj = cls_gate_up_proj(
+        self.gate_up_proj = MergedColumnParallelLinear(
             input_size=in_features,
             output_sizes=[hidden_features] * 2,  # [gate_proj, up_proj]
             bias=bias,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
+            prefix=f"{prefix}.gate_up_proj",
+            disable_tp=use_data_parallel)
 
-        cls_down_proj = (ReplicatedLinear
-                         if use_data_parallel else RowParallelLinear)
-        self.down_proj = cls_down_proj(hidden_features,
-                                       in_features,
-                                       bias=bias,
-                                       quant_config=quant_config,
-                                       prefix=f"{prefix}.down_proj")
+        self.down_proj = RowParallelLinear(hidden_features,
+                                           in_features,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj",
+                                           disable_tp=use_data_parallel)
         self.act_fn = act_fn
 
     def forward(self, x: torch.Tensor):
@@ -243,30 +240,21 @@ class Qwen2_5_VisionAttention(nn.Module):
         self.num_attention_heads_per_partition = dist_utils.divide(
             num_heads, self.tp_size)
 
-        if use_data_parallel:
-            self.qkv = ReplicatedLinear(embed_dim,
-                                        self.hidden_size_per_attention_head *
-                                        3 * num_heads,
-                                        bias=True,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.qkv")
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+            disable_tp=use_data_parallel)
 
-        else:
-            self.qkv = QKVParallelLinear(
-                hidden_size=embed_dim,
-                head_size=self.hidden_size_per_attention_head,
-                total_num_heads=num_heads,
-                total_num_kv_heads=num_heads,
-                bias=True,
-                quant_config=quant_config,
-                prefix=f"{prefix}.qkv")
-
-        cls_proj = (ReplicatedLinear
-                    if use_data_parallel else RowParallelLinear)
-        self.proj = cls_proj(input_size=projection_size,
-                             output_size=embed_dim,
-                             quant_config=quant_config,
-                             prefix=f"{prefix}.proj")
+        self.proj = RowParallelLinear(input_size=projection_size,
+                                      output_size=embed_dim,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.proj",
+                                      disable_tp=use_data_parallel)
 
         # Detect attention implementation.
         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index f379d2c15f..17299b6497 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -21,7 +21,6 @@ from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
-                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -667,35 +666,21 @@ class Step3VisionAttention(nn.Module):
 
         self.q_size = self.num_heads * self.head_dim
 
-        if use_data_parallel:
-            self.qkv_proj = ReplicatedLinear(
-                self.embed_dim,
-                3 * self.q_size,
-                bias=True,
-                quant_config=quant_config,
-                prefix=prefix,
-            )
-            self.out_proj = ReplicatedLinear(
-                self.total_num_heads * self.head_dim,
-                self.embed_dim,
-                bias=True,
-                quant_config=quant_config,
-                prefix=prefix,
-            )
-        else:
-            self.qkv_proj = QKVParallelLinear(
-                self.embed_dim,
-                self.head_dim,
-                self.total_num_heads,
-                bias=True,
-                quant_config=quant_config,
-                prefix=prefix,
-            )
-            self.out_proj = RowParallelLinear(self.embed_dim,
-                                              self.embed_dim,
-                                              bias=True,
-                                              quant_config=quant_config,
-                                              prefix=prefix)
+        self.qkv_proj = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+            disable_tp=use_data_parallel,
+        )
+        self.out_proj = RowParallelLinear(self.embed_dim,
+                                          self.embed_dim,
+                                          bias=True,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.out_proj",
+                                          disable_tp=use_data_parallel)
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads,
@@ -740,20 +725,18 @@ class Step3VisionMLP(nn.Module):
         super().__init__()
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        cls_fc1 = (ReplicatedLinear
-                   if use_data_parallel else ColumnParallelLinear)
-        self.fc1 = cls_fc1(config.hidden_size,
-                           config.intermediate_size,
-                           bias=True,
-                           quant_config=quant_config,
-                           prefix=prefix)
-        cls_fc2 = (ReplicatedLinear
-                   if use_data_parallel else RowParallelLinear)
-        self.fc2 = cls_fc2(config.intermediate_size,
-                           config.hidden_size,
-                           bias=True,
-                           quant_config=quant_config,
-                           prefix=prefix)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1",
+                                        disable_tp=use_data_parallel)
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2",
+                                     disable_tp=use_data_parallel)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc1(hidden_states)
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 9465308e94..221712ba9a 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -57,6 +57,8 @@ class BasevLLMParameter(Parameter):
             weight_loader = _make_synced_weight_loader(weight_loader)
 
         self._weight_loader = weight_loader
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
 
     @property
     def weight_loader(self):
@@ -116,10 +118,10 @@ class _ColumnvLLMParameter(BasevLLMParameter):
         return self._output_dim
 
     def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
-        tp_rank = get_tensor_model_parallel_rank()
         shard_size = self.data.shape[self.output_dim]
         loaded_weight = loaded_weight.narrow(self.output_dim,
-                                             tp_rank * shard_size, shard_size)
+                                             self.tp_rank * shard_size,
+                                             shard_size)
         assert self.data.shape == loaded_weight.shape
         self.data.copy_(loaded_weight)
 
@@ -127,6 +129,7 @@ class _ColumnvLLMParameter(BasevLLMParameter):
 
         shard_offset = kwargs.get("shard_offset")
         shard_size = kwargs.get("shard_size")
+
         # TODO: move these to PackedColumnParameter and PackedvLLMParameter
         if isinstance(
                 self,
@@ -137,11 +140,11 @@ class _ColumnvLLMParameter(BasevLLMParameter):
 
         param_data = self.data
 
-        tp_rank = get_tensor_model_parallel_rank()
         param_data = param_data.narrow(self.output_dim, shard_offset,
                                        shard_size)
         loaded_weight = loaded_weight.narrow(self.output_dim,
-                                             tp_rank * shard_size, shard_size)
+                                             self.tp_rank * shard_size,
+                                             shard_size)
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -161,8 +164,8 @@ class _ColumnvLLMParameter(BasevLLMParameter):
                 shard_offset=shard_offset, shard_size=shard_size)
 
         param_data = self.data
-        tp_rank = get_tensor_model_parallel_rank()
-        shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
+        shard_id = (self.tp_rank if shard_id == "q" else self.tp_rank //
+                    num_heads)
         param_data = param_data.narrow(self.output_dim, shard_offset,
                                        shard_size)
         loaded_weight = loaded_weight.narrow(self.output_dim,
@@ -189,10 +192,10 @@ class RowvLLMParameter(BasevLLMParameter):
         return self._input_dim
 
     def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
-        tp_rank = get_tensor_model_parallel_rank()
         shard_size = self.data.shape[self.input_dim]
         loaded_weight = loaded_weight.narrow(self.input_dim,
-                                             tp_rank * shard_size, shard_size)
+                                             self.tp_rank * shard_size,
+                                             shard_size)
 
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
@@ -414,9 +417,6 @@ class SharedWeightParameter(BasevLLMParameter):
             "weight_loader": self._fake_weight_loader
         }
 
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
-
         if self.tp_size > 1:
             raise NotImplementedError(f"{self.__class__.__name__} does not "
                                       "currently support tensor parallelism")

From 6d6c6b05d37f12abe18cffdd9ade9f1fab864749 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Sat, 6 Sep 2025 13:58:36 +0800
Subject: [PATCH 877/932] [New Model]: google/embeddinggemma-300m (#24318)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/supported_models.md               |  1 +
 tests/models/language/pooling/mteb_utils.py   | 18 +++++++++--
 .../language/pooling/test_st_projector.py     |  7 +++-
 tests/models/registry.py                      |  1 +
 vllm/config/__init__.py                       |  2 ++
 vllm/model_executor/models/adapters.py        | 32 ++++++++++---------
 vllm/model_executor/models/config.py          |  9 ++++++
 vllm/model_executor/models/gemma3.py          | 31 +++++++++++-------
 vllm/model_executor/models/registry.py        |  1 +
 9 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 9db6f8036a..bdb29aac33 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -440,6 +440,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ |
 | `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
 | `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  | ✅︎ |
 | `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  | ✅︎ |
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 7be1bba2ff..68b1cc8030 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -10,7 +10,8 @@ import numpy as np
 import pytest
 import requests
 
-from tests.models.utils import EmbedModelInfo, RerankModelInfo
+from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
+                                check_embeddings_close)
 
 # Most embedding models on the STS12 task (See #17175):
 # - Model implementation and minor changes in tensor dtype
@@ -163,12 +164,14 @@ def mteb_test_embed_models(hf_runner,
                            model_info: EmbedModelInfo,
                            vllm_extra_kwargs=None,
                            hf_model_callback=None,
-                           atol=MTEB_RERANK_TOL):
+                           atol=MTEB_EMBED_TOL):
     if not model_info.enable_test:
         # A model family has many models with the same architecture,
         # and we don't need to test each one.
         pytest.skip("Skipping test.")
 
+    example_prompts = ["The chef prepared a delicious meal."]
+
     vllm_extra_kwargs = vllm_extra_kwargs or {}
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
@@ -191,6 +194,7 @@ def mteb_test_embed_models(hf_runner,
         vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                               MTEB_EMBED_TASKS)
         vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
+        vllm_outputs = vllm_model.embed(example_prompts)
 
     if model_info.mteb_score is None:
         with hf_runner(model_info.name,
@@ -202,6 +206,16 @@ def mteb_test_embed_models(hf_runner,
 
             st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
             st_dtype = next(hf_model.model.parameters()).dtype
+
+            # Test embed_dims and whether to use normalize
+            hf_outputs = hf_model.encode(example_prompts)
+            check_embeddings_close(
+                embeddings_0_lst=hf_outputs,
+                embeddings_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+                tol=1e-2,
+            )
     else:
         st_main_score = model_info.mteb_score
         st_dtype = "Constant"
diff --git a/tests/models/language/pooling/test_st_projector.py b/tests/models/language/pooling/test_st_projector.py
index bafeb4060d..9301e705c4 100644
--- a/tests/models/language/pooling/test_st_projector.py
+++ b/tests/models/language/pooling/test_st_projector.py
@@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo,
+                      LASTPoolingEmbedModelInfo)
 from .mteb_utils import mteb_test_embed_models
 
 # ST models with projector (Dense) layers
@@ -13,6 +14,10 @@ ST_PROJECTOR_MODELS = [
         mteb_score=0.688611955,
         enable_test=True,
     ),
+    LASTPoolingEmbedModelInfo("google/embeddinggemma-300m",
+                              architecture="Gemma3TextModel",
+                              mteb_score=0.7473819294684156,
+                              enable_test=True)
 ]
 
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 38efb01341..c6ff50b542 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -352,6 +352,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
     # [Text-only]
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),  # noqa: E501
+    "Gemma3TextModel": _HfExamplesInfo("google/embeddinggemma-300m"),
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
     "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                                trust_remote_code=True),
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 8bdc22acf3..c4434c37f4 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -2750,6 +2750,8 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
 _FLOAT16_NOT_SUPPORTED_MODELS = {
     "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",
     "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.",
+    "gemma3_text":
+    "Numerical instability. Please use bfloat16 or float32 instead.",
     "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.",
     "glm4": "Numerical instability. Please use bfloat16 or float32 instead.",
 }
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 50c2cd97f3..bb96bc5592 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -49,26 +49,28 @@ def _load_st_projector(model_config: "ModelConfig") -> Optional[nn.Module]:
         if not dense_modules:
             return None
 
-        module = dense_modules[0]
-        folder = module.get("path", "")
+        layers = []
+        for module in dense_modules:
+            folder = module.get("path", "")
 
-        config_path = f"{folder}/config.json" if folder else "config.json"
-        layer_config = get_hf_file_to_dict(config_path, model_config.model,
-                                           model_config.revision)
-        if not layer_config:
-            return None
+            config_path = f"{folder}/config.json" if folder else "config.json"
+            layer_config = get_hf_file_to_dict(config_path, model_config.model,
+                                               model_config.revision)
+            if not layer_config:
+                continue
 
-        linear = nn.Linear(layer_config.get("in_features", 768),
-                           layer_config.get("out_features", 768),
-                           bias=layer_config.get("bias", True),
-                           dtype=torch.float32)
+            linear = nn.Linear(layer_config.get("in_features", 768),
+                               layer_config.get("out_features", 768),
+                               bias=layer_config.get("bias", True),
+                               dtype=torch.float32)
 
-        if _load_dense_weights(linear, folder, model_config):
-            layers = [linear]
+            if not _load_dense_weights(linear, folder, model_config):
+                continue
+
+            layers.append(linear)
             if act_name := layer_config.get("activation_function"):
                 layers.append(get_act_fn(act_name))
-            return nn.Sequential(*layers).to(dtype=torch.float32)
-
+        return nn.Sequential(*layers).to(dtype=torch.float32)
     except Exception:
         logger.exception("ST projector loading failed")
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 8b76a54332..f38e7fc202 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -24,6 +24,14 @@ class VerifyAndUpdateConfig:
         raise NotImplementedError
 
 
+class Gemma3TextModelConfig:
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        hf_config = vllm_config.model_config.hf_config
+        hf_config.is_causal = not hf_config.use_bidirectional_attention
+
+
 class GteNewModelConfig(VerifyAndUpdateConfig):
 
     @staticmethod
@@ -409,6 +417,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteModel": SnowflakeGteNewModelConfig,
     "GteNewModel": GteNewModelConfig,
     "GteNewForSequenceClassification": GteNewModelConfig,
+    "Gemma3TextModel": Gemma3TextModelConfig,
     "NomicBertModel": NomicBertModelConfig,
     "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
     "Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 410c715d52..1263e3049a 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -24,7 +24,7 @@ import torch.nn.functional as F
 from torch import nn
 from transformers import Gemma3TextConfig
 
-from vllm.attention import Attention
+from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -44,6 +44,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from ...attention.layers.encoder_only_attention import EncoderOnlyAttention
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, extract_layer_index,
                     is_pp_missing_parameter,
@@ -169,16 +170,24 @@ class Gemma3Attention(nn.Module):
             rope_scaling=self.rope_scaling,
         )
 
-        # Initialize the attention.
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              logits_soft_cap=attn_logits_soft_cap,
-                              per_layer_sliding_window=sliding_window,
-                              prefix=f"{prefix}.attn")
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        attn_cls = (EncoderOnlyAttention
+                    if attn_type == AttentionType.ENCODER_ONLY else Attention)
+
+        self.attn = attn_cls(self.num_heads,
+                             self.head_dim,
+                             self.scaling,
+                             num_kv_heads=self.num_kv_heads,
+                             cache_config=cache_config,
+                             quant_config=quant_config,
+                             attn_type=attn_type,
+                             logits_soft_cap=attn_logits_soft_cap,
+                             per_layer_sliding_window=sliding_window,
+                             prefix=f"{prefix}.attn")
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 38d300b03d..c522fcab7f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -155,6 +155,7 @@ _EMBEDDING_MODELS = {
     "BertModel": ("bert", "BertEmbeddingModel"),
     "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
+    "Gemma3TextModel": ("gemma3", "Gemma3Model"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GPT2ForSequenceClassification": ("gpt2", "GPT2ForSequenceClassification"),
     "GritLM": ("gritlm", "GritLM"),

From 305a1cc0d27870568c9c0b2ee6f2124479f0cb3a Mon Sep 17 00:00:00 2001
From: Andrew Sansom <andrew@protopia.ai>
Date: Sat, 6 Sep 2025 01:01:23 -0500
Subject: [PATCH 878/932] refactor: Turn GPUModelRunner.inputs_embeds to a
 CpuGpuBuffer (#24345)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
---
 vllm/v1/utils.py                   | 21 +++++++++++++++++----
 vllm/v1/worker/gpu_model_runner.py | 30 ++++++++++++++++++++----------
 2 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 8f9face6fb..ab9bee3e45 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -19,6 +19,8 @@ from vllm.utils import (get_open_port, get_open_zmq_ipc_path, get_tcp_uri,
                         kill_process_tree)
 
 if TYPE_CHECKING:
+    import numpy as np
+
     from vllm.v1.engine.coordinator import DPCoordinator
     from vllm.v1.engine.utils import (CoreEngineActorManager,
                                       CoreEngineProcManager)
@@ -97,20 +99,31 @@ class ConstantList(Generic[T], Sequence):
 
 
 class CpuGpuBuffer:
+    """Buffer to easily copy tensors between CPU and GPU."""
 
     def __init__(
         self,
-        *args,
+        *size: Union[int, torch.SymInt],
         dtype: torch.dtype,
         device: torch.device,
         pin_memory: bool,
-    ):
-        self.cpu = torch.zeros(*args,
+        with_numpy: bool = True,
+    ) -> None:
+        self.cpu = torch.zeros(*size,
                                dtype=dtype,
                                device="cpu",
                                pin_memory=pin_memory)
-        self.np = self.cpu.numpy()
         self.gpu = self.cpu.to(device)
+        self.np: np.ndarray
+        # To keep type hints simple (avoiding generics and subclasses), we
+        # only conditionally create the numpy array attribute. This can cause
+        # AttributeError if `self.np` is accessed when `with_numpy=False`.
+        if with_numpy:
+            if dtype == torch.bfloat16:
+                raise ValueError(
+                    "Bfloat16 torch tensors cannot be directly cast to a "
+                    "numpy array, so call CpuGpuBuffer with with_numpy=False")
+            self.np = self.cpu.numpy()
 
     def copy_to_gpu(self, n: Optional[int] = None) -> torch.Tensor:
         if n is None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ba909f5e81..76ed5c5a60 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -303,10 +303,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.query_start_loc = self._make_buffer(self.max_num_reqs + 1,
                                                  dtype=torch.int32)
         self.seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
-        self.inputs_embeds = torch.zeros(
-            (self.max_num_tokens, self.hidden_size),
-            dtype=self.dtype,
-            device=self.device)
+        # Because inputs_embeds may be bfloat16 and we don't need a numpy
+        # version of this tensor, avoid a RuntimeError by not creating a
+        # numpy buffer.
+        self.inputs_embeds = self._make_buffer(self.max_num_tokens,
+                                               self.hidden_size,
+                                               dtype=self.dtype,
+                                               numpy=False)
 
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
         if self.uses_mrope:
@@ -374,11 +377,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             device="cpu",
             pin_memory=self.pin_memory)
 
-    def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
-        return CpuGpuBuffer(*args,
+    def _make_buffer(self,
+                     *size: Union[int, torch.SymInt],
+                     dtype: torch.dtype,
+                     numpy: bool = True) -> CpuGpuBuffer:
+        # Bfloat16 torch tensors cannot be directly cast to a numpy array, so
+        # if a bfloat16 buffer is needed without a corresponding numpy array,
+        # don't bother instantiating the numpy array.
+        return CpuGpuBuffer(*size,
                             dtype=dtype,
                             device=self.device,
-                            pin_memory=self.pin_memory)
+                            pin_memory=self.pin_memory,
+                            with_numpy=numpy)
 
     def _init_model_kwargs(self, num_tokens: int):
         model_kwargs = dict[str, Any]()
@@ -1645,11 +1655,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             )
 
             # TODO(woosuk): Avoid the copy. Optimize.
-            self.inputs_embeds[:num_scheduled_tokens].copy_(
+            self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(
                 inputs_embeds_scheduled)
 
             input_ids = None
-            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
             model_kwargs = {
                 **self._init_model_kwargs(num_scheduled_tokens),
                 **self._extract_mm_kwargs(scheduler_output),
@@ -2484,7 +2494,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                             num_scheduled_tokens, remove_lora):
             if self.supports_mm_inputs:
                 input_ids = None
-                inputs_embeds = self.inputs_embeds[:num_tokens]
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
                 model_kwargs = {
                     **self._init_model_kwargs(num_tokens),
                     **self._dummy_mm_kwargs(num_reqs),

From eddaafc1c77b0690194cbd1b73747d572793838c Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Sat, 6 Sep 2025 02:33:19 -0700
Subject: [PATCH 879/932] [Multimodal] Improve max video embedding length
 estimation in V1 (#24312)

Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.me>
---
 vllm/model_executor/models/llava_onevision.py | 5 +----
 vllm/model_executor/models/qwen2_vl.py        | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index e4ac0cd919..bc340a9e2d 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -216,12 +216,9 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> int:
-        max_images = mm_counts.get("image", 0)
         max_videos = mm_counts.get("video", 0)
 
-        max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self._get_max_video_frames(seq_len -
-                                                      max_image_tokens)
+        max_total_frames = self._get_max_video_frames(seq_len)
         max_frames_per_video = min(max_total_frames // max(max_videos, 1),
                                    _MAX_FRAMES_PER_VIDEO)
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index ae7a8d8d7a..b708719e4f 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -915,12 +915,9 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> int:
-        max_images = mm_counts.get("image", 0)
         max_videos = mm_counts.get("video", 0)
 
-        max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self._get_max_video_frames(seq_len -
-                                                      max_image_tokens)
+        max_total_frames = self._get_max_video_frames(seq_len)
         max_frames_per_video = min(max_total_frames // max(max_videos, 1),
                                    _MAX_FRAMES_PER_VIDEO)
 

From b121ca22ad9a648513af098052915dd0afb1dd47 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Sat, 6 Sep 2025 06:31:56 -0700
Subject: [PATCH 880/932] [CI] Disable flaky structured output test from CI
 (#24366)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index cd82eb2ac4..fb49db8f16 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -41,8 +41,9 @@ EAGLE_SPEC_CONFIG = {
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
     ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto",
-     None),
+    #FIXME: This test is flaky on CI thus disabled
+    #("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto",
+    # None),
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None),

From 0077c8634e122d7161c3a72fc15e7c1fb8bb230b Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <benjamin.chislett@centml.ai>
Date: Sat, 6 Sep 2025 10:03:35 -0400
Subject: [PATCH 881/932] Add @benchislett to codeowner for spec decode and
 structured outputs (#24362)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
---
 .github/CODEOWNERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d2839deccb..682b27ac89 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -27,7 +27,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/spec_decode @benchislett
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 
 # Test ownership

From 0eadaeff7e51ada11771b31d2056bda05b012f5c Mon Sep 17 00:00:00 2001
From: mohankku <mohankku@users.noreply.github.com>
Date: Sat, 6 Sep 2025 08:17:03 -0700
Subject: [PATCH 882/932] [Bugfix] Avoid uninitialized usage of azp_val when
 AZP is false. (#24335)

Signed-off-by: Mohan Kumar Kumar <mohan.cbein@gmail.com>
Signed-off-by: mohankku <mohan.cbein@gmail.com>
---
 csrc/cpu/dnnl_kernels.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp
index 1aa9961492..9a3af4ac9d 100644
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -145,7 +145,8 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
       }
     }
 
-    float scale_val, azp_val;
+    float scale_val;
+    float azp_val = 0.0f;
     if constexpr (AZP) {
       float max_scalar = max_value.reduce_max();
       float min_scalar = min_value.reduce_min();

From 00a4e56d8dd470615f0dde2e4c996ed5564da35f Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 7 Sep 2025 00:23:12 +0800
Subject: [PATCH 883/932] [Bugfix] Fix broken deepseek fp8 TP weights loading
 (#24367)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/layers/linear.py           | 4 +++-
 vllm/model_executor/layers/quantization/fp8.py | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index fa8a261db7..fd88eac55c 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -262,7 +262,7 @@ class LinearBase(CustomOp):
         self.tp_size = (get_tensor_model_parallel_world_size()
                         if not disable_tp else 1)
 
-    def __post_init__(self):
+    def update_param_tp_status(self):
         for param in self.parameters():
             if isinstance(param, BasevLLMParameter):
                 param.tp_rank = self.tp_rank
@@ -459,6 +459,7 @@ class ColumnParallelLinear(LinearBase):
             })
         else:
             self.register_parameter("bias", None)
+        self.update_param_tp_status()
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
 
@@ -1250,6 +1251,7 @@ class RowParallelLinear(LinearBase):
             })
         else:
             self.register_parameter("bias", None)
+        self.update_param_tp_status()
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         input_dim = getattr(param, "input_dim", None)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index de22cceb45..65e0b70621 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -270,7 +270,8 @@ class Fp8LinearMethod(LinearMethodBase):
         layer.weight_block_size = None
 
         if self.block_quant:
-            tp_size = get_tensor_model_parallel_world_size()
+            tp_size = getattr(layer, "tp_size",
+                              get_tensor_model_parallel_world_size())
             assert self.quant_config.weight_block_size is not None
             layer.weight_block_size = self.quant_config.weight_block_size
             block_n, block_k = (

From 7555d6b34af1aad14786a9451ff69968316a3ab4 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 7 Sep 2025 00:32:03 +0800
Subject: [PATCH 884/932] [Bugfix] Fix test_mixtral_moe (#24371)

---
 tests/kernels/moe/test_moe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 0ea9667914..850c486b95 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -371,8 +371,8 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 @pytest.mark.parametrize(
     "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 @torch.inference_mode()
-def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
-                     monkeypatch):
+def test_mixtral_moe(dist_init, dtype: torch.dtype, padding: bool,
+                     use_rocm_aiter: bool, monkeypatch):
     """Make sure our Mixtral MoE implementation agrees with the one from
     huggingface."""
 

From 6024d115cdd4d23b117d4ba9bd27d7e0311fed19 Mon Sep 17 00:00:00 2001
From: Ashwin Phadke <ashwinphadke12@rediffmail.com>
Date: Sat, 6 Sep 2025 22:12:19 +0530
Subject: [PATCH 885/932] Lora bias(enable_lora_bias) deprecate warning
 (#24339)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/config/__init__.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index c4434c37f4..41322f4f2a 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -2458,7 +2458,6 @@ class LoRAConfig:
     LoRA adapter. Will be removed in v0.12.0."""
     lora_vocab_padding_size: ClassVar[int] = current_platform\
         .get_lora_vocab_padding_size()
-
     default_mm_loras: Optional[dict[str, str]] = None
     """Dictionary mapping specific modalities to LoRA model paths; this field
     is only applicable to multimodal models and should be leveraged when a
@@ -2470,7 +2469,8 @@ class LoRAConfig:
     will be automatically assigned to 1-n with the names of the modalities
     in alphabetic order."""
     bias_enabled: bool = False
-    """Enable bias for LoRA adapters."""
+    """[DEPRECATED] Enable bias for LoRA adapters. This option will be
+    removed in v0.12.0."""
 
     def compute_hash(self) -> str:
         """
@@ -2503,6 +2503,11 @@ class LoRAConfig:
             "in v0.12.0. Additional vocabulary support for "
             "LoRA adapters is being phased out.")
 
+        # Deprecation warning for enable_lora_bias
+        if self.bias_enabled:
+            logger.warning("`enable_lora_bias` is deprecated "
+                           "and will be removed in v0.12.0.")
+
         # Setting the maximum rank to 512 should be able to satisfy the vast
         # majority of applications.
         possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)

From fb691ee4e776a5fa6780e3752884dc5e0c5ccda1 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Sat, 6 Sep 2025 15:10:32 -0400
Subject: [PATCH 886/932] [Fix] [gpt-oss] fix non-tool calling path for chat
 completion (#24324)

---
 tests/entrypoints/openai/test_serving_chat.py | 70 ++++++++++++++-----
 vllm/entrypoints/openai/serving_chat.py       | 51 ++++++++------
 2 files changed, 83 insertions(+), 38 deletions(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index c609cfb5c0..04805dbca7 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -36,21 +36,41 @@ def monkeypatch_module():
     mpatch.undo()
 
 
+@pytest.fixture(scope="module",
+                params=[True, False],
+                ids=["with_tool_parser", "without_tool_parser"])
+def with_tool_parser(request) -> bool:
+    return request.param
+
+
 @pytest.fixture(scope="module")
-def gptoss_server(monkeypatch_module: pytest.MonkeyPatch):
-    with monkeypatch_module.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
-        args = [
-            "--enforce-eager",
-            "--max-model-len",
-            "8192",
+def default_server_args(with_tool_parser: bool):
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--enforce-eager",
+        "--max-model-len",
+        "4096",
+        "--reasoning-parser",
+        "openai_gptoss",
+        "--gpu-memory-utilization",
+        "0.8",
+    ]
+    if with_tool_parser:
+        args.extend([
             "--tool-call-parser",
             "openai",
-            "--reasoning-parser",
-            "openai_gptoss",
             "--enable-auto-tool-choice",
-        ]
-        with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, args) as remote_server:
+        ])
+    return args
+
+
+@pytest.fixture(scope="module")
+def gptoss_server(monkeypatch_module: pytest.MonkeyPatch,
+                  default_server_args: list[str]):
+    with monkeypatch_module.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
+        with RemoteOpenAIServer(GPT_OSS_MODEL_NAME,
+                                default_server_args) as remote_server:
             yield remote_server
 
 
@@ -61,7 +81,8 @@ async def gptoss_client(gptoss_server):
 
 
 @pytest.mark.asyncio
-async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI):
+async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI,
+                                                with_tool_parser: bool):
     tools = [{
         "type": "function",
         "function": {
@@ -94,10 +115,14 @@ async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI):
     ]
 
     stream = await gptoss_client.chat.completions.create(
-        model=GPT_OSS_MODEL_NAME, messages=messages, tools=tools, stream=True)
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools if with_tool_parser else None,
+        stream=True)
 
     name = None
     args_buf = ""
+    content_buf = ""
     async for chunk in stream:
         delta = chunk.choices[0].delta
         if delta.tool_calls:
@@ -106,13 +131,22 @@ async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI):
                 name = tc.function.name
             if tc.function and tc.function.arguments:
                 args_buf += tc.function.arguments
-
-    assert name is not None
-    assert len(args_buf) > 0
+        if getattr(delta, "content", None):
+            content_buf += delta.content
+    if with_tool_parser:
+        assert name is not None
+        assert len(args_buf) > 0
+    else:
+        assert name is None
+        assert len(args_buf) == 0
+        assert len(content_buf) > 0
 
 
 @pytest.mark.asyncio
-async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI):
+async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI,
+                                       with_tool_parser: bool):
+    if not with_tool_parser:
+        pytest.skip("skip non-tool for multi-turn tests")
     tools = [{
         "type": "function",
         "function": {
@@ -175,7 +209,7 @@ async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI):
     )
     second_msg = second.choices[0].message
     assert (second_msg.content is not None and len(second_msg.content) > 0) or \
-        (second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0)  # noqa: E501
+        (second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0)
 
 
 MODEL_NAME = "openai-community/gpt2"
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 4cc22787a0..5c7adc53f4 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -6,7 +6,7 @@ import json
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
-from typing import TYPE_CHECKING, Callable, Final, Optional, Union
+from typing import Callable, Final, Optional, Union
 
 import jinja2
 import partial_json_parser
@@ -1174,6 +1174,7 @@ class OpenAIServingChat(OpenAIServing):
         for output in final_res.outputs:
             token_ids = output.token_ids
             out_logprobs = output.logprobs
+            tool_call_info = None
 
             if request.logprobs and request.top_logprobs is not None:
                 assert out_logprobs is not None, "Did not output logprobs"
@@ -1188,32 +1189,42 @@ class OpenAIServingChat(OpenAIServing):
                 logprobs = None
 
             if self.use_harmony:
-                if TYPE_CHECKING:
-                    assert self.tool_parser is not None
-                tool_parser = self.tool_parser(tokenizer)
-                # NOTE: We use token_ids for openai tool parser
-                tool_call_info = tool_parser.extract_tool_calls(
-                    "",
-                    request=request,
-                    token_ids=token_ids,  # type: ignore
-                )
-                reasoning_content, content = None, tool_call_info.content
-                if request.include_reasoning:
+                if self.tool_parser is not None:
+                    tool_parser = self.tool_parser(tokenizer)
+                    # NOTE: We use token_ids for openai tool parser
+                    tool_call_info = tool_parser.extract_tool_calls(
+                        "",
+                        request=request,
+                        token_ids=token_ids,  # type: ignore
+                    )
+                    reasoning_content, content = None, tool_call_info.content
+                    if request.include_reasoning:
+                        reasoning_content, content, _ = parse_chat_output(
+                            token_ids)
+                    message = ChatMessage(
+                        role=role,
+                        reasoning_content=reasoning_content,
+                        content=content,
+                        tool_calls=tool_call_info.tool_calls,
+                    )
+                else:
                     reasoning_content, content, _ = parse_chat_output(
                         token_ids)
-                message = ChatMessage(
-                    role=role,
-                    reasoning_content=reasoning_content,
-                    content=content,
-                    tool_calls=tool_call_info.tool_calls,
-                )
+                    if not request.include_reasoning:
+                        reasoning_content = None
+                    message = ChatMessage(
+                        role=role,
+                        reasoning_content=reasoning_content,
+                        content=content,
+                    )
 
                 choice_data = ChatCompletionResponseChoice(
                     index=output.index,
                     message=message,
                     logprobs=logprobs,
-                    finish_reason="tool_calls"
-                    if tool_call_info.tools_called else
+                    finish_reason="tool_calls" if
+                    (tool_call_info is not None
+                     and tool_call_info.tools_called) else
                     output.finish_reason if output.finish_reason else "stop",
                     stop_reason=output.stop_reason,
                 )

From a3645ed94d9f587e22a24f13fccd26be9d35d8aa Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Sat, 6 Sep 2025 13:27:15 -0700
Subject: [PATCH 887/932] [Frontend][Responses API] Support reporting tool
 output tokens and fix reasoning token count (#24285)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 tests/entrypoints/test_context.py            | 425 +++++++++++++++++++
 vllm/entrypoints/context.py                  | 158 +++++--
 vllm/entrypoints/openai/protocol.py          |   3 +-
 vllm/entrypoints/openai/serving_responses.py |   7 +-
 4 files changed, 557 insertions(+), 36 deletions(-)
 create mode 100644 tests/entrypoints/test_context.py

diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/test_context.py
new file mode 100644
index 0000000000..5e6a4c85ff
--- /dev/null
+++ b/tests/entrypoints/test_context.py
@@ -0,0 +1,425 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+from openai_harmony import StreamState
+
+from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext
+from vllm.outputs import CompletionOutput, RequestOutput
+
+
+# Helper function for Python < 3.10 compatibility
+async def async_next(async_iterator):
+    """Compatibility function equivalent to Python 3.10's anext()."""
+    return await async_iterator.__anext__()
+
+
+def create_mock_request_output(
+    prompt_token_ids=None,
+    output_token_ids=None,
+    num_cached_tokens=0,
+    finished=True,
+):
+    """Helper function to create a mock RequestOutput object for testing."""
+    outputs = []
+    token_ids = output_token_ids if output_token_ids is not None else []
+    outputs = [
+        CompletionOutput(
+            index=0,
+            text="Test output",
+            token_ids=token_ids,
+            cumulative_logprob=0.0,
+            logprobs=None,
+            finish_reason=None,
+            stop_reason=None,
+        )
+    ]
+
+    return RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=prompt_token_ids,
+        prompt_logprobs=None,
+        outputs=outputs,
+        finished=finished,
+        num_cached_tokens=num_cached_tokens,
+    )
+
+
+async def generate_mock_outputs(num_turns,
+                                prompt_token_counts,
+                                output_token_counts,
+                                cached_token_counts=None):
+    """Generate a sequence of mock RequestOutput objects to simulate multiple
+    turns."""
+    if cached_token_counts is None:
+        cached_token_counts = [0] * num_turns
+
+    for i in range(num_turns):
+        # Create mock prompt token IDs and output token IDs
+        prompt_token_ids = list(range(1, prompt_token_counts[i] + 1))
+        output_token_ids = list(range(1, output_token_counts[i] + 1))
+
+        # Create and yield the RequestOutput
+        yield create_mock_request_output(
+            prompt_token_ids=prompt_token_ids,
+            output_token_ids=output_token_ids,
+            num_cached_tokens=cached_token_counts[i],
+        )
+
+
+@pytest.fixture
+def mock_parser():
+    """Set up a mock parser for tests."""
+    with patch("vllm.entrypoints.context.get_streamable_parser_for_assistant"
+               ) as mock_parser_factory:
+        # Create a mock parser object
+        parser = MagicMock()
+        parser.messages = []
+        parser.current_channel = None
+        parser.state = StreamState.EXPECT_START
+        mock_parser_factory.return_value = parser
+        yield parser
+
+
+def test_single_turn_token_counting():
+    """Test token counting behavior for a single turn."""
+    # Create a context
+    context = HarmonyContext(messages=[], available_tools=[])
+
+    # Create a mock RequestOutput with specific token counts
+    mock_output = create_mock_request_output(
+        prompt_token_ids=[1, 2, 3, 4, 5],  # 5 prompt tokens
+        output_token_ids=[6, 7, 8],  # 3 output tokens
+        num_cached_tokens=2,  # 2 cached tokens
+    )
+
+    # Append the output to the context
+    context.append_output(mock_output)
+
+    # Verify the token counts
+    assert context.num_prompt_tokens == 5
+    assert context.num_output_tokens == 3
+    assert context.num_cached_tokens == 2
+    assert context.num_tool_output_tokens == 0  # No tool tokens in first turn
+
+    # Verify internal state tracking
+    assert not context.is_first_turn
+    assert context.previous_turn.input_tokens == 5
+    assert context.previous_turn.output_tokens == 3
+
+
+@pytest.mark.asyncio
+async def test_multi_turn_token_counting():
+    """Test token counting behavior across multiple turns with tool output."""
+    # Create a context
+    context = HarmonyContext(messages=[], available_tools=["browser"])
+
+    # Simulate a conversation with 3 turns
+    # Turn 1: prefill 5, decode 3, tool 7
+    # Turn 2: prefill 15, cached 5, decode 4, tool 1
+    # Turn 3: prefill 20, cached 15, decode 5
+    prompt_token_counts = [5, 15, 20]
+    output_token_counts = [3, 4, 5]
+    cached_token_counts = [0, 5, 15]
+    mock_generator = generate_mock_outputs(3, prompt_token_counts,
+                                           output_token_counts,
+                                           cached_token_counts)
+
+    # First turn - initial prompt and response
+    mock_output1 = await async_next(mock_generator)
+    context.append_output(mock_output1)
+
+    # At this point, we should have 5 prompt tokens and 3 output tokens
+    assert context.num_prompt_tokens == 5
+    assert context.num_output_tokens == 3
+    assert context.num_tool_output_tokens == 0
+
+    # Second turn - after tool output
+    mock_output2 = await async_next(mock_generator)
+    context.append_output(mock_output2)
+    # Current prompt tokens (15) - last_turn_input_tokens (5) -
+    # last_turn_output_tokens (3) = 7
+    expected_tool_output = 7
+
+    assert context.num_prompt_tokens == 5 + 15
+    assert context.num_output_tokens == 3 + 4
+    assert context.num_tool_output_tokens == expected_tool_output
+    assert context.num_cached_tokens == 5
+
+    # Third turn - final response
+    mock_output3 = await async_next(mock_generator)
+    context.append_output(mock_output3)
+    # Additional tool output tokens from third turn:
+    # Current prompt (20) - last_turn_input_tokens (15) -
+    # last_turn_output_tokens (4) = 1
+    expected_tool_output = 7 + 1
+
+    assert context.num_prompt_tokens == 5 + 15 + 20
+    assert context.num_output_tokens == 3 + 4 + 5
+    assert context.num_tool_output_tokens == expected_tool_output
+    assert context.num_cached_tokens == 5 + 15
+
+
+def test_empty_output_tokens():
+    """Test behavior when RequestOutput has empty output tokens."""
+    context = HarmonyContext(messages=[], available_tools=[])
+
+    # Create a RequestOutput with empty output tokens
+    mock_output = create_mock_request_output(
+        prompt_token_ids=[1, 2, 3],  # 3 prompt tokens
+        output_token_ids=[],  # Empty output tokens list
+        num_cached_tokens=1,
+    )
+
+    context.append_output(mock_output)
+
+    # Should handle empty outputs gracefully
+    assert context.num_prompt_tokens == 3
+    assert context.num_output_tokens == 0  # No output tokens
+    assert context.num_cached_tokens == 1
+    assert context.num_tool_output_tokens == 0
+
+
+def test_missing_prompt_token_ids():
+    """Test behavior when RequestOutput has None prompt_token_ids."""
+    context = HarmonyContext(messages=[], available_tools=[])
+
+    mock_output = create_mock_request_output(
+        prompt_token_ids=None,  # No prompt token IDs
+        output_token_ids=[1, 2],  # 2 output tokens
+        num_cached_tokens=0,
+    )
+
+    # Logger.error will be called, but we don't need to check for warnings
+    # here Just ensure it doesn't raise an exception
+    context.append_output(mock_output)
+
+    # Should handle missing prompt tokens gracefully
+    assert context.num_prompt_tokens == 0
+    assert context.num_output_tokens == 2
+    assert context.num_cached_tokens == 0
+    assert context.num_tool_output_tokens == 0
+
+
+def test_reasoning_tokens_counting(mock_parser):
+    """Test that reasoning tokens are counted correctly."""
+    context = HarmonyContext(messages=[], available_tools=[])
+
+    # Mock parser to simulate reasoning channel
+    mock_parser.current_channel = "analysis"  # Reasoning channel
+
+    mock_output = create_mock_request_output(
+        prompt_token_ids=[1, 2, 3],
+        output_token_ids=[4, 5, 6, 7],  # 4 tokens, all in reasoning
+        num_cached_tokens=0,
+    )
+
+    context.append_output(mock_output)
+
+    # All output tokens should be counted as reasoning
+    assert context.num_reasoning_tokens == 4
+    assert context.num_output_tokens == 4
+
+
+def test_zero_tokens_edge_case():
+    """Test behavior with all zero token counts."""
+    context = HarmonyContext(messages=[], available_tools=[])
+
+    # Create a request with empty lists (not None) for both prompt and
+    # output tokens
+    mock_output = create_mock_request_output(
+        prompt_token_ids=[],  # Empty prompt tokens
+        output_token_ids=[],  # Empty output tokens
+        num_cached_tokens=0,
+    )
+
+    context.append_output(mock_output)
+
+    # All counts should be zero
+    assert context.num_prompt_tokens == 0
+    assert context.num_output_tokens == 0
+    assert context.num_cached_tokens == 0
+    assert context.num_tool_output_tokens == 0
+    assert context.num_reasoning_tokens == 0
+
+
+@pytest.mark.asyncio
+async def test_single_turn_no_tool_output():
+    """Test that first turn never generates tool output tokens."""
+    context = HarmonyContext(
+        messages=[],
+        available_tools=["browser"]  # Tools available
+    )
+
+    # Even with large prompt in first turn, no tool tokens should be counted
+    mock_output = create_mock_request_output(
+        prompt_token_ids=list(range(100)),  # 100 tokens
+        output_token_ids=[1, 2, 3],
+        num_cached_tokens=0,
+    )
+
+    context.append_output(mock_output)
+
+    # First turn should never have tool output tokens
+    assert context.num_tool_output_tokens == 0
+    assert context.is_first_turn is False  # Should be updated after first turn
+
+
+@pytest.mark.asyncio
+async def test_negative_tool_tokens_edge_case():
+    """Test edge case where calculation could result in negative tool
+    tokens. We should log an error and clamp the value to 0."""
+    # Use patch to check if logger.error was called
+    with patch("vllm.entrypoints.context.logger.error") as mock_log:
+        context = HarmonyContext(messages=[], available_tools=["browser"])
+
+        # First turn
+        mock_output1 = create_mock_request_output(
+            prompt_token_ids=list(range(10)),  # 10 tokens
+            output_token_ids=[1, 2, 3, 4, 5],  # 5 tokens
+        )
+        context.append_output(mock_output1)
+
+        # Second turn with fewer new tokens than previous output
+        # This could happen in edge cases with aggressive caching
+        mock_output2 = create_mock_request_output(
+            prompt_token_ids=list(range(12)),  # 12 tokens (only 2 new)
+            output_token_ids=[6, 7],  # 2 tokens
+        )
+        context.append_output(mock_output2)
+
+        # Calculated negative tool tokens (12 - 10 - 5 = -3) should be clamped
+        # to 0 and an error should be logged
+        assert context.num_tool_output_tokens == 0
+        assert context.num_prompt_tokens == 10 + 12
+        assert context.num_output_tokens == 5 + 2
+
+        # Verify the error was logged properly
+        mock_log.assert_called_once()
+
+        # Extract the actual log message and arguments from the call
+        args, _ = mock_log.call_args
+        log_message = args[0]
+
+        # Check for key parts of the message
+        assert "Negative tool output tokens calculated" in log_message
+        assert "-3" in str(args)  # Check that -3 is in the arguments
+
+
+@pytest.mark.asyncio
+async def test_streaming_multi_turn_token_counting(mock_parser):
+    """Test token counting for streaming multi-turn conversations.
+    
+    This test focuses on how StreamingHarmonyContext counts tokens in a 
+    multi-turn conversation with streaming (token-by-token) outputs and 
+    message boundaries.
+    """
+    # Create a streaming context
+    context = StreamingHarmonyContext(messages=[], available_tools=["browser"])
+
+    # Simulate three turns of conversation:
+    # Turn 1: stream tokens one by one, then finish the message
+    # Turn 2: new prompt, stream more tokens with a reasoning segment
+    # Turn 3: new prompt with tool output and cached tokens
+
+    # First turn: 3 tokens streamed one by one
+    # First token of first turn
+    context.append_output(
+        create_mock_request_output(
+            prompt_token_ids=[1, 2, 3],  # 3 prompt tokens
+            output_token_ids=[101],  # Single token
+            num_cached_tokens=0,
+            finished=False,  # Not end of message yet
+        ))
+
+    # Second token of first turn
+    context.append_output(
+        create_mock_request_output(
+            output_token_ids=[102],
+            finished=False,
+        ))
+
+    # Last token of first turn (finished=True signals end of message)
+    context.append_output(
+        create_mock_request_output(
+            output_token_ids=[103],
+            finished=True,  # End of message
+        ))
+
+    # Check token counts after first turn
+    assert context.num_prompt_tokens == 3  # Initial prompt tokens
+    assert context.num_output_tokens == 3  # Three output tokens
+    assert context.num_cached_tokens == 0
+    assert context.num_tool_output_tokens == 0  # No tool output in first turn
+    assert context.first_tok_of_message is True  # Ready for next message
+
+    # Second turn: reasoning tokens in analysis channel
+    mock_parser.current_channel = "analysis"  # Set to reasoning channel
+
+    # First token of second turn
+    context.append_output(
+        create_mock_request_output(
+            prompt_token_ids=[1, 2, 3, 101, 102, 103, 4,
+                              5],  # 8 tokens (includes previous)
+            output_token_ids=[201],
+            num_cached_tokens=3,  # Some tokens cached
+            finished=False,
+        ))
+
+    # More tokens in reasoning channel
+    context.append_output(
+        create_mock_request_output(
+            output_token_ids=[202],
+            finished=False,
+        ))
+
+    context.append_output(
+        create_mock_request_output(
+            output_token_ids=[203],
+            finished=True,  # End of reasoning message
+        ))
+
+    # Check counts after second turn (reasoning message)
+    assert context.num_prompt_tokens == 3 + 8  # Initial + second prompt
+    assert context.num_output_tokens == 3 + 3  # First turn + second turn
+    assert context.num_reasoning_tokens == 3  # All tokens in analysis channel
+    assert context.num_cached_tokens == 3  # Cached tokens from second turn
+
+    # Formula: this turn prompt tokens - last turn prompt - last turn output
+    expected_tool_tokens = 8 - 3 - 3  # = 2
+    assert context.num_tool_output_tokens == expected_tool_tokens
+
+    # Third turn: regular output channel
+    mock_parser.current_channel = "final"  # Switch back to regular channel
+
+    # Third turn (with more cached tokens)
+    context.append_output(
+        create_mock_request_output(
+            prompt_token_ids=[
+                1, 2, 3, 101, 102, 103, 4, 5, 201, 202, 203, 6, 7
+            ],  # 13 tokens
+            output_token_ids=[301],
+            num_cached_tokens=8,  # More cached tokens
+            finished=False,
+        ))
+
+    context.append_output(
+        create_mock_request_output(
+            output_token_ids=[302],
+            finished=True,
+        ))
+
+    # Final token counts check
+    assert context.num_prompt_tokens == 3 + 8 + 13  # All prompts
+    assert context.num_output_tokens == 3 + 3 + 2  # All outputs
+    assert context.num_reasoning_tokens == 3  # Unchanged from second turn
+    assert context.num_cached_tokens == 3 + 8  # Accumulated cached tokens
+
+    # Additional tool tokens from third turn
+    # Formula: this turn prompt - last turn prompt - last turn output
+    additional_tool_tokens = 13 - 8 - 3  # = 2
+    assert context.num_tool_output_tokens == expected_tool_tokens \
+        + additional_tool_tokens
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index e4f2e800f9..7723c5d5cb 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -3,7 +3,6 @@
 import json
 import logging
 from abc import ABC, abstractmethod
-from collections.abc import Sequence
 from contextlib import AsyncExitStack
 from typing import TYPE_CHECKING, Optional, Union
 
@@ -21,6 +20,23 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
 
 
+class TurnTokens:
+    """Tracks token counts for a single conversation turn."""
+
+    def __init__(self, input_tokens=0, output_tokens=0):
+        self.input_tokens = input_tokens
+        self.output_tokens = output_tokens
+
+    def reset(self):
+        """Reset counters for a new turn."""
+        self.input_tokens = 0
+        self.output_tokens = 0
+
+    def copy(self):
+        """Create a copy of this turn's token counts."""
+        return TurnTokens(self.input_tokens, self.output_tokens)
+
+
 class ConversationContext(ABC):
 
     @abstractmethod
@@ -92,52 +108,124 @@ class HarmonyContext(ConversationContext):
         self.num_init_messages = len(messages)
         self.num_prompt_tokens = 0
         self.num_output_tokens = 0
-        # TODO(woosuk): Implement the following fields.
         self.num_cached_tokens = 0
         self.num_reasoning_tokens = 0
+        self.num_tool_output_tokens = 0
 
-    def _update_num_prompt_tokens(self, output: RequestOutput):
-        if output.prompt_token_ids and len(output.prompt_token_ids) > 0:
-            # NOTE: with built-in tools, there might be multiple rounds in
-            # the conversation, with the full conversation being resent
-            # as new prompt each time. Hence the sum.
-            self.num_prompt_tokens += len(output.prompt_token_ids)
+        # Turn tracking - replaces multiple individual tracking variables
+        self.current_turn = TurnTokens()
+        self.previous_turn = TurnTokens()
+        self.is_first_turn = True
+        self.first_tok_of_message = True  # For streaming support
 
-    def _update_num_cached_tokens(self, output: RequestOutput):
-        if output.num_cached_tokens is not None:
-            #Similar to num_prompt_tokens
-            self.num_cached_tokens += output.num_cached_tokens
-
-    def _update_num_output_tokens(self, token_ids: Sequence[int]):
-        self.num_output_tokens += len(token_ids)
-
-    def _update_num_reasoning_tokens(self, token_ids: Sequence[int]):
-        # Count tokens that are part of reasoning content (analysis channel
-        # or tool-directed messages like python/browser calls)
-        is_analysis = self.parser.current_channel == "analysis"
-        is_tool_call = (self.parser.current_recipient is not None and
-                        (self.parser.current_recipient.startswith("python") or
-                         self.parser.current_recipient.startswith("browser.")))
-        if is_analysis or is_tool_call:
-            self.num_reasoning_tokens += len(token_ids)
+    def _update_num_reasoning_tokens(self):
+        # Count all analysis and commentary channels as reasoning tokens
+        if self.parser.current_channel in {"analysis", "commentary"}:
+            self.num_reasoning_tokens += 1
 
     def append_output(self, output) -> None:
         if isinstance(output, RequestOutput):
-            self._update_num_prompt_tokens(output)
-            self._update_num_cached_tokens(output)
             output_token_ids = output.outputs[0].token_ids
-            self._update_num_output_tokens(output_token_ids)
             self.parser = get_streamable_parser_for_assistant()
             for token_id in output_token_ids:
                 self.parser.process(token_id)
                 # Check if the current token is part of reasoning content
-                self._update_num_reasoning_tokens([token_id])
+                self._update_num_reasoning_tokens()
+            self._update_prefill_token_usage(output)
+            # Reset current turn output tokens for this turn
+            self.current_turn.output_tokens = 0
+            self._update_decode_token_usage(output)
+            # Move current turn to previous turn for next turn's calculations
+            self.previous_turn = self.current_turn.copy()
             output_msgs = self.parser.messages
         else:
             # Tool output.
             output_msgs = output
         self._messages.extend(output_msgs)
 
+    def _update_prefill_token_usage(self, output: RequestOutput) -> None:
+        """Update token usage statistics for the prefill phase of generation.
+        
+        The prefill phase processes the input prompt tokens. This method:
+        1. Counts the prompt tokens for this turn
+        2. Calculates tool output tokens for multi-turn conversations
+        3. Updates cached token counts
+        4. Tracks state for next turn calculations
+        
+        Tool output tokens are calculated as:
+        current_prompt_tokens - last_turn_prompt_tokens - 
+        last_turn_output_tokens
+        This represents tokens added between turns (typically tool responses).
+        
+        Args:
+            output: The RequestOutput containing prompt token information
+        """
+        if output.prompt_token_ids is not None:
+            this_turn_input_tokens = len(output.prompt_token_ids)
+        else:
+            this_turn_input_tokens = 0
+            logger.error(
+                "RequestOutput appended contains no prompt_token_ids.")
+
+        # Update current turn input tokens
+        self.current_turn.input_tokens = this_turn_input_tokens
+        self.num_prompt_tokens += this_turn_input_tokens
+
+        # Calculate tool tokens (except on first turn)
+        if self.is_first_turn:
+            self.is_first_turn = False
+        else:
+            # start counting tool after first turn
+            # tool tokens = this turn prefill - last turn prefill -
+            # last turn decode
+            this_turn_tool_tokens = (self.current_turn.input_tokens -
+                                     self.previous_turn.input_tokens -
+                                     self.previous_turn.output_tokens)
+
+            # Handle negative tool token counts (shouldn't happen in normal
+            # cases)
+            if this_turn_tool_tokens < 0:
+                logger.error(
+                    "Negative tool output tokens calculated: %d "
+                    "(current_input=%d, previous_input=%d, "
+                    "previous_output=%d). Setting to 0.",
+                    this_turn_tool_tokens, self.current_turn.input_tokens,
+                    self.previous_turn.input_tokens,
+                    self.previous_turn.output_tokens)
+                this_turn_tool_tokens = 0
+
+            self.num_tool_output_tokens += this_turn_tool_tokens
+
+        # Update cached tokens
+        if output.num_cached_tokens is not None:
+            self.num_cached_tokens += output.num_cached_tokens
+
+    def _update_decode_token_usage(self, output: RequestOutput) -> int:
+        """Update token usage statistics for the decode phase of generation.
+        
+        The decode phase processes the generated output tokens. This method:
+        1. Counts output tokens from all completion outputs
+        2. Updates the total output token count
+        3. Tracks tokens generated in the current turn
+        
+        In streaming mode, this is called for each token generated.
+        In non-streaming mode, this is called once with all output tokens.
+        
+        Args:
+            output: The RequestOutput containing generated token information
+            
+        Returns:
+            int: Number of output tokens processed in this call
+        """
+        updated_output_token_count = 0
+        if output.outputs:
+            for completion_output in output.outputs:
+                # only keep last round
+                updated_output_token_count += len(completion_output.token_ids)
+            self.num_output_tokens += updated_output_token_count
+            self.current_turn.output_tokens += updated_output_token_count
+        return updated_output_token_count
+
     @property
     def messages(self) -> list:
         return self._messages
@@ -231,8 +319,8 @@ class StreamingHarmonyContext(HarmonyContext):
             # append_output is called for each output token in streaming case,
             # so we only want to add the prompt tokens once for each message.
             if self.first_tok_of_message:
-                self._update_num_prompt_tokens(output)
-                self._update_num_cached_tokens(output)
+                self._update_prefill_token_usage(output)
+                self.current_turn.output_tokens = 0
             # Reset self.first_tok_of_message if needed:
             # if the current token is the last one of the current message
             # (finished=True), then the next token processed will mark the
@@ -240,9 +328,13 @@ class StreamingHarmonyContext(HarmonyContext):
             self.first_tok_of_message = output.finished
             for tok in output.outputs[0].token_ids:
                 self.parser.process(tok)
-            self._update_num_output_tokens(output.outputs[0].token_ids)
+            self._update_decode_token_usage(output)
+
+            # For streaming, update previous turn when message is complete
+            if output.finished:
+                self.previous_turn = self.current_turn.copy()
             # Check if the current token is part of reasoning content
-            self._update_num_reasoning_tokens(output.outputs[0].token_ids)
+            self._update_num_reasoning_tokens()
             self.last_tok = tok
         else:
             # Handle the case of tool output in direct message format
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 413e1dd8d6..c56c68cf76 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1841,7 +1841,8 @@ class InputTokensDetails(OpenAIBaseModel):
 
 
 class OutputTokensDetails(OpenAIBaseModel):
-    reasoning_tokens: int
+    reasoning_tokens: int = 0
+    tool_output_tokens: int = 0
 
 
 class ResponseUsage(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index d49724b043..a102d4a4a5 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -460,7 +460,7 @@ class OpenAIServingResponses(OpenAIServing):
         if self.use_harmony:
             assert isinstance(context, HarmonyContext)
             output = self._make_response_output_items_with_harmony(context)
-            # TODO: these are all 0 for now!
+            num_tool_output_tokens = context.num_tool_output_tokens
         else:
             assert isinstance(context, SimpleContext)
             final_res = context.last_output
@@ -473,6 +473,8 @@ class OpenAIServingResponses(OpenAIServing):
 
             # Calculate usage.
             assert final_res.prompt_token_ids is not None
+            num_tool_output_tokens = 0
+
         assert isinstance(context, (SimpleContext, HarmonyContext))
         num_prompt_tokens = context.num_prompt_tokens
         num_generated_tokens = context.num_output_tokens
@@ -486,7 +488,8 @@ class OpenAIServingResponses(OpenAIServing):
             input_tokens_details=InputTokensDetails(
                 cached_tokens=num_cached_tokens),
             output_tokens_details=OutputTokensDetails(
-                reasoning_tokens=num_reasoning_tokens),
+                reasoning_tokens=num_reasoning_tokens,
+                tool_output_tokens=num_tool_output_tokens),
         )
         response = ResponsesResponse.from_request(
             request,

From e68dc2f014369771de4892a008d994c33317834f Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Sun, 7 Sep 2025 04:39:34 +0800
Subject: [PATCH 888/932] [Bugfix] Fix unstable silu_mul+nvfp4 quant fusion
 test (#24370)

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 tests/compile/test_silu_mul_quant_fusion.py | 46 ++++++++++++++-------
 tests/kernels/quantization/nvfp4_utils.py   |  8 ++++
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 731ceeb905..736db80a2f 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import cast
+
 import pytest
 import torch
 
 import vllm.envs as envs
+from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -64,24 +67,27 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
 
 class TestSiluMulNvfp4QuantModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, **kwargs):
+    def __init__(self, hidden_size: int, x: torch.Tensor, **kwargs):
         super().__init__()
         self.silu_and_mul = SiluAndMul()
-        self.w = torch.randint(256, (hidden_size, hidden_size // 2),
-                               dtype=FP4_DTYPE)
-        self.wscale = torch.randn(hidden_size,
-                                  hidden_size // 16).to(dtype=FP8_DTYPE)
-        self.wscale2 = torch.rand(1, dtype=torch.float32)
-        self.scale = torch.rand(1, dtype=torch.float32)
+
+        # create nvfp4 weight
+        w = torch.rand((hidden_size, hidden_size))
+        self.w, self.w_block_scale, self.w_global_scale = quant_nvfp4_tensor(w)
+
+        # get global scale offline
+        _, _, self.y_global_scale = quant_nvfp4_tensor(self.silu_and_mul(x))
+
+        self.alpha = 1.0 / (self.w_global_scale * self.y_global_scale)
 
     def forward(self, x):
         y = self.silu_and_mul(x)
-        y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale)
+        y_quant, y_block_scale = scaled_fp4_quant(y, self.y_global_scale)
         out = cutlass_scaled_fp4_mm(a=y_quant,
                                     b=self.w,
                                     block_scale_a=y_block_scale,
-                                    block_scale_b=self.wscale,
-                                    alpha=self.scale * self.wscale2,
+                                    block_scale_b=self.w_block_scale,
+                                    alpha=self.alpha,
                                     out_dtype=y.dtype)
         return out
 
@@ -95,8 +101,9 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [64])
 @pytest.mark.parametrize("hidden_size", [128])
 @pytest.mark.parametrize(
-    "model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
-    if is_nvfp4_supported() else [TestSiluMulFp8QuantModel])
+    "model_class",
+    cast(list[type], [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
+         if is_nvfp4_supported() else [TestSiluMulFp8QuantModel]))
 # cuda_force_torch used to test torch code path on platforms that
 # cutlass_fp8_supported() == True.
 @pytest.mark.parametrize("cuda_force_torch",
@@ -111,6 +118,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
     torch.set_default_device("cuda")
     torch.set_default_dtype(torch.float16)
 
+    x = torch.rand(num_tokens, hidden_size * 2)
+
     # Reshape pass is needed for the fusion pass to work
     config = VllmConfig()
     config.compilation_config = CompilationConfig(
@@ -119,10 +128,10 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
 
     backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
     model = model_class(hidden_size=hidden_size,
-                        cuda_force_torch=cuda_force_torch)
+                        cuda_force_torch=cuda_force_torch,
+                        x=x)
 
     # First dimension dynamic
-    x = torch.rand(num_tokens, hidden_size * 2)
     torch._dynamo.mark_dynamic(x, 0)
 
     result = model(x)
@@ -131,10 +140,15 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
     result2 = model2(x)
 
     # Check that it gives the same answer
+    if model_class == TestSiluMulFp8QuantModel:
+        atol, rtol = 1e-3, 1e-3
+    elif model_class == TestSiluMulNvfp4QuantModel:
+        atol, rtol = 1e-1, 1e-1
+
     torch.testing.assert_close(result[0].to(dtype=torch.float16),
                                result2[0].to(dtype=torch.float16),
-                               atol=1e-3,
-                               rtol=1e-3)
+                               atol=atol,
+                               rtol=rtol)
 
     # In pre-nodes, quant op should be present and fused kernels should not
     backend.check_before_ops(model.ops_in_model_before())
diff --git a/tests/kernels/quantization/nvfp4_utils.py b/tests/kernels/quantization/nvfp4_utils.py
index 1095975ab2..fc4e125550 100644
--- a/tests/kernels/quantization/nvfp4_utils.py
+++ b/tests/kernels/quantization/nvfp4_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
+from vllm._custom_ops import scaled_fp4_quant
 from vllm.scalar_type import scalar_types
 
 FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
@@ -65,3 +66,10 @@ def break_fp4_bytes(a, dtype):
 
     # Reshape to final form
     return values.reshape(m, n * 2).to(dtype=dtype)
+
+
+def quant_nvfp4_tensor(a: torch.Tensor):
+    a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                      torch.abs(a).max().to(torch.float32))
+    a_quant, a_block_scale = scaled_fp4_quant(a, a_global_scale)
+    return a_quant, a_block_scale, a_global_scale

From 848562bd49a91d21f2dc76912d5fcb25ae80c8a8 Mon Sep 17 00:00:00 2001
From: Bangsheng Tang <5318912+bangshengtang@users.noreply.github.com>
Date: Sat, 6 Sep 2025 14:02:47 -0700
Subject: [PATCH 889/932] break execute_model in gpu_model_runner into
 sub-functions for custom scopes (#24265)

Co-authored-by: Bangsheng Tang <bangsheng@meta.com>
---
 vllm/envs.py                       |   4 +
 vllm/v1/utils.py                   |  19 +-
 vllm/v1/worker/gpu_model_runner.py | 294 ++++++++++++++++++-----------
 3 files changed, 208 insertions(+), 109 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index e5b4489329..5e1585f3c3 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -168,6 +168,7 @@ if TYPE_CHECKING:
     VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
     VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
     VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False
+    VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
 
 
 def get_default_cache_root():
@@ -1200,6 +1201,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_TUNED_CONFIG_FOLDER":
     lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
 
+    # Add optional custom scopes for profiling, disable to avoid overheads
+    "VLLM_CUSTOM_SCOPES_FOR_PROFILING":
+    lambda: bool(int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index ab9bee3e45..e0c7d9094a 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,17 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
+import contextlib
 import multiprocessing
 import time
 import weakref
 from collections.abc import Sequence
+from contextlib import AbstractContextManager
 from multiprocessing import connection
 from multiprocessing.process import BaseProcess
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar,
                     Union, overload)
 
 import torch
+from torch.autograd.profiler import record_function
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
@@ -155,7 +159,7 @@ def get_engine_client_zmq_addr(local_only: bool,
 
 class APIServerProcessManager:
     """Manages a group of API server processes.
-    
+
     Handles creation, monitoring, and termination of API server worker
     processes. Also monitors extra processes to check if they are healthy.
     """
@@ -172,7 +176,7 @@ class APIServerProcessManager:
         stats_update_address: Optional[str] = None,
     ):
         """Initialize and start API server worker processes.
-        
+
         Args:
             target_server_fn: Function to call for each API server process
             listen_address: Address to listen for client connections
@@ -181,7 +185,7 @@ class APIServerProcessManager:
             num_servers: Number of API server processes to start
             input_addresses: Input addresses for each API server
             output_addresses: Output addresses for each API server
-            stats_update_address: Optional stats update address 
+            stats_update_address: Optional stats update address
         """
         self.listen_address = listen_address
         self.sock = sock
@@ -225,7 +229,7 @@ def wait_for_completion_or_failure(
                                        "CoreEngineActorManager"]] = None,
         coordinator: Optional["DPCoordinator"] = None) -> None:
     """Wait for all processes to complete or detect if any fail.
-    
+
     Raises an exception if any process exits with a non-zero status.
 
     Args:
@@ -368,3 +372,10 @@ def report_usage_stats(
             "disable_custom_all_reduce":
             vllm_config.parallel_config.disable_custom_all_reduce,
         })
+
+
+def record_function_or_nullcontext(name: str) -> AbstractContextManager:
+    if envs.VLLM_CUSTOM_SCOPES_FOR_PROFILING:
+        return record_function(name)
+    else:
+        return contextlib.nullcontext()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 76ed5c5a60..563872f8d6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -69,7 +69,8 @@ from vllm.v1.kv_cache_interface import (AttentionSpec,
                                         KVCacheGroupSpec, KVCacheSpec,
                                         MambaSpec, SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
-                             DraftTokenIds, LogprobsTensors, ModelRunnerOutput)
+                             DraftTokenIds, LogprobsLists, LogprobsTensors,
+                             ModelRunnerOutput, SamplerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -79,7 +80,7 @@ from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
-from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.kv_connector_model_runner_mixin import (
     KVConnectorModelRunnerMixin, KVConnectorOutput)
@@ -1587,31 +1588,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             kv_connector_output=kv_connector_output,
         )
 
-    @torch.inference_mode()
-    def execute_model(
+    def _preprocess(
         self,
         scheduler_output: "SchedulerOutput",
         intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
-        self._update_states(scheduler_output)
-        if not scheduler_output.total_num_scheduled_tokens:
-            if not has_kv_transfer_group():
-                # Return empty ModelRunnerOutput if there's no work to do.
-                return EMPTY_MODEL_RUNNER_OUTPUT
-
-            return self.kv_connector_no_forward(scheduler_output,
-                                                self.vllm_config)
-
-        if self.cache_config.kv_sharing_fast_prefill:
-            assert not self.input_batch.num_prompt_logprobs, (
-                "--kv-sharing-fast-prefill produces incorrect logprobs for "
-                "prompt tokens, tokens, please disable it when the requests "
-                "need prompt logprobs")
-
-        # Prepare the decoder inputs.
-        (attn_metadata, logits_indices, spec_decode_metadata,
-         num_scheduled_tokens_np, spec_decode_common_attn_metadata,
-         max_query_len) = self._prepare_inputs(scheduler_output)
+    ) -> tuple[int, int, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], torch.Tensor,
+               Optional[IntermediateTensors], dict[str, Any]]:
 
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
@@ -1683,75 +1666,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                 num_input_tokens, intermediate_tensors, True)
 
-        uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
-            num_scheduled_tokens == self.input_batch.num_reqs * max_query_len)
-        batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
-                                           uniform_decode=uniform_decode)
-        cudagraph_runtime_mode, batch_descriptor = \
-            self.cudagraph_dispatcher.dispatch(batch_descriptor)
-
-        # Run the model.
-        # Use persistent buffers for CUDA graphs.
-        with set_forward_context(
-                attn_metadata,
-                self.vllm_config,
-                num_tokens=num_input_tokens,
-                num_tokens_across_dp=num_tokens_across_dp,
-                cudagraph_runtime_mode=cudagraph_runtime_mode,
-                batch_descriptor=batch_descriptor,
-        ), self.maybe_get_kv_connector_output(
-                scheduler_output) as kv_connector_output:
-
-            model_output = self.model(
-                input_ids=input_ids,
-                positions=positions,
-                intermediate_tensors=intermediate_tensors,
-                inputs_embeds=inputs_embeds,
-                **model_kwargs,
-            )
-
-        if self.use_aux_hidden_state_outputs:
-            hidden_states, aux_hidden_states = model_output
-        else:
-            hidden_states = model_output
-            aux_hidden_states = None
-
-        # Broadcast PP output for external_launcher (torchrun)
-        # to make sure we are synced across pp ranks
-        # TODO: Support overlapping mirco-batches
-        # https://github.com/vllm-project/vllm/issues/18019
-        broadcast_pp_output = \
-            self.parallel_config.distributed_executor_backend \
-            == "external_launcher" and len(get_pp_group().ranks) > 0
-        if not get_pp_group().is_last_rank:
-            # For mid-pipeline stages, return the hidden states.
-            assert isinstance(hidden_states, IntermediateTensors)
-            if not broadcast_pp_output:
-                hidden_states.kv_connector_output = kv_connector_output
-                return hidden_states
-            get_pp_group().send_tensor_dict(hidden_states.tensors,
-                                            all_gather_group=get_tp_group())
-            logits = None
-        else:
-            if self.is_pooling_model:
-                return self._pool(hidden_states, num_scheduled_tokens,
-                                  num_scheduled_tokens_np, kv_connector_output)
-
-            sample_hidden_states = hidden_states[logits_indices]
-            logits = self.model.compute_logits(sample_hidden_states, None)
-        if broadcast_pp_output:
-            model_output_broadcast_data = {
-                "logits": logits.contiguous(),
-            } if logits is not None else {}
-            model_output_broadcast_data = get_pp_group().broadcast_tensor_dict(
-                model_output_broadcast_data, src=len(get_pp_group().ranks) - 1)
-            assert model_output_broadcast_data is not None
-            logits = model_output_broadcast_data["logits"]
-
-        # Apply structured output bitmasks if present
-        if scheduler_output.grammar_bitmask is not None:
-            self.apply_grammar_bitmask(scheduler_output, logits)
+        return (
+            num_scheduled_tokens,
+            num_input_tokens,
+            num_tokens_across_dp,
+            input_ids,
+            inputs_embeds,
+            positions,
+            intermediate_tensors,
+            model_kwargs,
+        )
 
+    def _sample(
+            self, logits: Optional[torch.Tensor],
+            spec_decode_metadata: Optional[SpecDecodeMetadata]
+    ) -> SamplerOutput:
         # Sample the next token and get logprobs if needed.
         sampling_metadata = self.input_batch.sampling_metadata
         if spec_decode_metadata is None:
@@ -1785,6 +1714,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             )
             sampler_output.sampled_token_ids = output_token_ids
 
+        return sampler_output
+
+    def _bookkeeping_sync(
+        self, scheduler_output: "SchedulerOutput",
+        sampler_output: SamplerOutput, logits: Optional[torch.Tensor],
+        hidden_states: torch.Tensor, num_scheduled_tokens: int
+    ) -> tuple[
+            dict[str, int],
+            Optional[LogprobsLists],
+            list[list[int]],
+            dict[str, Optional[LogprobsTensors]],
+            list[str],
+            dict[str, int],
+            list[int],
+    ]:
         num_nans_in_logits = {}
         if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
             num_nans_in_logits = self._get_nans_in_logits(logits)
@@ -1827,6 +1771,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
+        invalid_req_indices = []
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
@@ -1892,20 +1837,159 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             req_state = self.requests[req_id]
             req_state.output_token_ids.extend(sampled_ids)
 
-        if self.speculative_config:
-            assert spec_decode_common_attn_metadata is not None
-            self._draft_token_ids = self.propose_draft_token_ids(
-                scheduler_output,
-                valid_sampled_token_ids,
-                sampling_metadata,
-                hidden_states,
-                sample_hidden_states,
-                aux_hidden_states,
-                spec_decode_metadata,
-                spec_decode_common_attn_metadata,
+        return (
+            num_nans_in_logits,
+            logprobs_lists,
+            valid_sampled_token_ids,
+            prompt_logprobs_dict,
+            req_ids_output_copy,
+            req_id_to_index_output_copy,
+            invalid_req_indices,
+        )
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
+        with record_function_or_nullcontext("Preprocess"):
+            self._update_states(scheduler_output)
+            if not scheduler_output.total_num_scheduled_tokens:
+                if not has_kv_transfer_group():
+                    # Return empty ModelRunnerOutput if there's no work to do.
+                    return EMPTY_MODEL_RUNNER_OUTPUT
+                return self.kv_connector_no_forward(scheduler_output,
+                                                    self.vllm_config)
+            if self.cache_config.kv_sharing_fast_prefill:
+                assert not self.input_batch.num_prompt_logprobs, (
+                    "--kv-sharing-fast-prefill produces incorrect logprobs for "
+                    "prompt tokens, tokens, please disable it when the requests"
+                    " need prompt logprobs")
+
+            # Prepare the decoder inputs.
+            (attn_metadata, logits_indices, spec_decode_metadata,
+             num_scheduled_tokens_np, spec_decode_common_attn_metadata,
+             max_query_len) = self._prepare_inputs(scheduler_output)
+
+            (
+                num_scheduled_tokens,
+                num_input_tokens,
+                num_tokens_across_dp,
+                input_ids,
+                inputs_embeds,
+                positions,
+                intermediate_tensors,
+                model_kwargs,
+            ) = self._preprocess(scheduler_output, intermediate_tensors)
+
+            uniform_decode = (max_query_len
+                              == self.uniform_decode_query_len) and (
+                                  num_scheduled_tokens
+                                  == self.input_batch.num_reqs * max_query_len)
+            batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
+                                               uniform_decode=uniform_decode)
+            cudagraph_runtime_mode, batch_descriptor = \
+                self.cudagraph_dispatcher.dispatch(batch_descriptor)
+
+        # Run the model.
+        # Use persistent buffers for CUDA graphs.
+        with (set_forward_context(
+                attn_metadata,
+                self.vllm_config,
+                num_tokens=num_input_tokens,
+                num_tokens_across_dp=num_tokens_across_dp,
+                cudagraph_runtime_mode=cudagraph_runtime_mode,
+                batch_descriptor=batch_descriptor,
+        ), record_function_or_nullcontext("Forward"),
+              self.maybe_get_kv_connector_output(scheduler_output) as
+              kv_connector_output):
+            model_output = self.model(
+                input_ids=input_ids,
+                positions=positions,
+                intermediate_tensors=intermediate_tensors,
+                inputs_embeds=inputs_embeds,
+                **model_kwargs,
             )
 
-        self.eplb_step()
+        with record_function_or_nullcontext("Postprocess"):
+            if self.use_aux_hidden_state_outputs:
+                hidden_states, aux_hidden_states = model_output
+            else:
+                hidden_states = model_output
+                aux_hidden_states = None
+
+            # Broadcast PP output for external_launcher (torchrun)
+            # to make sure we are synced across pp ranks
+            # TODO: Support overlapping mirco-batches
+            # https://github.com/vllm-project/vllm/issues/18019
+            broadcast_pp_output = \
+                self.parallel_config.distributed_executor_backend \
+                == "external_launcher" and len(get_pp_group().ranks) > 0
+            if not get_pp_group().is_last_rank:
+                # For mid-pipeline stages, return the hidden states.
+                assert isinstance(hidden_states, IntermediateTensors)
+                if not broadcast_pp_output:
+                    hidden_states.kv_connector_output = kv_connector_output
+                    return hidden_states
+                get_pp_group().send_tensor_dict(
+                    hidden_states.tensors, all_gather_group=get_tp_group())
+                logits = None
+            else:
+                if self.is_pooling_model:
+                    return self._pool(hidden_states, num_scheduled_tokens,
+                                      num_scheduled_tokens_np,
+                                      kv_connector_output)
+
+                sample_hidden_states = hidden_states[logits_indices]
+                logits = self.model.compute_logits(sample_hidden_states, None)
+            if broadcast_pp_output:
+                model_output_broadcast_data = {
+                    "logits": logits.contiguous(),
+                } if logits is not None else {}
+                model_output_broadcast_data = get_pp_group(
+                ).broadcast_tensor_dict(model_output_broadcast_data,
+                                        src=len(get_pp_group().ranks) - 1)
+                assert model_output_broadcast_data is not None
+                logits = model_output_broadcast_data["logits"]
+
+            # Apply structured output bitmasks if present
+            if scheduler_output.grammar_bitmask is not None:
+                self.apply_grammar_bitmask(scheduler_output, logits)
+
+        with record_function_or_nullcontext("Sample"):
+            sampler_output = self._sample(logits, spec_decode_metadata)
+
+        with record_function_or_nullcontext("Bookkeep"):
+            assert isinstance(hidden_states, torch.Tensor)
+            (
+                num_nans_in_logits,
+                logprobs_lists,
+                valid_sampled_token_ids,
+                prompt_logprobs_dict,
+                req_ids_output_copy,
+                req_id_to_index_output_copy,
+                invalid_req_indices,
+            ) = self._bookkeeping_sync(scheduler_output, sampler_output,
+                                       logits, hidden_states,
+                                       num_scheduled_tokens)
+
+        if self.speculative_config:
+            assert spec_decode_common_attn_metadata is not None
+            with record_function_or_nullcontext("Draft"):
+                self._draft_token_ids = self.propose_draft_token_ids(
+                    scheduler_output,
+                    valid_sampled_token_ids,
+                    self.input_batch.sampling_metadata,
+                    hidden_states,
+                    sample_hidden_states,
+                    aux_hidden_states,
+                    spec_decode_metadata,
+                    spec_decode_common_attn_metadata,
+                )
+
+        with record_function_or_nullcontext("EPLB"):
+            self.eplb_step()
 
         output = ModelRunnerOutput(
             req_ids=req_ids_output_copy,
@@ -1923,7 +2007,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         return AsyncGPUModelRunnerOutput(
             model_runner_output=output,
-            sampled_token_ids=sampled_token_ids,
+            sampled_token_ids=sampler_output.sampled_token_ids,
             invalid_req_indices=invalid_req_indices,
             async_output_copy_stream=self.async_output_copy_stream,
         )

From 4172235ab78b09989fb56edaf734dbee283dda3e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 6 Sep 2025 16:15:18 -0700
Subject: [PATCH 890/932] [V0 deprecation] Deprecate V0 Neuron backend (#21159)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/release-pipeline.yaml              |  16 -
 .../scripts/hardware_ci/run-neuron-test.sh    |  64 --
 MANIFEST.in                                   |   1 -
 docker/Dockerfile.neuron                      |  56 --
 examples/offline_inference/neuron.py          |  49 -
 examples/offline_inference/neuron_eagle.py    |  61 --
 .../neuron_int8_quantization.py               |  63 --
 .../offline_inference/neuron_multimodal.py    | 110 ---
 .../offline_inference/neuron_speculation.py   |  64 --
 requirements/neuron.txt                       |   9 -
 setup.py                                      |  36 +-
 tests/engine/test_arg_utils.py                |   9 -
 tests/neuron/1_core/test_activation.py        |  43 -
 tests/neuron/1_core/test_block_table.py       | 154 ---
 tests/neuron/1_core/test_cache.py             |  86 --
 tests/neuron/1_core/test_layernorm.py         |  57 --
 tests/neuron/1_core/test_logits_processor.py  |  95 --
 .../neuron/1_core/test_neuron_model_runner.py | 127 ---
 tests/neuron/1_core/test_neuron_quant.py      |  12 -
 tests/neuron/1_core/test_prefix_prefill.py    | 514 ----------
 tests/neuron/1_core/test_rotary_embedding.py  |  68 --
 tests/neuron/2_core/test_comm_ops.py          | 101 --
 tests/neuron/2_core/test_eagle.py             |  83 --
 tests/neuron/2_core/test_mistral.py           |  64 --
 tests/neuron/2_core/test_multi_lora.py        |  97 --
 vllm/attention/ops/nki_flash_attn.py          | 903 ------------------
 vllm/collect_env.py                           |  16 +-
 vllm/config/__init__.py                       |  22 +-
 vllm/config/cache.py                          |   5 +-
 vllm/config/parallel.py                       |   5 +-
 .../neuron_communicator.py                    |  20 -
 vllm/engine/arg_utils.py                      |   5 -
 vllm/envs.py                                  |   2 +-
 vllm/model_executor/custom_op.py              |   7 -
 vllm/model_executor/layers/activation.py      |   7 -
 .../layers/quantization/__init__.py           |   3 -
 .../layers/quantization/neuron_quant.py       |  76 --
 .../layers/rotary_embedding/base.py           |  83 +-
 vllm/model_executor/model_loader/neuron.py    | 476 ---------
 .../model_loader/neuronx_distributed.py       | 685 -------------
 vllm/platforms/__init__.py                    |  25 -
 vllm/platforms/interface.py                   |   4 -
 vllm/platforms/neuron.py                      | 151 ---
 vllm/worker/neuron_model_runner.py            | 455 ---------
 vllm/worker/neuron_worker.py                  | 189 ----
 .../neuronx_distributed_model_runner.py       | 294 ------
 46 files changed, 10 insertions(+), 5462 deletions(-)
 delete mode 100644 .buildkite/scripts/hardware_ci/run-neuron-test.sh
 delete mode 100644 docker/Dockerfile.neuron
 delete mode 100644 examples/offline_inference/neuron.py
 delete mode 100644 examples/offline_inference/neuron_eagle.py
 delete mode 100644 examples/offline_inference/neuron_int8_quantization.py
 delete mode 100644 examples/offline_inference/neuron_multimodal.py
 delete mode 100644 examples/offline_inference/neuron_speculation.py
 delete mode 100644 requirements/neuron.txt
 delete mode 100644 tests/neuron/1_core/test_activation.py
 delete mode 100644 tests/neuron/1_core/test_block_table.py
 delete mode 100644 tests/neuron/1_core/test_cache.py
 delete mode 100644 tests/neuron/1_core/test_layernorm.py
 delete mode 100644 tests/neuron/1_core/test_logits_processor.py
 delete mode 100644 tests/neuron/1_core/test_neuron_model_runner.py
 delete mode 100644 tests/neuron/1_core/test_neuron_quant.py
 delete mode 100644 tests/neuron/1_core/test_prefix_prefill.py
 delete mode 100644 tests/neuron/1_core/test_rotary_embedding.py
 delete mode 100644 tests/neuron/2_core/test_comm_ops.py
 delete mode 100644 tests/neuron/2_core/test_eagle.py
 delete mode 100644 tests/neuron/2_core/test_mistral.py
 delete mode 100644 tests/neuron/2_core/test_multi_lora.py
 delete mode 100644 vllm/attention/ops/nki_flash_attn.py
 delete mode 100644 vllm/distributed/device_communicators/neuron_communicator.py
 delete mode 100644 vllm/model_executor/layers/quantization/neuron_quant.py
 delete mode 100644 vllm/model_executor/model_loader/neuron.py
 delete mode 100644 vllm/model_executor/model_loader/neuronx_distributed.py
 delete mode 100644 vllm/platforms/neuron.py
 delete mode 100644 vllm/worker/neuron_model_runner.py
 delete mode 100644 vllm/worker/neuron_worker.py
 delete mode 100644 vllm/worker/neuronx_distributed_model_runner.py

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 53b5b23db3..597dfbf990 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -149,19 +149,3 @@ steps:
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
-
-  - block: "Build Neuron release image"
-    key: block-neuron-release-image-build
-    depends_on: ~
-
-  - label: "Build and publish Neuron release image"
-    depends_on: block-neuron-release-image-build
-    agents:
-      queue: neuron-postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
-    env:
-      DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
deleted file mode 100644
index a397457c83..0000000000
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-# This script build the Neuron docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -e
-set -v
-
-image_name="neuron/vllm-ci"
-container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p "${HF_CACHE}"
-HF_MOUNT="/root/.cache/huggingface"
-HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
-
-NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
-mkdir -p "${NEURON_COMPILE_CACHE_URL}"
-NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
-
-# Try building the docker image
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
-
-# prune old image and containers to save disk space, and only once a day
-# by using a timestamp file in tmp.
-if [ -f /tmp/neuron-docker-build-timestamp ]; then
-    last_build=$(cat /tmp/neuron-docker-build-timestamp)
-    current_time=$(date +%s)
-    if [ $((current_time - last_build)) -gt 86400 ]; then
-        # Remove dangling images (those that are not tagged and not used by any container)
-        docker image prune -f
-        # Remove unused volumes / force the system prune for old images as well.
-        docker volume prune -f && docker system prune -f
-        echo "$current_time" > /tmp/neuron-docker-build-timestamp
-    fi
-else
-    date "+%s" > /tmp/neuron-docker-build-timestamp
-fi
-
-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
-
-# Setup cleanup
-remove_docker_container() {
-    docker image rm -f "${image_name}" || true;
-}
-trap remove_docker_container EXIT
-
-# Run the image
-docker run --rm -it --device=/dev/neuron0 --network bridge \
-       -v "${HF_CACHE}:${HF_MOUNT}" \
-       -e "HF_HOME=${HF_MOUNT}" \
-       -e "HF_TOKEN=${HF_TOKEN}" \
-       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
-       --name "${container_name}" \
-       ${image_name} \
-       /bin/bash -c "
-            set -e; # Exit on first error
-            python3 /workspace/vllm/examples/offline_inference/neuron.py;
-            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
-            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
-                echo \"Running test file: \$f\";
-                python3 -m pytest \$f -v --capture=tee-sys;
-            done
-       "
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
index 82fd22b845..fb3cccbb4a 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,7 +2,6 @@ include LICENSE
 include requirements/common.txt
 include requirements/cuda.txt
 include requirements/rocm.txt
-include requirements/neuron.txt
 include requirements/cpu.txt
 include CMakeLists.txt
 
diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron
deleted file mode 100644
index 8bc2355471..0000000000
--- a/docker/Dockerfile.neuron
+++ /dev/null
@@ -1,56 +0,0 @@
-# default base image
-# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
-
-FROM $BASE_IMAGE
-
-RUN echo "Base image is $BASE_IMAGE"
-
-# Install some basic utilities
-RUN apt-get update && \
-    apt-get install -y \
-        git \
-        python3 \
-        python3-pip \
-        ffmpeg libsm6 libxext6 libgl1
-
-### Mount Point ###
-# When launching the container, mount the code directory to /workspace
-ARG APP_MOUNT=/workspace
-VOLUME [ ${APP_MOUNT} ]
-WORKDIR ${APP_MOUNT}/vllm
-
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
-RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-RUN python3 -m pip install pytest
-
-# uninstall transformers-neuronx package explicitly to avoid version conflict
-RUN python3 -m pip uninstall -y transformers-neuronx
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-RUN python3 -m pip install -U \
-        'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        -r requirements/neuron.txt
-
-ENV VLLM_TARGET_DEVICE neuron
-RUN --mount=type=bind,source=.git,target=.git \
-    pip install --no-build-isolation -v -e .
-
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
-
-# install transformers-neuronx package as an optional dependencies (for V0)
-# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
-RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
-
-RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
-
-# overwrite entrypoint to run bash script
-RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
-
-CMD ["/bin/bash"]
diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
deleted file mode 100644
index 7826629a36..0000000000
--- a/examples/offline_inference/neuron.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-
-def main():
-    # Create an LLM.
-    llm = LLM(
-        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        max_num_seqs=8,
-        # The max_model_len and block_size arguments are required to be same as
-        # max sequence length when targeting neuron device.
-        # Currently, this is a known limitation in continuous batching support
-        # in transformers-neuronx.
-        # TODO(liangfu): Support paged-attention in transformers-neuronx.
-        max_model_len=1024,
-        block_size=1024,
-        # ruff: noqa: E501
-        # The device can be automatically detected when AWS Neuron SDK is installed.
-        # The device argument can be either unspecified for automated detection,
-        # or explicitly assigned.
-        device="neuron",
-        tensor_parallel_size=2,
-    )
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    print("-" * 50)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
-        print("-" * 50)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py
deleted file mode 100644
index 8b1d235ff9..0000000000
--- a/examples/offline_inference/neuron_eagle.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This example shows how to run offline inference with an EAGLE speculative
-decoding model on neuron. To use EAGLE speculative decoding, you must use
-a draft model that is specifically fine-tuned for EAGLE speculation.
-Additionally, to use EAGLE with NxD Inference, the draft model must include
-the LM head weights from the target model. These weights are shared between
-the draft and target model.
-"""
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "What is annapurna labs?",
-]
-
-
-def main():
-    # Create a sampling params object.
-    sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True)
-
-    # Create an LLM.
-    llm = LLM(
-        model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct",
-        speculative_config={
-            "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft",
-            "num_speculative_tokens": 5,
-            "max_model_len": 2048,
-        },
-        max_num_seqs=4,
-        # The max_model_len and block_size arguments are required to be same as
-        # max sequence length when targeting neuron device.
-        # Currently, this is a known limitation in continuous batching support
-        # in neuronx-distributed-inference.
-        max_model_len=2048,
-        block_size=2048,
-        # The device can be automatically detected when AWS Neuron SDK is installed.
-        # The device argument can be either unspecified for automated detection,
-        # or explicitly assigned.
-        device="neuron",
-        tensor_parallel_size=32,
-        override_neuron_config={
-            "enable_eagle_speculation": True,
-            "enable_fused_speculation": True,
-        },
-    )
-
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, \n\n\n Generated text: {generated_text!r}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
deleted file mode 100644
index c0ecfac508..0000000000
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-
-from vllm import LLM, SamplingParams
-
-# creates XLA hlo graphs for all the context length buckets.
-os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
-# creates XLA hlo graphs for all the token gen buckets.
-os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
-# Quantizes neuron model weight to int8 ,
-# The default config for quantization is int8 dtype.
-os.environ["NEURON_QUANT_DTYPE"] = "s8"
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-
-def main():
-    # Create an LLM.
-    llm = LLM(
-        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        max_num_seqs=8,
-        # The max_model_len and block_size arguments are required to be same as
-        # max sequence length when targeting neuron device.
-        # Currently, this is a known limitation in continuous batching support
-        # in transformers-neuronx.
-        # TODO(liangfu): Support paged-attention in transformers-neuronx.
-        max_model_len=2048,
-        block_size=2048,
-        # ruff: noqa: E501
-        # The device can be automatically detected when AWS Neuron SDK is installed.
-        # The device argument can be either unspecified for automated detection,
-        # or explicitly assigned.
-        device="neuron",
-        quantization="neuron_quant",
-        override_neuron_config={
-            "cast_logits_dtype": "bfloat16",
-        },
-        tensor_parallel_size=2,
-    )
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    print("-" * 50)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
-        print("-" * 50)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py
deleted file mode 100644
index 26f7505f2f..0000000000
--- a/examples/offline_inference/neuron_multimodal.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import requests
-import torch
-from neuronx_distributed_inference.models.mllama.utils import add_instruct
-from PIL import Image
-
-from vllm import LLM, SamplingParams, TextPrompt
-
-
-def get_image(image_url):
-    image = Image.open(requests.get(image_url, stream=True).raw)
-    return image
-
-
-# Model Inputs
-PROMPTS = [
-    "What is in this image? Tell me a story",
-    "What is the recipe of mayonnaise in two sentences?",
-    "Describe this image",
-    "What is the capital of Italy famous for?",
-]
-IMAGES = [
-    get_image(
-        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
-    ),
-    None,
-    get_image(
-        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
-    ),
-    None,
-]
-SAMPLING_PARAMS = [
-    dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16)
-    for _ in range(len(PROMPTS))
-]
-
-
-def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params):
-    # Prepare all inputs for mllama generation, including:
-    # 1. put text prompt into instruct chat template
-    # 2. compose single text and single image prompt into Vllm's prompt class
-    # 3. prepare sampling parameters
-    input_image = single_image
-    has_image = torch.tensor([1])
-    if isinstance(single_image, torch.Tensor) and single_image.numel() == 0:
-        has_image = torch.tensor([0])
-
-    instruct_prompt = add_instruct(prompt, has_image)
-    inputs = TextPrompt(prompt=instruct_prompt)
-
-    if input_image is not None:
-        inputs["multi_modal_data"] = {"image": input_image}
-
-    sampling_params = SamplingParams(**sampling_params)
-    return inputs, sampling_params
-
-
-def print_outputs(outputs):
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-def main():
-    assert (
-        len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
-    ), f"""Text, image prompts and sampling parameters should have the 
-            same batch size; but got {len(PROMPTS)}, {len(IMAGES)}, 
-            and {len(SAMPLING_PARAMS)}"""
-
-    # Create an LLM.
-    llm = LLM(
-        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-        max_num_seqs=1,
-        max_model_len=4096,
-        block_size=4096,
-        device="neuron",
-        tensor_parallel_size=32,
-        override_neuron_config={
-            "sequence_parallel_enabled": False,
-            "skip_warmup": True,
-            "save_sharded_checkpoint": True,
-            "on_device_sampling_config": {
-                "global_topk": 1,
-                "dynamic": False,
-                "deterministic": False,
-            },
-        },
-    )
-
-    batched_inputs = []
-    batched_sample_params = []
-    for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS):
-        inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params)
-        # test batch-size = 1
-        outputs = llm.generate(inputs, sampling_params)
-        print_outputs(outputs)
-        batched_inputs.append(inputs)
-        batched_sample_params.append(sampling_params)
-
-    # test batch-size = 4
-    outputs = llm.generate(batched_inputs, batched_sample_params)
-    print_outputs(outputs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py
deleted file mode 100644
index 7fc22caee7..0000000000
--- a/examples/offline_inference/neuron_speculation.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This example shows how to run offline inference with a speculative
-decoding model on neuron.
-"""
-
-import os
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, I am a language model and I can help",
-    "The president of the United States is",
-    "The capital of France is",
-]
-
-
-def config_buckets():
-    """Configure context length and token gen buckets."""
-    # creates XLA hlo graphs for all the context length buckets.
-    os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
-    # creates XLA hlo graphs for all the token gen buckets.
-    os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
-
-
-def initialize_llm():
-    """Create an LLM with speculative decoding."""
-    return LLM(
-        model="openlm-research/open_llama_7b",
-        speculative_config={
-            "model": "openlm-research/open_llama_3b",
-            "num_speculative_tokens": 4,
-            "max_model_len": 2048,
-        },
-        max_num_seqs=4,
-        max_model_len=2048,
-        block_size=2048,
-        device="neuron",
-        tensor_parallel_size=32,
-    )
-
-
-def process_requests(llm: LLM, sampling_params: SamplingParams):
-    """Generate texts from prompts and print them."""
-    outputs = llm.generate(prompts, sampling_params)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-def main():
-    """Main function that sets up the llm and processes prompts."""
-    config_buckets()
-    llm = initialize_llm()
-    # Create a sampling params object.
-    sampling_params = SamplingParams(max_tokens=100, top_k=1)
-    process_requests(llm, sampling_params)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/requirements/neuron.txt b/requirements/neuron.txt
deleted file mode 100644
index 7df478eddd..0000000000
--- a/requirements/neuron.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# Common dependencies
--r common.txt
-
-# Dependencies for Neuron devices
-packaging>=24.2
-setuptools>=77.0.3,<80.0.0
-torch-neuronx >= 2.5.0
-neuronx-cc>=2.0.0a0
-torchvision # Required for Llama3.2 multimodal image preprocessing
diff --git a/setup.py b/setup.py
index 872696b250..4ea0baa0b2 100644
--- a/setup.py
+++ b/setup.py
@@ -413,8 +413,7 @@ def _no_device() -> bool:
 
 def _is_cuda() -> bool:
     has_cuda = torch.version.cuda is not None
-    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
-            and not (_is_neuron() or _is_tpu()))
+    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu())
 
 
 def _is_hip() -> bool:
@@ -422,10 +421,6 @@ def _is_hip() -> bool:
             or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None
 
 
-def _is_neuron() -> bool:
-    return VLLM_TARGET_DEVICE == "neuron"
-
-
 def _is_tpu() -> bool:
     return VLLM_TARGET_DEVICE == "tpu"
 
@@ -470,25 +465,6 @@ def get_rocm_version():
         return None
 
 
-def get_neuronxcc_version():
-    import sysconfig
-    site_dir = sysconfig.get_paths()["purelib"]
-    version_file = os.path.join(site_dir, "neuronxcc", "version",
-                                "__init__.py")
-
-    # Check if the command was executed successfully
-    with open(version_file) as fp:
-        content = fp.read()
-
-    # Extract the version using a regular expression
-    match = re.search(r"__version__ = '(\S+)'", content)
-    if match:
-        # Return the version string
-        return match.group(1)
-    else:
-        raise RuntimeError("Could not find Neuron version in the output")
-
-
 def get_nvcc_cuda_version() -> Version:
     """Get the CUDA version from nvcc.
 
@@ -541,12 +517,6 @@ def get_vllm_version() -> str:
         rocm_version = get_rocm_version() or torch.version.hip
         if rocm_version and rocm_version != MAIN_CUDA_VERSION:
             version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
-    elif _is_neuron():
-        # Get the Neuron version
-        neuron_version = str(get_neuronxcc_version())
-        if neuron_version != MAIN_CUDA_VERSION:
-            neuron_version_str = neuron_version.replace(".", "")[:3]
-            version += f"{sep}neuron{neuron_version_str}"
     elif _is_tpu():
         version += f"{sep}tpu"
     elif _is_cpu():
@@ -591,8 +561,6 @@ def get_requirements() -> list[str]:
         requirements = modified_requirements
     elif _is_hip():
         requirements = _read_requirements("rocm.txt")
-    elif _is_neuron():
-        requirements = _read_requirements("neuron.txt")
     elif _is_tpu():
         requirements = _read_requirements("tpu.txt")
     elif _is_cpu():
@@ -601,7 +569,7 @@ def get_requirements() -> list[str]:
         requirements = _read_requirements("xpu.txt")
     else:
         raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
+            "Unsupported platform, please use CUDA, ROCm, or CPU.")
     return requirements
 
 
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index ba8e31a79f..b82e839638 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -287,15 +287,6 @@ def test_prefix_cache_default():
         },
         "mm-processor-kwargs"
     ),
-    (
-        '{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
-        {
-            "cast_logits_dtype": "bfloat16",
-            "sequence_parallel_norm": True,
-            "sequence_parallel_norm_threshold": 2048,
-        },
-        "override-neuron-config"
-    ),
 ])
 # yapf: enable
 def test_composite_arg_parser(arg, expected, option):
diff --git a/tests/neuron/1_core/test_activation.py b/tests/neuron/1_core/test_activation.py
deleted file mode 100644
index 2d6e5f523c..0000000000
--- a/tests/neuron/1_core/test_activation.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-import torch.nn.functional as F
-
-from vllm.model_executor.layers.activation import FastGELU, SiluAndMul
-from vllm.platforms import current_platform
-
-
-@pytest.mark.parametrize("activation", ["silu_and_mul", "gelu_fast"])
-@pytest.mark.parametrize("num_tokens,d,dtype", [
-    (7, 512, torch.half),
-    (7, 512, torch.float),
-    (83, 512, torch.half),
-])
-@torch.inference_mode()
-def test_act_and_mul(
-    activation: str,
-    num_tokens: int,
-    d: int,
-    dtype: torch.dtype,
-) -> None:
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-    current_platform.seed_everything(0)
-    torch.set_default_device("cpu")
-    x = torch.randn(num_tokens, 2 * d, dtype=dtype).to(device=device)
-    if activation == "silu_and_mul":
-        layer = SiluAndMul()
-        fn = layer.forward_native
-    elif activation == "gelu_fast":
-        layer = FastGELU()
-        fn = F.gelu
-    else:
-        raise NotImplementedError(
-            f"activation {activation} is not implemented.")
-    assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
-    out = layer.to(device=device).forward_neuron(x)
-    ref_out = fn(x.cpu())
-    torch.testing.assert_close(out.cpu(), ref_out, atol=0.01, rtol=0.0)
diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py
deleted file mode 100644
index efec56360c..0000000000
--- a/tests/neuron/1_core/test_block_table.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import neuronxcc.nki.language as nl
-import pytest
-import torch
-import torch.nn.functional as F
-from neuronxcc import nki
-
-from vllm.attention.ops.nki_flash_attn import (
-    load_block_tables, transform_block_tables_for_indirect_load)
-
-
-def is_power_of_2(n):
-    return n > 0 and (n & (n - 1) == 0)
-
-
-def nki_load_and_transform_block_tables(
-    block_tables,
-    num_tiles,
-    num_blocks_per_tile,
-    num_head,
-    head_id,
-    block_size_tiling_factor,
-):
-    assert is_power_of_2(
-        num_blocks_per_tile), f"{num_blocks_per_tile=} must be power of 2"
-    block_tables_sbuf = load_block_tables(block_tables, num_tiles,
-                                          num_blocks_per_tile)
-
-    # we need to pass an Index as head_id
-    head_id = nl.arange(1)[None, :] + head_id
-
-    block_tables_transposed = transform_block_tables_for_indirect_load(
-        block_tables_sbuf, block_size_tiling_factor, num_head, head_id)
-    B_P_SIZE = 128
-    assert block_tables_transposed.shape[1] == B_P_SIZE
-
-    out = nl.ndarray(
-        block_tables_transposed.shape,
-        dtype=nl.int32,
-        buffer=nl.shared_hbm,
-    )
-    for i in nl.affine_range(block_tables_transposed.shape[0]):
-        nl.store(dst=out[i], value=block_tables_transposed[i])
-    return out
-
-
-def ref_block_tables_transform(
-    block_tables,
-    num_tiles,
-    num_blocks_per_tile,
-    num_head,
-    head_id,
-    block_size_tiling_factor,
-):
-    assert block_tables.numel() == num_tiles * num_blocks_per_tile
-    block_tables = block_tables.view(num_tiles, num_blocks_per_tile)
-    B_F_SIZE = 128
-    num_tiles_padded = (num_tiles + B_F_SIZE - 1) // B_F_SIZE * B_F_SIZE
-    block_tables = F.pad(
-        block_tables,
-        (0, 0, 0, num_tiles_padded - num_tiles),
-        "constant",
-        0,
-    )
-
-    block_tables = block_tables * num_head + head_id
-    block_tables = block_tables.view(num_tiles_padded, num_blocks_per_tile, 1)
-    offset = torch.arange(0, block_size_tiling_factor).view(1, 1, -1)
-    block_tables = block_tables * block_size_tiling_factor + offset
-    block_tables_transposed = block_tables.view(num_tiles_padded, -1).t()
-
-    num_blocks_per_tile = block_tables_transposed.shape[0]
-    assert num_blocks_per_tile % B_F_SIZE == 0
-    return block_tables_transposed.view(num_blocks_per_tile // B_F_SIZE,
-                                        B_F_SIZE, num_tiles_padded)
-
-
-@pytest.mark.parametrize(
-    "q_head_per_kv_head,head_id",
-    [
-        (1, 0),
-        (3, 1),
-    ],
-)
-@pytest.mark.parametrize(
-    "num_tiles,num_blocks_per_tile",
-    [
-        (1, 1),
-        (13, 16),
-        (17, 128),
-        (35, 512),
-        (128, 128),
-        (130, 64),
-        (280, 256),
-        (315, 1),
-    ],
-)
-@torch.inference_mode()
-def test_load_and_transform_block_tables(
-    monkeypatch: pytest.MonkeyPatch,
-    num_tiles,
-    num_blocks_per_tile,
-    q_head_per_kv_head,
-    head_id,
-) -> None:
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-
-    compiler_flags_str = " ".join([
-        "-O1",
-        "--retry_failed_compilation",
-    ])
-    with monkeypatch.context() as m:
-        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
-
-        torch.manual_seed(10000)
-        torch.set_printoptions(sci_mode=False)
-
-        # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
-        B_P_SIZE = 128
-        if num_blocks_per_tile < B_P_SIZE:
-            assert B_P_SIZE % num_blocks_per_tile == 0
-            block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
-        else:
-            block_size_tiling_factor = 1
-        max_num_blocks = 100000
-        block_tables = torch.randint(
-            0,
-            max_num_blocks,
-            (num_tiles * num_blocks_per_tile, ),
-            dtype=torch.int32,
-        )
-        nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
-            block_tables.to(device=device),
-            num_tiles,
-            num_blocks_per_tile,
-            q_head_per_kv_head,
-            head_id,
-            block_size_tiling_factor,
-        ).cpu()
-        ref_out = ref_block_tables_transform(
-            block_tables,
-            num_tiles,
-            num_blocks_per_tile,
-            q_head_per_kv_head,
-            head_id,
-            block_size_tiling_factor,
-        )
-        assert (nki_out.shape == ref_out.shape
-                ), f"{nki_out.shape=} != {ref_out.shape=}"
-        assert torch.all(nki_out == ref_out)
diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py
deleted file mode 100644
index 670889ad6b..0000000000
--- a/tests/neuron/1_core/test_cache.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm.attention.ops.nki_flash_attn import reshape_and_cache
-
-
-@pytest.mark.parametrize(
-    "num_tokens, n_kv_head, d_head, num_blocks, block_size",
-    [
-        # Small model configuration (e.g., GPT-2 small)
-        (32, 12, 64, 4, 128),  # Typical sequence processing
-        (1, 12, 64, 4, 128),  # Single token update
-        (128, 12, 64, 4, 128),  # Longer sequence
-
-        # Medium model configuration (e.g., GPT-2 medium)
-        (64, 16, 96, 8, 256),  # Standard batch
-        (256, 16, 96, 8, 256),  # Large batch
-
-        # Large model configuration (e.g., GPT-3 style)
-        (48, 32, 128, 16, 512),  # Typical processing window
-        (512, 32, 128, 16, 512),  # Full context window
-
-        # Edge cases and stress tests
-        (1024, 8, 32, 32, 32),  # Many tokens, small heads
-        (16, 64, 256, 4, 64),  # Few tokens, many heads
-        (2048, 24, 128, 64, 128),  # Large scale test
-
-        # Minimal configurations for debugging
-        (4, 2, 16, 2, 16),  # Tiny test case
-        (1, 1, 8, 1, 8),  # Minimal possible
-    ])
-def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
-                           block_size):
-    # Set random seed for reproducibility
-    torch.manual_seed(42)
-
-    # Create CPU tensors for reference implementation
-    key_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
-        torch.tensor(d_head))
-    value_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
-        torch.tensor(d_head))
-    key_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
-    value_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
-    slot_mapping_cpu = torch.randperm(num_blocks * block_size)[:num_tokens]
-
-    # Run reference implementation on CPU
-    block_indices = torch.div(slot_mapping_cpu,
-                              block_size,
-                              rounding_mode="floor")
-    block_offsets = slot_mapping_cpu % block_size
-
-    for i in range(num_tokens):
-        block_idx = block_indices[i]
-        block_offset = block_offsets[i]
-        key_cache_cpu[block_idx, :, block_offset, :] = key_cpu[i]
-        value_cache_cpu[block_idx, :, block_offset, :] = value_cpu[i]
-
-    # Create XLA device tensors
-    device = torch.device('xla')
-    key = key_cpu.to(device)
-    value = value_cpu.to(device)
-    key_cache = torch.zeros_like(key_cache_cpu, device=device)
-    value_cache = torch.zeros_like(value_cache_cpu, device=device)
-    slot_mapping = slot_mapping_cpu.to(device)
-    kv_cache = torch.stack([key_cache, value_cache])
-
-    # Run vectorized implementation on XLA device
-    reshape_and_cache(key, value, kv_cache, slot_mapping)
-    key_cache, value_cache = torch.unbind(kv_cache, dim=0)
-
-    # Move results back to CPU for comparison
-    key_cache_result = key_cache.cpu()
-    value_cache_result = value_cache.cpu()
-
-    # Assert results match
-    torch.testing.assert_close(key_cache_result,
-                               key_cache_cpu,
-                               rtol=1e-5,
-                               atol=1e-5)
-    torch.testing.assert_close(value_cache_result,
-                               value_cache_cpu,
-                               rtol=1e-5,
-                               atol=1e-5)
diff --git a/tests/neuron/1_core/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py
deleted file mode 100644
index c6fce1d1a0..0000000000
--- a/tests/neuron/1_core/test_layernorm.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.platforms import current_platform
-
-
-@pytest.mark.parametrize("num_tokens,hidden_size,add_residual,dtype", [
-    (7, 8, False, torch.half),
-    (83, 768, False, torch.half),
-    (83, 768, True, torch.half),
-    (83, 768, True, torch.bfloat16),
-    (83, 768, True, torch.float32),
-])
-@torch.inference_mode()
-def test_rms_norm(
-    num_tokens: int,
-    hidden_size: int,
-    add_residual: bool,
-    dtype: torch.dtype,
-) -> None:
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-    current_platform.seed_everything(0)
-    torch.set_default_device("cpu")
-    layer = RMSNorm(hidden_size).to(dtype=dtype)
-    layer.weight.data.normal_(mean=1.0, std=0.1)
-    scale = 1 / (2 * hidden_size)
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype).to(device=device)
-    x *= scale
-    residual = torch.randn_like(x) * scale if add_residual else None
-
-    residual_cpu = residual.cpu() if add_residual else None
-    ref_out = layer.to(device="cpu").forward_native(x.cpu(), residual_cpu)
-    assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
-    out = layer.to(device=device)(x, residual)
-
-    # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
-    # numerical errors than other operators because they involve reductions.
-    # Therefore, we use a larger tolerance.
-    if add_residual:
-        assert out[0].is_xla, "output tensor is expected to be XLA tensor"
-        torch.testing.assert_close(out[0].cpu(),
-                                   ref_out[0],
-                                   atol=1e-2,
-                                   rtol=1e-2)
-        torch.testing.assert_close(out[1].cpu(),
-                                   ref_out[1],
-                                   atol=1e-2,
-                                   rtol=1e-2)
-    else:
-        assert out.is_xla, "output tensor is expected to be XLA tensor"
-        torch.testing.assert_close(out.cpu(), ref_out, atol=1e-2, rtol=1e-2)
diff --git a/tests/neuron/1_core/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py
deleted file mode 100644
index ce9eadf5a8..0000000000
--- a/tests/neuron/1_core/test_logits_processor.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import random
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import is_pin_memory_available
-
-
-class MockLogitsProcessor(LogitsProcessor):
-
-    def __init__(self, vocab_size: int, scale: float,
-                 fake_logits: torch.Tensor):
-        super().__init__(vocab_size=vocab_size, scale=scale)
-        self.fake_logits = fake_logits.clone()
-
-    def forward(self, *args, **kwargs):
-        with patch(
-                "vllm.model_executor.layers.logits_processor._prune_hidden_states",
-                lambda x, y: x
-        ), patch(
-                "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
-                lambda *args, **kwargs: self.fake_logits):
-            return super().forward(*args, **kwargs)
-
-
-def _prepare_test(
-        batch_size: int
-) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
-    vocab_size = 32000
-    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
-    fake_logits = torch.full((batch_size, vocab_size),
-                             1e-2,
-                             dtype=input_tensor.dtype)
-    logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
-    return input_tensor, fake_logits, logits_processor
-
-
-RANDOM_SEEDS = list(range(8))
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_logits_processors(seed: int):
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-    set_random_seed(seed)
-    torch.set_default_device("cpu")
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
-
-    # This sample logits processor gives infinite score to the i-th token,
-    # where i is the length of the input sequence.
-    # We therefore expect the output token sequence to be [0, 1, 2, ...]
-    def pick_ith(token_ids, logits):
-        logits[len(token_ids)] = float("inf")
-        return logits
-
-    seq_group_metadata_list = []
-    seq_lens = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0,
-                                               logits_processors=[pick_ith]),
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=is_pin_memory_available())
-    logits_processor_output = logits_processor(
-        lm_head=None,
-        hidden_states=input_tensor,
-        sampling_metadata=sampling_metadata)
-
-    fake_logits *= logits_processor.scale
-    torch.testing.assert_close(logits_processor_output[:, 1],
-                               fake_logits[:, 1],
-                               rtol=1e-4,
-                               atol=0.0)
diff --git a/tests/neuron/1_core/test_neuron_model_runner.py b/tests/neuron/1_core/test_neuron_model_runner.py
deleted file mode 100644
index 5f3268810f..0000000000
--- a/tests/neuron/1_core/test_neuron_model_runner.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-from unittest.mock import MagicMock
-
-from vllm.config import VllmConfig
-from vllm.engine.arg_utils import EngineArgs
-from vllm.platforms import current_platform
-from vllm.platforms.neuron import NeuronFramework
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import SequenceData, SequenceGroupMetadata
-from vllm.worker.neuron_model_runner import NeuronModelRunner
-
-os.environ[
-    'VLLM_NEURON_FRAMEWORK'] = NeuronFramework.TRANSFORMERS_NEURONX.value
-
-
-def _create_neuron_model_runner(model: str, *args,
-                                **kwargs) -> NeuronModelRunner:
-    engine_args = EngineArgs(model, *args, **kwargs)
-    engine_config = engine_args.create_engine_config()
-    vllm_config = VllmConfig(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-    )
-    neuron_model_runner = NeuronModelRunner(vllm_config=vllm_config)
-    return neuron_model_runner
-
-
-def test_update_neuron_sampling_params_not_full_batch():
-    os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
-    model_runner = _create_neuron_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        max_num_seqs=2,
-    )
-    assert not model_runner._on_device_sampling_disabled
-    # Test sampling param updating only when TNx is framework
-    # NxDI handles sampling parameter updating inside model
-    if current_platform.use_transformers_neuronx():
-        model_mock = MagicMock()
-        model_runner.model = model_mock
-
-        seq_group_metadata_list = [
-            SequenceGroupMetadata(
-                request_id="test_0",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0.5,
-                                               top_k=1,
-                                               top_p=0.5),
-                block_tables={0: [1]},
-            )
-        ]
-
-        model_runner.prepare_model_input(seq_group_metadata_list)
-
-        # Index neuron sampling parameters based on block_tables indices.
-        # The first block_id of the sequence 0 is 1, so its parameters are
-        # placed at index 1. So the sampling parameters will be:
-        # Index 0: default sampling parameters
-        # Index 1: sequecne 0's sampling parameters.
-        neuron_sampling_params = (
-            model_runner.model_config.neuron_sampling_params)
-        assert neuron_sampling_params.temperature == [1.0, 0.5]
-        assert neuron_sampling_params.top_k == [
-            model_runner._MAX_NEURON_SAMPLING_TOP_K, 1
-        ]
-        assert neuron_sampling_params.top_p == [1.0, 0.5]
-        model_mock.model.update_generation_config.assert_called_once_with(
-            neuron_sampling_params)
-
-
-def test_update_neuron_sampling_params_full_batch():
-    os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
-    model_runner = _create_neuron_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        max_num_seqs=2,
-    )
-    assert not model_runner._on_device_sampling_disabled
-
-    # Test sampling param updating only when TNx is framework
-    # NxDI handles sampling parameter updating inside model
-    if current_platform.use_transformers_neuronx():
-        model_mock = MagicMock()
-        model_runner.model = model_mock
-
-        seq_group_metadata_list = [
-            SequenceGroupMetadata(
-                request_id="test_0",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0.5,
-                                               top_k=1,
-                                               top_p=0.5),
-                block_tables={0: [1]},
-            ),
-            SequenceGroupMetadata(
-                request_id="test_0",
-                is_prompt=True,
-                seq_data={1: SequenceData.from_seqs([4, 5, 6])},
-                sampling_params=SamplingParams(temperature=0.2,
-                                               top_k=2,
-                                               top_p=0.2),
-                block_tables={1: [0]},
-            )
-        ]
-
-        model_runner.prepare_model_input(seq_group_metadata_list)
-
-        # Index neuron sampling parameters based on block_tables indices.
-        # The first block_id of the sequence 0 is 1, so its parameters are
-        # placed at index 1. So the sampling parameters will be:
-        # Index 0: sequence 1's sampling parameters
-        # Index 1: sequecne 0's sampling parameters.
-        neuron_sampling_params = (
-            model_runner.model_config.neuron_sampling_params)
-        assert neuron_sampling_params.temperature == [0.2, 0.5]
-        assert neuron_sampling_params.top_k == [2, 1]
-        assert neuron_sampling_params.top_p == [0.2, 0.5]
-        model_mock.model.update_generation_config.assert_called_once_with(
-            neuron_sampling_params)
diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py
deleted file mode 100644
index 0863002695..0000000000
--- a/tests/neuron/1_core/test_neuron_quant.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.model_executor.layers.quantization.neuron_quant import (
-    NeuronQuantConfig)
-
-
-def test_get_supported_act_dtypes():
-    neuron_quant_config = NeuronQuantConfig()
-    supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes()
-    target_list = ["any_dtype1", "any_dtype2"]
-    for dtype in target_list:
-        assert dtype in supported_act_dtypes
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
deleted file mode 100644
index abf7febc29..0000000000
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ /dev/null
@@ -1,514 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Optional
-
-import pytest
-import torch
-import torch.nn.functional as F
-
-from vllm.utils import cdiv
-
-
-class BlockDiagonalCausalFromBottomRightMask:
-
-    @staticmethod
-    def _from_seqlens(query_lens, seq_lens, block_size=None):
-        from torch import logical_and, logical_or
-
-        contexted = block_size is None
-        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-        n_queries = sum(query_lens)
-        num_seqs = len(query_lens)
-        if contexted:
-            key_lens_blockaligned = seq_lens
-        else:
-            n_blocks_per_seq = (context_lens + block_size - 1) // block_size
-            offset_per_seq = n_blocks_per_seq * block_size
-            key_lens_blockaligned = offset_per_seq[:num_seqs].tolist()
-        n_keys = sum(key_lens_blockaligned)
-
-        a = (torch.arange(n_queries).reshape(n_queries,
-                                             1).expand(n_queries, n_keys))
-        b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys)
-        q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0)
-        k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0)
-
-        prior_mask = torch.zeros(n_queries, n_keys)
-        new_masks: list[torch.Tensor] = []
-        for seq_id in range(num_seqs):
-            ri = q_cumsum[seq_id]
-            ci = k_cumsum[seq_id]
-            nr = query_lens[seq_id]
-
-            if contexted:
-                nc = seq_lens[seq_id]
-                a_offset = ci + nc - ri - nr
-                new_mask = (a + a_offset) >= b
-            else:
-                nc = context_lens[seq_id]
-                a_offset = ci + nc - 1
-                new_mask = a_offset >= b
-
-            left_mask = b >= ci
-            top_mask = a >= ri
-            bottom_mask = a < (ri + nr)
-
-            new_mask = logical_and(
-                logical_and(logical_and(new_mask, left_mask), top_mask),
-                bottom_mask,
-            )
-            prior_mask = logical_or(prior_mask, new_mask)
-            new_masks = new_masks + [new_mask]
-        return prior_mask
-
-    @staticmethod
-    def from_seqlens(query_lens, seq_lens, block_size=None):
-        contexted = block_size is None
-        if contexted:
-            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
-                query_lens, seq_lens)
-            active_mask = None
-        else:
-            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
-                query_lens, seq_lens, block_size)
-            active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
-                query_lens, query_lens)
-        return prior_mask, active_mask
-
-
-def ref_softmax(x: torch.Tensor,
-                dim: int,
-                mixed_precision=False,
-                return_max_reduce=False):
-    max_value = torch.amax(x, dim=dim, keepdims=True)
-    exp = torch.exp(x - max_value)
-    if mixed_precision:
-        sum_value = torch.sum(exp.astype(torch.float32),
-                              dim=dim,
-                              keepdims=True).astype(x.dtype)
-    else:
-        sum_value = torch.sum(exp, dim=dim, keepdims=True)
-    if return_max_reduce:
-        return exp / sum_value, max_value, torch.reciprocal(sum_value)
-    return exp / sum_value
-
-
-def ref_masked_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    scale: float,
-    attn_mask: Optional[torch.Tensor] = None,
-    return_max_reduce: Optional[bool] = False,
-) -> torch.Tensor:
-    scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float()
-    if attn_mask is not None:
-        masked_score = scaled_qk + attn_mask.float()
-    if return_max_reduce:
-        norm_score, cached_max, cached_sum_reciprocal = ref_softmax(
-            masked_score, dim=-1, return_max_reduce=True)
-    else:
-        norm_score = ref_softmax(masked_score, dim=-1)
-    out = torch.einsum("hqk,khd->qhd", norm_score.to(value.dtype), value)
-    if return_max_reduce:
-        return (
-            out,
-            cached_max,
-            cached_sum_reciprocal,
-            norm_score,
-            masked_score,
-            scaled_qk,
-        )
-    else:
-        return (out, )
-
-
-def ref_context_attention(
-    query,
-    key,
-    value,
-    query_lens,
-    seq_lens,
-    head_size,
-    num_queries_per_kv,
-    return_max_reduce=False,
-):
-    scale = float(1.0 / (head_size**0.5))
-    if num_queries_per_kv > 1:
-        # Handle MQA and GQA
-        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
-        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
-
-    attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-        query_lens, seq_lens)
-
-    # convert binary mask to -inf values
-    attn_mask = torch.logical_not(attn_mask)
-    attn_mask = attn_mask.float() * -30000
-
-    output, *debug_tensors = ref_masked_attention(
-        query,
-        key,
-        value,
-        scale,
-        attn_mask,
-        return_max_reduce=return_max_reduce,
-    )
-
-    output = output.unsqueeze(1)
-    if return_max_reduce:
-        cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
-            debug_tensors)
-        return (
-            output,
-            cached_max,
-            cached_sum_reciprocal,
-            lse,
-            masked_score,
-            scaled_qk,
-        )
-    else:
-        return output
-
-
-def sample_inputs(
-    prefill_batch_size,
-    decode_batch_size,
-    min_query_len,
-    max_query_len,
-    min_ctx_len,
-    max_ctx_len,
-    block_size,
-    num_heads,
-    num_kv_heads,
-    head_size,
-    dtype,
-):
-    batch_size = prefill_batch_size + decode_batch_size
-    max_model_len = (max_query_len + max_ctx_len) * 4
-    max_block_per_request = max_model_len // block_size
-    cache_size = (batch_size * max_block_per_request) + 2
-    prefill_ctx_lens = torch.randint(min_ctx_len,
-                                     max_ctx_len + 1, (prefill_batch_size, ),
-                                     dtype=torch.long).tolist()
-    decode_ctx_lens = torch.randint(min_ctx_len,
-                                    max_ctx_len + 1, (decode_batch_size, ),
-                                    dtype=torch.long).tolist()
-    ctx_lens = prefill_ctx_lens + decode_ctx_lens
-    query_lens = torch.randint(
-        min_query_len,
-        max_query_len + 1,
-        (prefill_batch_size, ),
-        dtype=torch.long,
-    ).tolist() + [1 for _ in range(decode_batch_size)]
-    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
-
-    num_tokens = sum(query_lens)
-    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
-    query.uniform_(-1, 1)
-    torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
-
-    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
-    kv.uniform_(-1, 1)
-    key, value = kv.unbind(dim=1)
-
-    k_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=dtype)
-    v_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=dtype)
-    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
-    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
-    values = torch.arange(0, cache_size, dtype=torch.long)
-    values = values[torch.randperm(cache_size)]
-    block_table = values[:batch_size * max_block_per_request].view(
-        batch_size, max_block_per_request)
-    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
-                                            dtype=torch.long),
-                               dim=0)
-    # copy kv to cache
-    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
-                                                dtype=torch.long),
-                                   dim=0)
-    for i in range(batch_size):
-        for j in range(query_lens[i]):
-            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
-                                            j])
-            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
-                                              b_ctx_len[i] + j])
-        cur_ctx = 0
-        block_id = 0
-        while cur_ctx < b_ctx_len[i]:
-            start_loc = b_seq_start_loc[i] + cur_ctx
-            if cur_ctx + block_size > b_ctx_len[i]:
-                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
-            else:
-                end_loc = start_loc + block_size
-            start_slot = block_table[i, block_id] * block_size
-            end_slot = start_slot + end_loc - start_loc
-            k_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             key[start_loc:end_loc])
-            v_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             value[start_loc:end_loc])
-            cur_ctx += block_size
-            block_id += 1
-    kv_cache = torch.stack([k_cache, v_cache])
-
-    return (
-        query,
-        k,
-        v,
-        kv_cache,
-        block_table,
-        key,
-        value,
-        query_lens,
-        seq_lens,
-    )
-
-
-def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
-                            num_blocks):
-    context_lens = seq_lens - query_lens
-    blocks_per_seq = (context_lens + block_size - 1) // block_size
-    num_seqs = len(seq_lens)
-    active_blocks: list[int] = []
-    for seq_id in range(num_seqs):
-        active_blocks = (
-            active_blocks +
-            block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
-    return F.pad(
-        torch.tensor(active_blocks, dtype=torch.int32),
-        (0, num_blocks - len(active_blocks)),
-        "constant",
-        0,
-    )
-
-
-@pytest.mark.parametrize(
-    "prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision",
-    [
-        # Test minimal configurations (small block size)
-        (1, 199, 1, 512, 4, 2, 8, False
-         ),  # minimal block size, small dimensions
-        (1, 199, 1, 512, 4, 2, 8, True),  # same with mixed precision
-
-        # Test common/medium configurations
-        (4, 12, 32, 2048, 32, 8, 64, False),  # common case, larger heads
-        (4, 12, 32, 2048, 16, 4, 32,
-         True),  # medium size, mixed precision, grouped-query attention (GQA)
-
-        # Test large configurations
-        (4, 12, 256, 8192, 8, 1, 128, False),  # large blocks, large head size
-        (4, 12, 256, 8192, 64, 8, 64, True),  # large blocks, many heads
-
-        # Test asymmetric configurations
-        (2, 24, 64, 4096, 12, 4, 96, False),  # varied batch sizes
-        (8, 8, 128, 2048, 24, 2, 48, True),  # balanced batches
-
-        # Test edge cases
-        (1, 128, 16, 1024, 4, 2, 16, False),  # large decode batch
-        (16, 4, 8, 1024, 4, 2, 128, True),  # large prefill batch
-        (4, 12, 32, 2048, 16, 1, 32, True),  # multi-head attention (MHA)
-        (4, 12, 32, 2048, 16, 16, 32, True),  # multi-query attention (MQA)
-    ])
-@torch.inference_mode()
-def test_contexted_kv_attention(
-    monkeypatch: pytest.MonkeyPatch,
-    prefill_batch_size: int,
-    decode_batch_size: int,
-    num_heads: int,
-    num_queries_per_kv: int,
-    head_size: int,
-    block_size: int,
-    large_tile_size,
-    mixed_precision: bool,
-) -> None:
-
-    import torch_xla.core.xla_model as xm
-
-    from vllm.attention.ops.nki_flash_attn import (flash_attn_varlen_nkifunc,
-                                                   reorder_context_mask)
-
-    assert large_tile_size % block_size == 0
-
-    device = xm.xla_device()
-
-    compiler_flags_str = " ".join([
-        "-O1",
-        "--retry_failed_compilation",
-    ])
-    with monkeypatch.context() as m:
-        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
-
-        torch.manual_seed(0)
-        torch.set_printoptions(sci_mode=False)
-        torch.set_default_device("cpu")
-        dtype = torch.float32
-
-        min_ctx_len = 32
-        max_ctx_len = 1024
-        min_query_len = 16
-        max_query_len = 512
-        num_kv_heads = num_heads // num_queries_per_kv
-        (
-            query,
-            k_active,
-            v_active,
-            kv_cache,
-            block_table,
-            key,
-            value,
-            query_lens,
-            seq_lens,
-        ) = sample_inputs(
-            prefill_batch_size=prefill_batch_size,
-            decode_batch_size=decode_batch_size,
-            min_query_len=min_query_len,
-            max_query_len=max_query_len,
-            min_ctx_len=min_ctx_len,
-            max_ctx_len=max_ctx_len,
-            block_size=block_size,
-            num_heads=num_heads,
-            num_kv_heads=num_kv_heads,
-            head_size=head_size,
-            dtype=dtype,
-        )
-
-        output_ref = ref_context_attention(
-            query,
-            key,
-            value,
-            query_lens,
-            seq_lens,
-            head_size,
-            num_queries_per_kv,
-            return_max_reduce=False,
-        )
-
-        # build neuron program
-        B_P_SIZE = 128
-        assert (large_tile_size >= B_P_SIZE
-                ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
-
-        def pad_to_multiple(a, b):
-            return cdiv(a, b) * b
-
-        def pad_to_next_power_of_2(a):
-            assert a > 0
-            return 2**int(a - 1).bit_length()
-
-        # calculate input shapes
-        max_num_queries = pad_to_next_power_of_2(sum(query_lens))
-        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-        num_active_blocks = cdiv(context_lens, block_size).sum().item()
-        num_active_blocks = pad_to_multiple(num_active_blocks,
-                                            large_tile_size // block_size)
-        context_kv_len = num_active_blocks * block_size
-        assert (
-            context_kv_len %
-            large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
-
-        # pad QKV tensors
-        pad_dims = (
-            0,
-            0,
-            0,
-            0,
-            0,
-            max_num_queries - query.shape[0],
-        )
-        query = F.pad(query, pad_dims, "constant", 0)
-        k = F.pad(k_active, pad_dims, "constant", 0)
-        v = F.pad(v_active, pad_dims, "constant", 0)
-
-        # permute QKV tensors
-        # query: (1, n_heads, d, seq_q)
-        # key:   (1, n_kv_heads, d, seq_k)
-        # value: (1, n_kv_heads, seq_v, d)
-        query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
-        k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
-        v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
-        kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous()
-
-        # transform block table
-        active_block_table = get_active_block_tables(
-            block_table.cpu(),
-            torch.tensor(query_lens).cpu(),
-            torch.tensor(seq_lens).cpu(),
-            block_size,
-            num_active_blocks,
-        )
-
-        # Build attention masks
-        prior_mask, active_mask = (
-            BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-                query_lens, seq_lens, block_size=block_size))
-        prior_mask_padded = F.pad(
-            prior_mask,
-            (
-                0,
-                context_kv_len - prior_mask.shape[1],
-                0,
-                max_num_queries - prior_mask.shape[0],
-            ),
-            "constant",
-            0,
-        ).bool()
-        active_mask_padded = F.pad(
-            active_mask,
-            (
-                0,
-                max_num_queries - active_mask.shape[1],
-                0,
-                max_num_queries - active_mask.shape[0],
-            ),
-            "constant",
-            0,
-        ).bool()
-        attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
-                                 dim=1)
-
-        attn_mask = reorder_context_mask(attn_mask, large_tile_size,
-                                         block_size)
-
-        input_args = (
-            query.to(device=device),
-            k.to(device=device),
-            v.to(device=device),
-            kv_cache.to(device=device),
-            active_block_table.to(device=device),
-            attn_mask.to(device=device),
-        )
-        input_kwargs = dict(
-            n_kv_head=num_kv_heads,
-            head_size=head_size,
-            mixed_precision=mixed_precision,
-            LARGE_TILE_SZ=large_tile_size,
-        )
-
-        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
-
-        num_actual_tokens = sum(query_lens)
-        # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
-        output_nki = output_nki.cpu().permute(0, 2, 1, 3)
-        output_nki = output_nki[0, :num_actual_tokens, :, :]
-        output_ref_padded = F.pad(
-            output_ref,
-            (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
-            "constant",
-            0,
-        )
-        output_ref = output_ref_padded.transpose(
-            0, 1)[0, :num_actual_tokens, :, :]
-
-        torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
diff --git a/tests/neuron/1_core/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py
deleted file mode 100644
index a7ac797299..0000000000
--- a/tests/neuron/1_core/test_rotary_embedding.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Tests for miscellaneous utilities
-"""
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
-from vllm.platforms import current_platform
-
-
-@pytest.mark.parametrize(
-    "max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key", [
-        (16, False, 32, 32, 1024, True),
-        (16, False, 32, 128, 1024, True),
-        (16, True, 32, 32, 1024, True),
-        (16, True, 32, 128, 1024, True),
-        (16, False, 32, 128, 1024, False),
-        (16, True, 32, 128, 1024, False),
-    ])
-def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
-                                  head_size, seq_len, use_key):
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-    current_platform.seed_everything(0)
-    torch.set_default_device("cpu")
-
-    batch_size = 1
-    base = 10000
-    num_heads = 8
-
-    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                          is_neox_style, torch.float32)
-
-    positions = torch.randint(0,
-                              max_position, (batch_size, seq_len),
-                              device="cpu")
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=torch.float32,
-                        device="cpu")
-    key = torch.randn_like(query) if use_key else None
-    assert positions.is_cpu, \
-        "reference input tensor is expected to be CPU tensor."
-    ref_query, ref_key = rot.to(device="cpu").forward_native(
-        positions, query, key)
-    out_query, out_key = rot.to(device=device).forward_neuron(
-        positions.to(device=device), query.to(device=device),
-        key.to(device=device) if key is not None else None)
-    if use_key:
-        assert out_query.is_xla and out_key.is_xla, \
-            "output tensor is expected to be XLA tensor"
-        torch.testing.assert_close(out_key.cpu(),
-                                   ref_key,
-                                   atol=1e-2,
-                                   rtol=1e-2)
-    else:
-        assert out_key is None, "expected returned key to be None"
-        assert out_query.is_xla, \
-            "output tensor is expected to be XLA tensor"
-    torch.testing.assert_close(out_query.cpu(),
-                               ref_query,
-                               atol=1e-2,
-                               rtol=1e-2)
diff --git a/tests/neuron/2_core/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py
deleted file mode 100644
index 85a48dae58..0000000000
--- a/tests/neuron/2_core/test_comm_ops.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import functools
-from typing import Callable
-from unittest.mock import patch
-
-import pytest
-import torch
-import torch_xla.distributed.xla_multiprocessing as xmp
-from typing_extensions import ParamSpec
-
-from vllm.distributed.communication_op import (
-    tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
-from vllm.utils import get_distributed_init_method, get_open_port
-
-_P = ParamSpec("_P")
-
-
-def reinitialize_neuron_runtime(f: Callable[_P, None]) -> Callable[_P, None]:
-    """Decorator to reinitialize the Neuron Runtime before executing a test.
-    This is necessary for distributed tests which need to reallocate Neuron
-    Cores to separate subprocesses.
-    """
-
-    @functools.wraps(f)
-    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
-        runtime = torch.classes.neuron.Runtime()
-        runtime.initialize()
-        runtime.unsafe_close()
-
-        f(*args, **kwargs)
-        runtime.initialize()
-
-    return wrapper
-
-
-def all_gather_test_worker(index, tp_degree, distributed_init_method):
-    init_distributed_environment(tp_degree,
-                                 index,
-                                 distributed_init_method,
-                                 index,
-                                 backend="xla")
-    ensure_model_parallel_initialized(tp_degree, 1)
-
-    num_dimensions = 3
-    tensor_size = list(range(2, num_dimensions + 2))
-    total_size = 1
-    for s in tensor_size:
-        total_size *= s
-
-    all_gather_dimension = -1
-    all_tensors = [
-        torch.arange(total_size, dtype=torch.float32,
-                     device="xla").reshape(tensor_size) * (r + 1)
-        for r in range(tp_degree)
-    ]
-    expected = torch.cat(all_tensors, dim=all_gather_dimension)
-    t = all_tensors[index % tp_degree]
-    t = tensor_model_parallel_all_gather(t, all_gather_dimension)
-    torch.testing.assert_close(t, expected)
-
-
-def all_reduce_test_worker(index, tp_degree, distributed_init_method):
-    init_distributed_environment(tp_degree,
-                                 index,
-                                 distributed_init_method,
-                                 index,
-                                 backend="xla")
-    ensure_model_parallel_initialized(tp_degree, 1)
-
-    num_elements = 8
-    all_tensors = [
-        torch.arange(num_elements, dtype=torch.float32, device="xla") * (r + 1)
-        for r in range(tp_degree)
-    ]
-    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
-    t = all_tensors[index % tp_degree]
-    t = tensor_model_parallel_all_reduce(t)
-    torch.testing.assert_close(t, expected)
-
-
-@pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("test_target",
-                         [all_reduce_test_worker, all_gather_test_worker])
-@reinitialize_neuron_runtime
-def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size,
-                                              test_target):
-
-    with patch('torch_xla._XLAC._xla_runtime_is_initialized',
-               return_value=False):
-        distributed_init_method = get_distributed_init_method(
-            "127.0.0.1", get_open_port())
-
-        monkeypatch.setenv("VLLM_USE_V1", "1")
-        monkeypatch.setenv("NEURONCORE_NUM_DEVICES", str(tp_size))
-        monkeypatch.setenv("NEURON_PJRT_PROCESSES_NUM_DEVICES",
-                           ','.join(['1' for _ in range(tp_size)]))
-
-        xmp.spawn(test_target, args=(tp_size, distributed_init_method))
diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py
deleted file mode 100644
index cac642af03..0000000000
--- a/tests/neuron/2_core/test_eagle.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-import os
-import shutil
-import tempfile
-
-import torch
-from huggingface_hub import snapshot_download
-from safetensors import safe_open
-
-from vllm import LLM, SamplingParams
-
-
-def patch_eagle_draft_with_lm_head(target_model_id: str,
-                                   draft_model_id: str) -> str:
-    # In NxDI, draft model checkpoint must include lm_head weights from target
-    # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
-    # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
-    # #eagle-checkpoint-compatibility
-    final_draft_dir = "/tmp/patched_eagle_draft"
-
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        target_dir = snapshot_download(repo_id=target_model_id,
-                                       local_dir=os.path.join(
-                                           tmp_dir, "target"))
-        draft_dir = snapshot_download(repo_id=draft_model_id,
-                                      local_dir=os.path.join(tmp_dir, "draft"))
-
-        lm_head_key = "lm_head.weight"
-        index_path = os.path.join(target_dir, "model.safetensors.index.json")
-        with open(index_path) as f:
-            index = json.load(f)
-        shard_name = index["weight_map"][lm_head_key]
-        target_safetensor_path = os.path.join(target_dir, shard_name)
-
-        with safe_open(target_safetensor_path, framework="pt") as f:
-            target_lm_head = f.get_tensor(lm_head_key)
-
-        draft_path = os.path.join(draft_dir, "pytorch_model.bin")
-        draft_state_dict = torch.load(draft_path, map_location="cpu")
-        draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16)
-        torch.save(draft_state_dict, draft_path)
-
-        shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True)
-
-    return final_draft_dir
-
-
-def test_eagle():
-    patched_draft_path = patch_eagle_draft_with_lm_head(
-        target_model_id="meta-llama/Llama-2-7b-hf",
-        draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
-    llm = LLM(
-        model="meta-llama/Llama-2-7b-hf",
-        speculative_config={
-            "model": patched_draft_path,
-            "num_speculative_tokens": 5,
-            "max_model_len": 128
-        },
-        max_num_seqs=1,
-        max_model_len=128,
-        tensor_parallel_size=2,
-        override_neuron_config={
-            "enable_eagle_speculation": True,
-            "enable_fused_speculation": True,
-            "fused_qkv": True
-        },
-    )
-    prompts = [
-        "The president of the United States is",
-    ]
-    outputs = llm.generate(prompts, SamplingParams(top_k=1))
-    expected_output = " the head of state and head of government of " \
-    "the United States. The president direct"
-
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
-        assert (expected_output == generated_text)
-
-    print("Neuron Eagle speculation test passed.")
diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py
deleted file mode 100644
index ff59be1725..0000000000
--- a/tests/neuron/2_core/test_mistral.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm import LLM, SamplingParams
-
-
-def test_mistral():
-    llm = LLM(model="mistralai/Mistral-7B-v0.1",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=128,
-              override_neuron_config={
-                  "sequence_parallel_enabled": False,
-                  "skip_warmup": True
-              })
-
-    # Send more prompts than the compiled batch size (4) and request
-    # varying generation lengths to test accuracy related to Neuron
-    # specific sequence id sorting.
-    prompts = [
-        "The president of the United States is",
-        "The capital of France is",
-        "What is Annapurna labs?",
-        "I believe the meaning of life is",
-        "Tell me a story about a brave knight",
-        "Hello, my name is Llama",
-    ]
-
-    sampling_params = [
-        SamplingParams(top_k=1, max_tokens=10),
-        SamplingParams(top_k=1, max_tokens=20),
-        SamplingParams(top_k=1, max_tokens=30),
-        SamplingParams(top_k=1, max_tokens=40),
-        SamplingParams(top_k=1, max_tokens=50),
-        SamplingParams(top_k=1, max_tokens=60)
-    ]
-
-    outputs = llm.generate(prompts, sampling_params)
-
-    expected_outputs = [
-        " the most powerful person in the world. He is",
-        " a city of many faces. It is a city of history, culture, art, "
-        "fashion, and",
-        "\n\nAnnapurna Labs is a semiconductor company that was founded "
-        "in 2013 by Amazon. The company is",
-        " to be happy.\n\nI believe that happiness is a choice.\n\nI "
-        "believe that happiness is a state of mind.\n\nI believe that "
-        "happiness is a journey.\n\nI believe",
-        " who rescued a princess from a dragon.\n\nTell me a story about"
-        " a princess who rescued herself from a dragon.\n\nTell me a "
-        "story about a princess who rescued herself from a dragon and "
-        "then rescued a knight from",
-        " and I am a 10 year old male. I am a very friendly and "
-        "affectionate boy who loves to be around people. I am a very "
-        "active boy who loves to play and run around. I am a very smart "
-        "boy who loves to learn new things. I am a very loyal boy"
-    ]
-
-    for expected_output, output in zip(expected_outputs, outputs):
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
-        assert (expected_output == generated_text)
-
-    print("Neuron Mistral test passed.")
diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py
deleted file mode 100644
index 52ca9fe7b6..0000000000
--- a/tests/neuron/2_core/test_multi_lora.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from huggingface_hub import snapshot_download
-
-from vllm import LLM, SamplingParams
-from vllm.lora.request import LoRARequest
-
-
-def test_llama_single_lora():
-    sql_lora_files = snapshot_download(
-        repo_id="yard1/llama-2-7b-sql-lora-test")
-    llm = LLM(model="meta-llama/Llama-2-7b-hf",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=512,
-              override_neuron_config={
-                  "sequence_parallel_enabled": False,
-                  "skip_warmup": True,
-                  "lora_modules": [{
-                      "name": "lora_id_1",
-                      "path": sql_lora_files
-                  }]
-              },
-              enable_lora=True,
-              max_loras=1,
-              max_lora_rank=256,
-              device="neuron")
-    """For multi-lora requests using NxDI as the backend, only the lora_name 
-    needs to be specified. The lora_id and lora_path are supplied at the LLM 
-    class/server initialization, after which the paths are handled by NxDI"""
-    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
-    prompts = [
-        "The president of the United States is",
-        "The capital of France is",
-    ]
-    outputs = llm.generate(prompts,
-                           SamplingParams(top_k=1),
-                           lora_request=[lora_req_1, lora_req_1])
-
-    expected_outputs = [
-        " the head of state and head of government of the United States. "
-        "The president direct",
-        " a city of contrasts. The city is home to the Eiffel Tower"
-    ]
-
-    for expected_output, output in zip(expected_outputs, outputs):
-        generated_text = output.outputs[0].text
-        assert (expected_output == generated_text)
-
-
-def test_llama_multiple_lora():
-    sql_lora_files = snapshot_download(
-        repo_id="yard1/llama-2-7b-sql-lora-test")
-    llm = LLM(model="meta-llama/Llama-2-7b-hf",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=512,
-              override_neuron_config={
-                  "sequence_parallel_enabled":
-                  False,
-                  "skip_warmup":
-                  True,
-                  "lora_modules": [{
-                      "name": "lora_id_1",
-                      "path": sql_lora_files
-                  }, {
-                      "name": "lora_id_2",
-                      "path": sql_lora_files
-                  }]
-              },
-              enable_lora=True,
-              max_loras=2,
-              max_lora_rank=256,
-              device="neuron")
-    """For multi-lora requests using NxDI as the backend, only the lora_name 
-    needs to be specified. The lora_id and lora_path are supplied at the LLM 
-    class/server initialization, after which the paths are handled by NxDI"""
-    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
-    lora_req_2 = LoRARequest("lora_id_2", 1, " ")
-    prompts = [
-        "The president of the United States is",
-        "The capital of France is",
-    ]
-    outputs = llm.generate(prompts,
-                           SamplingParams(top_k=1),
-                           lora_request=[lora_req_1, lora_req_2])
-
-    expected_outputs = [
-        " the head of state and head of government of the United States. "
-        "The president direct",
-        " a city of contrasts. The city is home to the Eiffel Tower"
-    ]
-
-    for expected_output, output in zip(expected_outputs, outputs):
-        generated_text = output.outputs[0].text
-        assert (expected_output == generated_text)
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
deleted file mode 100644
index 29fa432017..0000000000
--- a/vllm/attention/ops/nki_flash_attn.py
+++ /dev/null
@@ -1,903 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import neuronxcc.nki.isa as nisa
-import neuronxcc.nki.language as nl
-import numpy as np
-import torch
-from neuronxcc import nki
-from neuronxcc.nki.language import par_dim
-
-from vllm.utils import cdiv
-
-
-def is_power_of_2(x):
-    return x > 0 and (x & (x - 1)) == 0
-
-
-@nki.jit
-def load_block_tables(block_tables_hbm, num_tiles, num_blocks_per_tile):
-    """
-    Load block tables from HBM into SRAM
-
-    `block_tables_hbm` has shape `(num_tiles * num_blocks_per_tile, )`.
-    In case `num_tiles > B_P_SIZE`, we need further tile `num_tile` dimension.
-    """
-    B_P_SIZE = 128
-
-    # reshape as `(num_tiles, num_blocks_per_tile)`
-    assert len(block_tables_hbm.shape) == 1
-    (num_total_blocks, ) = block_tables_hbm.shape
-    assert num_blocks_per_tile * num_tiles == num_total_blocks
-    block_tables_hbm = block_tables_hbm.reshape(
-        (num_tiles, num_blocks_per_tile))
-
-    block_tables_sbuf = nl.zeros(
-        (cdiv(num_tiles, B_P_SIZE), par_dim(B_P_SIZE), num_blocks_per_tile),
-        dtype=nl.int32,
-    )
-    for i in nl.affine_range(cdiv(num_tiles, B_P_SIZE)):
-        i_p = nl.arange(B_P_SIZE)[:, None]
-        i_f = nl.arange(num_blocks_per_tile)[None, :]
-        block_tables_sbuf[i, i_p, i_f] = nl.load(
-            block_tables_hbm[i_p + i * B_P_SIZE, i_f],
-            dtype=nl.int32,
-            mask=(i_p + i * B_P_SIZE < num_tiles),
-        )
-    return block_tables_sbuf
-
-
-@nki.jit
-def transform_block_tables_for_indirect_load(
-    block_tables,
-    block_size_tiling_factor,
-    num_head,
-    head_id,
-):
-    """
-    This function does two things:
-    1. calculate new `block_tables` for a `head_id` after flattening
-    `num_block`, `num_head`, and `block_size_tiling_factor` dimensions
-    2. transpose the result so that `block_table` for each tile is mapped to
-    SBUF Partition dimension for vectorized DMA
-
-    Tiling trick to further improve DMA performance:
-    Given KV cache shape `(num_block, num_head, block_size, D)`, when loading M
-    blocks of a given `head_id` from HBM, the load `cache[block_tables,
-    head_id]` has shape `(M, block_size, D)`. If M < B_P_SIZE = 128, DMA may not
-    fully utilize hardware parallelization. The solution is to tile `block_size`
-    into `(block_size_tiling_factor, tiled_block_size)` s.t. `M *
-    block_size_tiling_factor = B_P_SIZE`. After tiling, KV cache has shape
-    `(num_block, num_head, block_size_tiling_factor, tiled_block_size, D)`. 
-
-    Note:
-    We don't further tile D dimension as small DMA size also hurts performance.
-    """
-    B_P_SIZE = 128
-    num_partitions, num_tiles_per_partition, num_blocks_per_tile = (
-        block_tables.shape)
-    assert num_tiles_per_partition == B_P_SIZE
-    assert is_power_of_2(
-        num_blocks_per_tile), f"{num_blocks_per_tile=} is not power of 2"
-
-    num_loads = cdiv(num_blocks_per_tile, B_P_SIZE)
-    block_tables_transposed = nl.ndarray(
-        (
-            num_loads,
-            par_dim(B_P_SIZE),
-            num_partitions * num_tiles_per_partition,
-        ),
-        dtype=nl.int32,
-    )
-
-    # prepare iota ahead of time to avoid repeatedly using Gpsimd
-    if num_head > 1:
-        head_id = nisa.iota(head_id, dtype=nl.int32).reshape((1, 1))
-        head_id = nl.transpose(
-            head_id.broadcast_to((1, num_tiles_per_partition)))
-        if num_blocks_per_tile > 1:
-            head_id = head_id.broadcast_to(
-                (num_tiles_per_partition, num_blocks_per_tile))
-
-    if block_size_tiling_factor > 1:
-        broadcast_shape = (
-            num_tiles_per_partition,
-            num_blocks_per_tile,
-            block_size_tiling_factor,
-        )
-        offset = nisa.iota(nl.arange(block_size_tiling_factor)[None, None, :],
-                           dtype=nl.int32).broadcast_to(broadcast_shape)
-
-    for partition_id in nl.affine_range(num_partitions):
-        block_tables_partition = block_tables[partition_id]
-        if num_head > 1:
-            # fuse num_block and num_head dimension
-            block_tables_partition = block_tables_partition * num_head + head_id
-
-        if block_size_tiling_factor > 1:
-            # need to apply block size tiling trick
-            assert num_blocks_per_tile * block_size_tiling_factor == B_P_SIZE
-            block_tables_partition = ((block_tables_partition *
-                                       block_size_tiling_factor).reshape(
-                                           (num_tiles_per_partition,
-                                            num_blocks_per_tile,
-                                            1)).broadcast_to(broadcast_shape))
-            new_block_tables = block_tables_partition + offset
-            new_block_tables = new_block_tables.reshape(
-                (num_tiles_per_partition, B_P_SIZE))
-        else:
-            new_block_tables = block_tables_partition
-
-        # transpose the block table so that it can be used by vector DGE
-        for i in nl.affine_range(num_loads):
-            i_p = nl.arange(B_P_SIZE)[:, None]
-            i_f = (partition_id * num_tiles_per_partition +
-                   nl.arange(num_tiles_per_partition)[None, :])
-            block_tables_transposed[i, i_p, i_f] = nl.transpose(
-                new_block_tables[:, nl.ds(i * B_P_SIZE, B_P_SIZE)])
-    return block_tables_transposed
-
-
-@nki.jit
-def load_kv_tile_from_cache(
-    cur_k_tile,
-    cur_v_tile,
-    kv_cache,
-    block_tables,
-    large_k_tile_idx,
-    num_blocks_per_large_tile,
-    tiled_block_size,
-    B_P_SIZE,
-    B_D_SIZE,
-):
-    """
-    Load KV cache and transform Key and Value into layout required by Matmul
-
-    Vectorized DMA Load layout:
-    Key and Value: (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE)
-
-    Layout used by attention matmuls:
-    Key: (par_dim(B_D_SIZE), seqlen_kv)
-    Value: (seqlen_kv // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE)
-           equivalent to (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE)
-    """
-    # load key cache
-    num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE)
-    for load_idx in nl.affine_range(num_loads):
-        i_p = nl.arange(B_P_SIZE)[:, None]
-        i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
-        loaded = nl.load(kv_cache[0, block_tables[load_idx, i_p,
-                                                  large_k_tile_idx], i_f])
-        if cur_k_tile.dtype != loaded.dtype:
-            loaded = nl.copy(loaded, dtype=cur_k_tile.dtype)
-        # Transpose SBUF tensor using PE
-        for tb_i in nl.affine_range(tiled_block_size):
-            cur_k_tile[
-                :,
-                nl.ds(
-                    load_idx * B_P_SIZE * tiled_block_size + tb_i * B_P_SIZE,
-                    B_P_SIZE,
-                ),
-            ] = nl.transpose(loaded[:, nl.ds(tb_i * B_D_SIZE, B_D_SIZE)])
-
-    # load value cache
-    for load_idx in nl.affine_range(num_loads):
-        loaded = nl.load(kv_cache[1, block_tables[load_idx, i_p,
-                                                  large_k_tile_idx], i_f])
-        if cur_v_tile.dtype != loaded.dtype:
-            loaded = nl.copy(loaded, dtype=cur_v_tile.dtype)
-        i_p = nl.arange(B_P_SIZE)[:, None]
-        i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
-        cur_v_tile[
-            :,
-            nl.ds(
-                load_idx * tiled_block_size * B_D_SIZE,
-                tiled_block_size * B_D_SIZE,
-            ),
-        ] = loaded
-
-
-@nki.jit
-def transpose_p_local(p_local_transposed,
-                      p_local,
-                      LARGE_TILE_SZ,
-                      B_F_SIZE=512):
-    for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
-        if nisa.get_nc_version() == nisa.nc_version.gen3:
-            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
-                                       buffer=nl.sbuf,
-                                       dtype=p_local.dtype)
-        else:
-            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
-                                       buffer=nl.psum,
-                                       dtype=np.float32)
-
-        for j in nl.affine_range(B_F_SIZE // 128):
-            j_128_slice = nl.ds(j * 128, 128)
-            i_j_128_slice = nl.ds(i * B_F_SIZE + j * 128, 128)
-
-            if nisa.get_nc_version() == nisa.nc_version.gen3:
-                p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose(
-                    p_local[:, i_j_128_slice])
-            else:
-                p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose(
-                    p_local[:, i_j_128_slice])
-
-        p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy(
-            p_local_t_tmp, dtype=p_local_transposed.dtype)
-
-
-@nki.jit
-def _flash_attention_core(
-    q_local_tile,
-    k,
-    v,
-    o_buffer,
-    l_buffer,
-    m_buffer,
-    kernel_dtype,
-    acc_type,
-    tile_mask,
-    use_causal_mask,
-    q_tile_idx=None,
-    initialize=False,
-    LARGE_TILE_SZ=2048,
-    B_P_SIZE=128,
-    B_F_SIZE=512,
-    B_D_SIZE=128,
-    qk_res_buffer=None,
-):
-    """
-    The flash attention core function to calculate self attention between a tile
-    of q and a block of K and V.
-    The q_local_tile has (B_P_SIZE, B_D_SIZE)
-    The K and V have shape (B_D_SIZE, LARGE_TILE_SZ), whose free dimension will
-    be split into size B_F_SIZE tiles
-
-    The results are stored in the following three buffers
-    o_buffer: (B_P_SIZE, d)
-    l_buffer: (B_P_SIZE, 1)
-    m_buffer: (B_P_SIZE, 1)
-
-    All IO buffers are in SBUF.
-    """
-    num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
-
-    qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
-                            buffer=nl.sbuf,
-                            dtype=acc_type)
-    max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile),
-                           dtype=acc_type)
-    for k_i in nl.affine_range(num_k_tile_per_large_tile):
-        k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
-
-        if use_causal_mask:
-            # mask are used to only apply computation to the lower half of the
-            # matrix, which reduce the arithmetic intensity by up to 50%
-            multiplication_required_selection = (q_tile_idx * B_P_SIZE
-                                                 >= k_i * B_F_SIZE)
-        else:
-            multiplication_required_selection = True
-
-        if multiplication_required_selection:
-            qk_psum = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE),
-                                 dtype=np.float32,
-                                 buffer=nl.psum)  # (128, 512)
-            qk_psum[:, :] = nl.matmul(q_local_tile,
-                                      k[:, k_i_b_f_slice],
-                                      transpose_x=True)  # (p(128), 512)
-            qk_res_buf[:, k_i_b_f_slice] = nl.where(
-                tile_mask[:, k_i_b_f_slice],
-                qk_psum[:, nl.ds(0, B_F_SIZE)],
-                -9984.0,
-                dtype=acc_type,
-            )
-        else:
-            qk_res_buf[:, k_i_b_f_slice] = -9984.0
-
-        # Calculate max of the current tile
-        max_local[:, k_i] = nisa.tensor_reduce(
-            np.max,
-            qk_res_buf[:, k_i_b_f_slice],
-            axis=(1, ),
-            dtype=acc_type,
-            negate=False,
-        )
-
-    if qk_res_buffer is not None:
-        qk_res_buffer[:, :] = nl.copy(qk_res_buf[:, :])
-
-    max_ = nisa.tensor_reduce(
-        np.max,
-        max_local[:, :],
-        axis=(1, ),
-        dtype=acc_type,
-        negate=False,
-    )
-
-    o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE),
-                                   dtype=o_buffer.dtype)
-
-    if initialize:
-        m_buffer[:, 0] = nl.copy(max_)
-        m_current = max_
-    else:
-        m_previous = nl.copy(m_buffer[:, 0])
-        m_buffer[:, 0] = nl.maximum(m_previous, max_)  # (128,1)
-
-        m_current = m_buffer[:, 0]
-        # Compute scaling factor
-        alpha = nisa.activation(
-            np.exp,
-            m_previous,
-            bias=-1 * m_current,
-            scale=1.0,
-        )
-        o_previous_scaled[...] = nl.multiply(o_buffer[:, :], alpha)
-
-    p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
-                         dtype=kernel_dtype)
-    REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2)
-
-    p_partial_sum = nl.ndarray(
-        (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE),
-        dtype=acc_type,
-    )
-
-    for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE):
-        k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE)
-
-        # compute exp(qk - max)
-        # Compute partial row - tile sum of exp(qk - max))
-        # FIXME : Use activation accumulate to accumulate over k_r_i loop ?
-        p_local[:, k_r_i_reduce_slice] = nisa.activation_reduce(
-            np.exp,
-            qk_res_buf[:, k_r_i_reduce_slice],
-            bias=-1 * m_current,
-            scale=1.0,
-            reduce_op=nl.add,
-            reduce_res=p_partial_sum[:, k_r_i],
-            dtype=kernel_dtype,
-        )
-
-    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type)
-
-    p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
-                                    dtype=kernel_dtype)
-    transpose_p_local(
-        p_local_transposed=p_local_transposed,
-        p_local=p_local,
-        LARGE_TILE_SZ=LARGE_TILE_SZ,
-        B_F_SIZE=B_F_SIZE,
-    )
-
-    pv_psum = nl.zeros(
-        (par_dim(B_P_SIZE), B_D_SIZE),
-        dtype=np.float32,
-        buffer=nl.psum,
-    )
-    for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
-        pv_psum[:, :] += nl.matmul(
-            p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
-            v[:, nl.ds(k_i * B_D_SIZE, B_D_SIZE)],
-            transpose_x=True,
-        )  # (128, 128) (p(Br), d)
-
-    if initialize:
-        o_buffer[:, :] = nl.copy(pv_psum[:, :])
-        l_buffer[:, 0] = nl.add(nl.log(ps), max_)
-    else:
-        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum)
-
-        l_prev = l_buffer[:, 0]
-        l_exp = nl.add(
-            nl.exp(nl.subtract(l_prev, m_current)),
-            ps,
-        )
-        l_buffer[:, 0] = nl.add(m_current, nl.log(l_exp))
-
-
-@nki.jit
-def load_v_tile(v_hbm_tile, cur_v_tile, large_tile_idx, v_i, LARGE_TILE_SZ):
-    B_P_SIZE = 128
-    B_D_SIZE = v_hbm_tile.shape[-1]
-    loaded = nl.load(v_hbm_tile[
-        nl.ds(large_tile_idx * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE),
-        :,
-    ])
-    if cur_v_tile.dtype != loaded.dtype:
-        loaded = nl.copy(loaded, dtype=cur_v_tile.dtype)
-    cur_v_tile[:, nl.ds(v_i * B_D_SIZE, B_D_SIZE)] = loaded
-
-
-@nki.jit
-def flash_paged_attention(
-    query,
-    key,
-    value,
-    kv_cache,
-    block_tables,
-    mask,
-    softmax_scale=None,
-    mixed_precision=True,
-    LARGE_TILE_SZ=2048,
-    return_debug_tensors=False,
-):
-    """
-    Flash PagedAttention Forward Kernel.
-
-    IO tensor layouts:
-      - query: shape   (1, n_heads, d, seq_q)
-      - key:   shape   (1, n_kv_heads, d, seq_k)
-      - value: shape   (1, n_kv_heads, seq_v, d)
-      - kv_cache: (2, num_blocks, n_kv_heads, block_size, d)
-      - block_tables: (num_active_blocks, )
-      - mask: (seq_q, num_active_blocks * block_size + seq_q)
-      - o: shape (1, n_heads, seq_q, d)
-
-      - This kernel requires seq_k == seq_v
-      - We use continuous batching by default, so the batch dimension is
-        always 1, and different requests are concatenated along sequence
-        dimension.
-      - We use paged cache blocks (kv_cache) to store KV cache.
-
-    IO tensor dtypes:
-      - This kernel assumes all IO tensors have the same dtype except for
-        block_tables (int32) and mask (int32)
-      - If mixed_precision is True, then all Tensor Engine operation will be
-        performed in bfloat16 and accumulation will be performed in float32.
-        Otherwise the intermediates will be in the same type as the inputs.
-
-    Compile-time Constants:
-      - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)`
-      - mixed_precision: flag to set non-matmul ops in fp32 precision, default
-        is set to `true`, if false, we use same precision as input types
-      - LARGE_TILE_SZ: `default=2048`, size of the kv tile size for attention
-        computation reduction
-
-    GQA support Notes:
-      the spmd kernel for launching kernel should be on kv_heads instead of
-      nheads
-
-    Example usage:
-      MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d]
-        usage: `flash_fwd[b, h](q, k, v, ...)`
-      GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d]
-        usage: `flash_fwd[b, kv_h](q, k, v, ...)`
-    """
-    B_F_SIZE = 512
-    B_P_SIZE = 128
-    b, h, d, seqlen_q = query.shape
-    B_D_SIZE = d
-    n_tile_q = seqlen_q // B_P_SIZE  # since q will be loaded on tensor engine
-    _, num_blocks, k_h, block_size, _ = kv_cache.shape
-    q_h_per_k_h = h // k_h
-    assert b == 1, f"invalid batch size {b=}"
-    assert d <= 128, f" we do not support head_dim > 128, got head dim {d=}"
-    cache_shape = (2, num_blocks, k_h, block_size, d)
-    assert (tuple(kv_cache.shape) == cache_shape
-            ), f"{kv_cache.shape=} mismatch, expect {cache_shape}"
-    assert key is None or tuple(key.shape) == (
-        1,
-        k_h,
-        d,
-        seqlen_q,
-    ), f"key shape {key.shape} mismatch!"
-    assert value is None or tuple(value.shape) == (
-        1,
-        k_h,
-        seqlen_q,
-        d,
-    ), f"value shape {value.shape} mismatch!"
-
-    assert (
-        nl.program_ndim() == 2
-    ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!"
-    batch_id = nl.program_id(axis=0)
-    head_id = nl.program_id(axis=1)
-
-    (num_active_blocks, ) = block_tables.shape
-    context_kv_len = num_active_blocks * block_size
-    assert (
-        LARGE_TILE_SZ % B_F_SIZE == 0
-    ), f"Need {LARGE_TILE_SZ=} to be divisible by {B_F_SIZE=} in transpose_p"
-    assert (context_kv_len % LARGE_TILE_SZ == 0
-            ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}"
-
-    num_blocks_per_large_tile = LARGE_TILE_SZ // block_size
-    assert is_power_of_2(
-        num_blocks_per_large_tile
-    ), f"{num_blocks_per_large_tile=} is expected of be power of 2"
-    if seqlen_q > B_F_SIZE:
-        MAX_REDUCTION_TILE = 2048
-        if seqlen_q // 2 > MAX_REDUCTION_TILE:
-            assert (
-                seqlen_q % MAX_REDUCTION_TILE == 0
-            ), f"{seqlen_q=} should be divisible by {MAX_REDUCTION_TILE=}"
-        else:
-            assert (seqlen_q % B_F_SIZE == 0
-                    ), f"{seqlen_q=} should be divisible by {B_F_SIZE=})"
-
-    kernel_dtype = nl.bfloat16 if mixed_precision else query.dtype
-    acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype
-    softmax_scale = softmax_scale or (1.0 / (d**0.5))
-    num_large_k_tile = context_kv_len // LARGE_TILE_SZ
-
-    o = nl.ndarray((b, h, seqlen_q, d),
-                   dtype=query.dtype,
-                   buffer=nl.shared_hbm)
-    hbm_l_buffer, hbm_m_buffer, hbm_qk_res, qk_res_buffer = (
-        None,
-        None,
-        None,
-        None,
-    )
-    if return_debug_tensors:
-        hbm_l_buffer = nl.ndarray((b, h, seqlen_q),
-                                  dtype=acc_type,
-                                  buffer=nl.shared_hbm)
-        hbm_m_buffer = nl.ndarray((b, h, seqlen_q),
-                                  dtype=acc_type,
-                                  buffer=nl.shared_hbm)
-        hbm_qk_res = nl.ndarray((b, h, B_P_SIZE, seqlen_q),
-                                dtype=acc_type,
-                                buffer=nl.shared_hbm)
-        qk_res_buffer = nl.zeros(
-            (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), seqlen_q),
-            dtype=acc_type,
-            buffer=nl.sbuf,
-            lazy_initialization=True,
-        )
-    block_tables_sbuf = load_block_tables(
-        block_tables_hbm=block_tables,
-        num_tiles=num_large_k_tile,
-        num_blocks_per_tile=num_blocks_per_large_tile,
-    )
-
-    # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
-    if num_blocks_per_large_tile < B_P_SIZE:
-        # we checked num_blocks_per_tile is a power of 2
-        assert B_P_SIZE % num_blocks_per_large_tile == 0
-        block_size_tiling_factor = B_P_SIZE // num_blocks_per_large_tile
-        # We assume block_size >= block_size_tiling_factor
-        assert block_size % block_size_tiling_factor == 0
-    else:
-        block_size_tiling_factor = 1
-    tiled_block_size = block_size // block_size_tiling_factor
-
-    # Indirect DMA load must be placed along Partition Dimension
-    block_tables_sbuf = transform_block_tables_for_indirect_load(
-        block_tables_sbuf,
-        block_size_tiling_factor=block_size_tiling_factor,
-        num_head=k_h,
-        head_id=head_id,
-    )
-
-    # Flatten KV cache to be 3D for loading into SBUF
-    new_cache_shape = (
-        2,
-        num_blocks * k_h * block_size_tiling_factor,
-        tiled_block_size * d,
-    )
-    kv_cache = kv_cache.reshape(new_cache_shape)
-
-    # Global Flash Attention accumulators
-    o_buffer = nl.zeros(
-        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), d),
-        dtype=acc_type,
-        buffer=nl.sbuf,
-        lazy_initialization=True,
-    )
-    l_buffer = nl.zeros(
-        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1),
-        dtype=acc_type,
-        buffer=nl.sbuf,
-        lazy_initialization=True,
-    )
-    m_buffer = nl.zeros(
-        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1),
-        dtype=acc_type,
-        buffer=nl.sbuf,
-        lazy_initialization=True,
-    )
-
-    for large_k_tile_idx in nl.sequential_range(0, num_large_k_tile):
-        num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE)
-        cur_k_tile = nl.ndarray(
-            (par_dim(B_D_SIZE), LARGE_TILE_SZ),
-            dtype=kernel_dtype,
-        )
-        cur_v_tile = nl.ndarray(
-            (par_dim(B_P_SIZE), num_loads * tiled_block_size * B_D_SIZE),
-            dtype=kernel_dtype,
-        )
-        load_kv_tile_from_cache(
-            cur_k_tile=cur_k_tile,
-            cur_v_tile=cur_v_tile,
-            kv_cache=kv_cache,
-            block_tables=block_tables_sbuf,
-            large_k_tile_idx=large_k_tile_idx,
-            num_blocks_per_large_tile=num_blocks_per_large_tile,
-            tiled_block_size=tiled_block_size,
-            B_P_SIZE=B_P_SIZE,
-            B_D_SIZE=B_D_SIZE,
-        )
-
-        for i in nl.affine_range(n_tile_q):
-            cur_mask = nl.load(mask[
-                nl.ds(i * B_P_SIZE, B_P_SIZE),
-                nl.ds(large_k_tile_idx * LARGE_TILE_SZ, LARGE_TILE_SZ),
-            ])
-            for i_q_h in nl.affine_range(q_h_per_k_h):
-                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
-                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
-                q_sbuf_tile = nl.load(q_hbm_tile[:,
-                                                 nl.ds(i *
-                                                       B_P_SIZE, B_P_SIZE)])
-                if q_sbuf_tile.dtype != kernel_dtype:
-                    q_sbuf_tile = nl.copy(q_sbuf_tile, dtype=kernel_dtype)
-                q_tile[:, :] = q_sbuf_tile * softmax_scale
-
-                _flash_attention_core(
-                    q_local_tile=q_tile,
-                    k=cur_k_tile,
-                    v=cur_v_tile,
-                    o_buffer=o_buffer[i, i_q_h],
-                    l_buffer=l_buffer[i, i_q_h],
-                    m_buffer=m_buffer[i, i_q_h],
-                    kernel_dtype=kernel_dtype,
-                    acc_type=acc_type,
-                    tile_mask=cur_mask,
-                    use_causal_mask=False,
-                    q_tile_idx=i,
-                    initialize=large_k_tile_idx == 0,
-                    LARGE_TILE_SZ=LARGE_TILE_SZ,
-                    B_P_SIZE=B_P_SIZE,
-                    B_F_SIZE=B_F_SIZE,
-                    B_D_SIZE=B_D_SIZE,
-                )
-
-    # compute attention between input query, key and value
-    if key is not None and value is not None:
-        B_F_SIZE = min(seqlen_q, B_F_SIZE)
-        LARGE_TILE_SZ = seqlen_q
-
-        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
-                                dtype=kernel_dtype)
-        cur_v_tile = nl.ndarray(
-            (par_dim(B_P_SIZE), LARGE_TILE_SZ // B_P_SIZE * B_D_SIZE),
-            dtype=kernel_dtype,
-        )
-
-        loaded = nl.load(key[batch_id, head_id, :, :])
-        if loaded.dtype != kernel_dtype:
-            loaded = nl.copy(loaded, dtype=kernel_dtype)
-        cur_k_tile[:, :] = loaded
-
-        v_hbm_tile = value[batch_id, head_id]
-        for v_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
-            load_v_tile(
-                v_hbm_tile=v_hbm_tile,
-                cur_v_tile=cur_v_tile,
-                large_tile_idx=0,
-                v_i=v_i,
-                LARGE_TILE_SZ=LARGE_TILE_SZ,
-            )
-
-        for i in nl.affine_range(n_tile_q):
-            cur_mask = nl.load(mask[
-                nl.ds(i * B_P_SIZE, B_P_SIZE),
-                nl.ds(context_kv_len, LARGE_TILE_SZ),
-            ])
-            for i_q_h in nl.affine_range(q_h_per_k_h):
-
-                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
-                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
-                q_sbuf_tile = nl.load(q_hbm_tile[:,
-                                                 nl.ds(i *
-                                                       B_P_SIZE, B_P_SIZE)])
-                if q_sbuf_tile.dtype != kernel_dtype:
-                    q_sbuf_tile = nl.copy(q_sbuf_tile, dtype=kernel_dtype)
-                q_tile[:, :] = q_sbuf_tile * softmax_scale
-                _flash_attention_core(
-                    q_local_tile=q_tile,
-                    k=cur_k_tile,
-                    v=cur_v_tile,
-                    o_buffer=o_buffer[i, i_q_h],
-                    l_buffer=l_buffer[i, i_q_h],
-                    m_buffer=m_buffer[i, i_q_h],
-                    kernel_dtype=kernel_dtype,
-                    acc_type=acc_type,
-                    tile_mask=cur_mask,
-                    use_causal_mask=True,
-                    q_tile_idx=i,
-                    initialize=False,
-                    LARGE_TILE_SZ=LARGE_TILE_SZ,
-                    B_P_SIZE=B_P_SIZE,
-                    B_F_SIZE=B_F_SIZE,
-                    B_D_SIZE=B_D_SIZE,
-                    qk_res_buffer=(qk_res_buffer[i, i_q_h]
-                                   if qk_res_buffer is not None else None),
-                )
-
-    # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- #
-    for i_q_h in nl.affine_range(q_h_per_k_h):
-        for i in nl.affine_range(n_tile_q):
-            out = nl.multiply(
-                o_buffer[i, i_q_h],
-                nl.exp(m_buffer[i, i_q_h] - l_buffer[i, i_q_h]),
-                dtype=kernel_dtype,
-            )
-
-            nl.store(
-                o[
-                    batch_id,
-                    head_id * q_h_per_k_h + i_q_h,
-                    nl.ds(i * B_P_SIZE, B_P_SIZE),
-                    :,
-                ],
-                out,
-            )
-            # maximum and summation statistics
-            if return_debug_tensors:
-                nl.store(
-                    hbm_m_buffer[
-                        batch_id,
-                        head_id * q_h_per_k_h + i_q_h,
-                        nl.ds(i * B_P_SIZE, B_P_SIZE),
-                    ],
-                    m_buffer[i, i_q_h, :, :],
-                )
-                nl.store(
-                    hbm_l_buffer[
-                        batch_id,
-                        head_id * q_h_per_k_h + i_q_h,
-                        nl.ds(i * B_P_SIZE, B_P_SIZE),
-                    ],
-                    l_buffer[i, i_q_h],
-                )
-                nl.store(
-                    hbm_qk_res[batch_id, head_id * q_h_per_k_h + i_q_h, :, :],
-                    qk_res_buffer[batch_id, i_q_h, :, :],
-                )
-
-    if return_debug_tensors:
-        return o, hbm_m_buffer, hbm_l_buffer, hbm_qk_res
-    return o
-
-
-def reorder_context_mask(mask, LARGE_TILE_SZ, block_size):
-    """
-    Reorder the mask to make it compatible with the flash attention kernel.
-
-    We vectorize KV cache read to improve DMA utilization. However, the layout
-    that maximizes DMA bandwidth changes the order tokens are consumed.
-    
-    The token layout (inner 2 dimensions) after vectorized load is (B_P_SIZE,
-    tiled_block_size) in a tile of `B_P_SIZE * tiled_block_size` tokens. And
-    each step the engine consumes a column (rather than a row) of B_P_SIZE
-    tokens. Therefore, the tokens are visited in a strided way.
-
-    To make sure mask matches the order tokens are consumed, we need to properly
-    transpose mask.
-    """
-    total_query_len, total_seq_len = mask.shape
-    context_kv_len = total_seq_len - total_query_len
-
-    B_P_SIZE = 128
-    assert (LARGE_TILE_SZ
-            >= B_P_SIZE), f"{LARGE_TILE_SZ=} must be larger than {B_P_SIZE=}"
-    num_tiled_blocks = max(B_P_SIZE, LARGE_TILE_SZ // block_size)
-    tiled_block_size = LARGE_TILE_SZ // num_tiled_blocks
-    if tiled_block_size > 1:
-        # Mask reordering is needed when tiled_block_size > 1
-        device = mask.device
-        mask = mask.cpu()
-        context_mask = mask[:, :context_kv_len]
-        context_mask = context_mask.view(
-            total_query_len,
-            context_kv_len // LARGE_TILE_SZ,
-            num_tiled_blocks // B_P_SIZE,
-            B_P_SIZE,
-            tiled_block_size,
-        )
-        context_mask = context_mask.transpose(3, 4).reshape(
-            total_query_len, context_kv_len)
-        new_mask = mask[:, context_kv_len:]
-        return torch.concat([context_mask, new_mask], dim=1).to(device)
-    else:
-        return mask
-
-
-def flash_attn_varlen_nkifunc(
-    query,
-    key,
-    value,
-    kv_cache,
-    block_table,
-    attn_mask,
-    n_kv_head=None,
-    head_size=None,
-    LARGE_TILE_SZ=2048,
-    mixed_precision=True,
-):
-    """
-    Compute flash paged attention for variable length sequences.
-
-    This function is a wrapper around the flash attention NKI kernel. It takes
-    in the following arguments:
-      - query: (1, n_heads, d, seq_q)
-      - key:   (1, n_kv_heads, d, seq_k)
-      - value: (1, n_kv_heads, seq_v, d)
-      - kv_cache:   (2, n_blocks, n_kv_heads, block_size, d)
-      - block_tables: (n_active_blocks, )
-      - attn_mask: (seq_q, n_active_blocks * block_size + seq_q)
-
-    Notes:
-      - attn_mask must be reordered outside using `reorder_context_mask`
-      - Key/value cache layout must be (n_blocks, n_kv_heads, block_size, d) 
-        for better DMA throughput
-    """
-    if n_kv_head is None:
-        n_kv_head = kv_cache.shape[2]
-    assert kv_cache.shape[0] == 2
-    assert kv_cache.shape[2] == n_kv_head
-    if head_size is None:
-        head_size = kv_cache.shape[-1]
-
-    kwargs = dict(
-        query=query,
-        key=key,
-        value=value,
-        kv_cache=kv_cache,
-        block_tables=block_table,
-        mask=attn_mask,
-        softmax_scale=1.0 / (head_size**0.5),
-        mixed_precision=mixed_precision,
-        LARGE_TILE_SZ=LARGE_TILE_SZ,
-    )
-
-    o = flash_paged_attention[1, n_kv_head](**kwargs)
-    return o
-
-
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    kv_cache: torch.Tensor,
-    slot_mapping: torch.Tensor,
-) -> None:
-    """
-    Writes key-value pairs to the KV cache at specified positions.
-
-    Args:
-        key (torch.Tensor): Key tensor with shape
-            (num_tokens, n_kv_head, d_head)
-        value (torch.Tensor): Value tensor with shape 
-            (num_tokens, n_kv_head, d_head)
-        kv_cache (torch.Tensor): Key/value cache tensor with shape 
-            (2, num_blocks, n_kv_head, block_size, d_head)
-        slot_mapping (torch.Tensor): Mapping tensor indicating cache positions
-            with shape (num_tokens)
-
-    Returns:
-        None: Updates the kv_cache tensor in-place
-    """
-    block_size = kv_cache.size(3)
-    n_kv_head = key.size(1)
-
-    # Calculate indices with explicit floor division
-    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    block_offsets = slot_mapping % block_size
-
-    # Create the head indices tensor
-    head_indices = torch.arange(n_kv_head, device=key.device)
-
-    # Update caches using index_put_
-    kv_cache.index_put_(
-        (torch.tensor([0], device=key.device), block_indices[:, None],
-         head_indices[None, :], block_offsets[:, None]), key)
-
-    kv_cache.index_put_(
-        (torch.tensor([1], device=key.device), block_indices[:, None],
-         head_indices[None, :], block_offsets[:, None]), value)
diff --git a/vllm/collect_env.py b/vllm/collect_env.py
index ee43ad12e8..5b3883a482 100644
--- a/vllm/collect_env.py
+++ b/vllm/collect_env.py
@@ -54,7 +54,6 @@ SystemEnv = namedtuple(
         'is_xnnpack_available',
         'cpu_info',
         'rocm_version',  # vllm specific field
-        'neuron_sdk_version',  # vllm specific field
         'vllm_version',  # vllm specific field
         'vllm_build_flags',  # vllm specific field
         'gpu_topo',  # vllm specific field
@@ -275,15 +274,6 @@ def get_rocm_version(run_lambda):
                                      r'HIP version: (\S+)')
 
 
-def get_neuron_sdk_version(run_lambda):
-    # Adapted from your install script
-    try:
-        result = run_lambda(["neuron-ls"])
-        return result if result[0] == 0 else 'N/A'
-    except Exception:
-        return 'N/A'
-
-
 def get_vllm_version():
     from vllm import __version__, __version_tuple__
 
@@ -306,10 +296,9 @@ def get_vllm_version():
 
 def summarize_vllm_build_flags():
     # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
-    return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
+    return 'CUDA Archs: {}; ROCm: {}'.format(
         os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'),
         'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled',
-        'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled',
     )
 
 
@@ -601,7 +590,6 @@ def get_env_info():
     conda_packages = get_conda_packages(run_lambda)
 
     rocm_version = get_rocm_version(run_lambda)
-    neuron_sdk_version = get_neuron_sdk_version(run_lambda)
     vllm_version = get_vllm_version()
     vllm_build_flags = summarize_vllm_build_flags()
     gpu_topo = get_gpu_topo(run_lambda)
@@ -635,7 +623,6 @@ def get_env_info():
         is_xnnpack_available=is_xnnpack_available(),
         cpu_info=get_cpu_info(run_lambda),
         rocm_version=rocm_version,
-        neuron_sdk_version=neuron_sdk_version,
         vllm_version=vllm_version,
         vllm_build_flags=vllm_build_flags,
         gpu_topo=gpu_topo,
@@ -702,7 +689,6 @@ env_info_fmt += """
          vLLM Info
 ==============================
 ROCM Version                 : {rocm_version}
-Neuron SDK Version           : {neuron_sdk_version}
 vLLM Version                 : {vllm_version}
 vLLM Build Flags:
   {vllm_build_flags}
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 41322f4f2a..063af69f41 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -461,11 +461,6 @@ class ModelConfig:
         DP (which is controlled by `--data-parallel-size`).
         This is only supported on a per-model basis and falls back to
         `"weights"` if the encoder does not support DP."""
-    override_neuron_config: dict[str, Any] = field(default_factory=dict)
-    """Initialize non-default neuron config or override default neuron config
-    that are specific to Neuron devices, this argument will be used to
-    configure the neuron config that can not be gathered from the vllm
-    arguments. e.g. `{"cast_logits_dtype": "bfloat16"}`."""
     pooler_config: Optional["PoolerConfig"] = field(init=False)
     """Pooler config which controls the behaviour of output pooling in pooling
     models."""
@@ -785,10 +780,6 @@ class ModelConfig:
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
 
-        if (not current_platform.is_neuron() and self.override_neuron_config):
-            raise ValueError(
-                "`override_neuron_config` is only supported on Neuron.")
-
         # Avoid running try_verify_and_update_config multiple times
         self.config_updated = False
 
@@ -1696,13 +1687,7 @@ class ModelConfig:
         """
         For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to
         True to enable cross-attention
-        Neuron needs all multimodal data to be in the decoder and does not
-        need to explicitly enable cross-attention
         """
-        if (current_platform.is_neuron()
-                and self.hf_config.model_type == "mllama"):
-            return False
-
         return is_encoder_decoder(self.hf_config)
 
     @property
@@ -1871,7 +1856,7 @@ class LoadConfig:
             self.ignore_patterns = ["original/**/*"]
 
 
-Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
+Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
 
 
 @config
@@ -1927,9 +1912,7 @@ class DeviceConfig:
                 self.device_type = self.device.type
 
         # Some device types require processing inputs on CPU
-        if self.device_type in ["neuron"]:
-            self.device = torch.device("cpu")
-        elif self.device_type in ["tpu"]:
+        if self.device_type in ["tpu"]:
             self.device = None
         else:
             # Set device with device type
@@ -3941,7 +3924,6 @@ class VllmConfig:
             f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
             f"tokenizer_mode={self.model_config.tokenizer_mode}, "
             f"revision={self.model_config.revision}, "
-            f"override_neuron_config={self.model_config.override_neuron_config}, "  # noqa
             f"tokenizer_revision={self.model_config.tokenizer_revision}, "
             f"trust_remote_code={self.model_config.trust_remote_code}, "
             f"dtype={self.model_config.dtype}, "
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 6f8f962fe7..5cc630b728 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -33,9 +33,8 @@ class CacheConfig:
     """Configuration for the KV cache."""
 
     block_size: SkipValidation[BlockSize] = None  # type: ignore
-    """Size of a contiguous cache block in number of tokens. This is ignored on
-    neuron devices and set to `--max-model-len`. On CUDA devices, only block
-    sizes up to 32 are supported. On HPU devices, block size defaults to 128.
+    """Size of a contiguous cache block in number of tokens. On CUDA devices,
+    only block sizes up to 32 are supported.
 
     This config has no static default. If left unspecified by the user, it will
     be set in `Platform.check_and_update_config()` based on the current
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index fb8e30996e..3a74b5fb7e 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -377,10 +377,7 @@ class ParallelConfig:
             from vllm.executor import ray_utils
             backend: DistributedExecutorBackend = "mp"
             ray_found = ray_utils.ray_is_available()
-            if current_platform.is_neuron():
-                # neuron uses single process to control multiple devices
-                backend = "uni"
-            elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
+            if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
                 backend = "uni"
             elif (current_platform.is_cuda()
                   and cuda_device_count_stateless() < self.world_size):
diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py
deleted file mode 100644
index 5b61a1687a..0000000000
--- a/vllm/distributed/device_communicators/neuron_communicator.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import torch
-
-from vllm.distributed.device_communicators.base_device_communicator import (
-    DeviceCommunicatorBase)
-from vllm.platforms import current_platform
-
-if current_platform.is_neuron():
-    import torch_xla.core.xla_model as xm
-
-
-class NeuronCommunicator(DeviceCommunicatorBase):
-
-    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
-        return xm.all_reduce(xm.REDUCE_SUM, x)
-
-    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
-        assert dim == -1, "Neuron only supports dim=-1 for all-gather."
-        return xm.all_gather(x, dim=dim)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d96654ecfa..fdd25a2f9c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -419,8 +419,6 @@ class EngineArgs:
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
     scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
 
-    override_neuron_config: dict[str, Any] = \
-        get_field(ModelConfig, "override_neuron_config")
     override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
         ModelConfig.override_pooler_config
     compilation_config: CompilationConfig = \
@@ -561,8 +559,6 @@ class EngineArgs:
                                  help=model_kwargs["hf_token"]["help"])
         model_group.add_argument("--hf-overrides",
                                  **model_kwargs["hf_overrides"])
-        model_group.add_argument("--override-neuron-config",
-                                 **model_kwargs["override_neuron_config"])
         model_group.add_argument("--override-pooler-config",
                                  **model_kwargs["override_pooler_config"])
         model_group.add_argument("--logits-processor-pattern",
@@ -992,7 +988,6 @@ class EngineArgs:
             mm_processor_kwargs=self.mm_processor_kwargs,
             mm_processor_cache_gb=self.mm_processor_cache_gb,
             mm_encoder_tp_mode=self.mm_encoder_tp_mode,
-            override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern,
             generation_config=self.generation_config,
diff --git a/vllm/envs.py b/vllm/envs.py
index 5e1585f3c3..50783eeb95 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -236,7 +236,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # ================== Installation Time Env Vars ==================
 
     # Target device of vLLM, supporting [cuda (by default),
-    # rocm, neuron, cpu]
+    # rocm, cpu]
     "VLLM_TARGET_DEVICE":
     lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
 
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 6b5a107396..e7eb8247d5 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -73,11 +73,6 @@ class CustomOp(nn.Module):
         # NOTE(woosuk): This is a placeholder for future extensions.
         return self.forward_native(*args, **kwargs)
 
-    def forward_neuron(self, *args, **kwargs):
-        # By default, we assume that Neuron ops are compatible with the
-        # PyTorch-native implementation.
-        return self.forward_native(*args, **kwargs)
-
     def forward_oot(self, *args, **kwargs):
         # By default, we assume that OOT ops are compatible with the
         # PyTorch-native implementation.
@@ -105,8 +100,6 @@ class CustomOp(nn.Module):
             return self.forward_tpu
         elif current_platform.is_xpu():
             return self.forward_xpu
-        elif current_platform.is_neuron():
-            return self.forward_neuron
         elif current_platform.is_out_of_tree():
             return self.forward_oot
         else:
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index fac37ef75b..319fa938d4 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -95,13 +95,6 @@ class SiluAndMul(CustomOp):
         self.op(out, x)
         return out
 
-    def forward_neuron(self, x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1] // 2
-        x_reshaped = x.view(-1, x.shape[-1])
-        s = x_reshaped[:, :d] * F.sigmoid(x_reshaped[:, :d])
-        result = s * x_reshaped[:, d:]
-        return result.view(*x.shape[:-1], d)
-
 
 @CustomOp.register("mul_and_silu")
 class MulAndSilu(CustomOp):
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index d73fcf368f..8cac47b5a3 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -26,7 +26,6 @@ QuantizationMethods = Literal[
     "bitsandbytes",
     "hqq",
     "experts_int8",
-    "neuron_quant",
     "ipex",
     "quark",
     "moe_wna16",
@@ -108,7 +107,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config
     from .moe_wna16 import MoeWNA16Config
     from .mxfp4 import Mxfp4Config
-    from .neuron_quant import NeuronQuantConfig
     from .petit import PetitNvFp4Config
     from .ptpc_fp8 import PTPCFp8Config
     from .rtn import RTNConfig
@@ -135,7 +133,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "ptpc_fp8": PTPCFp8Config,
         "hqq": HQQMarlinConfig,
         "experts_int8": ExpertsInt8Config,
-        "neuron_quant": NeuronQuantConfig,
         "ipex": IPEXConfig,
         "quark": QuarkConfig,
         "moe_wna16": MoeWNA16Config,
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
deleted file mode 100644
index 8040236663..0000000000
--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-from importlib.util import find_spec
-from typing import Any, Optional
-
-from torch.nn import Module
-
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-
-SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn']
-
-
-class AlwaysSupportedDtypes(list):
-
-    def __contains__(self, item):
-        return True
-
-
-class NeuronQuantConfig(QuantizationConfig):
-    """Int8 Quantization Config class for Neuron Backend."""
-
-    def __init__(
-        self,
-        dequant_dtype: str = "f16",
-        quantize_method: str = "vector_dynamic",
-    ) -> None:
-        super().__init__()
-        self.quant_dtype = os.getenv("NEURON_QUANT_DTYPE", "s8")
-        if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST:
-            raise ValueError(
-                f"Neuron quantization datatype {self.quant_dtype} is not valid,"
-                f" the quantization datatype should match one of the below "
-                f"types {SUPPORTED_QUANT_DTYPE_LIST}")
-        self.dequant_dtype = dequant_dtype
-        self.quantize_method = quantize_method
-
-    def get_name(self) -> QuantizationMethods:
-        return "neuron_quant"
-
-    def get_supported_act_dtypes(self) -> list[str]:
-        # Neuron implements custom handling logic for quantization support
-        return AlwaysSupportedDtypes()
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        raise NotImplementedError(
-            "This function should not be called with Neuron Backend")
-
-    @staticmethod
-    def get_config_filenames() -> list[str]:
-        return []
-
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "NeuronQuantConfig":
-        quantize_method = cls.get_from_keys(config, ["quantize_method"])
-        dequant_dtype = cls.get_from_keys(config, ["dequant_dtype"])
-        return cls(dequant_dtype=dequant_dtype,
-                   quantize_method=quantize_method)
-
-    def get_quant_method(self, layer: Module, prefix: str) -> Optional[Any]:
-        if find_spec("transformers_neuronx") is not None:
-            return self.get_quantization_config()
-        else:
-            raise NotImplementedError(
-                "Neuron Quantization is only supported through"
-                " transformers_neuronx.")
-
-    def get_quantization_config(self):
-        from transformers_neuronx.config import QuantizationConfig
-        return QuantizationConfig(quant_dtype=self.quant_dtype,
-                                  dequant_dtype=self.dequant_dtype,
-                                  quantize_method=self.quantize_method)
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 10fce857a8..be25e90abf 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.model_executor.custom_op import CustomOp
 
-from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch
+from .common import apply_rotary_emb_torch
 
 
 @CustomOp.register("rotary_embedding")
@@ -149,87 +149,6 @@ class RotaryEmbedding(CustomOp):
                                      self.cos_sin_cache, self.is_neox_style)
         return query, key
 
-    def forward_neuron(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-
-        def _apply_rotary_emb_neuron(
-            x: torch.Tensor,
-            cos: torch.Tensor,
-            sin: torch.Tensor,
-            is_neox_style: bool,
-        ) -> torch.Tensor:
-            cos = cos.unsqueeze(-2).to(x.dtype)
-            sin = sin.unsqueeze(-2).to(x.dtype)
-            if is_neox_style:
-                x1, x2 = torch.chunk(x, 2, dim=-1)
-            else:
-                # x1 = x[..., ::2]
-
-                # x2 = x[..., 1::2]
-                d = x.shape[-1] // 2
-                x_reshaped = x.view(-1, x.shape[-1])
-                x1 = x_reshaped[:, ::2].view(*x.shape[:-1], d)
-                x2 = x_reshaped[:, 1::2].view(*x.shape[:-1], d)
-            o1 = x1 * cos - x2 * sin
-            o2 = x2 * cos + x1 * sin
-            if is_neox_style:
-                return torch.cat((o1, o2), dim=-1)
-            else:
-                return torch.stack((o1, o2), dim=-1).flatten(-2)
-
-        if offsets is not None:
-            positions = positions + offsets
-
-        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                   dtype=query.dtype)
-
-        positions = positions.flatten()
-        num_tokens = positions.shape[0]
-        cos_sin = self.cos_sin_cache.index_select(0, positions)
-        cos, sin = cos_sin.chunk(2, dim=-1)
-
-        query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
-        if key is not None:
-            key_shape = key.shape
-            key = key.view(num_tokens, -1, self.head_size)
-
-        if self.rotary_dim == self.head_size:
-            query = apply_rotary_emb_dispatch(query, cos, sin,
-                                              self.is_neox_style)
-            query = query.reshape(query_shape)
-            if key is not None:
-                key = apply_rotary_emb_dispatch(key, cos, sin,
-                                                self.is_neox_style)
-                key = key.reshape(key_shape)
-        else:
-            head_size = query.shape[-1]
-            query_reshaped = query.view(-1, head_size)
-            query_pass = query_reshaped[:, self.rotary_dim:].view(
-                *query.shape[:-1], head_size - self.rotary_dim)
-            query_rot = query_reshaped[:, :self.rotary_dim].view(
-                *query.shape[:-1], self.rotary_dim)
-            query_rot = _apply_rotary_emb_neuron(query_rot, cos, sin,
-                                                 self.is_neox_style)
-            query = torch.cat((query_rot, query_pass),
-                              dim=-1).reshape(query_shape)
-
-            if key is not None:
-                key_reshaped = key.view(-1, head_size)
-                key_pass = key_reshaped[:, self.rotary_dim:].view(
-                    *key.shape[:-1], head_size - self.rotary_dim)
-                key_rot = key_reshaped[:, :self.rotary_dim].view(
-                    *key.shape[:-1], self.rotary_dim)
-                key_rot = _apply_rotary_emb_neuron(key_rot, cos, sin,
-                                                   self.is_neox_style)
-                key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
-        return query, key
-
     def extra_repr(self) -> str:
         s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
         s += f", max_position_embeddings={self.max_position_embeddings}"
diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
deleted file mode 100644
index ee484e9a7b..0000000000
--- a/vllm/model_executor/model_loader/neuron.py
+++ /dev/null
@@ -1,476 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Utilities for selecting and loading Neuron models in transformers-neuronx
-framework."""
-import ast
-import copy
-import importlib
-import os
-from typing import Optional
-
-import torch
-import torch.nn as nn
-from transformers import PretrainedConfig
-
-from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig)
-from vllm.logprobs import Logprob
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import get_quantization_config
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import CompletionSequenceGroupOutput, SequenceOutput
-
-TORCH_DTYPE_TO_NEURON_AMP = {
-    "auto": "f32",
-    "half": "f16",
-    "float16": "f16",
-    "bfloat16": "bf16",
-    "float": "f32",
-    "float32": "f32",
-    torch.float16: "f16",
-    torch.bfloat16: "bf16",
-    torch.float32: "f32",
-}
-
-# Models supported by Neuron.
-_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str, str]] = {
-    "LlamaForCausalLM": ("transformers_neuronx.llama.model",
-                         "LlamaForSampling", "LlamaForCausalLM"),
-    "MistralForCausalLM": ("transformers_neuronx.mistral.model",
-                           "MistralForSampling", "MistralForCausalLM")
-}
-
-
-class NeuronCausalLM(nn.Module):
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 on_device_sampling_disabled: bool = False) -> None:
-        super().__init__()
-        self.config = config
-        self.logits_processor = LogitsProcessor(config.vocab_size,
-                                                logits_as_input=True)
-
-        self.on_device_sampling_disabled = on_device_sampling_disabled
-        if self.on_device_sampling_disabled:
-            # Use default sampler
-            self.sampler = Sampler()
-
-        # Lazy initialized
-        self.model: nn.Module
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        input_block_ids: torch.Tensor,
-    ) -> torch.Tensor:
-        logits = self.model(input_ids,
-                            cache_ids=positions,
-                            start_ids=input_block_ids)
-        return logits
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(None, hidden_states, sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-
-        if self.on_device_sampling_disabled:
-            next_tokens = self.sampler(logits, sampling_metadata)
-            return next_tokens
-
-        # On-device sampling outputs the token ids directly.
-        sampled_token_ids = logits.flatten()
-        next_tokens = []
-        sample_idx = 0
-        for seq_group in sampling_metadata.seq_groups:
-            samples = []
-            for seq_id in seq_group.seq_ids:
-                token_id = sampled_token_ids[sample_idx].item()
-                samples.append(
-                    SequenceOutput(parent_seq_id=seq_id,
-                                   output_token=token_id,
-                                   logprobs={token_id: Logprob(token_id)}))
-                sample_idx += 1
-            next_tokens.append(
-                CompletionSequenceGroupOutput(samples=samples,
-                                              prompt_logprobs=None))
-
-        return SamplerOutput(outputs=next_tokens)
-
-    def load_weights(self, model_name_or_path: str, **kwargs):
-        arch = _get_model_architecture(self.config)
-        neuronx_module_path, neuronx_model_cls_name, hf_model_cls_name = (
-            _NEURON_SUPPORTED_MODELS[arch])
-        neuronx_module = importlib.import_module(neuronx_module_path)
-        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
-
-        self.model = neuronx_model_cls.from_pretrained(model_name_or_path,
-                                                       **kwargs)
-        self.model.to_neuron()
-
-
-class NeuronSpeculationCausalLM(nn.Module):
-    """A Neuron-optimized causal language model with speculative decoding."""
-
-    SPECULATION_TERMINATION_ID = -1
-
-    def __init__(self, speculation_model) -> None:
-        super().__init__()
-        self.model = speculation_model
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        input_block_ids: torch.Tensor,
-    ) -> torch.Tensor:
-        tokens, counts = self.model.speculative_iteration(
-            input_ids, positions, input_block_ids)
-
-        # Mark the end of accepted speculative tokens for each sequence with the
-        # speculation termination id.
-        batch_size, steps = tokens.shape
-        mask = torch.arange(steps).expand(batch_size, -1) >= counts
-        tokens[mask] = self.SPECULATION_TERMINATION_ID
-
-        return tokens
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[list[SamplerOutput]]:
-        batch_size, num_steps = logits.shape
-        seq_ids = [
-            seq_id for sg in sampling_metadata.seq_groups
-            for seq_id in sg.seq_ids
-        ]
-        # Organize input tensors by step instead of by sequence.
-        accepted_token_ids_by_step = logits.transpose(0, 1)
-        accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
-
-        sampler_output_list = []
-        for step_index in range(num_steps):
-            if all(token_id == self.SPECULATION_TERMINATION_ID
-                   for token_id in accepted_token_ids_by_step[step_index]):
-                break
-            step_output_token_ids = []
-            for sequence_index in range(batch_size):
-                token_id = accepted_token_ids_by_step[step_index][
-                    sequence_index]
-                step_output_token_ids.append(
-                    CompletionSequenceGroupOutput(samples=[
-                        SequenceOutput(parent_seq_id=seq_ids[sequence_index],
-                                       output_token=token_id,
-                                       logprobs={token_id: Logprob(token_id)})
-                    ],
-                                                  prompt_logprobs=None))
-            sampler_output_list.append(
-                SamplerOutput(outputs=step_output_token_ids))
-        return sampler_output_list
-
-
-def _get_model_architecture(config: PretrainedConfig) -> str:
-    architectures = getattr(config, "architectures", [])
-    for arch in architectures:
-        if arch in _NEURON_SUPPORTED_MODELS:
-            return arch
-    raise ValueError(
-        f"Model architectures {architectures} are not supported on Neuron "
-        f"for now. Supported architectures: "
-        f"{list(_NEURON_SUPPORTED_MODELS.keys())}")
-
-
-def _get_buckets(env: str, default_value: list[int]) -> list[int]:
-    env_value = os.getenv(env)
-    if env_value is None:
-        return default_value
-    buckets_remove_empty = filter(
-        lambda x: x is not None and len(x.strip()) > 0, env_value.split(","))
-    buckets_int = map(int, buckets_remove_empty)
-    buckets_list = list(buckets_int)
-    return buckets_list
-
-
-def _get_default_neuron_config(model_config: ModelConfig,
-                               parallel_config: ParallelConfig,
-                               scheduler_config: SchedulerConfig):
-    """Generate a neuron config based on vllm config args."""
-    from transformers_neuronx.config import ContinuousBatchingConfig
-    from transformers_neuronx.constants import LAYOUT_BSH
-
-    continuous_batching_config = ContinuousBatchingConfig(
-        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
-    quant_config = dict(
-        dequant_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        quantize_method="vector_dynamic")
-    neuron_quantization_config_builder = lambda quant: get_quantization_config(
-        quant).from_config(quant_config).get_quant_method(None, "")
-    # TODO: Add Paged attention config to the default neuron arguments.
-    default_neuron_args = dict(
-        collectives_layout=LAYOUT_BSH,
-        attention_layout=LAYOUT_BSH,
-        fuse_qkv=True,
-        quant=neuron_quantization_config_builder(model_config.quantization)
-        if model_config.quantization else None,
-        continuous_batching=continuous_batching_config,
-        weight_tiling=bool(model_config.quantization),
-        on_device_generation=_get_neuron_on_device_generation_config(
-            model_config))
-    return default_neuron_args
-
-
-def _get_default_neuron_config_for_speculation(
-        model_config: ModelConfig, parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig):
-    """Generate a neuron config for speculative decoding based on
-    vllm config args."""
-    from transformers_neuronx.config import ContinuousBatchingConfig
-    from transformers_neuronx.constants import LAYOUT_BSH
-
-    continuous_batching_config = ContinuousBatchingConfig(
-        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
-
-    default_neuron_args = dict(collectives_layout=LAYOUT_BSH,
-                               attention_layout=LAYOUT_BSH,
-                               fuse_qkv=True,
-                               on_device_embedding=True,
-                               continuous_batching=continuous_batching_config,
-                               on_device_generation=copy.deepcopy(
-                                   model_config.neuron_sampling_params))
-    return default_neuron_args
-
-
-def _get_neuron_on_device_generation_config(model_config: ModelConfig):
-    if not _is_neuron_on_device_sampling_disabled(model_config):
-        return copy.deepcopy(model_config.neuron_sampling_params)
-    return None
-
-
-def _is_neuron_on_device_sampling_disabled(model_config: ModelConfig) -> bool:
-    return not getattr(model_config, "neuron_sampling_params", None)
-
-
-def _get_neuron_config_after_override(default_neuron_config,
-                                      overridden_neuron_config):
-    from transformers_neuronx.config import (ContinuousBatchingConfig,
-                                             GenerationConfig,
-                                             KVCacheQuantizationConfig,
-                                             NeuronConfig, QuantizationConfig,
-                                             SparseAttnConfig)
-
-    sparse_attn = overridden_neuron_config.pop("sparse_attn", {})
-    if sparse_attn:
-        overridden_neuron_config["sparse_attn"] = SparseAttnConfig(
-            **sparse_attn)
-
-    kv_cache_quant = overridden_neuron_config.pop("kv_cache_quant", {})
-    if kv_cache_quant:
-        overridden_neuron_config["kv_cache_quant"] = KVCacheQuantizationConfig(
-            **kv_cache_quant)
-
-    continuous_batching = overridden_neuron_config.pop("continuous_batching",
-                                                       {})
-    if continuous_batching:
-        overridden_neuron_config[
-            "continuous_batching"] = ContinuousBatchingConfig(
-                **continuous_batching)
-
-    quant = overridden_neuron_config.pop("quant", {})
-    if quant:
-        overridden_neuron_config["quant"] = QuantizationConfig(**quant)
-
-    on_device_generation = overridden_neuron_config.pop(
-        "on_device_generation", {})
-    if on_device_generation:
-        overridden_neuron_config["on_device_generation"] = GenerationConfig(
-            **on_device_generation)
-    default_neuron_config.update(overridden_neuron_config)
-    return NeuronConfig(**default_neuron_config)
-
-
-def get_neuron_model(model_config: ModelConfig,
-                     parallel_config: ParallelConfig,
-                     scheduler_config: SchedulerConfig) -> nn.Module:
-    """Initializes a neuron-optimized model for inference."""
-    # Create a model instance.
-    model = NeuronCausalLM(
-        model_config.hf_config,
-        _is_neuron_on_device_sampling_disabled(model_config))
-
-    default_neuron_config_args = _get_default_neuron_config(
-        model_config, parallel_config, scheduler_config)
-
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
-                                            [scheduler_config.max_model_len])
-    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
-                               [scheduler_config.max_model_len])
-
-    model.load_weights(model_config.model,
-                       tp_degree=parallel_config.tensor_parallel_size,
-                       amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-                       neuron_config=neuron_config,
-                       context_length_estimate=context_length_estimates,
-                       n_positions=n_positions,
-                       batch_size=scheduler_config.max_num_seqs)
-
-    return model.eval()
-
-
-def get_neuron_speculation_model(model_config: ModelConfig,
-                                 parallel_config: ParallelConfig,
-                                 scheduler_config: SchedulerConfig,
-                                 speculation_config: SpeculativeConfig):
-    """Initializes a neuron-optimized speculation model for inference.
-
-    This method is only applicable for speculation with a standalone draft model
-    """
-    from transformers_neuronx.fused_speculation import FusedSpeculativeDecoder
-
-    # For Eagle SD, we need to pass in additional parameters in neuron config.
-    is_eagle = getattr(speculation_config.draft_model_config.hf_config,
-                       "is_eagle", False)
-
-    # Create target model instance.
-    target_model = NeuronCausalLM(model_config.hf_config)
-
-    default_neuron_config_args = _get_default_neuron_config_for_speculation(
-        model_config, parallel_config, scheduler_config)
-    if is_eagle:
-        default_neuron_config_args['is_eagle_target'] = True
-
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
-                                            [scheduler_config.max_model_len])
-    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
-                               [scheduler_config.max_model_len])
-
-    target_model.load_weights(
-        model_config.model,
-        tp_degree=parallel_config.tensor_parallel_size,
-        amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        neuron_config=neuron_config,
-        context_length_estimate=context_length_estimates,
-        n_positions=n_positions,
-        batch_size=scheduler_config.max_num_seqs)
-
-    target_model.eval()
-
-    # Create draft model instance.
-    draft_model = NeuronCausalLM(
-        speculation_config.draft_model_config.hf_config)
-
-    default_draft_neuron_config_args = (
-        _get_default_neuron_config_for_speculation(
-            speculation_config.draft_model_config, parallel_config,
-            scheduler_config))
-    if is_eagle:
-        default_draft_neuron_config_args['is_eagle_draft'] = True
-        default_draft_neuron_config_args['has_pre_attention_norm'] = False
-
-    draft_neuron_config = _get_neuron_config_after_override(
-        default_draft_neuron_config_args,
-        speculation_config.draft_model_config.override_neuron_config)
-
-    draft_model.load_weights(speculation_config.draft_model_config.model,
-                             tp_degree=speculation_config.
-                             draft_parallel_config.tensor_parallel_size,
-                             amp=TORCH_DTYPE_TO_NEURON_AMP[
-                                 speculation_config.draft_model_config.dtype],
-                             neuron_config=draft_neuron_config,
-                             context_length_estimate=context_length_estimates,
-                             n_positions=n_positions,
-                             batch_size=scheduler_config.max_num_seqs)
-
-    draft_model.eval()
-
-    num_speculative_tokens = speculation_config.num_speculative_tokens
-    # Create speculation model instance.
-    speculation_model = FusedSpeculativeDecoder(draft_model.model,
-                                                target_model.model,
-                                                num_speculative_tokens)
-    speculation_model.to_neuron()
-
-    return NeuronSpeculationCausalLM(speculation_model)
-
-
-def get_neuron_eagle_speculation_model(model_config: ModelConfig,
-                                       parallel_config: ParallelConfig,
-                                       scheduler_config: SchedulerConfig,
-                                       speculation_config: SpeculativeConfig):
-    """Initializes a neuron-optimized EAGLE speculation model for inference."""
-    from transformers_neuronx.eagle_speculation import EagleSpeculativeDecoder
-
-    # Create target model instance.
-    target_model = NeuronCausalLM(model_config.hf_config)
-
-    default_neuron_config_args = _get_default_neuron_config_for_speculation(
-        model_config, parallel_config, scheduler_config)
-    default_neuron_config_args['is_eagle_target'] = True
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
-                                            [scheduler_config.max_model_len])
-    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
-                               [scheduler_config.max_model_len])
-
-    target_model.load_weights(
-        model_config.model,
-        tp_degree=parallel_config.tensor_parallel_size,
-        amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        neuron_config=neuron_config,
-        context_length_estimate=context_length_estimates,
-        n_positions=n_positions,
-        batch_size=scheduler_config.max_num_seqs)
-
-    target_model.eval()
-
-    # Create draft model instance.
-    draft_model = NeuronCausalLM(
-        speculation_config.draft_model_config.hf_config)
-
-    default_draft_neuron_config_args = (
-        _get_default_neuron_config_for_speculation(
-            speculation_config.draft_model_config, parallel_config,
-            scheduler_config))
-    default_draft_neuron_config_args['is_eagle_draft'] = True
-    default_draft_neuron_config_args['has_pre_attention_norm'] = False
-    draft_neuron_config = _get_neuron_config_after_override(
-        default_draft_neuron_config_args,
-        speculation_config.draft_model_config.override_neuron_config)
-
-    draft_model.load_weights(speculation_config.draft_model_config.model,
-                             tp_degree=speculation_config.
-                             draft_parallel_config.tensor_parallel_size,
-                             amp=TORCH_DTYPE_TO_NEURON_AMP[
-                                 speculation_config.draft_model_config.dtype],
-                             neuron_config=draft_neuron_config,
-                             context_length_estimate=context_length_estimates,
-                             n_positions=n_positions,
-                             batch_size=scheduler_config.max_num_seqs)
-
-    draft_model.eval()
-
-    token_tree: dict[int, list[int]] = ast.literal_eval(
-        speculation_config.speculative_token_tree)
-
-    speculation_model = EagleSpeculativeDecoder(draft_model.model,
-                                                target_model.model,
-                                                token_tree=token_tree)
-    speculation_model.to_neuron()
-
-    return NeuronSpeculationCausalLM(speculation_model)
diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py
deleted file mode 100644
index 34bf43fe7b..0000000000
--- a/vllm/model_executor/model_loader/neuronx_distributed.py
+++ /dev/null
@@ -1,685 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Utilities for selecting and loading Neuron models in
-neuronx-distributed-inference framework."""
-# Disabling yapf because yapf and isort have conflicts for the below imports
-# yapf: disable
-import copy
-import hashlib
-import importlib
-import multiprocessing
-import os
-import shutil
-from typing import Optional
-
-import torch
-import torch.nn as nn
-from neuronx_distributed_inference.models.config import (
-    FusedSpecNeuronConfig, OnDeviceSamplingConfig)
-from neuronx_distributed_inference.models.mllama.utils import (
-    create_vision_mask)
-from neuronx_distributed_inference.modules.lora_serving import (
-    LoraServingConfig)
-from neuronx_distributed_inference.utils.hf_adapter import (
-    load_pretrained_config)
-from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
-
-from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig)
-from vllm.logger import init_logger
-from vllm.logprobs import Logprob
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import CompletionSequenceGroupOutput, SequenceOutput
-
-# yapf: enable
-logger = init_logger(__name__)
-
-TORCH_DTYPE_TO_NEURON_AMP = {
-    "auto": "float32",
-    "half": "float16",
-    "float16": "float16",
-    "bfloat16": "bfloat16",
-    "float": "float32",
-    "float32": "float32",
-    torch.float16: "float16",
-    torch.bfloat16: "bfloat16",
-    torch.float32: "float32",
-}
-
-# Models supported by Neuronx distributed for inference.
-_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str]] = {
-    "LlamaForCausalLM":
-    ("neuronx_distributed_inference.models.llama.modeling_llama",
-     "NeuronLlamaForCausalLM"),
-    "MistralForCausalLM":
-    ("neuronx_distributed_inference.models.llama.modeling_llama",
-     "NeuronLlamaForCausalLM"),
-    "DbrxForCausalLM":
-    ("neuronx_distributed_inference.models.dbrx.modeling_dbrx",
-     "NeuronDbrxForCausalLM"),
-    "MixtralForCausalLM":
-    ("neuronx_distributed_inference.models.mixtral.modeling_mixtral",
-     "NeuronMixtralForCausalLM"),
-    "MllamaForConditionalGeneration":
-    ("neuronx_distributed_inference.models.mllama.modeling_mllama",
-     "NeuronMllamaForCausalLM"),
-}
-
-
-class NeuronCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.logits_processor = LogitsProcessor(config.vocab_size,
-                                                logits_as_input=True)
-        self.sampler = Sampler()
-
-        # Lazy initialized
-        self.model: nn.Module
-
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                input_block_ids: torch.Tensor,
-                sampling_params: torch.Tensor,
-                prev_hidden: Optional[torch.Tensor] = None,
-                adapter_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
-        # sort block ids sequentially for perf/neuron support reasons
-        sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids)
-        input_ids = torch.index_select(input_ids, 0, sorted_indices)
-        positions = torch.index_select(positions, 0, sorted_indices)
-        sampling_params = torch.index_select(sampling_params, 0,
-                                             sorted_indices)
-        output = self.model(input_ids,
-                            attention_mask=None,
-                            position_ids=positions,
-                            seq_ids=sorted_input_block_ids,
-                            sampling_params=sampling_params,
-                            prev_hidden=prev_hidden,
-                            adapter_ids=adapter_ids)
-        # on-device sampling
-        if self.config.neuron_config.on_device_sampling_config:
-            output = output.hidden_states
-        else:
-            output = output.logits[:, -1, :]
-
-        restored_indices = torch.argsort(sorted_indices)
-        if input_block_ids.shape[0] != 1:
-            output = torch.index_select(output, 0, restored_indices)
-
-        return output
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(None, hidden_states, sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        # on-device sampling
-        if self.config.neuron_config.on_device_sampling_config:
-            batch_size = logits.shape
-            seq_ids = [
-                seq_id for sg in sampling_metadata.seq_groups
-                for seq_id in sg.seq_ids
-            ]
-            assert len(seq_ids) == list(batch_size)[0], "batch size mismatch"
-            # Organize input tensors by step instead of by sequence.
-            accepted_token_ids_by_step = logits.flatten()
-            accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
-
-            step_output_token_ids = []
-            for i, seq_id in enumerate(seq_ids):
-                token_id = accepted_token_ids_by_step[i]
-                step_output_token_ids.append(
-                    CompletionSequenceGroupOutput(samples=[
-                        SequenceOutput(parent_seq_id=seq_id,
-                                       output_token=token_id,
-                                       logprobs={token_id: Logprob(token_id)})
-                    ],
-                                                  prompt_logprobs=None))
-            return SamplerOutput(outputs=step_output_token_ids)
-        else:
-            return self.sampler(logits, sampling_metadata)
-
-    def load_weights(self, model_name_or_path: str, **kwargs):
-        arch = _get_model_architecture(self.config)
-        neuronx_module_path, neuronx_model_cls_name = (
-            _NEURON_SUPPORTED_MODELS[arch])
-        neuronx_module = importlib.import_module(neuronx_module_path)
-        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
-        neuron_config = neuronx_model_cls.get_neuron_config_cls()(
-            **kwargs['neuron_config'])
-        self.config.neuron_config = neuron_config
-        config = neuronx_model_cls.get_config_cls()(
-            neuron_config,
-            load_config=load_pretrained_config(model_name_or_path))
-        hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'),
-                                    usedforsecurity=False).hexdigest()
-        if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None:
-            compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS")
-        elif os.path.exists(model_name_or_path):
-            compiled_model_path = os.path.join(model_name_or_path,
-                                               "neuron-compiled-artifacts",
-                                               hashed_config)
-            shutil.rmtree(compiled_model_path, ignore_errors=True)
-        else:
-            compiled_model_path = os.path.join("local-models",
-                                               model_name_or_path,
-                                               "neuron-compiled-artifacts",
-                                               hashed_config)
-            shutil.rmtree(compiled_model_path, ignore_errors=True)
-        try:
-            self.model = neuronx_model_cls(compiled_model_path)
-            override_neuron_config = kwargs["override_neuron_config"]
-            for k, v in override_neuron_config.items():
-                setattr(self.model.config.neuron_config, k, v)
-            self.model.load(compiled_model_path)
-            return
-        except (FileNotFoundError, ValueError) as e:
-            logger.warning("Exception: %s", e)
-            logger.warning("Failed to load the model from %s, Recompiling...",
-                           compiled_model_path)
-        if not os.path.exists(model_name_or_path):
-            hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
-            saved_path = os.path.join("local-models", model_name_or_path)
-            hf_model.save_pretrained(saved_path)
-            model_name_or_path = saved_path
-        self.model = neuronx_model_cls(model_name_or_path, config)
-        self.model.compile(compiled_model_path)
-        self.model.load(compiled_model_path)
-
-
-class NeuronMllamaForCausalLM(nn.Module):
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 on_device_sampling_disabled: bool = False) -> None:
-        super().__init__()
-        # has_image is the only multimodal input that is used in
-        # token-generation
-        # This is a cache (on CPU) that saves has_image data per sequence id
-        # The number of entries in this cache is <= Batch-Size
-        self.has_image_cache: dict[int, torch.Tensor] = {}
-        self.config = config
-        self.logits_processor = LogitsProcessor(
-            config.get_text_config().vocab_size, logits_as_input=True)
-
-        self.on_device_sampling_disabled = on_device_sampling_disabled
-        if self.on_device_sampling_disabled:
-            # Use default sampler
-            self.sampler = Sampler()
-
-        # Lazy initialized
-        self.model: nn.Module
-        self.is_reorder_needed: bool = True
-
-    def read_from_has_image_cache(self, seq_ids: torch.Tensor):
-        has_image_list = []
-        for index in range(len(seq_ids)):
-            seq_id = seq_ids[index].item()
-            if seq_id in self.has_image_cache:
-                has_image_list.append(self.has_image_cache[seq_id])
-            else:
-                has_image_list.append(torch.tensor([0]))
-        return torch.tensor(has_image_list)
-
-    def write_to_has_image_cache(self, seq_ids: torch.Tensor,
-                                 has_image: torch.Tensor):
-        for index in range(len(seq_ids)):
-            seq_id = seq_ids[index].item()
-            if index < len(has_image):
-                self.has_image_cache[seq_id] = has_image[index]
-            else:
-                self.has_image_cache[seq_id] = torch.zeros(1)
-
-    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
-                seq_ids: torch.Tensor, pixel_values: torch.Tensor,
-                aspect_ratios: torch.Tensor, num_chunks: torch.Tensor,
-                has_image: torch.Tensor, sampling_params) -> torch.Tensor:
-
-        # We update the has_image cache during prefill
-        # and read the has_image cache during decode
-        if input_ids.shape[-1] > 1:  # prefill
-            self.write_to_has_image_cache(seq_ids, has_image)
-        else:
-            has_image = self.read_from_has_image_cache(seq_ids)
-            bs = input_ids.shape[0]
-            num_chunks = torch.zeros((bs, 1))
-            aspect_ratios = torch.zeros((bs, 1, 2))
-
-        input_block_ids = seq_ids
-        origin_input_block_ids = seq_ids
-        if self.is_reorder_needed:
-            # sort block ids sequentially for perf/neuron support reasons
-            input_block_ids, sorted_indices = torch.sort(input_block_ids)
-            input_ids = torch.index_select(input_ids, 0, sorted_indices)
-            positions = torch.index_select(positions, 0, sorted_indices)
-            sampling_params = torch.index_select(sampling_params, 0,
-                                                 sorted_indices)
-            pixel_values = torch.index_select(pixel_values, 0, sorted_indices)
-            aspect_ratios = torch.index_select(aspect_ratios, 0,
-                                               sorted_indices)
-            num_chunks = torch.index_select(num_chunks, 0, sorted_indices)
-            has_image = torch.index_select(has_image, 0, sorted_indices)
-
-        self.vision_mask = create_vision_mask(input_ids, self.vision_token_id)
-        output = self.model(
-            input_ids.to(torch.int32),
-            attention_mask=None,
-            position_ids=positions.to(torch.int32),
-            seq_ids=seq_ids.flatten().to(torch.int32),
-            pixel_values=pixel_values.to(
-                self.config.vision_config.torch_dtype),
-            aspect_ratios=aspect_ratios.to(torch.int32),
-            vision_mask=self.vision_mask.to(torch.int32),
-            sampling_params=sampling_params,
-            num_chunks=num_chunks.to(torch.int32),
-            has_image=has_image.to(torch.int32),
-        )
-        if self.config.neuron_config.on_device_sampling_config:
-            output = output.hidden_states
-        else:
-            output = output.logits[:, -1, :]
-
-        if self.is_reorder_needed and origin_input_block_ids.shape[0] != 1:
-            restored_indices = torch.argsort(sorted_indices)
-            output = torch.index_select(output, 0, restored_indices)
-        return output
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(None, hidden_states, sampling_metadata)
-        return logits
-
-    def sample(self, hidden_states, sampling_metadata):
-        if not self.on_device_sampling_disabled:
-            with torch.profiler.record_function("sample"):
-                hidden_states = hidden_states.flatten()
-                res = []
-                sample_idx = 0
-                for seq_group in sampling_metadata.seq_groups:
-                    seq_ids = seq_group.seq_ids
-                    samples = []
-                    for seq_id in seq_ids:
-                        token_id = hidden_states[sample_idx].item()
-                        samples.append(
-                            SequenceOutput(
-                                parent_seq_id=seq_id,
-                                output_token=token_id,
-                                logprobs={token_id: Logprob(token_id)}))
-                        sample_idx += 1
-                    res.append(
-                        CompletionSequenceGroupOutput(samples=samples,
-                                                      prompt_logprobs=None))
-                next_tokens = SamplerOutput(outputs=res)
-        else:
-            next_tokens = self.sampler(None, hidden_states, sampling_metadata)
-        return next_tokens
-
-    def load_weights(self, model_name_or_path: str, **kwargs):
-        arch = _get_model_architecture(self.config)
-        neuronx_module_path, neuronx_model_cls_name = (
-            _NEURON_SUPPORTED_MODELS[arch])
-        neuronx_module = importlib.import_module(neuronx_module_path)
-        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
-        neuron_config = neuronx_model_cls.get_neuron_config_cls()(
-            **kwargs['neuron_config'])
-        self.config.neuron_config = neuron_config
-        logger.info("neuron_config buckets: %s",
-                    self.config.neuron_config.buckets)
-        config = neuronx_model_cls.get_config_cls()(
-            neuron_config,
-            load_config=load_pretrained_config(model_name_or_path))
-        hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'),
-                                    usedforsecurity=False).hexdigest()
-        if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None:
-            compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS")
-        elif os.path.exists(model_name_or_path):
-            compiled_model_path = os.path.join(model_name_or_path,
-                                               "neuron-compiled-artifacts",
-                                               hashed_config)
-        else:
-            compiled_model_path = os.path.join("local-models",
-                                               model_name_or_path,
-                                               "neuron-compiled-artifacts",
-                                               hashed_config)
-        try:
-            self.model = neuronx_model_cls(compiled_model_path)
-            tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
-            self.vision_token_id = tokenizer(
-                "<|image|>", add_special_tokens=False).input_ids[0]
-            self.model.load(compiled_model_path)
-            return
-        except (FileNotFoundError, ValueError):
-            logger.warning("Failed to load the model from %s, Recompiling...",
-                           compiled_model_path)
-        if not os.path.exists(model_name_or_path):
-            hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
-            saved_path = os.path.join("local-models", model_name_or_path)
-            hf_model.save_pretrained(saved_path)
-            model_name_or_path = saved_path
-        self.model = neuronx_model_cls(model_name_or_path, config)
-
-        logger.info("\nCompiling and saving model to %s", model_name_or_path)
-
-        p = multiprocessing.Process(target=compile_model,
-                                    args=(self, compiled_model_path))
-        p.start()
-        p.join()
-
-        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
-        tokenizer.save_pretrained(compiled_model_path)
-        logger.info("Successfully compiled and saved the model in %s",
-                    compiled_model_path)
-
-        # Read "<|image|>" token_id from the tokenizer
-        self.vision_token_id = tokenizer("<|image|>",
-                                         add_special_tokens=False).input_ids[0]
-        logger.info("\nLoading model from compiled checkpoint...")
-        self.model.load(compiled_model_path)
-
-
-def compile_model(neuron_model, traced_model_path):
-    neuron_model.model.compile(traced_model_path)
-
-
-class NeuronSpeculationCausalLM(nn.Module):
-    """A Neuron-optimized causal language model with speculative decoding."""
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.logits_processor = LogitsProcessor(config.vocab_size,
-                                                logits_as_input=True)
-        # Lazy initialized
-        self.model: nn.Module
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        input_block_ids: torch.Tensor,
-        sampling_params: torch.Tensor,
-    ) -> torch.Tensor:
-        # sort block ids sequentially for perf/neuron support reasons
-        sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids)
-        input_ids = torch.index_select(input_ids, 0, sorted_indices)
-        positions = torch.index_select(positions, 0, sorted_indices)
-        sampling_params = torch.index_select(sampling_params, 0,
-                                             sorted_indices)
-
-        output = self.model(input_ids,
-                            attention_mask=None,
-                            position_ids=positions,
-                            seq_ids=sorted_input_block_ids,
-                            sampling_params=sampling_params)
-        restored_indices = torch.argsort(sorted_indices)
-
-        # CTX encoding
-        if (positions[:, 0]).sum().item() == 0:
-            output = output.fused_outputs[0][:, 0:1]
-            if input_block_ids.shape[0] != 1:
-                output = torch.index_select(output, 0, restored_indices)
-            return output
-
-        # Fused Spec (Generation)
-        accepted_tokens_with_padding = output.fused_outputs[0]
-        next_pos_ids = output.fused_outputs[-1]
-        generated_token_counts = next_pos_ids - positions
-
-        assert torch.any(generated_token_counts == 0).item() is False, \
-            "NxDI model generated no output for one or more sequences."
-
-        batch_size, steps = accepted_tokens_with_padding.shape
-        mask = torch.arange(steps).expand(batch_size,
-                                          -1) >= generated_token_counts
-        accepted_tokens_with_padding[mask] = -1
-
-        if input_block_ids.shape[0] != 1:
-            accepted_tokens_with_padding = torch.index_select(
-                accepted_tokens_with_padding, 0, restored_indices)
-
-        return accepted_tokens_with_padding
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[list[SamplerOutput]]:
-        batch_size, num_steps = logits.shape
-        seq_ids = [
-            seq_id for sg in sampling_metadata.seq_groups
-            for seq_id in sg.seq_ids
-        ]
-        # Organize input tensors by step instead of by sequence.
-        accepted_token_ids_by_step = logits.transpose(0, 1)
-        accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
-
-        sampler_output_list = []
-        for step_index in range(num_steps):
-            if all(token_id == -1
-                   for token_id in accepted_token_ids_by_step[step_index]):
-                break
-            step_output_token_ids = []
-            for sequence_index in range(batch_size):
-                token_id = accepted_token_ids_by_step[step_index][
-                    sequence_index]
-                step_output_token_ids.append(
-                    CompletionSequenceGroupOutput(samples=[
-                        SequenceOutput(parent_seq_id=seq_ids[sequence_index],
-                                       output_token=token_id,
-                                       logprobs={token_id: Logprob(token_id)})
-                    ],
-                                                  prompt_logprobs=None))
-            sampler_output_list.append(
-                SamplerOutput(outputs=step_output_token_ids))
-        return sampler_output_list
-
-    def load_weights(self, model_name_or_path: str,
-                     draft_model_name_or_path: str, **kwargs):
-        arch = _get_model_architecture(self.config)
-        neuronx_module_path, neuronx_model_cls_name = (
-            _NEURON_SUPPORTED_MODELS[arch])
-        neuronx_module = importlib.import_module(neuronx_module_path)
-        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
-        neuron_config = neuronx_model_cls.get_neuron_config_cls()(
-            **kwargs['neuron_config'])
-        config = neuronx_model_cls.get_config_cls()(
-            neuron_config,
-            load_config=load_pretrained_config(model_name_or_path))
-
-        draft_neuron_config = copy.deepcopy(config.neuron_config)
-        if not config.neuron_config.enable_eagle_speculation:
-            draft_neuron_config.speculation_length = 0
-        draft_neuron_config.trace_tokengen_model = True
-        draft_neuron_config.enable_fused_speculation = False
-        if getattr(config.neuron_config, "draft_model_modules_to_not_convert",
-                   None):
-            draft_neuron_config.modules_to_not_convert = (
-                draft_neuron_config.draft_model_modules_to_not_convert)
-        if config.neuron_config.enable_eagle_speculation:
-            draft_neuron_config.is_eagle_draft = True
-            draft_neuron_config.sequence_parallel_enabled = False
-        draft_config = neuronx_model_cls.get_config_cls()(
-            draft_neuron_config,
-            load_config=load_pretrained_config(draft_model_name_or_path))
-        fused_spec_config = (FusedSpecNeuronConfig(
-            neuronx_model_cls._model_cls,
-            draft_config=draft_config,
-            draft_model_path=draft_model_name_or_path))
-        config.fused_spec_config = fused_spec_config
-        self.config.neuron_config = neuron_config
-
-        hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'),
-                                    usedforsecurity=False).hexdigest()
-        if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None:
-            compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS")
-        elif os.path.exists(model_name_or_path):
-            compiled_model_path = os.path.join(model_name_or_path,
-                                               "neuron-compiled-artifacts",
-                                               hashed_config)
-            shutil.rmtree(compiled_model_path, ignore_errors=True)
-        else:
-            compiled_model_path = os.path.join("local-models",
-                                               model_name_or_path,
-                                               "neuron-compiled-artifacts",
-                                               hashed_config)
-            shutil.rmtree(compiled_model_path, ignore_errors=True)
-        try:
-            self.model = neuronx_model_cls(compiled_model_path)
-            override_neuron_config = kwargs["override_neuron_config"]
-            for k, v in override_neuron_config.items():
-                setattr(self.model.config.neuron_config, k, v)
-            self.model.load(compiled_model_path)
-            return
-        except (FileNotFoundError, ValueError) as e:
-            logger.warning("Exception: %s", e)
-            logger.warning("Failed to load the model from %s Recompiling...",
-                           compiled_model_path)
-        if not os.path.exists(model_name_or_path):
-            hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
-            saved_path = os.path.join("local-models", model_name_or_path)
-            hf_model.save_pretrained(saved_path)
-            model_name_or_path = saved_path
-        if not os.path.exists(draft_model_name_or_path):
-            if draft_model_name_or_path != model_name_or_path:
-                hf_model = AutoModelForCausalLM.from_pretrained(
-                    draft_model_name_or_path)
-                saved_path = os.path.join("local-models",
-                                          draft_model_name_or_path)
-                hf_model.save_pretrained(saved_path)
-                draft_model_name_or_path = saved_path
-            else:
-                draft_model_name_or_path = model_name_or_path
-            config.fused_spec_config.draft_model_path = draft_model_name_or_path
-        self.model = neuronx_model_cls(model_name_or_path, config)
-        self.model.compile(compiled_model_path)
-        self.model.load(compiled_model_path)
-
-
-def _get_model_architecture(config: PretrainedConfig) -> str:
-    architectures = getattr(config, "architectures", [])
-    for arch in architectures:
-        if arch in _NEURON_SUPPORTED_MODELS:
-            return arch
-    raise ValueError(
-        f"Model architectures {architectures} are not supported on Neuron "
-        f"for now. Supported architectures: "
-        f"{list(_NEURON_SUPPORTED_MODELS.keys())}")
-
-
-def _get_default_neuron_config(model_config: ModelConfig,
-                               parallel_config: ParallelConfig,
-                               scheduler_config: SchedulerConfig,
-                               lora_serving_config: LoraServingConfig):
-    """Generate a neuron config based on vllm config args."""
-    on_device_sampling_config = OnDeviceSamplingConfig(dynamic=True,
-                                                       deterministic=False)
-    batch_size = scheduler_config.max_num_seqs
-
-    neuron_config = dict(
-        tp_degree=parallel_config.tensor_parallel_size,
-        ctx_batch_size=1,
-        batch_size=batch_size,
-        max_context_length=scheduler_config.max_model_len,
-        seq_len=scheduler_config.max_model_len,
-        enable_bucketing=True,
-        is_continuous_batching=True,
-        quantized=False,
-        torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        padding_side="right",
-        on_device_sampling_config=on_device_sampling_config,
-        sequence_parallel_enabled=True,
-        lora_serving_config=lora_serving_config)
-    return neuron_config
-
-
-def _get_default_speculation_config(model_config: ModelConfig,
-                                    parallel_config: ParallelConfig,
-                                    scheduler_config: SchedulerConfig,
-                                    speculation_config: SpeculativeConfig):
-    """Generate a neuron config for speculative decoding based on vllm config
-    args."""
-    neuron_config = dict(
-        tp_degree=parallel_config.tensor_parallel_size,
-        ctx_batch_size=1,
-        batch_size=scheduler_config.max_num_seqs,
-        max_context_length=scheduler_config.max_model_len,
-        seq_len=scheduler_config.max_model_len,
-        speculation_length=speculation_config.num_speculative_tokens,
-        trace_tokengen_model=False,
-        enable_fused_speculation=True,
-        enable_bucketing=True,
-        is_continuous_batching=True,
-        quantized=False,
-        torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        on_device_sampling_config=dict(
-            top_k=1,
-            do_sample=False,
-        ))
-    return neuron_config
-
-
-def _get_neuron_config_after_override(default_neuron_config,
-                                      overridden_neuron_config):
-    """Update default neuron config values with override args"""
-    overridden_neuron_config = overridden_neuron_config or {}
-    default_neuron_config.update(overridden_neuron_config)
-    return default_neuron_config
-
-
-def get_neuron_model(model_config: ModelConfig,
-                     parallel_config: ParallelConfig,
-                     scheduler_config: SchedulerConfig,
-                     lora_serving_config: LoraServingConfig) -> nn.Module:
-    """Initializes a neuron-optimized model for inference."""
-    model_arch = _get_model_architecture(model_config.hf_config)
-    if model_arch == "MllamaForConditionalGeneration":
-        model = NeuronMllamaForCausalLM(model_config.hf_config)
-    else:
-        model = NeuronCausalLM(model_config.hf_config)
-    default_neuron_config_args = _get_default_neuron_config(
-        model_config, parallel_config, scheduler_config, lora_serving_config)
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    override_neuron_config = model_config.override_neuron_config
-    model.load_weights(model_config.model,
-                       neuron_config=neuron_config,
-                       override_neuron_config=override_neuron_config)
-    return model.eval()
-
-
-def get_neuron_speculation_model(model_config: ModelConfig,
-                                 parallel_config: ParallelConfig,
-                                 scheduler_config: SchedulerConfig,
-                                 speculation_config: SpeculativeConfig):
-    """Initializes a neuron-optimized speculation model for inference.
-    
-    This model handles speculation using both a draft model and an EAGLE draft. 
-    """
-    model = NeuronSpeculationCausalLM(model_config.hf_config)
-    default_neuron_config_args = _get_default_speculation_config(
-        model_config, parallel_config, scheduler_config, speculation_config)
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    override_neuron_config = model_config.override_neuron_config
-    model.load_weights(model_config.model,
-                       speculation_config.draft_model_config.model,
-                       neuron_config=neuron_config,
-                       override_neuron_config=override_neuron_config)
-    return model.eval()
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 56edb8629e..9b64817da6 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -169,37 +169,12 @@ def cpu_platform_plugin() -> Optional[str]:
     return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
 
 
-def neuron_platform_plugin() -> Optional[str]:
-    tnx_installed = False
-    nxd_installed = False
-    logger.debug("Checking if Neuron platform is available.")
-    try:
-        import transformers_neuronx  # noqa: F401
-        tnx_installed = True
-        logger.debug("Confirmed Neuron platform is available because"
-                     " transformers_neuronx is found.")
-    except ImportError:
-        pass
-
-    try:
-        import neuronx_distributed_inference  # noqa: F401
-        nxd_installed = True
-        logger.debug("Confirmed Neuron platform is available because"
-                     " neuronx_distributed_inference is found.")
-    except ImportError:
-        pass
-
-    is_neuron = tnx_installed or nxd_installed
-    return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
-
-
 builtin_platform_plugins = {
     'tpu': tpu_platform_plugin,
     'cuda': cuda_platform_plugin,
     'rocm': rocm_platform_plugin,
     'xpu': xpu_platform_plugin,
     'cpu': cpu_platform_plugin,
-    'neuron': neuron_platform_plugin,
 }
 
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index cb620542b8..fdd3764d2c 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -73,7 +73,6 @@ class PlatformEnum(enum.Enum):
     TPU = enum.auto()
     XPU = enum.auto()
     CPU = enum.auto()
-    NEURON = enum.auto()
     OOT = enum.auto()
     UNSPECIFIED = enum.auto()
 
@@ -164,9 +163,6 @@ class Platform:
     def is_cpu(self) -> bool:
         return self._enum == PlatformEnum.CPU
 
-    def is_neuron(self) -> bool:
-        return self._enum == PlatformEnum.NEURON
-
     def is_out_of_tree(self) -> bool:
         return self._enum == PlatformEnum.OOT
 
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
deleted file mode 100644
index cb8ac8db66..0000000000
--- a/vllm/platforms/neuron.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import enum
-import os
-from functools import lru_cache
-from typing import TYPE_CHECKING, Optional
-
-from vllm import envs
-from vllm.logger import init_logger
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
-
-from .interface import Platform, PlatformEnum
-
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-else:
-    VllmConfig = None
-
-logger = init_logger(__name__)
-
-
-class NeuronFramework(enum.Enum):
-    TRANSFORMERS_NEURONX = "transformers-neuronx"
-    NEURONX_DISTRIBUTED_INFERENCE = "neuronx-distributed-inference"
-
-
-class NeuronPlatform(Platform):
-    _enum = PlatformEnum.NEURON
-    device_name: str = "neuron"
-    device_type: str = "neuron"
-    ray_device_key: str = "neuron_cores"
-    supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"]
-    dist_backend: str = "gloo"
-    device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
-
-    @classmethod
-    def get_device_name(cls, device_id: int = 0) -> str:
-        return "neuron"
-
-    @classmethod
-    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
-        return False
-
-    @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        parallel_config = vllm_config.parallel_config
-        if parallel_config.worker_cls == "auto":
-            parallel_config.worker_cls = \
-                "vllm.worker.neuron_worker.NeuronWorker"
-
-        if parallel_config.world_size > 1:
-            parallel_config.distributed_executor_backend = "uni"
-
-        if vllm_config.cache_config and vllm_config.model_config:
-            # neuron needs block_size = max_model_len
-            vllm_config.cache_config.block_size = \
-                vllm_config.model_config.max_model_len  # type: ignore
-
-        if vllm_config.model_config and vllm_config.model_config.use_mla:
-            logger.info(
-                "MLA is enabled on a non-GPU platform; forcing chunked "
-                "prefill and prefix caching to be disabled.")
-            vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
-            vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS)
-
-    @classmethod
-    def is_pin_memory_available(cls) -> bool:
-        logger.warning("Pin memory is not supported on Neuron.")
-        return False
-
-    @classmethod
-    def get_device_communicator_cls(cls) -> str:
-        if envs.VLLM_USE_V1:
-            return "vllm.distributed.device_communicators.neuron_communicator.NeuronCommunicator"  # noqa
-        else:
-            return Platform.get_device_communicator_cls()
-
-    @classmethod
-    def use_all_gather(cls) -> bool:
-        return True
-
-    @classmethod
-    @lru_cache
-    def is_neuronx_distributed_inference(cls) -> bool:
-        try:
-            import neuronx_distributed_inference
-        except ImportError:
-            neuronx_distributed_inference = None
-        return neuronx_distributed_inference is not None
-
-    @classmethod
-    @lru_cache
-    def is_transformers_neuronx(cls) -> bool:
-        try:
-            import transformers_neuronx
-        except ImportError:
-            transformers_neuronx = None
-        return transformers_neuronx is not None
-
-    def get_neuron_framework_to_use(self):
-        """Return the specified framework if corresponding installations are
-        available.
-
-        If no framework is specified, use neuronx-distributed-inference by
-        default.
-        If that's unavailable, check and switch to transformers-neuronx.
-        """
-        if not self.is_neuron():
-            raise AssertionError(
-                f"Neuron Framework unavailable for platform: {self}")
-
-        tnx_installed = self.is_transformers_neuronx()
-        nxd_installed = self.is_neuronx_distributed_inference()
-
-        specified_framework = os.environ.get("VLLM_NEURON_FRAMEWORK")
-        tnx_framework = NeuronFramework.TRANSFORMERS_NEURONX.value
-        nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE.value
-        if specified_framework == tnx_framework and tnx_installed:
-            return self.TRANSFORMERS_NEURONX
-
-        if ((specified_framework == nxd_framework and nxd_installed)
-                or (specified_framework is None and nxd_installed)):
-            return NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE
-
-        if specified_framework is None and tnx_installed:
-            return NeuronFramework.TRANSFORMERS_NEURONX
-
-        return None
-
-    def use_neuronx_distributed(self):
-        """
-        Return True if the framework determined in get_neuron_framework_to_use()
-        is NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE, False otherwise. This
-        is used to select the Neuron model framework and framework-specific
-        configuration to apply during model compilation.
-        """
-        nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE
-        return self.get_neuron_framework_to_use() == nxd_framework
-
-    def use_transformers_neuronx(self):
-        """
-        Return True if the framework determined in get_neuron_framework_to_use()
-        is NeuronFramework.TRANSFORMERS_NEURONX, False otherwise. This is used
-        to select the Neuron model framework and framework-specific
-        configuration to apply during model compilation.
-        """
-        return self.get_neuron_framework_to_use(
-        ) == NeuronFramework.TRANSFORMERS_NEURONX
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
deleted file mode 100644
index 8317b9abff..0000000000
--- a/vllm/worker/neuron_model_runner.py
+++ /dev/null
@@ -1,455 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
-
-import torch
-from torch import nn
-
-from vllm.config import DeviceConfig, VllmConfig
-from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.model_loader.neuron import get_neuron_model
-from vllm.multimodal import BatchedTensorInputs, MultiModalKwargs
-from vllm.platforms import current_platform
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-logger = init_logger(__name__)
-
-
-@dataclass(frozen=True)
-class ModelInputForNeuron(ModelRunnerInputBase):
-    """
-    Used by the NeuronModelRunner.
-    """
-    input_tokens: Optional[torch.Tensor] = None
-    input_positions: Optional[torch.Tensor] = None
-    input_block_ids: Optional[torch.Tensor] = None
-    sampling_metadata: SamplingMetadata = None
-    multi_modal_kwargs: BatchedTensorInputs = None
-    adapter_ids: Optional[str] = None
-
-    def as_broadcastable_tensor_dict(
-            self) -> Dict[str, Union[int, torch.Tensor]]:
-        return {
-            "input_tokens": self.input_tokens,
-            "input_positions": self.input_positions,
-            "input_block_ids": self.input_block_ids,
-            "sampling_metadata": self.sampling_metadata,
-            "multi_modal_kwargs": self.multi_modal_kwargs,
-        }
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "ModelInputForNeuron":
-        return ModelInputForNeuron(
-            input_tokens=tensor_dict["input_tokens"],
-            input_positions=tensor_dict["input_positions"],
-            input_block_ids=tensor_dict["input_block_ids"],
-            sampling_metadata=tensor_dict["sampling_metadata"],
-            multi_modal_kwargs=tensor_dict["multi_modal_kwargs"],
-        )
-
-
-class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
-    """A model runner for AWS Neuron hardware"""
-
-    # NEURON has an upper limit on the top_k
-    _MAX_NEURON_SAMPLING_TOP_K = 256
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ):
-        ModelRunnerBase.__init__(self, vllm_config)
-
-        if (self.model_config is not None
-                and self.model_config.get_sliding_window()):
-            logger.warning("Sliding window is not supported on Neuron. "
-                           "The model will run without sliding window.")
-        self.device_config = (self.device_config if self.device_config
-                              is not None else DeviceConfig())
-        self.lora_config = vllm_config.lora_config
-        self.device = self.device_config.device
-        self.pin_memory = is_pin_memory_available()
-
-        # Lazy initialization.
-        self.model: nn.Module  # initialize after load_model.
-
-        # Once NEURON_ON_DEVICE_SAMPLING_DISABLED is set to a non-zero value,
-        # turn off on-device sampling.
-        self._on_device_sampling_disabled = int(
-            os.getenv("NEURON_ON_DEVICE_SAMPLING_DISABLED", "0"))
-
-        # NEURON needs to update sampling parameters when request IDs change
-        # across batches. This variable stores the previous batch's request IDs
-        # to determine if an update is needed.
-        self._previous_batch_request_ids: List[str] = []
-
-        if not self._on_device_sampling_disabled:
-            self._init_neuron_sampling()
-
-    def _init_neuron_sampling(self) -> None:
-        if current_platform.use_transformers_neuronx():
-            from transformers_neuronx.config import GenerationConfig
-        else:
-            from transformers import GenerationConfig
-        logger.warning(
-            "On-device sampling is turned on in Neuron by default, only "
-            "top_k, top_p, and temperature are current supported sampling "
-            "parameters. To turn off the on-device sampling, please set "
-            "the environment variable NEURON_ON_DEVICE_SAMPLING_DISABLED=1.")
-        self.model_config.neuron_sampling_params = GenerationConfig(
-            max_length=self.scheduler_config.max_model_len,
-            do_sample=True,
-            per_batch_line=True,
-            top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \
-                  * self.scheduler_config.max_num_seqs,
-            top_p=[1.0] * self.scheduler_config.max_num_seqs,
-            temperature=[1.0] * self.scheduler_config.max_num_seqs,
-            dynamic=True,
-            global_top_k=self._MAX_NEURON_SAMPLING_TOP_K)
-
-    def load_model(self) -> None:
-        self.model = get_neuron_model(self.model_config,
-                                      parallel_config=self.parallel_config,
-                                      scheduler_config=self.scheduler_config)
-
-    def get_model(self) -> nn.Module:
-        return self.model
-
-    def _prepare_prompt(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int],
-               BatchedTensorInputs]:
-        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[List[int]] = []
-        input_positions: List[List[int]] = []
-        input_block_ids: List[int] = []
-
-        seq_lens: List[int] = []
-        multi_modal_kwargs_list: List[MultiModalKwargs] = []
-        for seq_group_metadata in seq_group_metadata_list:
-            assert seq_group_metadata.is_prompt
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            assert len(seq_ids) == 1
-            seq_id = seq_ids[0]
-
-            seq_data = seq_group_metadata.seq_data[seq_id]
-            prompt_tokens = seq_data.get_token_ids()
-            seq_len = len(prompt_tokens)
-            seq_lens.append(seq_len)
-
-            input_tokens.append(prompt_tokens)
-            input_positions.append(list(range(seq_len)))
-
-            assert seq_group_metadata.block_tables is not None
-            block_table = seq_group_metadata.block_tables[seq_id]
-            assert len(block_table) == 1
-            input_block_ids.append(block_table[0])
-
-            mm_kwargs = seq_group_metadata.multi_modal_data
-            if mm_kwargs:
-                mm_kwargs = self.process_multi_modal_data_neuron(mm_kwargs)
-                multi_modal_kwargs_list.append(mm_kwargs)
-
-        max_seq_len = max(seq_lens)
-        assert max_seq_len > 0
-        input_tokens = make_tensor_with_pad(input_tokens,
-                                            pad=0,
-                                            max_len=max_seq_len,
-                                            dtype=torch.long,
-                                            device=self.device)
-        input_positions = make_tensor_with_pad(input_positions,
-                                               pad=0,
-                                               max_len=max_seq_len,
-                                               dtype=torch.long,
-                                               device=self.device)
-        input_block_ids = torch.tensor(input_block_ids,
-                                       dtype=torch.long,
-                                       device=self.device)
-
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        return (input_tokens, input_positions, input_block_ids, seq_lens,
-                multi_modal_kwargs)
-
-    def _prepare_decode(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[List[int]] = []
-        input_positions: List[List[int]] = []
-        input_block_ids: List[int] = []
-        context_lens: List[int] = []
-
-        for seq_group_metadata in seq_group_metadata_list:
-            assert not seq_group_metadata.is_prompt
-
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-
-            for seq_id in seq_ids:
-                seq_data = seq_group_metadata.seq_data[seq_id]
-                generation_token = seq_data.get_last_token_id()
-                input_tokens.append([generation_token])
-
-                seq_len = seq_data.get_len()
-                position = seq_len - 1
-                input_positions.append([position])
-                context_lens.append(seq_len)
-
-                assert seq_group_metadata.block_tables is not None
-                block_table = seq_group_metadata.block_tables[seq_id]
-                assert len(block_table) == 1
-                input_block_ids.append(block_table[0])
-
-        input_tokens = make_tensor_with_pad(input_tokens,
-                                            pad=0,
-                                            max_len=1,
-                                            dtype=torch.long,
-                                            device=self.device)
-        input_positions = make_tensor_with_pad(input_positions,
-                                               pad=0,
-                                               max_len=1,
-                                               dtype=torch.long,
-                                               device=self.device)
-        context_lens = torch.tensor(context_lens,
-                                    dtype=torch.int,
-                                    device=self.device)
-        input_block_ids = torch.tensor(input_block_ids,
-                                       dtype=torch.long,
-                                       device=self.device)
-
-        return input_tokens, input_positions, input_block_ids
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron:
-        return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict)
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForNeuron:
-        multi_modal_kwargs = None
-        # NOTE: We assume that all sequences in the group are all prompts or
-        # all decodes.
-        is_prompt = seq_group_metadata_list[0].is_prompt
-        # Prepare input tensors.
-        if is_prompt:
-            (input_tokens, input_positions, input_block_ids, seq_lens,
-             multi_modal_kwargs
-             ) = self._prepare_prompt(seq_group_metadata_list)
-        else:
-            (input_tokens, input_positions,
-             input_block_ids) = self._prepare_decode(seq_group_metadata_list)
-            seq_lens = None
-
-        if not self._on_device_sampling_disabled:
-            for seq_group_metadata in seq_group_metadata_list:
-                sampling_params = seq_group_metadata.sampling_params
-                top_k, top_p, temperature = (
-                    self._convert_to_neuron_sampling_params(sampling_params))
-                sampling_params.top_k = top_k
-                sampling_params.top_p = top_p
-                sampling_params.temperature = temperature
-
-        # we need multi_modal_data for later tokens as well
-        multi_modal_kwargs_list: List[MultiModalKwargs] = []
-        for seq_group_metadata in seq_group_metadata_list:
-            mm_data = seq_group_metadata.multi_modal_data
-            if mm_data:
-                multi_modal_kwargs_list.append(mm_data)
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            # query_lens is not needed if chunked prefill is not
-            # supported. Since neuron worker doesn't support chunked prefill
-            # just use seq_lens instead.
-            seq_lens,
-            self.device,
-            self.pin_memory,
-            generators=self.get_generators(finished_requests_ids))
-
-        if current_platform.use_transformers_neuronx(
-        ) and not self._on_device_sampling_disabled:
-            # Once the request IDs are changed in current iteration, we will
-            # update the on-device sampling parameters.
-            current_batch_request_ids = [
-                seq_group_meta_data.request_id
-                for seq_group_meta_data in seq_group_metadata_list
-            ]
-            if current_batch_request_ids != self._previous_batch_request_ids:
-                self._update_neuron_sampling_params(seq_group_metadata_list)
-                self._previous_batch_request_ids = current_batch_request_ids
-
-        return ModelInputForNeuron(input_tokens=input_tokens,
-                                   input_positions=input_positions,
-                                   input_block_ids=input_block_ids,
-                                   sampling_metadata=sampling_metadata,
-                                   multi_modal_kwargs=multi_modal_kwargs)
-
-    def _update_neuron_sampling_params(
-            self, seq_group_metadata_list: List[SequenceGroupMetadata]):
-        # Update Neuron sampling parameters (GenerationConfig in Neuron)
-        current_sampling_params = self.model_config.neuron_sampling_params
-        assert current_sampling_params is not None, (
-            f"Failed to update sampling_params, "
-            f"current sampling params is {current_sampling_params}")
-
-        is_update_needed = False
-
-        top_k = current_sampling_params.top_k
-        top_p = current_sampling_params.top_p
-        temperature = current_sampling_params.temperature
-
-        # The index of a sequence's sampling parameters in neuron is equal to
-        # its index in `input_block_ids`.
-        for seq_group_metadata in seq_group_metadata_list:
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            sampling_params = seq_group_metadata.sampling_params
-
-            seq_group_top_k = sampling_params.top_k
-            seq_group_top_p = sampling_params.top_p
-            seq_group_temperature = sampling_params.temperature
-
-            for seq_id in seq_ids:
-                index = seq_group_metadata.block_tables[seq_id][0]
-                if (top_k[index] != seq_group_top_k
-                        or top_p[index] != seq_group_top_p
-                        or temperature[index] != seq_group_temperature):
-                    is_update_needed = True
-
-                top_k[index] = seq_group_top_k
-                top_p[index] = seq_group_top_p
-                temperature[index] = seq_group_temperature
-
-        # update_generation_config is only available in transformers-neuronx
-        if is_update_needed and current_platform.use_transformers_neuronx():
-            self.model.model.update_generation_config(current_sampling_params)
-
-    def _convert_to_neuron_sampling_params(
-            self, sampling_params: SamplingParams) -> Tuple[int, float, float]:
-        # Returns the top_k, top_p and temperature parameters for neuron.
-        top_k = sampling_params.top_k
-        top_p = sampling_params.top_p
-        temperature = sampling_params.temperature
-
-        if temperature == 0.0:
-            # Enable greedy sampling on zero temperature
-            return (1, 1.0, 1.0)
-        if top_k < 1 or top_k > self._MAX_NEURON_SAMPLING_TOP_K:
-            top_k = self._MAX_NEURON_SAMPLING_TOP_K
-
-        return (top_k, top_p, temperature)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForNeuron,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        if num_steps > 1:
-            raise ValueError(
-                "NeuronModelRunner does not support multi-step execution.")
-
-        # extract top_k, top_p and temperature from model_input for neuron
-        # forward call
-        sampling_params = (torch.tensor([[
-            seq_group.sampling_params.top_k, seq_group.sampling_params.top_p,
-            seq_group.sampling_params.temperature
-        ] for seq_group in model_input.sampling_metadata.seq_groups]))
-
-        if current_platform.use_neuronx_distributed():
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                input_block_ids=model_input.input_block_ids,
-                sampling_params=sampling_params,
-                adapter_ids=model_input.adapter_ids,
-                **MultiModalKwargs.as_kwargs(
-                    model_input.multi_modal_kwargs or {},
-                    device=self.device,
-                ),
-            )
-        elif current_platform.use_transformers_neuronx():
-            # [TODO] validate on-device sampling
-            # The model signature may need change for on-device sampling
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                input_block_ids=model_input.input_block_ids,
-                **MultiModalKwargs.as_kwargs(
-                    model_input.multi_modal_kwargs or {},
-                    device=self.device,
-                ),
-            )
-
-        # Compute the logits only if the on-device sampling is turned off as
-        # on-device sampling outputs the token ids.
-        if self._on_device_sampling_disabled:
-            logits = self.model.compute_logits(hidden_states,
-                                               model_input.sampling_metadata)
-        else:
-            logits = hidden_states
-
-        # Sample the next token.
-        output = self.model.sample(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        return [output]
-
-    @property
-    def vocab_size(self) -> int:
-        return self.model_config.get_vocab_size()
-
-    def process_multi_modal_data_neuron(self, mm_data):
-        # this is a no-op for NeuronModelRunner
-        return mm_data
-
-    def remove_all_loras(self):
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def add_lora(self, lora_request: LoRARequest):
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
deleted file mode 100644
index 3e4512a639..0000000000
--- a/vllm/worker/neuron_worker.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""A Neuron worker class."""
-import os
-from typing import List, Optional, Set, Tuple
-
-import torch.distributed
-
-from vllm.config import VllmConfig
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment)
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import set_random_seed
-from vllm.platforms import current_platform
-from vllm.platforms.neuron import NeuronFramework
-from vllm.sequence import ExecuteModelRequest
-from vllm.worker.neuron_model_runner import NeuronModelRunner
-from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
-                                     WorkerInput)
-
-logger = init_logger(__name__)
-
-
-class NeuronWorker(LocalOrDistributedWorkerBase):
-    """A worker class that executes the model on a group of neuron cores.
-    """
-
-    model_runner: NeuronModelRunner
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 local_rank: int,
-                 rank: int,
-                 distributed_init_method: str,
-                 is_driver_worker: bool = False) -> None:
-        WorkerBase.__init__(self, vllm_config=vllm_config)
-        self.local_rank = local_rank
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
-        self.is_driver_worker = is_driver_worker
-        self.lora_config = vllm_config.lora_config
-
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
-
-        neuron_framework = current_platform.get_neuron_framework_to_use()
-        if neuron_framework == NeuronFramework.TRANSFORMERS_NEURONX:
-            self.model_runner = self.get_tnx_model_runner(vllm_config)
-        elif neuron_framework == NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE:
-            self.model_runner = self.get_neuronx_distributed_model_runner(
-                vllm_config)
-        else:
-            raise NotImplementedError(
-                "Specified framework" +
-                f" {os.environ.get('VLLM_NEURON_FRAMEWORK')}" +
-                " is either not installed or not supported." +
-                " Supported frameworks: " +
-                "[transformers-neuronx, neuronx-distributed-inference]")
-
-    def get_tnx_model_runner(self, vllm_config):
-        assert (self.lora_config
-                is None), ("LoRA is not supported for TransformersNeuronX "
-                           "framework.")
-        if self.speculative_config is not None:
-            raise NotImplementedError(
-                "Speculative decoding is not supported for TransformersNeuronX"
-            )
-        return NeuronModelRunner(vllm_config=vllm_config)
-
-    def get_neuronx_distributed_model_runner(self, vllm_config):
-        from vllm.worker.neuronx_distributed_model_runner import (
-            NeuronxDistributedModelRunner)
-        if self.speculative_config is not None:
-            assert (self.lora_config is None), (
-                "LoRA is not supported for Speculative Decoding")
-            raise NotImplementedError(
-                "Speculative decoding is not supported for NeuronxDistributed")
-        return NeuronxDistributedModelRunner(vllm_config=vllm_config)
-
-    def init_device(self) -> None:
-        self.init_distributed_environment()
-
-        # Set random seed.
-        set_random_seed(self.model_config.seed)
-
-    def load_model(self):
-        self.model_runner.load_model()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks.
-
-        Swapping is not yet supported, so always return num_cpu_blocks=0.
-
-        We configure num_gpu_blocks to be equal to max_num_seqs.
-        """
-        # Set the number of GPU blocks to be the same as the maximum number of
-        # sequences that can be processed in a single batch. This is equivalent
-        # to schedule without PagedAttention.
-        num_gpu_blocks = self.scheduler_config.max_num_seqs + 1
-
-        # Swap not yet supported with Neuron backend.
-        num_cpu_blocks = 0
-
-        return num_gpu_blocks, num_cpu_blocks
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache.
-        """
-
-        # Different values are not tested.
-        assert num_cpu_blocks == 0
-        assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-    @property
-    def do_metadata_broadcast(self) -> bool:
-        return False
-
-    @property
-    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
-        return None
-
-    @torch.inference_mode()
-    def prepare_worker_input(
-            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
-        return WorkerInput(num_seq_groups=len(
-            execute_model_req.seq_group_metadata_list), )
-
-    def execute_worker(self, worker_input: WorkerInput) -> None:
-        pass
-
-    def get_cache_block_size_bytes(self) -> int:
-        """Determine the size in bytes of a cache block.
-
-        This is required for speculative decoding; it is not yet implemented.
-        """
-        raise NotImplementedError
-
-    def init_distributed_environment(self):
-        """Neuron uses transformers-neuronx for tensor parallelism.
-
-        vLLM still needs the environment initialized when TP/PP > 1
-        """
-        init_distributed_environment(
-            world_size=1,
-            rank=self.rank,
-            local_rank=self.local_rank,
-            distributed_init_method=self.distributed_init_method,
-            backend=current_platform.dist_backend,
-        )
-
-        ensure_model_parallel_initialized(
-            1,
-            1,
-        )
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        if current_platform.use_transformers_neuronx():
-            raise NotImplementedError(
-                f"{type(self)} does not support LoRA with Neuron Framework "
-                f"Transformers NeuronX")
-        return self.model_runner.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        if current_platform.use_transformers_neuronx():
-            raise NotImplementedError(
-                f"{type(self)} does not support LoRA with Neuron Framework "
-                f"Transformers NeuronX")
-        return self.model_runner.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        if current_platform.use_transformers_neuronx():
-            raise NotImplementedError(
-                f"{type(self)} does not support LoRA with Neuron Framework "
-                f"Transformers NeuronX")
-        return self.model_runner.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        if current_platform.use_transformers_neuronx():
-            raise NotImplementedError(
-                f"{type(self)} does not support LoRA with Neuron Framework "
-                f"Transformers NeuronX")
-        return self.model_runner.list_loras()
diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py
deleted file mode 100644
index 2a0f4e77c9..0000000000
--- a/vllm/worker/neuronx_distributed_model_runner.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import List, Optional, Set
-
-import torch
-from neuronx_distributed_inference.models.mllama.aspect_ratio_utils import (
-    get_all_supported_aspect_ratios)
-from neuronx_distributed_inference.modules.generation.sampling import (
-    prepare_sampling_params)
-from neuronx_distributed_inference.modules.lora_serving import (
-    LoraCheckpoint, LoraServingConfig)
-
-from vllm.config import VllmConfig
-from vllm.entrypoints.openai.serving_models import LoRAModulePath
-from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.model_loader.neuronx_distributed import (
-    _get_model_architecture, get_neuron_model)
-from vllm.multimodal import MultiModalKwargs
-from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.worker.neuron_model_runner import (ModelInputForNeuron,
-                                             NeuronModelRunner)
-
-logger = init_logger(__name__)
-
-
-class NeuronxDistributedModelRunner(NeuronModelRunner):
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ):
-        super().__init__(vllm_config)
-        self.lora_checkpoint = None
-        self.model = None
-        self.lora_serving_config = None
-
-    @staticmethod
-    def _get_lora_paths_strings(lora_modules: List[LoRAModulePath]):
-        if not lora_modules:
-            return None
-        return {_.get("name"): _.get("path") for _ in lora_modules}
-
-    def _get_nxdi_lora_config(self):
-        override_neuron_config = self.model_config.override_neuron_config
-        lora_modules = override_neuron_config.pop("lora_modules", None)
-        target_modules = override_neuron_config.pop("target_modules", None)
-        lora_ckpt_paths = self._get_lora_paths_strings(lora_modules)
-        if self.lora_config.max_loras < len(lora_ckpt_paths):
-            raise ValueError(
-                "Number of LoRAs (%s) exceeds maximum "
-                "allowed (%s)", len(lora_ckpt_paths),
-                self.lora_config.max_loras)
-
-        return LoraServingConfig(
-            max_loras=self.lora_config.max_loras,
-            max_lora_rank=self.lora_config.max_lora_rank,
-            target_modules=target_modules,
-            lora_ckpt_paths=lora_ckpt_paths,
-        )
-
-    def load_model(self) -> None:
-        # Update LoRA config
-        if self.lora_config is not None:
-            self.lora_serving_config = self._get_nxdi_lora_config()
-            self.lora_checkpoint = LoraCheckpoint(self.lora_serving_config)
-        self.model = get_neuron_model(
-            self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            lora_serving_config=self.lora_serving_config)
-
-    def get_nxd_sampling_params(self, sampling_metadata):
-        if self.model.config.neuron_config.on_device_sampling_config:
-            max_topk = (self.model.config.neuron_config.
-                        on_device_sampling_config.global_topk)
-        else:
-            max_topk = self.model.config.vocab_size
-
-        top_k = [1] * self.scheduler_config.max_num_seqs
-        top_p = [1.0] * self.scheduler_config.max_num_seqs
-        temperature = [1.0] * self.scheduler_config.max_num_seqs
-
-        for index, sequenceGroupToSample in enumerate(
-                sampling_metadata.seq_groups):
-            top_k[index] = (sequenceGroupToSample.sampling_params.top_k
-                            if sequenceGroupToSample.sampling_params.top_k > 0
-                            else max_topk)
-            top_p[index] = sequenceGroupToSample.sampling_params.top_p
-            temperature[index] = (
-                sequenceGroupToSample.sampling_params.temperature)
-
-        sampling_params = prepare_sampling_params(
-            batch_size=self.scheduler_config.max_num_seqs,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature)
-        return sampling_params
-
-    def get_multi_modal_data_neuron(self, input_images):
-        raise NotImplementedError("need to restore multi-modal support")
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForNeuron,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        if num_steps > 1:
-            raise ValueError(
-                "NeuronModelRunner does not support multi-step execution.")
-
-        if _get_model_architecture(
-                self.model.config) != "MllamaForConditionalGeneration":
-            return super().execute_model(model_input, kv_caches,
-                                         intermediate_tensors, num_steps)
-
-        sampling_params = self.get_nxd_sampling_params(
-            model_input.sampling_metadata)
-
-        if model_input.multi_modal_kwargs.get('pixel_values') is not None:
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                seq_ids=model_input.input_block_ids,
-                pixel_values=model_input.multi_modal_kwargs.get(
-                    'pixel_values'),
-                aspect_ratios=model_input.multi_modal_kwargs.get(
-                    'aspect_ratios'),
-                sampling_params=sampling_params,
-                num_chunks=model_input.multi_modal_kwargs.get('num_chunks'),
-                has_image=model_input.multi_modal_kwargs.get(
-                    'has_image').squeeze(1),
-            )
-        else:
-            bs = model_input.input_tokens.shape[0] if (model_input.input_tokens
-                                                       is not None) else 1
-            empty_pixel_values = torch.zeros([bs, 1, 4, 3, 560, 560],
-                                             dtype=torch.bfloat16)
-            empty_aspect_ratios = torch.ones([bs, 1, 2], dtype=torch.int64)
-            num_chunks = torch.zeros((bs, 1), dtype=torch.int32)
-            has_image = torch.zeros([bs], dtype=torch.int32)
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                seq_ids=model_input.input_block_ids,
-                pixel_values=empty_pixel_values,
-                aspect_ratios=empty_aspect_ratios,
-                sampling_params=sampling_params,
-                num_chunks=num_chunks,
-                has_image=has_image,
-            )
-
-        output = self.model.sample(
-            hidden_states=hidden_states,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-
-        return [output]
-
-    def process_multi_modal_data_neuron(self, mm_data):
-        # Neuron uses aspect_ratios instead of aspect_ratio_ids
-        all_supported_aspect_ratios = get_all_supported_aspect_ratios(
-            self.model.config.vision_config.max_num_tiles)
-        aspect_ratio_ids = mm_data.get("aspect_ratio_ids")
-        mm_data["aspect_ratios"] = torch.tensor(
-            all_supported_aspect_ratios[aspect_ratio_ids]).unsqueeze(0)
-
-        # Neuron's num_chunks is HF's num_tiles
-        mm_data["num_chunks"] = mm_data.get("num_tiles")
-
-        # Input has an image if it has pixel_values
-        bs = mm_data["num_chunks"].shape[0]
-        pixel_values = mm_data.get("pixel_values")
-        if pixel_values is not None and not torch.all(pixel_values == 0):
-            mm_data["has_image"] = torch.ones(bs)
-
-        else:
-            mm_data["has_image"] = torch.zeros(bs)
-        return mm_data
-
-    def _get_lora_adapter_ids(self, seq_group_metadata_list):
-        # set LoRA adapter IDs for multi-lora serving
-        batch_size = len(seq_group_metadata_list)
-        if self.lora_checkpoint is not None:
-            # "0" indicates NxDI to use the base model for inference
-            adapter_ids = ["0"] * batch_size
-            for idx, seq_group_metadata in enumerate(seq_group_metadata_list):
-                if seq_group_metadata.lora_request is not None:
-                    adapter_ids[
-                        idx] = seq_group_metadata.lora_request.lora_name
-
-            # convert adapter_ids from strings to integers
-            adapter_ids = self.lora_checkpoint.convert_adapter_ids_to_indices(
-                adapter_ids, batch_size)
-        else:
-            adapter_ids = torch.zeros((batch_size), dtype=torch.int32)
-
-        return adapter_ids
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForNeuron:
-        # NOTE: We assume that all sequences in the group are all prompts or
-        # all decodes.
-        is_prompt = seq_group_metadata_list[0].is_prompt
-        # Prepare input tensors.
-        if is_prompt:
-            (input_tokens, input_positions, input_block_ids, seq_lens,
-             multi_modal_kwargs
-             ) = self._prepare_prompt(seq_group_metadata_list)
-        else:
-            (input_tokens, input_positions,
-             input_block_ids) = self._prepare_decode(seq_group_metadata_list)
-            seq_lens = None
-
-        if not self._on_device_sampling_disabled:
-            for seq_group_metadata in seq_group_metadata_list:
-                sampling_params = seq_group_metadata.sampling_params
-                top_k, top_p, temperature = (
-                    self._convert_to_neuron_sampling_params(sampling_params))
-                sampling_params.top_k = top_k
-                sampling_params.top_p = top_p
-                sampling_params.temperature = temperature
-
-        # we need multi_modal_data for later tokens as well
-        multi_modal_kwargs_list: List[MultiModalKwargs] = []
-        for seq_group_metadata in seq_group_metadata_list:
-            mm_data = seq_group_metadata.multi_modal_data
-            if mm_data:
-                multi_modal_kwargs_list.append(mm_data)
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        lora_adapter_ids = self._get_lora_adapter_ids(seq_group_metadata_list)
-
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            # query_lens is not needed if chunked prefill is not
-            # supported. Since neuron worker doesn't support chunked prefill
-            # just use seq_lens instead.
-            seq_lens,
-            self.device,
-            self.pin_memory,
-            generators=self.get_generators(finished_requests_ids))
-
-        return ModelInputForNeuron(input_tokens=input_tokens,
-                                   input_positions=input_positions,
-                                   input_block_ids=input_block_ids,
-                                   sampling_metadata=sampling_metadata,
-                                   multi_modal_kwargs=multi_modal_kwargs,
-                                   adapter_ids=lora_adapter_ids)
-
-    def remove_all_loras(self):
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
-
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
-
-    def add_lora(self, lora_request: LoRARequest):
-        logger.warning(
-            "Adding LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config. If you supplied "
-            "the parameter, you can ignore this warning. Ignoring"
-            "lora request: ", lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")

From 558f0907dc67c804c5821c92f4d64ed43d10489b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 7 Sep 2025 09:18:59 +0800
Subject: [PATCH 891/932] [attention][DCP] use
 AttentionImpl.need_to_return_lse_for_decode (#24372)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/attention/backends/abstract.py        | 26 ++++++++++++++++++++++
 vllm/v1/attention/backends/mla/common.py   |  4 ----
 vllm/v1/attention/backends/mla/flashmla.py |  2 ++
 vllm/v1/worker/gpu_model_runner.py         | 15 ++++++++-----
 4 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 0b9c625533..0217bff6ad 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -257,6 +257,32 @@ class AttentionLayer(Protocol):
 
 class AttentionImpl(ABC, Generic[T]):
 
+    # Whether the attention impl can return the softmax lse for decode.
+    # Some features like decode context parallelism require the softmax lse.
+    can_return_lse_for_decode: bool = False
+
+    # some attention backends might not always want to return lse
+    # even if they can return lse (for efficiency reasons)
+    need_to_return_lse_for_decode: bool = False
+
+    dcp_world_size: int
+    dcp_rank: int
+
+    def __new__(cls, *args, **kwargs):
+        # use __new__ so that all subclasses will call this
+        self = super().__new__(cls)
+        try:
+            from vllm.distributed.parallel_state import get_dcp_group
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0
+        self.need_to_return_lse_for_decode = self.dcp_world_size > 1 \
+            and self.can_return_lse_for_decode
+        return self
+
     @abstractmethod
     def __init__(
         self,
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 090ebf9384..ec1216a16b 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1592,10 +1592,6 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
 
             # recorect dcp attn_out with lse.
             if self.dcp_world_size > 1:
-                assert lse is not None, (
-                    "For a mla backend want to enable"
-                    "DCP, it is mandatory that the corresponding decode attn"
-                    "kernel return the softmax lse.")
                 attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group())
 
             # v_up projection
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 11c91b8a06..1824bbadb6 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -133,6 +133,8 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
 
 class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
 
+    can_return_lse_for_decode: bool = True
+
     def __init__(
             self,
             num_heads: int,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 563872f8d6..0224f3944f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -56,7 +56,6 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         GiB_bytes, LazyLoader, cdiv, check_use_alibi,
                         get_dtype_size, is_pin_memory_available, round_up,
                         supports_dynamo)
-from vllm.v1.attention.backends.mla.flashmla import FlashMLABackend
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
     create_fast_prefill_custom_backend,
@@ -3405,10 +3404,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     copy_kv_blocks)
 
         if self.dcp_world_size > 1:
-            assert self.attn_groups[0][0].backend is FlashMLABackend, (
-                "DCP only support flashmla now."
-                "For a mla backend want to enable DCP, it is mandatory that the"
-                "corresponding decode attn kernel return the softmax lse.")
+            layer_names = self.attn_groups[0][0].layer_names
+            layers = get_layers_from_vllm_config(self.vllm_config,
+                                                 AttentionLayerBase,
+                                                 layer_names)
+            for layer in layers.values():
+                assert layer.impl.need_to_return_lse_for_decode, (
+                    "DCP requires attention impls to return"
+                    " the softmax lse for decode, but the impl "
+                    f"{layer.impl.__class__.__name__} "
+                    "does not return the softmax lse for decode.")
 
     def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
         """

From 37a6fa95fd29e95de6ceaf7ccc249c4523be6c75 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Sat, 6 Sep 2025 20:07:31 -0700
Subject: [PATCH 892/932] Migrate Qwen2 inputs to TensorSchema (#23475)

Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../models/qwen2_5_omni_thinker.py            |  52 ++---
 vllm/model_executor/models/qwen2_5_vl.py      | 177 +++++++++++-------
 vllm/model_executor/models/qwen2_audio.py     |  48 +++--
 vllm/model_executor/models/qwen2_vl.py        | 166 +++++++++-------
 4 files changed, 268 insertions(+), 175 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 29563540a7..d05eb76cdf 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -25,7 +25,7 @@
 from collections.abc import Iterable, Mapping, Sequence
 from copy import copy
 from functools import partial
-from typing import Any, Callable, Optional, Union
+from typing import Annotated, Any, Callable, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -41,15 +41,13 @@ from transformers.models.whisper import WhisperFeatureExtractor
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2_5_vl import (
     Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs,
     Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs,
     Qwen2_5_VLProcessingInfo, Qwen2_5_VLVideoEmbeddingInputs,
     Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs)
 from vllm.model_executor.models.qwen2_audio import (
-    Qwen2AudioFeatureInputs, Qwen2AudioProcessingInfo,
-    _get_feat_extract_output_lengths)
+    Qwen2AudioProcessingInfo, _get_feat_extract_output_lengths)
 from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -66,9 +64,9 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -81,6 +79,26 @@ except (ImportError, ModuleNotFoundError):
 logger = init_logger(__name__)
 
 
+class Qwen2_5OmniAudioFeatureInputs(TensorSchema):
+    """
+    Dimensions:
+        - na: Number of audios
+        - nmb: Number of mel bins
+        - msl: Maximum sequence length
+        - tsl: Total sequence length
+    """
+    type: Literal["audio_features"]
+    input_features: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("nmb", "tsl"),
+    ]
+
+    feature_attention_mask: Annotated[
+        torch.Tensor,
+        TensorShape("na", "msl"),
+    ]
+
+
 def create_qwen2_5_omni_thinker_field_factory(
     spatial_merge_size: int
 ) -> Callable[[Mapping[str, torch.Tensor]], Mapping[str,
@@ -536,7 +554,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
             return torch.concat(mm_input, dim=dim)
 
     def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> Optional[Qwen2AudioFeatureInputs]:
+            self, **kwargs: object) -> Optional[Qwen2_5OmniAudioFeatureInputs]:
         input_audio_features = kwargs.pop('input_audio_features', None)
         audio_feature_lengths = kwargs.pop('audio_feature_lengths', None)
         feature_attention_mask = kwargs.pop('feature_attention_mask', None)
@@ -550,7 +568,8 @@ class Qwen2_5OmniConditionalGenerationMixin:
         if not isinstance(input_audio_features, (torch.Tensor, list)):
             raise ValueError("Incorrect type of audio input features. "
                              f"Got type: {type(input_audio_features)}")
-        return Qwen2AudioFeatureInputs(
+        return Qwen2_5OmniAudioFeatureInputs(
+            type="audio_features",
             input_features=input_audio_features,
             audio_feature_lengths=audio_feature_lengths,
             feature_attention_mask=feature_attention_mask)
@@ -633,7 +652,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
 
     def _process_audio_input(
         self,
-        audio_input: Qwen2AudioFeatureInputs,
+        audio_input: Qwen2_5OmniAudioFeatureInputs,
         audio_hashes: list[str] = None,
         cached_audio_features: torch.Tensor = None,
     ) -> torch.Tensor:
@@ -660,8 +679,8 @@ class Qwen2_5OmniConditionalGenerationMixin:
             feature_lens=audio_feature_lengths,
             aftercnn_lens=audio_feat_lengths,
         )
-        audio_features = audio_outputs.last_hidden_state
-        return audio_features.split(audio_output_lengths.tolist())
+        return audio_outputs.last_hidden_state.split(
+            audio_output_lengths.tolist())
 
     def _process_image_input(
             self,
@@ -707,7 +726,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
     dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder,
 )
 class Qwen2_5OmniThinkerForConditionalGeneration(
-        nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
+        nn.Module, SupportsMultiModal, SupportsPP,
         Qwen2_5OmniConditionalGenerationMixin):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -800,15 +819,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_mm_mapping(self) -> MultiModelKeys:
-        """Get module prefix for multimodal models to filter LoRA modules."""
-        return MultiModelKeys.from_string_field(
-            language_model="language_model",
-            connector=[],  # No explicit connector in this model
-            tower_model=["visual",
-                         "audio_tower"],  # Exclude vision and audio towers
-        )
-
     def get_multimodal_embeddings(self,
                                   **kwargs: object) -> MultiModalEmbeddings:
 
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 0a89f86fc7..afef86fbaa 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -27,7 +27,7 @@
 """Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping
 from functools import lru_cache, partial
-from typing import Callable, Literal, Optional, TypedDict, Union
+from typing import Annotated, Callable, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -64,6 +64,7 @@ from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP, SupportsQuant)
@@ -80,84 +81,125 @@ logger = init_logger(__name__)
 # === Vision Inputs === #
 
 
-class Qwen2_5_VLImagePixelInputs(TypedDict):
+class Qwen2_5_VLImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: Number of patches
+        - ni: Number of images
+        - cps: Number of channels * patch_size * patch_size
+    
+    Historical context:
+        - pixel_values shape: (num_patches, num_channels * patch_size * 
+          patch_size)
+        - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
+          formatnum_channels * patch_size * patch_size
+    """
     type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
-    """Shape:
-    `(num_patches, num_channels * patch_size * patch_size)`
+
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", "cps"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+class Qwen2_5_VLImageEmbeddingInputs(TensorSchema):
     """
-
-    image_grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
+    Dimensions:
+        - nf: Number of image features
+        - hs: Hidden size
+        - ni: Number of images
+    
+    Historical context:
+        - image_embeds shape: (num_image_features, hidden_size)
+        - num_image_features varies based on the number and resolution of the
+          images.
+        - hidden_size must match the hidden size of language model backbone.
+        - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
+          format
     """
-
-
-class Qwen2_5_VLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    image_embeds: torch.Tensor
-    """Supported types:
-    - list[`torch.Tensor`]: A list of tensors holding all images' features.
-        Each tensor holds an image's features.
-    - `torch.Tensor`: A tensor holding all images' features
-        (concatenation of all images' feature tensors).
 
-    Tensor shape: `(num_image_features, hidden_size)`
-    - `num_image_features` varies based on
-        the number and resolution of the images.
-    - `hidden_size` must match the hidden size of language model backbone.
-    """
+    image_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("nf", "hs"),
+    ]
 
-    image_grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
-    """
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
 
 
 Qwen2_5_VLImageInputs = Union[Qwen2_5_VLImagePixelInputs,
                               Qwen2_5_VLImageEmbeddingInputs]
 
 
-class Qwen2_5_VLVideoPixelInputs(TypedDict):
+class Qwen2_5_VLVideoPixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: Number of patches
+        - nv: Number of videos
+        - ctps: Number of channels * temporal_patch_size * patch_size * 
+          patch_size
+    
+    Historical context:
+        - pixel_values_videos shape: (num_patches, num_channels * 
+          temporal_patch_size * patch_size * patch_size)
+        - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
+          format
+        - second_per_grid_ts: The video time interval (in seconds) for each
+          grid along the temporal dimension in the 3D position IDs. Returned
+          when `videos` is not `None`.
+    """
     type: Literal["pixel_values_videos"]
-    pixel_values_videos: torch.Tensor
-    """Shape:
-    `(num_patches,
-      num_channels * temporal_patch_size * patch_size * patch_size)`
+
+    pixel_values_videos: Annotated[
+        torch.Tensor,
+        TensorShape("np", "ctps"),
+    ]
+
+    video_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("nv", 3),
+    ]
+
+    second_per_grid_ts: Annotated[
+        Optional[torch.Tensor],
+        TensorShape("nv"),
+    ]
+
+
+class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
     """
-
-    video_grid_thw: torch.Tensor
-    """Shape: `(num_videos, 3)`
-
-    This should be in `(grid_t, grid_h, grid_w)` format.
+    Dimensions:
+        - nf: Number of video features
+        - hs: Hidden size
+        - nv: Number of videos
+    
+    Historical context:
+        - video_embeds shape: (num_video_features, hidden_size)
+        - num_video_features varies based on the number and resolution of the
+          videos.
+        - hidden_size must match the hidden size of language model backbone.
+        - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
+          format
     """
-
-    second_per_grid_ts: torch.Tensor
-    """
-    The video time interval (in seconds) for each grid along the temporal
-    dimension in the 3D position IDs. Returned when `videos` is not `None`.
-    """
-
-
-class Qwen2_5_VLVideoEmbeddingInputs(TypedDict):
     type: Literal["video_embeds"]
-    video_embeds: torch.Tensor
-    """Supported types:
-    - list[`torch.Tensor`]: A list of tensors holding all videos' features.
-        Each tensor holds an video's features.
-    - `torch.Tensor`: A tensor holding all videos' features
-      (concatenation of all videos' feature tensors).
 
-    Tensor shape: `(num_image_features, hidden_size)`
-    - `num_image_features` varies based on
-        the number and resolution of the videos.
-    - `hidden_size` must match the hidden size of language model backbone.
-    """
+    video_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("nf", "hs"),
+    ]
 
-    video_grid_thw: torch.Tensor
-    """Shape: `(num_videos, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
-    """
+    video_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("nv", 3),
+    ]
 
 
 Qwen2_5_VLVideoInputs = Union[Qwen2_5_VLVideoPixelInputs,
@@ -936,10 +978,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_grid_thw = self._validate_and_reshape_mm_tensor(
                 image_grid_thw, "image grid_thw")
 
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
             return Qwen2_5_VLImagePixelInputs(type="pixel_values",
                                               pixel_values=pixel_values,
                                               image_grid_thw=image_grid_thw)
@@ -950,9 +988,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_grid_thw = self._validate_and_reshape_mm_tensor(
                 image_grid_thw, "image grid_thw")
 
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
             return Qwen2_5_VLImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds,
@@ -973,7 +1008,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                 pixel_values_videos, "video pixel values")
             video_grid_thw = self._validate_and_reshape_mm_tensor(
                 video_grid_thw, "video grid_thw")
-
+            if second_per_grid_ts is not None and second_per_grid_ts.ndim == 2:
+                second_per_grid_ts = second_per_grid_ts.squeeze(-1)
             return Qwen2_5_VLVideoPixelInputs(
                 type="pixel_values_videos",
                 pixel_values_videos=pixel_values_videos,
@@ -987,9 +1023,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             video_grid_thw = self._validate_and_reshape_mm_tensor(
                 video_grid_thw, "video grid_thw")
 
-            if not isinstance(video_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of video embeddings. "
-                                 f"Got type: {type(video_embeds)}")
             return Qwen2_5_VLVideoEmbeddingInputs(
                 type="video_embeds",
                 video_embeds=video_embeds,
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 86b4a9a018..54ec7b8627 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -23,7 +23,7 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -47,6 +47,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
@@ -54,21 +55,38 @@ from .utils import (AutoWeightsLoader, init_vllm_registered_model,
 
 
 # # === Audio Inputs === #
-class Qwen2AudioFeatureInputs(TypedDict):
-    type: Literal["audio_features"]
-    input_features: torch.Tensor
-    """Shape: `(num_audios, num_mel_bins, 3000)`"""
-
-    feature_attention_mask: torch.Tensor
-    """Shape: `(num_audios, 3000)`"""
-
-
-class Qwen2AudioEmbeddingInputs(TypedDict):
-    type: Literal["audio_embeds"]
-    audio_embeds: list[torch.Tensor]
-    """Shape: `(num_audio_features, hidden_size)`
-    `hidden_size` must match the hidden size of language model backbone.
+class Qwen2AudioFeatureInputs(TensorSchema):
     """
+    Dimensions:
+        - na: Number of audios
+        - nmb: Number of mel bins
+    """
+    type: Literal["audio_features"]
+    input_features: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("na", "nmb", 3000),
+    ]
+
+    feature_attention_mask: Annotated[
+        torch.Tensor,
+        TensorShape("na", 3000),
+    ]
+
+
+class Qwen2AudioEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size
+        - naf: Number of audio features
+        - hs: Hidden size (must match the hidden size of language model
+          backbone)
+    """
+    type: Literal["audio_embeds"] = "audio_embeds"
+
+    audio_embeds: Annotated[
+        list[torch.Tensor],
+        TensorShape("bn", "naf", "hs"),
+    ]
 
 
 Qwen2AudioInputs = Union[Qwen2AudioFeatureInputs, Qwen2AudioEmbeddingInputs]
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index b708719e4f..f00b214b1e 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -26,7 +26,7 @@
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
-from typing import Any, Callable, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Callable, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -70,6 +70,7 @@ from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -86,78 +87,119 @@ _MAX_FRAMES_PER_VIDEO = 16
 # === Vision Inputs === #
 
 
-class Qwen2VLImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
-    """Shape:
-    `(num_patches, num_channels * patch_size * patch_size)`
+class Qwen2VLImagePixelInputs(TensorSchema):
     """
-
-    image_grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
-    """
-
-
-class Qwen2VLImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    image_embeds: torch.Tensor
-    """Supported types:
-    - list[`torch.Tensor`]: A list of tensors holding all images' features.
-        Each tensor holds an image's features.
-    - `torch.Tensor`: A tensor holding all images' features
-        (concatenation of all images' feature tensors).
+    Dimensions:
+        - np: The total number of patches over each image over each prompt in
+              the batch
+        - ni: Number of images
+        - cps: Number of channels * patch_size * patch_size
     
-    Tensor shape: `(num_image_features, hidden_size)`
-    - `num_image_features` varies based on
-        the number and resolution of the images.
-    - `hidden_size` must match the hidden size of language model backbone.
+    Historical context:
+        - pixel_values shape: (num_patches, num_channels * patch_size * 
+          patch_size)
+        - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
+          format
     """
+    type: Literal["pixel_values"]
 
-    image_grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", "cps"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+class Qwen2VLImageEmbeddingInputs(TensorSchema):
     """
+    Dimensions:
+        - nf: Number of image features
+        - hs: Hidden size
+        - ni: Number of images
+    
+    Historical context:
+        - image_embeds shape: (num_image_features, hidden_size)
+        - num_image_features varies based on the number and resolution of the
+          images.
+        - hidden_size must match the hidden size of language model backbone.
+        - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
+          format
+    """
+    type: Literal["image_embeds"]
+
+    image_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("nf", "hs"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
 
 
 Qwen2VLImageInputs = Union[Qwen2VLImagePixelInputs,
                            Qwen2VLImageEmbeddingInputs]
 
 
-class Qwen2VLVideoPixelInputs(TypedDict):
-    type: Literal["pixel_values_videos"]
-    pixel_values_videos: torch.Tensor
-    """Shape:
-    `(num_patches,
-      num_channels * temporal_patch_size * patch_size * patch_size)`
+class Qwen2VLVideoPixelInputs(TensorSchema):
     """
-
-    video_grid_thw: torch.Tensor
-    """Shape: `(num_videos, 3)`
-
-    This should be in `(grid_t, grid_h, grid_w)` format.
-    """
-
-
-class Qwen2VLVideoEmbeddingInputs(TypedDict):
-    type: Literal["video_embeds"]
-    video_embeds: torch.Tensor
-    """Supported types:
-    - list[`torch.Tensor`]: A list of tensors holding all videos' features.
-        Each tensor holds an video's features.
-    - `torch.Tensor`: A tensor holding all videos' features
-        (concatenation of all videos' feature tensors).
+    Dimensions:
+        - np: The total number of patches over each video over each prompt in
+              the batch
+        - ctps: Number of channels * temporal_patch_size * patch_size * 
+          patch_size
+        - nv: Number of videos
     
-    Tensor shape: `(num_image_features, hidden_size)`
-    - `num_image_features` varies based on 
-        the number and resolution of the videos.
-    - `hidden_size` must match the hidden size of language model backbone.
+    Historical context:
+        - pixel_values_videos shape: (num_patches, num_channels * 
+          temporal_patch_size * patch_size * patch_size)
+        - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
+          format
     """
+    type: Literal["pixel_values_videos"]
 
-    video_grid_thw: torch.Tensor
-    """Shape: `(num_videos, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
+    pixel_values_videos: Annotated[
+        torch.Tensor,
+        TensorShape("np", "ctps"),
+    ]
+
+    video_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("nv", 3),
+    ]
+
+
+class Qwen2VLVideoEmbeddingInputs(TensorSchema):
     """
+    Dimensions:
+        - nf: Number of video features
+        - hs: Hidden size
+        - nv: Number of videos
+    
+    Historical context:
+        - video_embeds shape: (num_video_features, hidden_size)
+        - num_video_features varies based on the number and resolution of the
+          videos.
+        - hidden_size must match the hidden size of language model backbone.
+        - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
+          format
+    """
+    type: Literal["video_embeds"]
+
+    video_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("nf", "hs"),
+    ]
+
+    video_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("nv", 3),
+    ]
 
 
 Qwen2VLVideoInputs = Union[Qwen2VLVideoPixelInputs,
@@ -1126,10 +1168,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_grid_thw = self._validate_and_reshape_mm_tensor(
                 image_grid_thw, "image grid_thw")
 
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
             return Qwen2VLImagePixelInputs(type="pixel_values",
                                            pixel_values=pixel_values,
                                            image_grid_thw=image_grid_thw)
@@ -1140,9 +1178,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_grid_thw = self._validate_and_reshape_mm_tensor(
                 image_grid_thw, "image grid_thw")
 
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
             return Qwen2VLImageEmbeddingInputs(type="image_embeds",
                                                image_embeds=image_embeds,
                                                image_grid_thw=image_grid_thw)
@@ -1174,9 +1209,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             video_grid_thw = self._validate_and_reshape_mm_tensor(
                 video_grid_thw, "video grid_thw")
 
-            if not isinstance(video_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of video embeddings. "
-                                 f"Got type: {type(video_embeds)}")
             return Qwen2VLVideoEmbeddingInputs(type="video_embeds",
                                                video_embeds=video_embeds,
                                                video_grid_thw=video_grid_thw)

From e67597545b0162e8f2e0d4f8186fb2e546c04128 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Sat, 6 Sep 2025 23:10:40 -0400
Subject: [PATCH 893/932] [CI][Fix] deterministic seed for flaky CI runs on
 structured outputs (#24380)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index fb49db8f16..c10b1abb2b 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -41,9 +41,8 @@ EAGLE_SPEC_CONFIG = {
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
     ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
-    #FIXME: This test is flaky on CI thus disabled
-    #("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto",
-    # None),
+    ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto",
+     None),
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None),
@@ -123,6 +122,7 @@ def test_structured_output(
         guided_decoding_backend=guided_decoding_backend,
         guided_decoding_disable_any_whitespace=(guided_decoding_backend
                                                 in {"xgrammar", "guidance"}),
+        seed=120,
         tokenizer_mode=tokenizer_mode,
         speculative_config=speculative_config)
 

From 77aec83b8c6e6c15a9b5c333a531c29eff0b61fc Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Sun, 7 Sep 2025 11:12:05 +0800
Subject: [PATCH 894/932] [Benchmark] add benchmark for custom activation op
 (#23908)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 benchmarks/kernels/benchmark_activation.py | 104 +++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 benchmarks/kernels/benchmark_activation.py

diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
new file mode 100644
index 0000000000..93edbcc939
--- /dev/null
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# benchmark custom activation op performance
+import itertools
+
+import torch
+
+import vllm.model_executor.layers.activation  # noqa F401
+from vllm.model_executor.custom_op import CustomOp
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+
+batch_size_range = [1, 16, 32, 64, 128]
+seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+intermediate_size = [3072, 9728, 12288]
+configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
+
+
+def benchmark_activation(
+    batch_size: int,
+    seq_len: int,
+    intermediate_size: int,
+    provider: str,
+    func_name: str,
+    dtype: torch.dtype,
+):
+    device = "cuda"
+    num_tokens = batch_size * seq_len
+    dim = intermediate_size
+    current_platform.seed_everything(42)
+    torch.set_default_device(device)
+
+    if func_name == "gelu_and_mul":
+        layer = CustomOp.op_registry[func_name](approximate="none")
+    elif func_name == "gelu_and_mul_tanh":
+        layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
+    elif func_name == "fatrelu_and_mul":
+        threshold = 0.5
+        layer = CustomOp.op_registry[func_name](threshold)
+    else:
+        layer = CustomOp.op_registry[func_name]()
+
+    x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
+    compiled_layer = torch.compile(layer.forward_native)
+
+    if provider == "custom":
+        fn = lambda: layer(x)
+    elif provider == "compiled":
+        fn = lambda: compiled_layer(x)
+
+    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+        fn, quantiles=[0.5, 0.2, 0.8]
+    )
+    return ms, max_ms, min_ms
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the custom activation op.")
+    parser.add_argument(
+        "--func-name",
+        type=str,
+        choices=[
+            "mul_and_silu",
+            "silu_and_mul",
+            "gelu_and_mul",
+            "gelu_and_mul_tanh",
+            "fatrelu_and_mul",
+            "swigluoai_and_mul",
+            "gelu_new",
+            "gelu_fast",
+            "quick_gelu",
+        ],
+        default="silu_and_mul",
+    )
+    parser.add_argument(
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+    )
+    args = parser.parse_args()
+    assert args
+
+    func_name = args.func_name
+    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
+
+    perf_report = triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size", "seq_len", "intermediate_size"],
+            x_vals=configs,
+            line_arg="provider",
+            line_vals=["custom", "compiled"],
+            line_names=["Custom OP", "Compiled"],
+            styles=[("blue", "-"), ("green", "-")],
+            ylabel="ms",
+            plot_name=f"{func_name}-op-performance",
+            args={},
+        )
+    )
+
+    perf_report(
+        lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation(
+            batch_size, seq_len, intermediate_size, provider, func_name, dtype
+        )
+    ).run(print_data=True)

From 75334956c2a88dc7d9d73243bed26a75c8a68b85 Mon Sep 17 00:00:00 2001
From: "Saman A. Pour" <samanamp@outlook.com>
Date: Sat, 6 Sep 2025 20:18:54 -0700
Subject: [PATCH 895/932] QWEN3 Thinking Fused MoE kernels Optimization configs
 (#24330)

Signed-off-by: Saman Keon <samanamp@outlook.com>
---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  36 ++---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 4 files changed, 456 insertions(+), 18 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..e5059358c9
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..db1b6e98df
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
index b9dc2d71f6..1bbb8aa613 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -9,16 +9,16 @@
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "4": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
@@ -26,15 +26,15 @@
     "8": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "16": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
@@ -42,7 +42,7 @@
     "24": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
@@ -53,12 +53,12 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "48": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
@@ -82,10 +82,10 @@
     "128": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "256": {
         "BLOCK_SIZE_M": 16,
@@ -98,8 +98,8 @@
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -107,7 +107,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -123,15 +123,15 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
     "3072": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000..8fb4947d62
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}

From 81c53ef55c01203a2be39f97acfc9f40abe3686b Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Sat, 6 Sep 2025 20:30:41 -0700
Subject: [PATCH 896/932] [Misc] collect flashinfer version in collect_env.py
 (#24378)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 vllm/collect_env.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/collect_env.py b/vllm/collect_env.py
index 5b3883a482..0291f64e84 100644
--- a/vllm/collect_env.py
+++ b/vllm/collect_env.py
@@ -74,6 +74,7 @@ DEFAULT_CONDA_PATTERNS = {
     "zmq",
     "nvidia",
     "pynvml",
+    "flashinfer-python",
 }
 
 DEFAULT_PIP_PATTERNS = {
@@ -89,6 +90,7 @@ DEFAULT_PIP_PATTERNS = {
     "zmq",
     "nvidia",
     "pynvml",
+    "flashinfer-python",
 }
 
 
From 62f66be1f74378e5a22e266ad161023c324cf4f8 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 7 Sep 2025 13:19:46 +0800
Subject: [PATCH 897/932] [Bugfix] Fix Qwen3-coder moe tuned config (#24072)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py           |  6 +-
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 86 +++++++++----------
 2 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 710d30adfd..6259aa0dd6 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -678,7 +678,11 @@ def main(args: argparse.Namespace):
         is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
         search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
         print(f"Start tuning over {len(search_space)} configurations...")
-
+        if use_deep_gemm:
+            raise ValueError(
+                "Tuning with --use-deep-gemm is not supported as it only tunes Triton "
+                "kernels. Please remove the flag."
+            )
         start = time.time()
         configs = _distribute(
             "tune",
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
index 307c924093..c7998718da 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -18,18 +18,18 @@
     "4": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
     "8": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "16": {
         "BLOCK_SIZE_M": 16,
@@ -58,7 +58,7 @@
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
@@ -74,73 +74,73 @@
     "96": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "128": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 2
+        "num_stages": 4
     },
     "256": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
-    },
-    "512": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
         "num_stages": 4
     },
     "1024": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 5
+        "num_warps": 4,
+        "num_stages": 3
     },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
+    "2048": {
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
     },
     "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 5
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
     }
 }

From 105d3d62efde2d750b985020bbcea888f94a139b Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 7 Sep 2025 01:12:36 -0700
Subject: [PATCH 898/932] [TPU] Remove TopKTopPSampler dependency for TPU
 sampler (#24391)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/tpu/test_topk_topp_sampler.py  |  8 ++-
 vllm/v1/sample/ops/topk_topp_sampler.py | 53 +----------------
 vllm/v1/sample/tpu/sampler.py           | 75 +++++++++++++++++++++++--
 3 files changed, 79 insertions(+), 57 deletions(-)

diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
index ca5c067b36..05751badc7 100644
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -6,8 +6,12 @@ import pytest
 import torch
 
 from vllm.platforms import current_platform
-from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
-                                                  apply_top_k_top_p_tpu)
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+
+# isort: off
+from vllm.v1.sample.tpu.sampler import (apply_top_k_top_p as
+                                        apply_top_k_top_p_tpu)
+# isort: on
 
 if not current_platform.is_tpu():
     pytest.skip("This test needs a TPU.", allow_module_level=True)
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 7bd4a5a380..cc5653b10e 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -73,10 +73,8 @@ class TopKTopPSampler(nn.Module):
                 self.forward = self.forward_native
         else:
             self.forward = self.forward_native
-        if current_platform.is_tpu():
-            self.apply_top_k_top_p = apply_top_k_top_p_tpu
-        else:
-            self.apply_top_k_top_p = apply_top_k_top_p
+
+        self.apply_top_k_top_p = apply_top_k_top_p
 
     def forward_native(
         self,
@@ -125,53 +123,6 @@ class TopKTopPSampler(nn.Module):
         return flashinfer_sample(logits.contiguous(), k, p, generators), None
 
 
-def apply_top_k_top_p_tpu(
-    logits: torch.Tensor,
-    k: torch.Tensor,
-    p: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Apply top-k and top-p optimized for TPU.
-
-    This algorithm avoids using torch.scatter which is extremely slow on TPU.
-    This is achieved by finding a "cut-off" element in the original logit, and
-    after thresholding the logit using this cut-off, the remaining elements
-    shall constitute the top-p set.
-
-    Note: in the case of tie (i.e. multipple cut-off elements present in the
-    logit), all tie elements are included in the top-p set. In other words,
-    this function does not break ties. Instead, these tie tokens have equal
-    chance of being chosen during final sampling, so we can consider the tie
-    being broken then.
-    """
-    probs = logits.softmax(dim=-1)
-    probs_sort, _ = probs.sort(dim=-1, descending=False)
-
-    if k is not None:
-        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
-        top_k_count = top_k_count.unsqueeze(dim=1)
-        top_k_cutoff = probs_sort.gather(-1, top_k_count)
-
-        # Make sure the no top-k rows are no-op.
-        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
-        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))
-
-        elements_to_discard = probs < top_k_cutoff
-        logits.masked_fill_(elements_to_discard, -float("inf"))
-
-    if p is not None:
-        cumprob = torch.cumsum(probs_sort, dim=-1)
-        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
-        top_p_mask[:, -1] = False  # at least one
-
-        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
-        top_p_cutoff = probs_sort.gather(-1, top_p_count)
-        elements_to_discard = probs < top_p_cutoff
-        logits.masked_fill_(elements_to_discard, -float("inf"))
-
-    return logits
-
-
 def apply_top_k_top_p(
     logits: torch.Tensor,
     k: Optional[torch.Tensor],
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index e84136e3a6..17b83a4ba0 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -2,11 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampler layer implementing TPU supported operations."""
 
+from typing import Optional
+
 import torch
 import torch.nn as nn
 
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
-from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
 
 _SAMPLING_EPS = 1e-5
@@ -17,7 +18,6 @@ class Sampler(nn.Module):
     def __init__(self):
         # TODO(houseroad): Add support for logprobs_mode.
         super().__init__()
-        self.topk_topp_sampler = TopKTopPSampler()
 
     def forward(
         self,
@@ -65,13 +65,17 @@ class Sampler(nn.Module):
             logits = self.apply_min_p(logits, sampling_metadata.min_p)
 
         # Apply top_k and/or top_p.
-        random_sampled, _ = self.topk_topp_sampler(
+        logits = apply_top_k_top_p(
             logits,
-            sampling_metadata.generators,
             sampling_metadata.top_k,
             sampling_metadata.top_p,
         )
 
+        # Random sample.
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        random_sampled = self.random_sample(probs,
+                                            sampling_metadata.generators)
+
         sampled = torch.where(sampling_metadata.temperature < _SAMPLING_EPS,
                               greedy_sampled, random_sampled)
         return sampled
@@ -144,3 +148,66 @@ class Sampler(nn.Module):
         # Apply mask using boolean indexing (xla friendly)
         logits.masked_fill_(~valid_token_mask, -float("inf"))
         return logits
+
+    def random_sample(
+        self,
+        probs: torch.Tensor,
+        generators: dict[int, torch.Generator],
+    ) -> torch.Tensor:
+        q = torch.empty_like(probs)
+        # NOTE(woosuk): To batch-process the requests without their own seeds,
+        # which is the common case, we first assume that every request does
+        # not have its own seed. Then, we overwrite the values for the requests
+        # that have their own seeds.
+        q.exponential_()
+        if generators:
+            for i, generator in generators.items():
+                q[i].exponential_(generator=generator)
+        return probs.div_(q).argmax(dim=-1).view(-1)
+
+
+def apply_top_k_top_p(
+    logits: torch.Tensor,
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Apply top-k and top-p optimized for TPU.
+
+    This algorithm avoids using torch.scatter which is extremely slow on TPU.
+    This is achieved by finding a "cut-off" element in the original logit, and
+    after thresholding the logit using this cut-off, the remaining elements
+    shall constitute the top-p set.
+
+    Note: in the case of tie (i.e. multipple cut-off elements present in the
+    logit), all tie elements are included in the top-p set. In other words,
+    this function does not break ties. Instead, these tie tokens have equal
+    chance of being chosen during final sampling, so we can consider the tie
+    being broken then.
+    """
+    probs = logits.softmax(dim=-1)
+    probs_sort, _ = probs.sort(dim=-1, descending=False)
+
+    if k is not None:
+        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
+        top_k_count = top_k_count.unsqueeze(dim=1)
+        top_k_cutoff = probs_sort.gather(-1, top_k_count)
+
+        # Make sure the no top-k rows are no-op.
+        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
+        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))
+
+        elements_to_discard = probs < top_k_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+
+    if p is not None:
+        cumprob = torch.cumsum(probs_sort, dim=-1)
+        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
+        top_p_mask[:, -1] = False  # at least one
+
+        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
+        top_p_cutoff = probs_sort.gather(-1, top_p_count)
+        elements_to_discard = probs < top_p_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+
+    return logits

From 0661cb9df390a47dda0a5c201cf712dd2d943666 Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Sun, 7 Sep 2025 01:26:48 -0700
Subject: [PATCH 899/932] Add renderer-based prompt processing for embedding
 and classification endpoints (#24356)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 tests/entrypoints/openai/test_truncation.py   | 14 ++----
 tests/entrypoints/test_renderer.py            | 17 +++++++
 .../openai/serving_classification.py          | 13 +++---
 vllm/entrypoints/openai/serving_embedding.py  | 45 ++++++++-----------
 vllm/entrypoints/openai/serving_engine.py     | 19 ++++----
 vllm/entrypoints/renderer.py                  |  9 +++-
 6 files changed, 60 insertions(+), 57 deletions(-)

diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py
index 121c0413e1..6bdf5ce7c4 100644
--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/openai/test_truncation.py
@@ -73,17 +73,11 @@ async def test_zero_truncation_size(client: openai.AsyncOpenAI):
         "truncate_prompt_tokens": truncation_size
     }
 
-    with pytest.raises(openai.BadRequestError) as err:
-        await client.post(path="embeddings", cast_to=object, body={**kwargs})
+    response = await client.post(path="embeddings",
+                                 cast_to=object,
+                                 body={**kwargs})
 
-    assert err.value.status_code == 400
-    error_details = err.value.response.json()["error"]
-
-    assert error_details["type"] == "BadRequestError"
-    assert "This model's maximum context length is" in error_details["message"]
-    assert "tokens in the input for embedding generation" in error_details[
-        "message"]
-    assert "Please reduce the length of the input" in error_details["message"]
+    assert response["usage"]["prompt_tokens"] == truncation_size
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/test_renderer.py
index 54b5271ba6..1d80ea6cb4 100644
--- a/tests/entrypoints/test_renderer.py
+++ b/tests/entrypoints/test_renderer.py
@@ -130,6 +130,23 @@ class TestRenderPrompt:
         assert call_args.kwargs["truncation"] is True
         assert call_args.kwargs["max_length"] == 50
 
+    @pytest.mark.asyncio
+    async def test_truncation_negative(self, renderer, mock_async_tokenizer):
+        # Test that negative truncation uses model's max_model_len
+        mock_async_tokenizer.return_value = MockTokenizerResult(
+            [101, 7592, 2088])  # Truncated to max_model_len
+        renderer.async_tokenizer_pool[
+            renderer.tokenizer] = mock_async_tokenizer
+
+        results = await renderer.render_prompt(prompt_or_prompts="Hello world",
+                                               max_length=200,
+                                               truncate_prompt_tokens=-1)
+
+        assert len(results) == 1
+        call_args = mock_async_tokenizer.call_args
+        assert call_args.kwargs["truncation"] is True
+        assert call_args.kwargs["max_length"] == 100  # model's max_model_len
+
     @pytest.mark.asyncio
     async def test_token_truncation_last_elements(self, renderer):
         # Test that token truncation keeps the last N elements
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index b4fdc36390..98b7a206fa 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -54,14 +54,11 @@ class ClassificationMixin(OpenAIServing):
             ctx.tokenizer = await self.engine_client.get_tokenizer(
                 ctx.lora_request)
 
-            (
-                ctx.request_prompts,
-                ctx.engine_prompts,
-            ) = await self._preprocess_completion(
-                ctx.request,
-                ctx.tokenizer,
-                ctx.request.input,
-            )
+            renderer = self._get_renderer(ctx.tokenizer)
+            ctx.engine_prompts = await renderer.render_prompt(
+                prompt_or_prompts=ctx.request.input,
+                max_length=self.max_model_len,
+                truncate_prompt_tokens=ctx.request.truncate_prompt_tokens)
 
             return None
 
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index c6d3509afd..c375f9e7c5 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -24,7 +24,6 @@ from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
                                               ErrorResponse, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (EmbeddingServeContext,
                                                     OpenAIServing,
-                                                    RequestPrompt,
                                                     ServeContext,
                                                     TextTokensPrompt)
 # yapf: enable
@@ -79,11 +78,12 @@ class EmbeddingMixin(OpenAIServing):
 
             tokenizer = await self.engine_client.get_tokenizer(ctx.lora_request
                                                                )
+            renderer = self._get_renderer(tokenizer)
 
             if isinstance(ctx.request, EmbeddingChatRequest):
                 (
                     _,
-                    ctx.request_prompts,
+                    _,
                     ctx.engine_prompts,
                 ) = await self._preprocess_chat(
                     ctx.request,
@@ -98,13 +98,18 @@ class EmbeddingMixin(OpenAIServing):
                     add_special_tokens=ctx.request.add_special_tokens,
                 )
             else:
-                (ctx.request_prompts,
-                 ctx.engine_prompts) = await self._preprocess_completion(
-                     ctx.request,
-                     tokenizer,
-                     ctx.request.input,
-                     add_special_tokens=ctx.request.add_special_tokens,
-                 )
+                # Set max_length based on chunked processing capability
+                if self._should_use_chunked_processing(ctx.request):
+                    max_length = None
+                else:
+                    max_length = self.max_embed_len or self.max_model_len
+
+                ctx.engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=ctx.request.input,
+                    max_length=max_length,
+                    truncate_prompt_tokens=ctx.request.truncate_prompt_tokens,
+                    add_special_tokens=ctx.request.add_special_tokens,
+                )
             return None
         except (ValueError, TypeError) as e:
             logger.exception("Error in preprocessing prompt inputs")
@@ -286,7 +291,6 @@ class EmbeddingMixin(OpenAIServing):
         self,
         ctx: EmbeddingServeContext,
         engine_prompt: Union[EngineTokensPrompt, EngineEmbedsPrompt],
-        request_prompt: RequestPrompt,
         pooling_params: PoolingParams,
         trace_headers: Optional[Mapping[str, str]],
         prompt_index: int,
@@ -295,7 +299,7 @@ class EmbeddingMixin(OpenAIServing):
         request_id_item = f"{ctx.request_id}-{prompt_index}"
 
         self._log_inputs(request_id_item,
-                         request_prompt,
+                         engine_prompt,
                          params=pooling_params,
                          lora_request=ctx.lora_request)
 
@@ -353,20 +357,14 @@ class EmbeddingMixin(OpenAIServing):
                 return self.create_error_response(
                     "Engine prompts not available")
 
-            if ctx.request_prompts is None:
-                return self.create_error_response(
-                    "Request prompts not available")
-
             max_pos_embeddings = self._get_max_position_embeddings()
 
             for i, engine_prompt in enumerate(ctx.engine_prompts):
-                request_prompt = ctx.request_prompts[i]
-
                 # Check if this specific prompt needs chunked processing
-                if self._is_text_tokens_prompt(request_prompt):
+                if self._is_text_tokens_prompt(engine_prompt):
                     # Cast to TextTokensPrompt since we've verified
                     # prompt_token_ids
-                    text_tokens_prompt = cast(TextTokensPrompt, request_prompt)
+                    text_tokens_prompt = cast(TextTokensPrompt, engine_prompt)
                     if (len(text_tokens_prompt["prompt_token_ids"])
                             > max_pos_embeddings):
                         # Use chunked processing for this prompt
@@ -382,8 +380,7 @@ class EmbeddingMixin(OpenAIServing):
                     Union[EngineTokensPrompt, EngineEmbedsPrompt],
                     engine_prompt)
                 generator = await self._create_single_prompt_generator(
-                    ctx, engine_prompt_typed, request_prompt, pooling_params,
-                    trace_headers, i)
+                    ctx, engine_prompt_typed, pooling_params, trace_headers, i)
                 generators.append(generator)
 
             from vllm.utils import merge_async_iterators
@@ -419,10 +416,6 @@ class EmbeddingMixin(OpenAIServing):
             if not use_chunked:
                 return await super()._collect_batch(ctx=ctx)
 
-            if ctx.request_prompts is None:
-                return self.create_error_response(
-                    "Request prompts not available")
-
             if ctx.result_generator is None:
                 return self.create_error_response(
                     "Result generator not available")
@@ -538,7 +531,7 @@ class EmbeddingMixin(OpenAIServing):
                             data=final_embedding)
 
                         # Get original prompt token IDs for this prompt
-                        original_prompt = ctx.request_prompts[prompt_idx]
+                        original_prompt = ctx.engine_prompts[prompt_idx]
                         if not self._is_text_tokens_prompt(original_prompt):
                             return self.create_error_response(
                                 f"Chunked prompt {prompt_idx} is not a "
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index a218f6882f..1a2236de4f 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -368,23 +368,20 @@ class OpenAIServing:
             for i, engine_prompt in enumerate(ctx.engine_prompts):
                 request_id_item = f"{ctx.request_id}-{i}"
 
-                if ctx.request_prompts is None:
-                    return self.create_error_response(
-                        "Request prompts not available")
-
-                self._log_inputs(
-                    request_id_item,
-                    ctx.request_prompts[i],
-                    params=pooling_params,
-                    lora_request=ctx.lora_request,
-                )
-
                 # Mypy has an existing bug related to inferring the variance of
                 # TypedDicts with `builtins.enumerate`:
                 # https://github.com/python/mypy/issues/8586#issuecomment-2867698435
                 engine_prompt = cast(
                     Union[EngineTokensPrompt, EngineEmbedsPrompt],
                     engine_prompt)
+
+                self._log_inputs(
+                    request_id_item,
+                    engine_prompt,
+                    params=pooling_params,
+                    lora_request=ctx.lora_request,
+                )
+
                 generator = self.engine_client.encode(
                     engine_prompt,
                     pooling_params,
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 29200dda89..d3f3a8cfa5 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -108,10 +108,15 @@ class CompletionRenderer(BaseRenderer):
         for detailed parameter documentation.
         """
         if truncate_prompt_tokens is not None:
-            if max_length is not None:
-                assert 0 <= truncate_prompt_tokens <= max_length
             if truncate_prompt_tokens == 0:
                 return []
+            if truncate_prompt_tokens < 0:
+                truncate_prompt_tokens = self.model_config.max_model_len
+            if max_length is not None and truncate_prompt_tokens > max_length:
+                raise ValueError(
+                    f"truncate_prompt_tokens ({truncate_prompt_tokens}) "
+                    f"cannot be greater than max_length ({max_length}). "
+                    f"Please select a smaller truncation size.")
 
         # Parse and batch the input prompts
         batch_inputs = parse_and_batch_prompt(prompt_or_prompts)

From 2e5d21378db8226b04bb76f3e7119a9ddd0239df Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 7 Sep 2025 09:38:35 -0700
Subject: [PATCH 900/932] Skip MM Encoder for non-first PP ranks (#24387)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0224f3944f..549c5dd2bb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1620,14 +1620,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
-        if self.supports_mm_inputs:
+        if self.supports_mm_inputs and get_pp_group().is_first_rank:
             # Run the multimodal encoder if any.
             self._execute_mm_encoder(scheduler_output)
             mm_embeds = self._gather_mm_embeddings(scheduler_output)
-        else:
-            mm_embeds = []
 
-        if self.supports_mm_inputs and get_pp_group().is_first_rank:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.

From 795b6951cd6a18c2d78bcb1d188f112357d19262 Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Sun, 7 Sep 2025 17:30:27 -0700
Subject: [PATCH 901/932] Add @luccafong to codeowner for spec decode (#24397)

Signed-off-by: Lu Fang <fanglu@fb.com>
---
 .github/CODEOWNERS | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 682b27ac89..b6b3e184bf 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -28,7 +28,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
-/vllm/v1/spec_decode @benchislett
+/vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 
 # Test ownership
@@ -70,6 +70,9 @@ mkdocs.yaml @hmellor
 /vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
 /vllm/model_executor/models/qwen* @sighingnow
 
+# MTP-specific files
+/vllm/model_executor/models/deepseek_mtp.py @luccafong
+
 # Mistral-specific files
 /vllm/model_executor/models/mistral*.py @patrickvonplaten
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten

From 86173ad593ff2d0c699230c4c2cd3ee5715d6e5d Mon Sep 17 00:00:00 2001
From: Ming Yang <minos.future@gmail.com>
Date: Sun, 7 Sep 2025 18:27:12 -0700
Subject: [PATCH 902/932] [Kernel] Support decode context parallelism on
 Blackwell with CUTLASS MLA (#24385)

Signed-off-by: Ming Yang <minos.future@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 .../attention/mla/sm100_cutlass_mla_kernel.cu | 17 +++++++---
 csrc/torch_bindings.cpp                       |  8 ++---
 tests/kernels/test_cutlass_mla_decode.py      | 32 +++++++++++++------
 vllm/_custom_ops.py                           |  6 ++--
 vllm/v1/attention/backends/mla/cutlass_mla.py | 32 +++++++++++++------
 5 files changed, 63 insertions(+), 32 deletions(-)

diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
index 820bf81dd1..c60f1823b8 100644
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@@ -36,6 +36,7 @@ limitations under the License.
 #if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
 void sm100_cutlass_mla_decode(
     torch::Tensor const& out,
+    torch::Tensor const& lse,
     torch::Tensor const& q_nope,
     torch::Tensor const& q_pe,
     torch::Tensor const& kv_c_and_k_pe_cache,
@@ -99,6 +100,7 @@ struct MlaSm100 {
 template <typename T>
 typename T::Fmha::Arguments args_from_options(
     at::Tensor const& out,
+    at::Tensor const& lse,
     at::Tensor const& q_nope,
     at::Tensor const& q_pe,
     at::Tensor const& kv_c_and_k_pe_cache,
@@ -162,7 +164,10 @@ typename T::Fmha::Arguments args_from_options(
        stride_PT,
        page_count_total,
        page_size},
-      {static_cast<ElementOut*>(out.data_ptr()), stride_O, static_cast<ElementAcc*>(nullptr), stride_LSE},
+      {static_cast<ElementOut*>(out.data_ptr()),
+       stride_O,
+       static_cast<ElementAcc*>(lse.defined() ? lse.data_ptr() : nullptr),
+       stride_LSE},
       hw_info,
       // TODO(trevor-m): Change split_kv back to -1 when
       // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
@@ -181,6 +186,7 @@ typename T::Fmha::Arguments args_from_options(
 template <typename Element, typename ElementOut, bool IsPaged128, typename PersistenceOption>
 void runMla(
     at::Tensor const& out,
+    at::Tensor const& lse,
     at::Tensor const& q_nope,
     at::Tensor const& q_pe,
     at::Tensor const& kv_c_and_k_pe_cache,
@@ -192,7 +198,7 @@ void runMla(
     cudaStream_t stream) {
   using MlaSm100Type = MlaSm100<Element, ElementOut, IsPaged128, PersistenceOption>;
   typename MlaSm100Type::Fmha fmha;
-  auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
+  auto arguments = args_from_options<MlaSm100Type>(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
 
   CUTLASS_CHECK(fmha.can_implement(arguments));
 
@@ -214,6 +220,7 @@ void runMla(
 
 void sm100_cutlass_mla_decode(
     torch::Tensor const& out,
+    torch::Tensor const& lse,
     torch::Tensor const& q_nope,
     torch::Tensor const& q_pe,
     torch::Tensor const& kv_c_and_k_pe_cache,
@@ -234,13 +241,13 @@ void sm100_cutlass_mla_decode(
     DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
       if (in_dtype == at::ScalarType::Half) {
         runMla<cutlass::half_t, cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
       } else if (in_dtype == at::ScalarType::BFloat16) {
         runMla<cutlass::bfloat16_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
       } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
         runMla<cutlass::float_e4m3_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
       } else {
         TORCH_CHECK(false, "Unsupported input data type of MLA");
       }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 95fb5b197f..d3f50d1076 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -516,10 +516,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // SM100 CUTLASS MLA decode
   ops.def(
-      "sm100_cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
-      "                         Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
-      "                         Tensor page_table, Tensor workspace, float "
-      "scale,"
+      "sm100_cutlass_mla_decode(Tensor! out, Tensor! lse, Tensor q_nope,"
+      "                         Tensor q_pe, Tensor kv_c_and_k_pe_cache,"
+      "                         Tensor seq_lens, Tensor page_table,"
+      "                         Tensor workspace, float scale,"
       "                         int num_kv_splits) -> ()");
   // conditionally compiled so impl in source file
 
diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py
index 85984324b1..820dac0e6c 100644
--- a/tests/kernels/test_cutlass_mla_decode.py
+++ b/tests/kernels/test_cutlass_mla_decode.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 import random
+from typing import Optional
 
 import pytest
 import torch
@@ -14,14 +15,20 @@ from vllm.triton_utils import triton
 def cal_diff(x: torch.Tensor,
              y: torch.Tensor,
              name: str,
-             use_fp8: bool = False) -> None:
+             use_fp8: bool = False,
+             diff_threshold: Optional[float] = None) -> None:
     x, y = x.double(), y.double()
     cos_diff = 1 - 2 * (x * y).sum().item() / max(
         (x * x + y * y).sum().item(), 1e-12)
-    if (use_fp8):
-        assert cos_diff < 1e-4
+    if diff_threshold is not None:
+        # directly compare the cos_diff with the threshold
+        assert cos_diff < diff_threshold
     else:
-        assert cos_diff < 1e-5
+        # use the default threshold
+        if (use_fp8):
+            assert cos_diff < 1e-4
+        else:
+            assert cos_diff < 1e-5
 
 
 CUTLASS_MLA_UNSUPPORTED_REASON = \
@@ -118,11 +125,13 @@ def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size,
                                 dtype=torch.uint8)
 
         out_ans = torch.empty(b, MAX_HEADS, dv, dtype=init_dtype)
-
-        ops.sm100_cutlass_mla_decode(out_ans, q_nope, q_pe, kv_cache_flat,
-                                     cache_seqlens, block_table, workspace,
-                                     scale, 1)
-        return out_ans[:, :h_q].contiguous()
+        output_lse = torch.empty((b, MAX_HEADS),
+                                 dtype=torch.float32,
+                                 device=q_nope.device)
+        ops.sm100_cutlass_mla_decode(out_ans, output_lse, q_nope, q_pe,
+                                     kv_cache_flat, cache_seqlens, block_table,
+                                     workspace, scale, 1)
+        return out_ans[:, :h_q].contiguous(), output_lse[:, :h_q].contiguous()
 
     def scaled_dot_product_attention(query, key, value, is_causal=False):
         query = query.float()
@@ -165,11 +174,14 @@ def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size,
             lse[i] = lse_i
         return out, lse
 
-    out_cutlass = cutlass_mla()
+    out_cutlass, lse_cutlass = cutlass_mla()
     out_torch, lse_torch = ref_mla()
     # Extract the single token (s_q=1) slice to match cutlass output shape
     out_torch_slice = out_torch[:, 0, :, :]  # [b, h_q, dv]
+    lse_torch_slice = lse_torch[:, 0, :]  # [b, h_q]
     cal_diff(out_cutlass, out_torch_slice, "out", use_fp8)
+    # lse has larger numerical error, so use a larger threshold
+    cal_diff(lse_cutlass, lse_torch_slice, "lse", use_fp8, diff_threshold=1e-3)
 
     t = triton.testing.do_bench(cutlass_mla)
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 545f4cb48b..6e9a8df0a5 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1833,13 +1833,13 @@ def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
     return out
 
 
-def sm100_cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
-                             q_pe: torch.Tensor,
+def sm100_cutlass_mla_decode(out: torch.Tensor, lse: torch.Tensor,
+                             q_nope: torch.Tensor, q_pe: torch.Tensor,
                              kv_c_and_k_pe_cache: torch.Tensor,
                              seq_lens: torch.Tensor, page_table: torch.Tensor,
                              workspace: torch.Tensor, scale: float,
                              num_kv_splits: int) -> torch.Tensor:
-    torch.ops._C.sm100_cutlass_mla_decode(out, q_nope, q_pe,
+    torch.ops._C.sm100_cutlass_mla_decode(out, lse, q_nope, q_pe,
                                           kv_c_and_k_pe_cache, seq_lens,
                                           page_table, workspace, scale,
                                           num_kv_splits)
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 95dce8d8e2..6017445402 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -76,6 +76,7 @@ g_sm100_workspace = SM100Workspace(128 * 1024 * 1024)  # 128MB
 
 
 class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
+    can_return_lse_for_decode: bool = True
 
     def __init__(
             self,
@@ -138,7 +139,7 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         workspace: torch.Tensor,
         sm_scale: float,
         num_kv_splits: int,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         assert (q_nope.ndim == 3
                 ), f"q_nope must be a 3D tensor, but got {q_nope.ndim}"
         assert (
@@ -193,9 +194,13 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         dtype = (torch.bfloat16 if is_quantized_kv_cache(self.kv_cache_dtype)
                  else q_nope.dtype)
         out = q_nope.new_empty((B_q, MAX_HEADS, D_latent), dtype=dtype)
+        lse = (torch.empty(
+            (B_q, MAX_HEADS), dtype=torch.float32, device=q_nope.device)
+               if self.need_to_return_lse_for_decode else torch.Tensor())
 
         ops.sm100_cutlass_mla_decode(
             out,
+            lse,
             q_nope,
             q_pe,
             kv_c_and_k_pe_cache,
@@ -205,7 +210,9 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
             sm_scale,
             num_kv_splits,
         )
-        return out[:, :H].contiguous()
+        returned_lse = lse[:, :H].contiguous(
+        ) if self.need_to_return_lse_for_decode else lse
+        return out[:, :H].contiguous(), returned_lse
 
     def _sm100_forward_decode(
         self,
@@ -213,7 +220,7 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
@@ -226,13 +233,18 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         q_nope = q_nope.clone()
         q_pe = q_pe.clone()
 
-        o = self._sm100_cutlass_mla_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
-                                           attn_metadata.decode.seq_lens,
-                                           attn_metadata.decode.block_table,
-                                           self._workspace.get_buf(),
-                                           self.scale, self._num_kv_splits)
+        o, lse = self._sm100_cutlass_mla_decode(
+            q_nope,
+            q_pe,
+            kv_c_and_k_pe_cache,
+            attn_metadata.decode.seq_lens,
+            attn_metadata.decode.block_table,
+            self._workspace.get_buf(),
+            self.scale,
+            self._num_kv_splits,
+        )
 
-        return o
+        return o, (lse if self.need_to_return_lse_for_decode else None)
 
     # TODO: Currently we leave it here only for backup in case something is
     #       wrong with the new SM100 CUTLASS MLA kernel
@@ -286,4 +298,4 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
                                             attn_metadata), None
 
         return self._sm100_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
-                                          attn_metadata), None
+                                          attn_metadata)

From 67841317d11653febdb321d7748f3d7e0c242d64 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Mon, 8 Sep 2025 10:07:16 +0800
Subject: [PATCH 903/932] [xpu] upgrade ipex/python3.12 for xpu (#23830)

Signed-off-by: Yan Ma <yan.ma@intel.com>
---
 docker/Dockerfile.xpu                         | 33 ++++++++++++-------
 .../installation/gpu/xpu.inc.md               | 19 +++++------
 requirements/xpu.txt                          |  7 ++--
 3 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 65d2e5036b..ef42235250 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -1,12 +1,10 @@
 FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
 
-RUN rm /etc/apt/sources.list.d/intel-graphics.list
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
+    add-apt-repository -y ppa:kobuk-team/intel-graphics
 
 RUN apt clean && apt-get update -y && \
-    apt-get install -y software-properties-common && \
-    add-apt-repository ppa:deadsnakes/ppa && \
-    apt-get install -y python3.10 python3.10-distutils && \
-    curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
     apt-get install -y --no-install-recommends --fix-missing \
     curl \
     ffmpeg \
@@ -17,17 +15,29 @@ RUN apt clean && apt-get update -y && \
     libgl1 \
     lsb-release \
     numactl \
-    python3.10-dev \
-    wget
+    wget \
+    vim \
+    python3.12 \
+    python3.12-dev \
+    python3-pip
 
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
 
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
+RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
+
+RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
+RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
+SHELL ["bash", "-c"]
+CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
 
 WORKDIR /workspace/vllm
 COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
 COPY requirements/common.txt /workspace/vllm/requirements/common.txt
 
+# suppress the python externally managed environment error
+RUN python3 -m pip config set global.break-system-packages true
+
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --no-cache-dir \
     -r requirements/xpu.txt
@@ -54,8 +64,9 @@ FROM vllm-base AS vllm-openai
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
 
-ENV VLLM_USAGE_SOURCE production-docker-image \
-    TRITON_XPU_PROFILE 1
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip uninstall oneccl oneccl-devel -y
+
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/docs/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md
index b77c4e00cf..ed1dc0418c 100644
--- a/docs/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/getting_started/installation/gpu/xpu.inc.md
@@ -3,13 +3,16 @@
 vLLM initially supports basic model inference and serving on Intel GPU platform.
 
 !!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+    There are no pre-built wheels for this device, so you need build vLLM from source. Or you can use pre-built images which are based on vLLM released versions.
 
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
-- OneAPI requirements: oneAPI 2025.0
+- OneAPI requirements: oneAPI 2025.1
+- Python: 3.12
+!!! warning
+    The provided IPEX whl is Python3.12 specific so this version is a MUST.
 
 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
@@ -24,7 +27,7 @@ Currently, there are no pre-built XPU wheels.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.0 or later.
+- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later.
 - Second, install Python packages for vLLM XPU backend building:
 
 ```bash
@@ -40,14 +43,10 @@ pip install -v -r requirements/xpu.txt
 VLLM_TARGET_DEVICE=xpu python setup.py install
 ```
 
-!!! note
-    - FP16 is the default data type in the current XPU backend. The BF16 data
-      type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
-
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
-Currently, there are no pre-built XPU images.
+Currently, we release prebuilt XPU images at docker [hub](https://hub.docker.com/r/intel/vllm/tags) based on vLLM released version. For more information, please refer release [note](https://github.com/intel/ai-containers/blob/main/vllm).
 
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
@@ -65,14 +64,14 @@ docker run -it \
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]
 
-XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:
+XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
 
 ```bash
 python -m vllm.entrypoints.openai.api_server \
      --model=facebook/opt-13b \
      --dtype=bfloat16 \
      --max_model_len=1024 \
-     --distributed-executor-backend=ray \
+     --distributed-executor-backend=mp \
      --pipeline-parallel-size=2 \
      -tp=8
 ```
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index c44a2a9c74..74f5b05b23 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -11,10 +11,9 @@ jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 nixl==0.3.0 # for PD disaggregation
---extra-index-url=https://download.pytorch.org/whl/xpu
 torch==2.8.0+xpu
 torchaudio
 torchvision
-pytorch-triton-xpu
---extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.8.10+xpu
+--extra-index-url=https://download.pytorch.org/whl/xpu
+
+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl

From b3d7e3c845d3f4905e9a95776c7d605c743f0ea1 Mon Sep 17 00:00:00 2001
From: Xingyu Liu <38244988+charlotte12l@users.noreply.github.com>
Date: Sun, 7 Sep 2025 19:34:31 -0700
Subject: [PATCH 904/932] [Sampler] Support returning all prompt logprobs
 (#23868)

Signed-off-by: Xingyu Liu <charlotteliu12x@gmail.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/v1/sample/test_logprobs.py  | 12 ++++++++++--
 vllm/sampling_params.py           | 11 +++++++----
 vllm/v1/engine/processor.py       | 28 ++++++++++++++++++----------
 vllm/v1/worker/gpu_input_batch.py |  5 +++--
 4 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index e835c02963..570e330208 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -430,7 +430,7 @@ def test_zero_logprobs(vllm_model, example_prompts,
 
 
 def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
-    """Engine should return all vocabulary logprobs
+    """Engine should return all vocabulary logprobs and prompt logprobs
 
     Args:
       example_prompts: list of example prompts (test fixture)
@@ -444,16 +444,24 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
             # 2 other llms alive during whole session
             gpu_memory_utilization=0.15,
             max_model_len=256)
+
         sampling_params_logprobs_all = SamplingParams(max_tokens=5,
-                                                      logprobs=-1)
+                                                      logprobs=-1,
+                                                      prompt_logprobs=-1)
         results_logprobs_all = runner.llm.generate(
             example_prompts, sampling_params=sampling_params_logprobs_all)
         vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
+
         for i in range(len(results_logprobs_all)):
             logprobs = results_logprobs_all[i].outputs[0].logprobs
+            prompt_logprobs = results_logprobs_all[i].prompt_logprobs
             assert logprobs is not None
             for logprob in logprobs:
                 assert len(logprob) == vocab_size
+            assert prompt_logprobs is not None
+            assert prompt_logprobs[0] is None
+            for prompt_logprob in prompt_logprobs[1:]:
+                assert len(prompt_logprob) == vocab_size
 
 
 @pytest.mark.parametrize("logprobs_mode", list(LogprobsMode))
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index c7b4ba34c6..fe93e90606 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -165,7 +165,8 @@ class SamplingParams(
     the sampled token, so there may be up to `logprobs+1` elements in the
     response. When set to -1, return all `vocab_size` log probabilities."""
     prompt_logprobs: Optional[int] = None
-    """Number of log probabilities to return per prompt token."""
+    """Number of log probabilities to return per prompt token.
+    When set to -1, return all `vocab_size` log probabilities."""
     # NOTE: This parameter is only exposed at the engine level for now.
     # It is not exposed in the OpenAI API server, as the OpenAI API does
     # not support returning only a list of token IDs.
@@ -409,9 +410,11 @@ class SamplingParams(
                 and self.logprobs < 0):
             raise ValueError(
                 f"logprobs must be non-negative or -1, got {self.logprobs}.")
-        if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
-            raise ValueError(f"prompt_logprobs must be non-negative, got "
-                             f"{self.prompt_logprobs}.")
+        if (self.prompt_logprobs is not None and self.prompt_logprobs != -1
+                and self.prompt_logprobs < 0):
+            raise ValueError(
+                f"prompt_logprobs must be non-negative or -1, got "
+                f"{self.prompt_logprobs}.")
         if (self.truncate_prompt_tokens is not None
                 and (self.truncate_prompt_tokens == 0
                      or self.truncate_prompt_tokens < -1)):
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 1aa117ded4..baade24314 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -65,19 +65,27 @@ class Processor:
     ) -> None:
         max_logprobs = self.model_config.max_logprobs
         if max_logprobs == -1:
-            return
+            max_logprobs = self.model_config.get_vocab_size()
+
         # Validate sample logprobs.
-        if params.logprobs and (params.logprobs == -1
-                                or params.logprobs > max_logprobs):
-            raise ValueError(
-                f"Requested sample logprobs of {params.logprobs}, "
-                f"which is greater than max allowed: {max_logprobs}")
+        if params.logprobs:
+            num_logprobs = params.logprobs
+            if num_logprobs == -1:
+                num_logprobs = self.model_config.get_vocab_size()
+            if num_logprobs > max_logprobs:
+                raise ValueError(
+                    f"Requested sample logprobs of {num_logprobs}, "
+                    f"which is is greater than max allowed: {max_logprobs}")
 
         # Validate prompt logprobs.
-        if params.prompt_logprobs and params.prompt_logprobs > max_logprobs:
-            raise ValueError(
-                f"Requested prompt logprobs of {params.prompt_logprobs}, "
-                f"which is greater than max allowed: {max_logprobs}")
+        if params.prompt_logprobs:
+            num_prompt_logprobs = params.prompt_logprobs
+            if num_prompt_logprobs == -1:
+                num_prompt_logprobs = self.model_config.get_vocab_size()
+            if num_prompt_logprobs > max_logprobs:
+                raise ValueError(
+                    f"Requested prompt logprobs of {num_prompt_logprobs}, "
+                    f"which is is greater than max allowed: {max_logprobs}")
 
     def _validate_sampling_params(
         self,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 83fc821b84..bf9b16575e 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -360,8 +360,9 @@ class InputBatch:
                                              if sampling_params.logprobs == -1
                                              else sampling_params.logprobs)
             if sampling_params.prompt_logprobs is not None:
-                self.num_prompt_logprobs[
-                    req_id] = sampling_params.prompt_logprobs
+                self.num_prompt_logprobs[req_id] = (
+                    self.vocab_size if sampling_params.prompt_logprobs == -1
+                    else sampling_params.prompt_logprobs)
 
             if sampling_params.allowed_token_ids:
                 self.has_allowed_token_ids.add(req_id)

From 3a3e91bdfe86adcb8fceb1cb8c5f883878fc65b4 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Sun, 7 Sep 2025 19:51:59 -0700
Subject: [PATCH 905/932] [CI/Build] Disable flaky test_structured_output tests
 (#24404)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index c10b1abb2b..126d8ce8c8 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -46,12 +46,12 @@ PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
+    #FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402
+    # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
+    # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
+    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
     ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto",
      NGRAM_SPEC_CONFIG),
-    #FIXME: This test is flaky on CI thus disabled
-    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
     ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto",
      NGRAM_SPEC_CONFIG),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG),

From 3bca396f79cf3ba248ee369fe73533255006b2b7 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Sun, 7 Sep 2025 20:31:35 -0700
Subject: [PATCH 906/932] [CI/Build] Fix local image inputs in test_pixtral.py
 (#24401)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 .../multimodal/generation/test_pixtral.py     | 30 +++++++------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index f95dbc7547..a4e21aface 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -187,27 +187,19 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
                          name_1="output")
 
 
-@pytest.fixture
-def prompt(request, local_asset_server) -> TextPrompt:
-    names = request.param
-    urls = [local_asset_server.url_for(n) for n in names]
-    return _create_engine_inputs_hf(urls)
-
-
 @pytest.mark.parametrize(
-    "prompt,expected_ranges",
-    [
-        pytest.param(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
-        pytest.param(IMG_URLS[1:4], [
-            PlaceholderRange(offset=11, length=266),
-            PlaceholderRange(offset=277, length=1056),
-            PlaceholderRange(offset=1333, length=418)
-        ])
-    ],
-)
-def test_multi_modal_placeholders(vllm_runner, prompt: TextPrompt,
+    "image_urls,expected_ranges",
+    [(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
+     (IMG_URLS[1:4], [
+         PlaceholderRange(offset=11, length=266),
+         PlaceholderRange(offset=277, length=1056),
+         PlaceholderRange(offset=1333, length=418)
+     ])])
+def test_multi_modal_placeholders(vllm_runner, image_urls: list[str],
                                   expected_ranges: list[PlaceholderRange],
-                                  monkeypatch) -> None:
+                                  local_asset_server, monkeypatch) -> None:
+    local_image_urls = [local_asset_server.url_for(u) for u in image_urls]
+    prompt = _create_engine_inputs_hf(local_image_urls)
 
     # This placeholder checking test only works with V0 engine
     # where `multi_modal_placeholders` is returned with `RequestOutput`

From 8c892b1831a8a35fbf5a89fedfaae0ab9bef2dd0 Mon Sep 17 00:00:00 2001
From: Al-Ekram Elahee Hridoy <aliqramalaheehridoy@gmail.com>
Date: Sun, 7 Sep 2025 23:33:52 -0600
Subject: [PATCH 907/932] [Doc] Fix UTF-8 encoding issues in documentation
 generation on Windows (#24361)

Signed-off-by: alekramelaheehridoy <aliqramalaheehridoy@gmail.com>
Signed-off-by: alekramelaheehridoy <alekramelaheehridoy@gmail.com>
Co-authored-by: alekramelaheehridoy <alekramelaheehridoy@gmail.com>
---
 docs/mkdocs/hooks/generate_argparse.py | 3 ++-
 docs/mkdocs/hooks/generate_examples.py | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index 051a2d9044..91454ec272 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -165,6 +165,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
     # Generate documentation for each parser
     for stem, parser in parsers.items():
         doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
-        with open(doc_path, "w") as f:
+        # Specify encoding for building on Windows
+        with open(doc_path, "w", encoding="utf-8") as f:
             f.write(parser.format_help())
         logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 881df79169..ac2101daac 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -106,7 +106,8 @@ class Example:
 
     def determine_title(self) -> str:
         if not self.is_code:
-            with open(self.main_file) as f:
+            # Specify encoding for building on Windows
+            with open(self.main_file, encoding="utf-8") as f:
                 first_line = f.readline().strip()
             match = re.match(r'^#\s+(?P<title>.+)$', first_line)
             if match:
@@ -174,6 +175,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
         doc_path = EXAMPLE_DOC_DIR / example.category / example_name
         if not doc_path.parent.exists():
             doc_path.parent.mkdir(parents=True)
-        with open(doc_path, "w+") as f:
+        # Specify encoding for building on Windows
+        with open(doc_path, "w+", encoding="utf-8") as f:
             f.write(example.generate())
         logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))

From 61aa4b2901fa2a1376bcd781da253d83526e3090 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Mon, 8 Sep 2025 14:07:00 +0800
Subject: [PATCH 908/932] [P/D] Add a shutdown method to the Connector API
 (#22699)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/distributed/kv_transfer/__init__.py         |  7 ++++---
 .../kv_transfer/kv_connector/v1/base.py          |  8 ++++++++
 .../distributed/kv_transfer/kv_transfer_state.py |  7 +++++++
 vllm/executor/executor_base.py                   |  2 +-
 vllm/v1/core/sched/scheduler.py                  |  2 ++
 vllm/v1/executor/multiproc_executor.py           | 16 +++++++++-------
 vllm/v1/worker/gpu_worker.py                     |  3 +++
 .../v1/worker/kv_connector_model_runner_mixin.py |  8 +++++++-
 vllm/v1/worker/tpu_worker.py                     |  3 +++
 vllm/worker/worker_base.py                       |  8 ++++++++
 10 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
index fa9b7e4f14..cf58e79149 100644
--- a/vllm/distributed/kv_transfer/__init__.py
+++ b/vllm/distributed/kv_transfer/__init__.py
@@ -2,11 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.distributed.kv_transfer.kv_transfer_state import (
-    KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group,
-    has_kv_transfer_group, is_v1_kv_transfer_group)
+    KVConnectorBaseType, ensure_kv_transfer_initialized,
+    ensure_kv_transfer_shutdown, get_kv_transfer_group, has_kv_transfer_group,
+    is_v1_kv_transfer_group)
 
 __all__ = [
     "get_kv_transfer_group", "has_kv_transfer_group",
     "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized",
-    "KVConnectorBaseType"
+    "ensure_kv_transfer_shutdown", "KVConnectorBaseType"
 ]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 2804003f5a..f3f493144d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -226,6 +226,14 @@ class KVConnectorBase_V1(ABC):
         """
         return None, None
 
+    def shutdown(self):
+        """
+        Shutdown the connector. This is called when the worker process
+        is shutting down to ensure that all the async operations are
+        completed and the connector is cleaned up properly.
+        """
+        return None
+
     # ==============================
     # Scheduler-side methods
     # ==============================
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
index 5e0f64fca2..d5747bed92 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_state.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -64,3 +64,10 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
                 config=vllm_config, role=KVConnectorRole.WORKER)
         else:
             raise ValueError("V0 is no longer supported")
+
+
+def ensure_kv_transfer_shutdown() -> None:
+    global _KV_CONNECTOR_AGENT
+    if _KV_CONNECTOR_AGENT is not None:
+        _KV_CONNECTOR_AGENT.shutdown()
+        _KV_CONNECTOR_AGENT = None
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 813232cd19..a3c1d79a58 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -231,7 +231,7 @@ class ExecutorBase(ABC):
 
     def shutdown(self) -> None:
         """Shutdown the executor."""
-        return
+        self.collective_rpc("shutdown")
 
     def __del__(self):
         self.shutdown()
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 31f7e9c70f..2d40e96632 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1188,6 +1188,8 @@ class Scheduler(SchedulerInterface):
     def shutdown(self) -> None:
         if self.kv_event_publisher:
             self.kv_event_publisher.shutdown()
+        if self.connector is not None:
+            self.connector.shutdown()
 
     ########################################################################
     # KV Connector Related Methods
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index ef6303495c..c3d6c20e22 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import multiprocessing
-import os
 import pickle
 import queue
 import signal
@@ -507,6 +506,7 @@ class WorkerProc:
         return cast(list[WorkerProcHandle], ready_proc_handles)
 
     def shutdown(self):
+        self.worker.shutdown()
         self.rpc_broadcast_mq = None
         self.worker_response_mq = None
         destroy_model_parallel()
@@ -536,7 +536,7 @@ class WorkerProc:
         # tuple[Connection, Connection]
         reader, ready_writer = kwargs.pop("ready_pipe")
         death_pipe = kwargs.pop("death_pipe", None)
-
+        shutdown_event = threading.Event()
         # Start death monitoring thread if death_pipe is provided
         if death_pipe is not None:
 
@@ -548,7 +548,7 @@ class WorkerProc:
                     # Parent process has exited, terminate this worker
                     logger.info("Parent process exited, terminating worker")
                     # Send signal to self to trigger clean shutdown
-                    os.kill(os.getpid(), signal.SIGTERM)
+                    shutdown_event.set()
                 except Exception as e:
                     logger.warning("Death monitoring error: %s", e)
 
@@ -576,7 +576,7 @@ class WorkerProc:
             ready_writer.close()
             ready_writer = None
 
-            worker.worker_busy_loop()
+            worker.worker_busy_loop(cancel=shutdown_event)
 
         except Exception:
             # NOTE: if an Exception arises in busy_loop, we send
@@ -586,6 +586,8 @@ class WorkerProc:
 
             if ready_writer is not None:
                 logger.exception("WorkerProc failed to start.")
+            elif shutdown_event.is_set():
+                logger.info("WorkerProc shutting down.")
             else:
                 logger.exception("WorkerProc failed.")
 
@@ -637,11 +639,11 @@ class WorkerProc:
             output = self.async_output_queue.get()
             self.enqueue_output(output)
 
-    def worker_busy_loop(self):
+    def worker_busy_loop(self, cancel: Optional[threading.Event] = None):
         """Main busy loop for Multiprocessing Workers"""
         while True:
-            method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue()
-
+            method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
+                cancel=cancel)
             try:
                 if isinstance(method, str):
                     func = getattr(self.worker, method)
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 6a3bc5d46d..726f596034 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -601,6 +601,9 @@ class Worker(WorkerBase):
         self.model_runner.save_tensorized_model(
             tensorizer_config=tensorizer_config, )
 
+    def shutdown(self) -> None:
+        self.model_runner.ensure_kv_transfer_shutdown()
+
 
 def init_worker_distributed_environment(
     vllm_config: VllmConfig,
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index e2ffa2f12f..67bb967d2e 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -9,7 +9,8 @@ from typing import Generator  # noqa: UP035
 from typing import TYPE_CHECKING, Optional
 
 from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+from vllm.distributed.kv_transfer import (ensure_kv_transfer_shutdown,
+                                          get_kv_transfer_group,
                                           has_kv_transfer_group)
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
 from vllm.forward_context import get_forward_context, set_forward_context
@@ -42,6 +43,11 @@ class KVConnectorModelRunnerMixin:
             # Do this here to save a collective_rpc.
             kv_connector.start_load_kv(get_forward_context())
 
+    @staticmethod
+    def ensure_kv_transfer_shutdown() -> None:
+        if has_kv_transfer_group():
+            ensure_kv_transfer_shutdown()
+
     @staticmethod
     def maybe_wait_for_kv_save() -> None:
         if has_kv_transfer_group():
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 3f4e3ecbd4..fc72b954df 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -330,6 +330,9 @@ class TPUWorker:
 
         ensure_kv_transfer_initialized(vllm_config)
 
+    def shutdown(self) -> None:
+        self.model_runner.ensure_kv_transfer_shutdown()
+
 
 if USE_TPU_COMMONS:
     from tpu_commons.worker import TPUWorker as TPUCommonsWorker
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index a1fa7f2cf7..aa76d21f0f 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -129,6 +129,10 @@ class WorkerBase:
         """Get vocabulary size from model configuration."""
         return self.model_config.get_vocab_size()
 
+    def shutdown(self) -> None:
+        """Clean up resources held by the worker."""
+        return
+
 
 class DelegateWorkerBase(WorkerBase):
     """
@@ -519,6 +523,10 @@ class WorkerWrapperBase:
                 from vllm.utils import init_cached_hf_modules
                 init_cached_hf_modules()
 
+    def shutdown(self) -> None:
+        if self.worker is not None:
+            self.worker.shutdown()
+
     def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
         """
         Adjust the rpc_rank based on the given mapping.

From 8a4660260624eb34c365da34e4962e6ed15bbc0b Mon Sep 17 00:00:00 2001
From: Chatcharin Sangbutsarakum
 <67754293+what-in-the-nim@users.noreply.github.com>
Date: Mon, 8 Sep 2025 13:10:54 +0700
Subject: [PATCH 909/932] [Model] Remove unnecessary CUDA sync of GLM-4.1V
 image and video preprocess (#24332)

Signed-off-by: Win <chatcharinsang@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/model_executor/models/glm4_1v.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index fd5fecac67..055cab9013 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -1429,6 +1429,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             self, image_input: Glm4vImageInputs) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
 
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
@@ -1443,13 +1444,15 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
                 image_embeds = self.visual(pixel_values,
                                            grid_thw=grid_thw.tolist())
         merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
-        return image_embeds.split(sizes.tolist())
+        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
+                 (merge_size * merge_size)).tolist()
+        return image_embeds.split(sizes)
 
     def _process_video_input(
             self, video_input: Glm4vVideoInputs) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
 
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
@@ -1466,8 +1469,9 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
                                            grid_thw=grid_thw.tolist())
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
-        return video_embeds.split(sizes.tolist())
+        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
+                 (merge_size * merge_size)).tolist()
+        return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         mm_input_by_modality = {}

From 60f0843ef8fb4b0c4e6788acc042873a0a2ea2a1 Mon Sep 17 00:00:00 2001
From: Chatcharin Sangbutsarakum
 <67754293+what-in-the-nim@users.noreply.github.com>
Date: Mon, 8 Sep 2025 13:11:12 +0700
Subject: [PATCH 910/932] [Model] Remove unnecessary CUDA sync of Qwen2VL image
 and video preprocess (#24334)

Signed-off-by: Win <chatcharinsang@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/model_executor/models/qwen2_vl.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index f00b214b1e..90a1ad2a65 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1218,6 +1218,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
 
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"]
@@ -1227,15 +1228,17 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # Split concatenated embeddings for each image item.
         merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
+        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
+                 (merge_size * merge_size)).tolist()
 
-        return image_embeds.split(sizes.tolist())
+        return image_embeds.split(sizes)
 
     def _process_video_input(
             self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]:
 
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
 
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"]
@@ -1245,9 +1248,10 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
+        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
+                 (merge_size * merge_size)).tolist()
 
-        return video_embeds.split(sizes.tolist())
+        return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         modalities = {}

From 425b04b8f47ec2a7fe12a7e379e3b45e0153a504 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Mon, 8 Sep 2025 14:49:52 +0800
Subject: [PATCH 911/932] [gpt-oss][Responses API] Fix the function call id
 format (#24409)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/entrypoints/harmony_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index d1ff06425f..a3693ce60e 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -273,7 +273,7 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
                     call_id=f"call_{random_id}",
                     type="function_call",
                     name=function_name,
-                    id=f"ft_{random_id}",
+                    id=f"fc_{random_id}",
                 )
                 output_items.append(response_item)
         elif recipient is not None and (recipient.startswith("python")

From 2f0b833a05190a71e837f0b6d3e2a01c25c86e73 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Mon, 8 Sep 2025 15:19:40 +0800
Subject: [PATCH 912/932] [Docs] Fix a tip indentation and typo (#24419)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/contributing/profiling.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index dffd62385e..5b83d93274 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -19,7 +19,7 @@ When using `vllm bench serve`, you can enable profiling by passing the `--profil
 Traces can be visualized using <https://ui.perfetto.dev/>.
 
 !!! tip
-You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`.
+    You can directly call bench module without installing vLLM using `python -m vllm.entrypoints.cli.main bench`.
 
 !!! tip
     Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.

From f4962a6d55a340ebb569d377c842deff7611d8f7 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Mon, 8 Sep 2025 09:22:16 +0200
Subject: [PATCH 913/932] [Doc]: fix typos in Python comments (#24417)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 examples/offline_inference/chat_with_tools.py                   | 2 +-
 vllm/attention/backends/mla/common.py                           | 2 +-
 vllm/config/__init__.py                                         | 2 +-
 vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py  | 2 +-
 vllm/engine/arg_utils.py                                        | 2 +-
 vllm/engine/multiprocessing/client.py                           | 2 +-
 vllm/entrypoints/openai/cli_args.py                             | 2 +-
 .../openai/tool_parsers/llama4_pythonic_tool_parser.py          | 2 +-
 vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py     | 2 +-
 vllm/model_executor/layers/fused_moe/modular_kernel.py          | 2 +-
 vllm/model_executor/layers/quantization/utils/marlin_utils.py   | 2 +-
 vllm/v1/worker/block_table.py                                   | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
index 6e56e24f20..3a95b1fdfb 100644
--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@@ -143,5 +143,5 @@ outputs = llm.chat(messages, sampling_params, tools=tools)
 
 print(outputs[0].outputs[0].text.strip())
 # yields
-#   'The weather in Dallas, TX is 85 degrees fahrenheit. '
+#   'The weather in Dallas, TX is 85 degrees Fahrenheit. '
 #   'It is partly cloudly, with highs in the 90's.'
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 3b90375211..789393eb39 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1052,7 +1052,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
             return layer.weight
 
         # we currently do not have quantized bmm's which are needed for
-        # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
+        # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
         # the bmm's in 16-bit, the extra memory overhead of this is fairly low
         kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
         assert kv_b_proj_weight.shape == (
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 063af69f41..f6f1838aed 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1169,7 +1169,7 @@ class ModelConfig:
             ]
             # Any custom overrides will be in quantization_methods so we place
             # them at the start of the list so custom overrides have preference
-            # over the built in ones.
+            # over the built-in ones.
             quantization_methods = quantization_methods + overrides
 
             # Detect which checkpoint is it
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index c2f73fa281..20d1e31a71 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -770,7 +770,7 @@ class NixlConnectorWorker:
             # with joint KV for each block. This minimizes the overhead in
             # registerMem allowing faster descs queries. In order to be able to
             # split on kv_heads dim as required by heterogeneous TP, one must
-            # be able to index K/V separately. Hence the we double the number
+            # be able to index K/V separately. Hence we double the number
             # of 'virtual' regions here and halve `block_len` below.
             self.num_regions *= 2
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index fdd25a2f9c..bee97f4cd0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1159,7 +1159,7 @@ class EngineArgs:
         # Note(hc): In the current implementation of decode context
         # parallel(DCP), tp_size needs to be divisible by dcp_size,
         # because the world size does not change by dcp, it simply
-        # reuse the GPUs of TP group, and split one TP group into
+        # reuses the GPUs of TP group, and split one TP group into
         # tp_size//dcp_size DCP groups.
         assert self.tensor_parallel_size % self.decode_context_parallel_size \
             == 0, (
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 0beb9c8cc0..7d1f29a982 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -235,7 +235,7 @@ class MQLLMEngineClient(EngineClient):
                         # therefore we have to inform that the current
                         # processed requests failed as well. Send back a dead
                         # engine error give this feedback and also give a
-                        # 'hint' to the server to shutdown next.
+                        # 'hint' to the server to shut down next.
                         exception = self.dead_error
 
                     if request_id is None:
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index d0b5d013eb..7e1df795fb 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -204,7 +204,7 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
         frontend_kwargs["lora_modules"]["type"] = optional_type(str)
         frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
 
-        # Special case: Middleware needs append action
+        # Special case: Middleware needs to append action
         frontend_kwargs["middleware"]["action"] = "append"
         frontend_kwargs["middleware"]["type"] = str
         if "nargs" in frontend_kwargs["middleware"]:
diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
index 6bf44a4345..9a9a19ce21 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
@@ -176,7 +176,7 @@ class Llama4PythonicToolParser(ToolParser):
                             index] += delta.function.arguments
 
         # HACK: serving_chat.py inspects the internal state of tool parsers
-        # when determining it's final streaming delta, automatically
+        # when determining its final streaming delta, automatically
         # adding autocompleted JSON.
         # These two lines avoid that nonsense while ensuring finish_reason
         # is set to tool_calls when at least one tool is called.
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index c0691f1229..e6b300fd84 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -143,7 +143,7 @@ class MistralToolParser(ToolParser):
             except json.JSONDecodeError:
                 # use a regex to find the part corresponding to the tool call.
                 # NOTE: This use case should not happen if the model is trained
-                # correctly. It's a easy possible fix so it's included, but
+                # correctly. It's an easy possible fix so it's included, but
                 # can be brittle for very complex / highly nested tool calls
                 raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
                 function_call_arr = json.loads(raw_tool_call)
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 7a8c6f8571..281563c3bf 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -302,7 +302,7 @@ class FusedMoEPrepareAndFinalize(ABC):
     def max_num_tokens_per_rank(self) -> Optional[int]:
         """
         Some PrepareFinalize All2All implementations are batched. Meaning,
-        they can processes only as set of tokens at a time. This
+        they can process only as set of tokens at a time. This
         function returns the batch size i.e the maximum number of tokens
         the implementation can process at a time.
         Return None if there are no such restrictions.
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 02057b476c..317ad079b3 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -201,7 +201,7 @@ def marlin_make_workspace(output_size_per_partition: int,
 def marlin_make_workspace_new(device: torch.device,
                               max_blocks_per_sm: int = 1) -> torch.Tensor:
     # In the new marlin kernel, we use the num of threadblocks as workspace
-    # size. The num of threadblocks is is sms_count * max_blocks_per_sm.
+    # size. The num of threadblocks is sms_count * max_blocks_per_sm.
     sms = torch.cuda.get_device_properties(device).multi_processor_count
     return torch.zeros(sms * max_blocks_per_sm,
                        dtype=torch.int,
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index c5902595a4..0e509b7453 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -98,7 +98,7 @@ class BlockTable:
         # here because M (max_model_len) is not necessarily divisible by
         # block_size.
         if self.dcp_world_size > 1:
-            # Note(hc): The DCP implement store kvcache with a interleave
+            # Note(hc): The DCP implement store kvcache with an interleave
             # style, the kvcache for the token whose token_idx is i is
             # always stored on the GPU whose dcp_rank equals i % cp_world_size:
 

From c2a8b08fcdc48579871cd568b0799037a1214ea1 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Mon, 8 Sep 2025 17:28:32 +0800
Subject: [PATCH 914/932] [Doc] Fix issues in integrations/llamastack.md
 (#24428)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/deployment/integrations/llamastack.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/deployment/integrations/llamastack.md b/docs/deployment/integrations/llamastack.md
index 28031f01f8..8eb7f8d812 100644
--- a/docs/deployment/integrations/llamastack.md
+++ b/docs/deployment/integrations/llamastack.md
@@ -1,6 +1,6 @@
 # Llama Stack
 
-vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
+vLLM is also available via [Llama Stack](https://github.com/llamastack/llama-stack).
 
 To install Llama Stack, run
 
@@ -8,9 +8,9 @@ To install Llama Stack, run
 pip install llama-stack -q
 ```
 
-## Inference using OpenAI Compatible API
+## Inference using OpenAI-Compatible API
 
-Then start Llama Stack server pointing to your vLLM server with the following configuration:
+Then start the Llama Stack server and configure it to point to your vLLM server with the following settings:
 
 ```yaml
 inference:
@@ -20,15 +20,15 @@ inference:
       url: http://127.0.0.1:8000
 ```
 
-Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider.
+Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/providers/inference/remote_vllm.html) for more details on this remote vLLM provider.
 
-## Inference via Embedded vLLM
+## Inference using Embedded vLLM
 
-An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm)
+An [inline provider](https://github.com/llamastack/llama-stack/tree/main/llama_stack/providers/inline/inference)
 is also available. This is a sample of configuration using that method:
 
 ```yaml
-inference
+inference:
   - provider_type: vllm
     config:
       model: Llama3.1-8B-Instruct

From 5e537f45b453096c489d06f0c47776f9436ef99b Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Mon, 8 Sep 2025 19:03:02 +0800
Subject: [PATCH 915/932] [Bugfix] Fix get_quant_config when using modelscope
 (#24421)

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 .../model_loader/default_loader.py            | 38 ++-------------
 .../model_loader/weight_utils.py              | 46 ++++++++++++++++++-
 2 files changed, 48 insertions(+), 36 deletions(-)

diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 1e5aa9e571..c8ad3a55d9 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -7,20 +7,19 @@ import time
 from collections.abc import Generator, Iterable
 from typing import Optional, cast
 
-import huggingface_hub
 import torch
 from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm import envs
 from vllm.config import LoadConfig, ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     fastsafetensors_weights_iterator, filter_duplicate_safetensors_files,
-    filter_files_not_needed_for_inference, get_lock, np_cache_weights_iterator,
-    pt_weights_iterator, safetensors_weights_iterator)
+    filter_files_not_needed_for_inference, maybe_download_from_modelscope,
+    np_cache_weights_iterator, pt_weights_iterator,
+    safetensors_weights_iterator)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -57,35 +56,6 @@ class DefaultModelLoader(BaseModelLoader):
             raise ValueError(f"Model loader extra config is not supported for "
                              f"load format {load_config.load_format}")
 
-    def _maybe_download_from_modelscope(
-            self, model: str, revision: Optional[str]) -> Optional[str]:
-        """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True.
-
-        Returns the path to the downloaded model, or None if the model is not
-        downloaded from ModelScope."""
-        if envs.VLLM_USE_MODELSCOPE:
-            # download model from ModelScope hub,
-            # lazy import so that modelscope is not required for normal use.
-            # pylint: disable=C.
-            from modelscope.hub.snapshot_download import snapshot_download
-
-            # Use file lock to prevent multiple processes from
-            # downloading the same model weights at the same time.
-            with get_lock(model, self.load_config.download_dir):
-                if not os.path.exists(model):
-                    model_path = snapshot_download(
-                        model_id=model,
-                        cache_dir=self.load_config.download_dir,
-                        local_files_only=huggingface_hub.constants.
-                        HF_HUB_OFFLINE,
-                        revision=revision,
-                        ignore_file_pattern=self.load_config.ignore_patterns,
-                    )
-                else:
-                    model_path = model
-            return model_path
-        return None
-
     def _prepare_weights(
         self,
         model_name_or_path: str,
@@ -96,7 +66,7 @@ class DefaultModelLoader(BaseModelLoader):
         """Prepare weights for the model.
 
         If the model is not local, it will be downloaded."""
-        model_name_or_path = (self._maybe_download_from_modelscope(
+        model_name_or_path = (maybe_download_from_modelscope(
             model_name_or_path, revision) or model_name_or_path)
 
         is_local = os.path.isdir(model_name_or_path)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index f87eeaa456..50056038b6 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -21,6 +21,7 @@ from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
 
+from vllm import envs
 from vllm.config import LoadConfig, ModelConfig
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.logger import init_logger
@@ -95,6 +96,41 @@ def get_lock(model_name_or_path: Union[str, Path],
     return lock
 
 
+def maybe_download_from_modelscope(
+        model: str,
+        revision: Optional[str] = None,
+        download_dir: Optional[str] = None,
+        ignore_patterns: Optional[Union[str, list[str]]] = None,
+        allow_patterns: Optional[Union[list[str],
+                                       str]] = None) -> Optional[str]:
+    """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True.
+
+        Returns the path to the downloaded model, or None if the model is not
+        downloaded from ModelScope."""
+    if envs.VLLM_USE_MODELSCOPE:
+        # download model from ModelScope hub,
+        # lazy import so that modelscope is not required for normal use.
+        # pylint: disable=C.
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        # Use file lock to prevent multiple processes from
+        # downloading the same model weights at the same time.
+        with get_lock(model, download_dir):
+            if not os.path.exists(model):
+                model_path = snapshot_download(
+                    model_id=model,
+                    cache_dir=download_dir,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    revision=revision,
+                    ignore_file_pattern=ignore_patterns,
+                    allow_patterns=allow_patterns,
+                )
+            else:
+                model_path = model
+        return model_path
+    return None
+
+
 def _shared_pointers(tensors):
     ptrs = defaultdict(list)
     for k, v in tensors.items():
@@ -169,7 +205,13 @@ def get_quant_config(model_config: ModelConfig,
     # Inflight BNB quantization
     if model_config.quantization == "bitsandbytes":
         return quant_cls.from_config({})
-    is_local = os.path.isdir(model_config.model)
+    model_name_or_path = maybe_download_from_modelscope(
+        model_config.model,
+        revision=model_config.revision,
+        download_dir=load_config.download_dir,
+        allow_patterns=["*.json"],
+    ) or model_config.model
+    is_local = os.path.isdir(model_name_or_path)
     if not is_local:
         # Download the config files.
         with get_lock(model_config.model, load_config.download_dir):
@@ -182,7 +224,7 @@ def get_quant_config(model_config: ModelConfig,
                 tqdm_class=DisabledTqdm,
             )
     else:
-        hf_folder = model_config.model
+        hf_folder = model_name_or_path
 
     possible_config_filenames = quant_cls.get_config_filenames()
 

From e041314184e3b1c4d797b2b025a5a9f54ade5a00 Mon Sep 17 00:00:00 2001
From: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Date: Mon, 8 Sep 2025 14:42:41 +0300
Subject: [PATCH 916/932] [Bugfix] Fix mamba2 prefill chunking (#23279)

Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
Signed-off-by: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/kernels/mamba/test_mamba_ssm_ssd.py     | 233 +++++++++++++++++-
 .../layers/mamba/ops/ssd_chunk_scan.py        |   3 +
 .../layers/mamba/ops/ssd_combined.py          |   9 +-
 .../layers/mamba/ops/ssd_state_passing.py     |  84 +++++--
 vllm/v1/attention/backends/mamba2_attn.py     |  55 ++++-
 5 files changed, 349 insertions(+), 35 deletions(-)

diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
index 2c554baaff..1ce7f9d85e 100644
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -115,21 +115,27 @@ def generate_continuous_batched_examples(example_lens_by_batch,
                                          n_heads,
                                          d_head,
                                          itype,
-                                         device='cuda'):
+                                         device='cuda',
+                                         return_naive_ref=True):
 
     # this function generates a random examples of certain length
     # and then cut according to "example_lens_by_batch" and feed
-    # them in continuous batches to the kernels
+    # them in continuous batches to the kernels.
+    # If if return_naive_ref=True, the naive torch implementation
+    # ssd_minimal_discrete will be used to compute and return
+    # reference output.
 
     # generate the full-length example
     A, dt, X, B, C = generate_random_inputs(num_examples, full_length, n_heads,
                                             d_head, itype)
 
-    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1),
-                                                  A * dt,
-                                                  B,
-                                                  C,
-                                                  block_len=full_length // 4)
+    if return_naive_ref:
+        Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1),
+                                                      A * dt,
+                                                      B,
+                                                      C,
+                                                      block_len=full_length //
+                                                      4)
 
     # internal function that outputs a cont batch of examples
     # given a tuple of lengths for each example in the batch
@@ -179,7 +185,8 @@ def generate_continuous_batched_examples(example_lens_by_batch,
             IND_S = [x % full_length for x in IND_E]
         IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)]
 
-        yield ([Y_min[s, IND_S[s]:IND_E[s]] for s in range(num_examples)],
+        yield ([Y_min[s, IND_S[s]:IND_E[s]]
+                for s in range(num_examples)] if return_naive_ref else None,
                cu_seqlens, seq_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
 
 
@@ -324,3 +331,213 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
             if clear:
                 states[i].fill_(0.)
                 exhausted[i] = False
+
+
+@pytest.mark.parametrize("chunk_size", [8, 256])
+@pytest.mark.parametrize("seqlens", [
+    (16, 2, 8, 13),
+    (270, 88, 212, 203),
+    (16, 20),
+])
+def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
+
+    # This test verifies the correctness of the chunked prefill implementation
+    # in the mamba2 ssd kernels, by comparing concatenation (in the sequence
+    # dimension) of chunked results with the full sequence result.
+    # It is different from test_mamba_chunk_scan_cont_batch by:
+    # 1. Not using the naive torch implementaion (ssd_minimal_discrete) to get
+    #    reference outputs. Instead, it compares chunked kernel outputs to full
+    #    sequence kernel outputs. This is the most straightforward way to
+    #    assert chunked prefill correctness.
+    # 2. It focuses on cases where sequences change in the middle of mamba
+    #    chunks, and not necessarily on chunk boundaries.
+
+    max_seqlen = max(seqlens)
+    # This test can have larger error for longer sequences
+    if max_seqlen > 256:
+        atol, rtol = 1e-2, 5e-3
+    else:
+        atol, rtol = 5e-3, 5e-3
+
+    num_sequences = len(seqlens)
+    n_heads = 16
+    d_head = 64
+    itype = torch.float32
+
+    # hold state during the cutting process so we know if an
+    # example has been exhausted and needs to cycle
+    last_taken: dict = {}  # map: eg -> pointer to last taken sample
+    exhausted: dict = {}  # map: eg -> boolean indicating example is exhausted
+    _, cu_seqlens, seq_idx, (A, dt, X, B, C) = next(
+        generate_continuous_batched_examples([seqlens],
+                                             num_sequences,
+                                             max_seqlen,
+                                             last_taken,
+                                             exhausted,
+                                             n_heads,
+                                             d_head,
+                                             itype,
+                                             return_naive_ref=False))
+    seqlens = torch.tensor(seqlens, dtype=torch.int32, device=X.device)
+    device = X.device
+
+    ## full seqlen computation
+    chunk_indices, chunk_offsets = \
+            _query_start_loc_to_chunk_indices_offsets(
+                cu_seqlens, chunk_size, cu_seqlens[-1])
+    Y_ref = torch.empty_like(X)
+    state_ref = mamba_chunk_scan_combined(
+        X,
+        dt,
+        A,
+        B,
+        C,
+        chunk_size,
+        D=None,
+        cu_seqlens=cu_seqlens,
+        seq_idx=seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
+        return_varlen_states=True,
+        initial_states=None,
+        out=Y_ref,
+    )
+
+    ## chunked seqlen computation
+    # first chunk
+    chunked_seqlens = seqlens // 2
+    chunked_cu_seqlens = torch.cat([
+        torch.tensor([0], device=device),
+        torch.cumsum(chunked_seqlens, dim=0)
+    ],
+                                   dim=0)
+    chunked_seq_idx = torch.repeat_interleave(
+        torch.arange(len(chunked_seqlens), device=device),
+        chunked_seqlens,
+        output_size=chunked_cu_seqlens[-1]).unsqueeze(0).to(torch.int32)
+    chunked_input_seq_len = chunked_cu_seqlens[-1]
+    X_chunked = torch.zeros_like(X)[:, :chunked_input_seq_len, ...]
+    dt_chunked = torch.zeros_like(dt)[:, :chunked_input_seq_len, ...]
+    B_chunked = torch.zeros_like(B)[:, :chunked_input_seq_len, ...]
+    C_chunked = torch.zeros_like(C)[:, :chunked_input_seq_len, ...]
+    for i in range(num_sequences):
+        # fmt: off
+        chunk_f = lambda x, i: x[:, cu_seqlens[i]:cu_seqlens[i] + chunked_seqlens[i], ...]  # noqa: E501
+
+        X_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(X, i)  # noqa: E501
+        dt_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(dt, i)  # noqa: E501
+        B_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(B, i)  # noqa: E501
+        C_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(C, i)  # noqa: E501
+        # fmt: on
+
+    chunk_indices, chunk_offsets = \
+            _query_start_loc_to_chunk_indices_offsets(
+                chunked_cu_seqlens, chunk_size, chunked_cu_seqlens[-1])
+    Y_partial = torch.empty_like(X_chunked)
+    partial_state = mamba_chunk_scan_combined(
+        X_chunked,
+        dt_chunked,
+        A,
+        B_chunked,
+        C_chunked,
+        chunk_size,
+        D=None,
+        cu_seqlens=chunked_cu_seqlens,
+        seq_idx=chunked_seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
+        return_varlen_states=True,
+        initial_states=None,
+        out=Y_partial,
+    )
+
+    # remaining chunk
+    remaining_chunked_seqlens = seqlens - chunked_seqlens
+    remaining_chunked_cu_seqlens = torch.cat([
+        torch.tensor([0], device=device),
+        torch.cumsum(remaining_chunked_seqlens, dim=0)
+    ],
+                                             dim=0)
+    remaining_chunked_seq_idx = torch.repeat_interleave(
+        torch.arange(len(remaining_chunked_seqlens), device=device),
+        remaining_chunked_seqlens,
+        output_size=remaining_chunked_cu_seqlens[-1]).unsqueeze(0).to(
+            torch.int32)
+    remaining_chunked_input_seq_len = remaining_chunked_cu_seqlens[-1]
+    # fmt: off
+    remaining_X_chunked = torch.zeros_like(X)[:, :remaining_chunked_input_seq_len, ...]  # noqa: E501
+    remaining_dt_chunked = torch.zeros_like(dt)[:, :remaining_chunked_input_seq_len, ...]  # noqa: E501
+    remaining_B_chunked = torch.zeros_like(B)[:, :remaining_chunked_input_seq_len, ...]  # noqa: E501
+    remaining_C_chunked = torch.zeros_like(C)[:, :remaining_chunked_input_seq_len, ...]  # noqa: E501
+    for i in range(num_sequences):
+        remaining_chunk_f = lambda x, i: x[:, cu_seqlens[i] + chunked_seqlens[i]:cu_seqlens[i+1], ...]  # noqa: E501
+
+        remaining_X_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(X, i)  # noqa: E501
+        remaining_dt_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(dt, i)  # noqa: E501
+        remaining_B_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(B, i)  # noqa: E501
+        remaining_C_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(C, i)  # noqa: E501
+
+    # assert input chunking is correct
+    concat_chunk_f = lambda pt1, pt2, i: torch.cat([
+        pt1[:,chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1],...],
+        pt2[:,remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1],...],
+        ],
+        dim=1)
+    concat_batch_f = lambda pt1, pt2: torch.cat([concat_chunk_f(pt1, pt2, i) for i in range(num_sequences)], dim=1)  # noqa: E501
+    # fmt: on
+
+    assert concat_batch_f(X_chunked, remaining_X_chunked).equal(X)
+    assert concat_batch_f(dt_chunked, remaining_dt_chunked).equal(dt)
+    assert concat_batch_f(B_chunked, remaining_B_chunked).equal(B)
+    assert concat_batch_f(C_chunked, remaining_C_chunked).equal(C)
+
+    chunk_indices, chunk_offsets = \
+            _query_start_loc_to_chunk_indices_offsets(
+                remaining_chunked_cu_seqlens,
+                chunk_size,
+                remaining_chunked_cu_seqlens[-1])
+
+    Y_chunked = torch.empty_like(remaining_X_chunked)
+    state_chunked = mamba_chunk_scan_combined(
+        remaining_X_chunked,
+        remaining_dt_chunked,
+        A,
+        remaining_B_chunked,
+        remaining_C_chunked,
+        chunk_size,
+        D=None,
+        cu_seqlens=remaining_chunked_cu_seqlens,
+        seq_idx=remaining_chunked_seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
+        return_varlen_states=True,
+        initial_states=partial_state,
+        out=Y_chunked,
+    )
+    Y = concat_batch_f(Y_partial, Y_chunked)
+
+    # kernel chunked is same as kernel overall
+    for i in range(num_sequences):
+        Y_seq = Y[:, cu_seqlens[i]:cu_seqlens[i + 1], ...]
+        Y_ref_seq = Y_ref[:, cu_seqlens[i]:cu_seqlens[i + 1], ...]
+        torch.testing.assert_close(
+            Y_seq[:, :chunked_seqlens[i], ...],
+            Y_ref_seq[:, :chunked_seqlens[i], ...],
+            atol=atol,
+            rtol=rtol,
+            msg=lambda x: f"seq{i} output part1 " + x)  # noqa: B023
+        torch.testing.assert_close(
+            Y_seq[:, chunked_seqlens[i]:, ...],
+            Y_ref_seq[:, chunked_seqlens[i]:, ...],
+            atol=atol,
+            rtol=rtol,
+            msg=lambda x: f"seq{i} output part2 " + x)  # noqa: B023
+
+        state_seq = state_chunked[i]
+        state_seq_ref = state_ref[i]
+        torch.testing.assert_close(
+            state_seq,
+            state_seq_ref,
+            atol=atol,
+            rtol=rtol,
+            msg=lambda x: f"seq{i} state " + x)  # noqa: B023
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index 365139e237..fb8350e191 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -289,6 +289,9 @@ def _chunk_scan_fwd_kernel(
 
             # get the cs at the offset boundary
             # - c_off == 0 is a passthrough
+            # - We need dA_cs at the boundary, defined by c_off - no need
+            #   to increase pointer by pid_m (it is a constant offset,
+            #   i.e. the same for all blocks)
             dA_cs_m_boundary = tl.load(
                 dA_cumsum_ptr + (c_off - 1) * stride_dA_cs_csize,
                 mask=(((c_off - 1) > -1) and ((c_off) < chunk_size)),
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
index d0b3e9e523..fcc5c905bf 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -106,21 +106,24 @@ def _mamba_chunk_scan_combined_fwd(x,
     # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
     # (middle term of factorization of off-diag blocks; A terms)
     # - for handling chunked prefill, this requires i) initial_states
-    #   ii) seq_idx and iii) is_cont_batched to be all specified.
+    #   ii) seq_idx iii) is_cont_batched and (iv) chunk_offsets to be all specified.
     # - When a new seq_idx is detected, we will stop passing the prev_state
     #   and switch accordingly to the init_state corresponding to the new seq_idx.
+    # - We will also make sure that the dA_cumsum is taken only from the start of the
+    #   sequence (hence we need the full dA_cumsum tensor and not just the values at chunk boundaries)
     # - this will ensure that states will be updated with the rightmost flushed seq_idx
     #   of the previous chunk. This implies that the first chunk of states is either 0
     #   or equal to init_states of the first example.
     states, final_states = _state_passing_fwd(
         rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
+        dA_cumsum,
         initial_states=rearrange(initial_states, "... p n -> ... (p n)")
         if initial_states is not None else None,
         seq_idx=seq_idx,
         chunk_size=chunk_size,
         out_dtype=state_dtype if state_dtype is not None else C.dtype,
-        is_cont_batched=cu_seqlens is not None)
+        is_cont_batched=cu_seqlens is not None,
+        chunk_offsets=chunk_offsets)
     states, final_states = (rearrange(t, "... (p n) -> ... p n", n=dstate)
                             for t in [states, final_states])
 
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
index a28fc9ffad..d61c3a8cdb 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
@@ -31,6 +31,8 @@ def _state_passing_fwd_kernel(
     dA_cs_ptr,
     initstates_ptr,
     seq_idx_ptr,
+    chunk_offsets_ptr,
+    chunk_meta_num,
     # Matrix dimensions
     dim,
     nchunks,
@@ -51,6 +53,7 @@ def _state_passing_fwd_kernel(
     stride_dA_cs_batch,
     stride_dA_cs_chunk,
     stride_dA_cs_head,
+    stride_dA_cs_csize,
     stride_initstates_batch,
     stride_initstates_head,
     stride_initstates_dim,
@@ -66,7 +69,8 @@ def _state_passing_fwd_kernel(
     pid_h = tl.program_id(axis=2)
     pid_m = tl.program_id(axis=0)
     states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
-    dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head
+    dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head + (
+        chunk_size - 1) * stride_dA_cs_csize
     out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
     final_states_ptr += pid_b * stride_final_states_batch + pid_h * stride_final_states_head
     if HAS_INITSTATES:
@@ -95,35 +99,62 @@ def _state_passing_fwd_kernel(
 
     tl.store(out_ptrs, states, mask=offs_m < dim)
     out_ptrs += stride_out_chunk
-    seq_idx = 0
+    prev_seq_idx_chunk_end = 0
+    logical_chunk_idx = 0
     for c in range(nchunks):
         new_states = tl.load(states_ptrs, mask=offs_m < dim,
                              other=0.0).to(tl.float32)
         dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
-        scale = tl.exp(dA_cs)
+        scale_mask = True
         if HAS_SEQ_IDX:
             # - the seq to pass forward is the one that is flushed to the right
             #   boundary.
-            # - that is given by seq_idx_new below.
-            seq_idx_new = tl.load(seq_idx_ptr +
-                                  (min((c + 1) * chunk_size, seqlen) - 1) *
-                                  stride_seq_idx_seqlen)
+            # - that is given by seq_idx_chunk_end below: the sequence index at the end of the chunk.
+            seq_idx_chunk_end = tl.load(seq_idx_ptr + (min(
+                (c + 1) * chunk_size, seqlen) - 1) * stride_seq_idx_seqlen)
             if HAS_INITSTATES:
-                if IS_CONT_BATCHED and seq_idx != seq_idx_new:
+                if IS_CONT_BATCHED and prev_seq_idx_chunk_end != seq_idx_chunk_end:
                     # this means in the current chunk the rightmost flushed seq
                     # has changed.
                     # - so we do not propagate the state from previous chunk
                     # - but rather we load that sequence's init state
-                    initstates_ptrs = initstates_ptr + seq_idx_new * stride_initstates_batch
+                    initstates_ptrs = initstates_ptr + seq_idx_chunk_end * stride_initstates_batch
 
                     # - update state with seq_idx_new's init state
                     states = tl.load(initstates_ptrs,
                                      mask=offs_m < dim,
                                      other=0.0).to(tl.float32)
-            else:
-                scale = tl.where(seq_idx_new == seq_idx, scale, 0.0)
 
-            seq_idx = seq_idx_new
+                    # - we need to consider the cumsum only of the last sequence in the chunk
+                    # - find its starting position (given by c_off of the logical chunk index)
+                    # - and subtract the cumsum just before that position from the total cumsum
+                    # - first, update the logical chunk index (add the number of sequences in the current physical chunk):
+                    # sequence index at the start of the current chunk
+                    seq_idx_chunk_start = tl.load(seq_idx_ptr +
+                                                  min(c * chunk_size, seqlen) *
+                                                  stride_seq_idx_seqlen)
+                    logical_chunk_idx += seq_idx_chunk_end - seq_idx_chunk_start
+                    # - load the chunk offset:
+                    c_off = tl.load(chunk_offsets_ptr + logical_chunk_idx,
+                                    mask=logical_chunk_idx < chunk_meta_num,
+                                    other=0)
+                    # - if offset is 0, then the sequence starts at the beginning of the chunk, and we don't need to subtract anything
+                    if c_off > 0:
+                        # - dA_cs_ptr currently points to the cumsum at the end of the chunk - subtract the chunk size and add the offset
+                        dA_cs_boundary = tl.load(
+                            dA_cs_ptr - (chunk_size - 1) * stride_dA_cs_csize +
+                            (c_off - 1) * stride_dA_cs_csize,
+                            mask=(c_off - 1) > -1 and c_off < chunk_size,
+                            other=0.0)
+                        dA_cs -= dA_cs_boundary
+
+                # - increment logical chunk index for every physical chunk
+                logical_chunk_idx += 1
+            else:
+                scale_mask = seq_idx_chunk_end == prev_seq_idx_chunk_end
+            prev_seq_idx_chunk_end = seq_idx_chunk_end
+
+        scale = tl.where(scale_mask, tl.exp(dA_cs), 0.0)
         states = scale * states + new_states
         if c < nchunks - 1:
             tl.store(out_ptrs, states, mask=offs_m < dim)
@@ -136,28 +167,36 @@ def _state_passing_fwd_kernel(
 
 def _state_passing_fwd(
     states,
-    dA_chunk_cumsum,
+    dA_cumsum,
     initial_states=None,
     seq_idx=None,
     chunk_size=None,
     out_dtype=None,
     is_cont_batched=False,
+    chunk_offsets=None,
 ):
     batch, nchunks, nheads, dim = states.shape
-    assert dA_chunk_cumsum.shape == (batch, nheads, nchunks)
+    if chunk_size is None:
+        chunk_size = dA_cumsum.shape[-1]
+    else:
+        assert chunk_size == dA_cumsum.shape[-1]
+    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
     if initial_states is not None:
         if is_cont_batched:
             # - if cu_seqlens is provided, then the initial states
             #   are used for continuous batching. In which case we
             #   require seq_idx to be provided
-            assert seq_idx is not None, ""
+            assert seq_idx is not None, "seq_idx must be provided for continuous batching"
+            # - we also need chunk_offsets to be provided, to account
+            #   for computation of dA_cumsum from the start of the
+            #   sequence
+            assert chunk_offsets is not None, "chunk_offsets must be provided for continuous batching"
         else:
             # - this is the regular batching case, where initial
             #   states are used are for each example of the batch.
             assert initial_states.shape == (batch, nheads, dim)
 
     if seq_idx is not None:
-        assert chunk_size is not None
         seqlen = seq_idx.shape[-1]
         assert seq_idx.shape == (batch, seqlen)
     out_dtype = states.dtype if out_dtype is None else out_dtype
@@ -173,13 +212,15 @@ def _state_passing_fwd(
             states,
             out,
             final_states,
-            dA_chunk_cumsum,
+            dA_cumsum,
             initial_states,
             seq_idx,
+            chunk_offsets,
+            len(chunk_offsets) if chunk_offsets is not None else 0,
             dim,
             nchunks,
             seqlen if seq_idx is not None else 0,
-            chunk_size if seq_idx is not None else 0,
+            chunk_size,
             states.stride(0),
             states.stride(1),
             states.stride(2),
@@ -191,9 +232,10 @@ def _state_passing_fwd(
             final_states.stride(0),
             final_states.stride(1),
             final_states.stride(2),
-            dA_chunk_cumsum.stride(0),
-            dA_chunk_cumsum.stride(2),
-            dA_chunk_cumsum.stride(1),
+            dA_cumsum.stride(0),
+            dA_cumsum.stride(2),
+            dA_cumsum.stride(1),
+            dA_cumsum.stride(3),
             *((initial_states.stride(0), initial_states.stride(1),
                initial_states.stride(2)) if initial_states is not None else
               (0, 0, 0)),
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index f3e6cd7430..359bad1ea9 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -16,9 +16,58 @@ from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 
-def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor,
-                                              chunk_size: int,
-                                              total_seqlens: int):
+def _query_start_loc_to_chunk_indices_offsets(
+        query_start_loc: torch.Tensor, chunk_size: int,
+        total_seqlens: int) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args:
+        query_start_loc (torch.Tensor): 1D tensor of cumulative sequence 
+            lengths, shape (num_seqs + 1,).
+            The first element should be 0. Each entry represents the starting
+            index of a sequence in the flattened token array.
+        chunk_size (int): The size of each physical mamba chunk
+            (number of tokens per chunk).
+        total_seqlens (int): The total number of tokens in the batch.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - chunk_indices (torch.Tensor): 1D tensor of indices 
+                indicating the physical chunk for each logical chunk.
+            - chunk_offsets (torch.Tensor): 1D tensor of offsets
+                indicating the starting index of each logical chunk within
+                its physical chunk.
+
+    This function computes the chunk indices and offsets for the given
+    query_start_loc and chunk_size. Both are tensors of integers with length N,
+    where N is the number of logical (pseudo) chunks.
+    A logical chunk is a sequence of tokens that are all part of the same
+    sequence and are all in the same physical mamba chunk.
+    In other words, a logical chunk changes every time we cross a sequence
+    boundary or a physical mamba chunk boundary.
+    Logical chunks are needed to handle batched requests with initial states
+    (see _state_passing_fwd and _chunk_scan_fwd).
+    The chunk_indices tensor contains the index of the physical chunk for each
+    logical chunk.
+    The chunk_offsets tensor contains the offset (AKA starting index) of the
+    logical chunk in the physical chunk.
+
+    Example:
+    query_start_loc = [0, 5, 10]
+    chunk_size = 8
+    total_seqlens = 10
+    -> chunk_indices = [0, 0, 1]
+    -> chunk_offsets = [0, 5, 0]
+
+    In this example, we have 2 sequences, each with 5 tokens. The physical
+    chunk size is 8 tokens.
+    We have three logical chunks:
+    - the first logical chunk starts at token 0 in the first physical chunk
+        and contains all 5 tokens from the first sequence
+    - the second logical chunk starts at token 5 in the first physical chunk
+        and contains first 3 tokens from the second sequence
+    - the third logical chunk starts at token 0 in the second physical chunk
+        and contains the remaining 2 tokens from the second sequence
+    """
 
     cu_seqlens = query_start_loc[1:]  # remove prepended 0
 

From 9cd76b71abf15b31878f8d9675546f809a6ba150 Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Mon, 8 Sep 2025 15:40:26 +0200
Subject: [PATCH 917/932] [Misc] Terratorch related fixes (#24337)

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../prithvi_geospatial_mae_io_processor.py    |  4 ++--
 .../online_serving/prithvi_geospatial_mae.py  |  4 ++--
 requirements/test.in                          |  2 +-
 requirements/test.txt                         |  2 +-
 .../entrypoints/openai/test_skip_tokenizer.py |  2 +-
 tests/models/registry.py                      |  4 ++--
 tests/models/test_terratorch.py               |  2 +-
 .../prithvi_io_processor/__init__.py          |  6 ++----
 .../prithvi_io_processor/prithvi_processor.py | 20 ++-----------------
 .../prithvi_io_processor_plugin/setup.py      |  3 +--
 .../test_io_processor_plugins.py              |  6 +++---
 11 files changed, 18 insertions(+), 37 deletions(-)

diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
index 5d629fabf0..418c40645f 100644
--- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
+++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
@@ -18,7 +18,7 @@ from vllm.pooling_params import PoolingParams
 
 def main():
     torch.set_default_dtype(torch.float16)
-    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"  # noqa: E501
 
     img_prompt = dict(
         data=image_url,
@@ -36,7 +36,7 @@ def main():
         # to avoid the model going OOM.
         # The maximum number depends on the available GPU memory
         max_num_seqs=32,
-        io_processor_plugin="prithvi_to_tiff_india",
+        io_processor_plugin="prithvi_to_tiff",
         model_impl="terratorch",
     )
 
diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/prithvi_geospatial_mae.py
index c6eed64838..611a7cbc89 100644
--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@@ -18,11 +18,11 @@ import requests
 #   --model-impl terratorch
 #   --task embed --trust-remote-code
 #   --skip-tokenizer-init --enforce-eager
-#   --io-processor-plugin prithvi_to_tiff_india
+#   --io-processor-plugin prithvi_to_tiff
 
 
 def main():
-    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"  # noqa: E501
     server_endpoint = "http://localhost:8000/pooling"
 
     request_payload_url = {
diff --git a/requirements/test.in b/requirements/test.in
index 5db9cd7979..1bbf0074a8 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -54,4 +54,4 @@ runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
 pydantic>=2.10 # 2.9 leads to error on python 3.10
 decord==0.6.0
-terratorch==1.1rc3 # required for PrithviMAE test
+terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
diff --git a/requirements/test.txt b/requirements/test.txt
index 332a9b9cfb..65ef7c3c64 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1042,7 +1042,7 @@ tensorboardx==2.6.4
     # via lightning
 tensorizer==2.10.1
     # via -r requirements/test.in
-terratorch==1.1rc3
+terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
     # via -r requirements/test.in
 threadpoolctl==3.5.0
     # via scikit-learn
diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py
index af520ac61d..840e0dac81 100644
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -11,7 +11,7 @@ import torch
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
+MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 DTYPE = "float16"
 
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index c6ff50b542..e4c215b108 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -383,7 +383,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
                                          trust_remote_code=True),
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
-    "PrithviGeoSpatialMAE": _HfExamplesInfo("mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
+    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
                                             dtype=torch.float16,
                                             enforce_eager=True,
                                             skip_tokenizer_init=True,
@@ -391,7 +391,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
                                             # going OOM in CI
                                             max_num_seqs=32,
                                             ),
-    "Terratorch": _HfExamplesInfo("mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
+    "Terratorch": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
                                   dtype=torch.float16,
                                   enforce_eager=True,
                                   skip_tokenizer_init=True,
diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py
index bfa54280dc..d6d43ca2f7 100644
--- a/tests/models/test_terratorch.py
+++ b/tests/models/test_terratorch.py
@@ -11,7 +11,7 @@ from vllm.utils import set_default_torch_num_threads
 @pytest.mark.parametrize(
     "model",
     [
-        "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
+        "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
         "mgazz/Prithvi_v2_eo_300_tl_unet_agb"
     ],
 )
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
index a750c756c1..4bbb79c98a 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-def register_prithvi_india():
-    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorIndia"  # noqa: E501
 
 
-def register_prithvi_valencia():
-    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorValencia"  # noqa: E501
+def register_prithvi():
+    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessor"  # noqa: E501
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
index 0ebaafda94..42874f0398 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -234,6 +234,8 @@ def load_image(
 
 class PrithviMultimodalDataProcessor(IOProcessor):
 
+    indices = [0, 1, 2, 3, 4, 5]
+
     def __init__(self, vllm_config: VllmConfig):
 
         super().__init__(vllm_config)
@@ -412,21 +414,3 @@ class PrithviMultimodalDataProcessor(IOProcessor):
                                   format="tiff",
                                   data=out_data,
                                   request_id=request_id)
-
-
-class PrithviMultimodalDataProcessorIndia(PrithviMultimodalDataProcessor):
-
-    def __init__(self, vllm_config: VllmConfig):
-
-        super().__init__(vllm_config)
-
-        self.indices = [1, 2, 3, 8, 11, 12]
-
-
-class PrithviMultimodalDataProcessorValencia(PrithviMultimodalDataProcessor):
-
-    def __init__(self, vllm_config: VllmConfig):
-
-        super().__init__(vllm_config)
-
-        self.indices = [0, 1, 2, 3, 4, 5]
diff --git a/tests/plugins/prithvi_io_processor_plugin/setup.py b/tests/plugins/prithvi_io_processor_plugin/setup.py
index a03b1fbbd4..3ddda1a47b 100644
--- a/tests/plugins/prithvi_io_processor_plugin/setup.py
+++ b/tests/plugins/prithvi_io_processor_plugin/setup.py
@@ -9,8 +9,7 @@ setup(
     packages=["prithvi_io_processor"],
     entry_points={
         "vllm.io_processor_plugins": [
-            "prithvi_to_tiff_india = prithvi_io_processor:register_prithvi_india",  # noqa: E501
-            "prithvi_to_tiff_valencia = prithvi_io_processor:register_prithvi_valencia",  # noqa: E501
+            "prithvi_to_tiff = prithvi_io_processor:register_prithvi",  # noqa: E501
         ]
     },
 )
diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py
index 825165e89b..3567a701a3 100644
--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import IOProcessorResponse
 from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 
-MODEL_NAME = "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
+MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 
 image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"  # noqa: E501
 
@@ -35,7 +35,7 @@ def server():
         "--max-num-seqs",
         "32",
         "--io-processor-plugin",
-        "prithvi_to_tiff_valencia",
+        "prithvi_to_tiff",
         "--model-impl",
         "terratorch",
     ]
@@ -107,7 +107,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
             # to avoid the model going OOM in CI.
             max_num_seqs=1,
             model_impl="terratorch",
-            io_processor_plugin="prithvi_to_tiff_valencia",
+            io_processor_plugin="prithvi_to_tiff",
     ) as llm_runner:
         pooler_output = llm_runner.get_llm().encode(
             img_prompt,

From 03dd652c16f8aa53190277cb1e48c5938caf6d76 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 8 Sep 2025 14:41:27 +0100
Subject: [PATCH 918/932] Move `KVEventsConfig` from `config/__init__.py` to
 `config/kv_events.py` (#24433)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/distributed/conftest.py |  2 +-
 vllm/config/__init__.py       | 43 +-----------------------------
 vllm/config/kv_events.py      | 50 +++++++++++++++++++++++++++++++++++
 vllm/distributed/kv_events.py |  2 +-
 4 files changed, 53 insertions(+), 44 deletions(-)
 create mode 100644 vllm/config/kv_events.py

diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py
index 666a715cc0..7dc4a0cc3d 100644
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@@ -8,7 +8,7 @@ import msgspec.msgpack
 import pytest
 import zmq
 
-from vllm.config import KVEventsConfig
+from vllm.config.kv_events import KVEventsConfig
 from vllm.distributed.kv_events import EventPublisherFactory
 
 from .test_events import SampleBatch
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index f6f1838aed..e3ce1987fe 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -33,6 +33,7 @@ from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
                                PrefixCachingHashAlgo)
 from vllm.config.compilation import (CompilationConfig, CompilationLevel,
                                      CUDAGraphMode, PassConfig)
+from vllm.config.kv_events import KVEventsConfig
 from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
                                   ParallelConfig)
 from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
@@ -3310,48 +3311,6 @@ class KVTransferConfig:
         return self.kv_connector_extra_config.get(key, default)
 
 
-@config
-@dataclass
-class KVEventsConfig:
-    """Configuration for KV event publishing."""
-
-    enable_kv_cache_events: bool = False
-    """If True, enable KV cache events for tracking block storage and removal.
-    Events can be published externally by zmq using the event publisher config.
-    """
-
-    publisher: str = "null"
-    """The publisher to use for publishing kv events. Can be "null", "zmq".
-    """
-
-    endpoint: str = "tcp://*:5557"
-    """The zmq endpoint to use for publishing kv events.
-    """
-
-    replay_endpoint: Optional[str] = None
-    """The zmq endpoint to use for replaying kv events.
-    """
-
-    buffer_steps: int = 10_000
-    """The number of steps to cache for replay endpoint. Will only save
-    events from the last N steps for the replay endpoint.
-    """
-
-    hwm: int = 100_000
-    """The zmq high water mark for the event publisher. After queueing N events,
-    events will start dropping if the consumer is not keeping up.
-    """
-
-    max_queue_size: int = 100_000
-    """The maximum number of events to queue while waiting for publishing.
-    """
-
-    topic: str = ""
-    """The topic to use for the event publisher. Consumers can subscribe to
-    this topic to receive events.
-    """
-
-
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class VllmConfig:
diff --git a/vllm/config/kv_events.py b/vllm/config/kv_events.py
new file mode 100644
index 0000000000..1c6bdffa12
--- /dev/null
+++ b/vllm/config/kv_events.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from pydantic.dataclasses import dataclass
+
+from vllm.config.utils import config
+
+
+@config
+@dataclass
+class KVEventsConfig:
+    """Configuration for KV event publishing."""
+
+    enable_kv_cache_events: bool = False
+    """If True, enable KV cache events for tracking block storage and removal.
+    Events can be published externally by zmq using the event publisher config.
+    """
+
+    publisher: str = "null"
+    """The publisher to use for publishing kv events. Can be "null", "zmq".
+    """
+
+    endpoint: str = "tcp://*:5557"
+    """The zmq endpoint to use for publishing kv events.
+    """
+
+    replay_endpoint: Optional[str] = None
+    """The zmq endpoint to use for replaying kv events.
+    """
+
+    buffer_steps: int = 10_000
+    """The number of steps to cache for replay endpoint. Will only save
+    events from the last N steps for the replay endpoint.
+    """
+
+    hwm: int = 100_000
+    """The zmq high water mark for the event publisher. After queueing N events,
+    events will start dropping if the consumer is not keeping up.
+    """
+
+    max_queue_size: int = 100_000
+    """The maximum number of events to queue while waiting for publishing.
+    """
+
+    topic: str = ""
+    """The topic to use for the event publisher. Consumers can subscribe to
+    this topic to receive events.
+    """
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index 37f8f72fa9..09f42b550f 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -14,7 +14,7 @@ from typing import Any, Callable, Optional, Union
 import msgspec
 import zmq
 
-from vllm.config import KVEventsConfig
+from vllm.config.kv_events import KVEventsConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)

From 01dfb5e982f45fff33c715e8c248555b0361276e Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Mon, 8 Sep 2025 06:42:20 -0700
Subject: [PATCH 919/932] [Frontend] User-provided uuids for medias in chat.
 (RFC #22044) (#23449)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/features/multimodal_inputs.md        |  42 +-
 tests/entrypoints/openai/test_vision.py   | 129 ++++
 tests/entrypoints/test_chat_utils.py      | 752 +++++++++++++++++++++-
 vllm/entrypoints/chat_utils.py            | 181 ++++--
 vllm/entrypoints/llm.py                   |   5 +-
 vllm/entrypoints/openai/serving_engine.py |   6 +-
 vllm/inputs/preprocess.py                 |  36 +-
 vllm/model_executor/models/terratorch.py  |   7 +-
 8 files changed, 1079 insertions(+), 79 deletions(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 206ab7a468..77baa27c7a 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -215,19 +215,19 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
 
     ```python
     from vllm import LLM
-    
+
     # Default white background (no configuration needed)
     llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-    
+
     # Custom black background for dark theme
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
         media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
     )
-    
+
     # Custom brand color background (e.g., blue)
     llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf", 
+        model="llava-hf/llava-1.5-7b-hf",
         media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
     )
     ```
@@ -388,7 +388,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
 
 ## Online Serving
 
-Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
+Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.
 
 !!! important
     A chat template is **required** to use Chat Completions API.
@@ -438,7 +438,13 @@ Then, you can use the OpenAI client as follows:
                 # NOTE: The prompt formatting with the image token `<image>` is not needed
                 # since the prompt will be processed automatically by the API server.
                 {"type": "text", "text": "What’s in this image?"},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        url": image_url
+                    },
+                    "uuid": image_url # Optional
+                },
             ],
         }],
     )
@@ -454,8 +460,20 @@ Then, you can use the OpenAI client as follows:
             "role": "user",
             "content": [
                 {"type": "text", "text": "What are the animals in these images?"},
-                {"type": "image_url", "image_url": {"url": image_url_duck}},
-                {"type": "image_url", "image_url": {"url": image_url_lion}},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url_duck
+                    },
+                    "uuid": image_url_duck # Optional
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url_lion
+                    },
+                    "uuid": image_url_lion # Optional
+                },
             ],
         }],
     )
@@ -522,6 +540,7 @@ Then, you can use the OpenAI client as follows:
                     "video_url": {
                         "url": video_url
                     },
+                    "uuid": video_url # Optional
                 },
             ],
         }],
@@ -613,6 +632,7 @@ Then, you can use the OpenAI client as follows:
                         "data": audio_base64,
                         "format": "wav"
                     },
+                    "uuid": audio_url # Optional
                 },
             ],
         }],
@@ -642,6 +662,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
                     "audio_url": {
                         "url": audio_url
                     },
+                    "uuid": audio_url # Optional
                 },
             ],
         }],
@@ -695,7 +716,8 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
     model = "llava-hf/llava-1.5-7b-hf"
     embeds =  {
         "type": "image_embeds",
-        "image_embeds": f"{base64_image_embedding}" 
+        "image_embeds": f"{base64_image_embedding}",
+        "uuid": image_url # Optional
     }
 
     # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
@@ -706,6 +728,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
             "image_embeds": f"{base64_image_embedding}" , # Required
             "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
         },
+        "uuid": image_url # Optional
     }
     model = "openbmb/MiniCPM-V-2_6"
     embeds =  {
@@ -714,6 +737,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
             "image_embeds": f"{base64_image_embedding}" , # Required
             "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
         },
+        "uuid": image_url # Optional
     }
     chat_completion = client.chat.completions.create(
         messages=[
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 9d61754059..29a3b40d2d 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -436,3 +436,132 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
         )
         message = chat_completion.choices[0].message
         assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            }
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image_with_uuid(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            },
+                            "uuid": image_url
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image_with_incorrect_uuid_format(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                                "incorrect_uuid_key": image_url,
+                            },
+                            "also_incorrect_uuid_key": image_url,
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 18db1027c0..5149ca3460 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -21,7 +21,7 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
                                          resolve_chat_template_content_format,
                                          resolve_hf_chat_template)
 from vllm.entrypoints.llm import apply_hf_chat_template
-from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
                                    encode_video_base64)
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
@@ -179,6 +179,27 @@ def _assert_mm_data_is_image_input(
     assert isinstance(image_data, list) and len(image_data) == image_count
 
 
+def _assert_mm_uuids(
+    mm_uuids: Optional[MultiModalUUIDDict],
+    media_count: int,
+    expected_uuids: list[Optional[str]],
+    modality: str = "image",
+) -> None:
+    if len(expected_uuids) > 0:
+        assert mm_uuids is not None
+        assert modality in mm_uuids
+
+        image_uuids = mm_uuids.get(modality)
+        assert image_uuids is not None
+
+        assert isinstance(image_uuids,
+                          list) and len(image_uuids) == media_count
+
+        assert image_uuids == expected_uuids
+    else:
+        assert mm_uuids is None
+
+
 ModalityType = Literal["image", "video", "audio"]
 MultiModalDataCounts = Mapping[ModalityType, int]
 
@@ -201,7 +222,7 @@ def test_parse_chat_messages_single_image(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -228,6 +249,260 @@ def test_parse_chat_messages_single_image(
         "content": "<|image_1|>\nWhat's in the image?"
     }]
     _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
+
+
+def test_parse_chat_messages_single_image_with_uuid(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
+
+
+def test_parse_chat_messages_single_image_with_bad_uuid_format(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                        "uuid": image_uuid,
+                    },
+                    "bad_uuid_key": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
+
+
+def test_parse_chat_messages_multiple_images_with_uuids(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in the image?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_single_image_with_uuid_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_with_uuids_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, image_uuid2])
 
 
 def test_parse_chat_messages_empty_system(
@@ -235,7 +510,7 @@ def test_parse_chat_messages_empty_system(
     mistral_tokenizer,
 ):
     # Test string format
-    conversation, _ = parse_chat_messages(
+    conversation, _, _ = parse_chat_messages(
         [
             {
                 "role": "system",
@@ -265,7 +540,7 @@ def test_parse_chat_messages_empty_system(
     ]
 
     # Test openai format
-    conversation, _ = parse_chat_messages(
+    conversation, _, _ = parse_chat_messages(
         [
             {
                 "role": "system",
@@ -307,7 +582,7 @@ async def test_parse_chat_messages_single_image_async(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_future = parse_chat_messages_futures(
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
         [{
             "role":
             "user",
@@ -334,6 +609,7 @@ async def test_parse_chat_messages_single_image_async(
         "content": "<|image_1|>\nWhat's in the image?"
     }]
     _assert_mm_data_is_image_input(await mm_future, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
 
 
 def test_parse_chat_messages_multiple_images(
@@ -341,7 +617,7 @@ def test_parse_chat_messages_multiple_images(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -374,6 +650,7 @@ def test_parse_chat_messages_multiple_images(
         "<|image_1|>\n<|image_2|>\nWhat's in these images?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
 @pytest.mark.asyncio
@@ -382,7 +659,7 @@ async def test_parse_chat_messages_multiple_images_async(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_future = parse_chat_messages_futures(
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
         [{
             "role":
             "user",
@@ -415,6 +692,7 @@ async def test_parse_chat_messages_multiple_images_async(
         "<|image_1|>\n<|image_2|>\nWhat's in these images?",
     }]
     _assert_mm_data_is_image_input(await mm_future, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
 def test_parse_chat_messages_placeholder_already_in_prompt(
@@ -422,7 +700,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -458,6 +736,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
         "What's in <|image_1|> and how does it compare to <|image_2|>?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
 def test_parse_chat_messages_placeholder_one_already_in_prompt(
@@ -465,7 +744,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -503,6 +782,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
         "other one?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
 def test_parse_chat_messages_multiple_images_across_messages(
@@ -510,7 +790,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
                 "role":
@@ -569,13 +849,84 @@ def test_parse_chat_messages_multiple_images_across_messages(
         },
     ]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": image_uuid,
+                    },
+                    {
+                        "type": "text",
+                        "text": "What's in this image?"
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": image_uuid,
+                    },
+                    {
+                        "type": "text",
+                        "text": "What about this one?"
+                    },
+                ],
+            },
+        ],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\nWhat's in this image?"
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role": "user",
+            "content": "<|image_2|>\nWhat about this one?"
+        },
+    ]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
 
 
 def test_parse_chat_messages_context_text_format(
     phi3v_model_config,
     phi3v_tokenizer,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
                 "role": "user",
@@ -621,6 +972,8 @@ def test_parse_chat_messages_context_text_format(
             }],
         },
     ]
+    assert mm_data is None
+    assert mm_uuids is None
 
 
 def test_parse_chat_messages_rejects_too_many_images_in_one_message(
@@ -736,7 +1089,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -762,6 +1115,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
         "<|image_1|>\n<|image_2|>\nWhat's in these images?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
 def test_parse_chat_messages_multiple_images_interleave(
@@ -769,7 +1123,7 @@ def test_parse_chat_messages_multiple_images_interleave(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -813,6 +1167,7 @@ def test_parse_chat_messages_multiple_images_interleave(
         "Do they have differences?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
 @pytest.mark.asyncio
@@ -821,7 +1176,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages_futures(
+    conversation, mm_data, mm_uuids = parse_chat_messages_futures(
         [{
             "role":
             "user",
@@ -865,6 +1220,63 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
         "Do they have differences?",
     }]
     _assert_mm_data_is_image_input(await mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "I need you to compare this image",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "and this one"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "Do they have differences?"
+                },
+            ],
+        }],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+        "Do they have differences?",
+    }]
+    _assert_mm_data_is_image_input(await mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
 
 
 def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
@@ -872,7 +1284,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
                 "role":
@@ -935,6 +1347,81 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
         },
     ]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(  # noqa: E501
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": image_uuid,
+                    },
+                    {
+                        "type": "text",
+                        "text": "Be accurate."
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": image_uuid,
+                    },
+                ],
+            },
+        ],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "What's on this image?\n<|image_1|>\nBe accurate.",
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role": "user",
+            "content": "What's on this image?\n<|image_2|>"
+        },
+    ]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
 
 
 def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
@@ -944,7 +1431,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
     video_url,
     audio_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
                 "role":
@@ -1030,6 +1517,229 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
     ]
 
     _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     2,
+                     modality="image",
+                     expected_uuids=[None, None])
+    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=[None])
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
+
+
+def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(  # noqa: E501
+    qwen25omni_model_config_mm_interleaved,
+    qwen25omni_tokenizer,
+    image_url,
+    video_url,
+    audio_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "Now listen to this audio"
+                    },
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            "url": audio_url
+                        },
+                        "uuid": "audio_123",
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "And what's in the video?"
+                    },
+                    {
+                        "type": "video_url",
+                        "video_url": {
+                            "url": video_url
+                        },
+                        "uuid": "video_123",
+                    },
+                ],
+            },
+        ],
+        qwen25omni_model_config_mm_interleaved,
+        qwen25omni_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+        },
+    ]
+
+    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     2,
+                     modality="image",
+                     expected_uuids=["image_123", "image_123"])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="video",
+                     expected_uuids=["video_123"])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="audio",
+                     expected_uuids=["audio_123"])
+
+
+def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave(  # noqa: E501
+    qwen25omni_model_config_mm_interleaved,
+    qwen25omni_tokenizer,
+    image_url,
+    video_url,
+    audio_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "Now listen to this audio"
+                    },
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            "url": audio_url
+                        }
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "And what's in the video?"
+                    },
+                    {
+                        "type": "video_url",
+                        "video_url": {
+                            "url": video_url
+                        },
+                        "uuid": "video_123",
+                    },
+                ],
+            },
+        ],
+        qwen25omni_model_config_mm_interleaved,
+        qwen25omni_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+        },
+    ]
+
+    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     2,
+                     modality="image",
+                     expected_uuids=["image_123", None])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="video",
+                     expected_uuids=["video_123"])
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
 
 
 def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
@@ -1081,7 +1791,7 @@ def test_mllama_single_image(
     image_url,
 ):
     """Ensures that a single image is parsed correctly mllama."""
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -1100,6 +1810,7 @@ def test_mllama_single_image(
         content_format="openai",
     )
     _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
     assert conversation == [{
         "role":
         "user",
@@ -1121,7 +1832,7 @@ def test_mllama_interleaved_images(
     image_url,
 ):
     """Ensures that multiple image are parsed as interleaved dicts."""
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -1147,6 +1858,7 @@ def test_mllama_interleaved_images(
         content_format="openai",
     )
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
     assert conversation == [{
         "role":
         "user",
@@ -1227,7 +1939,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
 
     # Now parse with vLLMs chat utils & apply the template
     vllm_conversation = get_conversation(is_hf=False)
-    conversation, _ = parse_chat_messages(
+    conversation, _, _ = parse_chat_messages(
         vllm_conversation,
         model_config,
         tokenizer_group,
@@ -1518,7 +2230,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
         }],
     }]
 
-    conversation_with_thinking, _ = parse_chat_messages(
+    conversation_with_thinking, _, _ = parse_chat_messages(
         messages,
         mistral_model_config,
         mistral_tokenizer,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 80e2c44a02..b53dbfb3a2 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -41,7 +41,8 @@ from typing_extensions import Required, TypeAlias, TypedDict
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
+                             MultiModalUUIDDict)
 from vllm.multimodal.utils import MediaConnector
 # yapf: disable
 from vllm.transformers_utils.chat_templates import (
@@ -72,6 +73,11 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
 
     type: Required[Literal["audio_url"]]
     """The type of the content part."""
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
@@ -83,6 +89,11 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
     """
     type: Required[Literal["image_embeds"]]
     """The type of the content part."""
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class VideoURL(TypedDict, total=False):
@@ -97,6 +108,11 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
 
     type: Required[Literal["video_url"]]
     """The type of the content part."""
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class PILImage(BaseModel):
@@ -118,6 +134,11 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
     """
 
     image_pil: Required[PILImage]
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
@@ -131,6 +152,11 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
     """
 
     image_url: Required[str]
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
@@ -155,6 +181,11 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     """
 
     video_url: Required[str]
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class CustomThinkCompletionContentParam(TypedDict, total=False):
@@ -567,6 +598,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         self._tokenizer = tokenizer
 
         self._items_by_modality = defaultdict[str, list[_T]](list)
+        self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list)
 
     @property
     def model_config(self) -> ModelConfig:
@@ -591,10 +623,15 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     def mm_processor(self):
         return self.mm_registry.create_processor(self.model_config)
 
-    def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
+    def add(
+        self, modality: ModalityStr, item: _T, uuid: Optional[str] = None
+    ) -> Optional[str]:
         """
         Add a multi-modal item to the current prompt and returns the
         placeholder string to use, if any.
+
+        An optional uuid can be added which serves as a unique identifier of the
+        media. 
         """
         input_modality = modality.replace("_embeds", "")
         num_items = len(self._items_by_modality[modality]) + 1
@@ -602,9 +639,35 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         self.mm_processor.validate_num_items(input_modality, num_items)
 
         self._items_by_modality[modality].append(item)
+        self._uuids_by_modality[modality].append(uuid)
 
         return self.model_cls.get_placeholder_str(modality, num_items)
 
+    def all_mm_uuids(self) -> Optional[MultiModalUUIDDict]:
+        if not self._items_by_modality:
+            return None
+        mm_uuids = {}
+        uuids_by_modality = dict(self._uuids_by_modality)
+        if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality:
+            raise ValueError(
+                "Mixing raw image and embedding inputs is not allowed"
+            )
+
+        if "image_embeds" in uuids_by_modality:
+            image_embeds_uuids = uuids_by_modality["image_embeds"]
+            if len(image_embeds_uuids) > 1:
+                raise ValueError(
+                    "Only one message can have {'type': 'image_embeds'}"
+                )
+            mm_uuids["image"] = uuids_by_modality["image_embeds"]
+        if "image" in uuids_by_modality:
+            mm_uuids["image"] = uuids_by_modality["image"]  # UUIDs of images
+        if "audio" in uuids_by_modality:
+            mm_uuids["audio"] = uuids_by_modality["audio"]  # UUIDs of audios
+        if "video" in uuids_by_modality:
+            mm_uuids["video"] = uuids_by_modality["video"]  # UUIDs of videos
+        return mm_uuids
+
     @abstractmethod
     def create_parser(self) -> "BaseMultiModalContentParser":
         raise NotImplementedError
@@ -697,29 +760,35 @@ class BaseMultiModalContentParser(ABC):
         return dict(self._placeholder_storage)
 
     @abstractmethod
-    def parse_image(self, image_url: str) -> None:
+    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
         raise NotImplementedError
 
     @abstractmethod
     def parse_image_embeds(
-        self, image_embeds: Union[str, dict[str, str]]
+        self,
+        image_embeds: Union[str, dict[str, str]],
+        uuid: Optional[str] = None,
     ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_image_pil(self, image_pil: Image.Image) -> None:
+    def parse_image_pil(
+        self, image_pil: Image.Image, uuid: Optional[str] = None
+    ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_audio(self, audio_url: str) -> None:
+    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_input_audio(self, input_audio: InputAudio) -> None:
+    def parse_input_audio(
+        self, input_audio: InputAudio, uuid: Optional[str] = None
+    ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_video(self, video_url: str) -> None:
+    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
         raise NotImplementedError
 
 
@@ -734,49 +803,55 @@ class MultiModalContentParser(BaseMultiModalContentParser):
             allowed_local_media_path=tracker.allowed_local_media_path,
         )
 
-    def parse_image(self, image_url: str) -> None:
+    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
         image = self._connector.fetch_image(image_url)
 
-        placeholder = self._tracker.add("image", image)
+        placeholder = self._tracker.add("image", image, uuid)
         self._add_placeholder("image", placeholder)
 
     def parse_image_embeds(
-        self, image_embeds: Union[str, dict[str, str]]
+        self,
+        image_embeds: Union[str, dict[str, str]],
+        uuid: Optional[str] = None,
     ) -> None:
         if isinstance(image_embeds, dict):
             embeds = {
                 k: self._connector.fetch_image_embedding(v)
                 for k, v in image_embeds.items()
             }
-            placeholder = self._tracker.add("image_embeds", embeds)
+            placeholder = self._tracker.add("image_embeds", embeds, uuid)
 
         if isinstance(image_embeds, str):
             embedding = self._connector.fetch_image_embedding(image_embeds)
-            placeholder = self._tracker.add("image_embeds", embedding)
+            placeholder = self._tracker.add("image_embeds", embedding, uuid)
 
         self._add_placeholder("image", placeholder)
 
-    def parse_image_pil(self, image_pil: Image.Image) -> None:
-        placeholder = self._tracker.add("image", image_pil)
+    def parse_image_pil(
+        self, image_pil: Image.Image, uuid: Optional[str] = None
+    ) -> None:
+        placeholder = self._tracker.add("image", image_pil, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_audio(self, audio_url: str) -> None:
+    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
         audio = self._connector.fetch_audio(audio_url)
 
-        placeholder = self._tracker.add("audio", audio)
+        placeholder = self._tracker.add("audio", audio, uuid)
         self._add_placeholder("audio", placeholder)
 
-    def parse_input_audio(self, input_audio: InputAudio) -> None:
+    def parse_input_audio(
+        self, input_audio: InputAudio, uuid: Optional[str] = None
+    ) -> None:
         audio_data = input_audio.get("data", "")
         audio_format = input_audio.get("format", "")
         audio_url = f"data:audio/{audio_format};base64,{audio_data}"
 
-        return self.parse_audio(audio_url)
+        return self.parse_audio(audio_url, uuid)
 
-    def parse_video(self, video_url: str) -> None:
+    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
         video = self._connector.fetch_video(video_url=video_url)
 
-        placeholder = self._tracker.add("video", video)
+        placeholder = self._tracker.add("video", video, uuid)
         self._add_placeholder("video", placeholder)
 
 
@@ -790,14 +865,16 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
             allowed_local_media_path=tracker.allowed_local_media_path,
         )
 
-    def parse_image(self, image_url: str) -> None:
+    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
         image_coro = self._connector.fetch_image_async(image_url)
 
-        placeholder = self._tracker.add("image", image_coro)
+        placeholder = self._tracker.add("image", image_coro, uuid)
         self._add_placeholder("image", placeholder)
 
     def parse_image_embeds(
-        self, image_embeds: Union[str, dict[str, str]]
+        self,
+        image_embeds: Union[str, dict[str, str]],
+        uuid: Optional[str] = None,
     ) -> None:
         future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
 
@@ -812,33 +889,37 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
             embedding = self._connector.fetch_image_embedding(image_embeds)
             future.set_result(embedding)
 
-        placeholder = self._tracker.add("image_embeds", future)
+        placeholder = self._tracker.add("image_embeds", future, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_image_pil(self, image_pil: Image.Image) -> None:
+    def parse_image_pil(
+        self, image_pil: Image.Image, uuid: Optional[str] = None
+    ) -> None:
         future: asyncio.Future[Image.Image] = asyncio.Future()
         future.set_result(image_pil)
 
-        placeholder = self._tracker.add("image", future)
+        placeholder = self._tracker.add("image", future, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_audio(self, audio_url: str) -> None:
+    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
         audio_coro = self._connector.fetch_audio_async(audio_url)
 
-        placeholder = self._tracker.add("audio", audio_coro)
+        placeholder = self._tracker.add("audio", audio_coro, uuid)
         self._add_placeholder("audio", placeholder)
 
-    def parse_input_audio(self, input_audio: InputAudio) -> None:
+    def parse_input_audio(
+        self, input_audio: InputAudio, uuid: Optional[str] = None
+    ) -> None:
         audio_data = input_audio.get("data", "")
         audio_format = input_audio.get("format", "")
         audio_url = f"data:audio/{audio_format};base64,{audio_data}"
 
-        return self.parse_audio(audio_url)
+        return self.parse_audio(audio_url, uuid)
 
-    def parse_video(self, video_url: str) -> None:
+    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
         video = self._connector.fetch_video_async(video_url=video_url)
 
-        placeholder = self._tracker.add("video", video)
+        placeholder = self._tracker.add("video", video, uuid)
         self._add_placeholder("video", placeholder)
 
 
@@ -1177,30 +1258,36 @@ def _parse_chat_message_content_part(
         else:
             return str_content
 
+    # For media items, if a user has provided one, use it. Otherwise, insert
+    # a placeholder empty uuid.
+    uuid = part.get("uuid", None)
+    if uuid is not None:
+        uuid = str(uuid)
+
     modality = None
     if part_type == "image_pil":
         image_content = cast(Image.Image, content)
-        mm_parser.parse_image_pil(image_content)
+        mm_parser.parse_image_pil(image_content, uuid)
         modality = "image"
     elif part_type in ("image_url", "input_image"):
         str_content = cast(str, content)
-        mm_parser.parse_image(str_content)
+        mm_parser.parse_image(str_content, uuid)
         modality = "image"
     elif part_type == "image_embeds":
         content = cast(Union[str, dict[str, str]], content)
-        mm_parser.parse_image_embeds(content)
+        mm_parser.parse_image_embeds(content, uuid)
         modality = "image"
     elif part_type == "audio_url":
         str_content = cast(str, content)
-        mm_parser.parse_audio(str_content)
+        mm_parser.parse_audio(str_content, uuid)
         modality = "audio"
     elif part_type == "input_audio":
         dict_content = cast(InputAudio, content)
-        mm_parser.parse_input_audio(dict_content)
+        mm_parser.parse_input_audio(dict_content, uuid)
         modality = "audio"
     elif part_type == "video_url":
         str_content = cast(str, content)
-        mm_parser.parse_video(str_content)
+        mm_parser.parse_video(str_content, uuid)
         modality = "video"
     else:
         raise NotImplementedError(f"Unknown part type: {part_type}")
@@ -1288,7 +1375,11 @@ def parse_chat_messages(
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
     content_format: _ChatTemplateContentFormat,
-) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]:
+) -> tuple[
+    list[ConversationMessage],
+    Optional[MultiModalDataDict],
+    Optional[MultiModalUUIDDict],
+]:
     conversation: list[ConversationMessage] = []
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
 
@@ -1308,7 +1399,7 @@ def parse_chat_messages(
 
     _postprocess_messages(conversation)
 
-    return conversation, mm_tracker.all_mm_data()
+    return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids()
 
 
 def parse_chat_messages_futures(
@@ -1316,7 +1407,11 @@ def parse_chat_messages_futures(
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
     content_format: _ChatTemplateContentFormat,
-) -> tuple[list[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
+) -> tuple[
+    list[ConversationMessage],
+    Awaitable[Optional[MultiModalDataDict]],
+    Optional[MultiModalUUIDDict],
+]:
     conversation: list[ConversationMessage] = []
     mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
 
@@ -1336,7 +1431,7 @@ def parse_chat_messages_futures(
 
     _postprocess_messages(conversation)
 
-    return conversation, mm_tracker.all_mm_data()
+    return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids()
 
 
 def apply_hf_chat_template(
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 9b2ad808eb..d33fd0ec0b 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -796,7 +796,7 @@ class LLM:
             # NOTE: _parse_chat_message_content_parts() currently doesn't
             # handle mm_processor_kwargs, since there is no implementation in
             # the chat message parsing for it.
-            conversation, mm_data = parse_chat_messages(
+            conversation, mm_data, mm_uuids = parse_chat_messages(
                 msgs,
                 model_config,
                 tokenizer,
@@ -826,6 +826,9 @@ class LLM:
             if mm_data is not None:
                 prompt["multi_modal_data"] = mm_data
 
+            if mm_uuids is not None:
+                prompt["multi_modal_uuids"] = mm_uuids
+
             if mm_processor_kwargs is not None:
                 prompt["mm_processor_kwargs"] = mm_processor_kwargs
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 1a2236de4f..d6e8d93a57 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -929,7 +929,7 @@ class OpenAIServing:
             tokenizer,
             model_config=model_config,
         )
-        conversation, mm_data_future = parse_chat_messages_futures(
+        conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
             messages,
             model_config,
             tokenizer,
@@ -1006,6 +1006,10 @@ class OpenAIServing:
             prompt_token_ids=prompt_inputs["prompt_token_ids"])
         if mm_data is not None:
             engine_prompt["multi_modal_data"] = mm_data
+
+        if mm_uuids is not None:
+            engine_prompt["multi_modal_uuids"] = mm_uuids
+
         if request.mm_processor_kwargs is not None:
             engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 094fcf021b..ec82be831e 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -276,13 +276,23 @@ class InputPreprocessor:
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(
+        mm_input = mm_processor.apply(
             prompt,
             mm_data,
             hf_processor_mm_kwargs=mm_processor_kwargs,
             tokenization_kwargs=tokenization_kwargs,
             mm_hash_overrides=mm_hash_overrides,
         )
+        mm_hashes = mm_input["mm_hashes"]
+
+        # Validate that all mm items have a string as their hash
+        if not contains_only_strings(mm_hashes):
+            raise ValueError(
+                f"mm_hashes must contain only strings, got: {mm_hashes}. "
+                "This is likely due to an incorrect custom implementation of "
+                "MultiModalProcessor.apply method.")
+
+        return mm_input
 
     async def _process_multimodal_async(
         self,
@@ -310,13 +320,23 @@ class InputPreprocessor:
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(
+        mm_input = mm_processor.apply(
             prompt,
             mm_data,
             hf_processor_mm_kwargs=mm_processor_kwargs,
             tokenization_kwargs=tokenization_kwargs,
             mm_hash_overrides=mm_hash_overrides,
         )
+        mm_hashes = mm_input["mm_hashes"]
+
+        # Validate that all mm items have a string as their hash
+        if not contains_only_strings(mm_hashes):
+            raise ValueError(
+                f"mm_hashes must contain only strings, got: {mm_hashes}. "
+                "This is likely due to an incorrect custom implementation of "
+                "MultiModalProcessor.apply method.")
+
+        return mm_input
 
     def _process_embeds(
         self,
@@ -953,3 +973,15 @@ class InputPreprocessor:
     def clear_cache(self) -> None:
         if self.mm_processor_cache is not None:
             self.mm_processor_cache.clear_cache()
+
+
+# Helper function to validate that a nested dictionary contains
+# only strings or list of strings as the leaf values.
+def contains_only_strings(obj: object):
+    if isinstance(obj, str):
+        return True
+    if isinstance(obj, list):
+        return all(isinstance(x, str) for x in obj)
+    if isinstance(obj, dict):
+        return all(contains_only_strings(v) for v in obj.values())
+    return False
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index 739396a493..453da1a51d 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -174,9 +174,10 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
 
         mm_items = self._to_mm_items(mm_data)
         tokenization_kwargs = tokenization_kwargs or {}
-        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
-                     self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
-                                         tokenization_kwargs))
+        mm_hashes = self._hash_mm_items(mm_items,
+                                        hf_processor_mm_kwargs,
+                                        tokenization_kwargs,
+                                        mm_hash_overrides=mm_hash_overrides)
         mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
 
         mm_processed_data = BatchFeature(image_data)

From 717fc00e98c18d183f9141393b4185ddfb6606b3 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 8 Sep 2025 14:45:14 +0100
Subject: [PATCH 920/932] [Docs] Move feature compatibility tables to README
 (#24431)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/.nav.yml                                        | 5 +----
 docs/features/{compatibility_matrix.md => README.md} | 8 +++++---
 2 files changed, 6 insertions(+), 7 deletions(-)
 rename docs/features/{compatibility_matrix.md => README.md} (98%)

diff --git a/docs/.nav.yml b/docs/.nav.yml
index dbac0e12f1..8a21dc9f1d 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -32,10 +32,7 @@ nav:
       - models/pooling_models.md
       - models/extensions
       - Hardware Supported Models: models/hardware_supported_models
-    - Features:
-      - features/compatibility_matrix.md
-      - features/*
-      - features/quantization
+    - Features: features
   - Developer Guide:
     - contributing/README.md
     - General:
diff --git a/docs/features/compatibility_matrix.md b/docs/features/README.md
similarity index 98%
rename from docs/features/compatibility_matrix.md
rename to docs/features/README.md
index 5b08b38107..de23cd0a90 100644
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/README.md
@@ -1,4 +1,6 @@
-# Compatibility Matrix
+# Features
+
+## Compatibility Matrix
 
 The tables below show mutually exclusive features and the support on some hardware.
 
@@ -12,7 +14,7 @@ The symbols used have the following meanings:
 !!! note
     Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
 
-## Feature x Feature
+### Feature x Feature
 
 <style>
 td:not(:first-child) {
@@ -56,7 +58,7 @@ th:not(:first-child) {
 
 [](){ #feature-x-hardware }
 
-## Feature x Hardware
+### Feature x Hardware
 
 | Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU |
 |-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|

From 55be93baf52476a5ac24b4ec6f2feca9c22ba34d Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Mon, 8 Sep 2025 18:36:54 +0200
Subject: [PATCH 921/932] [Doc]: fix 2 hyperlinks leading to Ray site after
 they changed Ray's doc structure (#24438)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/serving/parallelism_scaling.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md
index fa7fc1b290..cef1127fc5 100644
--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@@ -66,7 +66,7 @@ Ray is a distributed computing framework for scaling Python programs. Multi-node
 
 vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens.
 
-Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm/serving-llms.html) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads.
+Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads.
 
 For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.html).
 
@@ -104,7 +104,7 @@ Note that `VLLM_HOST_IP` is unique for each worker. Keep the shells running thes
 From any node, enter a container and run `ray status` and `ray list nodes` to verify that Ray finds the expected number of nodes and GPUs.
 
 !!! tip
-    Alternatively, set up the Ray cluster using KubeRay. For more information, see [KubeRay vLLM documentation](https://docs.ray.io/en/latest/cluster/kubernetes/examples/vllm-rayservice.html).
+    Alternatively, set up the Ray cluster using KubeRay. For more information, see [KubeRay vLLM documentation](https://docs.ray.io/en/latest/cluster/kubernetes/examples/rayserve-llm-example.html).
 
 ### Running vLLM on a Ray cluster
 

From c44797a4d6d91033dee20cb179f6ccb239ae9656 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Tue, 9 Sep 2025 00:36:57 +0800
Subject: [PATCH 922/932] [Docs]add eplb_config param use docs (#24213)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 docs/serving/expert_parallel_deployment.md | 35 +++++++++++++++++-----
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md
index 280b3322b1..7bf87b151e 100644
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -123,12 +123,33 @@ When enabled, vLLM collects load statistics with every forward pass and periodic
 
 ### EPLB Parameters
 
+Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. The available keys and their descriptions are:
+
 | Parameter | Description | Default |
 |-----------|-------------|---------|
-| `--eplb-window-size` | Number of engine steps to track for rebalancing decisions | - |
-| `--eplb-step-interval` | Frequency of rebalancing (every N engine steps) | - |
-| `--eplb-log-balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
-| `--num-redundant-experts` | Additional global experts per EP rank beyond equal distribution | `0` |
+| `window_size`| Number of engine steps to track for rebalancing decisions | 1000 |
+| `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 |
+| `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
+| `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
+
+For example:
+
+```bash
+vllm serve Qwen/Qwen3-30B-A3B \
+  --enable-eplb \
+  --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
+```
+
+??? tip "Prefer individual arguments instead of JSON?"
+
+    ```bash
+    vllm serve Qwen/Qwen3-30B-A3B \
+            --enable-eplb \
+            --eplb-config.window_size 1000 \
+            --eplb-config.step_interval 3000 \
+            --eplb-config.num_redundant_experts 2 \
+            --eplb-config.log_balancedness true
+    ```
 
 ### Expert Distribution Formula
 
@@ -146,12 +167,10 @@ VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 vllm serve deepseek-ai/DeepSeek-V
     --data-parallel-size 8 \        # Data parallelism  
     --enable-expert-parallel \      # Enable EP
     --enable-eplb \                 # Enable load balancer
-    --eplb-log-balancedness \       # Log balancing metrics
-    --eplb-window-size 1000 \       # Track last 1000 engine steps
-    --eplb-step-interval 3000       # Rebalance every 3000 steps
+    --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
 ```
 
-For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--num-redundant-experts` to 32 in large scale use cases so the most popular experts are always available.
+For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available.
 
 ## Disaggregated Serving (Prefill/Decode Split)
 

From 6f4a82f8b5a13a62dac0b84c7c817a33c602af52 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 9 Sep 2025 00:37:08 +0800
Subject: [PATCH 923/932] [Model] Enable BNB support for qwen2_5_omni_thinker
 (#24420)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../models/qwen2_5_omni_thinker.py            | 31 +++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index d05eb76cdf..e79428d17a 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -41,6 +41,7 @@ from transformers.models.whisper import WhisperFeatureExtractor
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2_5_vl import (
     Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs,
     Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs,
@@ -66,7 +67,8 @@ from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, WeightsMapper,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -726,7 +728,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
     dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder,
 )
 class Qwen2_5OmniThinkerForConditionalGeneration(
-        nn.Module, SupportsMultiModal, SupportsPP,
+        nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         Qwen2_5OmniConditionalGenerationMixin):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -734,6 +736,22 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             "thinker.model.": "language_model.model.",
             "thinker.": "",
         })
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "attn.qkv": [
+            "attn.q",
+            "attn.k",
+            "attn.v",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
@@ -956,3 +974,12 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
                                              mapper=self.hf_to_vllm_mapper)
 
         return loaded_weights
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="merger.",
+            tower_model=["visual.", "audio_tower."])

From 3feeeb9fea6c96a1006d37241c0c1cd9283fe6ef Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Mon, 8 Sep 2025 13:32:42 -0400
Subject: [PATCH 924/932] [Spec Decode][Benchmark] Add Spec Bench Dataset for
 benchmarking (#23563)

Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
---
 vllm/benchmarks/datasets.py | 80 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 882b68ac9e..2e0c02d317 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1020,7 +1020,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         default="random",
         choices=[
             "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf", 
-            "custom", "prefix_repetition"
+            "custom", "prefix_repetition", "spec_bench"
         ],
         help="Name of the dataset to benchmark on.",
     )
@@ -1053,6 +1053,22 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "Skip applying chat template to prompt, used only for custom dataset.",
     )
 
+    spec_bench_group = parser.add_argument_group("spec bench dataset options")
+    spec_bench_group.add_argument(
+        "--spec-bench-output-len",
+        type=int,
+        default=256,
+        help=
+        "Num of output tokens per request, used only for spec bench dataset.",
+    )
+    spec_bench_group.add_argument(
+        "--spec-bench-category",
+        type=str,
+        default=None,
+        help=
+        "Category for spec bench dataset. If None, use all categories.",
+    )
+
     sonnet_group = parser.add_argument_group("sonnet dataset options")
     sonnet_group.add_argument(
         "--sonnet-input-len",
@@ -1404,6 +1420,14 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
     else:
         # For datasets that follow a similar structure, use a mapping.
         dataset_mapping = {
+            "spec_bench":
+            lambda: SpecBench(dataset_path=args.dataset_path, 
+                              category=args.spec_bench_category).sample(
+                num_requests=args.num_prompts,
+                tokenizer=tokenizer,
+                output_len=args.spec_bench_output_len,
+                request_id_prefix=args.request_id_prefix,
+            ),
             "sharegpt": lambda: ShareGPTDataset(
                 random_seed=args.seed, dataset_path=args.dataset_path
             ).sample(
@@ -1541,6 +1565,14 @@ class CustomDataset(BenchmarkDataset):
         request_id_prefix: str = "",
         **kwargs,
     ) -> list:
+        # load all data if needed
+        self.num_available_samples = len(self.data)
+        if num_requests <= 0:
+            num_requests = self.num_available_samples
+            logger.info("num_requests is set to 0 or negative, "
+                        "so using all available samples: %d",
+                        num_requests)
+            
         sampled_requests = []
         for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
@@ -1572,6 +1604,52 @@ class CustomDataset(BenchmarkDataset):
         return sampled_requests
 
 
+# -----------------------------------------------------------------------------
+# Spec Bench Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class SpecBench(CustomDataset):
+    """
+    Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench
+    Download the dataset using: 
+    wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
+    """ # noqa: E501
+
+    def __init__(self, **kwargs) -> None:
+        self.category = kwargs.pop("category", None)
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        self.data = []
+
+        # Load the JSONL file
+        jsonl_data = pd.read_json(path_or_buf=self.dataset_path,
+                                    lines=True)
+
+        # check if the JSONL file has a 'turns' column
+        if "turns" not in jsonl_data.columns:
+            raise ValueError("JSONL file must contain a 'turns' column.")
+
+        for _, row in jsonl_data.iterrows():
+            # sample only from a specific category if specified
+            if (not self.category) or (self.category == row['category']):
+                prompt = row["turns"][0]
+                self.data.append({"prompt": prompt})
+
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(self, **kwargs) -> list:
+        # leverage CustomDataset sample
+        kwargs["skip_chat_template"] = False
+        return super().sample(**kwargs)
+    
+    
 # -----------------------------------------------------------------------------
 # Sonnet Dataset Implementation
 # -----------------------------------------------------------------------------

From cd086369260ccf2289b9b3d96319792705df4e5b Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Mon, 8 Sep 2025 13:32:52 -0400
Subject: [PATCH 925/932] [Spec Decode][Benchmark] Add Blitzedit dataset
 (#23605)

Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/benchmarks/datasets.py | 113 ++++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 2e0c02d317..1c358a835c 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1101,6 +1101,22 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "from the ShareGPT dataset.",
     )
 
+    blazedit_group = parser.add_argument_group("blazedit dataset options")
+    blazedit_group.add_argument(
+        "--blazedit-min-distance",
+        type=float,
+        default=0.0,
+        help=
+        "Minimum distance for blazedit dataset. Min: 0, Max: 1.0",
+    )
+    blazedit_group.add_argument(
+        "--blazedit-max-distance",
+        type=float,
+        default=1.0,
+        help=
+        "Maximum distance for blazedit dataset. Min: 0, Max: 1.0",
+    )
+
     random_group = parser.add_argument_group("random dataset options")
     random_group.add_argument(
         "--random-input-len",
@@ -1333,6 +1349,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
     elif args.dataset_name == "hf":
         # all following datasets are implemented from the
         # HuggingFaceDataset base class
+        hf_kwargs = {}
         if (
             args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS
             or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS
@@ -1376,6 +1393,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
         ):
             dataset_class = ASRDataset
             args.hf_split = "train"
+        elif args.dataset_path in BlazeditDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = BlazeditDataset
+            args.hf_split = "train"
+            hf_kwargs = {
+                "min_distance": args.blazedit_min_distance,
+                "max_distance": args.blazedit_max_distance,
+            }
         elif (
             args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS
             or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS
@@ -1415,6 +1439,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             tokenizer=tokenizer,
             output_len=args.hf_output_len,
             request_id_prefix=args.request_id_prefix,
+            **hf_kwargs
         )
 
     else:
@@ -2090,6 +2115,94 @@ class MTBenchDataset(HuggingFaceDataset):
         return sampled_requests
 
 
+# -----------------------------------------------------------------------------
+# Blazedit Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class BlazeditDataset(HuggingFaceDataset):
+    """
+    Blazedit Dataset.
+    https://github.com/ise-uiuc/blazedit
+
+    5k char version: vdaita/edit_5k_char
+    10k char version: vdaita/edit_10k_char
+    """  # noqa: E501
+
+    # 5k char version will have output as ~5k chars
+    # 10k char version will have output as ~10k chars
+    # Assuming 3 char per token, 10k chars will be 3333 tokens
+    # We set default to 4000 to be safe
+    DEFAULT_OUTPUT_LEN = 4000
+    SUPPORTED_DATASET_PATHS = {
+        "vdaita/edit_5k_char",
+        "vdaita/edit_10k_char",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        request_id_prefix: str = "",
+        min_distance: float = 0.0,
+        max_distance: float = 1.0,
+        **kwargs,
+    ) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+
+        for i, item in enumerate(self.data):
+            if len(sampled_requests) >= num_requests:
+                break
+            code = item["code"]
+            change_request = item["change_request"]
+            norm_distance = item["norm_distance"]
+
+            # compare the levenshtein distance normalized by code length
+            if norm_distance < min_distance or norm_distance > max_distance:
+                continue
+            
+            # template copied from 
+            # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
+            instruction = f"""Given a code file, please apply the change requests and generate the new file.
+
+Original file:
+```python
+{code}
+```
+
+Change request:
+{change_request}
+
+Please generate the new code file in the "New file" section below.""" # noqa: E501
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": instruction
+                }],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
+        
+        return sampled_requests
+
+
 # -----------------------------------------------------------------------------
 # AIMO Dataset Implementation
 # -----------------------------------------------------------------------------

From 8d7f39b48c9df750678171e37586dd84def5737e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 9 Sep 2025 02:02:14 +0800
Subject: [PATCH 926/932] [Model] Remove quantized mixtral (#24437)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/models/registry.py                    |   1 -
 vllm/model_executor/model_loader/utils.py   |  16 -
 vllm/model_executor/models/mixtral_quant.py | 454 --------------------
 vllm/model_executor/models/registry.py      |   1 -
 4 files changed, 472 deletions(-)
 delete mode 100644 vllm/model_executor/models/mixtral_quant.py

diff --git a/tests/models/registry.py b/tests/models/registry.py
index e4c215b108..8bcdeb087c 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -285,7 +285,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
     "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1",  # noqa: E501
                                           {"tiny": "TitanML/tiny-mixtral"}),  # noqa: E501
-    "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"),  # noqa: E501
     "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
     "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index f57ebdb1ab..c82fa5a40a 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -169,22 +169,6 @@ def get_model_architecture(
         model_config: ModelConfig) -> tuple[type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
 
-    # Special handling for quantized Mixtral.
-    # FIXME(woosuk): This is a temporary hack.
-    mixtral_supported = [
-        "fp8",
-        "compressed-tensors",
-        "gptq_marlin",
-        "awq_marlin",
-        "quark",
-        "bitsandbytes",
-    ]
-
-    if (model_config.quantization is not None
-            and model_config.quantization not in mixtral_supported
-            and "MixtralForCausalLM" in architectures):
-        architectures = ["QuantMixtralForCausalLM"]
-
     model_cls, arch = model_config.registry.resolve_model_cls(
         architectures,
         model_config=model_config,
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
deleted file mode 100644
index 692267b4d7..0000000000
--- a/vllm/model_executor/models/mixtral_quant.py
+++ /dev/null
@@ -1,454 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only Mixtral model."""
-from collections.abc import Iterable
-from itertools import islice
-from typing import Optional, Union
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import nn
-from transformers import MixtralConfig
-
-from vllm.attention import Attention
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
-
-from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
-
-
-class MixtralMLP(nn.Module):
-
-    def __init__(
-        self,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.num_experts = num_experts
-        self.ffn_dim = intermediate_size
-        self.hidden_dim = hidden_size
-
-        self.w1 = ReplicatedLinear(self.hidden_dim,
-                                   self.ffn_dim,
-                                   bias=False,
-                                   quant_config=quant_config)
-        self.w2 = ReplicatedLinear(self.ffn_dim,
-                                   self.hidden_dim,
-                                   bias=False,
-                                   quant_config=quant_config)
-        self.w3 = ReplicatedLinear(self.hidden_dim,
-                                   self.ffn_dim,
-                                   bias=False,
-                                   quant_config=quant_config)
-
-        # TODO: Use vllm's SiluAndMul
-        self.act_fn = nn.SiLU()
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        w1_out, _ = self.w1(hidden_states)
-        w1_out = self.act_fn(w1_out)
-        w3_out, _ = self.w3(hidden_states)
-        current_hidden_states = w1_out * w3_out
-        current_hidden_states, _ = self.w2(current_hidden_states)
-        return current_hidden_states
-
-
-class MixtralMoE(nn.Module):
-
-    def __init__(
-        self,
-        config: MixtralConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.num_total_experts = config.num_local_experts
-        self.top_k = config.num_experts_per_tok
-        if self.tp_size > self.num_total_experts:
-            raise ValueError(
-                f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {self.num_total_experts}.")
-        # Split experts equally between ranks
-        self.expert_indices = np.array_split(range(self.num_total_experts),
-                                             self.tp_size)[self.rank].tolist()
-        if not self.expert_indices:
-            raise ValueError(
-                f"Rank {self.rank} has no experts assigned to it.")
-
-        self.experts = nn.ModuleList([
-            MixtralMLP(self.num_total_experts,
-                       config.hidden_size,
-                       config.intermediate_size,
-                       quant_config=quant_config)
-            if idx in self.expert_indices else None
-            for idx in range(self.num_total_experts)
-        ])
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     self.num_total_experts,
-                                     bias=False,
-                                     quant_config=None)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_dim = hidden_states.shape
-        hidden_states = hidden_states.view(-1, hidden_dim)
-        # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states)
-
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights,
-                                                       self.top_k,
-                                                       dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-
-        final_hidden_states = None
-        for expert_idx in self.expert_indices:
-            expert_layer = self.experts[expert_idx]
-            expert_mask = (selected_experts == expert_idx)
-            expert_weights = (routing_weights * expert_mask).sum(dim=-1,
-                                                                 keepdim=True)
-
-            current_hidden_states = expert_layer(hidden_states).mul_(
-                expert_weights)
-            if final_hidden_states is None:
-                final_hidden_states = current_hidden_states
-            else:
-                final_hidden_states.add_(current_hidden_states)
-
-        return tensor_model_parallel_all_reduce(final_hidden_states).view(
-            num_tokens, hidden_dim)
-
-
-class MixtralAttention(nn.Module):
-
-    def __init__(
-        self,
-        config: MixtralConfig,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
-        quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        # MixtralConfig has an optional head_dim argument
-        self.head_dim = getattr(config, "head_dim", None)
-        if self.head_dim is None:
-            self.head_dim = self.hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=False,
-            quant_config=quant_config,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            quant_config=quant_config,
-        )
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position,
-            base=int(self.rope_theta),
-            is_neox_style=True,
-        )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class MixtralDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: MixtralConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        self.self_attn = MixtralAttention(
-            config=config,
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            max_position=config.max_position_embeddings,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.self_attn",
-        )
-        self.block_sparse_moe = MixtralMoE(config=config,
-                                           quant_config=quant_config)
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.block_sparse_moe(hidden_states)
-        return hidden_states, residual
-
-
-class MixtralModel(nn.Module):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: MixtralDecoderLayer(
-                config, cache_config, quant_config=quant_config, prefix=prefix
-            ),
-            prefix=f"{prefix}.layers")
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if get_pp_group().is_first_rank:
-            if inputs_embeds is not None:
-                hidden_states = inputs_embeds
-            else:
-                hidden_states = self.get_input_embeddings(input_ids)
-            residual = None
-        else:
-            assert intermediate_tensors is not None
-            hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-        for layer in islice(self.layers, self.start_layer, self.end_layer):
-            hidden_states, residual = layer(positions, hidden_states, residual)
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if name.endswith("scale"):
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if ("block_sparse_moe.experts." in name
-                        and name not in params_dict):
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
-
-class MixtralForCausalLM(nn.Module, SupportsPP):
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = MixtralModel(vllm_config=vllm_config,
-                                  prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index c522fcab7f..43075956b4 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -110,7 +110,6 @@ _TEXT_GENERATION_MODELS = {
     "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
-    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),

From 7be141b2c52366f0cc4e731c36819aed178d2258 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Tue, 9 Sep 2025 02:48:06 +0800
Subject: [PATCH 927/932] [CI] Enable encoder model compilation test (#24442)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 tests/compile/test_basic_correctness.py | 31 ++++++++++++++-----------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 422cb94b03..f678370434 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -62,8 +62,12 @@ class TestSetting:
         TestSetting(
             model="BAAI/bge-multilingual-gemma2",
             model_args=[
-                "--runner", "pooling", "--dtype", "bfloat16",
-                "--max-model-len", "2048"
+                "--runner",
+                "pooling",
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
             ],
             pp_size=1,
             tp_size=1,
@@ -71,17 +75,15 @@ class TestSetting:
             method="encode",
             fullgraph=True,
         ),
-        # TODO: bert models are not supported in V1 yet
-        # # encoder-based embedding model (BERT)
-        # TestSetting(
-        #     model="BAAI/bge-base-en-v1.5",
-        #     model_args=["--runner", "pooling"],
-        #     pp_size=1,
-        #     tp_size=1,
-        #     attn_backend="XFORMERS",
-        #     method="encode",
-        #     fullgraph=True,
-        # ),
+        TestSetting(
+            model="BAAI/bge-base-en-v1.5",
+            model_args=["--runner", "pooling"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="encode",
+            fullgraph=True,
+        ),
         # vision language model
         TestSetting(
             model="microsoft/Phi-3.5-vision-instruct",
@@ -92,7 +94,8 @@ class TestSetting:
             method="generate_with_image",
             fullgraph=False,
         ),
-    ])
+    ],
+)
 def test_compile_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_setting: TestSetting,

From 43d9ad03ba8584547537da71aee3e1810a0abbfd Mon Sep 17 00:00:00 2001
From: Yang Kaiyong <kaiyongyang@outlook.com>
Date: Tue, 9 Sep 2025 02:49:39 +0800
Subject: [PATCH 928/932] [Model loader]: support multi-thread model weight
 loading (#23928)

Signed-off-by: Yang Kaiyong <yangkaiyong.yky@antgroup.com>
Signed-off-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 .../model_loader/default_loader.py            | 53 +++++++++++----
 .../model_loader/weight_utils.py              | 64 +++++++++++++++++++
 2 files changed, 105 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index c8ad3a55d9..4badc31753 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -18,8 +18,9 @@ from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     fastsafetensors_weights_iterator, filter_duplicate_safetensors_files,
     filter_files_not_needed_for_inference, maybe_download_from_modelscope,
-    np_cache_weights_iterator, pt_weights_iterator,
-    safetensors_weights_iterator)
+    multi_thread_pt_weights_iterator,
+    multi_thread_safetensors_weights_iterator, np_cache_weights_iterator,
+    pt_weights_iterator, safetensors_weights_iterator)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -28,6 +29,9 @@ logger = init_logger(__name__)
 class DefaultModelLoader(BaseModelLoader):
     """Model loader that can load different file types from disk."""
 
+    # default number of thread when enable multithread weight loading
+    DEFAULT_NUM_THREADS = 8
+
     @dataclasses.dataclass
     class Source:
         """A source for weights."""
@@ -52,9 +56,15 @@ class DefaultModelLoader(BaseModelLoader):
 
     def __init__(self, load_config: LoadConfig):
         super().__init__(load_config)
-        if load_config.model_loader_extra_config:
-            raise ValueError(f"Model loader extra config is not supported for "
-                             f"load format {load_config.load_format}")
+
+        extra_config = load_config.model_loader_extra_config
+        allowed_keys = {"enable_multithread_load", "num_threads"}
+        unexpected_keys = set(extra_config.keys()) - allowed_keys
+
+        if unexpected_keys:
+            raise ValueError(f"Unexpected extra config keys for load format "
+                             f"{load_config.load_format}: "
+                             f"{unexpected_keys}")
 
     def _prepare_weights(
         self,
@@ -145,6 +155,7 @@ class DefaultModelLoader(BaseModelLoader):
             self, source: "Source"
     ) -> Generator[tuple[str, torch.Tensor], None, None]:
         """Get an iterator for the model weights based on the load format."""
+        extra_config = self.load_config.model_loader_extra_config
         hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
             source.model_or_path, source.revision, source.fall_back_to_pt,
             source.allow_patterns_overrides)
@@ -165,16 +176,34 @@ class DefaultModelLoader(BaseModelLoader):
                     self.load_config.use_tqdm_on_load,
                 )
             else:
-                weights_iterator = safetensors_weights_iterator(
+                if extra_config.get("enable_multithread_load"):
+                    weights_iterator = (
+                        multi_thread_safetensors_weights_iterator(
+                            hf_weights_files,
+                            self.load_config.use_tqdm_on_load,
+                            max_workers=extra_config.get(
+                                "num_threads", self.DEFAULT_NUM_THREADS),
+                        ))
+                else:
+                    weights_iterator = safetensors_weights_iterator(
+                        hf_weights_files,
+                        self.load_config.use_tqdm_on_load,
+                    )
+        else:
+            if extra_config.get("enable_multithread_load"):
+                weights_iterator = multi_thread_pt_weights_iterator(
                     hf_weights_files,
                     self.load_config.use_tqdm_on_load,
+                    self.load_config.pt_load_map_location,
+                    max_workers=extra_config.get("num_threads",
+                                                 self.DEFAULT_NUM_THREADS),
+                )
+            else:
+                weights_iterator = pt_weights_iterator(
+                    hf_weights_files,
+                    self.load_config.use_tqdm_on_load,
+                    self.load_config.pt_load_map_location,
                 )
-        else:
-            weights_iterator = pt_weights_iterator(
-                hf_weights_files,
-                self.load_config.use_tqdm_on_load,
-                self.load_config.pt_load_map_location,
-            )
 
         if current_platform.is_tpu():
             from vllm.platforms.tpu import USE_TPU_COMMONS
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 50056038b6..a4eda36148 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for downloading and initializing model weights."""
+import concurrent.futures
 import fnmatch
 import glob
 import hashlib
@@ -531,6 +532,36 @@ def safetensors_weights_iterator(
                 yield name, param
 
 
+def multi_thread_safetensors_weights_iterator(
+    hf_weights_files: list[str],
+    use_tqdm_on_load: bool,
+    max_workers: int = 4,
+) -> Generator[tuple[str, torch.Tensor], None, None]:
+    """Multi-Thread iterate over the weights in the model safetensor files."""
+
+    def _load_file(st_file: str):
+        result = load_file(st_file, device="cpu")
+        return result
+
+    with concurrent.futures.ThreadPoolExecutor(
+            max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(_load_file, st_file)
+            for st_file in hf_weights_files
+        ]
+        futures_iter = tqdm(
+            concurrent.futures.as_completed(futures),
+            total=len(hf_weights_files),
+            desc="Multi-thread loading shards",
+            disable=not enable_tqdm(use_tqdm_on_load),
+            bar_format=_BAR_FORMAT,
+        )
+
+        for future in futures_iter:
+            state_dict = future.result()
+            yield from state_dict.items()
+
+
 def runai_safetensors_weights_iterator(
     hf_weights_files: list[str],
     use_tqdm_on_load: bool,
@@ -611,6 +642,39 @@ def pt_weights_iterator(
         del state
 
 
+def multi_thread_pt_weights_iterator(
+    hf_weights_files: list[str],
+    use_tqdm_on_load: bool,
+    pt_load_map_location: Union[str, dict[str, str]] = "cpu",
+    max_workers: int = 4,
+) -> Generator[tuple[str, torch.Tensor], None, None]:
+    """Multi-Thread iterate over the weights in the model bin/pt files."""
+
+    def _load_file(bin_file: str):
+        return torch.load(bin_file,
+                          map_location=pt_load_map_location,
+                          weights_only=True)
+
+    with concurrent.futures.ThreadPoolExecutor(
+            max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(_load_file, bin_file)
+            for bin_file in hf_weights_files
+        ]
+        futures_iter = tqdm(
+            concurrent.futures.as_completed(futures),
+            total=len(hf_weights_files),
+            desc="Multi-thread loading pt checkpoint shards",
+            disable=not enable_tqdm(use_tqdm_on_load),
+            bar_format=_BAR_FORMAT,
+        )
+
+        for future in futures_iter:
+            state = future.result()
+            yield from state.items()
+            del state
+
+
 def get_gguf_extra_tensor_names(
         gguf_file: str, gguf_to_hf_name_map: dict[str, str]) -> list[str]:
     reader = gguf.GGUFReader(gguf_file)

From 41183c1fe09c60cd77d683e64895c08f0d84b693 Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Mon, 8 Sep 2025 16:44:13 -0400
Subject: [PATCH 929/932] [Spec Decode] Fix offline spec_decode.py (#24257)

Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/benchmarks/datasets.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 1c358a835c..784536054a 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1310,6 +1310,10 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
 
 
 def get_samples(args, tokenizer) -> list[SampleRequest]:
+
+    if not hasattr(args, "request_id_prefix"):
+        args.request_id_prefix = ""
+
     if args.dataset_name == "custom":
         dataset = CustomDataset(dataset_path=args.dataset_path)
         input_requests = dataset.sample(

From 620db1fc5879329dae62367238f97d446904a3ee Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni001@gmail.com>
Date: Mon, 8 Sep 2025 15:05:26 -0700
Subject: [PATCH 930/932] [Attention] FlashAttention MLA cudagraph support
 (#23958)

Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
---
 .../compile/piecewise/test_full_cudagraph.py  | 12 ++-
 tests/v1/attention/test_mla_backends.py       |  5 --
 tests/v1/cudagraph/test_cudagraph_mode.py     | 10 +++
 vllm/v1/attention/backends/mla/common.py      | 22 ++++--
 .../attention/backends/mla/flashattn_mla.py   | 77 +++++++++++++++++--
 vllm/v1/attention/backends/mla/flashmla.py    | 11 +--
 .../attention/backends/mla/rocm_aiter_mla.py  | 10 ++-
 7 files changed, 118 insertions(+), 29 deletions(-)

diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index 97140a9db7..2454f85342 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -61,6 +61,16 @@ backend_configs = {
                       "cudagraph_mode": "FULL_AND_PIECEWISE",
                   },
                   specific_gpu_arch=(9, 0)),
+    # FlashAttention MLA on Hopper
+    "FlashAttentionMLA":
+    BackendConfig(name="FlashAttentionMLA",
+                  env_vars={
+                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
+                  },
+                  comp_config={
+                      "cudagraph_mode": "FULL_DECODE_ONLY",
+                  },
+                  specific_gpu_arch=(9, 0)),
     # Cutlass MLA on Blackwell
     "CutlassMLA":
     BackendConfig(
@@ -102,7 +112,7 @@ backend_configs = {
 test_params_full_cudagraph = []
 
 # deepseek-ai/DeepSeek-V2-Lite with MLA
-MLA_backends = ["FlashMLA", "CutlassMLA"]
+MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
 for mla_backend in MLA_backends:
     test_params_full_cudagraph.append(
         pytest.param(
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index e7cd116fdc..a62993950a 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -73,7 +73,6 @@ def create_and_prepopulate_kv_cache(
         kv_c_contexts: list[torch.Tensor],
         k_pe_contexts: list[torch.Tensor],
         block_size: int,
-        num_kv_heads: int,
         head_size: int,
         dtype: torch.dtype,
         device: torch.device,
@@ -87,7 +86,6 @@ def create_and_prepopulate_kv_cache(
         k_pe_contexts: List of key positional embedding context tensors
                        for each sequence
         block_size: Size of each block
-        num_kv_heads: Number of KV heads (should be 1 for MLA)
         head_size: Size of each head (latent dimension)
         dtype: Data type for the cache
         device: Device to create the cache on
@@ -285,8 +283,6 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
     query_lens = batch_spec.query_lens
     num_q_heads = vllm_config.model_config.get_num_attention_heads(
         vllm_config.parallel_config)
-    num_kv_heads = vllm_config.model_config.get_num_kv_heads(
-        vllm_config.parallel_config)
     head_size = vllm_config.model_config.get_head_size()
     dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
     block_size = vllm_config.cache_config.block_size
@@ -476,7 +472,6 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
         kv_c_contexts=kv_c_contexts,
         k_pe_contexts=k_pe_contexts,
         block_size=block_size,
-        num_kv_heads=num_kv_heads,
         head_size=head_size,
         dtype=dtype,
         device=device,
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index 81655e4175..25e01806f4 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -62,6 +62,16 @@ backend_configs = {
                       "cudagraph_mode": "FULL_AND_PIECEWISE",
                   },
                   specific_gpu_arch=(9, 0)),
+    # FlashAttention MLA on Hopper
+    "FlashAttentionMLA":
+    BackendConfig(name="FlashAttentionMLA",
+                  env_vars={
+                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
+                  },
+                  comp_config={
+                      "cudagraph_mode": "FULL_DECODE_ONLY",
+                  },
+                  specific_gpu_arch=(9, 0)),
     # FA2
     "FA2":
     BackendConfig(name="FA2",
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index ec1216a16b..226bc43605 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -443,11 +443,13 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         self.metadata_cls = metadata_cls \
             if metadata_cls is not None else MLACommonMetadata
         self.kv_cache_spec = kv_cache_spec
-        self.device = device
         scheduler_config = vllm_config.scheduler_config
         self.model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
         parallel_config = vllm_config.parallel_config
+        cache_config = vllm_config.cache_config
+        self.compilation_config = vllm_config.compilation_config
+        self.device = device
+
         self.num_heads = self.model_config.get_num_attention_heads(
             parallel_config)
         self.mla_dims = get_mla_dims(self.model_config)
@@ -608,10 +610,12 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         prefill.prefill_main = self._fi_prefill_main
         prefill.prefill_chunks = self._fi_prefill_chunks
 
-    def _build_decode(
-            self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor,
-            seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor,
-            query_start_loc_device: torch.Tensor) -> MLACommonDecodeMetadata:
+    def _build_decode(self, block_table_tensor: torch.Tensor,
+                      seq_lens_cpu: torch.Tensor,
+                      seq_lens_device: torch.Tensor,
+                      query_start_loc_cpu: torch.Tensor,
+                      query_start_loc_device: torch.Tensor,
+                      num_decode_tokens: int) -> MLACommonDecodeMetadata:
         return MLACommonDecodeMetadata(
             block_table=block_table_tensor,
             seq_lens=seq_lens_device,
@@ -624,11 +628,12 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         Currently, only decode is supported for full cudagraphs with MLA.
         """
         m = common_attn_metadata
-        assert m.num_reqs == m.num_actual_tokens, \
+        assert m.num_reqs <= (m.num_actual_tokens *
+                              self.reorder_batch_threshold), \
             "MLA only supports decode-only full CUDAGraph capture. " \
             "Make sure all cudagraph capture sizes <= max_num_seq."
 
-        assert m.max_query_len == 1  # decode-only
+        assert m.max_query_len <= self.reorder_batch_threshold  # decode only
 
         return self.build(0, m)
 
@@ -819,6 +824,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 seq_lens_device=seq_lens[:num_decodes],
                 query_start_loc_cpu=query_start_loc_cpu[:num_decodes + 1],
                 query_start_loc_device=query_start_loc[:num_decodes + 1],
+                num_decode_tokens=num_decode_tokens,
             )
 
         attn_metadata = self.metadata_cls(
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index e2a63c2f57..12f206637d 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -17,11 +17,16 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                    MLACommonImpl,
                                                    MLACommonMetadata,
                                                    MLACommonMetadataBuilder)
+from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata
 
 logger = init_logger(__name__)
 
+# NOTE(matt): This is an arbitrary number, copied from
+# woosuk's implementation in standard FlashAttention backend
+_DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH = 16
+
 
 class FlashAttnMLABackend(MLACommonBackend):
 
@@ -48,6 +53,7 @@ class FlashAttnMLADecodeMetadata(MLACommonDecodeMetadata):
     max_query_len: int
     max_seq_len: int
     scheduler_metadata: Optional[torch.Tensor] = None
+    max_num_splits: int = 0
 
 
 @dataclass
@@ -57,14 +63,41 @@ class FlashAttnMLAMetadata(MLACommonMetadata[FlashAttnMLADecodeMetadata]):
 
 class FlashAttnMLAMetadataBuilder(
         MLACommonMetadataBuilder[FlashAttnMLAMetadata]):
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_BATCH
+
     reorder_batch_threshold: ClassVar[int] = 512
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device,
                          FlashAttnMLAMetadata)
+        self.max_num_splits = 0  # No upper bound on the number of splits.
         self.fa_aot_schedule = (get_flash_attn_version() == 3)
 
+        self.use_full_cuda_graph = \
+            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+
+        if self.use_full_cuda_graph and self.fa_aot_schedule:
+            self.max_cudagraph_size = self.compilation_config.max_capture_size
+
+            if self.max_cudagraph_size > 992:
+                # This condition derives from FA3's internal heuristic.
+                # TODO(woosuk): Support larger cudagraph sizes.
+                raise ValueError(
+                    "Capture size larger than 992 is not supported for "
+                    "full cuda graph.")
+
+            self.scheduler_metadata = torch.zeros(
+                vllm_config.scheduler_config.max_num_seqs + 1,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            # When using cuda graph, we need to set the upper bound of the
+            # number of splits so that large enough intermediate buffers are
+            # pre-allocated during capture.
+            self.max_num_splits = _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
+
     def _schedule_decode(self, num_reqs, cu_query_lens, max_query_len, seqlens,
                          max_seq_len, causal):
         if self.fa_aot_schedule:
@@ -81,14 +114,16 @@ class FlashAttnMLAMetadataBuilder(
                 page_size=self.page_size,
                 cu_seqlens_q=cu_query_lens,
                 causal=causal,
+                num_splits=self.max_num_splits,
             )
         return None
 
-    def _build_decode(
-            self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor,
-            seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor,
-            query_start_loc_device: torch.Tensor
-    ) -> FlashAttnMLADecodeMetadata:
+    def _build_decode(self, block_table_tensor: torch.Tensor,
+                      seq_lens_cpu: torch.Tensor,
+                      seq_lens_device: torch.Tensor,
+                      query_start_loc_cpu: torch.Tensor,
+                      query_start_loc_device: torch.Tensor,
+                      num_decode_tokens: int) -> FlashAttnMLADecodeMetadata:
         query_lens_cpu = (query_start_loc_cpu[1:] - query_start_loc_cpu[:-1])
         max_query_len = query_lens_cpu.max().item()
         max_seq_len = seq_lens_cpu.max().item()
@@ -102,6 +137,29 @@ class FlashAttnMLAMetadataBuilder(
             causal=True,
         )
 
+        # For FA3 + full cudagraph
+        max_num_splits = 0
+        if self.use_full_cuda_graph and scheduler_metadata is not None:
+            n = scheduler_metadata.shape[0]
+            # Ensure the persistent buffer is large enough
+            assert n <= self.scheduler_metadata.shape[0], \
+                f"Scheduler metadata size {n} exceeds buffer size " + \
+                f"{self.scheduler_metadata.shape[0]}"
+            self.scheduler_metadata[:n] = scheduler_metadata
+            # NOTE(woosuk): We should zero out the rest of the scheduler
+            # metadata to guarantee the correctness. Otherwise, some thread
+            # blocks may use the invalid scheduler metadata and overwrite the
+            # output buffer.
+            self.scheduler_metadata[n:] = 0
+            scheduler_metadata = self.scheduler_metadata[:n]
+
+            if num_decode_tokens <= self.max_cudagraph_size:
+                # NOTE(woosuk): Setting num_splits > 1 may increase the memory
+                # usage, because the intermediate buffers of size [num_splits,
+                # num_heads, num_tokens, head_size] are allocated. Therefore,
+                # we only set num_splits when using cuda graphs.
+                max_num_splits = self.max_num_splits
+
         return FlashAttnMLADecodeMetadata(
             block_table=block_table_tensor,
             seq_lens=seq_lens_device,
@@ -109,6 +167,7 @@ class FlashAttnMLAMetadataBuilder(
             max_query_len=max_query_len,
             max_seq_len=max_seq_len,
             scheduler_metadata=scheduler_metadata,
+            max_num_splits=max_num_splits,
         )
 
 
@@ -175,12 +234,17 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]):
         kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
         k_pe_cache = kv_c_and_k_pe_cache[..., self.kv_lora_rank:]
 
+        # NOTE(matt): During CUDA graph capture, max_query_len can be 0, but the
+        # kernel uses this to calculate grid dimensions. Ensure it's at least 1
+        # to prevent invalid grid configuration during graph capture.
+        max_seqlen_q = max(attn_metadata.decode.max_query_len, 1)
+
         o = flash_attn_varlen_func(
             q=q_pe,
             k=k_pe_cache.unsqueeze(-2),  # Add head dim of 1
             v=kv_c_cache.unsqueeze(-2),  # Add head dim of 1
             q_v=q_nope,
-            max_seqlen_q=attn_metadata.decode.max_query_len,
+            max_seqlen_q=max_seqlen_q,
             cu_seqlens_q=attn_metadata.decode.query_start_loc,
             max_seqlen_k=attn_metadata.decode.max_seq_len,
             seqused_k=attn_metadata.decode.seq_lens,
@@ -189,6 +253,7 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]):
             causal=True,
             fa_version=3,  # only version 3 is supported
             scheduler_metadata=attn_metadata.decode.scheduler_metadata,
+            num_splits=attn_metadata.decode.max_num_splits,
         )
 
         return self._v_up_proj(o)
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 1824bbadb6..2f13f19218 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -62,7 +62,6 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device,
                          FlashMLAMetadata)
 
-        self.compilation_config = vllm_config.compilation_config
         self.num_q_heads = vllm_config.model_config.get_num_attention_heads(
             vllm_config.parallel_config)
 
@@ -85,10 +84,12 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
                 device=self.device,
                 dtype=torch.int32)
 
-    def _build_decode(
-            self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor,
-            seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor,
-            query_start_loc_device: torch.Tensor) -> FlashMLADecodeMetadata:
+    def _build_decode(self, block_table_tensor: torch.Tensor,
+                      seq_lens_cpu: torch.Tensor,
+                      seq_lens_device: torch.Tensor,
+                      query_start_loc_cpu: torch.Tensor,
+                      query_start_loc_device: torch.Tensor,
+                      num_decode_tokens: int) -> FlashMLADecodeMetadata:
         tile_scheduler_metadata, num_splits = \
             get_mla_metadata(
             seq_lens_device,
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index fc6b1998e8..db27a34d89 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -104,10 +104,12 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
                                           dtype=torch.int32,
                                           device=device)
 
-    def _build_decode(
-            self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor,
-            seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor,
-            query_start_loc_device: torch.Tensor) -> AiterMLADecodeMetadata:
+    def _build_decode(self, block_table_tensor: torch.Tensor,
+                      seq_lens_cpu: torch.Tensor,
+                      seq_lens_device: torch.Tensor,
+                      query_start_loc_cpu: torch.Tensor,
+                      query_start_loc_device: torch.Tensor,
+                      num_decode_tokens: int) -> AiterMLADecodeMetadata:
         page_size = self.kv_cache_spec.block_size
         block_table_bounds = (seq_lens_device + page_size - 1) // page_size
         device = self.device

From e680723eba82c8f486dfb8ba00f72cccb6be5727 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Tue, 9 Sep 2025 06:28:03 +0800
Subject: [PATCH 931/932] [Bugfix] Disable the statslogger if the
 api_server_count is greater than 1 (#22227)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/async_llm.py | 1 +
 vllm/v1/metrics/loggers.py  | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d23602eaaf..f57075c6fa 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -143,6 +143,7 @@ class AsyncLLM(EngineClient):
                 engine_idxs=self.engine_core.engine_ranks_managed,
                 custom_stat_loggers=stat_loggers,
                 enable_default_loggers=log_stats,
+                client_count=client_count,
             )
             self.logger_manager.log_engine_initialized()
 
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index f480344c85..347185d834 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -652,6 +652,7 @@ class StatLoggerManager:
         engine_idxs: Optional[list[int]] = None,
         custom_stat_loggers: Optional[list[StatLoggerFactory]] = None,
         enable_default_loggers: bool = True,
+        client_count: int = 1,
     ):
         self.engine_idxs = engine_idxs if engine_idxs else [0]
 
@@ -660,7 +661,12 @@ class StatLoggerManager:
             factories.extend(custom_stat_loggers)
 
         if enable_default_loggers and logger.isEnabledFor(logging.INFO):
-            factories.append(LoggingStatLogger)
+            if client_count > 1:
+                logger.warning(
+                    "AsyncLLM created with api_server_count more than 1; "
+                    "disabling stats logging to avoid incomplete stats.")
+            else:
+                factories.append(LoggingStatLogger)
 
         # engine_idx: StatLogger
         self.per_engine_logger_dict: dict[int, list[StatLoggerBase]] = {}

From e10fef08838612b4560e9c72e5cb1414a5edfa13 Mon Sep 17 00:00:00 2001
From: R3hankhan <Rehan.Khan7@ibm.com>
Date: Tue, 9 Sep 2025 05:20:34 +0530
Subject: [PATCH 932/932] [Hardware][IBM Z] Fix Outlines Core issue for s390x
 (#24034)

Signed-off-by: Rehan Khan <Rehan.Khan7@ibm.com>
---
 docker/Dockerfile.s390x | 46 +++++++++++++++++++++++++++++++++++++++--
 requirements/common.txt |  3 +--
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x
index 9270b48c54..9942b7626f 100644
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -16,7 +16,8 @@ ENV LANG=C.UTF-8 \
 RUN microdnf install -y \
     which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
     libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
-    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile && \
+    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \
+    clang llvm-devel llvm-static clang-devel && \
     microdnf clean all
 
 # Python Installation
@@ -191,7 +192,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         -DCOMPILER_RT_BUILD_ORC=OFF                      \
         -DCOMPILER_RT_INCLUDE_TESTS=OFF                  \
         ${CMAKE_ARGS} -GNinja ../llvm                    \
-
     && ninja install  . && \
     #  build llvmlite
     cd ../../llvmlite && python setup.py bdist_wheel && \
@@ -200,6 +200,45 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
     fi && python setup.py bdist_wheel
 
+# Edit aws-lc-sys to support s390x
+FROM python-install AS aws-lc-sys-editor
+WORKDIR /tmp
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+ARG AWS_LC_VERSION=v0.30.0
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    git clone --recursive https://github.com/aws/aws-lc-rs.git && \
+    cd aws-lc-rs && \
+    git checkout tags/aws-lc-sys/${AWS_LC_VERSION} && \
+    git submodule sync && \
+    git submodule update --init --recursive && \
+    cd aws-lc-sys && \
+    sed -i '682 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
+    sed -i '712 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
+    sed -i '747 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c
+    
+# Build Outlines Core
+FROM python-install AS outlines-core-builder
+WORKDIR /tmp
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+ARG OUTLINES_CORE_VERSION=0.2.10
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    --mount=type=bind,from=aws-lc-sys-editor,source=/tmp/aws-lc-rs/aws-lc-sys,target=/tmp/aws-lc-sys,rw \
+    git clone https://github.com/dottxt-ai/outlines-core.git && \
+    cd outlines-core && \
+    git checkout tags/${OUTLINES_CORE_VERSION} && \
+    sed -i "s/version = \"0.0.0\"/version = \"${OUTLINES_CORE_VERSION}\"/" Cargo.toml && \
+    echo '[patch.crates-io]' >> Cargo.toml && \
+    echo 'aws-lc-sys = { path = "/tmp/aws-lc-sys" }' >> Cargo.toml && \
+    uv pip install maturin && \
+    python -m maturin build --release --out dist
 
 # Final build stage
 FROM python-install AS vllm-cpu
@@ -230,6 +269,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
     --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
     --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
+    --mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \
      sed -i '/^torch/d' requirements/build.txt && \
      ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
      VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
@@ -237,6 +277,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
      TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \
      LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
      NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
+     OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \
     uv pip install -v \    
         $ARROW_WHL_FILE  \
         $VISION_WHL_FILE \
@@ -244,6 +285,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         $TORCH_WHL_FILE \
         $LLVM_WHL_FILE \
         $NUMBA_WHL_FILE \
+        $OUTLINES_CORE_WHL_FILE \
         --index-strategy unsafe-best-match \
         -r requirements/build.txt \
         -r requirements/cpu.txt
diff --git a/requirements/common.txt b/requirements/common.txt
index ce0795488c..8f5bc9176d 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -20,8 +20,7 @@ prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.11.3
 llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
-outlines_core == 0.2.10 ; platform_machine != "s390x"
-outlines == 0.1.11 ; platform_machine == "s390x"
+outlines_core == 0.2.10
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2